1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1993 Linus Torvalds 4 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 5 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 6 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 7 * Numa awareness, Christoph Lameter, SGI, June 2005 8 * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 9 */ 10 11 #include <linux/vmalloc.h> 12 #include <linux/mm.h> 13 #include <linux/module.h> 14 #include <linux/highmem.h> 15 #include <linux/sched/signal.h> 16 #include <linux/slab.h> 17 #include <linux/spinlock.h> 18 #include <linux/interrupt.h> 19 #include <linux/proc_fs.h> 20 #include <linux/seq_file.h> 21 #include <linux/set_memory.h> 22 #include <linux/debugobjects.h> 23 #include <linux/kallsyms.h> 24 #include <linux/list.h> 25 #include <linux/notifier.h> 26 #include <linux/rbtree.h> 27 #include <linux/xarray.h> 28 #include <linux/io.h> 29 #include <linux/rcupdate.h> 30 #include <linux/pfn.h> 31 #include <linux/kmemleak.h> 32 #include <linux/atomic.h> 33 #include <linux/compiler.h> 34 #include <linux/llist.h> 35 #include <linux/bitops.h> 36 #include <linux/rbtree_augmented.h> 37 #include <linux/overflow.h> 38 #include <linux/pgtable.h> 39 #include <linux/uaccess.h> 40 #include <linux/hugetlb.h> 41 #include <asm/tlbflush.h> 42 #include <asm/shmparam.h> 43 44 #include "internal.h" 45 #include "pgalloc-track.h" 46 47 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP 48 static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1; 49 50 static int __init set_nohugeiomap(char *str) 51 { 52 ioremap_max_page_shift = PAGE_SHIFT; 53 return 0; 54 } 55 early_param("nohugeiomap", set_nohugeiomap); 56 #else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ 57 static const unsigned int ioremap_max_page_shift = PAGE_SHIFT; 58 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ 59 60 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC 61 static bool __ro_after_init vmap_allow_huge = true; 62 63 static int __init set_nohugevmalloc(char *str) 64 { 65 vmap_allow_huge = false; 66 return 0; 67 } 68 early_param("nohugevmalloc", set_nohugevmalloc); 69 #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ 70 static const bool vmap_allow_huge = false; 71 #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ 72 73 bool is_vmalloc_addr(const void *x) 74 { 75 unsigned long addr = (unsigned long)x; 76 77 return addr >= VMALLOC_START && addr < VMALLOC_END; 78 } 79 EXPORT_SYMBOL(is_vmalloc_addr); 80 81 struct vfree_deferred { 82 struct llist_head list; 83 struct work_struct wq; 84 }; 85 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); 86 87 static void __vunmap(const void *, int); 88 89 static void free_work(struct work_struct *w) 90 { 91 struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); 92 struct llist_node *t, *llnode; 93 94 llist_for_each_safe(llnode, t, llist_del_all(&p->list)) 95 __vunmap((void *)llnode, 1); 96 } 97 98 /*** Page table manipulation functions ***/ 99 static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 100 phys_addr_t phys_addr, pgprot_t prot, 101 unsigned int max_page_shift, pgtbl_mod_mask *mask) 102 { 103 pte_t *pte; 104 u64 pfn; 105 unsigned long size = PAGE_SIZE; 106 107 pfn = phys_addr >> PAGE_SHIFT; 108 pte = pte_alloc_kernel_track(pmd, addr, mask); 109 if (!pte) 110 return -ENOMEM; 111 do { 112 BUG_ON(!pte_none(*pte)); 113 114 #ifdef CONFIG_HUGETLB_PAGE 115 size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); 116 if (size != PAGE_SIZE) { 117 pte_t entry = pfn_pte(pfn, prot); 118 119 entry = pte_mkhuge(entry); 120 entry = arch_make_huge_pte(entry, ilog2(size), 0); 121 set_huge_pte_at(&init_mm, addr, pte, entry); 122 pfn += PFN_DOWN(size); 123 continue; 124 } 125 #endif 126 set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); 127 pfn++; 128 } while (pte += PFN_DOWN(size), addr += size, addr != end); 129 *mask |= PGTBL_PTE_MODIFIED; 130 return 0; 131 } 132 133 static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, 134 phys_addr_t phys_addr, pgprot_t prot, 135 unsigned int max_page_shift) 136 { 137 if (max_page_shift < PMD_SHIFT) 138 return 0; 139 140 if (!arch_vmap_pmd_supported(prot)) 141 return 0; 142 143 if ((end - addr) != PMD_SIZE) 144 return 0; 145 146 if (!IS_ALIGNED(addr, PMD_SIZE)) 147 return 0; 148 149 if (!IS_ALIGNED(phys_addr, PMD_SIZE)) 150 return 0; 151 152 if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) 153 return 0; 154 155 return pmd_set_huge(pmd, phys_addr, prot); 156 } 157 158 static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 159 phys_addr_t phys_addr, pgprot_t prot, 160 unsigned int max_page_shift, pgtbl_mod_mask *mask) 161 { 162 pmd_t *pmd; 163 unsigned long next; 164 165 pmd = pmd_alloc_track(&init_mm, pud, addr, mask); 166 if (!pmd) 167 return -ENOMEM; 168 do { 169 next = pmd_addr_end(addr, end); 170 171 if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, 172 max_page_shift)) { 173 *mask |= PGTBL_PMD_MODIFIED; 174 continue; 175 } 176 177 if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask)) 178 return -ENOMEM; 179 } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); 180 return 0; 181 } 182 183 static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, 184 phys_addr_t phys_addr, pgprot_t prot, 185 unsigned int max_page_shift) 186 { 187 if (max_page_shift < PUD_SHIFT) 188 return 0; 189 190 if (!arch_vmap_pud_supported(prot)) 191 return 0; 192 193 if ((end - addr) != PUD_SIZE) 194 return 0; 195 196 if (!IS_ALIGNED(addr, PUD_SIZE)) 197 return 0; 198 199 if (!IS_ALIGNED(phys_addr, PUD_SIZE)) 200 return 0; 201 202 if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) 203 return 0; 204 205 return pud_set_huge(pud, phys_addr, prot); 206 } 207 208 static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 209 phys_addr_t phys_addr, pgprot_t prot, 210 unsigned int max_page_shift, pgtbl_mod_mask *mask) 211 { 212 pud_t *pud; 213 unsigned long next; 214 215 pud = pud_alloc_track(&init_mm, p4d, addr, mask); 216 if (!pud) 217 return -ENOMEM; 218 do { 219 next = pud_addr_end(addr, end); 220 221 if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, 222 max_page_shift)) { 223 *mask |= PGTBL_PUD_MODIFIED; 224 continue; 225 } 226 227 if (vmap_pmd_range(pud, addr, next, phys_addr, prot, 228 max_page_shift, mask)) 229 return -ENOMEM; 230 } while (pud++, phys_addr += (next - addr), addr = next, addr != end); 231 return 0; 232 } 233 234 static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, 235 phys_addr_t phys_addr, pgprot_t prot, 236 unsigned int max_page_shift) 237 { 238 if (max_page_shift < P4D_SHIFT) 239 return 0; 240 241 if (!arch_vmap_p4d_supported(prot)) 242 return 0; 243 244 if ((end - addr) != P4D_SIZE) 245 return 0; 246 247 if (!IS_ALIGNED(addr, P4D_SIZE)) 248 return 0; 249 250 if (!IS_ALIGNED(phys_addr, P4D_SIZE)) 251 return 0; 252 253 if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) 254 return 0; 255 256 return p4d_set_huge(p4d, phys_addr, prot); 257 } 258 259 static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 260 phys_addr_t phys_addr, pgprot_t prot, 261 unsigned int max_page_shift, pgtbl_mod_mask *mask) 262 { 263 p4d_t *p4d; 264 unsigned long next; 265 266 p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); 267 if (!p4d) 268 return -ENOMEM; 269 do { 270 next = p4d_addr_end(addr, end); 271 272 if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, 273 max_page_shift)) { 274 *mask |= PGTBL_P4D_MODIFIED; 275 continue; 276 } 277 278 if (vmap_pud_range(p4d, addr, next, phys_addr, prot, 279 max_page_shift, mask)) 280 return -ENOMEM; 281 } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); 282 return 0; 283 } 284 285 static int vmap_range_noflush(unsigned long addr, unsigned long end, 286 phys_addr_t phys_addr, pgprot_t prot, 287 unsigned int max_page_shift) 288 { 289 pgd_t *pgd; 290 unsigned long start; 291 unsigned long next; 292 int err; 293 pgtbl_mod_mask mask = 0; 294 295 might_sleep(); 296 BUG_ON(addr >= end); 297 298 start = addr; 299 pgd = pgd_offset_k(addr); 300 do { 301 next = pgd_addr_end(addr, end); 302 err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, 303 max_page_shift, &mask); 304 if (err) 305 break; 306 } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); 307 308 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 309 arch_sync_kernel_mappings(start, end); 310 311 return err; 312 } 313 314 int ioremap_page_range(unsigned long addr, unsigned long end, 315 phys_addr_t phys_addr, pgprot_t prot) 316 { 317 int err; 318 319 err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), 320 ioremap_max_page_shift); 321 flush_cache_vmap(addr, end); 322 return err; 323 } 324 325 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 326 pgtbl_mod_mask *mask) 327 { 328 pte_t *pte; 329 330 pte = pte_offset_kernel(pmd, addr); 331 do { 332 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); 333 WARN_ON(!pte_none(ptent) && !pte_present(ptent)); 334 } while (pte++, addr += PAGE_SIZE, addr != end); 335 *mask |= PGTBL_PTE_MODIFIED; 336 } 337 338 static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 339 pgtbl_mod_mask *mask) 340 { 341 pmd_t *pmd; 342 unsigned long next; 343 int cleared; 344 345 pmd = pmd_offset(pud, addr); 346 do { 347 next = pmd_addr_end(addr, end); 348 349 cleared = pmd_clear_huge(pmd); 350 if (cleared || pmd_bad(*pmd)) 351 *mask |= PGTBL_PMD_MODIFIED; 352 353 if (cleared) 354 continue; 355 if (pmd_none_or_clear_bad(pmd)) 356 continue; 357 vunmap_pte_range(pmd, addr, next, mask); 358 359 cond_resched(); 360 } while (pmd++, addr = next, addr != end); 361 } 362 363 static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 364 pgtbl_mod_mask *mask) 365 { 366 pud_t *pud; 367 unsigned long next; 368 int cleared; 369 370 pud = pud_offset(p4d, addr); 371 do { 372 next = pud_addr_end(addr, end); 373 374 cleared = pud_clear_huge(pud); 375 if (cleared || pud_bad(*pud)) 376 *mask |= PGTBL_PUD_MODIFIED; 377 378 if (cleared) 379 continue; 380 if (pud_none_or_clear_bad(pud)) 381 continue; 382 vunmap_pmd_range(pud, addr, next, mask); 383 } while (pud++, addr = next, addr != end); 384 } 385 386 static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 387 pgtbl_mod_mask *mask) 388 { 389 p4d_t *p4d; 390 unsigned long next; 391 int cleared; 392 393 p4d = p4d_offset(pgd, addr); 394 do { 395 next = p4d_addr_end(addr, end); 396 397 cleared = p4d_clear_huge(p4d); 398 if (cleared || p4d_bad(*p4d)) 399 *mask |= PGTBL_P4D_MODIFIED; 400 401 if (cleared) 402 continue; 403 if (p4d_none_or_clear_bad(p4d)) 404 continue; 405 vunmap_pud_range(p4d, addr, next, mask); 406 } while (p4d++, addr = next, addr != end); 407 } 408 409 /* 410 * vunmap_range_noflush is similar to vunmap_range, but does not 411 * flush caches or TLBs. 412 * 413 * The caller is responsible for calling flush_cache_vmap() before calling 414 * this function, and flush_tlb_kernel_range after it has returned 415 * successfully (and before the addresses are expected to cause a page fault 416 * or be re-mapped for something else, if TLB flushes are being delayed or 417 * coalesced). 418 * 419 * This is an internal function only. Do not use outside mm/. 420 */ 421 void vunmap_range_noflush(unsigned long start, unsigned long end) 422 { 423 unsigned long next; 424 pgd_t *pgd; 425 unsigned long addr = start; 426 pgtbl_mod_mask mask = 0; 427 428 BUG_ON(addr >= end); 429 pgd = pgd_offset_k(addr); 430 do { 431 next = pgd_addr_end(addr, end); 432 if (pgd_bad(*pgd)) 433 mask |= PGTBL_PGD_MODIFIED; 434 if (pgd_none_or_clear_bad(pgd)) 435 continue; 436 vunmap_p4d_range(pgd, addr, next, &mask); 437 } while (pgd++, addr = next, addr != end); 438 439 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 440 arch_sync_kernel_mappings(start, end); 441 } 442 443 /** 444 * vunmap_range - unmap kernel virtual addresses 445 * @addr: start of the VM area to unmap 446 * @end: end of the VM area to unmap (non-inclusive) 447 * 448 * Clears any present PTEs in the virtual address range, flushes TLBs and 449 * caches. Any subsequent access to the address before it has been re-mapped 450 * is a kernel bug. 451 */ 452 void vunmap_range(unsigned long addr, unsigned long end) 453 { 454 flush_cache_vunmap(addr, end); 455 vunmap_range_noflush(addr, end); 456 flush_tlb_kernel_range(addr, end); 457 } 458 459 static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, 460 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 461 pgtbl_mod_mask *mask) 462 { 463 pte_t *pte; 464 465 /* 466 * nr is a running index into the array which helps higher level 467 * callers keep track of where we're up to. 468 */ 469 470 pte = pte_alloc_kernel_track(pmd, addr, mask); 471 if (!pte) 472 return -ENOMEM; 473 do { 474 struct page *page = pages[*nr]; 475 476 if (WARN_ON(!pte_none(*pte))) 477 return -EBUSY; 478 if (WARN_ON(!page)) 479 return -ENOMEM; 480 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 481 (*nr)++; 482 } while (pte++, addr += PAGE_SIZE, addr != end); 483 *mask |= PGTBL_PTE_MODIFIED; 484 return 0; 485 } 486 487 static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, 488 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 489 pgtbl_mod_mask *mask) 490 { 491 pmd_t *pmd; 492 unsigned long next; 493 494 pmd = pmd_alloc_track(&init_mm, pud, addr, mask); 495 if (!pmd) 496 return -ENOMEM; 497 do { 498 next = pmd_addr_end(addr, end); 499 if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask)) 500 return -ENOMEM; 501 } while (pmd++, addr = next, addr != end); 502 return 0; 503 } 504 505 static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, 506 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 507 pgtbl_mod_mask *mask) 508 { 509 pud_t *pud; 510 unsigned long next; 511 512 pud = pud_alloc_track(&init_mm, p4d, addr, mask); 513 if (!pud) 514 return -ENOMEM; 515 do { 516 next = pud_addr_end(addr, end); 517 if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask)) 518 return -ENOMEM; 519 } while (pud++, addr = next, addr != end); 520 return 0; 521 } 522 523 static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, 524 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 525 pgtbl_mod_mask *mask) 526 { 527 p4d_t *p4d; 528 unsigned long next; 529 530 p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); 531 if (!p4d) 532 return -ENOMEM; 533 do { 534 next = p4d_addr_end(addr, end); 535 if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask)) 536 return -ENOMEM; 537 } while (p4d++, addr = next, addr != end); 538 return 0; 539 } 540 541 static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, 542 pgprot_t prot, struct page **pages) 543 { 544 unsigned long start = addr; 545 pgd_t *pgd; 546 unsigned long next; 547 int err = 0; 548 int nr = 0; 549 pgtbl_mod_mask mask = 0; 550 551 BUG_ON(addr >= end); 552 pgd = pgd_offset_k(addr); 553 do { 554 next = pgd_addr_end(addr, end); 555 if (pgd_bad(*pgd)) 556 mask |= PGTBL_PGD_MODIFIED; 557 err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); 558 if (err) 559 return err; 560 } while (pgd++, addr = next, addr != end); 561 562 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 563 arch_sync_kernel_mappings(start, end); 564 565 return 0; 566 } 567 568 /* 569 * vmap_pages_range_noflush is similar to vmap_pages_range, but does not 570 * flush caches. 571 * 572 * The caller is responsible for calling flush_cache_vmap() after this 573 * function returns successfully and before the addresses are accessed. 574 * 575 * This is an internal function only. Do not use outside mm/. 576 */ 577 int vmap_pages_range_noflush(unsigned long addr, unsigned long end, 578 pgprot_t prot, struct page **pages, unsigned int page_shift) 579 { 580 unsigned int i, nr = (end - addr) >> PAGE_SHIFT; 581 582 WARN_ON(page_shift < PAGE_SHIFT); 583 584 if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || 585 page_shift == PAGE_SHIFT) 586 return vmap_small_pages_range_noflush(addr, end, prot, pages); 587 588 for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { 589 int err; 590 591 err = vmap_range_noflush(addr, addr + (1UL << page_shift), 592 __pa(page_address(pages[i])), prot, 593 page_shift); 594 if (err) 595 return err; 596 597 addr += 1UL << page_shift; 598 } 599 600 return 0; 601 } 602 603 /** 604 * vmap_pages_range - map pages to a kernel virtual address 605 * @addr: start of the VM area to map 606 * @end: end of the VM area to map (non-inclusive) 607 * @prot: page protection flags to use 608 * @pages: pages to map (always PAGE_SIZE pages) 609 * @page_shift: maximum shift that the pages may be mapped with, @pages must 610 * be aligned and contiguous up to at least this shift. 611 * 612 * RETURNS: 613 * 0 on success, -errno on failure. 614 */ 615 static int vmap_pages_range(unsigned long addr, unsigned long end, 616 pgprot_t prot, struct page **pages, unsigned int page_shift) 617 { 618 int err; 619 620 err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); 621 flush_cache_vmap(addr, end); 622 return err; 623 } 624 625 int is_vmalloc_or_module_addr(const void *x) 626 { 627 /* 628 * ARM, x86-64 and sparc64 put modules in a special place, 629 * and fall back on vmalloc() if that fails. Others 630 * just put it in the vmalloc space. 631 */ 632 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) 633 unsigned long addr = (unsigned long)x; 634 if (addr >= MODULES_VADDR && addr < MODULES_END) 635 return 1; 636 #endif 637 return is_vmalloc_addr(x); 638 } 639 640 /* 641 * Walk a vmap address to the struct page it maps. Huge vmap mappings will 642 * return the tail page that corresponds to the base page address, which 643 * matches small vmap mappings. 644 */ 645 struct page *vmalloc_to_page(const void *vmalloc_addr) 646 { 647 unsigned long addr = (unsigned long) vmalloc_addr; 648 struct page *page = NULL; 649 pgd_t *pgd = pgd_offset_k(addr); 650 p4d_t *p4d; 651 pud_t *pud; 652 pmd_t *pmd; 653 pte_t *ptep, pte; 654 655 /* 656 * XXX we might need to change this if we add VIRTUAL_BUG_ON for 657 * architectures that do not vmalloc module space 658 */ 659 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); 660 661 if (pgd_none(*pgd)) 662 return NULL; 663 if (WARN_ON_ONCE(pgd_leaf(*pgd))) 664 return NULL; /* XXX: no allowance for huge pgd */ 665 if (WARN_ON_ONCE(pgd_bad(*pgd))) 666 return NULL; 667 668 p4d = p4d_offset(pgd, addr); 669 if (p4d_none(*p4d)) 670 return NULL; 671 if (p4d_leaf(*p4d)) 672 return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); 673 if (WARN_ON_ONCE(p4d_bad(*p4d))) 674 return NULL; 675 676 pud = pud_offset(p4d, addr); 677 if (pud_none(*pud)) 678 return NULL; 679 if (pud_leaf(*pud)) 680 return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 681 if (WARN_ON_ONCE(pud_bad(*pud))) 682 return NULL; 683 684 pmd = pmd_offset(pud, addr); 685 if (pmd_none(*pmd)) 686 return NULL; 687 if (pmd_leaf(*pmd)) 688 return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 689 if (WARN_ON_ONCE(pmd_bad(*pmd))) 690 return NULL; 691 692 ptep = pte_offset_map(pmd, addr); 693 pte = *ptep; 694 if (pte_present(pte)) 695 page = pte_page(pte); 696 pte_unmap(ptep); 697 698 return page; 699 } 700 EXPORT_SYMBOL(vmalloc_to_page); 701 702 /* 703 * Map a vmalloc()-space virtual address to the physical page frame number. 704 */ 705 unsigned long vmalloc_to_pfn(const void *vmalloc_addr) 706 { 707 return page_to_pfn(vmalloc_to_page(vmalloc_addr)); 708 } 709 EXPORT_SYMBOL(vmalloc_to_pfn); 710 711 712 /*** Global kva allocator ***/ 713 714 #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 715 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 716 717 718 static DEFINE_SPINLOCK(vmap_area_lock); 719 static DEFINE_SPINLOCK(free_vmap_area_lock); 720 /* Export for kexec only */ 721 LIST_HEAD(vmap_area_list); 722 static struct rb_root vmap_area_root = RB_ROOT; 723 static bool vmap_initialized __read_mostly; 724 725 static struct rb_root purge_vmap_area_root = RB_ROOT; 726 static LIST_HEAD(purge_vmap_area_list); 727 static DEFINE_SPINLOCK(purge_vmap_area_lock); 728 729 /* 730 * This kmem_cache is used for vmap_area objects. Instead of 731 * allocating from slab we reuse an object from this cache to 732 * make things faster. Especially in "no edge" splitting of 733 * free block. 734 */ 735 static struct kmem_cache *vmap_area_cachep; 736 737 /* 738 * This linked list is used in pair with free_vmap_area_root. 739 * It gives O(1) access to prev/next to perform fast coalescing. 740 */ 741 static LIST_HEAD(free_vmap_area_list); 742 743 /* 744 * This augment red-black tree represents the free vmap space. 745 * All vmap_area objects in this tree are sorted by va->va_start 746 * address. It is used for allocation and merging when a vmap 747 * object is released. 748 * 749 * Each vmap_area node contains a maximum available free block 750 * of its sub-tree, right or left. Therefore it is possible to 751 * find a lowest match of free area. 752 */ 753 static struct rb_root free_vmap_area_root = RB_ROOT; 754 755 /* 756 * Preload a CPU with one object for "no edge" split case. The 757 * aim is to get rid of allocations from the atomic context, thus 758 * to use more permissive allocation masks. 759 */ 760 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); 761 762 static __always_inline unsigned long 763 va_size(struct vmap_area *va) 764 { 765 return (va->va_end - va->va_start); 766 } 767 768 static __always_inline unsigned long 769 get_subtree_max_size(struct rb_node *node) 770 { 771 struct vmap_area *va; 772 773 va = rb_entry_safe(node, struct vmap_area, rb_node); 774 return va ? va->subtree_max_size : 0; 775 } 776 777 /* 778 * Gets called when remove the node and rotate. 779 */ 780 static __always_inline unsigned long 781 compute_subtree_max_size(struct vmap_area *va) 782 { 783 return max3(va_size(va), 784 get_subtree_max_size(va->rb_node.rb_left), 785 get_subtree_max_size(va->rb_node.rb_right)); 786 } 787 788 RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, 789 struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) 790 791 static void purge_vmap_area_lazy(void); 792 static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); 793 static unsigned long lazy_max_pages(void); 794 795 static atomic_long_t nr_vmalloc_pages; 796 797 unsigned long vmalloc_nr_pages(void) 798 { 799 return atomic_long_read(&nr_vmalloc_pages); 800 } 801 802 static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) 803 { 804 struct vmap_area *va = NULL; 805 struct rb_node *n = vmap_area_root.rb_node; 806 807 while (n) { 808 struct vmap_area *tmp; 809 810 tmp = rb_entry(n, struct vmap_area, rb_node); 811 if (tmp->va_end > addr) { 812 va = tmp; 813 if (tmp->va_start <= addr) 814 break; 815 816 n = n->rb_left; 817 } else 818 n = n->rb_right; 819 } 820 821 return va; 822 } 823 824 static struct vmap_area *__find_vmap_area(unsigned long addr) 825 { 826 struct rb_node *n = vmap_area_root.rb_node; 827 828 while (n) { 829 struct vmap_area *va; 830 831 va = rb_entry(n, struct vmap_area, rb_node); 832 if (addr < va->va_start) 833 n = n->rb_left; 834 else if (addr >= va->va_end) 835 n = n->rb_right; 836 else 837 return va; 838 } 839 840 return NULL; 841 } 842 843 /* 844 * This function returns back addresses of parent node 845 * and its left or right link for further processing. 846 * 847 * Otherwise NULL is returned. In that case all further 848 * steps regarding inserting of conflicting overlap range 849 * have to be declined and actually considered as a bug. 850 */ 851 static __always_inline struct rb_node ** 852 find_va_links(struct vmap_area *va, 853 struct rb_root *root, struct rb_node *from, 854 struct rb_node **parent) 855 { 856 struct vmap_area *tmp_va; 857 struct rb_node **link; 858 859 if (root) { 860 link = &root->rb_node; 861 if (unlikely(!*link)) { 862 *parent = NULL; 863 return link; 864 } 865 } else { 866 link = &from; 867 } 868 869 /* 870 * Go to the bottom of the tree. When we hit the last point 871 * we end up with parent rb_node and correct direction, i name 872 * it link, where the new va->rb_node will be attached to. 873 */ 874 do { 875 tmp_va = rb_entry(*link, struct vmap_area, rb_node); 876 877 /* 878 * During the traversal we also do some sanity check. 879 * Trigger the BUG() if there are sides(left/right) 880 * or full overlaps. 881 */ 882 if (va->va_start < tmp_va->va_end && 883 va->va_end <= tmp_va->va_start) 884 link = &(*link)->rb_left; 885 else if (va->va_end > tmp_va->va_start && 886 va->va_start >= tmp_va->va_end) 887 link = &(*link)->rb_right; 888 else { 889 WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", 890 va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); 891 892 return NULL; 893 } 894 } while (*link); 895 896 *parent = &tmp_va->rb_node; 897 return link; 898 } 899 900 static __always_inline struct list_head * 901 get_va_next_sibling(struct rb_node *parent, struct rb_node **link) 902 { 903 struct list_head *list; 904 905 if (unlikely(!parent)) 906 /* 907 * The red-black tree where we try to find VA neighbors 908 * before merging or inserting is empty, i.e. it means 909 * there is no free vmap space. Normally it does not 910 * happen but we handle this case anyway. 911 */ 912 return NULL; 913 914 list = &rb_entry(parent, struct vmap_area, rb_node)->list; 915 return (&parent->rb_right == link ? list->next : list); 916 } 917 918 static __always_inline void 919 link_va(struct vmap_area *va, struct rb_root *root, 920 struct rb_node *parent, struct rb_node **link, struct list_head *head) 921 { 922 /* 923 * VA is still not in the list, but we can 924 * identify its future previous list_head node. 925 */ 926 if (likely(parent)) { 927 head = &rb_entry(parent, struct vmap_area, rb_node)->list; 928 if (&parent->rb_right != link) 929 head = head->prev; 930 } 931 932 /* Insert to the rb-tree */ 933 rb_link_node(&va->rb_node, parent, link); 934 if (root == &free_vmap_area_root) { 935 /* 936 * Some explanation here. Just perform simple insertion 937 * to the tree. We do not set va->subtree_max_size to 938 * its current size before calling rb_insert_augmented(). 939 * It is because of we populate the tree from the bottom 940 * to parent levels when the node _is_ in the tree. 941 * 942 * Therefore we set subtree_max_size to zero after insertion, 943 * to let __augment_tree_propagate_from() puts everything to 944 * the correct order later on. 945 */ 946 rb_insert_augmented(&va->rb_node, 947 root, &free_vmap_area_rb_augment_cb); 948 va->subtree_max_size = 0; 949 } else { 950 rb_insert_color(&va->rb_node, root); 951 } 952 953 /* Address-sort this list */ 954 list_add(&va->list, head); 955 } 956 957 static __always_inline void 958 unlink_va(struct vmap_area *va, struct rb_root *root) 959 { 960 if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) 961 return; 962 963 if (root == &free_vmap_area_root) 964 rb_erase_augmented(&va->rb_node, 965 root, &free_vmap_area_rb_augment_cb); 966 else 967 rb_erase(&va->rb_node, root); 968 969 list_del(&va->list); 970 RB_CLEAR_NODE(&va->rb_node); 971 } 972 973 #if DEBUG_AUGMENT_PROPAGATE_CHECK 974 static void 975 augment_tree_propagate_check(void) 976 { 977 struct vmap_area *va; 978 unsigned long computed_size; 979 980 list_for_each_entry(va, &free_vmap_area_list, list) { 981 computed_size = compute_subtree_max_size(va); 982 if (computed_size != va->subtree_max_size) 983 pr_emerg("tree is corrupted: %lu, %lu\n", 984 va_size(va), va->subtree_max_size); 985 } 986 } 987 #endif 988 989 /* 990 * This function populates subtree_max_size from bottom to upper 991 * levels starting from VA point. The propagation must be done 992 * when VA size is modified by changing its va_start/va_end. Or 993 * in case of newly inserting of VA to the tree. 994 * 995 * It means that __augment_tree_propagate_from() must be called: 996 * - After VA has been inserted to the tree(free path); 997 * - After VA has been shrunk(allocation path); 998 * - After VA has been increased(merging path). 999 * 1000 * Please note that, it does not mean that upper parent nodes 1001 * and their subtree_max_size are recalculated all the time up 1002 * to the root node. 1003 * 1004 * 4--8 1005 * /\ 1006 * / \ 1007 * / \ 1008 * 2--2 8--8 1009 * 1010 * For example if we modify the node 4, shrinking it to 2, then 1011 * no any modification is required. If we shrink the node 2 to 1 1012 * its subtree_max_size is updated only, and set to 1. If we shrink 1013 * the node 8 to 6, then its subtree_max_size is set to 6 and parent 1014 * node becomes 4--6. 1015 */ 1016 static __always_inline void 1017 augment_tree_propagate_from(struct vmap_area *va) 1018 { 1019 /* 1020 * Populate the tree from bottom towards the root until 1021 * the calculated maximum available size of checked node 1022 * is equal to its current one. 1023 */ 1024 free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); 1025 1026 #if DEBUG_AUGMENT_PROPAGATE_CHECK 1027 augment_tree_propagate_check(); 1028 #endif 1029 } 1030 1031 static void 1032 insert_vmap_area(struct vmap_area *va, 1033 struct rb_root *root, struct list_head *head) 1034 { 1035 struct rb_node **link; 1036 struct rb_node *parent; 1037 1038 link = find_va_links(va, root, NULL, &parent); 1039 if (link) 1040 link_va(va, root, parent, link, head); 1041 } 1042 1043 static void 1044 insert_vmap_area_augment(struct vmap_area *va, 1045 struct rb_node *from, struct rb_root *root, 1046 struct list_head *head) 1047 { 1048 struct rb_node **link; 1049 struct rb_node *parent; 1050 1051 if (from) 1052 link = find_va_links(va, NULL, from, &parent); 1053 else 1054 link = find_va_links(va, root, NULL, &parent); 1055 1056 if (link) { 1057 link_va(va, root, parent, link, head); 1058 augment_tree_propagate_from(va); 1059 } 1060 } 1061 1062 /* 1063 * Merge de-allocated chunk of VA memory with previous 1064 * and next free blocks. If coalesce is not done a new 1065 * free area is inserted. If VA has been merged, it is 1066 * freed. 1067 * 1068 * Please note, it can return NULL in case of overlap 1069 * ranges, followed by WARN() report. Despite it is a 1070 * buggy behaviour, a system can be alive and keep 1071 * ongoing. 1072 */ 1073 static __always_inline struct vmap_area * 1074 merge_or_add_vmap_area(struct vmap_area *va, 1075 struct rb_root *root, struct list_head *head) 1076 { 1077 struct vmap_area *sibling; 1078 struct list_head *next; 1079 struct rb_node **link; 1080 struct rb_node *parent; 1081 bool merged = false; 1082 1083 /* 1084 * Find a place in the tree where VA potentially will be 1085 * inserted, unless it is merged with its sibling/siblings. 1086 */ 1087 link = find_va_links(va, root, NULL, &parent); 1088 if (!link) 1089 return NULL; 1090 1091 /* 1092 * Get next node of VA to check if merging can be done. 1093 */ 1094 next = get_va_next_sibling(parent, link); 1095 if (unlikely(next == NULL)) 1096 goto insert; 1097 1098 /* 1099 * start end 1100 * | | 1101 * |<------VA------>|<-----Next----->| 1102 * | | 1103 * start end 1104 */ 1105 if (next != head) { 1106 sibling = list_entry(next, struct vmap_area, list); 1107 if (sibling->va_start == va->va_end) { 1108 sibling->va_start = va->va_start; 1109 1110 /* Free vmap_area object. */ 1111 kmem_cache_free(vmap_area_cachep, va); 1112 1113 /* Point to the new merged area. */ 1114 va = sibling; 1115 merged = true; 1116 } 1117 } 1118 1119 /* 1120 * start end 1121 * | | 1122 * |<-----Prev----->|<------VA------>| 1123 * | | 1124 * start end 1125 */ 1126 if (next->prev != head) { 1127 sibling = list_entry(next->prev, struct vmap_area, list); 1128 if (sibling->va_end == va->va_start) { 1129 /* 1130 * If both neighbors are coalesced, it is important 1131 * to unlink the "next" node first, followed by merging 1132 * with "previous" one. Otherwise the tree might not be 1133 * fully populated if a sibling's augmented value is 1134 * "normalized" because of rotation operations. 1135 */ 1136 if (merged) 1137 unlink_va(va, root); 1138 1139 sibling->va_end = va->va_end; 1140 1141 /* Free vmap_area object. */ 1142 kmem_cache_free(vmap_area_cachep, va); 1143 1144 /* Point to the new merged area. */ 1145 va = sibling; 1146 merged = true; 1147 } 1148 } 1149 1150 insert: 1151 if (!merged) 1152 link_va(va, root, parent, link, head); 1153 1154 return va; 1155 } 1156 1157 static __always_inline struct vmap_area * 1158 merge_or_add_vmap_area_augment(struct vmap_area *va, 1159 struct rb_root *root, struct list_head *head) 1160 { 1161 va = merge_or_add_vmap_area(va, root, head); 1162 if (va) 1163 augment_tree_propagate_from(va); 1164 1165 return va; 1166 } 1167 1168 static __always_inline bool 1169 is_within_this_va(struct vmap_area *va, unsigned long size, 1170 unsigned long align, unsigned long vstart) 1171 { 1172 unsigned long nva_start_addr; 1173 1174 if (va->va_start > vstart) 1175 nva_start_addr = ALIGN(va->va_start, align); 1176 else 1177 nva_start_addr = ALIGN(vstart, align); 1178 1179 /* Can be overflowed due to big size or alignment. */ 1180 if (nva_start_addr + size < nva_start_addr || 1181 nva_start_addr < vstart) 1182 return false; 1183 1184 return (nva_start_addr + size <= va->va_end); 1185 } 1186 1187 /* 1188 * Find the first free block(lowest start address) in the tree, 1189 * that will accomplish the request corresponding to passing 1190 * parameters. 1191 */ 1192 static __always_inline struct vmap_area * 1193 find_vmap_lowest_match(unsigned long size, 1194 unsigned long align, unsigned long vstart) 1195 { 1196 struct vmap_area *va; 1197 struct rb_node *node; 1198 unsigned long length; 1199 1200 /* Start from the root. */ 1201 node = free_vmap_area_root.rb_node; 1202 1203 /* Adjust the search size for alignment overhead. */ 1204 length = size + align - 1; 1205 1206 while (node) { 1207 va = rb_entry(node, struct vmap_area, rb_node); 1208 1209 if (get_subtree_max_size(node->rb_left) >= length && 1210 vstart < va->va_start) { 1211 node = node->rb_left; 1212 } else { 1213 if (is_within_this_va(va, size, align, vstart)) 1214 return va; 1215 1216 /* 1217 * Does not make sense to go deeper towards the right 1218 * sub-tree if it does not have a free block that is 1219 * equal or bigger to the requested search length. 1220 */ 1221 if (get_subtree_max_size(node->rb_right) >= length) { 1222 node = node->rb_right; 1223 continue; 1224 } 1225 1226 /* 1227 * OK. We roll back and find the first right sub-tree, 1228 * that will satisfy the search criteria. It can happen 1229 * only once due to "vstart" restriction. 1230 */ 1231 while ((node = rb_parent(node))) { 1232 va = rb_entry(node, struct vmap_area, rb_node); 1233 if (is_within_this_va(va, size, align, vstart)) 1234 return va; 1235 1236 if (get_subtree_max_size(node->rb_right) >= length && 1237 vstart <= va->va_start) { 1238 node = node->rb_right; 1239 break; 1240 } 1241 } 1242 } 1243 } 1244 1245 return NULL; 1246 } 1247 1248 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 1249 #include <linux/random.h> 1250 1251 static struct vmap_area * 1252 find_vmap_lowest_linear_match(unsigned long size, 1253 unsigned long align, unsigned long vstart) 1254 { 1255 struct vmap_area *va; 1256 1257 list_for_each_entry(va, &free_vmap_area_list, list) { 1258 if (!is_within_this_va(va, size, align, vstart)) 1259 continue; 1260 1261 return va; 1262 } 1263 1264 return NULL; 1265 } 1266 1267 static void 1268 find_vmap_lowest_match_check(unsigned long size) 1269 { 1270 struct vmap_area *va_1, *va_2; 1271 unsigned long vstart; 1272 unsigned int rnd; 1273 1274 get_random_bytes(&rnd, sizeof(rnd)); 1275 vstart = VMALLOC_START + rnd; 1276 1277 va_1 = find_vmap_lowest_match(size, 1, vstart); 1278 va_2 = find_vmap_lowest_linear_match(size, 1, vstart); 1279 1280 if (va_1 != va_2) 1281 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", 1282 va_1, va_2, vstart); 1283 } 1284 #endif 1285 1286 enum fit_type { 1287 NOTHING_FIT = 0, 1288 FL_FIT_TYPE = 1, /* full fit */ 1289 LE_FIT_TYPE = 2, /* left edge fit */ 1290 RE_FIT_TYPE = 3, /* right edge fit */ 1291 NE_FIT_TYPE = 4 /* no edge fit */ 1292 }; 1293 1294 static __always_inline enum fit_type 1295 classify_va_fit_type(struct vmap_area *va, 1296 unsigned long nva_start_addr, unsigned long size) 1297 { 1298 enum fit_type type; 1299 1300 /* Check if it is within VA. */ 1301 if (nva_start_addr < va->va_start || 1302 nva_start_addr + size > va->va_end) 1303 return NOTHING_FIT; 1304 1305 /* Now classify. */ 1306 if (va->va_start == nva_start_addr) { 1307 if (va->va_end == nva_start_addr + size) 1308 type = FL_FIT_TYPE; 1309 else 1310 type = LE_FIT_TYPE; 1311 } else if (va->va_end == nva_start_addr + size) { 1312 type = RE_FIT_TYPE; 1313 } else { 1314 type = NE_FIT_TYPE; 1315 } 1316 1317 return type; 1318 } 1319 1320 static __always_inline int 1321 adjust_va_to_fit_type(struct vmap_area *va, 1322 unsigned long nva_start_addr, unsigned long size, 1323 enum fit_type type) 1324 { 1325 struct vmap_area *lva = NULL; 1326 1327 if (type == FL_FIT_TYPE) { 1328 /* 1329 * No need to split VA, it fully fits. 1330 * 1331 * | | 1332 * V NVA V 1333 * |---------------| 1334 */ 1335 unlink_va(va, &free_vmap_area_root); 1336 kmem_cache_free(vmap_area_cachep, va); 1337 } else if (type == LE_FIT_TYPE) { 1338 /* 1339 * Split left edge of fit VA. 1340 * 1341 * | | 1342 * V NVA V R 1343 * |-------|-------| 1344 */ 1345 va->va_start += size; 1346 } else if (type == RE_FIT_TYPE) { 1347 /* 1348 * Split right edge of fit VA. 1349 * 1350 * | | 1351 * L V NVA V 1352 * |-------|-------| 1353 */ 1354 va->va_end = nva_start_addr; 1355 } else if (type == NE_FIT_TYPE) { 1356 /* 1357 * Split no edge of fit VA. 1358 * 1359 * | | 1360 * L V NVA V R 1361 * |---|-------|---| 1362 */ 1363 lva = __this_cpu_xchg(ne_fit_preload_node, NULL); 1364 if (unlikely(!lva)) { 1365 /* 1366 * For percpu allocator we do not do any pre-allocation 1367 * and leave it as it is. The reason is it most likely 1368 * never ends up with NE_FIT_TYPE splitting. In case of 1369 * percpu allocations offsets and sizes are aligned to 1370 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE 1371 * are its main fitting cases. 1372 * 1373 * There are a few exceptions though, as an example it is 1374 * a first allocation (early boot up) when we have "one" 1375 * big free space that has to be split. 1376 * 1377 * Also we can hit this path in case of regular "vmap" 1378 * allocations, if "this" current CPU was not preloaded. 1379 * See the comment in alloc_vmap_area() why. If so, then 1380 * GFP_NOWAIT is used instead to get an extra object for 1381 * split purpose. That is rare and most time does not 1382 * occur. 1383 * 1384 * What happens if an allocation gets failed. Basically, 1385 * an "overflow" path is triggered to purge lazily freed 1386 * areas to free some memory, then, the "retry" path is 1387 * triggered to repeat one more time. See more details 1388 * in alloc_vmap_area() function. 1389 */ 1390 lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); 1391 if (!lva) 1392 return -1; 1393 } 1394 1395 /* 1396 * Build the remainder. 1397 */ 1398 lva->va_start = va->va_start; 1399 lva->va_end = nva_start_addr; 1400 1401 /* 1402 * Shrink this VA to remaining size. 1403 */ 1404 va->va_start = nva_start_addr + size; 1405 } else { 1406 return -1; 1407 } 1408 1409 if (type != FL_FIT_TYPE) { 1410 augment_tree_propagate_from(va); 1411 1412 if (lva) /* type == NE_FIT_TYPE */ 1413 insert_vmap_area_augment(lva, &va->rb_node, 1414 &free_vmap_area_root, &free_vmap_area_list); 1415 } 1416 1417 return 0; 1418 } 1419 1420 /* 1421 * Returns a start address of the newly allocated area, if success. 1422 * Otherwise a vend is returned that indicates failure. 1423 */ 1424 static __always_inline unsigned long 1425 __alloc_vmap_area(unsigned long size, unsigned long align, 1426 unsigned long vstart, unsigned long vend) 1427 { 1428 unsigned long nva_start_addr; 1429 struct vmap_area *va; 1430 enum fit_type type; 1431 int ret; 1432 1433 va = find_vmap_lowest_match(size, align, vstart); 1434 if (unlikely(!va)) 1435 return vend; 1436 1437 if (va->va_start > vstart) 1438 nva_start_addr = ALIGN(va->va_start, align); 1439 else 1440 nva_start_addr = ALIGN(vstart, align); 1441 1442 /* Check the "vend" restriction. */ 1443 if (nva_start_addr + size > vend) 1444 return vend; 1445 1446 /* Classify what we have found. */ 1447 type = classify_va_fit_type(va, nva_start_addr, size); 1448 if (WARN_ON_ONCE(type == NOTHING_FIT)) 1449 return vend; 1450 1451 /* Update the free vmap_area. */ 1452 ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); 1453 if (ret) 1454 return vend; 1455 1456 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 1457 find_vmap_lowest_match_check(size); 1458 #endif 1459 1460 return nva_start_addr; 1461 } 1462 1463 /* 1464 * Free a region of KVA allocated by alloc_vmap_area 1465 */ 1466 static void free_vmap_area(struct vmap_area *va) 1467 { 1468 /* 1469 * Remove from the busy tree/list. 1470 */ 1471 spin_lock(&vmap_area_lock); 1472 unlink_va(va, &vmap_area_root); 1473 spin_unlock(&vmap_area_lock); 1474 1475 /* 1476 * Insert/Merge it back to the free tree/list. 1477 */ 1478 spin_lock(&free_vmap_area_lock); 1479 merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); 1480 spin_unlock(&free_vmap_area_lock); 1481 } 1482 1483 static inline void 1484 preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) 1485 { 1486 struct vmap_area *va = NULL; 1487 1488 /* 1489 * Preload this CPU with one extra vmap_area object. It is used 1490 * when fit type of free area is NE_FIT_TYPE. It guarantees that 1491 * a CPU that does an allocation is preloaded. 1492 * 1493 * We do it in non-atomic context, thus it allows us to use more 1494 * permissive allocation masks to be more stable under low memory 1495 * condition and high memory pressure. 1496 */ 1497 if (!this_cpu_read(ne_fit_preload_node)) 1498 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1499 1500 spin_lock(lock); 1501 1502 if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va)) 1503 kmem_cache_free(vmap_area_cachep, va); 1504 } 1505 1506 /* 1507 * Allocate a region of KVA of the specified size and alignment, within the 1508 * vstart and vend. 1509 */ 1510 static struct vmap_area *alloc_vmap_area(unsigned long size, 1511 unsigned long align, 1512 unsigned long vstart, unsigned long vend, 1513 int node, gfp_t gfp_mask) 1514 { 1515 struct vmap_area *va; 1516 unsigned long freed; 1517 unsigned long addr; 1518 int purged = 0; 1519 int ret; 1520 1521 BUG_ON(!size); 1522 BUG_ON(offset_in_page(size)); 1523 BUG_ON(!is_power_of_2(align)); 1524 1525 if (unlikely(!vmap_initialized)) 1526 return ERR_PTR(-EBUSY); 1527 1528 might_sleep(); 1529 gfp_mask = gfp_mask & GFP_RECLAIM_MASK; 1530 1531 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1532 if (unlikely(!va)) 1533 return ERR_PTR(-ENOMEM); 1534 1535 /* 1536 * Only scan the relevant parts containing pointers to other objects 1537 * to avoid false negatives. 1538 */ 1539 kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); 1540 1541 retry: 1542 preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); 1543 addr = __alloc_vmap_area(size, align, vstart, vend); 1544 spin_unlock(&free_vmap_area_lock); 1545 1546 /* 1547 * If an allocation fails, the "vend" address is 1548 * returned. Therefore trigger the overflow path. 1549 */ 1550 if (unlikely(addr == vend)) 1551 goto overflow; 1552 1553 va->va_start = addr; 1554 va->va_end = addr + size; 1555 va->vm = NULL; 1556 1557 spin_lock(&vmap_area_lock); 1558 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 1559 spin_unlock(&vmap_area_lock); 1560 1561 BUG_ON(!IS_ALIGNED(va->va_start, align)); 1562 BUG_ON(va->va_start < vstart); 1563 BUG_ON(va->va_end > vend); 1564 1565 ret = kasan_populate_vmalloc(addr, size); 1566 if (ret) { 1567 free_vmap_area(va); 1568 return ERR_PTR(ret); 1569 } 1570 1571 return va; 1572 1573 overflow: 1574 if (!purged) { 1575 purge_vmap_area_lazy(); 1576 purged = 1; 1577 goto retry; 1578 } 1579 1580 freed = 0; 1581 blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); 1582 1583 if (freed > 0) { 1584 purged = 0; 1585 goto retry; 1586 } 1587 1588 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) 1589 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", 1590 size); 1591 1592 kmem_cache_free(vmap_area_cachep, va); 1593 return ERR_PTR(-EBUSY); 1594 } 1595 1596 int register_vmap_purge_notifier(struct notifier_block *nb) 1597 { 1598 return blocking_notifier_chain_register(&vmap_notify_list, nb); 1599 } 1600 EXPORT_SYMBOL_GPL(register_vmap_purge_notifier); 1601 1602 int unregister_vmap_purge_notifier(struct notifier_block *nb) 1603 { 1604 return blocking_notifier_chain_unregister(&vmap_notify_list, nb); 1605 } 1606 EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); 1607 1608 /* 1609 * lazy_max_pages is the maximum amount of virtual address space we gather up 1610 * before attempting to purge with a TLB flush. 1611 * 1612 * There is a tradeoff here: a larger number will cover more kernel page tables 1613 * and take slightly longer to purge, but it will linearly reduce the number of 1614 * global TLB flushes that must be performed. It would seem natural to scale 1615 * this number up linearly with the number of CPUs (because vmapping activity 1616 * could also scale linearly with the number of CPUs), however it is likely 1617 * that in practice, workloads might be constrained in other ways that mean 1618 * vmap activity will not scale linearly with CPUs. Also, I want to be 1619 * conservative and not introduce a big latency on huge systems, so go with 1620 * a less aggressive log scale. It will still be an improvement over the old 1621 * code, and it will be simple to change the scale factor if we find that it 1622 * becomes a problem on bigger systems. 1623 */ 1624 static unsigned long lazy_max_pages(void) 1625 { 1626 unsigned int log; 1627 1628 log = fls(num_online_cpus()); 1629 1630 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 1631 } 1632 1633 static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); 1634 1635 /* 1636 * Serialize vmap purging. There is no actual critical section protected 1637 * by this look, but we want to avoid concurrent calls for performance 1638 * reasons and to make the pcpu_get_vm_areas more deterministic. 1639 */ 1640 static DEFINE_MUTEX(vmap_purge_lock); 1641 1642 /* for per-CPU blocks */ 1643 static void purge_fragmented_blocks_allcpus(void); 1644 1645 #ifdef CONFIG_X86_64 1646 /* 1647 * called before a call to iounmap() if the caller wants vm_area_struct's 1648 * immediately freed. 1649 */ 1650 void set_iounmap_nonlazy(void) 1651 { 1652 atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1); 1653 } 1654 #endif /* CONFIG_X86_64 */ 1655 1656 /* 1657 * Purges all lazily-freed vmap areas. 1658 */ 1659 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) 1660 { 1661 unsigned long resched_threshold; 1662 struct list_head local_pure_list; 1663 struct vmap_area *va, *n_va; 1664 1665 lockdep_assert_held(&vmap_purge_lock); 1666 1667 spin_lock(&purge_vmap_area_lock); 1668 purge_vmap_area_root = RB_ROOT; 1669 list_replace_init(&purge_vmap_area_list, &local_pure_list); 1670 spin_unlock(&purge_vmap_area_lock); 1671 1672 if (unlikely(list_empty(&local_pure_list))) 1673 return false; 1674 1675 start = min(start, 1676 list_first_entry(&local_pure_list, 1677 struct vmap_area, list)->va_start); 1678 1679 end = max(end, 1680 list_last_entry(&local_pure_list, 1681 struct vmap_area, list)->va_end); 1682 1683 flush_tlb_kernel_range(start, end); 1684 resched_threshold = lazy_max_pages() << 1; 1685 1686 spin_lock(&free_vmap_area_lock); 1687 list_for_each_entry_safe(va, n_va, &local_pure_list, list) { 1688 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; 1689 unsigned long orig_start = va->va_start; 1690 unsigned long orig_end = va->va_end; 1691 1692 /* 1693 * Finally insert or merge lazily-freed area. It is 1694 * detached and there is no need to "unlink" it from 1695 * anything. 1696 */ 1697 va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root, 1698 &free_vmap_area_list); 1699 1700 if (!va) 1701 continue; 1702 1703 if (is_vmalloc_or_module_addr((void *)orig_start)) 1704 kasan_release_vmalloc(orig_start, orig_end, 1705 va->va_start, va->va_end); 1706 1707 atomic_long_sub(nr, &vmap_lazy_nr); 1708 1709 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) 1710 cond_resched_lock(&free_vmap_area_lock); 1711 } 1712 spin_unlock(&free_vmap_area_lock); 1713 return true; 1714 } 1715 1716 /* 1717 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody 1718 * is already purging. 1719 */ 1720 static void try_purge_vmap_area_lazy(void) 1721 { 1722 if (mutex_trylock(&vmap_purge_lock)) { 1723 __purge_vmap_area_lazy(ULONG_MAX, 0); 1724 mutex_unlock(&vmap_purge_lock); 1725 } 1726 } 1727 1728 /* 1729 * Kick off a purge of the outstanding lazy areas. 1730 */ 1731 static void purge_vmap_area_lazy(void) 1732 { 1733 mutex_lock(&vmap_purge_lock); 1734 purge_fragmented_blocks_allcpus(); 1735 __purge_vmap_area_lazy(ULONG_MAX, 0); 1736 mutex_unlock(&vmap_purge_lock); 1737 } 1738 1739 /* 1740 * Free a vmap area, caller ensuring that the area has been unmapped 1741 * and flush_cache_vunmap had been called for the correct range 1742 * previously. 1743 */ 1744 static void free_vmap_area_noflush(struct vmap_area *va) 1745 { 1746 unsigned long nr_lazy; 1747 1748 spin_lock(&vmap_area_lock); 1749 unlink_va(va, &vmap_area_root); 1750 spin_unlock(&vmap_area_lock); 1751 1752 nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> 1753 PAGE_SHIFT, &vmap_lazy_nr); 1754 1755 /* 1756 * Merge or place it to the purge tree/list. 1757 */ 1758 spin_lock(&purge_vmap_area_lock); 1759 merge_or_add_vmap_area(va, 1760 &purge_vmap_area_root, &purge_vmap_area_list); 1761 spin_unlock(&purge_vmap_area_lock); 1762 1763 /* After this point, we may free va at any time */ 1764 if (unlikely(nr_lazy > lazy_max_pages())) 1765 try_purge_vmap_area_lazy(); 1766 } 1767 1768 /* 1769 * Free and unmap a vmap area 1770 */ 1771 static void free_unmap_vmap_area(struct vmap_area *va) 1772 { 1773 flush_cache_vunmap(va->va_start, va->va_end); 1774 vunmap_range_noflush(va->va_start, va->va_end); 1775 if (debug_pagealloc_enabled_static()) 1776 flush_tlb_kernel_range(va->va_start, va->va_end); 1777 1778 free_vmap_area_noflush(va); 1779 } 1780 1781 static struct vmap_area *find_vmap_area(unsigned long addr) 1782 { 1783 struct vmap_area *va; 1784 1785 spin_lock(&vmap_area_lock); 1786 va = __find_vmap_area(addr); 1787 spin_unlock(&vmap_area_lock); 1788 1789 return va; 1790 } 1791 1792 /*** Per cpu kva allocator ***/ 1793 1794 /* 1795 * vmap space is limited especially on 32 bit architectures. Ensure there is 1796 * room for at least 16 percpu vmap blocks per CPU. 1797 */ 1798 /* 1799 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able 1800 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess 1801 * instead (we just need a rough idea) 1802 */ 1803 #if BITS_PER_LONG == 32 1804 #define VMALLOC_SPACE (128UL*1024*1024) 1805 #else 1806 #define VMALLOC_SPACE (128UL*1024*1024*1024) 1807 #endif 1808 1809 #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) 1810 #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ 1811 #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ 1812 #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 1813 #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 1814 #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 1815 #define VMAP_BBMAP_BITS \ 1816 VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 1817 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 1818 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) 1819 1820 #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 1821 1822 struct vmap_block_queue { 1823 spinlock_t lock; 1824 struct list_head free; 1825 }; 1826 1827 struct vmap_block { 1828 spinlock_t lock; 1829 struct vmap_area *va; 1830 unsigned long free, dirty; 1831 unsigned long dirty_min, dirty_max; /*< dirty range */ 1832 struct list_head free_list; 1833 struct rcu_head rcu_head; 1834 struct list_head purge; 1835 }; 1836 1837 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ 1838 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); 1839 1840 /* 1841 * XArray of vmap blocks, indexed by address, to quickly find a vmap block 1842 * in the free path. Could get rid of this if we change the API to return a 1843 * "cookie" from alloc, to be passed to free. But no big deal yet. 1844 */ 1845 static DEFINE_XARRAY(vmap_blocks); 1846 1847 /* 1848 * We should probably have a fallback mechanism to allocate virtual memory 1849 * out of partially filled vmap blocks. However vmap block sizing should be 1850 * fairly reasonable according to the vmalloc size, so it shouldn't be a 1851 * big problem. 1852 */ 1853 1854 static unsigned long addr_to_vb_idx(unsigned long addr) 1855 { 1856 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); 1857 addr /= VMAP_BLOCK_SIZE; 1858 return addr; 1859 } 1860 1861 static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) 1862 { 1863 unsigned long addr; 1864 1865 addr = va_start + (pages_off << PAGE_SHIFT); 1866 BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); 1867 return (void *)addr; 1868 } 1869 1870 /** 1871 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this 1872 * block. Of course pages number can't exceed VMAP_BBMAP_BITS 1873 * @order: how many 2^order pages should be occupied in newly allocated block 1874 * @gfp_mask: flags for the page level allocator 1875 * 1876 * Return: virtual address in a newly allocated block or ERR_PTR(-errno) 1877 */ 1878 static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) 1879 { 1880 struct vmap_block_queue *vbq; 1881 struct vmap_block *vb; 1882 struct vmap_area *va; 1883 unsigned long vb_idx; 1884 int node, err; 1885 void *vaddr; 1886 1887 node = numa_node_id(); 1888 1889 vb = kmalloc_node(sizeof(struct vmap_block), 1890 gfp_mask & GFP_RECLAIM_MASK, node); 1891 if (unlikely(!vb)) 1892 return ERR_PTR(-ENOMEM); 1893 1894 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, 1895 VMALLOC_START, VMALLOC_END, 1896 node, gfp_mask); 1897 if (IS_ERR(va)) { 1898 kfree(vb); 1899 return ERR_CAST(va); 1900 } 1901 1902 vaddr = vmap_block_vaddr(va->va_start, 0); 1903 spin_lock_init(&vb->lock); 1904 vb->va = va; 1905 /* At least something should be left free */ 1906 BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); 1907 vb->free = VMAP_BBMAP_BITS - (1UL << order); 1908 vb->dirty = 0; 1909 vb->dirty_min = VMAP_BBMAP_BITS; 1910 vb->dirty_max = 0; 1911 INIT_LIST_HEAD(&vb->free_list); 1912 1913 vb_idx = addr_to_vb_idx(va->va_start); 1914 err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask); 1915 if (err) { 1916 kfree(vb); 1917 free_vmap_area(va); 1918 return ERR_PTR(err); 1919 } 1920 1921 vbq = &get_cpu_var(vmap_block_queue); 1922 spin_lock(&vbq->lock); 1923 list_add_tail_rcu(&vb->free_list, &vbq->free); 1924 spin_unlock(&vbq->lock); 1925 put_cpu_var(vmap_block_queue); 1926 1927 return vaddr; 1928 } 1929 1930 static void free_vmap_block(struct vmap_block *vb) 1931 { 1932 struct vmap_block *tmp; 1933 1934 tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); 1935 BUG_ON(tmp != vb); 1936 1937 free_vmap_area_noflush(vb->va); 1938 kfree_rcu(vb, rcu_head); 1939 } 1940 1941 static void purge_fragmented_blocks(int cpu) 1942 { 1943 LIST_HEAD(purge); 1944 struct vmap_block *vb; 1945 struct vmap_block *n_vb; 1946 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 1947 1948 rcu_read_lock(); 1949 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1950 1951 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) 1952 continue; 1953 1954 spin_lock(&vb->lock); 1955 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { 1956 vb->free = 0; /* prevent further allocs after releasing lock */ 1957 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ 1958 vb->dirty_min = 0; 1959 vb->dirty_max = VMAP_BBMAP_BITS; 1960 spin_lock(&vbq->lock); 1961 list_del_rcu(&vb->free_list); 1962 spin_unlock(&vbq->lock); 1963 spin_unlock(&vb->lock); 1964 list_add_tail(&vb->purge, &purge); 1965 } else 1966 spin_unlock(&vb->lock); 1967 } 1968 rcu_read_unlock(); 1969 1970 list_for_each_entry_safe(vb, n_vb, &purge, purge) { 1971 list_del(&vb->purge); 1972 free_vmap_block(vb); 1973 } 1974 } 1975 1976 static void purge_fragmented_blocks_allcpus(void) 1977 { 1978 int cpu; 1979 1980 for_each_possible_cpu(cpu) 1981 purge_fragmented_blocks(cpu); 1982 } 1983 1984 static void *vb_alloc(unsigned long size, gfp_t gfp_mask) 1985 { 1986 struct vmap_block_queue *vbq; 1987 struct vmap_block *vb; 1988 void *vaddr = NULL; 1989 unsigned int order; 1990 1991 BUG_ON(offset_in_page(size)); 1992 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 1993 if (WARN_ON(size == 0)) { 1994 /* 1995 * Allocating 0 bytes isn't what caller wants since 1996 * get_order(0) returns funny result. Just warn and terminate 1997 * early. 1998 */ 1999 return NULL; 2000 } 2001 order = get_order(size); 2002 2003 rcu_read_lock(); 2004 vbq = &get_cpu_var(vmap_block_queue); 2005 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 2006 unsigned long pages_off; 2007 2008 spin_lock(&vb->lock); 2009 if (vb->free < (1UL << order)) { 2010 spin_unlock(&vb->lock); 2011 continue; 2012 } 2013 2014 pages_off = VMAP_BBMAP_BITS - vb->free; 2015 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); 2016 vb->free -= 1UL << order; 2017 if (vb->free == 0) { 2018 spin_lock(&vbq->lock); 2019 list_del_rcu(&vb->free_list); 2020 spin_unlock(&vbq->lock); 2021 } 2022 2023 spin_unlock(&vb->lock); 2024 break; 2025 } 2026 2027 put_cpu_var(vmap_block_queue); 2028 rcu_read_unlock(); 2029 2030 /* Allocate new block if nothing was found */ 2031 if (!vaddr) 2032 vaddr = new_vmap_block(order, gfp_mask); 2033 2034 return vaddr; 2035 } 2036 2037 static void vb_free(unsigned long addr, unsigned long size) 2038 { 2039 unsigned long offset; 2040 unsigned int order; 2041 struct vmap_block *vb; 2042 2043 BUG_ON(offset_in_page(size)); 2044 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 2045 2046 flush_cache_vunmap(addr, addr + size); 2047 2048 order = get_order(size); 2049 offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; 2050 vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); 2051 2052 vunmap_range_noflush(addr, addr + size); 2053 2054 if (debug_pagealloc_enabled_static()) 2055 flush_tlb_kernel_range(addr, addr + size); 2056 2057 spin_lock(&vb->lock); 2058 2059 /* Expand dirty range */ 2060 vb->dirty_min = min(vb->dirty_min, offset); 2061 vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); 2062 2063 vb->dirty += 1UL << order; 2064 if (vb->dirty == VMAP_BBMAP_BITS) { 2065 BUG_ON(vb->free); 2066 spin_unlock(&vb->lock); 2067 free_vmap_block(vb); 2068 } else 2069 spin_unlock(&vb->lock); 2070 } 2071 2072 static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) 2073 { 2074 int cpu; 2075 2076 if (unlikely(!vmap_initialized)) 2077 return; 2078 2079 might_sleep(); 2080 2081 for_each_possible_cpu(cpu) { 2082 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 2083 struct vmap_block *vb; 2084 2085 rcu_read_lock(); 2086 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 2087 spin_lock(&vb->lock); 2088 if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) { 2089 unsigned long va_start = vb->va->va_start; 2090 unsigned long s, e; 2091 2092 s = va_start + (vb->dirty_min << PAGE_SHIFT); 2093 e = va_start + (vb->dirty_max << PAGE_SHIFT); 2094 2095 start = min(s, start); 2096 end = max(e, end); 2097 2098 flush = 1; 2099 } 2100 spin_unlock(&vb->lock); 2101 } 2102 rcu_read_unlock(); 2103 } 2104 2105 mutex_lock(&vmap_purge_lock); 2106 purge_fragmented_blocks_allcpus(); 2107 if (!__purge_vmap_area_lazy(start, end) && flush) 2108 flush_tlb_kernel_range(start, end); 2109 mutex_unlock(&vmap_purge_lock); 2110 } 2111 2112 /** 2113 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer 2114 * 2115 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily 2116 * to amortize TLB flushing overheads. What this means is that any page you 2117 * have now, may, in a former life, have been mapped into kernel virtual 2118 * address by the vmap layer and so there might be some CPUs with TLB entries 2119 * still referencing that page (additional to the regular 1:1 kernel mapping). 2120 * 2121 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can 2122 * be sure that none of the pages we have control over will have any aliases 2123 * from the vmap layer. 2124 */ 2125 void vm_unmap_aliases(void) 2126 { 2127 unsigned long start = ULONG_MAX, end = 0; 2128 int flush = 0; 2129 2130 _vm_unmap_aliases(start, end, flush); 2131 } 2132 EXPORT_SYMBOL_GPL(vm_unmap_aliases); 2133 2134 /** 2135 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram 2136 * @mem: the pointer returned by vm_map_ram 2137 * @count: the count passed to that vm_map_ram call (cannot unmap partial) 2138 */ 2139 void vm_unmap_ram(const void *mem, unsigned int count) 2140 { 2141 unsigned long size = (unsigned long)count << PAGE_SHIFT; 2142 unsigned long addr = (unsigned long)mem; 2143 struct vmap_area *va; 2144 2145 might_sleep(); 2146 BUG_ON(!addr); 2147 BUG_ON(addr < VMALLOC_START); 2148 BUG_ON(addr > VMALLOC_END); 2149 BUG_ON(!PAGE_ALIGNED(addr)); 2150 2151 kasan_poison_vmalloc(mem, size); 2152 2153 if (likely(count <= VMAP_MAX_ALLOC)) { 2154 debug_check_no_locks_freed(mem, size); 2155 vb_free(addr, size); 2156 return; 2157 } 2158 2159 va = find_vmap_area(addr); 2160 BUG_ON(!va); 2161 debug_check_no_locks_freed((void *)va->va_start, 2162 (va->va_end - va->va_start)); 2163 free_unmap_vmap_area(va); 2164 } 2165 EXPORT_SYMBOL(vm_unmap_ram); 2166 2167 /** 2168 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) 2169 * @pages: an array of pointers to the pages to be mapped 2170 * @count: number of pages 2171 * @node: prefer to allocate data structures on this node 2172 * 2173 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be 2174 * faster than vmap so it's good. But if you mix long-life and short-life 2175 * objects with vm_map_ram(), it could consume lots of address space through 2176 * fragmentation (especially on a 32bit machine). You could see failures in 2177 * the end. Please use this function for short-lived objects. 2178 * 2179 * Returns: a pointer to the address that has been mapped, or %NULL on failure 2180 */ 2181 void *vm_map_ram(struct page **pages, unsigned int count, int node) 2182 { 2183 unsigned long size = (unsigned long)count << PAGE_SHIFT; 2184 unsigned long addr; 2185 void *mem; 2186 2187 if (likely(count <= VMAP_MAX_ALLOC)) { 2188 mem = vb_alloc(size, GFP_KERNEL); 2189 if (IS_ERR(mem)) 2190 return NULL; 2191 addr = (unsigned long)mem; 2192 } else { 2193 struct vmap_area *va; 2194 va = alloc_vmap_area(size, PAGE_SIZE, 2195 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); 2196 if (IS_ERR(va)) 2197 return NULL; 2198 2199 addr = va->va_start; 2200 mem = (void *)addr; 2201 } 2202 2203 kasan_unpoison_vmalloc(mem, size); 2204 2205 if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, 2206 pages, PAGE_SHIFT) < 0) { 2207 vm_unmap_ram(mem, count); 2208 return NULL; 2209 } 2210 2211 return mem; 2212 } 2213 EXPORT_SYMBOL(vm_map_ram); 2214 2215 static struct vm_struct *vmlist __initdata; 2216 2217 static inline unsigned int vm_area_page_order(struct vm_struct *vm) 2218 { 2219 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC 2220 return vm->page_order; 2221 #else 2222 return 0; 2223 #endif 2224 } 2225 2226 static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) 2227 { 2228 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC 2229 vm->page_order = order; 2230 #else 2231 BUG_ON(order != 0); 2232 #endif 2233 } 2234 2235 /** 2236 * vm_area_add_early - add vmap area early during boot 2237 * @vm: vm_struct to add 2238 * 2239 * This function is used to add fixed kernel vm area to vmlist before 2240 * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags 2241 * should contain proper values and the other fields should be zero. 2242 * 2243 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 2244 */ 2245 void __init vm_area_add_early(struct vm_struct *vm) 2246 { 2247 struct vm_struct *tmp, **p; 2248 2249 BUG_ON(vmap_initialized); 2250 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { 2251 if (tmp->addr >= vm->addr) { 2252 BUG_ON(tmp->addr < vm->addr + vm->size); 2253 break; 2254 } else 2255 BUG_ON(tmp->addr + tmp->size > vm->addr); 2256 } 2257 vm->next = *p; 2258 *p = vm; 2259 } 2260 2261 /** 2262 * vm_area_register_early - register vmap area early during boot 2263 * @vm: vm_struct to register 2264 * @align: requested alignment 2265 * 2266 * This function is used to register kernel vm area before 2267 * vmalloc_init() is called. @vm->size and @vm->flags should contain 2268 * proper values on entry and other fields should be zero. On return, 2269 * vm->addr contains the allocated address. 2270 * 2271 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 2272 */ 2273 void __init vm_area_register_early(struct vm_struct *vm, size_t align) 2274 { 2275 static size_t vm_init_off __initdata; 2276 unsigned long addr; 2277 2278 addr = ALIGN(VMALLOC_START + vm_init_off, align); 2279 vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; 2280 2281 vm->addr = (void *)addr; 2282 2283 vm_area_add_early(vm); 2284 } 2285 2286 static void vmap_init_free_space(void) 2287 { 2288 unsigned long vmap_start = 1; 2289 const unsigned long vmap_end = ULONG_MAX; 2290 struct vmap_area *busy, *free; 2291 2292 /* 2293 * B F B B B F 2294 * -|-----|.....|-----|-----|-----|.....|- 2295 * | The KVA space | 2296 * |<--------------------------------->| 2297 */ 2298 list_for_each_entry(busy, &vmap_area_list, list) { 2299 if (busy->va_start - vmap_start > 0) { 2300 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 2301 if (!WARN_ON_ONCE(!free)) { 2302 free->va_start = vmap_start; 2303 free->va_end = busy->va_start; 2304 2305 insert_vmap_area_augment(free, NULL, 2306 &free_vmap_area_root, 2307 &free_vmap_area_list); 2308 } 2309 } 2310 2311 vmap_start = busy->va_end; 2312 } 2313 2314 if (vmap_end - vmap_start > 0) { 2315 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 2316 if (!WARN_ON_ONCE(!free)) { 2317 free->va_start = vmap_start; 2318 free->va_end = vmap_end; 2319 2320 insert_vmap_area_augment(free, NULL, 2321 &free_vmap_area_root, 2322 &free_vmap_area_list); 2323 } 2324 } 2325 } 2326 2327 void __init vmalloc_init(void) 2328 { 2329 struct vmap_area *va; 2330 struct vm_struct *tmp; 2331 int i; 2332 2333 /* 2334 * Create the cache for vmap_area objects. 2335 */ 2336 vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); 2337 2338 for_each_possible_cpu(i) { 2339 struct vmap_block_queue *vbq; 2340 struct vfree_deferred *p; 2341 2342 vbq = &per_cpu(vmap_block_queue, i); 2343 spin_lock_init(&vbq->lock); 2344 INIT_LIST_HEAD(&vbq->free); 2345 p = &per_cpu(vfree_deferred, i); 2346 init_llist_head(&p->list); 2347 INIT_WORK(&p->wq, free_work); 2348 } 2349 2350 /* Import existing vmlist entries. */ 2351 for (tmp = vmlist; tmp; tmp = tmp->next) { 2352 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 2353 if (WARN_ON_ONCE(!va)) 2354 continue; 2355 2356 va->va_start = (unsigned long)tmp->addr; 2357 va->va_end = va->va_start + tmp->size; 2358 va->vm = tmp; 2359 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 2360 } 2361 2362 /* 2363 * Now we can initialize a free vmap space. 2364 */ 2365 vmap_init_free_space(); 2366 vmap_initialized = true; 2367 } 2368 2369 static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, 2370 struct vmap_area *va, unsigned long flags, const void *caller) 2371 { 2372 vm->flags = flags; 2373 vm->addr = (void *)va->va_start; 2374 vm->size = va->va_end - va->va_start; 2375 vm->caller = caller; 2376 va->vm = vm; 2377 } 2378 2379 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 2380 unsigned long flags, const void *caller) 2381 { 2382 spin_lock(&vmap_area_lock); 2383 setup_vmalloc_vm_locked(vm, va, flags, caller); 2384 spin_unlock(&vmap_area_lock); 2385 } 2386 2387 static void clear_vm_uninitialized_flag(struct vm_struct *vm) 2388 { 2389 /* 2390 * Before removing VM_UNINITIALIZED, 2391 * we should make sure that vm has proper values. 2392 * Pair with smp_rmb() in show_numa_info(). 2393 */ 2394 smp_wmb(); 2395 vm->flags &= ~VM_UNINITIALIZED; 2396 } 2397 2398 static struct vm_struct *__get_vm_area_node(unsigned long size, 2399 unsigned long align, unsigned long shift, unsigned long flags, 2400 unsigned long start, unsigned long end, int node, 2401 gfp_t gfp_mask, const void *caller) 2402 { 2403 struct vmap_area *va; 2404 struct vm_struct *area; 2405 unsigned long requested_size = size; 2406 2407 BUG_ON(in_interrupt()); 2408 size = ALIGN(size, 1ul << shift); 2409 if (unlikely(!size)) 2410 return NULL; 2411 2412 if (flags & VM_IOREMAP) 2413 align = 1ul << clamp_t(int, get_count_order_long(size), 2414 PAGE_SHIFT, IOREMAP_MAX_ORDER); 2415 2416 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 2417 if (unlikely(!area)) 2418 return NULL; 2419 2420 if (!(flags & VM_NO_GUARD)) 2421 size += PAGE_SIZE; 2422 2423 va = alloc_vmap_area(size, align, start, end, node, gfp_mask); 2424 if (IS_ERR(va)) { 2425 kfree(area); 2426 return NULL; 2427 } 2428 2429 kasan_unpoison_vmalloc((void *)va->va_start, requested_size); 2430 2431 setup_vmalloc_vm(area, va, flags, caller); 2432 2433 return area; 2434 } 2435 2436 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, 2437 unsigned long start, unsigned long end, 2438 const void *caller) 2439 { 2440 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end, 2441 NUMA_NO_NODE, GFP_KERNEL, caller); 2442 } 2443 2444 /** 2445 * get_vm_area - reserve a contiguous kernel virtual area 2446 * @size: size of the area 2447 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 2448 * 2449 * Search an area of @size in the kernel virtual mapping area, 2450 * and reserved it for out purposes. Returns the area descriptor 2451 * on success or %NULL on failure. 2452 * 2453 * Return: the area descriptor on success or %NULL on failure. 2454 */ 2455 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 2456 { 2457 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, 2458 VMALLOC_START, VMALLOC_END, 2459 NUMA_NO_NODE, GFP_KERNEL, 2460 __builtin_return_address(0)); 2461 } 2462 2463 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 2464 const void *caller) 2465 { 2466 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, 2467 VMALLOC_START, VMALLOC_END, 2468 NUMA_NO_NODE, GFP_KERNEL, caller); 2469 } 2470 2471 /** 2472 * find_vm_area - find a continuous kernel virtual area 2473 * @addr: base address 2474 * 2475 * Search for the kernel VM area starting at @addr, and return it. 2476 * It is up to the caller to do all required locking to keep the returned 2477 * pointer valid. 2478 * 2479 * Return: the area descriptor on success or %NULL on failure. 2480 */ 2481 struct vm_struct *find_vm_area(const void *addr) 2482 { 2483 struct vmap_area *va; 2484 2485 va = find_vmap_area((unsigned long)addr); 2486 if (!va) 2487 return NULL; 2488 2489 return va->vm; 2490 } 2491 2492 /** 2493 * remove_vm_area - find and remove a continuous kernel virtual area 2494 * @addr: base address 2495 * 2496 * Search for the kernel VM area starting at @addr, and remove it. 2497 * This function returns the found VM area, but using it is NOT safe 2498 * on SMP machines, except for its size or flags. 2499 * 2500 * Return: the area descriptor on success or %NULL on failure. 2501 */ 2502 struct vm_struct *remove_vm_area(const void *addr) 2503 { 2504 struct vmap_area *va; 2505 2506 might_sleep(); 2507 2508 spin_lock(&vmap_area_lock); 2509 va = __find_vmap_area((unsigned long)addr); 2510 if (va && va->vm) { 2511 struct vm_struct *vm = va->vm; 2512 2513 va->vm = NULL; 2514 spin_unlock(&vmap_area_lock); 2515 2516 kasan_free_shadow(vm); 2517 free_unmap_vmap_area(va); 2518 2519 return vm; 2520 } 2521 2522 spin_unlock(&vmap_area_lock); 2523 return NULL; 2524 } 2525 2526 static inline void set_area_direct_map(const struct vm_struct *area, 2527 int (*set_direct_map)(struct page *page)) 2528 { 2529 int i; 2530 2531 /* HUGE_VMALLOC passes small pages to set_direct_map */ 2532 for (i = 0; i < area->nr_pages; i++) 2533 if (page_address(area->pages[i])) 2534 set_direct_map(area->pages[i]); 2535 } 2536 2537 /* Handle removing and resetting vm mappings related to the vm_struct. */ 2538 static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) 2539 { 2540 unsigned long start = ULONG_MAX, end = 0; 2541 unsigned int page_order = vm_area_page_order(area); 2542 int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; 2543 int flush_dmap = 0; 2544 int i; 2545 2546 remove_vm_area(area->addr); 2547 2548 /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ 2549 if (!flush_reset) 2550 return; 2551 2552 /* 2553 * If not deallocating pages, just do the flush of the VM area and 2554 * return. 2555 */ 2556 if (!deallocate_pages) { 2557 vm_unmap_aliases(); 2558 return; 2559 } 2560 2561 /* 2562 * If execution gets here, flush the vm mapping and reset the direct 2563 * map. Find the start and end range of the direct mappings to make sure 2564 * the vm_unmap_aliases() flush includes the direct map. 2565 */ 2566 for (i = 0; i < area->nr_pages; i += 1U << page_order) { 2567 unsigned long addr = (unsigned long)page_address(area->pages[i]); 2568 if (addr) { 2569 unsigned long page_size; 2570 2571 page_size = PAGE_SIZE << page_order; 2572 start = min(addr, start); 2573 end = max(addr + page_size, end); 2574 flush_dmap = 1; 2575 } 2576 } 2577 2578 /* 2579 * Set direct map to something invalid so that it won't be cached if 2580 * there are any accesses after the TLB flush, then flush the TLB and 2581 * reset the direct map permissions to the default. 2582 */ 2583 set_area_direct_map(area, set_direct_map_invalid_noflush); 2584 _vm_unmap_aliases(start, end, flush_dmap); 2585 set_area_direct_map(area, set_direct_map_default_noflush); 2586 } 2587 2588 static void __vunmap(const void *addr, int deallocate_pages) 2589 { 2590 struct vm_struct *area; 2591 2592 if (!addr) 2593 return; 2594 2595 if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", 2596 addr)) 2597 return; 2598 2599 area = find_vm_area(addr); 2600 if (unlikely(!area)) { 2601 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 2602 addr); 2603 return; 2604 } 2605 2606 debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); 2607 debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); 2608 2609 kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); 2610 2611 vm_remove_mappings(area, deallocate_pages); 2612 2613 if (deallocate_pages) { 2614 unsigned int page_order = vm_area_page_order(area); 2615 int i; 2616 2617 for (i = 0; i < area->nr_pages; i += 1U << page_order) { 2618 struct page *page = area->pages[i]; 2619 2620 BUG_ON(!page); 2621 __free_pages(page, page_order); 2622 cond_resched(); 2623 } 2624 atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); 2625 2626 kvfree(area->pages); 2627 } 2628 2629 kfree(area); 2630 } 2631 2632 static inline void __vfree_deferred(const void *addr) 2633 { 2634 /* 2635 * Use raw_cpu_ptr() because this can be called from preemptible 2636 * context. Preemption is absolutely fine here, because the llist_add() 2637 * implementation is lockless, so it works even if we are adding to 2638 * another cpu's list. schedule_work() should be fine with this too. 2639 */ 2640 struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); 2641 2642 if (llist_add((struct llist_node *)addr, &p->list)) 2643 schedule_work(&p->wq); 2644 } 2645 2646 /** 2647 * vfree_atomic - release memory allocated by vmalloc() 2648 * @addr: memory base address 2649 * 2650 * This one is just like vfree() but can be called in any atomic context 2651 * except NMIs. 2652 */ 2653 void vfree_atomic(const void *addr) 2654 { 2655 BUG_ON(in_nmi()); 2656 2657 kmemleak_free(addr); 2658 2659 if (!addr) 2660 return; 2661 __vfree_deferred(addr); 2662 } 2663 2664 static void __vfree(const void *addr) 2665 { 2666 if (unlikely(in_interrupt())) 2667 __vfree_deferred(addr); 2668 else 2669 __vunmap(addr, 1); 2670 } 2671 2672 /** 2673 * vfree - Release memory allocated by vmalloc() 2674 * @addr: Memory base address 2675 * 2676 * Free the virtually continuous memory area starting at @addr, as obtained 2677 * from one of the vmalloc() family of APIs. This will usually also free the 2678 * physical memory underlying the virtual allocation, but that memory is 2679 * reference counted, so it will not be freed until the last user goes away. 2680 * 2681 * If @addr is NULL, no operation is performed. 2682 * 2683 * Context: 2684 * May sleep if called *not* from interrupt context. 2685 * Must not be called in NMI context (strictly speaking, it could be 2686 * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling 2687 * conventions for vfree() arch-dependent would be a really bad idea). 2688 */ 2689 void vfree(const void *addr) 2690 { 2691 BUG_ON(in_nmi()); 2692 2693 kmemleak_free(addr); 2694 2695 might_sleep_if(!in_interrupt()); 2696 2697 if (!addr) 2698 return; 2699 2700 __vfree(addr); 2701 } 2702 EXPORT_SYMBOL(vfree); 2703 2704 /** 2705 * vunmap - release virtual mapping obtained by vmap() 2706 * @addr: memory base address 2707 * 2708 * Free the virtually contiguous memory area starting at @addr, 2709 * which was created from the page array passed to vmap(). 2710 * 2711 * Must not be called in interrupt context. 2712 */ 2713 void vunmap(const void *addr) 2714 { 2715 BUG_ON(in_interrupt()); 2716 might_sleep(); 2717 if (addr) 2718 __vunmap(addr, 0); 2719 } 2720 EXPORT_SYMBOL(vunmap); 2721 2722 /** 2723 * vmap - map an array of pages into virtually contiguous space 2724 * @pages: array of page pointers 2725 * @count: number of pages to map 2726 * @flags: vm_area->flags 2727 * @prot: page protection for the mapping 2728 * 2729 * Maps @count pages from @pages into contiguous kernel virtual space. 2730 * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself 2731 * (which must be kmalloc or vmalloc memory) and one reference per pages in it 2732 * are transferred from the caller to vmap(), and will be freed / dropped when 2733 * vfree() is called on the return value. 2734 * 2735 * Return: the address of the area or %NULL on failure 2736 */ 2737 void *vmap(struct page **pages, unsigned int count, 2738 unsigned long flags, pgprot_t prot) 2739 { 2740 struct vm_struct *area; 2741 unsigned long addr; 2742 unsigned long size; /* In bytes */ 2743 2744 might_sleep(); 2745 2746 if (count > totalram_pages()) 2747 return NULL; 2748 2749 size = (unsigned long)count << PAGE_SHIFT; 2750 area = get_vm_area_caller(size, flags, __builtin_return_address(0)); 2751 if (!area) 2752 return NULL; 2753 2754 addr = (unsigned long)area->addr; 2755 if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), 2756 pages, PAGE_SHIFT) < 0) { 2757 vunmap(area->addr); 2758 return NULL; 2759 } 2760 2761 if (flags & VM_MAP_PUT_PAGES) { 2762 area->pages = pages; 2763 area->nr_pages = count; 2764 } 2765 return area->addr; 2766 } 2767 EXPORT_SYMBOL(vmap); 2768 2769 #ifdef CONFIG_VMAP_PFN 2770 struct vmap_pfn_data { 2771 unsigned long *pfns; 2772 pgprot_t prot; 2773 unsigned int idx; 2774 }; 2775 2776 static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) 2777 { 2778 struct vmap_pfn_data *data = private; 2779 2780 if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx]))) 2781 return -EINVAL; 2782 *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); 2783 return 0; 2784 } 2785 2786 /** 2787 * vmap_pfn - map an array of PFNs into virtually contiguous space 2788 * @pfns: array of PFNs 2789 * @count: number of pages to map 2790 * @prot: page protection for the mapping 2791 * 2792 * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns 2793 * the start address of the mapping. 2794 */ 2795 void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) 2796 { 2797 struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; 2798 struct vm_struct *area; 2799 2800 area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, 2801 __builtin_return_address(0)); 2802 if (!area) 2803 return NULL; 2804 if (apply_to_page_range(&init_mm, (unsigned long)area->addr, 2805 count * PAGE_SIZE, vmap_pfn_apply, &data)) { 2806 free_vm_area(area); 2807 return NULL; 2808 } 2809 return area->addr; 2810 } 2811 EXPORT_SYMBOL_GPL(vmap_pfn); 2812 #endif /* CONFIG_VMAP_PFN */ 2813 2814 static inline unsigned int 2815 vm_area_alloc_pages(gfp_t gfp, int nid, 2816 unsigned int order, unsigned int nr_pages, struct page **pages) 2817 { 2818 unsigned int nr_allocated = 0; 2819 2820 /* 2821 * For order-0 pages we make use of bulk allocator, if 2822 * the page array is partly or not at all populated due 2823 * to fails, fallback to a single page allocator that is 2824 * more permissive. 2825 */ 2826 if (!order) { 2827 while (nr_allocated < nr_pages) { 2828 unsigned int nr, nr_pages_request; 2829 2830 /* 2831 * A maximum allowed request is hard-coded and is 100 2832 * pages per call. That is done in order to prevent a 2833 * long preemption off scenario in the bulk-allocator 2834 * so the range is [1:100]. 2835 */ 2836 nr_pages_request = min(100U, nr_pages - nr_allocated); 2837 2838 nr = alloc_pages_bulk_array_node(gfp, nid, 2839 nr_pages_request, pages + nr_allocated); 2840 2841 nr_allocated += nr; 2842 cond_resched(); 2843 2844 /* 2845 * If zero or pages were obtained partly, 2846 * fallback to a single page allocator. 2847 */ 2848 if (nr != nr_pages_request) 2849 break; 2850 } 2851 } else 2852 /* 2853 * Compound pages required for remap_vmalloc_page if 2854 * high-order pages. 2855 */ 2856 gfp |= __GFP_COMP; 2857 2858 /* High-order pages or fallback path if "bulk" fails. */ 2859 while (nr_allocated < nr_pages) { 2860 struct page *page; 2861 int i; 2862 2863 page = alloc_pages_node(nid, gfp, order); 2864 if (unlikely(!page)) 2865 break; 2866 2867 /* 2868 * Careful, we allocate and map page-order pages, but 2869 * tracking is done per PAGE_SIZE page so as to keep the 2870 * vm_struct APIs independent of the physical/mapped size. 2871 */ 2872 for (i = 0; i < (1U << order); i++) 2873 pages[nr_allocated + i] = page + i; 2874 2875 cond_resched(); 2876 nr_allocated += 1U << order; 2877 } 2878 2879 return nr_allocated; 2880 } 2881 2882 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 2883 pgprot_t prot, unsigned int page_shift, 2884 int node) 2885 { 2886 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 2887 unsigned long addr = (unsigned long)area->addr; 2888 unsigned long size = get_vm_area_size(area); 2889 unsigned long array_size; 2890 unsigned int nr_small_pages = size >> PAGE_SHIFT; 2891 unsigned int page_order; 2892 2893 array_size = (unsigned long)nr_small_pages * sizeof(struct page *); 2894 gfp_mask |= __GFP_NOWARN; 2895 if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) 2896 gfp_mask |= __GFP_HIGHMEM; 2897 2898 /* Please note that the recursion is strictly bounded. */ 2899 if (array_size > PAGE_SIZE) { 2900 area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, 2901 area->caller); 2902 } else { 2903 area->pages = kmalloc_node(array_size, nested_gfp, node); 2904 } 2905 2906 if (!area->pages) { 2907 warn_alloc(gfp_mask, NULL, 2908 "vmalloc error: size %lu, failed to allocated page array size %lu", 2909 nr_small_pages * PAGE_SIZE, array_size); 2910 free_vm_area(area); 2911 return NULL; 2912 } 2913 2914 set_vm_area_page_order(area, page_shift - PAGE_SHIFT); 2915 page_order = vm_area_page_order(area); 2916 2917 area->nr_pages = vm_area_alloc_pages(gfp_mask, node, 2918 page_order, nr_small_pages, area->pages); 2919 2920 atomic_long_add(area->nr_pages, &nr_vmalloc_pages); 2921 2922 /* 2923 * If not enough pages were obtained to accomplish an 2924 * allocation request, free them via __vfree() if any. 2925 */ 2926 if (area->nr_pages != nr_small_pages) { 2927 warn_alloc(gfp_mask, NULL, 2928 "vmalloc error: size %lu, page order %u, failed to allocate pages", 2929 area->nr_pages * PAGE_SIZE, page_order); 2930 goto fail; 2931 } 2932 2933 if (vmap_pages_range(addr, addr + size, prot, area->pages, 2934 page_shift) < 0) { 2935 warn_alloc(gfp_mask, NULL, 2936 "vmalloc error: size %lu, failed to map pages", 2937 area->nr_pages * PAGE_SIZE); 2938 goto fail; 2939 } 2940 2941 return area->addr; 2942 2943 fail: 2944 __vfree(area->addr); 2945 return NULL; 2946 } 2947 2948 /** 2949 * __vmalloc_node_range - allocate virtually contiguous memory 2950 * @size: allocation size 2951 * @align: desired alignment 2952 * @start: vm area range start 2953 * @end: vm area range end 2954 * @gfp_mask: flags for the page level allocator 2955 * @prot: protection mask for the allocated pages 2956 * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) 2957 * @node: node to use for allocation or NUMA_NO_NODE 2958 * @caller: caller's return address 2959 * 2960 * Allocate enough pages to cover @size from the page level 2961 * allocator with @gfp_mask flags. Map them into contiguous 2962 * kernel virtual space, using a pagetable protection of @prot. 2963 * 2964 * Return: the address of the area or %NULL on failure 2965 */ 2966 void *__vmalloc_node_range(unsigned long size, unsigned long align, 2967 unsigned long start, unsigned long end, gfp_t gfp_mask, 2968 pgprot_t prot, unsigned long vm_flags, int node, 2969 const void *caller) 2970 { 2971 struct vm_struct *area; 2972 void *addr; 2973 unsigned long real_size = size; 2974 unsigned long real_align = align; 2975 unsigned int shift = PAGE_SHIFT; 2976 2977 if (WARN_ON_ONCE(!size)) 2978 return NULL; 2979 2980 if ((size >> PAGE_SHIFT) > totalram_pages()) { 2981 warn_alloc(gfp_mask, NULL, 2982 "vmalloc error: size %lu, exceeds total pages", 2983 real_size); 2984 return NULL; 2985 } 2986 2987 if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP)) { 2988 unsigned long size_per_node; 2989 2990 /* 2991 * Try huge pages. Only try for PAGE_KERNEL allocations, 2992 * others like modules don't yet expect huge pages in 2993 * their allocations due to apply_to_page_range not 2994 * supporting them. 2995 */ 2996 2997 size_per_node = size; 2998 if (node == NUMA_NO_NODE) 2999 size_per_node /= num_online_nodes(); 3000 if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE) 3001 shift = PMD_SHIFT; 3002 else 3003 shift = arch_vmap_pte_supported_shift(size_per_node); 3004 3005 align = max(real_align, 1UL << shift); 3006 size = ALIGN(real_size, 1UL << shift); 3007 } 3008 3009 again: 3010 area = __get_vm_area_node(real_size, align, shift, VM_ALLOC | 3011 VM_UNINITIALIZED | vm_flags, start, end, node, 3012 gfp_mask, caller); 3013 if (!area) { 3014 warn_alloc(gfp_mask, NULL, 3015 "vmalloc error: size %lu, vm_struct allocation failed", 3016 real_size); 3017 goto fail; 3018 } 3019 3020 addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); 3021 if (!addr) 3022 goto fail; 3023 3024 /* 3025 * In this function, newly allocated vm_struct has VM_UNINITIALIZED 3026 * flag. It means that vm_struct is not fully initialized. 3027 * Now, it is fully initialized, so remove this flag here. 3028 */ 3029 clear_vm_uninitialized_flag(area); 3030 3031 size = PAGE_ALIGN(size); 3032 kmemleak_vmalloc(area, size, gfp_mask); 3033 3034 return addr; 3035 3036 fail: 3037 if (shift > PAGE_SHIFT) { 3038 shift = PAGE_SHIFT; 3039 align = real_align; 3040 size = real_size; 3041 goto again; 3042 } 3043 3044 return NULL; 3045 } 3046 3047 /** 3048 * __vmalloc_node - allocate virtually contiguous memory 3049 * @size: allocation size 3050 * @align: desired alignment 3051 * @gfp_mask: flags for the page level allocator 3052 * @node: node to use for allocation or NUMA_NO_NODE 3053 * @caller: caller's return address 3054 * 3055 * Allocate enough pages to cover @size from the page level allocator with 3056 * @gfp_mask flags. Map them into contiguous kernel virtual space. 3057 * 3058 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL 3059 * and __GFP_NOFAIL are not supported 3060 * 3061 * Any use of gfp flags outside of GFP_KERNEL should be consulted 3062 * with mm people. 3063 * 3064 * Return: pointer to the allocated memory or %NULL on error 3065 */ 3066 void *__vmalloc_node(unsigned long size, unsigned long align, 3067 gfp_t gfp_mask, int node, const void *caller) 3068 { 3069 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 3070 gfp_mask, PAGE_KERNEL, 0, node, caller); 3071 } 3072 /* 3073 * This is only for performance analysis of vmalloc and stress purpose. 3074 * It is required by vmalloc test module, therefore do not use it other 3075 * than that. 3076 */ 3077 #ifdef CONFIG_TEST_VMALLOC_MODULE 3078 EXPORT_SYMBOL_GPL(__vmalloc_node); 3079 #endif 3080 3081 void *__vmalloc(unsigned long size, gfp_t gfp_mask) 3082 { 3083 return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, 3084 __builtin_return_address(0)); 3085 } 3086 EXPORT_SYMBOL(__vmalloc); 3087 3088 /** 3089 * vmalloc - allocate virtually contiguous memory 3090 * @size: allocation size 3091 * 3092 * Allocate enough pages to cover @size from the page level 3093 * allocator and map them into contiguous kernel virtual space. 3094 * 3095 * For tight control over page level allocator and protection flags 3096 * use __vmalloc() instead. 3097 * 3098 * Return: pointer to the allocated memory or %NULL on error 3099 */ 3100 void *vmalloc(unsigned long size) 3101 { 3102 return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, 3103 __builtin_return_address(0)); 3104 } 3105 EXPORT_SYMBOL(vmalloc); 3106 3107 /** 3108 * vmalloc_no_huge - allocate virtually contiguous memory using small pages 3109 * @size: allocation size 3110 * 3111 * Allocate enough non-huge pages to cover @size from the page level 3112 * allocator and map them into contiguous kernel virtual space. 3113 * 3114 * Return: pointer to the allocated memory or %NULL on error 3115 */ 3116 void *vmalloc_no_huge(unsigned long size) 3117 { 3118 return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, 3119 GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP, 3120 NUMA_NO_NODE, __builtin_return_address(0)); 3121 } 3122 EXPORT_SYMBOL(vmalloc_no_huge); 3123 3124 /** 3125 * vzalloc - allocate virtually contiguous memory with zero fill 3126 * @size: allocation size 3127 * 3128 * Allocate enough pages to cover @size from the page level 3129 * allocator and map them into contiguous kernel virtual space. 3130 * The memory allocated is set to zero. 3131 * 3132 * For tight control over page level allocator and protection flags 3133 * use __vmalloc() instead. 3134 * 3135 * Return: pointer to the allocated memory or %NULL on error 3136 */ 3137 void *vzalloc(unsigned long size) 3138 { 3139 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 3140 __builtin_return_address(0)); 3141 } 3142 EXPORT_SYMBOL(vzalloc); 3143 3144 /** 3145 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace 3146 * @size: allocation size 3147 * 3148 * The resulting memory area is zeroed so it can be mapped to userspace 3149 * without leaking data. 3150 * 3151 * Return: pointer to the allocated memory or %NULL on error 3152 */ 3153 void *vmalloc_user(unsigned long size) 3154 { 3155 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 3156 GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, 3157 VM_USERMAP, NUMA_NO_NODE, 3158 __builtin_return_address(0)); 3159 } 3160 EXPORT_SYMBOL(vmalloc_user); 3161 3162 /** 3163 * vmalloc_node - allocate memory on a specific node 3164 * @size: allocation size 3165 * @node: numa node 3166 * 3167 * Allocate enough pages to cover @size from the page level 3168 * allocator and map them into contiguous kernel virtual space. 3169 * 3170 * For tight control over page level allocator and protection flags 3171 * use __vmalloc() instead. 3172 * 3173 * Return: pointer to the allocated memory or %NULL on error 3174 */ 3175 void *vmalloc_node(unsigned long size, int node) 3176 { 3177 return __vmalloc_node(size, 1, GFP_KERNEL, node, 3178 __builtin_return_address(0)); 3179 } 3180 EXPORT_SYMBOL(vmalloc_node); 3181 3182 /** 3183 * vzalloc_node - allocate memory on a specific node with zero fill 3184 * @size: allocation size 3185 * @node: numa node 3186 * 3187 * Allocate enough pages to cover @size from the page level 3188 * allocator and map them into contiguous kernel virtual space. 3189 * The memory allocated is set to zero. 3190 * 3191 * Return: pointer to the allocated memory or %NULL on error 3192 */ 3193 void *vzalloc_node(unsigned long size, int node) 3194 { 3195 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, 3196 __builtin_return_address(0)); 3197 } 3198 EXPORT_SYMBOL(vzalloc_node); 3199 3200 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 3201 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) 3202 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) 3203 #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) 3204 #else 3205 /* 3206 * 64b systems should always have either DMA or DMA32 zones. For others 3207 * GFP_DMA32 should do the right thing and use the normal zone. 3208 */ 3209 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) 3210 #endif 3211 3212 /** 3213 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 3214 * @size: allocation size 3215 * 3216 * Allocate enough 32bit PA addressable pages to cover @size from the 3217 * page level allocator and map them into contiguous kernel virtual space. 3218 * 3219 * Return: pointer to the allocated memory or %NULL on error 3220 */ 3221 void *vmalloc_32(unsigned long size) 3222 { 3223 return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, 3224 __builtin_return_address(0)); 3225 } 3226 EXPORT_SYMBOL(vmalloc_32); 3227 3228 /** 3229 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 3230 * @size: allocation size 3231 * 3232 * The resulting memory area is 32bit addressable and zeroed so it can be 3233 * mapped to userspace without leaking data. 3234 * 3235 * Return: pointer to the allocated memory or %NULL on error 3236 */ 3237 void *vmalloc_32_user(unsigned long size) 3238 { 3239 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 3240 GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 3241 VM_USERMAP, NUMA_NO_NODE, 3242 __builtin_return_address(0)); 3243 } 3244 EXPORT_SYMBOL(vmalloc_32_user); 3245 3246 /* 3247 * small helper routine , copy contents to buf from addr. 3248 * If the page is not present, fill zero. 3249 */ 3250 3251 static int aligned_vread(char *buf, char *addr, unsigned long count) 3252 { 3253 struct page *p; 3254 int copied = 0; 3255 3256 while (count) { 3257 unsigned long offset, length; 3258 3259 offset = offset_in_page(addr); 3260 length = PAGE_SIZE - offset; 3261 if (length > count) 3262 length = count; 3263 p = vmalloc_to_page(addr); 3264 /* 3265 * To do safe access to this _mapped_ area, we need 3266 * lock. But adding lock here means that we need to add 3267 * overhead of vmalloc()/vfree() calls for this _debug_ 3268 * interface, rarely used. Instead of that, we'll use 3269 * kmap() and get small overhead in this access function. 3270 */ 3271 if (p) { 3272 /* We can expect USER0 is not used -- see vread() */ 3273 void *map = kmap_atomic(p); 3274 memcpy(buf, map + offset, length); 3275 kunmap_atomic(map); 3276 } else 3277 memset(buf, 0, length); 3278 3279 addr += length; 3280 buf += length; 3281 copied += length; 3282 count -= length; 3283 } 3284 return copied; 3285 } 3286 3287 /** 3288 * vread() - read vmalloc area in a safe way. 3289 * @buf: buffer for reading data 3290 * @addr: vm address. 3291 * @count: number of bytes to be read. 3292 * 3293 * This function checks that addr is a valid vmalloc'ed area, and 3294 * copy data from that area to a given buffer. If the given memory range 3295 * of [addr...addr+count) includes some valid address, data is copied to 3296 * proper area of @buf. If there are memory holes, they'll be zero-filled. 3297 * IOREMAP area is treated as memory hole and no copy is done. 3298 * 3299 * If [addr...addr+count) doesn't includes any intersects with alive 3300 * vm_struct area, returns 0. @buf should be kernel's buffer. 3301 * 3302 * Note: In usual ops, vread() is never necessary because the caller 3303 * should know vmalloc() area is valid and can use memcpy(). 3304 * This is for routines which have to access vmalloc area without 3305 * any information, as /proc/kcore. 3306 * 3307 * Return: number of bytes for which addr and buf should be increased 3308 * (same number as @count) or %0 if [addr...addr+count) doesn't 3309 * include any intersection with valid vmalloc area 3310 */ 3311 long vread(char *buf, char *addr, unsigned long count) 3312 { 3313 struct vmap_area *va; 3314 struct vm_struct *vm; 3315 char *vaddr, *buf_start = buf; 3316 unsigned long buflen = count; 3317 unsigned long n; 3318 3319 /* Don't allow overflow */ 3320 if ((unsigned long) addr + count < count) 3321 count = -(unsigned long) addr; 3322 3323 spin_lock(&vmap_area_lock); 3324 va = find_vmap_area_exceed_addr((unsigned long)addr); 3325 if (!va) 3326 goto finished; 3327 3328 /* no intersects with alive vmap_area */ 3329 if ((unsigned long)addr + count <= va->va_start) 3330 goto finished; 3331 3332 list_for_each_entry_from(va, &vmap_area_list, list) { 3333 if (!count) 3334 break; 3335 3336 if (!va->vm) 3337 continue; 3338 3339 vm = va->vm; 3340 vaddr = (char *) vm->addr; 3341 if (addr >= vaddr + get_vm_area_size(vm)) 3342 continue; 3343 while (addr < vaddr) { 3344 if (count == 0) 3345 goto finished; 3346 *buf = '\0'; 3347 buf++; 3348 addr++; 3349 count--; 3350 } 3351 n = vaddr + get_vm_area_size(vm) - addr; 3352 if (n > count) 3353 n = count; 3354 if (!(vm->flags & VM_IOREMAP)) 3355 aligned_vread(buf, addr, n); 3356 else /* IOREMAP area is treated as memory hole */ 3357 memset(buf, 0, n); 3358 buf += n; 3359 addr += n; 3360 count -= n; 3361 } 3362 finished: 3363 spin_unlock(&vmap_area_lock); 3364 3365 if (buf == buf_start) 3366 return 0; 3367 /* zero-fill memory holes */ 3368 if (buf != buf_start + buflen) 3369 memset(buf, 0, buflen - (buf - buf_start)); 3370 3371 return buflen; 3372 } 3373 3374 /** 3375 * remap_vmalloc_range_partial - map vmalloc pages to userspace 3376 * @vma: vma to cover 3377 * @uaddr: target user address to start at 3378 * @kaddr: virtual address of vmalloc kernel memory 3379 * @pgoff: offset from @kaddr to start at 3380 * @size: size of map area 3381 * 3382 * Returns: 0 for success, -Exxx on failure 3383 * 3384 * This function checks that @kaddr is a valid vmalloc'ed area, 3385 * and that it is big enough to cover the range starting at 3386 * @uaddr in @vma. Will return failure if that criteria isn't 3387 * met. 3388 * 3389 * Similar to remap_pfn_range() (see mm/memory.c) 3390 */ 3391 int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, 3392 void *kaddr, unsigned long pgoff, 3393 unsigned long size) 3394 { 3395 struct vm_struct *area; 3396 unsigned long off; 3397 unsigned long end_index; 3398 3399 if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) 3400 return -EINVAL; 3401 3402 size = PAGE_ALIGN(size); 3403 3404 if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) 3405 return -EINVAL; 3406 3407 area = find_vm_area(kaddr); 3408 if (!area) 3409 return -EINVAL; 3410 3411 if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) 3412 return -EINVAL; 3413 3414 if (check_add_overflow(size, off, &end_index) || 3415 end_index > get_vm_area_size(area)) 3416 return -EINVAL; 3417 kaddr += off; 3418 3419 do { 3420 struct page *page = vmalloc_to_page(kaddr); 3421 int ret; 3422 3423 ret = vm_insert_page(vma, uaddr, page); 3424 if (ret) 3425 return ret; 3426 3427 uaddr += PAGE_SIZE; 3428 kaddr += PAGE_SIZE; 3429 size -= PAGE_SIZE; 3430 } while (size > 0); 3431 3432 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 3433 3434 return 0; 3435 } 3436 3437 /** 3438 * remap_vmalloc_range - map vmalloc pages to userspace 3439 * @vma: vma to cover (map full range of vma) 3440 * @addr: vmalloc memory 3441 * @pgoff: number of pages into addr before first page to map 3442 * 3443 * Returns: 0 for success, -Exxx on failure 3444 * 3445 * This function checks that addr is a valid vmalloc'ed area, and 3446 * that it is big enough to cover the vma. Will return failure if 3447 * that criteria isn't met. 3448 * 3449 * Similar to remap_pfn_range() (see mm/memory.c) 3450 */ 3451 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 3452 unsigned long pgoff) 3453 { 3454 return remap_vmalloc_range_partial(vma, vma->vm_start, 3455 addr, pgoff, 3456 vma->vm_end - vma->vm_start); 3457 } 3458 EXPORT_SYMBOL(remap_vmalloc_range); 3459 3460 void free_vm_area(struct vm_struct *area) 3461 { 3462 struct vm_struct *ret; 3463 ret = remove_vm_area(area->addr); 3464 BUG_ON(ret != area); 3465 kfree(area); 3466 } 3467 EXPORT_SYMBOL_GPL(free_vm_area); 3468 3469 #ifdef CONFIG_SMP 3470 static struct vmap_area *node_to_va(struct rb_node *n) 3471 { 3472 return rb_entry_safe(n, struct vmap_area, rb_node); 3473 } 3474 3475 /** 3476 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to 3477 * @addr: target address 3478 * 3479 * Returns: vmap_area if it is found. If there is no such area 3480 * the first highest(reverse order) vmap_area is returned 3481 * i.e. va->va_start < addr && va->va_end < addr or NULL 3482 * if there are no any areas before @addr. 3483 */ 3484 static struct vmap_area * 3485 pvm_find_va_enclose_addr(unsigned long addr) 3486 { 3487 struct vmap_area *va, *tmp; 3488 struct rb_node *n; 3489 3490 n = free_vmap_area_root.rb_node; 3491 va = NULL; 3492 3493 while (n) { 3494 tmp = rb_entry(n, struct vmap_area, rb_node); 3495 if (tmp->va_start <= addr) { 3496 va = tmp; 3497 if (tmp->va_end >= addr) 3498 break; 3499 3500 n = n->rb_right; 3501 } else { 3502 n = n->rb_left; 3503 } 3504 } 3505 3506 return va; 3507 } 3508 3509 /** 3510 * pvm_determine_end_from_reverse - find the highest aligned address 3511 * of free block below VMALLOC_END 3512 * @va: 3513 * in - the VA we start the search(reverse order); 3514 * out - the VA with the highest aligned end address. 3515 * @align: alignment for required highest address 3516 * 3517 * Returns: determined end address within vmap_area 3518 */ 3519 static unsigned long 3520 pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) 3521 { 3522 unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3523 unsigned long addr; 3524 3525 if (likely(*va)) { 3526 list_for_each_entry_from_reverse((*va), 3527 &free_vmap_area_list, list) { 3528 addr = min((*va)->va_end & ~(align - 1), vmalloc_end); 3529 if ((*va)->va_start < addr) 3530 return addr; 3531 } 3532 } 3533 3534 return 0; 3535 } 3536 3537 /** 3538 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator 3539 * @offsets: array containing offset of each area 3540 * @sizes: array containing size of each area 3541 * @nr_vms: the number of areas to allocate 3542 * @align: alignment, all entries in @offsets and @sizes must be aligned to this 3543 * 3544 * Returns: kmalloc'd vm_struct pointer array pointing to allocated 3545 * vm_structs on success, %NULL on failure 3546 * 3547 * Percpu allocator wants to use congruent vm areas so that it can 3548 * maintain the offsets among percpu areas. This function allocates 3549 * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to 3550 * be scattered pretty far, distance between two areas easily going up 3551 * to gigabytes. To avoid interacting with regular vmallocs, these 3552 * areas are allocated from top. 3553 * 3554 * Despite its complicated look, this allocator is rather simple. It 3555 * does everything top-down and scans free blocks from the end looking 3556 * for matching base. While scanning, if any of the areas do not fit the 3557 * base address is pulled down to fit the area. Scanning is repeated till 3558 * all the areas fit and then all necessary data structures are inserted 3559 * and the result is returned. 3560 */ 3561 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 3562 const size_t *sizes, int nr_vms, 3563 size_t align) 3564 { 3565 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 3566 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3567 struct vmap_area **vas, *va; 3568 struct vm_struct **vms; 3569 int area, area2, last_area, term_area; 3570 unsigned long base, start, size, end, last_end, orig_start, orig_end; 3571 bool purged = false; 3572 enum fit_type type; 3573 3574 /* verify parameters and allocate data structures */ 3575 BUG_ON(offset_in_page(align) || !is_power_of_2(align)); 3576 for (last_area = 0, area = 0; area < nr_vms; area++) { 3577 start = offsets[area]; 3578 end = start + sizes[area]; 3579 3580 /* is everything aligned properly? */ 3581 BUG_ON(!IS_ALIGNED(offsets[area], align)); 3582 BUG_ON(!IS_ALIGNED(sizes[area], align)); 3583 3584 /* detect the area with the highest address */ 3585 if (start > offsets[last_area]) 3586 last_area = area; 3587 3588 for (area2 = area + 1; area2 < nr_vms; area2++) { 3589 unsigned long start2 = offsets[area2]; 3590 unsigned long end2 = start2 + sizes[area2]; 3591 3592 BUG_ON(start2 < end && start < end2); 3593 } 3594 } 3595 last_end = offsets[last_area] + sizes[last_area]; 3596 3597 if (vmalloc_end - vmalloc_start < last_end) { 3598 WARN_ON(true); 3599 return NULL; 3600 } 3601 3602 vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); 3603 vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); 3604 if (!vas || !vms) 3605 goto err_free2; 3606 3607 for (area = 0; area < nr_vms; area++) { 3608 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); 3609 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); 3610 if (!vas[area] || !vms[area]) 3611 goto err_free; 3612 } 3613 retry: 3614 spin_lock(&free_vmap_area_lock); 3615 3616 /* start scanning - we scan from the top, begin with the last area */ 3617 area = term_area = last_area; 3618 start = offsets[area]; 3619 end = start + sizes[area]; 3620 3621 va = pvm_find_va_enclose_addr(vmalloc_end); 3622 base = pvm_determine_end_from_reverse(&va, align) - end; 3623 3624 while (true) { 3625 /* 3626 * base might have underflowed, add last_end before 3627 * comparing. 3628 */ 3629 if (base + last_end < vmalloc_start + last_end) 3630 goto overflow; 3631 3632 /* 3633 * Fitting base has not been found. 3634 */ 3635 if (va == NULL) 3636 goto overflow; 3637 3638 /* 3639 * If required width exceeds current VA block, move 3640 * base downwards and then recheck. 3641 */ 3642 if (base + end > va->va_end) { 3643 base = pvm_determine_end_from_reverse(&va, align) - end; 3644 term_area = area; 3645 continue; 3646 } 3647 3648 /* 3649 * If this VA does not fit, move base downwards and recheck. 3650 */ 3651 if (base + start < va->va_start) { 3652 va = node_to_va(rb_prev(&va->rb_node)); 3653 base = pvm_determine_end_from_reverse(&va, align) - end; 3654 term_area = area; 3655 continue; 3656 } 3657 3658 /* 3659 * This area fits, move on to the previous one. If 3660 * the previous one is the terminal one, we're done. 3661 */ 3662 area = (area + nr_vms - 1) % nr_vms; 3663 if (area == term_area) 3664 break; 3665 3666 start = offsets[area]; 3667 end = start + sizes[area]; 3668 va = pvm_find_va_enclose_addr(base + end); 3669 } 3670 3671 /* we've found a fitting base, insert all va's */ 3672 for (area = 0; area < nr_vms; area++) { 3673 int ret; 3674 3675 start = base + offsets[area]; 3676 size = sizes[area]; 3677 3678 va = pvm_find_va_enclose_addr(start); 3679 if (WARN_ON_ONCE(va == NULL)) 3680 /* It is a BUG(), but trigger recovery instead. */ 3681 goto recovery; 3682 3683 type = classify_va_fit_type(va, start, size); 3684 if (WARN_ON_ONCE(type == NOTHING_FIT)) 3685 /* It is a BUG(), but trigger recovery instead. */ 3686 goto recovery; 3687 3688 ret = adjust_va_to_fit_type(va, start, size, type); 3689 if (unlikely(ret)) 3690 goto recovery; 3691 3692 /* Allocated area. */ 3693 va = vas[area]; 3694 va->va_start = start; 3695 va->va_end = start + size; 3696 } 3697 3698 spin_unlock(&free_vmap_area_lock); 3699 3700 /* populate the kasan shadow space */ 3701 for (area = 0; area < nr_vms; area++) { 3702 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) 3703 goto err_free_shadow; 3704 3705 kasan_unpoison_vmalloc((void *)vas[area]->va_start, 3706 sizes[area]); 3707 } 3708 3709 /* insert all vm's */ 3710 spin_lock(&vmap_area_lock); 3711 for (area = 0; area < nr_vms; area++) { 3712 insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); 3713 3714 setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, 3715 pcpu_get_vm_areas); 3716 } 3717 spin_unlock(&vmap_area_lock); 3718 3719 kfree(vas); 3720 return vms; 3721 3722 recovery: 3723 /* 3724 * Remove previously allocated areas. There is no 3725 * need in removing these areas from the busy tree, 3726 * because they are inserted only on the final step 3727 * and when pcpu_get_vm_areas() is success. 3728 */ 3729 while (area--) { 3730 orig_start = vas[area]->va_start; 3731 orig_end = vas[area]->va_end; 3732 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, 3733 &free_vmap_area_list); 3734 if (va) 3735 kasan_release_vmalloc(orig_start, orig_end, 3736 va->va_start, va->va_end); 3737 vas[area] = NULL; 3738 } 3739 3740 overflow: 3741 spin_unlock(&free_vmap_area_lock); 3742 if (!purged) { 3743 purge_vmap_area_lazy(); 3744 purged = true; 3745 3746 /* Before "retry", check if we recover. */ 3747 for (area = 0; area < nr_vms; area++) { 3748 if (vas[area]) 3749 continue; 3750 3751 vas[area] = kmem_cache_zalloc( 3752 vmap_area_cachep, GFP_KERNEL); 3753 if (!vas[area]) 3754 goto err_free; 3755 } 3756 3757 goto retry; 3758 } 3759 3760 err_free: 3761 for (area = 0; area < nr_vms; area++) { 3762 if (vas[area]) 3763 kmem_cache_free(vmap_area_cachep, vas[area]); 3764 3765 kfree(vms[area]); 3766 } 3767 err_free2: 3768 kfree(vas); 3769 kfree(vms); 3770 return NULL; 3771 3772 err_free_shadow: 3773 spin_lock(&free_vmap_area_lock); 3774 /* 3775 * We release all the vmalloc shadows, even the ones for regions that 3776 * hadn't been successfully added. This relies on kasan_release_vmalloc 3777 * being able to tolerate this case. 3778 */ 3779 for (area = 0; area < nr_vms; area++) { 3780 orig_start = vas[area]->va_start; 3781 orig_end = vas[area]->va_end; 3782 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, 3783 &free_vmap_area_list); 3784 if (va) 3785 kasan_release_vmalloc(orig_start, orig_end, 3786 va->va_start, va->va_end); 3787 vas[area] = NULL; 3788 kfree(vms[area]); 3789 } 3790 spin_unlock(&free_vmap_area_lock); 3791 kfree(vas); 3792 kfree(vms); 3793 return NULL; 3794 } 3795 3796 /** 3797 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator 3798 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() 3799 * @nr_vms: the number of allocated areas 3800 * 3801 * Free vm_structs and the array allocated by pcpu_get_vm_areas(). 3802 */ 3803 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) 3804 { 3805 int i; 3806 3807 for (i = 0; i < nr_vms; i++) 3808 free_vm_area(vms[i]); 3809 kfree(vms); 3810 } 3811 #endif /* CONFIG_SMP */ 3812 3813 #ifdef CONFIG_PRINTK 3814 bool vmalloc_dump_obj(void *object) 3815 { 3816 struct vm_struct *vm; 3817 void *objp = (void *)PAGE_ALIGN((unsigned long)object); 3818 3819 vm = find_vm_area(objp); 3820 if (!vm) 3821 return false; 3822 pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", 3823 vm->nr_pages, (unsigned long)vm->addr, vm->caller); 3824 return true; 3825 } 3826 #endif 3827 3828 #ifdef CONFIG_PROC_FS 3829 static void *s_start(struct seq_file *m, loff_t *pos) 3830 __acquires(&vmap_purge_lock) 3831 __acquires(&vmap_area_lock) 3832 { 3833 mutex_lock(&vmap_purge_lock); 3834 spin_lock(&vmap_area_lock); 3835 3836 return seq_list_start(&vmap_area_list, *pos); 3837 } 3838 3839 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 3840 { 3841 return seq_list_next(p, &vmap_area_list, pos); 3842 } 3843 3844 static void s_stop(struct seq_file *m, void *p) 3845 __releases(&vmap_area_lock) 3846 __releases(&vmap_purge_lock) 3847 { 3848 spin_unlock(&vmap_area_lock); 3849 mutex_unlock(&vmap_purge_lock); 3850 } 3851 3852 static void show_numa_info(struct seq_file *m, struct vm_struct *v) 3853 { 3854 if (IS_ENABLED(CONFIG_NUMA)) { 3855 unsigned int nr, *counters = m->private; 3856 3857 if (!counters) 3858 return; 3859 3860 if (v->flags & VM_UNINITIALIZED) 3861 return; 3862 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ 3863 smp_rmb(); 3864 3865 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 3866 3867 for (nr = 0; nr < v->nr_pages; nr++) 3868 counters[page_to_nid(v->pages[nr])]++; 3869 3870 for_each_node_state(nr, N_HIGH_MEMORY) 3871 if (counters[nr]) 3872 seq_printf(m, " N%u=%u", nr, counters[nr]); 3873 } 3874 } 3875 3876 static void show_purge_info(struct seq_file *m) 3877 { 3878 struct vmap_area *va; 3879 3880 spin_lock(&purge_vmap_area_lock); 3881 list_for_each_entry(va, &purge_vmap_area_list, list) { 3882 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", 3883 (void *)va->va_start, (void *)va->va_end, 3884 va->va_end - va->va_start); 3885 } 3886 spin_unlock(&purge_vmap_area_lock); 3887 } 3888 3889 static int s_show(struct seq_file *m, void *p) 3890 { 3891 struct vmap_area *va; 3892 struct vm_struct *v; 3893 3894 va = list_entry(p, struct vmap_area, list); 3895 3896 /* 3897 * s_show can encounter race with remove_vm_area, !vm on behalf 3898 * of vmap area is being tear down or vm_map_ram allocation. 3899 */ 3900 if (!va->vm) { 3901 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", 3902 (void *)va->va_start, (void *)va->va_end, 3903 va->va_end - va->va_start); 3904 3905 return 0; 3906 } 3907 3908 v = va->vm; 3909 3910 seq_printf(m, "0x%pK-0x%pK %7ld", 3911 v->addr, v->addr + v->size, v->size); 3912 3913 if (v->caller) 3914 seq_printf(m, " %pS", v->caller); 3915 3916 if (v->nr_pages) 3917 seq_printf(m, " pages=%d", v->nr_pages); 3918 3919 if (v->phys_addr) 3920 seq_printf(m, " phys=%pa", &v->phys_addr); 3921 3922 if (v->flags & VM_IOREMAP) 3923 seq_puts(m, " ioremap"); 3924 3925 if (v->flags & VM_ALLOC) 3926 seq_puts(m, " vmalloc"); 3927 3928 if (v->flags & VM_MAP) 3929 seq_puts(m, " vmap"); 3930 3931 if (v->flags & VM_USERMAP) 3932 seq_puts(m, " user"); 3933 3934 if (v->flags & VM_DMA_COHERENT) 3935 seq_puts(m, " dma-coherent"); 3936 3937 if (is_vmalloc_addr(v->pages)) 3938 seq_puts(m, " vpages"); 3939 3940 show_numa_info(m, v); 3941 seq_putc(m, '\n'); 3942 3943 /* 3944 * As a final step, dump "unpurged" areas. 3945 */ 3946 if (list_is_last(&va->list, &vmap_area_list)) 3947 show_purge_info(m); 3948 3949 return 0; 3950 } 3951 3952 static const struct seq_operations vmalloc_op = { 3953 .start = s_start, 3954 .next = s_next, 3955 .stop = s_stop, 3956 .show = s_show, 3957 }; 3958 3959 static int __init proc_vmalloc_init(void) 3960 { 3961 if (IS_ENABLED(CONFIG_NUMA)) 3962 proc_create_seq_private("vmallocinfo", 0400, NULL, 3963 &vmalloc_op, 3964 nr_node_ids * sizeof(unsigned int), NULL); 3965 else 3966 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op); 3967 return 0; 3968 } 3969 module_init(proc_vmalloc_init); 3970 3971 #endif 3972