1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1993 Linus Torvalds 4 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 5 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 6 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 7 * Numa awareness, Christoph Lameter, SGI, June 2005 8 * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 9 */ 10 11 #include <linux/vmalloc.h> 12 #include <linux/mm.h> 13 #include <linux/module.h> 14 #include <linux/highmem.h> 15 #include <linux/sched/signal.h> 16 #include <linux/slab.h> 17 #include <linux/spinlock.h> 18 #include <linux/interrupt.h> 19 #include <linux/proc_fs.h> 20 #include <linux/seq_file.h> 21 #include <linux/set_memory.h> 22 #include <linux/debugobjects.h> 23 #include <linux/kallsyms.h> 24 #include <linux/list.h> 25 #include <linux/notifier.h> 26 #include <linux/rbtree.h> 27 #include <linux/xarray.h> 28 #include <linux/io.h> 29 #include <linux/rcupdate.h> 30 #include <linux/pfn.h> 31 #include <linux/kmemleak.h> 32 #include <linux/atomic.h> 33 #include <linux/compiler.h> 34 #include <linux/memcontrol.h> 35 #include <linux/llist.h> 36 #include <linux/uio.h> 37 #include <linux/bitops.h> 38 #include <linux/rbtree_augmented.h> 39 #include <linux/overflow.h> 40 #include <linux/pgtable.h> 41 #include <linux/hugetlb.h> 42 #include <linux/sched/mm.h> 43 #include <asm/tlbflush.h> 44 #include <asm/shmparam.h> 45 #include <linux/page_owner.h> 46 47 #define CREATE_TRACE_POINTS 48 #include <trace/events/vmalloc.h> 49 50 #include "internal.h" 51 #include "pgalloc-track.h" 52 53 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP 54 static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1; 55 56 static int __init set_nohugeiomap(char *str) 57 { 58 ioremap_max_page_shift = PAGE_SHIFT; 59 return 0; 60 } 61 early_param("nohugeiomap", set_nohugeiomap); 62 #else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ 63 static const unsigned int ioremap_max_page_shift = PAGE_SHIFT; 64 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ 65 66 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC 67 static bool __ro_after_init vmap_allow_huge = true; 68 69 static int __init set_nohugevmalloc(char *str) 70 { 71 vmap_allow_huge = false; 72 return 0; 73 } 74 early_param("nohugevmalloc", set_nohugevmalloc); 75 #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ 76 static const bool vmap_allow_huge = false; 77 #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ 78 79 bool is_vmalloc_addr(const void *x) 80 { 81 unsigned long addr = (unsigned long)kasan_reset_tag(x); 82 83 return addr >= VMALLOC_START && addr < VMALLOC_END; 84 } 85 EXPORT_SYMBOL(is_vmalloc_addr); 86 87 struct vfree_deferred { 88 struct llist_head list; 89 struct work_struct wq; 90 }; 91 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); 92 93 /*** Page table manipulation functions ***/ 94 static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 95 phys_addr_t phys_addr, pgprot_t prot, 96 unsigned int max_page_shift, pgtbl_mod_mask *mask) 97 { 98 pte_t *pte; 99 u64 pfn; 100 struct page *page; 101 unsigned long size = PAGE_SIZE; 102 103 if (WARN_ON_ONCE(!PAGE_ALIGNED(end - addr))) 104 return -EINVAL; 105 106 pfn = phys_addr >> PAGE_SHIFT; 107 pte = pte_alloc_kernel_track(pmd, addr, mask); 108 if (!pte) 109 return -ENOMEM; 110 111 arch_enter_lazy_mmu_mode(); 112 113 do { 114 if (unlikely(!pte_none(ptep_get(pte)))) { 115 if (pfn_valid(pfn)) { 116 page = pfn_to_page(pfn); 117 dump_page(page, "remapping already mapped page"); 118 } 119 BUG(); 120 } 121 122 #ifdef CONFIG_HUGETLB_PAGE 123 size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); 124 if (size != PAGE_SIZE) { 125 pte_t entry = pfn_pte(pfn, prot); 126 127 entry = arch_make_huge_pte(entry, ilog2(size), 0); 128 set_huge_pte_at(&init_mm, addr, pte, entry, size); 129 pfn += PFN_DOWN(size); 130 continue; 131 } 132 #endif 133 set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); 134 pfn++; 135 } while (pte += PFN_DOWN(size), addr += size, addr != end); 136 137 arch_leave_lazy_mmu_mode(); 138 *mask |= PGTBL_PTE_MODIFIED; 139 return 0; 140 } 141 142 static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, 143 phys_addr_t phys_addr, pgprot_t prot, 144 unsigned int max_page_shift) 145 { 146 if (max_page_shift < PMD_SHIFT) 147 return 0; 148 149 if (!arch_vmap_pmd_supported(prot)) 150 return 0; 151 152 if ((end - addr) != PMD_SIZE) 153 return 0; 154 155 if (!IS_ALIGNED(addr, PMD_SIZE)) 156 return 0; 157 158 if (!IS_ALIGNED(phys_addr, PMD_SIZE)) 159 return 0; 160 161 if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) 162 return 0; 163 164 return pmd_set_huge(pmd, phys_addr, prot); 165 } 166 167 static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 168 phys_addr_t phys_addr, pgprot_t prot, 169 unsigned int max_page_shift, pgtbl_mod_mask *mask) 170 { 171 pmd_t *pmd; 172 unsigned long next; 173 int err = 0; 174 175 pmd = pmd_alloc_track(&init_mm, pud, addr, mask); 176 if (!pmd) 177 return -ENOMEM; 178 do { 179 next = pmd_addr_end(addr, end); 180 181 if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, 182 max_page_shift)) { 183 *mask |= PGTBL_PMD_MODIFIED; 184 continue; 185 } 186 187 err = vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask); 188 if (err) 189 break; 190 } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); 191 return err; 192 } 193 194 static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, 195 phys_addr_t phys_addr, pgprot_t prot, 196 unsigned int max_page_shift) 197 { 198 if (max_page_shift < PUD_SHIFT) 199 return 0; 200 201 if (!arch_vmap_pud_supported(prot)) 202 return 0; 203 204 if ((end - addr) != PUD_SIZE) 205 return 0; 206 207 if (!IS_ALIGNED(addr, PUD_SIZE)) 208 return 0; 209 210 if (!IS_ALIGNED(phys_addr, PUD_SIZE)) 211 return 0; 212 213 if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) 214 return 0; 215 216 return pud_set_huge(pud, phys_addr, prot); 217 } 218 219 static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 220 phys_addr_t phys_addr, pgprot_t prot, 221 unsigned int max_page_shift, pgtbl_mod_mask *mask) 222 { 223 pud_t *pud; 224 unsigned long next; 225 int err = 0; 226 227 pud = pud_alloc_track(&init_mm, p4d, addr, mask); 228 if (!pud) 229 return -ENOMEM; 230 do { 231 next = pud_addr_end(addr, end); 232 233 if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, 234 max_page_shift)) { 235 *mask |= PGTBL_PUD_MODIFIED; 236 continue; 237 } 238 239 err = vmap_pmd_range(pud, addr, next, phys_addr, prot, max_page_shift, mask); 240 if (err) 241 break; 242 } while (pud++, phys_addr += (next - addr), addr = next, addr != end); 243 return err; 244 } 245 246 static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, 247 phys_addr_t phys_addr, pgprot_t prot, 248 unsigned int max_page_shift) 249 { 250 if (max_page_shift < P4D_SHIFT) 251 return 0; 252 253 if (!arch_vmap_p4d_supported(prot)) 254 return 0; 255 256 if ((end - addr) != P4D_SIZE) 257 return 0; 258 259 if (!IS_ALIGNED(addr, P4D_SIZE)) 260 return 0; 261 262 if (!IS_ALIGNED(phys_addr, P4D_SIZE)) 263 return 0; 264 265 if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) 266 return 0; 267 268 return p4d_set_huge(p4d, phys_addr, prot); 269 } 270 271 static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 272 phys_addr_t phys_addr, pgprot_t prot, 273 unsigned int max_page_shift, pgtbl_mod_mask *mask) 274 { 275 p4d_t *p4d; 276 unsigned long next; 277 int err = 0; 278 279 p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); 280 if (!p4d) 281 return -ENOMEM; 282 do { 283 next = p4d_addr_end(addr, end); 284 285 if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, 286 max_page_shift)) { 287 *mask |= PGTBL_P4D_MODIFIED; 288 continue; 289 } 290 291 err = vmap_pud_range(p4d, addr, next, phys_addr, prot, max_page_shift, mask); 292 if (err) 293 break; 294 } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); 295 return err; 296 } 297 298 static int vmap_range_noflush(unsigned long addr, unsigned long end, 299 phys_addr_t phys_addr, pgprot_t prot, 300 unsigned int max_page_shift) 301 { 302 pgd_t *pgd; 303 unsigned long start; 304 unsigned long next; 305 int err; 306 pgtbl_mod_mask mask = 0; 307 308 might_sleep(); 309 BUG_ON(addr >= end); 310 311 start = addr; 312 pgd = pgd_offset_k(addr); 313 do { 314 next = pgd_addr_end(addr, end); 315 err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, 316 max_page_shift, &mask); 317 if (err) 318 break; 319 } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); 320 321 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 322 arch_sync_kernel_mappings(start, end); 323 324 return err; 325 } 326 327 int vmap_page_range(unsigned long addr, unsigned long end, 328 phys_addr_t phys_addr, pgprot_t prot) 329 { 330 int err; 331 332 err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), 333 ioremap_max_page_shift); 334 flush_cache_vmap(addr, end); 335 if (!err) 336 err = kmsan_ioremap_page_range(addr, end, phys_addr, prot, 337 ioremap_max_page_shift); 338 return err; 339 } 340 341 int ioremap_page_range(unsigned long addr, unsigned long end, 342 phys_addr_t phys_addr, pgprot_t prot) 343 { 344 struct vm_struct *area; 345 346 area = find_vm_area((void *)addr); 347 if (!area || !(area->flags & VM_IOREMAP)) { 348 WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr); 349 return -EINVAL; 350 } 351 if (addr != (unsigned long)area->addr || 352 (void *)end != area->addr + get_vm_area_size(area)) { 353 WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n", 354 addr, end, (long)area->addr, 355 (long)area->addr + get_vm_area_size(area)); 356 return -ERANGE; 357 } 358 return vmap_page_range(addr, end, phys_addr, prot); 359 } 360 361 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 362 pgtbl_mod_mask *mask) 363 { 364 pte_t *pte; 365 pte_t ptent; 366 unsigned long size = PAGE_SIZE; 367 368 pte = pte_offset_kernel(pmd, addr); 369 arch_enter_lazy_mmu_mode(); 370 371 do { 372 #ifdef CONFIG_HUGETLB_PAGE 373 size = arch_vmap_pte_range_unmap_size(addr, pte); 374 if (size != PAGE_SIZE) { 375 if (WARN_ON(!IS_ALIGNED(addr, size))) { 376 addr = ALIGN_DOWN(addr, size); 377 pte = PTR_ALIGN_DOWN(pte, sizeof(*pte) * (size >> PAGE_SHIFT)); 378 } 379 ptent = huge_ptep_get_and_clear(&init_mm, addr, pte, size); 380 if (WARN_ON(end - addr < size)) 381 size = end - addr; 382 } else 383 #endif 384 ptent = ptep_get_and_clear(&init_mm, addr, pte); 385 WARN_ON(!pte_none(ptent) && !pte_present(ptent)); 386 } while (pte += (size >> PAGE_SHIFT), addr += size, addr != end); 387 388 arch_leave_lazy_mmu_mode(); 389 *mask |= PGTBL_PTE_MODIFIED; 390 } 391 392 static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 393 pgtbl_mod_mask *mask) 394 { 395 pmd_t *pmd; 396 unsigned long next; 397 int cleared; 398 399 pmd = pmd_offset(pud, addr); 400 do { 401 next = pmd_addr_end(addr, end); 402 403 cleared = pmd_clear_huge(pmd); 404 if (cleared || pmd_bad(*pmd)) 405 *mask |= PGTBL_PMD_MODIFIED; 406 407 if (cleared) { 408 WARN_ON(next - addr < PMD_SIZE); 409 continue; 410 } 411 if (pmd_none_or_clear_bad(pmd)) 412 continue; 413 vunmap_pte_range(pmd, addr, next, mask); 414 415 cond_resched(); 416 } while (pmd++, addr = next, addr != end); 417 } 418 419 static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 420 pgtbl_mod_mask *mask) 421 { 422 pud_t *pud; 423 unsigned long next; 424 int cleared; 425 426 pud = pud_offset(p4d, addr); 427 do { 428 next = pud_addr_end(addr, end); 429 430 cleared = pud_clear_huge(pud); 431 if (cleared || pud_bad(*pud)) 432 *mask |= PGTBL_PUD_MODIFIED; 433 434 if (cleared) { 435 WARN_ON(next - addr < PUD_SIZE); 436 continue; 437 } 438 if (pud_none_or_clear_bad(pud)) 439 continue; 440 vunmap_pmd_range(pud, addr, next, mask); 441 } while (pud++, addr = next, addr != end); 442 } 443 444 static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 445 pgtbl_mod_mask *mask) 446 { 447 p4d_t *p4d; 448 unsigned long next; 449 450 p4d = p4d_offset(pgd, addr); 451 do { 452 next = p4d_addr_end(addr, end); 453 454 p4d_clear_huge(p4d); 455 if (p4d_bad(*p4d)) 456 *mask |= PGTBL_P4D_MODIFIED; 457 458 if (p4d_none_or_clear_bad(p4d)) 459 continue; 460 vunmap_pud_range(p4d, addr, next, mask); 461 } while (p4d++, addr = next, addr != end); 462 } 463 464 /* 465 * vunmap_range_noflush is similar to vunmap_range, but does not 466 * flush caches or TLBs. 467 * 468 * The caller is responsible for calling flush_cache_vmap() before calling 469 * this function, and flush_tlb_kernel_range after it has returned 470 * successfully (and before the addresses are expected to cause a page fault 471 * or be re-mapped for something else, if TLB flushes are being delayed or 472 * coalesced). 473 * 474 * This is an internal function only. Do not use outside mm/. 475 */ 476 void __vunmap_range_noflush(unsigned long start, unsigned long end) 477 { 478 unsigned long next; 479 pgd_t *pgd; 480 unsigned long addr = start; 481 pgtbl_mod_mask mask = 0; 482 483 BUG_ON(addr >= end); 484 pgd = pgd_offset_k(addr); 485 do { 486 next = pgd_addr_end(addr, end); 487 if (pgd_bad(*pgd)) 488 mask |= PGTBL_PGD_MODIFIED; 489 if (pgd_none_or_clear_bad(pgd)) 490 continue; 491 vunmap_p4d_range(pgd, addr, next, &mask); 492 } while (pgd++, addr = next, addr != end); 493 494 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 495 arch_sync_kernel_mappings(start, end); 496 } 497 498 void vunmap_range_noflush(unsigned long start, unsigned long end) 499 { 500 kmsan_vunmap_range_noflush(start, end); 501 __vunmap_range_noflush(start, end); 502 } 503 504 /** 505 * vunmap_range - unmap kernel virtual addresses 506 * @addr: start of the VM area to unmap 507 * @end: end of the VM area to unmap (non-inclusive) 508 * 509 * Clears any present PTEs in the virtual address range, flushes TLBs and 510 * caches. Any subsequent access to the address before it has been re-mapped 511 * is a kernel bug. 512 */ 513 void vunmap_range(unsigned long addr, unsigned long end) 514 { 515 flush_cache_vunmap(addr, end); 516 vunmap_range_noflush(addr, end); 517 flush_tlb_kernel_range(addr, end); 518 } 519 520 static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, 521 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 522 pgtbl_mod_mask *mask) 523 { 524 int err = 0; 525 pte_t *pte; 526 527 /* 528 * nr is a running index into the array which helps higher level 529 * callers keep track of where we're up to. 530 */ 531 532 pte = pte_alloc_kernel_track(pmd, addr, mask); 533 if (!pte) 534 return -ENOMEM; 535 536 arch_enter_lazy_mmu_mode(); 537 538 do { 539 struct page *page = pages[*nr]; 540 541 if (WARN_ON(!pte_none(ptep_get(pte)))) { 542 err = -EBUSY; 543 break; 544 } 545 if (WARN_ON(!page)) { 546 err = -ENOMEM; 547 break; 548 } 549 if (WARN_ON(!pfn_valid(page_to_pfn(page)))) { 550 err = -EINVAL; 551 break; 552 } 553 554 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 555 (*nr)++; 556 } while (pte++, addr += PAGE_SIZE, addr != end); 557 558 arch_leave_lazy_mmu_mode(); 559 *mask |= PGTBL_PTE_MODIFIED; 560 561 return err; 562 } 563 564 static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, 565 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 566 pgtbl_mod_mask *mask) 567 { 568 pmd_t *pmd; 569 unsigned long next; 570 571 pmd = pmd_alloc_track(&init_mm, pud, addr, mask); 572 if (!pmd) 573 return -ENOMEM; 574 do { 575 next = pmd_addr_end(addr, end); 576 if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask)) 577 return -ENOMEM; 578 } while (pmd++, addr = next, addr != end); 579 return 0; 580 } 581 582 static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, 583 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 584 pgtbl_mod_mask *mask) 585 { 586 pud_t *pud; 587 unsigned long next; 588 589 pud = pud_alloc_track(&init_mm, p4d, addr, mask); 590 if (!pud) 591 return -ENOMEM; 592 do { 593 next = pud_addr_end(addr, end); 594 if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask)) 595 return -ENOMEM; 596 } while (pud++, addr = next, addr != end); 597 return 0; 598 } 599 600 static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, 601 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 602 pgtbl_mod_mask *mask) 603 { 604 p4d_t *p4d; 605 unsigned long next; 606 607 p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); 608 if (!p4d) 609 return -ENOMEM; 610 do { 611 next = p4d_addr_end(addr, end); 612 if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask)) 613 return -ENOMEM; 614 } while (p4d++, addr = next, addr != end); 615 return 0; 616 } 617 618 static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, 619 pgprot_t prot, struct page **pages) 620 { 621 unsigned long start = addr; 622 pgd_t *pgd; 623 unsigned long next; 624 int err = 0; 625 int nr = 0; 626 pgtbl_mod_mask mask = 0; 627 628 BUG_ON(addr >= end); 629 pgd = pgd_offset_k(addr); 630 do { 631 next = pgd_addr_end(addr, end); 632 if (pgd_bad(*pgd)) 633 mask |= PGTBL_PGD_MODIFIED; 634 err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); 635 if (err) 636 break; 637 } while (pgd++, addr = next, addr != end); 638 639 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 640 arch_sync_kernel_mappings(start, end); 641 642 return err; 643 } 644 645 /* 646 * vmap_pages_range_noflush is similar to vmap_pages_range, but does not 647 * flush caches. 648 * 649 * The caller is responsible for calling flush_cache_vmap() after this 650 * function returns successfully and before the addresses are accessed. 651 * 652 * This is an internal function only. Do not use outside mm/. 653 */ 654 int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, 655 pgprot_t prot, struct page **pages, unsigned int page_shift) 656 { 657 unsigned int i, nr = (end - addr) >> PAGE_SHIFT; 658 659 WARN_ON(page_shift < PAGE_SHIFT); 660 661 if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || 662 page_shift == PAGE_SHIFT) 663 return vmap_small_pages_range_noflush(addr, end, prot, pages); 664 665 for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { 666 int err; 667 668 err = vmap_range_noflush(addr, addr + (1UL << page_shift), 669 page_to_phys(pages[i]), prot, 670 page_shift); 671 if (err) 672 return err; 673 674 addr += 1UL << page_shift; 675 } 676 677 return 0; 678 } 679 680 int vmap_pages_range_noflush(unsigned long addr, unsigned long end, 681 pgprot_t prot, struct page **pages, unsigned int page_shift, 682 gfp_t gfp_mask) 683 { 684 int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages, 685 page_shift, gfp_mask); 686 687 if (ret) 688 return ret; 689 return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); 690 } 691 692 static int __vmap_pages_range(unsigned long addr, unsigned long end, 693 pgprot_t prot, struct page **pages, unsigned int page_shift, 694 gfp_t gfp_mask) 695 { 696 int err; 697 698 err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift, gfp_mask); 699 flush_cache_vmap(addr, end); 700 return err; 701 } 702 703 /** 704 * vmap_pages_range - map pages to a kernel virtual address 705 * @addr: start of the VM area to map 706 * @end: end of the VM area to map (non-inclusive) 707 * @prot: page protection flags to use 708 * @pages: pages to map (always PAGE_SIZE pages) 709 * @page_shift: maximum shift that the pages may be mapped with, @pages must 710 * be aligned and contiguous up to at least this shift. 711 * 712 * RETURNS: 713 * 0 on success, -errno on failure. 714 */ 715 int vmap_pages_range(unsigned long addr, unsigned long end, 716 pgprot_t prot, struct page **pages, unsigned int page_shift) 717 { 718 return __vmap_pages_range(addr, end, prot, pages, page_shift, GFP_KERNEL); 719 } 720 721 static int check_sparse_vm_area(struct vm_struct *area, unsigned long start, 722 unsigned long end) 723 { 724 might_sleep(); 725 if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS)) 726 return -EINVAL; 727 if (WARN_ON_ONCE(area->flags & VM_NO_GUARD)) 728 return -EINVAL; 729 if (WARN_ON_ONCE(!(area->flags & VM_SPARSE))) 730 return -EINVAL; 731 if ((end - start) >> PAGE_SHIFT > totalram_pages()) 732 return -E2BIG; 733 if (start < (unsigned long)area->addr || 734 (void *)end > area->addr + get_vm_area_size(area)) 735 return -ERANGE; 736 return 0; 737 } 738 739 /** 740 * vm_area_map_pages - map pages inside given sparse vm_area 741 * @area: vm_area 742 * @start: start address inside vm_area 743 * @end: end address inside vm_area 744 * @pages: pages to map (always PAGE_SIZE pages) 745 */ 746 int vm_area_map_pages(struct vm_struct *area, unsigned long start, 747 unsigned long end, struct page **pages) 748 { 749 int err; 750 751 err = check_sparse_vm_area(area, start, end); 752 if (err) 753 return err; 754 755 return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT); 756 } 757 758 /** 759 * vm_area_unmap_pages - unmap pages inside given sparse vm_area 760 * @area: vm_area 761 * @start: start address inside vm_area 762 * @end: end address inside vm_area 763 */ 764 void vm_area_unmap_pages(struct vm_struct *area, unsigned long start, 765 unsigned long end) 766 { 767 if (check_sparse_vm_area(area, start, end)) 768 return; 769 770 vunmap_range(start, end); 771 } 772 773 int is_vmalloc_or_module_addr(const void *x) 774 { 775 /* 776 * ARM, x86-64 and sparc64 put modules in a special place, 777 * and fall back on vmalloc() if that fails. Others 778 * just put it in the vmalloc space. 779 */ 780 #if defined(CONFIG_EXECMEM) && defined(MODULES_VADDR) 781 unsigned long addr = (unsigned long)kasan_reset_tag(x); 782 if (addr >= MODULES_VADDR && addr < MODULES_END) 783 return 1; 784 #endif 785 return is_vmalloc_addr(x); 786 } 787 EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr); 788 789 /* 790 * Walk a vmap address to the struct page it maps. Huge vmap mappings will 791 * return the tail page that corresponds to the base page address, which 792 * matches small vmap mappings. 793 */ 794 struct page *vmalloc_to_page(const void *vmalloc_addr) 795 { 796 unsigned long addr = (unsigned long) vmalloc_addr; 797 struct page *page = NULL; 798 pgd_t *pgd = pgd_offset_k(addr); 799 p4d_t *p4d; 800 pud_t *pud; 801 pmd_t *pmd; 802 pte_t *ptep, pte; 803 804 /* 805 * XXX we might need to change this if we add VIRTUAL_BUG_ON for 806 * architectures that do not vmalloc module space 807 */ 808 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); 809 810 if (pgd_none(*pgd)) 811 return NULL; 812 if (WARN_ON_ONCE(pgd_leaf(*pgd))) 813 return NULL; /* XXX: no allowance for huge pgd */ 814 if (WARN_ON_ONCE(pgd_bad(*pgd))) 815 return NULL; 816 817 p4d = p4d_offset(pgd, addr); 818 if (p4d_none(*p4d)) 819 return NULL; 820 if (p4d_leaf(*p4d)) 821 return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); 822 if (WARN_ON_ONCE(p4d_bad(*p4d))) 823 return NULL; 824 825 pud = pud_offset(p4d, addr); 826 if (pud_none(*pud)) 827 return NULL; 828 if (pud_leaf(*pud)) 829 return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 830 if (WARN_ON_ONCE(pud_bad(*pud))) 831 return NULL; 832 833 pmd = pmd_offset(pud, addr); 834 if (pmd_none(*pmd)) 835 return NULL; 836 if (pmd_leaf(*pmd)) 837 return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 838 if (WARN_ON_ONCE(pmd_bad(*pmd))) 839 return NULL; 840 841 ptep = pte_offset_kernel(pmd, addr); 842 pte = ptep_get(ptep); 843 if (pte_present(pte)) 844 page = pte_page(pte); 845 846 return page; 847 } 848 EXPORT_SYMBOL(vmalloc_to_page); 849 850 /* 851 * Map a vmalloc()-space virtual address to the physical page frame number. 852 */ 853 unsigned long vmalloc_to_pfn(const void *vmalloc_addr) 854 { 855 return page_to_pfn(vmalloc_to_page(vmalloc_addr)); 856 } 857 EXPORT_SYMBOL(vmalloc_to_pfn); 858 859 860 /*** Global kva allocator ***/ 861 862 #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 863 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 864 865 866 static DEFINE_SPINLOCK(free_vmap_area_lock); 867 static bool vmap_initialized __read_mostly; 868 869 /* 870 * This kmem_cache is used for vmap_area objects. Instead of 871 * allocating from slab we reuse an object from this cache to 872 * make things faster. Especially in "no edge" splitting of 873 * free block. 874 */ 875 static struct kmem_cache *vmap_area_cachep; 876 877 /* 878 * This linked list is used in pair with free_vmap_area_root. 879 * It gives O(1) access to prev/next to perform fast coalescing. 880 */ 881 static LIST_HEAD(free_vmap_area_list); 882 883 /* 884 * This augment red-black tree represents the free vmap space. 885 * All vmap_area objects in this tree are sorted by va->va_start 886 * address. It is used for allocation and merging when a vmap 887 * object is released. 888 * 889 * Each vmap_area node contains a maximum available free block 890 * of its sub-tree, right or left. Therefore it is possible to 891 * find a lowest match of free area. 892 */ 893 static struct rb_root free_vmap_area_root = RB_ROOT; 894 895 /* 896 * Preload a CPU with one object for "no edge" split case. The 897 * aim is to get rid of allocations from the atomic context, thus 898 * to use more permissive allocation masks. 899 */ 900 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); 901 902 /* 903 * This structure defines a single, solid model where a list and 904 * rb-tree are part of one entity protected by the lock. Nodes are 905 * sorted in ascending order, thus for O(1) access to left/right 906 * neighbors a list is used as well as for sequential traversal. 907 */ 908 struct rb_list { 909 struct rb_root root; 910 struct list_head head; 911 spinlock_t lock; 912 }; 913 914 /* 915 * A fast size storage contains VAs up to 1M size. A pool consists 916 * of linked between each other ready to go VAs of certain sizes. 917 * An index in the pool-array corresponds to number of pages + 1. 918 */ 919 #define MAX_VA_SIZE_PAGES 256 920 921 struct vmap_pool { 922 struct list_head head; 923 unsigned long len; 924 }; 925 926 /* 927 * An effective vmap-node logic. Users make use of nodes instead 928 * of a global heap. It allows to balance an access and mitigate 929 * contention. 930 */ 931 static struct vmap_node { 932 /* Simple size segregated storage. */ 933 struct vmap_pool pool[MAX_VA_SIZE_PAGES]; 934 spinlock_t pool_lock; 935 bool skip_populate; 936 937 /* Bookkeeping data of this node. */ 938 struct rb_list busy; 939 struct rb_list lazy; 940 941 /* 942 * Ready-to-free areas. 943 */ 944 struct list_head purge_list; 945 struct work_struct purge_work; 946 unsigned long nr_purged; 947 } single; 948 949 /* 950 * Initial setup consists of one single node, i.e. a balancing 951 * is fully disabled. Later on, after vmap is initialized these 952 * parameters are updated based on a system capacity. 953 */ 954 static struct vmap_node *vmap_nodes = &single; 955 static __read_mostly unsigned int nr_vmap_nodes = 1; 956 static __read_mostly unsigned int vmap_zone_size = 1; 957 958 /* A simple iterator over all vmap-nodes. */ 959 #define for_each_vmap_node(vn) \ 960 for ((vn) = &vmap_nodes[0]; \ 961 (vn) < &vmap_nodes[nr_vmap_nodes]; (vn)++) 962 963 static inline unsigned int 964 addr_to_node_id(unsigned long addr) 965 { 966 return (addr / vmap_zone_size) % nr_vmap_nodes; 967 } 968 969 static inline struct vmap_node * 970 addr_to_node(unsigned long addr) 971 { 972 return &vmap_nodes[addr_to_node_id(addr)]; 973 } 974 975 static inline struct vmap_node * 976 id_to_node(unsigned int id) 977 { 978 return &vmap_nodes[id % nr_vmap_nodes]; 979 } 980 981 static inline unsigned int 982 node_to_id(struct vmap_node *node) 983 { 984 /* Pointer arithmetic. */ 985 unsigned int id = node - vmap_nodes; 986 987 if (likely(id < nr_vmap_nodes)) 988 return id; 989 990 WARN_ONCE(1, "An address 0x%p is out-of-bounds.\n", node); 991 return 0; 992 } 993 994 /* 995 * We use the value 0 to represent "no node", that is why 996 * an encoded value will be the node-id incremented by 1. 997 * It is always greater then 0. A valid node_id which can 998 * be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id 999 * is not valid 0 is returned. 1000 */ 1001 static unsigned int 1002 encode_vn_id(unsigned int node_id) 1003 { 1004 /* Can store U8_MAX [0:254] nodes. */ 1005 if (node_id < nr_vmap_nodes) 1006 return (node_id + 1) << BITS_PER_BYTE; 1007 1008 /* Warn and no node encoded. */ 1009 WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id); 1010 return 0; 1011 } 1012 1013 /* 1014 * Returns an encoded node-id, the valid range is within 1015 * [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is 1016 * returned if extracted data is wrong. 1017 */ 1018 static unsigned int 1019 decode_vn_id(unsigned int val) 1020 { 1021 unsigned int node_id = (val >> BITS_PER_BYTE) - 1; 1022 1023 /* Can store U8_MAX [0:254] nodes. */ 1024 if (node_id < nr_vmap_nodes) 1025 return node_id; 1026 1027 /* If it was _not_ zero, warn. */ 1028 WARN_ONCE(node_id != UINT_MAX, 1029 "Decode wrong node id (%d)\n", node_id); 1030 1031 return nr_vmap_nodes; 1032 } 1033 1034 static bool 1035 is_vn_id_valid(unsigned int node_id) 1036 { 1037 if (node_id < nr_vmap_nodes) 1038 return true; 1039 1040 return false; 1041 } 1042 1043 static __always_inline unsigned long 1044 va_size(struct vmap_area *va) 1045 { 1046 return (va->va_end - va->va_start); 1047 } 1048 1049 static __always_inline unsigned long 1050 get_subtree_max_size(struct rb_node *node) 1051 { 1052 struct vmap_area *va; 1053 1054 va = rb_entry_safe(node, struct vmap_area, rb_node); 1055 return va ? va->subtree_max_size : 0; 1056 } 1057 1058 RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, 1059 struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) 1060 1061 static void reclaim_and_purge_vmap_areas(void); 1062 static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); 1063 static void drain_vmap_area_work(struct work_struct *work); 1064 static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); 1065 1066 static __cacheline_aligned_in_smp atomic_long_t nr_vmalloc_pages; 1067 static __cacheline_aligned_in_smp atomic_long_t vmap_lazy_nr; 1068 1069 unsigned long vmalloc_nr_pages(void) 1070 { 1071 return atomic_long_read(&nr_vmalloc_pages); 1072 } 1073 1074 static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) 1075 { 1076 struct rb_node *n = root->rb_node; 1077 1078 addr = (unsigned long)kasan_reset_tag((void *)addr); 1079 1080 while (n) { 1081 struct vmap_area *va; 1082 1083 va = rb_entry(n, struct vmap_area, rb_node); 1084 if (addr < va->va_start) 1085 n = n->rb_left; 1086 else if (addr >= va->va_end) 1087 n = n->rb_right; 1088 else 1089 return va; 1090 } 1091 1092 return NULL; 1093 } 1094 1095 /* Look up the first VA which satisfies addr < va_end, NULL if none. */ 1096 static struct vmap_area * 1097 __find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) 1098 { 1099 struct vmap_area *va = NULL; 1100 struct rb_node *n = root->rb_node; 1101 1102 addr = (unsigned long)kasan_reset_tag((void *)addr); 1103 1104 while (n) { 1105 struct vmap_area *tmp; 1106 1107 tmp = rb_entry(n, struct vmap_area, rb_node); 1108 if (tmp->va_end > addr) { 1109 va = tmp; 1110 if (tmp->va_start <= addr) 1111 break; 1112 1113 n = n->rb_left; 1114 } else 1115 n = n->rb_right; 1116 } 1117 1118 return va; 1119 } 1120 1121 /* 1122 * Returns a node where a first VA, that satisfies addr < va_end, resides. 1123 * If success, a node is locked. A user is responsible to unlock it when a 1124 * VA is no longer needed to be accessed. 1125 * 1126 * Returns NULL if nothing found. 1127 */ 1128 static struct vmap_node * 1129 find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va) 1130 { 1131 unsigned long va_start_lowest; 1132 struct vmap_node *vn; 1133 1134 repeat: 1135 va_start_lowest = 0; 1136 1137 for_each_vmap_node(vn) { 1138 spin_lock(&vn->busy.lock); 1139 *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root); 1140 1141 if (*va) 1142 if (!va_start_lowest || (*va)->va_start < va_start_lowest) 1143 va_start_lowest = (*va)->va_start; 1144 spin_unlock(&vn->busy.lock); 1145 } 1146 1147 /* 1148 * Check if found VA exists, it might have gone away. In this case we 1149 * repeat the search because a VA has been removed concurrently and we 1150 * need to proceed to the next one, which is a rare case. 1151 */ 1152 if (va_start_lowest) { 1153 vn = addr_to_node(va_start_lowest); 1154 1155 spin_lock(&vn->busy.lock); 1156 *va = __find_vmap_area(va_start_lowest, &vn->busy.root); 1157 1158 if (*va) 1159 return vn; 1160 1161 spin_unlock(&vn->busy.lock); 1162 goto repeat; 1163 } 1164 1165 return NULL; 1166 } 1167 1168 /* 1169 * This function returns back addresses of parent node 1170 * and its left or right link for further processing. 1171 * 1172 * Otherwise NULL is returned. In that case all further 1173 * steps regarding inserting of conflicting overlap range 1174 * have to be declined and actually considered as a bug. 1175 */ 1176 static __always_inline struct rb_node ** 1177 find_va_links(struct vmap_area *va, 1178 struct rb_root *root, struct rb_node *from, 1179 struct rb_node **parent) 1180 { 1181 struct vmap_area *tmp_va; 1182 struct rb_node **link; 1183 1184 if (root) { 1185 link = &root->rb_node; 1186 if (unlikely(!*link)) { 1187 *parent = NULL; 1188 return link; 1189 } 1190 } else { 1191 link = &from; 1192 } 1193 1194 /* 1195 * Go to the bottom of the tree. When we hit the last point 1196 * we end up with parent rb_node and correct direction, i name 1197 * it link, where the new va->rb_node will be attached to. 1198 */ 1199 do { 1200 tmp_va = rb_entry(*link, struct vmap_area, rb_node); 1201 1202 /* 1203 * During the traversal we also do some sanity check. 1204 * Trigger the BUG() if there are sides(left/right) 1205 * or full overlaps. 1206 */ 1207 if (va->va_end <= tmp_va->va_start) 1208 link = &(*link)->rb_left; 1209 else if (va->va_start >= tmp_va->va_end) 1210 link = &(*link)->rb_right; 1211 else { 1212 WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", 1213 va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); 1214 1215 return NULL; 1216 } 1217 } while (*link); 1218 1219 *parent = &tmp_va->rb_node; 1220 return link; 1221 } 1222 1223 static __always_inline struct list_head * 1224 get_va_next_sibling(struct rb_node *parent, struct rb_node **link) 1225 { 1226 struct list_head *list; 1227 1228 if (unlikely(!parent)) 1229 /* 1230 * The red-black tree where we try to find VA neighbors 1231 * before merging or inserting is empty, i.e. it means 1232 * there is no free vmap space. Normally it does not 1233 * happen but we handle this case anyway. 1234 */ 1235 return NULL; 1236 1237 list = &rb_entry(parent, struct vmap_area, rb_node)->list; 1238 return (&parent->rb_right == link ? list->next : list); 1239 } 1240 1241 static __always_inline void 1242 __link_va(struct vmap_area *va, struct rb_root *root, 1243 struct rb_node *parent, struct rb_node **link, 1244 struct list_head *head, bool augment) 1245 { 1246 /* 1247 * VA is still not in the list, but we can 1248 * identify its future previous list_head node. 1249 */ 1250 if (likely(parent)) { 1251 head = &rb_entry(parent, struct vmap_area, rb_node)->list; 1252 if (&parent->rb_right != link) 1253 head = head->prev; 1254 } 1255 1256 /* Insert to the rb-tree */ 1257 rb_link_node(&va->rb_node, parent, link); 1258 if (augment) { 1259 /* 1260 * Some explanation here. Just perform simple insertion 1261 * to the tree. We do not set va->subtree_max_size to 1262 * its current size before calling rb_insert_augmented(). 1263 * It is because we populate the tree from the bottom 1264 * to parent levels when the node _is_ in the tree. 1265 * 1266 * Therefore we set subtree_max_size to zero after insertion, 1267 * to let __augment_tree_propagate_from() puts everything to 1268 * the correct order later on. 1269 */ 1270 rb_insert_augmented(&va->rb_node, 1271 root, &free_vmap_area_rb_augment_cb); 1272 va->subtree_max_size = 0; 1273 } else { 1274 rb_insert_color(&va->rb_node, root); 1275 } 1276 1277 /* Address-sort this list */ 1278 list_add(&va->list, head); 1279 } 1280 1281 static __always_inline void 1282 link_va(struct vmap_area *va, struct rb_root *root, 1283 struct rb_node *parent, struct rb_node **link, 1284 struct list_head *head) 1285 { 1286 __link_va(va, root, parent, link, head, false); 1287 } 1288 1289 static __always_inline void 1290 link_va_augment(struct vmap_area *va, struct rb_root *root, 1291 struct rb_node *parent, struct rb_node **link, 1292 struct list_head *head) 1293 { 1294 __link_va(va, root, parent, link, head, true); 1295 } 1296 1297 static __always_inline void 1298 __unlink_va(struct vmap_area *va, struct rb_root *root, bool augment) 1299 { 1300 if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) 1301 return; 1302 1303 if (augment) 1304 rb_erase_augmented(&va->rb_node, 1305 root, &free_vmap_area_rb_augment_cb); 1306 else 1307 rb_erase(&va->rb_node, root); 1308 1309 list_del_init(&va->list); 1310 RB_CLEAR_NODE(&va->rb_node); 1311 } 1312 1313 static __always_inline void 1314 unlink_va(struct vmap_area *va, struct rb_root *root) 1315 { 1316 __unlink_va(va, root, false); 1317 } 1318 1319 static __always_inline void 1320 unlink_va_augment(struct vmap_area *va, struct rb_root *root) 1321 { 1322 __unlink_va(va, root, true); 1323 } 1324 1325 #if DEBUG_AUGMENT_PROPAGATE_CHECK 1326 /* 1327 * Gets called when remove the node and rotate. 1328 */ 1329 static __always_inline unsigned long 1330 compute_subtree_max_size(struct vmap_area *va) 1331 { 1332 return max3(va_size(va), 1333 get_subtree_max_size(va->rb_node.rb_left), 1334 get_subtree_max_size(va->rb_node.rb_right)); 1335 } 1336 1337 static void 1338 augment_tree_propagate_check(void) 1339 { 1340 struct vmap_area *va; 1341 unsigned long computed_size; 1342 1343 list_for_each_entry(va, &free_vmap_area_list, list) { 1344 computed_size = compute_subtree_max_size(va); 1345 if (computed_size != va->subtree_max_size) 1346 pr_emerg("tree is corrupted: %lu, %lu\n", 1347 va_size(va), va->subtree_max_size); 1348 } 1349 } 1350 #endif 1351 1352 /* 1353 * This function populates subtree_max_size from bottom to upper 1354 * levels starting from VA point. The propagation must be done 1355 * when VA size is modified by changing its va_start/va_end. Or 1356 * in case of newly inserting of VA to the tree. 1357 * 1358 * It means that __augment_tree_propagate_from() must be called: 1359 * - After VA has been inserted to the tree(free path); 1360 * - After VA has been shrunk(allocation path); 1361 * - After VA has been increased(merging path). 1362 * 1363 * Please note that, it does not mean that upper parent nodes 1364 * and their subtree_max_size are recalculated all the time up 1365 * to the root node. 1366 * 1367 * 4--8 1368 * /\ 1369 * / \ 1370 * / \ 1371 * 2--2 8--8 1372 * 1373 * For example if we modify the node 4, shrinking it to 2, then 1374 * no any modification is required. If we shrink the node 2 to 1 1375 * its subtree_max_size is updated only, and set to 1. If we shrink 1376 * the node 8 to 6, then its subtree_max_size is set to 6 and parent 1377 * node becomes 4--6. 1378 */ 1379 static __always_inline void 1380 augment_tree_propagate_from(struct vmap_area *va) 1381 { 1382 /* 1383 * Populate the tree from bottom towards the root until 1384 * the calculated maximum available size of checked node 1385 * is equal to its current one. 1386 */ 1387 free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); 1388 1389 #if DEBUG_AUGMENT_PROPAGATE_CHECK 1390 augment_tree_propagate_check(); 1391 #endif 1392 } 1393 1394 static void 1395 insert_vmap_area(struct vmap_area *va, 1396 struct rb_root *root, struct list_head *head) 1397 { 1398 struct rb_node **link; 1399 struct rb_node *parent; 1400 1401 link = find_va_links(va, root, NULL, &parent); 1402 if (link) 1403 link_va(va, root, parent, link, head); 1404 } 1405 1406 static void 1407 insert_vmap_area_augment(struct vmap_area *va, 1408 struct rb_node *from, struct rb_root *root, 1409 struct list_head *head) 1410 { 1411 struct rb_node **link; 1412 struct rb_node *parent; 1413 1414 if (from) 1415 link = find_va_links(va, NULL, from, &parent); 1416 else 1417 link = find_va_links(va, root, NULL, &parent); 1418 1419 if (link) { 1420 link_va_augment(va, root, parent, link, head); 1421 augment_tree_propagate_from(va); 1422 } 1423 } 1424 1425 /* 1426 * Merge de-allocated chunk of VA memory with previous 1427 * and next free blocks. If coalesce is not done a new 1428 * free area is inserted. If VA has been merged, it is 1429 * freed. 1430 * 1431 * Please note, it can return NULL in case of overlap 1432 * ranges, followed by WARN() report. Despite it is a 1433 * buggy behaviour, a system can be alive and keep 1434 * ongoing. 1435 */ 1436 static __always_inline struct vmap_area * 1437 __merge_or_add_vmap_area(struct vmap_area *va, 1438 struct rb_root *root, struct list_head *head, bool augment) 1439 { 1440 struct vmap_area *sibling; 1441 struct list_head *next; 1442 struct rb_node **link; 1443 struct rb_node *parent; 1444 bool merged = false; 1445 1446 /* 1447 * Find a place in the tree where VA potentially will be 1448 * inserted, unless it is merged with its sibling/siblings. 1449 */ 1450 link = find_va_links(va, root, NULL, &parent); 1451 if (!link) 1452 return NULL; 1453 1454 /* 1455 * Get next node of VA to check if merging can be done. 1456 */ 1457 next = get_va_next_sibling(parent, link); 1458 if (unlikely(next == NULL)) 1459 goto insert; 1460 1461 /* 1462 * start end 1463 * | | 1464 * |<------VA------>|<-----Next----->| 1465 * | | 1466 * start end 1467 */ 1468 if (next != head) { 1469 sibling = list_entry(next, struct vmap_area, list); 1470 if (sibling->va_start == va->va_end) { 1471 sibling->va_start = va->va_start; 1472 1473 /* Free vmap_area object. */ 1474 kmem_cache_free(vmap_area_cachep, va); 1475 1476 /* Point to the new merged area. */ 1477 va = sibling; 1478 merged = true; 1479 } 1480 } 1481 1482 /* 1483 * start end 1484 * | | 1485 * |<-----Prev----->|<------VA------>| 1486 * | | 1487 * start end 1488 */ 1489 if (next->prev != head) { 1490 sibling = list_entry(next->prev, struct vmap_area, list); 1491 if (sibling->va_end == va->va_start) { 1492 /* 1493 * If both neighbors are coalesced, it is important 1494 * to unlink the "next" node first, followed by merging 1495 * with "previous" one. Otherwise the tree might not be 1496 * fully populated if a sibling's augmented value is 1497 * "normalized" because of rotation operations. 1498 */ 1499 if (merged) 1500 __unlink_va(va, root, augment); 1501 1502 sibling->va_end = va->va_end; 1503 1504 /* Free vmap_area object. */ 1505 kmem_cache_free(vmap_area_cachep, va); 1506 1507 /* Point to the new merged area. */ 1508 va = sibling; 1509 merged = true; 1510 } 1511 } 1512 1513 insert: 1514 if (!merged) 1515 __link_va(va, root, parent, link, head, augment); 1516 1517 return va; 1518 } 1519 1520 static __always_inline struct vmap_area * 1521 merge_or_add_vmap_area(struct vmap_area *va, 1522 struct rb_root *root, struct list_head *head) 1523 { 1524 return __merge_or_add_vmap_area(va, root, head, false); 1525 } 1526 1527 static __always_inline struct vmap_area * 1528 merge_or_add_vmap_area_augment(struct vmap_area *va, 1529 struct rb_root *root, struct list_head *head) 1530 { 1531 va = __merge_or_add_vmap_area(va, root, head, true); 1532 if (va) 1533 augment_tree_propagate_from(va); 1534 1535 return va; 1536 } 1537 1538 static __always_inline bool 1539 is_within_this_va(struct vmap_area *va, unsigned long size, 1540 unsigned long align, unsigned long vstart) 1541 { 1542 unsigned long nva_start_addr; 1543 1544 if (va->va_start > vstart) 1545 nva_start_addr = ALIGN(va->va_start, align); 1546 else 1547 nva_start_addr = ALIGN(vstart, align); 1548 1549 /* Can be overflowed due to big size or alignment. */ 1550 if (nva_start_addr + size < nva_start_addr || 1551 nva_start_addr < vstart) 1552 return false; 1553 1554 return (nva_start_addr + size <= va->va_end); 1555 } 1556 1557 /* 1558 * Find the first free block(lowest start address) in the tree, 1559 * that will accomplish the request corresponding to passing 1560 * parameters. Please note, with an alignment bigger than PAGE_SIZE, 1561 * a search length is adjusted to account for worst case alignment 1562 * overhead. 1563 */ 1564 static __always_inline struct vmap_area * 1565 find_vmap_lowest_match(struct rb_root *root, unsigned long size, 1566 unsigned long align, unsigned long vstart, bool adjust_search_size) 1567 { 1568 struct vmap_area *va; 1569 struct rb_node *node; 1570 unsigned long length; 1571 1572 /* Start from the root. */ 1573 node = root->rb_node; 1574 1575 /* Adjust the search size for alignment overhead. */ 1576 length = adjust_search_size ? size + align - 1 : size; 1577 1578 while (node) { 1579 va = rb_entry(node, struct vmap_area, rb_node); 1580 1581 if (get_subtree_max_size(node->rb_left) >= length && 1582 vstart < va->va_start) { 1583 node = node->rb_left; 1584 } else { 1585 if (is_within_this_va(va, size, align, vstart)) 1586 return va; 1587 1588 /* 1589 * Does not make sense to go deeper towards the right 1590 * sub-tree if it does not have a free block that is 1591 * equal or bigger to the requested search length. 1592 */ 1593 if (get_subtree_max_size(node->rb_right) >= length) { 1594 node = node->rb_right; 1595 continue; 1596 } 1597 1598 /* 1599 * OK. We roll back and find the first right sub-tree, 1600 * that will satisfy the search criteria. It can happen 1601 * due to "vstart" restriction or an alignment overhead 1602 * that is bigger then PAGE_SIZE. 1603 */ 1604 while ((node = rb_parent(node))) { 1605 va = rb_entry(node, struct vmap_area, rb_node); 1606 if (is_within_this_va(va, size, align, vstart)) 1607 return va; 1608 1609 if (get_subtree_max_size(node->rb_right) >= length && 1610 vstart <= va->va_start) { 1611 /* 1612 * Shift the vstart forward. Please note, we update it with 1613 * parent's start address adding "1" because we do not want 1614 * to enter same sub-tree after it has already been checked 1615 * and no suitable free block found there. 1616 */ 1617 vstart = va->va_start + 1; 1618 node = node->rb_right; 1619 break; 1620 } 1621 } 1622 } 1623 } 1624 1625 return NULL; 1626 } 1627 1628 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 1629 #include <linux/random.h> 1630 1631 static struct vmap_area * 1632 find_vmap_lowest_linear_match(struct list_head *head, unsigned long size, 1633 unsigned long align, unsigned long vstart) 1634 { 1635 struct vmap_area *va; 1636 1637 list_for_each_entry(va, head, list) { 1638 if (!is_within_this_va(va, size, align, vstart)) 1639 continue; 1640 1641 return va; 1642 } 1643 1644 return NULL; 1645 } 1646 1647 static void 1648 find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head, 1649 unsigned long size, unsigned long align) 1650 { 1651 struct vmap_area *va_1, *va_2; 1652 unsigned long vstart; 1653 unsigned int rnd; 1654 1655 get_random_bytes(&rnd, sizeof(rnd)); 1656 vstart = VMALLOC_START + rnd; 1657 1658 va_1 = find_vmap_lowest_match(root, size, align, vstart, false); 1659 va_2 = find_vmap_lowest_linear_match(head, size, align, vstart); 1660 1661 if (va_1 != va_2) 1662 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", 1663 va_1, va_2, vstart); 1664 } 1665 #endif 1666 1667 enum fit_type { 1668 NOTHING_FIT = 0, 1669 FL_FIT_TYPE = 1, /* full fit */ 1670 LE_FIT_TYPE = 2, /* left edge fit */ 1671 RE_FIT_TYPE = 3, /* right edge fit */ 1672 NE_FIT_TYPE = 4 /* no edge fit */ 1673 }; 1674 1675 static __always_inline enum fit_type 1676 classify_va_fit_type(struct vmap_area *va, 1677 unsigned long nva_start_addr, unsigned long size) 1678 { 1679 enum fit_type type; 1680 1681 /* Check if it is within VA. */ 1682 if (nva_start_addr < va->va_start || 1683 nva_start_addr + size > va->va_end) 1684 return NOTHING_FIT; 1685 1686 /* Now classify. */ 1687 if (va->va_start == nva_start_addr) { 1688 if (va->va_end == nva_start_addr + size) 1689 type = FL_FIT_TYPE; 1690 else 1691 type = LE_FIT_TYPE; 1692 } else if (va->va_end == nva_start_addr + size) { 1693 type = RE_FIT_TYPE; 1694 } else { 1695 type = NE_FIT_TYPE; 1696 } 1697 1698 return type; 1699 } 1700 1701 static __always_inline int 1702 va_clip(struct rb_root *root, struct list_head *head, 1703 struct vmap_area *va, unsigned long nva_start_addr, 1704 unsigned long size) 1705 { 1706 struct vmap_area *lva = NULL; 1707 enum fit_type type = classify_va_fit_type(va, nva_start_addr, size); 1708 1709 if (type == FL_FIT_TYPE) { 1710 /* 1711 * No need to split VA, it fully fits. 1712 * 1713 * | | 1714 * V NVA V 1715 * |---------------| 1716 */ 1717 unlink_va_augment(va, root); 1718 kmem_cache_free(vmap_area_cachep, va); 1719 } else if (type == LE_FIT_TYPE) { 1720 /* 1721 * Split left edge of fit VA. 1722 * 1723 * | | 1724 * V NVA V R 1725 * |-------|-------| 1726 */ 1727 va->va_start += size; 1728 } else if (type == RE_FIT_TYPE) { 1729 /* 1730 * Split right edge of fit VA. 1731 * 1732 * | | 1733 * L V NVA V 1734 * |-------|-------| 1735 */ 1736 va->va_end = nva_start_addr; 1737 } else if (type == NE_FIT_TYPE) { 1738 /* 1739 * Split no edge of fit VA. 1740 * 1741 * | | 1742 * L V NVA V R 1743 * |---|-------|---| 1744 */ 1745 lva = __this_cpu_xchg(ne_fit_preload_node, NULL); 1746 if (unlikely(!lva)) { 1747 /* 1748 * For percpu allocator we do not do any pre-allocation 1749 * and leave it as it is. The reason is it most likely 1750 * never ends up with NE_FIT_TYPE splitting. In case of 1751 * percpu allocations offsets and sizes are aligned to 1752 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE 1753 * are its main fitting cases. 1754 * 1755 * There are a few exceptions though, as an example it is 1756 * a first allocation (early boot up) when we have "one" 1757 * big free space that has to be split. 1758 * 1759 * Also we can hit this path in case of regular "vmap" 1760 * allocations, if "this" current CPU was not preloaded. 1761 * See the comment in alloc_vmap_area() why. If so, then 1762 * GFP_NOWAIT is used instead to get an extra object for 1763 * split purpose. That is rare and most time does not 1764 * occur. 1765 * 1766 * What happens if an allocation gets failed. Basically, 1767 * an "overflow" path is triggered to purge lazily freed 1768 * areas to free some memory, then, the "retry" path is 1769 * triggered to repeat one more time. See more details 1770 * in alloc_vmap_area() function. 1771 */ 1772 lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); 1773 if (!lva) 1774 return -ENOMEM; 1775 } 1776 1777 /* 1778 * Build the remainder. 1779 */ 1780 lva->va_start = va->va_start; 1781 lva->va_end = nva_start_addr; 1782 1783 /* 1784 * Shrink this VA to remaining size. 1785 */ 1786 va->va_start = nva_start_addr + size; 1787 } else { 1788 return -EINVAL; 1789 } 1790 1791 if (type != FL_FIT_TYPE) { 1792 augment_tree_propagate_from(va); 1793 1794 if (lva) /* type == NE_FIT_TYPE */ 1795 insert_vmap_area_augment(lva, &va->rb_node, root, head); 1796 } 1797 1798 return 0; 1799 } 1800 1801 static unsigned long 1802 va_alloc(struct vmap_area *va, 1803 struct rb_root *root, struct list_head *head, 1804 unsigned long size, unsigned long align, 1805 unsigned long vstart, unsigned long vend) 1806 { 1807 unsigned long nva_start_addr; 1808 int ret; 1809 1810 if (va->va_start > vstart) 1811 nva_start_addr = ALIGN(va->va_start, align); 1812 else 1813 nva_start_addr = ALIGN(vstart, align); 1814 1815 /* Check the "vend" restriction. */ 1816 if (nva_start_addr + size > vend) 1817 return -ERANGE; 1818 1819 /* Update the free vmap_area. */ 1820 ret = va_clip(root, head, va, nva_start_addr, size); 1821 if (WARN_ON_ONCE(ret)) 1822 return ret; 1823 1824 return nva_start_addr; 1825 } 1826 1827 /* 1828 * Returns a start address of the newly allocated area, if success. 1829 * Otherwise an error value is returned that indicates failure. 1830 */ 1831 static __always_inline unsigned long 1832 __alloc_vmap_area(struct rb_root *root, struct list_head *head, 1833 unsigned long size, unsigned long align, 1834 unsigned long vstart, unsigned long vend) 1835 { 1836 bool adjust_search_size = true; 1837 unsigned long nva_start_addr; 1838 struct vmap_area *va; 1839 1840 /* 1841 * Do not adjust when: 1842 * a) align <= PAGE_SIZE, because it does not make any sense. 1843 * All blocks(their start addresses) are at least PAGE_SIZE 1844 * aligned anyway; 1845 * b) a short range where a requested size corresponds to exactly 1846 * specified [vstart:vend] interval and an alignment > PAGE_SIZE. 1847 * With adjusted search length an allocation would not succeed. 1848 */ 1849 if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size)) 1850 adjust_search_size = false; 1851 1852 va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size); 1853 if (unlikely(!va)) 1854 return -ENOENT; 1855 1856 nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend); 1857 1858 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 1859 if (!IS_ERR_VALUE(nva_start_addr)) 1860 find_vmap_lowest_match_check(root, head, size, align); 1861 #endif 1862 1863 return nva_start_addr; 1864 } 1865 1866 /* 1867 * Free a region of KVA allocated by alloc_vmap_area 1868 */ 1869 static void free_vmap_area(struct vmap_area *va) 1870 { 1871 struct vmap_node *vn = addr_to_node(va->va_start); 1872 1873 /* 1874 * Remove from the busy tree/list. 1875 */ 1876 spin_lock(&vn->busy.lock); 1877 unlink_va(va, &vn->busy.root); 1878 spin_unlock(&vn->busy.lock); 1879 1880 /* 1881 * Insert/Merge it back to the free tree/list. 1882 */ 1883 spin_lock(&free_vmap_area_lock); 1884 merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); 1885 spin_unlock(&free_vmap_area_lock); 1886 } 1887 1888 static inline void 1889 preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) 1890 { 1891 struct vmap_area *va = NULL, *tmp; 1892 1893 /* 1894 * Preload this CPU with one extra vmap_area object. It is used 1895 * when fit type of free area is NE_FIT_TYPE. It guarantees that 1896 * a CPU that does an allocation is preloaded. 1897 * 1898 * We do it in non-atomic context, thus it allows us to use more 1899 * permissive allocation masks to be more stable under low memory 1900 * condition and high memory pressure. 1901 */ 1902 if (!this_cpu_read(ne_fit_preload_node)) 1903 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1904 1905 spin_lock(lock); 1906 1907 tmp = NULL; 1908 if (va && !__this_cpu_try_cmpxchg(ne_fit_preload_node, &tmp, va)) 1909 kmem_cache_free(vmap_area_cachep, va); 1910 } 1911 1912 static struct vmap_pool * 1913 size_to_va_pool(struct vmap_node *vn, unsigned long size) 1914 { 1915 unsigned int idx = (size - 1) / PAGE_SIZE; 1916 1917 if (idx < MAX_VA_SIZE_PAGES) 1918 return &vn->pool[idx]; 1919 1920 return NULL; 1921 } 1922 1923 static bool 1924 node_pool_add_va(struct vmap_node *n, struct vmap_area *va) 1925 { 1926 struct vmap_pool *vp; 1927 1928 vp = size_to_va_pool(n, va_size(va)); 1929 if (!vp) 1930 return false; 1931 1932 spin_lock(&n->pool_lock); 1933 list_add(&va->list, &vp->head); 1934 WRITE_ONCE(vp->len, vp->len + 1); 1935 spin_unlock(&n->pool_lock); 1936 1937 return true; 1938 } 1939 1940 static struct vmap_area * 1941 node_pool_del_va(struct vmap_node *vn, unsigned long size, 1942 unsigned long align, unsigned long vstart, 1943 unsigned long vend) 1944 { 1945 struct vmap_area *va = NULL; 1946 struct vmap_pool *vp; 1947 int err = 0; 1948 1949 vp = size_to_va_pool(vn, size); 1950 if (!vp || list_empty(&vp->head)) 1951 return NULL; 1952 1953 spin_lock(&vn->pool_lock); 1954 if (!list_empty(&vp->head)) { 1955 va = list_first_entry(&vp->head, struct vmap_area, list); 1956 1957 if (IS_ALIGNED(va->va_start, align)) { 1958 /* 1959 * Do some sanity check and emit a warning 1960 * if one of below checks detects an error. 1961 */ 1962 err |= (va_size(va) != size); 1963 err |= (va->va_start < vstart); 1964 err |= (va->va_end > vend); 1965 1966 if (!WARN_ON_ONCE(err)) { 1967 list_del_init(&va->list); 1968 WRITE_ONCE(vp->len, vp->len - 1); 1969 } else { 1970 va = NULL; 1971 } 1972 } else { 1973 list_move_tail(&va->list, &vp->head); 1974 va = NULL; 1975 } 1976 } 1977 spin_unlock(&vn->pool_lock); 1978 1979 return va; 1980 } 1981 1982 static struct vmap_area * 1983 node_alloc(unsigned long size, unsigned long align, 1984 unsigned long vstart, unsigned long vend, 1985 unsigned long *addr, unsigned int *vn_id) 1986 { 1987 struct vmap_area *va; 1988 1989 *vn_id = 0; 1990 *addr = -EINVAL; 1991 1992 /* 1993 * Fallback to a global heap if not vmalloc or there 1994 * is only one node. 1995 */ 1996 if (vstart != VMALLOC_START || vend != VMALLOC_END || 1997 nr_vmap_nodes == 1) 1998 return NULL; 1999 2000 *vn_id = raw_smp_processor_id() % nr_vmap_nodes; 2001 va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend); 2002 *vn_id = encode_vn_id(*vn_id); 2003 2004 if (va) 2005 *addr = va->va_start; 2006 2007 return va; 2008 } 2009 2010 static inline void setup_vmalloc_vm(struct vm_struct *vm, 2011 struct vmap_area *va, unsigned long flags, const void *caller) 2012 { 2013 vm->flags = flags; 2014 vm->addr = (void *)va->va_start; 2015 vm->size = vm->requested_size = va_size(va); 2016 vm->caller = caller; 2017 va->vm = vm; 2018 } 2019 2020 /* 2021 * Allocate a region of KVA of the specified size and alignment, within the 2022 * vstart and vend. If vm is passed in, the two will also be bound. 2023 */ 2024 static struct vmap_area *alloc_vmap_area(unsigned long size, 2025 unsigned long align, 2026 unsigned long vstart, unsigned long vend, 2027 int node, gfp_t gfp_mask, 2028 unsigned long va_flags, struct vm_struct *vm) 2029 { 2030 struct vmap_node *vn; 2031 struct vmap_area *va; 2032 unsigned long freed; 2033 unsigned long addr; 2034 unsigned int vn_id; 2035 bool allow_block; 2036 int purged = 0; 2037 int ret; 2038 2039 if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align))) 2040 return ERR_PTR(-EINVAL); 2041 2042 if (unlikely(!vmap_initialized)) 2043 return ERR_PTR(-EBUSY); 2044 2045 /* Only reclaim behaviour flags are relevant. */ 2046 gfp_mask = gfp_mask & GFP_RECLAIM_MASK; 2047 allow_block = gfpflags_allow_blocking(gfp_mask); 2048 might_sleep_if(allow_block); 2049 2050 /* 2051 * If a VA is obtained from a global heap(if it fails here) 2052 * it is anyway marked with this "vn_id" so it is returned 2053 * to this pool's node later. Such way gives a possibility 2054 * to populate pools based on users demand. 2055 * 2056 * On success a ready to go VA is returned. 2057 */ 2058 va = node_alloc(size, align, vstart, vend, &addr, &vn_id); 2059 if (!va) { 2060 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 2061 if (unlikely(!va)) 2062 return ERR_PTR(-ENOMEM); 2063 2064 /* 2065 * Only scan the relevant parts containing pointers to other objects 2066 * to avoid false negatives. 2067 */ 2068 kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); 2069 } 2070 2071 retry: 2072 if (IS_ERR_VALUE(addr)) { 2073 preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); 2074 addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, 2075 size, align, vstart, vend); 2076 spin_unlock(&free_vmap_area_lock); 2077 2078 /* 2079 * This is not a fast path. Check if yielding is needed. This 2080 * is the only reschedule point in the vmalloc() path. 2081 */ 2082 if (allow_block) 2083 cond_resched(); 2084 } 2085 2086 trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr)); 2087 2088 /* 2089 * If an allocation fails, the error value is 2090 * returned. Therefore trigger the overflow path. 2091 */ 2092 if (IS_ERR_VALUE(addr)) { 2093 if (allow_block) 2094 goto overflow; 2095 2096 /* 2097 * We can not trigger any reclaim logic because 2098 * sleeping is not allowed, thus fail an allocation. 2099 */ 2100 goto out_free_va; 2101 } 2102 2103 va->va_start = addr; 2104 va->va_end = addr + size; 2105 va->vm = NULL; 2106 va->flags = (va_flags | vn_id); 2107 2108 if (vm) { 2109 vm->addr = (void *)va->va_start; 2110 vm->size = va_size(va); 2111 va->vm = vm; 2112 } 2113 2114 vn = addr_to_node(va->va_start); 2115 2116 spin_lock(&vn->busy.lock); 2117 insert_vmap_area(va, &vn->busy.root, &vn->busy.head); 2118 spin_unlock(&vn->busy.lock); 2119 2120 BUG_ON(!IS_ALIGNED(va->va_start, align)); 2121 BUG_ON(va->va_start < vstart); 2122 BUG_ON(va->va_end > vend); 2123 2124 ret = kasan_populate_vmalloc(addr, size, gfp_mask); 2125 if (ret) { 2126 free_vmap_area(va); 2127 return ERR_PTR(ret); 2128 } 2129 2130 return va; 2131 2132 overflow: 2133 if (!purged) { 2134 reclaim_and_purge_vmap_areas(); 2135 purged = 1; 2136 goto retry; 2137 } 2138 2139 freed = 0; 2140 blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); 2141 2142 if (freed > 0) { 2143 purged = 0; 2144 goto retry; 2145 } 2146 2147 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) 2148 pr_warn("vmalloc_node_range for size %lu failed: Address range restricted to %#lx - %#lx\n", 2149 size, vstart, vend); 2150 2151 out_free_va: 2152 kmem_cache_free(vmap_area_cachep, va); 2153 return ERR_PTR(-EBUSY); 2154 } 2155 2156 int register_vmap_purge_notifier(struct notifier_block *nb) 2157 { 2158 return blocking_notifier_chain_register(&vmap_notify_list, nb); 2159 } 2160 EXPORT_SYMBOL_GPL(register_vmap_purge_notifier); 2161 2162 int unregister_vmap_purge_notifier(struct notifier_block *nb) 2163 { 2164 return blocking_notifier_chain_unregister(&vmap_notify_list, nb); 2165 } 2166 EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); 2167 2168 /* 2169 * lazy_max_pages is the maximum amount of virtual address space we gather up 2170 * before attempting to purge with a TLB flush. 2171 * 2172 * There is a tradeoff here: a larger number will cover more kernel page tables 2173 * and take slightly longer to purge, but it will linearly reduce the number of 2174 * global TLB flushes that must be performed. It would seem natural to scale 2175 * this number up linearly with the number of CPUs (because vmapping activity 2176 * could also scale linearly with the number of CPUs), however it is likely 2177 * that in practice, workloads might be constrained in other ways that mean 2178 * vmap activity will not scale linearly with CPUs. Also, I want to be 2179 * conservative and not introduce a big latency on huge systems, so go with 2180 * a less aggressive log scale. It will still be an improvement over the old 2181 * code, and it will be simple to change the scale factor if we find that it 2182 * becomes a problem on bigger systems. 2183 */ 2184 static unsigned long lazy_max_pages(void) 2185 { 2186 unsigned int log; 2187 2188 log = fls(num_online_cpus()); 2189 2190 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 2191 } 2192 2193 /* 2194 * Serialize vmap purging. There is no actual critical section protected 2195 * by this lock, but we want to avoid concurrent calls for performance 2196 * reasons and to make the pcpu_get_vm_areas more deterministic. 2197 */ 2198 static DEFINE_MUTEX(vmap_purge_lock); 2199 2200 /* for per-CPU blocks */ 2201 static void purge_fragmented_blocks_allcpus(void); 2202 2203 static void 2204 reclaim_list_global(struct list_head *head) 2205 { 2206 struct vmap_area *va, *n; 2207 2208 if (list_empty(head)) 2209 return; 2210 2211 spin_lock(&free_vmap_area_lock); 2212 list_for_each_entry_safe(va, n, head, list) 2213 merge_or_add_vmap_area_augment(va, 2214 &free_vmap_area_root, &free_vmap_area_list); 2215 spin_unlock(&free_vmap_area_lock); 2216 } 2217 2218 static void 2219 decay_va_pool_node(struct vmap_node *vn, bool full_decay) 2220 { 2221 LIST_HEAD(decay_list); 2222 struct rb_root decay_root = RB_ROOT; 2223 struct vmap_area *va, *nva; 2224 unsigned long n_decay, pool_len; 2225 int i; 2226 2227 for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { 2228 LIST_HEAD(tmp_list); 2229 2230 if (list_empty(&vn->pool[i].head)) 2231 continue; 2232 2233 /* Detach the pool, so no-one can access it. */ 2234 spin_lock(&vn->pool_lock); 2235 list_replace_init(&vn->pool[i].head, &tmp_list); 2236 spin_unlock(&vn->pool_lock); 2237 2238 pool_len = n_decay = vn->pool[i].len; 2239 WRITE_ONCE(vn->pool[i].len, 0); 2240 2241 /* Decay a pool by ~25% out of left objects. */ 2242 if (!full_decay) 2243 n_decay >>= 2; 2244 pool_len -= n_decay; 2245 2246 list_for_each_entry_safe(va, nva, &tmp_list, list) { 2247 if (!n_decay--) 2248 break; 2249 2250 list_del_init(&va->list); 2251 merge_or_add_vmap_area(va, &decay_root, &decay_list); 2252 } 2253 2254 /* 2255 * Attach the pool back if it has been partly decayed. 2256 * Please note, it is supposed that nobody(other contexts) 2257 * can populate the pool therefore a simple list replace 2258 * operation takes place here. 2259 */ 2260 if (!list_empty(&tmp_list)) { 2261 spin_lock(&vn->pool_lock); 2262 list_replace_init(&tmp_list, &vn->pool[i].head); 2263 WRITE_ONCE(vn->pool[i].len, pool_len); 2264 spin_unlock(&vn->pool_lock); 2265 } 2266 } 2267 2268 reclaim_list_global(&decay_list); 2269 } 2270 2271 static void 2272 kasan_release_vmalloc_node(struct vmap_node *vn) 2273 { 2274 struct vmap_area *va; 2275 unsigned long start, end; 2276 2277 start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start; 2278 end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end; 2279 2280 list_for_each_entry(va, &vn->purge_list, list) { 2281 if (is_vmalloc_or_module_addr((void *) va->va_start)) 2282 kasan_release_vmalloc(va->va_start, va->va_end, 2283 va->va_start, va->va_end, 2284 KASAN_VMALLOC_PAGE_RANGE); 2285 } 2286 2287 kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH); 2288 } 2289 2290 static void purge_vmap_node(struct work_struct *work) 2291 { 2292 struct vmap_node *vn = container_of(work, 2293 struct vmap_node, purge_work); 2294 unsigned long nr_purged_pages = 0; 2295 struct vmap_area *va, *n_va; 2296 LIST_HEAD(local_list); 2297 2298 if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) 2299 kasan_release_vmalloc_node(vn); 2300 2301 vn->nr_purged = 0; 2302 2303 list_for_each_entry_safe(va, n_va, &vn->purge_list, list) { 2304 unsigned long nr = va_size(va) >> PAGE_SHIFT; 2305 unsigned int vn_id = decode_vn_id(va->flags); 2306 2307 list_del_init(&va->list); 2308 2309 nr_purged_pages += nr; 2310 vn->nr_purged++; 2311 2312 if (is_vn_id_valid(vn_id) && !vn->skip_populate) 2313 if (node_pool_add_va(vn, va)) 2314 continue; 2315 2316 /* Go back to global. */ 2317 list_add(&va->list, &local_list); 2318 } 2319 2320 atomic_long_sub(nr_purged_pages, &vmap_lazy_nr); 2321 2322 reclaim_list_global(&local_list); 2323 } 2324 2325 /* 2326 * Purges all lazily-freed vmap areas. 2327 */ 2328 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end, 2329 bool full_pool_decay) 2330 { 2331 unsigned long nr_purged_areas = 0; 2332 unsigned int nr_purge_helpers; 2333 static cpumask_t purge_nodes; 2334 unsigned int nr_purge_nodes; 2335 struct vmap_node *vn; 2336 int i; 2337 2338 lockdep_assert_held(&vmap_purge_lock); 2339 2340 /* 2341 * Use cpumask to mark which node has to be processed. 2342 */ 2343 purge_nodes = CPU_MASK_NONE; 2344 2345 for_each_vmap_node(vn) { 2346 INIT_LIST_HEAD(&vn->purge_list); 2347 vn->skip_populate = full_pool_decay; 2348 decay_va_pool_node(vn, full_pool_decay); 2349 2350 if (RB_EMPTY_ROOT(&vn->lazy.root)) 2351 continue; 2352 2353 spin_lock(&vn->lazy.lock); 2354 WRITE_ONCE(vn->lazy.root.rb_node, NULL); 2355 list_replace_init(&vn->lazy.head, &vn->purge_list); 2356 spin_unlock(&vn->lazy.lock); 2357 2358 start = min(start, list_first_entry(&vn->purge_list, 2359 struct vmap_area, list)->va_start); 2360 2361 end = max(end, list_last_entry(&vn->purge_list, 2362 struct vmap_area, list)->va_end); 2363 2364 cpumask_set_cpu(node_to_id(vn), &purge_nodes); 2365 } 2366 2367 nr_purge_nodes = cpumask_weight(&purge_nodes); 2368 if (nr_purge_nodes > 0) { 2369 flush_tlb_kernel_range(start, end); 2370 2371 /* One extra worker is per a lazy_max_pages() full set minus one. */ 2372 nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages(); 2373 nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1; 2374 2375 for_each_cpu(i, &purge_nodes) { 2376 vn = &vmap_nodes[i]; 2377 2378 if (nr_purge_helpers > 0) { 2379 INIT_WORK(&vn->purge_work, purge_vmap_node); 2380 2381 if (cpumask_test_cpu(i, cpu_online_mask)) 2382 schedule_work_on(i, &vn->purge_work); 2383 else 2384 schedule_work(&vn->purge_work); 2385 2386 nr_purge_helpers--; 2387 } else { 2388 vn->purge_work.func = NULL; 2389 purge_vmap_node(&vn->purge_work); 2390 nr_purged_areas += vn->nr_purged; 2391 } 2392 } 2393 2394 for_each_cpu(i, &purge_nodes) { 2395 vn = &vmap_nodes[i]; 2396 2397 if (vn->purge_work.func) { 2398 flush_work(&vn->purge_work); 2399 nr_purged_areas += vn->nr_purged; 2400 } 2401 } 2402 } 2403 2404 trace_purge_vmap_area_lazy(start, end, nr_purged_areas); 2405 return nr_purged_areas > 0; 2406 } 2407 2408 /* 2409 * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list. 2410 */ 2411 static void reclaim_and_purge_vmap_areas(void) 2412 2413 { 2414 mutex_lock(&vmap_purge_lock); 2415 purge_fragmented_blocks_allcpus(); 2416 __purge_vmap_area_lazy(ULONG_MAX, 0, true); 2417 mutex_unlock(&vmap_purge_lock); 2418 } 2419 2420 static void drain_vmap_area_work(struct work_struct *work) 2421 { 2422 mutex_lock(&vmap_purge_lock); 2423 __purge_vmap_area_lazy(ULONG_MAX, 0, false); 2424 mutex_unlock(&vmap_purge_lock); 2425 } 2426 2427 /* 2428 * Free a vmap area, caller ensuring that the area has been unmapped, 2429 * unlinked and flush_cache_vunmap had been called for the correct 2430 * range previously. 2431 */ 2432 static void free_vmap_area_noflush(struct vmap_area *va) 2433 { 2434 unsigned long nr_lazy_max = lazy_max_pages(); 2435 unsigned long va_start = va->va_start; 2436 unsigned int vn_id = decode_vn_id(va->flags); 2437 struct vmap_node *vn; 2438 unsigned long nr_lazy; 2439 2440 if (WARN_ON_ONCE(!list_empty(&va->list))) 2441 return; 2442 2443 nr_lazy = atomic_long_add_return_relaxed(va_size(va) >> PAGE_SHIFT, 2444 &vmap_lazy_nr); 2445 2446 /* 2447 * If it was request by a certain node we would like to 2448 * return it to that node, i.e. its pool for later reuse. 2449 */ 2450 vn = is_vn_id_valid(vn_id) ? 2451 id_to_node(vn_id):addr_to_node(va->va_start); 2452 2453 spin_lock(&vn->lazy.lock); 2454 insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head); 2455 spin_unlock(&vn->lazy.lock); 2456 2457 trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max); 2458 2459 /* After this point, we may free va at any time */ 2460 if (unlikely(nr_lazy > nr_lazy_max)) 2461 schedule_work(&drain_vmap_work); 2462 } 2463 2464 /* 2465 * Free and unmap a vmap area 2466 */ 2467 static void free_unmap_vmap_area(struct vmap_area *va) 2468 { 2469 flush_cache_vunmap(va->va_start, va->va_end); 2470 vunmap_range_noflush(va->va_start, va->va_end); 2471 if (debug_pagealloc_enabled_static()) 2472 flush_tlb_kernel_range(va->va_start, va->va_end); 2473 2474 free_vmap_area_noflush(va); 2475 } 2476 2477 struct vmap_area *find_vmap_area(unsigned long addr) 2478 { 2479 struct vmap_node *vn; 2480 struct vmap_area *va; 2481 int i, j; 2482 2483 if (unlikely(!vmap_initialized)) 2484 return NULL; 2485 2486 /* 2487 * An addr_to_node_id(addr) converts an address to a node index 2488 * where a VA is located. If VA spans several zones and passed 2489 * addr is not the same as va->va_start, what is not common, we 2490 * may need to scan extra nodes. See an example: 2491 * 2492 * <----va----> 2493 * -|-----|-----|-----|-----|- 2494 * 1 2 0 1 2495 * 2496 * VA resides in node 1 whereas it spans 1, 2 an 0. If passed 2497 * addr is within 2 or 0 nodes we should do extra work. 2498 */ 2499 i = j = addr_to_node_id(addr); 2500 do { 2501 vn = &vmap_nodes[i]; 2502 2503 spin_lock(&vn->busy.lock); 2504 va = __find_vmap_area(addr, &vn->busy.root); 2505 spin_unlock(&vn->busy.lock); 2506 2507 if (va) 2508 return va; 2509 } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j); 2510 2511 return NULL; 2512 } 2513 2514 static struct vmap_area *find_unlink_vmap_area(unsigned long addr) 2515 { 2516 struct vmap_node *vn; 2517 struct vmap_area *va; 2518 int i, j; 2519 2520 /* 2521 * Check the comment in the find_vmap_area() about the loop. 2522 */ 2523 i = j = addr_to_node_id(addr); 2524 do { 2525 vn = &vmap_nodes[i]; 2526 2527 spin_lock(&vn->busy.lock); 2528 va = __find_vmap_area(addr, &vn->busy.root); 2529 if (va) 2530 unlink_va(va, &vn->busy.root); 2531 spin_unlock(&vn->busy.lock); 2532 2533 if (va) 2534 return va; 2535 } while ((i = (i + nr_vmap_nodes - 1) % nr_vmap_nodes) != j); 2536 2537 return NULL; 2538 } 2539 2540 /*** Per cpu kva allocator ***/ 2541 2542 /* 2543 * vmap space is limited especially on 32 bit architectures. Ensure there is 2544 * room for at least 16 percpu vmap blocks per CPU. 2545 */ 2546 /* 2547 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able 2548 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess 2549 * instead (we just need a rough idea) 2550 */ 2551 #if BITS_PER_LONG == 32 2552 #define VMALLOC_SPACE (128UL*1024*1024) 2553 #else 2554 #define VMALLOC_SPACE (128UL*1024*1024*1024) 2555 #endif 2556 2557 #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) 2558 #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ 2559 #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ 2560 #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 2561 #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 2562 #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 2563 #define VMAP_BBMAP_BITS \ 2564 VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 2565 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 2566 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) 2567 2568 #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 2569 2570 /* 2571 * Purge threshold to prevent overeager purging of fragmented blocks for 2572 * regular operations: Purge if vb->free is less than 1/4 of the capacity. 2573 */ 2574 #define VMAP_PURGE_THRESHOLD (VMAP_BBMAP_BITS / 4) 2575 2576 #define VMAP_RAM 0x1 /* indicates vm_map_ram area*/ 2577 #define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/ 2578 #define VMAP_FLAGS_MASK 0x3 2579 2580 struct vmap_block_queue { 2581 spinlock_t lock; 2582 struct list_head free; 2583 2584 /* 2585 * An xarray requires an extra memory dynamically to 2586 * be allocated. If it is an issue, we can use rb-tree 2587 * instead. 2588 */ 2589 struct xarray vmap_blocks; 2590 }; 2591 2592 struct vmap_block { 2593 spinlock_t lock; 2594 struct vmap_area *va; 2595 unsigned long free, dirty; 2596 DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS); 2597 unsigned long dirty_min, dirty_max; /*< dirty range */ 2598 struct list_head free_list; 2599 struct rcu_head rcu_head; 2600 struct list_head purge; 2601 unsigned int cpu; 2602 }; 2603 2604 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ 2605 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); 2606 2607 /* 2608 * In order to fast access to any "vmap_block" associated with a 2609 * specific address, we use a hash. 2610 * 2611 * A per-cpu vmap_block_queue is used in both ways, to serialize 2612 * an access to free block chains among CPUs(alloc path) and it 2613 * also acts as a vmap_block hash(alloc/free paths). It means we 2614 * overload it, since we already have the per-cpu array which is 2615 * used as a hash table. When used as a hash a 'cpu' passed to 2616 * per_cpu() is not actually a CPU but rather a hash index. 2617 * 2618 * A hash function is addr_to_vb_xa() which hashes any address 2619 * to a specific index(in a hash) it belongs to. This then uses a 2620 * per_cpu() macro to access an array with generated index. 2621 * 2622 * An example: 2623 * 2624 * CPU_1 CPU_2 CPU_0 2625 * | | | 2626 * V V V 2627 * 0 10 20 30 40 50 60 2628 * |------|------|------|------|------|------|...<vmap address space> 2629 * CPU0 CPU1 CPU2 CPU0 CPU1 CPU2 2630 * 2631 * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus 2632 * it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock; 2633 * 2634 * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus 2635 * it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock; 2636 * 2637 * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus 2638 * it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock. 2639 * 2640 * This technique almost always avoids lock contention on insert/remove, 2641 * however xarray spinlocks protect against any contention that remains. 2642 */ 2643 static struct xarray * 2644 addr_to_vb_xa(unsigned long addr) 2645 { 2646 int index = (addr / VMAP_BLOCK_SIZE) % nr_cpu_ids; 2647 2648 /* 2649 * Please note, nr_cpu_ids points on a highest set 2650 * possible bit, i.e. we never invoke cpumask_next() 2651 * if an index points on it which is nr_cpu_ids - 1. 2652 */ 2653 if (!cpu_possible(index)) 2654 index = cpumask_next(index, cpu_possible_mask); 2655 2656 return &per_cpu(vmap_block_queue, index).vmap_blocks; 2657 } 2658 2659 /* 2660 * We should probably have a fallback mechanism to allocate virtual memory 2661 * out of partially filled vmap blocks. However vmap block sizing should be 2662 * fairly reasonable according to the vmalloc size, so it shouldn't be a 2663 * big problem. 2664 */ 2665 2666 static unsigned long addr_to_vb_idx(unsigned long addr) 2667 { 2668 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); 2669 addr /= VMAP_BLOCK_SIZE; 2670 return addr; 2671 } 2672 2673 static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) 2674 { 2675 unsigned long addr; 2676 2677 addr = va_start + (pages_off << PAGE_SHIFT); 2678 BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); 2679 return (void *)addr; 2680 } 2681 2682 /** 2683 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this 2684 * block. Of course pages number can't exceed VMAP_BBMAP_BITS 2685 * @order: how many 2^order pages should be occupied in newly allocated block 2686 * @gfp_mask: flags for the page level allocator 2687 * 2688 * Return: virtual address in a newly allocated block or ERR_PTR(-errno) 2689 */ 2690 static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) 2691 { 2692 struct vmap_block_queue *vbq; 2693 struct vmap_block *vb; 2694 struct vmap_area *va; 2695 struct xarray *xa; 2696 unsigned long vb_idx; 2697 int node, err; 2698 void *vaddr; 2699 2700 node = numa_node_id(); 2701 2702 vb = kmalloc_node(sizeof(struct vmap_block), gfp_mask, node); 2703 if (unlikely(!vb)) 2704 return ERR_PTR(-ENOMEM); 2705 2706 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, 2707 VMALLOC_START, VMALLOC_END, 2708 node, gfp_mask, 2709 VMAP_RAM|VMAP_BLOCK, NULL); 2710 if (IS_ERR(va)) { 2711 kfree(vb); 2712 return ERR_CAST(va); 2713 } 2714 2715 vaddr = vmap_block_vaddr(va->va_start, 0); 2716 spin_lock_init(&vb->lock); 2717 vb->va = va; 2718 /* At least something should be left free */ 2719 BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); 2720 bitmap_zero(vb->used_map, VMAP_BBMAP_BITS); 2721 vb->free = VMAP_BBMAP_BITS - (1UL << order); 2722 vb->dirty = 0; 2723 vb->dirty_min = VMAP_BBMAP_BITS; 2724 vb->dirty_max = 0; 2725 bitmap_set(vb->used_map, 0, (1UL << order)); 2726 INIT_LIST_HEAD(&vb->free_list); 2727 vb->cpu = raw_smp_processor_id(); 2728 2729 xa = addr_to_vb_xa(va->va_start); 2730 vb_idx = addr_to_vb_idx(va->va_start); 2731 err = xa_insert(xa, vb_idx, vb, gfp_mask); 2732 if (err) { 2733 kfree(vb); 2734 free_vmap_area(va); 2735 return ERR_PTR(err); 2736 } 2737 /* 2738 * list_add_tail_rcu could happened in another core 2739 * rather than vb->cpu due to task migration, which 2740 * is safe as list_add_tail_rcu will ensure the list's 2741 * integrity together with list_for_each_rcu from read 2742 * side. 2743 */ 2744 vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu); 2745 spin_lock(&vbq->lock); 2746 list_add_tail_rcu(&vb->free_list, &vbq->free); 2747 spin_unlock(&vbq->lock); 2748 2749 return vaddr; 2750 } 2751 2752 static void free_vmap_block(struct vmap_block *vb) 2753 { 2754 struct vmap_node *vn; 2755 struct vmap_block *tmp; 2756 struct xarray *xa; 2757 2758 xa = addr_to_vb_xa(vb->va->va_start); 2759 tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start)); 2760 BUG_ON(tmp != vb); 2761 2762 vn = addr_to_node(vb->va->va_start); 2763 spin_lock(&vn->busy.lock); 2764 unlink_va(vb->va, &vn->busy.root); 2765 spin_unlock(&vn->busy.lock); 2766 2767 free_vmap_area_noflush(vb->va); 2768 kfree_rcu(vb, rcu_head); 2769 } 2770 2771 static bool purge_fragmented_block(struct vmap_block *vb, 2772 struct list_head *purge_list, bool force_purge) 2773 { 2774 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, vb->cpu); 2775 2776 if (vb->free + vb->dirty != VMAP_BBMAP_BITS || 2777 vb->dirty == VMAP_BBMAP_BITS) 2778 return false; 2779 2780 /* Don't overeagerly purge usable blocks unless requested */ 2781 if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD)) 2782 return false; 2783 2784 /* prevent further allocs after releasing lock */ 2785 WRITE_ONCE(vb->free, 0); 2786 /* prevent purging it again */ 2787 WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS); 2788 vb->dirty_min = 0; 2789 vb->dirty_max = VMAP_BBMAP_BITS; 2790 spin_lock(&vbq->lock); 2791 list_del_rcu(&vb->free_list); 2792 spin_unlock(&vbq->lock); 2793 list_add_tail(&vb->purge, purge_list); 2794 return true; 2795 } 2796 2797 static void free_purged_blocks(struct list_head *purge_list) 2798 { 2799 struct vmap_block *vb, *n_vb; 2800 2801 list_for_each_entry_safe(vb, n_vb, purge_list, purge) { 2802 list_del(&vb->purge); 2803 free_vmap_block(vb); 2804 } 2805 } 2806 2807 static void purge_fragmented_blocks(int cpu) 2808 { 2809 LIST_HEAD(purge); 2810 struct vmap_block *vb; 2811 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 2812 2813 rcu_read_lock(); 2814 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 2815 unsigned long free = READ_ONCE(vb->free); 2816 unsigned long dirty = READ_ONCE(vb->dirty); 2817 2818 if (free + dirty != VMAP_BBMAP_BITS || 2819 dirty == VMAP_BBMAP_BITS) 2820 continue; 2821 2822 spin_lock(&vb->lock); 2823 purge_fragmented_block(vb, &purge, true); 2824 spin_unlock(&vb->lock); 2825 } 2826 rcu_read_unlock(); 2827 free_purged_blocks(&purge); 2828 } 2829 2830 static void purge_fragmented_blocks_allcpus(void) 2831 { 2832 int cpu; 2833 2834 for_each_possible_cpu(cpu) 2835 purge_fragmented_blocks(cpu); 2836 } 2837 2838 static void *vb_alloc(unsigned long size, gfp_t gfp_mask) 2839 { 2840 struct vmap_block_queue *vbq; 2841 struct vmap_block *vb; 2842 void *vaddr = NULL; 2843 unsigned int order; 2844 2845 BUG_ON(offset_in_page(size)); 2846 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 2847 if (WARN_ON(size == 0)) { 2848 /* 2849 * Allocating 0 bytes isn't what caller wants since 2850 * get_order(0) returns funny result. Just warn and terminate 2851 * early. 2852 */ 2853 return ERR_PTR(-EINVAL); 2854 } 2855 order = get_order(size); 2856 2857 rcu_read_lock(); 2858 vbq = raw_cpu_ptr(&vmap_block_queue); 2859 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 2860 unsigned long pages_off; 2861 2862 if (READ_ONCE(vb->free) < (1UL << order)) 2863 continue; 2864 2865 spin_lock(&vb->lock); 2866 if (vb->free < (1UL << order)) { 2867 spin_unlock(&vb->lock); 2868 continue; 2869 } 2870 2871 pages_off = VMAP_BBMAP_BITS - vb->free; 2872 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); 2873 WRITE_ONCE(vb->free, vb->free - (1UL << order)); 2874 bitmap_set(vb->used_map, pages_off, (1UL << order)); 2875 if (vb->free == 0) { 2876 spin_lock(&vbq->lock); 2877 list_del_rcu(&vb->free_list); 2878 spin_unlock(&vbq->lock); 2879 } 2880 2881 spin_unlock(&vb->lock); 2882 break; 2883 } 2884 2885 rcu_read_unlock(); 2886 2887 /* Allocate new block if nothing was found */ 2888 if (!vaddr) 2889 vaddr = new_vmap_block(order, gfp_mask); 2890 2891 return vaddr; 2892 } 2893 2894 static void vb_free(unsigned long addr, unsigned long size) 2895 { 2896 unsigned long offset; 2897 unsigned int order; 2898 struct vmap_block *vb; 2899 struct xarray *xa; 2900 2901 BUG_ON(offset_in_page(size)); 2902 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 2903 2904 flush_cache_vunmap(addr, addr + size); 2905 2906 order = get_order(size); 2907 offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; 2908 2909 xa = addr_to_vb_xa(addr); 2910 vb = xa_load(xa, addr_to_vb_idx(addr)); 2911 2912 spin_lock(&vb->lock); 2913 bitmap_clear(vb->used_map, offset, (1UL << order)); 2914 spin_unlock(&vb->lock); 2915 2916 vunmap_range_noflush(addr, addr + size); 2917 2918 if (debug_pagealloc_enabled_static()) 2919 flush_tlb_kernel_range(addr, addr + size); 2920 2921 spin_lock(&vb->lock); 2922 2923 /* Expand the not yet TLB flushed dirty range */ 2924 vb->dirty_min = min(vb->dirty_min, offset); 2925 vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); 2926 2927 WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order)); 2928 if (vb->dirty == VMAP_BBMAP_BITS) { 2929 BUG_ON(vb->free); 2930 spin_unlock(&vb->lock); 2931 free_vmap_block(vb); 2932 } else 2933 spin_unlock(&vb->lock); 2934 } 2935 2936 static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) 2937 { 2938 LIST_HEAD(purge_list); 2939 int cpu; 2940 2941 if (unlikely(!vmap_initialized)) 2942 return; 2943 2944 mutex_lock(&vmap_purge_lock); 2945 2946 for_each_possible_cpu(cpu) { 2947 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 2948 struct vmap_block *vb; 2949 unsigned long idx; 2950 2951 rcu_read_lock(); 2952 xa_for_each(&vbq->vmap_blocks, idx, vb) { 2953 spin_lock(&vb->lock); 2954 2955 /* 2956 * Try to purge a fragmented block first. If it's 2957 * not purgeable, check whether there is dirty 2958 * space to be flushed. 2959 */ 2960 if (!purge_fragmented_block(vb, &purge_list, false) && 2961 vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) { 2962 unsigned long va_start = vb->va->va_start; 2963 unsigned long s, e; 2964 2965 s = va_start + (vb->dirty_min << PAGE_SHIFT); 2966 e = va_start + (vb->dirty_max << PAGE_SHIFT); 2967 2968 start = min(s, start); 2969 end = max(e, end); 2970 2971 /* Prevent that this is flushed again */ 2972 vb->dirty_min = VMAP_BBMAP_BITS; 2973 vb->dirty_max = 0; 2974 2975 flush = 1; 2976 } 2977 spin_unlock(&vb->lock); 2978 } 2979 rcu_read_unlock(); 2980 } 2981 free_purged_blocks(&purge_list); 2982 2983 if (!__purge_vmap_area_lazy(start, end, false) && flush) 2984 flush_tlb_kernel_range(start, end); 2985 mutex_unlock(&vmap_purge_lock); 2986 } 2987 2988 /** 2989 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer 2990 * 2991 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily 2992 * to amortize TLB flushing overheads. What this means is that any page you 2993 * have now, may, in a former life, have been mapped into kernel virtual 2994 * address by the vmap layer and so there might be some CPUs with TLB entries 2995 * still referencing that page (additional to the regular 1:1 kernel mapping). 2996 * 2997 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can 2998 * be sure that none of the pages we have control over will have any aliases 2999 * from the vmap layer. 3000 */ 3001 void vm_unmap_aliases(void) 3002 { 3003 _vm_unmap_aliases(ULONG_MAX, 0, 0); 3004 } 3005 EXPORT_SYMBOL_GPL(vm_unmap_aliases); 3006 3007 /** 3008 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram 3009 * @mem: the pointer returned by vm_map_ram 3010 * @count: the count passed to that vm_map_ram call (cannot unmap partial) 3011 */ 3012 void vm_unmap_ram(const void *mem, unsigned int count) 3013 { 3014 unsigned long size = (unsigned long)count << PAGE_SHIFT; 3015 unsigned long addr = (unsigned long)kasan_reset_tag(mem); 3016 struct vmap_area *va; 3017 3018 might_sleep(); 3019 BUG_ON(!addr); 3020 BUG_ON(addr < VMALLOC_START); 3021 BUG_ON(addr > VMALLOC_END); 3022 BUG_ON(!PAGE_ALIGNED(addr)); 3023 3024 kasan_poison_vmalloc(mem, size); 3025 3026 if (likely(count <= VMAP_MAX_ALLOC)) { 3027 debug_check_no_locks_freed(mem, size); 3028 vb_free(addr, size); 3029 return; 3030 } 3031 3032 va = find_unlink_vmap_area(addr); 3033 if (WARN_ON_ONCE(!va)) 3034 return; 3035 3036 debug_check_no_locks_freed((void *)va->va_start, va_size(va)); 3037 free_unmap_vmap_area(va); 3038 } 3039 EXPORT_SYMBOL(vm_unmap_ram); 3040 3041 /** 3042 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) 3043 * @pages: an array of pointers to the pages to be mapped 3044 * @count: number of pages 3045 * @node: prefer to allocate data structures on this node 3046 * 3047 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be 3048 * faster than vmap so it's good. But if you mix long-life and short-life 3049 * objects with vm_map_ram(), it could consume lots of address space through 3050 * fragmentation (especially on a 32bit machine). You could see failures in 3051 * the end. Please use this function for short-lived objects. 3052 * 3053 * Returns: a pointer to the address that has been mapped, or %NULL on failure 3054 */ 3055 void *vm_map_ram(struct page **pages, unsigned int count, int node) 3056 { 3057 unsigned long size = (unsigned long)count << PAGE_SHIFT; 3058 unsigned long addr; 3059 void *mem; 3060 3061 if (likely(count <= VMAP_MAX_ALLOC)) { 3062 mem = vb_alloc(size, GFP_KERNEL); 3063 if (IS_ERR(mem)) 3064 return NULL; 3065 addr = (unsigned long)mem; 3066 } else { 3067 struct vmap_area *va; 3068 va = alloc_vmap_area(size, PAGE_SIZE, 3069 VMALLOC_START, VMALLOC_END, 3070 node, GFP_KERNEL, VMAP_RAM, 3071 NULL); 3072 if (IS_ERR(va)) 3073 return NULL; 3074 3075 addr = va->va_start; 3076 mem = (void *)addr; 3077 } 3078 3079 if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, 3080 pages, PAGE_SHIFT) < 0) { 3081 vm_unmap_ram(mem, count); 3082 return NULL; 3083 } 3084 3085 /* 3086 * Mark the pages as accessible, now that they are mapped. 3087 * With hardware tag-based KASAN, marking is skipped for 3088 * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). 3089 */ 3090 mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL); 3091 3092 return mem; 3093 } 3094 EXPORT_SYMBOL(vm_map_ram); 3095 3096 static struct vm_struct *vmlist __initdata; 3097 3098 static inline unsigned int vm_area_page_order(struct vm_struct *vm) 3099 { 3100 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC 3101 return vm->page_order; 3102 #else 3103 return 0; 3104 #endif 3105 } 3106 3107 unsigned int get_vm_area_page_order(struct vm_struct *vm) 3108 { 3109 return vm_area_page_order(vm); 3110 } 3111 3112 static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) 3113 { 3114 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC 3115 vm->page_order = order; 3116 #else 3117 BUG_ON(order != 0); 3118 #endif 3119 } 3120 3121 /** 3122 * vm_area_add_early - add vmap area early during boot 3123 * @vm: vm_struct to add 3124 * 3125 * This function is used to add fixed kernel vm area to vmlist before 3126 * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags 3127 * should contain proper values and the other fields should be zero. 3128 * 3129 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 3130 */ 3131 void __init vm_area_add_early(struct vm_struct *vm) 3132 { 3133 struct vm_struct *tmp, **p; 3134 3135 BUG_ON(vmap_initialized); 3136 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { 3137 if (tmp->addr >= vm->addr) { 3138 BUG_ON(tmp->addr < vm->addr + vm->size); 3139 break; 3140 } else 3141 BUG_ON(tmp->addr + tmp->size > vm->addr); 3142 } 3143 vm->next = *p; 3144 *p = vm; 3145 } 3146 3147 /** 3148 * vm_area_register_early - register vmap area early during boot 3149 * @vm: vm_struct to register 3150 * @align: requested alignment 3151 * 3152 * This function is used to register kernel vm area before 3153 * vmalloc_init() is called. @vm->size and @vm->flags should contain 3154 * proper values on entry and other fields should be zero. On return, 3155 * vm->addr contains the allocated address. 3156 * 3157 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 3158 */ 3159 void __init vm_area_register_early(struct vm_struct *vm, size_t align) 3160 { 3161 unsigned long addr = ALIGN(VMALLOC_START, align); 3162 struct vm_struct *cur, **p; 3163 3164 BUG_ON(vmap_initialized); 3165 3166 for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) { 3167 if ((unsigned long)cur->addr - addr >= vm->size) 3168 break; 3169 addr = ALIGN((unsigned long)cur->addr + cur->size, align); 3170 } 3171 3172 BUG_ON(addr > VMALLOC_END - vm->size); 3173 vm->addr = (void *)addr; 3174 vm->next = *p; 3175 *p = vm; 3176 kasan_populate_early_vm_area_shadow(vm->addr, vm->size); 3177 } 3178 3179 static void clear_vm_uninitialized_flag(struct vm_struct *vm) 3180 { 3181 /* 3182 * Before removing VM_UNINITIALIZED, 3183 * we should make sure that vm has proper values. 3184 * Pair with smp_rmb() in vread_iter() and vmalloc_info_show(). 3185 */ 3186 smp_wmb(); 3187 vm->flags &= ~VM_UNINITIALIZED; 3188 } 3189 3190 struct vm_struct *__get_vm_area_node(unsigned long size, 3191 unsigned long align, unsigned long shift, unsigned long flags, 3192 unsigned long start, unsigned long end, int node, 3193 gfp_t gfp_mask, const void *caller) 3194 { 3195 struct vmap_area *va; 3196 struct vm_struct *area; 3197 unsigned long requested_size = size; 3198 3199 BUG_ON(in_interrupt()); 3200 size = ALIGN(size, 1ul << shift); 3201 if (unlikely(!size)) 3202 return NULL; 3203 3204 if (flags & VM_IOREMAP) 3205 align = 1ul << clamp_t(int, get_count_order_long(size), 3206 PAGE_SHIFT, IOREMAP_MAX_ORDER); 3207 3208 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 3209 if (unlikely(!area)) 3210 return NULL; 3211 3212 if (!(flags & VM_NO_GUARD)) 3213 size += PAGE_SIZE; 3214 3215 area->flags = flags; 3216 area->caller = caller; 3217 area->requested_size = requested_size; 3218 3219 va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area); 3220 if (IS_ERR(va)) { 3221 kfree(area); 3222 return NULL; 3223 } 3224 3225 /* 3226 * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a 3227 * best-effort approach, as they can be mapped outside of vmalloc code. 3228 * For VM_ALLOC mappings, the pages are marked as accessible after 3229 * getting mapped in __vmalloc_node_range(). 3230 * With hardware tag-based KASAN, marking is skipped for 3231 * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). 3232 */ 3233 if (!(flags & VM_ALLOC)) 3234 area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, 3235 KASAN_VMALLOC_PROT_NORMAL); 3236 3237 return area; 3238 } 3239 3240 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, 3241 unsigned long start, unsigned long end, 3242 const void *caller) 3243 { 3244 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end, 3245 NUMA_NO_NODE, GFP_KERNEL, caller); 3246 } 3247 3248 /** 3249 * get_vm_area - reserve a contiguous kernel virtual area 3250 * @size: size of the area 3251 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 3252 * 3253 * Search an area of @size in the kernel virtual mapping area, 3254 * and reserved it for out purposes. Returns the area descriptor 3255 * on success or %NULL on failure. 3256 * 3257 * Return: the area descriptor on success or %NULL on failure. 3258 */ 3259 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 3260 { 3261 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, 3262 VMALLOC_START, VMALLOC_END, 3263 NUMA_NO_NODE, GFP_KERNEL, 3264 __builtin_return_address(0)); 3265 } 3266 3267 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 3268 const void *caller) 3269 { 3270 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, 3271 VMALLOC_START, VMALLOC_END, 3272 NUMA_NO_NODE, GFP_KERNEL, caller); 3273 } 3274 3275 /** 3276 * find_vm_area - find a continuous kernel virtual area 3277 * @addr: base address 3278 * 3279 * Search for the kernel VM area starting at @addr, and return it. 3280 * It is up to the caller to do all required locking to keep the returned 3281 * pointer valid. 3282 * 3283 * Return: the area descriptor on success or %NULL on failure. 3284 */ 3285 struct vm_struct *find_vm_area(const void *addr) 3286 { 3287 struct vmap_area *va; 3288 3289 va = find_vmap_area((unsigned long)addr); 3290 if (!va) 3291 return NULL; 3292 3293 return va->vm; 3294 } 3295 3296 /** 3297 * remove_vm_area - find and remove a continuous kernel virtual area 3298 * @addr: base address 3299 * 3300 * Search for the kernel VM area starting at @addr, and remove it. 3301 * This function returns the found VM area, but using it is NOT safe 3302 * on SMP machines, except for its size or flags. 3303 * 3304 * Return: the area descriptor on success or %NULL on failure. 3305 */ 3306 struct vm_struct *remove_vm_area(const void *addr) 3307 { 3308 struct vmap_area *va; 3309 struct vm_struct *vm; 3310 3311 might_sleep(); 3312 3313 if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", 3314 addr)) 3315 return NULL; 3316 3317 va = find_unlink_vmap_area((unsigned long)addr); 3318 if (!va || !va->vm) 3319 return NULL; 3320 vm = va->vm; 3321 3322 debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm)); 3323 debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm)); 3324 kasan_free_module_shadow(vm); 3325 kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm)); 3326 3327 free_unmap_vmap_area(va); 3328 return vm; 3329 } 3330 3331 static inline void set_area_direct_map(const struct vm_struct *area, 3332 int (*set_direct_map)(struct page *page)) 3333 { 3334 int i; 3335 3336 /* HUGE_VMALLOC passes small pages to set_direct_map */ 3337 for (i = 0; i < area->nr_pages; i++) 3338 if (page_address(area->pages[i])) 3339 set_direct_map(area->pages[i]); 3340 } 3341 3342 /* 3343 * Flush the vm mapping and reset the direct map. 3344 */ 3345 static void vm_reset_perms(struct vm_struct *area) 3346 { 3347 unsigned long start = ULONG_MAX, end = 0; 3348 unsigned int page_order = vm_area_page_order(area); 3349 int flush_dmap = 0; 3350 int i; 3351 3352 /* 3353 * Find the start and end range of the direct mappings to make sure that 3354 * the vm_unmap_aliases() flush includes the direct map. 3355 */ 3356 for (i = 0; i < area->nr_pages; i += 1U << page_order) { 3357 unsigned long addr = (unsigned long)page_address(area->pages[i]); 3358 3359 if (addr) { 3360 unsigned long page_size; 3361 3362 page_size = PAGE_SIZE << page_order; 3363 start = min(addr, start); 3364 end = max(addr + page_size, end); 3365 flush_dmap = 1; 3366 } 3367 } 3368 3369 /* 3370 * Set direct map to something invalid so that it won't be cached if 3371 * there are any accesses after the TLB flush, then flush the TLB and 3372 * reset the direct map permissions to the default. 3373 */ 3374 set_area_direct_map(area, set_direct_map_invalid_noflush); 3375 _vm_unmap_aliases(start, end, flush_dmap); 3376 set_area_direct_map(area, set_direct_map_default_noflush); 3377 } 3378 3379 static void delayed_vfree_work(struct work_struct *w) 3380 { 3381 struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); 3382 struct llist_node *t, *llnode; 3383 3384 llist_for_each_safe(llnode, t, llist_del_all(&p->list)) 3385 vfree(llnode); 3386 } 3387 3388 /** 3389 * vfree_atomic - release memory allocated by vmalloc() 3390 * @addr: memory base address 3391 * 3392 * This one is just like vfree() but can be called in any atomic context 3393 * except NMIs. 3394 */ 3395 void vfree_atomic(const void *addr) 3396 { 3397 struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); 3398 3399 BUG_ON(in_nmi()); 3400 kmemleak_free(addr); 3401 3402 /* 3403 * Use raw_cpu_ptr() because this can be called from preemptible 3404 * context. Preemption is absolutely fine here, because the llist_add() 3405 * implementation is lockless, so it works even if we are adding to 3406 * another cpu's list. schedule_work() should be fine with this too. 3407 */ 3408 if (addr && llist_add((struct llist_node *)addr, &p->list)) 3409 schedule_work(&p->wq); 3410 } 3411 3412 /** 3413 * vfree - Release memory allocated by vmalloc() 3414 * @addr: Memory base address 3415 * 3416 * Free the virtually continuous memory area starting at @addr, as obtained 3417 * from one of the vmalloc() family of APIs. This will usually also free the 3418 * physical memory underlying the virtual allocation, but that memory is 3419 * reference counted, so it will not be freed until the last user goes away. 3420 * 3421 * If @addr is NULL, no operation is performed. 3422 * 3423 * Context: 3424 * May sleep if called *not* from interrupt context. 3425 * Must not be called in NMI context (strictly speaking, it could be 3426 * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling 3427 * conventions for vfree() arch-dependent would be a really bad idea). 3428 */ 3429 void vfree(const void *addr) 3430 { 3431 struct vm_struct *vm; 3432 int i; 3433 3434 if (unlikely(in_interrupt())) { 3435 vfree_atomic(addr); 3436 return; 3437 } 3438 3439 BUG_ON(in_nmi()); 3440 kmemleak_free(addr); 3441 might_sleep(); 3442 3443 if (!addr) 3444 return; 3445 3446 vm = remove_vm_area(addr); 3447 if (unlikely(!vm)) { 3448 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 3449 addr); 3450 return; 3451 } 3452 3453 if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) 3454 vm_reset_perms(vm); 3455 /* All pages of vm should be charged to same memcg, so use first one. */ 3456 if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES)) 3457 mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages); 3458 for (i = 0; i < vm->nr_pages; i++) { 3459 struct page *page = vm->pages[i]; 3460 3461 BUG_ON(!page); 3462 /* 3463 * High-order allocs for huge vmallocs are split, so 3464 * can be freed as an array of order-0 allocations 3465 */ 3466 __free_page(page); 3467 cond_resched(); 3468 } 3469 if (!(vm->flags & VM_MAP_PUT_PAGES)) 3470 atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); 3471 kvfree(vm->pages); 3472 kfree(vm); 3473 } 3474 EXPORT_SYMBOL(vfree); 3475 3476 /** 3477 * vunmap - release virtual mapping obtained by vmap() 3478 * @addr: memory base address 3479 * 3480 * Free the virtually contiguous memory area starting at @addr, 3481 * which was created from the page array passed to vmap(). 3482 * 3483 * Must not be called in interrupt context. 3484 */ 3485 void vunmap(const void *addr) 3486 { 3487 struct vm_struct *vm; 3488 3489 BUG_ON(in_interrupt()); 3490 might_sleep(); 3491 3492 if (!addr) 3493 return; 3494 vm = remove_vm_area(addr); 3495 if (unlikely(!vm)) { 3496 WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n", 3497 addr); 3498 return; 3499 } 3500 kfree(vm); 3501 } 3502 EXPORT_SYMBOL(vunmap); 3503 3504 /** 3505 * vmap - map an array of pages into virtually contiguous space 3506 * @pages: array of page pointers 3507 * @count: number of pages to map 3508 * @flags: vm_area->flags 3509 * @prot: page protection for the mapping 3510 * 3511 * Maps @count pages from @pages into contiguous kernel virtual space. 3512 * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself 3513 * (which must be kmalloc or vmalloc memory) and one reference per pages in it 3514 * are transferred from the caller to vmap(), and will be freed / dropped when 3515 * vfree() is called on the return value. 3516 * 3517 * Return: the address of the area or %NULL on failure 3518 */ 3519 void *vmap(struct page **pages, unsigned int count, 3520 unsigned long flags, pgprot_t prot) 3521 { 3522 struct vm_struct *area; 3523 unsigned long addr; 3524 unsigned long size; /* In bytes */ 3525 3526 might_sleep(); 3527 3528 if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS)) 3529 return NULL; 3530 3531 /* 3532 * Your top guard is someone else's bottom guard. Not having a top 3533 * guard compromises someone else's mappings too. 3534 */ 3535 if (WARN_ON_ONCE(flags & VM_NO_GUARD)) 3536 flags &= ~VM_NO_GUARD; 3537 3538 if (count > totalram_pages()) 3539 return NULL; 3540 3541 size = (unsigned long)count << PAGE_SHIFT; 3542 area = get_vm_area_caller(size, flags, __builtin_return_address(0)); 3543 if (!area) 3544 return NULL; 3545 3546 addr = (unsigned long)area->addr; 3547 if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), 3548 pages, PAGE_SHIFT) < 0) { 3549 vunmap(area->addr); 3550 return NULL; 3551 } 3552 3553 if (flags & VM_MAP_PUT_PAGES) { 3554 area->pages = pages; 3555 area->nr_pages = count; 3556 } 3557 return area->addr; 3558 } 3559 EXPORT_SYMBOL(vmap); 3560 3561 #ifdef CONFIG_VMAP_PFN 3562 struct vmap_pfn_data { 3563 unsigned long *pfns; 3564 pgprot_t prot; 3565 unsigned int idx; 3566 }; 3567 3568 static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) 3569 { 3570 struct vmap_pfn_data *data = private; 3571 unsigned long pfn = data->pfns[data->idx]; 3572 pte_t ptent; 3573 3574 if (WARN_ON_ONCE(pfn_valid(pfn))) 3575 return -EINVAL; 3576 3577 ptent = pte_mkspecial(pfn_pte(pfn, data->prot)); 3578 set_pte_at(&init_mm, addr, pte, ptent); 3579 3580 data->idx++; 3581 return 0; 3582 } 3583 3584 /** 3585 * vmap_pfn - map an array of PFNs into virtually contiguous space 3586 * @pfns: array of PFNs 3587 * @count: number of pages to map 3588 * @prot: page protection for the mapping 3589 * 3590 * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns 3591 * the start address of the mapping. 3592 */ 3593 void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) 3594 { 3595 struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; 3596 struct vm_struct *area; 3597 3598 area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, 3599 __builtin_return_address(0)); 3600 if (!area) 3601 return NULL; 3602 if (apply_to_page_range(&init_mm, (unsigned long)area->addr, 3603 count * PAGE_SIZE, vmap_pfn_apply, &data)) { 3604 free_vm_area(area); 3605 return NULL; 3606 } 3607 3608 flush_cache_vmap((unsigned long)area->addr, 3609 (unsigned long)area->addr + count * PAGE_SIZE); 3610 3611 return area->addr; 3612 } 3613 EXPORT_SYMBOL_GPL(vmap_pfn); 3614 #endif /* CONFIG_VMAP_PFN */ 3615 3616 /* 3617 * Helper for vmalloc to adjust the gfp flags for certain allocations. 3618 */ 3619 static inline gfp_t vmalloc_gfp_adjust(gfp_t flags, const bool large) 3620 { 3621 flags |= __GFP_NOWARN; 3622 if (large) 3623 flags &= ~__GFP_NOFAIL; 3624 return flags; 3625 } 3626 3627 static inline unsigned int 3628 vm_area_alloc_pages(gfp_t gfp, int nid, 3629 unsigned int order, unsigned int nr_pages, struct page **pages) 3630 { 3631 unsigned int nr_allocated = 0; 3632 unsigned int nr_remaining = nr_pages; 3633 unsigned int max_attempt_order = MAX_PAGE_ORDER; 3634 struct page *page; 3635 int i; 3636 unsigned int large_order = ilog2(nr_remaining); 3637 gfp_t large_gfp = vmalloc_gfp_adjust(gfp, large_order) & ~__GFP_DIRECT_RECLAIM; 3638 3639 large_order = min(max_attempt_order, large_order); 3640 3641 /* 3642 * Initially, attempt to have the page allocator give us large order 3643 * pages. Do not attempt allocating smaller than order chunks since 3644 * __vmap_pages_range() expects physically contigous pages of exactly 3645 * order long chunks. 3646 */ 3647 while (large_order > order && nr_remaining) { 3648 if (nid == NUMA_NO_NODE) 3649 page = alloc_pages_noprof(large_gfp, large_order); 3650 else 3651 page = alloc_pages_node_noprof(nid, large_gfp, large_order); 3652 3653 if (unlikely(!page)) { 3654 max_attempt_order = --large_order; 3655 continue; 3656 } 3657 3658 split_page(page, large_order); 3659 for (i = 0; i < (1U << large_order); i++) 3660 pages[nr_allocated + i] = page + i; 3661 3662 nr_allocated += 1U << large_order; 3663 nr_remaining = nr_pages - nr_allocated; 3664 3665 large_order = ilog2(nr_remaining); 3666 large_order = min(max_attempt_order, large_order); 3667 } 3668 3669 /* 3670 * For order-0 pages we make use of bulk allocator, if 3671 * the page array is partly or not at all populated due 3672 * to fails, fallback to a single page allocator that is 3673 * more permissive. 3674 */ 3675 if (!order) { 3676 while (nr_allocated < nr_pages) { 3677 unsigned int nr, nr_pages_request; 3678 3679 /* 3680 * A maximum allowed request is hard-coded and is 100 3681 * pages per call. That is done in order to prevent a 3682 * long preemption off scenario in the bulk-allocator 3683 * so the range is [1:100]. 3684 */ 3685 nr_pages_request = min(100U, nr_pages - nr_allocated); 3686 3687 /* memory allocation should consider mempolicy, we can't 3688 * wrongly use nearest node when nid == NUMA_NO_NODE, 3689 * otherwise memory may be allocated in only one node, 3690 * but mempolicy wants to alloc memory by interleaving. 3691 */ 3692 if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) 3693 nr = alloc_pages_bulk_mempolicy_noprof(gfp, 3694 nr_pages_request, 3695 pages + nr_allocated); 3696 else 3697 nr = alloc_pages_bulk_node_noprof(gfp, nid, 3698 nr_pages_request, 3699 pages + nr_allocated); 3700 3701 nr_allocated += nr; 3702 3703 /* 3704 * If zero or pages were obtained partly, 3705 * fallback to a single page allocator. 3706 */ 3707 if (nr != nr_pages_request) 3708 break; 3709 } 3710 } 3711 3712 /* High-order pages or fallback path if "bulk" fails. */ 3713 while (nr_allocated < nr_pages) { 3714 if (!(gfp & __GFP_NOFAIL) && fatal_signal_pending(current)) 3715 break; 3716 3717 if (nid == NUMA_NO_NODE) 3718 page = alloc_pages_noprof(gfp, order); 3719 else 3720 page = alloc_pages_node_noprof(nid, gfp, order); 3721 3722 if (unlikely(!page)) 3723 break; 3724 3725 /* 3726 * High-order allocations must be able to be treated as 3727 * independent small pages by callers (as they can with 3728 * small-page vmallocs). Some drivers do their own refcounting 3729 * on vmalloc_to_page() pages, some use page->mapping, 3730 * page->lru, etc. 3731 */ 3732 if (order) 3733 split_page(page, order); 3734 3735 /* 3736 * Careful, we allocate and map page-order pages, but 3737 * tracking is done per PAGE_SIZE page so as to keep the 3738 * vm_struct APIs independent of the physical/mapped size. 3739 */ 3740 for (i = 0; i < (1U << order); i++) 3741 pages[nr_allocated + i] = page + i; 3742 3743 nr_allocated += 1U << order; 3744 } 3745 3746 return nr_allocated; 3747 } 3748 3749 static LLIST_HEAD(pending_vm_area_cleanup); 3750 static void cleanup_vm_area_work(struct work_struct *work) 3751 { 3752 struct vm_struct *area, *tmp; 3753 struct llist_node *head; 3754 3755 head = llist_del_all(&pending_vm_area_cleanup); 3756 if (!head) 3757 return; 3758 3759 llist_for_each_entry_safe(area, tmp, head, llnode) { 3760 if (!area->pages) 3761 free_vm_area(area); 3762 else 3763 vfree(area->addr); 3764 } 3765 } 3766 3767 /* 3768 * Helper for __vmalloc_area_node() to defer cleanup 3769 * of partially initialized vm_struct in error paths. 3770 */ 3771 static DECLARE_WORK(cleanup_vm_area, cleanup_vm_area_work); 3772 static void defer_vm_area_cleanup(struct vm_struct *area) 3773 { 3774 if (llist_add(&area->llnode, &pending_vm_area_cleanup)) 3775 schedule_work(&cleanup_vm_area); 3776 } 3777 3778 /* 3779 * Page tables allocations ignore external GFP. Enforces it by 3780 * the memalloc scope API. It is used by vmalloc internals and 3781 * KASAN shadow population only. 3782 * 3783 * GFP to scope mapping: 3784 * 3785 * non-blocking (no __GFP_DIRECT_RECLAIM) - memalloc_noreclaim_save() 3786 * GFP_NOFS - memalloc_nofs_save() 3787 * GFP_NOIO - memalloc_noio_save() 3788 * 3789 * Returns a flag cookie to pair with restore. 3790 */ 3791 unsigned int 3792 memalloc_apply_gfp_scope(gfp_t gfp_mask) 3793 { 3794 unsigned int flags = 0; 3795 3796 if (!gfpflags_allow_blocking(gfp_mask)) 3797 flags = memalloc_noreclaim_save(); 3798 else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) 3799 flags = memalloc_nofs_save(); 3800 else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) 3801 flags = memalloc_noio_save(); 3802 3803 /* 0 - no scope applied. */ 3804 return flags; 3805 } 3806 3807 void 3808 memalloc_restore_scope(unsigned int flags) 3809 { 3810 if (flags) 3811 memalloc_flags_restore(flags); 3812 } 3813 3814 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 3815 pgprot_t prot, unsigned int page_shift, 3816 int node) 3817 { 3818 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 3819 bool nofail = gfp_mask & __GFP_NOFAIL; 3820 unsigned long addr = (unsigned long)area->addr; 3821 unsigned long size = get_vm_area_size(area); 3822 unsigned long array_size; 3823 unsigned int nr_small_pages = size >> PAGE_SHIFT; 3824 unsigned int page_order; 3825 unsigned int flags; 3826 int ret; 3827 3828 array_size = (unsigned long)nr_small_pages * sizeof(struct page *); 3829 3830 /* __GFP_NOFAIL and "noblock" flags are mutually exclusive. */ 3831 if (!gfpflags_allow_blocking(gfp_mask)) 3832 nofail = false; 3833 3834 if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) 3835 gfp_mask |= __GFP_HIGHMEM; 3836 3837 /* Please note that the recursion is strictly bounded. */ 3838 if (array_size > PAGE_SIZE) { 3839 area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node, 3840 area->caller); 3841 } else { 3842 area->pages = kmalloc_node_noprof(array_size, nested_gfp, node); 3843 } 3844 3845 if (!area->pages) { 3846 warn_alloc(gfp_mask, NULL, 3847 "vmalloc error: size %lu, failed to allocated page array size %lu", 3848 nr_small_pages * PAGE_SIZE, array_size); 3849 goto fail; 3850 } 3851 3852 set_vm_area_page_order(area, page_shift - PAGE_SHIFT); 3853 page_order = vm_area_page_order(area); 3854 3855 /* 3856 * High-order nofail allocations are really expensive and 3857 * potentially dangerous (pre-mature OOM, disruptive reclaim 3858 * and compaction etc. 3859 * 3860 * Please note, the __vmalloc_node_range_noprof() falls-back 3861 * to order-0 pages if high-order attempt is unsuccessful. 3862 */ 3863 area->nr_pages = vm_area_alloc_pages( 3864 vmalloc_gfp_adjust(gfp_mask, page_order), node, 3865 page_order, nr_small_pages, area->pages); 3866 3867 atomic_long_add(area->nr_pages, &nr_vmalloc_pages); 3868 /* All pages of vm should be charged to same memcg, so use first one. */ 3869 if (gfp_mask & __GFP_ACCOUNT && area->nr_pages) 3870 mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC, 3871 area->nr_pages); 3872 3873 /* 3874 * If not enough pages were obtained to accomplish an 3875 * allocation request, free them via vfree() if any. 3876 */ 3877 if (area->nr_pages != nr_small_pages) { 3878 /* 3879 * vm_area_alloc_pages() can fail due to insufficient memory but 3880 * also:- 3881 * 3882 * - a pending fatal signal 3883 * - insufficient huge page-order pages 3884 * 3885 * Since we always retry allocations at order-0 in the huge page 3886 * case a warning for either is spurious. 3887 */ 3888 if (!fatal_signal_pending(current) && page_order == 0) 3889 warn_alloc(gfp_mask, NULL, 3890 "vmalloc error: size %lu, failed to allocate pages", 3891 area->nr_pages * PAGE_SIZE); 3892 goto fail; 3893 } 3894 3895 /* 3896 * page tables allocations ignore external gfp mask, enforce it 3897 * by the scope API 3898 */ 3899 flags = memalloc_apply_gfp_scope(gfp_mask); 3900 do { 3901 ret = __vmap_pages_range(addr, addr + size, prot, area->pages, 3902 page_shift, nested_gfp); 3903 if (nofail && (ret < 0)) 3904 schedule_timeout_uninterruptible(1); 3905 } while (nofail && (ret < 0)); 3906 memalloc_restore_scope(flags); 3907 3908 if (ret < 0) { 3909 warn_alloc(gfp_mask, NULL, 3910 "vmalloc error: size %lu, failed to map pages", 3911 area->nr_pages * PAGE_SIZE); 3912 goto fail; 3913 } 3914 3915 return area->addr; 3916 3917 fail: 3918 defer_vm_area_cleanup(area); 3919 return NULL; 3920 } 3921 3922 /* 3923 * See __vmalloc_node_range() for a clear list of supported vmalloc flags. 3924 * This gfp lists all flags currently passed through vmalloc. Currently, 3925 * __GFP_ZERO is used by BPF and __GFP_NORETRY is used by percpu. Both drm 3926 * and BPF also use GFP_USER. Additionally, various users pass 3927 * GFP_KERNEL_ACCOUNT. Xfs uses __GFP_NOLOCKDEP. 3928 */ 3929 #define GFP_VMALLOC_SUPPORTED (GFP_KERNEL | GFP_ATOMIC | GFP_NOWAIT |\ 3930 __GFP_NOFAIL | __GFP_ZERO | __GFP_NORETRY |\ 3931 GFP_NOFS | GFP_NOIO | GFP_KERNEL_ACCOUNT |\ 3932 GFP_USER | __GFP_NOLOCKDEP) 3933 3934 static gfp_t vmalloc_fix_flags(gfp_t flags) 3935 { 3936 gfp_t invalid_mask = flags & ~GFP_VMALLOC_SUPPORTED; 3937 3938 flags &= GFP_VMALLOC_SUPPORTED; 3939 WARN_ONCE(1, "Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", 3940 invalid_mask, &invalid_mask, flags, &flags); 3941 return flags; 3942 } 3943 3944 /** 3945 * __vmalloc_node_range - allocate virtually contiguous memory 3946 * @size: allocation size 3947 * @align: desired alignment 3948 * @start: vm area range start 3949 * @end: vm area range end 3950 * @gfp_mask: flags for the page level allocator 3951 * @prot: protection mask for the allocated pages 3952 * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) 3953 * @node: node to use for allocation or NUMA_NO_NODE 3954 * @caller: caller's return address 3955 * 3956 * Allocate enough pages to cover @size from the page level 3957 * allocator with @gfp_mask flags and map them into contiguous 3958 * virtual range with protection @prot. 3959 * 3960 * Supported GFP classes: %GFP_KERNEL, %GFP_ATOMIC, %GFP_NOWAIT, 3961 * %GFP_NOFS and %GFP_NOIO. Zone modifiers are not supported. 3962 * Please note %GFP_ATOMIC and %GFP_NOWAIT are supported only 3963 * by __vmalloc(). 3964 * 3965 * Retry modifiers: only %__GFP_NOFAIL is supported; %__GFP_NORETRY 3966 * and %__GFP_RETRY_MAYFAIL are not supported. 3967 * 3968 * %__GFP_NOWARN can be used to suppress failure messages. 3969 * 3970 * Can not be called from interrupt nor NMI contexts. 3971 * Return: the address of the area or %NULL on failure 3972 */ 3973 void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, 3974 unsigned long start, unsigned long end, gfp_t gfp_mask, 3975 pgprot_t prot, unsigned long vm_flags, int node, 3976 const void *caller) 3977 { 3978 struct vm_struct *area; 3979 void *ret; 3980 kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; 3981 unsigned long original_align = align; 3982 unsigned int shift = PAGE_SHIFT; 3983 3984 if (WARN_ON_ONCE(!size)) 3985 return NULL; 3986 3987 if ((size >> PAGE_SHIFT) > totalram_pages()) { 3988 warn_alloc(gfp_mask, NULL, 3989 "vmalloc error: size %lu, exceeds total pages", 3990 size); 3991 return NULL; 3992 } 3993 3994 if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) { 3995 /* 3996 * Try huge pages. Only try for PAGE_KERNEL allocations, 3997 * others like modules don't yet expect huge pages in 3998 * their allocations due to apply_to_page_range not 3999 * supporting them. 4000 */ 4001 4002 if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE) 4003 shift = PMD_SHIFT; 4004 else 4005 shift = arch_vmap_pte_supported_shift(size); 4006 4007 align = max(original_align, 1UL << shift); 4008 } 4009 4010 again: 4011 area = __get_vm_area_node(size, align, shift, VM_ALLOC | 4012 VM_UNINITIALIZED | vm_flags, start, end, node, 4013 gfp_mask, caller); 4014 if (!area) { 4015 bool nofail = gfp_mask & __GFP_NOFAIL; 4016 warn_alloc(gfp_mask, NULL, 4017 "vmalloc error: size %lu, vm_struct allocation failed%s", 4018 size, (nofail) ? ". Retrying." : ""); 4019 if (nofail) { 4020 schedule_timeout_uninterruptible(1); 4021 goto again; 4022 } 4023 goto fail; 4024 } 4025 4026 /* 4027 * Prepare arguments for __vmalloc_area_node() and 4028 * kasan_unpoison_vmalloc(). 4029 */ 4030 if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { 4031 if (kasan_hw_tags_enabled()) { 4032 /* 4033 * Modify protection bits to allow tagging. 4034 * This must be done before mapping. 4035 */ 4036 prot = arch_vmap_pgprot_tagged(prot); 4037 4038 /* 4039 * Skip page_alloc poisoning and zeroing for physical 4040 * pages backing VM_ALLOC mapping. Memory is instead 4041 * poisoned and zeroed by kasan_unpoison_vmalloc(). 4042 */ 4043 gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO; 4044 } 4045 4046 /* Take note that the mapping is PAGE_KERNEL. */ 4047 kasan_flags |= KASAN_VMALLOC_PROT_NORMAL; 4048 } 4049 4050 /* Allocate physical pages and map them into vmalloc space. */ 4051 ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); 4052 if (!ret) 4053 goto fail; 4054 4055 /* 4056 * Mark the pages as accessible, now that they are mapped. 4057 * The condition for setting KASAN_VMALLOC_INIT should complement the 4058 * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check 4059 * to make sure that memory is initialized under the same conditions. 4060 * Tag-based KASAN modes only assign tags to normal non-executable 4061 * allocations, see __kasan_unpoison_vmalloc(). 4062 */ 4063 kasan_flags |= KASAN_VMALLOC_VM_ALLOC; 4064 if (!want_init_on_free() && want_init_on_alloc(gfp_mask) && 4065 (gfp_mask & __GFP_SKIP_ZERO)) 4066 kasan_flags |= KASAN_VMALLOC_INIT; 4067 /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ 4068 area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags); 4069 4070 /* 4071 * In this function, newly allocated vm_struct has VM_UNINITIALIZED 4072 * flag. It means that vm_struct is not fully initialized. 4073 * Now, it is fully initialized, so remove this flag here. 4074 */ 4075 clear_vm_uninitialized_flag(area); 4076 4077 if (!(vm_flags & VM_DEFER_KMEMLEAK)) 4078 kmemleak_vmalloc(area, PAGE_ALIGN(size), gfp_mask); 4079 4080 return area->addr; 4081 4082 fail: 4083 if (shift > PAGE_SHIFT) { 4084 shift = PAGE_SHIFT; 4085 align = original_align; 4086 goto again; 4087 } 4088 4089 return NULL; 4090 } 4091 4092 /** 4093 * __vmalloc_node - allocate virtually contiguous memory 4094 * @size: allocation size 4095 * @align: desired alignment 4096 * @gfp_mask: flags for the page level allocator 4097 * @node: node to use for allocation or NUMA_NO_NODE 4098 * @caller: caller's return address 4099 * 4100 * Allocate enough pages to cover @size from the page level allocator with 4101 * @gfp_mask flags. Map them into contiguous kernel virtual space. 4102 * 4103 * Semantics of @gfp_mask (including reclaim/retry modifiers such as 4104 * __GFP_NOFAIL) are the same as in __vmalloc_node_range_noprof(). 4105 * 4106 * Return: pointer to the allocated memory or %NULL on error 4107 */ 4108 void *__vmalloc_node_noprof(unsigned long size, unsigned long align, 4109 gfp_t gfp_mask, int node, const void *caller) 4110 { 4111 return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, 4112 gfp_mask, PAGE_KERNEL, 0, node, caller); 4113 } 4114 /* 4115 * This is only for performance analysis of vmalloc and stress purpose. 4116 * It is required by vmalloc test module, therefore do not use it other 4117 * than that. 4118 */ 4119 #ifdef CONFIG_TEST_VMALLOC_MODULE 4120 EXPORT_SYMBOL_GPL(__vmalloc_node_noprof); 4121 #endif 4122 4123 void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) 4124 { 4125 if (unlikely(gfp_mask & ~GFP_VMALLOC_SUPPORTED)) 4126 gfp_mask = vmalloc_fix_flags(gfp_mask); 4127 return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE, 4128 __builtin_return_address(0)); 4129 } 4130 EXPORT_SYMBOL(__vmalloc_noprof); 4131 4132 /** 4133 * vmalloc - allocate virtually contiguous memory 4134 * @size: allocation size 4135 * 4136 * Allocate enough pages to cover @size from the page level 4137 * allocator and map them into contiguous kernel virtual space. 4138 * 4139 * For tight control over page level allocator and protection flags 4140 * use __vmalloc() instead. 4141 * 4142 * Return: pointer to the allocated memory or %NULL on error 4143 */ 4144 void *vmalloc_noprof(unsigned long size) 4145 { 4146 return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE, 4147 __builtin_return_address(0)); 4148 } 4149 EXPORT_SYMBOL(vmalloc_noprof); 4150 4151 /** 4152 * vmalloc_huge_node - allocate virtually contiguous memory, allow huge pages 4153 * @size: allocation size 4154 * @gfp_mask: flags for the page level allocator 4155 * @node: node to use for allocation or NUMA_NO_NODE 4156 * 4157 * Allocate enough pages to cover @size from the page level 4158 * allocator and map them into contiguous kernel virtual space. 4159 * If @size is greater than or equal to PMD_SIZE, allow using 4160 * huge pages for the memory 4161 * 4162 * Return: pointer to the allocated memory or %NULL on error 4163 */ 4164 void *vmalloc_huge_node_noprof(unsigned long size, gfp_t gfp_mask, int node) 4165 { 4166 if (unlikely(gfp_mask & ~GFP_VMALLOC_SUPPORTED)) 4167 gfp_mask = vmalloc_fix_flags(gfp_mask); 4168 return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, 4169 gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, 4170 node, __builtin_return_address(0)); 4171 } 4172 EXPORT_SYMBOL_GPL(vmalloc_huge_node_noprof); 4173 4174 /** 4175 * vzalloc - allocate virtually contiguous memory with zero fill 4176 * @size: allocation size 4177 * 4178 * Allocate enough pages to cover @size from the page level 4179 * allocator and map them into contiguous kernel virtual space. 4180 * The memory allocated is set to zero. 4181 * 4182 * For tight control over page level allocator and protection flags 4183 * use __vmalloc() instead. 4184 * 4185 * Return: pointer to the allocated memory or %NULL on error 4186 */ 4187 void *vzalloc_noprof(unsigned long size) 4188 { 4189 return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 4190 __builtin_return_address(0)); 4191 } 4192 EXPORT_SYMBOL(vzalloc_noprof); 4193 4194 /** 4195 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace 4196 * @size: allocation size 4197 * 4198 * The resulting memory area is zeroed so it can be mapped to userspace 4199 * without leaking data. 4200 * 4201 * Return: pointer to the allocated memory or %NULL on error 4202 */ 4203 void *vmalloc_user_noprof(unsigned long size) 4204 { 4205 return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, 4206 GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, 4207 VM_USERMAP, NUMA_NO_NODE, 4208 __builtin_return_address(0)); 4209 } 4210 EXPORT_SYMBOL(vmalloc_user_noprof); 4211 4212 /** 4213 * vmalloc_node - allocate memory on a specific node 4214 * @size: allocation size 4215 * @node: numa node 4216 * 4217 * Allocate enough pages to cover @size from the page level 4218 * allocator and map them into contiguous kernel virtual space. 4219 * 4220 * For tight control over page level allocator and protection flags 4221 * use __vmalloc() instead. 4222 * 4223 * Return: pointer to the allocated memory or %NULL on error 4224 */ 4225 void *vmalloc_node_noprof(unsigned long size, int node) 4226 { 4227 return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node, 4228 __builtin_return_address(0)); 4229 } 4230 EXPORT_SYMBOL(vmalloc_node_noprof); 4231 4232 /** 4233 * vzalloc_node - allocate memory on a specific node with zero fill 4234 * @size: allocation size 4235 * @node: numa node 4236 * 4237 * Allocate enough pages to cover @size from the page level 4238 * allocator and map them into contiguous kernel virtual space. 4239 * The memory allocated is set to zero. 4240 * 4241 * Return: pointer to the allocated memory or %NULL on error 4242 */ 4243 void *vzalloc_node_noprof(unsigned long size, int node) 4244 { 4245 return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node, 4246 __builtin_return_address(0)); 4247 } 4248 EXPORT_SYMBOL(vzalloc_node_noprof); 4249 4250 /** 4251 * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents 4252 * remain unchanged 4253 * @p: object to reallocate memory for 4254 * @size: the size to reallocate 4255 * @align: requested alignment 4256 * @flags: the flags for the page level allocator 4257 * @nid: node number of the target node 4258 * 4259 * If @p is %NULL, vrealloc_XXX() behaves exactly like vmalloc_XXX(). If @size 4260 * is 0 and @p is not a %NULL pointer, the object pointed to is freed. 4261 * 4262 * If the caller wants the new memory to be on specific node *only*, 4263 * __GFP_THISNODE flag should be set, otherwise the function will try to avoid 4264 * reallocation and possibly disregard the specified @nid. 4265 * 4266 * If __GFP_ZERO logic is requested, callers must ensure that, starting with the 4267 * initial memory allocation, every subsequent call to this API for the same 4268 * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that 4269 * __GFP_ZERO is not fully honored by this API. 4270 * 4271 * Requesting an alignment that is bigger than the alignment of the existing 4272 * allocation will fail. 4273 * 4274 * In any case, the contents of the object pointed to are preserved up to the 4275 * lesser of the new and old sizes. 4276 * 4277 * This function must not be called concurrently with itself or vfree() for the 4278 * same memory allocation. 4279 * 4280 * Return: pointer to the allocated memory; %NULL if @size is zero or in case of 4281 * failure 4282 */ 4283 void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, 4284 gfp_t flags, int nid) 4285 { 4286 struct vm_struct *vm = NULL; 4287 size_t alloced_size = 0; 4288 size_t old_size = 0; 4289 void *n; 4290 4291 if (!size) { 4292 vfree(p); 4293 return NULL; 4294 } 4295 4296 if (p) { 4297 vm = find_vm_area(p); 4298 if (unlikely(!vm)) { 4299 WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p); 4300 return NULL; 4301 } 4302 4303 alloced_size = get_vm_area_size(vm); 4304 old_size = vm->requested_size; 4305 if (WARN(alloced_size < old_size, 4306 "vrealloc() has mismatched area vs requested sizes (%p)\n", p)) 4307 return NULL; 4308 if (WARN(!IS_ALIGNED((unsigned long)p, align), 4309 "will not reallocate with a bigger alignment (0x%lx)\n", align)) 4310 return NULL; 4311 if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && 4312 nid != page_to_nid(vmalloc_to_page(p))) 4313 goto need_realloc; 4314 } 4315 4316 /* 4317 * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What 4318 * would be a good heuristic for when to shrink the vm_area? 4319 */ 4320 if (size <= old_size) { 4321 /* Zero out "freed" memory, potentially for future realloc. */ 4322 if (want_init_on_free() || want_init_on_alloc(flags)) 4323 memset((void *)p + size, 0, old_size - size); 4324 vm->requested_size = size; 4325 kasan_poison_vmalloc(p + size, old_size - size); 4326 return (void *)p; 4327 } 4328 4329 /* 4330 * We already have the bytes available in the allocation; use them. 4331 */ 4332 if (size <= alloced_size) { 4333 kasan_unpoison_vmalloc(p + old_size, size - old_size, 4334 KASAN_VMALLOC_PROT_NORMAL); 4335 /* 4336 * No need to zero memory here, as unused memory will have 4337 * already been zeroed at initial allocation time or during 4338 * realloc shrink time. 4339 */ 4340 vm->requested_size = size; 4341 return (void *)p; 4342 } 4343 4344 need_realloc: 4345 /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */ 4346 n = __vmalloc_node_noprof(size, align, flags, nid, __builtin_return_address(0)); 4347 4348 if (!n) 4349 return NULL; 4350 4351 if (p) { 4352 memcpy(n, p, old_size); 4353 vfree(p); 4354 } 4355 4356 return n; 4357 } 4358 4359 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 4360 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) 4361 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) 4362 #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) 4363 #else 4364 /* 4365 * 64b systems should always have either DMA or DMA32 zones. For others 4366 * GFP_DMA32 should do the right thing and use the normal zone. 4367 */ 4368 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) 4369 #endif 4370 4371 /** 4372 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 4373 * @size: allocation size 4374 * 4375 * Allocate enough 32bit PA addressable pages to cover @size from the 4376 * page level allocator and map them into contiguous kernel virtual space. 4377 * 4378 * Return: pointer to the allocated memory or %NULL on error 4379 */ 4380 void *vmalloc_32_noprof(unsigned long size) 4381 { 4382 return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, 4383 __builtin_return_address(0)); 4384 } 4385 EXPORT_SYMBOL(vmalloc_32_noprof); 4386 4387 /** 4388 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 4389 * @size: allocation size 4390 * 4391 * The resulting memory area is 32bit addressable and zeroed so it can be 4392 * mapped to userspace without leaking data. 4393 * 4394 * Return: pointer to the allocated memory or %NULL on error 4395 */ 4396 void *vmalloc_32_user_noprof(unsigned long size) 4397 { 4398 return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, 4399 GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 4400 VM_USERMAP, NUMA_NO_NODE, 4401 __builtin_return_address(0)); 4402 } 4403 EXPORT_SYMBOL(vmalloc_32_user_noprof); 4404 4405 /* 4406 * Atomically zero bytes in the iterator. 4407 * 4408 * Returns the number of zeroed bytes. 4409 */ 4410 static size_t zero_iter(struct iov_iter *iter, size_t count) 4411 { 4412 size_t remains = count; 4413 4414 while (remains > 0) { 4415 size_t num, copied; 4416 4417 num = min_t(size_t, remains, PAGE_SIZE); 4418 copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter); 4419 remains -= copied; 4420 4421 if (copied < num) 4422 break; 4423 } 4424 4425 return count - remains; 4426 } 4427 4428 /* 4429 * small helper routine, copy contents to iter from addr. 4430 * If the page is not present, fill zero. 4431 * 4432 * Returns the number of copied bytes. 4433 */ 4434 static size_t aligned_vread_iter(struct iov_iter *iter, 4435 const char *addr, size_t count) 4436 { 4437 size_t remains = count; 4438 struct page *page; 4439 4440 while (remains > 0) { 4441 unsigned long offset, length; 4442 size_t copied = 0; 4443 4444 offset = offset_in_page(addr); 4445 length = PAGE_SIZE - offset; 4446 if (length > remains) 4447 length = remains; 4448 page = vmalloc_to_page(addr); 4449 /* 4450 * To do safe access to this _mapped_ area, we need lock. But 4451 * adding lock here means that we need to add overhead of 4452 * vmalloc()/vfree() calls for this _debug_ interface, rarely 4453 * used. Instead of that, we'll use an local mapping via 4454 * copy_page_to_iter_nofault() and accept a small overhead in 4455 * this access function. 4456 */ 4457 if (page) 4458 copied = copy_page_to_iter_nofault(page, offset, 4459 length, iter); 4460 else 4461 copied = zero_iter(iter, length); 4462 4463 addr += copied; 4464 remains -= copied; 4465 4466 if (copied != length) 4467 break; 4468 } 4469 4470 return count - remains; 4471 } 4472 4473 /* 4474 * Read from a vm_map_ram region of memory. 4475 * 4476 * Returns the number of copied bytes. 4477 */ 4478 static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr, 4479 size_t count, unsigned long flags) 4480 { 4481 char *start; 4482 struct vmap_block *vb; 4483 struct xarray *xa; 4484 unsigned long offset; 4485 unsigned int rs, re; 4486 size_t remains, n; 4487 4488 /* 4489 * If it's area created by vm_map_ram() interface directly, but 4490 * not further subdividing and delegating management to vmap_block, 4491 * handle it here. 4492 */ 4493 if (!(flags & VMAP_BLOCK)) 4494 return aligned_vread_iter(iter, addr, count); 4495 4496 remains = count; 4497 4498 /* 4499 * Area is split into regions and tracked with vmap_block, read out 4500 * each region and zero fill the hole between regions. 4501 */ 4502 xa = addr_to_vb_xa((unsigned long) addr); 4503 vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr)); 4504 if (!vb) 4505 goto finished_zero; 4506 4507 spin_lock(&vb->lock); 4508 if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { 4509 spin_unlock(&vb->lock); 4510 goto finished_zero; 4511 } 4512 4513 for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { 4514 size_t copied; 4515 4516 if (remains == 0) 4517 goto finished; 4518 4519 start = vmap_block_vaddr(vb->va->va_start, rs); 4520 4521 if (addr < start) { 4522 size_t to_zero = min_t(size_t, start - addr, remains); 4523 size_t zeroed = zero_iter(iter, to_zero); 4524 4525 addr += zeroed; 4526 remains -= zeroed; 4527 4528 if (remains == 0 || zeroed != to_zero) 4529 goto finished; 4530 } 4531 4532 /*it could start reading from the middle of used region*/ 4533 offset = offset_in_page(addr); 4534 n = ((re - rs + 1) << PAGE_SHIFT) - offset; 4535 if (n > remains) 4536 n = remains; 4537 4538 copied = aligned_vread_iter(iter, start + offset, n); 4539 4540 addr += copied; 4541 remains -= copied; 4542 4543 if (copied != n) 4544 goto finished; 4545 } 4546 4547 spin_unlock(&vb->lock); 4548 4549 finished_zero: 4550 /* zero-fill the left dirty or free regions */ 4551 return count - remains + zero_iter(iter, remains); 4552 finished: 4553 /* We couldn't copy/zero everything */ 4554 spin_unlock(&vb->lock); 4555 return count - remains; 4556 } 4557 4558 /** 4559 * vread_iter() - read vmalloc area in a safe way to an iterator. 4560 * @iter: the iterator to which data should be written. 4561 * @addr: vm address. 4562 * @count: number of bytes to be read. 4563 * 4564 * This function checks that addr is a valid vmalloc'ed area, and 4565 * copy data from that area to a given buffer. If the given memory range 4566 * of [addr...addr+count) includes some valid address, data is copied to 4567 * proper area of @buf. If there are memory holes, they'll be zero-filled. 4568 * IOREMAP area is treated as memory hole and no copy is done. 4569 * 4570 * If [addr...addr+count) doesn't includes any intersects with alive 4571 * vm_struct area, returns 0. @buf should be kernel's buffer. 4572 * 4573 * Note: In usual ops, vread() is never necessary because the caller 4574 * should know vmalloc() area is valid and can use memcpy(). 4575 * This is for routines which have to access vmalloc area without 4576 * any information, as /proc/kcore. 4577 * 4578 * Return: number of bytes for which addr and buf should be increased 4579 * (same number as @count) or %0 if [addr...addr+count) doesn't 4580 * include any intersection with valid vmalloc area 4581 */ 4582 long vread_iter(struct iov_iter *iter, const char *addr, size_t count) 4583 { 4584 struct vmap_node *vn; 4585 struct vmap_area *va; 4586 struct vm_struct *vm; 4587 char *vaddr; 4588 size_t n, size, flags, remains; 4589 unsigned long next; 4590 4591 addr = kasan_reset_tag(addr); 4592 4593 /* Don't allow overflow */ 4594 if ((unsigned long) addr + count < count) 4595 count = -(unsigned long) addr; 4596 4597 remains = count; 4598 4599 vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va); 4600 if (!vn) 4601 goto finished_zero; 4602 4603 /* no intersects with alive vmap_area */ 4604 if ((unsigned long)addr + remains <= va->va_start) 4605 goto finished_zero; 4606 4607 do { 4608 size_t copied; 4609 4610 if (remains == 0) 4611 goto finished; 4612 4613 vm = va->vm; 4614 flags = va->flags & VMAP_FLAGS_MASK; 4615 /* 4616 * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need 4617 * be set together with VMAP_RAM. 4618 */ 4619 WARN_ON(flags == VMAP_BLOCK); 4620 4621 if (!vm && !flags) 4622 goto next_va; 4623 4624 if (vm && (vm->flags & VM_UNINITIALIZED)) 4625 goto next_va; 4626 4627 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ 4628 smp_rmb(); 4629 4630 vaddr = (char *) va->va_start; 4631 size = vm ? get_vm_area_size(vm) : va_size(va); 4632 4633 if (addr >= vaddr + size) 4634 goto next_va; 4635 4636 if (addr < vaddr) { 4637 size_t to_zero = min_t(size_t, vaddr - addr, remains); 4638 size_t zeroed = zero_iter(iter, to_zero); 4639 4640 addr += zeroed; 4641 remains -= zeroed; 4642 4643 if (remains == 0 || zeroed != to_zero) 4644 goto finished; 4645 } 4646 4647 n = vaddr + size - addr; 4648 if (n > remains) 4649 n = remains; 4650 4651 if (flags & VMAP_RAM) 4652 copied = vmap_ram_vread_iter(iter, addr, n, flags); 4653 else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE)))) 4654 copied = aligned_vread_iter(iter, addr, n); 4655 else /* IOREMAP | SPARSE area is treated as memory hole */ 4656 copied = zero_iter(iter, n); 4657 4658 addr += copied; 4659 remains -= copied; 4660 4661 if (copied != n) 4662 goto finished; 4663 4664 next_va: 4665 next = va->va_end; 4666 spin_unlock(&vn->busy.lock); 4667 } while ((vn = find_vmap_area_exceed_addr_lock(next, &va))); 4668 4669 finished_zero: 4670 if (vn) 4671 spin_unlock(&vn->busy.lock); 4672 4673 /* zero-fill memory holes */ 4674 return count - remains + zero_iter(iter, remains); 4675 finished: 4676 /* Nothing remains, or We couldn't copy/zero everything. */ 4677 if (vn) 4678 spin_unlock(&vn->busy.lock); 4679 4680 return count - remains; 4681 } 4682 4683 /** 4684 * remap_vmalloc_range_partial - map vmalloc pages to userspace 4685 * @vma: vma to cover 4686 * @uaddr: target user address to start at 4687 * @kaddr: virtual address of vmalloc kernel memory 4688 * @pgoff: offset from @kaddr to start at 4689 * @size: size of map area 4690 * 4691 * Returns: 0 for success, -Exxx on failure 4692 * 4693 * This function checks that @kaddr is a valid vmalloc'ed area, 4694 * and that it is big enough to cover the range starting at 4695 * @uaddr in @vma. Will return failure if that criteria isn't 4696 * met. 4697 * 4698 * Similar to remap_pfn_range() (see mm/memory.c) 4699 */ 4700 int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, 4701 void *kaddr, unsigned long pgoff, 4702 unsigned long size) 4703 { 4704 struct vm_struct *area; 4705 unsigned long off; 4706 unsigned long end_index; 4707 4708 if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) 4709 return -EINVAL; 4710 4711 size = PAGE_ALIGN(size); 4712 4713 if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) 4714 return -EINVAL; 4715 4716 area = find_vm_area(kaddr); 4717 if (!area) 4718 return -EINVAL; 4719 4720 if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) 4721 return -EINVAL; 4722 4723 if (check_add_overflow(size, off, &end_index) || 4724 end_index > get_vm_area_size(area)) 4725 return -EINVAL; 4726 kaddr += off; 4727 4728 do { 4729 struct page *page = vmalloc_to_page(kaddr); 4730 int ret; 4731 4732 ret = vm_insert_page(vma, uaddr, page); 4733 if (ret) 4734 return ret; 4735 4736 uaddr += PAGE_SIZE; 4737 kaddr += PAGE_SIZE; 4738 size -= PAGE_SIZE; 4739 } while (size > 0); 4740 4741 vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); 4742 4743 return 0; 4744 } 4745 4746 /** 4747 * remap_vmalloc_range - map vmalloc pages to userspace 4748 * @vma: vma to cover (map full range of vma) 4749 * @addr: vmalloc memory 4750 * @pgoff: number of pages into addr before first page to map 4751 * 4752 * Returns: 0 for success, -Exxx on failure 4753 * 4754 * This function checks that addr is a valid vmalloc'ed area, and 4755 * that it is big enough to cover the vma. Will return failure if 4756 * that criteria isn't met. 4757 * 4758 * Similar to remap_pfn_range() (see mm/memory.c) 4759 */ 4760 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 4761 unsigned long pgoff) 4762 { 4763 return remap_vmalloc_range_partial(vma, vma->vm_start, 4764 addr, pgoff, 4765 vma->vm_end - vma->vm_start); 4766 } 4767 EXPORT_SYMBOL(remap_vmalloc_range); 4768 4769 void free_vm_area(struct vm_struct *area) 4770 { 4771 struct vm_struct *ret; 4772 ret = remove_vm_area(area->addr); 4773 BUG_ON(ret != area); 4774 kfree(area); 4775 } 4776 EXPORT_SYMBOL_GPL(free_vm_area); 4777 4778 #ifdef CONFIG_SMP 4779 static struct vmap_area *node_to_va(struct rb_node *n) 4780 { 4781 return rb_entry_safe(n, struct vmap_area, rb_node); 4782 } 4783 4784 /** 4785 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to 4786 * @addr: target address 4787 * 4788 * Returns: vmap_area if it is found. If there is no such area 4789 * the first highest(reverse order) vmap_area is returned 4790 * i.e. va->va_start < addr && va->va_end < addr or NULL 4791 * if there are no any areas before @addr. 4792 */ 4793 static struct vmap_area * 4794 pvm_find_va_enclose_addr(unsigned long addr) 4795 { 4796 struct vmap_area *va, *tmp; 4797 struct rb_node *n; 4798 4799 n = free_vmap_area_root.rb_node; 4800 va = NULL; 4801 4802 while (n) { 4803 tmp = rb_entry(n, struct vmap_area, rb_node); 4804 if (tmp->va_start <= addr) { 4805 va = tmp; 4806 if (tmp->va_end >= addr) 4807 break; 4808 4809 n = n->rb_right; 4810 } else { 4811 n = n->rb_left; 4812 } 4813 } 4814 4815 return va; 4816 } 4817 4818 /** 4819 * pvm_determine_end_from_reverse - find the highest aligned address 4820 * of free block below VMALLOC_END 4821 * @va: 4822 * in - the VA we start the search(reverse order); 4823 * out - the VA with the highest aligned end address. 4824 * @align: alignment for required highest address 4825 * 4826 * Returns: determined end address within vmap_area 4827 */ 4828 static unsigned long 4829 pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) 4830 { 4831 unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 4832 unsigned long addr; 4833 4834 if (likely(*va)) { 4835 list_for_each_entry_from_reverse((*va), 4836 &free_vmap_area_list, list) { 4837 addr = min((*va)->va_end & ~(align - 1), vmalloc_end); 4838 if ((*va)->va_start < addr) 4839 return addr; 4840 } 4841 } 4842 4843 return 0; 4844 } 4845 4846 /** 4847 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator 4848 * @offsets: array containing offset of each area 4849 * @sizes: array containing size of each area 4850 * @nr_vms: the number of areas to allocate 4851 * @align: alignment, all entries in @offsets and @sizes must be aligned to this 4852 * 4853 * Returns: kmalloc'd vm_struct pointer array pointing to allocated 4854 * vm_structs on success, %NULL on failure 4855 * 4856 * Percpu allocator wants to use congruent vm areas so that it can 4857 * maintain the offsets among percpu areas. This function allocates 4858 * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to 4859 * be scattered pretty far, distance between two areas easily going up 4860 * to gigabytes. To avoid interacting with regular vmallocs, these 4861 * areas are allocated from top. 4862 * 4863 * Despite its complicated look, this allocator is rather simple. It 4864 * does everything top-down and scans free blocks from the end looking 4865 * for matching base. While scanning, if any of the areas do not fit the 4866 * base address is pulled down to fit the area. Scanning is repeated till 4867 * all the areas fit and then all necessary data structures are inserted 4868 * and the result is returned. 4869 */ 4870 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 4871 const size_t *sizes, int nr_vms, 4872 size_t align) 4873 { 4874 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 4875 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 4876 struct vmap_area **vas, *va; 4877 struct vm_struct **vms; 4878 int area, area2, last_area, term_area; 4879 unsigned long base, start, size, end, last_end, orig_start, orig_end; 4880 bool purged = false; 4881 4882 /* verify parameters and allocate data structures */ 4883 BUG_ON(offset_in_page(align) || !is_power_of_2(align)); 4884 for (last_area = 0, area = 0; area < nr_vms; area++) { 4885 start = offsets[area]; 4886 end = start + sizes[area]; 4887 4888 /* is everything aligned properly? */ 4889 BUG_ON(!IS_ALIGNED(offsets[area], align)); 4890 BUG_ON(!IS_ALIGNED(sizes[area], align)); 4891 4892 /* detect the area with the highest address */ 4893 if (start > offsets[last_area]) 4894 last_area = area; 4895 4896 for (area2 = area + 1; area2 < nr_vms; area2++) { 4897 unsigned long start2 = offsets[area2]; 4898 unsigned long end2 = start2 + sizes[area2]; 4899 4900 BUG_ON(start2 < end && start < end2); 4901 } 4902 } 4903 last_end = offsets[last_area] + sizes[last_area]; 4904 4905 if (vmalloc_end - vmalloc_start < last_end) { 4906 WARN_ON(true); 4907 return NULL; 4908 } 4909 4910 vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); 4911 vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); 4912 if (!vas || !vms) 4913 goto err_free2; 4914 4915 for (area = 0; area < nr_vms; area++) { 4916 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); 4917 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); 4918 if (!vas[area] || !vms[area]) 4919 goto err_free; 4920 } 4921 retry: 4922 spin_lock(&free_vmap_area_lock); 4923 4924 /* start scanning - we scan from the top, begin with the last area */ 4925 area = term_area = last_area; 4926 start = offsets[area]; 4927 end = start + sizes[area]; 4928 4929 va = pvm_find_va_enclose_addr(vmalloc_end); 4930 base = pvm_determine_end_from_reverse(&va, align) - end; 4931 4932 while (true) { 4933 /* 4934 * base might have underflowed, add last_end before 4935 * comparing. 4936 */ 4937 if (base + last_end < vmalloc_start + last_end) 4938 goto overflow; 4939 4940 /* 4941 * Fitting base has not been found. 4942 */ 4943 if (va == NULL) 4944 goto overflow; 4945 4946 /* 4947 * If required width exceeds current VA block, move 4948 * base downwards and then recheck. 4949 */ 4950 if (base + end > va->va_end) { 4951 base = pvm_determine_end_from_reverse(&va, align) - end; 4952 term_area = area; 4953 continue; 4954 } 4955 4956 /* 4957 * If this VA does not fit, move base downwards and recheck. 4958 */ 4959 if (base + start < va->va_start) { 4960 va = node_to_va(rb_prev(&va->rb_node)); 4961 base = pvm_determine_end_from_reverse(&va, align) - end; 4962 term_area = area; 4963 continue; 4964 } 4965 4966 /* 4967 * This area fits, move on to the previous one. If 4968 * the previous one is the terminal one, we're done. 4969 */ 4970 area = (area + nr_vms - 1) % nr_vms; 4971 if (area == term_area) 4972 break; 4973 4974 start = offsets[area]; 4975 end = start + sizes[area]; 4976 va = pvm_find_va_enclose_addr(base + end); 4977 } 4978 4979 /* we've found a fitting base, insert all va's */ 4980 for (area = 0; area < nr_vms; area++) { 4981 int ret; 4982 4983 start = base + offsets[area]; 4984 size = sizes[area]; 4985 4986 va = pvm_find_va_enclose_addr(start); 4987 if (WARN_ON_ONCE(va == NULL)) 4988 /* It is a BUG(), but trigger recovery instead. */ 4989 goto recovery; 4990 4991 ret = va_clip(&free_vmap_area_root, 4992 &free_vmap_area_list, va, start, size); 4993 if (WARN_ON_ONCE(unlikely(ret))) 4994 /* It is a BUG(), but trigger recovery instead. */ 4995 goto recovery; 4996 4997 /* Allocated area. */ 4998 va = vas[area]; 4999 va->va_start = start; 5000 va->va_end = start + size; 5001 } 5002 5003 spin_unlock(&free_vmap_area_lock); 5004 5005 /* populate the kasan shadow space */ 5006 for (area = 0; area < nr_vms; area++) { 5007 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area], GFP_KERNEL)) 5008 goto err_free_shadow; 5009 } 5010 5011 /* insert all vm's */ 5012 for (area = 0; area < nr_vms; area++) { 5013 struct vmap_node *vn = addr_to_node(vas[area]->va_start); 5014 5015 spin_lock(&vn->busy.lock); 5016 insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head); 5017 setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC, 5018 pcpu_get_vm_areas); 5019 spin_unlock(&vn->busy.lock); 5020 } 5021 5022 /* 5023 * Mark allocated areas as accessible. Do it now as a best-effort 5024 * approach, as they can be mapped outside of vmalloc code. 5025 * With hardware tag-based KASAN, marking is skipped for 5026 * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). 5027 */ 5028 for (area = 0; area < nr_vms; area++) 5029 vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, 5030 vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); 5031 5032 kfree(vas); 5033 return vms; 5034 5035 recovery: 5036 /* 5037 * Remove previously allocated areas. There is no 5038 * need in removing these areas from the busy tree, 5039 * because they are inserted only on the final step 5040 * and when pcpu_get_vm_areas() is success. 5041 */ 5042 while (area--) { 5043 orig_start = vas[area]->va_start; 5044 orig_end = vas[area]->va_end; 5045 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, 5046 &free_vmap_area_list); 5047 if (va) 5048 kasan_release_vmalloc(orig_start, orig_end, 5049 va->va_start, va->va_end, 5050 KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH); 5051 vas[area] = NULL; 5052 } 5053 5054 overflow: 5055 spin_unlock(&free_vmap_area_lock); 5056 if (!purged) { 5057 reclaim_and_purge_vmap_areas(); 5058 purged = true; 5059 5060 /* Before "retry", check if we recover. */ 5061 for (area = 0; area < nr_vms; area++) { 5062 if (vas[area]) 5063 continue; 5064 5065 vas[area] = kmem_cache_zalloc( 5066 vmap_area_cachep, GFP_KERNEL); 5067 if (!vas[area]) 5068 goto err_free; 5069 } 5070 5071 goto retry; 5072 } 5073 5074 err_free: 5075 for (area = 0; area < nr_vms; area++) { 5076 if (vas[area]) 5077 kmem_cache_free(vmap_area_cachep, vas[area]); 5078 5079 kfree(vms[area]); 5080 } 5081 err_free2: 5082 kfree(vas); 5083 kfree(vms); 5084 return NULL; 5085 5086 err_free_shadow: 5087 spin_lock(&free_vmap_area_lock); 5088 /* 5089 * We release all the vmalloc shadows, even the ones for regions that 5090 * hadn't been successfully added. This relies on kasan_release_vmalloc 5091 * being able to tolerate this case. 5092 */ 5093 for (area = 0; area < nr_vms; area++) { 5094 orig_start = vas[area]->va_start; 5095 orig_end = vas[area]->va_end; 5096 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, 5097 &free_vmap_area_list); 5098 if (va) 5099 kasan_release_vmalloc(orig_start, orig_end, 5100 va->va_start, va->va_end, 5101 KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH); 5102 vas[area] = NULL; 5103 kfree(vms[area]); 5104 } 5105 spin_unlock(&free_vmap_area_lock); 5106 kfree(vas); 5107 kfree(vms); 5108 return NULL; 5109 } 5110 5111 /** 5112 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator 5113 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() 5114 * @nr_vms: the number of allocated areas 5115 * 5116 * Free vm_structs and the array allocated by pcpu_get_vm_areas(). 5117 */ 5118 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) 5119 { 5120 int i; 5121 5122 for (i = 0; i < nr_vms; i++) 5123 free_vm_area(vms[i]); 5124 kfree(vms); 5125 } 5126 #endif /* CONFIG_SMP */ 5127 5128 #ifdef CONFIG_PRINTK 5129 bool vmalloc_dump_obj(void *object) 5130 { 5131 const void *caller; 5132 struct vm_struct *vm; 5133 struct vmap_area *va; 5134 struct vmap_node *vn; 5135 unsigned long addr; 5136 unsigned int nr_pages; 5137 5138 addr = PAGE_ALIGN((unsigned long) object); 5139 vn = addr_to_node(addr); 5140 5141 if (!spin_trylock(&vn->busy.lock)) 5142 return false; 5143 5144 va = __find_vmap_area(addr, &vn->busy.root); 5145 if (!va || !va->vm) { 5146 spin_unlock(&vn->busy.lock); 5147 return false; 5148 } 5149 5150 vm = va->vm; 5151 addr = (unsigned long) vm->addr; 5152 caller = vm->caller; 5153 nr_pages = vm->nr_pages; 5154 spin_unlock(&vn->busy.lock); 5155 5156 pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", 5157 nr_pages, addr, caller); 5158 5159 return true; 5160 } 5161 #endif 5162 5163 #ifdef CONFIG_PROC_FS 5164 5165 /* 5166 * Print number of pages allocated on each memory node. 5167 * 5168 * This function can only be called if CONFIG_NUMA is enabled 5169 * and VM_UNINITIALIZED bit in v->flags is disabled. 5170 */ 5171 static void show_numa_info(struct seq_file *m, struct vm_struct *v, 5172 unsigned int *counters) 5173 { 5174 unsigned int nr; 5175 unsigned int step = 1U << vm_area_page_order(v); 5176 5177 if (!counters) 5178 return; 5179 5180 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 5181 5182 for (nr = 0; nr < v->nr_pages; nr += step) 5183 counters[page_to_nid(v->pages[nr])] += step; 5184 for_each_node_state(nr, N_HIGH_MEMORY) 5185 if (counters[nr]) 5186 seq_printf(m, " N%u=%u", nr, counters[nr]); 5187 } 5188 5189 static void show_purge_info(struct seq_file *m) 5190 { 5191 struct vmap_node *vn; 5192 struct vmap_area *va; 5193 5194 for_each_vmap_node(vn) { 5195 spin_lock(&vn->lazy.lock); 5196 list_for_each_entry(va, &vn->lazy.head, list) { 5197 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", 5198 (void *)va->va_start, (void *)va->va_end, 5199 va_size(va)); 5200 } 5201 spin_unlock(&vn->lazy.lock); 5202 } 5203 } 5204 5205 static int vmalloc_info_show(struct seq_file *m, void *p) 5206 { 5207 struct vmap_node *vn; 5208 struct vmap_area *va; 5209 struct vm_struct *v; 5210 unsigned int *counters; 5211 5212 if (IS_ENABLED(CONFIG_NUMA)) 5213 counters = kmalloc_array(nr_node_ids, sizeof(unsigned int), GFP_KERNEL); 5214 5215 for_each_vmap_node(vn) { 5216 spin_lock(&vn->busy.lock); 5217 list_for_each_entry(va, &vn->busy.head, list) { 5218 if (!va->vm) { 5219 if (va->flags & VMAP_RAM) 5220 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", 5221 (void *)va->va_start, (void *)va->va_end, 5222 va_size(va)); 5223 5224 continue; 5225 } 5226 5227 v = va->vm; 5228 if (v->flags & VM_UNINITIALIZED) 5229 continue; 5230 5231 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ 5232 smp_rmb(); 5233 5234 seq_printf(m, "0x%pK-0x%pK %7ld", 5235 v->addr, v->addr + v->size, v->size); 5236 5237 if (v->caller) 5238 seq_printf(m, " %pS", v->caller); 5239 5240 if (v->nr_pages) 5241 seq_printf(m, " pages=%d", v->nr_pages); 5242 5243 if (v->phys_addr) 5244 seq_printf(m, " phys=%pa", &v->phys_addr); 5245 5246 if (v->flags & VM_IOREMAP) 5247 seq_puts(m, " ioremap"); 5248 5249 if (v->flags & VM_SPARSE) 5250 seq_puts(m, " sparse"); 5251 5252 if (v->flags & VM_ALLOC) 5253 seq_puts(m, " vmalloc"); 5254 5255 if (v->flags & VM_MAP) 5256 seq_puts(m, " vmap"); 5257 5258 if (v->flags & VM_USERMAP) 5259 seq_puts(m, " user"); 5260 5261 if (v->flags & VM_DMA_COHERENT) 5262 seq_puts(m, " dma-coherent"); 5263 5264 if (is_vmalloc_addr(v->pages)) 5265 seq_puts(m, " vpages"); 5266 5267 if (IS_ENABLED(CONFIG_NUMA)) 5268 show_numa_info(m, v, counters); 5269 5270 seq_putc(m, '\n'); 5271 } 5272 spin_unlock(&vn->busy.lock); 5273 } 5274 5275 /* 5276 * As a final step, dump "unpurged" areas. 5277 */ 5278 show_purge_info(m); 5279 if (IS_ENABLED(CONFIG_NUMA)) 5280 kfree(counters); 5281 return 0; 5282 } 5283 5284 static int __init proc_vmalloc_init(void) 5285 { 5286 proc_create_single("vmallocinfo", 0400, NULL, vmalloc_info_show); 5287 return 0; 5288 } 5289 module_init(proc_vmalloc_init); 5290 5291 #endif 5292 5293 static void __init vmap_init_free_space(void) 5294 { 5295 unsigned long vmap_start = 1; 5296 const unsigned long vmap_end = ULONG_MAX; 5297 struct vmap_area *free; 5298 struct vm_struct *busy; 5299 5300 /* 5301 * B F B B B F 5302 * -|-----|.....|-----|-----|-----|.....|- 5303 * | The KVA space | 5304 * |<--------------------------------->| 5305 */ 5306 for (busy = vmlist; busy; busy = busy->next) { 5307 if ((unsigned long) busy->addr - vmap_start > 0) { 5308 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 5309 if (!WARN_ON_ONCE(!free)) { 5310 free->va_start = vmap_start; 5311 free->va_end = (unsigned long) busy->addr; 5312 5313 insert_vmap_area_augment(free, NULL, 5314 &free_vmap_area_root, 5315 &free_vmap_area_list); 5316 } 5317 } 5318 5319 vmap_start = (unsigned long) busy->addr + busy->size; 5320 } 5321 5322 if (vmap_end - vmap_start > 0) { 5323 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 5324 if (!WARN_ON_ONCE(!free)) { 5325 free->va_start = vmap_start; 5326 free->va_end = vmap_end; 5327 5328 insert_vmap_area_augment(free, NULL, 5329 &free_vmap_area_root, 5330 &free_vmap_area_list); 5331 } 5332 } 5333 } 5334 5335 static void vmap_init_nodes(void) 5336 { 5337 struct vmap_node *vn; 5338 int i; 5339 5340 #if BITS_PER_LONG == 64 5341 /* 5342 * A high threshold of max nodes is fixed and bound to 128, 5343 * thus a scale factor is 1 for systems where number of cores 5344 * are less or equal to specified threshold. 5345 * 5346 * As for NUMA-aware notes. For bigger systems, for example 5347 * NUMA with multi-sockets, where we can end-up with thousands 5348 * of cores in total, a "sub-numa-clustering" should be added. 5349 * 5350 * In this case a NUMA domain is considered as a single entity 5351 * with dedicated sub-nodes in it which describe one group or 5352 * set of cores. Therefore a per-domain purging is supposed to 5353 * be added as well as a per-domain balancing. 5354 */ 5355 int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); 5356 5357 if (n > 1) { 5358 vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT); 5359 if (vn) { 5360 /* Node partition is 16 pages. */ 5361 vmap_zone_size = (1 << 4) * PAGE_SIZE; 5362 nr_vmap_nodes = n; 5363 vmap_nodes = vn; 5364 } else { 5365 pr_err("Failed to allocate an array. Disable a node layer\n"); 5366 } 5367 } 5368 #endif 5369 5370 for_each_vmap_node(vn) { 5371 vn->busy.root = RB_ROOT; 5372 INIT_LIST_HEAD(&vn->busy.head); 5373 spin_lock_init(&vn->busy.lock); 5374 5375 vn->lazy.root = RB_ROOT; 5376 INIT_LIST_HEAD(&vn->lazy.head); 5377 spin_lock_init(&vn->lazy.lock); 5378 5379 for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { 5380 INIT_LIST_HEAD(&vn->pool[i].head); 5381 WRITE_ONCE(vn->pool[i].len, 0); 5382 } 5383 5384 spin_lock_init(&vn->pool_lock); 5385 } 5386 } 5387 5388 static unsigned long 5389 vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 5390 { 5391 unsigned long count = 0; 5392 struct vmap_node *vn; 5393 int i; 5394 5395 for_each_vmap_node(vn) { 5396 for (i = 0; i < MAX_VA_SIZE_PAGES; i++) 5397 count += READ_ONCE(vn->pool[i].len); 5398 } 5399 5400 return count ? count : SHRINK_EMPTY; 5401 } 5402 5403 static unsigned long 5404 vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 5405 { 5406 struct vmap_node *vn; 5407 5408 for_each_vmap_node(vn) 5409 decay_va_pool_node(vn, true); 5410 5411 return SHRINK_STOP; 5412 } 5413 5414 void __init vmalloc_init(void) 5415 { 5416 struct shrinker *vmap_node_shrinker; 5417 struct vmap_area *va; 5418 struct vmap_node *vn; 5419 struct vm_struct *tmp; 5420 int i; 5421 5422 /* 5423 * Create the cache for vmap_area objects. 5424 */ 5425 vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); 5426 5427 for_each_possible_cpu(i) { 5428 struct vmap_block_queue *vbq; 5429 struct vfree_deferred *p; 5430 5431 vbq = &per_cpu(vmap_block_queue, i); 5432 spin_lock_init(&vbq->lock); 5433 INIT_LIST_HEAD(&vbq->free); 5434 p = &per_cpu(vfree_deferred, i); 5435 init_llist_head(&p->list); 5436 INIT_WORK(&p->wq, delayed_vfree_work); 5437 xa_init(&vbq->vmap_blocks); 5438 } 5439 5440 /* 5441 * Setup nodes before importing vmlist. 5442 */ 5443 vmap_init_nodes(); 5444 5445 /* Import existing vmlist entries. */ 5446 for (tmp = vmlist; tmp; tmp = tmp->next) { 5447 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 5448 if (WARN_ON_ONCE(!va)) 5449 continue; 5450 5451 va->va_start = (unsigned long)tmp->addr; 5452 va->va_end = va->va_start + tmp->size; 5453 va->vm = tmp; 5454 5455 vn = addr_to_node(va->va_start); 5456 insert_vmap_area(va, &vn->busy.root, &vn->busy.head); 5457 } 5458 5459 /* 5460 * Now we can initialize a free vmap space. 5461 */ 5462 vmap_init_free_space(); 5463 vmap_initialized = true; 5464 5465 vmap_node_shrinker = shrinker_alloc(0, "vmap-node"); 5466 if (!vmap_node_shrinker) { 5467 pr_err("Failed to allocate vmap-node shrinker!\n"); 5468 return; 5469 } 5470 5471 vmap_node_shrinker->count_objects = vmap_node_shrink_count; 5472 vmap_node_shrinker->scan_objects = vmap_node_shrink_scan; 5473 shrinker_register(vmap_node_shrinker); 5474 } 5475