1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 1993 Linus Torvalds 4 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 5 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 6 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 7 * Numa awareness, Christoph Lameter, SGI, June 2005 8 * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 9 */ 10 11 #include <linux/vmalloc.h> 12 #include <linux/mm.h> 13 #include <linux/module.h> 14 #include <linux/highmem.h> 15 #include <linux/sched/signal.h> 16 #include <linux/slab.h> 17 #include <linux/spinlock.h> 18 #include <linux/interrupt.h> 19 #include <linux/proc_fs.h> 20 #include <linux/seq_file.h> 21 #include <linux/set_memory.h> 22 #include <linux/debugobjects.h> 23 #include <linux/kallsyms.h> 24 #include <linux/list.h> 25 #include <linux/notifier.h> 26 #include <linux/rbtree.h> 27 #include <linux/xarray.h> 28 #include <linux/io.h> 29 #include <linux/rcupdate.h> 30 #include <linux/pfn.h> 31 #include <linux/kmemleak.h> 32 #include <linux/atomic.h> 33 #include <linux/compiler.h> 34 #include <linux/memcontrol.h> 35 #include <linux/llist.h> 36 #include <linux/bitops.h> 37 #include <linux/rbtree_augmented.h> 38 #include <linux/overflow.h> 39 #include <linux/pgtable.h> 40 #include <linux/uaccess.h> 41 #include <linux/hugetlb.h> 42 #include <linux/sched/mm.h> 43 #include <asm/tlbflush.h> 44 #include <asm/shmparam.h> 45 46 #define CREATE_TRACE_POINTS 47 #include <trace/events/vmalloc.h> 48 49 #include "internal.h" 50 #include "pgalloc-track.h" 51 52 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP 53 static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1; 54 55 static int __init set_nohugeiomap(char *str) 56 { 57 ioremap_max_page_shift = PAGE_SHIFT; 58 return 0; 59 } 60 early_param("nohugeiomap", set_nohugeiomap); 61 #else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ 62 static const unsigned int ioremap_max_page_shift = PAGE_SHIFT; 63 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ 64 65 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC 66 static bool __ro_after_init vmap_allow_huge = true; 67 68 static int __init set_nohugevmalloc(char *str) 69 { 70 vmap_allow_huge = false; 71 return 0; 72 } 73 early_param("nohugevmalloc", set_nohugevmalloc); 74 #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ 75 static const bool vmap_allow_huge = false; 76 #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ 77 78 bool is_vmalloc_addr(const void *x) 79 { 80 unsigned long addr = (unsigned long)kasan_reset_tag(x); 81 82 return addr >= VMALLOC_START && addr < VMALLOC_END; 83 } 84 EXPORT_SYMBOL(is_vmalloc_addr); 85 86 struct vfree_deferred { 87 struct llist_head list; 88 struct work_struct wq; 89 }; 90 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); 91 92 static void __vunmap(const void *, int); 93 94 static void free_work(struct work_struct *w) 95 { 96 struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); 97 struct llist_node *t, *llnode; 98 99 llist_for_each_safe(llnode, t, llist_del_all(&p->list)) 100 __vunmap((void *)llnode, 1); 101 } 102 103 /*** Page table manipulation functions ***/ 104 static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 105 phys_addr_t phys_addr, pgprot_t prot, 106 unsigned int max_page_shift, pgtbl_mod_mask *mask) 107 { 108 pte_t *pte; 109 u64 pfn; 110 unsigned long size = PAGE_SIZE; 111 112 pfn = phys_addr >> PAGE_SHIFT; 113 pte = pte_alloc_kernel_track(pmd, addr, mask); 114 if (!pte) 115 return -ENOMEM; 116 do { 117 BUG_ON(!pte_none(*pte)); 118 119 #ifdef CONFIG_HUGETLB_PAGE 120 size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift); 121 if (size != PAGE_SIZE) { 122 pte_t entry = pfn_pte(pfn, prot); 123 124 entry = arch_make_huge_pte(entry, ilog2(size), 0); 125 set_huge_pte_at(&init_mm, addr, pte, entry); 126 pfn += PFN_DOWN(size); 127 continue; 128 } 129 #endif 130 set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot)); 131 pfn++; 132 } while (pte += PFN_DOWN(size), addr += size, addr != end); 133 *mask |= PGTBL_PTE_MODIFIED; 134 return 0; 135 } 136 137 static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end, 138 phys_addr_t phys_addr, pgprot_t prot, 139 unsigned int max_page_shift) 140 { 141 if (max_page_shift < PMD_SHIFT) 142 return 0; 143 144 if (!arch_vmap_pmd_supported(prot)) 145 return 0; 146 147 if ((end - addr) != PMD_SIZE) 148 return 0; 149 150 if (!IS_ALIGNED(addr, PMD_SIZE)) 151 return 0; 152 153 if (!IS_ALIGNED(phys_addr, PMD_SIZE)) 154 return 0; 155 156 if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr)) 157 return 0; 158 159 return pmd_set_huge(pmd, phys_addr, prot); 160 } 161 162 static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 163 phys_addr_t phys_addr, pgprot_t prot, 164 unsigned int max_page_shift, pgtbl_mod_mask *mask) 165 { 166 pmd_t *pmd; 167 unsigned long next; 168 169 pmd = pmd_alloc_track(&init_mm, pud, addr, mask); 170 if (!pmd) 171 return -ENOMEM; 172 do { 173 next = pmd_addr_end(addr, end); 174 175 if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot, 176 max_page_shift)) { 177 *mask |= PGTBL_PMD_MODIFIED; 178 continue; 179 } 180 181 if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask)) 182 return -ENOMEM; 183 } while (pmd++, phys_addr += (next - addr), addr = next, addr != end); 184 return 0; 185 } 186 187 static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end, 188 phys_addr_t phys_addr, pgprot_t prot, 189 unsigned int max_page_shift) 190 { 191 if (max_page_shift < PUD_SHIFT) 192 return 0; 193 194 if (!arch_vmap_pud_supported(prot)) 195 return 0; 196 197 if ((end - addr) != PUD_SIZE) 198 return 0; 199 200 if (!IS_ALIGNED(addr, PUD_SIZE)) 201 return 0; 202 203 if (!IS_ALIGNED(phys_addr, PUD_SIZE)) 204 return 0; 205 206 if (pud_present(*pud) && !pud_free_pmd_page(pud, addr)) 207 return 0; 208 209 return pud_set_huge(pud, phys_addr, prot); 210 } 211 212 static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 213 phys_addr_t phys_addr, pgprot_t prot, 214 unsigned int max_page_shift, pgtbl_mod_mask *mask) 215 { 216 pud_t *pud; 217 unsigned long next; 218 219 pud = pud_alloc_track(&init_mm, p4d, addr, mask); 220 if (!pud) 221 return -ENOMEM; 222 do { 223 next = pud_addr_end(addr, end); 224 225 if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot, 226 max_page_shift)) { 227 *mask |= PGTBL_PUD_MODIFIED; 228 continue; 229 } 230 231 if (vmap_pmd_range(pud, addr, next, phys_addr, prot, 232 max_page_shift, mask)) 233 return -ENOMEM; 234 } while (pud++, phys_addr += (next - addr), addr = next, addr != end); 235 return 0; 236 } 237 238 static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end, 239 phys_addr_t phys_addr, pgprot_t prot, 240 unsigned int max_page_shift) 241 { 242 if (max_page_shift < P4D_SHIFT) 243 return 0; 244 245 if (!arch_vmap_p4d_supported(prot)) 246 return 0; 247 248 if ((end - addr) != P4D_SIZE) 249 return 0; 250 251 if (!IS_ALIGNED(addr, P4D_SIZE)) 252 return 0; 253 254 if (!IS_ALIGNED(phys_addr, P4D_SIZE)) 255 return 0; 256 257 if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr)) 258 return 0; 259 260 return p4d_set_huge(p4d, phys_addr, prot); 261 } 262 263 static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 264 phys_addr_t phys_addr, pgprot_t prot, 265 unsigned int max_page_shift, pgtbl_mod_mask *mask) 266 { 267 p4d_t *p4d; 268 unsigned long next; 269 270 p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); 271 if (!p4d) 272 return -ENOMEM; 273 do { 274 next = p4d_addr_end(addr, end); 275 276 if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot, 277 max_page_shift)) { 278 *mask |= PGTBL_P4D_MODIFIED; 279 continue; 280 } 281 282 if (vmap_pud_range(p4d, addr, next, phys_addr, prot, 283 max_page_shift, mask)) 284 return -ENOMEM; 285 } while (p4d++, phys_addr += (next - addr), addr = next, addr != end); 286 return 0; 287 } 288 289 static int vmap_range_noflush(unsigned long addr, unsigned long end, 290 phys_addr_t phys_addr, pgprot_t prot, 291 unsigned int max_page_shift) 292 { 293 pgd_t *pgd; 294 unsigned long start; 295 unsigned long next; 296 int err; 297 pgtbl_mod_mask mask = 0; 298 299 might_sleep(); 300 BUG_ON(addr >= end); 301 302 start = addr; 303 pgd = pgd_offset_k(addr); 304 do { 305 next = pgd_addr_end(addr, end); 306 err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, 307 max_page_shift, &mask); 308 if (err) 309 break; 310 } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); 311 312 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 313 arch_sync_kernel_mappings(start, end); 314 315 return err; 316 } 317 318 int ioremap_page_range(unsigned long addr, unsigned long end, 319 phys_addr_t phys_addr, pgprot_t prot) 320 { 321 int err; 322 323 err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), 324 ioremap_max_page_shift); 325 flush_cache_vmap(addr, end); 326 if (!err) 327 kmsan_ioremap_page_range(addr, end, phys_addr, prot, 328 ioremap_max_page_shift); 329 return err; 330 } 331 332 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 333 pgtbl_mod_mask *mask) 334 { 335 pte_t *pte; 336 337 pte = pte_offset_kernel(pmd, addr); 338 do { 339 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); 340 WARN_ON(!pte_none(ptent) && !pte_present(ptent)); 341 } while (pte++, addr += PAGE_SIZE, addr != end); 342 *mask |= PGTBL_PTE_MODIFIED; 343 } 344 345 static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, 346 pgtbl_mod_mask *mask) 347 { 348 pmd_t *pmd; 349 unsigned long next; 350 int cleared; 351 352 pmd = pmd_offset(pud, addr); 353 do { 354 next = pmd_addr_end(addr, end); 355 356 cleared = pmd_clear_huge(pmd); 357 if (cleared || pmd_bad(*pmd)) 358 *mask |= PGTBL_PMD_MODIFIED; 359 360 if (cleared) 361 continue; 362 if (pmd_none_or_clear_bad(pmd)) 363 continue; 364 vunmap_pte_range(pmd, addr, next, mask); 365 366 cond_resched(); 367 } while (pmd++, addr = next, addr != end); 368 } 369 370 static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, 371 pgtbl_mod_mask *mask) 372 { 373 pud_t *pud; 374 unsigned long next; 375 int cleared; 376 377 pud = pud_offset(p4d, addr); 378 do { 379 next = pud_addr_end(addr, end); 380 381 cleared = pud_clear_huge(pud); 382 if (cleared || pud_bad(*pud)) 383 *mask |= PGTBL_PUD_MODIFIED; 384 385 if (cleared) 386 continue; 387 if (pud_none_or_clear_bad(pud)) 388 continue; 389 vunmap_pmd_range(pud, addr, next, mask); 390 } while (pud++, addr = next, addr != end); 391 } 392 393 static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, 394 pgtbl_mod_mask *mask) 395 { 396 p4d_t *p4d; 397 unsigned long next; 398 399 p4d = p4d_offset(pgd, addr); 400 do { 401 next = p4d_addr_end(addr, end); 402 403 p4d_clear_huge(p4d); 404 if (p4d_bad(*p4d)) 405 *mask |= PGTBL_P4D_MODIFIED; 406 407 if (p4d_none_or_clear_bad(p4d)) 408 continue; 409 vunmap_pud_range(p4d, addr, next, mask); 410 } while (p4d++, addr = next, addr != end); 411 } 412 413 /* 414 * vunmap_range_noflush is similar to vunmap_range, but does not 415 * flush caches or TLBs. 416 * 417 * The caller is responsible for calling flush_cache_vmap() before calling 418 * this function, and flush_tlb_kernel_range after it has returned 419 * successfully (and before the addresses are expected to cause a page fault 420 * or be re-mapped for something else, if TLB flushes are being delayed or 421 * coalesced). 422 * 423 * This is an internal function only. Do not use outside mm/. 424 */ 425 void __vunmap_range_noflush(unsigned long start, unsigned long end) 426 { 427 unsigned long next; 428 pgd_t *pgd; 429 unsigned long addr = start; 430 pgtbl_mod_mask mask = 0; 431 432 BUG_ON(addr >= end); 433 pgd = pgd_offset_k(addr); 434 do { 435 next = pgd_addr_end(addr, end); 436 if (pgd_bad(*pgd)) 437 mask |= PGTBL_PGD_MODIFIED; 438 if (pgd_none_or_clear_bad(pgd)) 439 continue; 440 vunmap_p4d_range(pgd, addr, next, &mask); 441 } while (pgd++, addr = next, addr != end); 442 443 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 444 arch_sync_kernel_mappings(start, end); 445 } 446 447 void vunmap_range_noflush(unsigned long start, unsigned long end) 448 { 449 kmsan_vunmap_range_noflush(start, end); 450 __vunmap_range_noflush(start, end); 451 } 452 453 /** 454 * vunmap_range - unmap kernel virtual addresses 455 * @addr: start of the VM area to unmap 456 * @end: end of the VM area to unmap (non-inclusive) 457 * 458 * Clears any present PTEs in the virtual address range, flushes TLBs and 459 * caches. Any subsequent access to the address before it has been re-mapped 460 * is a kernel bug. 461 */ 462 void vunmap_range(unsigned long addr, unsigned long end) 463 { 464 flush_cache_vunmap(addr, end); 465 vunmap_range_noflush(addr, end); 466 flush_tlb_kernel_range(addr, end); 467 } 468 469 static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, 470 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 471 pgtbl_mod_mask *mask) 472 { 473 pte_t *pte; 474 475 /* 476 * nr is a running index into the array which helps higher level 477 * callers keep track of where we're up to. 478 */ 479 480 pte = pte_alloc_kernel_track(pmd, addr, mask); 481 if (!pte) 482 return -ENOMEM; 483 do { 484 struct page *page = pages[*nr]; 485 486 if (WARN_ON(!pte_none(*pte))) 487 return -EBUSY; 488 if (WARN_ON(!page)) 489 return -ENOMEM; 490 if (WARN_ON(!pfn_valid(page_to_pfn(page)))) 491 return -EINVAL; 492 493 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 494 (*nr)++; 495 } while (pte++, addr += PAGE_SIZE, addr != end); 496 *mask |= PGTBL_PTE_MODIFIED; 497 return 0; 498 } 499 500 static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr, 501 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 502 pgtbl_mod_mask *mask) 503 { 504 pmd_t *pmd; 505 unsigned long next; 506 507 pmd = pmd_alloc_track(&init_mm, pud, addr, mask); 508 if (!pmd) 509 return -ENOMEM; 510 do { 511 next = pmd_addr_end(addr, end); 512 if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask)) 513 return -ENOMEM; 514 } while (pmd++, addr = next, addr != end); 515 return 0; 516 } 517 518 static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr, 519 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 520 pgtbl_mod_mask *mask) 521 { 522 pud_t *pud; 523 unsigned long next; 524 525 pud = pud_alloc_track(&init_mm, p4d, addr, mask); 526 if (!pud) 527 return -ENOMEM; 528 do { 529 next = pud_addr_end(addr, end); 530 if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask)) 531 return -ENOMEM; 532 } while (pud++, addr = next, addr != end); 533 return 0; 534 } 535 536 static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, 537 unsigned long end, pgprot_t prot, struct page **pages, int *nr, 538 pgtbl_mod_mask *mask) 539 { 540 p4d_t *p4d; 541 unsigned long next; 542 543 p4d = p4d_alloc_track(&init_mm, pgd, addr, mask); 544 if (!p4d) 545 return -ENOMEM; 546 do { 547 next = p4d_addr_end(addr, end); 548 if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask)) 549 return -ENOMEM; 550 } while (p4d++, addr = next, addr != end); 551 return 0; 552 } 553 554 static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, 555 pgprot_t prot, struct page **pages) 556 { 557 unsigned long start = addr; 558 pgd_t *pgd; 559 unsigned long next; 560 int err = 0; 561 int nr = 0; 562 pgtbl_mod_mask mask = 0; 563 564 BUG_ON(addr >= end); 565 pgd = pgd_offset_k(addr); 566 do { 567 next = pgd_addr_end(addr, end); 568 if (pgd_bad(*pgd)) 569 mask |= PGTBL_PGD_MODIFIED; 570 err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); 571 if (err) 572 return err; 573 } while (pgd++, addr = next, addr != end); 574 575 if (mask & ARCH_PAGE_TABLE_SYNC_MASK) 576 arch_sync_kernel_mappings(start, end); 577 578 return 0; 579 } 580 581 /* 582 * vmap_pages_range_noflush is similar to vmap_pages_range, but does not 583 * flush caches. 584 * 585 * The caller is responsible for calling flush_cache_vmap() after this 586 * function returns successfully and before the addresses are accessed. 587 * 588 * This is an internal function only. Do not use outside mm/. 589 */ 590 int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, 591 pgprot_t prot, struct page **pages, unsigned int page_shift) 592 { 593 unsigned int i, nr = (end - addr) >> PAGE_SHIFT; 594 595 WARN_ON(page_shift < PAGE_SHIFT); 596 597 if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || 598 page_shift == PAGE_SHIFT) 599 return vmap_small_pages_range_noflush(addr, end, prot, pages); 600 601 for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { 602 int err; 603 604 err = vmap_range_noflush(addr, addr + (1UL << page_shift), 605 page_to_phys(pages[i]), prot, 606 page_shift); 607 if (err) 608 return err; 609 610 addr += 1UL << page_shift; 611 } 612 613 return 0; 614 } 615 616 int vmap_pages_range_noflush(unsigned long addr, unsigned long end, 617 pgprot_t prot, struct page **pages, unsigned int page_shift) 618 { 619 kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); 620 return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); 621 } 622 623 /** 624 * vmap_pages_range - map pages to a kernel virtual address 625 * @addr: start of the VM area to map 626 * @end: end of the VM area to map (non-inclusive) 627 * @prot: page protection flags to use 628 * @pages: pages to map (always PAGE_SIZE pages) 629 * @page_shift: maximum shift that the pages may be mapped with, @pages must 630 * be aligned and contiguous up to at least this shift. 631 * 632 * RETURNS: 633 * 0 on success, -errno on failure. 634 */ 635 static int vmap_pages_range(unsigned long addr, unsigned long end, 636 pgprot_t prot, struct page **pages, unsigned int page_shift) 637 { 638 int err; 639 640 err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); 641 flush_cache_vmap(addr, end); 642 return err; 643 } 644 645 int is_vmalloc_or_module_addr(const void *x) 646 { 647 /* 648 * ARM, x86-64 and sparc64 put modules in a special place, 649 * and fall back on vmalloc() if that fails. Others 650 * just put it in the vmalloc space. 651 */ 652 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) 653 unsigned long addr = (unsigned long)kasan_reset_tag(x); 654 if (addr >= MODULES_VADDR && addr < MODULES_END) 655 return 1; 656 #endif 657 return is_vmalloc_addr(x); 658 } 659 EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr); 660 661 /* 662 * Walk a vmap address to the struct page it maps. Huge vmap mappings will 663 * return the tail page that corresponds to the base page address, which 664 * matches small vmap mappings. 665 */ 666 struct page *vmalloc_to_page(const void *vmalloc_addr) 667 { 668 unsigned long addr = (unsigned long) vmalloc_addr; 669 struct page *page = NULL; 670 pgd_t *pgd = pgd_offset_k(addr); 671 p4d_t *p4d; 672 pud_t *pud; 673 pmd_t *pmd; 674 pte_t *ptep, pte; 675 676 /* 677 * XXX we might need to change this if we add VIRTUAL_BUG_ON for 678 * architectures that do not vmalloc module space 679 */ 680 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); 681 682 if (pgd_none(*pgd)) 683 return NULL; 684 if (WARN_ON_ONCE(pgd_leaf(*pgd))) 685 return NULL; /* XXX: no allowance for huge pgd */ 686 if (WARN_ON_ONCE(pgd_bad(*pgd))) 687 return NULL; 688 689 p4d = p4d_offset(pgd, addr); 690 if (p4d_none(*p4d)) 691 return NULL; 692 if (p4d_leaf(*p4d)) 693 return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); 694 if (WARN_ON_ONCE(p4d_bad(*p4d))) 695 return NULL; 696 697 pud = pud_offset(p4d, addr); 698 if (pud_none(*pud)) 699 return NULL; 700 if (pud_leaf(*pud)) 701 return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); 702 if (WARN_ON_ONCE(pud_bad(*pud))) 703 return NULL; 704 705 pmd = pmd_offset(pud, addr); 706 if (pmd_none(*pmd)) 707 return NULL; 708 if (pmd_leaf(*pmd)) 709 return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); 710 if (WARN_ON_ONCE(pmd_bad(*pmd))) 711 return NULL; 712 713 ptep = pte_offset_map(pmd, addr); 714 pte = *ptep; 715 if (pte_present(pte)) 716 page = pte_page(pte); 717 pte_unmap(ptep); 718 719 return page; 720 } 721 EXPORT_SYMBOL(vmalloc_to_page); 722 723 /* 724 * Map a vmalloc()-space virtual address to the physical page frame number. 725 */ 726 unsigned long vmalloc_to_pfn(const void *vmalloc_addr) 727 { 728 return page_to_pfn(vmalloc_to_page(vmalloc_addr)); 729 } 730 EXPORT_SYMBOL(vmalloc_to_pfn); 731 732 733 /*** Global kva allocator ***/ 734 735 #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 736 #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 737 738 739 static DEFINE_SPINLOCK(vmap_area_lock); 740 static DEFINE_SPINLOCK(free_vmap_area_lock); 741 /* Export for kexec only */ 742 LIST_HEAD(vmap_area_list); 743 static struct rb_root vmap_area_root = RB_ROOT; 744 static bool vmap_initialized __read_mostly; 745 746 static struct rb_root purge_vmap_area_root = RB_ROOT; 747 static LIST_HEAD(purge_vmap_area_list); 748 static DEFINE_SPINLOCK(purge_vmap_area_lock); 749 750 /* 751 * This kmem_cache is used for vmap_area objects. Instead of 752 * allocating from slab we reuse an object from this cache to 753 * make things faster. Especially in "no edge" splitting of 754 * free block. 755 */ 756 static struct kmem_cache *vmap_area_cachep; 757 758 /* 759 * This linked list is used in pair with free_vmap_area_root. 760 * It gives O(1) access to prev/next to perform fast coalescing. 761 */ 762 static LIST_HEAD(free_vmap_area_list); 763 764 /* 765 * This augment red-black tree represents the free vmap space. 766 * All vmap_area objects in this tree are sorted by va->va_start 767 * address. It is used for allocation and merging when a vmap 768 * object is released. 769 * 770 * Each vmap_area node contains a maximum available free block 771 * of its sub-tree, right or left. Therefore it is possible to 772 * find a lowest match of free area. 773 */ 774 static struct rb_root free_vmap_area_root = RB_ROOT; 775 776 /* 777 * Preload a CPU with one object for "no edge" split case. The 778 * aim is to get rid of allocations from the atomic context, thus 779 * to use more permissive allocation masks. 780 */ 781 static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node); 782 783 static __always_inline unsigned long 784 va_size(struct vmap_area *va) 785 { 786 return (va->va_end - va->va_start); 787 } 788 789 static __always_inline unsigned long 790 get_subtree_max_size(struct rb_node *node) 791 { 792 struct vmap_area *va; 793 794 va = rb_entry_safe(node, struct vmap_area, rb_node); 795 return va ? va->subtree_max_size : 0; 796 } 797 798 RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb, 799 struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size) 800 801 static void purge_vmap_area_lazy(void); 802 static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); 803 static void drain_vmap_area_work(struct work_struct *work); 804 static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work); 805 806 static atomic_long_t nr_vmalloc_pages; 807 808 unsigned long vmalloc_nr_pages(void) 809 { 810 return atomic_long_read(&nr_vmalloc_pages); 811 } 812 813 /* Look up the first VA which satisfies addr < va_end, NULL if none. */ 814 static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) 815 { 816 struct vmap_area *va = NULL; 817 struct rb_node *n = vmap_area_root.rb_node; 818 819 addr = (unsigned long)kasan_reset_tag((void *)addr); 820 821 while (n) { 822 struct vmap_area *tmp; 823 824 tmp = rb_entry(n, struct vmap_area, rb_node); 825 if (tmp->va_end > addr) { 826 va = tmp; 827 if (tmp->va_start <= addr) 828 break; 829 830 n = n->rb_left; 831 } else 832 n = n->rb_right; 833 } 834 835 return va; 836 } 837 838 static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) 839 { 840 struct rb_node *n = root->rb_node; 841 842 addr = (unsigned long)kasan_reset_tag((void *)addr); 843 844 while (n) { 845 struct vmap_area *va; 846 847 va = rb_entry(n, struct vmap_area, rb_node); 848 if (addr < va->va_start) 849 n = n->rb_left; 850 else if (addr >= va->va_end) 851 n = n->rb_right; 852 else 853 return va; 854 } 855 856 return NULL; 857 } 858 859 /* 860 * This function returns back addresses of parent node 861 * and its left or right link for further processing. 862 * 863 * Otherwise NULL is returned. In that case all further 864 * steps regarding inserting of conflicting overlap range 865 * have to be declined and actually considered as a bug. 866 */ 867 static __always_inline struct rb_node ** 868 find_va_links(struct vmap_area *va, 869 struct rb_root *root, struct rb_node *from, 870 struct rb_node **parent) 871 { 872 struct vmap_area *tmp_va; 873 struct rb_node **link; 874 875 if (root) { 876 link = &root->rb_node; 877 if (unlikely(!*link)) { 878 *parent = NULL; 879 return link; 880 } 881 } else { 882 link = &from; 883 } 884 885 /* 886 * Go to the bottom of the tree. When we hit the last point 887 * we end up with parent rb_node and correct direction, i name 888 * it link, where the new va->rb_node will be attached to. 889 */ 890 do { 891 tmp_va = rb_entry(*link, struct vmap_area, rb_node); 892 893 /* 894 * During the traversal we also do some sanity check. 895 * Trigger the BUG() if there are sides(left/right) 896 * or full overlaps. 897 */ 898 if (va->va_end <= tmp_va->va_start) 899 link = &(*link)->rb_left; 900 else if (va->va_start >= tmp_va->va_end) 901 link = &(*link)->rb_right; 902 else { 903 WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", 904 va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); 905 906 return NULL; 907 } 908 } while (*link); 909 910 *parent = &tmp_va->rb_node; 911 return link; 912 } 913 914 static __always_inline struct list_head * 915 get_va_next_sibling(struct rb_node *parent, struct rb_node **link) 916 { 917 struct list_head *list; 918 919 if (unlikely(!parent)) 920 /* 921 * The red-black tree where we try to find VA neighbors 922 * before merging or inserting is empty, i.e. it means 923 * there is no free vmap space. Normally it does not 924 * happen but we handle this case anyway. 925 */ 926 return NULL; 927 928 list = &rb_entry(parent, struct vmap_area, rb_node)->list; 929 return (&parent->rb_right == link ? list->next : list); 930 } 931 932 static __always_inline void 933 __link_va(struct vmap_area *va, struct rb_root *root, 934 struct rb_node *parent, struct rb_node **link, 935 struct list_head *head, bool augment) 936 { 937 /* 938 * VA is still not in the list, but we can 939 * identify its future previous list_head node. 940 */ 941 if (likely(parent)) { 942 head = &rb_entry(parent, struct vmap_area, rb_node)->list; 943 if (&parent->rb_right != link) 944 head = head->prev; 945 } 946 947 /* Insert to the rb-tree */ 948 rb_link_node(&va->rb_node, parent, link); 949 if (augment) { 950 /* 951 * Some explanation here. Just perform simple insertion 952 * to the tree. We do not set va->subtree_max_size to 953 * its current size before calling rb_insert_augmented(). 954 * It is because we populate the tree from the bottom 955 * to parent levels when the node _is_ in the tree. 956 * 957 * Therefore we set subtree_max_size to zero after insertion, 958 * to let __augment_tree_propagate_from() puts everything to 959 * the correct order later on. 960 */ 961 rb_insert_augmented(&va->rb_node, 962 root, &free_vmap_area_rb_augment_cb); 963 va->subtree_max_size = 0; 964 } else { 965 rb_insert_color(&va->rb_node, root); 966 } 967 968 /* Address-sort this list */ 969 list_add(&va->list, head); 970 } 971 972 static __always_inline void 973 link_va(struct vmap_area *va, struct rb_root *root, 974 struct rb_node *parent, struct rb_node **link, 975 struct list_head *head) 976 { 977 __link_va(va, root, parent, link, head, false); 978 } 979 980 static __always_inline void 981 link_va_augment(struct vmap_area *va, struct rb_root *root, 982 struct rb_node *parent, struct rb_node **link, 983 struct list_head *head) 984 { 985 __link_va(va, root, parent, link, head, true); 986 } 987 988 static __always_inline void 989 __unlink_va(struct vmap_area *va, struct rb_root *root, bool augment) 990 { 991 if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) 992 return; 993 994 if (augment) 995 rb_erase_augmented(&va->rb_node, 996 root, &free_vmap_area_rb_augment_cb); 997 else 998 rb_erase(&va->rb_node, root); 999 1000 list_del_init(&va->list); 1001 RB_CLEAR_NODE(&va->rb_node); 1002 } 1003 1004 static __always_inline void 1005 unlink_va(struct vmap_area *va, struct rb_root *root) 1006 { 1007 __unlink_va(va, root, false); 1008 } 1009 1010 static __always_inline void 1011 unlink_va_augment(struct vmap_area *va, struct rb_root *root) 1012 { 1013 __unlink_va(va, root, true); 1014 } 1015 1016 #if DEBUG_AUGMENT_PROPAGATE_CHECK 1017 /* 1018 * Gets called when remove the node and rotate. 1019 */ 1020 static __always_inline unsigned long 1021 compute_subtree_max_size(struct vmap_area *va) 1022 { 1023 return max3(va_size(va), 1024 get_subtree_max_size(va->rb_node.rb_left), 1025 get_subtree_max_size(va->rb_node.rb_right)); 1026 } 1027 1028 static void 1029 augment_tree_propagate_check(void) 1030 { 1031 struct vmap_area *va; 1032 unsigned long computed_size; 1033 1034 list_for_each_entry(va, &free_vmap_area_list, list) { 1035 computed_size = compute_subtree_max_size(va); 1036 if (computed_size != va->subtree_max_size) 1037 pr_emerg("tree is corrupted: %lu, %lu\n", 1038 va_size(va), va->subtree_max_size); 1039 } 1040 } 1041 #endif 1042 1043 /* 1044 * This function populates subtree_max_size from bottom to upper 1045 * levels starting from VA point. The propagation must be done 1046 * when VA size is modified by changing its va_start/va_end. Or 1047 * in case of newly inserting of VA to the tree. 1048 * 1049 * It means that __augment_tree_propagate_from() must be called: 1050 * - After VA has been inserted to the tree(free path); 1051 * - After VA has been shrunk(allocation path); 1052 * - After VA has been increased(merging path). 1053 * 1054 * Please note that, it does not mean that upper parent nodes 1055 * and their subtree_max_size are recalculated all the time up 1056 * to the root node. 1057 * 1058 * 4--8 1059 * /\ 1060 * / \ 1061 * / \ 1062 * 2--2 8--8 1063 * 1064 * For example if we modify the node 4, shrinking it to 2, then 1065 * no any modification is required. If we shrink the node 2 to 1 1066 * its subtree_max_size is updated only, and set to 1. If we shrink 1067 * the node 8 to 6, then its subtree_max_size is set to 6 and parent 1068 * node becomes 4--6. 1069 */ 1070 static __always_inline void 1071 augment_tree_propagate_from(struct vmap_area *va) 1072 { 1073 /* 1074 * Populate the tree from bottom towards the root until 1075 * the calculated maximum available size of checked node 1076 * is equal to its current one. 1077 */ 1078 free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); 1079 1080 #if DEBUG_AUGMENT_PROPAGATE_CHECK 1081 augment_tree_propagate_check(); 1082 #endif 1083 } 1084 1085 static void 1086 insert_vmap_area(struct vmap_area *va, 1087 struct rb_root *root, struct list_head *head) 1088 { 1089 struct rb_node **link; 1090 struct rb_node *parent; 1091 1092 link = find_va_links(va, root, NULL, &parent); 1093 if (link) 1094 link_va(va, root, parent, link, head); 1095 } 1096 1097 static void 1098 insert_vmap_area_augment(struct vmap_area *va, 1099 struct rb_node *from, struct rb_root *root, 1100 struct list_head *head) 1101 { 1102 struct rb_node **link; 1103 struct rb_node *parent; 1104 1105 if (from) 1106 link = find_va_links(va, NULL, from, &parent); 1107 else 1108 link = find_va_links(va, root, NULL, &parent); 1109 1110 if (link) { 1111 link_va_augment(va, root, parent, link, head); 1112 augment_tree_propagate_from(va); 1113 } 1114 } 1115 1116 /* 1117 * Merge de-allocated chunk of VA memory with previous 1118 * and next free blocks. If coalesce is not done a new 1119 * free area is inserted. If VA has been merged, it is 1120 * freed. 1121 * 1122 * Please note, it can return NULL in case of overlap 1123 * ranges, followed by WARN() report. Despite it is a 1124 * buggy behaviour, a system can be alive and keep 1125 * ongoing. 1126 */ 1127 static __always_inline struct vmap_area * 1128 __merge_or_add_vmap_area(struct vmap_area *va, 1129 struct rb_root *root, struct list_head *head, bool augment) 1130 { 1131 struct vmap_area *sibling; 1132 struct list_head *next; 1133 struct rb_node **link; 1134 struct rb_node *parent; 1135 bool merged = false; 1136 1137 /* 1138 * Find a place in the tree where VA potentially will be 1139 * inserted, unless it is merged with its sibling/siblings. 1140 */ 1141 link = find_va_links(va, root, NULL, &parent); 1142 if (!link) 1143 return NULL; 1144 1145 /* 1146 * Get next node of VA to check if merging can be done. 1147 */ 1148 next = get_va_next_sibling(parent, link); 1149 if (unlikely(next == NULL)) 1150 goto insert; 1151 1152 /* 1153 * start end 1154 * | | 1155 * |<------VA------>|<-----Next----->| 1156 * | | 1157 * start end 1158 */ 1159 if (next != head) { 1160 sibling = list_entry(next, struct vmap_area, list); 1161 if (sibling->va_start == va->va_end) { 1162 sibling->va_start = va->va_start; 1163 1164 /* Free vmap_area object. */ 1165 kmem_cache_free(vmap_area_cachep, va); 1166 1167 /* Point to the new merged area. */ 1168 va = sibling; 1169 merged = true; 1170 } 1171 } 1172 1173 /* 1174 * start end 1175 * | | 1176 * |<-----Prev----->|<------VA------>| 1177 * | | 1178 * start end 1179 */ 1180 if (next->prev != head) { 1181 sibling = list_entry(next->prev, struct vmap_area, list); 1182 if (sibling->va_end == va->va_start) { 1183 /* 1184 * If both neighbors are coalesced, it is important 1185 * to unlink the "next" node first, followed by merging 1186 * with "previous" one. Otherwise the tree might not be 1187 * fully populated if a sibling's augmented value is 1188 * "normalized" because of rotation operations. 1189 */ 1190 if (merged) 1191 __unlink_va(va, root, augment); 1192 1193 sibling->va_end = va->va_end; 1194 1195 /* Free vmap_area object. */ 1196 kmem_cache_free(vmap_area_cachep, va); 1197 1198 /* Point to the new merged area. */ 1199 va = sibling; 1200 merged = true; 1201 } 1202 } 1203 1204 insert: 1205 if (!merged) 1206 __link_va(va, root, parent, link, head, augment); 1207 1208 return va; 1209 } 1210 1211 static __always_inline struct vmap_area * 1212 merge_or_add_vmap_area(struct vmap_area *va, 1213 struct rb_root *root, struct list_head *head) 1214 { 1215 return __merge_or_add_vmap_area(va, root, head, false); 1216 } 1217 1218 static __always_inline struct vmap_area * 1219 merge_or_add_vmap_area_augment(struct vmap_area *va, 1220 struct rb_root *root, struct list_head *head) 1221 { 1222 va = __merge_or_add_vmap_area(va, root, head, true); 1223 if (va) 1224 augment_tree_propagate_from(va); 1225 1226 return va; 1227 } 1228 1229 static __always_inline bool 1230 is_within_this_va(struct vmap_area *va, unsigned long size, 1231 unsigned long align, unsigned long vstart) 1232 { 1233 unsigned long nva_start_addr; 1234 1235 if (va->va_start > vstart) 1236 nva_start_addr = ALIGN(va->va_start, align); 1237 else 1238 nva_start_addr = ALIGN(vstart, align); 1239 1240 /* Can be overflowed due to big size or alignment. */ 1241 if (nva_start_addr + size < nva_start_addr || 1242 nva_start_addr < vstart) 1243 return false; 1244 1245 return (nva_start_addr + size <= va->va_end); 1246 } 1247 1248 /* 1249 * Find the first free block(lowest start address) in the tree, 1250 * that will accomplish the request corresponding to passing 1251 * parameters. Please note, with an alignment bigger than PAGE_SIZE, 1252 * a search length is adjusted to account for worst case alignment 1253 * overhead. 1254 */ 1255 static __always_inline struct vmap_area * 1256 find_vmap_lowest_match(struct rb_root *root, unsigned long size, 1257 unsigned long align, unsigned long vstart, bool adjust_search_size) 1258 { 1259 struct vmap_area *va; 1260 struct rb_node *node; 1261 unsigned long length; 1262 1263 /* Start from the root. */ 1264 node = root->rb_node; 1265 1266 /* Adjust the search size for alignment overhead. */ 1267 length = adjust_search_size ? size + align - 1 : size; 1268 1269 while (node) { 1270 va = rb_entry(node, struct vmap_area, rb_node); 1271 1272 if (get_subtree_max_size(node->rb_left) >= length && 1273 vstart < va->va_start) { 1274 node = node->rb_left; 1275 } else { 1276 if (is_within_this_va(va, size, align, vstart)) 1277 return va; 1278 1279 /* 1280 * Does not make sense to go deeper towards the right 1281 * sub-tree if it does not have a free block that is 1282 * equal or bigger to the requested search length. 1283 */ 1284 if (get_subtree_max_size(node->rb_right) >= length) { 1285 node = node->rb_right; 1286 continue; 1287 } 1288 1289 /* 1290 * OK. We roll back and find the first right sub-tree, 1291 * that will satisfy the search criteria. It can happen 1292 * due to "vstart" restriction or an alignment overhead 1293 * that is bigger then PAGE_SIZE. 1294 */ 1295 while ((node = rb_parent(node))) { 1296 va = rb_entry(node, struct vmap_area, rb_node); 1297 if (is_within_this_va(va, size, align, vstart)) 1298 return va; 1299 1300 if (get_subtree_max_size(node->rb_right) >= length && 1301 vstart <= va->va_start) { 1302 /* 1303 * Shift the vstart forward. Please note, we update it with 1304 * parent's start address adding "1" because we do not want 1305 * to enter same sub-tree after it has already been checked 1306 * and no suitable free block found there. 1307 */ 1308 vstart = va->va_start + 1; 1309 node = node->rb_right; 1310 break; 1311 } 1312 } 1313 } 1314 } 1315 1316 return NULL; 1317 } 1318 1319 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 1320 #include <linux/random.h> 1321 1322 static struct vmap_area * 1323 find_vmap_lowest_linear_match(struct list_head *head, unsigned long size, 1324 unsigned long align, unsigned long vstart) 1325 { 1326 struct vmap_area *va; 1327 1328 list_for_each_entry(va, head, list) { 1329 if (!is_within_this_va(va, size, align, vstart)) 1330 continue; 1331 1332 return va; 1333 } 1334 1335 return NULL; 1336 } 1337 1338 static void 1339 find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head, 1340 unsigned long size, unsigned long align) 1341 { 1342 struct vmap_area *va_1, *va_2; 1343 unsigned long vstart; 1344 unsigned int rnd; 1345 1346 get_random_bytes(&rnd, sizeof(rnd)); 1347 vstart = VMALLOC_START + rnd; 1348 1349 va_1 = find_vmap_lowest_match(root, size, align, vstart, false); 1350 va_2 = find_vmap_lowest_linear_match(head, size, align, vstart); 1351 1352 if (va_1 != va_2) 1353 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", 1354 va_1, va_2, vstart); 1355 } 1356 #endif 1357 1358 enum fit_type { 1359 NOTHING_FIT = 0, 1360 FL_FIT_TYPE = 1, /* full fit */ 1361 LE_FIT_TYPE = 2, /* left edge fit */ 1362 RE_FIT_TYPE = 3, /* right edge fit */ 1363 NE_FIT_TYPE = 4 /* no edge fit */ 1364 }; 1365 1366 static __always_inline enum fit_type 1367 classify_va_fit_type(struct vmap_area *va, 1368 unsigned long nva_start_addr, unsigned long size) 1369 { 1370 enum fit_type type; 1371 1372 /* Check if it is within VA. */ 1373 if (nva_start_addr < va->va_start || 1374 nva_start_addr + size > va->va_end) 1375 return NOTHING_FIT; 1376 1377 /* Now classify. */ 1378 if (va->va_start == nva_start_addr) { 1379 if (va->va_end == nva_start_addr + size) 1380 type = FL_FIT_TYPE; 1381 else 1382 type = LE_FIT_TYPE; 1383 } else if (va->va_end == nva_start_addr + size) { 1384 type = RE_FIT_TYPE; 1385 } else { 1386 type = NE_FIT_TYPE; 1387 } 1388 1389 return type; 1390 } 1391 1392 static __always_inline int 1393 adjust_va_to_fit_type(struct rb_root *root, struct list_head *head, 1394 struct vmap_area *va, unsigned long nva_start_addr, 1395 unsigned long size) 1396 { 1397 struct vmap_area *lva = NULL; 1398 enum fit_type type = classify_va_fit_type(va, nva_start_addr, size); 1399 1400 if (type == FL_FIT_TYPE) { 1401 /* 1402 * No need to split VA, it fully fits. 1403 * 1404 * | | 1405 * V NVA V 1406 * |---------------| 1407 */ 1408 unlink_va_augment(va, root); 1409 kmem_cache_free(vmap_area_cachep, va); 1410 } else if (type == LE_FIT_TYPE) { 1411 /* 1412 * Split left edge of fit VA. 1413 * 1414 * | | 1415 * V NVA V R 1416 * |-------|-------| 1417 */ 1418 va->va_start += size; 1419 } else if (type == RE_FIT_TYPE) { 1420 /* 1421 * Split right edge of fit VA. 1422 * 1423 * | | 1424 * L V NVA V 1425 * |-------|-------| 1426 */ 1427 va->va_end = nva_start_addr; 1428 } else if (type == NE_FIT_TYPE) { 1429 /* 1430 * Split no edge of fit VA. 1431 * 1432 * | | 1433 * L V NVA V R 1434 * |---|-------|---| 1435 */ 1436 lva = __this_cpu_xchg(ne_fit_preload_node, NULL); 1437 if (unlikely(!lva)) { 1438 /* 1439 * For percpu allocator we do not do any pre-allocation 1440 * and leave it as it is. The reason is it most likely 1441 * never ends up with NE_FIT_TYPE splitting. In case of 1442 * percpu allocations offsets and sizes are aligned to 1443 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE 1444 * are its main fitting cases. 1445 * 1446 * There are a few exceptions though, as an example it is 1447 * a first allocation (early boot up) when we have "one" 1448 * big free space that has to be split. 1449 * 1450 * Also we can hit this path in case of regular "vmap" 1451 * allocations, if "this" current CPU was not preloaded. 1452 * See the comment in alloc_vmap_area() why. If so, then 1453 * GFP_NOWAIT is used instead to get an extra object for 1454 * split purpose. That is rare and most time does not 1455 * occur. 1456 * 1457 * What happens if an allocation gets failed. Basically, 1458 * an "overflow" path is triggered to purge lazily freed 1459 * areas to free some memory, then, the "retry" path is 1460 * triggered to repeat one more time. See more details 1461 * in alloc_vmap_area() function. 1462 */ 1463 lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); 1464 if (!lva) 1465 return -1; 1466 } 1467 1468 /* 1469 * Build the remainder. 1470 */ 1471 lva->va_start = va->va_start; 1472 lva->va_end = nva_start_addr; 1473 1474 /* 1475 * Shrink this VA to remaining size. 1476 */ 1477 va->va_start = nva_start_addr + size; 1478 } else { 1479 return -1; 1480 } 1481 1482 if (type != FL_FIT_TYPE) { 1483 augment_tree_propagate_from(va); 1484 1485 if (lva) /* type == NE_FIT_TYPE */ 1486 insert_vmap_area_augment(lva, &va->rb_node, root, head); 1487 } 1488 1489 return 0; 1490 } 1491 1492 /* 1493 * Returns a start address of the newly allocated area, if success. 1494 * Otherwise a vend is returned that indicates failure. 1495 */ 1496 static __always_inline unsigned long 1497 __alloc_vmap_area(struct rb_root *root, struct list_head *head, 1498 unsigned long size, unsigned long align, 1499 unsigned long vstart, unsigned long vend) 1500 { 1501 bool adjust_search_size = true; 1502 unsigned long nva_start_addr; 1503 struct vmap_area *va; 1504 int ret; 1505 1506 /* 1507 * Do not adjust when: 1508 * a) align <= PAGE_SIZE, because it does not make any sense. 1509 * All blocks(their start addresses) are at least PAGE_SIZE 1510 * aligned anyway; 1511 * b) a short range where a requested size corresponds to exactly 1512 * specified [vstart:vend] interval and an alignment > PAGE_SIZE. 1513 * With adjusted search length an allocation would not succeed. 1514 */ 1515 if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size)) 1516 adjust_search_size = false; 1517 1518 va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size); 1519 if (unlikely(!va)) 1520 return vend; 1521 1522 if (va->va_start > vstart) 1523 nva_start_addr = ALIGN(va->va_start, align); 1524 else 1525 nva_start_addr = ALIGN(vstart, align); 1526 1527 /* Check the "vend" restriction. */ 1528 if (nva_start_addr + size > vend) 1529 return vend; 1530 1531 /* Update the free vmap_area. */ 1532 ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size); 1533 if (WARN_ON_ONCE(ret)) 1534 return vend; 1535 1536 #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK 1537 find_vmap_lowest_match_check(root, head, size, align); 1538 #endif 1539 1540 return nva_start_addr; 1541 } 1542 1543 /* 1544 * Free a region of KVA allocated by alloc_vmap_area 1545 */ 1546 static void free_vmap_area(struct vmap_area *va) 1547 { 1548 /* 1549 * Remove from the busy tree/list. 1550 */ 1551 spin_lock(&vmap_area_lock); 1552 unlink_va(va, &vmap_area_root); 1553 spin_unlock(&vmap_area_lock); 1554 1555 /* 1556 * Insert/Merge it back to the free tree/list. 1557 */ 1558 spin_lock(&free_vmap_area_lock); 1559 merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); 1560 spin_unlock(&free_vmap_area_lock); 1561 } 1562 1563 static inline void 1564 preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) 1565 { 1566 struct vmap_area *va = NULL; 1567 1568 /* 1569 * Preload this CPU with one extra vmap_area object. It is used 1570 * when fit type of free area is NE_FIT_TYPE. It guarantees that 1571 * a CPU that does an allocation is preloaded. 1572 * 1573 * We do it in non-atomic context, thus it allows us to use more 1574 * permissive allocation masks to be more stable under low memory 1575 * condition and high memory pressure. 1576 */ 1577 if (!this_cpu_read(ne_fit_preload_node)) 1578 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1579 1580 spin_lock(lock); 1581 1582 if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va)) 1583 kmem_cache_free(vmap_area_cachep, va); 1584 } 1585 1586 /* 1587 * Allocate a region of KVA of the specified size and alignment, within the 1588 * vstart and vend. 1589 */ 1590 static struct vmap_area *alloc_vmap_area(unsigned long size, 1591 unsigned long align, 1592 unsigned long vstart, unsigned long vend, 1593 int node, gfp_t gfp_mask) 1594 { 1595 struct vmap_area *va; 1596 unsigned long freed; 1597 unsigned long addr; 1598 int purged = 0; 1599 int ret; 1600 1601 BUG_ON(!size); 1602 BUG_ON(offset_in_page(size)); 1603 BUG_ON(!is_power_of_2(align)); 1604 1605 if (unlikely(!vmap_initialized)) 1606 return ERR_PTR(-EBUSY); 1607 1608 might_sleep(); 1609 gfp_mask = gfp_mask & GFP_RECLAIM_MASK; 1610 1611 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); 1612 if (unlikely(!va)) 1613 return ERR_PTR(-ENOMEM); 1614 1615 /* 1616 * Only scan the relevant parts containing pointers to other objects 1617 * to avoid false negatives. 1618 */ 1619 kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); 1620 1621 retry: 1622 preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); 1623 addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, 1624 size, align, vstart, vend); 1625 spin_unlock(&free_vmap_area_lock); 1626 1627 trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); 1628 1629 /* 1630 * If an allocation fails, the "vend" address is 1631 * returned. Therefore trigger the overflow path. 1632 */ 1633 if (unlikely(addr == vend)) 1634 goto overflow; 1635 1636 va->va_start = addr; 1637 va->va_end = addr + size; 1638 va->vm = NULL; 1639 1640 spin_lock(&vmap_area_lock); 1641 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 1642 spin_unlock(&vmap_area_lock); 1643 1644 BUG_ON(!IS_ALIGNED(va->va_start, align)); 1645 BUG_ON(va->va_start < vstart); 1646 BUG_ON(va->va_end > vend); 1647 1648 ret = kasan_populate_vmalloc(addr, size); 1649 if (ret) { 1650 free_vmap_area(va); 1651 return ERR_PTR(ret); 1652 } 1653 1654 return va; 1655 1656 overflow: 1657 if (!purged) { 1658 purge_vmap_area_lazy(); 1659 purged = 1; 1660 goto retry; 1661 } 1662 1663 freed = 0; 1664 blocking_notifier_call_chain(&vmap_notify_list, 0, &freed); 1665 1666 if (freed > 0) { 1667 purged = 0; 1668 goto retry; 1669 } 1670 1671 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) 1672 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", 1673 size); 1674 1675 kmem_cache_free(vmap_area_cachep, va); 1676 return ERR_PTR(-EBUSY); 1677 } 1678 1679 int register_vmap_purge_notifier(struct notifier_block *nb) 1680 { 1681 return blocking_notifier_chain_register(&vmap_notify_list, nb); 1682 } 1683 EXPORT_SYMBOL_GPL(register_vmap_purge_notifier); 1684 1685 int unregister_vmap_purge_notifier(struct notifier_block *nb) 1686 { 1687 return blocking_notifier_chain_unregister(&vmap_notify_list, nb); 1688 } 1689 EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); 1690 1691 /* 1692 * lazy_max_pages is the maximum amount of virtual address space we gather up 1693 * before attempting to purge with a TLB flush. 1694 * 1695 * There is a tradeoff here: a larger number will cover more kernel page tables 1696 * and take slightly longer to purge, but it will linearly reduce the number of 1697 * global TLB flushes that must be performed. It would seem natural to scale 1698 * this number up linearly with the number of CPUs (because vmapping activity 1699 * could also scale linearly with the number of CPUs), however it is likely 1700 * that in practice, workloads might be constrained in other ways that mean 1701 * vmap activity will not scale linearly with CPUs. Also, I want to be 1702 * conservative and not introduce a big latency on huge systems, so go with 1703 * a less aggressive log scale. It will still be an improvement over the old 1704 * code, and it will be simple to change the scale factor if we find that it 1705 * becomes a problem on bigger systems. 1706 */ 1707 static unsigned long lazy_max_pages(void) 1708 { 1709 unsigned int log; 1710 1711 log = fls(num_online_cpus()); 1712 1713 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 1714 } 1715 1716 static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); 1717 1718 /* 1719 * Serialize vmap purging. There is no actual critical section protected 1720 * by this lock, but we want to avoid concurrent calls for performance 1721 * reasons and to make the pcpu_get_vm_areas more deterministic. 1722 */ 1723 static DEFINE_MUTEX(vmap_purge_lock); 1724 1725 /* for per-CPU blocks */ 1726 static void purge_fragmented_blocks_allcpus(void); 1727 1728 /* 1729 * Purges all lazily-freed vmap areas. 1730 */ 1731 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) 1732 { 1733 unsigned long resched_threshold; 1734 unsigned int num_purged_areas = 0; 1735 struct list_head local_purge_list; 1736 struct vmap_area *va, *n_va; 1737 1738 lockdep_assert_held(&vmap_purge_lock); 1739 1740 spin_lock(&purge_vmap_area_lock); 1741 purge_vmap_area_root = RB_ROOT; 1742 list_replace_init(&purge_vmap_area_list, &local_purge_list); 1743 spin_unlock(&purge_vmap_area_lock); 1744 1745 if (unlikely(list_empty(&local_purge_list))) 1746 goto out; 1747 1748 start = min(start, 1749 list_first_entry(&local_purge_list, 1750 struct vmap_area, list)->va_start); 1751 1752 end = max(end, 1753 list_last_entry(&local_purge_list, 1754 struct vmap_area, list)->va_end); 1755 1756 flush_tlb_kernel_range(start, end); 1757 resched_threshold = lazy_max_pages() << 1; 1758 1759 spin_lock(&free_vmap_area_lock); 1760 list_for_each_entry_safe(va, n_va, &local_purge_list, list) { 1761 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; 1762 unsigned long orig_start = va->va_start; 1763 unsigned long orig_end = va->va_end; 1764 1765 /* 1766 * Finally insert or merge lazily-freed area. It is 1767 * detached and there is no need to "unlink" it from 1768 * anything. 1769 */ 1770 va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root, 1771 &free_vmap_area_list); 1772 1773 if (!va) 1774 continue; 1775 1776 if (is_vmalloc_or_module_addr((void *)orig_start)) 1777 kasan_release_vmalloc(orig_start, orig_end, 1778 va->va_start, va->va_end); 1779 1780 atomic_long_sub(nr, &vmap_lazy_nr); 1781 num_purged_areas++; 1782 1783 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) 1784 cond_resched_lock(&free_vmap_area_lock); 1785 } 1786 spin_unlock(&free_vmap_area_lock); 1787 1788 out: 1789 trace_purge_vmap_area_lazy(start, end, num_purged_areas); 1790 return num_purged_areas > 0; 1791 } 1792 1793 /* 1794 * Kick off a purge of the outstanding lazy areas. 1795 */ 1796 static void purge_vmap_area_lazy(void) 1797 { 1798 mutex_lock(&vmap_purge_lock); 1799 purge_fragmented_blocks_allcpus(); 1800 __purge_vmap_area_lazy(ULONG_MAX, 0); 1801 mutex_unlock(&vmap_purge_lock); 1802 } 1803 1804 static void drain_vmap_area_work(struct work_struct *work) 1805 { 1806 unsigned long nr_lazy; 1807 1808 do { 1809 mutex_lock(&vmap_purge_lock); 1810 __purge_vmap_area_lazy(ULONG_MAX, 0); 1811 mutex_unlock(&vmap_purge_lock); 1812 1813 /* Recheck if further work is required. */ 1814 nr_lazy = atomic_long_read(&vmap_lazy_nr); 1815 } while (nr_lazy > lazy_max_pages()); 1816 } 1817 1818 /* 1819 * Free a vmap area, caller ensuring that the area has been unmapped 1820 * and flush_cache_vunmap had been called for the correct range 1821 * previously. 1822 */ 1823 static void free_vmap_area_noflush(struct vmap_area *va) 1824 { 1825 unsigned long nr_lazy_max = lazy_max_pages(); 1826 unsigned long va_start = va->va_start; 1827 unsigned long nr_lazy; 1828 1829 spin_lock(&vmap_area_lock); 1830 unlink_va(va, &vmap_area_root); 1831 spin_unlock(&vmap_area_lock); 1832 1833 nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> 1834 PAGE_SHIFT, &vmap_lazy_nr); 1835 1836 /* 1837 * Merge or place it to the purge tree/list. 1838 */ 1839 spin_lock(&purge_vmap_area_lock); 1840 merge_or_add_vmap_area(va, 1841 &purge_vmap_area_root, &purge_vmap_area_list); 1842 spin_unlock(&purge_vmap_area_lock); 1843 1844 trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max); 1845 1846 /* After this point, we may free va at any time */ 1847 if (unlikely(nr_lazy > nr_lazy_max)) 1848 schedule_work(&drain_vmap_work); 1849 } 1850 1851 /* 1852 * Free and unmap a vmap area 1853 */ 1854 static void free_unmap_vmap_area(struct vmap_area *va) 1855 { 1856 flush_cache_vunmap(va->va_start, va->va_end); 1857 vunmap_range_noflush(va->va_start, va->va_end); 1858 if (debug_pagealloc_enabled_static()) 1859 flush_tlb_kernel_range(va->va_start, va->va_end); 1860 1861 free_vmap_area_noflush(va); 1862 } 1863 1864 struct vmap_area *find_vmap_area(unsigned long addr) 1865 { 1866 struct vmap_area *va; 1867 1868 spin_lock(&vmap_area_lock); 1869 va = __find_vmap_area(addr, &vmap_area_root); 1870 spin_unlock(&vmap_area_lock); 1871 1872 return va; 1873 } 1874 1875 /*** Per cpu kva allocator ***/ 1876 1877 /* 1878 * vmap space is limited especially on 32 bit architectures. Ensure there is 1879 * room for at least 16 percpu vmap blocks per CPU. 1880 */ 1881 /* 1882 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able 1883 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess 1884 * instead (we just need a rough idea) 1885 */ 1886 #if BITS_PER_LONG == 32 1887 #define VMALLOC_SPACE (128UL*1024*1024) 1888 #else 1889 #define VMALLOC_SPACE (128UL*1024*1024*1024) 1890 #endif 1891 1892 #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) 1893 #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ 1894 #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ 1895 #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 1896 #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 1897 #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 1898 #define VMAP_BBMAP_BITS \ 1899 VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 1900 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 1901 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) 1902 1903 #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 1904 1905 struct vmap_block_queue { 1906 spinlock_t lock; 1907 struct list_head free; 1908 }; 1909 1910 struct vmap_block { 1911 spinlock_t lock; 1912 struct vmap_area *va; 1913 unsigned long free, dirty; 1914 unsigned long dirty_min, dirty_max; /*< dirty range */ 1915 struct list_head free_list; 1916 struct rcu_head rcu_head; 1917 struct list_head purge; 1918 }; 1919 1920 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ 1921 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); 1922 1923 /* 1924 * XArray of vmap blocks, indexed by address, to quickly find a vmap block 1925 * in the free path. Could get rid of this if we change the API to return a 1926 * "cookie" from alloc, to be passed to free. But no big deal yet. 1927 */ 1928 static DEFINE_XARRAY(vmap_blocks); 1929 1930 /* 1931 * We should probably have a fallback mechanism to allocate virtual memory 1932 * out of partially filled vmap blocks. However vmap block sizing should be 1933 * fairly reasonable according to the vmalloc size, so it shouldn't be a 1934 * big problem. 1935 */ 1936 1937 static unsigned long addr_to_vb_idx(unsigned long addr) 1938 { 1939 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); 1940 addr /= VMAP_BLOCK_SIZE; 1941 return addr; 1942 } 1943 1944 static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) 1945 { 1946 unsigned long addr; 1947 1948 addr = va_start + (pages_off << PAGE_SHIFT); 1949 BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); 1950 return (void *)addr; 1951 } 1952 1953 /** 1954 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this 1955 * block. Of course pages number can't exceed VMAP_BBMAP_BITS 1956 * @order: how many 2^order pages should be occupied in newly allocated block 1957 * @gfp_mask: flags for the page level allocator 1958 * 1959 * Return: virtual address in a newly allocated block or ERR_PTR(-errno) 1960 */ 1961 static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) 1962 { 1963 struct vmap_block_queue *vbq; 1964 struct vmap_block *vb; 1965 struct vmap_area *va; 1966 unsigned long vb_idx; 1967 int node, err; 1968 void *vaddr; 1969 1970 node = numa_node_id(); 1971 1972 vb = kmalloc_node(sizeof(struct vmap_block), 1973 gfp_mask & GFP_RECLAIM_MASK, node); 1974 if (unlikely(!vb)) 1975 return ERR_PTR(-ENOMEM); 1976 1977 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, 1978 VMALLOC_START, VMALLOC_END, 1979 node, gfp_mask); 1980 if (IS_ERR(va)) { 1981 kfree(vb); 1982 return ERR_CAST(va); 1983 } 1984 1985 vaddr = vmap_block_vaddr(va->va_start, 0); 1986 spin_lock_init(&vb->lock); 1987 vb->va = va; 1988 /* At least something should be left free */ 1989 BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); 1990 vb->free = VMAP_BBMAP_BITS - (1UL << order); 1991 vb->dirty = 0; 1992 vb->dirty_min = VMAP_BBMAP_BITS; 1993 vb->dirty_max = 0; 1994 INIT_LIST_HEAD(&vb->free_list); 1995 1996 vb_idx = addr_to_vb_idx(va->va_start); 1997 err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask); 1998 if (err) { 1999 kfree(vb); 2000 free_vmap_area(va); 2001 return ERR_PTR(err); 2002 } 2003 2004 vbq = raw_cpu_ptr(&vmap_block_queue); 2005 spin_lock(&vbq->lock); 2006 list_add_tail_rcu(&vb->free_list, &vbq->free); 2007 spin_unlock(&vbq->lock); 2008 2009 return vaddr; 2010 } 2011 2012 static void free_vmap_block(struct vmap_block *vb) 2013 { 2014 struct vmap_block *tmp; 2015 2016 tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); 2017 BUG_ON(tmp != vb); 2018 2019 free_vmap_area_noflush(vb->va); 2020 kfree_rcu(vb, rcu_head); 2021 } 2022 2023 static void purge_fragmented_blocks(int cpu) 2024 { 2025 LIST_HEAD(purge); 2026 struct vmap_block *vb; 2027 struct vmap_block *n_vb; 2028 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 2029 2030 rcu_read_lock(); 2031 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 2032 2033 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) 2034 continue; 2035 2036 spin_lock(&vb->lock); 2037 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { 2038 vb->free = 0; /* prevent further allocs after releasing lock */ 2039 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ 2040 vb->dirty_min = 0; 2041 vb->dirty_max = VMAP_BBMAP_BITS; 2042 spin_lock(&vbq->lock); 2043 list_del_rcu(&vb->free_list); 2044 spin_unlock(&vbq->lock); 2045 spin_unlock(&vb->lock); 2046 list_add_tail(&vb->purge, &purge); 2047 } else 2048 spin_unlock(&vb->lock); 2049 } 2050 rcu_read_unlock(); 2051 2052 list_for_each_entry_safe(vb, n_vb, &purge, purge) { 2053 list_del(&vb->purge); 2054 free_vmap_block(vb); 2055 } 2056 } 2057 2058 static void purge_fragmented_blocks_allcpus(void) 2059 { 2060 int cpu; 2061 2062 for_each_possible_cpu(cpu) 2063 purge_fragmented_blocks(cpu); 2064 } 2065 2066 static void *vb_alloc(unsigned long size, gfp_t gfp_mask) 2067 { 2068 struct vmap_block_queue *vbq; 2069 struct vmap_block *vb; 2070 void *vaddr = NULL; 2071 unsigned int order; 2072 2073 BUG_ON(offset_in_page(size)); 2074 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 2075 if (WARN_ON(size == 0)) { 2076 /* 2077 * Allocating 0 bytes isn't what caller wants since 2078 * get_order(0) returns funny result. Just warn and terminate 2079 * early. 2080 */ 2081 return NULL; 2082 } 2083 order = get_order(size); 2084 2085 rcu_read_lock(); 2086 vbq = raw_cpu_ptr(&vmap_block_queue); 2087 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 2088 unsigned long pages_off; 2089 2090 spin_lock(&vb->lock); 2091 if (vb->free < (1UL << order)) { 2092 spin_unlock(&vb->lock); 2093 continue; 2094 } 2095 2096 pages_off = VMAP_BBMAP_BITS - vb->free; 2097 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); 2098 vb->free -= 1UL << order; 2099 if (vb->free == 0) { 2100 spin_lock(&vbq->lock); 2101 list_del_rcu(&vb->free_list); 2102 spin_unlock(&vbq->lock); 2103 } 2104 2105 spin_unlock(&vb->lock); 2106 break; 2107 } 2108 2109 rcu_read_unlock(); 2110 2111 /* Allocate new block if nothing was found */ 2112 if (!vaddr) 2113 vaddr = new_vmap_block(order, gfp_mask); 2114 2115 return vaddr; 2116 } 2117 2118 static void vb_free(unsigned long addr, unsigned long size) 2119 { 2120 unsigned long offset; 2121 unsigned int order; 2122 struct vmap_block *vb; 2123 2124 BUG_ON(offset_in_page(size)); 2125 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 2126 2127 flush_cache_vunmap(addr, addr + size); 2128 2129 order = get_order(size); 2130 offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; 2131 vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); 2132 2133 vunmap_range_noflush(addr, addr + size); 2134 2135 if (debug_pagealloc_enabled_static()) 2136 flush_tlb_kernel_range(addr, addr + size); 2137 2138 spin_lock(&vb->lock); 2139 2140 /* Expand dirty range */ 2141 vb->dirty_min = min(vb->dirty_min, offset); 2142 vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); 2143 2144 vb->dirty += 1UL << order; 2145 if (vb->dirty == VMAP_BBMAP_BITS) { 2146 BUG_ON(vb->free); 2147 spin_unlock(&vb->lock); 2148 free_vmap_block(vb); 2149 } else 2150 spin_unlock(&vb->lock); 2151 } 2152 2153 static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) 2154 { 2155 int cpu; 2156 2157 if (unlikely(!vmap_initialized)) 2158 return; 2159 2160 might_sleep(); 2161 2162 for_each_possible_cpu(cpu) { 2163 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 2164 struct vmap_block *vb; 2165 2166 rcu_read_lock(); 2167 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 2168 spin_lock(&vb->lock); 2169 if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) { 2170 unsigned long va_start = vb->va->va_start; 2171 unsigned long s, e; 2172 2173 s = va_start + (vb->dirty_min << PAGE_SHIFT); 2174 e = va_start + (vb->dirty_max << PAGE_SHIFT); 2175 2176 start = min(s, start); 2177 end = max(e, end); 2178 2179 flush = 1; 2180 } 2181 spin_unlock(&vb->lock); 2182 } 2183 rcu_read_unlock(); 2184 } 2185 2186 mutex_lock(&vmap_purge_lock); 2187 purge_fragmented_blocks_allcpus(); 2188 if (!__purge_vmap_area_lazy(start, end) && flush) 2189 flush_tlb_kernel_range(start, end); 2190 mutex_unlock(&vmap_purge_lock); 2191 } 2192 2193 /** 2194 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer 2195 * 2196 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily 2197 * to amortize TLB flushing overheads. What this means is that any page you 2198 * have now, may, in a former life, have been mapped into kernel virtual 2199 * address by the vmap layer and so there might be some CPUs with TLB entries 2200 * still referencing that page (additional to the regular 1:1 kernel mapping). 2201 * 2202 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can 2203 * be sure that none of the pages we have control over will have any aliases 2204 * from the vmap layer. 2205 */ 2206 void vm_unmap_aliases(void) 2207 { 2208 unsigned long start = ULONG_MAX, end = 0; 2209 int flush = 0; 2210 2211 _vm_unmap_aliases(start, end, flush); 2212 } 2213 EXPORT_SYMBOL_GPL(vm_unmap_aliases); 2214 2215 /** 2216 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram 2217 * @mem: the pointer returned by vm_map_ram 2218 * @count: the count passed to that vm_map_ram call (cannot unmap partial) 2219 */ 2220 void vm_unmap_ram(const void *mem, unsigned int count) 2221 { 2222 unsigned long size = (unsigned long)count << PAGE_SHIFT; 2223 unsigned long addr = (unsigned long)kasan_reset_tag(mem); 2224 struct vmap_area *va; 2225 2226 might_sleep(); 2227 BUG_ON(!addr); 2228 BUG_ON(addr < VMALLOC_START); 2229 BUG_ON(addr > VMALLOC_END); 2230 BUG_ON(!PAGE_ALIGNED(addr)); 2231 2232 kasan_poison_vmalloc(mem, size); 2233 2234 if (likely(count <= VMAP_MAX_ALLOC)) { 2235 debug_check_no_locks_freed(mem, size); 2236 vb_free(addr, size); 2237 return; 2238 } 2239 2240 va = find_vmap_area(addr); 2241 BUG_ON(!va); 2242 debug_check_no_locks_freed((void *)va->va_start, 2243 (va->va_end - va->va_start)); 2244 free_unmap_vmap_area(va); 2245 } 2246 EXPORT_SYMBOL(vm_unmap_ram); 2247 2248 /** 2249 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) 2250 * @pages: an array of pointers to the pages to be mapped 2251 * @count: number of pages 2252 * @node: prefer to allocate data structures on this node 2253 * 2254 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be 2255 * faster than vmap so it's good. But if you mix long-life and short-life 2256 * objects with vm_map_ram(), it could consume lots of address space through 2257 * fragmentation (especially on a 32bit machine). You could see failures in 2258 * the end. Please use this function for short-lived objects. 2259 * 2260 * Returns: a pointer to the address that has been mapped, or %NULL on failure 2261 */ 2262 void *vm_map_ram(struct page **pages, unsigned int count, int node) 2263 { 2264 unsigned long size = (unsigned long)count << PAGE_SHIFT; 2265 unsigned long addr; 2266 void *mem; 2267 2268 if (likely(count <= VMAP_MAX_ALLOC)) { 2269 mem = vb_alloc(size, GFP_KERNEL); 2270 if (IS_ERR(mem)) 2271 return NULL; 2272 addr = (unsigned long)mem; 2273 } else { 2274 struct vmap_area *va; 2275 va = alloc_vmap_area(size, PAGE_SIZE, 2276 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); 2277 if (IS_ERR(va)) 2278 return NULL; 2279 2280 addr = va->va_start; 2281 mem = (void *)addr; 2282 } 2283 2284 if (vmap_pages_range(addr, addr + size, PAGE_KERNEL, 2285 pages, PAGE_SHIFT) < 0) { 2286 vm_unmap_ram(mem, count); 2287 return NULL; 2288 } 2289 2290 /* 2291 * Mark the pages as accessible, now that they are mapped. 2292 * With hardware tag-based KASAN, marking is skipped for 2293 * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). 2294 */ 2295 mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL); 2296 2297 return mem; 2298 } 2299 EXPORT_SYMBOL(vm_map_ram); 2300 2301 static struct vm_struct *vmlist __initdata; 2302 2303 static inline unsigned int vm_area_page_order(struct vm_struct *vm) 2304 { 2305 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC 2306 return vm->page_order; 2307 #else 2308 return 0; 2309 #endif 2310 } 2311 2312 static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) 2313 { 2314 #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC 2315 vm->page_order = order; 2316 #else 2317 BUG_ON(order != 0); 2318 #endif 2319 } 2320 2321 /** 2322 * vm_area_add_early - add vmap area early during boot 2323 * @vm: vm_struct to add 2324 * 2325 * This function is used to add fixed kernel vm area to vmlist before 2326 * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags 2327 * should contain proper values and the other fields should be zero. 2328 * 2329 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 2330 */ 2331 void __init vm_area_add_early(struct vm_struct *vm) 2332 { 2333 struct vm_struct *tmp, **p; 2334 2335 BUG_ON(vmap_initialized); 2336 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { 2337 if (tmp->addr >= vm->addr) { 2338 BUG_ON(tmp->addr < vm->addr + vm->size); 2339 break; 2340 } else 2341 BUG_ON(tmp->addr + tmp->size > vm->addr); 2342 } 2343 vm->next = *p; 2344 *p = vm; 2345 } 2346 2347 /** 2348 * vm_area_register_early - register vmap area early during boot 2349 * @vm: vm_struct to register 2350 * @align: requested alignment 2351 * 2352 * This function is used to register kernel vm area before 2353 * vmalloc_init() is called. @vm->size and @vm->flags should contain 2354 * proper values on entry and other fields should be zero. On return, 2355 * vm->addr contains the allocated address. 2356 * 2357 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 2358 */ 2359 void __init vm_area_register_early(struct vm_struct *vm, size_t align) 2360 { 2361 unsigned long addr = ALIGN(VMALLOC_START, align); 2362 struct vm_struct *cur, **p; 2363 2364 BUG_ON(vmap_initialized); 2365 2366 for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) { 2367 if ((unsigned long)cur->addr - addr >= vm->size) 2368 break; 2369 addr = ALIGN((unsigned long)cur->addr + cur->size, align); 2370 } 2371 2372 BUG_ON(addr > VMALLOC_END - vm->size); 2373 vm->addr = (void *)addr; 2374 vm->next = *p; 2375 *p = vm; 2376 kasan_populate_early_vm_area_shadow(vm->addr, vm->size); 2377 } 2378 2379 static void vmap_init_free_space(void) 2380 { 2381 unsigned long vmap_start = 1; 2382 const unsigned long vmap_end = ULONG_MAX; 2383 struct vmap_area *busy, *free; 2384 2385 /* 2386 * B F B B B F 2387 * -|-----|.....|-----|-----|-----|.....|- 2388 * | The KVA space | 2389 * |<--------------------------------->| 2390 */ 2391 list_for_each_entry(busy, &vmap_area_list, list) { 2392 if (busy->va_start - vmap_start > 0) { 2393 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 2394 if (!WARN_ON_ONCE(!free)) { 2395 free->va_start = vmap_start; 2396 free->va_end = busy->va_start; 2397 2398 insert_vmap_area_augment(free, NULL, 2399 &free_vmap_area_root, 2400 &free_vmap_area_list); 2401 } 2402 } 2403 2404 vmap_start = busy->va_end; 2405 } 2406 2407 if (vmap_end - vmap_start > 0) { 2408 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 2409 if (!WARN_ON_ONCE(!free)) { 2410 free->va_start = vmap_start; 2411 free->va_end = vmap_end; 2412 2413 insert_vmap_area_augment(free, NULL, 2414 &free_vmap_area_root, 2415 &free_vmap_area_list); 2416 } 2417 } 2418 } 2419 2420 void __init vmalloc_init(void) 2421 { 2422 struct vmap_area *va; 2423 struct vm_struct *tmp; 2424 int i; 2425 2426 /* 2427 * Create the cache for vmap_area objects. 2428 */ 2429 vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); 2430 2431 for_each_possible_cpu(i) { 2432 struct vmap_block_queue *vbq; 2433 struct vfree_deferred *p; 2434 2435 vbq = &per_cpu(vmap_block_queue, i); 2436 spin_lock_init(&vbq->lock); 2437 INIT_LIST_HEAD(&vbq->free); 2438 p = &per_cpu(vfree_deferred, i); 2439 init_llist_head(&p->list); 2440 INIT_WORK(&p->wq, free_work); 2441 } 2442 2443 /* Import existing vmlist entries. */ 2444 for (tmp = vmlist; tmp; tmp = tmp->next) { 2445 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); 2446 if (WARN_ON_ONCE(!va)) 2447 continue; 2448 2449 va->va_start = (unsigned long)tmp->addr; 2450 va->va_end = va->va_start + tmp->size; 2451 va->vm = tmp; 2452 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 2453 } 2454 2455 /* 2456 * Now we can initialize a free vmap space. 2457 */ 2458 vmap_init_free_space(); 2459 vmap_initialized = true; 2460 } 2461 2462 static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, 2463 struct vmap_area *va, unsigned long flags, const void *caller) 2464 { 2465 vm->flags = flags; 2466 vm->addr = (void *)va->va_start; 2467 vm->size = va->va_end - va->va_start; 2468 vm->caller = caller; 2469 va->vm = vm; 2470 } 2471 2472 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 2473 unsigned long flags, const void *caller) 2474 { 2475 spin_lock(&vmap_area_lock); 2476 setup_vmalloc_vm_locked(vm, va, flags, caller); 2477 spin_unlock(&vmap_area_lock); 2478 } 2479 2480 static void clear_vm_uninitialized_flag(struct vm_struct *vm) 2481 { 2482 /* 2483 * Before removing VM_UNINITIALIZED, 2484 * we should make sure that vm has proper values. 2485 * Pair with smp_rmb() in show_numa_info(). 2486 */ 2487 smp_wmb(); 2488 vm->flags &= ~VM_UNINITIALIZED; 2489 } 2490 2491 static struct vm_struct *__get_vm_area_node(unsigned long size, 2492 unsigned long align, unsigned long shift, unsigned long flags, 2493 unsigned long start, unsigned long end, int node, 2494 gfp_t gfp_mask, const void *caller) 2495 { 2496 struct vmap_area *va; 2497 struct vm_struct *area; 2498 unsigned long requested_size = size; 2499 2500 BUG_ON(in_interrupt()); 2501 size = ALIGN(size, 1ul << shift); 2502 if (unlikely(!size)) 2503 return NULL; 2504 2505 if (flags & VM_IOREMAP) 2506 align = 1ul << clamp_t(int, get_count_order_long(size), 2507 PAGE_SHIFT, IOREMAP_MAX_ORDER); 2508 2509 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 2510 if (unlikely(!area)) 2511 return NULL; 2512 2513 if (!(flags & VM_NO_GUARD)) 2514 size += PAGE_SIZE; 2515 2516 va = alloc_vmap_area(size, align, start, end, node, gfp_mask); 2517 if (IS_ERR(va)) { 2518 kfree(area); 2519 return NULL; 2520 } 2521 2522 setup_vmalloc_vm(area, va, flags, caller); 2523 2524 /* 2525 * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a 2526 * best-effort approach, as they can be mapped outside of vmalloc code. 2527 * For VM_ALLOC mappings, the pages are marked as accessible after 2528 * getting mapped in __vmalloc_node_range(). 2529 * With hardware tag-based KASAN, marking is skipped for 2530 * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). 2531 */ 2532 if (!(flags & VM_ALLOC)) 2533 area->addr = kasan_unpoison_vmalloc(area->addr, requested_size, 2534 KASAN_VMALLOC_PROT_NORMAL); 2535 2536 return area; 2537 } 2538 2539 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, 2540 unsigned long start, unsigned long end, 2541 const void *caller) 2542 { 2543 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end, 2544 NUMA_NO_NODE, GFP_KERNEL, caller); 2545 } 2546 2547 /** 2548 * get_vm_area - reserve a contiguous kernel virtual area 2549 * @size: size of the area 2550 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 2551 * 2552 * Search an area of @size in the kernel virtual mapping area, 2553 * and reserved it for out purposes. Returns the area descriptor 2554 * on success or %NULL on failure. 2555 * 2556 * Return: the area descriptor on success or %NULL on failure. 2557 */ 2558 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 2559 { 2560 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, 2561 VMALLOC_START, VMALLOC_END, 2562 NUMA_NO_NODE, GFP_KERNEL, 2563 __builtin_return_address(0)); 2564 } 2565 2566 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 2567 const void *caller) 2568 { 2569 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, 2570 VMALLOC_START, VMALLOC_END, 2571 NUMA_NO_NODE, GFP_KERNEL, caller); 2572 } 2573 2574 /** 2575 * find_vm_area - find a continuous kernel virtual area 2576 * @addr: base address 2577 * 2578 * Search for the kernel VM area starting at @addr, and return it. 2579 * It is up to the caller to do all required locking to keep the returned 2580 * pointer valid. 2581 * 2582 * Return: the area descriptor on success or %NULL on failure. 2583 */ 2584 struct vm_struct *find_vm_area(const void *addr) 2585 { 2586 struct vmap_area *va; 2587 2588 va = find_vmap_area((unsigned long)addr); 2589 if (!va) 2590 return NULL; 2591 2592 return va->vm; 2593 } 2594 2595 /** 2596 * remove_vm_area - find and remove a continuous kernel virtual area 2597 * @addr: base address 2598 * 2599 * Search for the kernel VM area starting at @addr, and remove it. 2600 * This function returns the found VM area, but using it is NOT safe 2601 * on SMP machines, except for its size or flags. 2602 * 2603 * Return: the area descriptor on success or %NULL on failure. 2604 */ 2605 struct vm_struct *remove_vm_area(const void *addr) 2606 { 2607 struct vmap_area *va; 2608 2609 might_sleep(); 2610 2611 spin_lock(&vmap_area_lock); 2612 va = __find_vmap_area((unsigned long)addr, &vmap_area_root); 2613 if (va && va->vm) { 2614 struct vm_struct *vm = va->vm; 2615 2616 va->vm = NULL; 2617 spin_unlock(&vmap_area_lock); 2618 2619 kasan_free_module_shadow(vm); 2620 free_unmap_vmap_area(va); 2621 2622 return vm; 2623 } 2624 2625 spin_unlock(&vmap_area_lock); 2626 return NULL; 2627 } 2628 2629 static inline void set_area_direct_map(const struct vm_struct *area, 2630 int (*set_direct_map)(struct page *page)) 2631 { 2632 int i; 2633 2634 /* HUGE_VMALLOC passes small pages to set_direct_map */ 2635 for (i = 0; i < area->nr_pages; i++) 2636 if (page_address(area->pages[i])) 2637 set_direct_map(area->pages[i]); 2638 } 2639 2640 /* Handle removing and resetting vm mappings related to the vm_struct. */ 2641 static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) 2642 { 2643 unsigned long start = ULONG_MAX, end = 0; 2644 unsigned int page_order = vm_area_page_order(area); 2645 int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; 2646 int flush_dmap = 0; 2647 int i; 2648 2649 remove_vm_area(area->addr); 2650 2651 /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ 2652 if (!flush_reset) 2653 return; 2654 2655 /* 2656 * If not deallocating pages, just do the flush of the VM area and 2657 * return. 2658 */ 2659 if (!deallocate_pages) { 2660 vm_unmap_aliases(); 2661 return; 2662 } 2663 2664 /* 2665 * If execution gets here, flush the vm mapping and reset the direct 2666 * map. Find the start and end range of the direct mappings to make sure 2667 * the vm_unmap_aliases() flush includes the direct map. 2668 */ 2669 for (i = 0; i < area->nr_pages; i += 1U << page_order) { 2670 unsigned long addr = (unsigned long)page_address(area->pages[i]); 2671 if (addr) { 2672 unsigned long page_size; 2673 2674 page_size = PAGE_SIZE << page_order; 2675 start = min(addr, start); 2676 end = max(addr + page_size, end); 2677 flush_dmap = 1; 2678 } 2679 } 2680 2681 /* 2682 * Set direct map to something invalid so that it won't be cached if 2683 * there are any accesses after the TLB flush, then flush the TLB and 2684 * reset the direct map permissions to the default. 2685 */ 2686 set_area_direct_map(area, set_direct_map_invalid_noflush); 2687 _vm_unmap_aliases(start, end, flush_dmap); 2688 set_area_direct_map(area, set_direct_map_default_noflush); 2689 } 2690 2691 static void __vunmap(const void *addr, int deallocate_pages) 2692 { 2693 struct vm_struct *area; 2694 2695 if (!addr) 2696 return; 2697 2698 if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", 2699 addr)) 2700 return; 2701 2702 area = find_vm_area(addr); 2703 if (unlikely(!area)) { 2704 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 2705 addr); 2706 return; 2707 } 2708 2709 debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); 2710 debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); 2711 2712 kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); 2713 2714 vm_remove_mappings(area, deallocate_pages); 2715 2716 if (deallocate_pages) { 2717 int i; 2718 2719 for (i = 0; i < area->nr_pages; i++) { 2720 struct page *page = area->pages[i]; 2721 2722 BUG_ON(!page); 2723 mod_memcg_page_state(page, MEMCG_VMALLOC, -1); 2724 /* 2725 * High-order allocs for huge vmallocs are split, so 2726 * can be freed as an array of order-0 allocations 2727 */ 2728 __free_pages(page, 0); 2729 cond_resched(); 2730 } 2731 atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); 2732 2733 kvfree(area->pages); 2734 } 2735 2736 kfree(area); 2737 } 2738 2739 static inline void __vfree_deferred(const void *addr) 2740 { 2741 /* 2742 * Use raw_cpu_ptr() because this can be called from preemptible 2743 * context. Preemption is absolutely fine here, because the llist_add() 2744 * implementation is lockless, so it works even if we are adding to 2745 * another cpu's list. schedule_work() should be fine with this too. 2746 */ 2747 struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); 2748 2749 if (llist_add((struct llist_node *)addr, &p->list)) 2750 schedule_work(&p->wq); 2751 } 2752 2753 /** 2754 * vfree_atomic - release memory allocated by vmalloc() 2755 * @addr: memory base address 2756 * 2757 * This one is just like vfree() but can be called in any atomic context 2758 * except NMIs. 2759 */ 2760 void vfree_atomic(const void *addr) 2761 { 2762 BUG_ON(in_nmi()); 2763 2764 kmemleak_free(addr); 2765 2766 if (!addr) 2767 return; 2768 __vfree_deferred(addr); 2769 } 2770 2771 static void __vfree(const void *addr) 2772 { 2773 if (unlikely(in_interrupt())) 2774 __vfree_deferred(addr); 2775 else 2776 __vunmap(addr, 1); 2777 } 2778 2779 /** 2780 * vfree - Release memory allocated by vmalloc() 2781 * @addr: Memory base address 2782 * 2783 * Free the virtually continuous memory area starting at @addr, as obtained 2784 * from one of the vmalloc() family of APIs. This will usually also free the 2785 * physical memory underlying the virtual allocation, but that memory is 2786 * reference counted, so it will not be freed until the last user goes away. 2787 * 2788 * If @addr is NULL, no operation is performed. 2789 * 2790 * Context: 2791 * May sleep if called *not* from interrupt context. 2792 * Must not be called in NMI context (strictly speaking, it could be 2793 * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling 2794 * conventions for vfree() arch-dependent would be a really bad idea). 2795 */ 2796 void vfree(const void *addr) 2797 { 2798 BUG_ON(in_nmi()); 2799 2800 kmemleak_free(addr); 2801 2802 might_sleep_if(!in_interrupt()); 2803 2804 if (!addr) 2805 return; 2806 2807 __vfree(addr); 2808 } 2809 EXPORT_SYMBOL(vfree); 2810 2811 /** 2812 * vunmap - release virtual mapping obtained by vmap() 2813 * @addr: memory base address 2814 * 2815 * Free the virtually contiguous memory area starting at @addr, 2816 * which was created from the page array passed to vmap(). 2817 * 2818 * Must not be called in interrupt context. 2819 */ 2820 void vunmap(const void *addr) 2821 { 2822 BUG_ON(in_interrupt()); 2823 might_sleep(); 2824 if (addr) 2825 __vunmap(addr, 0); 2826 } 2827 EXPORT_SYMBOL(vunmap); 2828 2829 /** 2830 * vmap - map an array of pages into virtually contiguous space 2831 * @pages: array of page pointers 2832 * @count: number of pages to map 2833 * @flags: vm_area->flags 2834 * @prot: page protection for the mapping 2835 * 2836 * Maps @count pages from @pages into contiguous kernel virtual space. 2837 * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself 2838 * (which must be kmalloc or vmalloc memory) and one reference per pages in it 2839 * are transferred from the caller to vmap(), and will be freed / dropped when 2840 * vfree() is called on the return value. 2841 * 2842 * Return: the address of the area or %NULL on failure 2843 */ 2844 void *vmap(struct page **pages, unsigned int count, 2845 unsigned long flags, pgprot_t prot) 2846 { 2847 struct vm_struct *area; 2848 unsigned long addr; 2849 unsigned long size; /* In bytes */ 2850 2851 might_sleep(); 2852 2853 /* 2854 * Your top guard is someone else's bottom guard. Not having a top 2855 * guard compromises someone else's mappings too. 2856 */ 2857 if (WARN_ON_ONCE(flags & VM_NO_GUARD)) 2858 flags &= ~VM_NO_GUARD; 2859 2860 if (count > totalram_pages()) 2861 return NULL; 2862 2863 size = (unsigned long)count << PAGE_SHIFT; 2864 area = get_vm_area_caller(size, flags, __builtin_return_address(0)); 2865 if (!area) 2866 return NULL; 2867 2868 addr = (unsigned long)area->addr; 2869 if (vmap_pages_range(addr, addr + size, pgprot_nx(prot), 2870 pages, PAGE_SHIFT) < 0) { 2871 vunmap(area->addr); 2872 return NULL; 2873 } 2874 2875 if (flags & VM_MAP_PUT_PAGES) { 2876 area->pages = pages; 2877 area->nr_pages = count; 2878 } 2879 return area->addr; 2880 } 2881 EXPORT_SYMBOL(vmap); 2882 2883 #ifdef CONFIG_VMAP_PFN 2884 struct vmap_pfn_data { 2885 unsigned long *pfns; 2886 pgprot_t prot; 2887 unsigned int idx; 2888 }; 2889 2890 static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private) 2891 { 2892 struct vmap_pfn_data *data = private; 2893 2894 if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx]))) 2895 return -EINVAL; 2896 *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot)); 2897 return 0; 2898 } 2899 2900 /** 2901 * vmap_pfn - map an array of PFNs into virtually contiguous space 2902 * @pfns: array of PFNs 2903 * @count: number of pages to map 2904 * @prot: page protection for the mapping 2905 * 2906 * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns 2907 * the start address of the mapping. 2908 */ 2909 void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) 2910 { 2911 struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) }; 2912 struct vm_struct *area; 2913 2914 area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP, 2915 __builtin_return_address(0)); 2916 if (!area) 2917 return NULL; 2918 if (apply_to_page_range(&init_mm, (unsigned long)area->addr, 2919 count * PAGE_SIZE, vmap_pfn_apply, &data)) { 2920 free_vm_area(area); 2921 return NULL; 2922 } 2923 return area->addr; 2924 } 2925 EXPORT_SYMBOL_GPL(vmap_pfn); 2926 #endif /* CONFIG_VMAP_PFN */ 2927 2928 static inline unsigned int 2929 vm_area_alloc_pages(gfp_t gfp, int nid, 2930 unsigned int order, unsigned int nr_pages, struct page **pages) 2931 { 2932 unsigned int nr_allocated = 0; 2933 struct page *page; 2934 int i; 2935 2936 /* 2937 * For order-0 pages we make use of bulk allocator, if 2938 * the page array is partly or not at all populated due 2939 * to fails, fallback to a single page allocator that is 2940 * more permissive. 2941 */ 2942 if (!order) { 2943 gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL; 2944 2945 while (nr_allocated < nr_pages) { 2946 unsigned int nr, nr_pages_request; 2947 2948 /* 2949 * A maximum allowed request is hard-coded and is 100 2950 * pages per call. That is done in order to prevent a 2951 * long preemption off scenario in the bulk-allocator 2952 * so the range is [1:100]. 2953 */ 2954 nr_pages_request = min(100U, nr_pages - nr_allocated); 2955 2956 /* memory allocation should consider mempolicy, we can't 2957 * wrongly use nearest node when nid == NUMA_NO_NODE, 2958 * otherwise memory may be allocated in only one node, 2959 * but mempolicy wants to alloc memory by interleaving. 2960 */ 2961 if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) 2962 nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, 2963 nr_pages_request, 2964 pages + nr_allocated); 2965 2966 else 2967 nr = alloc_pages_bulk_array_node(bulk_gfp, nid, 2968 nr_pages_request, 2969 pages + nr_allocated); 2970 2971 nr_allocated += nr; 2972 cond_resched(); 2973 2974 /* 2975 * If zero or pages were obtained partly, 2976 * fallback to a single page allocator. 2977 */ 2978 if (nr != nr_pages_request) 2979 break; 2980 } 2981 } 2982 2983 /* High-order pages or fallback path if "bulk" fails. */ 2984 2985 while (nr_allocated < nr_pages) { 2986 if (fatal_signal_pending(current)) 2987 break; 2988 2989 if (nid == NUMA_NO_NODE) 2990 page = alloc_pages(gfp, order); 2991 else 2992 page = alloc_pages_node(nid, gfp, order); 2993 if (unlikely(!page)) 2994 break; 2995 /* 2996 * Higher order allocations must be able to be treated as 2997 * indepdenent small pages by callers (as they can with 2998 * small-page vmallocs). Some drivers do their own refcounting 2999 * on vmalloc_to_page() pages, some use page->mapping, 3000 * page->lru, etc. 3001 */ 3002 if (order) 3003 split_page(page, order); 3004 3005 /* 3006 * Careful, we allocate and map page-order pages, but 3007 * tracking is done per PAGE_SIZE page so as to keep the 3008 * vm_struct APIs independent of the physical/mapped size. 3009 */ 3010 for (i = 0; i < (1U << order); i++) 3011 pages[nr_allocated + i] = page + i; 3012 3013 cond_resched(); 3014 nr_allocated += 1U << order; 3015 } 3016 3017 return nr_allocated; 3018 } 3019 3020 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 3021 pgprot_t prot, unsigned int page_shift, 3022 int node) 3023 { 3024 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 3025 bool nofail = gfp_mask & __GFP_NOFAIL; 3026 unsigned long addr = (unsigned long)area->addr; 3027 unsigned long size = get_vm_area_size(area); 3028 unsigned long array_size; 3029 unsigned int nr_small_pages = size >> PAGE_SHIFT; 3030 unsigned int page_order; 3031 unsigned int flags; 3032 int ret; 3033 3034 array_size = (unsigned long)nr_small_pages * sizeof(struct page *); 3035 gfp_mask |= __GFP_NOWARN; 3036 if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) 3037 gfp_mask |= __GFP_HIGHMEM; 3038 3039 /* Please note that the recursion is strictly bounded. */ 3040 if (array_size > PAGE_SIZE) { 3041 area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, 3042 area->caller); 3043 } else { 3044 area->pages = kmalloc_node(array_size, nested_gfp, node); 3045 } 3046 3047 if (!area->pages) { 3048 warn_alloc(gfp_mask, NULL, 3049 "vmalloc error: size %lu, failed to allocated page array size %lu", 3050 nr_small_pages * PAGE_SIZE, array_size); 3051 free_vm_area(area); 3052 return NULL; 3053 } 3054 3055 set_vm_area_page_order(area, page_shift - PAGE_SHIFT); 3056 page_order = vm_area_page_order(area); 3057 3058 area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, 3059 node, page_order, nr_small_pages, area->pages); 3060 3061 atomic_long_add(area->nr_pages, &nr_vmalloc_pages); 3062 if (gfp_mask & __GFP_ACCOUNT) { 3063 int i; 3064 3065 for (i = 0; i < area->nr_pages; i++) 3066 mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1); 3067 } 3068 3069 /* 3070 * If not enough pages were obtained to accomplish an 3071 * allocation request, free them via __vfree() if any. 3072 */ 3073 if (area->nr_pages != nr_small_pages) { 3074 warn_alloc(gfp_mask, NULL, 3075 "vmalloc error: size %lu, page order %u, failed to allocate pages", 3076 area->nr_pages * PAGE_SIZE, page_order); 3077 goto fail; 3078 } 3079 3080 /* 3081 * page tables allocations ignore external gfp mask, enforce it 3082 * by the scope API 3083 */ 3084 if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) 3085 flags = memalloc_nofs_save(); 3086 else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) 3087 flags = memalloc_noio_save(); 3088 3089 do { 3090 ret = vmap_pages_range(addr, addr + size, prot, area->pages, 3091 page_shift); 3092 if (nofail && (ret < 0)) 3093 schedule_timeout_uninterruptible(1); 3094 } while (nofail && (ret < 0)); 3095 3096 if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) 3097 memalloc_nofs_restore(flags); 3098 else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) 3099 memalloc_noio_restore(flags); 3100 3101 if (ret < 0) { 3102 warn_alloc(gfp_mask, NULL, 3103 "vmalloc error: size %lu, failed to map pages", 3104 area->nr_pages * PAGE_SIZE); 3105 goto fail; 3106 } 3107 3108 return area->addr; 3109 3110 fail: 3111 __vfree(area->addr); 3112 return NULL; 3113 } 3114 3115 /** 3116 * __vmalloc_node_range - allocate virtually contiguous memory 3117 * @size: allocation size 3118 * @align: desired alignment 3119 * @start: vm area range start 3120 * @end: vm area range end 3121 * @gfp_mask: flags for the page level allocator 3122 * @prot: protection mask for the allocated pages 3123 * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) 3124 * @node: node to use for allocation or NUMA_NO_NODE 3125 * @caller: caller's return address 3126 * 3127 * Allocate enough pages to cover @size from the page level 3128 * allocator with @gfp_mask flags. Please note that the full set of gfp 3129 * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all 3130 * supported. 3131 * Zone modifiers are not supported. From the reclaim modifiers 3132 * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported) 3133 * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and 3134 * __GFP_RETRY_MAYFAIL are not supported). 3135 * 3136 * __GFP_NOWARN can be used to suppress failures messages. 3137 * 3138 * Map them into contiguous kernel virtual space, using a pagetable 3139 * protection of @prot. 3140 * 3141 * Return: the address of the area or %NULL on failure 3142 */ 3143 void *__vmalloc_node_range(unsigned long size, unsigned long align, 3144 unsigned long start, unsigned long end, gfp_t gfp_mask, 3145 pgprot_t prot, unsigned long vm_flags, int node, 3146 const void *caller) 3147 { 3148 struct vm_struct *area; 3149 void *ret; 3150 kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE; 3151 unsigned long real_size = size; 3152 unsigned long real_align = align; 3153 unsigned int shift = PAGE_SHIFT; 3154 3155 if (WARN_ON_ONCE(!size)) 3156 return NULL; 3157 3158 if ((size >> PAGE_SHIFT) > totalram_pages()) { 3159 warn_alloc(gfp_mask, NULL, 3160 "vmalloc error: size %lu, exceeds total pages", 3161 real_size); 3162 return NULL; 3163 } 3164 3165 if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) { 3166 unsigned long size_per_node; 3167 3168 /* 3169 * Try huge pages. Only try for PAGE_KERNEL allocations, 3170 * others like modules don't yet expect huge pages in 3171 * their allocations due to apply_to_page_range not 3172 * supporting them. 3173 */ 3174 3175 size_per_node = size; 3176 if (node == NUMA_NO_NODE) 3177 size_per_node /= num_online_nodes(); 3178 if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE) 3179 shift = PMD_SHIFT; 3180 else 3181 shift = arch_vmap_pte_supported_shift(size_per_node); 3182 3183 align = max(real_align, 1UL << shift); 3184 size = ALIGN(real_size, 1UL << shift); 3185 } 3186 3187 again: 3188 area = __get_vm_area_node(real_size, align, shift, VM_ALLOC | 3189 VM_UNINITIALIZED | vm_flags, start, end, node, 3190 gfp_mask, caller); 3191 if (!area) { 3192 bool nofail = gfp_mask & __GFP_NOFAIL; 3193 warn_alloc(gfp_mask, NULL, 3194 "vmalloc error: size %lu, vm_struct allocation failed%s", 3195 real_size, (nofail) ? ". Retrying." : ""); 3196 if (nofail) { 3197 schedule_timeout_uninterruptible(1); 3198 goto again; 3199 } 3200 goto fail; 3201 } 3202 3203 /* 3204 * Prepare arguments for __vmalloc_area_node() and 3205 * kasan_unpoison_vmalloc(). 3206 */ 3207 if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) { 3208 if (kasan_hw_tags_enabled()) { 3209 /* 3210 * Modify protection bits to allow tagging. 3211 * This must be done before mapping. 3212 */ 3213 prot = arch_vmap_pgprot_tagged(prot); 3214 3215 /* 3216 * Skip page_alloc poisoning and zeroing for physical 3217 * pages backing VM_ALLOC mapping. Memory is instead 3218 * poisoned and zeroed by kasan_unpoison_vmalloc(). 3219 */ 3220 gfp_mask |= __GFP_SKIP_KASAN_UNPOISON | __GFP_SKIP_ZERO; 3221 } 3222 3223 /* Take note that the mapping is PAGE_KERNEL. */ 3224 kasan_flags |= KASAN_VMALLOC_PROT_NORMAL; 3225 } 3226 3227 /* Allocate physical pages and map them into vmalloc space. */ 3228 ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); 3229 if (!ret) 3230 goto fail; 3231 3232 /* 3233 * Mark the pages as accessible, now that they are mapped. 3234 * The condition for setting KASAN_VMALLOC_INIT should complement the 3235 * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check 3236 * to make sure that memory is initialized under the same conditions. 3237 * Tag-based KASAN modes only assign tags to normal non-executable 3238 * allocations, see __kasan_unpoison_vmalloc(). 3239 */ 3240 kasan_flags |= KASAN_VMALLOC_VM_ALLOC; 3241 if (!want_init_on_free() && want_init_on_alloc(gfp_mask) && 3242 (gfp_mask & __GFP_SKIP_ZERO)) 3243 kasan_flags |= KASAN_VMALLOC_INIT; 3244 /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ 3245 area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); 3246 3247 /* 3248 * In this function, newly allocated vm_struct has VM_UNINITIALIZED 3249 * flag. It means that vm_struct is not fully initialized. 3250 * Now, it is fully initialized, so remove this flag here. 3251 */ 3252 clear_vm_uninitialized_flag(area); 3253 3254 size = PAGE_ALIGN(size); 3255 if (!(vm_flags & VM_DEFER_KMEMLEAK)) 3256 kmemleak_vmalloc(area, size, gfp_mask); 3257 3258 return area->addr; 3259 3260 fail: 3261 if (shift > PAGE_SHIFT) { 3262 shift = PAGE_SHIFT; 3263 align = real_align; 3264 size = real_size; 3265 goto again; 3266 } 3267 3268 return NULL; 3269 } 3270 3271 /** 3272 * __vmalloc_node - allocate virtually contiguous memory 3273 * @size: allocation size 3274 * @align: desired alignment 3275 * @gfp_mask: flags for the page level allocator 3276 * @node: node to use for allocation or NUMA_NO_NODE 3277 * @caller: caller's return address 3278 * 3279 * Allocate enough pages to cover @size from the page level allocator with 3280 * @gfp_mask flags. Map them into contiguous kernel virtual space. 3281 * 3282 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL 3283 * and __GFP_NOFAIL are not supported 3284 * 3285 * Any use of gfp flags outside of GFP_KERNEL should be consulted 3286 * with mm people. 3287 * 3288 * Return: pointer to the allocated memory or %NULL on error 3289 */ 3290 void *__vmalloc_node(unsigned long size, unsigned long align, 3291 gfp_t gfp_mask, int node, const void *caller) 3292 { 3293 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 3294 gfp_mask, PAGE_KERNEL, 0, node, caller); 3295 } 3296 /* 3297 * This is only for performance analysis of vmalloc and stress purpose. 3298 * It is required by vmalloc test module, therefore do not use it other 3299 * than that. 3300 */ 3301 #ifdef CONFIG_TEST_VMALLOC_MODULE 3302 EXPORT_SYMBOL_GPL(__vmalloc_node); 3303 #endif 3304 3305 void *__vmalloc(unsigned long size, gfp_t gfp_mask) 3306 { 3307 return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, 3308 __builtin_return_address(0)); 3309 } 3310 EXPORT_SYMBOL(__vmalloc); 3311 3312 /** 3313 * vmalloc - allocate virtually contiguous memory 3314 * @size: allocation size 3315 * 3316 * Allocate enough pages to cover @size from the page level 3317 * allocator and map them into contiguous kernel virtual space. 3318 * 3319 * For tight control over page level allocator and protection flags 3320 * use __vmalloc() instead. 3321 * 3322 * Return: pointer to the allocated memory or %NULL on error 3323 */ 3324 void *vmalloc(unsigned long size) 3325 { 3326 return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, 3327 __builtin_return_address(0)); 3328 } 3329 EXPORT_SYMBOL(vmalloc); 3330 3331 /** 3332 * vmalloc_huge - allocate virtually contiguous memory, allow huge pages 3333 * @size: allocation size 3334 * @gfp_mask: flags for the page level allocator 3335 * 3336 * Allocate enough pages to cover @size from the page level 3337 * allocator and map them into contiguous kernel virtual space. 3338 * If @size is greater than or equal to PMD_SIZE, allow using 3339 * huge pages for the memory 3340 * 3341 * Return: pointer to the allocated memory or %NULL on error 3342 */ 3343 void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) 3344 { 3345 return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, 3346 gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, 3347 NUMA_NO_NODE, __builtin_return_address(0)); 3348 } 3349 EXPORT_SYMBOL_GPL(vmalloc_huge); 3350 3351 /** 3352 * vzalloc - allocate virtually contiguous memory with zero fill 3353 * @size: allocation size 3354 * 3355 * Allocate enough pages to cover @size from the page level 3356 * allocator and map them into contiguous kernel virtual space. 3357 * The memory allocated is set to zero. 3358 * 3359 * For tight control over page level allocator and protection flags 3360 * use __vmalloc() instead. 3361 * 3362 * Return: pointer to the allocated memory or %NULL on error 3363 */ 3364 void *vzalloc(unsigned long size) 3365 { 3366 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 3367 __builtin_return_address(0)); 3368 } 3369 EXPORT_SYMBOL(vzalloc); 3370 3371 /** 3372 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace 3373 * @size: allocation size 3374 * 3375 * The resulting memory area is zeroed so it can be mapped to userspace 3376 * without leaking data. 3377 * 3378 * Return: pointer to the allocated memory or %NULL on error 3379 */ 3380 void *vmalloc_user(unsigned long size) 3381 { 3382 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 3383 GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, 3384 VM_USERMAP, NUMA_NO_NODE, 3385 __builtin_return_address(0)); 3386 } 3387 EXPORT_SYMBOL(vmalloc_user); 3388 3389 /** 3390 * vmalloc_node - allocate memory on a specific node 3391 * @size: allocation size 3392 * @node: numa node 3393 * 3394 * Allocate enough pages to cover @size from the page level 3395 * allocator and map them into contiguous kernel virtual space. 3396 * 3397 * For tight control over page level allocator and protection flags 3398 * use __vmalloc() instead. 3399 * 3400 * Return: pointer to the allocated memory or %NULL on error 3401 */ 3402 void *vmalloc_node(unsigned long size, int node) 3403 { 3404 return __vmalloc_node(size, 1, GFP_KERNEL, node, 3405 __builtin_return_address(0)); 3406 } 3407 EXPORT_SYMBOL(vmalloc_node); 3408 3409 /** 3410 * vzalloc_node - allocate memory on a specific node with zero fill 3411 * @size: allocation size 3412 * @node: numa node 3413 * 3414 * Allocate enough pages to cover @size from the page level 3415 * allocator and map them into contiguous kernel virtual space. 3416 * The memory allocated is set to zero. 3417 * 3418 * Return: pointer to the allocated memory or %NULL on error 3419 */ 3420 void *vzalloc_node(unsigned long size, int node) 3421 { 3422 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, 3423 __builtin_return_address(0)); 3424 } 3425 EXPORT_SYMBOL(vzalloc_node); 3426 3427 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 3428 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) 3429 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) 3430 #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL) 3431 #else 3432 /* 3433 * 64b systems should always have either DMA or DMA32 zones. For others 3434 * GFP_DMA32 should do the right thing and use the normal zone. 3435 */ 3436 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) 3437 #endif 3438 3439 /** 3440 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 3441 * @size: allocation size 3442 * 3443 * Allocate enough 32bit PA addressable pages to cover @size from the 3444 * page level allocator and map them into contiguous kernel virtual space. 3445 * 3446 * Return: pointer to the allocated memory or %NULL on error 3447 */ 3448 void *vmalloc_32(unsigned long size) 3449 { 3450 return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, 3451 __builtin_return_address(0)); 3452 } 3453 EXPORT_SYMBOL(vmalloc_32); 3454 3455 /** 3456 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 3457 * @size: allocation size 3458 * 3459 * The resulting memory area is 32bit addressable and zeroed so it can be 3460 * mapped to userspace without leaking data. 3461 * 3462 * Return: pointer to the allocated memory or %NULL on error 3463 */ 3464 void *vmalloc_32_user(unsigned long size) 3465 { 3466 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, 3467 GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 3468 VM_USERMAP, NUMA_NO_NODE, 3469 __builtin_return_address(0)); 3470 } 3471 EXPORT_SYMBOL(vmalloc_32_user); 3472 3473 /* 3474 * small helper routine , copy contents to buf from addr. 3475 * If the page is not present, fill zero. 3476 */ 3477 3478 static int aligned_vread(char *buf, char *addr, unsigned long count) 3479 { 3480 struct page *p; 3481 int copied = 0; 3482 3483 while (count) { 3484 unsigned long offset, length; 3485 3486 offset = offset_in_page(addr); 3487 length = PAGE_SIZE - offset; 3488 if (length > count) 3489 length = count; 3490 p = vmalloc_to_page(addr); 3491 /* 3492 * To do safe access to this _mapped_ area, we need 3493 * lock. But adding lock here means that we need to add 3494 * overhead of vmalloc()/vfree() calls for this _debug_ 3495 * interface, rarely used. Instead of that, we'll use 3496 * kmap() and get small overhead in this access function. 3497 */ 3498 if (p) { 3499 /* We can expect USER0 is not used -- see vread() */ 3500 void *map = kmap_atomic(p); 3501 memcpy(buf, map + offset, length); 3502 kunmap_atomic(map); 3503 } else 3504 memset(buf, 0, length); 3505 3506 addr += length; 3507 buf += length; 3508 copied += length; 3509 count -= length; 3510 } 3511 return copied; 3512 } 3513 3514 /** 3515 * vread() - read vmalloc area in a safe way. 3516 * @buf: buffer for reading data 3517 * @addr: vm address. 3518 * @count: number of bytes to be read. 3519 * 3520 * This function checks that addr is a valid vmalloc'ed area, and 3521 * copy data from that area to a given buffer. If the given memory range 3522 * of [addr...addr+count) includes some valid address, data is copied to 3523 * proper area of @buf. If there are memory holes, they'll be zero-filled. 3524 * IOREMAP area is treated as memory hole and no copy is done. 3525 * 3526 * If [addr...addr+count) doesn't includes any intersects with alive 3527 * vm_struct area, returns 0. @buf should be kernel's buffer. 3528 * 3529 * Note: In usual ops, vread() is never necessary because the caller 3530 * should know vmalloc() area is valid and can use memcpy(). 3531 * This is for routines which have to access vmalloc area without 3532 * any information, as /proc/kcore. 3533 * 3534 * Return: number of bytes for which addr and buf should be increased 3535 * (same number as @count) or %0 if [addr...addr+count) doesn't 3536 * include any intersection with valid vmalloc area 3537 */ 3538 long vread(char *buf, char *addr, unsigned long count) 3539 { 3540 struct vmap_area *va; 3541 struct vm_struct *vm; 3542 char *vaddr, *buf_start = buf; 3543 unsigned long buflen = count; 3544 unsigned long n; 3545 3546 addr = kasan_reset_tag(addr); 3547 3548 /* Don't allow overflow */ 3549 if ((unsigned long) addr + count < count) 3550 count = -(unsigned long) addr; 3551 3552 spin_lock(&vmap_area_lock); 3553 va = find_vmap_area_exceed_addr((unsigned long)addr); 3554 if (!va) 3555 goto finished; 3556 3557 /* no intersects with alive vmap_area */ 3558 if ((unsigned long)addr + count <= va->va_start) 3559 goto finished; 3560 3561 list_for_each_entry_from(va, &vmap_area_list, list) { 3562 if (!count) 3563 break; 3564 3565 if (!va->vm) 3566 continue; 3567 3568 vm = va->vm; 3569 vaddr = (char *) vm->addr; 3570 if (addr >= vaddr + get_vm_area_size(vm)) 3571 continue; 3572 while (addr < vaddr) { 3573 if (count == 0) 3574 goto finished; 3575 *buf = '\0'; 3576 buf++; 3577 addr++; 3578 count--; 3579 } 3580 n = vaddr + get_vm_area_size(vm) - addr; 3581 if (n > count) 3582 n = count; 3583 if (!(vm->flags & VM_IOREMAP)) 3584 aligned_vread(buf, addr, n); 3585 else /* IOREMAP area is treated as memory hole */ 3586 memset(buf, 0, n); 3587 buf += n; 3588 addr += n; 3589 count -= n; 3590 } 3591 finished: 3592 spin_unlock(&vmap_area_lock); 3593 3594 if (buf == buf_start) 3595 return 0; 3596 /* zero-fill memory holes */ 3597 if (buf != buf_start + buflen) 3598 memset(buf, 0, buflen - (buf - buf_start)); 3599 3600 return buflen; 3601 } 3602 3603 /** 3604 * remap_vmalloc_range_partial - map vmalloc pages to userspace 3605 * @vma: vma to cover 3606 * @uaddr: target user address to start at 3607 * @kaddr: virtual address of vmalloc kernel memory 3608 * @pgoff: offset from @kaddr to start at 3609 * @size: size of map area 3610 * 3611 * Returns: 0 for success, -Exxx on failure 3612 * 3613 * This function checks that @kaddr is a valid vmalloc'ed area, 3614 * and that it is big enough to cover the range starting at 3615 * @uaddr in @vma. Will return failure if that criteria isn't 3616 * met. 3617 * 3618 * Similar to remap_pfn_range() (see mm/memory.c) 3619 */ 3620 int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, 3621 void *kaddr, unsigned long pgoff, 3622 unsigned long size) 3623 { 3624 struct vm_struct *area; 3625 unsigned long off; 3626 unsigned long end_index; 3627 3628 if (check_shl_overflow(pgoff, PAGE_SHIFT, &off)) 3629 return -EINVAL; 3630 3631 size = PAGE_ALIGN(size); 3632 3633 if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) 3634 return -EINVAL; 3635 3636 area = find_vm_area(kaddr); 3637 if (!area) 3638 return -EINVAL; 3639 3640 if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) 3641 return -EINVAL; 3642 3643 if (check_add_overflow(size, off, &end_index) || 3644 end_index > get_vm_area_size(area)) 3645 return -EINVAL; 3646 kaddr += off; 3647 3648 do { 3649 struct page *page = vmalloc_to_page(kaddr); 3650 int ret; 3651 3652 ret = vm_insert_page(vma, uaddr, page); 3653 if (ret) 3654 return ret; 3655 3656 uaddr += PAGE_SIZE; 3657 kaddr += PAGE_SIZE; 3658 size -= PAGE_SIZE; 3659 } while (size > 0); 3660 3661 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 3662 3663 return 0; 3664 } 3665 3666 /** 3667 * remap_vmalloc_range - map vmalloc pages to userspace 3668 * @vma: vma to cover (map full range of vma) 3669 * @addr: vmalloc memory 3670 * @pgoff: number of pages into addr before first page to map 3671 * 3672 * Returns: 0 for success, -Exxx on failure 3673 * 3674 * This function checks that addr is a valid vmalloc'ed area, and 3675 * that it is big enough to cover the vma. Will return failure if 3676 * that criteria isn't met. 3677 * 3678 * Similar to remap_pfn_range() (see mm/memory.c) 3679 */ 3680 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 3681 unsigned long pgoff) 3682 { 3683 return remap_vmalloc_range_partial(vma, vma->vm_start, 3684 addr, pgoff, 3685 vma->vm_end - vma->vm_start); 3686 } 3687 EXPORT_SYMBOL(remap_vmalloc_range); 3688 3689 void free_vm_area(struct vm_struct *area) 3690 { 3691 struct vm_struct *ret; 3692 ret = remove_vm_area(area->addr); 3693 BUG_ON(ret != area); 3694 kfree(area); 3695 } 3696 EXPORT_SYMBOL_GPL(free_vm_area); 3697 3698 #ifdef CONFIG_SMP 3699 static struct vmap_area *node_to_va(struct rb_node *n) 3700 { 3701 return rb_entry_safe(n, struct vmap_area, rb_node); 3702 } 3703 3704 /** 3705 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to 3706 * @addr: target address 3707 * 3708 * Returns: vmap_area if it is found. If there is no such area 3709 * the first highest(reverse order) vmap_area is returned 3710 * i.e. va->va_start < addr && va->va_end < addr or NULL 3711 * if there are no any areas before @addr. 3712 */ 3713 static struct vmap_area * 3714 pvm_find_va_enclose_addr(unsigned long addr) 3715 { 3716 struct vmap_area *va, *tmp; 3717 struct rb_node *n; 3718 3719 n = free_vmap_area_root.rb_node; 3720 va = NULL; 3721 3722 while (n) { 3723 tmp = rb_entry(n, struct vmap_area, rb_node); 3724 if (tmp->va_start <= addr) { 3725 va = tmp; 3726 if (tmp->va_end >= addr) 3727 break; 3728 3729 n = n->rb_right; 3730 } else { 3731 n = n->rb_left; 3732 } 3733 } 3734 3735 return va; 3736 } 3737 3738 /** 3739 * pvm_determine_end_from_reverse - find the highest aligned address 3740 * of free block below VMALLOC_END 3741 * @va: 3742 * in - the VA we start the search(reverse order); 3743 * out - the VA with the highest aligned end address. 3744 * @align: alignment for required highest address 3745 * 3746 * Returns: determined end address within vmap_area 3747 */ 3748 static unsigned long 3749 pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) 3750 { 3751 unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3752 unsigned long addr; 3753 3754 if (likely(*va)) { 3755 list_for_each_entry_from_reverse((*va), 3756 &free_vmap_area_list, list) { 3757 addr = min((*va)->va_end & ~(align - 1), vmalloc_end); 3758 if ((*va)->va_start < addr) 3759 return addr; 3760 } 3761 } 3762 3763 return 0; 3764 } 3765 3766 /** 3767 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator 3768 * @offsets: array containing offset of each area 3769 * @sizes: array containing size of each area 3770 * @nr_vms: the number of areas to allocate 3771 * @align: alignment, all entries in @offsets and @sizes must be aligned to this 3772 * 3773 * Returns: kmalloc'd vm_struct pointer array pointing to allocated 3774 * vm_structs on success, %NULL on failure 3775 * 3776 * Percpu allocator wants to use congruent vm areas so that it can 3777 * maintain the offsets among percpu areas. This function allocates 3778 * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to 3779 * be scattered pretty far, distance between two areas easily going up 3780 * to gigabytes. To avoid interacting with regular vmallocs, these 3781 * areas are allocated from top. 3782 * 3783 * Despite its complicated look, this allocator is rather simple. It 3784 * does everything top-down and scans free blocks from the end looking 3785 * for matching base. While scanning, if any of the areas do not fit the 3786 * base address is pulled down to fit the area. Scanning is repeated till 3787 * all the areas fit and then all necessary data structures are inserted 3788 * and the result is returned. 3789 */ 3790 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 3791 const size_t *sizes, int nr_vms, 3792 size_t align) 3793 { 3794 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 3795 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 3796 struct vmap_area **vas, *va; 3797 struct vm_struct **vms; 3798 int area, area2, last_area, term_area; 3799 unsigned long base, start, size, end, last_end, orig_start, orig_end; 3800 bool purged = false; 3801 3802 /* verify parameters and allocate data structures */ 3803 BUG_ON(offset_in_page(align) || !is_power_of_2(align)); 3804 for (last_area = 0, area = 0; area < nr_vms; area++) { 3805 start = offsets[area]; 3806 end = start + sizes[area]; 3807 3808 /* is everything aligned properly? */ 3809 BUG_ON(!IS_ALIGNED(offsets[area], align)); 3810 BUG_ON(!IS_ALIGNED(sizes[area], align)); 3811 3812 /* detect the area with the highest address */ 3813 if (start > offsets[last_area]) 3814 last_area = area; 3815 3816 for (area2 = area + 1; area2 < nr_vms; area2++) { 3817 unsigned long start2 = offsets[area2]; 3818 unsigned long end2 = start2 + sizes[area2]; 3819 3820 BUG_ON(start2 < end && start < end2); 3821 } 3822 } 3823 last_end = offsets[last_area] + sizes[last_area]; 3824 3825 if (vmalloc_end - vmalloc_start < last_end) { 3826 WARN_ON(true); 3827 return NULL; 3828 } 3829 3830 vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); 3831 vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); 3832 if (!vas || !vms) 3833 goto err_free2; 3834 3835 for (area = 0; area < nr_vms; area++) { 3836 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); 3837 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); 3838 if (!vas[area] || !vms[area]) 3839 goto err_free; 3840 } 3841 retry: 3842 spin_lock(&free_vmap_area_lock); 3843 3844 /* start scanning - we scan from the top, begin with the last area */ 3845 area = term_area = last_area; 3846 start = offsets[area]; 3847 end = start + sizes[area]; 3848 3849 va = pvm_find_va_enclose_addr(vmalloc_end); 3850 base = pvm_determine_end_from_reverse(&va, align) - end; 3851 3852 while (true) { 3853 /* 3854 * base might have underflowed, add last_end before 3855 * comparing. 3856 */ 3857 if (base + last_end < vmalloc_start + last_end) 3858 goto overflow; 3859 3860 /* 3861 * Fitting base has not been found. 3862 */ 3863 if (va == NULL) 3864 goto overflow; 3865 3866 /* 3867 * If required width exceeds current VA block, move 3868 * base downwards and then recheck. 3869 */ 3870 if (base + end > va->va_end) { 3871 base = pvm_determine_end_from_reverse(&va, align) - end; 3872 term_area = area; 3873 continue; 3874 } 3875 3876 /* 3877 * If this VA does not fit, move base downwards and recheck. 3878 */ 3879 if (base + start < va->va_start) { 3880 va = node_to_va(rb_prev(&va->rb_node)); 3881 base = pvm_determine_end_from_reverse(&va, align) - end; 3882 term_area = area; 3883 continue; 3884 } 3885 3886 /* 3887 * This area fits, move on to the previous one. If 3888 * the previous one is the terminal one, we're done. 3889 */ 3890 area = (area + nr_vms - 1) % nr_vms; 3891 if (area == term_area) 3892 break; 3893 3894 start = offsets[area]; 3895 end = start + sizes[area]; 3896 va = pvm_find_va_enclose_addr(base + end); 3897 } 3898 3899 /* we've found a fitting base, insert all va's */ 3900 for (area = 0; area < nr_vms; area++) { 3901 int ret; 3902 3903 start = base + offsets[area]; 3904 size = sizes[area]; 3905 3906 va = pvm_find_va_enclose_addr(start); 3907 if (WARN_ON_ONCE(va == NULL)) 3908 /* It is a BUG(), but trigger recovery instead. */ 3909 goto recovery; 3910 3911 ret = adjust_va_to_fit_type(&free_vmap_area_root, 3912 &free_vmap_area_list, 3913 va, start, size); 3914 if (WARN_ON_ONCE(unlikely(ret))) 3915 /* It is a BUG(), but trigger recovery instead. */ 3916 goto recovery; 3917 3918 /* Allocated area. */ 3919 va = vas[area]; 3920 va->va_start = start; 3921 va->va_end = start + size; 3922 } 3923 3924 spin_unlock(&free_vmap_area_lock); 3925 3926 /* populate the kasan shadow space */ 3927 for (area = 0; area < nr_vms; area++) { 3928 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) 3929 goto err_free_shadow; 3930 } 3931 3932 /* insert all vm's */ 3933 spin_lock(&vmap_area_lock); 3934 for (area = 0; area < nr_vms; area++) { 3935 insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); 3936 3937 setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, 3938 pcpu_get_vm_areas); 3939 } 3940 spin_unlock(&vmap_area_lock); 3941 3942 /* 3943 * Mark allocated areas as accessible. Do it now as a best-effort 3944 * approach, as they can be mapped outside of vmalloc code. 3945 * With hardware tag-based KASAN, marking is skipped for 3946 * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). 3947 */ 3948 for (area = 0; area < nr_vms; area++) 3949 vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, 3950 vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); 3951 3952 kfree(vas); 3953 return vms; 3954 3955 recovery: 3956 /* 3957 * Remove previously allocated areas. There is no 3958 * need in removing these areas from the busy tree, 3959 * because they are inserted only on the final step 3960 * and when pcpu_get_vm_areas() is success. 3961 */ 3962 while (area--) { 3963 orig_start = vas[area]->va_start; 3964 orig_end = vas[area]->va_end; 3965 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, 3966 &free_vmap_area_list); 3967 if (va) 3968 kasan_release_vmalloc(orig_start, orig_end, 3969 va->va_start, va->va_end); 3970 vas[area] = NULL; 3971 } 3972 3973 overflow: 3974 spin_unlock(&free_vmap_area_lock); 3975 if (!purged) { 3976 purge_vmap_area_lazy(); 3977 purged = true; 3978 3979 /* Before "retry", check if we recover. */ 3980 for (area = 0; area < nr_vms; area++) { 3981 if (vas[area]) 3982 continue; 3983 3984 vas[area] = kmem_cache_zalloc( 3985 vmap_area_cachep, GFP_KERNEL); 3986 if (!vas[area]) 3987 goto err_free; 3988 } 3989 3990 goto retry; 3991 } 3992 3993 err_free: 3994 for (area = 0; area < nr_vms; area++) { 3995 if (vas[area]) 3996 kmem_cache_free(vmap_area_cachep, vas[area]); 3997 3998 kfree(vms[area]); 3999 } 4000 err_free2: 4001 kfree(vas); 4002 kfree(vms); 4003 return NULL; 4004 4005 err_free_shadow: 4006 spin_lock(&free_vmap_area_lock); 4007 /* 4008 * We release all the vmalloc shadows, even the ones for regions that 4009 * hadn't been successfully added. This relies on kasan_release_vmalloc 4010 * being able to tolerate this case. 4011 */ 4012 for (area = 0; area < nr_vms; area++) { 4013 orig_start = vas[area]->va_start; 4014 orig_end = vas[area]->va_end; 4015 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, 4016 &free_vmap_area_list); 4017 if (va) 4018 kasan_release_vmalloc(orig_start, orig_end, 4019 va->va_start, va->va_end); 4020 vas[area] = NULL; 4021 kfree(vms[area]); 4022 } 4023 spin_unlock(&free_vmap_area_lock); 4024 kfree(vas); 4025 kfree(vms); 4026 return NULL; 4027 } 4028 4029 /** 4030 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator 4031 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() 4032 * @nr_vms: the number of allocated areas 4033 * 4034 * Free vm_structs and the array allocated by pcpu_get_vm_areas(). 4035 */ 4036 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) 4037 { 4038 int i; 4039 4040 for (i = 0; i < nr_vms; i++) 4041 free_vm_area(vms[i]); 4042 kfree(vms); 4043 } 4044 #endif /* CONFIG_SMP */ 4045 4046 #ifdef CONFIG_PRINTK 4047 bool vmalloc_dump_obj(void *object) 4048 { 4049 struct vm_struct *vm; 4050 void *objp = (void *)PAGE_ALIGN((unsigned long)object); 4051 4052 vm = find_vm_area(objp); 4053 if (!vm) 4054 return false; 4055 pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", 4056 vm->nr_pages, (unsigned long)vm->addr, vm->caller); 4057 return true; 4058 } 4059 #endif 4060 4061 #ifdef CONFIG_PROC_FS 4062 static void *s_start(struct seq_file *m, loff_t *pos) 4063 __acquires(&vmap_purge_lock) 4064 __acquires(&vmap_area_lock) 4065 { 4066 mutex_lock(&vmap_purge_lock); 4067 spin_lock(&vmap_area_lock); 4068 4069 return seq_list_start(&vmap_area_list, *pos); 4070 } 4071 4072 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 4073 { 4074 return seq_list_next(p, &vmap_area_list, pos); 4075 } 4076 4077 static void s_stop(struct seq_file *m, void *p) 4078 __releases(&vmap_area_lock) 4079 __releases(&vmap_purge_lock) 4080 { 4081 spin_unlock(&vmap_area_lock); 4082 mutex_unlock(&vmap_purge_lock); 4083 } 4084 4085 static void show_numa_info(struct seq_file *m, struct vm_struct *v) 4086 { 4087 if (IS_ENABLED(CONFIG_NUMA)) { 4088 unsigned int nr, *counters = m->private; 4089 unsigned int step = 1U << vm_area_page_order(v); 4090 4091 if (!counters) 4092 return; 4093 4094 if (v->flags & VM_UNINITIALIZED) 4095 return; 4096 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ 4097 smp_rmb(); 4098 4099 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 4100 4101 for (nr = 0; nr < v->nr_pages; nr += step) 4102 counters[page_to_nid(v->pages[nr])] += step; 4103 for_each_node_state(nr, N_HIGH_MEMORY) 4104 if (counters[nr]) 4105 seq_printf(m, " N%u=%u", nr, counters[nr]); 4106 } 4107 } 4108 4109 static void show_purge_info(struct seq_file *m) 4110 { 4111 struct vmap_area *va; 4112 4113 spin_lock(&purge_vmap_area_lock); 4114 list_for_each_entry(va, &purge_vmap_area_list, list) { 4115 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", 4116 (void *)va->va_start, (void *)va->va_end, 4117 va->va_end - va->va_start); 4118 } 4119 spin_unlock(&purge_vmap_area_lock); 4120 } 4121 4122 static int s_show(struct seq_file *m, void *p) 4123 { 4124 struct vmap_area *va; 4125 struct vm_struct *v; 4126 4127 va = list_entry(p, struct vmap_area, list); 4128 4129 /* 4130 * s_show can encounter race with remove_vm_area, !vm on behalf 4131 * of vmap area is being tear down or vm_map_ram allocation. 4132 */ 4133 if (!va->vm) { 4134 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", 4135 (void *)va->va_start, (void *)va->va_end, 4136 va->va_end - va->va_start); 4137 4138 goto final; 4139 } 4140 4141 v = va->vm; 4142 4143 seq_printf(m, "0x%pK-0x%pK %7ld", 4144 v->addr, v->addr + v->size, v->size); 4145 4146 if (v->caller) 4147 seq_printf(m, " %pS", v->caller); 4148 4149 if (v->nr_pages) 4150 seq_printf(m, " pages=%d", v->nr_pages); 4151 4152 if (v->phys_addr) 4153 seq_printf(m, " phys=%pa", &v->phys_addr); 4154 4155 if (v->flags & VM_IOREMAP) 4156 seq_puts(m, " ioremap"); 4157 4158 if (v->flags & VM_ALLOC) 4159 seq_puts(m, " vmalloc"); 4160 4161 if (v->flags & VM_MAP) 4162 seq_puts(m, " vmap"); 4163 4164 if (v->flags & VM_USERMAP) 4165 seq_puts(m, " user"); 4166 4167 if (v->flags & VM_DMA_COHERENT) 4168 seq_puts(m, " dma-coherent"); 4169 4170 if (is_vmalloc_addr(v->pages)) 4171 seq_puts(m, " vpages"); 4172 4173 show_numa_info(m, v); 4174 seq_putc(m, '\n'); 4175 4176 /* 4177 * As a final step, dump "unpurged" areas. 4178 */ 4179 final: 4180 if (list_is_last(&va->list, &vmap_area_list)) 4181 show_purge_info(m); 4182 4183 return 0; 4184 } 4185 4186 static const struct seq_operations vmalloc_op = { 4187 .start = s_start, 4188 .next = s_next, 4189 .stop = s_stop, 4190 .show = s_show, 4191 }; 4192 4193 static int __init proc_vmalloc_init(void) 4194 { 4195 if (IS_ENABLED(CONFIG_NUMA)) 4196 proc_create_seq_private("vmallocinfo", 0400, NULL, 4197 &vmalloc_op, 4198 nr_node_ids * sizeof(unsigned int), NULL); 4199 else 4200 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op); 4201 return 0; 4202 } 4203 module_init(proc_vmalloc_init); 4204 4205 #endif 4206