1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 2006 4 */ 5 6 #include <linux/memory_hotplug.h> 7 #include <linux/cpufeature.h> 8 #include <linux/memblock.h> 9 #include <linux/pfn.h> 10 #include <linux/mm.h> 11 #include <linux/init.h> 12 #include <linux/list.h> 13 #include <linux/hugetlb.h> 14 #include <linux/slab.h> 15 #include <linux/sort.h> 16 #include <asm/page-states.h> 17 #include <asm/abs_lowcore.h> 18 #include <asm/cacheflush.h> 19 #include <asm/maccess.h> 20 #include <asm/nospec-branch.h> 21 #include <asm/ctlreg.h> 22 #include <asm/pgalloc.h> 23 #include <asm/setup.h> 24 #include <asm/tlbflush.h> 25 #include <asm/sections.h> 26 #include <asm/set_memory.h> 27 #include <asm/physmem_info.h> 28 29 static DEFINE_MUTEX(vmem_mutex); 30 31 static void __ref *vmem_alloc_pages(unsigned int order) 32 { 33 unsigned long size = PAGE_SIZE << order; 34 35 if (slab_is_available()) 36 return (void *)__get_free_pages(GFP_KERNEL, order); 37 return memblock_alloc(size, size); 38 } 39 40 static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap) 41 { 42 unsigned int nr_pages = 1 << order; 43 struct page *page; 44 45 if (altmap) { 46 vmem_altmap_free(altmap, 1 << order); 47 return; 48 } 49 page = virt_to_page((void *)addr); 50 if (PageReserved(page)) { 51 /* allocated from memblock */ 52 while (nr_pages--) 53 free_reserved_page(page++); 54 } else { 55 free_pages(addr, order); 56 } 57 } 58 59 void *vmem_crst_alloc(unsigned long val) 60 { 61 unsigned long *table; 62 63 table = vmem_alloc_pages(CRST_ALLOC_ORDER); 64 if (!table) 65 return NULL; 66 crst_table_init(table, val); 67 __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); 68 return table; 69 } 70 71 pte_t __ref *vmem_pte_alloc(void) 72 { 73 pte_t *pte; 74 75 if (slab_is_available()) 76 pte = (pte_t *)page_table_alloc(&init_mm); 77 else 78 pte = (pte_t *)memblock_alloc(PAGE_SIZE, PAGE_SIZE); 79 if (!pte) 80 return NULL; 81 memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); 82 __arch_set_page_dat(pte, 1); 83 return pte; 84 } 85 86 static void vmem_pte_free(unsigned long *table) 87 { 88 page_table_free(&init_mm, table); 89 } 90 91 #define PAGE_UNUSED 0xFD 92 93 /* 94 * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges 95 * from unused_sub_pmd_start to next PMD_SIZE boundary. 96 */ 97 static unsigned long unused_sub_pmd_start; 98 99 static void vmemmap_flush_unused_sub_pmd(void) 100 { 101 if (!unused_sub_pmd_start) 102 return; 103 memset((void *)unused_sub_pmd_start, PAGE_UNUSED, 104 ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start); 105 unused_sub_pmd_start = 0; 106 } 107 108 static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end) 109 { 110 /* 111 * As we expect to add in the same granularity as we remove, it's 112 * sufficient to mark only some piece used to block the memmap page from 113 * getting removed (just in case the memmap never gets initialized, 114 * e.g., because the memory block never gets onlined). 115 */ 116 memset((void *)start, 0, sizeof(struct page)); 117 } 118 119 static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) 120 { 121 /* 122 * We only optimize if the new used range directly follows the 123 * previously unused range (esp., when populating consecutive sections). 124 */ 125 if (unused_sub_pmd_start == start) { 126 unused_sub_pmd_start = end; 127 if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE))) 128 unused_sub_pmd_start = 0; 129 return; 130 } 131 vmemmap_flush_unused_sub_pmd(); 132 vmemmap_mark_sub_pmd_used(start, end); 133 } 134 135 static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) 136 { 137 unsigned long page = ALIGN_DOWN(start, PMD_SIZE); 138 139 vmemmap_flush_unused_sub_pmd(); 140 141 /* Could be our memmap page is filled with PAGE_UNUSED already ... */ 142 vmemmap_mark_sub_pmd_used(start, end); 143 144 /* Mark the unused parts of the new memmap page PAGE_UNUSED. */ 145 if (!IS_ALIGNED(start, PMD_SIZE)) 146 memset((void *)page, PAGE_UNUSED, start - page); 147 /* 148 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of 149 * consecutive sections. Remember for the last added PMD the last 150 * unused range in the populated PMD. 151 */ 152 if (!IS_ALIGNED(end, PMD_SIZE)) 153 unused_sub_pmd_start = end; 154 } 155 156 /* Returns true if the PMD is completely unused and can be freed. */ 157 static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) 158 { 159 unsigned long page = ALIGN_DOWN(start, PMD_SIZE); 160 161 vmemmap_flush_unused_sub_pmd(); 162 memset((void *)start, PAGE_UNUSED, end - start); 163 return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE); 164 } 165 166 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ 167 static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, 168 unsigned long end, bool add, bool direct, 169 struct vmem_altmap *altmap) 170 { 171 unsigned long prot, pages = 0; 172 int ret = -ENOMEM; 173 pte_t *pte, entry; 174 175 prot = pgprot_val(PAGE_KERNEL); 176 pte = pte_offset_kernel(pmd, addr); 177 for (; addr < end; addr += PAGE_SIZE, pte++) { 178 entry = ptep_get(pte); 179 if (!add) { 180 if (pte_none(entry)) 181 continue; 182 if (!direct) 183 vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(entry)), get_order(PAGE_SIZE), altmap); 184 pte_clear(&init_mm, addr, pte); 185 } else if (pte_none(entry)) { 186 if (!direct) { 187 void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap); 188 189 if (!new_page) 190 goto out; 191 set_pte(pte, __pte(__pa(new_page) | prot)); 192 } else { 193 set_pte(pte, __pte(__pa(addr) | prot)); 194 } 195 } else { 196 continue; 197 } 198 pages++; 199 } 200 ret = 0; 201 out: 202 if (direct) 203 update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages); 204 return ret; 205 } 206 207 static void try_free_pte_table(pmd_t *pmd, unsigned long start) 208 { 209 pte_t *pte; 210 int i; 211 212 /* We can safely assume this is fully in 1:1 mapping & vmemmap area */ 213 pte = pte_offset_kernel(pmd, start); 214 for (i = 0; i < PTRS_PER_PTE; i++, pte++) { 215 if (!pte_none(ptep_get(pte))) 216 return; 217 } 218 vmem_pte_free((unsigned long *)pmd_deref(pmdp_get(pmd))); 219 pmd_clear(pmd); 220 } 221 222 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ 223 static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, 224 unsigned long end, bool add, bool direct, 225 struct vmem_altmap *altmap) 226 { 227 unsigned long next, prot, pages = 0; 228 int ret = -ENOMEM; 229 pmd_t entry; 230 pmd_t *pmd; 231 pte_t *pte; 232 233 prot = pgprot_val(SEGMENT_KERNEL); 234 pmd = pmd_offset(pud, addr); 235 for (; addr < end; addr = next, pmd++) { 236 next = pmd_addr_end(addr, end); 237 entry = pmdp_get(pmd); 238 if (!add) { 239 if (pmd_none(entry)) 240 continue; 241 if (pmd_leaf(entry)) { 242 if (IS_ALIGNED(addr, PMD_SIZE) && 243 IS_ALIGNED(next, PMD_SIZE)) { 244 if (!direct) 245 vmem_free_pages(pmd_deref(entry), get_order(PMD_SIZE), altmap); 246 pmd_clear(pmd); 247 pages++; 248 } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { 249 vmem_free_pages(pmd_deref(entry), get_order(PMD_SIZE), altmap); 250 pmd_clear(pmd); 251 } 252 continue; 253 } 254 } else if (pmd_none(entry)) { 255 if (IS_ALIGNED(addr, PMD_SIZE) && 256 IS_ALIGNED(next, PMD_SIZE) && 257 cpu_has_edat1() && direct && 258 !debug_pagealloc_enabled()) { 259 set_pmd(pmd, __pmd(__pa(addr) | prot)); 260 pages++; 261 continue; 262 } else if (!direct && cpu_has_edat1()) { 263 void *new_page; 264 265 /* 266 * Use 1MB frames for vmemmap if available. We 267 * always use large frames even if they are only 268 * partially used. Otherwise we would have also 269 * page tables since vmemmap_populate gets 270 * called for each section separately. 271 */ 272 new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap); 273 if (new_page) { 274 set_pmd(pmd, __pmd(__pa(new_page) | prot)); 275 if (!IS_ALIGNED(addr, PMD_SIZE) || 276 !IS_ALIGNED(next, PMD_SIZE)) { 277 vmemmap_use_new_sub_pmd(addr, next); 278 } 279 continue; 280 } 281 } 282 pte = vmem_pte_alloc(); 283 if (!pte) 284 goto out; 285 pmd_populate(&init_mm, pmd, pte); 286 } else if (pmd_leaf(entry)) { 287 if (!direct) 288 vmemmap_use_sub_pmd(addr, next); 289 continue; 290 } 291 ret = modify_pte_table(pmd, addr, next, add, direct, altmap); 292 if (ret) 293 goto out; 294 if (!add) 295 try_free_pte_table(pmd, addr & PMD_MASK); 296 } 297 ret = 0; 298 out: 299 if (direct) 300 update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages); 301 return ret; 302 } 303 304 static void try_free_pmd_table(pud_t *pud, unsigned long start) 305 { 306 pmd_t *pmd; 307 int i; 308 309 pmd = pmd_offset(pud, start); 310 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) 311 if (!pmd_none(pmdp_get(pmd))) 312 return; 313 vmem_free_pages(pud_deref(pudp_get(pud)), CRST_ALLOC_ORDER, NULL); 314 pud_clear(pud); 315 } 316 317 static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, 318 bool add, bool direct, struct vmem_altmap *altmap) 319 { 320 unsigned long next, prot, pages = 0; 321 int ret = -ENOMEM; 322 pud_t *pud, entry; 323 pmd_t *pmd; 324 325 prot = pgprot_val(REGION3_KERNEL); 326 pud = pud_offset(p4d, addr); 327 for (; addr < end; addr = next, pud++) { 328 next = pud_addr_end(addr, end); 329 entry = pudp_get(pud); 330 if (!add) { 331 if (pud_none(entry)) 332 continue; 333 if (pud_leaf(entry)) { 334 if (IS_ALIGNED(addr, PUD_SIZE) && 335 IS_ALIGNED(next, PUD_SIZE)) { 336 if (!direct) 337 vmem_free_pages(pud_deref(entry), get_order(PUD_SIZE), altmap); 338 pud_clear(pud); 339 pages++; 340 continue; 341 } else { 342 split_pud_page(pud, addr & PUD_MASK); 343 } 344 } 345 } else if (pud_none(entry)) { 346 if (IS_ALIGNED(addr, PUD_SIZE) && 347 IS_ALIGNED(next, PUD_SIZE) && 348 cpu_has_edat2() && direct && 349 !debug_pagealloc_enabled()) { 350 set_pud(pud, __pud(__pa(addr) | prot)); 351 pages++; 352 continue; 353 } 354 pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); 355 if (!pmd) 356 goto out; 357 pud_populate(&init_mm, pud, pmd); 358 } else if (pud_leaf(entry)) { 359 continue; 360 } 361 ret = modify_pmd_table(pud, addr, next, add, direct, altmap); 362 if (ret) 363 goto out; 364 if (!add) 365 try_free_pmd_table(pud, addr & PUD_MASK); 366 } 367 ret = 0; 368 out: 369 if (direct) 370 update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages); 371 return ret; 372 } 373 374 static void try_free_pud_table(p4d_t *p4d, unsigned long start) 375 { 376 pud_t *pud; 377 int i; 378 379 pud = pud_offset(p4d, start); 380 for (i = 0; i < PTRS_PER_PUD; i++, pud++) { 381 if (!pud_none(pudp_get(pud))) 382 return; 383 } 384 vmem_free_pages(p4d_deref(p4dp_get(p4d)), CRST_ALLOC_ORDER, NULL); 385 p4d_clear(p4d); 386 } 387 388 static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, 389 bool add, bool direct, struct vmem_altmap *altmap) 390 { 391 unsigned long next; 392 int ret = -ENOMEM; 393 p4d_t *p4d, entry; 394 pud_t *pud; 395 396 p4d = p4d_offset(pgd, addr); 397 for (; addr < end; addr = next, p4d++) { 398 next = p4d_addr_end(addr, end); 399 entry = p4dp_get(p4d); 400 if (!add) { 401 if (p4d_none(entry)) 402 continue; 403 } else if (p4d_none(entry)) { 404 pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); 405 if (!pud) 406 goto out; 407 p4d_populate(&init_mm, p4d, pud); 408 } 409 ret = modify_pud_table(p4d, addr, next, add, direct, altmap); 410 if (ret) 411 goto out; 412 if (!add) 413 try_free_pud_table(p4d, addr & P4D_MASK); 414 } 415 ret = 0; 416 out: 417 return ret; 418 } 419 420 static void try_free_p4d_table(pgd_t *pgd, unsigned long start) 421 { 422 p4d_t *p4d; 423 int i; 424 425 p4d = p4d_offset(pgd, start); 426 for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { 427 if (!p4d_none(p4dp_get(p4d))) 428 return; 429 } 430 vmem_free_pages(pgd_deref(pgdp_get(pgd)), CRST_ALLOC_ORDER, NULL); 431 pgd_clear(pgd); 432 } 433 434 static int modify_pagetable(unsigned long start, unsigned long end, bool add, 435 bool direct, struct vmem_altmap *altmap) 436 { 437 unsigned long addr, next; 438 int ret = -ENOMEM; 439 pgd_t *pgd, entry; 440 p4d_t *p4d; 441 442 if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) 443 return -EINVAL; 444 /* Don't mess with any tables not fully in 1:1 mapping, vmemmap & kasan area */ 445 #ifdef CONFIG_KASAN 446 if (WARN_ON_ONCE(!(start >= KASAN_SHADOW_START && end <= KASAN_SHADOW_END) && 447 end > __abs_lowcore)) 448 return -EINVAL; 449 #else 450 if (WARN_ON_ONCE(end > __abs_lowcore)) 451 return -EINVAL; 452 #endif 453 for (addr = start; addr < end; addr = next) { 454 next = pgd_addr_end(addr, end); 455 pgd = pgd_offset_k(addr); 456 entry = pgdp_get(pgd); 457 458 if (!add) { 459 if (pgd_none(entry)) 460 continue; 461 } else if (pgd_none(entry)) { 462 p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); 463 if (!p4d) 464 goto out; 465 pgd_populate(&init_mm, pgd, p4d); 466 } 467 ret = modify_p4d_table(pgd, addr, next, add, direct, altmap); 468 if (ret) 469 goto out; 470 if (!add) 471 try_free_p4d_table(pgd, addr & PGDIR_MASK); 472 } 473 ret = 0; 474 out: 475 if (!add) 476 flush_tlb_kernel_range(start, end); 477 return ret; 478 } 479 480 static int add_pagetable(unsigned long start, unsigned long end, bool direct, 481 struct vmem_altmap *altmap) 482 { 483 return modify_pagetable(start, end, true, direct, altmap); 484 } 485 486 static int remove_pagetable(unsigned long start, unsigned long end, bool direct, 487 struct vmem_altmap *altmap) 488 { 489 return modify_pagetable(start, end, false, direct, altmap); 490 } 491 492 /* 493 * Add a physical memory range to the 1:1 mapping. 494 */ 495 static int vmem_add_range(unsigned long start, unsigned long size) 496 { 497 start = (unsigned long)__va(start); 498 return add_pagetable(start, start + size, true, NULL); 499 } 500 501 /* 502 * Remove a physical memory range from the 1:1 mapping. 503 */ 504 static void vmem_remove_range(unsigned long start, unsigned long size) 505 { 506 start = (unsigned long)__va(start); 507 remove_pagetable(start, start + size, true, NULL); 508 } 509 510 /* 511 * Add a backed mem_map array to the virtual mem_map array. 512 */ 513 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, 514 struct vmem_altmap *altmap) 515 { 516 int ret; 517 518 mutex_lock(&vmem_mutex); 519 /* We don't care about the node, just use NUMA_NO_NODE on allocations */ 520 ret = add_pagetable(start, end, false, altmap); 521 if (ret) 522 remove_pagetable(start, end, false, altmap); 523 mutex_unlock(&vmem_mutex); 524 return ret; 525 } 526 527 #ifdef CONFIG_MEMORY_HOTPLUG 528 529 void vmemmap_free(unsigned long start, unsigned long end, 530 struct vmem_altmap *altmap) 531 { 532 mutex_lock(&vmem_mutex); 533 remove_pagetable(start, end, false, altmap); 534 mutex_unlock(&vmem_mutex); 535 } 536 537 #endif 538 539 void vmem_remove_mapping(unsigned long start, unsigned long size) 540 { 541 mutex_lock(&vmem_mutex); 542 vmem_remove_range(start, size); 543 mutex_unlock(&vmem_mutex); 544 } 545 546 struct range arch_get_mappable_range(void) 547 { 548 struct range mhp_range; 549 550 mhp_range.start = 0; 551 mhp_range.end = max_mappable - 1; 552 return mhp_range; 553 } 554 555 int vmem_add_mapping(unsigned long start, unsigned long size) 556 { 557 struct range range = arch_get_mappable_range(); 558 int ret; 559 560 if (start < range.start || 561 start + size > range.end + 1 || 562 start + size < start) 563 return -ERANGE; 564 565 mutex_lock(&vmem_mutex); 566 ret = vmem_add_range(start, size); 567 if (ret) 568 vmem_remove_range(start, size); 569 mutex_unlock(&vmem_mutex); 570 return ret; 571 } 572 573 /* 574 * Allocate new or return existing page-table entry, but do not map it 575 * to any physical address. If missing, allocate segment- and region- 576 * table entries along. Meeting a large segment- or region-table entry 577 * while traversing is an error, since the function is expected to be 578 * called against virtual regions reserved for 4KB mappings only. 579 */ 580 pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) 581 { 582 pte_t *ptep = NULL; 583 pud_t pud_entry; 584 pmd_t pmd_entry; 585 pgd_t *pgd; 586 p4d_t *p4d; 587 pud_t *pud; 588 pmd_t *pmd; 589 pte_t *pte; 590 591 pgd = pgd_offset_k(addr); 592 if (pgd_none(pgdp_get(pgd))) { 593 if (!alloc) 594 goto out; 595 p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); 596 if (!p4d) 597 goto out; 598 pgd_populate(&init_mm, pgd, p4d); 599 } 600 p4d = p4d_offset(pgd, addr); 601 if (p4d_none(p4dp_get(p4d))) { 602 if (!alloc) 603 goto out; 604 pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); 605 if (!pud) 606 goto out; 607 p4d_populate(&init_mm, p4d, pud); 608 } 609 pud = pud_offset(p4d, addr); 610 pud_entry = pudp_get(pud); 611 if (pud_none(pud_entry)) { 612 if (!alloc) 613 goto out; 614 pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); 615 if (!pmd) 616 goto out; 617 pud_populate(&init_mm, pud, pmd); 618 } else if (WARN_ON_ONCE(pud_leaf(pud_entry))) { 619 goto out; 620 } 621 pmd = pmd_offset(pud, addr); 622 pmd_entry = pmdp_get(pmd); 623 if (pmd_none(pmd_entry)) { 624 if (!alloc) 625 goto out; 626 pte = vmem_pte_alloc(); 627 if (!pte) 628 goto out; 629 pmd_populate(&init_mm, pmd, pte); 630 } else if (WARN_ON_ONCE(pmd_leaf(pmd_entry))) { 631 goto out; 632 } 633 ptep = pte_offset_kernel(pmd, addr); 634 out: 635 return ptep; 636 } 637 638 int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc) 639 { 640 pte_t *ptep, pte; 641 642 if (!IS_ALIGNED(addr, PAGE_SIZE)) 643 return -EINVAL; 644 ptep = vmem_get_alloc_pte(addr, alloc); 645 if (!ptep) 646 return -ENOMEM; 647 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 648 pte = mk_pte_phys(phys, prot); 649 set_pte(ptep, pte); 650 return 0; 651 } 652 653 int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot) 654 { 655 int rc; 656 657 mutex_lock(&vmem_mutex); 658 rc = __vmem_map_4k_page(addr, phys, prot, true); 659 mutex_unlock(&vmem_mutex); 660 return rc; 661 } 662 663 void vmem_unmap_4k_page(unsigned long addr) 664 { 665 pte_t *ptep; 666 667 mutex_lock(&vmem_mutex); 668 ptep = virt_to_kpte(addr); 669 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 670 pte_clear(&init_mm, addr, ptep); 671 mutex_unlock(&vmem_mutex); 672 } 673 674 void __init vmem_map_init(void) 675 { 676 __set_memory_rox(_stext, _etext); 677 __set_memory_ro(_etext, __end_rodata); 678 __set_memory_rox(__stext_amode31, __etext_amode31); 679 /* 680 * If the BEAR-enhancement facility is not installed the first 681 * prefix page is used to return to the previous context with 682 * an LPSWE instruction and therefore must be executable. 683 */ 684 if (!cpu_has_bear()) 685 set_memory_x(0, 1); 686 if (debug_pagealloc_enabled()) 687 __set_memory_4k(__va(0), absolute_pointer(__va(0)) + ident_map_size); 688 pr_info("Write protected kernel read-only data: %luk\n", 689 (unsigned long)(__end_rodata - _stext) >> 10); 690 } 691