1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 2006 4 */ 5 6 #include <linux/memory_hotplug.h> 7 #include <linux/cpufeature.h> 8 #include <linux/memblock.h> 9 #include <linux/pfn.h> 10 #include <linux/mm.h> 11 #include <linux/init.h> 12 #include <linux/list.h> 13 #include <linux/hugetlb.h> 14 #include <linux/slab.h> 15 #include <linux/sort.h> 16 #include <asm/page-states.h> 17 #include <asm/abs_lowcore.h> 18 #include <asm/cacheflush.h> 19 #include <asm/maccess.h> 20 #include <asm/nospec-branch.h> 21 #include <asm/ctlreg.h> 22 #include <asm/pgalloc.h> 23 #include <asm/setup.h> 24 #include <asm/tlbflush.h> 25 #include <asm/sections.h> 26 #include <asm/set_memory.h> 27 #include <asm/physmem_info.h> 28 29 static DEFINE_MUTEX(vmem_mutex); 30 31 static void __ref *vmem_alloc_pages(unsigned int order) 32 { 33 unsigned long size = PAGE_SIZE << order; 34 35 if (slab_is_available()) 36 return (void *)__get_free_pages(GFP_KERNEL, order); 37 return memblock_alloc(size, size); 38 } 39 40 static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap) 41 { 42 unsigned int nr_pages = 1 << order; 43 struct page *page; 44 45 if (altmap) { 46 vmem_altmap_free(altmap, 1 << order); 47 return; 48 } 49 page = virt_to_page((void *)addr); 50 if (PageReserved(page)) { 51 /* allocated from memblock */ 52 while (nr_pages--) 53 free_reserved_page(page++); 54 } else { 55 free_pages(addr, order); 56 } 57 } 58 59 void *vmem_crst_alloc(unsigned long val) 60 { 61 unsigned long *table; 62 63 table = vmem_alloc_pages(CRST_ALLOC_ORDER); 64 if (!table) 65 return NULL; 66 crst_table_init(table, val); 67 __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); 68 return table; 69 } 70 71 pte_t __ref *vmem_pte_alloc(void) 72 { 73 pte_t *pte; 74 75 if (slab_is_available()) 76 pte = (pte_t *)page_table_alloc(&init_mm); 77 else 78 pte = (pte_t *)memblock_alloc(PAGE_SIZE, PAGE_SIZE); 79 if (!pte) 80 return NULL; 81 memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); 82 __arch_set_page_dat(pte, 1); 83 return pte; 84 } 85 86 static void vmem_pte_free(unsigned long *table) 87 { 88 page_table_free(&init_mm, table); 89 } 90 91 #define PAGE_UNUSED 0xFD 92 93 /* 94 * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges 95 * from unused_sub_pmd_start to next PMD_SIZE boundary. 96 */ 97 static unsigned long unused_sub_pmd_start; 98 99 static void vmemmap_flush_unused_sub_pmd(void) 100 { 101 if (!unused_sub_pmd_start) 102 return; 103 memset((void *)unused_sub_pmd_start, PAGE_UNUSED, 104 ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start); 105 unused_sub_pmd_start = 0; 106 } 107 108 static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end) 109 { 110 /* 111 * As we expect to add in the same granularity as we remove, it's 112 * sufficient to mark only some piece used to block the memmap page from 113 * getting removed (just in case the memmap never gets initialized, 114 * e.g., because the memory block never gets onlined). 115 */ 116 memset((void *)start, 0, sizeof(struct page)); 117 } 118 119 static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) 120 { 121 /* 122 * We only optimize if the new used range directly follows the 123 * previously unused range (esp., when populating consecutive sections). 124 */ 125 if (unused_sub_pmd_start == start) { 126 unused_sub_pmd_start = end; 127 if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE))) 128 unused_sub_pmd_start = 0; 129 return; 130 } 131 vmemmap_flush_unused_sub_pmd(); 132 vmemmap_mark_sub_pmd_used(start, end); 133 } 134 135 static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) 136 { 137 unsigned long page = ALIGN_DOWN(start, PMD_SIZE); 138 139 vmemmap_flush_unused_sub_pmd(); 140 141 /* Could be our memmap page is filled with PAGE_UNUSED already ... */ 142 vmemmap_mark_sub_pmd_used(start, end); 143 144 /* Mark the unused parts of the new memmap page PAGE_UNUSED. */ 145 if (!IS_ALIGNED(start, PMD_SIZE)) 146 memset((void *)page, PAGE_UNUSED, start - page); 147 /* 148 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of 149 * consecutive sections. Remember for the last added PMD the last 150 * unused range in the populated PMD. 151 */ 152 if (!IS_ALIGNED(end, PMD_SIZE)) 153 unused_sub_pmd_start = end; 154 } 155 156 /* Returns true if the PMD is completely unused and can be freed. */ 157 static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) 158 { 159 unsigned long page = ALIGN_DOWN(start, PMD_SIZE); 160 161 vmemmap_flush_unused_sub_pmd(); 162 memset((void *)start, PAGE_UNUSED, end - start); 163 return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE); 164 } 165 166 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ 167 static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, 168 unsigned long end, bool add, bool direct, 169 struct vmem_altmap *altmap) 170 { 171 unsigned long prot, pages = 0; 172 int ret = -ENOMEM; 173 pte_t *pte; 174 175 prot = pgprot_val(PAGE_KERNEL); 176 pte = pte_offset_kernel(pmd, addr); 177 for (; addr < end; addr += PAGE_SIZE, pte++) { 178 if (!add) { 179 if (pte_none(*pte)) 180 continue; 181 if (!direct) 182 vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap); 183 pte_clear(&init_mm, addr, pte); 184 } else if (pte_none(*pte)) { 185 if (!direct) { 186 void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap); 187 188 if (!new_page) 189 goto out; 190 set_pte(pte, __pte(__pa(new_page) | prot)); 191 } else { 192 set_pte(pte, __pte(__pa(addr) | prot)); 193 } 194 } else { 195 continue; 196 } 197 pages++; 198 } 199 ret = 0; 200 out: 201 if (direct) 202 update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages); 203 return ret; 204 } 205 206 static void try_free_pte_table(pmd_t *pmd, unsigned long start) 207 { 208 pte_t *pte; 209 int i; 210 211 /* We can safely assume this is fully in 1:1 mapping & vmemmap area */ 212 pte = pte_offset_kernel(pmd, start); 213 for (i = 0; i < PTRS_PER_PTE; i++, pte++) { 214 if (!pte_none(*pte)) 215 return; 216 } 217 vmem_pte_free((unsigned long *) pmd_deref(*pmd)); 218 pmd_clear(pmd); 219 } 220 221 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ 222 static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, 223 unsigned long end, bool add, bool direct, 224 struct vmem_altmap *altmap) 225 { 226 unsigned long next, prot, pages = 0; 227 int ret = -ENOMEM; 228 pmd_t *pmd; 229 pte_t *pte; 230 231 prot = pgprot_val(SEGMENT_KERNEL); 232 pmd = pmd_offset(pud, addr); 233 for (; addr < end; addr = next, pmd++) { 234 next = pmd_addr_end(addr, end); 235 if (!add) { 236 if (pmd_none(*pmd)) 237 continue; 238 if (pmd_leaf(*pmd)) { 239 if (IS_ALIGNED(addr, PMD_SIZE) && 240 IS_ALIGNED(next, PMD_SIZE)) { 241 if (!direct) 242 vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); 243 pmd_clear(pmd); 244 pages++; 245 } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { 246 vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); 247 pmd_clear(pmd); 248 } 249 continue; 250 } 251 } else if (pmd_none(*pmd)) { 252 if (IS_ALIGNED(addr, PMD_SIZE) && 253 IS_ALIGNED(next, PMD_SIZE) && 254 cpu_has_edat1() && direct && 255 !debug_pagealloc_enabled()) { 256 set_pmd(pmd, __pmd(__pa(addr) | prot)); 257 pages++; 258 continue; 259 } else if (!direct && cpu_has_edat1()) { 260 void *new_page; 261 262 /* 263 * Use 1MB frames for vmemmap if available. We 264 * always use large frames even if they are only 265 * partially used. Otherwise we would have also 266 * page tables since vmemmap_populate gets 267 * called for each section separately. 268 */ 269 new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap); 270 if (new_page) { 271 set_pmd(pmd, __pmd(__pa(new_page) | prot)); 272 if (!IS_ALIGNED(addr, PMD_SIZE) || 273 !IS_ALIGNED(next, PMD_SIZE)) { 274 vmemmap_use_new_sub_pmd(addr, next); 275 } 276 continue; 277 } 278 } 279 pte = vmem_pte_alloc(); 280 if (!pte) 281 goto out; 282 pmd_populate(&init_mm, pmd, pte); 283 } else if (pmd_leaf(*pmd)) { 284 if (!direct) 285 vmemmap_use_sub_pmd(addr, next); 286 continue; 287 } 288 ret = modify_pte_table(pmd, addr, next, add, direct, altmap); 289 if (ret) 290 goto out; 291 if (!add) 292 try_free_pte_table(pmd, addr & PMD_MASK); 293 } 294 ret = 0; 295 out: 296 if (direct) 297 update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages); 298 return ret; 299 } 300 301 static void try_free_pmd_table(pud_t *pud, unsigned long start) 302 { 303 pmd_t *pmd; 304 int i; 305 306 pmd = pmd_offset(pud, start); 307 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) 308 if (!pmd_none(*pmd)) 309 return; 310 vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL); 311 pud_clear(pud); 312 } 313 314 static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, 315 bool add, bool direct, struct vmem_altmap *altmap) 316 { 317 unsigned long next, prot, pages = 0; 318 int ret = -ENOMEM; 319 pud_t *pud; 320 pmd_t *pmd; 321 322 prot = pgprot_val(REGION3_KERNEL); 323 pud = pud_offset(p4d, addr); 324 for (; addr < end; addr = next, pud++) { 325 next = pud_addr_end(addr, end); 326 if (!add) { 327 if (pud_none(*pud)) 328 continue; 329 if (pud_leaf(*pud)) { 330 if (IS_ALIGNED(addr, PUD_SIZE) && 331 IS_ALIGNED(next, PUD_SIZE)) { 332 if (!direct) 333 vmem_free_pages(pud_deref(*pud), get_order(PUD_SIZE), altmap); 334 pud_clear(pud); 335 pages++; 336 continue; 337 } else { 338 split_pud_page(pud, addr & PUD_MASK); 339 } 340 } 341 } else if (pud_none(*pud)) { 342 if (IS_ALIGNED(addr, PUD_SIZE) && 343 IS_ALIGNED(next, PUD_SIZE) && 344 cpu_has_edat2() && direct && 345 !debug_pagealloc_enabled()) { 346 set_pud(pud, __pud(__pa(addr) | prot)); 347 pages++; 348 continue; 349 } 350 pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); 351 if (!pmd) 352 goto out; 353 pud_populate(&init_mm, pud, pmd); 354 } else if (pud_leaf(*pud)) { 355 continue; 356 } 357 ret = modify_pmd_table(pud, addr, next, add, direct, altmap); 358 if (ret) 359 goto out; 360 if (!add) 361 try_free_pmd_table(pud, addr & PUD_MASK); 362 } 363 ret = 0; 364 out: 365 if (direct) 366 update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages); 367 return ret; 368 } 369 370 static void try_free_pud_table(p4d_t *p4d, unsigned long start) 371 { 372 pud_t *pud; 373 int i; 374 375 pud = pud_offset(p4d, start); 376 for (i = 0; i < PTRS_PER_PUD; i++, pud++) { 377 if (!pud_none(*pud)) 378 return; 379 } 380 vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL); 381 p4d_clear(p4d); 382 } 383 384 static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, 385 bool add, bool direct, struct vmem_altmap *altmap) 386 { 387 unsigned long next; 388 int ret = -ENOMEM; 389 p4d_t *p4d; 390 pud_t *pud; 391 392 p4d = p4d_offset(pgd, addr); 393 for (; addr < end; addr = next, p4d++) { 394 next = p4d_addr_end(addr, end); 395 if (!add) { 396 if (p4d_none(*p4d)) 397 continue; 398 } else if (p4d_none(*p4d)) { 399 pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); 400 if (!pud) 401 goto out; 402 p4d_populate(&init_mm, p4d, pud); 403 } 404 ret = modify_pud_table(p4d, addr, next, add, direct, altmap); 405 if (ret) 406 goto out; 407 if (!add) 408 try_free_pud_table(p4d, addr & P4D_MASK); 409 } 410 ret = 0; 411 out: 412 return ret; 413 } 414 415 static void try_free_p4d_table(pgd_t *pgd, unsigned long start) 416 { 417 p4d_t *p4d; 418 int i; 419 420 p4d = p4d_offset(pgd, start); 421 for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { 422 if (!p4d_none(*p4d)) 423 return; 424 } 425 vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL); 426 pgd_clear(pgd); 427 } 428 429 static int modify_pagetable(unsigned long start, unsigned long end, bool add, 430 bool direct, struct vmem_altmap *altmap) 431 { 432 unsigned long addr, next; 433 int ret = -ENOMEM; 434 pgd_t *pgd; 435 p4d_t *p4d; 436 437 if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) 438 return -EINVAL; 439 /* Don't mess with any tables not fully in 1:1 mapping, vmemmap & kasan area */ 440 #ifdef CONFIG_KASAN 441 if (WARN_ON_ONCE(!(start >= KASAN_SHADOW_START && end <= KASAN_SHADOW_END) && 442 end > __abs_lowcore)) 443 return -EINVAL; 444 #else 445 if (WARN_ON_ONCE(end > __abs_lowcore)) 446 return -EINVAL; 447 #endif 448 for (addr = start; addr < end; addr = next) { 449 next = pgd_addr_end(addr, end); 450 pgd = pgd_offset_k(addr); 451 452 if (!add) { 453 if (pgd_none(*pgd)) 454 continue; 455 } else if (pgd_none(*pgd)) { 456 p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); 457 if (!p4d) 458 goto out; 459 pgd_populate(&init_mm, pgd, p4d); 460 } 461 ret = modify_p4d_table(pgd, addr, next, add, direct, altmap); 462 if (ret) 463 goto out; 464 if (!add) 465 try_free_p4d_table(pgd, addr & PGDIR_MASK); 466 } 467 ret = 0; 468 out: 469 if (!add) 470 flush_tlb_kernel_range(start, end); 471 return ret; 472 } 473 474 static int add_pagetable(unsigned long start, unsigned long end, bool direct, 475 struct vmem_altmap *altmap) 476 { 477 return modify_pagetable(start, end, true, direct, altmap); 478 } 479 480 static int remove_pagetable(unsigned long start, unsigned long end, bool direct, 481 struct vmem_altmap *altmap) 482 { 483 return modify_pagetable(start, end, false, direct, altmap); 484 } 485 486 /* 487 * Add a physical memory range to the 1:1 mapping. 488 */ 489 static int vmem_add_range(unsigned long start, unsigned long size) 490 { 491 start = (unsigned long)__va(start); 492 return add_pagetable(start, start + size, true, NULL); 493 } 494 495 /* 496 * Remove a physical memory range from the 1:1 mapping. 497 */ 498 static void vmem_remove_range(unsigned long start, unsigned long size) 499 { 500 start = (unsigned long)__va(start); 501 remove_pagetable(start, start + size, true, NULL); 502 } 503 504 /* 505 * Add a backed mem_map array to the virtual mem_map array. 506 */ 507 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, 508 struct vmem_altmap *altmap) 509 { 510 int ret; 511 512 mutex_lock(&vmem_mutex); 513 /* We don't care about the node, just use NUMA_NO_NODE on allocations */ 514 ret = add_pagetable(start, end, false, altmap); 515 if (ret) 516 remove_pagetable(start, end, false, altmap); 517 mutex_unlock(&vmem_mutex); 518 return ret; 519 } 520 521 #ifdef CONFIG_MEMORY_HOTPLUG 522 523 void vmemmap_free(unsigned long start, unsigned long end, 524 struct vmem_altmap *altmap) 525 { 526 mutex_lock(&vmem_mutex); 527 remove_pagetable(start, end, false, altmap); 528 mutex_unlock(&vmem_mutex); 529 } 530 531 #endif 532 533 void vmem_remove_mapping(unsigned long start, unsigned long size) 534 { 535 mutex_lock(&vmem_mutex); 536 vmem_remove_range(start, size); 537 mutex_unlock(&vmem_mutex); 538 } 539 540 struct range arch_get_mappable_range(void) 541 { 542 struct range mhp_range; 543 544 mhp_range.start = 0; 545 mhp_range.end = max_mappable - 1; 546 return mhp_range; 547 } 548 549 int vmem_add_mapping(unsigned long start, unsigned long size) 550 { 551 struct range range = arch_get_mappable_range(); 552 int ret; 553 554 if (start < range.start || 555 start + size > range.end + 1 || 556 start + size < start) 557 return -ERANGE; 558 559 mutex_lock(&vmem_mutex); 560 ret = vmem_add_range(start, size); 561 if (ret) 562 vmem_remove_range(start, size); 563 mutex_unlock(&vmem_mutex); 564 return ret; 565 } 566 567 /* 568 * Allocate new or return existing page-table entry, but do not map it 569 * to any physical address. If missing, allocate segment- and region- 570 * table entries along. Meeting a large segment- or region-table entry 571 * while traversing is an error, since the function is expected to be 572 * called against virtual regions reserved for 4KB mappings only. 573 */ 574 pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) 575 { 576 pte_t *ptep = NULL; 577 pgd_t *pgd; 578 p4d_t *p4d; 579 pud_t *pud; 580 pmd_t *pmd; 581 pte_t *pte; 582 583 pgd = pgd_offset_k(addr); 584 if (pgd_none(*pgd)) { 585 if (!alloc) 586 goto out; 587 p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); 588 if (!p4d) 589 goto out; 590 pgd_populate(&init_mm, pgd, p4d); 591 } 592 p4d = p4d_offset(pgd, addr); 593 if (p4d_none(*p4d)) { 594 if (!alloc) 595 goto out; 596 pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); 597 if (!pud) 598 goto out; 599 p4d_populate(&init_mm, p4d, pud); 600 } 601 pud = pud_offset(p4d, addr); 602 if (pud_none(*pud)) { 603 if (!alloc) 604 goto out; 605 pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); 606 if (!pmd) 607 goto out; 608 pud_populate(&init_mm, pud, pmd); 609 } else if (WARN_ON_ONCE(pud_leaf(*pud))) { 610 goto out; 611 } 612 pmd = pmd_offset(pud, addr); 613 if (pmd_none(*pmd)) { 614 if (!alloc) 615 goto out; 616 pte = vmem_pte_alloc(); 617 if (!pte) 618 goto out; 619 pmd_populate(&init_mm, pmd, pte); 620 } else if (WARN_ON_ONCE(pmd_leaf(*pmd))) { 621 goto out; 622 } 623 ptep = pte_offset_kernel(pmd, addr); 624 out: 625 return ptep; 626 } 627 628 int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc) 629 { 630 pte_t *ptep, pte; 631 632 if (!IS_ALIGNED(addr, PAGE_SIZE)) 633 return -EINVAL; 634 ptep = vmem_get_alloc_pte(addr, alloc); 635 if (!ptep) 636 return -ENOMEM; 637 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 638 pte = mk_pte_phys(phys, prot); 639 set_pte(ptep, pte); 640 return 0; 641 } 642 643 int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot) 644 { 645 int rc; 646 647 mutex_lock(&vmem_mutex); 648 rc = __vmem_map_4k_page(addr, phys, prot, true); 649 mutex_unlock(&vmem_mutex); 650 return rc; 651 } 652 653 void vmem_unmap_4k_page(unsigned long addr) 654 { 655 pte_t *ptep; 656 657 mutex_lock(&vmem_mutex); 658 ptep = virt_to_kpte(addr); 659 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 660 pte_clear(&init_mm, addr, ptep); 661 mutex_unlock(&vmem_mutex); 662 } 663 664 void __init vmem_map_init(void) 665 { 666 __set_memory_rox(_stext, _etext); 667 __set_memory_ro(_etext, __end_rodata); 668 __set_memory_rox(__stext_amode31, __etext_amode31); 669 /* 670 * If the BEAR-enhancement facility is not installed the first 671 * prefix page is used to return to the previous context with 672 * an LPSWE instruction and therefore must be executable. 673 */ 674 if (!cpu_has_bear()) 675 set_memory_x(0, 1); 676 if (debug_pagealloc_enabled()) 677 __set_memory_4k(__va(0), absolute_pointer(__va(0)) + ident_map_size); 678 pr_info("Write protected kernel read-only data: %luk\n", 679 (unsigned long)(__end_rodata - _stext) >> 10); 680 } 681