1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 2006 4 */ 5 6 #include <linux/memory_hotplug.h> 7 #include <linux/memblock.h> 8 #include <linux/pfn.h> 9 #include <linux/mm.h> 10 #include <linux/init.h> 11 #include <linux/list.h> 12 #include <linux/hugetlb.h> 13 #include <linux/slab.h> 14 #include <linux/sort.h> 15 #include <asm/cacheflush.h> 16 #include <asm/nospec-branch.h> 17 #include <asm/pgalloc.h> 18 #include <asm/setup.h> 19 #include <asm/tlbflush.h> 20 #include <asm/sections.h> 21 #include <asm/set_memory.h> 22 23 static DEFINE_MUTEX(vmem_mutex); 24 25 static void __ref *vmem_alloc_pages(unsigned int order) 26 { 27 unsigned long size = PAGE_SIZE << order; 28 29 if (slab_is_available()) 30 return (void *)__get_free_pages(GFP_KERNEL, order); 31 return memblock_alloc(size, size); 32 } 33 34 static void vmem_free_pages(unsigned long addr, int order) 35 { 36 /* We don't expect boot memory to be removed ever. */ 37 if (!slab_is_available() || 38 WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr)))) 39 return; 40 free_pages(addr, order); 41 } 42 43 void *vmem_crst_alloc(unsigned long val) 44 { 45 unsigned long *table; 46 47 table = vmem_alloc_pages(CRST_ALLOC_ORDER); 48 if (table) 49 crst_table_init(table, val); 50 return table; 51 } 52 53 pte_t __ref *vmem_pte_alloc(void) 54 { 55 unsigned long size = PTRS_PER_PTE * sizeof(pte_t); 56 pte_t *pte; 57 58 if (slab_is_available()) 59 pte = (pte_t *) page_table_alloc(&init_mm); 60 else 61 pte = (pte_t *) memblock_alloc(size, size); 62 if (!pte) 63 return NULL; 64 memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); 65 return pte; 66 } 67 68 static void vmem_pte_free(unsigned long *table) 69 { 70 /* We don't expect boot memory to be removed ever. */ 71 if (!slab_is_available() || 72 WARN_ON_ONCE(PageReserved(virt_to_page(table)))) 73 return; 74 page_table_free(&init_mm, table); 75 } 76 77 #define PAGE_UNUSED 0xFD 78 79 /* 80 * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges 81 * from unused_sub_pmd_start to next PMD_SIZE boundary. 82 */ 83 static unsigned long unused_sub_pmd_start; 84 85 static void vmemmap_flush_unused_sub_pmd(void) 86 { 87 if (!unused_sub_pmd_start) 88 return; 89 memset((void *)unused_sub_pmd_start, PAGE_UNUSED, 90 ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start); 91 unused_sub_pmd_start = 0; 92 } 93 94 static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end) 95 { 96 /* 97 * As we expect to add in the same granularity as we remove, it's 98 * sufficient to mark only some piece used to block the memmap page from 99 * getting removed (just in case the memmap never gets initialized, 100 * e.g., because the memory block never gets onlined). 101 */ 102 memset((void *)start, 0, sizeof(struct page)); 103 } 104 105 static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) 106 { 107 /* 108 * We only optimize if the new used range directly follows the 109 * previously unused range (esp., when populating consecutive sections). 110 */ 111 if (unused_sub_pmd_start == start) { 112 unused_sub_pmd_start = end; 113 if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE))) 114 unused_sub_pmd_start = 0; 115 return; 116 } 117 vmemmap_flush_unused_sub_pmd(); 118 vmemmap_mark_sub_pmd_used(start, end); 119 } 120 121 static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) 122 { 123 unsigned long page = ALIGN_DOWN(start, PMD_SIZE); 124 125 vmemmap_flush_unused_sub_pmd(); 126 127 /* Could be our memmap page is filled with PAGE_UNUSED already ... */ 128 vmemmap_mark_sub_pmd_used(start, end); 129 130 /* Mark the unused parts of the new memmap page PAGE_UNUSED. */ 131 if (!IS_ALIGNED(start, PMD_SIZE)) 132 memset((void *)page, PAGE_UNUSED, start - page); 133 /* 134 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of 135 * consecutive sections. Remember for the last added PMD the last 136 * unused range in the populated PMD. 137 */ 138 if (!IS_ALIGNED(end, PMD_SIZE)) 139 unused_sub_pmd_start = end; 140 } 141 142 /* Returns true if the PMD is completely unused and can be freed. */ 143 static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) 144 { 145 unsigned long page = ALIGN_DOWN(start, PMD_SIZE); 146 147 vmemmap_flush_unused_sub_pmd(); 148 memset((void *)start, PAGE_UNUSED, end - start); 149 return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE); 150 } 151 152 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ 153 static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, 154 unsigned long end, bool add, bool direct) 155 { 156 unsigned long prot, pages = 0; 157 int ret = -ENOMEM; 158 pte_t *pte; 159 160 prot = pgprot_val(PAGE_KERNEL); 161 if (!MACHINE_HAS_NX) 162 prot &= ~_PAGE_NOEXEC; 163 164 pte = pte_offset_kernel(pmd, addr); 165 for (; addr < end; addr += PAGE_SIZE, pte++) { 166 if (!add) { 167 if (pte_none(*pte)) 168 continue; 169 if (!direct) 170 vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0); 171 pte_clear(&init_mm, addr, pte); 172 } else if (pte_none(*pte)) { 173 if (!direct) { 174 void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); 175 176 if (!new_page) 177 goto out; 178 set_pte(pte, __pte(__pa(new_page) | prot)); 179 } else { 180 set_pte(pte, __pte(__pa(addr) | prot)); 181 } 182 } else { 183 continue; 184 } 185 pages++; 186 } 187 ret = 0; 188 out: 189 if (direct) 190 update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages); 191 return ret; 192 } 193 194 static void try_free_pte_table(pmd_t *pmd, unsigned long start) 195 { 196 pte_t *pte; 197 int i; 198 199 /* We can safely assume this is fully in 1:1 mapping & vmemmap area */ 200 pte = pte_offset_kernel(pmd, start); 201 for (i = 0; i < PTRS_PER_PTE; i++, pte++) { 202 if (!pte_none(*pte)) 203 return; 204 } 205 vmem_pte_free((unsigned long *) pmd_deref(*pmd)); 206 pmd_clear(pmd); 207 } 208 209 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ 210 static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, 211 unsigned long end, bool add, bool direct) 212 { 213 unsigned long next, prot, pages = 0; 214 int ret = -ENOMEM; 215 pmd_t *pmd; 216 pte_t *pte; 217 218 prot = pgprot_val(SEGMENT_KERNEL); 219 if (!MACHINE_HAS_NX) 220 prot &= ~_SEGMENT_ENTRY_NOEXEC; 221 222 pmd = pmd_offset(pud, addr); 223 for (; addr < end; addr = next, pmd++) { 224 next = pmd_addr_end(addr, end); 225 if (!add) { 226 if (pmd_none(*pmd)) 227 continue; 228 if (pmd_large(*pmd)) { 229 if (IS_ALIGNED(addr, PMD_SIZE) && 230 IS_ALIGNED(next, PMD_SIZE)) { 231 if (!direct) 232 vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); 233 pmd_clear(pmd); 234 pages++; 235 } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { 236 vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); 237 pmd_clear(pmd); 238 } 239 continue; 240 } 241 } else if (pmd_none(*pmd)) { 242 if (IS_ALIGNED(addr, PMD_SIZE) && 243 IS_ALIGNED(next, PMD_SIZE) && 244 MACHINE_HAS_EDAT1 && direct && 245 !debug_pagealloc_enabled()) { 246 set_pmd(pmd, __pmd(__pa(addr) | prot)); 247 pages++; 248 continue; 249 } else if (!direct && MACHINE_HAS_EDAT1) { 250 void *new_page; 251 252 /* 253 * Use 1MB frames for vmemmap if available. We 254 * always use large frames even if they are only 255 * partially used. Otherwise we would have also 256 * page tables since vmemmap_populate gets 257 * called for each section separately. 258 */ 259 new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE); 260 if (new_page) { 261 set_pmd(pmd, __pmd(__pa(new_page) | prot)); 262 if (!IS_ALIGNED(addr, PMD_SIZE) || 263 !IS_ALIGNED(next, PMD_SIZE)) { 264 vmemmap_use_new_sub_pmd(addr, next); 265 } 266 continue; 267 } 268 } 269 pte = vmem_pte_alloc(); 270 if (!pte) 271 goto out; 272 pmd_populate(&init_mm, pmd, pte); 273 } else if (pmd_large(*pmd)) { 274 if (!direct) 275 vmemmap_use_sub_pmd(addr, next); 276 continue; 277 } 278 ret = modify_pte_table(pmd, addr, next, add, direct); 279 if (ret) 280 goto out; 281 if (!add) 282 try_free_pte_table(pmd, addr & PMD_MASK); 283 } 284 ret = 0; 285 out: 286 if (direct) 287 update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages); 288 return ret; 289 } 290 291 static void try_free_pmd_table(pud_t *pud, unsigned long start) 292 { 293 pmd_t *pmd; 294 int i; 295 296 pmd = pmd_offset(pud, start); 297 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) 298 if (!pmd_none(*pmd)) 299 return; 300 vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER); 301 pud_clear(pud); 302 } 303 304 static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, 305 bool add, bool direct) 306 { 307 unsigned long next, prot, pages = 0; 308 int ret = -ENOMEM; 309 pud_t *pud; 310 pmd_t *pmd; 311 312 prot = pgprot_val(REGION3_KERNEL); 313 if (!MACHINE_HAS_NX) 314 prot &= ~_REGION_ENTRY_NOEXEC; 315 pud = pud_offset(p4d, addr); 316 for (; addr < end; addr = next, pud++) { 317 next = pud_addr_end(addr, end); 318 if (!add) { 319 if (pud_none(*pud)) 320 continue; 321 if (pud_large(*pud)) { 322 if (IS_ALIGNED(addr, PUD_SIZE) && 323 IS_ALIGNED(next, PUD_SIZE)) { 324 pud_clear(pud); 325 pages++; 326 } 327 continue; 328 } 329 } else if (pud_none(*pud)) { 330 if (IS_ALIGNED(addr, PUD_SIZE) && 331 IS_ALIGNED(next, PUD_SIZE) && 332 MACHINE_HAS_EDAT2 && direct && 333 !debug_pagealloc_enabled()) { 334 set_pud(pud, __pud(__pa(addr) | prot)); 335 pages++; 336 continue; 337 } 338 pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); 339 if (!pmd) 340 goto out; 341 pud_populate(&init_mm, pud, pmd); 342 } else if (pud_large(*pud)) { 343 continue; 344 } 345 ret = modify_pmd_table(pud, addr, next, add, direct); 346 if (ret) 347 goto out; 348 if (!add) 349 try_free_pmd_table(pud, addr & PUD_MASK); 350 } 351 ret = 0; 352 out: 353 if (direct) 354 update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages); 355 return ret; 356 } 357 358 static void try_free_pud_table(p4d_t *p4d, unsigned long start) 359 { 360 pud_t *pud; 361 int i; 362 363 pud = pud_offset(p4d, start); 364 for (i = 0; i < PTRS_PER_PUD; i++, pud++) { 365 if (!pud_none(*pud)) 366 return; 367 } 368 vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER); 369 p4d_clear(p4d); 370 } 371 372 static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, 373 bool add, bool direct) 374 { 375 unsigned long next; 376 int ret = -ENOMEM; 377 p4d_t *p4d; 378 pud_t *pud; 379 380 p4d = p4d_offset(pgd, addr); 381 for (; addr < end; addr = next, p4d++) { 382 next = p4d_addr_end(addr, end); 383 if (!add) { 384 if (p4d_none(*p4d)) 385 continue; 386 } else if (p4d_none(*p4d)) { 387 pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); 388 if (!pud) 389 goto out; 390 p4d_populate(&init_mm, p4d, pud); 391 } 392 ret = modify_pud_table(p4d, addr, next, add, direct); 393 if (ret) 394 goto out; 395 if (!add) 396 try_free_pud_table(p4d, addr & P4D_MASK); 397 } 398 ret = 0; 399 out: 400 return ret; 401 } 402 403 static void try_free_p4d_table(pgd_t *pgd, unsigned long start) 404 { 405 p4d_t *p4d; 406 int i; 407 408 p4d = p4d_offset(pgd, start); 409 for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { 410 if (!p4d_none(*p4d)) 411 return; 412 } 413 vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER); 414 pgd_clear(pgd); 415 } 416 417 static int modify_pagetable(unsigned long start, unsigned long end, bool add, 418 bool direct) 419 { 420 unsigned long addr, next; 421 int ret = -ENOMEM; 422 pgd_t *pgd; 423 p4d_t *p4d; 424 425 if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) 426 return -EINVAL; 427 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ 428 if (WARN_ON_ONCE(end > VMALLOC_START)) 429 return -EINVAL; 430 for (addr = start; addr < end; addr = next) { 431 next = pgd_addr_end(addr, end); 432 pgd = pgd_offset_k(addr); 433 434 if (!add) { 435 if (pgd_none(*pgd)) 436 continue; 437 } else if (pgd_none(*pgd)) { 438 p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); 439 if (!p4d) 440 goto out; 441 pgd_populate(&init_mm, pgd, p4d); 442 } 443 ret = modify_p4d_table(pgd, addr, next, add, direct); 444 if (ret) 445 goto out; 446 if (!add) 447 try_free_p4d_table(pgd, addr & PGDIR_MASK); 448 } 449 ret = 0; 450 out: 451 if (!add) 452 flush_tlb_kernel_range(start, end); 453 return ret; 454 } 455 456 static int add_pagetable(unsigned long start, unsigned long end, bool direct) 457 { 458 return modify_pagetable(start, end, true, direct); 459 } 460 461 static int remove_pagetable(unsigned long start, unsigned long end, bool direct) 462 { 463 return modify_pagetable(start, end, false, direct); 464 } 465 466 /* 467 * Add a physical memory range to the 1:1 mapping. 468 */ 469 static int vmem_add_range(unsigned long start, unsigned long size) 470 { 471 start = (unsigned long)__va(start); 472 return add_pagetable(start, start + size, true); 473 } 474 475 /* 476 * Remove a physical memory range from the 1:1 mapping. 477 */ 478 static void vmem_remove_range(unsigned long start, unsigned long size) 479 { 480 start = (unsigned long)__va(start); 481 remove_pagetable(start, start + size, true); 482 } 483 484 /* 485 * Add a backed mem_map array to the virtual mem_map array. 486 */ 487 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, 488 struct vmem_altmap *altmap) 489 { 490 int ret; 491 492 mutex_lock(&vmem_mutex); 493 /* We don't care about the node, just use NUMA_NO_NODE on allocations */ 494 ret = add_pagetable(start, end, false); 495 if (ret) 496 remove_pagetable(start, end, false); 497 mutex_unlock(&vmem_mutex); 498 return ret; 499 } 500 501 void vmemmap_free(unsigned long start, unsigned long end, 502 struct vmem_altmap *altmap) 503 { 504 mutex_lock(&vmem_mutex); 505 remove_pagetable(start, end, false); 506 mutex_unlock(&vmem_mutex); 507 } 508 509 void vmem_remove_mapping(unsigned long start, unsigned long size) 510 { 511 mutex_lock(&vmem_mutex); 512 vmem_remove_range(start, size); 513 mutex_unlock(&vmem_mutex); 514 } 515 516 struct range arch_get_mappable_range(void) 517 { 518 struct range mhp_range; 519 520 mhp_range.start = 0; 521 mhp_range.end = max_mappable - 1; 522 return mhp_range; 523 } 524 525 int vmem_add_mapping(unsigned long start, unsigned long size) 526 { 527 struct range range = arch_get_mappable_range(); 528 int ret; 529 530 if (start < range.start || 531 start + size > range.end + 1 || 532 start + size < start) 533 return -ERANGE; 534 535 mutex_lock(&vmem_mutex); 536 ret = vmem_add_range(start, size); 537 if (ret) 538 vmem_remove_range(start, size); 539 mutex_unlock(&vmem_mutex); 540 return ret; 541 } 542 543 /* 544 * Allocate new or return existing page-table entry, but do not map it 545 * to any physical address. If missing, allocate segment- and region- 546 * table entries along. Meeting a large segment- or region-table entry 547 * while traversing is an error, since the function is expected to be 548 * called against virtual regions reserved for 4KB mappings only. 549 */ 550 pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) 551 { 552 pte_t *ptep = NULL; 553 pgd_t *pgd; 554 p4d_t *p4d; 555 pud_t *pud; 556 pmd_t *pmd; 557 pte_t *pte; 558 559 pgd = pgd_offset_k(addr); 560 if (pgd_none(*pgd)) { 561 if (!alloc) 562 goto out; 563 p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); 564 if (!p4d) 565 goto out; 566 pgd_populate(&init_mm, pgd, p4d); 567 } 568 p4d = p4d_offset(pgd, addr); 569 if (p4d_none(*p4d)) { 570 if (!alloc) 571 goto out; 572 pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); 573 if (!pud) 574 goto out; 575 p4d_populate(&init_mm, p4d, pud); 576 } 577 pud = pud_offset(p4d, addr); 578 if (pud_none(*pud)) { 579 if (!alloc) 580 goto out; 581 pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); 582 if (!pmd) 583 goto out; 584 pud_populate(&init_mm, pud, pmd); 585 } else if (WARN_ON_ONCE(pud_large(*pud))) { 586 goto out; 587 } 588 pmd = pmd_offset(pud, addr); 589 if (pmd_none(*pmd)) { 590 if (!alloc) 591 goto out; 592 pte = vmem_pte_alloc(); 593 if (!pte) 594 goto out; 595 pmd_populate(&init_mm, pmd, pte); 596 } else if (WARN_ON_ONCE(pmd_large(*pmd))) { 597 goto out; 598 } 599 ptep = pte_offset_kernel(pmd, addr); 600 out: 601 return ptep; 602 } 603 604 int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc) 605 { 606 pte_t *ptep, pte; 607 608 if (!IS_ALIGNED(addr, PAGE_SIZE)) 609 return -EINVAL; 610 ptep = vmem_get_alloc_pte(addr, alloc); 611 if (!ptep) 612 return -ENOMEM; 613 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 614 pte = mk_pte_phys(phys, prot); 615 set_pte(ptep, pte); 616 return 0; 617 } 618 619 int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot) 620 { 621 int rc; 622 623 mutex_lock(&vmem_mutex); 624 rc = __vmem_map_4k_page(addr, phys, prot, true); 625 mutex_unlock(&vmem_mutex); 626 return rc; 627 } 628 629 void vmem_unmap_4k_page(unsigned long addr) 630 { 631 pte_t *ptep; 632 633 mutex_lock(&vmem_mutex); 634 ptep = virt_to_kpte(addr); 635 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 636 pte_clear(&init_mm, addr, ptep); 637 mutex_unlock(&vmem_mutex); 638 } 639 640 void __init vmem_map_init(void) 641 { 642 __set_memory_rox(_stext, _etext); 643 __set_memory_ro(_etext, __end_rodata); 644 __set_memory_rox(_sinittext, _einittext); 645 __set_memory_rox(__stext_amode31, __etext_amode31); 646 /* 647 * If the BEAR-enhancement facility is not installed the first 648 * prefix page is used to return to the previous context with 649 * an LPSWE instruction and therefore must be executable. 650 */ 651 if (!static_key_enabled(&cpu_has_bear)) 652 set_memory_x(0, 1); 653 if (debug_pagealloc_enabled()) { 654 /* 655 * Use RELOC_HIDE() as long as __va(0) translates to NULL, 656 * since performing pointer arithmetic on a NULL pointer 657 * has undefined behavior and generates compiler warnings. 658 */ 659 __set_memory_4k(__va(0), RELOC_HIDE(__va(0), ident_map_size)); 660 } 661 if (MACHINE_HAS_NX) 662 ctl_set_bit(0, 20); 663 pr_info("Write protected kernel read-only data: %luk\n", 664 (unsigned long)(__end_rodata - _stext) >> 10); 665 } 666