1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 2006 4 */ 5 6 #include <linux/memory_hotplug.h> 7 #include <linux/cpufeature.h> 8 #include <linux/memblock.h> 9 #include <linux/pfn.h> 10 #include <linux/mm.h> 11 #include <linux/init.h> 12 #include <linux/list.h> 13 #include <linux/hugetlb.h> 14 #include <linux/slab.h> 15 #include <linux/sort.h> 16 #include <asm/page-states.h> 17 #include <asm/abs_lowcore.h> 18 #include <asm/cacheflush.h> 19 #include <asm/maccess.h> 20 #include <asm/nospec-branch.h> 21 #include <asm/ctlreg.h> 22 #include <asm/pgalloc.h> 23 #include <asm/setup.h> 24 #include <asm/tlbflush.h> 25 #include <asm/sections.h> 26 #include <asm/set_memory.h> 27 #include <asm/physmem_info.h> 28 29 static DEFINE_MUTEX(vmem_mutex); 30 31 static void __ref *vmem_alloc_pages(unsigned int order) 32 { 33 unsigned long size = PAGE_SIZE << order; 34 35 if (slab_is_available()) 36 return (void *)__get_free_pages(GFP_KERNEL, order); 37 return memblock_alloc(size, size); 38 } 39 40 static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap) 41 { 42 if (altmap) { 43 vmem_altmap_free(altmap, 1 << order); 44 return; 45 } 46 /* We don't expect boot memory to be removed ever. */ 47 if (!slab_is_available() || 48 WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr)))) 49 return; 50 free_pages(addr, order); 51 } 52 53 void *vmem_crst_alloc(unsigned long val) 54 { 55 unsigned long *table; 56 57 table = vmem_alloc_pages(CRST_ALLOC_ORDER); 58 if (!table) 59 return NULL; 60 crst_table_init(table, val); 61 __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); 62 return table; 63 } 64 65 pte_t __ref *vmem_pte_alloc(void) 66 { 67 unsigned long size = PTRS_PER_PTE * sizeof(pte_t); 68 pte_t *pte; 69 70 if (slab_is_available()) 71 pte = (pte_t *) page_table_alloc(&init_mm); 72 else 73 pte = (pte_t *) memblock_alloc(size, size); 74 if (!pte) 75 return NULL; 76 memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); 77 __arch_set_page_dat(pte, 1); 78 return pte; 79 } 80 81 static void vmem_pte_free(unsigned long *table) 82 { 83 /* We don't expect boot memory to be removed ever. */ 84 if (!slab_is_available() || 85 WARN_ON_ONCE(PageReserved(virt_to_page(table)))) 86 return; 87 page_table_free(&init_mm, table); 88 } 89 90 #define PAGE_UNUSED 0xFD 91 92 /* 93 * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges 94 * from unused_sub_pmd_start to next PMD_SIZE boundary. 95 */ 96 static unsigned long unused_sub_pmd_start; 97 98 static void vmemmap_flush_unused_sub_pmd(void) 99 { 100 if (!unused_sub_pmd_start) 101 return; 102 memset((void *)unused_sub_pmd_start, PAGE_UNUSED, 103 ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start); 104 unused_sub_pmd_start = 0; 105 } 106 107 static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end) 108 { 109 /* 110 * As we expect to add in the same granularity as we remove, it's 111 * sufficient to mark only some piece used to block the memmap page from 112 * getting removed (just in case the memmap never gets initialized, 113 * e.g., because the memory block never gets onlined). 114 */ 115 memset((void *)start, 0, sizeof(struct page)); 116 } 117 118 static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) 119 { 120 /* 121 * We only optimize if the new used range directly follows the 122 * previously unused range (esp., when populating consecutive sections). 123 */ 124 if (unused_sub_pmd_start == start) { 125 unused_sub_pmd_start = end; 126 if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE))) 127 unused_sub_pmd_start = 0; 128 return; 129 } 130 vmemmap_flush_unused_sub_pmd(); 131 vmemmap_mark_sub_pmd_used(start, end); 132 } 133 134 static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) 135 { 136 unsigned long page = ALIGN_DOWN(start, PMD_SIZE); 137 138 vmemmap_flush_unused_sub_pmd(); 139 140 /* Could be our memmap page is filled with PAGE_UNUSED already ... */ 141 vmemmap_mark_sub_pmd_used(start, end); 142 143 /* Mark the unused parts of the new memmap page PAGE_UNUSED. */ 144 if (!IS_ALIGNED(start, PMD_SIZE)) 145 memset((void *)page, PAGE_UNUSED, start - page); 146 /* 147 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of 148 * consecutive sections. Remember for the last added PMD the last 149 * unused range in the populated PMD. 150 */ 151 if (!IS_ALIGNED(end, PMD_SIZE)) 152 unused_sub_pmd_start = end; 153 } 154 155 /* Returns true if the PMD is completely unused and can be freed. */ 156 static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) 157 { 158 unsigned long page = ALIGN_DOWN(start, PMD_SIZE); 159 160 vmemmap_flush_unused_sub_pmd(); 161 memset((void *)start, PAGE_UNUSED, end - start); 162 return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE); 163 } 164 165 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ 166 static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, 167 unsigned long end, bool add, bool direct, 168 struct vmem_altmap *altmap) 169 { 170 unsigned long prot, pages = 0; 171 int ret = -ENOMEM; 172 pte_t *pte; 173 174 prot = pgprot_val(PAGE_KERNEL); 175 pte = pte_offset_kernel(pmd, addr); 176 for (; addr < end; addr += PAGE_SIZE, pte++) { 177 if (!add) { 178 if (pte_none(*pte)) 179 continue; 180 if (!direct) 181 vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap); 182 pte_clear(&init_mm, addr, pte); 183 } else if (pte_none(*pte)) { 184 if (!direct) { 185 void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap); 186 187 if (!new_page) 188 goto out; 189 set_pte(pte, __pte(__pa(new_page) | prot)); 190 } else { 191 set_pte(pte, __pte(__pa(addr) | prot)); 192 } 193 } else { 194 continue; 195 } 196 pages++; 197 } 198 ret = 0; 199 out: 200 if (direct) 201 update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages); 202 return ret; 203 } 204 205 static void try_free_pte_table(pmd_t *pmd, unsigned long start) 206 { 207 pte_t *pte; 208 int i; 209 210 /* We can safely assume this is fully in 1:1 mapping & vmemmap area */ 211 pte = pte_offset_kernel(pmd, start); 212 for (i = 0; i < PTRS_PER_PTE; i++, pte++) { 213 if (!pte_none(*pte)) 214 return; 215 } 216 vmem_pte_free((unsigned long *) pmd_deref(*pmd)); 217 pmd_clear(pmd); 218 } 219 220 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ 221 static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, 222 unsigned long end, bool add, bool direct, 223 struct vmem_altmap *altmap) 224 { 225 unsigned long next, prot, pages = 0; 226 int ret = -ENOMEM; 227 pmd_t *pmd; 228 pte_t *pte; 229 230 prot = pgprot_val(SEGMENT_KERNEL); 231 pmd = pmd_offset(pud, addr); 232 for (; addr < end; addr = next, pmd++) { 233 next = pmd_addr_end(addr, end); 234 if (!add) { 235 if (pmd_none(*pmd)) 236 continue; 237 if (pmd_leaf(*pmd)) { 238 if (IS_ALIGNED(addr, PMD_SIZE) && 239 IS_ALIGNED(next, PMD_SIZE)) { 240 if (!direct) 241 vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); 242 pmd_clear(pmd); 243 pages++; 244 } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { 245 vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); 246 pmd_clear(pmd); 247 } 248 continue; 249 } 250 } else if (pmd_none(*pmd)) { 251 if (IS_ALIGNED(addr, PMD_SIZE) && 252 IS_ALIGNED(next, PMD_SIZE) && 253 cpu_has_edat1() && direct && 254 !debug_pagealloc_enabled()) { 255 set_pmd(pmd, __pmd(__pa(addr) | prot)); 256 pages++; 257 continue; 258 } else if (!direct && cpu_has_edat1()) { 259 void *new_page; 260 261 /* 262 * Use 1MB frames for vmemmap if available. We 263 * always use large frames even if they are only 264 * partially used. Otherwise we would have also 265 * page tables since vmemmap_populate gets 266 * called for each section separately. 267 */ 268 new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap); 269 if (new_page) { 270 set_pmd(pmd, __pmd(__pa(new_page) | prot)); 271 if (!IS_ALIGNED(addr, PMD_SIZE) || 272 !IS_ALIGNED(next, PMD_SIZE)) { 273 vmemmap_use_new_sub_pmd(addr, next); 274 } 275 continue; 276 } 277 } 278 pte = vmem_pte_alloc(); 279 if (!pte) 280 goto out; 281 pmd_populate(&init_mm, pmd, pte); 282 } else if (pmd_leaf(*pmd)) { 283 if (!direct) 284 vmemmap_use_sub_pmd(addr, next); 285 continue; 286 } 287 ret = modify_pte_table(pmd, addr, next, add, direct, altmap); 288 if (ret) 289 goto out; 290 if (!add) 291 try_free_pte_table(pmd, addr & PMD_MASK); 292 } 293 ret = 0; 294 out: 295 if (direct) 296 update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages); 297 return ret; 298 } 299 300 static void try_free_pmd_table(pud_t *pud, unsigned long start) 301 { 302 pmd_t *pmd; 303 int i; 304 305 pmd = pmd_offset(pud, start); 306 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) 307 if (!pmd_none(*pmd)) 308 return; 309 vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL); 310 pud_clear(pud); 311 } 312 313 static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, 314 bool add, bool direct, struct vmem_altmap *altmap) 315 { 316 unsigned long next, prot, pages = 0; 317 int ret = -ENOMEM; 318 pud_t *pud; 319 pmd_t *pmd; 320 321 prot = pgprot_val(REGION3_KERNEL); 322 pud = pud_offset(p4d, addr); 323 for (; addr < end; addr = next, pud++) { 324 next = pud_addr_end(addr, end); 325 if (!add) { 326 if (pud_none(*pud)) 327 continue; 328 if (pud_leaf(*pud)) { 329 if (IS_ALIGNED(addr, PUD_SIZE) && 330 IS_ALIGNED(next, PUD_SIZE)) { 331 pud_clear(pud); 332 pages++; 333 } 334 continue; 335 } 336 } else if (pud_none(*pud)) { 337 if (IS_ALIGNED(addr, PUD_SIZE) && 338 IS_ALIGNED(next, PUD_SIZE) && 339 cpu_has_edat2() && direct && 340 !debug_pagealloc_enabled()) { 341 set_pud(pud, __pud(__pa(addr) | prot)); 342 pages++; 343 continue; 344 } 345 pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); 346 if (!pmd) 347 goto out; 348 pud_populate(&init_mm, pud, pmd); 349 } else if (pud_leaf(*pud)) { 350 continue; 351 } 352 ret = modify_pmd_table(pud, addr, next, add, direct, altmap); 353 if (ret) 354 goto out; 355 if (!add) 356 try_free_pmd_table(pud, addr & PUD_MASK); 357 } 358 ret = 0; 359 out: 360 if (direct) 361 update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages); 362 return ret; 363 } 364 365 static void try_free_pud_table(p4d_t *p4d, unsigned long start) 366 { 367 pud_t *pud; 368 int i; 369 370 pud = pud_offset(p4d, start); 371 for (i = 0; i < PTRS_PER_PUD; i++, pud++) { 372 if (!pud_none(*pud)) 373 return; 374 } 375 vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL); 376 p4d_clear(p4d); 377 } 378 379 static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, 380 bool add, bool direct, struct vmem_altmap *altmap) 381 { 382 unsigned long next; 383 int ret = -ENOMEM; 384 p4d_t *p4d; 385 pud_t *pud; 386 387 p4d = p4d_offset(pgd, addr); 388 for (; addr < end; addr = next, p4d++) { 389 next = p4d_addr_end(addr, end); 390 if (!add) { 391 if (p4d_none(*p4d)) 392 continue; 393 } else if (p4d_none(*p4d)) { 394 pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); 395 if (!pud) 396 goto out; 397 p4d_populate(&init_mm, p4d, pud); 398 } 399 ret = modify_pud_table(p4d, addr, next, add, direct, altmap); 400 if (ret) 401 goto out; 402 if (!add) 403 try_free_pud_table(p4d, addr & P4D_MASK); 404 } 405 ret = 0; 406 out: 407 return ret; 408 } 409 410 static void try_free_p4d_table(pgd_t *pgd, unsigned long start) 411 { 412 p4d_t *p4d; 413 int i; 414 415 p4d = p4d_offset(pgd, start); 416 for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { 417 if (!p4d_none(*p4d)) 418 return; 419 } 420 vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL); 421 pgd_clear(pgd); 422 } 423 424 static int modify_pagetable(unsigned long start, unsigned long end, bool add, 425 bool direct, struct vmem_altmap *altmap) 426 { 427 unsigned long addr, next; 428 int ret = -ENOMEM; 429 pgd_t *pgd; 430 p4d_t *p4d; 431 432 if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) 433 return -EINVAL; 434 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ 435 if (WARN_ON_ONCE(end > __abs_lowcore)) 436 return -EINVAL; 437 for (addr = start; addr < end; addr = next) { 438 next = pgd_addr_end(addr, end); 439 pgd = pgd_offset_k(addr); 440 441 if (!add) { 442 if (pgd_none(*pgd)) 443 continue; 444 } else if (pgd_none(*pgd)) { 445 p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); 446 if (!p4d) 447 goto out; 448 pgd_populate(&init_mm, pgd, p4d); 449 } 450 ret = modify_p4d_table(pgd, addr, next, add, direct, altmap); 451 if (ret) 452 goto out; 453 if (!add) 454 try_free_p4d_table(pgd, addr & PGDIR_MASK); 455 } 456 ret = 0; 457 out: 458 if (!add) 459 flush_tlb_kernel_range(start, end); 460 return ret; 461 } 462 463 static int add_pagetable(unsigned long start, unsigned long end, bool direct, 464 struct vmem_altmap *altmap) 465 { 466 return modify_pagetable(start, end, true, direct, altmap); 467 } 468 469 static int remove_pagetable(unsigned long start, unsigned long end, bool direct, 470 struct vmem_altmap *altmap) 471 { 472 return modify_pagetable(start, end, false, direct, altmap); 473 } 474 475 /* 476 * Add a physical memory range to the 1:1 mapping. 477 */ 478 static int vmem_add_range(unsigned long start, unsigned long size) 479 { 480 start = (unsigned long)__va(start); 481 return add_pagetable(start, start + size, true, NULL); 482 } 483 484 /* 485 * Remove a physical memory range from the 1:1 mapping. 486 */ 487 static void vmem_remove_range(unsigned long start, unsigned long size) 488 { 489 start = (unsigned long)__va(start); 490 remove_pagetable(start, start + size, true, NULL); 491 } 492 493 /* 494 * Add a backed mem_map array to the virtual mem_map array. 495 */ 496 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, 497 struct vmem_altmap *altmap) 498 { 499 int ret; 500 501 mutex_lock(&vmem_mutex); 502 /* We don't care about the node, just use NUMA_NO_NODE on allocations */ 503 ret = add_pagetable(start, end, false, altmap); 504 if (ret) 505 remove_pagetable(start, end, false, altmap); 506 mutex_unlock(&vmem_mutex); 507 return ret; 508 } 509 510 #ifdef CONFIG_MEMORY_HOTPLUG 511 512 void vmemmap_free(unsigned long start, unsigned long end, 513 struct vmem_altmap *altmap) 514 { 515 mutex_lock(&vmem_mutex); 516 remove_pagetable(start, end, false, altmap); 517 mutex_unlock(&vmem_mutex); 518 } 519 520 #endif 521 522 void vmem_remove_mapping(unsigned long start, unsigned long size) 523 { 524 mutex_lock(&vmem_mutex); 525 vmem_remove_range(start, size); 526 mutex_unlock(&vmem_mutex); 527 } 528 529 struct range arch_get_mappable_range(void) 530 { 531 struct range mhp_range; 532 533 mhp_range.start = 0; 534 mhp_range.end = max_mappable - 1; 535 return mhp_range; 536 } 537 538 int vmem_add_mapping(unsigned long start, unsigned long size) 539 { 540 struct range range = arch_get_mappable_range(); 541 int ret; 542 543 if (start < range.start || 544 start + size > range.end + 1 || 545 start + size < start) 546 return -ERANGE; 547 548 mutex_lock(&vmem_mutex); 549 ret = vmem_add_range(start, size); 550 if (ret) 551 vmem_remove_range(start, size); 552 mutex_unlock(&vmem_mutex); 553 return ret; 554 } 555 556 /* 557 * Allocate new or return existing page-table entry, but do not map it 558 * to any physical address. If missing, allocate segment- and region- 559 * table entries along. Meeting a large segment- or region-table entry 560 * while traversing is an error, since the function is expected to be 561 * called against virtual regions reserved for 4KB mappings only. 562 */ 563 pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) 564 { 565 pte_t *ptep = NULL; 566 pgd_t *pgd; 567 p4d_t *p4d; 568 pud_t *pud; 569 pmd_t *pmd; 570 pte_t *pte; 571 572 pgd = pgd_offset_k(addr); 573 if (pgd_none(*pgd)) { 574 if (!alloc) 575 goto out; 576 p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); 577 if (!p4d) 578 goto out; 579 pgd_populate(&init_mm, pgd, p4d); 580 } 581 p4d = p4d_offset(pgd, addr); 582 if (p4d_none(*p4d)) { 583 if (!alloc) 584 goto out; 585 pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); 586 if (!pud) 587 goto out; 588 p4d_populate(&init_mm, p4d, pud); 589 } 590 pud = pud_offset(p4d, addr); 591 if (pud_none(*pud)) { 592 if (!alloc) 593 goto out; 594 pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); 595 if (!pmd) 596 goto out; 597 pud_populate(&init_mm, pud, pmd); 598 } else if (WARN_ON_ONCE(pud_leaf(*pud))) { 599 goto out; 600 } 601 pmd = pmd_offset(pud, addr); 602 if (pmd_none(*pmd)) { 603 if (!alloc) 604 goto out; 605 pte = vmem_pte_alloc(); 606 if (!pte) 607 goto out; 608 pmd_populate(&init_mm, pmd, pte); 609 } else if (WARN_ON_ONCE(pmd_leaf(*pmd))) { 610 goto out; 611 } 612 ptep = pte_offset_kernel(pmd, addr); 613 out: 614 return ptep; 615 } 616 617 int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc) 618 { 619 pte_t *ptep, pte; 620 621 if (!IS_ALIGNED(addr, PAGE_SIZE)) 622 return -EINVAL; 623 ptep = vmem_get_alloc_pte(addr, alloc); 624 if (!ptep) 625 return -ENOMEM; 626 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 627 pte = mk_pte_phys(phys, prot); 628 set_pte(ptep, pte); 629 return 0; 630 } 631 632 int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot) 633 { 634 int rc; 635 636 mutex_lock(&vmem_mutex); 637 rc = __vmem_map_4k_page(addr, phys, prot, true); 638 mutex_unlock(&vmem_mutex); 639 return rc; 640 } 641 642 void vmem_unmap_4k_page(unsigned long addr) 643 { 644 pte_t *ptep; 645 646 mutex_lock(&vmem_mutex); 647 ptep = virt_to_kpte(addr); 648 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 649 pte_clear(&init_mm, addr, ptep); 650 mutex_unlock(&vmem_mutex); 651 } 652 653 void __init vmem_map_init(void) 654 { 655 __set_memory_rox(_stext, _etext); 656 __set_memory_ro(_etext, __end_rodata); 657 __set_memory_rox(__stext_amode31, __etext_amode31); 658 /* 659 * If the BEAR-enhancement facility is not installed the first 660 * prefix page is used to return to the previous context with 661 * an LPSWE instruction and therefore must be executable. 662 */ 663 if (!cpu_has_bear()) 664 set_memory_x(0, 1); 665 if (debug_pagealloc_enabled()) 666 __set_memory_4k(__va(0), absolute_pointer(__va(0)) + ident_map_size); 667 pr_info("Write protected kernel read-only data: %luk\n", 668 (unsigned long)(__end_rodata - _stext) >> 10); 669 } 670