1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright IBM Corp. 2006 4 */ 5 6 #include <linux/memory_hotplug.h> 7 #include <linux/bootmem_info.h> 8 #include <linux/cpufeature.h> 9 #include <linux/memblock.h> 10 #include <linux/pfn.h> 11 #include <linux/mm.h> 12 #include <linux/init.h> 13 #include <linux/list.h> 14 #include <linux/hugetlb.h> 15 #include <linux/slab.h> 16 #include <linux/sort.h> 17 #include <asm/page-states.h> 18 #include <asm/abs_lowcore.h> 19 #include <asm/cacheflush.h> 20 #include <asm/maccess.h> 21 #include <asm/nospec-branch.h> 22 #include <asm/ctlreg.h> 23 #include <asm/pgalloc.h> 24 #include <asm/setup.h> 25 #include <asm/tlbflush.h> 26 #include <asm/sections.h> 27 #include <asm/set_memory.h> 28 #include <asm/physmem_info.h> 29 30 static DEFINE_MUTEX(vmem_mutex); 31 32 static void __ref *vmem_alloc_pages(unsigned int order) 33 { 34 unsigned long size = PAGE_SIZE << order; 35 36 if (slab_is_available()) 37 return (void *)__get_free_pages(GFP_KERNEL, order); 38 return memblock_alloc(size, size); 39 } 40 41 static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap) 42 { 43 unsigned int nr_pages = 1 << order; 44 struct page *page; 45 46 if (altmap) { 47 vmem_altmap_free(altmap, 1 << order); 48 return; 49 } 50 page = virt_to_page((void *)addr); 51 if (PageReserved(page)) { 52 /* allocated from memblock */ 53 while (nr_pages--) 54 free_bootmem_page(page++); 55 } else { 56 free_pages(addr, order); 57 } 58 } 59 60 void *vmem_crst_alloc(unsigned long val) 61 { 62 unsigned long *table; 63 64 table = vmem_alloc_pages(CRST_ALLOC_ORDER); 65 if (!table) 66 return NULL; 67 crst_table_init(table, val); 68 __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); 69 return table; 70 } 71 72 pte_t __ref *vmem_pte_alloc(void) 73 { 74 pte_t *pte; 75 76 if (slab_is_available()) 77 pte = (pte_t *)page_table_alloc(&init_mm); 78 else 79 pte = (pte_t *)memblock_alloc(PAGE_SIZE, PAGE_SIZE); 80 if (!pte) 81 return NULL; 82 memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); 83 __arch_set_page_dat(pte, 1); 84 return pte; 85 } 86 87 static void vmem_pte_free(unsigned long *table) 88 { 89 page_table_free(&init_mm, table); 90 } 91 92 #define PAGE_UNUSED 0xFD 93 94 /* 95 * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges 96 * from unused_sub_pmd_start to next PMD_SIZE boundary. 97 */ 98 static unsigned long unused_sub_pmd_start; 99 100 static void vmemmap_flush_unused_sub_pmd(void) 101 { 102 if (!unused_sub_pmd_start) 103 return; 104 memset((void *)unused_sub_pmd_start, PAGE_UNUSED, 105 ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start); 106 unused_sub_pmd_start = 0; 107 } 108 109 static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end) 110 { 111 /* 112 * As we expect to add in the same granularity as we remove, it's 113 * sufficient to mark only some piece used to block the memmap page from 114 * getting removed (just in case the memmap never gets initialized, 115 * e.g., because the memory block never gets onlined). 116 */ 117 memset((void *)start, 0, sizeof(struct page)); 118 } 119 120 static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) 121 { 122 /* 123 * We only optimize if the new used range directly follows the 124 * previously unused range (esp., when populating consecutive sections). 125 */ 126 if (unused_sub_pmd_start == start) { 127 unused_sub_pmd_start = end; 128 if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE))) 129 unused_sub_pmd_start = 0; 130 return; 131 } 132 vmemmap_flush_unused_sub_pmd(); 133 vmemmap_mark_sub_pmd_used(start, end); 134 } 135 136 static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) 137 { 138 unsigned long page = ALIGN_DOWN(start, PMD_SIZE); 139 140 vmemmap_flush_unused_sub_pmd(); 141 142 /* Could be our memmap page is filled with PAGE_UNUSED already ... */ 143 vmemmap_mark_sub_pmd_used(start, end); 144 145 /* Mark the unused parts of the new memmap page PAGE_UNUSED. */ 146 if (!IS_ALIGNED(start, PMD_SIZE)) 147 memset((void *)page, PAGE_UNUSED, start - page); 148 /* 149 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of 150 * consecutive sections. Remember for the last added PMD the last 151 * unused range in the populated PMD. 152 */ 153 if (!IS_ALIGNED(end, PMD_SIZE)) 154 unused_sub_pmd_start = end; 155 } 156 157 /* Returns true if the PMD is completely unused and can be freed. */ 158 static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) 159 { 160 unsigned long page = ALIGN_DOWN(start, PMD_SIZE); 161 162 vmemmap_flush_unused_sub_pmd(); 163 memset((void *)start, PAGE_UNUSED, end - start); 164 return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE); 165 } 166 167 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ 168 static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, 169 unsigned long end, bool add, bool direct, 170 struct vmem_altmap *altmap) 171 { 172 unsigned long prot, pages = 0; 173 int ret = -ENOMEM; 174 pte_t *pte; 175 176 prot = pgprot_val(PAGE_KERNEL); 177 pte = pte_offset_kernel(pmd, addr); 178 for (; addr < end; addr += PAGE_SIZE, pte++) { 179 if (!add) { 180 if (pte_none(*pte)) 181 continue; 182 if (!direct) 183 vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap); 184 pte_clear(&init_mm, addr, pte); 185 } else if (pte_none(*pte)) { 186 if (!direct) { 187 void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap); 188 189 if (!new_page) 190 goto out; 191 set_pte(pte, __pte(__pa(new_page) | prot)); 192 } else { 193 set_pte(pte, __pte(__pa(addr) | prot)); 194 } 195 } else { 196 continue; 197 } 198 pages++; 199 } 200 ret = 0; 201 out: 202 if (direct) 203 update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages); 204 return ret; 205 } 206 207 static void try_free_pte_table(pmd_t *pmd, unsigned long start) 208 { 209 pte_t *pte; 210 int i; 211 212 /* We can safely assume this is fully in 1:1 mapping & vmemmap area */ 213 pte = pte_offset_kernel(pmd, start); 214 for (i = 0; i < PTRS_PER_PTE; i++, pte++) { 215 if (!pte_none(*pte)) 216 return; 217 } 218 vmem_pte_free((unsigned long *) pmd_deref(*pmd)); 219 pmd_clear(pmd); 220 } 221 222 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ 223 static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, 224 unsigned long end, bool add, bool direct, 225 struct vmem_altmap *altmap) 226 { 227 unsigned long next, prot, pages = 0; 228 int ret = -ENOMEM; 229 pmd_t *pmd; 230 pte_t *pte; 231 232 prot = pgprot_val(SEGMENT_KERNEL); 233 pmd = pmd_offset(pud, addr); 234 for (; addr < end; addr = next, pmd++) { 235 next = pmd_addr_end(addr, end); 236 if (!add) { 237 if (pmd_none(*pmd)) 238 continue; 239 if (pmd_leaf(*pmd)) { 240 if (IS_ALIGNED(addr, PMD_SIZE) && 241 IS_ALIGNED(next, PMD_SIZE)) { 242 if (!direct) 243 vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); 244 pmd_clear(pmd); 245 pages++; 246 } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { 247 vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); 248 pmd_clear(pmd); 249 } 250 continue; 251 } 252 } else if (pmd_none(*pmd)) { 253 if (IS_ALIGNED(addr, PMD_SIZE) && 254 IS_ALIGNED(next, PMD_SIZE) && 255 cpu_has_edat1() && direct && 256 !debug_pagealloc_enabled()) { 257 set_pmd(pmd, __pmd(__pa(addr) | prot)); 258 pages++; 259 continue; 260 } else if (!direct && cpu_has_edat1()) { 261 void *new_page; 262 263 /* 264 * Use 1MB frames for vmemmap if available. We 265 * always use large frames even if they are only 266 * partially used. Otherwise we would have also 267 * page tables since vmemmap_populate gets 268 * called for each section separately. 269 */ 270 new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap); 271 if (new_page) { 272 set_pmd(pmd, __pmd(__pa(new_page) | prot)); 273 if (!IS_ALIGNED(addr, PMD_SIZE) || 274 !IS_ALIGNED(next, PMD_SIZE)) { 275 vmemmap_use_new_sub_pmd(addr, next); 276 } 277 continue; 278 } 279 } 280 pte = vmem_pte_alloc(); 281 if (!pte) 282 goto out; 283 pmd_populate(&init_mm, pmd, pte); 284 } else if (pmd_leaf(*pmd)) { 285 if (!direct) 286 vmemmap_use_sub_pmd(addr, next); 287 continue; 288 } 289 ret = modify_pte_table(pmd, addr, next, add, direct, altmap); 290 if (ret) 291 goto out; 292 if (!add) 293 try_free_pte_table(pmd, addr & PMD_MASK); 294 } 295 ret = 0; 296 out: 297 if (direct) 298 update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages); 299 return ret; 300 } 301 302 static void try_free_pmd_table(pud_t *pud, unsigned long start) 303 { 304 pmd_t *pmd; 305 int i; 306 307 pmd = pmd_offset(pud, start); 308 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) 309 if (!pmd_none(*pmd)) 310 return; 311 vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL); 312 pud_clear(pud); 313 } 314 315 static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, 316 bool add, bool direct, struct vmem_altmap *altmap) 317 { 318 unsigned long next, prot, pages = 0; 319 int ret = -ENOMEM; 320 pud_t *pud; 321 pmd_t *pmd; 322 323 prot = pgprot_val(REGION3_KERNEL); 324 pud = pud_offset(p4d, addr); 325 for (; addr < end; addr = next, pud++) { 326 next = pud_addr_end(addr, end); 327 if (!add) { 328 if (pud_none(*pud)) 329 continue; 330 if (pud_leaf(*pud)) { 331 if (IS_ALIGNED(addr, PUD_SIZE) && 332 IS_ALIGNED(next, PUD_SIZE)) { 333 pud_clear(pud); 334 pages++; 335 } 336 continue; 337 } 338 } else if (pud_none(*pud)) { 339 if (IS_ALIGNED(addr, PUD_SIZE) && 340 IS_ALIGNED(next, PUD_SIZE) && 341 cpu_has_edat2() && direct && 342 !debug_pagealloc_enabled()) { 343 set_pud(pud, __pud(__pa(addr) | prot)); 344 pages++; 345 continue; 346 } 347 pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); 348 if (!pmd) 349 goto out; 350 pud_populate(&init_mm, pud, pmd); 351 } else if (pud_leaf(*pud)) { 352 continue; 353 } 354 ret = modify_pmd_table(pud, addr, next, add, direct, altmap); 355 if (ret) 356 goto out; 357 if (!add) 358 try_free_pmd_table(pud, addr & PUD_MASK); 359 } 360 ret = 0; 361 out: 362 if (direct) 363 update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages); 364 return ret; 365 } 366 367 static void try_free_pud_table(p4d_t *p4d, unsigned long start) 368 { 369 pud_t *pud; 370 int i; 371 372 pud = pud_offset(p4d, start); 373 for (i = 0; i < PTRS_PER_PUD; i++, pud++) { 374 if (!pud_none(*pud)) 375 return; 376 } 377 vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL); 378 p4d_clear(p4d); 379 } 380 381 static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, 382 bool add, bool direct, struct vmem_altmap *altmap) 383 { 384 unsigned long next; 385 int ret = -ENOMEM; 386 p4d_t *p4d; 387 pud_t *pud; 388 389 p4d = p4d_offset(pgd, addr); 390 for (; addr < end; addr = next, p4d++) { 391 next = p4d_addr_end(addr, end); 392 if (!add) { 393 if (p4d_none(*p4d)) 394 continue; 395 } else if (p4d_none(*p4d)) { 396 pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); 397 if (!pud) 398 goto out; 399 p4d_populate(&init_mm, p4d, pud); 400 } 401 ret = modify_pud_table(p4d, addr, next, add, direct, altmap); 402 if (ret) 403 goto out; 404 if (!add) 405 try_free_pud_table(p4d, addr & P4D_MASK); 406 } 407 ret = 0; 408 out: 409 return ret; 410 } 411 412 static void try_free_p4d_table(pgd_t *pgd, unsigned long start) 413 { 414 p4d_t *p4d; 415 int i; 416 417 p4d = p4d_offset(pgd, start); 418 for (i = 0; i < PTRS_PER_P4D; i++, p4d++) { 419 if (!p4d_none(*p4d)) 420 return; 421 } 422 vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL); 423 pgd_clear(pgd); 424 } 425 426 static int modify_pagetable(unsigned long start, unsigned long end, bool add, 427 bool direct, struct vmem_altmap *altmap) 428 { 429 unsigned long addr, next; 430 int ret = -ENOMEM; 431 pgd_t *pgd; 432 p4d_t *p4d; 433 434 if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end))) 435 return -EINVAL; 436 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */ 437 if (WARN_ON_ONCE(end > __abs_lowcore)) 438 return -EINVAL; 439 for (addr = start; addr < end; addr = next) { 440 next = pgd_addr_end(addr, end); 441 pgd = pgd_offset_k(addr); 442 443 if (!add) { 444 if (pgd_none(*pgd)) 445 continue; 446 } else if (pgd_none(*pgd)) { 447 p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); 448 if (!p4d) 449 goto out; 450 pgd_populate(&init_mm, pgd, p4d); 451 } 452 ret = modify_p4d_table(pgd, addr, next, add, direct, altmap); 453 if (ret) 454 goto out; 455 if (!add) 456 try_free_p4d_table(pgd, addr & PGDIR_MASK); 457 } 458 ret = 0; 459 out: 460 if (!add) 461 flush_tlb_kernel_range(start, end); 462 return ret; 463 } 464 465 static int add_pagetable(unsigned long start, unsigned long end, bool direct, 466 struct vmem_altmap *altmap) 467 { 468 return modify_pagetable(start, end, true, direct, altmap); 469 } 470 471 static int remove_pagetable(unsigned long start, unsigned long end, bool direct, 472 struct vmem_altmap *altmap) 473 { 474 return modify_pagetable(start, end, false, direct, altmap); 475 } 476 477 /* 478 * Add a physical memory range to the 1:1 mapping. 479 */ 480 static int vmem_add_range(unsigned long start, unsigned long size) 481 { 482 start = (unsigned long)__va(start); 483 return add_pagetable(start, start + size, true, NULL); 484 } 485 486 /* 487 * Remove a physical memory range from the 1:1 mapping. 488 */ 489 static void vmem_remove_range(unsigned long start, unsigned long size) 490 { 491 start = (unsigned long)__va(start); 492 remove_pagetable(start, start + size, true, NULL); 493 } 494 495 /* 496 * Add a backed mem_map array to the virtual mem_map array. 497 */ 498 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, 499 struct vmem_altmap *altmap) 500 { 501 int ret; 502 503 mutex_lock(&vmem_mutex); 504 /* We don't care about the node, just use NUMA_NO_NODE on allocations */ 505 ret = add_pagetable(start, end, false, altmap); 506 if (ret) 507 remove_pagetable(start, end, false, altmap); 508 mutex_unlock(&vmem_mutex); 509 return ret; 510 } 511 512 #ifdef CONFIG_MEMORY_HOTPLUG 513 514 void vmemmap_free(unsigned long start, unsigned long end, 515 struct vmem_altmap *altmap) 516 { 517 mutex_lock(&vmem_mutex); 518 remove_pagetable(start, end, false, altmap); 519 mutex_unlock(&vmem_mutex); 520 } 521 522 #endif 523 524 void vmem_remove_mapping(unsigned long start, unsigned long size) 525 { 526 mutex_lock(&vmem_mutex); 527 vmem_remove_range(start, size); 528 mutex_unlock(&vmem_mutex); 529 } 530 531 struct range arch_get_mappable_range(void) 532 { 533 struct range mhp_range; 534 535 mhp_range.start = 0; 536 mhp_range.end = max_mappable - 1; 537 return mhp_range; 538 } 539 540 int vmem_add_mapping(unsigned long start, unsigned long size) 541 { 542 struct range range = arch_get_mappable_range(); 543 int ret; 544 545 if (start < range.start || 546 start + size > range.end + 1 || 547 start + size < start) 548 return -ERANGE; 549 550 mutex_lock(&vmem_mutex); 551 ret = vmem_add_range(start, size); 552 if (ret) 553 vmem_remove_range(start, size); 554 mutex_unlock(&vmem_mutex); 555 return ret; 556 } 557 558 /* 559 * Allocate new or return existing page-table entry, but do not map it 560 * to any physical address. If missing, allocate segment- and region- 561 * table entries along. Meeting a large segment- or region-table entry 562 * while traversing is an error, since the function is expected to be 563 * called against virtual regions reserved for 4KB mappings only. 564 */ 565 pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) 566 { 567 pte_t *ptep = NULL; 568 pgd_t *pgd; 569 p4d_t *p4d; 570 pud_t *pud; 571 pmd_t *pmd; 572 pte_t *pte; 573 574 pgd = pgd_offset_k(addr); 575 if (pgd_none(*pgd)) { 576 if (!alloc) 577 goto out; 578 p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY); 579 if (!p4d) 580 goto out; 581 pgd_populate(&init_mm, pgd, p4d); 582 } 583 p4d = p4d_offset(pgd, addr); 584 if (p4d_none(*p4d)) { 585 if (!alloc) 586 goto out; 587 pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY); 588 if (!pud) 589 goto out; 590 p4d_populate(&init_mm, p4d, pud); 591 } 592 pud = pud_offset(p4d, addr); 593 if (pud_none(*pud)) { 594 if (!alloc) 595 goto out; 596 pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY); 597 if (!pmd) 598 goto out; 599 pud_populate(&init_mm, pud, pmd); 600 } else if (WARN_ON_ONCE(pud_leaf(*pud))) { 601 goto out; 602 } 603 pmd = pmd_offset(pud, addr); 604 if (pmd_none(*pmd)) { 605 if (!alloc) 606 goto out; 607 pte = vmem_pte_alloc(); 608 if (!pte) 609 goto out; 610 pmd_populate(&init_mm, pmd, pte); 611 } else if (WARN_ON_ONCE(pmd_leaf(*pmd))) { 612 goto out; 613 } 614 ptep = pte_offset_kernel(pmd, addr); 615 out: 616 return ptep; 617 } 618 619 int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc) 620 { 621 pte_t *ptep, pte; 622 623 if (!IS_ALIGNED(addr, PAGE_SIZE)) 624 return -EINVAL; 625 ptep = vmem_get_alloc_pte(addr, alloc); 626 if (!ptep) 627 return -ENOMEM; 628 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 629 pte = mk_pte_phys(phys, prot); 630 set_pte(ptep, pte); 631 return 0; 632 } 633 634 int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot) 635 { 636 int rc; 637 638 mutex_lock(&vmem_mutex); 639 rc = __vmem_map_4k_page(addr, phys, prot, true); 640 mutex_unlock(&vmem_mutex); 641 return rc; 642 } 643 644 void vmem_unmap_4k_page(unsigned long addr) 645 { 646 pte_t *ptep; 647 648 mutex_lock(&vmem_mutex); 649 ptep = virt_to_kpte(addr); 650 __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL); 651 pte_clear(&init_mm, addr, ptep); 652 mutex_unlock(&vmem_mutex); 653 } 654 655 void __init vmem_map_init(void) 656 { 657 __set_memory_rox(_stext, _etext); 658 __set_memory_ro(_etext, __end_rodata); 659 __set_memory_rox(__stext_amode31, __etext_amode31); 660 /* 661 * If the BEAR-enhancement facility is not installed the first 662 * prefix page is used to return to the previous context with 663 * an LPSWE instruction and therefore must be executable. 664 */ 665 if (!cpu_has_bear()) 666 set_memory_x(0, 1); 667 if (debug_pagealloc_enabled()) 668 __set_memory_4k(__va(0), absolute_pointer(__va(0)) + ident_map_size); 669 pr_info("Write protected kernel read-only data: %luk\n", 670 (unsigned long)(__end_rodata - _stext) >> 10); 671 } 672