1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Page table handling routines for radix page table. 4 * 5 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. 6 */ 7 8 #define pr_fmt(fmt) "radix-mmu: " fmt 9 10 #include <linux/io.h> 11 #include <linux/kernel.h> 12 #include <linux/sched/mm.h> 13 #include <linux/memblock.h> 14 #include <linux/of.h> 15 #include <linux/of_fdt.h> 16 #include <linux/mm.h> 17 #include <linux/page_table_check.h> 18 #include <linux/hugetlb.h> 19 #include <linux/string_helpers.h> 20 #include <linux/memory.h> 21 #include <linux/kfence.h> 22 23 #include <asm/pgalloc.h> 24 #include <asm/mmu_context.h> 25 #include <asm/dma.h> 26 #include <asm/machdep.h> 27 #include <asm/mmu.h> 28 #include <asm/firmware.h> 29 #include <asm/powernv.h> 30 #include <asm/sections.h> 31 #include <asm/smp.h> 32 #include <asm/trace.h> 33 #include <asm/uaccess.h> 34 #include <asm/ultravisor.h> 35 #include <asm/set_memory.h> 36 #include <asm/kfence.h> 37 38 #include <trace/events/thp.h> 39 40 #include <mm/mmu_decl.h> 41 42 unsigned int mmu_base_pid; 43 44 static __ref void *early_alloc_pgtable(unsigned long size, int nid, 45 unsigned long region_start, unsigned long region_end) 46 { 47 phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT; 48 phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE; 49 void *ptr; 50 51 if (region_start) 52 min_addr = region_start; 53 if (region_end) 54 max_addr = region_end; 55 56 ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid); 57 58 if (!ptr) 59 panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n", 60 __func__, size, size, nid, &min_addr, &max_addr); 61 62 return ptr; 63 } 64 65 /* 66 * When allocating pud or pmd pointers, we allocate a complete page 67 * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This 68 * is to ensure that the page obtained from the memblock allocator 69 * can be completely used as page table page and can be freed 70 * correctly when the page table entries are removed. 71 */ 72 static int early_map_kernel_page(unsigned long ea, unsigned long pa, 73 pgprot_t flags, 74 unsigned int map_page_size, 75 int nid, 76 unsigned long region_start, unsigned long region_end) 77 { 78 unsigned long pfn = pa >> PAGE_SHIFT; 79 pgd_t *pgdp; 80 p4d_t *p4dp; 81 pud_t *pudp; 82 pmd_t *pmdp; 83 pte_t *ptep; 84 85 pgdp = pgd_offset_k(ea); 86 p4dp = p4d_offset(pgdp, ea); 87 if (p4d_none(*p4dp)) { 88 pudp = early_alloc_pgtable(PAGE_SIZE, nid, 89 region_start, region_end); 90 p4d_populate(&init_mm, p4dp, pudp); 91 } 92 pudp = pud_offset(p4dp, ea); 93 if (map_page_size == PUD_SIZE) { 94 ptep = (pte_t *)pudp; 95 goto set_the_pte; 96 } 97 if (pud_none(*pudp)) { 98 pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start, 99 region_end); 100 pud_populate(&init_mm, pudp, pmdp); 101 } 102 pmdp = pmd_offset(pudp, ea); 103 if (map_page_size == PMD_SIZE) { 104 ptep = pmdp_ptep(pmdp); 105 goto set_the_pte; 106 } 107 if (!pmd_present(*pmdp)) { 108 ptep = early_alloc_pgtable(PAGE_SIZE, nid, 109 region_start, region_end); 110 pmd_populate_kernel(&init_mm, pmdp, ptep); 111 } 112 ptep = pte_offset_kernel(pmdp, ea); 113 114 set_the_pte: 115 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 116 asm volatile("ptesync": : :"memory"); 117 return 0; 118 } 119 120 /* 121 * nid, region_start, and region_end are hints to try to place the page 122 * table memory in the same node or region. 123 */ 124 static int __map_kernel_page(unsigned long ea, unsigned long pa, 125 pgprot_t flags, 126 unsigned int map_page_size, 127 int nid, 128 unsigned long region_start, unsigned long region_end) 129 { 130 unsigned long pfn = pa >> PAGE_SHIFT; 131 pgd_t *pgdp; 132 p4d_t *p4dp; 133 pud_t *pudp; 134 pmd_t *pmdp; 135 pte_t *ptep; 136 /* 137 * Make sure task size is correct as per the max adddr 138 */ 139 BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); 140 141 #ifdef CONFIG_PPC_64K_PAGES 142 BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT)); 143 #endif 144 145 if (unlikely(!slab_is_available())) 146 return early_map_kernel_page(ea, pa, flags, map_page_size, 147 nid, region_start, region_end); 148 149 /* 150 * Should make page table allocation functions be able to take a 151 * node, so we can place kernel page tables on the right nodes after 152 * boot. 153 */ 154 pgdp = pgd_offset_k(ea); 155 p4dp = p4d_offset(pgdp, ea); 156 pudp = pud_alloc(&init_mm, p4dp, ea); 157 if (!pudp) 158 return -ENOMEM; 159 if (map_page_size == PUD_SIZE) { 160 ptep = (pte_t *)pudp; 161 goto set_the_pte; 162 } 163 pmdp = pmd_alloc(&init_mm, pudp, ea); 164 if (!pmdp) 165 return -ENOMEM; 166 if (map_page_size == PMD_SIZE) { 167 ptep = pmdp_ptep(pmdp); 168 goto set_the_pte; 169 } 170 ptep = pte_alloc_kernel(pmdp, ea); 171 if (!ptep) 172 return -ENOMEM; 173 174 set_the_pte: 175 set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags)); 176 asm volatile("ptesync": : :"memory"); 177 return 0; 178 } 179 180 int radix__map_kernel_page(unsigned long ea, unsigned long pa, 181 pgprot_t flags, 182 unsigned int map_page_size) 183 { 184 return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0); 185 } 186 187 #ifdef CONFIG_STRICT_KERNEL_RWX 188 static void radix__change_memory_range(unsigned long start, unsigned long end, 189 unsigned long clear) 190 { 191 unsigned long idx; 192 pgd_t *pgdp; 193 p4d_t *p4dp; 194 pud_t *pudp; 195 pmd_t *pmdp; 196 pte_t *ptep; 197 198 start = ALIGN_DOWN(start, PAGE_SIZE); 199 end = PAGE_ALIGN(end); // aligns up 200 201 pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n", 202 start, end, clear); 203 204 for (idx = start; idx < end; idx += PAGE_SIZE) { 205 pgdp = pgd_offset_k(idx); 206 p4dp = p4d_offset(pgdp, idx); 207 pudp = pud_alloc(&init_mm, p4dp, idx); 208 if (!pudp) 209 continue; 210 if (pud_leaf(*pudp)) { 211 ptep = (pte_t *)pudp; 212 goto update_the_pte; 213 } 214 pmdp = pmd_alloc(&init_mm, pudp, idx); 215 if (!pmdp) 216 continue; 217 if (pmd_leaf(*pmdp)) { 218 ptep = pmdp_ptep(pmdp); 219 goto update_the_pte; 220 } 221 ptep = pte_alloc_kernel(pmdp, idx); 222 if (!ptep) 223 continue; 224 update_the_pte: 225 radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); 226 } 227 228 radix__flush_tlb_kernel_range(start, end); 229 } 230 231 void radix__mark_rodata_ro(void) 232 { 233 unsigned long start, end; 234 235 start = (unsigned long)_stext; 236 end = (unsigned long)__end_rodata; 237 238 radix__change_memory_range(start, end, _PAGE_WRITE); 239 240 for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) { 241 end = start + PAGE_SIZE; 242 if (overlaps_interrupt_vector_text(start, end)) 243 radix__change_memory_range(start, end, _PAGE_WRITE); 244 else 245 break; 246 } 247 } 248 249 void radix__mark_initmem_nx(void) 250 { 251 unsigned long start = (unsigned long)__init_begin; 252 unsigned long end = (unsigned long)__init_end; 253 254 radix__change_memory_range(start, end, _PAGE_EXEC); 255 } 256 #endif /* CONFIG_STRICT_KERNEL_RWX */ 257 258 static inline void __meminit 259 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec) 260 { 261 char buf[10]; 262 263 if (end <= start) 264 return; 265 266 string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); 267 268 pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, 269 exec ? " (exec)" : ""); 270 } 271 272 static unsigned long next_boundary(unsigned long addr, unsigned long end) 273 { 274 #ifdef CONFIG_STRICT_KERNEL_RWX 275 unsigned long stext_phys; 276 277 stext_phys = __pa_symbol(_stext); 278 279 // Relocatable kernel running at non-zero real address 280 if (stext_phys != 0) { 281 // The end of interrupts code at zero is a rodata boundary 282 unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys; 283 if (addr < end_intr) 284 return end_intr; 285 286 // Start of relocated kernel text is a rodata boundary 287 if (addr < stext_phys) 288 return stext_phys; 289 } 290 291 if (addr < __pa_symbol(__srwx_boundary)) 292 return __pa_symbol(__srwx_boundary); 293 #endif 294 return end; 295 } 296 297 static int __meminit create_physical_mapping(unsigned long start, 298 unsigned long end, 299 int nid, pgprot_t _prot, 300 unsigned long mapping_sz_limit) 301 { 302 unsigned long vaddr, addr, mapping_size = 0; 303 bool prev_exec, exec = false; 304 pgprot_t prot; 305 int psize; 306 unsigned long max_mapping_size = memory_block_size; 307 308 if (mapping_sz_limit < max_mapping_size) 309 max_mapping_size = mapping_sz_limit; 310 311 if (debug_pagealloc_enabled()) 312 max_mapping_size = PAGE_SIZE; 313 314 start = ALIGN(start, PAGE_SIZE); 315 end = ALIGN_DOWN(end, PAGE_SIZE); 316 for (addr = start; addr < end; addr += mapping_size) { 317 unsigned long gap, previous_size; 318 int rc; 319 320 gap = next_boundary(addr, end) - addr; 321 if (gap > max_mapping_size) 322 gap = max_mapping_size; 323 previous_size = mapping_size; 324 prev_exec = exec; 325 326 if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && 327 mmu_psize_defs[MMU_PAGE_1G].shift) { 328 mapping_size = PUD_SIZE; 329 psize = MMU_PAGE_1G; 330 } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && 331 mmu_psize_defs[MMU_PAGE_2M].shift) { 332 mapping_size = PMD_SIZE; 333 psize = MMU_PAGE_2M; 334 } else { 335 mapping_size = PAGE_SIZE; 336 psize = mmu_virtual_psize; 337 } 338 339 vaddr = (unsigned long)__va(addr); 340 341 if (overlaps_kernel_text(vaddr, vaddr + mapping_size) || 342 overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) { 343 prot = PAGE_KERNEL_X; 344 exec = true; 345 } else { 346 prot = _prot; 347 exec = false; 348 } 349 350 if (mapping_size != previous_size || exec != prev_exec) { 351 print_mapping(start, addr, previous_size, prev_exec); 352 start = addr; 353 } 354 355 rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end); 356 if (rc) 357 return rc; 358 359 update_page_count(psize, 1); 360 } 361 362 print_mapping(start, addr, mapping_size, exec); 363 return 0; 364 } 365 366 #ifdef CONFIG_KFENCE 367 static __init phys_addr_t alloc_kfence_pool(void) 368 { 369 phys_addr_t kfence_pool; 370 371 /* 372 * TODO: Support to enable KFENCE after bootup depends on the ability to 373 * split page table mappings. As such support is not currently 374 * implemented for radix pagetables, support enabling KFENCE 375 * only at system startup for now. 376 * 377 * After support for splitting mappings is available on radix, 378 * alloc_kfence_pool() & map_kfence_pool() can be dropped and 379 * mapping for __kfence_pool memory can be 380 * split during arch_kfence_init_pool(). 381 */ 382 if (!kfence_early_init) 383 goto no_kfence; 384 385 kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); 386 if (!kfence_pool) 387 goto no_kfence; 388 389 memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE); 390 return kfence_pool; 391 392 no_kfence: 393 disable_kfence(); 394 return 0; 395 } 396 397 static __init void map_kfence_pool(phys_addr_t kfence_pool) 398 { 399 if (!kfence_pool) 400 return; 401 402 if (create_physical_mapping(kfence_pool, kfence_pool + KFENCE_POOL_SIZE, 403 -1, PAGE_KERNEL, PAGE_SIZE)) 404 goto err; 405 406 memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); 407 __kfence_pool = __va(kfence_pool); 408 return; 409 410 err: 411 memblock_phys_free(kfence_pool, KFENCE_POOL_SIZE); 412 disable_kfence(); 413 } 414 #else 415 static inline phys_addr_t alloc_kfence_pool(void) { return 0; } 416 static inline void map_kfence_pool(phys_addr_t kfence_pool) { } 417 #endif 418 419 static void __init radix_init_pgtable(void) 420 { 421 phys_addr_t kfence_pool; 422 unsigned long rts_field; 423 phys_addr_t start, end; 424 u64 i; 425 426 /* We don't support slb for radix */ 427 slb_set_size(0); 428 429 kfence_pool = alloc_kfence_pool(); 430 431 /* 432 * Create the linear mapping 433 */ 434 for_each_mem_range(i, &start, &end) { 435 /* 436 * The memblock allocator is up at this point, so the 437 * page tables will be allocated within the range. No 438 * need or a node (which we don't have yet). 439 */ 440 441 if (end >= RADIX_VMALLOC_START) { 442 pr_warn("Outside the supported range\n"); 443 continue; 444 } 445 446 WARN_ON(create_physical_mapping(start, end, 447 -1, PAGE_KERNEL, ~0UL)); 448 } 449 450 map_kfence_pool(kfence_pool); 451 452 if (!cpu_has_feature(CPU_FTR_HVMODE) && 453 cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { 454 /* 455 * Older versions of KVM on these machines prefer if the 456 * guest only uses the low 19 PID bits. 457 */ 458 mmu_pid_bits = 19; 459 } 460 mmu_base_pid = 1; 461 462 /* 463 * Allocate Partition table and process table for the 464 * host. 465 */ 466 BUG_ON(PRTB_SIZE_SHIFT > 36); 467 process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0); 468 /* 469 * Fill in the process table. 470 */ 471 rts_field = radix__get_tree_size(); 472 process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); 473 474 /* 475 * The init_mm context is given the first available (non-zero) PID, 476 * which is the "guard PID" and contains no page table. PIDR should 477 * never be set to zero because that duplicates the kernel address 478 * space at the 0x0... offset (quadrant 0)! 479 * 480 * An arbitrary PID that may later be allocated by the PID allocator 481 * for userspace processes must not be used either, because that 482 * would cause stale user mappings for that PID on CPUs outside of 483 * the TLB invalidation scheme (because it won't be in mm_cpumask). 484 * 485 * So permanently carve out one PID for the purpose of a guard PID. 486 */ 487 init_mm.context.id = mmu_base_pid; 488 mmu_base_pid++; 489 } 490 491 static void __init radix_init_partition_table(void) 492 { 493 unsigned long rts_field, dw0, dw1; 494 495 mmu_partition_table_init(); 496 rts_field = radix__get_tree_size(); 497 dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; 498 dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR; 499 mmu_partition_table_set_entry(0, dw0, dw1, false); 500 501 pr_info("Initializing Radix MMU\n"); 502 } 503 504 static int __init get_idx_from_shift(unsigned int shift) 505 { 506 int idx = -1; 507 508 switch (shift) { 509 case 0xc: 510 idx = MMU_PAGE_4K; 511 break; 512 case 0x10: 513 idx = MMU_PAGE_64K; 514 break; 515 case 0x15: 516 idx = MMU_PAGE_2M; 517 break; 518 case 0x1e: 519 idx = MMU_PAGE_1G; 520 break; 521 } 522 return idx; 523 } 524 525 static int __init radix_dt_scan_page_sizes(unsigned long node, 526 const char *uname, int depth, 527 void *data) 528 { 529 int size = 0; 530 int shift, idx; 531 unsigned int ap; 532 const __be32 *prop; 533 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 534 535 /* We are scanning "cpu" nodes only */ 536 if (type == NULL || strcmp(type, "cpu") != 0) 537 return 0; 538 539 /* Grab page size encodings */ 540 prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); 541 if (!prop) 542 return 0; 543 544 pr_info("Page sizes from device-tree:\n"); 545 for (; size >= 4; size -= 4, ++prop) { 546 547 struct mmu_psize_def *def; 548 549 /* top 3 bit is AP encoding */ 550 shift = be32_to_cpu(prop[0]) & ~(0xe << 28); 551 ap = be32_to_cpu(prop[0]) >> 29; 552 pr_info("Page size shift = %d AP=0x%x\n", shift, ap); 553 554 idx = get_idx_from_shift(shift); 555 if (idx < 0) 556 continue; 557 558 def = &mmu_psize_defs[idx]; 559 def->shift = shift; 560 def->ap = ap; 561 def->h_rpt_pgsize = psize_to_rpti_pgsize(idx); 562 } 563 564 /* needed ? */ 565 cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; 566 return 1; 567 } 568 569 void __init radix__early_init_devtree(void) 570 { 571 int rc; 572 573 /* 574 * Try to find the available page sizes in the device-tree 575 */ 576 rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); 577 if (!rc) { 578 /* 579 * No page size details found in device tree. 580 * Let's assume we have page 4k and 64k support 581 */ 582 mmu_psize_defs[MMU_PAGE_4K].shift = 12; 583 mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; 584 mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize = 585 psize_to_rpti_pgsize(MMU_PAGE_4K); 586 587 mmu_psize_defs[MMU_PAGE_64K].shift = 16; 588 mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; 589 mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize = 590 psize_to_rpti_pgsize(MMU_PAGE_64K); 591 } 592 return; 593 } 594 595 void __init radix__early_init_mmu(void) 596 { 597 unsigned long lpcr; 598 599 #ifdef CONFIG_PPC_64S_HASH_MMU 600 #ifdef CONFIG_PPC_64K_PAGES 601 /* PAGE_SIZE mappings */ 602 mmu_virtual_psize = MMU_PAGE_64K; 603 #else 604 mmu_virtual_psize = MMU_PAGE_4K; 605 #endif 606 #endif 607 /* 608 * initialize page table size 609 */ 610 __pte_index_size = RADIX_PTE_INDEX_SIZE; 611 __pmd_index_size = RADIX_PMD_INDEX_SIZE; 612 __pud_index_size = RADIX_PUD_INDEX_SIZE; 613 __pgd_index_size = RADIX_PGD_INDEX_SIZE; 614 __pud_cache_index = RADIX_PUD_INDEX_SIZE; 615 __pte_table_size = RADIX_PTE_TABLE_SIZE; 616 __pmd_table_size = RADIX_PMD_TABLE_SIZE; 617 __pud_table_size = RADIX_PUD_TABLE_SIZE; 618 __pgd_table_size = RADIX_PGD_TABLE_SIZE; 619 620 __pmd_val_bits = RADIX_PMD_VAL_BITS; 621 __pud_val_bits = RADIX_PUD_VAL_BITS; 622 __pgd_val_bits = RADIX_PGD_VAL_BITS; 623 624 __kernel_virt_start = RADIX_KERN_VIRT_START; 625 __vmalloc_start = RADIX_VMALLOC_START; 626 __vmalloc_end = RADIX_VMALLOC_END; 627 __kernel_io_start = RADIX_KERN_IO_START; 628 __kernel_io_end = RADIX_KERN_IO_END; 629 vmemmap = (struct page *)RADIX_VMEMMAP_START; 630 ioremap_bot = IOREMAP_BASE; 631 632 #ifdef CONFIG_PCI 633 pci_io_base = ISA_IO_BASE; 634 #endif 635 __pte_frag_nr = RADIX_PTE_FRAG_NR; 636 __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT; 637 __pmd_frag_nr = RADIX_PMD_FRAG_NR; 638 __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT; 639 640 radix_init_pgtable(); 641 642 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 643 lpcr = mfspr(SPRN_LPCR); 644 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 645 radix_init_partition_table(); 646 } else { 647 radix_init_pseries(); 648 } 649 650 memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); 651 652 /* Switch to the guard PID before turning on MMU */ 653 radix__switch_mmu_context(NULL, &init_mm); 654 tlbiel_all(); 655 } 656 657 void radix__early_init_mmu_secondary(void) 658 { 659 unsigned long lpcr; 660 /* 661 * update partition table control register and UPRT 662 */ 663 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 664 lpcr = mfspr(SPRN_LPCR); 665 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); 666 667 set_ptcr_when_no_uv(__pa(partition_tb) | 668 (PATB_SIZE_SHIFT - 12)); 669 } 670 671 radix__switch_mmu_context(NULL, &init_mm); 672 tlbiel_all(); 673 674 /* Make sure userspace can't change the AMR */ 675 mtspr(SPRN_UAMOR, 0); 676 } 677 678 /* Called during kexec sequence with MMU off */ 679 notrace void radix__mmu_cleanup_all(void) 680 { 681 unsigned long lpcr; 682 683 if (!firmware_has_feature(FW_FEATURE_LPAR)) { 684 lpcr = mfspr(SPRN_LPCR); 685 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); 686 set_ptcr_when_no_uv(0); 687 powernv_set_nmmu_ptcr(0); 688 radix__flush_tlb_all(); 689 } 690 } 691 692 #ifdef CONFIG_MEMORY_HOTPLUG 693 static void free_pte_table(pte_t *pte_start, pmd_t *pmd) 694 { 695 pte_t *pte; 696 int i; 697 698 for (i = 0; i < PTRS_PER_PTE; i++) { 699 pte = pte_start + i; 700 if (!pte_none(*pte)) 701 return; 702 } 703 704 pte_free_kernel(&init_mm, pte_start); 705 pmd_clear(pmd); 706 } 707 708 static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) 709 { 710 pmd_t *pmd; 711 int i; 712 713 for (i = 0; i < PTRS_PER_PMD; i++) { 714 pmd = pmd_start + i; 715 if (!pmd_none(*pmd)) 716 return; 717 } 718 719 pmd_free(&init_mm, pmd_start); 720 pud_clear(pud); 721 } 722 723 static void free_pud_table(pud_t *pud_start, p4d_t *p4d) 724 { 725 pud_t *pud; 726 int i; 727 728 for (i = 0; i < PTRS_PER_PUD; i++) { 729 pud = pud_start + i; 730 if (!pud_none(*pud)) 731 return; 732 } 733 734 pud_free(&init_mm, pud_start); 735 p4d_clear(p4d); 736 } 737 738 #ifdef CONFIG_SPARSEMEM_VMEMMAP 739 static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end) 740 { 741 unsigned long start = ALIGN_DOWN(addr, PMD_SIZE); 742 743 return !vmemmap_populated(start, PMD_SIZE); 744 } 745 746 static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end) 747 { 748 unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE); 749 750 return !vmemmap_populated(start, PAGE_SIZE); 751 752 } 753 #endif 754 755 static void __meminit free_vmemmap_pages(struct page *page, 756 struct vmem_altmap *altmap, 757 int order) 758 { 759 unsigned int nr_pages = 1 << order; 760 761 if (altmap) { 762 unsigned long alt_start, alt_end; 763 unsigned long base_pfn = page_to_pfn(page); 764 765 /* 766 * with 2M vmemmap mmaping we can have things setup 767 * such that even though atlmap is specified we never 768 * used altmap. 769 */ 770 alt_start = altmap->base_pfn; 771 alt_end = altmap->base_pfn + altmap->reserve + altmap->free; 772 773 if (base_pfn >= alt_start && base_pfn < alt_end) { 774 vmem_altmap_free(altmap, nr_pages); 775 return; 776 } 777 } 778 779 if (PageReserved(page)) { 780 /* allocated from memblock */ 781 while (nr_pages--) 782 free_reserved_page(page++); 783 } else 784 __free_pages(page, order); 785 } 786 787 static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, 788 unsigned long end, bool direct, 789 struct vmem_altmap *altmap) 790 { 791 unsigned long next, pages = 0; 792 pte_t *pte; 793 794 pte = pte_start + pte_index(addr); 795 for (; addr < end; addr = next, pte++) { 796 next = (addr + PAGE_SIZE) & PAGE_MASK; 797 if (next > end) 798 next = end; 799 800 if (!pte_present(*pte)) 801 continue; 802 803 if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { 804 if (!direct) 805 free_vmemmap_pages(pte_page(*pte), altmap, 0); 806 pte_clear(&init_mm, addr, pte); 807 pages++; 808 } 809 #ifdef CONFIG_SPARSEMEM_VMEMMAP 810 else if (!direct && vmemmap_page_is_unused(addr, next)) { 811 free_vmemmap_pages(pte_page(*pte), altmap, 0); 812 pte_clear(&init_mm, addr, pte); 813 } 814 #endif 815 } 816 if (direct) 817 update_page_count(mmu_virtual_psize, -pages); 818 } 819 820 static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, 821 unsigned long end, bool direct, 822 struct vmem_altmap *altmap) 823 { 824 unsigned long next, pages = 0; 825 pte_t *pte_base; 826 pmd_t *pmd; 827 828 pmd = pmd_start + pmd_index(addr); 829 for (; addr < end; addr = next, pmd++) { 830 next = pmd_addr_end(addr, end); 831 832 if (!pmd_present(*pmd)) 833 continue; 834 835 if (pmd_leaf(*pmd)) { 836 if (IS_ALIGNED(addr, PMD_SIZE) && 837 IS_ALIGNED(next, PMD_SIZE)) { 838 if (!direct) 839 free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); 840 pte_clear(&init_mm, addr, (pte_t *)pmd); 841 pages++; 842 } 843 #ifdef CONFIG_SPARSEMEM_VMEMMAP 844 else if (!direct && vmemmap_pmd_is_unused(addr, next)) { 845 free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); 846 pte_clear(&init_mm, addr, (pte_t *)pmd); 847 } 848 #endif 849 continue; 850 } 851 852 pte_base = (pte_t *)pmd_page_vaddr(*pmd); 853 remove_pte_table(pte_base, addr, next, direct, altmap); 854 free_pte_table(pte_base, pmd); 855 } 856 if (direct) 857 update_page_count(MMU_PAGE_2M, -pages); 858 } 859 860 static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, 861 unsigned long end, bool direct, 862 struct vmem_altmap *altmap) 863 { 864 unsigned long next, pages = 0; 865 pmd_t *pmd_base; 866 pud_t *pud; 867 868 pud = pud_start + pud_index(addr); 869 for (; addr < end; addr = next, pud++) { 870 next = pud_addr_end(addr, end); 871 872 if (!pud_present(*pud)) 873 continue; 874 875 if (pud_leaf(*pud)) { 876 if (!IS_ALIGNED(addr, PUD_SIZE) || 877 !IS_ALIGNED(next, PUD_SIZE)) { 878 WARN_ONCE(1, "%s: unaligned range\n", __func__); 879 continue; 880 } 881 pte_clear(&init_mm, addr, (pte_t *)pud); 882 pages++; 883 continue; 884 } 885 886 pmd_base = pud_pgtable(*pud); 887 remove_pmd_table(pmd_base, addr, next, direct, altmap); 888 free_pmd_table(pmd_base, pud); 889 } 890 if (direct) 891 update_page_count(MMU_PAGE_1G, -pages); 892 } 893 894 static void __meminit 895 remove_pagetable(unsigned long start, unsigned long end, bool direct, 896 struct vmem_altmap *altmap) 897 { 898 unsigned long addr, next; 899 pud_t *pud_base; 900 pgd_t *pgd; 901 p4d_t *p4d; 902 903 spin_lock(&init_mm.page_table_lock); 904 905 for (addr = start; addr < end; addr = next) { 906 next = pgd_addr_end(addr, end); 907 908 pgd = pgd_offset_k(addr); 909 p4d = p4d_offset(pgd, addr); 910 if (!p4d_present(*p4d)) 911 continue; 912 913 if (p4d_leaf(*p4d)) { 914 if (!IS_ALIGNED(addr, P4D_SIZE) || 915 !IS_ALIGNED(next, P4D_SIZE)) { 916 WARN_ONCE(1, "%s: unaligned range\n", __func__); 917 continue; 918 } 919 920 pte_clear(&init_mm, addr, (pte_t *)pgd); 921 continue; 922 } 923 924 pud_base = p4d_pgtable(*p4d); 925 remove_pud_table(pud_base, addr, next, direct, altmap); 926 free_pud_table(pud_base, p4d); 927 } 928 929 spin_unlock(&init_mm.page_table_lock); 930 radix__flush_tlb_kernel_range(start, end); 931 } 932 933 int __meminit radix__create_section_mapping(unsigned long start, 934 unsigned long end, int nid, 935 pgprot_t prot) 936 { 937 if (end >= RADIX_VMALLOC_START) { 938 pr_warn("Outside the supported range\n"); 939 return -1; 940 } 941 942 return create_physical_mapping(__pa(start), __pa(end), 943 nid, prot, ~0UL); 944 } 945 946 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) 947 { 948 remove_pagetable(start, end, true, NULL); 949 return 0; 950 } 951 #endif /* CONFIG_MEMORY_HOTPLUG */ 952 953 #ifdef CONFIG_SPARSEMEM_VMEMMAP 954 static int __map_kernel_page_nid(unsigned long ea, unsigned long pa, 955 pgprot_t flags, unsigned int map_page_size, 956 int nid) 957 { 958 return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0); 959 } 960 961 int __meminit radix__vmemmap_create_mapping(unsigned long start, 962 unsigned long page_size, 963 unsigned long phys) 964 { 965 /* Create a PTE encoding */ 966 int nid = early_pfn_to_nid(phys >> PAGE_SHIFT); 967 int ret; 968 969 if ((start + page_size) >= RADIX_VMEMMAP_END) { 970 pr_warn("Outside the supported range\n"); 971 return -1; 972 } 973 974 ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid); 975 BUG_ON(ret); 976 977 return 0; 978 } 979 980 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP 981 bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) 982 { 983 if (radix_enabled()) 984 return __vmemmap_can_optimize(altmap, pgmap); 985 986 return false; 987 } 988 #endif 989 990 int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, 991 unsigned long addr, unsigned long next) 992 { 993 int large = pmd_leaf(*pmdp); 994 995 if (large) 996 vmemmap_verify(pmdp_ptep(pmdp), node, addr, next); 997 998 return large; 999 } 1000 1001 void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, 1002 unsigned long addr, unsigned long next) 1003 { 1004 pte_t entry; 1005 pte_t *ptep = pmdp_ptep(pmdp); 1006 1007 VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE)); 1008 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); 1009 set_pte_at(&init_mm, addr, ptep, entry); 1010 asm volatile("ptesync": : :"memory"); 1011 1012 vmemmap_verify(ptep, node, addr, next); 1013 } 1014 1015 static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr, 1016 int node, 1017 struct vmem_altmap *altmap, 1018 struct page *reuse) 1019 { 1020 pte_t *pte = pte_offset_kernel(pmdp, addr); 1021 1022 if (pte_none(*pte)) { 1023 pte_t entry; 1024 void *p; 1025 1026 if (!reuse) { 1027 /* 1028 * make sure we don't create altmap mappings 1029 * covering things outside the device. 1030 */ 1031 if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE)) 1032 altmap = NULL; 1033 1034 p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); 1035 if (!p && altmap) 1036 p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL); 1037 if (!p) 1038 return NULL; 1039 pr_debug("PAGE_SIZE vmemmap mapping\n"); 1040 } else { 1041 /* 1042 * When a PTE/PMD entry is freed from the init_mm 1043 * there's a free_pages() call to this page allocated 1044 * above. Thus this get_page() is paired with the 1045 * put_page_testzero() on the freeing path. 1046 * This can only called by certain ZONE_DEVICE path, 1047 * and through vmemmap_populate_compound_pages() when 1048 * slab is available. 1049 */ 1050 get_page(reuse); 1051 p = page_to_virt(reuse); 1052 pr_debug("Tail page reuse vmemmap mapping\n"); 1053 } 1054 1055 VM_BUG_ON(!PAGE_ALIGNED(addr)); 1056 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); 1057 set_pte_at(&init_mm, addr, pte, entry); 1058 asm volatile("ptesync": : :"memory"); 1059 } 1060 return pte; 1061 } 1062 1063 static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node, 1064 unsigned long address) 1065 { 1066 pud_t *pud; 1067 1068 /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ 1069 if (unlikely(p4d_none(*p4dp))) { 1070 if (unlikely(!slab_is_available())) { 1071 pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); 1072 p4d_populate(&init_mm, p4dp, pud); 1073 /* go to the pud_offset */ 1074 } else 1075 return pud_alloc(&init_mm, p4dp, address); 1076 } 1077 return pud_offset(p4dp, address); 1078 } 1079 1080 static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node, 1081 unsigned long address) 1082 { 1083 pmd_t *pmd; 1084 1085 /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ 1086 if (unlikely(pud_none(*pudp))) { 1087 if (unlikely(!slab_is_available())) { 1088 pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); 1089 pud_populate(&init_mm, pudp, pmd); 1090 } else 1091 return pmd_alloc(&init_mm, pudp, address); 1092 } 1093 return pmd_offset(pudp, address); 1094 } 1095 1096 static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node, 1097 unsigned long address) 1098 { 1099 pte_t *pte; 1100 1101 /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ 1102 if (unlikely(pmd_none(*pmdp))) { 1103 if (unlikely(!slab_is_available())) { 1104 pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); 1105 pmd_populate(&init_mm, pmdp, pte); 1106 } else 1107 return pte_alloc_kernel(pmdp, address); 1108 } 1109 return pte_offset_kernel(pmdp, address); 1110 } 1111 1112 1113 1114 int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node, 1115 struct vmem_altmap *altmap) 1116 { 1117 unsigned long addr; 1118 unsigned long next; 1119 pgd_t *pgd; 1120 p4d_t *p4d; 1121 pud_t *pud; 1122 pmd_t *pmd; 1123 pte_t *pte; 1124 1125 /* 1126 * If altmap is present, Make sure we align the start vmemmap addr 1127 * to PAGE_SIZE so that we calculate the correct start_pfn in 1128 * altmap boundary check to decide whether we should use altmap or 1129 * RAM based backing memory allocation. Also the address need to be 1130 * aligned for set_pte operation. If the start addr is already 1131 * PMD_SIZE aligned and with in the altmap boundary then we will 1132 * try to use a pmd size altmap mapping else we go for page size 1133 * mapping. 1134 * 1135 * If altmap is not present, align the vmemmap addr to PMD_SIZE and 1136 * always allocate a PMD size page for vmemmap backing. 1137 * 1138 */ 1139 1140 if (altmap) 1141 start = ALIGN_DOWN(start, PAGE_SIZE); 1142 else 1143 start = ALIGN_DOWN(start, PMD_SIZE); 1144 1145 for (addr = start; addr < end; addr = next) { 1146 next = pmd_addr_end(addr, end); 1147 1148 pgd = pgd_offset_k(addr); 1149 p4d = p4d_offset(pgd, addr); 1150 pud = vmemmap_pud_alloc(p4d, node, addr); 1151 if (!pud) 1152 return -ENOMEM; 1153 pmd = vmemmap_pmd_alloc(pud, node, addr); 1154 if (!pmd) 1155 return -ENOMEM; 1156 1157 if (pmd_none(READ_ONCE(*pmd))) { 1158 void *p; 1159 1160 /* 1161 * keep it simple by checking addr PMD_SIZE alignment 1162 * and verifying the device boundary condition. 1163 * For us to use a pmd mapping, both addr and pfn should 1164 * be aligned. We skip if addr is not aligned and for 1165 * pfn we hope we have extra area in the altmap that 1166 * can help to find an aligned block. This can result 1167 * in altmap block allocation failures, in which case 1168 * we fallback to RAM for vmemmap allocation. 1169 */ 1170 if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) || 1171 altmap_cross_boundary(altmap, addr, PMD_SIZE))) { 1172 /* 1173 * make sure we don't create altmap mappings 1174 * covering things outside the device. 1175 */ 1176 goto base_mapping; 1177 } 1178 1179 p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); 1180 if (p) { 1181 vmemmap_set_pmd(pmd, p, node, addr, next); 1182 pr_debug("PMD_SIZE vmemmap mapping\n"); 1183 continue; 1184 } else { 1185 /* 1186 * A vmemmap block allocation can fail due to 1187 * alignment requirements and we trying to align 1188 * things aggressively there by running out of 1189 * space. Try base mapping on failure. 1190 */ 1191 goto base_mapping; 1192 } 1193 } else if (vmemmap_check_pmd(pmd, node, addr, next)) { 1194 /* 1195 * If a huge mapping exist due to early call to 1196 * vmemmap_populate, let's try to use that. 1197 */ 1198 continue; 1199 } 1200 base_mapping: 1201 /* 1202 * Not able allocate higher order memory to back memmap 1203 * or we found a pointer to pte page. Allocate base page 1204 * size vmemmap 1205 */ 1206 pte = vmemmap_pte_alloc(pmd, node, addr); 1207 if (!pte) 1208 return -ENOMEM; 1209 1210 pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL); 1211 if (!pte) 1212 return -ENOMEM; 1213 1214 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); 1215 next = addr + PAGE_SIZE; 1216 } 1217 return 0; 1218 } 1219 1220 static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node, 1221 struct vmem_altmap *altmap, 1222 struct page *reuse) 1223 { 1224 pgd_t *pgd; 1225 p4d_t *p4d; 1226 pud_t *pud; 1227 pmd_t *pmd; 1228 pte_t *pte; 1229 1230 pgd = pgd_offset_k(addr); 1231 p4d = p4d_offset(pgd, addr); 1232 pud = vmemmap_pud_alloc(p4d, node, addr); 1233 if (!pud) 1234 return NULL; 1235 pmd = vmemmap_pmd_alloc(pud, node, addr); 1236 if (!pmd) 1237 return NULL; 1238 if (pmd_leaf(*pmd)) 1239 /* 1240 * The second page is mapped as a hugepage due to a nearby request. 1241 * Force our mapping to page size without deduplication 1242 */ 1243 return NULL; 1244 pte = vmemmap_pte_alloc(pmd, node, addr); 1245 if (!pte) 1246 return NULL; 1247 radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); 1248 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); 1249 1250 return pte; 1251 } 1252 1253 static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr, 1254 unsigned long pfn_offset, int node) 1255 { 1256 pgd_t *pgd; 1257 p4d_t *p4d; 1258 pud_t *pud; 1259 pmd_t *pmd; 1260 pte_t *pte; 1261 unsigned long map_addr; 1262 1263 /* the second vmemmap page which we use for duplication */ 1264 map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE; 1265 pgd = pgd_offset_k(map_addr); 1266 p4d = p4d_offset(pgd, map_addr); 1267 pud = vmemmap_pud_alloc(p4d, node, map_addr); 1268 if (!pud) 1269 return NULL; 1270 pmd = vmemmap_pmd_alloc(pud, node, map_addr); 1271 if (!pmd) 1272 return NULL; 1273 if (pmd_leaf(*pmd)) 1274 /* 1275 * The second page is mapped as a hugepage due to a nearby request. 1276 * Force our mapping to page size without deduplication 1277 */ 1278 return NULL; 1279 pte = vmemmap_pte_alloc(pmd, node, map_addr); 1280 if (!pte) 1281 return NULL; 1282 /* 1283 * Check if there exist a mapping to the left 1284 */ 1285 if (pte_none(*pte)) { 1286 /* 1287 * Populate the head page vmemmap page. 1288 * It can fall in different pmd, hence 1289 * vmemmap_populate_address() 1290 */ 1291 pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL); 1292 if (!pte) 1293 return NULL; 1294 /* 1295 * Populate the tail pages vmemmap page 1296 */ 1297 pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL); 1298 if (!pte) 1299 return NULL; 1300 vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE); 1301 return pte; 1302 } 1303 return pte; 1304 } 1305 1306 int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, 1307 unsigned long start, 1308 unsigned long end, int node, 1309 struct dev_pagemap *pgmap) 1310 { 1311 /* 1312 * we want to map things as base page size mapping so that 1313 * we can save space in vmemmap. We could have huge mapping 1314 * covering out both edges. 1315 */ 1316 unsigned long addr; 1317 unsigned long addr_pfn = start_pfn; 1318 unsigned long next; 1319 pgd_t *pgd; 1320 p4d_t *p4d; 1321 pud_t *pud; 1322 pmd_t *pmd; 1323 pte_t *pte; 1324 1325 for (addr = start; addr < end; addr = next) { 1326 1327 pgd = pgd_offset_k(addr); 1328 p4d = p4d_offset(pgd, addr); 1329 pud = vmemmap_pud_alloc(p4d, node, addr); 1330 if (!pud) 1331 return -ENOMEM; 1332 pmd = vmemmap_pmd_alloc(pud, node, addr); 1333 if (!pmd) 1334 return -ENOMEM; 1335 1336 if (pmd_leaf(READ_ONCE(*pmd))) { 1337 /* existing huge mapping. Skip the range */ 1338 addr_pfn += (PMD_SIZE >> PAGE_SHIFT); 1339 next = pmd_addr_end(addr, end); 1340 continue; 1341 } 1342 pte = vmemmap_pte_alloc(pmd, node, addr); 1343 if (!pte) 1344 return -ENOMEM; 1345 if (!pte_none(*pte)) { 1346 /* 1347 * This could be because we already have a compound 1348 * page whose VMEMMAP_RESERVE_NR pages were mapped and 1349 * this request fall in those pages. 1350 */ 1351 addr_pfn += 1; 1352 next = addr + PAGE_SIZE; 1353 continue; 1354 } else { 1355 unsigned long nr_pages = pgmap_vmemmap_nr(pgmap); 1356 unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages); 1357 pte_t *tail_page_pte; 1358 1359 /* 1360 * if the address is aligned to huge page size it is the 1361 * head mapping. 1362 */ 1363 if (pfn_offset == 0) { 1364 /* Populate the head page vmemmap page */ 1365 pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); 1366 if (!pte) 1367 return -ENOMEM; 1368 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); 1369 1370 /* 1371 * Populate the tail pages vmemmap page 1372 * It can fall in different pmd, hence 1373 * vmemmap_populate_address() 1374 */ 1375 pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL); 1376 if (!pte) 1377 return -ENOMEM; 1378 1379 addr_pfn += 2; 1380 next = addr + 2 * PAGE_SIZE; 1381 continue; 1382 } 1383 /* 1384 * get the 2nd mapping details 1385 * Also create it if that doesn't exist 1386 */ 1387 tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node); 1388 if (!tail_page_pte) { 1389 1390 pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); 1391 if (!pte) 1392 return -ENOMEM; 1393 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); 1394 1395 addr_pfn += 1; 1396 next = addr + PAGE_SIZE; 1397 continue; 1398 } 1399 1400 pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte)); 1401 if (!pte) 1402 return -ENOMEM; 1403 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); 1404 1405 addr_pfn += 1; 1406 next = addr + PAGE_SIZE; 1407 continue; 1408 } 1409 } 1410 return 0; 1411 } 1412 1413 1414 #ifdef CONFIG_MEMORY_HOTPLUG 1415 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) 1416 { 1417 remove_pagetable(start, start + page_size, true, NULL); 1418 } 1419 1420 void __ref radix__vmemmap_free(unsigned long start, unsigned long end, 1421 struct vmem_altmap *altmap) 1422 { 1423 remove_pagetable(start, end, false, altmap); 1424 } 1425 #endif 1426 #endif 1427 1428 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 1429 1430 unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, 1431 pmd_t *pmdp, unsigned long clr, 1432 unsigned long set) 1433 { 1434 unsigned long old; 1435 1436 #ifdef CONFIG_DEBUG_VM 1437 WARN_ON(!radix__pmd_trans_huge(*pmdp)); 1438 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1439 #endif 1440 1441 old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1); 1442 trace_hugepage_update_pmd(addr, old, clr, set); 1443 1444 return old; 1445 } 1446 1447 unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, 1448 pud_t *pudp, unsigned long clr, 1449 unsigned long set) 1450 { 1451 unsigned long old; 1452 1453 #ifdef CONFIG_DEBUG_VM 1454 WARN_ON(!pud_trans_huge(*pudp)); 1455 assert_spin_locked(pud_lockptr(mm, pudp)); 1456 #endif 1457 1458 old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1); 1459 trace_hugepage_update_pud(addr, old, clr, set); 1460 1461 return old; 1462 } 1463 1464 pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, 1465 pmd_t *pmdp) 1466 1467 { 1468 pmd_t pmd; 1469 1470 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1471 VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); 1472 /* 1473 * khugepaged calls this for normal pmd 1474 */ 1475 pmd = *pmdp; 1476 pmd_clear(pmdp); 1477 1478 page_table_check_pmd_clear(vma->vm_mm, address, pmd); 1479 1480 radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); 1481 1482 return pmd; 1483 } 1484 1485 /* 1486 * For us pgtable_t is pte_t *. Inorder to save the deposisted 1487 * page table, we consider the allocated page table as a list 1488 * head. On withdraw we need to make sure we zero out the used 1489 * list_head memory area. 1490 */ 1491 void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 1492 pgtable_t pgtable) 1493 { 1494 struct list_head *lh = (struct list_head *) pgtable; 1495 1496 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1497 1498 /* FIFO */ 1499 if (!pmd_huge_pte(mm, pmdp)) 1500 INIT_LIST_HEAD(lh); 1501 else 1502 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 1503 pmd_huge_pte(mm, pmdp) = pgtable; 1504 } 1505 1506 pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 1507 { 1508 pte_t *ptep; 1509 pgtable_t pgtable; 1510 struct list_head *lh; 1511 1512 assert_spin_locked(pmd_lockptr(mm, pmdp)); 1513 1514 /* FIFO */ 1515 pgtable = pmd_huge_pte(mm, pmdp); 1516 lh = (struct list_head *) pgtable; 1517 if (list_empty(lh)) 1518 pmd_huge_pte(mm, pmdp) = NULL; 1519 else { 1520 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 1521 list_del(lh); 1522 } 1523 ptep = (pte_t *) pgtable; 1524 *ptep = __pte(0); 1525 ptep++; 1526 *ptep = __pte(0); 1527 return pgtable; 1528 } 1529 1530 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, 1531 unsigned long addr, pmd_t *pmdp) 1532 { 1533 pmd_t old_pmd; 1534 unsigned long old; 1535 1536 old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); 1537 old_pmd = __pmd(old); 1538 return old_pmd; 1539 } 1540 1541 pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, 1542 unsigned long addr, pud_t *pudp) 1543 { 1544 pud_t old_pud; 1545 unsigned long old; 1546 1547 old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0); 1548 old_pud = __pud(old); 1549 return old_pud; 1550 } 1551 1552 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1553 1554 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, 1555 pte_t entry, unsigned long address, int psize) 1556 { 1557 struct mm_struct *mm = vma->vm_mm; 1558 unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY | 1559 _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); 1560 1561 unsigned long change = pte_val(entry) ^ pte_val(*ptep); 1562 /* 1563 * On POWER9, the NMMU is not able to relax PTE access permissions 1564 * for a translation with a TLB. The PTE must be invalidated, TLB 1565 * flushed before the new PTE is installed. 1566 * 1567 * This only needs to be done for radix, because hash translation does 1568 * flush when updating the linux pte (and we don't support NMMU 1569 * accelerators on HPT on POWER9 anyway XXX: do we?). 1570 * 1571 * POWER10 (and P9P) NMMU does behave as per ISA. 1572 */ 1573 if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) && 1574 atomic_read(&mm->context.copros) > 0) { 1575 unsigned long old_pte, new_pte; 1576 1577 old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID); 1578 new_pte = old_pte | set; 1579 radix__flush_tlb_page_psize(mm, address, psize); 1580 __radix_pte_update(ptep, _PAGE_INVALID, new_pte); 1581 } else { 1582 __radix_pte_update(ptep, 0, set); 1583 /* 1584 * Book3S does not require a TLB flush when relaxing access 1585 * restrictions when the address space (modulo the POWER9 nest 1586 * MMU issue above) because the MMU will reload the PTE after 1587 * taking an access fault, as defined by the architecture. See 1588 * "Setting a Reference or Change Bit or Upgrading Access 1589 * Authority (PTE Subject to Atomic Hardware Updates)" in 1590 * Power ISA Version 3.1B. 1591 */ 1592 } 1593 /* See ptesync comment in radix__set_pte_at */ 1594 } 1595 1596 void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, 1597 unsigned long addr, pte_t *ptep, 1598 pte_t old_pte, pte_t pte) 1599 { 1600 struct mm_struct *mm = vma->vm_mm; 1601 1602 /* 1603 * POWER9 NMMU must flush the TLB after clearing the PTE before 1604 * installing a PTE with more relaxed access permissions, see 1605 * radix__ptep_set_access_flags. 1606 */ 1607 if (!cpu_has_feature(CPU_FTR_ARCH_31) && 1608 is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && 1609 (atomic_read(&mm->context.copros) > 0)) 1610 radix__flush_tlb_page(vma, addr); 1611 1612 set_pte_at_unchecked(mm, addr, ptep, pte); 1613 } 1614 1615 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) 1616 { 1617 pte_t *ptep = (pte_t *)pud; 1618 pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot); 1619 1620 if (!radix_enabled()) 1621 return 0; 1622 1623 set_pte_at_unchecked(&init_mm, 0 /* radix unused */, ptep, new_pud); 1624 1625 return 1; 1626 } 1627 1628 int pud_clear_huge(pud_t *pud) 1629 { 1630 if (pud_leaf(*pud)) { 1631 pud_clear(pud); 1632 return 1; 1633 } 1634 1635 return 0; 1636 } 1637 1638 int pud_free_pmd_page(pud_t *pud, unsigned long addr) 1639 { 1640 pmd_t *pmd; 1641 int i; 1642 1643 pmd = pud_pgtable(*pud); 1644 pud_clear(pud); 1645 1646 flush_tlb_kernel_range(addr, addr + PUD_SIZE); 1647 1648 for (i = 0; i < PTRS_PER_PMD; i++) { 1649 if (!pmd_none(pmd[i])) { 1650 pte_t *pte; 1651 pte = (pte_t *)pmd_page_vaddr(pmd[i]); 1652 1653 pte_free_kernel(&init_mm, pte); 1654 } 1655 } 1656 1657 pmd_free(&init_mm, pmd); 1658 1659 return 1; 1660 } 1661 1662 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) 1663 { 1664 pte_t *ptep = (pte_t *)pmd; 1665 pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot); 1666 1667 if (!radix_enabled()) 1668 return 0; 1669 1670 set_pte_at_unchecked(&init_mm, 0 /* radix unused */, ptep, new_pmd); 1671 1672 return 1; 1673 } 1674 1675 int pmd_clear_huge(pmd_t *pmd) 1676 { 1677 if (pmd_leaf(*pmd)) { 1678 pmd_clear(pmd); 1679 return 1; 1680 } 1681 1682 return 0; 1683 } 1684 1685 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 1686 { 1687 pte_t *pte; 1688 1689 pte = (pte_t *)pmd_page_vaddr(*pmd); 1690 pmd_clear(pmd); 1691 1692 flush_tlb_kernel_range(addr, addr + PMD_SIZE); 1693 1694 pte_free_kernel(&init_mm, pte); 1695 1696 return 1; 1697 } 1698