1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Based on arch/arm/mm/mmu.c 4 * 5 * Copyright (C) 1995-2005 Russell King 6 * Copyright (C) 2012 ARM Ltd. 7 */ 8 9 #include <linux/cache.h> 10 #include <linux/export.h> 11 #include <linux/kernel.h> 12 #include <linux/errno.h> 13 #include <linux/init.h> 14 #include <linux/ioport.h> 15 #include <linux/kexec.h> 16 #include <linux/libfdt.h> 17 #include <linux/mman.h> 18 #include <linux/nodemask.h> 19 #include <linux/memblock.h> 20 #include <linux/memremap.h> 21 #include <linux/memory.h> 22 #include <linux/fs.h> 23 #include <linux/io.h> 24 #include <linux/mm.h> 25 #include <linux/vmalloc.h> 26 #include <linux/set_memory.h> 27 #include <linux/kfence.h> 28 #include <linux/pkeys.h> 29 #include <linux/mm_inline.h> 30 #include <linux/pagewalk.h> 31 #include <linux/stop_machine.h> 32 33 #include <asm/barrier.h> 34 #include <asm/cputype.h> 35 #include <asm/fixmap.h> 36 #include <asm/kasan.h> 37 #include <asm/kernel-pgtable.h> 38 #include <asm/sections.h> 39 #include <asm/setup.h> 40 #include <linux/sizes.h> 41 #include <asm/tlb.h> 42 #include <asm/mmu_context.h> 43 #include <asm/ptdump.h> 44 #include <asm/tlbflush.h> 45 #include <asm/pgalloc.h> 46 #include <asm/kfence.h> 47 48 #define NO_BLOCK_MAPPINGS BIT(0) 49 #define NO_CONT_MAPPINGS BIT(1) 50 #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */ 51 52 DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key); 53 54 u64 kimage_voffset __ro_after_init; 55 EXPORT_SYMBOL(kimage_voffset); 56 57 u32 __boot_cpu_mode[] = { BOOT_CPU_MODE_EL2, BOOT_CPU_MODE_EL1 }; 58 59 static bool rodata_is_rw __ro_after_init = true; 60 61 /* 62 * The booting CPU updates the failed status @__early_cpu_boot_status, 63 * with MMU turned off. 64 */ 65 long __section(".mmuoff.data.write") __early_cpu_boot_status; 66 67 static DEFINE_SPINLOCK(swapper_pgdir_lock); 68 static DEFINE_MUTEX(fixmap_lock); 69 70 void noinstr set_swapper_pgd(pgd_t *pgdp, pgd_t pgd) 71 { 72 pgd_t *fixmap_pgdp; 73 74 /* 75 * Don't bother with the fixmap if swapper_pg_dir is still mapped 76 * writable in the kernel mapping. 77 */ 78 if (rodata_is_rw) { 79 WRITE_ONCE(*pgdp, pgd); 80 dsb(ishst); 81 isb(); 82 return; 83 } 84 85 spin_lock(&swapper_pgdir_lock); 86 fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp)); 87 WRITE_ONCE(*fixmap_pgdp, pgd); 88 /* 89 * We need dsb(ishst) here to ensure the page-table-walker sees 90 * our new entry before set_p?d() returns. The fixmap's 91 * flush_tlb_kernel_range() via clear_fixmap() does this for us. 92 */ 93 pgd_clear_fixmap(); 94 spin_unlock(&swapper_pgdir_lock); 95 } 96 97 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 98 unsigned long size, pgprot_t vma_prot) 99 { 100 if (!pfn_is_map_memory(pfn)) 101 return pgprot_noncached(vma_prot); 102 else if (file->f_flags & O_SYNC) 103 return pgprot_writecombine(vma_prot); 104 return vma_prot; 105 } 106 EXPORT_SYMBOL(phys_mem_access_prot); 107 108 static phys_addr_t __init early_pgtable_alloc(enum pgtable_level pgtable_level) 109 { 110 phys_addr_t phys; 111 112 phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0, 113 MEMBLOCK_ALLOC_NOLEAKTRACE); 114 if (!phys) 115 panic("Failed to allocate page table page\n"); 116 117 return phys; 118 } 119 120 bool pgattr_change_is_safe(pteval_t old, pteval_t new) 121 { 122 /* 123 * The following mapping attributes may be updated in live 124 * kernel mappings without the need for break-before-make. 125 */ 126 pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG | 127 PTE_SWBITS_MASK; 128 129 /* creating or taking down mappings is always safe */ 130 if (!pte_valid(__pte(old)) || !pte_valid(__pte(new))) 131 return true; 132 133 /* A live entry's pfn should not change */ 134 if (pte_pfn(__pte(old)) != pte_pfn(__pte(new))) 135 return false; 136 137 /* Transitioning from Non-Global to Global is unsafe */ 138 if (old & ~new & PTE_NG) 139 return false; 140 141 /* 142 * Changing the memory type between Normal and Normal-Tagged is safe 143 * since Tagged is considered a permission attribute from the 144 * mismatched attribute aliases perspective. 145 */ 146 if (((old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) || 147 (old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)) && 148 ((new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) || 149 (new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED))) 150 mask |= PTE_ATTRINDX_MASK; 151 152 return ((old ^ new) & ~mask) == 0; 153 } 154 155 static void init_clear_pgtable(void *table) 156 { 157 clear_page(table); 158 159 /* Ensure the zeroing is observed by page table walks. */ 160 dsb(ishst); 161 } 162 163 static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end, 164 phys_addr_t phys, pgprot_t prot) 165 { 166 do { 167 pte_t old_pte = __ptep_get(ptep); 168 169 /* 170 * Required barriers to make this visible to the table walker 171 * are deferred to the end of alloc_init_cont_pte(). 172 */ 173 __set_pte_nosync(ptep, pfn_pte(__phys_to_pfn(phys), prot)); 174 175 /* 176 * After the PTE entry has been populated once, we 177 * only allow updates to the permission attributes. 178 */ 179 BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), 180 pte_val(__ptep_get(ptep)))); 181 182 phys += PAGE_SIZE; 183 } while (ptep++, addr += PAGE_SIZE, addr != end); 184 } 185 186 static bool pte_range_has_valid_noncont(pte_t *ptep) 187 { 188 for (int i = 0; i < CONT_PTES; i++) { 189 pte_t pte = __ptep_get(&ptep[i]); 190 191 if (pte_valid(pte) && !pte_cont(pte)) 192 return true; 193 } 194 return false; 195 } 196 197 static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, 198 unsigned long end, phys_addr_t phys, 199 pgprot_t prot, 200 phys_addr_t (*pgtable_alloc)(enum pgtable_level), 201 int flags) 202 { 203 unsigned long next; 204 pmd_t pmd = READ_ONCE(*pmdp); 205 pte_t *ptep; 206 207 BUG_ON(pmd_leaf(pmd)); 208 if (pmd_none(pmd)) { 209 pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; 210 phys_addr_t pte_phys; 211 212 if (flags & NO_EXEC_MAPPINGS) 213 pmdval |= PMD_TABLE_PXN; 214 BUG_ON(!pgtable_alloc); 215 pte_phys = pgtable_alloc(PGTABLE_LEVEL_PTE); 216 if (pte_phys == INVALID_PHYS_ADDR) 217 return -ENOMEM; 218 ptep = pte_set_fixmap(pte_phys); 219 init_clear_pgtable(ptep); 220 ptep += pte_index(addr); 221 __pmd_populate(pmdp, pte_phys, pmdval); 222 } else { 223 BUG_ON(pmd_bad(pmd)); 224 ptep = pte_set_fixmap_offset(pmdp, addr); 225 } 226 227 do { 228 pgprot_t __prot = prot; 229 230 next = pte_cont_addr_end(addr, end); 231 232 /* use a contiguous mapping if the range is suitably aligned */ 233 if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) && 234 (flags & NO_CONT_MAPPINGS) == 0 && 235 !pte_range_has_valid_noncont(ptep)) 236 __prot = __pgprot(pgprot_val(prot) | PTE_CONT); 237 238 init_pte(ptep, addr, next, phys, __prot); 239 240 ptep += pte_index(next) - pte_index(addr); 241 phys += next - addr; 242 } while (addr = next, addr != end); 243 244 /* 245 * Note: barriers and maintenance necessary to clear the fixmap slot 246 * ensure that all previous pgtable writes are visible to the table 247 * walker. 248 */ 249 pte_clear_fixmap(); 250 251 return 0; 252 } 253 254 static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, 255 phys_addr_t phys, pgprot_t prot, 256 phys_addr_t (*pgtable_alloc)(enum pgtable_level), int flags) 257 { 258 unsigned long next; 259 260 do { 261 pmd_t old_pmd = READ_ONCE(*pmdp); 262 263 next = pmd_addr_end(addr, end); 264 265 /* try section mapping first */ 266 if (((addr | next | phys) & ~PMD_MASK) == 0 && 267 (flags & NO_BLOCK_MAPPINGS) == 0 && 268 !pmd_table(old_pmd)) { 269 WARN_ON(!pmd_set_huge(pmdp, phys, prot)); 270 271 /* 272 * After the PMD entry has been populated once, we 273 * only allow updates to the permission attributes. 274 */ 275 BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd), 276 READ_ONCE(pmd_val(*pmdp)))); 277 } else { 278 int ret; 279 280 ret = alloc_init_cont_pte(pmdp, addr, next, phys, prot, 281 pgtable_alloc, flags); 282 if (ret) 283 return ret; 284 285 VM_WARN_ON_ONCE(pmd_val(old_pmd) != 0 && 286 pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp))); 287 } 288 phys += next - addr; 289 } while (pmdp++, addr = next, addr != end); 290 291 return 0; 292 } 293 294 static bool pmd_range_has_valid_noncont(pmd_t *pmdp) 295 { 296 for (int i = 0; i < CONT_PMDS; i++) { 297 pte_t pte = pmd_pte(READ_ONCE(pmdp[i])); 298 299 if (pte_valid(pte) && !pte_cont(pte)) 300 return true; 301 } 302 return false; 303 } 304 305 static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, 306 unsigned long end, phys_addr_t phys, 307 pgprot_t prot, 308 phys_addr_t (*pgtable_alloc)(enum pgtable_level), 309 int flags) 310 { 311 int ret; 312 unsigned long next; 313 pud_t pud = READ_ONCE(*pudp); 314 pmd_t *pmdp; 315 316 /* 317 * Check for initial section mappings in the pgd/pud. 318 */ 319 BUG_ON(pud_leaf(pud)); 320 if (pud_none(pud)) { 321 pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; 322 phys_addr_t pmd_phys; 323 324 if (flags & NO_EXEC_MAPPINGS) 325 pudval |= PUD_TABLE_PXN; 326 BUG_ON(!pgtable_alloc); 327 pmd_phys = pgtable_alloc(PGTABLE_LEVEL_PMD); 328 if (pmd_phys == INVALID_PHYS_ADDR) 329 return -ENOMEM; 330 pmdp = pmd_set_fixmap(pmd_phys); 331 init_clear_pgtable(pmdp); 332 pmdp += pmd_index(addr); 333 __pud_populate(pudp, pmd_phys, pudval); 334 } else { 335 BUG_ON(pud_bad(pud)); 336 pmdp = pmd_set_fixmap_offset(pudp, addr); 337 } 338 339 do { 340 pgprot_t __prot = prot; 341 342 next = pmd_cont_addr_end(addr, end); 343 344 /* use a contiguous mapping if the range is suitably aligned */ 345 if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) && 346 (flags & NO_CONT_MAPPINGS) == 0 && 347 !pmd_range_has_valid_noncont(pmdp)) 348 __prot = __pgprot(pgprot_val(prot) | PTE_CONT); 349 350 ret = init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags); 351 if (ret) 352 goto out; 353 354 pmdp += pmd_index(next) - pmd_index(addr); 355 phys += next - addr; 356 } while (addr = next, addr != end); 357 358 out: 359 pmd_clear_fixmap(); 360 361 return ret; 362 } 363 364 static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, 365 phys_addr_t phys, pgprot_t prot, 366 phys_addr_t (*pgtable_alloc)(enum pgtable_level), 367 int flags) 368 { 369 int ret = 0; 370 unsigned long next; 371 p4d_t p4d = READ_ONCE(*p4dp); 372 pud_t *pudp; 373 374 if (p4d_none(p4d)) { 375 p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN | P4D_TABLE_AF; 376 phys_addr_t pud_phys; 377 378 if (flags & NO_EXEC_MAPPINGS) 379 p4dval |= P4D_TABLE_PXN; 380 BUG_ON(!pgtable_alloc); 381 pud_phys = pgtable_alloc(PGTABLE_LEVEL_PUD); 382 if (pud_phys == INVALID_PHYS_ADDR) 383 return -ENOMEM; 384 pudp = pud_set_fixmap(pud_phys); 385 init_clear_pgtable(pudp); 386 pudp += pud_index(addr); 387 __p4d_populate(p4dp, pud_phys, p4dval); 388 } else { 389 BUG_ON(p4d_bad(p4d)); 390 pudp = pud_set_fixmap_offset(p4dp, addr); 391 } 392 393 do { 394 pud_t old_pud = READ_ONCE(*pudp); 395 396 next = pud_addr_end(addr, end); 397 398 /* 399 * For 4K granule only, attempt to put down a 1GB block 400 */ 401 if (pud_sect_supported() && 402 ((addr | next | phys) & ~PUD_MASK) == 0 && 403 (flags & NO_BLOCK_MAPPINGS) == 0 && 404 !pud_table(old_pud)) { 405 WARN_ON(!pud_set_huge(pudp, phys, prot)); 406 407 /* 408 * After the PUD entry has been populated once, we 409 * only allow updates to the permission attributes. 410 */ 411 BUG_ON(!pgattr_change_is_safe(pud_val(old_pud), 412 READ_ONCE(pud_val(*pudp)))); 413 } else { 414 ret = alloc_init_cont_pmd(pudp, addr, next, phys, prot, 415 pgtable_alloc, flags); 416 if (ret) 417 goto out; 418 419 VM_WARN_ON_ONCE(pud_val(old_pud) != 0 && 420 pud_val(old_pud) != READ_ONCE(pud_val(*pudp))); 421 } 422 phys += next - addr; 423 } while (pudp++, addr = next, addr != end); 424 425 out: 426 pud_clear_fixmap(); 427 428 return ret; 429 } 430 431 static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, 432 phys_addr_t phys, pgprot_t prot, 433 phys_addr_t (*pgtable_alloc)(enum pgtable_level), 434 int flags) 435 { 436 int ret; 437 unsigned long next; 438 pgd_t pgd = READ_ONCE(*pgdp); 439 p4d_t *p4dp; 440 441 if (pgd_none(pgd)) { 442 pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN | PGD_TABLE_AF; 443 phys_addr_t p4d_phys; 444 445 if (flags & NO_EXEC_MAPPINGS) 446 pgdval |= PGD_TABLE_PXN; 447 BUG_ON(!pgtable_alloc); 448 p4d_phys = pgtable_alloc(PGTABLE_LEVEL_P4D); 449 if (p4d_phys == INVALID_PHYS_ADDR) 450 return -ENOMEM; 451 p4dp = p4d_set_fixmap(p4d_phys); 452 init_clear_pgtable(p4dp); 453 p4dp += p4d_index(addr); 454 __pgd_populate(pgdp, p4d_phys, pgdval); 455 } else { 456 BUG_ON(pgd_bad(pgd)); 457 p4dp = p4d_set_fixmap_offset(pgdp, addr); 458 } 459 460 do { 461 p4d_t old_p4d = READ_ONCE(*p4dp); 462 463 next = p4d_addr_end(addr, end); 464 465 ret = alloc_init_pud(p4dp, addr, next, phys, prot, 466 pgtable_alloc, flags); 467 if (ret) 468 goto out; 469 470 VM_WARN_ON_ONCE(p4d_val(old_p4d) != 0 && 471 p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp))); 472 473 phys += next - addr; 474 } while (p4dp++, addr = next, addr != end); 475 476 out: 477 p4d_clear_fixmap(); 478 479 return ret; 480 } 481 482 static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, 483 unsigned long virt, phys_addr_t size, 484 pgprot_t prot, 485 phys_addr_t (*pgtable_alloc)(enum pgtable_level), 486 int flags) 487 { 488 int ret; 489 unsigned long addr, end, next; 490 pgd_t *pgdp = pgd_offset_pgd(pgdir, virt); 491 492 /* 493 * If the virtual and physical address don't have the same offset 494 * within a page, we cannot map the region as the caller expects. 495 */ 496 if (WARN_ON((phys ^ virt) & ~PAGE_MASK)) 497 return -EINVAL; 498 499 phys &= PAGE_MASK; 500 addr = virt & PAGE_MASK; 501 end = PAGE_ALIGN(virt + size); 502 503 do { 504 next = pgd_addr_end(addr, end); 505 ret = alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc, 506 flags); 507 if (ret) 508 return ret; 509 phys += next - addr; 510 } while (pgdp++, addr = next, addr != end); 511 512 return 0; 513 } 514 515 static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, 516 unsigned long virt, phys_addr_t size, 517 pgprot_t prot, 518 phys_addr_t (*pgtable_alloc)(enum pgtable_level), 519 int flags) 520 { 521 int ret; 522 523 mutex_lock(&fixmap_lock); 524 ret = __create_pgd_mapping_locked(pgdir, phys, virt, size, prot, 525 pgtable_alloc, flags); 526 mutex_unlock(&fixmap_lock); 527 528 return ret; 529 } 530 531 static void early_create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, 532 unsigned long virt, phys_addr_t size, 533 pgprot_t prot, 534 phys_addr_t (*pgtable_alloc)(enum pgtable_level), 535 int flags) 536 { 537 int ret; 538 539 ret = __create_pgd_mapping(pgdir, phys, virt, size, prot, pgtable_alloc, 540 flags); 541 if (ret) 542 panic("Failed to create page tables\n"); 543 } 544 545 static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp, 546 enum pgtable_level pgtable_level) 547 { 548 /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ 549 struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0); 550 phys_addr_t pa; 551 552 if (!ptdesc) 553 return INVALID_PHYS_ADDR; 554 555 pa = page_to_phys(ptdesc_page(ptdesc)); 556 557 switch (pgtable_level) { 558 case PGTABLE_LEVEL_PTE: 559 BUG_ON(!pagetable_pte_ctor(mm, ptdesc)); 560 break; 561 case PGTABLE_LEVEL_PMD: 562 BUG_ON(!pagetable_pmd_ctor(mm, ptdesc)); 563 break; 564 case PGTABLE_LEVEL_PUD: 565 pagetable_pud_ctor(ptdesc); 566 break; 567 case PGTABLE_LEVEL_P4D: 568 pagetable_p4d_ctor(ptdesc); 569 break; 570 case PGTABLE_LEVEL_PGD: 571 VM_WARN_ON(1); 572 break; 573 } 574 575 return pa; 576 } 577 578 static phys_addr_t 579 pgd_pgtable_alloc_init_mm_gfp(enum pgtable_level pgtable_level, gfp_t gfp) 580 { 581 return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_level); 582 } 583 584 static phys_addr_t __maybe_unused 585 pgd_pgtable_alloc_init_mm(enum pgtable_level pgtable_level) 586 { 587 return pgd_pgtable_alloc_init_mm_gfp(pgtable_level, GFP_PGTABLE_KERNEL); 588 } 589 590 static phys_addr_t 591 pgd_pgtable_alloc_special_mm(enum pgtable_level pgtable_level) 592 { 593 return __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_level); 594 } 595 596 static void split_contpte(pte_t *ptep) 597 { 598 int i; 599 600 ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); 601 for (i = 0; i < CONT_PTES; i++, ptep++) 602 __set_pte(ptep, pte_mknoncont(__ptep_get(ptep))); 603 } 604 605 static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont) 606 { 607 pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; 608 unsigned long pfn = pmd_pfn(pmd); 609 pgprot_t prot = pmd_pgprot(pmd); 610 phys_addr_t pte_phys; 611 pte_t *ptep; 612 int i; 613 614 pte_phys = pgd_pgtable_alloc_init_mm_gfp(PGTABLE_LEVEL_PTE, gfp); 615 if (pte_phys == INVALID_PHYS_ADDR) 616 return -ENOMEM; 617 ptep = (pte_t *)phys_to_virt(pte_phys); 618 619 if (pgprot_val(prot) & PMD_SECT_PXN) 620 tableprot |= PMD_TABLE_PXN; 621 622 prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE); 623 if (!pmd_valid(pmd)) 624 prot = pte_pgprot(pte_mkinvalid(pfn_pte(0, prot))); 625 prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); 626 if (to_cont) 627 prot = __pgprot(pgprot_val(prot) | PTE_CONT); 628 629 for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++) 630 __set_pte(ptep, pfn_pte(pfn, prot)); 631 632 /* 633 * Ensure the pte entries are visible to the table walker by the time 634 * the pmd entry that points to the ptes is visible. 635 */ 636 dsb(ishst); 637 __pmd_populate(pmdp, pte_phys, tableprot); 638 639 return 0; 640 } 641 642 static void split_contpmd(pmd_t *pmdp) 643 { 644 int i; 645 646 pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS); 647 for (i = 0; i < CONT_PMDS; i++, pmdp++) 648 set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp))); 649 } 650 651 static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont) 652 { 653 pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; 654 unsigned int step = PMD_SIZE >> PAGE_SHIFT; 655 unsigned long pfn = pud_pfn(pud); 656 pgprot_t prot = pud_pgprot(pud); 657 phys_addr_t pmd_phys; 658 pmd_t *pmdp; 659 int i; 660 661 pmd_phys = pgd_pgtable_alloc_init_mm_gfp(PGTABLE_LEVEL_PMD, gfp); 662 if (pmd_phys == INVALID_PHYS_ADDR) 663 return -ENOMEM; 664 pmdp = (pmd_t *)phys_to_virt(pmd_phys); 665 666 if (pgprot_val(prot) & PMD_SECT_PXN) 667 tableprot |= PUD_TABLE_PXN; 668 669 prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT); 670 if (!pud_valid(pud)) 671 prot = pmd_pgprot(pmd_mkinvalid(pfn_pmd(0, prot))); 672 prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); 673 if (to_cont) 674 prot = __pgprot(pgprot_val(prot) | PTE_CONT); 675 676 for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step) 677 set_pmd(pmdp, pfn_pmd(pfn, prot)); 678 679 /* 680 * Ensure the pmd entries are visible to the table walker by the time 681 * the pud entry that points to the pmds is visible. 682 */ 683 dsb(ishst); 684 __pud_populate(pudp, pmd_phys, tableprot); 685 686 return 0; 687 } 688 689 static int split_kernel_leaf_mapping_locked(unsigned long addr) 690 { 691 pgd_t *pgdp, pgd; 692 p4d_t *p4dp, p4d; 693 pud_t *pudp, pud; 694 pmd_t *pmdp, pmd; 695 pte_t *ptep, pte; 696 int ret = 0; 697 698 /* 699 * PGD: If addr is PGD aligned then addr already describes a leaf 700 * boundary. If not present then there is nothing to split. 701 */ 702 if (ALIGN_DOWN(addr, PGDIR_SIZE) == addr) 703 goto out; 704 pgdp = pgd_offset_k(addr); 705 pgd = pgdp_get(pgdp); 706 if (!pgd_present(pgd)) 707 goto out; 708 709 /* 710 * P4D: If addr is P4D aligned then addr already describes a leaf 711 * boundary. If not present then there is nothing to split. 712 */ 713 if (ALIGN_DOWN(addr, P4D_SIZE) == addr) 714 goto out; 715 p4dp = p4d_offset(pgdp, addr); 716 p4d = p4dp_get(p4dp); 717 if (!p4d_present(p4d)) 718 goto out; 719 720 /* 721 * PUD: If addr is PUD aligned then addr already describes a leaf 722 * boundary. If not present then there is nothing to split. Otherwise, 723 * if we have a pud leaf, split to contpmd. 724 */ 725 if (ALIGN_DOWN(addr, PUD_SIZE) == addr) 726 goto out; 727 pudp = pud_offset(p4dp, addr); 728 pud = pudp_get(pudp); 729 if (!pud_present(pud)) 730 goto out; 731 if (pud_leaf(pud)) { 732 ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true); 733 if (ret) 734 goto out; 735 } 736 737 /* 738 * CONTPMD: If addr is CONTPMD aligned then addr already describes a 739 * leaf boundary. If not present then there is nothing to split. 740 * Otherwise, if we have a contpmd leaf, split to pmd. 741 */ 742 if (ALIGN_DOWN(addr, CONT_PMD_SIZE) == addr) 743 goto out; 744 pmdp = pmd_offset(pudp, addr); 745 pmd = pmdp_get(pmdp); 746 if (!pmd_present(pmd)) 747 goto out; 748 if (pmd_leaf(pmd)) { 749 if (pmd_cont(pmd)) 750 split_contpmd(pmdp); 751 /* 752 * PMD: If addr is PMD aligned then addr already describes a 753 * leaf boundary. Otherwise, split to contpte. 754 */ 755 if (ALIGN_DOWN(addr, PMD_SIZE) == addr) 756 goto out; 757 ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true); 758 if (ret) 759 goto out; 760 } 761 762 /* 763 * CONTPTE: If addr is CONTPTE aligned then addr already describes a 764 * leaf boundary. If not present then there is nothing to split. 765 * Otherwise, if we have a contpte leaf, split to pte. 766 */ 767 if (ALIGN_DOWN(addr, CONT_PTE_SIZE) == addr) 768 goto out; 769 ptep = pte_offset_kernel(pmdp, addr); 770 pte = __ptep_get(ptep); 771 if (!pte_present(pte)) 772 goto out; 773 if (pte_cont(pte)) 774 split_contpte(ptep); 775 776 out: 777 return ret; 778 } 779 780 static inline bool force_pte_mapping(void) 781 { 782 const bool bbml2 = system_capabilities_finalized() ? 783 system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort(); 784 785 if (debug_pagealloc_enabled()) 786 return true; 787 if (bbml2) 788 return false; 789 return rodata_full || arm64_kfence_can_set_direct_map() || is_realm_world(); 790 } 791 792 static DEFINE_MUTEX(pgtable_split_lock); 793 static bool linear_map_requires_bbml2; 794 795 int split_kernel_leaf_mapping(unsigned long start, unsigned long end) 796 { 797 int ret; 798 799 /* 800 * If the region is within a pte-mapped area, there is no need to try to 801 * split. Additionally, CONFIG_DEBUG_PAGEALLOC and CONFIG_KFENCE may 802 * change permissions from atomic context so for those cases (which are 803 * always pte-mapped), we must not go any further because taking the 804 * mutex below may sleep. Do not call force_pte_mapping() here because 805 * it could return a confusing result if called from a secondary cpu 806 * prior to finalizing caps. Instead, linear_map_requires_bbml2 gives us 807 * what we need. 808 */ 809 if (!linear_map_requires_bbml2 || is_kfence_address((void *)start)) 810 return 0; 811 812 if (!system_supports_bbml2_noabort()) { 813 /* 814 * !BBML2_NOABORT systems should not be trying to change 815 * permissions on anything that is not pte-mapped in the first 816 * place. Just return early and let the permission change code 817 * raise a warning if not already pte-mapped. 818 */ 819 if (system_capabilities_finalized()) 820 return 0; 821 822 /* 823 * Boot-time: split_kernel_leaf_mapping_locked() allocates from 824 * page allocator. Can't split until it's available. 825 */ 826 if (WARN_ON(!page_alloc_available)) 827 return -EBUSY; 828 829 /* 830 * Boot-time: Started secondary cpus but don't know if they 831 * support BBML2_NOABORT yet. Can't allow splitting in this 832 * window in case they don't. 833 */ 834 if (WARN_ON(num_online_cpus() > 1)) 835 return -EBUSY; 836 } 837 838 /* 839 * Ensure start and end are at least page-aligned since this is the 840 * finest granularity we can split to. 841 */ 842 if (start != PAGE_ALIGN(start) || end != PAGE_ALIGN(end)) 843 return -EINVAL; 844 845 mutex_lock(&pgtable_split_lock); 846 lazy_mmu_mode_enable(); 847 848 /* 849 * The split_kernel_leaf_mapping_locked() may sleep, it is not a 850 * problem for ARM64 since ARM64's lazy MMU implementation allows 851 * sleeping. 852 * 853 * Optimize for the common case of splitting out a single page from a 854 * larger mapping. Here we can just split on the "least aligned" of 855 * start and end and this will guarantee that there must also be a split 856 * on the more aligned address since the both addresses must be in the 857 * same contpte block and it must have been split to ptes. 858 */ 859 if (end - start == PAGE_SIZE) { 860 start = __ffs(start) < __ffs(end) ? start : end; 861 ret = split_kernel_leaf_mapping_locked(start); 862 } else { 863 ret = split_kernel_leaf_mapping_locked(start); 864 if (!ret) 865 ret = split_kernel_leaf_mapping_locked(end); 866 } 867 868 lazy_mmu_mode_disable(); 869 mutex_unlock(&pgtable_split_lock); 870 return ret; 871 } 872 873 static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr, 874 unsigned long next, struct mm_walk *walk) 875 { 876 gfp_t gfp = *(gfp_t *)walk->private; 877 pud_t pud = pudp_get(pudp); 878 int ret = 0; 879 880 if (pud_leaf(pud)) 881 ret = split_pud(pudp, pud, gfp, false); 882 883 return ret; 884 } 885 886 static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr, 887 unsigned long next, struct mm_walk *walk) 888 { 889 gfp_t gfp = *(gfp_t *)walk->private; 890 pmd_t pmd = pmdp_get(pmdp); 891 int ret = 0; 892 893 if (pmd_leaf(pmd)) { 894 if (pmd_cont(pmd)) 895 split_contpmd(pmdp); 896 ret = split_pmd(pmdp, pmd, gfp, false); 897 898 /* 899 * We have split the pmd directly to ptes so there is no need to 900 * visit each pte to check if they are contpte. 901 */ 902 walk->action = ACTION_CONTINUE; 903 } 904 905 return ret; 906 } 907 908 static int split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr, 909 unsigned long next, struct mm_walk *walk) 910 { 911 pte_t pte = __ptep_get(ptep); 912 913 if (pte_cont(pte)) 914 split_contpte(ptep); 915 916 return 0; 917 } 918 919 static const struct mm_walk_ops split_to_ptes_ops = { 920 .pud_entry = split_to_ptes_pud_entry, 921 .pmd_entry = split_to_ptes_pmd_entry, 922 .pte_entry = split_to_ptes_pte_entry, 923 }; 924 925 static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp) 926 { 927 int ret; 928 929 lazy_mmu_mode_enable(); 930 ret = walk_kernel_page_table_range_lockless(start, end, 931 &split_to_ptes_ops, NULL, &gfp); 932 lazy_mmu_mode_disable(); 933 934 return ret; 935 } 936 937 u32 idmap_kpti_bbml2_flag; 938 939 static void __init init_idmap_kpti_bbml2_flag(void) 940 { 941 WRITE_ONCE(idmap_kpti_bbml2_flag, 1); 942 /* Must be visible to other CPUs before stop_machine() is called. */ 943 smp_mb(); 944 } 945 946 static int __init linear_map_split_to_ptes(void *__unused) 947 { 948 /* 949 * Repainting the linear map must be done by CPU0 (the boot CPU) because 950 * that's the only CPU that we know supports BBML2. The other CPUs will 951 * be held in a waiting area with the idmap active. 952 */ 953 if (!smp_processor_id()) { 954 unsigned long lstart = _PAGE_OFFSET(vabits_actual); 955 unsigned long lend = PAGE_END; 956 unsigned long kstart = (unsigned long)lm_alias(_stext); 957 unsigned long kend = (unsigned long)lm_alias(__init_begin); 958 int ret; 959 960 /* 961 * Wait for all secondary CPUs to be put into the waiting area. 962 */ 963 smp_cond_load_acquire(&idmap_kpti_bbml2_flag, VAL == num_online_cpus()); 964 965 /* 966 * Walk all of the linear map [lstart, lend), except the kernel 967 * linear map alias [kstart, kend), and split all mappings to 968 * PTE. The kernel alias remains static throughout runtime so 969 * can continue to be safely mapped with large mappings. 970 */ 971 ret = range_split_to_ptes(lstart, kstart, GFP_ATOMIC); 972 if (!ret) 973 ret = range_split_to_ptes(kend, lend, GFP_ATOMIC); 974 if (ret) 975 panic("Failed to split linear map\n"); 976 flush_tlb_kernel_range(lstart, lend); 977 978 /* 979 * Relies on dsb in flush_tlb_kernel_range() to avoid reordering 980 * before any page table split operations. 981 */ 982 WRITE_ONCE(idmap_kpti_bbml2_flag, 0); 983 } else { 984 typedef void (wait_split_fn)(void); 985 extern wait_split_fn wait_linear_map_split_to_ptes; 986 wait_split_fn *wait_fn; 987 988 wait_fn = (void *)__pa_symbol(wait_linear_map_split_to_ptes); 989 990 /* 991 * At least one secondary CPU doesn't support BBML2 so cannot 992 * tolerate the size of the live mappings changing. So have the 993 * secondary CPUs wait for the boot CPU to make the changes 994 * with the idmap active and init_mm inactive. 995 */ 996 cpu_install_idmap(); 997 wait_fn(); 998 cpu_uninstall_idmap(); 999 } 1000 1001 return 0; 1002 } 1003 1004 void __init linear_map_maybe_split_to_ptes(void) 1005 { 1006 if (linear_map_requires_bbml2 && !system_supports_bbml2_noabort()) { 1007 init_idmap_kpti_bbml2_flag(); 1008 stop_machine(linear_map_split_to_ptes, NULL, cpu_online_mask); 1009 } 1010 } 1011 1012 /* 1013 * This function can only be used to modify existing table entries, 1014 * without allocating new levels of table. Note that this permits the 1015 * creation of new section or page entries. 1016 */ 1017 void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt, 1018 phys_addr_t size, pgprot_t prot) 1019 { 1020 if (virt < PAGE_OFFSET) { 1021 pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n", 1022 &phys, virt); 1023 return; 1024 } 1025 early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, 0); 1026 } 1027 1028 void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, 1029 unsigned long virt, phys_addr_t size, 1030 pgprot_t prot, bool page_mappings_only) 1031 { 1032 int flags = 0; 1033 1034 BUG_ON(mm == &init_mm); 1035 1036 if (page_mappings_only) 1037 flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; 1038 1039 early_create_pgd_mapping(mm->pgd, phys, virt, size, prot, 1040 pgd_pgtable_alloc_special_mm, flags); 1041 } 1042 1043 static void update_mapping_prot(phys_addr_t phys, unsigned long virt, 1044 phys_addr_t size, pgprot_t prot) 1045 { 1046 if (virt < PAGE_OFFSET) { 1047 pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n", 1048 &phys, virt); 1049 return; 1050 } 1051 1052 early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, 0); 1053 1054 /* flush the TLBs after updating live kernel mappings */ 1055 flush_tlb_kernel_range(virt, virt + size); 1056 } 1057 1058 static void __init __map_memblock(phys_addr_t start, phys_addr_t end, 1059 pgprot_t prot, int flags) 1060 { 1061 early_create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), 1062 end - start, prot, early_pgtable_alloc, flags); 1063 } 1064 1065 void __init mark_linear_text_alias_ro(void) 1066 { 1067 /* 1068 * Remove the write permissions from the linear alias of .text/.rodata 1069 */ 1070 update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), 1071 (unsigned long)__init_begin - (unsigned long)_text, 1072 PAGE_KERNEL_RO); 1073 } 1074 1075 #ifdef CONFIG_KFENCE 1076 1077 bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL; 1078 1079 /* early_param() will be parsed before map_mem() below. */ 1080 static int __init parse_kfence_early_init(char *arg) 1081 { 1082 int val; 1083 1084 if (get_option(&arg, &val)) 1085 kfence_early_init = !!val; 1086 return 0; 1087 } 1088 early_param("kfence.sample_interval", parse_kfence_early_init); 1089 1090 static void __init arm64_kfence_map_pool(void) 1091 { 1092 phys_addr_t kfence_pool; 1093 1094 if (!kfence_early_init) 1095 return; 1096 1097 kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); 1098 if (!kfence_pool) { 1099 pr_err("failed to allocate kfence pool\n"); 1100 kfence_early_init = false; 1101 return; 1102 } 1103 1104 /* KFENCE pool needs page-level mapping. */ 1105 __map_memblock(kfence_pool, kfence_pool + KFENCE_POOL_SIZE, 1106 pgprot_tagged(PAGE_KERNEL), 1107 NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS | NO_EXEC_MAPPINGS); 1108 __kfence_pool = phys_to_virt(kfence_pool); 1109 } 1110 1111 bool arch_kfence_init_pool(void) 1112 { 1113 unsigned long start = (unsigned long)__kfence_pool; 1114 unsigned long end = start + KFENCE_POOL_SIZE; 1115 int ret; 1116 1117 /* Exit early if we know the linear map is already pte-mapped. */ 1118 if (force_pte_mapping()) 1119 return true; 1120 1121 /* Kfence pool is already pte-mapped for the early init case. */ 1122 if (kfence_early_init) 1123 return true; 1124 1125 mutex_lock(&pgtable_split_lock); 1126 ret = range_split_to_ptes(start, end, GFP_PGTABLE_KERNEL); 1127 mutex_unlock(&pgtable_split_lock); 1128 1129 /* 1130 * Since the system supports bbml2_noabort, tlb invalidation is not 1131 * required here; the pgtable mappings have been split to pte but larger 1132 * entries may safely linger in the TLB. 1133 */ 1134 1135 return !ret; 1136 } 1137 #else /* CONFIG_KFENCE */ 1138 1139 static inline void arm64_kfence_map_pool(void) { } 1140 1141 #endif /* CONFIG_KFENCE */ 1142 1143 static void __init map_mem(void) 1144 { 1145 static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); 1146 phys_addr_t kernel_start = __pa_symbol(_text); 1147 phys_addr_t init_begin = __pa_symbol(__init_begin); 1148 phys_addr_t init_end = __pa_symbol(__init_end); 1149 phys_addr_t kernel_end = __pa_symbol(__bss_stop); 1150 phys_addr_t start, end; 1151 int flags = NO_EXEC_MAPPINGS; 1152 u64 i; 1153 1154 /* 1155 * Setting hierarchical PXNTable attributes on table entries covering 1156 * the linear region is only possible if it is guaranteed that no table 1157 * entries at any level are being shared between the linear region and 1158 * the vmalloc region. Check whether this is true for the PGD level, in 1159 * which case it is guaranteed to be true for all other levels as well. 1160 * (Unless we are running with support for LPA2, in which case the 1161 * entire reduced VA space is covered by a single pgd_t which will have 1162 * been populated without the PXNTable attribute by the time we get here.) 1163 */ 1164 BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end) && 1165 pgd_index(_PAGE_OFFSET(VA_BITS_MIN)) != PTRS_PER_PGD - 1); 1166 1167 arm64_kfence_map_pool(); 1168 1169 linear_map_requires_bbml2 = !force_pte_mapping() && can_set_direct_map(); 1170 1171 if (force_pte_mapping()) 1172 flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; 1173 1174 /* 1175 * Map the linear alias of the [_text, __init_begin) interval first 1176 * so that its write permissions can be removed later without the need 1177 * to split any block mappings created by the loop below. 1178 * 1179 * Write permissions are needed for alternatives patching, and will be 1180 * removed later by mark_linear_text_alias_ro() above. This makes the 1181 * contents of the region accessible to subsystems such as hibernate, 1182 * but protects it from inadvertent modification or execution. 1183 */ 1184 __map_memblock(kernel_start, init_begin, pgprot_tagged(PAGE_KERNEL), 1185 flags); 1186 1187 /* Map the kernel data/bss so it can be remapped later */ 1188 __map_memblock(init_end, kernel_end, pgprot_tagged(PAGE_KERNEL), 1189 flags); 1190 1191 /* map all the memory banks */ 1192 for_each_mem_range(i, &start, &end) { 1193 /* 1194 * The linear map must allow allocation tags reading/writing 1195 * if MTE is present. Otherwise, it has the same attributes as 1196 * PAGE_KERNEL. 1197 */ 1198 __map_memblock(start, end, pgprot_tagged(PAGE_KERNEL), 1199 flags); 1200 } 1201 1202 /* Map the kernel data/bss read-only in the linear map */ 1203 __map_memblock(init_end, kernel_end, PAGE_KERNEL_RO, flags); 1204 flush_tlb_kernel_range((unsigned long)lm_alias(__init_end), 1205 (unsigned long)lm_alias(__bss_stop)); 1206 } 1207 1208 void mark_rodata_ro(void) 1209 { 1210 unsigned long section_size; 1211 1212 /* 1213 * mark .rodata as read only. Use __init_begin rather than __end_rodata 1214 * to cover NOTES and EXCEPTION_TABLE. 1215 */ 1216 section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; 1217 WRITE_ONCE(rodata_is_rw, false); 1218 update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, 1219 section_size, PAGE_KERNEL_RO); 1220 /* mark the range between _text and _stext as read only. */ 1221 update_mapping_prot(__pa_symbol(_text), (unsigned long)_text, 1222 (unsigned long)_stext - (unsigned long)_text, 1223 PAGE_KERNEL_RO); 1224 } 1225 1226 static void __init declare_vma(struct vm_struct *vma, 1227 void *va_start, void *va_end, 1228 unsigned long vm_flags) 1229 { 1230 phys_addr_t pa_start = __pa_symbol(va_start); 1231 unsigned long size = va_end - va_start; 1232 1233 BUG_ON(!PAGE_ALIGNED(pa_start)); 1234 BUG_ON(!PAGE_ALIGNED(size)); 1235 1236 if (!(vm_flags & VM_NO_GUARD)) 1237 size += PAGE_SIZE; 1238 1239 vma->addr = va_start; 1240 vma->phys_addr = pa_start; 1241 vma->size = size; 1242 vma->flags = VM_MAP | vm_flags; 1243 vma->caller = __builtin_return_address(0); 1244 1245 vm_area_add_early(vma); 1246 } 1247 1248 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 1249 #define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT)) 1250 1251 static phys_addr_t kpti_ng_temp_alloc __initdata; 1252 1253 static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_level pgtable_level) 1254 { 1255 kpti_ng_temp_alloc -= PAGE_SIZE; 1256 return kpti_ng_temp_alloc; 1257 } 1258 1259 static int __init __kpti_install_ng_mappings(void *__unused) 1260 { 1261 typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long); 1262 extern kpti_remap_fn idmap_kpti_install_ng_mappings; 1263 kpti_remap_fn *remap_fn; 1264 1265 int cpu = smp_processor_id(); 1266 int levels = CONFIG_PGTABLE_LEVELS; 1267 int order = order_base_2(levels); 1268 u64 kpti_ng_temp_pgd_pa = 0; 1269 pgd_t *kpti_ng_temp_pgd; 1270 u64 alloc = 0; 1271 1272 if (levels == 5 && !pgtable_l5_enabled()) 1273 levels = 4; 1274 else if (levels == 4 && !pgtable_l4_enabled()) 1275 levels = 3; 1276 1277 remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); 1278 1279 if (!cpu) { 1280 int ret; 1281 1282 alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); 1283 kpti_ng_temp_pgd = (pgd_t *)(alloc + (levels - 1) * PAGE_SIZE); 1284 kpti_ng_temp_alloc = kpti_ng_temp_pgd_pa = __pa(kpti_ng_temp_pgd); 1285 1286 // 1287 // Create a minimal page table hierarchy that permits us to map 1288 // the swapper page tables temporarily as we traverse them. 1289 // 1290 // The physical pages are laid out as follows: 1291 // 1292 // +--------+-/-------+-/------ +-/------ +-\\\--------+ 1293 // : PTE[] : | PMD[] : | PUD[] : | P4D[] : ||| PGD[] : 1294 // +--------+-\-------+-\------ +-\------ +-///--------+ 1295 // ^ 1296 // The first page is mapped into this hierarchy at a PMD_SHIFT 1297 // aligned virtual address, so that we can manipulate the PTE 1298 // level entries while the mapping is active. The first entry 1299 // covers the PTE[] page itself, the remaining entries are free 1300 // to be used as a ad-hoc fixmap. 1301 // 1302 ret = __create_pgd_mapping_locked(kpti_ng_temp_pgd, __pa(alloc), 1303 KPTI_NG_TEMP_VA, PAGE_SIZE, PAGE_KERNEL, 1304 kpti_ng_pgd_alloc, 0); 1305 if (ret) 1306 panic("Failed to create page tables\n"); 1307 } 1308 1309 cpu_install_idmap(); 1310 remap_fn(cpu, num_online_cpus(), kpti_ng_temp_pgd_pa, KPTI_NG_TEMP_VA); 1311 cpu_uninstall_idmap(); 1312 1313 if (!cpu) { 1314 free_pages(alloc, order); 1315 arm64_use_ng_mappings = true; 1316 } 1317 1318 return 0; 1319 } 1320 1321 void __init kpti_install_ng_mappings(void) 1322 { 1323 /* Check whether KPTI is going to be used */ 1324 if (!arm64_kernel_unmapped_at_el0()) 1325 return; 1326 1327 /* 1328 * We don't need to rewrite the page-tables if either we've done 1329 * it already or we have KASLR enabled and therefore have not 1330 * created any global mappings at all. 1331 */ 1332 if (arm64_use_ng_mappings) 1333 return; 1334 1335 init_idmap_kpti_bbml2_flag(); 1336 stop_machine(__kpti_install_ng_mappings, NULL, cpu_online_mask); 1337 } 1338 1339 static pgprot_t __init kernel_exec_prot(void) 1340 { 1341 return rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; 1342 } 1343 1344 static int __init map_entry_trampoline(void) 1345 { 1346 int i; 1347 1348 if (!arm64_kernel_unmapped_at_el0()) 1349 return 0; 1350 1351 pgprot_t prot = kernel_exec_prot(); 1352 phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start); 1353 1354 /* The trampoline is always mapped and can therefore be global */ 1355 pgprot_val(prot) &= ~PTE_NG; 1356 1357 /* Map only the text into the trampoline page table */ 1358 memset(tramp_pg_dir, 0, PGD_SIZE); 1359 early_create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, 1360 entry_tramp_text_size(), prot, 1361 pgd_pgtable_alloc_init_mm, NO_BLOCK_MAPPINGS); 1362 1363 /* Map both the text and data into the kernel page table */ 1364 for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++) 1365 __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i, 1366 pa_start + i * PAGE_SIZE, prot); 1367 1368 if (IS_ENABLED(CONFIG_RELOCATABLE)) 1369 __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i, 1370 pa_start + i * PAGE_SIZE, PAGE_KERNEL_RO); 1371 1372 return 0; 1373 } 1374 core_initcall(map_entry_trampoline); 1375 #endif 1376 1377 /* 1378 * Declare the VMA areas for the kernel 1379 */ 1380 static void __init declare_kernel_vmas(void) 1381 { 1382 static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; 1383 1384 declare_vma(&vmlinux_seg[0], _text, _etext, VM_NO_GUARD); 1385 declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); 1386 declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); 1387 declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); 1388 declare_vma(&vmlinux_seg[4], _data, _end, 0); 1389 } 1390 1391 void __pi_map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, 1392 pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, 1393 u64 va_offset); 1394 1395 static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init, 1396 kpti_bbml2_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; 1397 1398 static void __init create_idmap(void) 1399 { 1400 phys_addr_t start = __pa_symbol(__idmap_text_start); 1401 phys_addr_t end = __pa_symbol(__idmap_text_end); 1402 phys_addr_t ptep = __pa_symbol(idmap_ptes); 1403 1404 __pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX, 1405 IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, 1406 __phys_to_virt(ptep) - ptep); 1407 1408 if (linear_map_requires_bbml2 || 1409 (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings)) { 1410 phys_addr_t pa = __pa_symbol(&idmap_kpti_bbml2_flag); 1411 1412 /* 1413 * The KPTI G-to-nG conversion code needs a read-write mapping 1414 * of its synchronization flag in the ID map. This is also used 1415 * when splitting the linear map to ptes if a secondary CPU 1416 * doesn't support bbml2. 1417 */ 1418 ptep = __pa_symbol(kpti_bbml2_ptes); 1419 __pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL, 1420 IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, 1421 __phys_to_virt(ptep) - ptep); 1422 } 1423 } 1424 1425 void __init paging_init(void) 1426 { 1427 map_mem(); 1428 1429 memblock_allow_resize(); 1430 1431 create_idmap(); 1432 declare_kernel_vmas(); 1433 } 1434 1435 #ifdef CONFIG_MEMORY_HOTPLUG 1436 static void free_hotplug_page_range(struct page *page, size_t size, 1437 struct vmem_altmap *altmap) 1438 { 1439 if (altmap) { 1440 vmem_altmap_free(altmap, size >> PAGE_SHIFT); 1441 } else { 1442 WARN_ON(PageReserved(page)); 1443 __free_pages(page, get_order(size)); 1444 } 1445 } 1446 1447 static void free_hotplug_pgtable_page(struct page *page) 1448 { 1449 pagetable_dtor(page_ptdesc(page)); 1450 free_hotplug_page_range(page, PAGE_SIZE, NULL); 1451 } 1452 1453 static bool pgtable_range_aligned(unsigned long start, unsigned long end, 1454 unsigned long floor, unsigned long ceiling, 1455 unsigned long mask) 1456 { 1457 start &= mask; 1458 if (start < floor) 1459 return false; 1460 1461 if (ceiling) { 1462 ceiling &= mask; 1463 if (!ceiling) 1464 return false; 1465 } 1466 1467 if (end - 1 > ceiling - 1) 1468 return false; 1469 return true; 1470 } 1471 1472 static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr, 1473 unsigned long end, bool free_mapped, 1474 struct vmem_altmap *altmap) 1475 { 1476 pte_t *ptep, pte; 1477 1478 do { 1479 ptep = pte_offset_kernel(pmdp, addr); 1480 pte = __ptep_get(ptep); 1481 if (pte_none(pte)) 1482 continue; 1483 1484 WARN_ON(!pte_present(pte)); 1485 __pte_clear(&init_mm, addr, ptep); 1486 if (free_mapped) { 1487 /* CONT blocks are not supported in the vmemmap */ 1488 WARN_ON(pte_cont(pte)); 1489 flush_tlb_kernel_range(addr, addr + PAGE_SIZE); 1490 free_hotplug_page_range(pte_page(pte), 1491 PAGE_SIZE, altmap); 1492 } 1493 /* unmap_hotplug_range() flushes TLB for !free_mapped */ 1494 } while (addr += PAGE_SIZE, addr < end); 1495 } 1496 1497 static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr, 1498 unsigned long end, bool free_mapped, 1499 struct vmem_altmap *altmap) 1500 { 1501 unsigned long next; 1502 pmd_t *pmdp, pmd; 1503 1504 do { 1505 next = pmd_addr_end(addr, end); 1506 pmdp = pmd_offset(pudp, addr); 1507 pmd = READ_ONCE(*pmdp); 1508 if (pmd_none(pmd)) 1509 continue; 1510 1511 WARN_ON(!pmd_present(pmd)); 1512 if (pmd_leaf(pmd)) { 1513 pmd_clear(pmdp); 1514 if (free_mapped) { 1515 /* CONT blocks are not supported in the vmemmap */ 1516 WARN_ON(pmd_cont(pmd)); 1517 flush_tlb_kernel_range(addr, addr + PMD_SIZE); 1518 free_hotplug_page_range(pmd_page(pmd), 1519 PMD_SIZE, altmap); 1520 } 1521 /* unmap_hotplug_range() flushes TLB for !free_mapped */ 1522 continue; 1523 } 1524 WARN_ON(!pmd_table(pmd)); 1525 unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap); 1526 } while (addr = next, addr < end); 1527 } 1528 1529 static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr, 1530 unsigned long end, bool free_mapped, 1531 struct vmem_altmap *altmap) 1532 { 1533 unsigned long next; 1534 pud_t *pudp, pud; 1535 1536 do { 1537 next = pud_addr_end(addr, end); 1538 pudp = pud_offset(p4dp, addr); 1539 pud = READ_ONCE(*pudp); 1540 if (pud_none(pud)) 1541 continue; 1542 1543 WARN_ON(!pud_present(pud)); 1544 if (pud_leaf(pud)) { 1545 pud_clear(pudp); 1546 if (free_mapped) { 1547 flush_tlb_kernel_range(addr, addr + PUD_SIZE); 1548 free_hotplug_page_range(pud_page(pud), 1549 PUD_SIZE, altmap); 1550 } 1551 /* unmap_hotplug_range() flushes TLB for !free_mapped */ 1552 continue; 1553 } 1554 WARN_ON(!pud_table(pud)); 1555 unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap); 1556 } while (addr = next, addr < end); 1557 } 1558 1559 static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr, 1560 unsigned long end, bool free_mapped, 1561 struct vmem_altmap *altmap) 1562 { 1563 unsigned long next; 1564 p4d_t *p4dp, p4d; 1565 1566 do { 1567 next = p4d_addr_end(addr, end); 1568 p4dp = p4d_offset(pgdp, addr); 1569 p4d = READ_ONCE(*p4dp); 1570 if (p4d_none(p4d)) 1571 continue; 1572 1573 WARN_ON(!p4d_present(p4d)); 1574 unmap_hotplug_pud_range(p4dp, addr, next, free_mapped, altmap); 1575 } while (addr = next, addr < end); 1576 } 1577 1578 static void unmap_hotplug_range(unsigned long addr, unsigned long end, 1579 bool free_mapped, struct vmem_altmap *altmap) 1580 { 1581 unsigned long start = addr; 1582 unsigned long next; 1583 pgd_t *pgdp, pgd; 1584 1585 /* 1586 * altmap can only be used as vmemmap mapping backing memory. 1587 * In case the backing memory itself is not being freed, then 1588 * altmap is irrelevant. Warn about this inconsistency when 1589 * encountered. 1590 */ 1591 WARN_ON(!free_mapped && altmap); 1592 1593 do { 1594 next = pgd_addr_end(addr, end); 1595 pgdp = pgd_offset_k(addr); 1596 pgd = READ_ONCE(*pgdp); 1597 if (pgd_none(pgd)) 1598 continue; 1599 1600 WARN_ON(!pgd_present(pgd)); 1601 unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap); 1602 } while (addr = next, addr < end); 1603 1604 if (!free_mapped) 1605 flush_tlb_kernel_range(start, end); 1606 } 1607 1608 static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, 1609 unsigned long end, unsigned long floor, 1610 unsigned long ceiling) 1611 { 1612 pte_t *ptep, pte; 1613 unsigned long i, start = addr; 1614 1615 do { 1616 ptep = pte_offset_kernel(pmdp, addr); 1617 pte = __ptep_get(ptep); 1618 1619 /* 1620 * This is just a sanity check here which verifies that 1621 * pte clearing has been done by earlier unmap loops. 1622 */ 1623 WARN_ON(!pte_none(pte)); 1624 } while (addr += PAGE_SIZE, addr < end); 1625 1626 if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK)) 1627 return; 1628 1629 /* 1630 * Check whether we can free the pte page if the rest of the 1631 * entries are empty. Overlap with other regions have been 1632 * handled by the floor/ceiling check. 1633 */ 1634 ptep = pte_offset_kernel(pmdp, 0UL); 1635 for (i = 0; i < PTRS_PER_PTE; i++) { 1636 if (!pte_none(__ptep_get(&ptep[i]))) 1637 return; 1638 } 1639 1640 pmd_clear(pmdp); 1641 __flush_tlb_kernel_pgtable(start); 1642 free_hotplug_pgtable_page(virt_to_page(ptep)); 1643 } 1644 1645 static void free_empty_pmd_table(pud_t *pudp, unsigned long addr, 1646 unsigned long end, unsigned long floor, 1647 unsigned long ceiling) 1648 { 1649 pmd_t *pmdp, pmd; 1650 unsigned long i, next, start = addr; 1651 1652 do { 1653 next = pmd_addr_end(addr, end); 1654 pmdp = pmd_offset(pudp, addr); 1655 pmd = READ_ONCE(*pmdp); 1656 if (pmd_none(pmd)) 1657 continue; 1658 1659 WARN_ON(!pmd_present(pmd) || !pmd_table(pmd)); 1660 free_empty_pte_table(pmdp, addr, next, floor, ceiling); 1661 } while (addr = next, addr < end); 1662 1663 if (CONFIG_PGTABLE_LEVELS <= 2) 1664 return; 1665 1666 if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK)) 1667 return; 1668 1669 /* 1670 * Check whether we can free the pmd page if the rest of the 1671 * entries are empty. Overlap with other regions have been 1672 * handled by the floor/ceiling check. 1673 */ 1674 pmdp = pmd_offset(pudp, 0UL); 1675 for (i = 0; i < PTRS_PER_PMD; i++) { 1676 if (!pmd_none(READ_ONCE(pmdp[i]))) 1677 return; 1678 } 1679 1680 pud_clear(pudp); 1681 __flush_tlb_kernel_pgtable(start); 1682 free_hotplug_pgtable_page(virt_to_page(pmdp)); 1683 } 1684 1685 static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr, 1686 unsigned long end, unsigned long floor, 1687 unsigned long ceiling) 1688 { 1689 pud_t *pudp, pud; 1690 unsigned long i, next, start = addr; 1691 1692 do { 1693 next = pud_addr_end(addr, end); 1694 pudp = pud_offset(p4dp, addr); 1695 pud = READ_ONCE(*pudp); 1696 if (pud_none(pud)) 1697 continue; 1698 1699 WARN_ON(!pud_present(pud) || !pud_table(pud)); 1700 free_empty_pmd_table(pudp, addr, next, floor, ceiling); 1701 } while (addr = next, addr < end); 1702 1703 if (!pgtable_l4_enabled()) 1704 return; 1705 1706 if (!pgtable_range_aligned(start, end, floor, ceiling, P4D_MASK)) 1707 return; 1708 1709 /* 1710 * Check whether we can free the pud page if the rest of the 1711 * entries are empty. Overlap with other regions have been 1712 * handled by the floor/ceiling check. 1713 */ 1714 pudp = pud_offset(p4dp, 0UL); 1715 for (i = 0; i < PTRS_PER_PUD; i++) { 1716 if (!pud_none(READ_ONCE(pudp[i]))) 1717 return; 1718 } 1719 1720 p4d_clear(p4dp); 1721 __flush_tlb_kernel_pgtable(start); 1722 free_hotplug_pgtable_page(virt_to_page(pudp)); 1723 } 1724 1725 static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr, 1726 unsigned long end, unsigned long floor, 1727 unsigned long ceiling) 1728 { 1729 p4d_t *p4dp, p4d; 1730 unsigned long i, next, start = addr; 1731 1732 do { 1733 next = p4d_addr_end(addr, end); 1734 p4dp = p4d_offset(pgdp, addr); 1735 p4d = READ_ONCE(*p4dp); 1736 if (p4d_none(p4d)) 1737 continue; 1738 1739 WARN_ON(!p4d_present(p4d)); 1740 free_empty_pud_table(p4dp, addr, next, floor, ceiling); 1741 } while (addr = next, addr < end); 1742 1743 if (!pgtable_l5_enabled()) 1744 return; 1745 1746 if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK)) 1747 return; 1748 1749 /* 1750 * Check whether we can free the p4d page if the rest of the 1751 * entries are empty. Overlap with other regions have been 1752 * handled by the floor/ceiling check. 1753 */ 1754 p4dp = p4d_offset(pgdp, 0UL); 1755 for (i = 0; i < PTRS_PER_P4D; i++) { 1756 if (!p4d_none(READ_ONCE(p4dp[i]))) 1757 return; 1758 } 1759 1760 pgd_clear(pgdp); 1761 __flush_tlb_kernel_pgtable(start); 1762 free_hotplug_pgtable_page(virt_to_page(p4dp)); 1763 } 1764 1765 static void free_empty_tables(unsigned long addr, unsigned long end, 1766 unsigned long floor, unsigned long ceiling) 1767 { 1768 unsigned long next; 1769 pgd_t *pgdp, pgd; 1770 1771 do { 1772 next = pgd_addr_end(addr, end); 1773 pgdp = pgd_offset_k(addr); 1774 pgd = READ_ONCE(*pgdp); 1775 if (pgd_none(pgd)) 1776 continue; 1777 1778 WARN_ON(!pgd_present(pgd)); 1779 free_empty_p4d_table(pgdp, addr, next, floor, ceiling); 1780 } while (addr = next, addr < end); 1781 } 1782 #endif 1783 1784 void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, 1785 unsigned long addr, unsigned long next) 1786 { 1787 pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL)); 1788 } 1789 1790 int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, 1791 unsigned long addr, unsigned long next) 1792 { 1793 vmemmap_verify((pte_t *)pmdp, node, addr, next); 1794 1795 return pmd_leaf(READ_ONCE(*pmdp)); 1796 } 1797 1798 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, 1799 struct vmem_altmap *altmap) 1800 { 1801 WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END)); 1802 /* [start, end] should be within one section */ 1803 WARN_ON_ONCE(end - start > PAGES_PER_SECTION * sizeof(struct page)); 1804 1805 if (!IS_ENABLED(CONFIG_ARM64_4K_PAGES) || 1806 (end - start < PAGES_PER_SECTION * sizeof(struct page))) 1807 return vmemmap_populate_basepages(start, end, node, altmap); 1808 else 1809 return vmemmap_populate_hugepages(start, end, node, altmap); 1810 } 1811 1812 #ifdef CONFIG_MEMORY_HOTPLUG 1813 void vmemmap_free(unsigned long start, unsigned long end, 1814 struct vmem_altmap *altmap) 1815 { 1816 WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END)); 1817 1818 unmap_hotplug_range(start, end, true, altmap); 1819 free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END); 1820 } 1821 #endif /* CONFIG_MEMORY_HOTPLUG */ 1822 1823 int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) 1824 { 1825 pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot)); 1826 1827 /* Only allow permission changes for now */ 1828 if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)), 1829 pud_val(new_pud))) 1830 return 0; 1831 1832 VM_BUG_ON(phys & ~PUD_MASK); 1833 set_pud(pudp, new_pud); 1834 return 1; 1835 } 1836 1837 int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) 1838 { 1839 pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot)); 1840 1841 /* Only allow permission changes for now */ 1842 if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)), 1843 pmd_val(new_pmd))) 1844 return 0; 1845 1846 VM_BUG_ON(phys & ~PMD_MASK); 1847 set_pmd(pmdp, new_pmd); 1848 return 1; 1849 } 1850 1851 #ifndef __PAGETABLE_P4D_FOLDED 1852 void p4d_clear_huge(p4d_t *p4dp) 1853 { 1854 } 1855 #endif 1856 1857 int pud_clear_huge(pud_t *pudp) 1858 { 1859 if (!pud_leaf(READ_ONCE(*pudp))) 1860 return 0; 1861 pud_clear(pudp); 1862 return 1; 1863 } 1864 1865 int pmd_clear_huge(pmd_t *pmdp) 1866 { 1867 if (!pmd_leaf(READ_ONCE(*pmdp))) 1868 return 0; 1869 pmd_clear(pmdp); 1870 return 1; 1871 } 1872 1873 static int __pmd_free_pte_page(pmd_t *pmdp, unsigned long addr, 1874 bool acquire_mmap_lock) 1875 { 1876 pte_t *table; 1877 pmd_t pmd; 1878 1879 pmd = READ_ONCE(*pmdp); 1880 1881 if (!pmd_table(pmd)) { 1882 VM_WARN_ON(1); 1883 return 1; 1884 } 1885 1886 /* See comment in pud_free_pmd_page for static key logic */ 1887 table = pte_offset_kernel(pmdp, addr); 1888 pmd_clear(pmdp); 1889 __flush_tlb_kernel_pgtable(addr); 1890 if (static_branch_unlikely(&arm64_ptdump_lock_key) && acquire_mmap_lock) { 1891 mmap_read_lock(&init_mm); 1892 mmap_read_unlock(&init_mm); 1893 } 1894 1895 pte_free_kernel(NULL, table); 1896 return 1; 1897 } 1898 1899 int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) 1900 { 1901 /* If ptdump is walking the pagetables, acquire init_mm.mmap_lock */ 1902 return __pmd_free_pte_page(pmdp, addr, /* acquire_mmap_lock = */ true); 1903 } 1904 1905 int pud_free_pmd_page(pud_t *pudp, unsigned long addr) 1906 { 1907 pmd_t *table; 1908 pmd_t *pmdp; 1909 pud_t pud; 1910 unsigned long next, end; 1911 1912 pud = READ_ONCE(*pudp); 1913 1914 if (!pud_table(pud)) { 1915 VM_WARN_ON(1); 1916 return 1; 1917 } 1918 1919 table = pmd_offset(pudp, addr); 1920 1921 /* 1922 * Our objective is to prevent ptdump from reading a PMD table which has 1923 * been freed. In this race, if pud_free_pmd_page observes the key on 1924 * (which got flipped by ptdump) then the mmap lock sequence here will, 1925 * as a result of the mmap write lock/unlock sequence in ptdump, give 1926 * us the correct synchronization. If not, this means that ptdump has 1927 * yet not started walking the pagetables - the sequence of barriers 1928 * issued by __flush_tlb_kernel_pgtable() guarantees that ptdump will 1929 * observe an empty PUD. 1930 */ 1931 pud_clear(pudp); 1932 __flush_tlb_kernel_pgtable(addr); 1933 if (static_branch_unlikely(&arm64_ptdump_lock_key)) { 1934 mmap_read_lock(&init_mm); 1935 mmap_read_unlock(&init_mm); 1936 } 1937 1938 pmdp = table; 1939 next = addr; 1940 end = addr + PUD_SIZE; 1941 do { 1942 if (pmd_present(pmdp_get(pmdp))) 1943 /* 1944 * PMD has been isolated, so ptdump won't see it. No 1945 * need to acquire init_mm.mmap_lock. 1946 */ 1947 __pmd_free_pte_page(pmdp, next, /* acquire_mmap_lock = */ false); 1948 } while (pmdp++, next += PMD_SIZE, next != end); 1949 1950 pmd_free(NULL, table); 1951 return 1; 1952 } 1953 1954 #ifdef CONFIG_MEMORY_HOTPLUG 1955 static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) 1956 { 1957 unsigned long end = start + size; 1958 1959 WARN_ON(pgdir != init_mm.pgd); 1960 WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END)); 1961 1962 unmap_hotplug_range(start, end, false, NULL); 1963 free_empty_tables(start, end, PAGE_OFFSET, PAGE_END); 1964 } 1965 1966 struct range arch_get_mappable_range(void) 1967 { 1968 struct range mhp_range; 1969 phys_addr_t start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); 1970 phys_addr_t end_linear_pa = __pa(PAGE_END - 1); 1971 1972 if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { 1973 /* 1974 * Check for a wrap, it is possible because of randomized linear 1975 * mapping the start physical address is actually bigger than 1976 * the end physical address. In this case set start to zero 1977 * because [0, end_linear_pa] range must still be able to cover 1978 * all addressable physical addresses. 1979 */ 1980 if (start_linear_pa > end_linear_pa) 1981 start_linear_pa = 0; 1982 } 1983 1984 WARN_ON(start_linear_pa > end_linear_pa); 1985 1986 /* 1987 * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)] 1988 * accommodating both its ends but excluding PAGE_END. Max physical 1989 * range which can be mapped inside this linear mapping range, must 1990 * also be derived from its end points. 1991 */ 1992 mhp_range.start = start_linear_pa; 1993 mhp_range.end = end_linear_pa; 1994 1995 return mhp_range; 1996 } 1997 1998 int arch_add_memory(int nid, u64 start, u64 size, 1999 struct mhp_params *params) 2000 { 2001 int ret, flags = NO_EXEC_MAPPINGS; 2002 2003 VM_BUG_ON(!mhp_range_allowed(start, size, true)); 2004 2005 if (force_pte_mapping()) 2006 flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; 2007 2008 ret = __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), 2009 size, params->pgprot, pgd_pgtable_alloc_init_mm, 2010 flags); 2011 if (ret) 2012 goto err; 2013 2014 memblock_clear_nomap(start, size); 2015 2016 ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, 2017 params); 2018 if (ret) 2019 goto err; 2020 2021 /* Address of hotplugged memory can be smaller */ 2022 max_pfn = max(max_pfn, PFN_UP(start + size)); 2023 max_low_pfn = max_pfn; 2024 2025 return 0; 2026 2027 err: 2028 __remove_pgd_mapping(swapper_pg_dir, 2029 __phys_to_virt(start), size); 2030 return ret; 2031 } 2032 2033 void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) 2034 { 2035 unsigned long start_pfn = start >> PAGE_SHIFT; 2036 unsigned long nr_pages = size >> PAGE_SHIFT; 2037 2038 __remove_pages(start_pfn, nr_pages, altmap); 2039 __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size); 2040 } 2041 2042 2043 static bool addr_splits_kernel_leaf(unsigned long addr) 2044 { 2045 pgd_t *pgdp, pgd; 2046 p4d_t *p4dp, p4d; 2047 pud_t *pudp, pud; 2048 pmd_t *pmdp, pmd; 2049 pte_t *ptep, pte; 2050 2051 /* 2052 * If the given address points at a the start address of 2053 * a possible leaf, we certainly won't split. Otherwise, 2054 * check if we would actually split a leaf by traversing 2055 * the page tables further. 2056 */ 2057 if (IS_ALIGNED(addr, PGDIR_SIZE)) 2058 return false; 2059 2060 pgdp = pgd_offset_k(addr); 2061 pgd = pgdp_get(pgdp); 2062 if (!pgd_present(pgd)) 2063 return false; 2064 2065 if (IS_ALIGNED(addr, P4D_SIZE)) 2066 return false; 2067 2068 p4dp = p4d_offset(pgdp, addr); 2069 p4d = p4dp_get(p4dp); 2070 if (!p4d_present(p4d)) 2071 return false; 2072 2073 if (IS_ALIGNED(addr, PUD_SIZE)) 2074 return false; 2075 2076 pudp = pud_offset(p4dp, addr); 2077 pud = pudp_get(pudp); 2078 if (!pud_present(pud)) 2079 return false; 2080 2081 if (pud_leaf(pud)) 2082 return true; 2083 2084 if (IS_ALIGNED(addr, CONT_PMD_SIZE)) 2085 return false; 2086 2087 pmdp = pmd_offset(pudp, addr); 2088 pmd = pmdp_get(pmdp); 2089 if (!pmd_present(pmd)) 2090 return false; 2091 2092 if (pmd_cont(pmd)) 2093 return true; 2094 2095 if (IS_ALIGNED(addr, PMD_SIZE)) 2096 return false; 2097 2098 if (pmd_leaf(pmd)) 2099 return true; 2100 2101 if (IS_ALIGNED(addr, CONT_PTE_SIZE)) 2102 return false; 2103 2104 ptep = pte_offset_kernel(pmdp, addr); 2105 pte = __ptep_get(ptep); 2106 if (!pte_present(pte)) 2107 return false; 2108 2109 if (pte_cont(pte)) 2110 return true; 2111 2112 return !IS_ALIGNED(addr, PAGE_SIZE); 2113 } 2114 2115 static bool can_unmap_without_split(unsigned long pfn, unsigned long nr_pages) 2116 { 2117 unsigned long phys_start, phys_end, start, end; 2118 2119 phys_start = PFN_PHYS(pfn); 2120 phys_end = phys_start + nr_pages * PAGE_SIZE; 2121 2122 /* PFN range's linear map edges are leaf entry aligned */ 2123 start = __phys_to_virt(phys_start); 2124 end = __phys_to_virt(phys_end); 2125 if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) { 2126 pr_warn("[%lx %lx] splits a leaf entry in linear map\n", 2127 phys_start, phys_end); 2128 return false; 2129 } 2130 2131 /* PFN range's vmemmap edges are leaf entry aligned */ 2132 BUILD_BUG_ON(!IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)); 2133 start = (unsigned long)pfn_to_page(pfn); 2134 end = (unsigned long)pfn_to_page(pfn + nr_pages); 2135 if (addr_splits_kernel_leaf(start) || addr_splits_kernel_leaf(end)) { 2136 pr_warn("[%lx %lx] splits a leaf entry in vmemmap\n", 2137 phys_start, phys_end); 2138 return false; 2139 } 2140 return true; 2141 } 2142 2143 /* 2144 * This memory hotplug notifier helps prevent boot memory from being 2145 * inadvertently removed as it blocks pfn range offlining process in 2146 * __offline_pages(). Hence this prevents both offlining as well as 2147 * removal process for boot memory which is initially always online. 2148 * In future if and when boot memory could be removed, this notifier 2149 * should be dropped and free_hotplug_page_range() should handle any 2150 * reserved pages allocated during boot. 2151 * 2152 * This also blocks any memory remove that would have caused a split 2153 * in leaf entry in kernel linear or vmemmap mapping. 2154 */ 2155 static int prevent_memory_remove_notifier(struct notifier_block *nb, 2156 unsigned long action, void *data) 2157 { 2158 struct mem_section *ms; 2159 struct memory_notify *arg = data; 2160 unsigned long end_pfn = arg->start_pfn + arg->nr_pages; 2161 unsigned long pfn = arg->start_pfn; 2162 2163 if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE)) 2164 return NOTIFY_OK; 2165 2166 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2167 unsigned long start = PFN_PHYS(pfn); 2168 unsigned long end = start + (1UL << PA_SECTION_SHIFT); 2169 2170 ms = __pfn_to_section(pfn); 2171 if (!early_section(ms)) 2172 continue; 2173 2174 if (action == MEM_GOING_OFFLINE) { 2175 /* 2176 * Boot memory removal is not supported. Prevent 2177 * it via blocking any attempted offline request 2178 * for the boot memory and just report it. 2179 */ 2180 pr_warn("Boot memory [%lx %lx] offlining attempted\n", start, end); 2181 return NOTIFY_BAD; 2182 } else if (action == MEM_OFFLINE) { 2183 /* 2184 * This should have never happened. Boot memory 2185 * offlining should have been prevented by this 2186 * very notifier. Probably some memory removal 2187 * procedure might have changed which would then 2188 * require further debug. 2189 */ 2190 pr_err("Boot memory [%lx %lx] offlined\n", start, end); 2191 2192 /* 2193 * Core memory hotplug does not process a return 2194 * code from the notifier for MEM_OFFLINE events. 2195 * The error condition has been reported. Return 2196 * from here as if ignored. 2197 */ 2198 return NOTIFY_DONE; 2199 } 2200 } 2201 2202 if (!can_unmap_without_split(pfn, arg->nr_pages)) 2203 return NOTIFY_BAD; 2204 2205 return NOTIFY_OK; 2206 } 2207 2208 static struct notifier_block prevent_memory_remove_nb = { 2209 .notifier_call = prevent_memory_remove_notifier, 2210 }; 2211 2212 /* 2213 * This ensures that boot memory sections on the platform are online 2214 * from early boot. Memory sections could not be prevented from being 2215 * offlined, unless for some reason they are not online to begin with. 2216 * This helps validate the basic assumption on which the above memory 2217 * event notifier works to prevent boot memory section offlining and 2218 * its possible removal. 2219 */ 2220 static void validate_bootmem_online(void) 2221 { 2222 phys_addr_t start, end, addr; 2223 struct mem_section *ms; 2224 u64 i; 2225 2226 /* 2227 * Scanning across all memblock might be expensive 2228 * on some big memory systems. Hence enable this 2229 * validation only with DEBUG_VM. 2230 */ 2231 if (!IS_ENABLED(CONFIG_DEBUG_VM)) 2232 return; 2233 2234 for_each_mem_range(i, &start, &end) { 2235 for (addr = start; addr < end; addr += (1UL << PA_SECTION_SHIFT)) { 2236 ms = __pfn_to_section(PHYS_PFN(addr)); 2237 2238 /* 2239 * All memory ranges in the system at this point 2240 * should have been marked as early sections. 2241 */ 2242 WARN_ON(!early_section(ms)); 2243 2244 /* 2245 * Memory notifier mechanism here to prevent boot 2246 * memory offlining depends on the fact that each 2247 * early section memory on the system is initially 2248 * online. Otherwise a given memory section which 2249 * is already offline will be overlooked and can 2250 * be removed completely. Call out such sections. 2251 */ 2252 if (!online_section(ms)) 2253 pr_err("Boot memory [%llx %llx] is offline, can be removed\n", 2254 addr, addr + (1UL << PA_SECTION_SHIFT)); 2255 } 2256 } 2257 } 2258 2259 static int __init prevent_memory_remove_init(void) 2260 { 2261 int ret = 0; 2262 2263 if (!IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) 2264 return ret; 2265 2266 validate_bootmem_online(); 2267 ret = register_memory_notifier(&prevent_memory_remove_nb); 2268 if (ret) 2269 pr_err("%s: Notifier registration failed %d\n", __func__, ret); 2270 2271 return ret; 2272 } 2273 early_initcall(prevent_memory_remove_init); 2274 #endif 2275 2276 pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, 2277 pte_t *ptep, unsigned int nr) 2278 { 2279 pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr); 2280 2281 if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { 2282 /* 2283 * Break-before-make (BBM) is required for all user space mappings 2284 * when the permission changes from executable to non-executable 2285 * in cases where cpu is affected with errata #2645198. 2286 */ 2287 if (pte_accessible(vma->vm_mm, pte) && pte_user_exec(pte)) 2288 __flush_tlb_range(vma, addr, nr * PAGE_SIZE, 2289 PAGE_SIZE, 3, TLBF_NOWALKCACHE); 2290 } 2291 2292 return pte; 2293 } 2294 2295 pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) 2296 { 2297 return modify_prot_start_ptes(vma, addr, ptep, 1); 2298 } 2299 2300 void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr, 2301 pte_t *ptep, pte_t old_pte, pte_t pte, 2302 unsigned int nr) 2303 { 2304 set_ptes(vma->vm_mm, addr, ptep, pte, nr); 2305 } 2306 2307 void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, 2308 pte_t old_pte, pte_t pte) 2309 { 2310 modify_prot_commit_ptes(vma, addr, ptep, old_pte, pte, 1); 2311 } 2312 2313 /* 2314 * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD, 2315 * avoiding the possibility of conflicting TLB entries being allocated. 2316 */ 2317 void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) 2318 { 2319 typedef void (ttbr_replace_func)(phys_addr_t); 2320 extern ttbr_replace_func idmap_cpu_replace_ttbr1; 2321 ttbr_replace_func *replace_phys; 2322 unsigned long daif; 2323 2324 /* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */ 2325 phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp)); 2326 2327 if (cnp) 2328 ttbr1 |= TTBRx_EL1_CnP; 2329 2330 replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1); 2331 2332 cpu_install_idmap(); 2333 2334 /* 2335 * We really don't want to take *any* exceptions while TTBR1 is 2336 * in the process of being replaced so mask everything. 2337 */ 2338 daif = local_daif_save(); 2339 replace_phys(ttbr1); 2340 local_daif_restore(daif); 2341 2342 cpu_uninstall_idmap(); 2343 } 2344 2345 #ifdef CONFIG_ARCH_HAS_PKEYS 2346 int arch_set_user_pkey_access(int pkey, unsigned long init_val) 2347 { 2348 u64 new_por; 2349 u64 old_por; 2350 2351 if (!system_supports_poe()) 2352 return -ENOSPC; 2353 2354 /* 2355 * This code should only be called with valid 'pkey' 2356 * values originating from in-kernel users. Complain 2357 * if a bad value is observed. 2358 */ 2359 if (WARN_ON_ONCE(pkey >= arch_max_pkey())) 2360 return -EINVAL; 2361 2362 /* Set the bits we need in POR: */ 2363 new_por = POE_RWX; 2364 if (init_val & PKEY_DISABLE_WRITE) 2365 new_por &= ~POE_W; 2366 if (init_val & PKEY_DISABLE_ACCESS) 2367 new_por &= ~POE_RW; 2368 if (init_val & PKEY_DISABLE_READ) 2369 new_por &= ~POE_R; 2370 if (init_val & PKEY_DISABLE_EXECUTE) 2371 new_por &= ~POE_X; 2372 2373 /* Shift the bits in to the correct place in POR for pkey: */ 2374 new_por = POR_ELx_PERM_PREP(pkey, new_por); 2375 2376 /* Get old POR and mask off any old bits in place: */ 2377 old_por = read_sysreg_s(SYS_POR_EL0); 2378 old_por &= ~(POE_MASK << POR_ELx_PERM_SHIFT(pkey)); 2379 2380 /* Write old part along with new part: */ 2381 write_sysreg_s(old_por | new_por, SYS_POR_EL0); 2382 2383 return 0; 2384 } 2385 #endif 2386