1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Based on arch/arm/mm/mmu.c 4 * 5 * Copyright (C) 1995-2005 Russell King 6 * Copyright (C) 2012 ARM Ltd. 7 */ 8 9 #include <linux/cache.h> 10 #include <linux/export.h> 11 #include <linux/kernel.h> 12 #include <linux/errno.h> 13 #include <linux/init.h> 14 #include <linux/ioport.h> 15 #include <linux/kexec.h> 16 #include <linux/libfdt.h> 17 #include <linux/mman.h> 18 #include <linux/nodemask.h> 19 #include <linux/memblock.h> 20 #include <linux/memremap.h> 21 #include <linux/memory.h> 22 #include <linux/fs.h> 23 #include <linux/io.h> 24 #include <linux/mm.h> 25 #include <linux/vmalloc.h> 26 #include <linux/set_memory.h> 27 #include <linux/kfence.h> 28 #include <linux/pkeys.h> 29 #include <linux/mm_inline.h> 30 #include <linux/pagewalk.h> 31 #include <linux/stop_machine.h> 32 33 #include <asm/barrier.h> 34 #include <asm/cputype.h> 35 #include <asm/fixmap.h> 36 #include <asm/kasan.h> 37 #include <asm/kernel-pgtable.h> 38 #include <asm/sections.h> 39 #include <asm/setup.h> 40 #include <linux/sizes.h> 41 #include <asm/tlb.h> 42 #include <asm/mmu_context.h> 43 #include <asm/ptdump.h> 44 #include <asm/tlbflush.h> 45 #include <asm/pgalloc.h> 46 #include <asm/kfence.h> 47 48 #define NO_BLOCK_MAPPINGS BIT(0) 49 #define NO_CONT_MAPPINGS BIT(1) 50 #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */ 51 52 #define INVALID_PHYS_ADDR (-1ULL) 53 54 DEFINE_STATIC_KEY_FALSE(arm64_ptdump_lock_key); 55 56 u64 kimage_voffset __ro_after_init; 57 EXPORT_SYMBOL(kimage_voffset); 58 59 u32 __boot_cpu_mode[] = { BOOT_CPU_MODE_EL2, BOOT_CPU_MODE_EL1 }; 60 61 static bool rodata_is_rw __ro_after_init = true; 62 63 /* 64 * The booting CPU updates the failed status @__early_cpu_boot_status, 65 * with MMU turned off. 66 */ 67 long __section(".mmuoff.data.write") __early_cpu_boot_status; 68 69 /* 70 * Empty_zero_page is a special page that is used for zero-initialized data 71 * and COW. 72 */ 73 unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; 74 EXPORT_SYMBOL(empty_zero_page); 75 76 static DEFINE_SPINLOCK(swapper_pgdir_lock); 77 static DEFINE_MUTEX(fixmap_lock); 78 79 void noinstr set_swapper_pgd(pgd_t *pgdp, pgd_t pgd) 80 { 81 pgd_t *fixmap_pgdp; 82 83 /* 84 * Don't bother with the fixmap if swapper_pg_dir is still mapped 85 * writable in the kernel mapping. 86 */ 87 if (rodata_is_rw) { 88 WRITE_ONCE(*pgdp, pgd); 89 dsb(ishst); 90 isb(); 91 return; 92 } 93 94 spin_lock(&swapper_pgdir_lock); 95 fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp)); 96 WRITE_ONCE(*fixmap_pgdp, pgd); 97 /* 98 * We need dsb(ishst) here to ensure the page-table-walker sees 99 * our new entry before set_p?d() returns. The fixmap's 100 * flush_tlb_kernel_range() via clear_fixmap() does this for us. 101 */ 102 pgd_clear_fixmap(); 103 spin_unlock(&swapper_pgdir_lock); 104 } 105 106 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 107 unsigned long size, pgprot_t vma_prot) 108 { 109 if (!pfn_is_map_memory(pfn)) 110 return pgprot_noncached(vma_prot); 111 else if (file->f_flags & O_SYNC) 112 return pgprot_writecombine(vma_prot); 113 return vma_prot; 114 } 115 EXPORT_SYMBOL(phys_mem_access_prot); 116 117 static phys_addr_t __init early_pgtable_alloc(enum pgtable_type pgtable_type) 118 { 119 phys_addr_t phys; 120 121 phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0, 122 MEMBLOCK_ALLOC_NOLEAKTRACE); 123 if (!phys) 124 panic("Failed to allocate page table page\n"); 125 126 return phys; 127 } 128 129 bool pgattr_change_is_safe(pteval_t old, pteval_t new) 130 { 131 /* 132 * The following mapping attributes may be updated in live 133 * kernel mappings without the need for break-before-make. 134 */ 135 pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG | 136 PTE_SWBITS_MASK; 137 138 /* creating or taking down mappings is always safe */ 139 if (!pte_valid(__pte(old)) || !pte_valid(__pte(new))) 140 return true; 141 142 /* A live entry's pfn should not change */ 143 if (pte_pfn(__pte(old)) != pte_pfn(__pte(new))) 144 return false; 145 146 /* live contiguous mappings may not be manipulated at all */ 147 if ((old | new) & PTE_CONT) 148 return false; 149 150 /* Transitioning from Non-Global to Global is unsafe */ 151 if (old & ~new & PTE_NG) 152 return false; 153 154 /* 155 * Changing the memory type between Normal and Normal-Tagged is safe 156 * since Tagged is considered a permission attribute from the 157 * mismatched attribute aliases perspective. 158 */ 159 if (((old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) || 160 (old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)) && 161 ((new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) || 162 (new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED))) 163 mask |= PTE_ATTRINDX_MASK; 164 165 return ((old ^ new) & ~mask) == 0; 166 } 167 168 static void init_clear_pgtable(void *table) 169 { 170 clear_page(table); 171 172 /* Ensure the zeroing is observed by page table walks. */ 173 dsb(ishst); 174 } 175 176 static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end, 177 phys_addr_t phys, pgprot_t prot) 178 { 179 do { 180 pte_t old_pte = __ptep_get(ptep); 181 182 /* 183 * Required barriers to make this visible to the table walker 184 * are deferred to the end of alloc_init_cont_pte(). 185 */ 186 __set_pte_nosync(ptep, pfn_pte(__phys_to_pfn(phys), prot)); 187 188 /* 189 * After the PTE entry has been populated once, we 190 * only allow updates to the permission attributes. 191 */ 192 BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), 193 pte_val(__ptep_get(ptep)))); 194 195 phys += PAGE_SIZE; 196 } while (ptep++, addr += PAGE_SIZE, addr != end); 197 } 198 199 static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, 200 unsigned long end, phys_addr_t phys, 201 pgprot_t prot, 202 phys_addr_t (*pgtable_alloc)(enum pgtable_type), 203 int flags) 204 { 205 unsigned long next; 206 pmd_t pmd = READ_ONCE(*pmdp); 207 pte_t *ptep; 208 209 BUG_ON(pmd_sect(pmd)); 210 if (pmd_none(pmd)) { 211 pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; 212 phys_addr_t pte_phys; 213 214 if (flags & NO_EXEC_MAPPINGS) 215 pmdval |= PMD_TABLE_PXN; 216 BUG_ON(!pgtable_alloc); 217 pte_phys = pgtable_alloc(TABLE_PTE); 218 if (pte_phys == INVALID_PHYS_ADDR) 219 return -ENOMEM; 220 ptep = pte_set_fixmap(pte_phys); 221 init_clear_pgtable(ptep); 222 ptep += pte_index(addr); 223 __pmd_populate(pmdp, pte_phys, pmdval); 224 } else { 225 BUG_ON(pmd_bad(pmd)); 226 ptep = pte_set_fixmap_offset(pmdp, addr); 227 } 228 229 do { 230 pgprot_t __prot = prot; 231 232 next = pte_cont_addr_end(addr, end); 233 234 /* use a contiguous mapping if the range is suitably aligned */ 235 if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) && 236 (flags & NO_CONT_MAPPINGS) == 0) 237 __prot = __pgprot(pgprot_val(prot) | PTE_CONT); 238 239 init_pte(ptep, addr, next, phys, __prot); 240 241 ptep += pte_index(next) - pte_index(addr); 242 phys += next - addr; 243 } while (addr = next, addr != end); 244 245 /* 246 * Note: barriers and maintenance necessary to clear the fixmap slot 247 * ensure that all previous pgtable writes are visible to the table 248 * walker. 249 */ 250 pte_clear_fixmap(); 251 252 return 0; 253 } 254 255 static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, 256 phys_addr_t phys, pgprot_t prot, 257 phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) 258 { 259 unsigned long next; 260 261 do { 262 pmd_t old_pmd = READ_ONCE(*pmdp); 263 264 next = pmd_addr_end(addr, end); 265 266 /* try section mapping first */ 267 if (((addr | next | phys) & ~PMD_MASK) == 0 && 268 (flags & NO_BLOCK_MAPPINGS) == 0) { 269 pmd_set_huge(pmdp, phys, prot); 270 271 /* 272 * After the PMD entry has been populated once, we 273 * only allow updates to the permission attributes. 274 */ 275 BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd), 276 READ_ONCE(pmd_val(*pmdp)))); 277 } else { 278 int ret; 279 280 ret = alloc_init_cont_pte(pmdp, addr, next, phys, prot, 281 pgtable_alloc, flags); 282 if (ret) 283 return ret; 284 285 BUG_ON(pmd_val(old_pmd) != 0 && 286 pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp))); 287 } 288 phys += next - addr; 289 } while (pmdp++, addr = next, addr != end); 290 291 return 0; 292 } 293 294 static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, 295 unsigned long end, phys_addr_t phys, 296 pgprot_t prot, 297 phys_addr_t (*pgtable_alloc)(enum pgtable_type), 298 int flags) 299 { 300 int ret; 301 unsigned long next; 302 pud_t pud = READ_ONCE(*pudp); 303 pmd_t *pmdp; 304 305 /* 306 * Check for initial section mappings in the pgd/pud. 307 */ 308 BUG_ON(pud_sect(pud)); 309 if (pud_none(pud)) { 310 pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; 311 phys_addr_t pmd_phys; 312 313 if (flags & NO_EXEC_MAPPINGS) 314 pudval |= PUD_TABLE_PXN; 315 BUG_ON(!pgtable_alloc); 316 pmd_phys = pgtable_alloc(TABLE_PMD); 317 if (pmd_phys == INVALID_PHYS_ADDR) 318 return -ENOMEM; 319 pmdp = pmd_set_fixmap(pmd_phys); 320 init_clear_pgtable(pmdp); 321 pmdp += pmd_index(addr); 322 __pud_populate(pudp, pmd_phys, pudval); 323 } else { 324 BUG_ON(pud_bad(pud)); 325 pmdp = pmd_set_fixmap_offset(pudp, addr); 326 } 327 328 do { 329 pgprot_t __prot = prot; 330 331 next = pmd_cont_addr_end(addr, end); 332 333 /* use a contiguous mapping if the range is suitably aligned */ 334 if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) && 335 (flags & NO_CONT_MAPPINGS) == 0) 336 __prot = __pgprot(pgprot_val(prot) | PTE_CONT); 337 338 ret = init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags); 339 if (ret) 340 goto out; 341 342 pmdp += pmd_index(next) - pmd_index(addr); 343 phys += next - addr; 344 } while (addr = next, addr != end); 345 346 out: 347 pmd_clear_fixmap(); 348 349 return ret; 350 } 351 352 static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, 353 phys_addr_t phys, pgprot_t prot, 354 phys_addr_t (*pgtable_alloc)(enum pgtable_type), 355 int flags) 356 { 357 int ret = 0; 358 unsigned long next; 359 p4d_t p4d = READ_ONCE(*p4dp); 360 pud_t *pudp; 361 362 if (p4d_none(p4d)) { 363 p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN | P4D_TABLE_AF; 364 phys_addr_t pud_phys; 365 366 if (flags & NO_EXEC_MAPPINGS) 367 p4dval |= P4D_TABLE_PXN; 368 BUG_ON(!pgtable_alloc); 369 pud_phys = pgtable_alloc(TABLE_PUD); 370 if (pud_phys == INVALID_PHYS_ADDR) 371 return -ENOMEM; 372 pudp = pud_set_fixmap(pud_phys); 373 init_clear_pgtable(pudp); 374 pudp += pud_index(addr); 375 __p4d_populate(p4dp, pud_phys, p4dval); 376 } else { 377 BUG_ON(p4d_bad(p4d)); 378 pudp = pud_set_fixmap_offset(p4dp, addr); 379 } 380 381 do { 382 pud_t old_pud = READ_ONCE(*pudp); 383 384 next = pud_addr_end(addr, end); 385 386 /* 387 * For 4K granule only, attempt to put down a 1GB block 388 */ 389 if (pud_sect_supported() && 390 ((addr | next | phys) & ~PUD_MASK) == 0 && 391 (flags & NO_BLOCK_MAPPINGS) == 0) { 392 pud_set_huge(pudp, phys, prot); 393 394 /* 395 * After the PUD entry has been populated once, we 396 * only allow updates to the permission attributes. 397 */ 398 BUG_ON(!pgattr_change_is_safe(pud_val(old_pud), 399 READ_ONCE(pud_val(*pudp)))); 400 } else { 401 ret = alloc_init_cont_pmd(pudp, addr, next, phys, prot, 402 pgtable_alloc, flags); 403 if (ret) 404 goto out; 405 406 BUG_ON(pud_val(old_pud) != 0 && 407 pud_val(old_pud) != READ_ONCE(pud_val(*pudp))); 408 } 409 phys += next - addr; 410 } while (pudp++, addr = next, addr != end); 411 412 out: 413 pud_clear_fixmap(); 414 415 return ret; 416 } 417 418 static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, 419 phys_addr_t phys, pgprot_t prot, 420 phys_addr_t (*pgtable_alloc)(enum pgtable_type), 421 int flags) 422 { 423 int ret; 424 unsigned long next; 425 pgd_t pgd = READ_ONCE(*pgdp); 426 p4d_t *p4dp; 427 428 if (pgd_none(pgd)) { 429 pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN | PGD_TABLE_AF; 430 phys_addr_t p4d_phys; 431 432 if (flags & NO_EXEC_MAPPINGS) 433 pgdval |= PGD_TABLE_PXN; 434 BUG_ON(!pgtable_alloc); 435 p4d_phys = pgtable_alloc(TABLE_P4D); 436 if (p4d_phys == INVALID_PHYS_ADDR) 437 return -ENOMEM; 438 p4dp = p4d_set_fixmap(p4d_phys); 439 init_clear_pgtable(p4dp); 440 p4dp += p4d_index(addr); 441 __pgd_populate(pgdp, p4d_phys, pgdval); 442 } else { 443 BUG_ON(pgd_bad(pgd)); 444 p4dp = p4d_set_fixmap_offset(pgdp, addr); 445 } 446 447 do { 448 p4d_t old_p4d = READ_ONCE(*p4dp); 449 450 next = p4d_addr_end(addr, end); 451 452 ret = alloc_init_pud(p4dp, addr, next, phys, prot, 453 pgtable_alloc, flags); 454 if (ret) 455 goto out; 456 457 BUG_ON(p4d_val(old_p4d) != 0 && 458 p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp))); 459 460 phys += next - addr; 461 } while (p4dp++, addr = next, addr != end); 462 463 out: 464 p4d_clear_fixmap(); 465 466 return ret; 467 } 468 469 static int __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, 470 unsigned long virt, phys_addr_t size, 471 pgprot_t prot, 472 phys_addr_t (*pgtable_alloc)(enum pgtable_type), 473 int flags) 474 { 475 int ret; 476 unsigned long addr, end, next; 477 pgd_t *pgdp = pgd_offset_pgd(pgdir, virt); 478 479 /* 480 * If the virtual and physical address don't have the same offset 481 * within a page, we cannot map the region as the caller expects. 482 */ 483 if (WARN_ON((phys ^ virt) & ~PAGE_MASK)) 484 return -EINVAL; 485 486 phys &= PAGE_MASK; 487 addr = virt & PAGE_MASK; 488 end = PAGE_ALIGN(virt + size); 489 490 do { 491 next = pgd_addr_end(addr, end); 492 ret = alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc, 493 flags); 494 if (ret) 495 return ret; 496 phys += next - addr; 497 } while (pgdp++, addr = next, addr != end); 498 499 return 0; 500 } 501 502 static int __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, 503 unsigned long virt, phys_addr_t size, 504 pgprot_t prot, 505 phys_addr_t (*pgtable_alloc)(enum pgtable_type), 506 int flags) 507 { 508 int ret; 509 510 mutex_lock(&fixmap_lock); 511 ret = __create_pgd_mapping_locked(pgdir, phys, virt, size, prot, 512 pgtable_alloc, flags); 513 mutex_unlock(&fixmap_lock); 514 515 return ret; 516 } 517 518 static void early_create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, 519 unsigned long virt, phys_addr_t size, 520 pgprot_t prot, 521 phys_addr_t (*pgtable_alloc)(enum pgtable_type), 522 int flags) 523 { 524 int ret; 525 526 ret = __create_pgd_mapping(pgdir, phys, virt, size, prot, pgtable_alloc, 527 flags); 528 if (ret) 529 panic("Failed to create page tables\n"); 530 } 531 532 static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp, 533 enum pgtable_type pgtable_type) 534 { 535 /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ 536 struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0); 537 phys_addr_t pa; 538 539 if (!ptdesc) 540 return INVALID_PHYS_ADDR; 541 542 pa = page_to_phys(ptdesc_page(ptdesc)); 543 544 switch (pgtable_type) { 545 case TABLE_PTE: 546 BUG_ON(!pagetable_pte_ctor(mm, ptdesc)); 547 break; 548 case TABLE_PMD: 549 BUG_ON(!pagetable_pmd_ctor(mm, ptdesc)); 550 break; 551 case TABLE_PUD: 552 pagetable_pud_ctor(ptdesc); 553 break; 554 case TABLE_P4D: 555 pagetable_p4d_ctor(ptdesc); 556 break; 557 } 558 559 return pa; 560 } 561 562 static phys_addr_t 563 pgd_pgtable_alloc_init_mm_gfp(enum pgtable_type pgtable_type, gfp_t gfp) 564 { 565 return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_type); 566 } 567 568 static phys_addr_t __maybe_unused 569 pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) 570 { 571 return pgd_pgtable_alloc_init_mm_gfp(pgtable_type, GFP_PGTABLE_KERNEL); 572 } 573 574 static phys_addr_t 575 pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type) 576 { 577 return __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type); 578 } 579 580 static void split_contpte(pte_t *ptep) 581 { 582 int i; 583 584 ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES); 585 for (i = 0; i < CONT_PTES; i++, ptep++) 586 __set_pte(ptep, pte_mknoncont(__ptep_get(ptep))); 587 } 588 589 static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont) 590 { 591 pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF; 592 unsigned long pfn = pmd_pfn(pmd); 593 pgprot_t prot = pmd_pgprot(pmd); 594 phys_addr_t pte_phys; 595 pte_t *ptep; 596 int i; 597 598 pte_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PTE, gfp); 599 if (pte_phys == INVALID_PHYS_ADDR) 600 return -ENOMEM; 601 ptep = (pte_t *)phys_to_virt(pte_phys); 602 603 if (pgprot_val(prot) & PMD_SECT_PXN) 604 tableprot |= PMD_TABLE_PXN; 605 606 prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) | PTE_TYPE_PAGE); 607 prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); 608 if (to_cont) 609 prot = __pgprot(pgprot_val(prot) | PTE_CONT); 610 611 for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++) 612 __set_pte(ptep, pfn_pte(pfn, prot)); 613 614 /* 615 * Ensure the pte entries are visible to the table walker by the time 616 * the pmd entry that points to the ptes is visible. 617 */ 618 dsb(ishst); 619 __pmd_populate(pmdp, pte_phys, tableprot); 620 621 return 0; 622 } 623 624 static void split_contpmd(pmd_t *pmdp) 625 { 626 int i; 627 628 pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS); 629 for (i = 0; i < CONT_PMDS; i++, pmdp++) 630 set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp))); 631 } 632 633 static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont) 634 { 635 pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF; 636 unsigned int step = PMD_SIZE >> PAGE_SHIFT; 637 unsigned long pfn = pud_pfn(pud); 638 pgprot_t prot = pud_pgprot(pud); 639 phys_addr_t pmd_phys; 640 pmd_t *pmdp; 641 int i; 642 643 pmd_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PMD, gfp); 644 if (pmd_phys == INVALID_PHYS_ADDR) 645 return -ENOMEM; 646 pmdp = (pmd_t *)phys_to_virt(pmd_phys); 647 648 if (pgprot_val(prot) & PMD_SECT_PXN) 649 tableprot |= PUD_TABLE_PXN; 650 651 prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT); 652 prot = __pgprot(pgprot_val(prot) & ~PTE_CONT); 653 if (to_cont) 654 prot = __pgprot(pgprot_val(prot) | PTE_CONT); 655 656 for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step) 657 set_pmd(pmdp, pfn_pmd(pfn, prot)); 658 659 /* 660 * Ensure the pmd entries are visible to the table walker by the time 661 * the pud entry that points to the pmds is visible. 662 */ 663 dsb(ishst); 664 __pud_populate(pudp, pmd_phys, tableprot); 665 666 return 0; 667 } 668 669 static int split_kernel_leaf_mapping_locked(unsigned long addr) 670 { 671 pgd_t *pgdp, pgd; 672 p4d_t *p4dp, p4d; 673 pud_t *pudp, pud; 674 pmd_t *pmdp, pmd; 675 pte_t *ptep, pte; 676 int ret = 0; 677 678 /* 679 * PGD: If addr is PGD aligned then addr already describes a leaf 680 * boundary. If not present then there is nothing to split. 681 */ 682 if (ALIGN_DOWN(addr, PGDIR_SIZE) == addr) 683 goto out; 684 pgdp = pgd_offset_k(addr); 685 pgd = pgdp_get(pgdp); 686 if (!pgd_present(pgd)) 687 goto out; 688 689 /* 690 * P4D: If addr is P4D aligned then addr already describes a leaf 691 * boundary. If not present then there is nothing to split. 692 */ 693 if (ALIGN_DOWN(addr, P4D_SIZE) == addr) 694 goto out; 695 p4dp = p4d_offset(pgdp, addr); 696 p4d = p4dp_get(p4dp); 697 if (!p4d_present(p4d)) 698 goto out; 699 700 /* 701 * PUD: If addr is PUD aligned then addr already describes a leaf 702 * boundary. If not present then there is nothing to split. Otherwise, 703 * if we have a pud leaf, split to contpmd. 704 */ 705 if (ALIGN_DOWN(addr, PUD_SIZE) == addr) 706 goto out; 707 pudp = pud_offset(p4dp, addr); 708 pud = pudp_get(pudp); 709 if (!pud_present(pud)) 710 goto out; 711 if (pud_leaf(pud)) { 712 ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true); 713 if (ret) 714 goto out; 715 } 716 717 /* 718 * CONTPMD: If addr is CONTPMD aligned then addr already describes a 719 * leaf boundary. If not present then there is nothing to split. 720 * Otherwise, if we have a contpmd leaf, split to pmd. 721 */ 722 if (ALIGN_DOWN(addr, CONT_PMD_SIZE) == addr) 723 goto out; 724 pmdp = pmd_offset(pudp, addr); 725 pmd = pmdp_get(pmdp); 726 if (!pmd_present(pmd)) 727 goto out; 728 if (pmd_leaf(pmd)) { 729 if (pmd_cont(pmd)) 730 split_contpmd(pmdp); 731 /* 732 * PMD: If addr is PMD aligned then addr already describes a 733 * leaf boundary. Otherwise, split to contpte. 734 */ 735 if (ALIGN_DOWN(addr, PMD_SIZE) == addr) 736 goto out; 737 ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true); 738 if (ret) 739 goto out; 740 } 741 742 /* 743 * CONTPTE: If addr is CONTPTE aligned then addr already describes a 744 * leaf boundary. If not present then there is nothing to split. 745 * Otherwise, if we have a contpte leaf, split to pte. 746 */ 747 if (ALIGN_DOWN(addr, CONT_PTE_SIZE) == addr) 748 goto out; 749 ptep = pte_offset_kernel(pmdp, addr); 750 pte = __ptep_get(ptep); 751 if (!pte_present(pte)) 752 goto out; 753 if (pte_cont(pte)) 754 split_contpte(ptep); 755 756 out: 757 return ret; 758 } 759 760 static inline bool force_pte_mapping(void) 761 { 762 const bool bbml2 = system_capabilities_finalized() ? 763 system_supports_bbml2_noabort() : cpu_supports_bbml2_noabort(); 764 765 if (debug_pagealloc_enabled()) 766 return true; 767 if (bbml2) 768 return false; 769 return rodata_full || arm64_kfence_can_set_direct_map() || is_realm_world(); 770 } 771 772 static inline bool split_leaf_mapping_possible(void) 773 { 774 /* 775 * !BBML2_NOABORT systems should never run into scenarios where we would 776 * have to split. So exit early and let calling code detect it and raise 777 * a warning. 778 */ 779 if (!system_supports_bbml2_noabort()) 780 return false; 781 return !force_pte_mapping(); 782 } 783 784 static DEFINE_MUTEX(pgtable_split_lock); 785 786 int split_kernel_leaf_mapping(unsigned long start, unsigned long end) 787 { 788 int ret; 789 790 /* 791 * Exit early if the region is within a pte-mapped area or if we can't 792 * split. For the latter case, the permission change code will raise a 793 * warning if not already pte-mapped. 794 */ 795 if (!split_leaf_mapping_possible() || is_kfence_address((void *)start)) 796 return 0; 797 798 /* 799 * Ensure start and end are at least page-aligned since this is the 800 * finest granularity we can split to. 801 */ 802 if (start != PAGE_ALIGN(start) || end != PAGE_ALIGN(end)) 803 return -EINVAL; 804 805 mutex_lock(&pgtable_split_lock); 806 arch_enter_lazy_mmu_mode(); 807 808 /* 809 * The split_kernel_leaf_mapping_locked() may sleep, it is not a 810 * problem for ARM64 since ARM64's lazy MMU implementation allows 811 * sleeping. 812 * 813 * Optimize for the common case of splitting out a single page from a 814 * larger mapping. Here we can just split on the "least aligned" of 815 * start and end and this will guarantee that there must also be a split 816 * on the more aligned address since the both addresses must be in the 817 * same contpte block and it must have been split to ptes. 818 */ 819 if (end - start == PAGE_SIZE) { 820 start = __ffs(start) < __ffs(end) ? start : end; 821 ret = split_kernel_leaf_mapping_locked(start); 822 } else { 823 ret = split_kernel_leaf_mapping_locked(start); 824 if (!ret) 825 ret = split_kernel_leaf_mapping_locked(end); 826 } 827 828 arch_leave_lazy_mmu_mode(); 829 mutex_unlock(&pgtable_split_lock); 830 return ret; 831 } 832 833 static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr, 834 unsigned long next, struct mm_walk *walk) 835 { 836 gfp_t gfp = *(gfp_t *)walk->private; 837 pud_t pud = pudp_get(pudp); 838 int ret = 0; 839 840 if (pud_leaf(pud)) 841 ret = split_pud(pudp, pud, gfp, false); 842 843 return ret; 844 } 845 846 static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr, 847 unsigned long next, struct mm_walk *walk) 848 { 849 gfp_t gfp = *(gfp_t *)walk->private; 850 pmd_t pmd = pmdp_get(pmdp); 851 int ret = 0; 852 853 if (pmd_leaf(pmd)) { 854 if (pmd_cont(pmd)) 855 split_contpmd(pmdp); 856 ret = split_pmd(pmdp, pmd, gfp, false); 857 858 /* 859 * We have split the pmd directly to ptes so there is no need to 860 * visit each pte to check if they are contpte. 861 */ 862 walk->action = ACTION_CONTINUE; 863 } 864 865 return ret; 866 } 867 868 static int split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr, 869 unsigned long next, struct mm_walk *walk) 870 { 871 pte_t pte = __ptep_get(ptep); 872 873 if (pte_cont(pte)) 874 split_contpte(ptep); 875 876 return 0; 877 } 878 879 static const struct mm_walk_ops split_to_ptes_ops = { 880 .pud_entry = split_to_ptes_pud_entry, 881 .pmd_entry = split_to_ptes_pmd_entry, 882 .pte_entry = split_to_ptes_pte_entry, 883 }; 884 885 static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp) 886 { 887 int ret; 888 889 arch_enter_lazy_mmu_mode(); 890 ret = walk_kernel_page_table_range_lockless(start, end, 891 &split_to_ptes_ops, NULL, &gfp); 892 arch_leave_lazy_mmu_mode(); 893 894 return ret; 895 } 896 897 static bool linear_map_requires_bbml2 __initdata; 898 899 u32 idmap_kpti_bbml2_flag; 900 901 static void __init init_idmap_kpti_bbml2_flag(void) 902 { 903 WRITE_ONCE(idmap_kpti_bbml2_flag, 1); 904 /* Must be visible to other CPUs before stop_machine() is called. */ 905 smp_mb(); 906 } 907 908 static int __init linear_map_split_to_ptes(void *__unused) 909 { 910 /* 911 * Repainting the linear map must be done by CPU0 (the boot CPU) because 912 * that's the only CPU that we know supports BBML2. The other CPUs will 913 * be held in a waiting area with the idmap active. 914 */ 915 if (!smp_processor_id()) { 916 unsigned long lstart = _PAGE_OFFSET(vabits_actual); 917 unsigned long lend = PAGE_END; 918 unsigned long kstart = (unsigned long)lm_alias(_stext); 919 unsigned long kend = (unsigned long)lm_alias(__init_begin); 920 int ret; 921 922 /* 923 * Wait for all secondary CPUs to be put into the waiting area. 924 */ 925 smp_cond_load_acquire(&idmap_kpti_bbml2_flag, VAL == num_online_cpus()); 926 927 /* 928 * Walk all of the linear map [lstart, lend), except the kernel 929 * linear map alias [kstart, kend), and split all mappings to 930 * PTE. The kernel alias remains static throughout runtime so 931 * can continue to be safely mapped with large mappings. 932 */ 933 ret = range_split_to_ptes(lstart, kstart, GFP_ATOMIC); 934 if (!ret) 935 ret = range_split_to_ptes(kend, lend, GFP_ATOMIC); 936 if (ret) 937 panic("Failed to split linear map\n"); 938 flush_tlb_kernel_range(lstart, lend); 939 940 /* 941 * Relies on dsb in flush_tlb_kernel_range() to avoid reordering 942 * before any page table split operations. 943 */ 944 WRITE_ONCE(idmap_kpti_bbml2_flag, 0); 945 } else { 946 typedef void (wait_split_fn)(void); 947 extern wait_split_fn wait_linear_map_split_to_ptes; 948 wait_split_fn *wait_fn; 949 950 wait_fn = (void *)__pa_symbol(wait_linear_map_split_to_ptes); 951 952 /* 953 * At least one secondary CPU doesn't support BBML2 so cannot 954 * tolerate the size of the live mappings changing. So have the 955 * secondary CPUs wait for the boot CPU to make the changes 956 * with the idmap active and init_mm inactive. 957 */ 958 cpu_install_idmap(); 959 wait_fn(); 960 cpu_uninstall_idmap(); 961 } 962 963 return 0; 964 } 965 966 void __init linear_map_maybe_split_to_ptes(void) 967 { 968 if (linear_map_requires_bbml2 && !system_supports_bbml2_noabort()) { 969 init_idmap_kpti_bbml2_flag(); 970 stop_machine(linear_map_split_to_ptes, NULL, cpu_online_mask); 971 } 972 } 973 974 /* 975 * This function can only be used to modify existing table entries, 976 * without allocating new levels of table. Note that this permits the 977 * creation of new section or page entries. 978 */ 979 void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt, 980 phys_addr_t size, pgprot_t prot) 981 { 982 if (virt < PAGE_OFFSET) { 983 pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n", 984 &phys, virt); 985 return; 986 } 987 early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, 988 NO_CONT_MAPPINGS); 989 } 990 991 void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, 992 unsigned long virt, phys_addr_t size, 993 pgprot_t prot, bool page_mappings_only) 994 { 995 int flags = 0; 996 997 BUG_ON(mm == &init_mm); 998 999 if (page_mappings_only) 1000 flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; 1001 1002 early_create_pgd_mapping(mm->pgd, phys, virt, size, prot, 1003 pgd_pgtable_alloc_special_mm, flags); 1004 } 1005 1006 static void update_mapping_prot(phys_addr_t phys, unsigned long virt, 1007 phys_addr_t size, pgprot_t prot) 1008 { 1009 if (virt < PAGE_OFFSET) { 1010 pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n", 1011 &phys, virt); 1012 return; 1013 } 1014 1015 early_create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, 1016 NO_CONT_MAPPINGS); 1017 1018 /* flush the TLBs after updating live kernel mappings */ 1019 flush_tlb_kernel_range(virt, virt + size); 1020 } 1021 1022 static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start, 1023 phys_addr_t end, pgprot_t prot, int flags) 1024 { 1025 early_create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start, 1026 prot, early_pgtable_alloc, flags); 1027 } 1028 1029 void __init mark_linear_text_alias_ro(void) 1030 { 1031 /* 1032 * Remove the write permissions from the linear alias of .text/.rodata 1033 */ 1034 update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), 1035 (unsigned long)__init_begin - (unsigned long)_text, 1036 PAGE_KERNEL_RO); 1037 } 1038 1039 #ifdef CONFIG_KFENCE 1040 1041 bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL; 1042 1043 /* early_param() will be parsed before map_mem() below. */ 1044 static int __init parse_kfence_early_init(char *arg) 1045 { 1046 int val; 1047 1048 if (get_option(&arg, &val)) 1049 kfence_early_init = !!val; 1050 return 0; 1051 } 1052 early_param("kfence.sample_interval", parse_kfence_early_init); 1053 1054 static phys_addr_t __init arm64_kfence_alloc_pool(void) 1055 { 1056 phys_addr_t kfence_pool; 1057 1058 if (!kfence_early_init) 1059 return 0; 1060 1061 kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); 1062 if (!kfence_pool) { 1063 pr_err("failed to allocate kfence pool\n"); 1064 kfence_early_init = false; 1065 return 0; 1066 } 1067 1068 /* Temporarily mark as NOMAP. */ 1069 memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE); 1070 1071 return kfence_pool; 1072 } 1073 1074 static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) 1075 { 1076 if (!kfence_pool) 1077 return; 1078 1079 /* KFENCE pool needs page-level mapping. */ 1080 __map_memblock(pgdp, kfence_pool, kfence_pool + KFENCE_POOL_SIZE, 1081 pgprot_tagged(PAGE_KERNEL), 1082 NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS); 1083 memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE); 1084 __kfence_pool = phys_to_virt(kfence_pool); 1085 } 1086 1087 bool arch_kfence_init_pool(void) 1088 { 1089 unsigned long start = (unsigned long)__kfence_pool; 1090 unsigned long end = start + KFENCE_POOL_SIZE; 1091 int ret; 1092 1093 /* Exit early if we know the linear map is already pte-mapped. */ 1094 if (!split_leaf_mapping_possible()) 1095 return true; 1096 1097 /* Kfence pool is already pte-mapped for the early init case. */ 1098 if (kfence_early_init) 1099 return true; 1100 1101 mutex_lock(&pgtable_split_lock); 1102 ret = range_split_to_ptes(start, end, GFP_PGTABLE_KERNEL); 1103 mutex_unlock(&pgtable_split_lock); 1104 1105 /* 1106 * Since the system supports bbml2_noabort, tlb invalidation is not 1107 * required here; the pgtable mappings have been split to pte but larger 1108 * entries may safely linger in the TLB. 1109 */ 1110 1111 return !ret; 1112 } 1113 #else /* CONFIG_KFENCE */ 1114 1115 static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; } 1116 static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { } 1117 1118 #endif /* CONFIG_KFENCE */ 1119 1120 static void __init map_mem(pgd_t *pgdp) 1121 { 1122 static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); 1123 phys_addr_t kernel_start = __pa_symbol(_text); 1124 phys_addr_t kernel_end = __pa_symbol(__init_begin); 1125 phys_addr_t start, end; 1126 phys_addr_t early_kfence_pool; 1127 int flags = NO_EXEC_MAPPINGS; 1128 u64 i; 1129 1130 /* 1131 * Setting hierarchical PXNTable attributes on table entries covering 1132 * the linear region is only possible if it is guaranteed that no table 1133 * entries at any level are being shared between the linear region and 1134 * the vmalloc region. Check whether this is true for the PGD level, in 1135 * which case it is guaranteed to be true for all other levels as well. 1136 * (Unless we are running with support for LPA2, in which case the 1137 * entire reduced VA space is covered by a single pgd_t which will have 1138 * been populated without the PXNTable attribute by the time we get here.) 1139 */ 1140 BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end) && 1141 pgd_index(_PAGE_OFFSET(VA_BITS_MIN)) != PTRS_PER_PGD - 1); 1142 1143 early_kfence_pool = arm64_kfence_alloc_pool(); 1144 1145 linear_map_requires_bbml2 = !force_pte_mapping() && can_set_direct_map(); 1146 1147 if (force_pte_mapping()) 1148 flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; 1149 1150 /* 1151 * Take care not to create a writable alias for the 1152 * read-only text and rodata sections of the kernel image. 1153 * So temporarily mark them as NOMAP to skip mappings in 1154 * the following for-loop 1155 */ 1156 memblock_mark_nomap(kernel_start, kernel_end - kernel_start); 1157 1158 /* map all the memory banks */ 1159 for_each_mem_range(i, &start, &end) { 1160 if (start >= end) 1161 break; 1162 /* 1163 * The linear map must allow allocation tags reading/writing 1164 * if MTE is present. Otherwise, it has the same attributes as 1165 * PAGE_KERNEL. 1166 */ 1167 __map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL), 1168 flags); 1169 } 1170 1171 /* 1172 * Map the linear alias of the [_text, __init_begin) interval 1173 * as non-executable now, and remove the write permission in 1174 * mark_linear_text_alias_ro() below (which will be called after 1175 * alternative patching has completed). This makes the contents 1176 * of the region accessible to subsystems such as hibernate, 1177 * but protects it from inadvertent modification or execution. 1178 * Note that contiguous mappings cannot be remapped in this way, 1179 * so we should avoid them here. 1180 */ 1181 __map_memblock(pgdp, kernel_start, kernel_end, 1182 PAGE_KERNEL, NO_CONT_MAPPINGS); 1183 memblock_clear_nomap(kernel_start, kernel_end - kernel_start); 1184 arm64_kfence_map_pool(early_kfence_pool, pgdp); 1185 } 1186 1187 void mark_rodata_ro(void) 1188 { 1189 unsigned long section_size; 1190 1191 /* 1192 * mark .rodata as read only. Use __init_begin rather than __end_rodata 1193 * to cover NOTES and EXCEPTION_TABLE. 1194 */ 1195 section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; 1196 WRITE_ONCE(rodata_is_rw, false); 1197 update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, 1198 section_size, PAGE_KERNEL_RO); 1199 /* mark the range between _text and _stext as read only. */ 1200 update_mapping_prot(__pa_symbol(_text), (unsigned long)_text, 1201 (unsigned long)_stext - (unsigned long)_text, 1202 PAGE_KERNEL_RO); 1203 } 1204 1205 static void __init declare_vma(struct vm_struct *vma, 1206 void *va_start, void *va_end, 1207 unsigned long vm_flags) 1208 { 1209 phys_addr_t pa_start = __pa_symbol(va_start); 1210 unsigned long size = va_end - va_start; 1211 1212 BUG_ON(!PAGE_ALIGNED(pa_start)); 1213 BUG_ON(!PAGE_ALIGNED(size)); 1214 1215 if (!(vm_flags & VM_NO_GUARD)) 1216 size += PAGE_SIZE; 1217 1218 vma->addr = va_start; 1219 vma->phys_addr = pa_start; 1220 vma->size = size; 1221 vma->flags = VM_MAP | vm_flags; 1222 vma->caller = __builtin_return_address(0); 1223 1224 vm_area_add_early(vma); 1225 } 1226 1227 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 1228 #define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT)) 1229 1230 static phys_addr_t kpti_ng_temp_alloc __initdata; 1231 1232 static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_type type) 1233 { 1234 kpti_ng_temp_alloc -= PAGE_SIZE; 1235 return kpti_ng_temp_alloc; 1236 } 1237 1238 static int __init __kpti_install_ng_mappings(void *__unused) 1239 { 1240 typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long); 1241 extern kpti_remap_fn idmap_kpti_install_ng_mappings; 1242 kpti_remap_fn *remap_fn; 1243 1244 int cpu = smp_processor_id(); 1245 int levels = CONFIG_PGTABLE_LEVELS; 1246 int order = order_base_2(levels); 1247 u64 kpti_ng_temp_pgd_pa = 0; 1248 pgd_t *kpti_ng_temp_pgd; 1249 u64 alloc = 0; 1250 1251 if (levels == 5 && !pgtable_l5_enabled()) 1252 levels = 4; 1253 else if (levels == 4 && !pgtable_l4_enabled()) 1254 levels = 3; 1255 1256 remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); 1257 1258 if (!cpu) { 1259 int ret; 1260 1261 alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); 1262 kpti_ng_temp_pgd = (pgd_t *)(alloc + (levels - 1) * PAGE_SIZE); 1263 kpti_ng_temp_alloc = kpti_ng_temp_pgd_pa = __pa(kpti_ng_temp_pgd); 1264 1265 // 1266 // Create a minimal page table hierarchy that permits us to map 1267 // the swapper page tables temporarily as we traverse them. 1268 // 1269 // The physical pages are laid out as follows: 1270 // 1271 // +--------+-/-------+-/------ +-/------ +-\\\--------+ 1272 // : PTE[] : | PMD[] : | PUD[] : | P4D[] : ||| PGD[] : 1273 // +--------+-\-------+-\------ +-\------ +-///--------+ 1274 // ^ 1275 // The first page is mapped into this hierarchy at a PMD_SHIFT 1276 // aligned virtual address, so that we can manipulate the PTE 1277 // level entries while the mapping is active. The first entry 1278 // covers the PTE[] page itself, the remaining entries are free 1279 // to be used as a ad-hoc fixmap. 1280 // 1281 ret = __create_pgd_mapping_locked(kpti_ng_temp_pgd, __pa(alloc), 1282 KPTI_NG_TEMP_VA, PAGE_SIZE, PAGE_KERNEL, 1283 kpti_ng_pgd_alloc, 0); 1284 if (ret) 1285 panic("Failed to create page tables\n"); 1286 } 1287 1288 cpu_install_idmap(); 1289 remap_fn(cpu, num_online_cpus(), kpti_ng_temp_pgd_pa, KPTI_NG_TEMP_VA); 1290 cpu_uninstall_idmap(); 1291 1292 if (!cpu) { 1293 free_pages(alloc, order); 1294 arm64_use_ng_mappings = true; 1295 } 1296 1297 return 0; 1298 } 1299 1300 void __init kpti_install_ng_mappings(void) 1301 { 1302 /* Check whether KPTI is going to be used */ 1303 if (!arm64_kernel_unmapped_at_el0()) 1304 return; 1305 1306 /* 1307 * We don't need to rewrite the page-tables if either we've done 1308 * it already or we have KASLR enabled and therefore have not 1309 * created any global mappings at all. 1310 */ 1311 if (arm64_use_ng_mappings) 1312 return; 1313 1314 init_idmap_kpti_bbml2_flag(); 1315 stop_machine(__kpti_install_ng_mappings, NULL, cpu_online_mask); 1316 } 1317 1318 static pgprot_t __init kernel_exec_prot(void) 1319 { 1320 return rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; 1321 } 1322 1323 static int __init map_entry_trampoline(void) 1324 { 1325 int i; 1326 1327 if (!arm64_kernel_unmapped_at_el0()) 1328 return 0; 1329 1330 pgprot_t prot = kernel_exec_prot(); 1331 phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start); 1332 1333 /* The trampoline is always mapped and can therefore be global */ 1334 pgprot_val(prot) &= ~PTE_NG; 1335 1336 /* Map only the text into the trampoline page table */ 1337 memset(tramp_pg_dir, 0, PGD_SIZE); 1338 early_create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, 1339 entry_tramp_text_size(), prot, 1340 pgd_pgtable_alloc_init_mm, NO_BLOCK_MAPPINGS); 1341 1342 /* Map both the text and data into the kernel page table */ 1343 for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++) 1344 __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i, 1345 pa_start + i * PAGE_SIZE, prot); 1346 1347 if (IS_ENABLED(CONFIG_RELOCATABLE)) 1348 __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i, 1349 pa_start + i * PAGE_SIZE, PAGE_KERNEL_RO); 1350 1351 return 0; 1352 } 1353 core_initcall(map_entry_trampoline); 1354 #endif 1355 1356 /* 1357 * Declare the VMA areas for the kernel 1358 */ 1359 static void __init declare_kernel_vmas(void) 1360 { 1361 static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; 1362 1363 declare_vma(&vmlinux_seg[0], _text, _etext, VM_NO_GUARD); 1364 declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); 1365 declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); 1366 declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); 1367 declare_vma(&vmlinux_seg[4], _data, _end, 0); 1368 } 1369 1370 void __pi_map_range(phys_addr_t *pte, u64 start, u64 end, phys_addr_t pa, 1371 pgprot_t prot, int level, pte_t *tbl, bool may_use_cont, 1372 u64 va_offset); 1373 1374 static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init, 1375 kpti_bbml2_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init; 1376 1377 static void __init create_idmap(void) 1378 { 1379 phys_addr_t start = __pa_symbol(__idmap_text_start); 1380 phys_addr_t end = __pa_symbol(__idmap_text_end); 1381 phys_addr_t ptep = __pa_symbol(idmap_ptes); 1382 1383 __pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX, 1384 IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, 1385 __phys_to_virt(ptep) - ptep); 1386 1387 if (linear_map_requires_bbml2 || 1388 (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings)) { 1389 phys_addr_t pa = __pa_symbol(&idmap_kpti_bbml2_flag); 1390 1391 /* 1392 * The KPTI G-to-nG conversion code needs a read-write mapping 1393 * of its synchronization flag in the ID map. This is also used 1394 * when splitting the linear map to ptes if a secondary CPU 1395 * doesn't support bbml2. 1396 */ 1397 ptep = __pa_symbol(kpti_bbml2_ptes); 1398 __pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL, 1399 IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false, 1400 __phys_to_virt(ptep) - ptep); 1401 } 1402 } 1403 1404 void __init paging_init(void) 1405 { 1406 map_mem(swapper_pg_dir); 1407 1408 memblock_allow_resize(); 1409 1410 create_idmap(); 1411 declare_kernel_vmas(); 1412 } 1413 1414 #ifdef CONFIG_MEMORY_HOTPLUG 1415 static void free_hotplug_page_range(struct page *page, size_t size, 1416 struct vmem_altmap *altmap) 1417 { 1418 if (altmap) { 1419 vmem_altmap_free(altmap, size >> PAGE_SHIFT); 1420 } else { 1421 WARN_ON(PageReserved(page)); 1422 __free_pages(page, get_order(size)); 1423 } 1424 } 1425 1426 static void free_hotplug_pgtable_page(struct page *page) 1427 { 1428 free_hotplug_page_range(page, PAGE_SIZE, NULL); 1429 } 1430 1431 static bool pgtable_range_aligned(unsigned long start, unsigned long end, 1432 unsigned long floor, unsigned long ceiling, 1433 unsigned long mask) 1434 { 1435 start &= mask; 1436 if (start < floor) 1437 return false; 1438 1439 if (ceiling) { 1440 ceiling &= mask; 1441 if (!ceiling) 1442 return false; 1443 } 1444 1445 if (end - 1 > ceiling - 1) 1446 return false; 1447 return true; 1448 } 1449 1450 static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr, 1451 unsigned long end, bool free_mapped, 1452 struct vmem_altmap *altmap) 1453 { 1454 pte_t *ptep, pte; 1455 1456 do { 1457 ptep = pte_offset_kernel(pmdp, addr); 1458 pte = __ptep_get(ptep); 1459 if (pte_none(pte)) 1460 continue; 1461 1462 WARN_ON(!pte_present(pte)); 1463 __pte_clear(&init_mm, addr, ptep); 1464 flush_tlb_kernel_range(addr, addr + PAGE_SIZE); 1465 if (free_mapped) 1466 free_hotplug_page_range(pte_page(pte), 1467 PAGE_SIZE, altmap); 1468 } while (addr += PAGE_SIZE, addr < end); 1469 } 1470 1471 static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr, 1472 unsigned long end, bool free_mapped, 1473 struct vmem_altmap *altmap) 1474 { 1475 unsigned long next; 1476 pmd_t *pmdp, pmd; 1477 1478 do { 1479 next = pmd_addr_end(addr, end); 1480 pmdp = pmd_offset(pudp, addr); 1481 pmd = READ_ONCE(*pmdp); 1482 if (pmd_none(pmd)) 1483 continue; 1484 1485 WARN_ON(!pmd_present(pmd)); 1486 if (pmd_sect(pmd)) { 1487 pmd_clear(pmdp); 1488 1489 /* 1490 * One TLBI should be sufficient here as the PMD_SIZE 1491 * range is mapped with a single block entry. 1492 */ 1493 flush_tlb_kernel_range(addr, addr + PAGE_SIZE); 1494 if (free_mapped) 1495 free_hotplug_page_range(pmd_page(pmd), 1496 PMD_SIZE, altmap); 1497 continue; 1498 } 1499 WARN_ON(!pmd_table(pmd)); 1500 unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap); 1501 } while (addr = next, addr < end); 1502 } 1503 1504 static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr, 1505 unsigned long end, bool free_mapped, 1506 struct vmem_altmap *altmap) 1507 { 1508 unsigned long next; 1509 pud_t *pudp, pud; 1510 1511 do { 1512 next = pud_addr_end(addr, end); 1513 pudp = pud_offset(p4dp, addr); 1514 pud = READ_ONCE(*pudp); 1515 if (pud_none(pud)) 1516 continue; 1517 1518 WARN_ON(!pud_present(pud)); 1519 if (pud_sect(pud)) { 1520 pud_clear(pudp); 1521 1522 /* 1523 * One TLBI should be sufficient here as the PUD_SIZE 1524 * range is mapped with a single block entry. 1525 */ 1526 flush_tlb_kernel_range(addr, addr + PAGE_SIZE); 1527 if (free_mapped) 1528 free_hotplug_page_range(pud_page(pud), 1529 PUD_SIZE, altmap); 1530 continue; 1531 } 1532 WARN_ON(!pud_table(pud)); 1533 unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap); 1534 } while (addr = next, addr < end); 1535 } 1536 1537 static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr, 1538 unsigned long end, bool free_mapped, 1539 struct vmem_altmap *altmap) 1540 { 1541 unsigned long next; 1542 p4d_t *p4dp, p4d; 1543 1544 do { 1545 next = p4d_addr_end(addr, end); 1546 p4dp = p4d_offset(pgdp, addr); 1547 p4d = READ_ONCE(*p4dp); 1548 if (p4d_none(p4d)) 1549 continue; 1550 1551 WARN_ON(!p4d_present(p4d)); 1552 unmap_hotplug_pud_range(p4dp, addr, next, free_mapped, altmap); 1553 } while (addr = next, addr < end); 1554 } 1555 1556 static void unmap_hotplug_range(unsigned long addr, unsigned long end, 1557 bool free_mapped, struct vmem_altmap *altmap) 1558 { 1559 unsigned long next; 1560 pgd_t *pgdp, pgd; 1561 1562 /* 1563 * altmap can only be used as vmemmap mapping backing memory. 1564 * In case the backing memory itself is not being freed, then 1565 * altmap is irrelevant. Warn about this inconsistency when 1566 * encountered. 1567 */ 1568 WARN_ON(!free_mapped && altmap); 1569 1570 do { 1571 next = pgd_addr_end(addr, end); 1572 pgdp = pgd_offset_k(addr); 1573 pgd = READ_ONCE(*pgdp); 1574 if (pgd_none(pgd)) 1575 continue; 1576 1577 WARN_ON(!pgd_present(pgd)); 1578 unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap); 1579 } while (addr = next, addr < end); 1580 } 1581 1582 static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr, 1583 unsigned long end, unsigned long floor, 1584 unsigned long ceiling) 1585 { 1586 pte_t *ptep, pte; 1587 unsigned long i, start = addr; 1588 1589 do { 1590 ptep = pte_offset_kernel(pmdp, addr); 1591 pte = __ptep_get(ptep); 1592 1593 /* 1594 * This is just a sanity check here which verifies that 1595 * pte clearing has been done by earlier unmap loops. 1596 */ 1597 WARN_ON(!pte_none(pte)); 1598 } while (addr += PAGE_SIZE, addr < end); 1599 1600 if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK)) 1601 return; 1602 1603 /* 1604 * Check whether we can free the pte page if the rest of the 1605 * entries are empty. Overlap with other regions have been 1606 * handled by the floor/ceiling check. 1607 */ 1608 ptep = pte_offset_kernel(pmdp, 0UL); 1609 for (i = 0; i < PTRS_PER_PTE; i++) { 1610 if (!pte_none(__ptep_get(&ptep[i]))) 1611 return; 1612 } 1613 1614 pmd_clear(pmdp); 1615 __flush_tlb_kernel_pgtable(start); 1616 free_hotplug_pgtable_page(virt_to_page(ptep)); 1617 } 1618 1619 static void free_empty_pmd_table(pud_t *pudp, unsigned long addr, 1620 unsigned long end, unsigned long floor, 1621 unsigned long ceiling) 1622 { 1623 pmd_t *pmdp, pmd; 1624 unsigned long i, next, start = addr; 1625 1626 do { 1627 next = pmd_addr_end(addr, end); 1628 pmdp = pmd_offset(pudp, addr); 1629 pmd = READ_ONCE(*pmdp); 1630 if (pmd_none(pmd)) 1631 continue; 1632 1633 WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd)); 1634 free_empty_pte_table(pmdp, addr, next, floor, ceiling); 1635 } while (addr = next, addr < end); 1636 1637 if (CONFIG_PGTABLE_LEVELS <= 2) 1638 return; 1639 1640 if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK)) 1641 return; 1642 1643 /* 1644 * Check whether we can free the pmd page if the rest of the 1645 * entries are empty. Overlap with other regions have been 1646 * handled by the floor/ceiling check. 1647 */ 1648 pmdp = pmd_offset(pudp, 0UL); 1649 for (i = 0; i < PTRS_PER_PMD; i++) { 1650 if (!pmd_none(READ_ONCE(pmdp[i]))) 1651 return; 1652 } 1653 1654 pud_clear(pudp); 1655 __flush_tlb_kernel_pgtable(start); 1656 free_hotplug_pgtable_page(virt_to_page(pmdp)); 1657 } 1658 1659 static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr, 1660 unsigned long end, unsigned long floor, 1661 unsigned long ceiling) 1662 { 1663 pud_t *pudp, pud; 1664 unsigned long i, next, start = addr; 1665 1666 do { 1667 next = pud_addr_end(addr, end); 1668 pudp = pud_offset(p4dp, addr); 1669 pud = READ_ONCE(*pudp); 1670 if (pud_none(pud)) 1671 continue; 1672 1673 WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud)); 1674 free_empty_pmd_table(pudp, addr, next, floor, ceiling); 1675 } while (addr = next, addr < end); 1676 1677 if (!pgtable_l4_enabled()) 1678 return; 1679 1680 if (!pgtable_range_aligned(start, end, floor, ceiling, P4D_MASK)) 1681 return; 1682 1683 /* 1684 * Check whether we can free the pud page if the rest of the 1685 * entries are empty. Overlap with other regions have been 1686 * handled by the floor/ceiling check. 1687 */ 1688 pudp = pud_offset(p4dp, 0UL); 1689 for (i = 0; i < PTRS_PER_PUD; i++) { 1690 if (!pud_none(READ_ONCE(pudp[i]))) 1691 return; 1692 } 1693 1694 p4d_clear(p4dp); 1695 __flush_tlb_kernel_pgtable(start); 1696 free_hotplug_pgtable_page(virt_to_page(pudp)); 1697 } 1698 1699 static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr, 1700 unsigned long end, unsigned long floor, 1701 unsigned long ceiling) 1702 { 1703 p4d_t *p4dp, p4d; 1704 unsigned long i, next, start = addr; 1705 1706 do { 1707 next = p4d_addr_end(addr, end); 1708 p4dp = p4d_offset(pgdp, addr); 1709 p4d = READ_ONCE(*p4dp); 1710 if (p4d_none(p4d)) 1711 continue; 1712 1713 WARN_ON(!p4d_present(p4d)); 1714 free_empty_pud_table(p4dp, addr, next, floor, ceiling); 1715 } while (addr = next, addr < end); 1716 1717 if (!pgtable_l5_enabled()) 1718 return; 1719 1720 if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK)) 1721 return; 1722 1723 /* 1724 * Check whether we can free the p4d page if the rest of the 1725 * entries are empty. Overlap with other regions have been 1726 * handled by the floor/ceiling check. 1727 */ 1728 p4dp = p4d_offset(pgdp, 0UL); 1729 for (i = 0; i < PTRS_PER_P4D; i++) { 1730 if (!p4d_none(READ_ONCE(p4dp[i]))) 1731 return; 1732 } 1733 1734 pgd_clear(pgdp); 1735 __flush_tlb_kernel_pgtable(start); 1736 free_hotplug_pgtable_page(virt_to_page(p4dp)); 1737 } 1738 1739 static void free_empty_tables(unsigned long addr, unsigned long end, 1740 unsigned long floor, unsigned long ceiling) 1741 { 1742 unsigned long next; 1743 pgd_t *pgdp, pgd; 1744 1745 do { 1746 next = pgd_addr_end(addr, end); 1747 pgdp = pgd_offset_k(addr); 1748 pgd = READ_ONCE(*pgdp); 1749 if (pgd_none(pgd)) 1750 continue; 1751 1752 WARN_ON(!pgd_present(pgd)); 1753 free_empty_p4d_table(pgdp, addr, next, floor, ceiling); 1754 } while (addr = next, addr < end); 1755 } 1756 #endif 1757 1758 void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, 1759 unsigned long addr, unsigned long next) 1760 { 1761 pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL)); 1762 } 1763 1764 int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, 1765 unsigned long addr, unsigned long next) 1766 { 1767 vmemmap_verify((pte_t *)pmdp, node, addr, next); 1768 1769 return pmd_sect(READ_ONCE(*pmdp)); 1770 } 1771 1772 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, 1773 struct vmem_altmap *altmap) 1774 { 1775 WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END)); 1776 /* [start, end] should be within one section */ 1777 WARN_ON_ONCE(end - start > PAGES_PER_SECTION * sizeof(struct page)); 1778 1779 if (!IS_ENABLED(CONFIG_ARM64_4K_PAGES) || 1780 (end - start < PAGES_PER_SECTION * sizeof(struct page))) 1781 return vmemmap_populate_basepages(start, end, node, altmap); 1782 else 1783 return vmemmap_populate_hugepages(start, end, node, altmap); 1784 } 1785 1786 #ifdef CONFIG_MEMORY_HOTPLUG 1787 void vmemmap_free(unsigned long start, unsigned long end, 1788 struct vmem_altmap *altmap) 1789 { 1790 WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END)); 1791 1792 unmap_hotplug_range(start, end, true, altmap); 1793 free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END); 1794 } 1795 #endif /* CONFIG_MEMORY_HOTPLUG */ 1796 1797 int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) 1798 { 1799 pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot)); 1800 1801 /* Only allow permission changes for now */ 1802 if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)), 1803 pud_val(new_pud))) 1804 return 0; 1805 1806 VM_BUG_ON(phys & ~PUD_MASK); 1807 set_pud(pudp, new_pud); 1808 return 1; 1809 } 1810 1811 int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) 1812 { 1813 pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot)); 1814 1815 /* Only allow permission changes for now */ 1816 if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)), 1817 pmd_val(new_pmd))) 1818 return 0; 1819 1820 VM_BUG_ON(phys & ~PMD_MASK); 1821 set_pmd(pmdp, new_pmd); 1822 return 1; 1823 } 1824 1825 #ifndef __PAGETABLE_P4D_FOLDED 1826 void p4d_clear_huge(p4d_t *p4dp) 1827 { 1828 } 1829 #endif 1830 1831 int pud_clear_huge(pud_t *pudp) 1832 { 1833 if (!pud_sect(READ_ONCE(*pudp))) 1834 return 0; 1835 pud_clear(pudp); 1836 return 1; 1837 } 1838 1839 int pmd_clear_huge(pmd_t *pmdp) 1840 { 1841 if (!pmd_sect(READ_ONCE(*pmdp))) 1842 return 0; 1843 pmd_clear(pmdp); 1844 return 1; 1845 } 1846 1847 static int __pmd_free_pte_page(pmd_t *pmdp, unsigned long addr, 1848 bool acquire_mmap_lock) 1849 { 1850 pte_t *table; 1851 pmd_t pmd; 1852 1853 pmd = READ_ONCE(*pmdp); 1854 1855 if (!pmd_table(pmd)) { 1856 VM_WARN_ON(1); 1857 return 1; 1858 } 1859 1860 /* See comment in pud_free_pmd_page for static key logic */ 1861 table = pte_offset_kernel(pmdp, addr); 1862 pmd_clear(pmdp); 1863 __flush_tlb_kernel_pgtable(addr); 1864 if (static_branch_unlikely(&arm64_ptdump_lock_key) && acquire_mmap_lock) { 1865 mmap_read_lock(&init_mm); 1866 mmap_read_unlock(&init_mm); 1867 } 1868 1869 pte_free_kernel(NULL, table); 1870 return 1; 1871 } 1872 1873 int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) 1874 { 1875 /* If ptdump is walking the pagetables, acquire init_mm.mmap_lock */ 1876 return __pmd_free_pte_page(pmdp, addr, /* acquire_mmap_lock = */ true); 1877 } 1878 1879 int pud_free_pmd_page(pud_t *pudp, unsigned long addr) 1880 { 1881 pmd_t *table; 1882 pmd_t *pmdp; 1883 pud_t pud; 1884 unsigned long next, end; 1885 1886 pud = READ_ONCE(*pudp); 1887 1888 if (!pud_table(pud)) { 1889 VM_WARN_ON(1); 1890 return 1; 1891 } 1892 1893 table = pmd_offset(pudp, addr); 1894 1895 /* 1896 * Our objective is to prevent ptdump from reading a PMD table which has 1897 * been freed. In this race, if pud_free_pmd_page observes the key on 1898 * (which got flipped by ptdump) then the mmap lock sequence here will, 1899 * as a result of the mmap write lock/unlock sequence in ptdump, give 1900 * us the correct synchronization. If not, this means that ptdump has 1901 * yet not started walking the pagetables - the sequence of barriers 1902 * issued by __flush_tlb_kernel_pgtable() guarantees that ptdump will 1903 * observe an empty PUD. 1904 */ 1905 pud_clear(pudp); 1906 __flush_tlb_kernel_pgtable(addr); 1907 if (static_branch_unlikely(&arm64_ptdump_lock_key)) { 1908 mmap_read_lock(&init_mm); 1909 mmap_read_unlock(&init_mm); 1910 } 1911 1912 pmdp = table; 1913 next = addr; 1914 end = addr + PUD_SIZE; 1915 do { 1916 if (pmd_present(pmdp_get(pmdp))) 1917 /* 1918 * PMD has been isolated, so ptdump won't see it. No 1919 * need to acquire init_mm.mmap_lock. 1920 */ 1921 __pmd_free_pte_page(pmdp, next, /* acquire_mmap_lock = */ false); 1922 } while (pmdp++, next += PMD_SIZE, next != end); 1923 1924 pmd_free(NULL, table); 1925 return 1; 1926 } 1927 1928 #ifdef CONFIG_MEMORY_HOTPLUG 1929 static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) 1930 { 1931 unsigned long end = start + size; 1932 1933 WARN_ON(pgdir != init_mm.pgd); 1934 WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END)); 1935 1936 unmap_hotplug_range(start, end, false, NULL); 1937 free_empty_tables(start, end, PAGE_OFFSET, PAGE_END); 1938 } 1939 1940 struct range arch_get_mappable_range(void) 1941 { 1942 struct range mhp_range; 1943 phys_addr_t start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); 1944 phys_addr_t end_linear_pa = __pa(PAGE_END - 1); 1945 1946 if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { 1947 /* 1948 * Check for a wrap, it is possible because of randomized linear 1949 * mapping the start physical address is actually bigger than 1950 * the end physical address. In this case set start to zero 1951 * because [0, end_linear_pa] range must still be able to cover 1952 * all addressable physical addresses. 1953 */ 1954 if (start_linear_pa > end_linear_pa) 1955 start_linear_pa = 0; 1956 } 1957 1958 WARN_ON(start_linear_pa > end_linear_pa); 1959 1960 /* 1961 * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)] 1962 * accommodating both its ends but excluding PAGE_END. Max physical 1963 * range which can be mapped inside this linear mapping range, must 1964 * also be derived from its end points. 1965 */ 1966 mhp_range.start = start_linear_pa; 1967 mhp_range.end = end_linear_pa; 1968 1969 return mhp_range; 1970 } 1971 1972 int arch_add_memory(int nid, u64 start, u64 size, 1973 struct mhp_params *params) 1974 { 1975 int ret, flags = NO_EXEC_MAPPINGS; 1976 1977 VM_BUG_ON(!mhp_range_allowed(start, size, true)); 1978 1979 if (force_pte_mapping()) 1980 flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; 1981 1982 ret = __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), 1983 size, params->pgprot, pgd_pgtable_alloc_init_mm, 1984 flags); 1985 if (ret) 1986 goto err; 1987 1988 memblock_clear_nomap(start, size); 1989 1990 ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, 1991 params); 1992 if (ret) 1993 goto err; 1994 1995 /* Address of hotplugged memory can be smaller */ 1996 max_pfn = max(max_pfn, PFN_UP(start + size)); 1997 max_low_pfn = max_pfn; 1998 1999 return 0; 2000 2001 err: 2002 __remove_pgd_mapping(swapper_pg_dir, 2003 __phys_to_virt(start), size); 2004 return ret; 2005 } 2006 2007 void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) 2008 { 2009 unsigned long start_pfn = start >> PAGE_SHIFT; 2010 unsigned long nr_pages = size >> PAGE_SHIFT; 2011 2012 __remove_pages(start_pfn, nr_pages, altmap); 2013 __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size); 2014 } 2015 2016 /* 2017 * This memory hotplug notifier helps prevent boot memory from being 2018 * inadvertently removed as it blocks pfn range offlining process in 2019 * __offline_pages(). Hence this prevents both offlining as well as 2020 * removal process for boot memory which is initially always online. 2021 * In future if and when boot memory could be removed, this notifier 2022 * should be dropped and free_hotplug_page_range() should handle any 2023 * reserved pages allocated during boot. 2024 */ 2025 static int prevent_bootmem_remove_notifier(struct notifier_block *nb, 2026 unsigned long action, void *data) 2027 { 2028 struct mem_section *ms; 2029 struct memory_notify *arg = data; 2030 unsigned long end_pfn = arg->start_pfn + arg->nr_pages; 2031 unsigned long pfn = arg->start_pfn; 2032 2033 if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE)) 2034 return NOTIFY_OK; 2035 2036 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 2037 unsigned long start = PFN_PHYS(pfn); 2038 unsigned long end = start + (1UL << PA_SECTION_SHIFT); 2039 2040 ms = __pfn_to_section(pfn); 2041 if (!early_section(ms)) 2042 continue; 2043 2044 if (action == MEM_GOING_OFFLINE) { 2045 /* 2046 * Boot memory removal is not supported. Prevent 2047 * it via blocking any attempted offline request 2048 * for the boot memory and just report it. 2049 */ 2050 pr_warn("Boot memory [%lx %lx] offlining attempted\n", start, end); 2051 return NOTIFY_BAD; 2052 } else if (action == MEM_OFFLINE) { 2053 /* 2054 * This should have never happened. Boot memory 2055 * offlining should have been prevented by this 2056 * very notifier. Probably some memory removal 2057 * procedure might have changed which would then 2058 * require further debug. 2059 */ 2060 pr_err("Boot memory [%lx %lx] offlined\n", start, end); 2061 2062 /* 2063 * Core memory hotplug does not process a return 2064 * code from the notifier for MEM_OFFLINE events. 2065 * The error condition has been reported. Return 2066 * from here as if ignored. 2067 */ 2068 return NOTIFY_DONE; 2069 } 2070 } 2071 return NOTIFY_OK; 2072 } 2073 2074 static struct notifier_block prevent_bootmem_remove_nb = { 2075 .notifier_call = prevent_bootmem_remove_notifier, 2076 }; 2077 2078 /* 2079 * This ensures that boot memory sections on the platform are online 2080 * from early boot. Memory sections could not be prevented from being 2081 * offlined, unless for some reason they are not online to begin with. 2082 * This helps validate the basic assumption on which the above memory 2083 * event notifier works to prevent boot memory section offlining and 2084 * its possible removal. 2085 */ 2086 static void validate_bootmem_online(void) 2087 { 2088 phys_addr_t start, end, addr; 2089 struct mem_section *ms; 2090 u64 i; 2091 2092 /* 2093 * Scanning across all memblock might be expensive 2094 * on some big memory systems. Hence enable this 2095 * validation only with DEBUG_VM. 2096 */ 2097 if (!IS_ENABLED(CONFIG_DEBUG_VM)) 2098 return; 2099 2100 for_each_mem_range(i, &start, &end) { 2101 for (addr = start; addr < end; addr += (1UL << PA_SECTION_SHIFT)) { 2102 ms = __pfn_to_section(PHYS_PFN(addr)); 2103 2104 /* 2105 * All memory ranges in the system at this point 2106 * should have been marked as early sections. 2107 */ 2108 WARN_ON(!early_section(ms)); 2109 2110 /* 2111 * Memory notifier mechanism here to prevent boot 2112 * memory offlining depends on the fact that each 2113 * early section memory on the system is initially 2114 * online. Otherwise a given memory section which 2115 * is already offline will be overlooked and can 2116 * be removed completely. Call out such sections. 2117 */ 2118 if (!online_section(ms)) 2119 pr_err("Boot memory [%llx %llx] is offline, can be removed\n", 2120 addr, addr + (1UL << PA_SECTION_SHIFT)); 2121 } 2122 } 2123 } 2124 2125 static int __init prevent_bootmem_remove_init(void) 2126 { 2127 int ret = 0; 2128 2129 if (!IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) 2130 return ret; 2131 2132 validate_bootmem_online(); 2133 ret = register_memory_notifier(&prevent_bootmem_remove_nb); 2134 if (ret) 2135 pr_err("%s: Notifier registration failed %d\n", __func__, ret); 2136 2137 return ret; 2138 } 2139 early_initcall(prevent_bootmem_remove_init); 2140 #endif 2141 2142 pte_t modify_prot_start_ptes(struct vm_area_struct *vma, unsigned long addr, 2143 pte_t *ptep, unsigned int nr) 2144 { 2145 pte_t pte = get_and_clear_ptes(vma->vm_mm, addr, ptep, nr); 2146 2147 if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { 2148 /* 2149 * Break-before-make (BBM) is required for all user space mappings 2150 * when the permission changes from executable to non-executable 2151 * in cases where cpu is affected with errata #2645198. 2152 */ 2153 if (pte_accessible(vma->vm_mm, pte) && pte_user_exec(pte)) 2154 __flush_tlb_range(vma, addr, nr * PAGE_SIZE, 2155 PAGE_SIZE, true, 3); 2156 } 2157 2158 return pte; 2159 } 2160 2161 pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) 2162 { 2163 return modify_prot_start_ptes(vma, addr, ptep, 1); 2164 } 2165 2166 void modify_prot_commit_ptes(struct vm_area_struct *vma, unsigned long addr, 2167 pte_t *ptep, pte_t old_pte, pte_t pte, 2168 unsigned int nr) 2169 { 2170 set_ptes(vma->vm_mm, addr, ptep, pte, nr); 2171 } 2172 2173 void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, 2174 pte_t old_pte, pte_t pte) 2175 { 2176 modify_prot_commit_ptes(vma, addr, ptep, old_pte, pte, 1); 2177 } 2178 2179 /* 2180 * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD, 2181 * avoiding the possibility of conflicting TLB entries being allocated. 2182 */ 2183 void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp) 2184 { 2185 typedef void (ttbr_replace_func)(phys_addr_t); 2186 extern ttbr_replace_func idmap_cpu_replace_ttbr1; 2187 ttbr_replace_func *replace_phys; 2188 unsigned long daif; 2189 2190 /* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */ 2191 phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp)); 2192 2193 if (cnp) 2194 ttbr1 |= TTBR_CNP_BIT; 2195 2196 replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1); 2197 2198 cpu_install_idmap(); 2199 2200 /* 2201 * We really don't want to take *any* exceptions while TTBR1 is 2202 * in the process of being replaced so mask everything. 2203 */ 2204 daif = local_daif_save(); 2205 replace_phys(ttbr1); 2206 local_daif_restore(daif); 2207 2208 cpu_uninstall_idmap(); 2209 } 2210 2211 #ifdef CONFIG_ARCH_HAS_PKEYS 2212 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) 2213 { 2214 u64 new_por; 2215 u64 old_por; 2216 2217 if (!system_supports_poe()) 2218 return -ENOSPC; 2219 2220 /* 2221 * This code should only be called with valid 'pkey' 2222 * values originating from in-kernel users. Complain 2223 * if a bad value is observed. 2224 */ 2225 if (WARN_ON_ONCE(pkey >= arch_max_pkey())) 2226 return -EINVAL; 2227 2228 /* Set the bits we need in POR: */ 2229 new_por = POE_RWX; 2230 if (init_val & PKEY_DISABLE_WRITE) 2231 new_por &= ~POE_W; 2232 if (init_val & PKEY_DISABLE_ACCESS) 2233 new_por &= ~POE_RW; 2234 if (init_val & PKEY_DISABLE_READ) 2235 new_por &= ~POE_R; 2236 if (init_val & PKEY_DISABLE_EXECUTE) 2237 new_por &= ~POE_X; 2238 2239 /* Shift the bits in to the correct place in POR for pkey: */ 2240 new_por = POR_ELx_PERM_PREP(pkey, new_por); 2241 2242 /* Get old POR and mask off any old bits in place: */ 2243 old_por = read_sysreg_s(SYS_POR_EL0); 2244 old_por &= ~(POE_MASK << POR_ELx_PERM_SHIFT(pkey)); 2245 2246 /* Write old part along with new part: */ 2247 write_sysreg_s(old_por | new_por, SYS_POR_EL0); 2248 2249 return 0; 2250 } 2251 #endif 2252