1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/mm.h> 3 #include <linux/gfp.h> 4 #include <linux/hugetlb.h> 5 #include <asm/pgalloc.h> 6 #include <asm/tlb.h> 7 #include <asm/fixmap.h> 8 #include <asm/mtrr.h> 9 10 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK 11 phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; 12 EXPORT_SYMBOL(physical_mask); 13 #endif 14 15 pgtable_t pte_alloc_one(struct mm_struct *mm) 16 { 17 return __pte_alloc_one(mm, GFP_PGTABLE_USER); 18 } 19 20 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) 21 { 22 paravirt_release_pte(page_to_pfn(pte)); 23 tlb_remove_ptdesc(tlb, page_ptdesc(pte)); 24 } 25 26 #if CONFIG_PGTABLE_LEVELS > 2 27 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) 28 { 29 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); 30 /* 31 * NOTE! For PAE, any changes to the top page-directory-pointer-table 32 * entries need a full cr3 reload to flush. 33 */ 34 #ifdef CONFIG_X86_PAE 35 tlb->need_flush_all = 1; 36 #endif 37 tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); 38 } 39 40 #if CONFIG_PGTABLE_LEVELS > 3 41 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 42 { 43 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 44 tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); 45 } 46 47 #if CONFIG_PGTABLE_LEVELS > 4 48 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) 49 { 50 paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); 51 tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d)); 52 } 53 #endif /* CONFIG_PGTABLE_LEVELS > 4 */ 54 #endif /* CONFIG_PGTABLE_LEVELS > 3 */ 55 #endif /* CONFIG_PGTABLE_LEVELS > 2 */ 56 57 static inline void pgd_list_add(pgd_t *pgd) 58 { 59 struct ptdesc *ptdesc = virt_to_ptdesc(pgd); 60 61 list_add(&ptdesc->pt_list, &pgd_list); 62 } 63 64 static inline void pgd_list_del(pgd_t *pgd) 65 { 66 struct ptdesc *ptdesc = virt_to_ptdesc(pgd); 67 68 list_del(&ptdesc->pt_list); 69 } 70 71 #define UNSHARED_PTRS_PER_PGD \ 72 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 73 #define MAX_UNSHARED_PTRS_PER_PGD \ 74 MAX_T(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) 75 76 77 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) 78 { 79 virt_to_ptdesc(pgd)->pt_mm = mm; 80 } 81 82 struct mm_struct *pgd_page_get_mm(struct page *page) 83 { 84 return page_ptdesc(page)->pt_mm; 85 } 86 87 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) 88 { 89 /* If the pgd points to a shared pagetable level (either the 90 ptes in non-PAE, or shared PMD in PAE), then just copy the 91 references from swapper_pg_dir. */ 92 if (CONFIG_PGTABLE_LEVELS == 2 || 93 (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || 94 CONFIG_PGTABLE_LEVELS >= 4) { 95 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, 96 swapper_pg_dir + KERNEL_PGD_BOUNDARY, 97 KERNEL_PGD_PTRS); 98 } 99 100 /* list required to sync kernel mapping updates */ 101 if (!SHARED_KERNEL_PMD) { 102 pgd_set_mm(pgd, mm); 103 pgd_list_add(pgd); 104 } 105 } 106 107 static void pgd_dtor(pgd_t *pgd) 108 { 109 if (SHARED_KERNEL_PMD) 110 return; 111 112 spin_lock(&pgd_lock); 113 pgd_list_del(pgd); 114 spin_unlock(&pgd_lock); 115 } 116 117 /* 118 * List of all pgd's needed for non-PAE so it can invalidate entries 119 * in both cached and uncached pgd's; not needed for PAE since the 120 * kernel pmd is shared. If PAE were not to share the pmd a similar 121 * tactic would be needed. This is essentially codepath-based locking 122 * against pageattr.c; it is the unique case in which a valid change 123 * of kernel pagetables can't be lazily synchronized by vmalloc faults. 124 * vmalloc faults work because attached pagetables are never freed. 125 * -- nyc 126 */ 127 128 #ifdef CONFIG_X86_PAE 129 /* 130 * In PAE mode, we need to do a cr3 reload (=tlb flush) when 131 * updating the top-level pagetable entries to guarantee the 132 * processor notices the update. Since this is expensive, and 133 * all 4 top-level entries are used almost immediately in a 134 * new process's life, we just pre-populate them here. 135 * 136 * Also, if we're in a paravirt environment where the kernel pmd is 137 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate 138 * and initialize the kernel pmds here. 139 */ 140 #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD 141 #define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD 142 143 /* 144 * We allocate separate PMDs for the kernel part of the user page-table 145 * when PTI is enabled. We need them to map the per-process LDT into the 146 * user-space page-table. 147 */ 148 #define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ 149 KERNEL_PGD_PTRS : 0) 150 #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS 151 152 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) 153 { 154 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 155 156 /* Note: almost everything apart from _PAGE_PRESENT is 157 reserved at the pmd (PDPT) level. */ 158 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); 159 160 /* 161 * According to Intel App note "TLBs, Paging-Structure Caches, 162 * and Their Invalidation", April 2007, document 317080-001, 163 * section 8.1: in PAE mode we explicitly have to flush the 164 * TLB via cr3 if the top-level pgd is changed... 165 */ 166 flush_tlb_mm(mm); 167 } 168 #else /* !CONFIG_X86_PAE */ 169 170 /* No need to prepopulate any pagetable entries in non-PAE modes. */ 171 #define PREALLOCATED_PMDS 0 172 #define MAX_PREALLOCATED_PMDS 0 173 #define PREALLOCATED_USER_PMDS 0 174 #define MAX_PREALLOCATED_USER_PMDS 0 175 #endif /* CONFIG_X86_PAE */ 176 177 static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) 178 { 179 int i; 180 struct ptdesc *ptdesc; 181 182 for (i = 0; i < count; i++) 183 if (pmds[i]) { 184 ptdesc = virt_to_ptdesc(pmds[i]); 185 186 pagetable_dtor(ptdesc); 187 pagetable_free(ptdesc); 188 mm_dec_nr_pmds(mm); 189 } 190 } 191 192 static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) 193 { 194 int i; 195 bool failed = false; 196 gfp_t gfp = GFP_PGTABLE_USER; 197 198 if (mm == &init_mm) 199 gfp &= ~__GFP_ACCOUNT; 200 gfp &= ~__GFP_HIGHMEM; 201 202 for (i = 0; i < count; i++) { 203 pmd_t *pmd = NULL; 204 struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); 205 206 if (!ptdesc) 207 failed = true; 208 if (ptdesc && !pagetable_pmd_ctor(ptdesc)) { 209 pagetable_free(ptdesc); 210 ptdesc = NULL; 211 failed = true; 212 } 213 if (ptdesc) { 214 mm_inc_nr_pmds(mm); 215 pmd = ptdesc_address(ptdesc); 216 } 217 218 pmds[i] = pmd; 219 } 220 221 if (failed) { 222 free_pmds(mm, pmds, count); 223 return -ENOMEM; 224 } 225 226 return 0; 227 } 228 229 /* 230 * Mop up any pmd pages which may still be attached to the pgd. 231 * Normally they will be freed by munmap/exit_mmap, but any pmd we 232 * preallocate which never got a corresponding vma will need to be 233 * freed manually. 234 */ 235 static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) 236 { 237 pgd_t pgd = *pgdp; 238 239 if (pgd_val(pgd) != 0) { 240 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); 241 242 pgd_clear(pgdp); 243 244 paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); 245 pmd_free(mm, pmd); 246 mm_dec_nr_pmds(mm); 247 } 248 } 249 250 static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) 251 { 252 int i; 253 254 for (i = 0; i < PREALLOCATED_PMDS; i++) 255 mop_up_one_pmd(mm, &pgdp[i]); 256 257 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION 258 259 if (!boot_cpu_has(X86_FEATURE_PTI)) 260 return; 261 262 pgdp = kernel_to_user_pgdp(pgdp); 263 264 for (i = 0; i < PREALLOCATED_USER_PMDS; i++) 265 mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); 266 #endif 267 } 268 269 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) 270 { 271 p4d_t *p4d; 272 pud_t *pud; 273 int i; 274 275 p4d = p4d_offset(pgd, 0); 276 pud = pud_offset(p4d, 0); 277 278 for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { 279 pmd_t *pmd = pmds[i]; 280 281 if (i >= KERNEL_PGD_BOUNDARY) 282 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), 283 sizeof(pmd_t) * PTRS_PER_PMD); 284 285 pud_populate(mm, pud, pmd); 286 } 287 } 288 289 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION 290 static void pgd_prepopulate_user_pmd(struct mm_struct *mm, 291 pgd_t *k_pgd, pmd_t *pmds[]) 292 { 293 pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); 294 pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); 295 p4d_t *u_p4d; 296 pud_t *u_pud; 297 int i; 298 299 u_p4d = p4d_offset(u_pgd, 0); 300 u_pud = pud_offset(u_p4d, 0); 301 302 s_pgd += KERNEL_PGD_BOUNDARY; 303 u_pud += KERNEL_PGD_BOUNDARY; 304 305 for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { 306 pmd_t *pmd = pmds[i]; 307 308 memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), 309 sizeof(pmd_t) * PTRS_PER_PMD); 310 311 pud_populate(mm, u_pud, pmd); 312 } 313 314 } 315 #else 316 static void pgd_prepopulate_user_pmd(struct mm_struct *mm, 317 pgd_t *k_pgd, pmd_t *pmds[]) 318 { 319 } 320 #endif 321 /* 322 * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also 323 * assumes that pgd should be in one page. 324 * 325 * But kernel with PAE paging that is not running as a Xen domain 326 * only needs to allocate 32 bytes for pgd instead of one page. 327 */ 328 #ifdef CONFIG_X86_PAE 329 330 #include <linux/slab.h> 331 332 #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) 333 #define PGD_ALIGN 32 334 335 static struct kmem_cache *pgd_cache; 336 337 void __init pgtable_cache_init(void) 338 { 339 /* 340 * When PAE kernel is running as a Xen domain, it does not use 341 * shared kernel pmd. And this requires a whole page for pgd. 342 */ 343 if (!SHARED_KERNEL_PMD) 344 return; 345 346 /* 347 * when PAE kernel is not running as a Xen domain, it uses 348 * shared kernel pmd. Shared kernel pmd does not require a whole 349 * page for pgd. We are able to just allocate a 32-byte for pgd. 350 * During boot time, we create a 32-byte slab for pgd table allocation. 351 */ 352 pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, 353 SLAB_PANIC, NULL); 354 } 355 356 static inline pgd_t *_pgd_alloc(struct mm_struct *mm) 357 { 358 /* 359 * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. 360 * We allocate one page for pgd. 361 */ 362 if (!SHARED_KERNEL_PMD) 363 return __pgd_alloc(mm, pgd_allocation_order()); 364 365 /* 366 * Now PAE kernel is not running as a Xen domain. We can allocate 367 * a 32-byte slab for pgd to save memory space. 368 */ 369 return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER); 370 } 371 372 static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) 373 { 374 if (!SHARED_KERNEL_PMD) 375 __pgd_free(mm, pgd); 376 else 377 kmem_cache_free(pgd_cache, pgd); 378 } 379 #else 380 381 static inline pgd_t *_pgd_alloc(struct mm_struct *mm) 382 { 383 return __pgd_alloc(mm, pgd_allocation_order()); 384 } 385 386 static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) 387 { 388 __pgd_free(mm, pgd); 389 } 390 #endif /* CONFIG_X86_PAE */ 391 392 pgd_t *pgd_alloc(struct mm_struct *mm) 393 { 394 pgd_t *pgd; 395 pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; 396 pmd_t *pmds[MAX_PREALLOCATED_PMDS]; 397 398 pgd = _pgd_alloc(mm); 399 400 if (pgd == NULL) 401 goto out; 402 403 mm->pgd = pgd; 404 405 if (sizeof(pmds) != 0 && 406 preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) 407 goto out_free_pgd; 408 409 if (sizeof(u_pmds) != 0 && 410 preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) 411 goto out_free_pmds; 412 413 if (paravirt_pgd_alloc(mm) != 0) 414 goto out_free_user_pmds; 415 416 /* 417 * Make sure that pre-populating the pmds is atomic with 418 * respect to anything walking the pgd_list, so that they 419 * never see a partially populated pgd. 420 */ 421 spin_lock(&pgd_lock); 422 423 pgd_ctor(mm, pgd); 424 if (sizeof(pmds) != 0) 425 pgd_prepopulate_pmd(mm, pgd, pmds); 426 427 if (sizeof(u_pmds) != 0) 428 pgd_prepopulate_user_pmd(mm, pgd, u_pmds); 429 430 spin_unlock(&pgd_lock); 431 432 return pgd; 433 434 out_free_user_pmds: 435 if (sizeof(u_pmds) != 0) 436 free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); 437 out_free_pmds: 438 if (sizeof(pmds) != 0) 439 free_pmds(mm, pmds, PREALLOCATED_PMDS); 440 out_free_pgd: 441 _pgd_free(mm, pgd); 442 out: 443 return NULL; 444 } 445 446 void pgd_free(struct mm_struct *mm, pgd_t *pgd) 447 { 448 pgd_mop_up_pmds(mm, pgd); 449 pgd_dtor(pgd); 450 paravirt_pgd_free(mm, pgd); 451 _pgd_free(mm, pgd); 452 } 453 454 /* 455 * Used to set accessed or dirty bits in the page table entries 456 * on other architectures. On x86, the accessed and dirty bits 457 * are tracked by hardware. However, do_wp_page calls this function 458 * to also make the pte writeable at the same time the dirty bit is 459 * set. In that case we do actually need to write the PTE. 460 */ 461 int ptep_set_access_flags(struct vm_area_struct *vma, 462 unsigned long address, pte_t *ptep, 463 pte_t entry, int dirty) 464 { 465 int changed = !pte_same(*ptep, entry); 466 467 if (changed && dirty) 468 set_pte(ptep, entry); 469 470 return changed; 471 } 472 473 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 474 int pmdp_set_access_flags(struct vm_area_struct *vma, 475 unsigned long address, pmd_t *pmdp, 476 pmd_t entry, int dirty) 477 { 478 int changed = !pmd_same(*pmdp, entry); 479 480 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 481 482 if (changed && dirty) { 483 set_pmd(pmdp, entry); 484 /* 485 * We had a write-protection fault here and changed the pmd 486 * to to more permissive. No need to flush the TLB for that, 487 * #PF is architecturally guaranteed to do that and in the 488 * worst-case we'll generate a spurious fault. 489 */ 490 } 491 492 return changed; 493 } 494 495 int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, 496 pud_t *pudp, pud_t entry, int dirty) 497 { 498 int changed = !pud_same(*pudp, entry); 499 500 VM_BUG_ON(address & ~HPAGE_PUD_MASK); 501 502 if (changed && dirty) { 503 set_pud(pudp, entry); 504 /* 505 * We had a write-protection fault here and changed the pud 506 * to to more permissive. No need to flush the TLB for that, 507 * #PF is architecturally guaranteed to do that and in the 508 * worst-case we'll generate a spurious fault. 509 */ 510 } 511 512 return changed; 513 } 514 #endif 515 516 int ptep_test_and_clear_young(struct vm_area_struct *vma, 517 unsigned long addr, pte_t *ptep) 518 { 519 int ret = 0; 520 521 if (pte_young(*ptep)) 522 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 523 (unsigned long *) &ptep->pte); 524 525 return ret; 526 } 527 528 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) 529 int pmdp_test_and_clear_young(struct vm_area_struct *vma, 530 unsigned long addr, pmd_t *pmdp) 531 { 532 int ret = 0; 533 534 if (pmd_young(*pmdp)) 535 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 536 (unsigned long *)pmdp); 537 538 return ret; 539 } 540 #endif 541 542 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 543 int pudp_test_and_clear_young(struct vm_area_struct *vma, 544 unsigned long addr, pud_t *pudp) 545 { 546 int ret = 0; 547 548 if (pud_young(*pudp)) 549 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 550 (unsigned long *)pudp); 551 552 return ret; 553 } 554 #endif 555 556 int ptep_clear_flush_young(struct vm_area_struct *vma, 557 unsigned long address, pte_t *ptep) 558 { 559 /* 560 * On x86 CPUs, clearing the accessed bit without a TLB flush 561 * doesn't cause data corruption. [ It could cause incorrect 562 * page aging and the (mistaken) reclaim of hot pages, but the 563 * chance of that should be relatively low. ] 564 * 565 * So as a performance optimization don't flush the TLB when 566 * clearing the accessed bit, it will eventually be flushed by 567 * a context switch or a VM operation anyway. [ In the rare 568 * event of it not getting flushed for a long time the delay 569 * shouldn't really matter because there's no real memory 570 * pressure for swapout to react to. ] 571 */ 572 return ptep_test_and_clear_young(vma, address, ptep); 573 } 574 575 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 576 int pmdp_clear_flush_young(struct vm_area_struct *vma, 577 unsigned long address, pmd_t *pmdp) 578 { 579 int young; 580 581 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 582 583 young = pmdp_test_and_clear_young(vma, address, pmdp); 584 if (young) 585 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 586 587 return young; 588 } 589 590 pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, 591 pmd_t *pmdp) 592 { 593 VM_WARN_ON_ONCE(!pmd_present(*pmdp)); 594 595 /* 596 * No flush is necessary. Once an invalid PTE is established, the PTE's 597 * access and dirty bits cannot be updated. 598 */ 599 return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); 600 } 601 #endif 602 603 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ 604 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 605 pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, 606 pud_t *pudp) 607 { 608 VM_WARN_ON_ONCE(!pud_present(*pudp)); 609 pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp)); 610 flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); 611 return old; 612 } 613 #endif 614 615 /** 616 * reserve_top_address - reserves a hole in the top of kernel address space 617 * @reserve - size of hole to reserve 618 * 619 * Can be used to relocate the fixmap area and poke a hole in the top 620 * of kernel address space to make room for a hypervisor. 621 */ 622 void __init reserve_top_address(unsigned long reserve) 623 { 624 #ifdef CONFIG_X86_32 625 BUG_ON(fixmaps_set > 0); 626 __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; 627 printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", 628 -reserve, __FIXADDR_TOP + PAGE_SIZE); 629 #endif 630 } 631 632 int fixmaps_set; 633 634 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) 635 { 636 unsigned long address = __fix_to_virt(idx); 637 638 #ifdef CONFIG_X86_64 639 /* 640 * Ensure that the static initial page tables are covering the 641 * fixmap completely. 642 */ 643 BUILD_BUG_ON(__end_of_permanent_fixed_addresses > 644 (FIXMAP_PMD_NUM * PTRS_PER_PTE)); 645 #endif 646 647 if (idx >= __end_of_fixed_addresses) { 648 BUG(); 649 return; 650 } 651 set_pte_vaddr(address, pte); 652 fixmaps_set++; 653 } 654 655 void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, 656 phys_addr_t phys, pgprot_t flags) 657 { 658 /* Sanitize 'prot' against any unsupported bits: */ 659 pgprot_val(flags) &= __default_kernel_pte_mask; 660 661 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); 662 } 663 664 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP 665 #ifdef CONFIG_X86_5LEVEL 666 /** 667 * p4d_set_huge - setup kernel P4D mapping 668 * 669 * No 512GB pages yet -- always return 0 670 */ 671 int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) 672 { 673 return 0; 674 } 675 676 /** 677 * p4d_clear_huge - clear kernel P4D mapping when it is set 678 * 679 * No 512GB pages yet -- always return 0 680 */ 681 void p4d_clear_huge(p4d_t *p4d) 682 { 683 } 684 #endif 685 686 /** 687 * pud_set_huge - setup kernel PUD mapping 688 * 689 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this 690 * function sets up a huge page only if the complete range has the same MTRR 691 * caching mode. 692 * 693 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger 694 * page mapping attempt fails. 695 * 696 * Returns 1 on success and 0 on failure. 697 */ 698 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) 699 { 700 u8 uniform; 701 702 mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); 703 if (!uniform) 704 return 0; 705 706 /* Bail out if we are we on a populated non-leaf entry: */ 707 if (pud_present(*pud) && !pud_leaf(*pud)) 708 return 0; 709 710 set_pte((pte_t *)pud, pfn_pte( 711 (u64)addr >> PAGE_SHIFT, 712 __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); 713 714 return 1; 715 } 716 717 /** 718 * pmd_set_huge - setup kernel PMD mapping 719 * 720 * See text over pud_set_huge() above. 721 * 722 * Returns 1 on success and 0 on failure. 723 */ 724 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) 725 { 726 u8 uniform; 727 728 mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); 729 if (!uniform) { 730 pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", 731 __func__, addr, addr + PMD_SIZE); 732 return 0; 733 } 734 735 /* Bail out if we are we on a populated non-leaf entry: */ 736 if (pmd_present(*pmd) && !pmd_leaf(*pmd)) 737 return 0; 738 739 set_pte((pte_t *)pmd, pfn_pte( 740 (u64)addr >> PAGE_SHIFT, 741 __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); 742 743 return 1; 744 } 745 746 /** 747 * pud_clear_huge - clear kernel PUD mapping when it is set 748 * 749 * Returns 1 on success and 0 on failure (no PUD map is found). 750 */ 751 int pud_clear_huge(pud_t *pud) 752 { 753 if (pud_leaf(*pud)) { 754 pud_clear(pud); 755 return 1; 756 } 757 758 return 0; 759 } 760 761 /** 762 * pmd_clear_huge - clear kernel PMD mapping when it is set 763 * 764 * Returns 1 on success and 0 on failure (no PMD map is found). 765 */ 766 int pmd_clear_huge(pmd_t *pmd) 767 { 768 if (pmd_leaf(*pmd)) { 769 pmd_clear(pmd); 770 return 1; 771 } 772 773 return 0; 774 } 775 776 #ifdef CONFIG_X86_64 777 /** 778 * pud_free_pmd_page - Clear pud entry and free pmd page. 779 * @pud: Pointer to a PUD. 780 * @addr: Virtual address associated with pud. 781 * 782 * Context: The pud range has been unmapped and TLB purged. 783 * Return: 1 if clearing the entry succeeded. 0 otherwise. 784 * 785 * NOTE: Callers must allow a single page allocation. 786 */ 787 int pud_free_pmd_page(pud_t *pud, unsigned long addr) 788 { 789 pmd_t *pmd, *pmd_sv; 790 pte_t *pte; 791 int i; 792 793 pmd = pud_pgtable(*pud); 794 pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); 795 if (!pmd_sv) 796 return 0; 797 798 for (i = 0; i < PTRS_PER_PMD; i++) { 799 pmd_sv[i] = pmd[i]; 800 if (!pmd_none(pmd[i])) 801 pmd_clear(&pmd[i]); 802 } 803 804 pud_clear(pud); 805 806 /* INVLPG to clear all paging-structure caches */ 807 flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); 808 809 for (i = 0; i < PTRS_PER_PMD; i++) { 810 if (!pmd_none(pmd_sv[i])) { 811 pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); 812 free_page((unsigned long)pte); 813 } 814 } 815 816 free_page((unsigned long)pmd_sv); 817 818 pagetable_dtor(virt_to_ptdesc(pmd)); 819 free_page((unsigned long)pmd); 820 821 return 1; 822 } 823 824 /** 825 * pmd_free_pte_page - Clear pmd entry and free pte page. 826 * @pmd: Pointer to a PMD. 827 * @addr: Virtual address associated with pmd. 828 * 829 * Context: The pmd range has been unmapped and TLB purged. 830 * Return: 1 if clearing the entry succeeded. 0 otherwise. 831 */ 832 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 833 { 834 pte_t *pte; 835 836 pte = (pte_t *)pmd_page_vaddr(*pmd); 837 pmd_clear(pmd); 838 839 /* INVLPG to clear all paging-structure caches */ 840 flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); 841 842 free_page((unsigned long)pte); 843 844 return 1; 845 } 846 847 #else /* !CONFIG_X86_64 */ 848 849 /* 850 * Disable free page handling on x86-PAE. This assures that ioremap() 851 * does not update sync'd pmd entries. See vmalloc_sync_one(). 852 */ 853 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 854 { 855 return pmd_none(*pmd); 856 } 857 858 #endif /* CONFIG_X86_64 */ 859 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ 860 861 pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) 862 { 863 if (vma->vm_flags & VM_SHADOW_STACK) 864 return pte_mkwrite_shstk(pte); 865 866 pte = pte_mkwrite_novma(pte); 867 868 return pte_clear_saveddirty(pte); 869 } 870 871 pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 872 { 873 if (vma->vm_flags & VM_SHADOW_STACK) 874 return pmd_mkwrite_shstk(pmd); 875 876 pmd = pmd_mkwrite_novma(pmd); 877 878 return pmd_clear_saveddirty(pmd); 879 } 880 881 void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) 882 { 883 /* 884 * Hardware before shadow stack can (rarely) set Dirty=1 885 * on a Write=0 PTE. So the below condition 886 * only indicates a software bug when shadow stack is 887 * supported by the HW. This checking is covered in 888 * pte_shstk(). 889 */ 890 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && 891 pte_shstk(pte)); 892 } 893 894 void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) 895 { 896 /* See note in arch_check_zapped_pte() */ 897 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && 898 pmd_shstk(pmd)); 899 } 900 901 void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) 902 { 903 /* See note in arch_check_zapped_pte() */ 904 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud)); 905 } 906