1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/mm.h> 3 #include <linux/gfp.h> 4 #include <linux/hugetlb.h> 5 #include <asm/pgalloc.h> 6 #include <asm/tlb.h> 7 #include <asm/fixmap.h> 8 #include <asm/mtrr.h> 9 10 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK 11 phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; 12 EXPORT_SYMBOL(physical_mask); 13 SYM_PIC_ALIAS(physical_mask); 14 #endif 15 16 pgtable_t pte_alloc_one(struct mm_struct *mm) 17 { 18 return __pte_alloc_one(mm, GFP_PGTABLE_USER); 19 } 20 21 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) 22 { 23 paravirt_release_pte(page_to_pfn(pte)); 24 tlb_remove_ptdesc(tlb, page_ptdesc(pte)); 25 } 26 27 #if CONFIG_PGTABLE_LEVELS > 2 28 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) 29 { 30 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); 31 /* 32 * NOTE! For PAE, any changes to the top page-directory-pointer-table 33 * entries need a full cr3 reload to flush. 34 */ 35 #ifdef CONFIG_X86_PAE 36 tlb->need_flush_all = 1; 37 #endif 38 tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); 39 } 40 41 #if CONFIG_PGTABLE_LEVELS > 3 42 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 43 { 44 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 45 tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); 46 } 47 48 #if CONFIG_PGTABLE_LEVELS > 4 49 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) 50 { 51 paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); 52 tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d)); 53 } 54 #endif /* CONFIG_PGTABLE_LEVELS > 4 */ 55 #endif /* CONFIG_PGTABLE_LEVELS > 3 */ 56 #endif /* CONFIG_PGTABLE_LEVELS > 2 */ 57 58 static inline void pgd_list_add(pgd_t *pgd) 59 { 60 struct ptdesc *ptdesc = virt_to_ptdesc(pgd); 61 62 list_add(&ptdesc->pt_list, &pgd_list); 63 } 64 65 static inline void pgd_list_del(pgd_t *pgd) 66 { 67 struct ptdesc *ptdesc = virt_to_ptdesc(pgd); 68 69 list_del(&ptdesc->pt_list); 70 } 71 72 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) 73 { 74 virt_to_ptdesc(pgd)->pt_mm = mm; 75 } 76 77 struct mm_struct *pgd_page_get_mm(struct page *page) 78 { 79 return page_ptdesc(page)->pt_mm; 80 } 81 82 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) 83 { 84 /* PAE preallocates all its PMDs. No cloning needed. */ 85 if (!IS_ENABLED(CONFIG_X86_PAE)) 86 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, 87 swapper_pg_dir + KERNEL_PGD_BOUNDARY, 88 KERNEL_PGD_PTRS); 89 90 /* List used to sync kernel mapping updates */ 91 pgd_set_mm(pgd, mm); 92 pgd_list_add(pgd); 93 } 94 95 static void pgd_dtor(pgd_t *pgd) 96 { 97 spin_lock(&pgd_lock); 98 pgd_list_del(pgd); 99 spin_unlock(&pgd_lock); 100 } 101 102 /* 103 * List of all pgd's needed for non-PAE so it can invalidate entries 104 * in both cached and uncached pgd's; not needed for PAE since the 105 * kernel pmd is shared. If PAE were not to share the pmd a similar 106 * tactic would be needed. This is essentially codepath-based locking 107 * against pageattr.c; it is the unique case in which a valid change 108 * of kernel pagetables can't be lazily synchronized by vmalloc faults. 109 * vmalloc faults work because attached pagetables are never freed. 110 * -- nyc 111 */ 112 113 #ifdef CONFIG_X86_PAE 114 /* 115 * In PAE mode, we need to do a cr3 reload (=tlb flush) when 116 * updating the top-level pagetable entries to guarantee the 117 * processor notices the update. Since this is expensive, and 118 * all 4 top-level entries are used almost immediately in a 119 * new process's life, we just pre-populate them here. 120 */ 121 #define PREALLOCATED_PMDS PTRS_PER_PGD 122 123 /* 124 * "USER_PMDS" are the PMDs for the user copy of the page tables when 125 * PTI is enabled. They do not exist when PTI is disabled. Note that 126 * this is distinct from the user _portion_ of the kernel page tables 127 * which always exists. 128 * 129 * We allocate separate PMDs for the kernel part of the user page-table 130 * when PTI is enabled. We need them to map the per-process LDT into the 131 * user-space page-table. 132 */ 133 #define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ 134 KERNEL_PGD_PTRS : 0) 135 #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS 136 137 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) 138 { 139 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 140 141 /* Note: almost everything apart from _PAGE_PRESENT is 142 reserved at the pmd (PDPT) level. */ 143 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); 144 145 /* 146 * According to Intel App note "TLBs, Paging-Structure Caches, 147 * and Their Invalidation", April 2007, document 317080-001, 148 * section 8.1: in PAE mode we explicitly have to flush the 149 * TLB via cr3 if the top-level pgd is changed... 150 */ 151 flush_tlb_mm(mm); 152 } 153 #else /* !CONFIG_X86_PAE */ 154 155 /* No need to prepopulate any pagetable entries in non-PAE modes. */ 156 #define PREALLOCATED_PMDS 0 157 #define PREALLOCATED_USER_PMDS 0 158 #define MAX_PREALLOCATED_USER_PMDS 0 159 #endif /* CONFIG_X86_PAE */ 160 161 static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) 162 { 163 int i; 164 struct ptdesc *ptdesc; 165 166 for (i = 0; i < count; i++) 167 if (pmds[i]) { 168 ptdesc = virt_to_ptdesc(pmds[i]); 169 170 pagetable_dtor(ptdesc); 171 pagetable_free(ptdesc); 172 mm_dec_nr_pmds(mm); 173 } 174 } 175 176 static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) 177 { 178 int i; 179 bool failed = false; 180 gfp_t gfp = GFP_PGTABLE_USER; 181 182 if (mm == &init_mm) 183 gfp &= ~__GFP_ACCOUNT; 184 gfp &= ~__GFP_HIGHMEM; 185 186 for (i = 0; i < count; i++) { 187 pmd_t *pmd = NULL; 188 struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); 189 190 if (!ptdesc) 191 failed = true; 192 if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) { 193 pagetable_free(ptdesc); 194 ptdesc = NULL; 195 failed = true; 196 } 197 if (ptdesc) { 198 mm_inc_nr_pmds(mm); 199 pmd = ptdesc_address(ptdesc); 200 } 201 202 pmds[i] = pmd; 203 } 204 205 if (failed) { 206 free_pmds(mm, pmds, count); 207 return -ENOMEM; 208 } 209 210 return 0; 211 } 212 213 /* 214 * Mop up any pmd pages which may still be attached to the pgd. 215 * Normally they will be freed by munmap/exit_mmap, but any pmd we 216 * preallocate which never got a corresponding vma will need to be 217 * freed manually. 218 */ 219 static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) 220 { 221 pgd_t pgd = *pgdp; 222 223 if (pgd_val(pgd) != 0) { 224 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); 225 226 pgd_clear(pgdp); 227 228 paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); 229 pmd_free(mm, pmd); 230 mm_dec_nr_pmds(mm); 231 } 232 } 233 234 static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) 235 { 236 int i; 237 238 for (i = 0; i < PREALLOCATED_PMDS; i++) 239 mop_up_one_pmd(mm, &pgdp[i]); 240 241 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION 242 243 if (!boot_cpu_has(X86_FEATURE_PTI)) 244 return; 245 246 pgdp = kernel_to_user_pgdp(pgdp); 247 248 for (i = 0; i < PREALLOCATED_USER_PMDS; i++) 249 mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); 250 #endif 251 } 252 253 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) 254 { 255 p4d_t *p4d; 256 pud_t *pud; 257 int i; 258 259 p4d = p4d_offset(pgd, 0); 260 pud = pud_offset(p4d, 0); 261 262 for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { 263 pmd_t *pmd = pmds[i]; 264 265 if (i >= KERNEL_PGD_BOUNDARY) 266 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), 267 sizeof(pmd_t) * PTRS_PER_PMD); 268 269 pud_populate(mm, pud, pmd); 270 } 271 } 272 273 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION 274 static void pgd_prepopulate_user_pmd(struct mm_struct *mm, 275 pgd_t *k_pgd, pmd_t *pmds[]) 276 { 277 pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); 278 pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); 279 p4d_t *u_p4d; 280 pud_t *u_pud; 281 int i; 282 283 u_p4d = p4d_offset(u_pgd, 0); 284 u_pud = pud_offset(u_p4d, 0); 285 286 s_pgd += KERNEL_PGD_BOUNDARY; 287 u_pud += KERNEL_PGD_BOUNDARY; 288 289 for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { 290 pmd_t *pmd = pmds[i]; 291 292 memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), 293 sizeof(pmd_t) * PTRS_PER_PMD); 294 295 pud_populate(mm, u_pud, pmd); 296 } 297 298 } 299 #else 300 static void pgd_prepopulate_user_pmd(struct mm_struct *mm, 301 pgd_t *k_pgd, pmd_t *pmds[]) 302 { 303 } 304 #endif 305 306 static inline pgd_t *_pgd_alloc(struct mm_struct *mm) 307 { 308 /* 309 * PTI and Xen need a whole page for the PAE PGD 310 * even though the hardware only needs 32 bytes. 311 * 312 * For simplicity, allocate a page for all users. 313 */ 314 return __pgd_alloc(mm, pgd_allocation_order()); 315 } 316 317 static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) 318 { 319 __pgd_free(mm, pgd); 320 } 321 322 pgd_t *pgd_alloc(struct mm_struct *mm) 323 { 324 pgd_t *pgd; 325 pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; 326 pmd_t *pmds[PREALLOCATED_PMDS]; 327 328 pgd = _pgd_alloc(mm); 329 330 if (pgd == NULL) 331 goto out; 332 333 mm->pgd = pgd; 334 335 if (sizeof(pmds) != 0 && 336 preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) 337 goto out_free_pgd; 338 339 if (sizeof(u_pmds) != 0 && 340 preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) 341 goto out_free_pmds; 342 343 if (paravirt_pgd_alloc(mm) != 0) 344 goto out_free_user_pmds; 345 346 /* 347 * Make sure that pre-populating the pmds is atomic with 348 * respect to anything walking the pgd_list, so that they 349 * never see a partially populated pgd. 350 */ 351 spin_lock(&pgd_lock); 352 353 pgd_ctor(mm, pgd); 354 if (sizeof(pmds) != 0) 355 pgd_prepopulate_pmd(mm, pgd, pmds); 356 357 if (sizeof(u_pmds) != 0) 358 pgd_prepopulate_user_pmd(mm, pgd, u_pmds); 359 360 spin_unlock(&pgd_lock); 361 362 return pgd; 363 364 out_free_user_pmds: 365 if (sizeof(u_pmds) != 0) 366 free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); 367 out_free_pmds: 368 if (sizeof(pmds) != 0) 369 free_pmds(mm, pmds, PREALLOCATED_PMDS); 370 out_free_pgd: 371 _pgd_free(mm, pgd); 372 out: 373 return NULL; 374 } 375 376 void pgd_free(struct mm_struct *mm, pgd_t *pgd) 377 { 378 pgd_mop_up_pmds(mm, pgd); 379 pgd_dtor(pgd); 380 paravirt_pgd_free(mm, pgd); 381 _pgd_free(mm, pgd); 382 } 383 384 /* 385 * Used to set accessed or dirty bits in the page table entries 386 * on other architectures. On x86, the accessed and dirty bits 387 * are tracked by hardware. However, do_wp_page calls this function 388 * to also make the pte writeable at the same time the dirty bit is 389 * set. In that case we do actually need to write the PTE. 390 */ 391 int ptep_set_access_flags(struct vm_area_struct *vma, 392 unsigned long address, pte_t *ptep, 393 pte_t entry, int dirty) 394 { 395 int changed = !pte_same(*ptep, entry); 396 397 if (changed && dirty) 398 set_pte(ptep, entry); 399 400 return changed; 401 } 402 403 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 404 int pmdp_set_access_flags(struct vm_area_struct *vma, 405 unsigned long address, pmd_t *pmdp, 406 pmd_t entry, int dirty) 407 { 408 int changed = !pmd_same(*pmdp, entry); 409 410 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 411 412 if (changed && dirty) { 413 set_pmd(pmdp, entry); 414 /* 415 * We had a write-protection fault here and changed the pmd 416 * to to more permissive. No need to flush the TLB for that, 417 * #PF is architecturally guaranteed to do that and in the 418 * worst-case we'll generate a spurious fault. 419 */ 420 } 421 422 return changed; 423 } 424 425 int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, 426 pud_t *pudp, pud_t entry, int dirty) 427 { 428 int changed = !pud_same(*pudp, entry); 429 430 VM_BUG_ON(address & ~HPAGE_PUD_MASK); 431 432 if (changed && dirty) { 433 set_pud(pudp, entry); 434 /* 435 * We had a write-protection fault here and changed the pud 436 * to to more permissive. No need to flush the TLB for that, 437 * #PF is architecturally guaranteed to do that and in the 438 * worst-case we'll generate a spurious fault. 439 */ 440 } 441 442 return changed; 443 } 444 #endif 445 446 int ptep_test_and_clear_young(struct vm_area_struct *vma, 447 unsigned long addr, pte_t *ptep) 448 { 449 int ret = 0; 450 451 if (pte_young(*ptep)) 452 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 453 (unsigned long *) &ptep->pte); 454 455 return ret; 456 } 457 458 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) 459 int pmdp_test_and_clear_young(struct vm_area_struct *vma, 460 unsigned long addr, pmd_t *pmdp) 461 { 462 int ret = 0; 463 464 if (pmd_young(*pmdp)) 465 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 466 (unsigned long *)pmdp); 467 468 return ret; 469 } 470 #endif 471 472 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 473 int pudp_test_and_clear_young(struct vm_area_struct *vma, 474 unsigned long addr, pud_t *pudp) 475 { 476 int ret = 0; 477 478 if (pud_young(*pudp)) 479 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 480 (unsigned long *)pudp); 481 482 return ret; 483 } 484 #endif 485 486 int ptep_clear_flush_young(struct vm_area_struct *vma, 487 unsigned long address, pte_t *ptep) 488 { 489 /* 490 * On x86 CPUs, clearing the accessed bit without a TLB flush 491 * doesn't cause data corruption. [ It could cause incorrect 492 * page aging and the (mistaken) reclaim of hot pages, but the 493 * chance of that should be relatively low. ] 494 * 495 * So as a performance optimization don't flush the TLB when 496 * clearing the accessed bit, it will eventually be flushed by 497 * a context switch or a VM operation anyway. [ In the rare 498 * event of it not getting flushed for a long time the delay 499 * shouldn't really matter because there's no real memory 500 * pressure for swapout to react to. ] 501 */ 502 return ptep_test_and_clear_young(vma, address, ptep); 503 } 504 505 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 506 int pmdp_clear_flush_young(struct vm_area_struct *vma, 507 unsigned long address, pmd_t *pmdp) 508 { 509 int young; 510 511 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 512 513 young = pmdp_test_and_clear_young(vma, address, pmdp); 514 if (young) 515 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 516 517 return young; 518 } 519 520 pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, 521 pmd_t *pmdp) 522 { 523 VM_WARN_ON_ONCE(!pmd_present(*pmdp)); 524 525 /* 526 * No flush is necessary. Once an invalid PTE is established, the PTE's 527 * access and dirty bits cannot be updated. 528 */ 529 return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); 530 } 531 #endif 532 533 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ 534 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 535 pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, 536 pud_t *pudp) 537 { 538 VM_WARN_ON_ONCE(!pud_present(*pudp)); 539 pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp)); 540 flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); 541 return old; 542 } 543 #endif 544 545 /** 546 * reserve_top_address - Reserve a hole in the top of the kernel address space 547 * @reserve: Size of hole to reserve 548 * 549 * Can be used to relocate the fixmap area and poke a hole in the top 550 * of the kernel address space to make room for a hypervisor. 551 */ 552 void __init reserve_top_address(unsigned long reserve) 553 { 554 #ifdef CONFIG_X86_32 555 BUG_ON(fixmaps_set > 0); 556 __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; 557 printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", 558 -reserve, __FIXADDR_TOP + PAGE_SIZE); 559 #endif 560 } 561 562 int fixmaps_set; 563 564 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) 565 { 566 unsigned long address = __fix_to_virt(idx); 567 568 #ifdef CONFIG_X86_64 569 /* 570 * Ensure that the static initial page tables are covering the 571 * fixmap completely. 572 */ 573 BUILD_BUG_ON(__end_of_permanent_fixed_addresses > 574 (FIXMAP_PMD_NUM * PTRS_PER_PTE)); 575 #endif 576 577 if (idx >= __end_of_fixed_addresses) { 578 BUG(); 579 return; 580 } 581 set_pte_vaddr(address, pte); 582 fixmaps_set++; 583 } 584 585 void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, 586 phys_addr_t phys, pgprot_t flags) 587 { 588 /* Sanitize 'prot' against any unsupported bits: */ 589 pgprot_val(flags) &= __default_kernel_pte_mask; 590 591 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); 592 } 593 594 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP 595 #if CONFIG_PGTABLE_LEVELS > 4 596 /** 597 * p4d_set_huge - Set up kernel P4D mapping 598 * @p4d: Pointer to the P4D entry 599 * @addr: Virtual address associated with the P4D entry 600 * @prot: Protection bits to use 601 * 602 * No 512GB pages yet -- always return 0 603 */ 604 int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) 605 { 606 return 0; 607 } 608 609 /** 610 * p4d_clear_huge - Clear kernel P4D mapping when it is set 611 * @p4d: Pointer to the P4D entry to clear 612 * 613 * No 512GB pages yet -- do nothing 614 */ 615 void p4d_clear_huge(p4d_t *p4d) 616 { 617 } 618 #endif 619 620 /** 621 * pud_set_huge - Set up kernel PUD mapping 622 * @pud: Pointer to the PUD entry 623 * @addr: Virtual address associated with the PUD entry 624 * @prot: Protection bits to use 625 * 626 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this 627 * function sets up a huge page only if the complete range has the same MTRR 628 * caching mode. 629 * 630 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger 631 * page mapping attempt fails. 632 * 633 * Returns 1 on success and 0 on failure. 634 */ 635 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) 636 { 637 u8 uniform; 638 639 mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); 640 if (!uniform) 641 return 0; 642 643 /* Bail out if we are we on a populated non-leaf entry: */ 644 if (pud_present(*pud) && !pud_leaf(*pud)) 645 return 0; 646 647 set_pte((pte_t *)pud, pfn_pte( 648 (u64)addr >> PAGE_SHIFT, 649 __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); 650 651 return 1; 652 } 653 654 /** 655 * pmd_set_huge - Set up kernel PMD mapping 656 * @pmd: Pointer to the PMD entry 657 * @addr: Virtual address associated with the PMD entry 658 * @prot: Protection bits to use 659 * 660 * See text over pud_set_huge() above. 661 * 662 * Returns 1 on success and 0 on failure. 663 */ 664 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) 665 { 666 u8 uniform; 667 668 mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); 669 if (!uniform) { 670 pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", 671 __func__, addr, addr + PMD_SIZE); 672 return 0; 673 } 674 675 /* Bail out if we are we on a populated non-leaf entry: */ 676 if (pmd_present(*pmd) && !pmd_leaf(*pmd)) 677 return 0; 678 679 set_pte((pte_t *)pmd, pfn_pte( 680 (u64)addr >> PAGE_SHIFT, 681 __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); 682 683 return 1; 684 } 685 686 /** 687 * pud_clear_huge - Clear kernel PUD mapping when it is set 688 * @pud: Pointer to the PUD entry to clear. 689 * 690 * Returns 1 on success and 0 on failure (no PUD map is found). 691 */ 692 int pud_clear_huge(pud_t *pud) 693 { 694 if (pud_leaf(*pud)) { 695 pud_clear(pud); 696 return 1; 697 } 698 699 return 0; 700 } 701 702 /** 703 * pmd_clear_huge - Clear kernel PMD mapping when it is set 704 * @pmd: Pointer to the PMD entry to clear. 705 * 706 * Returns 1 on success and 0 on failure (no PMD map is found). 707 */ 708 int pmd_clear_huge(pmd_t *pmd) 709 { 710 if (pmd_leaf(*pmd)) { 711 pmd_clear(pmd); 712 return 1; 713 } 714 715 return 0; 716 } 717 718 #ifdef CONFIG_X86_64 719 /** 720 * pud_free_pmd_page - Clear PUD entry and free PMD page 721 * @pud: Pointer to a PUD 722 * @addr: Virtual address associated with PUD 723 * 724 * Context: The PUD range has been unmapped and TLB purged. 725 * Return: 1 if clearing the entry succeeded. 0 otherwise. 726 * 727 * NOTE: Callers must allow a single page allocation. 728 */ 729 int pud_free_pmd_page(pud_t *pud, unsigned long addr) 730 { 731 pmd_t *pmd, *pmd_sv; 732 pte_t *pte; 733 int i; 734 735 pmd = pud_pgtable(*pud); 736 pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); 737 if (!pmd_sv) 738 return 0; 739 740 for (i = 0; i < PTRS_PER_PMD; i++) { 741 pmd_sv[i] = pmd[i]; 742 if (!pmd_none(pmd[i])) 743 pmd_clear(&pmd[i]); 744 } 745 746 pud_clear(pud); 747 748 /* INVLPG to clear all paging-structure caches */ 749 flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); 750 751 for (i = 0; i < PTRS_PER_PMD; i++) { 752 if (!pmd_none(pmd_sv[i])) { 753 pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); 754 pte_free_kernel(&init_mm, pte); 755 } 756 } 757 758 free_page((unsigned long)pmd_sv); 759 760 pmd_free(&init_mm, pmd); 761 762 return 1; 763 } 764 765 /** 766 * pmd_free_pte_page - Clear PMD entry and free PTE page. 767 * @pmd: Pointer to the PMD 768 * @addr: Virtual address associated with PMD 769 * 770 * Context: The PMD range has been unmapped and TLB purged. 771 * Return: 1 if clearing the entry succeeded. 0 otherwise. 772 */ 773 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 774 { 775 pte_t *pte; 776 777 pte = (pte_t *)pmd_page_vaddr(*pmd); 778 pmd_clear(pmd); 779 780 /* INVLPG to clear all paging-structure caches */ 781 flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); 782 783 pte_free_kernel(&init_mm, pte); 784 785 return 1; 786 } 787 788 #else /* !CONFIG_X86_64 */ 789 790 /* 791 * Disable free page handling on x86-PAE. This assures that ioremap() 792 * does not update sync'd PMD entries. See vmalloc_sync_one(). 793 */ 794 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 795 { 796 return pmd_none(*pmd); 797 } 798 799 #endif /* CONFIG_X86_64 */ 800 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ 801 802 pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) 803 { 804 if (vma->vm_flags & VM_SHADOW_STACK) 805 return pte_mkwrite_shstk(pte); 806 807 pte = pte_mkwrite_novma(pte); 808 809 return pte_clear_saveddirty(pte); 810 } 811 812 pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 813 { 814 if (vma->vm_flags & VM_SHADOW_STACK) 815 return pmd_mkwrite_shstk(pmd); 816 817 pmd = pmd_mkwrite_novma(pmd); 818 819 return pmd_clear_saveddirty(pmd); 820 } 821 822 void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) 823 { 824 /* 825 * Hardware before shadow stack can (rarely) set Dirty=1 826 * on a Write=0 PTE. So the below condition 827 * only indicates a software bug when shadow stack is 828 * supported by the HW. This checking is covered in 829 * pte_shstk(). 830 */ 831 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && 832 pte_shstk(pte)); 833 } 834 835 void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) 836 { 837 /* See note in arch_check_zapped_pte() */ 838 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && 839 pmd_shstk(pmd)); 840 } 841 842 void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) 843 { 844 /* See note in arch_check_zapped_pte() */ 845 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud)); 846 } 847