1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/mm.h> 3 #include <linux/gfp.h> 4 #include <linux/hugetlb.h> 5 #include <asm/pgalloc.h> 6 #include <asm/tlb.h> 7 #include <asm/fixmap.h> 8 #include <asm/mtrr.h> 9 10 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK 11 phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; 12 EXPORT_SYMBOL(physical_mask); 13 SYM_PIC_ALIAS(physical_mask); 14 #endif 15 16 pgtable_t pte_alloc_one(struct mm_struct *mm) 17 { 18 return __pte_alloc_one(mm, GFP_PGTABLE_USER); 19 } 20 21 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) 22 { 23 paravirt_release_pte(page_to_pfn(pte)); 24 tlb_remove_ptdesc(tlb, page_ptdesc(pte)); 25 } 26 27 #if CONFIG_PGTABLE_LEVELS > 2 28 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) 29 { 30 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); 31 /* 32 * NOTE! For PAE, any changes to the top page-directory-pointer-table 33 * entries need a full cr3 reload to flush. 34 */ 35 #ifdef CONFIG_X86_PAE 36 tlb->need_flush_all = 1; 37 #endif 38 tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); 39 } 40 41 #if CONFIG_PGTABLE_LEVELS > 3 42 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 43 { 44 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 45 tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); 46 } 47 48 #if CONFIG_PGTABLE_LEVELS > 4 49 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) 50 { 51 paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); 52 tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d)); 53 } 54 #endif /* CONFIG_PGTABLE_LEVELS > 4 */ 55 #endif /* CONFIG_PGTABLE_LEVELS > 3 */ 56 #endif /* CONFIG_PGTABLE_LEVELS > 2 */ 57 58 static inline void pgd_list_add(pgd_t *pgd) 59 { 60 struct ptdesc *ptdesc = virt_to_ptdesc(pgd); 61 62 list_add(&ptdesc->pt_list, &pgd_list); 63 } 64 65 static inline void pgd_list_del(pgd_t *pgd) 66 { 67 struct ptdesc *ptdesc = virt_to_ptdesc(pgd); 68 69 list_del(&ptdesc->pt_list); 70 } 71 72 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) 73 { 74 virt_to_ptdesc(pgd)->pt_mm = mm; 75 } 76 77 struct mm_struct *pgd_page_get_mm(struct page *page) 78 { 79 return page_ptdesc(page)->pt_mm; 80 } 81 82 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) 83 { 84 /* PAE preallocates all its PMDs. No cloning needed. */ 85 if (!IS_ENABLED(CONFIG_X86_PAE)) 86 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, 87 swapper_pg_dir + KERNEL_PGD_BOUNDARY, 88 KERNEL_PGD_PTRS); 89 90 /* List used to sync kernel mapping updates */ 91 pgd_set_mm(pgd, mm); 92 pgd_list_add(pgd); 93 } 94 95 static void pgd_dtor(pgd_t *pgd) 96 { 97 spin_lock(&pgd_lock); 98 pgd_list_del(pgd); 99 spin_unlock(&pgd_lock); 100 } 101 102 #ifdef CONFIG_X86_PAE 103 /* 104 * In PAE mode, we need to do a cr3 reload (=tlb flush) when 105 * updating the top-level pagetable entries to guarantee the 106 * processor notices the update. Since this is expensive, and 107 * all 4 top-level entries are used almost immediately in a 108 * new process's life, we just pre-populate them here. 109 */ 110 #define PREALLOCATED_PMDS PTRS_PER_PGD 111 112 /* 113 * "USER_PMDS" are the PMDs for the user copy of the page tables when 114 * PTI is enabled. They do not exist when PTI is disabled. Note that 115 * this is distinct from the user _portion_ of the kernel page tables 116 * which always exists. 117 * 118 * We allocate separate PMDs for the kernel part of the user page-table 119 * when PTI is enabled. We need them to map the per-process LDT into the 120 * user-space page-table. 121 */ 122 #define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ 123 KERNEL_PGD_PTRS : 0) 124 #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS 125 126 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) 127 { 128 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 129 130 /* Note: almost everything apart from _PAGE_PRESENT is 131 reserved at the pmd (PDPT) level. */ 132 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); 133 134 /* 135 * According to Intel App note "TLBs, Paging-Structure Caches, 136 * and Their Invalidation", April 2007, document 317080-001, 137 * section 8.1: in PAE mode we explicitly have to flush the 138 * TLB via cr3 if the top-level pgd is changed... 139 */ 140 flush_tlb_mm(mm); 141 } 142 #else /* !CONFIG_X86_PAE */ 143 144 /* No need to prepopulate any pagetable entries in non-PAE modes. */ 145 #define PREALLOCATED_PMDS 0 146 #define PREALLOCATED_USER_PMDS 0 147 #define MAX_PREALLOCATED_USER_PMDS 0 148 #endif /* CONFIG_X86_PAE */ 149 150 static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) 151 { 152 int i; 153 struct ptdesc *ptdesc; 154 155 for (i = 0; i < count; i++) 156 if (pmds[i]) { 157 ptdesc = virt_to_ptdesc(pmds[i]); 158 159 pagetable_dtor(ptdesc); 160 pagetable_free(ptdesc); 161 mm_dec_nr_pmds(mm); 162 } 163 } 164 165 static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) 166 { 167 int i; 168 bool failed = false; 169 gfp_t gfp = GFP_PGTABLE_USER; 170 171 if (mm == &init_mm) 172 gfp &= ~__GFP_ACCOUNT; 173 gfp &= ~__GFP_HIGHMEM; 174 175 for (i = 0; i < count; i++) { 176 pmd_t *pmd = NULL; 177 struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); 178 179 if (!ptdesc) 180 failed = true; 181 if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) { 182 pagetable_free(ptdesc); 183 ptdesc = NULL; 184 failed = true; 185 } 186 if (ptdesc) { 187 mm_inc_nr_pmds(mm); 188 pmd = ptdesc_address(ptdesc); 189 } 190 191 pmds[i] = pmd; 192 } 193 194 if (failed) { 195 free_pmds(mm, pmds, count); 196 return -ENOMEM; 197 } 198 199 return 0; 200 } 201 202 /* 203 * Mop up any pmd pages which may still be attached to the pgd. 204 * Normally they will be freed by munmap/exit_mmap, but any pmd we 205 * preallocate which never got a corresponding vma will need to be 206 * freed manually. 207 */ 208 static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) 209 { 210 pgd_t pgd = *pgdp; 211 212 if (pgd_val(pgd) != 0) { 213 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); 214 215 pgd_clear(pgdp); 216 217 paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); 218 pmd_free(mm, pmd); 219 mm_dec_nr_pmds(mm); 220 } 221 } 222 223 static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) 224 { 225 int i; 226 227 for (i = 0; i < PREALLOCATED_PMDS; i++) 228 mop_up_one_pmd(mm, &pgdp[i]); 229 230 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION 231 232 if (!boot_cpu_has(X86_FEATURE_PTI)) 233 return; 234 235 pgdp = kernel_to_user_pgdp(pgdp); 236 237 for (i = 0; i < PREALLOCATED_USER_PMDS; i++) 238 mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); 239 #endif 240 } 241 242 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) 243 { 244 p4d_t *p4d; 245 pud_t *pud; 246 int i; 247 248 p4d = p4d_offset(pgd, 0); 249 pud = pud_offset(p4d, 0); 250 251 for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { 252 pmd_t *pmd = pmds[i]; 253 254 if (i >= KERNEL_PGD_BOUNDARY) 255 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), 256 sizeof(pmd_t) * PTRS_PER_PMD); 257 258 pud_populate(mm, pud, pmd); 259 } 260 } 261 262 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION 263 static void pgd_prepopulate_user_pmd(struct mm_struct *mm, 264 pgd_t *k_pgd, pmd_t *pmds[]) 265 { 266 pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); 267 pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); 268 p4d_t *u_p4d; 269 pud_t *u_pud; 270 int i; 271 272 u_p4d = p4d_offset(u_pgd, 0); 273 u_pud = pud_offset(u_p4d, 0); 274 275 s_pgd += KERNEL_PGD_BOUNDARY; 276 u_pud += KERNEL_PGD_BOUNDARY; 277 278 for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { 279 pmd_t *pmd = pmds[i]; 280 281 memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), 282 sizeof(pmd_t) * PTRS_PER_PMD); 283 284 pud_populate(mm, u_pud, pmd); 285 } 286 287 } 288 #else 289 static void pgd_prepopulate_user_pmd(struct mm_struct *mm, 290 pgd_t *k_pgd, pmd_t *pmds[]) 291 { 292 } 293 #endif 294 295 static inline pgd_t *_pgd_alloc(struct mm_struct *mm) 296 { 297 /* 298 * PTI and Xen need a whole page for the PAE PGD 299 * even though the hardware only needs 32 bytes. 300 * 301 * For simplicity, allocate a page for all users. 302 */ 303 return __pgd_alloc(mm, pgd_allocation_order()); 304 } 305 306 static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) 307 { 308 __pgd_free(mm, pgd); 309 } 310 311 pgd_t *pgd_alloc(struct mm_struct *mm) 312 { 313 pgd_t *pgd; 314 pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; 315 pmd_t *pmds[PREALLOCATED_PMDS]; 316 317 pgd = _pgd_alloc(mm); 318 319 if (pgd == NULL) 320 goto out; 321 322 mm->pgd = pgd; 323 324 if (sizeof(pmds) != 0 && 325 preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) 326 goto out_free_pgd; 327 328 if (sizeof(u_pmds) != 0 && 329 preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) 330 goto out_free_pmds; 331 332 if (paravirt_pgd_alloc(mm) != 0) 333 goto out_free_user_pmds; 334 335 /* 336 * Make sure that pre-populating the pmds is atomic with 337 * respect to anything walking the pgd_list, so that they 338 * never see a partially populated pgd. 339 */ 340 spin_lock(&pgd_lock); 341 342 pgd_ctor(mm, pgd); 343 if (sizeof(pmds) != 0) 344 pgd_prepopulate_pmd(mm, pgd, pmds); 345 346 if (sizeof(u_pmds) != 0) 347 pgd_prepopulate_user_pmd(mm, pgd, u_pmds); 348 349 spin_unlock(&pgd_lock); 350 351 return pgd; 352 353 out_free_user_pmds: 354 if (sizeof(u_pmds) != 0) 355 free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); 356 out_free_pmds: 357 if (sizeof(pmds) != 0) 358 free_pmds(mm, pmds, PREALLOCATED_PMDS); 359 out_free_pgd: 360 _pgd_free(mm, pgd); 361 out: 362 return NULL; 363 } 364 365 void pgd_free(struct mm_struct *mm, pgd_t *pgd) 366 { 367 pgd_mop_up_pmds(mm, pgd); 368 pgd_dtor(pgd); 369 paravirt_pgd_free(mm, pgd); 370 _pgd_free(mm, pgd); 371 } 372 373 /* 374 * Used to set accessed or dirty bits in the page table entries 375 * on other architectures. On x86, the accessed and dirty bits 376 * are tracked by hardware. However, do_wp_page calls this function 377 * to also make the pte writeable at the same time the dirty bit is 378 * set. In that case we do actually need to write the PTE. 379 */ 380 int ptep_set_access_flags(struct vm_area_struct *vma, 381 unsigned long address, pte_t *ptep, 382 pte_t entry, int dirty) 383 { 384 int changed = !pte_same(*ptep, entry); 385 386 if (changed && dirty) 387 set_pte(ptep, entry); 388 389 return changed; 390 } 391 392 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 393 int pmdp_set_access_flags(struct vm_area_struct *vma, 394 unsigned long address, pmd_t *pmdp, 395 pmd_t entry, int dirty) 396 { 397 int changed = !pmd_same(*pmdp, entry); 398 399 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 400 401 if (changed && dirty) { 402 set_pmd(pmdp, entry); 403 /* 404 * We had a write-protection fault here and changed the pmd 405 * to to more permissive. No need to flush the TLB for that, 406 * #PF is architecturally guaranteed to do that and in the 407 * worst-case we'll generate a spurious fault. 408 */ 409 } 410 411 return changed; 412 } 413 414 int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, 415 pud_t *pudp, pud_t entry, int dirty) 416 { 417 int changed = !pud_same(*pudp, entry); 418 419 VM_BUG_ON(address & ~HPAGE_PUD_MASK); 420 421 if (changed && dirty) { 422 set_pud(pudp, entry); 423 /* 424 * We had a write-protection fault here and changed the pud 425 * to to more permissive. No need to flush the TLB for that, 426 * #PF is architecturally guaranteed to do that and in the 427 * worst-case we'll generate a spurious fault. 428 */ 429 } 430 431 return changed; 432 } 433 #endif 434 435 bool ptep_test_and_clear_young(struct vm_area_struct *vma, 436 unsigned long addr, pte_t *ptep) 437 { 438 bool ret = false; 439 440 if (pte_young(*ptep)) 441 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 442 (unsigned long *) &ptep->pte); 443 444 return ret; 445 } 446 447 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) 448 bool pmdp_test_and_clear_young(struct vm_area_struct *vma, 449 unsigned long addr, pmd_t *pmdp) 450 { 451 bool ret = false; 452 453 if (pmd_young(*pmdp)) 454 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 455 (unsigned long *)pmdp); 456 457 return ret; 458 } 459 #endif 460 461 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 462 bool pudp_test_and_clear_young(struct vm_area_struct *vma, 463 unsigned long addr, pud_t *pudp) 464 { 465 bool ret = false; 466 467 if (pud_young(*pudp)) 468 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 469 (unsigned long *)pudp); 470 471 return ret; 472 } 473 #endif 474 475 bool ptep_clear_flush_young(struct vm_area_struct *vma, 476 unsigned long address, pte_t *ptep) 477 { 478 /* 479 * On x86 CPUs, clearing the accessed bit without a TLB flush 480 * doesn't cause data corruption. [ It could cause incorrect 481 * page aging and the (mistaken) reclaim of hot pages, but the 482 * chance of that should be relatively low. ] 483 * 484 * So as a performance optimization don't flush the TLB when 485 * clearing the accessed bit, it will eventually be flushed by 486 * a context switch or a VM operation anyway. [ In the rare 487 * event of it not getting flushed for a long time the delay 488 * shouldn't really matter because there's no real memory 489 * pressure for swapout to react to. ] 490 */ 491 return ptep_test_and_clear_young(vma, address, ptep); 492 } 493 494 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 495 bool pmdp_clear_flush_young(struct vm_area_struct *vma, 496 unsigned long address, pmd_t *pmdp) 497 { 498 bool young; 499 500 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 501 502 young = pmdp_test_and_clear_young(vma, address, pmdp); 503 if (young) 504 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 505 506 return young; 507 } 508 509 pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, 510 pmd_t *pmdp) 511 { 512 VM_WARN_ON_ONCE(!pmd_present(*pmdp)); 513 514 /* 515 * No flush is necessary. Once an invalid PTE is established, the PTE's 516 * access and dirty bits cannot be updated. 517 */ 518 return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); 519 } 520 #endif 521 522 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ 523 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 524 pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, 525 pud_t *pudp) 526 { 527 VM_WARN_ON_ONCE(!pud_present(*pudp)); 528 pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp)); 529 flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); 530 return old; 531 } 532 #endif 533 534 /** 535 * reserve_top_address - Reserve a hole in the top of the kernel address space 536 * @reserve: Size of hole to reserve 537 * 538 * Can be used to relocate the fixmap area and poke a hole in the top 539 * of the kernel address space to make room for a hypervisor. 540 */ 541 void __init reserve_top_address(unsigned long reserve) 542 { 543 #ifdef CONFIG_X86_32 544 BUG_ON(fixmaps_set > 0); 545 __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; 546 printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", 547 -reserve, __FIXADDR_TOP + PAGE_SIZE); 548 #endif 549 } 550 551 int fixmaps_set; 552 553 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) 554 { 555 unsigned long address = __fix_to_virt(idx); 556 557 #ifdef CONFIG_X86_64 558 /* 559 * Ensure that the static initial page tables are covering the 560 * fixmap completely. 561 */ 562 BUILD_BUG_ON(__end_of_permanent_fixed_addresses > 563 (FIXMAP_PMD_NUM * PTRS_PER_PTE)); 564 #endif 565 566 if (idx >= __end_of_fixed_addresses) { 567 BUG(); 568 return; 569 } 570 set_pte_vaddr(address, pte); 571 fixmaps_set++; 572 } 573 574 void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, 575 phys_addr_t phys, pgprot_t flags) 576 { 577 /* Sanitize 'prot' against any unsupported bits: */ 578 pgprot_val(flags) &= __default_kernel_pte_mask; 579 580 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); 581 } 582 583 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP 584 #if CONFIG_PGTABLE_LEVELS > 4 585 /** 586 * p4d_set_huge - Set up kernel P4D mapping 587 * @p4d: Pointer to the P4D entry 588 * @addr: Virtual address associated with the P4D entry 589 * @prot: Protection bits to use 590 * 591 * No 512GB pages yet -- always return 0 592 */ 593 int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) 594 { 595 return 0; 596 } 597 598 /** 599 * p4d_clear_huge - Clear kernel P4D mapping when it is set 600 * @p4d: Pointer to the P4D entry to clear 601 * 602 * No 512GB pages yet -- do nothing 603 */ 604 void p4d_clear_huge(p4d_t *p4d) 605 { 606 } 607 #endif 608 609 /** 610 * pud_set_huge - Set up kernel PUD mapping 611 * @pud: Pointer to the PUD entry 612 * @addr: Virtual address associated with the PUD entry 613 * @prot: Protection bits to use 614 * 615 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this 616 * function sets up a huge page only if the complete range has the same MTRR 617 * caching mode. 618 * 619 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger 620 * page mapping attempt fails. 621 * 622 * Returns 1 on success and 0 on failure. 623 */ 624 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) 625 { 626 u8 uniform; 627 628 mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); 629 if (!uniform) 630 return 0; 631 632 /* Bail out if we are we on a populated non-leaf entry: */ 633 if (pud_present(*pud) && !pud_leaf(*pud)) 634 return 0; 635 636 set_pte((pte_t *)pud, pfn_pte( 637 (u64)addr >> PAGE_SHIFT, 638 __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); 639 640 return 1; 641 } 642 643 /** 644 * pmd_set_huge - Set up kernel PMD mapping 645 * @pmd: Pointer to the PMD entry 646 * @addr: Virtual address associated with the PMD entry 647 * @prot: Protection bits to use 648 * 649 * See text over pud_set_huge() above. 650 * 651 * Returns 1 on success and 0 on failure. 652 */ 653 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) 654 { 655 u8 uniform; 656 657 mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); 658 if (!uniform) { 659 pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", 660 __func__, addr, addr + PMD_SIZE); 661 return 0; 662 } 663 664 /* Bail out if we are we on a populated non-leaf entry: */ 665 if (pmd_present(*pmd) && !pmd_leaf(*pmd)) 666 return 0; 667 668 set_pte((pte_t *)pmd, pfn_pte( 669 (u64)addr >> PAGE_SHIFT, 670 __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); 671 672 return 1; 673 } 674 675 /** 676 * pud_clear_huge - Clear kernel PUD mapping when it is set 677 * @pud: Pointer to the PUD entry to clear. 678 * 679 * Returns 1 on success and 0 on failure (no PUD map is found). 680 */ 681 int pud_clear_huge(pud_t *pud) 682 { 683 if (pud_leaf(*pud)) { 684 pud_clear(pud); 685 return 1; 686 } 687 688 return 0; 689 } 690 691 /** 692 * pmd_clear_huge - Clear kernel PMD mapping when it is set 693 * @pmd: Pointer to the PMD entry to clear. 694 * 695 * Returns 1 on success and 0 on failure (no PMD map is found). 696 */ 697 int pmd_clear_huge(pmd_t *pmd) 698 { 699 if (pmd_leaf(*pmd)) { 700 pmd_clear(pmd); 701 return 1; 702 } 703 704 return 0; 705 } 706 707 #ifdef CONFIG_X86_64 708 /** 709 * pud_free_pmd_page - Clear PUD entry and free PMD page 710 * @pud: Pointer to a PUD 711 * @addr: Virtual address associated with PUD 712 * 713 * Context: The PUD range has been unmapped and TLB purged. 714 * Return: 1 if clearing the entry succeeded. 0 otherwise. 715 * 716 * NOTE: Callers must allow a single page allocation. 717 */ 718 int pud_free_pmd_page(pud_t *pud, unsigned long addr) 719 { 720 pmd_t *pmd, *pmd_sv; 721 struct ptdesc *pt; 722 int i; 723 724 pmd = pud_pgtable(*pud); 725 pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); 726 if (!pmd_sv) 727 return 0; 728 729 for (i = 0; i < PTRS_PER_PMD; i++) { 730 pmd_sv[i] = pmd[i]; 731 if (!pmd_none(pmd[i])) 732 pmd_clear(&pmd[i]); 733 } 734 735 pud_clear(pud); 736 737 /* INVLPG to clear all paging-structure caches */ 738 flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); 739 740 for (i = 0; i < PTRS_PER_PMD; i++) { 741 if (!pmd_none(pmd_sv[i])) { 742 pt = page_ptdesc(pmd_page(pmd_sv[i])); 743 pagetable_dtor_free(pt); 744 } 745 } 746 747 free_page((unsigned long)pmd_sv); 748 749 pmd_free(&init_mm, pmd); 750 751 return 1; 752 } 753 754 /** 755 * pmd_free_pte_page - Clear PMD entry and free PTE page. 756 * @pmd: Pointer to the PMD 757 * @addr: Virtual address associated with PMD 758 * 759 * Context: The PMD range has been unmapped and TLB purged. 760 * Return: 1 if clearing the entry succeeded. 0 otherwise. 761 */ 762 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 763 { 764 struct ptdesc *pt; 765 766 pt = page_ptdesc(pmd_page(*pmd)); 767 pmd_clear(pmd); 768 769 /* INVLPG to clear all paging-structure caches */ 770 flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); 771 772 pagetable_dtor_free(pt); 773 774 return 1; 775 } 776 777 #else /* !CONFIG_X86_64 */ 778 779 /* 780 * Disable free page handling on x86-PAE. This assures that ioremap() 781 * does not update sync'd PMD entries. See vmalloc_sync_one(). 782 */ 783 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 784 { 785 return pmd_none(*pmd); 786 } 787 788 #endif /* CONFIG_X86_64 */ 789 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ 790 791 pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) 792 { 793 if (vma->vm_flags & VM_SHADOW_STACK) 794 return pte_mkwrite_shstk(pte); 795 796 pte = pte_mkwrite_novma(pte); 797 798 return pte_clear_saveddirty(pte); 799 } 800 801 pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 802 { 803 if (vma->vm_flags & VM_SHADOW_STACK) 804 return pmd_mkwrite_shstk(pmd); 805 806 pmd = pmd_mkwrite_novma(pmd); 807 808 return pmd_clear_saveddirty(pmd); 809 } 810 811 void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) 812 { 813 /* 814 * Hardware before shadow stack can (rarely) set Dirty=1 815 * on a Write=0 PTE. So the below condition 816 * only indicates a software bug when shadow stack is 817 * supported by the HW. This checking is covered in 818 * pte_shstk(). 819 */ 820 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && 821 pte_shstk(pte)); 822 } 823 824 void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) 825 { 826 /* See note in arch_check_zapped_pte() */ 827 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && 828 pmd_shstk(pmd)); 829 } 830 831 void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) 832 { 833 /* See note in arch_check_zapped_pte() */ 834 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud)); 835 } 836