1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/mm.h> 3 #include <linux/gfp.h> 4 #include <linux/hugetlb.h> 5 #include <asm/pgalloc.h> 6 #include <asm/tlb.h> 7 #include <asm/fixmap.h> 8 #include <asm/mtrr.h> 9 10 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK 11 phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; 12 EXPORT_SYMBOL(physical_mask); 13 #endif 14 15 #ifdef CONFIG_HIGHPTE 16 #define PGTABLE_HIGHMEM __GFP_HIGHMEM 17 #else 18 #define PGTABLE_HIGHMEM 0 19 #endif 20 21 #ifndef CONFIG_PARAVIRT 22 #ifndef CONFIG_PT_RECLAIM 23 static inline 24 void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) 25 { 26 struct ptdesc *ptdesc = (struct ptdesc *)table; 27 28 pagetable_dtor(ptdesc); 29 tlb_remove_page(tlb, ptdesc_page(ptdesc)); 30 } 31 #else 32 static inline 33 void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) 34 { 35 tlb_remove_table(tlb, table); 36 } 37 #endif /* !CONFIG_PT_RECLAIM */ 38 #endif /* !CONFIG_PARAVIRT */ 39 40 gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; 41 42 pgtable_t pte_alloc_one(struct mm_struct *mm) 43 { 44 return __pte_alloc_one(mm, __userpte_alloc_gfp); 45 } 46 47 static int __init setup_userpte(char *arg) 48 { 49 if (!arg) 50 return -EINVAL; 51 52 /* 53 * "userpte=nohigh" disables allocation of user pagetables in 54 * high memory. 55 */ 56 if (strcmp(arg, "nohigh") == 0) 57 __userpte_alloc_gfp &= ~__GFP_HIGHMEM; 58 else 59 return -EINVAL; 60 return 0; 61 } 62 early_param("userpte", setup_userpte); 63 64 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) 65 { 66 paravirt_release_pte(page_to_pfn(pte)); 67 paravirt_tlb_remove_table(tlb, page_ptdesc(pte)); 68 } 69 70 #if CONFIG_PGTABLE_LEVELS > 2 71 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) 72 { 73 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); 74 /* 75 * NOTE! For PAE, any changes to the top page-directory-pointer-table 76 * entries need a full cr3 reload to flush. 77 */ 78 #ifdef CONFIG_X86_PAE 79 tlb->need_flush_all = 1; 80 #endif 81 paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pmd)); 82 } 83 84 #if CONFIG_PGTABLE_LEVELS > 3 85 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 86 { 87 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 88 paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pud)); 89 } 90 91 #if CONFIG_PGTABLE_LEVELS > 4 92 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) 93 { 94 paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); 95 paravirt_tlb_remove_table(tlb, virt_to_ptdesc(p4d)); 96 } 97 #endif /* CONFIG_PGTABLE_LEVELS > 4 */ 98 #endif /* CONFIG_PGTABLE_LEVELS > 3 */ 99 #endif /* CONFIG_PGTABLE_LEVELS > 2 */ 100 101 static inline void pgd_list_add(pgd_t *pgd) 102 { 103 struct ptdesc *ptdesc = virt_to_ptdesc(pgd); 104 105 list_add(&ptdesc->pt_list, &pgd_list); 106 } 107 108 static inline void pgd_list_del(pgd_t *pgd) 109 { 110 struct ptdesc *ptdesc = virt_to_ptdesc(pgd); 111 112 list_del(&ptdesc->pt_list); 113 } 114 115 #define UNSHARED_PTRS_PER_PGD \ 116 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 117 #define MAX_UNSHARED_PTRS_PER_PGD \ 118 MAX_T(size_t, KERNEL_PGD_BOUNDARY, PTRS_PER_PGD) 119 120 121 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) 122 { 123 virt_to_ptdesc(pgd)->pt_mm = mm; 124 } 125 126 struct mm_struct *pgd_page_get_mm(struct page *page) 127 { 128 return page_ptdesc(page)->pt_mm; 129 } 130 131 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) 132 { 133 /* If the pgd points to a shared pagetable level (either the 134 ptes in non-PAE, or shared PMD in PAE), then just copy the 135 references from swapper_pg_dir. */ 136 if (CONFIG_PGTABLE_LEVELS == 2 || 137 (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || 138 CONFIG_PGTABLE_LEVELS >= 4) { 139 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, 140 swapper_pg_dir + KERNEL_PGD_BOUNDARY, 141 KERNEL_PGD_PTRS); 142 } 143 144 /* list required to sync kernel mapping updates */ 145 if (!SHARED_KERNEL_PMD) { 146 pgd_set_mm(pgd, mm); 147 pgd_list_add(pgd); 148 } 149 } 150 151 static void pgd_dtor(pgd_t *pgd) 152 { 153 if (SHARED_KERNEL_PMD) 154 return; 155 156 spin_lock(&pgd_lock); 157 pgd_list_del(pgd); 158 spin_unlock(&pgd_lock); 159 } 160 161 /* 162 * List of all pgd's needed for non-PAE so it can invalidate entries 163 * in both cached and uncached pgd's; not needed for PAE since the 164 * kernel pmd is shared. If PAE were not to share the pmd a similar 165 * tactic would be needed. This is essentially codepath-based locking 166 * against pageattr.c; it is the unique case in which a valid change 167 * of kernel pagetables can't be lazily synchronized by vmalloc faults. 168 * vmalloc faults work because attached pagetables are never freed. 169 * -- nyc 170 */ 171 172 #ifdef CONFIG_X86_PAE 173 /* 174 * In PAE mode, we need to do a cr3 reload (=tlb flush) when 175 * updating the top-level pagetable entries to guarantee the 176 * processor notices the update. Since this is expensive, and 177 * all 4 top-level entries are used almost immediately in a 178 * new process's life, we just pre-populate them here. 179 * 180 * Also, if we're in a paravirt environment where the kernel pmd is 181 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate 182 * and initialize the kernel pmds here. 183 */ 184 #define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD 185 #define MAX_PREALLOCATED_PMDS MAX_UNSHARED_PTRS_PER_PGD 186 187 /* 188 * We allocate separate PMDs for the kernel part of the user page-table 189 * when PTI is enabled. We need them to map the per-process LDT into the 190 * user-space page-table. 191 */ 192 #define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ 193 KERNEL_PGD_PTRS : 0) 194 #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS 195 196 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) 197 { 198 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); 199 200 /* Note: almost everything apart from _PAGE_PRESENT is 201 reserved at the pmd (PDPT) level. */ 202 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); 203 204 /* 205 * According to Intel App note "TLBs, Paging-Structure Caches, 206 * and Their Invalidation", April 2007, document 317080-001, 207 * section 8.1: in PAE mode we explicitly have to flush the 208 * TLB via cr3 if the top-level pgd is changed... 209 */ 210 flush_tlb_mm(mm); 211 } 212 #else /* !CONFIG_X86_PAE */ 213 214 /* No need to prepopulate any pagetable entries in non-PAE modes. */ 215 #define PREALLOCATED_PMDS 0 216 #define MAX_PREALLOCATED_PMDS 0 217 #define PREALLOCATED_USER_PMDS 0 218 #define MAX_PREALLOCATED_USER_PMDS 0 219 #endif /* CONFIG_X86_PAE */ 220 221 static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) 222 { 223 int i; 224 struct ptdesc *ptdesc; 225 226 for (i = 0; i < count; i++) 227 if (pmds[i]) { 228 ptdesc = virt_to_ptdesc(pmds[i]); 229 230 pagetable_dtor(ptdesc); 231 pagetable_free(ptdesc); 232 mm_dec_nr_pmds(mm); 233 } 234 } 235 236 static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) 237 { 238 int i; 239 bool failed = false; 240 gfp_t gfp = GFP_PGTABLE_USER; 241 242 if (mm == &init_mm) 243 gfp &= ~__GFP_ACCOUNT; 244 gfp &= ~__GFP_HIGHMEM; 245 246 for (i = 0; i < count; i++) { 247 pmd_t *pmd = NULL; 248 struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); 249 250 if (!ptdesc) 251 failed = true; 252 if (ptdesc && !pagetable_pmd_ctor(ptdesc)) { 253 pagetable_free(ptdesc); 254 ptdesc = NULL; 255 failed = true; 256 } 257 if (ptdesc) { 258 mm_inc_nr_pmds(mm); 259 pmd = ptdesc_address(ptdesc); 260 } 261 262 pmds[i] = pmd; 263 } 264 265 if (failed) { 266 free_pmds(mm, pmds, count); 267 return -ENOMEM; 268 } 269 270 return 0; 271 } 272 273 /* 274 * Mop up any pmd pages which may still be attached to the pgd. 275 * Normally they will be freed by munmap/exit_mmap, but any pmd we 276 * preallocate which never got a corresponding vma will need to be 277 * freed manually. 278 */ 279 static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) 280 { 281 pgd_t pgd = *pgdp; 282 283 if (pgd_val(pgd) != 0) { 284 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); 285 286 pgd_clear(pgdp); 287 288 paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); 289 pmd_free(mm, pmd); 290 mm_dec_nr_pmds(mm); 291 } 292 } 293 294 static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) 295 { 296 int i; 297 298 for (i = 0; i < PREALLOCATED_PMDS; i++) 299 mop_up_one_pmd(mm, &pgdp[i]); 300 301 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION 302 303 if (!boot_cpu_has(X86_FEATURE_PTI)) 304 return; 305 306 pgdp = kernel_to_user_pgdp(pgdp); 307 308 for (i = 0; i < PREALLOCATED_USER_PMDS; i++) 309 mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); 310 #endif 311 } 312 313 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) 314 { 315 p4d_t *p4d; 316 pud_t *pud; 317 int i; 318 319 p4d = p4d_offset(pgd, 0); 320 pud = pud_offset(p4d, 0); 321 322 for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { 323 pmd_t *pmd = pmds[i]; 324 325 if (i >= KERNEL_PGD_BOUNDARY) 326 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), 327 sizeof(pmd_t) * PTRS_PER_PMD); 328 329 pud_populate(mm, pud, pmd); 330 } 331 } 332 333 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION 334 static void pgd_prepopulate_user_pmd(struct mm_struct *mm, 335 pgd_t *k_pgd, pmd_t *pmds[]) 336 { 337 pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); 338 pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); 339 p4d_t *u_p4d; 340 pud_t *u_pud; 341 int i; 342 343 u_p4d = p4d_offset(u_pgd, 0); 344 u_pud = pud_offset(u_p4d, 0); 345 346 s_pgd += KERNEL_PGD_BOUNDARY; 347 u_pud += KERNEL_PGD_BOUNDARY; 348 349 for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { 350 pmd_t *pmd = pmds[i]; 351 352 memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), 353 sizeof(pmd_t) * PTRS_PER_PMD); 354 355 pud_populate(mm, u_pud, pmd); 356 } 357 358 } 359 #else 360 static void pgd_prepopulate_user_pmd(struct mm_struct *mm, 361 pgd_t *k_pgd, pmd_t *pmds[]) 362 { 363 } 364 #endif 365 /* 366 * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also 367 * assumes that pgd should be in one page. 368 * 369 * But kernel with PAE paging that is not running as a Xen domain 370 * only needs to allocate 32 bytes for pgd instead of one page. 371 */ 372 #ifdef CONFIG_X86_PAE 373 374 #include <linux/slab.h> 375 376 #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) 377 #define PGD_ALIGN 32 378 379 static struct kmem_cache *pgd_cache; 380 381 void __init pgtable_cache_init(void) 382 { 383 /* 384 * When PAE kernel is running as a Xen domain, it does not use 385 * shared kernel pmd. And this requires a whole page for pgd. 386 */ 387 if (!SHARED_KERNEL_PMD) 388 return; 389 390 /* 391 * when PAE kernel is not running as a Xen domain, it uses 392 * shared kernel pmd. Shared kernel pmd does not require a whole 393 * page for pgd. We are able to just allocate a 32-byte for pgd. 394 * During boot time, we create a 32-byte slab for pgd table allocation. 395 */ 396 pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, 397 SLAB_PANIC, NULL); 398 } 399 400 static inline pgd_t *_pgd_alloc(struct mm_struct *mm) 401 { 402 /* 403 * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain. 404 * We allocate one page for pgd. 405 */ 406 if (!SHARED_KERNEL_PMD) 407 return __pgd_alloc(mm, PGD_ALLOCATION_ORDER); 408 409 /* 410 * Now PAE kernel is not running as a Xen domain. We can allocate 411 * a 32-byte slab for pgd to save memory space. 412 */ 413 return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER); 414 } 415 416 static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) 417 { 418 if (!SHARED_KERNEL_PMD) 419 __pgd_free(mm, pgd); 420 else 421 kmem_cache_free(pgd_cache, pgd); 422 } 423 #else 424 425 static inline pgd_t *_pgd_alloc(struct mm_struct *mm) 426 { 427 return __pgd_alloc(mm, PGD_ALLOCATION_ORDER); 428 } 429 430 static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) 431 { 432 __pgd_free(mm, pgd); 433 } 434 #endif /* CONFIG_X86_PAE */ 435 436 pgd_t *pgd_alloc(struct mm_struct *mm) 437 { 438 pgd_t *pgd; 439 pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; 440 pmd_t *pmds[MAX_PREALLOCATED_PMDS]; 441 442 pgd = _pgd_alloc(mm); 443 444 if (pgd == NULL) 445 goto out; 446 447 mm->pgd = pgd; 448 449 if (sizeof(pmds) != 0 && 450 preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) 451 goto out_free_pgd; 452 453 if (sizeof(u_pmds) != 0 && 454 preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) 455 goto out_free_pmds; 456 457 if (paravirt_pgd_alloc(mm) != 0) 458 goto out_free_user_pmds; 459 460 /* 461 * Make sure that pre-populating the pmds is atomic with 462 * respect to anything walking the pgd_list, so that they 463 * never see a partially populated pgd. 464 */ 465 spin_lock(&pgd_lock); 466 467 pgd_ctor(mm, pgd); 468 if (sizeof(pmds) != 0) 469 pgd_prepopulate_pmd(mm, pgd, pmds); 470 471 if (sizeof(u_pmds) != 0) 472 pgd_prepopulate_user_pmd(mm, pgd, u_pmds); 473 474 spin_unlock(&pgd_lock); 475 476 return pgd; 477 478 out_free_user_pmds: 479 if (sizeof(u_pmds) != 0) 480 free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); 481 out_free_pmds: 482 if (sizeof(pmds) != 0) 483 free_pmds(mm, pmds, PREALLOCATED_PMDS); 484 out_free_pgd: 485 _pgd_free(mm, pgd); 486 out: 487 return NULL; 488 } 489 490 void pgd_free(struct mm_struct *mm, pgd_t *pgd) 491 { 492 pgd_mop_up_pmds(mm, pgd); 493 pgd_dtor(pgd); 494 paravirt_pgd_free(mm, pgd); 495 _pgd_free(mm, pgd); 496 } 497 498 /* 499 * Used to set accessed or dirty bits in the page table entries 500 * on other architectures. On x86, the accessed and dirty bits 501 * are tracked by hardware. However, do_wp_page calls this function 502 * to also make the pte writeable at the same time the dirty bit is 503 * set. In that case we do actually need to write the PTE. 504 */ 505 int ptep_set_access_flags(struct vm_area_struct *vma, 506 unsigned long address, pte_t *ptep, 507 pte_t entry, int dirty) 508 { 509 int changed = !pte_same(*ptep, entry); 510 511 if (changed && dirty) 512 set_pte(ptep, entry); 513 514 return changed; 515 } 516 517 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 518 int pmdp_set_access_flags(struct vm_area_struct *vma, 519 unsigned long address, pmd_t *pmdp, 520 pmd_t entry, int dirty) 521 { 522 int changed = !pmd_same(*pmdp, entry); 523 524 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 525 526 if (changed && dirty) { 527 set_pmd(pmdp, entry); 528 /* 529 * We had a write-protection fault here and changed the pmd 530 * to to more permissive. No need to flush the TLB for that, 531 * #PF is architecturally guaranteed to do that and in the 532 * worst-case we'll generate a spurious fault. 533 */ 534 } 535 536 return changed; 537 } 538 539 int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, 540 pud_t *pudp, pud_t entry, int dirty) 541 { 542 int changed = !pud_same(*pudp, entry); 543 544 VM_BUG_ON(address & ~HPAGE_PUD_MASK); 545 546 if (changed && dirty) { 547 set_pud(pudp, entry); 548 /* 549 * We had a write-protection fault here and changed the pud 550 * to to more permissive. No need to flush the TLB for that, 551 * #PF is architecturally guaranteed to do that and in the 552 * worst-case we'll generate a spurious fault. 553 */ 554 } 555 556 return changed; 557 } 558 #endif 559 560 int ptep_test_and_clear_young(struct vm_area_struct *vma, 561 unsigned long addr, pte_t *ptep) 562 { 563 int ret = 0; 564 565 if (pte_young(*ptep)) 566 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 567 (unsigned long *) &ptep->pte); 568 569 return ret; 570 } 571 572 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) 573 int pmdp_test_and_clear_young(struct vm_area_struct *vma, 574 unsigned long addr, pmd_t *pmdp) 575 { 576 int ret = 0; 577 578 if (pmd_young(*pmdp)) 579 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 580 (unsigned long *)pmdp); 581 582 return ret; 583 } 584 #endif 585 586 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 587 int pudp_test_and_clear_young(struct vm_area_struct *vma, 588 unsigned long addr, pud_t *pudp) 589 { 590 int ret = 0; 591 592 if (pud_young(*pudp)) 593 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, 594 (unsigned long *)pudp); 595 596 return ret; 597 } 598 #endif 599 600 int ptep_clear_flush_young(struct vm_area_struct *vma, 601 unsigned long address, pte_t *ptep) 602 { 603 /* 604 * On x86 CPUs, clearing the accessed bit without a TLB flush 605 * doesn't cause data corruption. [ It could cause incorrect 606 * page aging and the (mistaken) reclaim of hot pages, but the 607 * chance of that should be relatively low. ] 608 * 609 * So as a performance optimization don't flush the TLB when 610 * clearing the accessed bit, it will eventually be flushed by 611 * a context switch or a VM operation anyway. [ In the rare 612 * event of it not getting flushed for a long time the delay 613 * shouldn't really matter because there's no real memory 614 * pressure for swapout to react to. ] 615 */ 616 return ptep_test_and_clear_young(vma, address, ptep); 617 } 618 619 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 620 int pmdp_clear_flush_young(struct vm_area_struct *vma, 621 unsigned long address, pmd_t *pmdp) 622 { 623 int young; 624 625 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 626 627 young = pmdp_test_and_clear_young(vma, address, pmdp); 628 if (young) 629 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 630 631 return young; 632 } 633 634 pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, 635 pmd_t *pmdp) 636 { 637 VM_WARN_ON_ONCE(!pmd_present(*pmdp)); 638 639 /* 640 * No flush is necessary. Once an invalid PTE is established, the PTE's 641 * access and dirty bits cannot be updated. 642 */ 643 return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); 644 } 645 #endif 646 647 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ 648 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) 649 pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, 650 pud_t *pudp) 651 { 652 VM_WARN_ON_ONCE(!pud_present(*pudp)); 653 pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp)); 654 flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); 655 return old; 656 } 657 #endif 658 659 /** 660 * reserve_top_address - reserves a hole in the top of kernel address space 661 * @reserve - size of hole to reserve 662 * 663 * Can be used to relocate the fixmap area and poke a hole in the top 664 * of kernel address space to make room for a hypervisor. 665 */ 666 void __init reserve_top_address(unsigned long reserve) 667 { 668 #ifdef CONFIG_X86_32 669 BUG_ON(fixmaps_set > 0); 670 __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; 671 printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", 672 -reserve, __FIXADDR_TOP + PAGE_SIZE); 673 #endif 674 } 675 676 int fixmaps_set; 677 678 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) 679 { 680 unsigned long address = __fix_to_virt(idx); 681 682 #ifdef CONFIG_X86_64 683 /* 684 * Ensure that the static initial page tables are covering the 685 * fixmap completely. 686 */ 687 BUILD_BUG_ON(__end_of_permanent_fixed_addresses > 688 (FIXMAP_PMD_NUM * PTRS_PER_PTE)); 689 #endif 690 691 if (idx >= __end_of_fixed_addresses) { 692 BUG(); 693 return; 694 } 695 set_pte_vaddr(address, pte); 696 fixmaps_set++; 697 } 698 699 void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, 700 phys_addr_t phys, pgprot_t flags) 701 { 702 /* Sanitize 'prot' against any unsupported bits: */ 703 pgprot_val(flags) &= __default_kernel_pte_mask; 704 705 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); 706 } 707 708 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP 709 #ifdef CONFIG_X86_5LEVEL 710 /** 711 * p4d_set_huge - setup kernel P4D mapping 712 * 713 * No 512GB pages yet -- always return 0 714 */ 715 int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) 716 { 717 return 0; 718 } 719 720 /** 721 * p4d_clear_huge - clear kernel P4D mapping when it is set 722 * 723 * No 512GB pages yet -- always return 0 724 */ 725 void p4d_clear_huge(p4d_t *p4d) 726 { 727 } 728 #endif 729 730 /** 731 * pud_set_huge - setup kernel PUD mapping 732 * 733 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this 734 * function sets up a huge page only if the complete range has the same MTRR 735 * caching mode. 736 * 737 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger 738 * page mapping attempt fails. 739 * 740 * Returns 1 on success and 0 on failure. 741 */ 742 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) 743 { 744 u8 uniform; 745 746 mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); 747 if (!uniform) 748 return 0; 749 750 /* Bail out if we are we on a populated non-leaf entry: */ 751 if (pud_present(*pud) && !pud_leaf(*pud)) 752 return 0; 753 754 set_pte((pte_t *)pud, pfn_pte( 755 (u64)addr >> PAGE_SHIFT, 756 __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); 757 758 return 1; 759 } 760 761 /** 762 * pmd_set_huge - setup kernel PMD mapping 763 * 764 * See text over pud_set_huge() above. 765 * 766 * Returns 1 on success and 0 on failure. 767 */ 768 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) 769 { 770 u8 uniform; 771 772 mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); 773 if (!uniform) { 774 pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", 775 __func__, addr, addr + PMD_SIZE); 776 return 0; 777 } 778 779 /* Bail out if we are we on a populated non-leaf entry: */ 780 if (pmd_present(*pmd) && !pmd_leaf(*pmd)) 781 return 0; 782 783 set_pte((pte_t *)pmd, pfn_pte( 784 (u64)addr >> PAGE_SHIFT, 785 __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); 786 787 return 1; 788 } 789 790 /** 791 * pud_clear_huge - clear kernel PUD mapping when it is set 792 * 793 * Returns 1 on success and 0 on failure (no PUD map is found). 794 */ 795 int pud_clear_huge(pud_t *pud) 796 { 797 if (pud_leaf(*pud)) { 798 pud_clear(pud); 799 return 1; 800 } 801 802 return 0; 803 } 804 805 /** 806 * pmd_clear_huge - clear kernel PMD mapping when it is set 807 * 808 * Returns 1 on success and 0 on failure (no PMD map is found). 809 */ 810 int pmd_clear_huge(pmd_t *pmd) 811 { 812 if (pmd_leaf(*pmd)) { 813 pmd_clear(pmd); 814 return 1; 815 } 816 817 return 0; 818 } 819 820 #ifdef CONFIG_X86_64 821 /** 822 * pud_free_pmd_page - Clear pud entry and free pmd page. 823 * @pud: Pointer to a PUD. 824 * @addr: Virtual address associated with pud. 825 * 826 * Context: The pud range has been unmapped and TLB purged. 827 * Return: 1 if clearing the entry succeeded. 0 otherwise. 828 * 829 * NOTE: Callers must allow a single page allocation. 830 */ 831 int pud_free_pmd_page(pud_t *pud, unsigned long addr) 832 { 833 pmd_t *pmd, *pmd_sv; 834 pte_t *pte; 835 int i; 836 837 pmd = pud_pgtable(*pud); 838 pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); 839 if (!pmd_sv) 840 return 0; 841 842 for (i = 0; i < PTRS_PER_PMD; i++) { 843 pmd_sv[i] = pmd[i]; 844 if (!pmd_none(pmd[i])) 845 pmd_clear(&pmd[i]); 846 } 847 848 pud_clear(pud); 849 850 /* INVLPG to clear all paging-structure caches */ 851 flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); 852 853 for (i = 0; i < PTRS_PER_PMD; i++) { 854 if (!pmd_none(pmd_sv[i])) { 855 pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); 856 free_page((unsigned long)pte); 857 } 858 } 859 860 free_page((unsigned long)pmd_sv); 861 862 pagetable_dtor(virt_to_ptdesc(pmd)); 863 free_page((unsigned long)pmd); 864 865 return 1; 866 } 867 868 /** 869 * pmd_free_pte_page - Clear pmd entry and free pte page. 870 * @pmd: Pointer to a PMD. 871 * @addr: Virtual address associated with pmd. 872 * 873 * Context: The pmd range has been unmapped and TLB purged. 874 * Return: 1 if clearing the entry succeeded. 0 otherwise. 875 */ 876 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 877 { 878 pte_t *pte; 879 880 pte = (pte_t *)pmd_page_vaddr(*pmd); 881 pmd_clear(pmd); 882 883 /* INVLPG to clear all paging-structure caches */ 884 flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); 885 886 free_page((unsigned long)pte); 887 888 return 1; 889 } 890 891 #else /* !CONFIG_X86_64 */ 892 893 /* 894 * Disable free page handling on x86-PAE. This assures that ioremap() 895 * does not update sync'd pmd entries. See vmalloc_sync_one(). 896 */ 897 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) 898 { 899 return pmd_none(*pmd); 900 } 901 902 #endif /* CONFIG_X86_64 */ 903 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ 904 905 pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) 906 { 907 if (vma->vm_flags & VM_SHADOW_STACK) 908 return pte_mkwrite_shstk(pte); 909 910 pte = pte_mkwrite_novma(pte); 911 912 return pte_clear_saveddirty(pte); 913 } 914 915 pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 916 { 917 if (vma->vm_flags & VM_SHADOW_STACK) 918 return pmd_mkwrite_shstk(pmd); 919 920 pmd = pmd_mkwrite_novma(pmd); 921 922 return pmd_clear_saveddirty(pmd); 923 } 924 925 void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) 926 { 927 /* 928 * Hardware before shadow stack can (rarely) set Dirty=1 929 * on a Write=0 PTE. So the below condition 930 * only indicates a software bug when shadow stack is 931 * supported by the HW. This checking is covered in 932 * pte_shstk(). 933 */ 934 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && 935 pte_shstk(pte)); 936 } 937 938 void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) 939 { 940 /* See note in arch_check_zapped_pte() */ 941 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && 942 pmd_shstk(pmd)); 943 } 944 945 void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) 946 { 947 /* See note in arch_check_zapped_pte() */ 948 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud)); 949 } 950