1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/mm.h>
3 #include <linux/gfp.h>
4 #include <linux/hugetlb.h>
5 #include <asm/pgalloc.h>
6 #include <asm/tlb.h>
7 #include <asm/fixmap.h>
8 #include <asm/mtrr.h>
9
10 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
11 phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
12 EXPORT_SYMBOL(physical_mask);
13 SYM_PIC_ALIAS(physical_mask);
14 #endif
15
pte_alloc_one(struct mm_struct * mm)16 pgtable_t pte_alloc_one(struct mm_struct *mm)
17 {
18 return __pte_alloc_one(mm, GFP_PGTABLE_USER);
19 }
20
___pte_free_tlb(struct mmu_gather * tlb,struct page * pte)21 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
22 {
23 paravirt_release_pte(page_to_pfn(pte));
24 tlb_remove_ptdesc(tlb, page_ptdesc(pte));
25 }
26
27 #if CONFIG_PGTABLE_LEVELS > 2
___pmd_free_tlb(struct mmu_gather * tlb,pmd_t * pmd)28 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
29 {
30 paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
31 /*
32 * NOTE! For PAE, any changes to the top page-directory-pointer-table
33 * entries need a full cr3 reload to flush.
34 */
35 #ifdef CONFIG_X86_PAE
36 tlb->need_flush_all = 1;
37 #endif
38 tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd));
39 }
40
41 #if CONFIG_PGTABLE_LEVELS > 3
___pud_free_tlb(struct mmu_gather * tlb,pud_t * pud)42 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
43 {
44 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
45 tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud));
46 }
47
48 #if CONFIG_PGTABLE_LEVELS > 4
___p4d_free_tlb(struct mmu_gather * tlb,p4d_t * p4d)49 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
50 {
51 paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
52 tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d));
53 }
54 #endif /* CONFIG_PGTABLE_LEVELS > 4 */
55 #endif /* CONFIG_PGTABLE_LEVELS > 3 */
56 #endif /* CONFIG_PGTABLE_LEVELS > 2 */
57
pgd_list_add(pgd_t * pgd)58 static inline void pgd_list_add(pgd_t *pgd)
59 {
60 struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
61
62 list_add(&ptdesc->pt_list, &pgd_list);
63 }
64
pgd_list_del(pgd_t * pgd)65 static inline void pgd_list_del(pgd_t *pgd)
66 {
67 struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
68
69 list_del(&ptdesc->pt_list);
70 }
71
pgd_set_mm(pgd_t * pgd,struct mm_struct * mm)72 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
73 {
74 virt_to_ptdesc(pgd)->pt_mm = mm;
75 }
76
pgd_page_get_mm(struct page * page)77 struct mm_struct *pgd_page_get_mm(struct page *page)
78 {
79 return page_ptdesc(page)->pt_mm;
80 }
81
pgd_ctor(struct mm_struct * mm,pgd_t * pgd)82 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
83 {
84 /* PAE preallocates all its PMDs. No cloning needed. */
85 if (!IS_ENABLED(CONFIG_X86_PAE))
86 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
87 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
88 KERNEL_PGD_PTRS);
89
90 /* List used to sync kernel mapping updates */
91 pgd_set_mm(pgd, mm);
92 pgd_list_add(pgd);
93 }
94
pgd_dtor(pgd_t * pgd)95 static void pgd_dtor(pgd_t *pgd)
96 {
97 spin_lock(&pgd_lock);
98 pgd_list_del(pgd);
99 spin_unlock(&pgd_lock);
100 }
101
102 /*
103 * List of all pgd's needed for non-PAE so it can invalidate entries
104 * in both cached and uncached pgd's; not needed for PAE since the
105 * kernel pmd is shared. If PAE were not to share the pmd a similar
106 * tactic would be needed. This is essentially codepath-based locking
107 * against pageattr.c; it is the unique case in which a valid change
108 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
109 * vmalloc faults work because attached pagetables are never freed.
110 * -- nyc
111 */
112
113 #ifdef CONFIG_X86_PAE
114 /*
115 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
116 * updating the top-level pagetable entries to guarantee the
117 * processor notices the update. Since this is expensive, and
118 * all 4 top-level entries are used almost immediately in a
119 * new process's life, we just pre-populate them here.
120 */
121 #define PREALLOCATED_PMDS PTRS_PER_PGD
122
123 /*
124 * "USER_PMDS" are the PMDs for the user copy of the page tables when
125 * PTI is enabled. They do not exist when PTI is disabled. Note that
126 * this is distinct from the user _portion_ of the kernel page tables
127 * which always exists.
128 *
129 * We allocate separate PMDs for the kernel part of the user page-table
130 * when PTI is enabled. We need them to map the per-process LDT into the
131 * user-space page-table.
132 */
133 #define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \
134 KERNEL_PGD_PTRS : 0)
135 #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
136
pud_populate(struct mm_struct * mm,pud_t * pudp,pmd_t * pmd)137 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
138 {
139 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
140
141 /* Note: almost everything apart from _PAGE_PRESENT is
142 reserved at the pmd (PDPT) level. */
143 set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
144
145 /*
146 * According to Intel App note "TLBs, Paging-Structure Caches,
147 * and Their Invalidation", April 2007, document 317080-001,
148 * section 8.1: in PAE mode we explicitly have to flush the
149 * TLB via cr3 if the top-level pgd is changed...
150 */
151 flush_tlb_mm(mm);
152 }
153 #else /* !CONFIG_X86_PAE */
154
155 /* No need to prepopulate any pagetable entries in non-PAE modes. */
156 #define PREALLOCATED_PMDS 0
157 #define PREALLOCATED_USER_PMDS 0
158 #define MAX_PREALLOCATED_USER_PMDS 0
159 #endif /* CONFIG_X86_PAE */
160
free_pmds(struct mm_struct * mm,pmd_t * pmds[],int count)161 static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
162 {
163 int i;
164 struct ptdesc *ptdesc;
165
166 for (i = 0; i < count; i++)
167 if (pmds[i]) {
168 ptdesc = virt_to_ptdesc(pmds[i]);
169
170 pagetable_dtor(ptdesc);
171 pagetable_free(ptdesc);
172 mm_dec_nr_pmds(mm);
173 }
174 }
175
preallocate_pmds(struct mm_struct * mm,pmd_t * pmds[],int count)176 static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
177 {
178 int i;
179 bool failed = false;
180 gfp_t gfp = GFP_PGTABLE_USER;
181
182 if (mm == &init_mm)
183 gfp &= ~__GFP_ACCOUNT;
184 gfp &= ~__GFP_HIGHMEM;
185
186 for (i = 0; i < count; i++) {
187 pmd_t *pmd = NULL;
188 struct ptdesc *ptdesc = pagetable_alloc(gfp, 0);
189
190 if (!ptdesc)
191 failed = true;
192 if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) {
193 pagetable_free(ptdesc);
194 ptdesc = NULL;
195 failed = true;
196 }
197 if (ptdesc) {
198 mm_inc_nr_pmds(mm);
199 pmd = ptdesc_address(ptdesc);
200 }
201
202 pmds[i] = pmd;
203 }
204
205 if (failed) {
206 free_pmds(mm, pmds, count);
207 return -ENOMEM;
208 }
209
210 return 0;
211 }
212
213 /*
214 * Mop up any pmd pages which may still be attached to the pgd.
215 * Normally they will be freed by munmap/exit_mmap, but any pmd we
216 * preallocate which never got a corresponding vma will need to be
217 * freed manually.
218 */
mop_up_one_pmd(struct mm_struct * mm,pgd_t * pgdp)219 static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
220 {
221 pgd_t pgd = *pgdp;
222
223 if (pgd_val(pgd) != 0) {
224 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
225
226 pgd_clear(pgdp);
227
228 paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
229 pmd_free(mm, pmd);
230 mm_dec_nr_pmds(mm);
231 }
232 }
233
pgd_mop_up_pmds(struct mm_struct * mm,pgd_t * pgdp)234 static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
235 {
236 int i;
237
238 for (i = 0; i < PREALLOCATED_PMDS; i++)
239 mop_up_one_pmd(mm, &pgdp[i]);
240
241 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
242
243 if (!boot_cpu_has(X86_FEATURE_PTI))
244 return;
245
246 pgdp = kernel_to_user_pgdp(pgdp);
247
248 for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
249 mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
250 #endif
251 }
252
pgd_prepopulate_pmd(struct mm_struct * mm,pgd_t * pgd,pmd_t * pmds[])253 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
254 {
255 p4d_t *p4d;
256 pud_t *pud;
257 int i;
258
259 p4d = p4d_offset(pgd, 0);
260 pud = pud_offset(p4d, 0);
261
262 for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
263 pmd_t *pmd = pmds[i];
264
265 if (i >= KERNEL_PGD_BOUNDARY)
266 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
267 sizeof(pmd_t) * PTRS_PER_PMD);
268
269 pud_populate(mm, pud, pmd);
270 }
271 }
272
273 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
pgd_prepopulate_user_pmd(struct mm_struct * mm,pgd_t * k_pgd,pmd_t * pmds[])274 static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
275 pgd_t *k_pgd, pmd_t *pmds[])
276 {
277 pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
278 pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
279 p4d_t *u_p4d;
280 pud_t *u_pud;
281 int i;
282
283 u_p4d = p4d_offset(u_pgd, 0);
284 u_pud = pud_offset(u_p4d, 0);
285
286 s_pgd += KERNEL_PGD_BOUNDARY;
287 u_pud += KERNEL_PGD_BOUNDARY;
288
289 for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
290 pmd_t *pmd = pmds[i];
291
292 memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
293 sizeof(pmd_t) * PTRS_PER_PMD);
294
295 pud_populate(mm, u_pud, pmd);
296 }
297
298 }
299 #else
pgd_prepopulate_user_pmd(struct mm_struct * mm,pgd_t * k_pgd,pmd_t * pmds[])300 static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
301 pgd_t *k_pgd, pmd_t *pmds[])
302 {
303 }
304 #endif
305
_pgd_alloc(struct mm_struct * mm)306 static inline pgd_t *_pgd_alloc(struct mm_struct *mm)
307 {
308 /*
309 * PTI and Xen need a whole page for the PAE PGD
310 * even though the hardware only needs 32 bytes.
311 *
312 * For simplicity, allocate a page for all users.
313 */
314 return __pgd_alloc(mm, pgd_allocation_order());
315 }
316
_pgd_free(struct mm_struct * mm,pgd_t * pgd)317 static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd)
318 {
319 __pgd_free(mm, pgd);
320 }
321
pgd_alloc(struct mm_struct * mm)322 pgd_t *pgd_alloc(struct mm_struct *mm)
323 {
324 pgd_t *pgd;
325 pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
326 pmd_t *pmds[PREALLOCATED_PMDS];
327
328 pgd = _pgd_alloc(mm);
329
330 if (pgd == NULL)
331 goto out;
332
333 mm->pgd = pgd;
334
335 if (sizeof(pmds) != 0 &&
336 preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
337 goto out_free_pgd;
338
339 if (sizeof(u_pmds) != 0 &&
340 preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
341 goto out_free_pmds;
342
343 if (paravirt_pgd_alloc(mm) != 0)
344 goto out_free_user_pmds;
345
346 /*
347 * Make sure that pre-populating the pmds is atomic with
348 * respect to anything walking the pgd_list, so that they
349 * never see a partially populated pgd.
350 */
351 spin_lock(&pgd_lock);
352
353 pgd_ctor(mm, pgd);
354 if (sizeof(pmds) != 0)
355 pgd_prepopulate_pmd(mm, pgd, pmds);
356
357 if (sizeof(u_pmds) != 0)
358 pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
359
360 spin_unlock(&pgd_lock);
361
362 return pgd;
363
364 out_free_user_pmds:
365 if (sizeof(u_pmds) != 0)
366 free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
367 out_free_pmds:
368 if (sizeof(pmds) != 0)
369 free_pmds(mm, pmds, PREALLOCATED_PMDS);
370 out_free_pgd:
371 _pgd_free(mm, pgd);
372 out:
373 return NULL;
374 }
375
pgd_free(struct mm_struct * mm,pgd_t * pgd)376 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
377 {
378 pgd_mop_up_pmds(mm, pgd);
379 pgd_dtor(pgd);
380 paravirt_pgd_free(mm, pgd);
381 _pgd_free(mm, pgd);
382 }
383
384 /*
385 * Used to set accessed or dirty bits in the page table entries
386 * on other architectures. On x86, the accessed and dirty bits
387 * are tracked by hardware. However, do_wp_page calls this function
388 * to also make the pte writeable at the same time the dirty bit is
389 * set. In that case we do actually need to write the PTE.
390 */
ptep_set_access_flags(struct vm_area_struct * vma,unsigned long address,pte_t * ptep,pte_t entry,int dirty)391 int ptep_set_access_flags(struct vm_area_struct *vma,
392 unsigned long address, pte_t *ptep,
393 pte_t entry, int dirty)
394 {
395 int changed = !pte_same(*ptep, entry);
396
397 if (changed && dirty)
398 set_pte(ptep, entry);
399
400 return changed;
401 }
402
403 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmdp_set_access_flags(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp,pmd_t entry,int dirty)404 int pmdp_set_access_flags(struct vm_area_struct *vma,
405 unsigned long address, pmd_t *pmdp,
406 pmd_t entry, int dirty)
407 {
408 int changed = !pmd_same(*pmdp, entry);
409
410 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
411
412 if (changed && dirty) {
413 set_pmd(pmdp, entry);
414 /*
415 * We had a write-protection fault here and changed the pmd
416 * to to more permissive. No need to flush the TLB for that,
417 * #PF is architecturally guaranteed to do that and in the
418 * worst-case we'll generate a spurious fault.
419 */
420 }
421
422 return changed;
423 }
424
pudp_set_access_flags(struct vm_area_struct * vma,unsigned long address,pud_t * pudp,pud_t entry,int dirty)425 int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
426 pud_t *pudp, pud_t entry, int dirty)
427 {
428 int changed = !pud_same(*pudp, entry);
429
430 VM_BUG_ON(address & ~HPAGE_PUD_MASK);
431
432 if (changed && dirty) {
433 set_pud(pudp, entry);
434 /*
435 * We had a write-protection fault here and changed the pud
436 * to to more permissive. No need to flush the TLB for that,
437 * #PF is architecturally guaranteed to do that and in the
438 * worst-case we'll generate a spurious fault.
439 */
440 }
441
442 return changed;
443 }
444 #endif
445
ptep_test_and_clear_young(struct vm_area_struct * vma,unsigned long addr,pte_t * ptep)446 int ptep_test_and_clear_young(struct vm_area_struct *vma,
447 unsigned long addr, pte_t *ptep)
448 {
449 int ret = 0;
450
451 if (pte_young(*ptep))
452 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
453 (unsigned long *) &ptep->pte);
454
455 return ret;
456 }
457
458 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
pmdp_test_and_clear_young(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmdp)459 int pmdp_test_and_clear_young(struct vm_area_struct *vma,
460 unsigned long addr, pmd_t *pmdp)
461 {
462 int ret = 0;
463
464 if (pmd_young(*pmdp))
465 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
466 (unsigned long *)pmdp);
467
468 return ret;
469 }
470 #endif
471
472 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pudp_test_and_clear_young(struct vm_area_struct * vma,unsigned long addr,pud_t * pudp)473 int pudp_test_and_clear_young(struct vm_area_struct *vma,
474 unsigned long addr, pud_t *pudp)
475 {
476 int ret = 0;
477
478 if (pud_young(*pudp))
479 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
480 (unsigned long *)pudp);
481
482 return ret;
483 }
484 #endif
485
ptep_clear_flush_young(struct vm_area_struct * vma,unsigned long address,pte_t * ptep)486 int ptep_clear_flush_young(struct vm_area_struct *vma,
487 unsigned long address, pte_t *ptep)
488 {
489 /*
490 * On x86 CPUs, clearing the accessed bit without a TLB flush
491 * doesn't cause data corruption. [ It could cause incorrect
492 * page aging and the (mistaken) reclaim of hot pages, but the
493 * chance of that should be relatively low. ]
494 *
495 * So as a performance optimization don't flush the TLB when
496 * clearing the accessed bit, it will eventually be flushed by
497 * a context switch or a VM operation anyway. [ In the rare
498 * event of it not getting flushed for a long time the delay
499 * shouldn't really matter because there's no real memory
500 * pressure for swapout to react to. ]
501 */
502 return ptep_test_and_clear_young(vma, address, ptep);
503 }
504
505 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmdp_clear_flush_young(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp)506 int pmdp_clear_flush_young(struct vm_area_struct *vma,
507 unsigned long address, pmd_t *pmdp)
508 {
509 int young;
510
511 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
512
513 young = pmdp_test_and_clear_young(vma, address, pmdp);
514 if (young)
515 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
516
517 return young;
518 }
519
pmdp_invalidate_ad(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp)520 pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
521 pmd_t *pmdp)
522 {
523 VM_WARN_ON_ONCE(!pmd_present(*pmdp));
524
525 /*
526 * No flush is necessary. Once an invalid PTE is established, the PTE's
527 * access and dirty bits cannot be updated.
528 */
529 return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
530 }
531 #endif
532
533 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
534 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
pudp_invalidate(struct vm_area_struct * vma,unsigned long address,pud_t * pudp)535 pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
536 pud_t *pudp)
537 {
538 VM_WARN_ON_ONCE(!pud_present(*pudp));
539 pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp));
540 flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
541 return old;
542 }
543 #endif
544
545 /**
546 * reserve_top_address - Reserve a hole in the top of the kernel address space
547 * @reserve: Size of hole to reserve
548 *
549 * Can be used to relocate the fixmap area and poke a hole in the top
550 * of the kernel address space to make room for a hypervisor.
551 */
reserve_top_address(unsigned long reserve)552 void __init reserve_top_address(unsigned long reserve)
553 {
554 #ifdef CONFIG_X86_32
555 BUG_ON(fixmaps_set > 0);
556 __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
557 printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
558 -reserve, __FIXADDR_TOP + PAGE_SIZE);
559 #endif
560 }
561
562 int fixmaps_set;
563
__native_set_fixmap(enum fixed_addresses idx,pte_t pte)564 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
565 {
566 unsigned long address = __fix_to_virt(idx);
567
568 #ifdef CONFIG_X86_64
569 /*
570 * Ensure that the static initial page tables are covering the
571 * fixmap completely.
572 */
573 BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
574 (FIXMAP_PMD_NUM * PTRS_PER_PTE));
575 #endif
576
577 if (idx >= __end_of_fixed_addresses) {
578 BUG();
579 return;
580 }
581 set_pte_vaddr(address, pte);
582 fixmaps_set++;
583 }
584
native_set_fixmap(unsigned idx,phys_addr_t phys,pgprot_t flags)585 void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
586 phys_addr_t phys, pgprot_t flags)
587 {
588 /* Sanitize 'prot' against any unsupported bits: */
589 pgprot_val(flags) &= __default_kernel_pte_mask;
590
591 __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
592 }
593
594 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
595 #if CONFIG_PGTABLE_LEVELS > 4
596 /**
597 * p4d_set_huge - Set up kernel P4D mapping
598 * @p4d: Pointer to the P4D entry
599 * @addr: Virtual address associated with the P4D entry
600 * @prot: Protection bits to use
601 *
602 * No 512GB pages yet -- always return 0
603 */
p4d_set_huge(p4d_t * p4d,phys_addr_t addr,pgprot_t prot)604 int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
605 {
606 return 0;
607 }
608
609 /**
610 * p4d_clear_huge - Clear kernel P4D mapping when it is set
611 * @p4d: Pointer to the P4D entry to clear
612 *
613 * No 512GB pages yet -- do nothing
614 */
p4d_clear_huge(p4d_t * p4d)615 void p4d_clear_huge(p4d_t *p4d)
616 {
617 }
618 #endif
619
620 /**
621 * pud_set_huge - Set up kernel PUD mapping
622 * @pud: Pointer to the PUD entry
623 * @addr: Virtual address associated with the PUD entry
624 * @prot: Protection bits to use
625 *
626 * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
627 * function sets up a huge page only if the complete range has the same MTRR
628 * caching mode.
629 *
630 * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
631 * page mapping attempt fails.
632 *
633 * Returns 1 on success and 0 on failure.
634 */
pud_set_huge(pud_t * pud,phys_addr_t addr,pgprot_t prot)635 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
636 {
637 u8 uniform;
638
639 mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
640 if (!uniform)
641 return 0;
642
643 /* Bail out if we are we on a populated non-leaf entry: */
644 if (pud_present(*pud) && !pud_leaf(*pud))
645 return 0;
646
647 set_pte((pte_t *)pud, pfn_pte(
648 (u64)addr >> PAGE_SHIFT,
649 __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
650
651 return 1;
652 }
653
654 /**
655 * pmd_set_huge - Set up kernel PMD mapping
656 * @pmd: Pointer to the PMD entry
657 * @addr: Virtual address associated with the PMD entry
658 * @prot: Protection bits to use
659 *
660 * See text over pud_set_huge() above.
661 *
662 * Returns 1 on success and 0 on failure.
663 */
pmd_set_huge(pmd_t * pmd,phys_addr_t addr,pgprot_t prot)664 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
665 {
666 u8 uniform;
667
668 mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
669 if (!uniform) {
670 pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
671 __func__, addr, addr + PMD_SIZE);
672 return 0;
673 }
674
675 /* Bail out if we are we on a populated non-leaf entry: */
676 if (pmd_present(*pmd) && !pmd_leaf(*pmd))
677 return 0;
678
679 set_pte((pte_t *)pmd, pfn_pte(
680 (u64)addr >> PAGE_SHIFT,
681 __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
682
683 return 1;
684 }
685
686 /**
687 * pud_clear_huge - Clear kernel PUD mapping when it is set
688 * @pud: Pointer to the PUD entry to clear.
689 *
690 * Returns 1 on success and 0 on failure (no PUD map is found).
691 */
pud_clear_huge(pud_t * pud)692 int pud_clear_huge(pud_t *pud)
693 {
694 if (pud_leaf(*pud)) {
695 pud_clear(pud);
696 return 1;
697 }
698
699 return 0;
700 }
701
702 /**
703 * pmd_clear_huge - Clear kernel PMD mapping when it is set
704 * @pmd: Pointer to the PMD entry to clear.
705 *
706 * Returns 1 on success and 0 on failure (no PMD map is found).
707 */
pmd_clear_huge(pmd_t * pmd)708 int pmd_clear_huge(pmd_t *pmd)
709 {
710 if (pmd_leaf(*pmd)) {
711 pmd_clear(pmd);
712 return 1;
713 }
714
715 return 0;
716 }
717
718 #ifdef CONFIG_X86_64
719 /**
720 * pud_free_pmd_page - Clear PUD entry and free PMD page
721 * @pud: Pointer to a PUD
722 * @addr: Virtual address associated with PUD
723 *
724 * Context: The PUD range has been unmapped and TLB purged.
725 * Return: 1 if clearing the entry succeeded. 0 otherwise.
726 *
727 * NOTE: Callers must allow a single page allocation.
728 */
pud_free_pmd_page(pud_t * pud,unsigned long addr)729 int pud_free_pmd_page(pud_t *pud, unsigned long addr)
730 {
731 pmd_t *pmd, *pmd_sv;
732 pte_t *pte;
733 int i;
734
735 pmd = pud_pgtable(*pud);
736 pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
737 if (!pmd_sv)
738 return 0;
739
740 for (i = 0; i < PTRS_PER_PMD; i++) {
741 pmd_sv[i] = pmd[i];
742 if (!pmd_none(pmd[i]))
743 pmd_clear(&pmd[i]);
744 }
745
746 pud_clear(pud);
747
748 /* INVLPG to clear all paging-structure caches */
749 flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
750
751 for (i = 0; i < PTRS_PER_PMD; i++) {
752 if (!pmd_none(pmd_sv[i])) {
753 pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
754 pte_free_kernel(&init_mm, pte);
755 }
756 }
757
758 free_page((unsigned long)pmd_sv);
759
760 pmd_free(&init_mm, pmd);
761
762 return 1;
763 }
764
765 /**
766 * pmd_free_pte_page - Clear PMD entry and free PTE page.
767 * @pmd: Pointer to the PMD
768 * @addr: Virtual address associated with PMD
769 *
770 * Context: The PMD range has been unmapped and TLB purged.
771 * Return: 1 if clearing the entry succeeded. 0 otherwise.
772 */
pmd_free_pte_page(pmd_t * pmd,unsigned long addr)773 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
774 {
775 pte_t *pte;
776
777 pte = (pte_t *)pmd_page_vaddr(*pmd);
778 pmd_clear(pmd);
779
780 /* INVLPG to clear all paging-structure caches */
781 flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
782
783 pte_free_kernel(&init_mm, pte);
784
785 return 1;
786 }
787
788 #else /* !CONFIG_X86_64 */
789
790 /*
791 * Disable free page handling on x86-PAE. This assures that ioremap()
792 * does not update sync'd PMD entries. See vmalloc_sync_one().
793 */
pmd_free_pte_page(pmd_t * pmd,unsigned long addr)794 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
795 {
796 return pmd_none(*pmd);
797 }
798
799 #endif /* CONFIG_X86_64 */
800 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
801
pte_mkwrite(pte_t pte,struct vm_area_struct * vma)802 pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
803 {
804 if (vma->vm_flags & VM_SHADOW_STACK)
805 return pte_mkwrite_shstk(pte);
806
807 pte = pte_mkwrite_novma(pte);
808
809 return pte_clear_saveddirty(pte);
810 }
811
pmd_mkwrite(pmd_t pmd,struct vm_area_struct * vma)812 pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
813 {
814 if (vma->vm_flags & VM_SHADOW_STACK)
815 return pmd_mkwrite_shstk(pmd);
816
817 pmd = pmd_mkwrite_novma(pmd);
818
819 return pmd_clear_saveddirty(pmd);
820 }
821
arch_check_zapped_pte(struct vm_area_struct * vma,pte_t pte)822 void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte)
823 {
824 /*
825 * Hardware before shadow stack can (rarely) set Dirty=1
826 * on a Write=0 PTE. So the below condition
827 * only indicates a software bug when shadow stack is
828 * supported by the HW. This checking is covered in
829 * pte_shstk().
830 */
831 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
832 pte_shstk(pte));
833 }
834
arch_check_zapped_pmd(struct vm_area_struct * vma,pmd_t pmd)835 void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
836 {
837 /* See note in arch_check_zapped_pte() */
838 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
839 pmd_shstk(pmd));
840 }
841
arch_check_zapped_pud(struct vm_area_struct * vma,pud_t pud)842 void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud)
843 {
844 /* See note in arch_check_zapped_pte() */
845 VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud));
846 }
847