xref: /linux/arch/x86/mm/pgtable.c (revision 4b99990cdf9560e8a071640baf19f312e6ae02f4)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/mm.h>
3 #include <linux/gfp.h>
4 #include <linux/hugetlb.h>
5 #include <asm/pgalloc.h>
6 #include <asm/tlb.h>
7 #include <asm/fixmap.h>
8 #include <asm/mtrr.h>
9 
10 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
11 phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
12 EXPORT_SYMBOL(physical_mask);
13 SYM_PIC_ALIAS(physical_mask);
14 #endif
15 
16 pgtable_t pte_alloc_one(struct mm_struct *mm)
17 {
18 	return __pte_alloc_one(mm, GFP_PGTABLE_USER);
19 }
20 
21 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
22 {
23 	paravirt_release_pte(page_to_pfn(pte));
24 	tlb_remove_ptdesc(tlb, page_ptdesc(pte));
25 }
26 
27 #if CONFIG_PGTABLE_LEVELS > 2
28 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
29 {
30 	paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
31 	/*
32 	 * NOTE! For PAE, any changes to the top page-directory-pointer-table
33 	 * entries need a full cr3 reload to flush.
34 	 */
35 #ifdef CONFIG_X86_PAE
36 	tlb->need_flush_all = 1;
37 #endif
38 	tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd));
39 }
40 
41 #if CONFIG_PGTABLE_LEVELS > 3
42 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
43 {
44 	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
45 	tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud));
46 }
47 
48 #if CONFIG_PGTABLE_LEVELS > 4
49 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
50 {
51 	paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
52 	tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d));
53 }
54 #endif	/* CONFIG_PGTABLE_LEVELS > 4 */
55 #endif	/* CONFIG_PGTABLE_LEVELS > 3 */
56 #endif	/* CONFIG_PGTABLE_LEVELS > 2 */
57 
58 static inline void pgd_list_add(pgd_t *pgd)
59 {
60 	struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
61 
62 	list_add(&ptdesc->pt_list, &pgd_list);
63 }
64 
65 static inline void pgd_list_del(pgd_t *pgd)
66 {
67 	struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
68 
69 	list_del(&ptdesc->pt_list);
70 }
71 
72 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
73 {
74 	virt_to_ptdesc(pgd)->pt_mm = mm;
75 }
76 
77 struct mm_struct *pgd_page_get_mm(struct page *page)
78 {
79 	return page_ptdesc(page)->pt_mm;
80 }
81 
82 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
83 {
84 	/* PAE preallocates all its PMDs.  No cloning needed. */
85 	if (!IS_ENABLED(CONFIG_X86_PAE))
86 		clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
87 				swapper_pg_dir + KERNEL_PGD_BOUNDARY,
88 				KERNEL_PGD_PTRS);
89 
90 	/* List used to sync kernel mapping updates */
91 	pgd_set_mm(pgd, mm);
92 	pgd_list_add(pgd);
93 }
94 
95 static void pgd_dtor(pgd_t *pgd)
96 {
97 	spin_lock(&pgd_lock);
98 	pgd_list_del(pgd);
99 	spin_unlock(&pgd_lock);
100 }
101 
102 #ifdef CONFIG_X86_PAE
103 /*
104  * In PAE mode, we need to do a cr3 reload (=tlb flush) when
105  * updating the top-level pagetable entries to guarantee the
106  * processor notices the update.  Since this is expensive, and
107  * all 4 top-level entries are used almost immediately in a
108  * new process's life, we just pre-populate them here.
109  */
110 #define PREALLOCATED_PMDS	PTRS_PER_PGD
111 
112 /*
113  * "USER_PMDS" are the PMDs for the user copy of the page tables when
114  * PTI is enabled. They do not exist when PTI is disabled.  Note that
115  * this is distinct from the user _portion_ of the kernel page tables
116  * which always exists.
117  *
118  * We allocate separate PMDs for the kernel part of the user page-table
119  * when PTI is enabled. We need them to map the per-process LDT into the
120  * user-space page-table.
121  */
122 #define PREALLOCATED_USER_PMDS	 (boot_cpu_has(X86_FEATURE_PTI) ? \
123 					KERNEL_PGD_PTRS : 0)
124 #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
125 
126 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
127 {
128 	paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
129 
130 	/* Note: almost everything apart from _PAGE_PRESENT is
131 	   reserved at the pmd (PDPT) level. */
132 	set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
133 
134 	/*
135 	 * According to Intel App note "TLBs, Paging-Structure Caches,
136 	 * and Their Invalidation", April 2007, document 317080-001,
137 	 * section 8.1: in PAE mode we explicitly have to flush the
138 	 * TLB via cr3 if the top-level pgd is changed...
139 	 */
140 	flush_tlb_mm(mm);
141 }
142 #else  /* !CONFIG_X86_PAE */
143 
144 /* No need to prepopulate any pagetable entries in non-PAE modes. */
145 #define PREALLOCATED_PMDS	0
146 #define PREALLOCATED_USER_PMDS	 0
147 #define MAX_PREALLOCATED_USER_PMDS 0
148 #endif	/* CONFIG_X86_PAE */
149 
150 static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
151 {
152 	int i;
153 	struct ptdesc *ptdesc;
154 
155 	for (i = 0; i < count; i++)
156 		if (pmds[i]) {
157 			ptdesc = virt_to_ptdesc(pmds[i]);
158 
159 			pagetable_dtor(ptdesc);
160 			pagetable_free(ptdesc);
161 			mm_dec_nr_pmds(mm);
162 		}
163 }
164 
165 static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
166 {
167 	int i;
168 	bool failed = false;
169 	gfp_t gfp = GFP_PGTABLE_USER;
170 
171 	if (mm == &init_mm)
172 		gfp &= ~__GFP_ACCOUNT;
173 	gfp &= ~__GFP_HIGHMEM;
174 
175 	for (i = 0; i < count; i++) {
176 		pmd_t *pmd = NULL;
177 		struct ptdesc *ptdesc = pagetable_alloc(gfp, 0);
178 
179 		if (!ptdesc)
180 			failed = true;
181 		if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) {
182 			pagetable_free(ptdesc);
183 			ptdesc = NULL;
184 			failed = true;
185 		}
186 		if (ptdesc) {
187 			mm_inc_nr_pmds(mm);
188 			pmd = ptdesc_address(ptdesc);
189 		}
190 
191 		pmds[i] = pmd;
192 	}
193 
194 	if (failed) {
195 		free_pmds(mm, pmds, count);
196 		return -ENOMEM;
197 	}
198 
199 	return 0;
200 }
201 
202 /*
203  * Mop up any pmd pages which may still be attached to the pgd.
204  * Normally they will be freed by munmap/exit_mmap, but any pmd we
205  * preallocate which never got a corresponding vma will need to be
206  * freed manually.
207  */
208 static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
209 {
210 	pgd_t pgd = *pgdp;
211 
212 	if (pgd_val(pgd) != 0) {
213 		pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
214 
215 		pgd_clear(pgdp);
216 
217 		paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
218 		pmd_free(mm, pmd);
219 		mm_dec_nr_pmds(mm);
220 	}
221 }
222 
223 static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
224 {
225 	int i;
226 
227 	for (i = 0; i < PREALLOCATED_PMDS; i++)
228 		mop_up_one_pmd(mm, &pgdp[i]);
229 
230 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
231 
232 	if (!boot_cpu_has(X86_FEATURE_PTI))
233 		return;
234 
235 	pgdp = kernel_to_user_pgdp(pgdp);
236 
237 	for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
238 		mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
239 #endif
240 }
241 
242 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
243 {
244 	p4d_t *p4d;
245 	pud_t *pud;
246 	int i;
247 
248 	p4d = p4d_offset(pgd, 0);
249 	pud = pud_offset(p4d, 0);
250 
251 	for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
252 		pmd_t *pmd = pmds[i];
253 
254 		if (i >= KERNEL_PGD_BOUNDARY)
255 			memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
256 			       sizeof(pmd_t) * PTRS_PER_PMD);
257 
258 		pud_populate(mm, pud, pmd);
259 	}
260 }
261 
262 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
263 static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
264 				     pgd_t *k_pgd, pmd_t *pmds[])
265 {
266 	pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
267 	pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
268 	p4d_t *u_p4d;
269 	pud_t *u_pud;
270 	int i;
271 
272 	u_p4d = p4d_offset(u_pgd, 0);
273 	u_pud = pud_offset(u_p4d, 0);
274 
275 	s_pgd += KERNEL_PGD_BOUNDARY;
276 	u_pud += KERNEL_PGD_BOUNDARY;
277 
278 	for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
279 		pmd_t *pmd = pmds[i];
280 
281 		memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
282 		       sizeof(pmd_t) * PTRS_PER_PMD);
283 
284 		pud_populate(mm, u_pud, pmd);
285 	}
286 
287 }
288 #else
289 static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
290 				     pgd_t *k_pgd, pmd_t *pmds[])
291 {
292 }
293 #endif
294 
295 static inline pgd_t *_pgd_alloc(struct mm_struct *mm)
296 {
297 	/*
298 	 * PTI and Xen need a whole page for the PAE PGD
299 	 * even though the hardware only needs 32 bytes.
300 	 *
301 	 * For simplicity, allocate a page for all users.
302 	 */
303 	return __pgd_alloc(mm, pgd_allocation_order());
304 }
305 
306 static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd)
307 {
308 	__pgd_free(mm, pgd);
309 }
310 
311 pgd_t *pgd_alloc(struct mm_struct *mm)
312 {
313 	pgd_t *pgd;
314 	pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
315 	pmd_t *pmds[PREALLOCATED_PMDS];
316 
317 	pgd = _pgd_alloc(mm);
318 
319 	if (pgd == NULL)
320 		goto out;
321 
322 	mm->pgd = pgd;
323 
324 	if (sizeof(pmds) != 0 &&
325 			preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
326 		goto out_free_pgd;
327 
328 	if (sizeof(u_pmds) != 0 &&
329 			preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
330 		goto out_free_pmds;
331 
332 	if (paravirt_pgd_alloc(mm) != 0)
333 		goto out_free_user_pmds;
334 
335 	/*
336 	 * Make sure that pre-populating the pmds is atomic with
337 	 * respect to anything walking the pgd_list, so that they
338 	 * never see a partially populated pgd.
339 	 */
340 	spin_lock(&pgd_lock);
341 
342 	pgd_ctor(mm, pgd);
343 	if (sizeof(pmds) != 0)
344 		pgd_prepopulate_pmd(mm, pgd, pmds);
345 
346 	if (sizeof(u_pmds) != 0)
347 		pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
348 
349 	spin_unlock(&pgd_lock);
350 
351 	return pgd;
352 
353 out_free_user_pmds:
354 	if (sizeof(u_pmds) != 0)
355 		free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
356 out_free_pmds:
357 	if (sizeof(pmds) != 0)
358 		free_pmds(mm, pmds, PREALLOCATED_PMDS);
359 out_free_pgd:
360 	_pgd_free(mm, pgd);
361 out:
362 	return NULL;
363 }
364 
365 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
366 {
367 	pgd_mop_up_pmds(mm, pgd);
368 	pgd_dtor(pgd);
369 	paravirt_pgd_free(mm, pgd);
370 	_pgd_free(mm, pgd);
371 }
372 
373 /*
374  * Used to set accessed or dirty bits in the page table entries
375  * on other architectures. On x86, the accessed and dirty bits
376  * are tracked by hardware. However, do_wp_page calls this function
377  * to also make the pte writeable at the same time the dirty bit is
378  * set. In that case we do actually need to write the PTE.
379  */
380 int ptep_set_access_flags(struct vm_area_struct *vma,
381 			  unsigned long address, pte_t *ptep,
382 			  pte_t entry, int dirty)
383 {
384 	int changed = !pte_same(*ptep, entry);
385 
386 	if (changed && dirty)
387 		set_pte(ptep, entry);
388 
389 	return changed;
390 }
391 
392 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
393 int pmdp_set_access_flags(struct vm_area_struct *vma,
394 			  unsigned long address, pmd_t *pmdp,
395 			  pmd_t entry, int dirty)
396 {
397 	int changed = !pmd_same(*pmdp, entry);
398 
399 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
400 
401 	if (changed && dirty) {
402 		set_pmd(pmdp, entry);
403 		/*
404 		 * We had a write-protection fault here and changed the pmd
405 		 * to to more permissive. No need to flush the TLB for that,
406 		 * #PF is architecturally guaranteed to do that and in the
407 		 * worst-case we'll generate a spurious fault.
408 		 */
409 	}
410 
411 	return changed;
412 }
413 
414 int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
415 			  pud_t *pudp, pud_t entry, int dirty)
416 {
417 	int changed = !pud_same(*pudp, entry);
418 
419 	VM_BUG_ON(address & ~HPAGE_PUD_MASK);
420 
421 	if (changed && dirty) {
422 		set_pud(pudp, entry);
423 		/*
424 		 * We had a write-protection fault here and changed the pud
425 		 * to to more permissive. No need to flush the TLB for that,
426 		 * #PF is architecturally guaranteed to do that and in the
427 		 * worst-case we'll generate a spurious fault.
428 		 */
429 	}
430 
431 	return changed;
432 }
433 #endif
434 
435 bool ptep_test_and_clear_young(struct vm_area_struct *vma,
436 		unsigned long addr, pte_t *ptep)
437 {
438 	bool ret = false;
439 
440 	if (pte_young(*ptep))
441 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
442 					 (unsigned long *) &ptep->pte);
443 
444 	return ret;
445 }
446 
447 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
448 bool pmdp_test_and_clear_young(struct vm_area_struct *vma,
449 		unsigned long addr, pmd_t *pmdp)
450 {
451 	bool ret = false;
452 
453 	if (pmd_young(*pmdp))
454 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
455 					 (unsigned long *)pmdp);
456 
457 	return ret;
458 }
459 #endif
460 
461 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
462 bool pudp_test_and_clear_young(struct vm_area_struct *vma,
463 		unsigned long addr, pud_t *pudp)
464 {
465 	bool ret = false;
466 
467 	if (pud_young(*pudp))
468 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
469 					 (unsigned long *)pudp);
470 
471 	return ret;
472 }
473 #endif
474 
475 bool ptep_clear_flush_young(struct vm_area_struct *vma,
476 		unsigned long address, pte_t *ptep)
477 {
478 	/*
479 	 * On x86 CPUs, clearing the accessed bit without a TLB flush
480 	 * doesn't cause data corruption. [ It could cause incorrect
481 	 * page aging and the (mistaken) reclaim of hot pages, but the
482 	 * chance of that should be relatively low. ]
483 	 *
484 	 * So as a performance optimization don't flush the TLB when
485 	 * clearing the accessed bit, it will eventually be flushed by
486 	 * a context switch or a VM operation anyway. [ In the rare
487 	 * event of it not getting flushed for a long time the delay
488 	 * shouldn't really matter because there's no real memory
489 	 * pressure for swapout to react to. ]
490 	 */
491 	return ptep_test_and_clear_young(vma, address, ptep);
492 }
493 
494 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
495 bool pmdp_clear_flush_young(struct vm_area_struct *vma,
496 		unsigned long address, pmd_t *pmdp)
497 {
498 	bool young;
499 
500 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
501 
502 	young = pmdp_test_and_clear_young(vma, address, pmdp);
503 	if (young)
504 		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
505 
506 	return young;
507 }
508 
509 pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
510 			 pmd_t *pmdp)
511 {
512 	VM_WARN_ON_ONCE(!pmd_present(*pmdp));
513 
514 	/*
515 	 * No flush is necessary. Once an invalid PTE is established, the PTE's
516 	 * access and dirty bits cannot be updated.
517 	 */
518 	return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
519 }
520 #endif
521 
522 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
523 	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
524 pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
525 		     pud_t *pudp)
526 {
527 	VM_WARN_ON_ONCE(!pud_present(*pudp));
528 	pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp));
529 	flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
530 	return old;
531 }
532 #endif
533 
534 /**
535  * reserve_top_address - Reserve a hole in the top of the kernel address space
536  * @reserve: Size of hole to reserve
537  *
538  * Can be used to relocate the fixmap area and poke a hole in the top
539  * of the kernel address space to make room for a hypervisor.
540  */
541 void __init reserve_top_address(unsigned long reserve)
542 {
543 #ifdef CONFIG_X86_32
544 	BUG_ON(fixmaps_set > 0);
545 	__FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
546 	printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
547 	       -reserve, __FIXADDR_TOP + PAGE_SIZE);
548 #endif
549 }
550 
551 int fixmaps_set;
552 
553 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
554 {
555 	unsigned long address = __fix_to_virt(idx);
556 
557 #ifdef CONFIG_X86_64
558        /*
559 	* Ensure that the static initial page tables are covering the
560 	* fixmap completely.
561 	*/
562 	BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
563 		     (FIXMAP_PMD_NUM * PTRS_PER_PTE));
564 #endif
565 
566 	if (idx >= __end_of_fixed_addresses) {
567 		BUG();
568 		return;
569 	}
570 	set_pte_vaddr(address, pte);
571 	fixmaps_set++;
572 }
573 
574 void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
575 		       phys_addr_t phys, pgprot_t flags)
576 {
577 	/* Sanitize 'prot' against any unsupported bits: */
578 	pgprot_val(flags) &= __default_kernel_pte_mask;
579 
580 	__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
581 }
582 
583 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
584 #if CONFIG_PGTABLE_LEVELS > 4
585 /**
586  * p4d_set_huge - Set up kernel P4D mapping
587  * @p4d: Pointer to the P4D entry
588  * @addr: Virtual address associated with the P4D entry
589  * @prot: Protection bits to use
590  *
591  * No 512GB pages yet -- always return 0
592  */
593 int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
594 {
595 	return 0;
596 }
597 
598 /**
599  * p4d_clear_huge - Clear kernel P4D mapping when it is set
600  * @p4d: Pointer to the P4D entry to clear
601  *
602  * No 512GB pages yet -- do nothing
603  */
604 void p4d_clear_huge(p4d_t *p4d)
605 {
606 }
607 #endif
608 
609 /**
610  * pud_set_huge - Set up kernel PUD mapping
611  * @pud: Pointer to the PUD entry
612  * @addr: Virtual address associated with the PUD entry
613  * @prot: Protection bits to use
614  *
615  * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
616  * function sets up a huge page only if the complete range has the same MTRR
617  * caching mode.
618  *
619  * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
620  * page mapping attempt fails.
621  *
622  * Returns 1 on success and 0 on failure.
623  */
624 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
625 {
626 	u8 uniform;
627 
628 	mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
629 	if (!uniform)
630 		return 0;
631 
632 	/* Bail out if we are we on a populated non-leaf entry: */
633 	if (pud_present(*pud) && !pud_leaf(*pud))
634 		return 0;
635 
636 	set_pte((pte_t *)pud, pfn_pte(
637 		(u64)addr >> PAGE_SHIFT,
638 		__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
639 
640 	return 1;
641 }
642 
643 /**
644  * pmd_set_huge - Set up kernel PMD mapping
645  * @pmd: Pointer to the PMD entry
646  * @addr: Virtual address associated with the PMD entry
647  * @prot: Protection bits to use
648  *
649  * See text over pud_set_huge() above.
650  *
651  * Returns 1 on success and 0 on failure.
652  */
653 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
654 {
655 	u8 uniform;
656 
657 	mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
658 	if (!uniform) {
659 		pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
660 			     __func__, addr, addr + PMD_SIZE);
661 		return 0;
662 	}
663 
664 	/* Bail out if we are we on a populated non-leaf entry: */
665 	if (pmd_present(*pmd) && !pmd_leaf(*pmd))
666 		return 0;
667 
668 	set_pte((pte_t *)pmd, pfn_pte(
669 		(u64)addr >> PAGE_SHIFT,
670 		__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
671 
672 	return 1;
673 }
674 
675 /**
676  * pud_clear_huge - Clear kernel PUD mapping when it is set
677  * @pud: Pointer to the PUD entry to clear.
678  *
679  * Returns 1 on success and 0 on failure (no PUD map is found).
680  */
681 int pud_clear_huge(pud_t *pud)
682 {
683 	if (pud_leaf(*pud)) {
684 		pud_clear(pud);
685 		return 1;
686 	}
687 
688 	return 0;
689 }
690 
691 /**
692  * pmd_clear_huge - Clear kernel PMD mapping when it is set
693  * @pmd: Pointer to the PMD entry to clear.
694  *
695  * Returns 1 on success and 0 on failure (no PMD map is found).
696  */
697 int pmd_clear_huge(pmd_t *pmd)
698 {
699 	if (pmd_leaf(*pmd)) {
700 		pmd_clear(pmd);
701 		return 1;
702 	}
703 
704 	return 0;
705 }
706 
707 #ifdef CONFIG_X86_64
708 /**
709  * pud_free_pmd_page - Clear PUD entry and free PMD page
710  * @pud: Pointer to a PUD
711  * @addr: Virtual address associated with PUD
712  *
713  * Context: The PUD range has been unmapped and TLB purged.
714  * Return: 1 if clearing the entry succeeded. 0 otherwise.
715  *
716  * NOTE: Callers must allow a single page allocation.
717  */
718 int pud_free_pmd_page(pud_t *pud, unsigned long addr)
719 {
720 	pmd_t *pmd, *pmd_sv;
721 	struct ptdesc *pt;
722 	int i;
723 
724 	pmd = pud_pgtable(*pud);
725 	pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
726 	if (!pmd_sv)
727 		return 0;
728 
729 	for (i = 0; i < PTRS_PER_PMD; i++) {
730 		pmd_sv[i] = pmd[i];
731 		if (!pmd_none(pmd[i]))
732 			pmd_clear(&pmd[i]);
733 	}
734 
735 	pud_clear(pud);
736 
737 	/* INVLPG to clear all paging-structure caches */
738 	flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
739 
740 	for (i = 0; i < PTRS_PER_PMD; i++) {
741 		if (!pmd_none(pmd_sv[i])) {
742 			pt = page_ptdesc(pmd_page(pmd_sv[i]));
743 			pagetable_dtor_free(pt);
744 		}
745 	}
746 
747 	free_page((unsigned long)pmd_sv);
748 
749 	pmd_free(&init_mm, pmd);
750 
751 	return 1;
752 }
753 
754 /**
755  * pmd_free_pte_page - Clear PMD entry and free PTE page.
756  * @pmd: Pointer to the PMD
757  * @addr: Virtual address associated with PMD
758  *
759  * Context: The PMD range has been unmapped and TLB purged.
760  * Return: 1 if clearing the entry succeeded. 0 otherwise.
761  */
762 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
763 {
764 	struct ptdesc *pt;
765 
766 	pt = page_ptdesc(pmd_page(*pmd));
767 	pmd_clear(pmd);
768 
769 	/* INVLPG to clear all paging-structure caches */
770 	flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
771 
772 	pagetable_dtor_free(pt);
773 
774 	return 1;
775 }
776 
777 #else /* !CONFIG_X86_64 */
778 
779 /*
780  * Disable free page handling on x86-PAE. This assures that ioremap()
781  * does not update sync'd PMD entries. See vmalloc_sync_one().
782  */
783 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
784 {
785 	return pmd_none(*pmd);
786 }
787 
788 #endif /* CONFIG_X86_64 */
789 #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
790 
791 pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
792 {
793 	if (vma->vm_flags & VM_SHADOW_STACK)
794 		return pte_mkwrite_shstk(pte);
795 
796 	pte = pte_mkwrite_novma(pte);
797 
798 	return pte_clear_saveddirty(pte);
799 }
800 
801 pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
802 {
803 	if (vma->vm_flags & VM_SHADOW_STACK)
804 		return pmd_mkwrite_shstk(pmd);
805 
806 	pmd = pmd_mkwrite_novma(pmd);
807 
808 	return pmd_clear_saveddirty(pmd);
809 }
810 
811 void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte)
812 {
813 	/*
814 	 * Hardware before shadow stack can (rarely) set Dirty=1
815 	 * on a Write=0 PTE. So the below condition
816 	 * only indicates a software bug when shadow stack is
817 	 * supported by the HW. This checking is covered in
818 	 * pte_shstk().
819 	 */
820 	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
821 			pte_shstk(pte));
822 }
823 
824 void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
825 {
826 	/* See note in arch_check_zapped_pte() */
827 	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
828 			pmd_shstk(pmd));
829 }
830 
831 void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud)
832 {
833 	/* See note in arch_check_zapped_pte() */
834 	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud));
835 }
836