xref: /linux/arch/x86/mm/pgtable.c (revision 00c010e130e58301db2ea0cec1eadc931e1cb8cf)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/mm.h>
3 #include <linux/gfp.h>
4 #include <linux/hugetlb.h>
5 #include <asm/pgalloc.h>
6 #include <asm/tlb.h>
7 #include <asm/fixmap.h>
8 #include <asm/mtrr.h>
9 
10 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
11 phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
12 EXPORT_SYMBOL(physical_mask);
13 SYM_PIC_ALIAS(physical_mask);
14 #endif
15 
pte_alloc_one(struct mm_struct * mm)16 pgtable_t pte_alloc_one(struct mm_struct *mm)
17 {
18 	return __pte_alloc_one(mm, GFP_PGTABLE_USER);
19 }
20 
___pte_free_tlb(struct mmu_gather * tlb,struct page * pte)21 void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
22 {
23 	paravirt_release_pte(page_to_pfn(pte));
24 	tlb_remove_ptdesc(tlb, page_ptdesc(pte));
25 }
26 
27 #if CONFIG_PGTABLE_LEVELS > 2
___pmd_free_tlb(struct mmu_gather * tlb,pmd_t * pmd)28 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
29 {
30 	paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
31 	/*
32 	 * NOTE! For PAE, any changes to the top page-directory-pointer-table
33 	 * entries need a full cr3 reload to flush.
34 	 */
35 #ifdef CONFIG_X86_PAE
36 	tlb->need_flush_all = 1;
37 #endif
38 	tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd));
39 }
40 
41 #if CONFIG_PGTABLE_LEVELS > 3
___pud_free_tlb(struct mmu_gather * tlb,pud_t * pud)42 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
43 {
44 	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
45 	tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud));
46 }
47 
48 #if CONFIG_PGTABLE_LEVELS > 4
___p4d_free_tlb(struct mmu_gather * tlb,p4d_t * p4d)49 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
50 {
51 	paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
52 	tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d));
53 }
54 #endif	/* CONFIG_PGTABLE_LEVELS > 4 */
55 #endif	/* CONFIG_PGTABLE_LEVELS > 3 */
56 #endif	/* CONFIG_PGTABLE_LEVELS > 2 */
57 
pgd_list_add(pgd_t * pgd)58 static inline void pgd_list_add(pgd_t *pgd)
59 {
60 	struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
61 
62 	list_add(&ptdesc->pt_list, &pgd_list);
63 }
64 
pgd_list_del(pgd_t * pgd)65 static inline void pgd_list_del(pgd_t *pgd)
66 {
67 	struct ptdesc *ptdesc = virt_to_ptdesc(pgd);
68 
69 	list_del(&ptdesc->pt_list);
70 }
71 
pgd_set_mm(pgd_t * pgd,struct mm_struct * mm)72 static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
73 {
74 	virt_to_ptdesc(pgd)->pt_mm = mm;
75 }
76 
pgd_page_get_mm(struct page * page)77 struct mm_struct *pgd_page_get_mm(struct page *page)
78 {
79 	return page_ptdesc(page)->pt_mm;
80 }
81 
pgd_ctor(struct mm_struct * mm,pgd_t * pgd)82 static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
83 {
84 	/* PAE preallocates all its PMDs.  No cloning needed. */
85 	if (!IS_ENABLED(CONFIG_X86_PAE))
86 		clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
87 				swapper_pg_dir + KERNEL_PGD_BOUNDARY,
88 				KERNEL_PGD_PTRS);
89 
90 	/* List used to sync kernel mapping updates */
91 	pgd_set_mm(pgd, mm);
92 	pgd_list_add(pgd);
93 }
94 
pgd_dtor(pgd_t * pgd)95 static void pgd_dtor(pgd_t *pgd)
96 {
97 	spin_lock(&pgd_lock);
98 	pgd_list_del(pgd);
99 	spin_unlock(&pgd_lock);
100 }
101 
102 /*
103  * List of all pgd's needed for non-PAE so it can invalidate entries
104  * in both cached and uncached pgd's; not needed for PAE since the
105  * kernel pmd is shared. If PAE were not to share the pmd a similar
106  * tactic would be needed. This is essentially codepath-based locking
107  * against pageattr.c; it is the unique case in which a valid change
108  * of kernel pagetables can't be lazily synchronized by vmalloc faults.
109  * vmalloc faults work because attached pagetables are never freed.
110  * -- nyc
111  */
112 
113 #ifdef CONFIG_X86_PAE
114 /*
115  * In PAE mode, we need to do a cr3 reload (=tlb flush) when
116  * updating the top-level pagetable entries to guarantee the
117  * processor notices the update.  Since this is expensive, and
118  * all 4 top-level entries are used almost immediately in a
119  * new process's life, we just pre-populate them here.
120  */
121 #define PREALLOCATED_PMDS	PTRS_PER_PGD
122 
123 /*
124  * "USER_PMDS" are the PMDs for the user copy of the page tables when
125  * PTI is enabled. They do not exist when PTI is disabled.  Note that
126  * this is distinct from the user _portion_ of the kernel page tables
127  * which always exists.
128  *
129  * We allocate separate PMDs for the kernel part of the user page-table
130  * when PTI is enabled. We need them to map the per-process LDT into the
131  * user-space page-table.
132  */
133 #define PREALLOCATED_USER_PMDS	 (boot_cpu_has(X86_FEATURE_PTI) ? \
134 					KERNEL_PGD_PTRS : 0)
135 #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS
136 
pud_populate(struct mm_struct * mm,pud_t * pudp,pmd_t * pmd)137 void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
138 {
139 	paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
140 
141 	/* Note: almost everything apart from _PAGE_PRESENT is
142 	   reserved at the pmd (PDPT) level. */
143 	set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
144 
145 	/*
146 	 * According to Intel App note "TLBs, Paging-Structure Caches,
147 	 * and Their Invalidation", April 2007, document 317080-001,
148 	 * section 8.1: in PAE mode we explicitly have to flush the
149 	 * TLB via cr3 if the top-level pgd is changed...
150 	 */
151 	flush_tlb_mm(mm);
152 }
153 #else  /* !CONFIG_X86_PAE */
154 
155 /* No need to prepopulate any pagetable entries in non-PAE modes. */
156 #define PREALLOCATED_PMDS	0
157 #define PREALLOCATED_USER_PMDS	 0
158 #define MAX_PREALLOCATED_USER_PMDS 0
159 #endif	/* CONFIG_X86_PAE */
160 
free_pmds(struct mm_struct * mm,pmd_t * pmds[],int count)161 static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
162 {
163 	int i;
164 	struct ptdesc *ptdesc;
165 
166 	for (i = 0; i < count; i++)
167 		if (pmds[i]) {
168 			ptdesc = virt_to_ptdesc(pmds[i]);
169 
170 			pagetable_dtor(ptdesc);
171 			pagetable_free(ptdesc);
172 			mm_dec_nr_pmds(mm);
173 		}
174 }
175 
preallocate_pmds(struct mm_struct * mm,pmd_t * pmds[],int count)176 static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count)
177 {
178 	int i;
179 	bool failed = false;
180 	gfp_t gfp = GFP_PGTABLE_USER;
181 
182 	if (mm == &init_mm)
183 		gfp &= ~__GFP_ACCOUNT;
184 	gfp &= ~__GFP_HIGHMEM;
185 
186 	for (i = 0; i < count; i++) {
187 		pmd_t *pmd = NULL;
188 		struct ptdesc *ptdesc = pagetable_alloc(gfp, 0);
189 
190 		if (!ptdesc)
191 			failed = true;
192 		if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) {
193 			pagetable_free(ptdesc);
194 			ptdesc = NULL;
195 			failed = true;
196 		}
197 		if (ptdesc) {
198 			mm_inc_nr_pmds(mm);
199 			pmd = ptdesc_address(ptdesc);
200 		}
201 
202 		pmds[i] = pmd;
203 	}
204 
205 	if (failed) {
206 		free_pmds(mm, pmds, count);
207 		return -ENOMEM;
208 	}
209 
210 	return 0;
211 }
212 
213 /*
214  * Mop up any pmd pages which may still be attached to the pgd.
215  * Normally they will be freed by munmap/exit_mmap, but any pmd we
216  * preallocate which never got a corresponding vma will need to be
217  * freed manually.
218  */
mop_up_one_pmd(struct mm_struct * mm,pgd_t * pgdp)219 static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
220 {
221 	pgd_t pgd = *pgdp;
222 
223 	if (pgd_val(pgd) != 0) {
224 		pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
225 
226 		pgd_clear(pgdp);
227 
228 		paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
229 		pmd_free(mm, pmd);
230 		mm_dec_nr_pmds(mm);
231 	}
232 }
233 
pgd_mop_up_pmds(struct mm_struct * mm,pgd_t * pgdp)234 static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
235 {
236 	int i;
237 
238 	for (i = 0; i < PREALLOCATED_PMDS; i++)
239 		mop_up_one_pmd(mm, &pgdp[i]);
240 
241 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
242 
243 	if (!boot_cpu_has(X86_FEATURE_PTI))
244 		return;
245 
246 	pgdp = kernel_to_user_pgdp(pgdp);
247 
248 	for (i = 0; i < PREALLOCATED_USER_PMDS; i++)
249 		mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]);
250 #endif
251 }
252 
pgd_prepopulate_pmd(struct mm_struct * mm,pgd_t * pgd,pmd_t * pmds[])253 static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
254 {
255 	p4d_t *p4d;
256 	pud_t *pud;
257 	int i;
258 
259 	p4d = p4d_offset(pgd, 0);
260 	pud = pud_offset(p4d, 0);
261 
262 	for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
263 		pmd_t *pmd = pmds[i];
264 
265 		if (i >= KERNEL_PGD_BOUNDARY)
266 			memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
267 			       sizeof(pmd_t) * PTRS_PER_PMD);
268 
269 		pud_populate(mm, pud, pmd);
270 	}
271 }
272 
273 #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION
pgd_prepopulate_user_pmd(struct mm_struct * mm,pgd_t * k_pgd,pmd_t * pmds[])274 static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
275 				     pgd_t *k_pgd, pmd_t *pmds[])
276 {
277 	pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir);
278 	pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd);
279 	p4d_t *u_p4d;
280 	pud_t *u_pud;
281 	int i;
282 
283 	u_p4d = p4d_offset(u_pgd, 0);
284 	u_pud = pud_offset(u_p4d, 0);
285 
286 	s_pgd += KERNEL_PGD_BOUNDARY;
287 	u_pud += KERNEL_PGD_BOUNDARY;
288 
289 	for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) {
290 		pmd_t *pmd = pmds[i];
291 
292 		memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd),
293 		       sizeof(pmd_t) * PTRS_PER_PMD);
294 
295 		pud_populate(mm, u_pud, pmd);
296 	}
297 
298 }
299 #else
pgd_prepopulate_user_pmd(struct mm_struct * mm,pgd_t * k_pgd,pmd_t * pmds[])300 static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
301 				     pgd_t *k_pgd, pmd_t *pmds[])
302 {
303 }
304 #endif
305 
_pgd_alloc(struct mm_struct * mm)306 static inline pgd_t *_pgd_alloc(struct mm_struct *mm)
307 {
308 	/*
309 	 * PTI and Xen need a whole page for the PAE PGD
310 	 * even though the hardware only needs 32 bytes.
311 	 *
312 	 * For simplicity, allocate a page for all users.
313 	 */
314 	return __pgd_alloc(mm, pgd_allocation_order());
315 }
316 
_pgd_free(struct mm_struct * mm,pgd_t * pgd)317 static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd)
318 {
319 	__pgd_free(mm, pgd);
320 }
321 
pgd_alloc(struct mm_struct * mm)322 pgd_t *pgd_alloc(struct mm_struct *mm)
323 {
324 	pgd_t *pgd;
325 	pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
326 	pmd_t *pmds[PREALLOCATED_PMDS];
327 
328 	pgd = _pgd_alloc(mm);
329 
330 	if (pgd == NULL)
331 		goto out;
332 
333 	mm->pgd = pgd;
334 
335 	if (sizeof(pmds) != 0 &&
336 			preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
337 		goto out_free_pgd;
338 
339 	if (sizeof(u_pmds) != 0 &&
340 			preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
341 		goto out_free_pmds;
342 
343 	if (paravirt_pgd_alloc(mm) != 0)
344 		goto out_free_user_pmds;
345 
346 	/*
347 	 * Make sure that pre-populating the pmds is atomic with
348 	 * respect to anything walking the pgd_list, so that they
349 	 * never see a partially populated pgd.
350 	 */
351 	spin_lock(&pgd_lock);
352 
353 	pgd_ctor(mm, pgd);
354 	if (sizeof(pmds) != 0)
355 		pgd_prepopulate_pmd(mm, pgd, pmds);
356 
357 	if (sizeof(u_pmds) != 0)
358 		pgd_prepopulate_user_pmd(mm, pgd, u_pmds);
359 
360 	spin_unlock(&pgd_lock);
361 
362 	return pgd;
363 
364 out_free_user_pmds:
365 	if (sizeof(u_pmds) != 0)
366 		free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
367 out_free_pmds:
368 	if (sizeof(pmds) != 0)
369 		free_pmds(mm, pmds, PREALLOCATED_PMDS);
370 out_free_pgd:
371 	_pgd_free(mm, pgd);
372 out:
373 	return NULL;
374 }
375 
pgd_free(struct mm_struct * mm,pgd_t * pgd)376 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
377 {
378 	pgd_mop_up_pmds(mm, pgd);
379 	pgd_dtor(pgd);
380 	paravirt_pgd_free(mm, pgd);
381 	_pgd_free(mm, pgd);
382 }
383 
384 /*
385  * Used to set accessed or dirty bits in the page table entries
386  * on other architectures. On x86, the accessed and dirty bits
387  * are tracked by hardware. However, do_wp_page calls this function
388  * to also make the pte writeable at the same time the dirty bit is
389  * set. In that case we do actually need to write the PTE.
390  */
ptep_set_access_flags(struct vm_area_struct * vma,unsigned long address,pte_t * ptep,pte_t entry,int dirty)391 int ptep_set_access_flags(struct vm_area_struct *vma,
392 			  unsigned long address, pte_t *ptep,
393 			  pte_t entry, int dirty)
394 {
395 	int changed = !pte_same(*ptep, entry);
396 
397 	if (changed && dirty)
398 		set_pte(ptep, entry);
399 
400 	return changed;
401 }
402 
403 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmdp_set_access_flags(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp,pmd_t entry,int dirty)404 int pmdp_set_access_flags(struct vm_area_struct *vma,
405 			  unsigned long address, pmd_t *pmdp,
406 			  pmd_t entry, int dirty)
407 {
408 	int changed = !pmd_same(*pmdp, entry);
409 
410 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
411 
412 	if (changed && dirty) {
413 		set_pmd(pmdp, entry);
414 		/*
415 		 * We had a write-protection fault here and changed the pmd
416 		 * to to more permissive. No need to flush the TLB for that,
417 		 * #PF is architecturally guaranteed to do that and in the
418 		 * worst-case we'll generate a spurious fault.
419 		 */
420 	}
421 
422 	return changed;
423 }
424 
pudp_set_access_flags(struct vm_area_struct * vma,unsigned long address,pud_t * pudp,pud_t entry,int dirty)425 int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
426 			  pud_t *pudp, pud_t entry, int dirty)
427 {
428 	int changed = !pud_same(*pudp, entry);
429 
430 	VM_BUG_ON(address & ~HPAGE_PUD_MASK);
431 
432 	if (changed && dirty) {
433 		set_pud(pudp, entry);
434 		/*
435 		 * We had a write-protection fault here and changed the pud
436 		 * to to more permissive. No need to flush the TLB for that,
437 		 * #PF is architecturally guaranteed to do that and in the
438 		 * worst-case we'll generate a spurious fault.
439 		 */
440 	}
441 
442 	return changed;
443 }
444 #endif
445 
ptep_test_and_clear_young(struct vm_area_struct * vma,unsigned long addr,pte_t * ptep)446 int ptep_test_and_clear_young(struct vm_area_struct *vma,
447 			      unsigned long addr, pte_t *ptep)
448 {
449 	int ret = 0;
450 
451 	if (pte_young(*ptep))
452 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
453 					 (unsigned long *) &ptep->pte);
454 
455 	return ret;
456 }
457 
458 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
pmdp_test_and_clear_young(struct vm_area_struct * vma,unsigned long addr,pmd_t * pmdp)459 int pmdp_test_and_clear_young(struct vm_area_struct *vma,
460 			      unsigned long addr, pmd_t *pmdp)
461 {
462 	int ret = 0;
463 
464 	if (pmd_young(*pmdp))
465 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
466 					 (unsigned long *)pmdp);
467 
468 	return ret;
469 }
470 #endif
471 
472 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pudp_test_and_clear_young(struct vm_area_struct * vma,unsigned long addr,pud_t * pudp)473 int pudp_test_and_clear_young(struct vm_area_struct *vma,
474 			      unsigned long addr, pud_t *pudp)
475 {
476 	int ret = 0;
477 
478 	if (pud_young(*pudp))
479 		ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
480 					 (unsigned long *)pudp);
481 
482 	return ret;
483 }
484 #endif
485 
ptep_clear_flush_young(struct vm_area_struct * vma,unsigned long address,pte_t * ptep)486 int ptep_clear_flush_young(struct vm_area_struct *vma,
487 			   unsigned long address, pte_t *ptep)
488 {
489 	/*
490 	 * On x86 CPUs, clearing the accessed bit without a TLB flush
491 	 * doesn't cause data corruption. [ It could cause incorrect
492 	 * page aging and the (mistaken) reclaim of hot pages, but the
493 	 * chance of that should be relatively low. ]
494 	 *
495 	 * So as a performance optimization don't flush the TLB when
496 	 * clearing the accessed bit, it will eventually be flushed by
497 	 * a context switch or a VM operation anyway. [ In the rare
498 	 * event of it not getting flushed for a long time the delay
499 	 * shouldn't really matter because there's no real memory
500 	 * pressure for swapout to react to. ]
501 	 */
502 	return ptep_test_and_clear_young(vma, address, ptep);
503 }
504 
505 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmdp_clear_flush_young(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp)506 int pmdp_clear_flush_young(struct vm_area_struct *vma,
507 			   unsigned long address, pmd_t *pmdp)
508 {
509 	int young;
510 
511 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
512 
513 	young = pmdp_test_and_clear_young(vma, address, pmdp);
514 	if (young)
515 		flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
516 
517 	return young;
518 }
519 
pmdp_invalidate_ad(struct vm_area_struct * vma,unsigned long address,pmd_t * pmdp)520 pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
521 			 pmd_t *pmdp)
522 {
523 	VM_WARN_ON_ONCE(!pmd_present(*pmdp));
524 
525 	/*
526 	 * No flush is necessary. Once an invalid PTE is established, the PTE's
527 	 * access and dirty bits cannot be updated.
528 	 */
529 	return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
530 }
531 #endif
532 
533 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
534 	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
pudp_invalidate(struct vm_area_struct * vma,unsigned long address,pud_t * pudp)535 pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
536 		     pud_t *pudp)
537 {
538 	VM_WARN_ON_ONCE(!pud_present(*pudp));
539 	pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp));
540 	flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
541 	return old;
542 }
543 #endif
544 
545 /**
546  * reserve_top_address - Reserve a hole in the top of the kernel address space
547  * @reserve: Size of hole to reserve
548  *
549  * Can be used to relocate the fixmap area and poke a hole in the top
550  * of the kernel address space to make room for a hypervisor.
551  */
reserve_top_address(unsigned long reserve)552 void __init reserve_top_address(unsigned long reserve)
553 {
554 #ifdef CONFIG_X86_32
555 	BUG_ON(fixmaps_set > 0);
556 	__FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE;
557 	printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n",
558 	       -reserve, __FIXADDR_TOP + PAGE_SIZE);
559 #endif
560 }
561 
562 int fixmaps_set;
563 
__native_set_fixmap(enum fixed_addresses idx,pte_t pte)564 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
565 {
566 	unsigned long address = __fix_to_virt(idx);
567 
568 #ifdef CONFIG_X86_64
569        /*
570 	* Ensure that the static initial page tables are covering the
571 	* fixmap completely.
572 	*/
573 	BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
574 		     (FIXMAP_PMD_NUM * PTRS_PER_PTE));
575 #endif
576 
577 	if (idx >= __end_of_fixed_addresses) {
578 		BUG();
579 		return;
580 	}
581 	set_pte_vaddr(address, pte);
582 	fixmaps_set++;
583 }
584 
native_set_fixmap(unsigned idx,phys_addr_t phys,pgprot_t flags)585 void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
586 		       phys_addr_t phys, pgprot_t flags)
587 {
588 	/* Sanitize 'prot' against any unsupported bits: */
589 	pgprot_val(flags) &= __default_kernel_pte_mask;
590 
591 	__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
592 }
593 
594 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
595 #if CONFIG_PGTABLE_LEVELS > 4
596 /**
597  * p4d_set_huge - Set up kernel P4D mapping
598  * @p4d: Pointer to the P4D entry
599  * @addr: Virtual address associated with the P4D entry
600  * @prot: Protection bits to use
601  *
602  * No 512GB pages yet -- always return 0
603  */
p4d_set_huge(p4d_t * p4d,phys_addr_t addr,pgprot_t prot)604 int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
605 {
606 	return 0;
607 }
608 
609 /**
610  * p4d_clear_huge - Clear kernel P4D mapping when it is set
611  * @p4d: Pointer to the P4D entry to clear
612  *
613  * No 512GB pages yet -- do nothing
614  */
p4d_clear_huge(p4d_t * p4d)615 void p4d_clear_huge(p4d_t *p4d)
616 {
617 }
618 #endif
619 
620 /**
621  * pud_set_huge - Set up kernel PUD mapping
622  * @pud: Pointer to the PUD entry
623  * @addr: Virtual address associated with the PUD entry
624  * @prot: Protection bits to use
625  *
626  * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
627  * function sets up a huge page only if the complete range has the same MTRR
628  * caching mode.
629  *
630  * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
631  * page mapping attempt fails.
632  *
633  * Returns 1 on success and 0 on failure.
634  */
pud_set_huge(pud_t * pud,phys_addr_t addr,pgprot_t prot)635 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
636 {
637 	u8 uniform;
638 
639 	mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
640 	if (!uniform)
641 		return 0;
642 
643 	/* Bail out if we are we on a populated non-leaf entry: */
644 	if (pud_present(*pud) && !pud_leaf(*pud))
645 		return 0;
646 
647 	set_pte((pte_t *)pud, pfn_pte(
648 		(u64)addr >> PAGE_SHIFT,
649 		__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
650 
651 	return 1;
652 }
653 
654 /**
655  * pmd_set_huge - Set up kernel PMD mapping
656  * @pmd: Pointer to the PMD entry
657  * @addr: Virtual address associated with the PMD entry
658  * @prot: Protection bits to use
659  *
660  * See text over pud_set_huge() above.
661  *
662  * Returns 1 on success and 0 on failure.
663  */
pmd_set_huge(pmd_t * pmd,phys_addr_t addr,pgprot_t prot)664 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
665 {
666 	u8 uniform;
667 
668 	mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
669 	if (!uniform) {
670 		pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
671 			     __func__, addr, addr + PMD_SIZE);
672 		return 0;
673 	}
674 
675 	/* Bail out if we are we on a populated non-leaf entry: */
676 	if (pmd_present(*pmd) && !pmd_leaf(*pmd))
677 		return 0;
678 
679 	set_pte((pte_t *)pmd, pfn_pte(
680 		(u64)addr >> PAGE_SHIFT,
681 		__pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE)));
682 
683 	return 1;
684 }
685 
686 /**
687  * pud_clear_huge - Clear kernel PUD mapping when it is set
688  * @pud: Pointer to the PUD entry to clear.
689  *
690  * Returns 1 on success and 0 on failure (no PUD map is found).
691  */
pud_clear_huge(pud_t * pud)692 int pud_clear_huge(pud_t *pud)
693 {
694 	if (pud_leaf(*pud)) {
695 		pud_clear(pud);
696 		return 1;
697 	}
698 
699 	return 0;
700 }
701 
702 /**
703  * pmd_clear_huge - Clear kernel PMD mapping when it is set
704  * @pmd: Pointer to the PMD entry to clear.
705  *
706  * Returns 1 on success and 0 on failure (no PMD map is found).
707  */
pmd_clear_huge(pmd_t * pmd)708 int pmd_clear_huge(pmd_t *pmd)
709 {
710 	if (pmd_leaf(*pmd)) {
711 		pmd_clear(pmd);
712 		return 1;
713 	}
714 
715 	return 0;
716 }
717 
718 #ifdef CONFIG_X86_64
719 /**
720  * pud_free_pmd_page - Clear PUD entry and free PMD page
721  * @pud: Pointer to a PUD
722  * @addr: Virtual address associated with PUD
723  *
724  * Context: The PUD range has been unmapped and TLB purged.
725  * Return: 1 if clearing the entry succeeded. 0 otherwise.
726  *
727  * NOTE: Callers must allow a single page allocation.
728  */
pud_free_pmd_page(pud_t * pud,unsigned long addr)729 int pud_free_pmd_page(pud_t *pud, unsigned long addr)
730 {
731 	pmd_t *pmd, *pmd_sv;
732 	pte_t *pte;
733 	int i;
734 
735 	pmd = pud_pgtable(*pud);
736 	pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
737 	if (!pmd_sv)
738 		return 0;
739 
740 	for (i = 0; i < PTRS_PER_PMD; i++) {
741 		pmd_sv[i] = pmd[i];
742 		if (!pmd_none(pmd[i]))
743 			pmd_clear(&pmd[i]);
744 	}
745 
746 	pud_clear(pud);
747 
748 	/* INVLPG to clear all paging-structure caches */
749 	flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
750 
751 	for (i = 0; i < PTRS_PER_PMD; i++) {
752 		if (!pmd_none(pmd_sv[i])) {
753 			pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
754 			pte_free_kernel(&init_mm, pte);
755 		}
756 	}
757 
758 	free_page((unsigned long)pmd_sv);
759 
760 	pmd_free(&init_mm, pmd);
761 
762 	return 1;
763 }
764 
765 /**
766  * pmd_free_pte_page - Clear PMD entry and free PTE page.
767  * @pmd: Pointer to the PMD
768  * @addr: Virtual address associated with PMD
769  *
770  * Context: The PMD range has been unmapped and TLB purged.
771  * Return: 1 if clearing the entry succeeded. 0 otherwise.
772  */
pmd_free_pte_page(pmd_t * pmd,unsigned long addr)773 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
774 {
775 	pte_t *pte;
776 
777 	pte = (pte_t *)pmd_page_vaddr(*pmd);
778 	pmd_clear(pmd);
779 
780 	/* INVLPG to clear all paging-structure caches */
781 	flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
782 
783 	pte_free_kernel(&init_mm, pte);
784 
785 	return 1;
786 }
787 
788 #else /* !CONFIG_X86_64 */
789 
790 /*
791  * Disable free page handling on x86-PAE. This assures that ioremap()
792  * does not update sync'd PMD entries. See vmalloc_sync_one().
793  */
pmd_free_pte_page(pmd_t * pmd,unsigned long addr)794 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
795 {
796 	return pmd_none(*pmd);
797 }
798 
799 #endif /* CONFIG_X86_64 */
800 #endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
801 
pte_mkwrite(pte_t pte,struct vm_area_struct * vma)802 pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
803 {
804 	if (vma->vm_flags & VM_SHADOW_STACK)
805 		return pte_mkwrite_shstk(pte);
806 
807 	pte = pte_mkwrite_novma(pte);
808 
809 	return pte_clear_saveddirty(pte);
810 }
811 
pmd_mkwrite(pmd_t pmd,struct vm_area_struct * vma)812 pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
813 {
814 	if (vma->vm_flags & VM_SHADOW_STACK)
815 		return pmd_mkwrite_shstk(pmd);
816 
817 	pmd = pmd_mkwrite_novma(pmd);
818 
819 	return pmd_clear_saveddirty(pmd);
820 }
821 
arch_check_zapped_pte(struct vm_area_struct * vma,pte_t pte)822 void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte)
823 {
824 	/*
825 	 * Hardware before shadow stack can (rarely) set Dirty=1
826 	 * on a Write=0 PTE. So the below condition
827 	 * only indicates a software bug when shadow stack is
828 	 * supported by the HW. This checking is covered in
829 	 * pte_shstk().
830 	 */
831 	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
832 			pte_shstk(pte));
833 }
834 
arch_check_zapped_pmd(struct vm_area_struct * vma,pmd_t pmd)835 void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd)
836 {
837 	/* See note in arch_check_zapped_pte() */
838 	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) &&
839 			pmd_shstk(pmd));
840 }
841 
arch_check_zapped_pud(struct vm_area_struct * vma,pud_t pud)842 void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud)
843 {
844 	/* See note in arch_check_zapped_pte() */
845 	VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud));
846 }
847