xref: /linux/mm/pgtable-generic.c (revision 2d46a397472191a10b0df294d64da542bfd1de57)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  mm/pgtable-generic.c
4  *
5  *  Generic pgtable methods declared in linux/pgtable.h
6  *
7  *  Copyright (C) 2010  Linus Torvalds
8  */
9 
10 #include <linux/pagemap.h>
11 #include <linux/hugetlb.h>
12 #include <linux/pgtable.h>
13 #include <linux/swap.h>
14 #include <linux/swapops.h>
15 #include <linux/mm_inline.h>
16 #include <linux/iommu.h>
17 #include <linux/pgalloc.h>
18 
19 #include <asm/tlb.h>
20 
21 /*
22  * If a p?d_bad entry is found while walking page tables, report
23  * the error, before resetting entry to p?d_none.  Usually (but
24  * very seldom) called out from the p?d_none_or_clear_bad macros.
25  */
26 
27 void pgd_clear_bad(pgd_t *pgd)
28 {
29 	pgd_ERROR(*pgd);
30 	pgd_clear(pgd);
31 }
32 
33 #ifndef __PAGETABLE_P4D_FOLDED
34 void p4d_clear_bad(p4d_t *p4d)
35 {
36 	p4d_ERROR(*p4d);
37 	p4d_clear(p4d);
38 }
39 #endif
40 
41 #ifndef __PAGETABLE_PUD_FOLDED
42 void pud_clear_bad(pud_t *pud)
43 {
44 	pud_ERROR(*pud);
45 	pud_clear(pud);
46 }
47 #endif
48 
49 /*
50  * Note that the pmd variant below can't be stub'ed out just as for p4d/pud
51  * above. pmd folding is special and typically pmd_* macros refer to upper
52  * level even when folded
53  */
54 void pmd_clear_bad(pmd_t *pmd)
55 {
56 	pmd_ERROR(*pmd);
57 	pmd_clear(pmd);
58 }
59 
60 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
61 /*
62  * Only sets the access flags (dirty, accessed), as well as write
63  * permission. Furthermore, we know it always gets set to a "more
64  * permissive" setting, which allows most architectures to optimize
65  * this. We return whether the PTE actually changed, which in turn
66  * instructs the caller to do things like update__mmu_cache.  This
67  * used to be done in the caller, but sparc needs minor faults to
68  * force that call on sun4c so we changed this macro slightly
69  */
70 int ptep_set_access_flags(struct vm_area_struct *vma,
71 			  unsigned long address, pte_t *ptep,
72 			  pte_t entry, int dirty)
73 {
74 	int changed = !pte_same(ptep_get(ptep), entry);
75 	if (changed) {
76 		set_pte_at(vma->vm_mm, address, ptep, entry);
77 		flush_tlb_fix_spurious_fault(vma, address, ptep);
78 	}
79 	return changed;
80 }
81 #endif
82 
83 #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
84 bool ptep_clear_flush_young(struct vm_area_struct *vma,
85 		unsigned long address, pte_t *ptep)
86 {
87 	bool young;
88 
89 	young = ptep_test_and_clear_young(vma, address, ptep);
90 	if (young)
91 		flush_tlb_page(vma, address);
92 	return young;
93 }
94 #endif
95 
96 #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
97 pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
98 		       pte_t *ptep)
99 {
100 	struct mm_struct *mm = (vma)->vm_mm;
101 	pte_t pte;
102 	pte = ptep_get_and_clear(mm, address, ptep);
103 	if (pte_accessible(mm, pte))
104 		flush_tlb_page(vma, address);
105 	return pte;
106 }
107 #endif
108 
109 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
110 
111 #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
112 int pmdp_set_access_flags(struct vm_area_struct *vma,
113 			  unsigned long address, pmd_t *pmdp,
114 			  pmd_t entry, int dirty)
115 {
116 	int changed = !pmd_same(*pmdp, entry);
117 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
118 	if (changed) {
119 		set_pmd_at(vma->vm_mm, address, pmdp, entry);
120 		flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
121 	}
122 	return changed;
123 }
124 #endif
125 
126 #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
127 bool pmdp_clear_flush_young(struct vm_area_struct *vma,
128 		unsigned long address, pmd_t *pmdp)
129 {
130 	bool young;
131 
132 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
133 	young = pmdp_test_and_clear_young(vma, address, pmdp);
134 	if (young)
135 		flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
136 	return young;
137 }
138 #endif
139 
140 #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
141 pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
142 			    pmd_t *pmdp)
143 {
144 	pmd_t pmd;
145 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
146 	VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp));
147 	pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
148 	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
149 	return pmd;
150 }
151 
152 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
153 pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
154 			    pud_t *pudp)
155 {
156 	pud_t pud;
157 
158 	VM_BUG_ON(address & ~HPAGE_PUD_MASK);
159 	VM_BUG_ON(!pud_trans_huge(*pudp));
160 	pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
161 	flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
162 	return pud;
163 }
164 #endif
165 #endif
166 
167 #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
168 void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
169 				pgtable_t pgtable)
170 {
171 	assert_spin_locked(pmd_lockptr(mm, pmdp));
172 
173 	/* FIFO */
174 	if (!pmd_huge_pte(mm, pmdp))
175 		INIT_LIST_HEAD(&pgtable->lru);
176 	else
177 		list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
178 	pmd_huge_pte(mm, pmdp) = pgtable;
179 }
180 #endif
181 
182 #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
183 /* no "address" argument so destroys page coloring of some arch */
184 pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
185 {
186 	pgtable_t pgtable;
187 
188 	assert_spin_locked(pmd_lockptr(mm, pmdp));
189 
190 	/* FIFO */
191 	pgtable = pmd_huge_pte(mm, pmdp);
192 	pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru,
193 							  struct page, lru);
194 	if (pmd_huge_pte(mm, pmdp))
195 		list_del(&pgtable->lru);
196 	return pgtable;
197 }
198 #endif
199 
200 #ifndef __HAVE_ARCH_PMDP_INVALIDATE
201 pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
202 		     pmd_t *pmdp)
203 {
204 	VM_WARN_ON_ONCE(!pmd_present(*pmdp));
205 	pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
206 	flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
207 	return old;
208 }
209 #endif
210 
211 #ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD
212 pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
213 			 pmd_t *pmdp)
214 {
215 	VM_WARN_ON_ONCE(!pmd_present(*pmdp));
216 	return pmdp_invalidate(vma, address, pmdp);
217 }
218 #endif
219 
220 #ifndef pmdp_collapse_flush
221 pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
222 			  pmd_t *pmdp)
223 {
224 	/*
225 	 * pmd and hugepage pte format are same. So we could
226 	 * use the same function.
227 	 */
228 	pmd_t pmd;
229 
230 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
231 	VM_BUG_ON(pmd_trans_huge(*pmdp));
232 	pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
233 
234 	/* collapse entails shooting down ptes not pmd */
235 	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
236 	return pmd;
237 }
238 #endif
239 
240 /* arch define pte_free_defer in asm/pgalloc.h for its own implementation */
241 #ifndef pte_free_defer
242 static void pte_free_now(struct rcu_head *head)
243 {
244 	struct page *page;
245 
246 	page = container_of(head, struct page, rcu_head);
247 	pte_free(NULL /* mm not passed and not used */, (pgtable_t)page);
248 }
249 
250 void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
251 {
252 	struct page *page;
253 
254 	page = pgtable;
255 	call_rcu(&page->rcu_head, pte_free_now);
256 }
257 #endif /* pte_free_defer */
258 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
259 
260 #if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \
261 	(defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU))
262 /*
263  * See the comment above ptep_get_lockless() in include/linux/pgtable.h:
264  * the barriers in pmdp_get_lockless() cannot guarantee that the value in
265  * pmd_high actually belongs with the value in pmd_low; but holding interrupts
266  * off blocks the TLB flush between present updates, which guarantees that a
267  * successful __pte_offset_map() points to a page from matched halves.
268  */
269 static unsigned long pmdp_get_lockless_start(void)
270 {
271 	unsigned long irqflags;
272 
273 	local_irq_save(irqflags);
274 	return irqflags;
275 }
276 static void pmdp_get_lockless_end(unsigned long irqflags)
277 {
278 	local_irq_restore(irqflags);
279 }
280 #else
281 static unsigned long pmdp_get_lockless_start(void) { return 0; }
282 static void pmdp_get_lockless_end(unsigned long irqflags) { }
283 #endif
284 
285 pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
286 {
287 	unsigned long irqflags;
288 	pmd_t pmdval;
289 
290 	rcu_read_lock();
291 	irqflags = pmdp_get_lockless_start();
292 	pmdval = pmdp_get_lockless(pmd);
293 	pmdp_get_lockless_end(irqflags);
294 
295 	if (pmdvalp)
296 		*pmdvalp = pmdval;
297 	if (unlikely(pmd_none(pmdval) || !pmd_present(pmdval)))
298 		goto nomap;
299 	if (unlikely(pmd_trans_huge(pmdval)))
300 		goto nomap;
301 	if (unlikely(pmd_bad(pmdval))) {
302 		pmd_clear_bad(pmd);
303 		goto nomap;
304 	}
305 	return __pte_map(&pmdval, addr);
306 nomap:
307 	rcu_read_unlock();
308 	return NULL;
309 }
310 
311 pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
312 				unsigned long addr, spinlock_t **ptlp)
313 {
314 	pmd_t pmdval;
315 	pte_t *pte;
316 
317 	pte = __pte_offset_map(pmd, addr, &pmdval);
318 	if (likely(pte))
319 		*ptlp = pte_lockptr(mm, &pmdval);
320 	return pte;
321 }
322 
323 pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
324 				unsigned long addr, pmd_t *pmdvalp,
325 				spinlock_t **ptlp)
326 {
327 	pte_t *pte;
328 
329 	VM_WARN_ON_ONCE(!pmdvalp);
330 	pte = __pte_offset_map(pmd, addr, pmdvalp);
331 	if (likely(pte))
332 		*ptlp = pte_lockptr(mm, pmdvalp);
333 	return pte;
334 }
335 
336 /*
337  * pte_offset_map_lock(mm, pmd, addr, ptlp) is usually called with the pmd
338  * pointer for addr, reached by walking down the mm's pgd, p4d, pud for addr:
339  * either while holding mmap_lock or vma lock for read or for write; or in
340  * truncate or rmap context, while holding file's i_mmap_lock or anon_vma lock
341  * for read (or for write). In a few cases, it may be used with pmd pointing to
342  * a pmd_t already copied to or constructed on the stack.
343  *
344  * When successful, it returns the pte pointer for addr, with its page table
345  * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent
346  * modification by software, with a pointer to that spinlock in ptlp (in some
347  * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's
348  * struct page).  pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards.
349  *
350  * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no
351  * page table at *pmd: if, for example, the page table has just been removed,
352  * or replaced by the huge pmd of a THP.  (When successful, *pmd is rechecked
353  * after acquiring the ptlock, and retried internally if it changed: so that a
354  * page table can be safely removed or replaced by THP while holding its lock.)
355  *
356  * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above,
357  * just returns the pte pointer for addr, its page table kmapped if necessary;
358  * or NULL if there is no page table at *pmd.  It does not attempt to lock the
359  * page table, so cannot normally be used when the page table is to be updated,
360  * or when entries read must be stable.  But it does take rcu_read_lock(): so
361  * that even when page table is racily removed, it remains a valid though empty
362  * and disconnected table.  Until pte_unmap(pte) unmaps and rcu_read_unlock()s
363  * afterwards.
364  *
365  * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
366  * but when successful, it also outputs a pointer to the spinlock in ptlp - as
367  * pte_offset_map_lock() does, but in this case without locking it.  This helps
368  * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
369  * act on a changed *pmd: pte_offset_map_ro_nolock() provides the correct spinlock
370  * pointer for the page table that it returns. Even after grabbing the spinlock,
371  * we might be looking either at a page table that is still mapped or one that
372  * was unmapped and is about to get freed. But for R/O access this is sufficient.
373  * So it is only applicable for read-only cases where any modification operations
374  * to the page table are not allowed even if the corresponding spinlock is held
375  * afterwards.
376  *
377  * pte_offset_map_rw_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like
378  * pte_offset_map_ro_nolock(); but when successful, it also outputs the pdmval.
379  * It is applicable for may-write cases where any modification operations to the
380  * page table may happen after the corresponding spinlock is held afterwards.
381  * But the users should make sure the page table is stable like checking pte_same()
382  * or checking pmd_same() by using the output pmdval before performing the write
383  * operations.
384  *
385  * Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will
386  * be read-only/read-write protected.
387  *
388  * Note that free_pgtables(), used after unmapping detached vmas, or when
389  * exiting the whole mm, does not take page table lock before freeing a page
390  * table, and may not use RCU at all: "outsiders" like khugepaged should avoid
391  * pte_offset_map() and co once the vma is detached from mm or mm_users is zero.
392  */
393 pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
394 			   unsigned long addr, spinlock_t **ptlp)
395 {
396 	spinlock_t *ptl;
397 	pmd_t pmdval;
398 	pte_t *pte;
399 again:
400 	pte = __pte_offset_map(pmd, addr, &pmdval);
401 	if (unlikely(!pte))
402 		return pte;
403 	ptl = pte_lockptr(mm, &pmdval);
404 	spin_lock(ptl);
405 	if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
406 		*ptlp = ptl;
407 		return pte;
408 	}
409 	pte_unmap_unlock(pte, ptl);
410 	goto again;
411 }
412 
413 #ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE
414 static void kernel_pgtable_work_func(struct work_struct *work);
415 
416 static struct {
417 	struct list_head list;
418 	/* protect above ptdesc lists */
419 	spinlock_t lock;
420 	struct work_struct work;
421 } kernel_pgtable_work = {
422 	.list = LIST_HEAD_INIT(kernel_pgtable_work.list),
423 	.lock = __SPIN_LOCK_UNLOCKED(kernel_pgtable_work.lock),
424 	.work = __WORK_INITIALIZER(kernel_pgtable_work.work, kernel_pgtable_work_func),
425 };
426 
427 static void kernel_pgtable_work_func(struct work_struct *work)
428 {
429 	struct ptdesc *pt, *next;
430 	LIST_HEAD(page_list);
431 
432 	spin_lock(&kernel_pgtable_work.lock);
433 	list_splice_tail_init(&kernel_pgtable_work.list, &page_list);
434 	spin_unlock(&kernel_pgtable_work.lock);
435 
436 	iommu_sva_invalidate_kva_range(PAGE_OFFSET, TLB_FLUSH_ALL);
437 	list_for_each_entry_safe(pt, next, &page_list, pt_list)
438 		__pagetable_free(pt);
439 }
440 
441 void pagetable_free_kernel(struct ptdesc *pt)
442 {
443 	spin_lock(&kernel_pgtable_work.lock);
444 	list_add(&pt->pt_list, &kernel_pgtable_work.list);
445 	spin_unlock(&kernel_pgtable_work.lock);
446 
447 	schedule_work(&kernel_pgtable_work.work);
448 }
449 #endif
450