xref: /linux/arch/s390/mm/gmap_helpers.c (revision 1fd1dc41724319406b0aff221a352a400b0ddfc5)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Helper functions for KVM guest address space mapping code
4  *
5  *    Copyright IBM Corp. 2007, 2025
6  */
7 
8 #include <linux/export.h>
9 #include <linux/mm_types.h>
10 #include <linux/mmap_lock.h>
11 #include <linux/mm.h>
12 #include <linux/hugetlb.h>
13 #include <linux/swap.h>
14 #include <linux/leafops.h>
15 #include <linux/pagewalk.h>
16 #include <linux/ksm.h>
17 #include <asm/gmap_helpers.h>
18 
19 /**
20  * ptep_zap_softleaf_entry() - discard a software leaf entry.
21  * @mm: the mm
22  * @entry: the software leaf entry that needs to be zapped
23  *
24  * Discards the given software leaf entry. If the leaf entry was an actual
25  * swap entry (and not a migration entry, for example), the actual swapped
26  * page is also discarded from swap.
27  */
28 static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
29 {
30 	if (softleaf_is_swap(entry))
31 		dec_mm_counter(mm, MM_SWAPENTS);
32 	else if (softleaf_is_migration(entry))
33 		dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry)));
34 	swap_put_entries_direct(entry, 1);
35 }
36 
37 /**
38  * gmap_helper_zap_one_page() - discard a page if it was swapped.
39  * @mm: the mm
40  * @vmaddr: the userspace virtual address that needs to be discarded
41  *
42  * If the given address maps to a swap entry, discard it.
43  *
44  * Context: needs to be called while holding the mmap lock.
45  */
46 void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
47 {
48 	struct vm_area_struct *vma;
49 	spinlock_t *ptl;
50 	pte_t *ptep;
51 
52 	mmap_assert_locked(mm);
53 
54 	/* Find the vm address for the guest address */
55 	vma = vma_lookup(mm, vmaddr);
56 	if (!vma || is_vm_hugetlb_page(vma))
57 		return;
58 
59 	/* Get pointer to the page table entry */
60 	ptep = get_locked_pte(mm, vmaddr, &ptl);
61 	if (unlikely(!ptep))
62 		return;
63 	if (pte_swap(*ptep)) {
64 		ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep));
65 		pte_clear(mm, vmaddr, ptep);
66 	}
67 	pte_unmap_unlock(ptep, ptl);
68 }
69 EXPORT_SYMBOL_GPL(gmap_helper_zap_one_page);
70 
71 /**
72  * gmap_helper_discard() - discard user pages in the given range
73  * @mm: the mm
74  * @vmaddr: starting userspace address
75  * @end: end address (first address outside the range)
76  *
77  * All userpace pages in the range [@vamddr, @end) are discarded and unmapped.
78  *
79  * Context: needs to be called while holding the mmap lock.
80  */
81 void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end)
82 {
83 	struct vm_area_struct *vma;
84 
85 	mmap_assert_locked(mm);
86 
87 	while (vmaddr < end) {
88 		vma = find_vma_intersection(mm, vmaddr, end);
89 		if (!vma)
90 			return;
91 		if (!is_vm_hugetlb_page(vma))
92 			zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr, NULL);
93 		vmaddr = vma->vm_end;
94 	}
95 }
96 EXPORT_SYMBOL_GPL(gmap_helper_discard);
97 
98 /**
99  * gmap_helper_try_set_pte_unused() - mark a pte entry as unused
100  * @mm: the mm
101  * @vmaddr: the userspace address whose pte is to be marked
102  *
103  * Mark the pte corresponding the given address as unused. This will cause
104  * core mm code to just drop this page instead of swapping it.
105  *
106  * This function needs to be called with interrupts disabled (for example
107  * while holding a spinlock), or while holding the mmap lock. Normally this
108  * function is called as a result of an unmap operation, and thus KVM common
109  * code will already hold kvm->mmu_lock in write mode.
110  *
111  * Context: Needs to be called while holding the mmap lock or with interrupts
112  *          disabled.
113  */
114 void gmap_helper_try_set_pte_unused(struct mm_struct *mm, unsigned long vmaddr)
115 {
116 	pmd_t *pmdp, pmd, pmdval;
117 	pud_t *pudp, pud;
118 	p4d_t *p4dp, p4d;
119 	pgd_t *pgdp, pgd;
120 	spinlock_t *ptl;	/* Lock for the host (userspace) page table */
121 	pte_t *ptep;
122 
123 	pgdp = pgd_offset(mm, vmaddr);
124 	pgd = pgdp_get(pgdp);
125 	if (pgd_none(pgd) || !pgd_present(pgd))
126 		return;
127 
128 	p4dp = p4d_offset(pgdp, vmaddr);
129 	p4d = p4dp_get(p4dp);
130 	if (p4d_none(p4d) || !p4d_present(p4d))
131 		return;
132 
133 	pudp = pud_offset(p4dp, vmaddr);
134 	pud = pudp_get(pudp);
135 	if (pud_none(pud) || pud_leaf(pud) || !pud_present(pud))
136 		return;
137 
138 	pmdp = pmd_offset(pudp, vmaddr);
139 	pmd = pmdp_get_lockless(pmdp);
140 	if (pmd_none(pmd) || pmd_leaf(pmd) || !pmd_present(pmd))
141 		return;
142 
143 	ptep = pte_offset_map_rw_nolock(mm, pmdp, vmaddr, &pmdval, &ptl);
144 	if (!ptep)
145 		return;
146 
147 	/*
148 	 * Several paths exists that takes the ptl lock and then call the
149 	 * mmu_notifier, which takes the mmu_lock. The unmap path, instead,
150 	 * takes the mmu_lock in write mode first, and then potentially
151 	 * calls this function, which takes the ptl lock. This can lead to a
152 	 * deadlock.
153 	 * The unused page mechanism is only an optimization, if the
154 	 * _PAGE_UNUSED bit is not set, the unused page is swapped as normal
155 	 * instead of being discarded.
156 	 * If the lock is contended the bit is not set and the deadlock is
157 	 * avoided.
158 	 */
159 	if (spin_trylock(ptl)) {
160 		/*
161 		 * Make sure the pte we are touching is still the correct
162 		 * one. In theory this check should not be needed, but
163 		 * better safe than sorry.
164 		 * Disabling interrupts or holding the mmap lock is enough to
165 		 * guarantee that no concurrent updates to the page tables
166 		 * are possible.
167 		 */
168 		if (likely(pmd_same(pmdval, pmdp_get_lockless(pmdp))))
169 			__atomic64_or(_PAGE_UNUSED, (long *)ptep);
170 		spin_unlock(ptl);
171 	}
172 
173 	pte_unmap(ptep);
174 }
175 EXPORT_SYMBOL_GPL(gmap_helper_try_set_pte_unused);
176 
177 static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
178 				   unsigned long end, struct mm_walk *walk)
179 {
180 	unsigned long *found_addr = walk->private;
181 
182 	/* Return 1 of the page is a zeropage. */
183 	if (is_zero_pfn(pte_pfn(*pte))) {
184 		/*
185 		 * Shared zeropage in e.g., a FS DAX mapping? We cannot do the
186 		 * right thing and likely don't care: FAULT_FLAG_UNSHARE
187 		 * currently only works in COW mappings, which is also where
188 		 * mm_forbids_zeropage() is checked.
189 		 */
190 		if (!is_cow_mapping(walk->vma->vm_flags))
191 			return -EFAULT;
192 
193 		*found_addr = addr;
194 		return 1;
195 	}
196 	return 0;
197 }
198 
199 static const struct mm_walk_ops find_zeropage_ops = {
200 	.pte_entry      = find_zeropage_pte_entry,
201 	.walk_lock      = PGWALK_WRLOCK,
202 };
203 
204 /** __gmap_helper_unshare_zeropages() - unshare all shared zeropages
205  * @mm: the mm whose zero pages are to be unshared
206  *
207  * Unshare all shared zeropages, replacing them by anonymous pages. Note that
208  * we cannot simply zap all shared zeropages, because this could later
209  * trigger unexpected userfaultfd missing events.
210  *
211  * This must be called after mm->context.allow_cow_sharing was
212  * set to 0, to avoid future mappings of shared zeropages.
213  *
214  * mm contracts with s390, that even if mm were to remove a page table,
215  * and racing with walk_page_range_vma() calling pte_offset_map_lock()
216  * would fail, it will never insert a page table containing empty zero
217  * pages once mm_forbids_zeropage(mm) i.e.
218  * mm->context.allow_cow_sharing is set to 0.
219  */
220 static int __gmap_helper_unshare_zeropages(struct mm_struct *mm)
221 {
222 	struct vm_area_struct *vma;
223 	VMA_ITERATOR(vmi, mm, 0);
224 	unsigned long addr;
225 	vm_fault_t fault;
226 	int rc;
227 
228 	for_each_vma(vmi, vma) {
229 		/*
230 		 * We could only look at COW mappings, but it's more future
231 		 * proof to catch unexpected zeropages in other mappings and
232 		 * fail.
233 		 */
234 		if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
235 			continue;
236 		addr = vma->vm_start;
237 
238 retry:
239 		rc = walk_page_range_vma(vma, addr, vma->vm_end,
240 					 &find_zeropage_ops, &addr);
241 		if (rc < 0)
242 			return rc;
243 		else if (!rc)
244 			continue;
245 
246 		/* addr was updated by find_zeropage_pte_entry() */
247 		fault = handle_mm_fault(vma, addr,
248 					FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
249 					NULL);
250 		if (fault & VM_FAULT_OOM)
251 			return -ENOMEM;
252 		/*
253 		 * See break_ksm(): even after handle_mm_fault() returned 0, we
254 		 * must start the lookup from the current address, because
255 		 * handle_mm_fault() may back out if there's any difficulty.
256 		 *
257 		 * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
258 		 * maybe they could trigger in the future on concurrent
259 		 * truncation. In that case, the shared zeropage would be gone
260 		 * and we can simply retry and make progress.
261 		 */
262 		cond_resched();
263 		goto retry;
264 	}
265 
266 	return 0;
267 }
268 
269 /**
270  * gmap_helper_disable_cow_sharing() - disable all COW sharing
271  *
272  * Disable most COW-sharing of memory pages for the whole process:
273  * (1) Disable KSM and unmerge/unshare any KSM pages.
274  * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
275  *
276  * Not that we currently don't bother with COW-shared pages that are shared
277  * with parent/child processes due to fork().
278  */
279 int gmap_helper_disable_cow_sharing(void)
280 {
281 	struct mm_struct *mm = current->mm;
282 	int rc;
283 
284 	mmap_assert_write_locked(mm);
285 
286 	if (!mm->context.allow_cow_sharing)
287 		return 0;
288 
289 	mm->context.allow_cow_sharing = 0;
290 
291 	/* Replace all shared zeropages by anonymous pages. */
292 	rc = __gmap_helper_unshare_zeropages(mm);
293 	/*
294 	 * Make sure to disable KSM (if enabled for the whole process or
295 	 * individual VMAs). Note that nothing currently hinders user space
296 	 * from re-enabling it.
297 	 */
298 	if (!rc)
299 		rc = ksm_disable(mm);
300 	if (rc)
301 		mm->context.allow_cow_sharing = 1;
302 	return rc;
303 }
304 EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing);
305