1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Helper functions for KVM guest address space mapping code
4 *
5 * Copyright IBM Corp. 2007, 2025
6 */
7
8 #include <linux/export.h>
9 #include <linux/mm_types.h>
10 #include <linux/mmap_lock.h>
11 #include <linux/mm.h>
12 #include <linux/hugetlb.h>
13 #include <linux/swap.h>
14 #include <linux/leafops.h>
15 #include <linux/pagewalk.h>
16 #include <linux/ksm.h>
17 #include <asm/gmap_helpers.h>
18 #include <asm/pgtable.h>
19
20 /**
21 * ptep_zap_softleaf_entry() - discard a software leaf entry.
22 * @mm: the mm
23 * @entry: the software leaf entry that needs to be zapped
24 *
25 * Discards the given software leaf entry. If the leaf entry was an actual
26 * swap entry (and not a migration entry, for example), the actual swapped
27 * page is also discarded from swap.
28 */
ptep_zap_softleaf_entry(struct mm_struct * mm,softleaf_t entry)29 static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry)
30 {
31 if (softleaf_is_swap(entry))
32 dec_mm_counter(mm, MM_SWAPENTS);
33 else if (softleaf_is_migration(entry))
34 dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry)));
35 free_swap_and_cache(entry);
36 }
37
38 /**
39 * gmap_helper_zap_one_page() - discard a page if it was swapped.
40 * @mm: the mm
41 * @vmaddr: the userspace virtual address that needs to be discarded
42 *
43 * If the given address maps to a swap entry, discard it.
44 *
45 * Context: needs to be called while holding the mmap lock.
46 */
gmap_helper_zap_one_page(struct mm_struct * mm,unsigned long vmaddr)47 void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr)
48 {
49 struct vm_area_struct *vma;
50 unsigned long pgstev;
51 spinlock_t *ptl;
52 pgste_t pgste;
53 pte_t *ptep;
54
55 mmap_assert_locked(mm);
56
57 /* Find the vm address for the guest address */
58 vma = vma_lookup(mm, vmaddr);
59 if (!vma || is_vm_hugetlb_page(vma))
60 return;
61
62 /* Get pointer to the page table entry */
63 ptep = get_locked_pte(mm, vmaddr, &ptl);
64 if (unlikely(!ptep))
65 return;
66 if (pte_swap(*ptep)) {
67 preempt_disable();
68 pgste = pgste_get_lock(ptep);
69 pgstev = pgste_val(pgste);
70
71 if ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED ||
72 (pgstev & _PGSTE_GPS_ZERO)) {
73 ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep));
74 pte_clear(mm, vmaddr, ptep);
75 }
76
77 pgste_set_unlock(ptep, pgste);
78 preempt_enable();
79 }
80 pte_unmap_unlock(ptep, ptl);
81 }
82 EXPORT_SYMBOL_GPL(gmap_helper_zap_one_page);
83
84 /**
85 * gmap_helper_discard() - discard user pages in the given range
86 * @mm: the mm
87 * @vmaddr: starting userspace address
88 * @end: end address (first address outside the range)
89 *
90 * All userpace pages in the range [@vamddr, @end) are discarded and unmapped.
91 *
92 * Context: needs to be called while holding the mmap lock.
93 */
gmap_helper_discard(struct mm_struct * mm,unsigned long vmaddr,unsigned long end)94 void gmap_helper_discard(struct mm_struct *mm, unsigned long vmaddr, unsigned long end)
95 {
96 struct vm_area_struct *vma;
97
98 mmap_assert_locked(mm);
99
100 while (vmaddr < end) {
101 vma = find_vma_intersection(mm, vmaddr, end);
102 if (!vma)
103 return;
104 if (!is_vm_hugetlb_page(vma))
105 zap_page_range_single(vma, vmaddr, min(end, vma->vm_end) - vmaddr, NULL);
106 vmaddr = vma->vm_end;
107 }
108 }
109 EXPORT_SYMBOL_GPL(gmap_helper_discard);
110
find_zeropage_pte_entry(pte_t * pte,unsigned long addr,unsigned long end,struct mm_walk * walk)111 static int find_zeropage_pte_entry(pte_t *pte, unsigned long addr,
112 unsigned long end, struct mm_walk *walk)
113 {
114 unsigned long *found_addr = walk->private;
115
116 /* Return 1 of the page is a zeropage. */
117 if (is_zero_pfn(pte_pfn(*pte))) {
118 /*
119 * Shared zeropage in e.g., a FS DAX mapping? We cannot do the
120 * right thing and likely don't care: FAULT_FLAG_UNSHARE
121 * currently only works in COW mappings, which is also where
122 * mm_forbids_zeropage() is checked.
123 */
124 if (!is_cow_mapping(walk->vma->vm_flags))
125 return -EFAULT;
126
127 *found_addr = addr;
128 return 1;
129 }
130 return 0;
131 }
132
133 static const struct mm_walk_ops find_zeropage_ops = {
134 .pte_entry = find_zeropage_pte_entry,
135 .walk_lock = PGWALK_WRLOCK,
136 };
137
138 /** __gmap_helper_unshare_zeropages() - unshare all shared zeropages
139 * @mm: the mm whose zero pages are to be unshared
140 *
141 * Unshare all shared zeropages, replacing them by anonymous pages. Note that
142 * we cannot simply zap all shared zeropages, because this could later
143 * trigger unexpected userfaultfd missing events.
144 *
145 * This must be called after mm->context.allow_cow_sharing was
146 * set to 0, to avoid future mappings of shared zeropages.
147 *
148 * mm contracts with s390, that even if mm were to remove a page table,
149 * and racing with walk_page_range_vma() calling pte_offset_map_lock()
150 * would fail, it will never insert a page table containing empty zero
151 * pages once mm_forbids_zeropage(mm) i.e.
152 * mm->context.allow_cow_sharing is set to 0.
153 */
__gmap_helper_unshare_zeropages(struct mm_struct * mm)154 static int __gmap_helper_unshare_zeropages(struct mm_struct *mm)
155 {
156 struct vm_area_struct *vma;
157 VMA_ITERATOR(vmi, mm, 0);
158 unsigned long addr;
159 vm_fault_t fault;
160 int rc;
161
162 for_each_vma(vmi, vma) {
163 /*
164 * We could only look at COW mappings, but it's more future
165 * proof to catch unexpected zeropages in other mappings and
166 * fail.
167 */
168 if ((vma->vm_flags & VM_PFNMAP) || is_vm_hugetlb_page(vma))
169 continue;
170 addr = vma->vm_start;
171
172 retry:
173 rc = walk_page_range_vma(vma, addr, vma->vm_end,
174 &find_zeropage_ops, &addr);
175 if (rc < 0)
176 return rc;
177 else if (!rc)
178 continue;
179
180 /* addr was updated by find_zeropage_pte_entry() */
181 fault = handle_mm_fault(vma, addr,
182 FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
183 NULL);
184 if (fault & VM_FAULT_OOM)
185 return -ENOMEM;
186 /*
187 * See break_ksm(): even after handle_mm_fault() returned 0, we
188 * must start the lookup from the current address, because
189 * handle_mm_fault() may back out if there's any difficulty.
190 *
191 * VM_FAULT_SIGBUS and VM_FAULT_SIGSEGV are unexpected but
192 * maybe they could trigger in the future on concurrent
193 * truncation. In that case, the shared zeropage would be gone
194 * and we can simply retry and make progress.
195 */
196 cond_resched();
197 goto retry;
198 }
199
200 return 0;
201 }
202
203 /**
204 * gmap_helper_disable_cow_sharing() - disable all COW sharing
205 *
206 * Disable most COW-sharing of memory pages for the whole process:
207 * (1) Disable KSM and unmerge/unshare any KSM pages.
208 * (2) Disallow shared zeropages and unshare any zerpages that are mapped.
209 *
210 * Not that we currently don't bother with COW-shared pages that are shared
211 * with parent/child processes due to fork().
212 */
gmap_helper_disable_cow_sharing(void)213 int gmap_helper_disable_cow_sharing(void)
214 {
215 struct mm_struct *mm = current->mm;
216 int rc;
217
218 mmap_assert_write_locked(mm);
219
220 if (!mm->context.allow_cow_sharing)
221 return 0;
222
223 mm->context.allow_cow_sharing = 0;
224
225 /* Replace all shared zeropages by anonymous pages. */
226 rc = __gmap_helper_unshare_zeropages(mm);
227 /*
228 * Make sure to disable KSM (if enabled for the whole process or
229 * individual VMAs). Note that nothing currently hinders user space
230 * from re-enabling it.
231 */
232 if (!rc)
233 rc = ksm_disable(mm);
234 if (rc)
235 mm->context.allow_cow_sharing = 1;
236 return rc;
237 }
238 EXPORT_SYMBOL_GPL(gmap_helper_disable_cow_sharing);
239