1 // SPDX-License-Identifier: GPL-2.0
2 #define CREATE_TRACE_POINTS
3 #include <trace/events/mmap_lock.h>
4
5 #include <linux/mm.h>
6 #include <linux/cgroup.h>
7 #include <linux/memcontrol.h>
8 #include <linux/mmap_lock.h>
9 #include <linux/mutex.h>
10 #include <linux/percpu.h>
11 #include <linux/rcupdate.h>
12 #include <linux/smp.h>
13 #include <linux/trace_events.h>
14 #include <linux/local_lock.h>
15
16 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
19
20 #ifdef CONFIG_TRACING
21 /*
22 * Trace calls must be in a separate file, as otherwise there's a circular
23 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
24 */
25
__mmap_lock_do_trace_start_locking(struct mm_struct * mm,bool write)26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
27 {
28 trace_mmap_lock_start_locking(mm, write);
29 }
30 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
31
__mmap_lock_do_trace_acquire_returned(struct mm_struct * mm,bool write,bool success)32 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
33 bool success)
34 {
35 trace_mmap_lock_acquire_returned(mm, write, success);
36 }
37 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
38
__mmap_lock_do_trace_released(struct mm_struct * mm,bool write)39 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
40 {
41 trace_mmap_lock_released(mm, write);
42 }
43 EXPORT_SYMBOL(__mmap_lock_do_trace_released);
44 #endif /* CONFIG_TRACING */
45
46 #ifdef CONFIG_MMU
47 #ifdef CONFIG_PER_VMA_LOCK
__vma_enter_locked(struct vm_area_struct * vma,bool detaching)48 static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
49 {
50 unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
51
52 /* Additional refcnt if the vma is attached. */
53 if (!detaching)
54 tgt_refcnt++;
55
56 /*
57 * If vma is detached then only vma_mark_attached() can raise the
58 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
59 */
60 if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
61 return false;
62
63 rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
64 rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
65 refcount_read(&vma->vm_refcnt) == tgt_refcnt,
66 TASK_UNINTERRUPTIBLE);
67 lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
68
69 return true;
70 }
71
__vma_exit_locked(struct vm_area_struct * vma,bool * detached)72 static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
73 {
74 *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
75 rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
76 }
77
__vma_start_write(struct vm_area_struct * vma,unsigned int mm_lock_seq)78 void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
79 {
80 bool locked;
81
82 /*
83 * __vma_enter_locked() returns false immediately if the vma is not
84 * attached, otherwise it waits until refcnt is indicating that vma
85 * is attached with no readers.
86 */
87 locked = __vma_enter_locked(vma, false);
88
89 /*
90 * We should use WRITE_ONCE() here because we can have concurrent reads
91 * from the early lockless pessimistic check in vma_start_read().
92 * We don't really care about the correctness of that early check, but
93 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
94 */
95 WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
96
97 if (locked) {
98 bool detached;
99
100 __vma_exit_locked(vma, &detached);
101 WARN_ON_ONCE(detached); /* vma should remain attached */
102 }
103 }
104 EXPORT_SYMBOL_GPL(__vma_start_write);
105
vma_mark_detached(struct vm_area_struct * vma)106 void vma_mark_detached(struct vm_area_struct *vma)
107 {
108 vma_assert_write_locked(vma);
109 vma_assert_attached(vma);
110
111 /*
112 * We are the only writer, so no need to use vma_refcount_put().
113 * The condition below is unlikely because the vma has been already
114 * write-locked and readers can increment vm_refcnt only temporarily
115 * before they check vm_lock_seq, realize the vma is locked and drop
116 * back the vm_refcnt. That is a narrow window for observing a raised
117 * vm_refcnt.
118 */
119 if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
120 /* Wait until vma is detached with no readers. */
121 if (__vma_enter_locked(vma, true)) {
122 bool detached;
123
124 __vma_exit_locked(vma, &detached);
125 WARN_ON_ONCE(!detached);
126 }
127 }
128 }
129
130 /*
131 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
132 * stable and not isolated. If the VMA is not found or is being modified the
133 * function returns NULL.
134 */
lock_vma_under_rcu(struct mm_struct * mm,unsigned long address)135 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
136 unsigned long address)
137 {
138 MA_STATE(mas, &mm->mm_mt, address, address);
139 struct vm_area_struct *vma;
140
141 rcu_read_lock();
142 retry:
143 vma = mas_walk(&mas);
144 if (!vma)
145 goto inval;
146
147 vma = vma_start_read(mm, vma);
148 if (IS_ERR_OR_NULL(vma)) {
149 /* Check if the VMA got isolated after we found it */
150 if (PTR_ERR(vma) == -EAGAIN) {
151 count_vm_vma_lock_event(VMA_LOCK_MISS);
152 /* The area was replaced with another one */
153 goto retry;
154 }
155
156 /* Failed to lock the VMA */
157 goto inval;
158 }
159 /*
160 * At this point, we have a stable reference to a VMA: The VMA is
161 * locked and we know it hasn't already been isolated.
162 * From here on, we can access the VMA without worrying about which
163 * fields are accessible for RCU readers.
164 */
165
166 /* Check if the vma we locked is the right one. */
167 if (unlikely(vma->vm_mm != mm ||
168 address < vma->vm_start || address >= vma->vm_end))
169 goto inval_end_read;
170
171 rcu_read_unlock();
172 return vma;
173
174 inval_end_read:
175 vma_end_read(vma);
176 inval:
177 rcu_read_unlock();
178 count_vm_vma_lock_event(VMA_LOCK_ABORT);
179 return NULL;
180 }
181 #endif /* CONFIG_PER_VMA_LOCK */
182
183 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA
184 #include <linux/extable.h>
185
get_mmap_lock_carefully(struct mm_struct * mm,struct pt_regs * regs)186 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
187 {
188 if (likely(mmap_read_trylock(mm)))
189 return true;
190
191 if (regs && !user_mode(regs)) {
192 unsigned long ip = exception_ip(regs);
193 if (!search_exception_tables(ip))
194 return false;
195 }
196
197 return !mmap_read_lock_killable(mm);
198 }
199
mmap_upgrade_trylock(struct mm_struct * mm)200 static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
201 {
202 /*
203 * We don't have this operation yet.
204 *
205 * It should be easy enough to do: it's basically a
206 * atomic_long_try_cmpxchg_acquire()
207 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
208 * it also needs the proper lockdep magic etc.
209 */
210 return false;
211 }
212
upgrade_mmap_lock_carefully(struct mm_struct * mm,struct pt_regs * regs)213 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
214 {
215 mmap_read_unlock(mm);
216 if (regs && !user_mode(regs)) {
217 unsigned long ip = exception_ip(regs);
218 if (!search_exception_tables(ip))
219 return false;
220 }
221 return !mmap_write_lock_killable(mm);
222 }
223
224 /*
225 * Helper for page fault handling.
226 *
227 * This is kind of equivalent to "mmap_read_lock()" followed
228 * by "find_extend_vma()", except it's a lot more careful about
229 * the locking (and will drop the lock on failure).
230 *
231 * For example, if we have a kernel bug that causes a page
232 * fault, we don't want to just use mmap_read_lock() to get
233 * the mm lock, because that would deadlock if the bug were
234 * to happen while we're holding the mm lock for writing.
235 *
236 * So this checks the exception tables on kernel faults in
237 * order to only do this all for instructions that are actually
238 * expected to fault.
239 *
240 * We can also actually take the mm lock for writing if we
241 * need to extend the vma, which helps the VM layer a lot.
242 */
lock_mm_and_find_vma(struct mm_struct * mm,unsigned long addr,struct pt_regs * regs)243 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
244 unsigned long addr, struct pt_regs *regs)
245 {
246 struct vm_area_struct *vma;
247
248 if (!get_mmap_lock_carefully(mm, regs))
249 return NULL;
250
251 vma = find_vma(mm, addr);
252 if (likely(vma && (vma->vm_start <= addr)))
253 return vma;
254
255 /*
256 * Well, dang. We might still be successful, but only
257 * if we can extend a vma to do so.
258 */
259 if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
260 mmap_read_unlock(mm);
261 return NULL;
262 }
263
264 /*
265 * We can try to upgrade the mmap lock atomically,
266 * in which case we can continue to use the vma
267 * we already looked up.
268 *
269 * Otherwise we'll have to drop the mmap lock and
270 * re-take it, and also look up the vma again,
271 * re-checking it.
272 */
273 if (!mmap_upgrade_trylock(mm)) {
274 if (!upgrade_mmap_lock_carefully(mm, regs))
275 return NULL;
276
277 vma = find_vma(mm, addr);
278 if (!vma)
279 goto fail;
280 if (vma->vm_start <= addr)
281 goto success;
282 if (!(vma->vm_flags & VM_GROWSDOWN))
283 goto fail;
284 }
285
286 if (expand_stack_locked(vma, addr))
287 goto fail;
288
289 success:
290 mmap_write_downgrade(mm);
291 return vma;
292
293 fail:
294 mmap_write_unlock(mm);
295 return NULL;
296 }
297 #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
298
299 #else /* CONFIG_MMU */
300
301 /*
302 * At least xtensa ends up having protection faults even with no
303 * MMU.. No stack expansion, at least.
304 */
lock_mm_and_find_vma(struct mm_struct * mm,unsigned long addr,struct pt_regs * regs)305 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
306 unsigned long addr, struct pt_regs *regs)
307 {
308 struct vm_area_struct *vma;
309
310 mmap_read_lock(mm);
311 vma = vma_lookup(mm, addr);
312 if (!vma)
313 mmap_read_unlock(mm);
314 return vma;
315 }
316
317 #endif /* CONFIG_MMU */
318