1 // SPDX-License-Identifier: GPL-2.0
2 #define CREATE_TRACE_POINTS
3 #include <trace/events/mmap_lock.h>
4
5 #include <linux/mm.h>
6 #include <linux/cgroup.h>
7 #include <linux/memcontrol.h>
8 #include <linux/mmap_lock.h>
9 #include <linux/mutex.h>
10 #include <linux/percpu.h>
11 #include <linux/rcupdate.h>
12 #include <linux/smp.h>
13 #include <linux/trace_events.h>
14 #include <linux/local_lock.h>
15
16 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
19
20 #ifdef CONFIG_TRACING
21 /*
22 * Trace calls must be in a separate file, as otherwise there's a circular
23 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
24 */
25
__mmap_lock_do_trace_start_locking(struct mm_struct * mm,bool write)26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
27 {
28 trace_mmap_lock_start_locking(mm, write);
29 }
30 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
31
__mmap_lock_do_trace_acquire_returned(struct mm_struct * mm,bool write,bool success)32 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
33 bool success)
34 {
35 trace_mmap_lock_acquire_returned(mm, write, success);
36 }
37 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
38
__mmap_lock_do_trace_released(struct mm_struct * mm,bool write)39 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
40 {
41 trace_mmap_lock_released(mm, write);
42 }
43 EXPORT_SYMBOL(__mmap_lock_do_trace_released);
44 #endif /* CONFIG_TRACING */
45
46 #ifdef CONFIG_MMU
47 #ifdef CONFIG_PER_VMA_LOCK
48 /*
49 * __vma_enter_locked() returns 0 immediately if the vma is not
50 * attached, otherwise it waits for any current readers to finish and
51 * returns 1. Returns -EINTR if a signal is received while waiting.
52 */
__vma_enter_locked(struct vm_area_struct * vma,bool detaching,int state)53 static inline int __vma_enter_locked(struct vm_area_struct *vma,
54 bool detaching, int state)
55 {
56 int err;
57 unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
58
59 mmap_assert_write_locked(vma->vm_mm);
60
61 /* Additional refcnt if the vma is attached. */
62 if (!detaching)
63 tgt_refcnt++;
64
65 /*
66 * If vma is detached then only vma_mark_attached() can raise the
67 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
68 */
69 if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
70 return 0;
71
72 rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
73 err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
74 refcount_read(&vma->vm_refcnt) == tgt_refcnt,
75 state);
76 if (err) {
77 if (refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt)) {
78 /*
79 * The wait failed, but the last reader went away
80 * as well. Tell the caller the VMA is detached.
81 */
82 WARN_ON_ONCE(!detaching);
83 err = 0;
84 }
85 rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
86 return err;
87 }
88 lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
89
90 return 1;
91 }
92
__vma_exit_locked(struct vm_area_struct * vma,bool * detached)93 static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
94 {
95 *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
96 rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
97 }
98
__vma_start_write(struct vm_area_struct * vma,unsigned int mm_lock_seq,int state)99 int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
100 int state)
101 {
102 int locked;
103
104 locked = __vma_enter_locked(vma, false, state);
105 if (locked < 0)
106 return locked;
107
108 /*
109 * We should use WRITE_ONCE() here because we can have concurrent reads
110 * from the early lockless pessimistic check in vma_start_read().
111 * We don't really care about the correctness of that early check, but
112 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
113 */
114 WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
115
116 if (locked) {
117 bool detached;
118
119 __vma_exit_locked(vma, &detached);
120 WARN_ON_ONCE(detached); /* vma should remain attached */
121 }
122
123 return 0;
124 }
125 EXPORT_SYMBOL_GPL(__vma_start_write);
126
vma_mark_detached(struct vm_area_struct * vma)127 void vma_mark_detached(struct vm_area_struct *vma)
128 {
129 vma_assert_write_locked(vma);
130 vma_assert_attached(vma);
131
132 /*
133 * We are the only writer, so no need to use vma_refcount_put().
134 * The condition below is unlikely because the vma has been already
135 * write-locked and readers can increment vm_refcnt only temporarily
136 * before they check vm_lock_seq, realize the vma is locked and drop
137 * back the vm_refcnt. That is a narrow window for observing a raised
138 * vm_refcnt.
139 */
140 if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
141 /* Wait until vma is detached with no readers. */
142 if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) {
143 bool detached;
144
145 __vma_exit_locked(vma, &detached);
146 WARN_ON_ONCE(!detached);
147 }
148 }
149 }
150
151 /*
152 * Try to read-lock a vma. The function is allowed to occasionally yield false
153 * locked result to avoid performance overhead, in which case we fall back to
154 * using mmap_lock. The function should never yield false unlocked result.
155 * False locked result is possible if mm_lock_seq overflows or if vma gets
156 * reused and attached to a different mm before we lock it.
157 * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
158 * detached.
159 *
160 * IMPORTANT: RCU lock must be held upon entering the function, but upon error
161 * IT IS RELEASED. The caller must handle this correctly.
162 */
vma_start_read(struct mm_struct * mm,struct vm_area_struct * vma)163 static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
164 struct vm_area_struct *vma)
165 {
166 struct mm_struct *other_mm;
167 int oldcnt;
168
169 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
170 /*
171 * Check before locking. A race might cause false locked result.
172 * We can use READ_ONCE() for the mm_lock_seq here, and don't need
173 * ACQUIRE semantics, because this is just a lockless check whose result
174 * we don't rely on for anything - the mm_lock_seq read against which we
175 * need ordering is below.
176 */
177 if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
178 vma = NULL;
179 goto err;
180 }
181
182 /*
183 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
184 * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
185 * Acquire fence is required here to avoid reordering against later
186 * vm_lock_seq check and checks inside lock_vma_under_rcu().
187 */
188 if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
189 VMA_REF_LIMIT))) {
190 /* return EAGAIN if vma got detached from under us */
191 vma = oldcnt ? NULL : ERR_PTR(-EAGAIN);
192 goto err;
193 }
194
195 rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
196
197 if (unlikely(vma->vm_mm != mm))
198 goto err_unstable;
199
200 /*
201 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
202 * False unlocked result is impossible because we modify and check
203 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
204 * modification invalidates all existing locks.
205 *
206 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
207 * racing with vma_end_write_all(), we only start reading from the VMA
208 * after it has been unlocked.
209 * This pairs with RELEASE semantics in vma_end_write_all().
210 */
211 if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
212 vma_refcount_put(vma);
213 vma = NULL;
214 goto err;
215 }
216
217 return vma;
218 err:
219 rcu_read_unlock();
220
221 return vma;
222 err_unstable:
223 /*
224 * If vma got attached to another mm from under us, that mm is not
225 * stable and can be freed in the narrow window after vma->vm_refcnt
226 * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
227 * releasing vma->vm_refcnt.
228 */
229 other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */
230
231 /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */
232 rcu_read_unlock();
233 mmgrab(other_mm);
234 vma_refcount_put(vma);
235 mmdrop(other_mm);
236
237 return NULL;
238 }
239
240 /*
241 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
242 * stable and not isolated. If the VMA is not found or is being modified the
243 * function returns NULL.
244 */
lock_vma_under_rcu(struct mm_struct * mm,unsigned long address)245 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
246 unsigned long address)
247 {
248 MA_STATE(mas, &mm->mm_mt, address, address);
249 struct vm_area_struct *vma;
250
251 retry:
252 rcu_read_lock();
253 vma = mas_walk(&mas);
254 if (!vma) {
255 rcu_read_unlock();
256 goto inval;
257 }
258
259 vma = vma_start_read(mm, vma);
260 if (IS_ERR_OR_NULL(vma)) {
261 /* Check if the VMA got isolated after we found it */
262 if (PTR_ERR(vma) == -EAGAIN) {
263 count_vm_vma_lock_event(VMA_LOCK_MISS);
264 /* The area was replaced with another one */
265 mas_set(&mas, address);
266 goto retry;
267 }
268
269 /* Failed to lock the VMA */
270 goto inval;
271 }
272 /*
273 * At this point, we have a stable reference to a VMA: The VMA is
274 * locked and we know it hasn't already been isolated.
275 * From here on, we can access the VMA without worrying about which
276 * fields are accessible for RCU readers.
277 */
278 rcu_read_unlock();
279
280 /* Check if the vma we locked is the right one. */
281 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
282 vma_end_read(vma);
283 goto inval;
284 }
285
286 return vma;
287
288 inval:
289 count_vm_vma_lock_event(VMA_LOCK_ABORT);
290 return NULL;
291 }
292
lock_next_vma_under_mmap_lock(struct mm_struct * mm,struct vma_iterator * vmi,unsigned long from_addr)293 static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
294 struct vma_iterator *vmi,
295 unsigned long from_addr)
296 {
297 struct vm_area_struct *vma;
298 int ret;
299
300 ret = mmap_read_lock_killable(mm);
301 if (ret)
302 return ERR_PTR(ret);
303
304 /* Lookup the vma at the last position again under mmap_read_lock */
305 vma_iter_set(vmi, from_addr);
306 vma = vma_next(vmi);
307 if (vma) {
308 /* Very unlikely vma->vm_refcnt overflow case */
309 if (unlikely(!vma_start_read_locked(vma)))
310 vma = ERR_PTR(-EAGAIN);
311 }
312
313 mmap_read_unlock(mm);
314
315 return vma;
316 }
317
lock_next_vma(struct mm_struct * mm,struct vma_iterator * vmi,unsigned long from_addr)318 struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
319 struct vma_iterator *vmi,
320 unsigned long from_addr)
321 {
322 struct vm_area_struct *vma;
323 unsigned int mm_wr_seq;
324 bool mmap_unlocked;
325
326 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
327 retry:
328 /* Start mmap_lock speculation in case we need to verify the vma later */
329 mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
330 vma = vma_next(vmi);
331 if (!vma)
332 return NULL;
333
334 vma = vma_start_read(mm, vma);
335 if (IS_ERR_OR_NULL(vma)) {
336 /*
337 * Retry immediately if the vma gets detached from under us.
338 * Infinite loop should not happen because the vma we find will
339 * have to be constantly knocked out from under us.
340 */
341 if (PTR_ERR(vma) == -EAGAIN) {
342 /* reset to search from the last address */
343 rcu_read_lock();
344 vma_iter_set(vmi, from_addr);
345 goto retry;
346 }
347
348 goto fallback;
349 }
350
351 /* Verify the vma is not behind the last search position. */
352 if (unlikely(from_addr >= vma->vm_end))
353 goto fallback_unlock;
354
355 /*
356 * vma can be ahead of the last search position but we need to verify
357 * it was not shrunk after we found it and another vma has not been
358 * installed ahead of it. Otherwise we might observe a gap that should
359 * not be there.
360 */
361 if (from_addr < vma->vm_start) {
362 /* Verify only if the address space might have changed since vma lookup. */
363 if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
364 vma_iter_set(vmi, from_addr);
365 if (vma != vma_next(vmi))
366 goto fallback_unlock;
367 }
368 }
369
370 return vma;
371
372 fallback_unlock:
373 rcu_read_unlock();
374 vma_end_read(vma);
375 fallback:
376 vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
377 rcu_read_lock();
378 /* Reinitialize the iterator after re-entering rcu read section */
379 vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);
380
381 return vma;
382 }
383 #endif /* CONFIG_PER_VMA_LOCK */
384
385 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA
386 #include <linux/extable.h>
387
get_mmap_lock_carefully(struct mm_struct * mm,struct pt_regs * regs)388 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
389 {
390 if (likely(mmap_read_trylock(mm)))
391 return true;
392
393 if (regs && !user_mode(regs)) {
394 unsigned long ip = exception_ip(regs);
395 if (!search_exception_tables(ip))
396 return false;
397 }
398
399 return !mmap_read_lock_killable(mm);
400 }
401
mmap_upgrade_trylock(struct mm_struct * mm)402 static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
403 {
404 /*
405 * We don't have this operation yet.
406 *
407 * It should be easy enough to do: it's basically a
408 * atomic_long_try_cmpxchg_acquire()
409 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
410 * it also needs the proper lockdep magic etc.
411 */
412 return false;
413 }
414
upgrade_mmap_lock_carefully(struct mm_struct * mm,struct pt_regs * regs)415 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
416 {
417 mmap_read_unlock(mm);
418 if (regs && !user_mode(regs)) {
419 unsigned long ip = exception_ip(regs);
420 if (!search_exception_tables(ip))
421 return false;
422 }
423 return !mmap_write_lock_killable(mm);
424 }
425
426 /*
427 * Helper for page fault handling.
428 *
429 * This is kind of equivalent to "mmap_read_lock()" followed
430 * by "find_extend_vma()", except it's a lot more careful about
431 * the locking (and will drop the lock on failure).
432 *
433 * For example, if we have a kernel bug that causes a page
434 * fault, we don't want to just use mmap_read_lock() to get
435 * the mm lock, because that would deadlock if the bug were
436 * to happen while we're holding the mm lock for writing.
437 *
438 * So this checks the exception tables on kernel faults in
439 * order to only do this all for instructions that are actually
440 * expected to fault.
441 *
442 * We can also actually take the mm lock for writing if we
443 * need to extend the vma, which helps the VM layer a lot.
444 */
lock_mm_and_find_vma(struct mm_struct * mm,unsigned long addr,struct pt_regs * regs)445 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
446 unsigned long addr, struct pt_regs *regs)
447 {
448 struct vm_area_struct *vma;
449
450 if (!get_mmap_lock_carefully(mm, regs))
451 return NULL;
452
453 vma = find_vma(mm, addr);
454 if (likely(vma && (vma->vm_start <= addr)))
455 return vma;
456
457 /*
458 * Well, dang. We might still be successful, but only
459 * if we can extend a vma to do so.
460 */
461 if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
462 mmap_read_unlock(mm);
463 return NULL;
464 }
465
466 /*
467 * We can try to upgrade the mmap lock atomically,
468 * in which case we can continue to use the vma
469 * we already looked up.
470 *
471 * Otherwise we'll have to drop the mmap lock and
472 * re-take it, and also look up the vma again,
473 * re-checking it.
474 */
475 if (!mmap_upgrade_trylock(mm)) {
476 if (!upgrade_mmap_lock_carefully(mm, regs))
477 return NULL;
478
479 vma = find_vma(mm, addr);
480 if (!vma)
481 goto fail;
482 if (vma->vm_start <= addr)
483 goto success;
484 if (!(vma->vm_flags & VM_GROWSDOWN))
485 goto fail;
486 }
487
488 if (expand_stack_locked(vma, addr))
489 goto fail;
490
491 success:
492 mmap_write_downgrade(mm);
493 return vma;
494
495 fail:
496 mmap_write_unlock(mm);
497 return NULL;
498 }
499 #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
500
501 #else /* CONFIG_MMU */
502
503 /*
504 * At least xtensa ends up having protection faults even with no
505 * MMU.. No stack expansion, at least.
506 */
lock_mm_and_find_vma(struct mm_struct * mm,unsigned long addr,struct pt_regs * regs)507 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
508 unsigned long addr, struct pt_regs *regs)
509 {
510 struct vm_area_struct *vma;
511
512 mmap_read_lock(mm);
513 vma = vma_lookup(mm, addr);
514 if (!vma)
515 mmap_read_unlock(mm);
516 return vma;
517 }
518
519 #endif /* CONFIG_MMU */
520