xref: /linux/mm/mmap_lock.c (revision 7203ca412fc8e8a0588e9adc0f777d3163f8dff3)
1 // SPDX-License-Identifier: GPL-2.0
2 #define CREATE_TRACE_POINTS
3 #include <trace/events/mmap_lock.h>
4 
5 #include <linux/mm.h>
6 #include <linux/cgroup.h>
7 #include <linux/memcontrol.h>
8 #include <linux/mmap_lock.h>
9 #include <linux/mutex.h>
10 #include <linux/percpu.h>
11 #include <linux/rcupdate.h>
12 #include <linux/smp.h>
13 #include <linux/trace_events.h>
14 #include <linux/local_lock.h>
15 
16 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
19 
20 #ifdef CONFIG_TRACING
21 /*
22  * Trace calls must be in a separate file, as otherwise there's a circular
23  * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
24  */
25 
__mmap_lock_do_trace_start_locking(struct mm_struct * mm,bool write)26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
27 {
28 	trace_mmap_lock_start_locking(mm, write);
29 }
30 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
31 
__mmap_lock_do_trace_acquire_returned(struct mm_struct * mm,bool write,bool success)32 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
33 					   bool success)
34 {
35 	trace_mmap_lock_acquire_returned(mm, write, success);
36 }
37 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
38 
__mmap_lock_do_trace_released(struct mm_struct * mm,bool write)39 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
40 {
41 	trace_mmap_lock_released(mm, write);
42 }
43 EXPORT_SYMBOL(__mmap_lock_do_trace_released);
44 #endif /* CONFIG_TRACING */
45 
46 #ifdef CONFIG_MMU
47 #ifdef CONFIG_PER_VMA_LOCK
48 /*
49  * __vma_enter_locked() returns 0 immediately if the vma is not
50  * attached, otherwise it waits for any current readers to finish and
51  * returns 1.  Returns -EINTR if a signal is received while waiting.
52  */
__vma_enter_locked(struct vm_area_struct * vma,bool detaching,int state)53 static inline int __vma_enter_locked(struct vm_area_struct *vma,
54 		bool detaching, int state)
55 {
56 	int err;
57 	unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
58 
59 	mmap_assert_write_locked(vma->vm_mm);
60 
61 	/* Additional refcnt if the vma is attached. */
62 	if (!detaching)
63 		tgt_refcnt++;
64 
65 	/*
66 	 * If vma is detached then only vma_mark_attached() can raise the
67 	 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
68 	 */
69 	if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
70 		return 0;
71 
72 	rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
73 	err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
74 		   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
75 		   state);
76 	if (err) {
77 		if (refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt)) {
78 			/*
79 			 * The wait failed, but the last reader went away
80 			 * as well.  Tell the caller the VMA is detached.
81 			 */
82 			WARN_ON_ONCE(!detaching);
83 			err = 0;
84 		}
85 		rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
86 		return err;
87 	}
88 	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
89 
90 	return 1;
91 }
92 
__vma_exit_locked(struct vm_area_struct * vma,bool * detached)93 static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
94 {
95 	*detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
96 	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
97 }
98 
__vma_start_write(struct vm_area_struct * vma,unsigned int mm_lock_seq,int state)99 int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
100 		int state)
101 {
102 	int locked;
103 
104 	locked = __vma_enter_locked(vma, false, state);
105 	if (locked < 0)
106 		return locked;
107 
108 	/*
109 	 * We should use WRITE_ONCE() here because we can have concurrent reads
110 	 * from the early lockless pessimistic check in vma_start_read().
111 	 * We don't really care about the correctness of that early check, but
112 	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
113 	 */
114 	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
115 
116 	if (locked) {
117 		bool detached;
118 
119 		__vma_exit_locked(vma, &detached);
120 		WARN_ON_ONCE(detached); /* vma should remain attached */
121 	}
122 
123 	return 0;
124 }
125 EXPORT_SYMBOL_GPL(__vma_start_write);
126 
vma_mark_detached(struct vm_area_struct * vma)127 void vma_mark_detached(struct vm_area_struct *vma)
128 {
129 	vma_assert_write_locked(vma);
130 	vma_assert_attached(vma);
131 
132 	/*
133 	 * We are the only writer, so no need to use vma_refcount_put().
134 	 * The condition below is unlikely because the vma has been already
135 	 * write-locked and readers can increment vm_refcnt only temporarily
136 	 * before they check vm_lock_seq, realize the vma is locked and drop
137 	 * back the vm_refcnt. That is a narrow window for observing a raised
138 	 * vm_refcnt.
139 	 */
140 	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
141 		/* Wait until vma is detached with no readers. */
142 		if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) {
143 			bool detached;
144 
145 			__vma_exit_locked(vma, &detached);
146 			WARN_ON_ONCE(!detached);
147 		}
148 	}
149 }
150 
151 /*
152  * Try to read-lock a vma. The function is allowed to occasionally yield false
153  * locked result to avoid performance overhead, in which case we fall back to
154  * using mmap_lock. The function should never yield false unlocked result.
155  * False locked result is possible if mm_lock_seq overflows or if vma gets
156  * reused and attached to a different mm before we lock it.
157  * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
158  * detached.
159  *
160  * IMPORTANT: RCU lock must be held upon entering the function, but upon error
161  *            IT IS RELEASED. The caller must handle this correctly.
162  */
vma_start_read(struct mm_struct * mm,struct vm_area_struct * vma)163 static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
164 						    struct vm_area_struct *vma)
165 {
166 	struct mm_struct *other_mm;
167 	int oldcnt;
168 
169 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
170 	/*
171 	 * Check before locking. A race might cause false locked result.
172 	 * We can use READ_ONCE() for the mm_lock_seq here, and don't need
173 	 * ACQUIRE semantics, because this is just a lockless check whose result
174 	 * we don't rely on for anything - the mm_lock_seq read against which we
175 	 * need ordering is below.
176 	 */
177 	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
178 		vma = NULL;
179 		goto err;
180 	}
181 
182 	/*
183 	 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
184 	 * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
185 	 * Acquire fence is required here to avoid reordering against later
186 	 * vm_lock_seq check and checks inside lock_vma_under_rcu().
187 	 */
188 	if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
189 							      VMA_REF_LIMIT))) {
190 		/* return EAGAIN if vma got detached from under us */
191 		vma = oldcnt ? NULL : ERR_PTR(-EAGAIN);
192 		goto err;
193 	}
194 
195 	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
196 
197 	if (unlikely(vma->vm_mm != mm))
198 		goto err_unstable;
199 
200 	/*
201 	 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
202 	 * False unlocked result is impossible because we modify and check
203 	 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
204 	 * modification invalidates all existing locks.
205 	 *
206 	 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
207 	 * racing with vma_end_write_all(), we only start reading from the VMA
208 	 * after it has been unlocked.
209 	 * This pairs with RELEASE semantics in vma_end_write_all().
210 	 */
211 	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
212 		vma_refcount_put(vma);
213 		vma = NULL;
214 		goto err;
215 	}
216 
217 	return vma;
218 err:
219 	rcu_read_unlock();
220 
221 	return vma;
222 err_unstable:
223 	/*
224 	 * If vma got attached to another mm from under us, that mm is not
225 	 * stable and can be freed in the narrow window after vma->vm_refcnt
226 	 * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
227 	 * releasing vma->vm_refcnt.
228 	 */
229 	other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */
230 
231 	/* __mmdrop() is a heavy operation, do it after dropping RCU lock. */
232 	rcu_read_unlock();
233 	mmgrab(other_mm);
234 	vma_refcount_put(vma);
235 	mmdrop(other_mm);
236 
237 	return NULL;
238 }
239 
240 /*
241  * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
242  * stable and not isolated. If the VMA is not found or is being modified the
243  * function returns NULL.
244  */
lock_vma_under_rcu(struct mm_struct * mm,unsigned long address)245 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
246 					  unsigned long address)
247 {
248 	MA_STATE(mas, &mm->mm_mt, address, address);
249 	struct vm_area_struct *vma;
250 
251 retry:
252 	rcu_read_lock();
253 	vma = mas_walk(&mas);
254 	if (!vma) {
255 		rcu_read_unlock();
256 		goto inval;
257 	}
258 
259 	vma = vma_start_read(mm, vma);
260 	if (IS_ERR_OR_NULL(vma)) {
261 		/* Check if the VMA got isolated after we found it */
262 		if (PTR_ERR(vma) == -EAGAIN) {
263 			count_vm_vma_lock_event(VMA_LOCK_MISS);
264 			/* The area was replaced with another one */
265 			mas_set(&mas, address);
266 			goto retry;
267 		}
268 
269 		/* Failed to lock the VMA */
270 		goto inval;
271 	}
272 	/*
273 	 * At this point, we have a stable reference to a VMA: The VMA is
274 	 * locked and we know it hasn't already been isolated.
275 	 * From here on, we can access the VMA without worrying about which
276 	 * fields are accessible for RCU readers.
277 	 */
278 	rcu_read_unlock();
279 
280 	/* Check if the vma we locked is the right one. */
281 	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
282 		vma_end_read(vma);
283 		goto inval;
284 	}
285 
286 	return vma;
287 
288 inval:
289 	count_vm_vma_lock_event(VMA_LOCK_ABORT);
290 	return NULL;
291 }
292 
lock_next_vma_under_mmap_lock(struct mm_struct * mm,struct vma_iterator * vmi,unsigned long from_addr)293 static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
294 							    struct vma_iterator *vmi,
295 							    unsigned long from_addr)
296 {
297 	struct vm_area_struct *vma;
298 	int ret;
299 
300 	ret = mmap_read_lock_killable(mm);
301 	if (ret)
302 		return ERR_PTR(ret);
303 
304 	/* Lookup the vma at the last position again under mmap_read_lock */
305 	vma_iter_set(vmi, from_addr);
306 	vma = vma_next(vmi);
307 	if (vma) {
308 		/* Very unlikely vma->vm_refcnt overflow case */
309 		if (unlikely(!vma_start_read_locked(vma)))
310 			vma = ERR_PTR(-EAGAIN);
311 	}
312 
313 	mmap_read_unlock(mm);
314 
315 	return vma;
316 }
317 
lock_next_vma(struct mm_struct * mm,struct vma_iterator * vmi,unsigned long from_addr)318 struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
319 				     struct vma_iterator *vmi,
320 				     unsigned long from_addr)
321 {
322 	struct vm_area_struct *vma;
323 	unsigned int mm_wr_seq;
324 	bool mmap_unlocked;
325 
326 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
327 retry:
328 	/* Start mmap_lock speculation in case we need to verify the vma later */
329 	mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
330 	vma = vma_next(vmi);
331 	if (!vma)
332 		return NULL;
333 
334 	vma = vma_start_read(mm, vma);
335 	if (IS_ERR_OR_NULL(vma)) {
336 		/*
337 		 * Retry immediately if the vma gets detached from under us.
338 		 * Infinite loop should not happen because the vma we find will
339 		 * have to be constantly knocked out from under us.
340 		 */
341 		if (PTR_ERR(vma) == -EAGAIN) {
342 			/* reset to search from the last address */
343 			rcu_read_lock();
344 			vma_iter_set(vmi, from_addr);
345 			goto retry;
346 		}
347 
348 		goto fallback;
349 	}
350 
351 	/* Verify the vma is not behind the last search position. */
352 	if (unlikely(from_addr >= vma->vm_end))
353 		goto fallback_unlock;
354 
355 	/*
356 	 * vma can be ahead of the last search position but we need to verify
357 	 * it was not shrunk after we found it and another vma has not been
358 	 * installed ahead of it. Otherwise we might observe a gap that should
359 	 * not be there.
360 	 */
361 	if (from_addr < vma->vm_start) {
362 		/* Verify only if the address space might have changed since vma lookup. */
363 		if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
364 			vma_iter_set(vmi, from_addr);
365 			if (vma != vma_next(vmi))
366 				goto fallback_unlock;
367 		}
368 	}
369 
370 	return vma;
371 
372 fallback_unlock:
373 	rcu_read_unlock();
374 	vma_end_read(vma);
375 fallback:
376 	vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
377 	rcu_read_lock();
378 	/* Reinitialize the iterator after re-entering rcu read section */
379 	vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);
380 
381 	return vma;
382 }
383 #endif /* CONFIG_PER_VMA_LOCK */
384 
385 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA
386 #include <linux/extable.h>
387 
get_mmap_lock_carefully(struct mm_struct * mm,struct pt_regs * regs)388 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
389 {
390 	if (likely(mmap_read_trylock(mm)))
391 		return true;
392 
393 	if (regs && !user_mode(regs)) {
394 		unsigned long ip = exception_ip(regs);
395 		if (!search_exception_tables(ip))
396 			return false;
397 	}
398 
399 	return !mmap_read_lock_killable(mm);
400 }
401 
mmap_upgrade_trylock(struct mm_struct * mm)402 static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
403 {
404 	/*
405 	 * We don't have this operation yet.
406 	 *
407 	 * It should be easy enough to do: it's basically a
408 	 *    atomic_long_try_cmpxchg_acquire()
409 	 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
410 	 * it also needs the proper lockdep magic etc.
411 	 */
412 	return false;
413 }
414 
upgrade_mmap_lock_carefully(struct mm_struct * mm,struct pt_regs * regs)415 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
416 {
417 	mmap_read_unlock(mm);
418 	if (regs && !user_mode(regs)) {
419 		unsigned long ip = exception_ip(regs);
420 		if (!search_exception_tables(ip))
421 			return false;
422 	}
423 	return !mmap_write_lock_killable(mm);
424 }
425 
426 /*
427  * Helper for page fault handling.
428  *
429  * This is kind of equivalent to "mmap_read_lock()" followed
430  * by "find_extend_vma()", except it's a lot more careful about
431  * the locking (and will drop the lock on failure).
432  *
433  * For example, if we have a kernel bug that causes a page
434  * fault, we don't want to just use mmap_read_lock() to get
435  * the mm lock, because that would deadlock if the bug were
436  * to happen while we're holding the mm lock for writing.
437  *
438  * So this checks the exception tables on kernel faults in
439  * order to only do this all for instructions that are actually
440  * expected to fault.
441  *
442  * We can also actually take the mm lock for writing if we
443  * need to extend the vma, which helps the VM layer a lot.
444  */
lock_mm_and_find_vma(struct mm_struct * mm,unsigned long addr,struct pt_regs * regs)445 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
446 			unsigned long addr, struct pt_regs *regs)
447 {
448 	struct vm_area_struct *vma;
449 
450 	if (!get_mmap_lock_carefully(mm, regs))
451 		return NULL;
452 
453 	vma = find_vma(mm, addr);
454 	if (likely(vma && (vma->vm_start <= addr)))
455 		return vma;
456 
457 	/*
458 	 * Well, dang. We might still be successful, but only
459 	 * if we can extend a vma to do so.
460 	 */
461 	if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
462 		mmap_read_unlock(mm);
463 		return NULL;
464 	}
465 
466 	/*
467 	 * We can try to upgrade the mmap lock atomically,
468 	 * in which case we can continue to use the vma
469 	 * we already looked up.
470 	 *
471 	 * Otherwise we'll have to drop the mmap lock and
472 	 * re-take it, and also look up the vma again,
473 	 * re-checking it.
474 	 */
475 	if (!mmap_upgrade_trylock(mm)) {
476 		if (!upgrade_mmap_lock_carefully(mm, regs))
477 			return NULL;
478 
479 		vma = find_vma(mm, addr);
480 		if (!vma)
481 			goto fail;
482 		if (vma->vm_start <= addr)
483 			goto success;
484 		if (!(vma->vm_flags & VM_GROWSDOWN))
485 			goto fail;
486 	}
487 
488 	if (expand_stack_locked(vma, addr))
489 		goto fail;
490 
491 success:
492 	mmap_write_downgrade(mm);
493 	return vma;
494 
495 fail:
496 	mmap_write_unlock(mm);
497 	return NULL;
498 }
499 #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
500 
501 #else /* CONFIG_MMU */
502 
503 /*
504  * At least xtensa ends up having protection faults even with no
505  * MMU.. No stack expansion, at least.
506  */
lock_mm_and_find_vma(struct mm_struct * mm,unsigned long addr,struct pt_regs * regs)507 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
508 			unsigned long addr, struct pt_regs *regs)
509 {
510 	struct vm_area_struct *vma;
511 
512 	mmap_read_lock(mm);
513 	vma = vma_lookup(mm, addr);
514 	if (!vma)
515 		mmap_read_unlock(mm);
516 	return vma;
517 }
518 
519 #endif /* CONFIG_MMU */
520