xref: /linux/mm/mmap_lock.c (revision 37104286f9390a3da330c299b01cabfb4c98af7c)
1 // SPDX-License-Identifier: GPL-2.0
2 #define CREATE_TRACE_POINTS
3 #include <trace/events/mmap_lock.h>
4 
5 #include <linux/mm.h>
6 #include <linux/cgroup.h>
7 #include <linux/memcontrol.h>
8 #include <linux/mmap_lock.h>
9 #include <linux/mutex.h>
10 #include <linux/percpu.h>
11 #include <linux/rcupdate.h>
12 #include <linux/smp.h>
13 #include <linux/trace_events.h>
14 #include <linux/local_lock.h>
15 
16 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
19 
20 #ifdef CONFIG_TRACING
21 /*
22  * Trace calls must be in a separate file, as otherwise there's a circular
23  * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
24  */
25 
26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
27 {
28 	trace_mmap_lock_start_locking(mm, write);
29 }
30 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
31 
32 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
33 					   bool success)
34 {
35 	trace_mmap_lock_acquire_returned(mm, write, success);
36 }
37 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
38 
39 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
40 {
41 	trace_mmap_lock_released(mm, write);
42 }
43 EXPORT_SYMBOL(__mmap_lock_do_trace_released);
44 #endif /* CONFIG_TRACING */
45 
46 #ifdef CONFIG_MMU
47 #ifdef CONFIG_PER_VMA_LOCK
48 /*
49  * Return value: 0 if vma detached,
50  * 1 if vma attached with no readers,
51  * -EINTR if signal received,
52  */
53 static inline int __vma_enter_locked(struct vm_area_struct *vma,
54 		bool detaching, int state)
55 {
56 	int err;
57 	unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
58 
59 	/* Additional refcnt if the vma is attached. */
60 	if (!detaching)
61 		tgt_refcnt++;
62 
63 	/*
64 	 * If vma is detached then only vma_mark_attached() can raise the
65 	 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
66 	 */
67 	if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
68 		return 0;
69 
70 	rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
71 	err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
72 		   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
73 		   state);
74 	if (err) {
75 		rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
76 		return err;
77 	}
78 	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
79 
80 	return 1;
81 }
82 
83 static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
84 {
85 	*detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
86 	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
87 }
88 
89 int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq,
90 		int state)
91 {
92 	int locked;
93 
94 	/*
95 	 * __vma_enter_locked() returns false immediately if the vma is not
96 	 * attached, otherwise it waits until refcnt is indicating that vma
97 	 * is attached with no readers.
98 	 */
99 	locked = __vma_enter_locked(vma, false, state);
100 	if (locked < 0)
101 		return locked;
102 
103 	/*
104 	 * We should use WRITE_ONCE() here because we can have concurrent reads
105 	 * from the early lockless pessimistic check in vma_start_read().
106 	 * We don't really care about the correctness of that early check, but
107 	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
108 	 */
109 	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
110 
111 	if (locked) {
112 		bool detached;
113 
114 		__vma_exit_locked(vma, &detached);
115 		WARN_ON_ONCE(detached); /* vma should remain attached */
116 	}
117 
118 	return 0;
119 }
120 EXPORT_SYMBOL_GPL(__vma_start_write);
121 
122 void vma_mark_detached(struct vm_area_struct *vma)
123 {
124 	vma_assert_write_locked(vma);
125 	vma_assert_attached(vma);
126 
127 	/*
128 	 * We are the only writer, so no need to use vma_refcount_put().
129 	 * The condition below is unlikely because the vma has been already
130 	 * write-locked and readers can increment vm_refcnt only temporarily
131 	 * before they check vm_lock_seq, realize the vma is locked and drop
132 	 * back the vm_refcnt. That is a narrow window for observing a raised
133 	 * vm_refcnt.
134 	 */
135 	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
136 		/* Wait until vma is detached with no readers. */
137 		if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) {
138 			bool detached;
139 
140 			__vma_exit_locked(vma, &detached);
141 			WARN_ON_ONCE(!detached);
142 		}
143 	}
144 }
145 
146 /*
147  * Try to read-lock a vma. The function is allowed to occasionally yield false
148  * locked result to avoid performance overhead, in which case we fall back to
149  * using mmap_lock. The function should never yield false unlocked result.
150  * False locked result is possible if mm_lock_seq overflows or if vma gets
151  * reused and attached to a different mm before we lock it.
152  * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
153  * detached.
154  *
155  * IMPORTANT: RCU lock must be held upon entering the function, but upon error
156  *            IT IS RELEASED. The caller must handle this correctly.
157  */
158 static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
159 						    struct vm_area_struct *vma)
160 {
161 	struct mm_struct *other_mm;
162 	int oldcnt;
163 
164 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
165 	/*
166 	 * Check before locking. A race might cause false locked result.
167 	 * We can use READ_ONCE() for the mm_lock_seq here, and don't need
168 	 * ACQUIRE semantics, because this is just a lockless check whose result
169 	 * we don't rely on for anything - the mm_lock_seq read against which we
170 	 * need ordering is below.
171 	 */
172 	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
173 		vma = NULL;
174 		goto err;
175 	}
176 
177 	/*
178 	 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
179 	 * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
180 	 * Acquire fence is required here to avoid reordering against later
181 	 * vm_lock_seq check and checks inside lock_vma_under_rcu().
182 	 */
183 	if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
184 							      VMA_REF_LIMIT))) {
185 		/* return EAGAIN if vma got detached from under us */
186 		vma = oldcnt ? NULL : ERR_PTR(-EAGAIN);
187 		goto err;
188 	}
189 
190 	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
191 
192 	if (unlikely(vma->vm_mm != mm))
193 		goto err_unstable;
194 
195 	/*
196 	 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
197 	 * False unlocked result is impossible because we modify and check
198 	 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
199 	 * modification invalidates all existing locks.
200 	 *
201 	 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
202 	 * racing with vma_end_write_all(), we only start reading from the VMA
203 	 * after it has been unlocked.
204 	 * This pairs with RELEASE semantics in vma_end_write_all().
205 	 */
206 	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
207 		vma_refcount_put(vma);
208 		vma = NULL;
209 		goto err;
210 	}
211 
212 	return vma;
213 err:
214 	rcu_read_unlock();
215 
216 	return vma;
217 err_unstable:
218 	/*
219 	 * If vma got attached to another mm from under us, that mm is not
220 	 * stable and can be freed in the narrow window after vma->vm_refcnt
221 	 * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
222 	 * releasing vma->vm_refcnt.
223 	 */
224 	other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */
225 
226 	/* __mmdrop() is a heavy operation, do it after dropping RCU lock. */
227 	rcu_read_unlock();
228 	mmgrab(other_mm);
229 	vma_refcount_put(vma);
230 	mmdrop(other_mm);
231 
232 	return NULL;
233 }
234 
235 /*
236  * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
237  * stable and not isolated. If the VMA is not found or is being modified the
238  * function returns NULL.
239  */
240 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
241 					  unsigned long address)
242 {
243 	MA_STATE(mas, &mm->mm_mt, address, address);
244 	struct vm_area_struct *vma;
245 
246 retry:
247 	rcu_read_lock();
248 	vma = mas_walk(&mas);
249 	if (!vma) {
250 		rcu_read_unlock();
251 		goto inval;
252 	}
253 
254 	vma = vma_start_read(mm, vma);
255 	if (IS_ERR_OR_NULL(vma)) {
256 		/* Check if the VMA got isolated after we found it */
257 		if (PTR_ERR(vma) == -EAGAIN) {
258 			count_vm_vma_lock_event(VMA_LOCK_MISS);
259 			/* The area was replaced with another one */
260 			goto retry;
261 		}
262 
263 		/* Failed to lock the VMA */
264 		goto inval;
265 	}
266 	/*
267 	 * At this point, we have a stable reference to a VMA: The VMA is
268 	 * locked and we know it hasn't already been isolated.
269 	 * From here on, we can access the VMA without worrying about which
270 	 * fields are accessible for RCU readers.
271 	 */
272 	rcu_read_unlock();
273 
274 	/* Check if the vma we locked is the right one. */
275 	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
276 		vma_end_read(vma);
277 		goto inval;
278 	}
279 
280 	return vma;
281 
282 inval:
283 	count_vm_vma_lock_event(VMA_LOCK_ABORT);
284 	return NULL;
285 }
286 
287 static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
288 							    struct vma_iterator *vmi,
289 							    unsigned long from_addr)
290 {
291 	struct vm_area_struct *vma;
292 	int ret;
293 
294 	ret = mmap_read_lock_killable(mm);
295 	if (ret)
296 		return ERR_PTR(ret);
297 
298 	/* Lookup the vma at the last position again under mmap_read_lock */
299 	vma_iter_set(vmi, from_addr);
300 	vma = vma_next(vmi);
301 	if (vma) {
302 		/* Very unlikely vma->vm_refcnt overflow case */
303 		if (unlikely(!vma_start_read_locked(vma)))
304 			vma = ERR_PTR(-EAGAIN);
305 	}
306 
307 	mmap_read_unlock(mm);
308 
309 	return vma;
310 }
311 
312 struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
313 				     struct vma_iterator *vmi,
314 				     unsigned long from_addr)
315 {
316 	struct vm_area_struct *vma;
317 	unsigned int mm_wr_seq;
318 	bool mmap_unlocked;
319 
320 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
321 retry:
322 	/* Start mmap_lock speculation in case we need to verify the vma later */
323 	mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
324 	vma = vma_next(vmi);
325 	if (!vma)
326 		return NULL;
327 
328 	vma = vma_start_read(mm, vma);
329 	if (IS_ERR_OR_NULL(vma)) {
330 		/*
331 		 * Retry immediately if the vma gets detached from under us.
332 		 * Infinite loop should not happen because the vma we find will
333 		 * have to be constantly knocked out from under us.
334 		 */
335 		if (PTR_ERR(vma) == -EAGAIN) {
336 			/* reset to search from the last address */
337 			rcu_read_lock();
338 			vma_iter_set(vmi, from_addr);
339 			goto retry;
340 		}
341 
342 		goto fallback;
343 	}
344 
345 	/* Verify the vma is not behind the last search position. */
346 	if (unlikely(from_addr >= vma->vm_end))
347 		goto fallback_unlock;
348 
349 	/*
350 	 * vma can be ahead of the last search position but we need to verify
351 	 * it was not shrunk after we found it and another vma has not been
352 	 * installed ahead of it. Otherwise we might observe a gap that should
353 	 * not be there.
354 	 */
355 	if (from_addr < vma->vm_start) {
356 		/* Verify only if the address space might have changed since vma lookup. */
357 		if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
358 			vma_iter_set(vmi, from_addr);
359 			if (vma != vma_next(vmi))
360 				goto fallback_unlock;
361 		}
362 	}
363 
364 	return vma;
365 
366 fallback_unlock:
367 	rcu_read_unlock();
368 	vma_end_read(vma);
369 fallback:
370 	vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
371 	rcu_read_lock();
372 	/* Reinitialize the iterator after re-entering rcu read section */
373 	vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);
374 
375 	return vma;
376 }
377 #endif /* CONFIG_PER_VMA_LOCK */
378 
379 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA
380 #include <linux/extable.h>
381 
382 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
383 {
384 	if (likely(mmap_read_trylock(mm)))
385 		return true;
386 
387 	if (regs && !user_mode(regs)) {
388 		unsigned long ip = exception_ip(regs);
389 		if (!search_exception_tables(ip))
390 			return false;
391 	}
392 
393 	return !mmap_read_lock_killable(mm);
394 }
395 
396 static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
397 {
398 	/*
399 	 * We don't have this operation yet.
400 	 *
401 	 * It should be easy enough to do: it's basically a
402 	 *    atomic_long_try_cmpxchg_acquire()
403 	 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
404 	 * it also needs the proper lockdep magic etc.
405 	 */
406 	return false;
407 }
408 
409 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
410 {
411 	mmap_read_unlock(mm);
412 	if (regs && !user_mode(regs)) {
413 		unsigned long ip = exception_ip(regs);
414 		if (!search_exception_tables(ip))
415 			return false;
416 	}
417 	return !mmap_write_lock_killable(mm);
418 }
419 
420 /*
421  * Helper for page fault handling.
422  *
423  * This is kind of equivalent to "mmap_read_lock()" followed
424  * by "find_extend_vma()", except it's a lot more careful about
425  * the locking (and will drop the lock on failure).
426  *
427  * For example, if we have a kernel bug that causes a page
428  * fault, we don't want to just use mmap_read_lock() to get
429  * the mm lock, because that would deadlock if the bug were
430  * to happen while we're holding the mm lock for writing.
431  *
432  * So this checks the exception tables on kernel faults in
433  * order to only do this all for instructions that are actually
434  * expected to fault.
435  *
436  * We can also actually take the mm lock for writing if we
437  * need to extend the vma, which helps the VM layer a lot.
438  */
439 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
440 			unsigned long addr, struct pt_regs *regs)
441 {
442 	struct vm_area_struct *vma;
443 
444 	if (!get_mmap_lock_carefully(mm, regs))
445 		return NULL;
446 
447 	vma = find_vma(mm, addr);
448 	if (likely(vma && (vma->vm_start <= addr)))
449 		return vma;
450 
451 	/*
452 	 * Well, dang. We might still be successful, but only
453 	 * if we can extend a vma to do so.
454 	 */
455 	if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
456 		mmap_read_unlock(mm);
457 		return NULL;
458 	}
459 
460 	/*
461 	 * We can try to upgrade the mmap lock atomically,
462 	 * in which case we can continue to use the vma
463 	 * we already looked up.
464 	 *
465 	 * Otherwise we'll have to drop the mmap lock and
466 	 * re-take it, and also look up the vma again,
467 	 * re-checking it.
468 	 */
469 	if (!mmap_upgrade_trylock(mm)) {
470 		if (!upgrade_mmap_lock_carefully(mm, regs))
471 			return NULL;
472 
473 		vma = find_vma(mm, addr);
474 		if (!vma)
475 			goto fail;
476 		if (vma->vm_start <= addr)
477 			goto success;
478 		if (!(vma->vm_flags & VM_GROWSDOWN))
479 			goto fail;
480 	}
481 
482 	if (expand_stack_locked(vma, addr))
483 		goto fail;
484 
485 success:
486 	mmap_write_downgrade(mm);
487 	return vma;
488 
489 fail:
490 	mmap_write_unlock(mm);
491 	return NULL;
492 }
493 #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
494 
495 #else /* CONFIG_MMU */
496 
497 /*
498  * At least xtensa ends up having protection faults even with no
499  * MMU.. No stack expansion, at least.
500  */
501 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
502 			unsigned long addr, struct pt_regs *regs)
503 {
504 	struct vm_area_struct *vma;
505 
506 	mmap_read_lock(mm);
507 	vma = vma_lookup(mm, addr);
508 	if (!vma)
509 		mmap_read_unlock(mm);
510 	return vma;
511 }
512 
513 #endif /* CONFIG_MMU */
514