xref: /linux/mm/mmap_lock.c (revision 8804d970fab45726b3c7cd7f240b31122aa94219)
1 // SPDX-License-Identifier: GPL-2.0
2 #define CREATE_TRACE_POINTS
3 #include <trace/events/mmap_lock.h>
4 
5 #include <linux/mm.h>
6 #include <linux/cgroup.h>
7 #include <linux/memcontrol.h>
8 #include <linux/mmap_lock.h>
9 #include <linux/mutex.h>
10 #include <linux/percpu.h>
11 #include <linux/rcupdate.h>
12 #include <linux/smp.h>
13 #include <linux/trace_events.h>
14 #include <linux/local_lock.h>
15 
16 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
19 
20 #ifdef CONFIG_TRACING
21 /*
22  * Trace calls must be in a separate file, as otherwise there's a circular
23  * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
24  */
25 
__mmap_lock_do_trace_start_locking(struct mm_struct * mm,bool write)26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
27 {
28 	trace_mmap_lock_start_locking(mm, write);
29 }
30 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
31 
__mmap_lock_do_trace_acquire_returned(struct mm_struct * mm,bool write,bool success)32 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
33 					   bool success)
34 {
35 	trace_mmap_lock_acquire_returned(mm, write, success);
36 }
37 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
38 
__mmap_lock_do_trace_released(struct mm_struct * mm,bool write)39 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
40 {
41 	trace_mmap_lock_released(mm, write);
42 }
43 EXPORT_SYMBOL(__mmap_lock_do_trace_released);
44 #endif /* CONFIG_TRACING */
45 
46 #ifdef CONFIG_MMU
47 #ifdef CONFIG_PER_VMA_LOCK
__vma_enter_locked(struct vm_area_struct * vma,bool detaching)48 static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
49 {
50 	unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
51 
52 	/* Additional refcnt if the vma is attached. */
53 	if (!detaching)
54 		tgt_refcnt++;
55 
56 	/*
57 	 * If vma is detached then only vma_mark_attached() can raise the
58 	 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
59 	 */
60 	if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
61 		return false;
62 
63 	rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
64 	rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
65 		   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
66 		   TASK_UNINTERRUPTIBLE);
67 	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
68 
69 	return true;
70 }
71 
__vma_exit_locked(struct vm_area_struct * vma,bool * detached)72 static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
73 {
74 	*detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
75 	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
76 }
77 
__vma_start_write(struct vm_area_struct * vma,unsigned int mm_lock_seq)78 void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
79 {
80 	bool locked;
81 
82 	/*
83 	 * __vma_enter_locked() returns false immediately if the vma is not
84 	 * attached, otherwise it waits until refcnt is indicating that vma
85 	 * is attached with no readers.
86 	 */
87 	locked = __vma_enter_locked(vma, false);
88 
89 	/*
90 	 * We should use WRITE_ONCE() here because we can have concurrent reads
91 	 * from the early lockless pessimistic check in vma_start_read().
92 	 * We don't really care about the correctness of that early check, but
93 	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
94 	 */
95 	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
96 
97 	if (locked) {
98 		bool detached;
99 
100 		__vma_exit_locked(vma, &detached);
101 		WARN_ON_ONCE(detached); /* vma should remain attached */
102 	}
103 }
104 EXPORT_SYMBOL_GPL(__vma_start_write);
105 
vma_mark_detached(struct vm_area_struct * vma)106 void vma_mark_detached(struct vm_area_struct *vma)
107 {
108 	vma_assert_write_locked(vma);
109 	vma_assert_attached(vma);
110 
111 	/*
112 	 * We are the only writer, so no need to use vma_refcount_put().
113 	 * The condition below is unlikely because the vma has been already
114 	 * write-locked and readers can increment vm_refcnt only temporarily
115 	 * before they check vm_lock_seq, realize the vma is locked and drop
116 	 * back the vm_refcnt. That is a narrow window for observing a raised
117 	 * vm_refcnt.
118 	 */
119 	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
120 		/* Wait until vma is detached with no readers. */
121 		if (__vma_enter_locked(vma, true)) {
122 			bool detached;
123 
124 			__vma_exit_locked(vma, &detached);
125 			WARN_ON_ONCE(!detached);
126 		}
127 	}
128 }
129 
130 /*
131  * Try to read-lock a vma. The function is allowed to occasionally yield false
132  * locked result to avoid performance overhead, in which case we fall back to
133  * using mmap_lock. The function should never yield false unlocked result.
134  * False locked result is possible if mm_lock_seq overflows or if vma gets
135  * reused and attached to a different mm before we lock it.
136  * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
137  * detached.
138  *
139  * IMPORTANT: RCU lock must be held upon entering the function, but upon error
140  *            IT IS RELEASED. The caller must handle this correctly.
141  */
vma_start_read(struct mm_struct * mm,struct vm_area_struct * vma)142 static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
143 						    struct vm_area_struct *vma)
144 {
145 	struct mm_struct *other_mm;
146 	int oldcnt;
147 
148 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
149 	/*
150 	 * Check before locking. A race might cause false locked result.
151 	 * We can use READ_ONCE() for the mm_lock_seq here, and don't need
152 	 * ACQUIRE semantics, because this is just a lockless check whose result
153 	 * we don't rely on for anything - the mm_lock_seq read against which we
154 	 * need ordering is below.
155 	 */
156 	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
157 		vma = NULL;
158 		goto err;
159 	}
160 
161 	/*
162 	 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
163 	 * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
164 	 * Acquire fence is required here to avoid reordering against later
165 	 * vm_lock_seq check and checks inside lock_vma_under_rcu().
166 	 */
167 	if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
168 							      VMA_REF_LIMIT))) {
169 		/* return EAGAIN if vma got detached from under us */
170 		vma = oldcnt ? NULL : ERR_PTR(-EAGAIN);
171 		goto err;
172 	}
173 
174 	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
175 
176 	if (unlikely(vma->vm_mm != mm))
177 		goto err_unstable;
178 
179 	/*
180 	 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
181 	 * False unlocked result is impossible because we modify and check
182 	 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
183 	 * modification invalidates all existing locks.
184 	 *
185 	 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
186 	 * racing with vma_end_write_all(), we only start reading from the VMA
187 	 * after it has been unlocked.
188 	 * This pairs with RELEASE semantics in vma_end_write_all().
189 	 */
190 	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
191 		vma_refcount_put(vma);
192 		vma = NULL;
193 		goto err;
194 	}
195 
196 	return vma;
197 err:
198 	rcu_read_unlock();
199 
200 	return vma;
201 err_unstable:
202 	/*
203 	 * If vma got attached to another mm from under us, that mm is not
204 	 * stable and can be freed in the narrow window after vma->vm_refcnt
205 	 * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
206 	 * releasing vma->vm_refcnt.
207 	 */
208 	other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */
209 
210 	/* __mmdrop() is a heavy operation, do it after dropping RCU lock. */
211 	rcu_read_unlock();
212 	mmgrab(other_mm);
213 	vma_refcount_put(vma);
214 	mmdrop(other_mm);
215 
216 	return NULL;
217 }
218 
219 /*
220  * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
221  * stable and not isolated. If the VMA is not found or is being modified the
222  * function returns NULL.
223  */
lock_vma_under_rcu(struct mm_struct * mm,unsigned long address)224 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
225 					  unsigned long address)
226 {
227 	MA_STATE(mas, &mm->mm_mt, address, address);
228 	struct vm_area_struct *vma;
229 
230 retry:
231 	rcu_read_lock();
232 	vma = mas_walk(&mas);
233 	if (!vma) {
234 		rcu_read_unlock();
235 		goto inval;
236 	}
237 
238 	vma = vma_start_read(mm, vma);
239 	if (IS_ERR_OR_NULL(vma)) {
240 		/* Check if the VMA got isolated after we found it */
241 		if (PTR_ERR(vma) == -EAGAIN) {
242 			count_vm_vma_lock_event(VMA_LOCK_MISS);
243 			/* The area was replaced with another one */
244 			goto retry;
245 		}
246 
247 		/* Failed to lock the VMA */
248 		goto inval;
249 	}
250 	/*
251 	 * At this point, we have a stable reference to a VMA: The VMA is
252 	 * locked and we know it hasn't already been isolated.
253 	 * From here on, we can access the VMA without worrying about which
254 	 * fields are accessible for RCU readers.
255 	 */
256 	rcu_read_unlock();
257 
258 	/* Check if the vma we locked is the right one. */
259 	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
260 		vma_end_read(vma);
261 		goto inval;
262 	}
263 
264 	return vma;
265 
266 inval:
267 	count_vm_vma_lock_event(VMA_LOCK_ABORT);
268 	return NULL;
269 }
270 
lock_next_vma_under_mmap_lock(struct mm_struct * mm,struct vma_iterator * vmi,unsigned long from_addr)271 static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
272 							    struct vma_iterator *vmi,
273 							    unsigned long from_addr)
274 {
275 	struct vm_area_struct *vma;
276 	int ret;
277 
278 	ret = mmap_read_lock_killable(mm);
279 	if (ret)
280 		return ERR_PTR(ret);
281 
282 	/* Lookup the vma at the last position again under mmap_read_lock */
283 	vma_iter_set(vmi, from_addr);
284 	vma = vma_next(vmi);
285 	if (vma) {
286 		/* Very unlikely vma->vm_refcnt overflow case */
287 		if (unlikely(!vma_start_read_locked(vma)))
288 			vma = ERR_PTR(-EAGAIN);
289 	}
290 
291 	mmap_read_unlock(mm);
292 
293 	return vma;
294 }
295 
lock_next_vma(struct mm_struct * mm,struct vma_iterator * vmi,unsigned long from_addr)296 struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
297 				     struct vma_iterator *vmi,
298 				     unsigned long from_addr)
299 {
300 	struct vm_area_struct *vma;
301 	unsigned int mm_wr_seq;
302 	bool mmap_unlocked;
303 
304 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
305 retry:
306 	/* Start mmap_lock speculation in case we need to verify the vma later */
307 	mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
308 	vma = vma_next(vmi);
309 	if (!vma)
310 		return NULL;
311 
312 	vma = vma_start_read(mm, vma);
313 	if (IS_ERR_OR_NULL(vma)) {
314 		/*
315 		 * Retry immediately if the vma gets detached from under us.
316 		 * Infinite loop should not happen because the vma we find will
317 		 * have to be constantly knocked out from under us.
318 		 */
319 		if (PTR_ERR(vma) == -EAGAIN) {
320 			/* reset to search from the last address */
321 			rcu_read_lock();
322 			vma_iter_set(vmi, from_addr);
323 			goto retry;
324 		}
325 
326 		goto fallback;
327 	}
328 
329 	/* Verify the vma is not behind the last search position. */
330 	if (unlikely(from_addr >= vma->vm_end))
331 		goto fallback_unlock;
332 
333 	/*
334 	 * vma can be ahead of the last search position but we need to verify
335 	 * it was not shrunk after we found it and another vma has not been
336 	 * installed ahead of it. Otherwise we might observe a gap that should
337 	 * not be there.
338 	 */
339 	if (from_addr < vma->vm_start) {
340 		/* Verify only if the address space might have changed since vma lookup. */
341 		if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
342 			vma_iter_set(vmi, from_addr);
343 			if (vma != vma_next(vmi))
344 				goto fallback_unlock;
345 		}
346 	}
347 
348 	return vma;
349 
350 fallback_unlock:
351 	rcu_read_unlock();
352 	vma_end_read(vma);
353 fallback:
354 	vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
355 	rcu_read_lock();
356 	/* Reinitialize the iterator after re-entering rcu read section */
357 	vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);
358 
359 	return vma;
360 }
361 #endif /* CONFIG_PER_VMA_LOCK */
362 
363 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA
364 #include <linux/extable.h>
365 
get_mmap_lock_carefully(struct mm_struct * mm,struct pt_regs * regs)366 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
367 {
368 	if (likely(mmap_read_trylock(mm)))
369 		return true;
370 
371 	if (regs && !user_mode(regs)) {
372 		unsigned long ip = exception_ip(regs);
373 		if (!search_exception_tables(ip))
374 			return false;
375 	}
376 
377 	return !mmap_read_lock_killable(mm);
378 }
379 
mmap_upgrade_trylock(struct mm_struct * mm)380 static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
381 {
382 	/*
383 	 * We don't have this operation yet.
384 	 *
385 	 * It should be easy enough to do: it's basically a
386 	 *    atomic_long_try_cmpxchg_acquire()
387 	 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
388 	 * it also needs the proper lockdep magic etc.
389 	 */
390 	return false;
391 }
392 
upgrade_mmap_lock_carefully(struct mm_struct * mm,struct pt_regs * regs)393 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
394 {
395 	mmap_read_unlock(mm);
396 	if (regs && !user_mode(regs)) {
397 		unsigned long ip = exception_ip(regs);
398 		if (!search_exception_tables(ip))
399 			return false;
400 	}
401 	return !mmap_write_lock_killable(mm);
402 }
403 
404 /*
405  * Helper for page fault handling.
406  *
407  * This is kind of equivalent to "mmap_read_lock()" followed
408  * by "find_extend_vma()", except it's a lot more careful about
409  * the locking (and will drop the lock on failure).
410  *
411  * For example, if we have a kernel bug that causes a page
412  * fault, we don't want to just use mmap_read_lock() to get
413  * the mm lock, because that would deadlock if the bug were
414  * to happen while we're holding the mm lock for writing.
415  *
416  * So this checks the exception tables on kernel faults in
417  * order to only do this all for instructions that are actually
418  * expected to fault.
419  *
420  * We can also actually take the mm lock for writing if we
421  * need to extend the vma, which helps the VM layer a lot.
422  */
lock_mm_and_find_vma(struct mm_struct * mm,unsigned long addr,struct pt_regs * regs)423 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
424 			unsigned long addr, struct pt_regs *regs)
425 {
426 	struct vm_area_struct *vma;
427 
428 	if (!get_mmap_lock_carefully(mm, regs))
429 		return NULL;
430 
431 	vma = find_vma(mm, addr);
432 	if (likely(vma && (vma->vm_start <= addr)))
433 		return vma;
434 
435 	/*
436 	 * Well, dang. We might still be successful, but only
437 	 * if we can extend a vma to do so.
438 	 */
439 	if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
440 		mmap_read_unlock(mm);
441 		return NULL;
442 	}
443 
444 	/*
445 	 * We can try to upgrade the mmap lock atomically,
446 	 * in which case we can continue to use the vma
447 	 * we already looked up.
448 	 *
449 	 * Otherwise we'll have to drop the mmap lock and
450 	 * re-take it, and also look up the vma again,
451 	 * re-checking it.
452 	 */
453 	if (!mmap_upgrade_trylock(mm)) {
454 		if (!upgrade_mmap_lock_carefully(mm, regs))
455 			return NULL;
456 
457 		vma = find_vma(mm, addr);
458 		if (!vma)
459 			goto fail;
460 		if (vma->vm_start <= addr)
461 			goto success;
462 		if (!(vma->vm_flags & VM_GROWSDOWN))
463 			goto fail;
464 	}
465 
466 	if (expand_stack_locked(vma, addr))
467 		goto fail;
468 
469 success:
470 	mmap_write_downgrade(mm);
471 	return vma;
472 
473 fail:
474 	mmap_write_unlock(mm);
475 	return NULL;
476 }
477 #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
478 
479 #else /* CONFIG_MMU */
480 
481 /*
482  * At least xtensa ends up having protection faults even with no
483  * MMU.. No stack expansion, at least.
484  */
lock_mm_and_find_vma(struct mm_struct * mm,unsigned long addr,struct pt_regs * regs)485 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
486 			unsigned long addr, struct pt_regs *regs)
487 {
488 	struct vm_area_struct *vma;
489 
490 	mmap_read_lock(mm);
491 	vma = vma_lookup(mm, addr);
492 	if (!vma)
493 		mmap_read_unlock(mm);
494 	return vma;
495 }
496 
497 #endif /* CONFIG_MMU */
498