xref: /linux/mm/mmap_lock.c (revision 22f7639f2f030e58cb55ad8438c77dfcea951fc3)
1 // SPDX-License-Identifier: GPL-2.0
2 #define CREATE_TRACE_POINTS
3 #include <trace/events/mmap_lock.h>
4 
5 #include <linux/mm.h>
6 #include <linux/cgroup.h>
7 #include <linux/memcontrol.h>
8 #include <linux/mmap_lock.h>
9 #include <linux/mutex.h>
10 #include <linux/percpu.h>
11 #include <linux/rcupdate.h>
12 #include <linux/smp.h>
13 #include <linux/trace_events.h>
14 #include <linux/local_lock.h>
15 
16 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
19 
20 #ifdef CONFIG_TRACING
21 /*
22  * Trace calls must be in a separate file, as otherwise there's a circular
23  * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
24  */
25 
26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
27 {
28 	trace_mmap_lock_start_locking(mm, write);
29 }
30 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
31 
32 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
33 					   bool success)
34 {
35 	trace_mmap_lock_acquire_returned(mm, write, success);
36 }
37 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
38 
39 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
40 {
41 	trace_mmap_lock_released(mm, write);
42 }
43 EXPORT_SYMBOL(__mmap_lock_do_trace_released);
44 #endif /* CONFIG_TRACING */
45 
46 #ifdef CONFIG_MMU
47 #ifdef CONFIG_PER_VMA_LOCK
48 
49 /* State shared across __vma_[start, end]_exclude_readers. */
50 struct vma_exclude_readers_state {
51 	/* Input parameters. */
52 	struct vm_area_struct *vma;
53 	int state; /* TASK_KILLABLE or TASK_UNINTERRUPTIBLE. */
54 	bool detaching;
55 
56 	/* Output parameters. */
57 	bool detached;
58 	bool exclusive; /* Are we exclusively locked? */
59 };
60 
61 /*
62  * Now that all readers have been evicted, mark the VMA as being out of the
63  * 'exclude readers' state.
64  */
65 static void __vma_end_exclude_readers(struct vma_exclude_readers_state *ves)
66 {
67 	struct vm_area_struct *vma = ves->vma;
68 
69 	VM_WARN_ON_ONCE(ves->detached);
70 
71 	ves->detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG,
72 					      &vma->vm_refcnt);
73 	__vma_lockdep_release_exclusive(vma);
74 }
75 
76 static unsigned int get_target_refcnt(struct vma_exclude_readers_state *ves)
77 {
78 	const unsigned int tgt = ves->detaching ? 0 : 1;
79 
80 	return tgt | VM_REFCNT_EXCLUDE_READERS_FLAG;
81 }
82 
83 /*
84  * Mark the VMA as being in a state of excluding readers, check to see if any
85  * VMA read locks are indeed held, and if so wait for them to be released.
86  *
87  * Note that this function pairs with vma_refcount_put() which will wake up this
88  * thread when it detects that the last reader has released its lock.
89  *
90  * The ves->state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases
91  * where we wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal
92  * signal is permitted to kill it.
93  *
94  * The function sets the ves->exclusive parameter to true if readers were
95  * excluded, or false if the VMA was detached or an error arose on wait.
96  *
97  * If the function indicates an exclusive lock was acquired via ves->exclusive
98  * the caller is required to invoke __vma_end_exclude_readers() once the
99  * exclusive state is no longer required.
100  *
101  * If ves->state is set to something other than TASK_UNINTERRUPTIBLE, the
102  * function may also return -EINTR to indicate a fatal signal was received while
103  * waiting.  Otherwise, the function returns 0.
104  */
105 static int __vma_start_exclude_readers(struct vma_exclude_readers_state *ves)
106 {
107 	struct vm_area_struct *vma = ves->vma;
108 	unsigned int tgt_refcnt = get_target_refcnt(ves);
109 	int err = 0;
110 
111 	mmap_assert_write_locked(vma->vm_mm);
112 
113 	/*
114 	 * If vma is detached then only vma_mark_attached() can raise the
115 	 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
116 	 *
117 	 * See the comment describing the vm_area_struct->vm_refcnt field for
118 	 * details of possible refcnt values.
119 	 */
120 	if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) {
121 		ves->detached = true;
122 		return 0;
123 	}
124 
125 	__vma_lockdep_acquire_exclusive(vma);
126 	err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
127 		   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
128 		   ves->state);
129 	if (err) {
130 		__vma_end_exclude_readers(ves);
131 		return err;
132 	}
133 
134 	__vma_lockdep_stat_mark_acquired(vma);
135 	ves->exclusive = true;
136 	return 0;
137 }
138 
139 int __vma_start_write(struct vm_area_struct *vma, int state)
140 {
141 	const unsigned int mm_lock_seq = __vma_raw_mm_seqnum(vma);
142 	struct vma_exclude_readers_state ves = {
143 		.vma = vma,
144 		.state = state,
145 	};
146 	int err;
147 
148 	err = __vma_start_exclude_readers(&ves);
149 	if (err) {
150 		WARN_ON_ONCE(ves.detached);
151 		return err;
152 	}
153 
154 	/*
155 	 * We should use WRITE_ONCE() here because we can have concurrent reads
156 	 * from the early lockless pessimistic check in vma_start_read().
157 	 * We don't really care about the correctness of that early check, but
158 	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
159 	 */
160 	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
161 
162 	if (ves.exclusive) {
163 		__vma_end_exclude_readers(&ves);
164 		/* VMA should remain attached. */
165 		WARN_ON_ONCE(ves.detached);
166 	}
167 
168 	return 0;
169 }
170 EXPORT_SYMBOL_GPL(__vma_start_write);
171 
172 void __vma_exclude_readers_for_detach(struct vm_area_struct *vma)
173 {
174 	struct vma_exclude_readers_state ves = {
175 		.vma = vma,
176 		.state = TASK_UNINTERRUPTIBLE,
177 		.detaching = true,
178 	};
179 	int err;
180 
181 	/*
182 	 * Wait until the VMA is detached with no readers. Since we hold the VMA
183 	 * write lock, the only read locks that might be present are those from
184 	 * threads trying to acquire the read lock and incrementing the
185 	 * reference count before realising the write lock is held and
186 	 * decrementing it.
187 	 */
188 	err = __vma_start_exclude_readers(&ves);
189 	if (!err && ves.exclusive) {
190 		/*
191 		 * Once this is complete, no readers can increment the
192 		 * reference count, and the VMA is marked detached.
193 		 */
194 		__vma_end_exclude_readers(&ves);
195 	}
196 	/* If an error arose but we were detached anyway, we don't care. */
197 	WARN_ON_ONCE(!ves.detached);
198 }
199 
200 /*
201  * Try to read-lock a vma. The function is allowed to occasionally yield false
202  * locked result to avoid performance overhead, in which case we fall back to
203  * using mmap_lock. The function should never yield false unlocked result.
204  * False locked result is possible if mm_lock_seq overflows or if vma gets
205  * reused and attached to a different mm before we lock it.
206  * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
207  * detached.
208  *
209  * IMPORTANT: RCU lock must be held upon entering the function, but upon error
210  *            IT IS RELEASED. The caller must handle this correctly.
211  */
212 static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
213 						    struct vm_area_struct *vma)
214 {
215 	struct mm_struct *other_mm;
216 	int oldcnt;
217 
218 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
219 	/*
220 	 * Check before locking. A race might cause false locked result.
221 	 * We can use READ_ONCE() for the mm_lock_seq here, and don't need
222 	 * ACQUIRE semantics, because this is just a lockless check whose result
223 	 * we don't rely on for anything - the mm_lock_seq read against which we
224 	 * need ordering is below.
225 	 */
226 	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
227 		vma = NULL;
228 		goto err;
229 	}
230 
231 	/*
232 	 * If VM_REFCNT_EXCLUDE_READERS_FLAG is set,
233 	 * __refcount_inc_not_zero_limited_acquire() will fail because
234 	 * VM_REFCNT_LIMIT is less than VM_REFCNT_EXCLUDE_READERS_FLAG.
235 	 *
236 	 * Acquire fence is required here to avoid reordering against later
237 	 * vm_lock_seq check and checks inside lock_vma_under_rcu().
238 	 */
239 	if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
240 							      VM_REFCNT_LIMIT))) {
241 		/* return EAGAIN if vma got detached from under us */
242 		vma = oldcnt ? NULL : ERR_PTR(-EAGAIN);
243 		goto err;
244 	}
245 
246 	__vma_lockdep_acquire_read(vma);
247 
248 	if (unlikely(vma->vm_mm != mm))
249 		goto err_unstable;
250 
251 	/*
252 	 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
253 	 * False unlocked result is impossible because we modify and check
254 	 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
255 	 * modification invalidates all existing locks.
256 	 *
257 	 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
258 	 * racing with vma_end_write_all(), we only start reading from the VMA
259 	 * after it has been unlocked.
260 	 * This pairs with RELEASE semantics in vma_end_write_all().
261 	 */
262 	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
263 		vma_refcount_put(vma);
264 		vma = NULL;
265 		goto err;
266 	}
267 
268 	return vma;
269 err:
270 	rcu_read_unlock();
271 
272 	return vma;
273 err_unstable:
274 	/*
275 	 * If vma got attached to another mm from under us, that mm is not
276 	 * stable and can be freed in the narrow window after vma->vm_refcnt
277 	 * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
278 	 * releasing vma->vm_refcnt.
279 	 */
280 	other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */
281 
282 	/* __mmdrop() is a heavy operation, do it after dropping RCU lock. */
283 	rcu_read_unlock();
284 	mmgrab(other_mm);
285 	vma_refcount_put(vma);
286 	mmdrop(other_mm);
287 
288 	return NULL;
289 }
290 
291 /*
292  * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
293  * stable and not isolated. If the VMA is not found or is being modified the
294  * function returns NULL.
295  */
296 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
297 					  unsigned long address)
298 {
299 	MA_STATE(mas, &mm->mm_mt, address, address);
300 	struct vm_area_struct *vma;
301 
302 retry:
303 	rcu_read_lock();
304 	vma = mas_walk(&mas);
305 	if (!vma) {
306 		rcu_read_unlock();
307 		goto inval;
308 	}
309 
310 	vma = vma_start_read(mm, vma);
311 	if (IS_ERR_OR_NULL(vma)) {
312 		/* Check if the VMA got isolated after we found it */
313 		if (PTR_ERR(vma) == -EAGAIN) {
314 			count_vm_vma_lock_event(VMA_LOCK_MISS);
315 			/* The area was replaced with another one */
316 			mas_set(&mas, address);
317 			goto retry;
318 		}
319 
320 		/* Failed to lock the VMA */
321 		goto inval;
322 	}
323 	/*
324 	 * At this point, we have a stable reference to a VMA: The VMA is
325 	 * locked and we know it hasn't already been isolated.
326 	 * From here on, we can access the VMA without worrying about which
327 	 * fields are accessible for RCU readers.
328 	 */
329 	rcu_read_unlock();
330 
331 	/* Check if the vma we locked is the right one. */
332 	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
333 		vma_end_read(vma);
334 		goto inval;
335 	}
336 
337 	return vma;
338 
339 inval:
340 	count_vm_vma_lock_event(VMA_LOCK_ABORT);
341 	return NULL;
342 }
343 
344 static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
345 							    struct vma_iterator *vmi,
346 							    unsigned long from_addr)
347 {
348 	struct vm_area_struct *vma;
349 	int ret;
350 
351 	ret = mmap_read_lock_killable(mm);
352 	if (ret)
353 		return ERR_PTR(ret);
354 
355 	/* Lookup the vma at the last position again under mmap_read_lock */
356 	vma_iter_set(vmi, from_addr);
357 	vma = vma_next(vmi);
358 	if (vma) {
359 		/* Very unlikely vma->vm_refcnt overflow case */
360 		if (unlikely(!vma_start_read_locked(vma)))
361 			vma = ERR_PTR(-EAGAIN);
362 	}
363 
364 	mmap_read_unlock(mm);
365 
366 	return vma;
367 }
368 
369 struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
370 				     struct vma_iterator *vmi,
371 				     unsigned long from_addr)
372 {
373 	struct vm_area_struct *vma;
374 	unsigned int mm_wr_seq;
375 	bool mmap_unlocked;
376 
377 	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
378 retry:
379 	/* Start mmap_lock speculation in case we need to verify the vma later */
380 	mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
381 	vma = vma_next(vmi);
382 	if (!vma)
383 		return NULL;
384 
385 	vma = vma_start_read(mm, vma);
386 	if (IS_ERR_OR_NULL(vma)) {
387 		/*
388 		 * Retry immediately if the vma gets detached from under us.
389 		 * Infinite loop should not happen because the vma we find will
390 		 * have to be constantly knocked out from under us.
391 		 */
392 		if (PTR_ERR(vma) == -EAGAIN) {
393 			/* reset to search from the last address */
394 			rcu_read_lock();
395 			vma_iter_set(vmi, from_addr);
396 			goto retry;
397 		}
398 
399 		goto fallback;
400 	}
401 
402 	/* Verify the vma is not behind the last search position. */
403 	if (unlikely(from_addr >= vma->vm_end))
404 		goto fallback_unlock;
405 
406 	/*
407 	 * vma can be ahead of the last search position but we need to verify
408 	 * it was not shrunk after we found it and another vma has not been
409 	 * installed ahead of it. Otherwise we might observe a gap that should
410 	 * not be there.
411 	 */
412 	if (from_addr < vma->vm_start) {
413 		/* Verify only if the address space might have changed since vma lookup. */
414 		if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
415 			vma_iter_set(vmi, from_addr);
416 			if (vma != vma_next(vmi))
417 				goto fallback_unlock;
418 		}
419 	}
420 
421 	return vma;
422 
423 fallback_unlock:
424 	rcu_read_unlock();
425 	vma_end_read(vma);
426 fallback:
427 	vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
428 	rcu_read_lock();
429 	/* Reinitialize the iterator after re-entering rcu read section */
430 	vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);
431 
432 	return vma;
433 }
434 #endif /* CONFIG_PER_VMA_LOCK */
435 
436 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA
437 #include <linux/extable.h>
438 
439 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
440 {
441 	if (likely(mmap_read_trylock(mm)))
442 		return true;
443 
444 	if (regs && !user_mode(regs)) {
445 		unsigned long ip = exception_ip(regs);
446 		if (!search_exception_tables(ip))
447 			return false;
448 	}
449 
450 	return !mmap_read_lock_killable(mm);
451 }
452 
453 static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
454 {
455 	/*
456 	 * We don't have this operation yet.
457 	 *
458 	 * It should be easy enough to do: it's basically a
459 	 *    atomic_long_try_cmpxchg_acquire()
460 	 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
461 	 * it also needs the proper lockdep magic etc.
462 	 */
463 	return false;
464 }
465 
466 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
467 {
468 	mmap_read_unlock(mm);
469 	if (regs && !user_mode(regs)) {
470 		unsigned long ip = exception_ip(regs);
471 		if (!search_exception_tables(ip))
472 			return false;
473 	}
474 	return !mmap_write_lock_killable(mm);
475 }
476 
477 /*
478  * Helper for page fault handling.
479  *
480  * This is kind of equivalent to "mmap_read_lock()" followed
481  * by "find_extend_vma()", except it's a lot more careful about
482  * the locking (and will drop the lock on failure).
483  *
484  * For example, if we have a kernel bug that causes a page
485  * fault, we don't want to just use mmap_read_lock() to get
486  * the mm lock, because that would deadlock if the bug were
487  * to happen while we're holding the mm lock for writing.
488  *
489  * So this checks the exception tables on kernel faults in
490  * order to only do this all for instructions that are actually
491  * expected to fault.
492  *
493  * We can also actually take the mm lock for writing if we
494  * need to extend the vma, which helps the VM layer a lot.
495  */
496 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
497 			unsigned long addr, struct pt_regs *regs)
498 {
499 	struct vm_area_struct *vma;
500 
501 	if (!get_mmap_lock_carefully(mm, regs))
502 		return NULL;
503 
504 	vma = find_vma(mm, addr);
505 	if (likely(vma && (vma->vm_start <= addr)))
506 		return vma;
507 
508 	/*
509 	 * Well, dang. We might still be successful, but only
510 	 * if we can extend a vma to do so.
511 	 */
512 	if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
513 		mmap_read_unlock(mm);
514 		return NULL;
515 	}
516 
517 	/*
518 	 * We can try to upgrade the mmap lock atomically,
519 	 * in which case we can continue to use the vma
520 	 * we already looked up.
521 	 *
522 	 * Otherwise we'll have to drop the mmap lock and
523 	 * re-take it, and also look up the vma again,
524 	 * re-checking it.
525 	 */
526 	if (!mmap_upgrade_trylock(mm)) {
527 		if (!upgrade_mmap_lock_carefully(mm, regs))
528 			return NULL;
529 
530 		vma = find_vma(mm, addr);
531 		if (!vma)
532 			goto fail;
533 		if (vma->vm_start <= addr)
534 			goto success;
535 		if (!(vma->vm_flags & VM_GROWSDOWN))
536 			goto fail;
537 	}
538 
539 	if (expand_stack_locked(vma, addr))
540 		goto fail;
541 
542 success:
543 	mmap_write_downgrade(mm);
544 	return vma;
545 
546 fail:
547 	mmap_write_unlock(mm);
548 	return NULL;
549 }
550 #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
551 
552 #else /* CONFIG_MMU */
553 
554 /*
555  * At least xtensa ends up having protection faults even with no
556  * MMU.. No stack expansion, at least.
557  */
558 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
559 			unsigned long addr, struct pt_regs *regs)
560 {
561 	struct vm_area_struct *vma;
562 
563 	mmap_read_lock(mm);
564 	vma = vma_lookup(mm, addr);
565 	if (!vma)
566 		mmap_read_unlock(mm);
567 	return vma;
568 }
569 
570 #endif /* CONFIG_MMU */
571