xref: /linux/include/linux/mmap_lock.h (revision da23ea194db94257123f1534d487f3cdc9b5626d)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _LINUX_MMAP_LOCK_H
3 #define _LINUX_MMAP_LOCK_H
4 
5 /* Avoid a dependency loop by declaring here. */
6 extern int rcuwait_wake_up(struct rcuwait *w);
7 
8 #include <linux/lockdep.h>
9 #include <linux/mm_types.h>
10 #include <linux/mmdebug.h>
11 #include <linux/rwsem.h>
12 #include <linux/tracepoint-defs.h>
13 #include <linux/types.h>
14 #include <linux/cleanup.h>
15 #include <linux/sched/mm.h>
16 
17 #define MMAP_LOCK_INITIALIZER(name) \
18 	.mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
19 
20 DECLARE_TRACEPOINT(mmap_lock_start_locking);
21 DECLARE_TRACEPOINT(mmap_lock_acquire_returned);
22 DECLARE_TRACEPOINT(mmap_lock_released);
23 
24 #ifdef CONFIG_TRACING
25 
26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write);
27 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
28 					   bool success);
29 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write);
30 
__mmap_lock_trace_start_locking(struct mm_struct * mm,bool write)31 static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
32 						   bool write)
33 {
34 	if (tracepoint_enabled(mmap_lock_start_locking))
35 		__mmap_lock_do_trace_start_locking(mm, write);
36 }
37 
__mmap_lock_trace_acquire_returned(struct mm_struct * mm,bool write,bool success)38 static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
39 						      bool write, bool success)
40 {
41 	if (tracepoint_enabled(mmap_lock_acquire_returned))
42 		__mmap_lock_do_trace_acquire_returned(mm, write, success);
43 }
44 
__mmap_lock_trace_released(struct mm_struct * mm,bool write)45 static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
46 {
47 	if (tracepoint_enabled(mmap_lock_released))
48 		__mmap_lock_do_trace_released(mm, write);
49 }
50 
51 #else /* !CONFIG_TRACING */
52 
__mmap_lock_trace_start_locking(struct mm_struct * mm,bool write)53 static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
54 						   bool write)
55 {
56 }
57 
__mmap_lock_trace_acquire_returned(struct mm_struct * mm,bool write,bool success)58 static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
59 						      bool write, bool success)
60 {
61 }
62 
__mmap_lock_trace_released(struct mm_struct * mm,bool write)63 static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
64 {
65 }
66 
67 #endif /* CONFIG_TRACING */
68 
mmap_assert_locked(const struct mm_struct * mm)69 static inline void mmap_assert_locked(const struct mm_struct *mm)
70 {
71 	rwsem_assert_held(&mm->mmap_lock);
72 }
73 
mmap_assert_write_locked(const struct mm_struct * mm)74 static inline void mmap_assert_write_locked(const struct mm_struct *mm)
75 {
76 	rwsem_assert_held_write(&mm->mmap_lock);
77 }
78 
79 #ifdef CONFIG_PER_VMA_LOCK
80 
mm_lock_seqcount_init(struct mm_struct * mm)81 static inline void mm_lock_seqcount_init(struct mm_struct *mm)
82 {
83 	seqcount_init(&mm->mm_lock_seq);
84 }
85 
mm_lock_seqcount_begin(struct mm_struct * mm)86 static inline void mm_lock_seqcount_begin(struct mm_struct *mm)
87 {
88 	do_raw_write_seqcount_begin(&mm->mm_lock_seq);
89 }
90 
mm_lock_seqcount_end(struct mm_struct * mm)91 static inline void mm_lock_seqcount_end(struct mm_struct *mm)
92 {
93 	ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq);
94 	do_raw_write_seqcount_end(&mm->mm_lock_seq);
95 }
96 
mmap_lock_speculate_try_begin(struct mm_struct * mm,unsigned int * seq)97 static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
98 {
99 	/*
100 	 * Since mmap_lock is a sleeping lock, and waiting for it to become
101 	 * unlocked is more or less equivalent with taking it ourselves, don't
102 	 * bother with the speculative path if mmap_lock is already write-locked
103 	 * and take the slow path, which takes the lock.
104 	 */
105 	return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq);
106 }
107 
mmap_lock_speculate_retry(struct mm_struct * mm,unsigned int seq)108 static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
109 {
110 	return read_seqcount_retry(&mm->mm_lock_seq, seq);
111 }
112 
vma_lock_init(struct vm_area_struct * vma,bool reset_refcnt)113 static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
114 {
115 #ifdef CONFIG_DEBUG_LOCK_ALLOC
116 	static struct lock_class_key lockdep_key;
117 
118 	lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
119 #endif
120 	if (reset_refcnt)
121 		refcount_set(&vma->vm_refcnt, 0);
122 	vma->vm_lock_seq = UINT_MAX;
123 }
124 
is_vma_writer_only(int refcnt)125 static inline bool is_vma_writer_only(int refcnt)
126 {
127 	/*
128 	 * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma
129 	 * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on
130 	 * a detached vma happens only in vma_mark_detached() and is a rare
131 	 * case, therefore most of the time there will be no unnecessary wakeup.
132 	 */
133 	return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1;
134 }
135 
vma_refcount_put(struct vm_area_struct * vma)136 static inline void vma_refcount_put(struct vm_area_struct *vma)
137 {
138 	/* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
139 	struct mm_struct *mm = vma->vm_mm;
140 	int oldcnt;
141 
142 	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
143 	if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {
144 
145 		if (is_vma_writer_only(oldcnt - 1))
146 			rcuwait_wake_up(&mm->vma_writer_wait);
147 	}
148 }
149 
150 /*
151  * Try to read-lock a vma. The function is allowed to occasionally yield false
152  * locked result to avoid performance overhead, in which case we fall back to
153  * using mmap_lock. The function should never yield false unlocked result.
154  * False locked result is possible if mm_lock_seq overflows or if vma gets
155  * reused and attached to a different mm before we lock it.
156  * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
157  * detached.
158  *
159  * WARNING! The vma passed to this function cannot be used if the function
160  * fails to lock it because in certain cases RCU lock is dropped and then
161  * reacquired. Once RCU lock is dropped the vma can be concurently freed.
162  */
vma_start_read(struct mm_struct * mm,struct vm_area_struct * vma)163 static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
164 						    struct vm_area_struct *vma)
165 {
166 	int oldcnt;
167 
168 	/*
169 	 * Check before locking. A race might cause false locked result.
170 	 * We can use READ_ONCE() for the mm_lock_seq here, and don't need
171 	 * ACQUIRE semantics, because this is just a lockless check whose result
172 	 * we don't rely on for anything - the mm_lock_seq read against which we
173 	 * need ordering is below.
174 	 */
175 	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
176 		return NULL;
177 
178 	/*
179 	 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
180 	 * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
181 	 * Acquire fence is required here to avoid reordering against later
182 	 * vm_lock_seq check and checks inside lock_vma_under_rcu().
183 	 */
184 	if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
185 							      VMA_REF_LIMIT))) {
186 		/* return EAGAIN if vma got detached from under us */
187 		return oldcnt ? NULL : ERR_PTR(-EAGAIN);
188 	}
189 
190 	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
191 
192 	/*
193 	 * If vma got attached to another mm from under us, that mm is not
194 	 * stable and can be freed in the narrow window after vma->vm_refcnt
195 	 * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
196 	 * releasing vma->vm_refcnt.
197 	 */
198 	if (unlikely(vma->vm_mm != mm)) {
199 		/* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
200 		struct mm_struct *other_mm = vma->vm_mm;
201 
202 		/*
203 		 * __mmdrop() is a heavy operation and we don't need RCU
204 		 * protection here. Release RCU lock during these operations.
205 		 * We reinstate the RCU read lock as the caller expects it to
206 		 * be held when this function returns even on error.
207 		 */
208 		rcu_read_unlock();
209 		mmgrab(other_mm);
210 		vma_refcount_put(vma);
211 		mmdrop(other_mm);
212 		rcu_read_lock();
213 		return NULL;
214 	}
215 
216 	/*
217 	 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
218 	 * False unlocked result is impossible because we modify and check
219 	 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
220 	 * modification invalidates all existing locks.
221 	 *
222 	 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
223 	 * racing with vma_end_write_all(), we only start reading from the VMA
224 	 * after it has been unlocked.
225 	 * This pairs with RELEASE semantics in vma_end_write_all().
226 	 */
227 	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
228 		vma_refcount_put(vma);
229 		return NULL;
230 	}
231 
232 	return vma;
233 }
234 
235 /*
236  * Use only while holding mmap read lock which guarantees that locking will not
237  * fail (nobody can concurrently write-lock the vma). vma_start_read() should
238  * not be used in such cases because it might fail due to mm_lock_seq overflow.
239  * This functionality is used to obtain vma read lock and drop the mmap read lock.
240  */
vma_start_read_locked_nested(struct vm_area_struct * vma,int subclass)241 static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
242 {
243 	int oldcnt;
244 
245 	mmap_assert_locked(vma->vm_mm);
246 	if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
247 							      VMA_REF_LIMIT)))
248 		return false;
249 
250 	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
251 	return true;
252 }
253 
254 /*
255  * Use only while holding mmap read lock which guarantees that locking will not
256  * fail (nobody can concurrently write-lock the vma). vma_start_read() should
257  * not be used in such cases because it might fail due to mm_lock_seq overflow.
258  * This functionality is used to obtain vma read lock and drop the mmap read lock.
259  */
vma_start_read_locked(struct vm_area_struct * vma)260 static inline bool vma_start_read_locked(struct vm_area_struct *vma)
261 {
262 	return vma_start_read_locked_nested(vma, 0);
263 }
264 
vma_end_read(struct vm_area_struct * vma)265 static inline void vma_end_read(struct vm_area_struct *vma)
266 {
267 	vma_refcount_put(vma);
268 }
269 
270 /* WARNING! Can only be used if mmap_lock is expected to be write-locked */
__is_vma_write_locked(struct vm_area_struct * vma,unsigned int * mm_lock_seq)271 static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
272 {
273 	mmap_assert_write_locked(vma->vm_mm);
274 
275 	/*
276 	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
277 	 * mm->mm_lock_seq can't be concurrently modified.
278 	 */
279 	*mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
280 	return (vma->vm_lock_seq == *mm_lock_seq);
281 }
282 
283 void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);
284 
285 /*
286  * Begin writing to a VMA.
287  * Exclude concurrent readers under the per-VMA lock until the currently
288  * write-locked mmap_lock is dropped or downgraded.
289  */
vma_start_write(struct vm_area_struct * vma)290 static inline void vma_start_write(struct vm_area_struct *vma)
291 {
292 	unsigned int mm_lock_seq;
293 
294 	if (__is_vma_write_locked(vma, &mm_lock_seq))
295 		return;
296 
297 	__vma_start_write(vma, mm_lock_seq);
298 }
299 
vma_assert_write_locked(struct vm_area_struct * vma)300 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
301 {
302 	unsigned int mm_lock_seq;
303 
304 	VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
305 }
306 
vma_assert_locked(struct vm_area_struct * vma)307 static inline void vma_assert_locked(struct vm_area_struct *vma)
308 {
309 	unsigned int mm_lock_seq;
310 
311 	VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 &&
312 		      !__is_vma_write_locked(vma, &mm_lock_seq), vma);
313 }
314 
315 /*
316  * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
317  * assertions should be made either under mmap_write_lock or when the object
318  * has been isolated under mmap_write_lock, ensuring no competing writers.
319  */
vma_assert_attached(struct vm_area_struct * vma)320 static inline void vma_assert_attached(struct vm_area_struct *vma)
321 {
322 	WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
323 }
324 
vma_assert_detached(struct vm_area_struct * vma)325 static inline void vma_assert_detached(struct vm_area_struct *vma)
326 {
327 	WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
328 }
329 
vma_mark_attached(struct vm_area_struct * vma)330 static inline void vma_mark_attached(struct vm_area_struct *vma)
331 {
332 	vma_assert_write_locked(vma);
333 	vma_assert_detached(vma);
334 	refcount_set_release(&vma->vm_refcnt, 1);
335 }
336 
337 void vma_mark_detached(struct vm_area_struct *vma);
338 
339 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
340 					  unsigned long address);
341 
342 /*
343  * Locks next vma pointed by the iterator. Confirms the locked vma has not
344  * been modified and will retry under mmap_lock protection if modification
345  * was detected. Should be called from read RCU section.
346  * Returns either a valid locked VMA, NULL if no more VMAs or -EINTR if the
347  * process was interrupted.
348  */
349 struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
350 				     struct vma_iterator *iter,
351 				     unsigned long address);
352 
353 #else /* CONFIG_PER_VMA_LOCK */
354 
mm_lock_seqcount_init(struct mm_struct * mm)355 static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
mm_lock_seqcount_begin(struct mm_struct * mm)356 static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {}
mm_lock_seqcount_end(struct mm_struct * mm)357 static inline void mm_lock_seqcount_end(struct mm_struct *mm) {}
358 
mmap_lock_speculate_try_begin(struct mm_struct * mm,unsigned int * seq)359 static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
360 {
361 	return false;
362 }
363 
mmap_lock_speculate_retry(struct mm_struct * mm,unsigned int seq)364 static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
365 {
366 	return true;
367 }
vma_lock_init(struct vm_area_struct * vma,bool reset_refcnt)368 static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
vma_start_read(struct mm_struct * mm,struct vm_area_struct * vma)369 static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
370 						    struct vm_area_struct *vma)
371 		{ return NULL; }
vma_end_read(struct vm_area_struct * vma)372 static inline void vma_end_read(struct vm_area_struct *vma) {}
vma_start_write(struct vm_area_struct * vma)373 static inline void vma_start_write(struct vm_area_struct *vma) {}
vma_assert_write_locked(struct vm_area_struct * vma)374 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
375 		{ mmap_assert_write_locked(vma->vm_mm); }
vma_assert_attached(struct vm_area_struct * vma)376 static inline void vma_assert_attached(struct vm_area_struct *vma) {}
vma_assert_detached(struct vm_area_struct * vma)377 static inline void vma_assert_detached(struct vm_area_struct *vma) {}
vma_mark_attached(struct vm_area_struct * vma)378 static inline void vma_mark_attached(struct vm_area_struct *vma) {}
vma_mark_detached(struct vm_area_struct * vma)379 static inline void vma_mark_detached(struct vm_area_struct *vma) {}
380 
lock_vma_under_rcu(struct mm_struct * mm,unsigned long address)381 static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
382 		unsigned long address)
383 {
384 	return NULL;
385 }
386 
vma_assert_locked(struct vm_area_struct * vma)387 static inline void vma_assert_locked(struct vm_area_struct *vma)
388 {
389 	mmap_assert_locked(vma->vm_mm);
390 }
391 
392 #endif /* CONFIG_PER_VMA_LOCK */
393 
mmap_write_lock(struct mm_struct * mm)394 static inline void mmap_write_lock(struct mm_struct *mm)
395 {
396 	__mmap_lock_trace_start_locking(mm, true);
397 	down_write(&mm->mmap_lock);
398 	mm_lock_seqcount_begin(mm);
399 	__mmap_lock_trace_acquire_returned(mm, true, true);
400 }
401 
mmap_write_lock_nested(struct mm_struct * mm,int subclass)402 static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
403 {
404 	__mmap_lock_trace_start_locking(mm, true);
405 	down_write_nested(&mm->mmap_lock, subclass);
406 	mm_lock_seqcount_begin(mm);
407 	__mmap_lock_trace_acquire_returned(mm, true, true);
408 }
409 
mmap_write_lock_killable(struct mm_struct * mm)410 static inline int mmap_write_lock_killable(struct mm_struct *mm)
411 {
412 	int ret;
413 
414 	__mmap_lock_trace_start_locking(mm, true);
415 	ret = down_write_killable(&mm->mmap_lock);
416 	if (!ret)
417 		mm_lock_seqcount_begin(mm);
418 	__mmap_lock_trace_acquire_returned(mm, true, ret == 0);
419 	return ret;
420 }
421 
422 /*
423  * Drop all currently-held per-VMA locks.
424  * This is called from the mmap_lock implementation directly before releasing
425  * a write-locked mmap_lock (or downgrading it to read-locked).
426  * This should normally NOT be called manually from other places.
427  * If you want to call this manually anyway, keep in mind that this will release
428  * *all* VMA write locks, including ones from further up the stack.
429  */
vma_end_write_all(struct mm_struct * mm)430 static inline void vma_end_write_all(struct mm_struct *mm)
431 {
432 	mmap_assert_write_locked(mm);
433 	mm_lock_seqcount_end(mm);
434 }
435 
mmap_write_unlock(struct mm_struct * mm)436 static inline void mmap_write_unlock(struct mm_struct *mm)
437 {
438 	__mmap_lock_trace_released(mm, true);
439 	vma_end_write_all(mm);
440 	up_write(&mm->mmap_lock);
441 }
442 
mmap_write_downgrade(struct mm_struct * mm)443 static inline void mmap_write_downgrade(struct mm_struct *mm)
444 {
445 	__mmap_lock_trace_acquire_returned(mm, false, true);
446 	vma_end_write_all(mm);
447 	downgrade_write(&mm->mmap_lock);
448 }
449 
mmap_read_lock(struct mm_struct * mm)450 static inline void mmap_read_lock(struct mm_struct *mm)
451 {
452 	__mmap_lock_trace_start_locking(mm, false);
453 	down_read(&mm->mmap_lock);
454 	__mmap_lock_trace_acquire_returned(mm, false, true);
455 }
456 
mmap_read_lock_killable(struct mm_struct * mm)457 static inline int mmap_read_lock_killable(struct mm_struct *mm)
458 {
459 	int ret;
460 
461 	__mmap_lock_trace_start_locking(mm, false);
462 	ret = down_read_killable(&mm->mmap_lock);
463 	__mmap_lock_trace_acquire_returned(mm, false, ret == 0);
464 	return ret;
465 }
466 
mmap_read_trylock(struct mm_struct * mm)467 static inline bool mmap_read_trylock(struct mm_struct *mm)
468 {
469 	bool ret;
470 
471 	__mmap_lock_trace_start_locking(mm, false);
472 	ret = down_read_trylock(&mm->mmap_lock) != 0;
473 	__mmap_lock_trace_acquire_returned(mm, false, ret);
474 	return ret;
475 }
476 
mmap_read_unlock(struct mm_struct * mm)477 static inline void mmap_read_unlock(struct mm_struct *mm)
478 {
479 	__mmap_lock_trace_released(mm, false);
480 	up_read(&mm->mmap_lock);
481 }
482 
DEFINE_GUARD(mmap_read_lock,struct mm_struct *,mmap_read_lock (_T),mmap_read_unlock (_T))483 DEFINE_GUARD(mmap_read_lock, struct mm_struct *,
484 	     mmap_read_lock(_T), mmap_read_unlock(_T))
485 
486 static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
487 {
488 	__mmap_lock_trace_released(mm, false);
489 	up_read_non_owner(&mm->mmap_lock);
490 }
491 
mmap_lock_is_contended(struct mm_struct * mm)492 static inline int mmap_lock_is_contended(struct mm_struct *mm)
493 {
494 	return rwsem_is_contended(&mm->mmap_lock);
495 }
496 
497 #endif /* _LINUX_MMAP_LOCK_H */
498