xref: /linux/mm/mmap_lock.c (revision e0c0ab04f6785abaa71b9b8dc252cb1a2072c225)
1 // SPDX-License-Identifier: GPL-2.0
2 #define CREATE_TRACE_POINTS
3 #include <trace/events/mmap_lock.h>
4 
5 #include <linux/mm.h>
6 #include <linux/cgroup.h>
7 #include <linux/memcontrol.h>
8 #include <linux/mmap_lock.h>
9 #include <linux/mutex.h>
10 #include <linux/percpu.h>
11 #include <linux/rcupdate.h>
12 #include <linux/smp.h>
13 #include <linux/trace_events.h>
14 #include <linux/local_lock.h>
15 
16 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
19 
20 #ifdef CONFIG_TRACING
21 /*
22  * Trace calls must be in a separate file, as otherwise there's a circular
23  * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
24  */
25 
26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
27 {
28 	trace_mmap_lock_start_locking(mm, write);
29 }
30 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
31 
32 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
33 					   bool success)
34 {
35 	trace_mmap_lock_acquire_returned(mm, write, success);
36 }
37 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
38 
39 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
40 {
41 	trace_mmap_lock_released(mm, write);
42 }
43 EXPORT_SYMBOL(__mmap_lock_do_trace_released);
44 #endif /* CONFIG_TRACING */
45 
46 #ifdef CONFIG_MMU
47 #ifdef CONFIG_PER_VMA_LOCK
48 static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
49 {
50 	unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
51 
52 	/* Additional refcnt if the vma is attached. */
53 	if (!detaching)
54 		tgt_refcnt++;
55 
56 	/*
57 	 * If vma is detached then only vma_mark_attached() can raise the
58 	 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
59 	 */
60 	if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
61 		return false;
62 
63 	rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
64 	rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
65 		   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
66 		   TASK_UNINTERRUPTIBLE);
67 	lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
68 
69 	return true;
70 }
71 
72 static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
73 {
74 	*detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
75 	rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
76 }
77 
78 void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
79 {
80 	bool locked;
81 
82 	/*
83 	 * __vma_enter_locked() returns false immediately if the vma is not
84 	 * attached, otherwise it waits until refcnt is indicating that vma
85 	 * is attached with no readers.
86 	 */
87 	locked = __vma_enter_locked(vma, false);
88 
89 	/*
90 	 * We should use WRITE_ONCE() here because we can have concurrent reads
91 	 * from the early lockless pessimistic check in vma_start_read().
92 	 * We don't really care about the correctness of that early check, but
93 	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
94 	 */
95 	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
96 
97 	if (locked) {
98 		bool detached;
99 
100 		__vma_exit_locked(vma, &detached);
101 		WARN_ON_ONCE(detached); /* vma should remain attached */
102 	}
103 }
104 EXPORT_SYMBOL_GPL(__vma_start_write);
105 
106 void vma_mark_detached(struct vm_area_struct *vma)
107 {
108 	vma_assert_write_locked(vma);
109 	vma_assert_attached(vma);
110 
111 	/*
112 	 * We are the only writer, so no need to use vma_refcount_put().
113 	 * The condition below is unlikely because the vma has been already
114 	 * write-locked and readers can increment vm_refcnt only temporarily
115 	 * before they check vm_lock_seq, realize the vma is locked and drop
116 	 * back the vm_refcnt. That is a narrow window for observing a raised
117 	 * vm_refcnt.
118 	 */
119 	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
120 		/* Wait until vma is detached with no readers. */
121 		if (__vma_enter_locked(vma, true)) {
122 			bool detached;
123 
124 			__vma_exit_locked(vma, &detached);
125 			WARN_ON_ONCE(!detached);
126 		}
127 	}
128 }
129 
130 /*
131  * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
132  * stable and not isolated. If the VMA is not found or is being modified the
133  * function returns NULL.
134  */
135 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
136 					  unsigned long address)
137 {
138 	MA_STATE(mas, &mm->mm_mt, address, address);
139 	struct vm_area_struct *vma;
140 
141 	rcu_read_lock();
142 retry:
143 	vma = mas_walk(&mas);
144 	if (!vma)
145 		goto inval;
146 
147 	vma = vma_start_read(mm, vma);
148 	if (IS_ERR_OR_NULL(vma)) {
149 		/* Check if the VMA got isolated after we found it */
150 		if (PTR_ERR(vma) == -EAGAIN) {
151 			count_vm_vma_lock_event(VMA_LOCK_MISS);
152 			/* The area was replaced with another one */
153 			goto retry;
154 		}
155 
156 		/* Failed to lock the VMA */
157 		goto inval;
158 	}
159 	/*
160 	 * At this point, we have a stable reference to a VMA: The VMA is
161 	 * locked and we know it hasn't already been isolated.
162 	 * From here on, we can access the VMA without worrying about which
163 	 * fields are accessible for RCU readers.
164 	 */
165 
166 	/* Check if the vma we locked is the right one. */
167 	if (unlikely(vma->vm_mm != mm ||
168 		     address < vma->vm_start || address >= vma->vm_end))
169 		goto inval_end_read;
170 
171 	rcu_read_unlock();
172 	return vma;
173 
174 inval_end_read:
175 	vma_end_read(vma);
176 inval:
177 	rcu_read_unlock();
178 	count_vm_vma_lock_event(VMA_LOCK_ABORT);
179 	return NULL;
180 }
181 #endif /* CONFIG_PER_VMA_LOCK */
182 
183 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA
184 #include <linux/extable.h>
185 
186 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
187 {
188 	if (likely(mmap_read_trylock(mm)))
189 		return true;
190 
191 	if (regs && !user_mode(regs)) {
192 		unsigned long ip = exception_ip(regs);
193 		if (!search_exception_tables(ip))
194 			return false;
195 	}
196 
197 	return !mmap_read_lock_killable(mm);
198 }
199 
200 static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
201 {
202 	/*
203 	 * We don't have this operation yet.
204 	 *
205 	 * It should be easy enough to do: it's basically a
206 	 *    atomic_long_try_cmpxchg_acquire()
207 	 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
208 	 * it also needs the proper lockdep magic etc.
209 	 */
210 	return false;
211 }
212 
213 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
214 {
215 	mmap_read_unlock(mm);
216 	if (regs && !user_mode(regs)) {
217 		unsigned long ip = exception_ip(regs);
218 		if (!search_exception_tables(ip))
219 			return false;
220 	}
221 	return !mmap_write_lock_killable(mm);
222 }
223 
224 /*
225  * Helper for page fault handling.
226  *
227  * This is kind of equivalent to "mmap_read_lock()" followed
228  * by "find_extend_vma()", except it's a lot more careful about
229  * the locking (and will drop the lock on failure).
230  *
231  * For example, if we have a kernel bug that causes a page
232  * fault, we don't want to just use mmap_read_lock() to get
233  * the mm lock, because that would deadlock if the bug were
234  * to happen while we're holding the mm lock for writing.
235  *
236  * So this checks the exception tables on kernel faults in
237  * order to only do this all for instructions that are actually
238  * expected to fault.
239  *
240  * We can also actually take the mm lock for writing if we
241  * need to extend the vma, which helps the VM layer a lot.
242  */
243 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
244 			unsigned long addr, struct pt_regs *regs)
245 {
246 	struct vm_area_struct *vma;
247 
248 	if (!get_mmap_lock_carefully(mm, regs))
249 		return NULL;
250 
251 	vma = find_vma(mm, addr);
252 	if (likely(vma && (vma->vm_start <= addr)))
253 		return vma;
254 
255 	/*
256 	 * Well, dang. We might still be successful, but only
257 	 * if we can extend a vma to do so.
258 	 */
259 	if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
260 		mmap_read_unlock(mm);
261 		return NULL;
262 	}
263 
264 	/*
265 	 * We can try to upgrade the mmap lock atomically,
266 	 * in which case we can continue to use the vma
267 	 * we already looked up.
268 	 *
269 	 * Otherwise we'll have to drop the mmap lock and
270 	 * re-take it, and also look up the vma again,
271 	 * re-checking it.
272 	 */
273 	if (!mmap_upgrade_trylock(mm)) {
274 		if (!upgrade_mmap_lock_carefully(mm, regs))
275 			return NULL;
276 
277 		vma = find_vma(mm, addr);
278 		if (!vma)
279 			goto fail;
280 		if (vma->vm_start <= addr)
281 			goto success;
282 		if (!(vma->vm_flags & VM_GROWSDOWN))
283 			goto fail;
284 	}
285 
286 	if (expand_stack_locked(vma, addr))
287 		goto fail;
288 
289 success:
290 	mmap_write_downgrade(mm);
291 	return vma;
292 
293 fail:
294 	mmap_write_unlock(mm);
295 	return NULL;
296 }
297 #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
298 
299 #else /* CONFIG_MMU */
300 
301 /*
302  * At least xtensa ends up having protection faults even with no
303  * MMU.. No stack expansion, at least.
304  */
305 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
306 			unsigned long addr, struct pt_regs *regs)
307 {
308 	struct vm_area_struct *vma;
309 
310 	mmap_read_lock(mm);
311 	vma = vma_lookup(mm, addr);
312 	if (!vma)
313 		mmap_read_unlock(mm);
314 	return vma;
315 }
316 
317 #endif /* CONFIG_MMU */
318