1 // SPDX-License-Identifier: GPL-2.0 2 #define CREATE_TRACE_POINTS 3 #include <trace/events/mmap_lock.h> 4 5 #include <linux/mm.h> 6 #include <linux/cgroup.h> 7 #include <linux/memcontrol.h> 8 #include <linux/mmap_lock.h> 9 #include <linux/mutex.h> 10 #include <linux/percpu.h> 11 #include <linux/rcupdate.h> 12 #include <linux/smp.h> 13 #include <linux/trace_events.h> 14 #include <linux/local_lock.h> 15 16 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); 17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); 18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); 19 20 #ifdef CONFIG_TRACING 21 /* 22 * Trace calls must be in a separate file, as otherwise there's a circular 23 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. 24 */ 25 26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) 27 { 28 trace_mmap_lock_start_locking(mm, write); 29 } 30 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); 31 32 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, 33 bool success) 34 { 35 trace_mmap_lock_acquire_returned(mm, write, success); 36 } 37 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); 38 39 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) 40 { 41 trace_mmap_lock_released(mm, write); 42 } 43 EXPORT_SYMBOL(__mmap_lock_do_trace_released); 44 #endif /* CONFIG_TRACING */ 45 46 #ifdef CONFIG_MMU 47 #ifdef CONFIG_PER_VMA_LOCK 48 static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) 49 { 50 unsigned int tgt_refcnt = VMA_LOCK_OFFSET; 51 52 /* Additional refcnt if the vma is attached. */ 53 if (!detaching) 54 tgt_refcnt++; 55 56 /* 57 * If vma is detached then only vma_mark_attached() can raise the 58 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). 59 */ 60 if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) 61 return false; 62 63 rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); 64 rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, 65 refcount_read(&vma->vm_refcnt) == tgt_refcnt, 66 TASK_UNINTERRUPTIBLE); 67 lock_acquired(&vma->vmlock_dep_map, _RET_IP_); 68 69 return true; 70 } 71 72 static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) 73 { 74 *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); 75 rwsem_release(&vma->vmlock_dep_map, _RET_IP_); 76 } 77 78 void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) 79 { 80 bool locked; 81 82 /* 83 * __vma_enter_locked() returns false immediately if the vma is not 84 * attached, otherwise it waits until refcnt is indicating that vma 85 * is attached with no readers. 86 */ 87 locked = __vma_enter_locked(vma, false); 88 89 /* 90 * We should use WRITE_ONCE() here because we can have concurrent reads 91 * from the early lockless pessimistic check in vma_start_read(). 92 * We don't really care about the correctness of that early check, but 93 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. 94 */ 95 WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); 96 97 if (locked) { 98 bool detached; 99 100 __vma_exit_locked(vma, &detached); 101 WARN_ON_ONCE(detached); /* vma should remain attached */ 102 } 103 } 104 EXPORT_SYMBOL_GPL(__vma_start_write); 105 106 void vma_mark_detached(struct vm_area_struct *vma) 107 { 108 vma_assert_write_locked(vma); 109 vma_assert_attached(vma); 110 111 /* 112 * We are the only writer, so no need to use vma_refcount_put(). 113 * The condition below is unlikely because the vma has been already 114 * write-locked and readers can increment vm_refcnt only temporarily 115 * before they check vm_lock_seq, realize the vma is locked and drop 116 * back the vm_refcnt. That is a narrow window for observing a raised 117 * vm_refcnt. 118 */ 119 if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { 120 /* Wait until vma is detached with no readers. */ 121 if (__vma_enter_locked(vma, true)) { 122 bool detached; 123 124 __vma_exit_locked(vma, &detached); 125 WARN_ON_ONCE(!detached); 126 } 127 } 128 } 129 130 /* 131 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be 132 * stable and not isolated. If the VMA is not found or is being modified the 133 * function returns NULL. 134 */ 135 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, 136 unsigned long address) 137 { 138 MA_STATE(mas, &mm->mm_mt, address, address); 139 struct vm_area_struct *vma; 140 141 rcu_read_lock(); 142 retry: 143 vma = mas_walk(&mas); 144 if (!vma) 145 goto inval; 146 147 vma = vma_start_read(mm, vma); 148 if (IS_ERR_OR_NULL(vma)) { 149 /* Check if the VMA got isolated after we found it */ 150 if (PTR_ERR(vma) == -EAGAIN) { 151 count_vm_vma_lock_event(VMA_LOCK_MISS); 152 /* The area was replaced with another one */ 153 goto retry; 154 } 155 156 /* Failed to lock the VMA */ 157 goto inval; 158 } 159 /* 160 * At this point, we have a stable reference to a VMA: The VMA is 161 * locked and we know it hasn't already been isolated. 162 * From here on, we can access the VMA without worrying about which 163 * fields are accessible for RCU readers. 164 */ 165 166 /* Check if the vma we locked is the right one. */ 167 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) 168 goto inval_end_read; 169 170 rcu_read_unlock(); 171 return vma; 172 173 inval_end_read: 174 vma_end_read(vma); 175 inval: 176 rcu_read_unlock(); 177 count_vm_vma_lock_event(VMA_LOCK_ABORT); 178 return NULL; 179 } 180 181 static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm, 182 struct vma_iterator *vmi, 183 unsigned long from_addr) 184 { 185 struct vm_area_struct *vma; 186 int ret; 187 188 ret = mmap_read_lock_killable(mm); 189 if (ret) 190 return ERR_PTR(ret); 191 192 /* Lookup the vma at the last position again under mmap_read_lock */ 193 vma_iter_set(vmi, from_addr); 194 vma = vma_next(vmi); 195 if (vma) { 196 /* Very unlikely vma->vm_refcnt overflow case */ 197 if (unlikely(!vma_start_read_locked(vma))) 198 vma = ERR_PTR(-EAGAIN); 199 } 200 201 mmap_read_unlock(mm); 202 203 return vma; 204 } 205 206 struct vm_area_struct *lock_next_vma(struct mm_struct *mm, 207 struct vma_iterator *vmi, 208 unsigned long from_addr) 209 { 210 struct vm_area_struct *vma; 211 unsigned int mm_wr_seq; 212 bool mmap_unlocked; 213 214 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); 215 retry: 216 /* Start mmap_lock speculation in case we need to verify the vma later */ 217 mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); 218 vma = vma_next(vmi); 219 if (!vma) 220 return NULL; 221 222 vma = vma_start_read(mm, vma); 223 if (IS_ERR_OR_NULL(vma)) { 224 /* 225 * Retry immediately if the vma gets detached from under us. 226 * Infinite loop should not happen because the vma we find will 227 * have to be constantly knocked out from under us. 228 */ 229 if (PTR_ERR(vma) == -EAGAIN) { 230 /* reset to search from the last address */ 231 vma_iter_set(vmi, from_addr); 232 goto retry; 233 } 234 235 goto fallback; 236 } 237 238 /* Verify the vma is not behind the last search position. */ 239 if (unlikely(from_addr >= vma->vm_end)) 240 goto fallback_unlock; 241 242 /* 243 * vma can be ahead of the last search position but we need to verify 244 * it was not shrunk after we found it and another vma has not been 245 * installed ahead of it. Otherwise we might observe a gap that should 246 * not be there. 247 */ 248 if (from_addr < vma->vm_start) { 249 /* Verify only if the address space might have changed since vma lookup. */ 250 if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) { 251 vma_iter_set(vmi, from_addr); 252 if (vma != vma_next(vmi)) 253 goto fallback_unlock; 254 } 255 } 256 257 return vma; 258 259 fallback_unlock: 260 vma_end_read(vma); 261 fallback: 262 rcu_read_unlock(); 263 vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr); 264 rcu_read_lock(); 265 /* Reinitialize the iterator after re-entering rcu read section */ 266 vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end); 267 268 return vma; 269 } 270 #endif /* CONFIG_PER_VMA_LOCK */ 271 272 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA 273 #include <linux/extable.h> 274 275 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 276 { 277 if (likely(mmap_read_trylock(mm))) 278 return true; 279 280 if (regs && !user_mode(regs)) { 281 unsigned long ip = exception_ip(regs); 282 if (!search_exception_tables(ip)) 283 return false; 284 } 285 286 return !mmap_read_lock_killable(mm); 287 } 288 289 static inline bool mmap_upgrade_trylock(struct mm_struct *mm) 290 { 291 /* 292 * We don't have this operation yet. 293 * 294 * It should be easy enough to do: it's basically a 295 * atomic_long_try_cmpxchg_acquire() 296 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but 297 * it also needs the proper lockdep magic etc. 298 */ 299 return false; 300 } 301 302 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 303 { 304 mmap_read_unlock(mm); 305 if (regs && !user_mode(regs)) { 306 unsigned long ip = exception_ip(regs); 307 if (!search_exception_tables(ip)) 308 return false; 309 } 310 return !mmap_write_lock_killable(mm); 311 } 312 313 /* 314 * Helper for page fault handling. 315 * 316 * This is kind of equivalent to "mmap_read_lock()" followed 317 * by "find_extend_vma()", except it's a lot more careful about 318 * the locking (and will drop the lock on failure). 319 * 320 * For example, if we have a kernel bug that causes a page 321 * fault, we don't want to just use mmap_read_lock() to get 322 * the mm lock, because that would deadlock if the bug were 323 * to happen while we're holding the mm lock for writing. 324 * 325 * So this checks the exception tables on kernel faults in 326 * order to only do this all for instructions that are actually 327 * expected to fault. 328 * 329 * We can also actually take the mm lock for writing if we 330 * need to extend the vma, which helps the VM layer a lot. 331 */ 332 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 333 unsigned long addr, struct pt_regs *regs) 334 { 335 struct vm_area_struct *vma; 336 337 if (!get_mmap_lock_carefully(mm, regs)) 338 return NULL; 339 340 vma = find_vma(mm, addr); 341 if (likely(vma && (vma->vm_start <= addr))) 342 return vma; 343 344 /* 345 * Well, dang. We might still be successful, but only 346 * if we can extend a vma to do so. 347 */ 348 if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { 349 mmap_read_unlock(mm); 350 return NULL; 351 } 352 353 /* 354 * We can try to upgrade the mmap lock atomically, 355 * in which case we can continue to use the vma 356 * we already looked up. 357 * 358 * Otherwise we'll have to drop the mmap lock and 359 * re-take it, and also look up the vma again, 360 * re-checking it. 361 */ 362 if (!mmap_upgrade_trylock(mm)) { 363 if (!upgrade_mmap_lock_carefully(mm, regs)) 364 return NULL; 365 366 vma = find_vma(mm, addr); 367 if (!vma) 368 goto fail; 369 if (vma->vm_start <= addr) 370 goto success; 371 if (!(vma->vm_flags & VM_GROWSDOWN)) 372 goto fail; 373 } 374 375 if (expand_stack_locked(vma, addr)) 376 goto fail; 377 378 success: 379 mmap_write_downgrade(mm); 380 return vma; 381 382 fail: 383 mmap_write_unlock(mm); 384 return NULL; 385 } 386 #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */ 387 388 #else /* CONFIG_MMU */ 389 390 /* 391 * At least xtensa ends up having protection faults even with no 392 * MMU.. No stack expansion, at least. 393 */ 394 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 395 unsigned long addr, struct pt_regs *regs) 396 { 397 struct vm_area_struct *vma; 398 399 mmap_read_lock(mm); 400 vma = vma_lookup(mm, addr); 401 if (!vma) 402 mmap_read_unlock(mm); 403 return vma; 404 } 405 406 #endif /* CONFIG_MMU */ 407