1 // SPDX-License-Identifier: GPL-2.0 2 #define CREATE_TRACE_POINTS 3 #include <trace/events/mmap_lock.h> 4 5 #include <linux/mm.h> 6 #include <linux/cgroup.h> 7 #include <linux/memcontrol.h> 8 #include <linux/mmap_lock.h> 9 #include <linux/mutex.h> 10 #include <linux/percpu.h> 11 #include <linux/rcupdate.h> 12 #include <linux/smp.h> 13 #include <linux/trace_events.h> 14 #include <linux/local_lock.h> 15 16 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); 17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); 18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); 19 20 #ifdef CONFIG_TRACING 21 /* 22 * Trace calls must be in a separate file, as otherwise there's a circular 23 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. 24 */ 25 26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) 27 { 28 trace_mmap_lock_start_locking(mm, write); 29 } 30 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); 31 32 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, 33 bool success) 34 { 35 trace_mmap_lock_acquire_returned(mm, write, success); 36 } 37 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); 38 39 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) 40 { 41 trace_mmap_lock_released(mm, write); 42 } 43 EXPORT_SYMBOL(__mmap_lock_do_trace_released); 44 #endif /* CONFIG_TRACING */ 45 46 #ifdef CONFIG_MMU 47 #ifdef CONFIG_PER_VMA_LOCK 48 /* 49 * __vma_enter_locked() returns 0 immediately if the vma is not 50 * attached, otherwise it waits for any current readers to finish and 51 * returns 1. Returns -EINTR if a signal is received while waiting. 52 */ 53 static inline int __vma_enter_locked(struct vm_area_struct *vma, 54 bool detaching, int state) 55 { 56 int err; 57 unsigned int tgt_refcnt = VMA_LOCK_OFFSET; 58 59 mmap_assert_write_locked(vma->vm_mm); 60 61 /* Additional refcnt if the vma is attached. */ 62 if (!detaching) 63 tgt_refcnt++; 64 65 /* 66 * If vma is detached then only vma_mark_attached() can raise the 67 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). 68 */ 69 if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) 70 return 0; 71 72 rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); 73 err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, 74 refcount_read(&vma->vm_refcnt) == tgt_refcnt, 75 state); 76 if (err) { 77 if (refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt)) { 78 /* 79 * The wait failed, but the last reader went away 80 * as well. Tell the caller the VMA is detached. 81 */ 82 WARN_ON_ONCE(!detaching); 83 err = 0; 84 } 85 rwsem_release(&vma->vmlock_dep_map, _RET_IP_); 86 return err; 87 } 88 lock_acquired(&vma->vmlock_dep_map, _RET_IP_); 89 90 return 1; 91 } 92 93 static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) 94 { 95 *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); 96 rwsem_release(&vma->vmlock_dep_map, _RET_IP_); 97 } 98 99 int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, 100 int state) 101 { 102 int locked; 103 104 locked = __vma_enter_locked(vma, false, state); 105 if (locked < 0) 106 return locked; 107 108 /* 109 * We should use WRITE_ONCE() here because we can have concurrent reads 110 * from the early lockless pessimistic check in vma_start_read(). 111 * We don't really care about the correctness of that early check, but 112 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. 113 */ 114 WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); 115 116 if (locked) { 117 bool detached; 118 119 __vma_exit_locked(vma, &detached); 120 WARN_ON_ONCE(detached); /* vma should remain attached */ 121 } 122 123 return 0; 124 } 125 EXPORT_SYMBOL_GPL(__vma_start_write); 126 127 void vma_mark_detached(struct vm_area_struct *vma) 128 { 129 vma_assert_write_locked(vma); 130 vma_assert_attached(vma); 131 132 /* 133 * We are the only writer, so no need to use vma_refcount_put(). 134 * The condition below is unlikely because the vma has been already 135 * write-locked and readers can increment vm_refcnt only temporarily 136 * before they check vm_lock_seq, realize the vma is locked and drop 137 * back the vm_refcnt. That is a narrow window for observing a raised 138 * vm_refcnt. 139 */ 140 if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { 141 /* Wait until vma is detached with no readers. */ 142 if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) { 143 bool detached; 144 145 __vma_exit_locked(vma, &detached); 146 WARN_ON_ONCE(!detached); 147 } 148 } 149 } 150 151 /* 152 * Try to read-lock a vma. The function is allowed to occasionally yield false 153 * locked result to avoid performance overhead, in which case we fall back to 154 * using mmap_lock. The function should never yield false unlocked result. 155 * False locked result is possible if mm_lock_seq overflows or if vma gets 156 * reused and attached to a different mm before we lock it. 157 * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got 158 * detached. 159 * 160 * IMPORTANT: RCU lock must be held upon entering the function, but upon error 161 * IT IS RELEASED. The caller must handle this correctly. 162 */ 163 static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, 164 struct vm_area_struct *vma) 165 { 166 struct mm_struct *other_mm; 167 int oldcnt; 168 169 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); 170 /* 171 * Check before locking. A race might cause false locked result. 172 * We can use READ_ONCE() for the mm_lock_seq here, and don't need 173 * ACQUIRE semantics, because this is just a lockless check whose result 174 * we don't rely on for anything - the mm_lock_seq read against which we 175 * need ordering is below. 176 */ 177 if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) { 178 vma = NULL; 179 goto err; 180 } 181 182 /* 183 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() 184 * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. 185 * Acquire fence is required here to avoid reordering against later 186 * vm_lock_seq check and checks inside lock_vma_under_rcu(). 187 */ 188 if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, 189 VMA_REF_LIMIT))) { 190 /* return EAGAIN if vma got detached from under us */ 191 vma = oldcnt ? NULL : ERR_PTR(-EAGAIN); 192 goto err; 193 } 194 195 rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); 196 197 if (unlikely(vma->vm_mm != mm)) 198 goto err_unstable; 199 200 /* 201 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. 202 * False unlocked result is impossible because we modify and check 203 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq 204 * modification invalidates all existing locks. 205 * 206 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are 207 * racing with vma_end_write_all(), we only start reading from the VMA 208 * after it has been unlocked. 209 * This pairs with RELEASE semantics in vma_end_write_all(). 210 */ 211 if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { 212 vma_refcount_put(vma); 213 vma = NULL; 214 goto err; 215 } 216 217 return vma; 218 err: 219 rcu_read_unlock(); 220 221 return vma; 222 err_unstable: 223 /* 224 * If vma got attached to another mm from under us, that mm is not 225 * stable and can be freed in the narrow window after vma->vm_refcnt 226 * is dropped and before rcuwait_wake_up(mm) is called. Grab it before 227 * releasing vma->vm_refcnt. 228 */ 229 other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */ 230 231 /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */ 232 rcu_read_unlock(); 233 mmgrab(other_mm); 234 vma_refcount_put(vma); 235 mmdrop(other_mm); 236 237 return NULL; 238 } 239 240 /* 241 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be 242 * stable and not isolated. If the VMA is not found or is being modified the 243 * function returns NULL. 244 */ 245 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, 246 unsigned long address) 247 { 248 MA_STATE(mas, &mm->mm_mt, address, address); 249 struct vm_area_struct *vma; 250 251 retry: 252 rcu_read_lock(); 253 vma = mas_walk(&mas); 254 if (!vma) { 255 rcu_read_unlock(); 256 goto inval; 257 } 258 259 vma = vma_start_read(mm, vma); 260 if (IS_ERR_OR_NULL(vma)) { 261 /* Check if the VMA got isolated after we found it */ 262 if (PTR_ERR(vma) == -EAGAIN) { 263 count_vm_vma_lock_event(VMA_LOCK_MISS); 264 /* The area was replaced with another one */ 265 mas_set(&mas, address); 266 goto retry; 267 } 268 269 /* Failed to lock the VMA */ 270 goto inval; 271 } 272 /* 273 * At this point, we have a stable reference to a VMA: The VMA is 274 * locked and we know it hasn't already been isolated. 275 * From here on, we can access the VMA without worrying about which 276 * fields are accessible for RCU readers. 277 */ 278 rcu_read_unlock(); 279 280 /* Check if the vma we locked is the right one. */ 281 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 282 vma_end_read(vma); 283 goto inval; 284 } 285 286 return vma; 287 288 inval: 289 count_vm_vma_lock_event(VMA_LOCK_ABORT); 290 return NULL; 291 } 292 293 static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm, 294 struct vma_iterator *vmi, 295 unsigned long from_addr) 296 { 297 struct vm_area_struct *vma; 298 int ret; 299 300 ret = mmap_read_lock_killable(mm); 301 if (ret) 302 return ERR_PTR(ret); 303 304 /* Lookup the vma at the last position again under mmap_read_lock */ 305 vma_iter_set(vmi, from_addr); 306 vma = vma_next(vmi); 307 if (vma) { 308 /* Very unlikely vma->vm_refcnt overflow case */ 309 if (unlikely(!vma_start_read_locked(vma))) 310 vma = ERR_PTR(-EAGAIN); 311 } 312 313 mmap_read_unlock(mm); 314 315 return vma; 316 } 317 318 struct vm_area_struct *lock_next_vma(struct mm_struct *mm, 319 struct vma_iterator *vmi, 320 unsigned long from_addr) 321 { 322 struct vm_area_struct *vma; 323 unsigned int mm_wr_seq; 324 bool mmap_unlocked; 325 326 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); 327 retry: 328 /* Start mmap_lock speculation in case we need to verify the vma later */ 329 mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); 330 vma = vma_next(vmi); 331 if (!vma) 332 return NULL; 333 334 vma = vma_start_read(mm, vma); 335 if (IS_ERR_OR_NULL(vma)) { 336 /* 337 * Retry immediately if the vma gets detached from under us. 338 * Infinite loop should not happen because the vma we find will 339 * have to be constantly knocked out from under us. 340 */ 341 if (PTR_ERR(vma) == -EAGAIN) { 342 /* reset to search from the last address */ 343 rcu_read_lock(); 344 vma_iter_set(vmi, from_addr); 345 goto retry; 346 } 347 348 goto fallback; 349 } 350 351 /* Verify the vma is not behind the last search position. */ 352 if (unlikely(from_addr >= vma->vm_end)) 353 goto fallback_unlock; 354 355 /* 356 * vma can be ahead of the last search position but we need to verify 357 * it was not shrunk after we found it and another vma has not been 358 * installed ahead of it. Otherwise we might observe a gap that should 359 * not be there. 360 */ 361 if (from_addr < vma->vm_start) { 362 /* Verify only if the address space might have changed since vma lookup. */ 363 if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) { 364 vma_iter_set(vmi, from_addr); 365 if (vma != vma_next(vmi)) 366 goto fallback_unlock; 367 } 368 } 369 370 return vma; 371 372 fallback_unlock: 373 rcu_read_unlock(); 374 vma_end_read(vma); 375 fallback: 376 vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr); 377 rcu_read_lock(); 378 /* Reinitialize the iterator after re-entering rcu read section */ 379 vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end); 380 381 return vma; 382 } 383 #endif /* CONFIG_PER_VMA_LOCK */ 384 385 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA 386 #include <linux/extable.h> 387 388 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 389 { 390 if (likely(mmap_read_trylock(mm))) 391 return true; 392 393 if (regs && !user_mode(regs)) { 394 unsigned long ip = exception_ip(regs); 395 if (!search_exception_tables(ip)) 396 return false; 397 } 398 399 return !mmap_read_lock_killable(mm); 400 } 401 402 static inline bool mmap_upgrade_trylock(struct mm_struct *mm) 403 { 404 /* 405 * We don't have this operation yet. 406 * 407 * It should be easy enough to do: it's basically a 408 * atomic_long_try_cmpxchg_acquire() 409 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but 410 * it also needs the proper lockdep magic etc. 411 */ 412 return false; 413 } 414 415 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 416 { 417 mmap_read_unlock(mm); 418 if (regs && !user_mode(regs)) { 419 unsigned long ip = exception_ip(regs); 420 if (!search_exception_tables(ip)) 421 return false; 422 } 423 return !mmap_write_lock_killable(mm); 424 } 425 426 /* 427 * Helper for page fault handling. 428 * 429 * This is kind of equivalent to "mmap_read_lock()" followed 430 * by "find_extend_vma()", except it's a lot more careful about 431 * the locking (and will drop the lock on failure). 432 * 433 * For example, if we have a kernel bug that causes a page 434 * fault, we don't want to just use mmap_read_lock() to get 435 * the mm lock, because that would deadlock if the bug were 436 * to happen while we're holding the mm lock for writing. 437 * 438 * So this checks the exception tables on kernel faults in 439 * order to only do this all for instructions that are actually 440 * expected to fault. 441 * 442 * We can also actually take the mm lock for writing if we 443 * need to extend the vma, which helps the VM layer a lot. 444 */ 445 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 446 unsigned long addr, struct pt_regs *regs) 447 { 448 struct vm_area_struct *vma; 449 450 if (!get_mmap_lock_carefully(mm, regs)) 451 return NULL; 452 453 vma = find_vma(mm, addr); 454 if (likely(vma && (vma->vm_start <= addr))) 455 return vma; 456 457 /* 458 * Well, dang. We might still be successful, but only 459 * if we can extend a vma to do so. 460 */ 461 if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { 462 mmap_read_unlock(mm); 463 return NULL; 464 } 465 466 /* 467 * We can try to upgrade the mmap lock atomically, 468 * in which case we can continue to use the vma 469 * we already looked up. 470 * 471 * Otherwise we'll have to drop the mmap lock and 472 * re-take it, and also look up the vma again, 473 * re-checking it. 474 */ 475 if (!mmap_upgrade_trylock(mm)) { 476 if (!upgrade_mmap_lock_carefully(mm, regs)) 477 return NULL; 478 479 vma = find_vma(mm, addr); 480 if (!vma) 481 goto fail; 482 if (vma->vm_start <= addr) 483 goto success; 484 if (!(vma->vm_flags & VM_GROWSDOWN)) 485 goto fail; 486 } 487 488 if (expand_stack_locked(vma, addr)) 489 goto fail; 490 491 success: 492 mmap_write_downgrade(mm); 493 return vma; 494 495 fail: 496 mmap_write_unlock(mm); 497 return NULL; 498 } 499 #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */ 500 501 #else /* CONFIG_MMU */ 502 503 /* 504 * At least xtensa ends up having protection faults even with no 505 * MMU.. No stack expansion, at least. 506 */ 507 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 508 unsigned long addr, struct pt_regs *regs) 509 { 510 struct vm_area_struct *vma; 511 512 mmap_read_lock(mm); 513 vma = vma_lookup(mm, addr); 514 if (!vma) 515 mmap_read_unlock(mm); 516 return vma; 517 } 518 519 #endif /* CONFIG_MMU */ 520