1 // SPDX-License-Identifier: GPL-2.0 2 #define CREATE_TRACE_POINTS 3 #include <trace/events/mmap_lock.h> 4 5 #include <linux/mm.h> 6 #include <linux/cgroup.h> 7 #include <linux/memcontrol.h> 8 #include <linux/mmap_lock.h> 9 #include <linux/mutex.h> 10 #include <linux/percpu.h> 11 #include <linux/rcupdate.h> 12 #include <linux/smp.h> 13 #include <linux/trace_events.h> 14 #include <linux/local_lock.h> 15 16 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); 17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); 18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); 19 20 #ifdef CONFIG_TRACING 21 /* 22 * Trace calls must be in a separate file, as otherwise there's a circular 23 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. 24 */ 25 26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) 27 { 28 trace_mmap_lock_start_locking(mm, write); 29 } 30 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); 31 32 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, 33 bool success) 34 { 35 trace_mmap_lock_acquire_returned(mm, write, success); 36 } 37 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); 38 39 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) 40 { 41 trace_mmap_lock_released(mm, write); 42 } 43 EXPORT_SYMBOL(__mmap_lock_do_trace_released); 44 #endif /* CONFIG_TRACING */ 45 46 #ifdef CONFIG_MMU 47 #ifdef CONFIG_PER_VMA_LOCK 48 49 /* State shared across __vma_[start, end]_exclude_readers. */ 50 struct vma_exclude_readers_state { 51 /* Input parameters. */ 52 struct vm_area_struct *vma; 53 int state; /* TASK_KILLABLE or TASK_UNINTERRUPTIBLE. */ 54 bool detaching; 55 56 /* Output parameters. */ 57 bool detached; 58 bool exclusive; /* Are we exclusively locked? */ 59 }; 60 61 /* 62 * Now that all readers have been evicted, mark the VMA as being out of the 63 * 'exclude readers' state. 64 */ 65 static void __vma_end_exclude_readers(struct vma_exclude_readers_state *ves) 66 { 67 struct vm_area_struct *vma = ves->vma; 68 69 VM_WARN_ON_ONCE(ves->detached); 70 71 ves->detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG, 72 &vma->vm_refcnt); 73 __vma_lockdep_release_exclusive(vma); 74 } 75 76 static unsigned int get_target_refcnt(struct vma_exclude_readers_state *ves) 77 { 78 const unsigned int tgt = ves->detaching ? 0 : 1; 79 80 return tgt | VM_REFCNT_EXCLUDE_READERS_FLAG; 81 } 82 83 /* 84 * Mark the VMA as being in a state of excluding readers, check to see if any 85 * VMA read locks are indeed held, and if so wait for them to be released. 86 * 87 * Note that this function pairs with vma_refcount_put() which will wake up this 88 * thread when it detects that the last reader has released its lock. 89 * 90 * The ves->state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases 91 * where we wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal 92 * signal is permitted to kill it. 93 * 94 * The function sets the ves->exclusive parameter to true if readers were 95 * excluded, or false if the VMA was detached or an error arose on wait. 96 * 97 * If the function indicates an exclusive lock was acquired via ves->exclusive 98 * the caller is required to invoke __vma_end_exclude_readers() once the 99 * exclusive state is no longer required. 100 * 101 * If ves->state is set to something other than TASK_UNINTERRUPTIBLE, the 102 * function may also return -EINTR to indicate a fatal signal was received while 103 * waiting. Otherwise, the function returns 0. 104 */ 105 static int __vma_start_exclude_readers(struct vma_exclude_readers_state *ves) 106 { 107 struct vm_area_struct *vma = ves->vma; 108 unsigned int tgt_refcnt = get_target_refcnt(ves); 109 int err = 0; 110 111 mmap_assert_write_locked(vma->vm_mm); 112 113 /* 114 * If vma is detached then only vma_mark_attached() can raise the 115 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). 116 * 117 * See the comment describing the vm_area_struct->vm_refcnt field for 118 * details of possible refcnt values. 119 */ 120 if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) { 121 ves->detached = true; 122 return 0; 123 } 124 125 __vma_lockdep_acquire_exclusive(vma); 126 err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, 127 refcount_read(&vma->vm_refcnt) == tgt_refcnt, 128 ves->state); 129 if (err) { 130 __vma_end_exclude_readers(ves); 131 return err; 132 } 133 134 __vma_lockdep_stat_mark_acquired(vma); 135 ves->exclusive = true; 136 return 0; 137 } 138 139 int __vma_start_write(struct vm_area_struct *vma, int state) 140 { 141 const unsigned int mm_lock_seq = __vma_raw_mm_seqnum(vma); 142 struct vma_exclude_readers_state ves = { 143 .vma = vma, 144 .state = state, 145 }; 146 int err; 147 148 err = __vma_start_exclude_readers(&ves); 149 if (err) { 150 WARN_ON_ONCE(ves.detached); 151 return err; 152 } 153 154 /* 155 * We should use WRITE_ONCE() here because we can have concurrent reads 156 * from the early lockless pessimistic check in vma_start_read(). 157 * We don't really care about the correctness of that early check, but 158 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. 159 */ 160 WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); 161 162 if (ves.exclusive) { 163 __vma_end_exclude_readers(&ves); 164 /* VMA should remain attached. */ 165 WARN_ON_ONCE(ves.detached); 166 } 167 168 return 0; 169 } 170 EXPORT_SYMBOL_GPL(__vma_start_write); 171 172 void __vma_exclude_readers_for_detach(struct vm_area_struct *vma) 173 { 174 struct vma_exclude_readers_state ves = { 175 .vma = vma, 176 .state = TASK_UNINTERRUPTIBLE, 177 .detaching = true, 178 }; 179 int err; 180 181 /* 182 * Wait until the VMA is detached with no readers. Since we hold the VMA 183 * write lock, the only read locks that might be present are those from 184 * threads trying to acquire the read lock and incrementing the 185 * reference count before realising the write lock is held and 186 * decrementing it. 187 */ 188 err = __vma_start_exclude_readers(&ves); 189 if (!err && ves.exclusive) { 190 /* 191 * Once this is complete, no readers can increment the 192 * reference count, and the VMA is marked detached. 193 */ 194 __vma_end_exclude_readers(&ves); 195 } 196 /* If an error arose but we were detached anyway, we don't care. */ 197 WARN_ON_ONCE(!ves.detached); 198 } 199 200 /* 201 * Try to read-lock a vma. The function is allowed to occasionally yield false 202 * locked result to avoid performance overhead, in which case we fall back to 203 * using mmap_lock. The function should never yield false unlocked result. 204 * False locked result is possible if mm_lock_seq overflows or if vma gets 205 * reused and attached to a different mm before we lock it. 206 * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got 207 * detached. 208 * 209 * IMPORTANT: RCU lock must be held upon entering the function, but upon error 210 * IT IS RELEASED. The caller must handle this correctly. 211 */ 212 static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, 213 struct vm_area_struct *vma) 214 { 215 struct mm_struct *other_mm; 216 int oldcnt; 217 218 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); 219 /* 220 * Check before locking. A race might cause false locked result. 221 * We can use READ_ONCE() for the mm_lock_seq here, and don't need 222 * ACQUIRE semantics, because this is just a lockless check whose result 223 * we don't rely on for anything - the mm_lock_seq read against which we 224 * need ordering is below. 225 */ 226 if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) { 227 vma = NULL; 228 goto err; 229 } 230 231 /* 232 * If VM_REFCNT_EXCLUDE_READERS_FLAG is set, 233 * __refcount_inc_not_zero_limited_acquire() will fail because 234 * VM_REFCNT_LIMIT is less than VM_REFCNT_EXCLUDE_READERS_FLAG. 235 * 236 * Acquire fence is required here to avoid reordering against later 237 * vm_lock_seq check and checks inside lock_vma_under_rcu(). 238 */ 239 if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, 240 VM_REFCNT_LIMIT))) { 241 /* return EAGAIN if vma got detached from under us */ 242 vma = oldcnt ? NULL : ERR_PTR(-EAGAIN); 243 goto err; 244 } 245 246 __vma_lockdep_acquire_read(vma); 247 248 if (unlikely(vma->vm_mm != mm)) 249 goto err_unstable; 250 251 /* 252 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. 253 * False unlocked result is impossible because we modify and check 254 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq 255 * modification invalidates all existing locks. 256 * 257 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are 258 * racing with vma_end_write_all(), we only start reading from the VMA 259 * after it has been unlocked. 260 * This pairs with RELEASE semantics in vma_end_write_all(). 261 */ 262 if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { 263 vma_refcount_put(vma); 264 vma = NULL; 265 goto err; 266 } 267 268 return vma; 269 err: 270 rcu_read_unlock(); 271 272 return vma; 273 err_unstable: 274 /* 275 * If vma got attached to another mm from under us, that mm is not 276 * stable and can be freed in the narrow window after vma->vm_refcnt 277 * is dropped and before rcuwait_wake_up(mm) is called. Grab it before 278 * releasing vma->vm_refcnt. 279 */ 280 other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */ 281 282 /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */ 283 rcu_read_unlock(); 284 mmgrab(other_mm); 285 vma_refcount_put(vma); 286 mmdrop(other_mm); 287 288 return NULL; 289 } 290 291 /* 292 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be 293 * stable and not isolated. If the VMA is not found or is being modified the 294 * function returns NULL. 295 */ 296 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, 297 unsigned long address) 298 { 299 MA_STATE(mas, &mm->mm_mt, address, address); 300 struct vm_area_struct *vma; 301 302 retry: 303 rcu_read_lock(); 304 vma = mas_walk(&mas); 305 if (!vma) { 306 rcu_read_unlock(); 307 goto inval; 308 } 309 310 vma = vma_start_read(mm, vma); 311 if (IS_ERR_OR_NULL(vma)) { 312 /* Check if the VMA got isolated after we found it */ 313 if (PTR_ERR(vma) == -EAGAIN) { 314 count_vm_vma_lock_event(VMA_LOCK_MISS); 315 /* The area was replaced with another one */ 316 mas_set(&mas, address); 317 goto retry; 318 } 319 320 /* Failed to lock the VMA */ 321 goto inval; 322 } 323 /* 324 * At this point, we have a stable reference to a VMA: The VMA is 325 * locked and we know it hasn't already been isolated. 326 * From here on, we can access the VMA without worrying about which 327 * fields are accessible for RCU readers. 328 */ 329 rcu_read_unlock(); 330 331 /* Check if the vma we locked is the right one. */ 332 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 333 vma_end_read(vma); 334 goto inval; 335 } 336 337 return vma; 338 339 inval: 340 count_vm_vma_lock_event(VMA_LOCK_ABORT); 341 return NULL; 342 } 343 344 static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm, 345 struct vma_iterator *vmi, 346 unsigned long from_addr) 347 { 348 struct vm_area_struct *vma; 349 int ret; 350 351 ret = mmap_read_lock_killable(mm); 352 if (ret) 353 return ERR_PTR(ret); 354 355 /* Lookup the vma at the last position again under mmap_read_lock */ 356 vma_iter_set(vmi, from_addr); 357 vma = vma_next(vmi); 358 if (vma) { 359 /* Very unlikely vma->vm_refcnt overflow case */ 360 if (unlikely(!vma_start_read_locked(vma))) 361 vma = ERR_PTR(-EAGAIN); 362 } 363 364 mmap_read_unlock(mm); 365 366 return vma; 367 } 368 369 struct vm_area_struct *lock_next_vma(struct mm_struct *mm, 370 struct vma_iterator *vmi, 371 unsigned long from_addr) 372 { 373 struct vm_area_struct *vma; 374 unsigned int mm_wr_seq; 375 bool mmap_unlocked; 376 377 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); 378 retry: 379 /* Start mmap_lock speculation in case we need to verify the vma later */ 380 mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); 381 vma = vma_next(vmi); 382 if (!vma) 383 return NULL; 384 385 vma = vma_start_read(mm, vma); 386 if (IS_ERR_OR_NULL(vma)) { 387 /* 388 * Retry immediately if the vma gets detached from under us. 389 * Infinite loop should not happen because the vma we find will 390 * have to be constantly knocked out from under us. 391 */ 392 if (PTR_ERR(vma) == -EAGAIN) { 393 /* reset to search from the last address */ 394 rcu_read_lock(); 395 vma_iter_set(vmi, from_addr); 396 goto retry; 397 } 398 399 goto fallback; 400 } 401 402 /* Verify the vma is not behind the last search position. */ 403 if (unlikely(from_addr >= vma->vm_end)) 404 goto fallback_unlock; 405 406 /* 407 * vma can be ahead of the last search position but we need to verify 408 * it was not shrunk after we found it and another vma has not been 409 * installed ahead of it. Otherwise we might observe a gap that should 410 * not be there. 411 */ 412 if (from_addr < vma->vm_start) { 413 /* Verify only if the address space might have changed since vma lookup. */ 414 if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) { 415 vma_iter_set(vmi, from_addr); 416 if (vma != vma_next(vmi)) 417 goto fallback_unlock; 418 } 419 } 420 421 return vma; 422 423 fallback_unlock: 424 rcu_read_unlock(); 425 vma_end_read(vma); 426 fallback: 427 vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr); 428 rcu_read_lock(); 429 /* Reinitialize the iterator after re-entering rcu read section */ 430 vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end); 431 432 return vma; 433 } 434 #endif /* CONFIG_PER_VMA_LOCK */ 435 436 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA 437 #include <linux/extable.h> 438 439 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 440 { 441 if (likely(mmap_read_trylock(mm))) 442 return true; 443 444 if (regs && !user_mode(regs)) { 445 unsigned long ip = exception_ip(regs); 446 if (!search_exception_tables(ip)) 447 return false; 448 } 449 450 return !mmap_read_lock_killable(mm); 451 } 452 453 static inline bool mmap_upgrade_trylock(struct mm_struct *mm) 454 { 455 /* 456 * We don't have this operation yet. 457 * 458 * It should be easy enough to do: it's basically a 459 * atomic_long_try_cmpxchg_acquire() 460 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but 461 * it also needs the proper lockdep magic etc. 462 */ 463 return false; 464 } 465 466 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 467 { 468 mmap_read_unlock(mm); 469 if (regs && !user_mode(regs)) { 470 unsigned long ip = exception_ip(regs); 471 if (!search_exception_tables(ip)) 472 return false; 473 } 474 return !mmap_write_lock_killable(mm); 475 } 476 477 /* 478 * Helper for page fault handling. 479 * 480 * This is kind of equivalent to "mmap_read_lock()" followed 481 * by "find_extend_vma()", except it's a lot more careful about 482 * the locking (and will drop the lock on failure). 483 * 484 * For example, if we have a kernel bug that causes a page 485 * fault, we don't want to just use mmap_read_lock() to get 486 * the mm lock, because that would deadlock if the bug were 487 * to happen while we're holding the mm lock for writing. 488 * 489 * So this checks the exception tables on kernel faults in 490 * order to only do this all for instructions that are actually 491 * expected to fault. 492 * 493 * We can also actually take the mm lock for writing if we 494 * need to extend the vma, which helps the VM layer a lot. 495 */ 496 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 497 unsigned long addr, struct pt_regs *regs) 498 { 499 struct vm_area_struct *vma; 500 501 if (!get_mmap_lock_carefully(mm, regs)) 502 return NULL; 503 504 vma = find_vma(mm, addr); 505 if (likely(vma && (vma->vm_start <= addr))) 506 return vma; 507 508 /* 509 * Well, dang. We might still be successful, but only 510 * if we can extend a vma to do so. 511 */ 512 if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { 513 mmap_read_unlock(mm); 514 return NULL; 515 } 516 517 /* 518 * We can try to upgrade the mmap lock atomically, 519 * in which case we can continue to use the vma 520 * we already looked up. 521 * 522 * Otherwise we'll have to drop the mmap lock and 523 * re-take it, and also look up the vma again, 524 * re-checking it. 525 */ 526 if (!mmap_upgrade_trylock(mm)) { 527 if (!upgrade_mmap_lock_carefully(mm, regs)) 528 return NULL; 529 530 vma = find_vma(mm, addr); 531 if (!vma) 532 goto fail; 533 if (vma->vm_start <= addr) 534 goto success; 535 if (!(vma->vm_flags & VM_GROWSDOWN)) 536 goto fail; 537 } 538 539 if (expand_stack_locked(vma, addr)) 540 goto fail; 541 542 success: 543 mmap_write_downgrade(mm); 544 return vma; 545 546 fail: 547 mmap_write_unlock(mm); 548 return NULL; 549 } 550 #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */ 551 552 #else /* CONFIG_MMU */ 553 554 /* 555 * At least xtensa ends up having protection faults even with no 556 * MMU.. No stack expansion, at least. 557 */ 558 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 559 unsigned long addr, struct pt_regs *regs) 560 { 561 struct vm_area_struct *vma; 562 563 mmap_read_lock(mm); 564 vma = vma_lookup(mm, addr); 565 if (!vma) 566 mmap_read_unlock(mm); 567 return vma; 568 } 569 570 #endif /* CONFIG_MMU */ 571