1 // SPDX-License-Identifier: GPL-2.0 2 #define CREATE_TRACE_POINTS 3 #include <trace/events/mmap_lock.h> 4 5 #include <linux/mm.h> 6 #include <linux/cgroup.h> 7 #include <linux/memcontrol.h> 8 #include <linux/mmap_lock.h> 9 #include <linux/mutex.h> 10 #include <linux/percpu.h> 11 #include <linux/rcupdate.h> 12 #include <linux/smp.h> 13 #include <linux/trace_events.h> 14 #include <linux/local_lock.h> 15 16 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); 17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); 18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); 19 20 #ifdef CONFIG_TRACING 21 /* 22 * Trace calls must be in a separate file, as otherwise there's a circular 23 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. 24 */ 25 26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) 27 { 28 trace_mmap_lock_start_locking(mm, write); 29 } 30 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); 31 32 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, 33 bool success) 34 { 35 trace_mmap_lock_acquire_returned(mm, write, success); 36 } 37 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); 38 39 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) 40 { 41 trace_mmap_lock_released(mm, write); 42 } 43 EXPORT_SYMBOL(__mmap_lock_do_trace_released); 44 #endif /* CONFIG_TRACING */ 45 46 #ifdef CONFIG_MMU 47 #ifdef CONFIG_PER_VMA_LOCK 48 /* 49 * Return value: 0 if vma detached, 50 * 1 if vma attached with no readers, 51 * -EINTR if signal received, 52 */ 53 static inline int __vma_enter_locked(struct vm_area_struct *vma, 54 bool detaching, int state) 55 { 56 int err; 57 unsigned int tgt_refcnt = VMA_LOCK_OFFSET; 58 59 /* Additional refcnt if the vma is attached. */ 60 if (!detaching) 61 tgt_refcnt++; 62 63 /* 64 * If vma is detached then only vma_mark_attached() can raise the 65 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). 66 */ 67 if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) 68 return 0; 69 70 rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); 71 err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, 72 refcount_read(&vma->vm_refcnt) == tgt_refcnt, 73 state); 74 if (err) { 75 rwsem_release(&vma->vmlock_dep_map, _RET_IP_); 76 return err; 77 } 78 lock_acquired(&vma->vmlock_dep_map, _RET_IP_); 79 80 return 1; 81 } 82 83 static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) 84 { 85 *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); 86 rwsem_release(&vma->vmlock_dep_map, _RET_IP_); 87 } 88 89 int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, 90 int state) 91 { 92 int locked; 93 94 /* 95 * __vma_enter_locked() returns false immediately if the vma is not 96 * attached, otherwise it waits until refcnt is indicating that vma 97 * is attached with no readers. 98 */ 99 locked = __vma_enter_locked(vma, false, state); 100 if (locked < 0) 101 return locked; 102 103 /* 104 * We should use WRITE_ONCE() here because we can have concurrent reads 105 * from the early lockless pessimistic check in vma_start_read(). 106 * We don't really care about the correctness of that early check, but 107 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. 108 */ 109 WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); 110 111 if (locked) { 112 bool detached; 113 114 __vma_exit_locked(vma, &detached); 115 WARN_ON_ONCE(detached); /* vma should remain attached */ 116 } 117 118 return 0; 119 } 120 EXPORT_SYMBOL_GPL(__vma_start_write); 121 122 void vma_mark_detached(struct vm_area_struct *vma) 123 { 124 vma_assert_write_locked(vma); 125 vma_assert_attached(vma); 126 127 /* 128 * We are the only writer, so no need to use vma_refcount_put(). 129 * The condition below is unlikely because the vma has been already 130 * write-locked and readers can increment vm_refcnt only temporarily 131 * before they check vm_lock_seq, realize the vma is locked and drop 132 * back the vm_refcnt. That is a narrow window for observing a raised 133 * vm_refcnt. 134 */ 135 if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { 136 /* Wait until vma is detached with no readers. */ 137 if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) { 138 bool detached; 139 140 __vma_exit_locked(vma, &detached); 141 WARN_ON_ONCE(!detached); 142 } 143 } 144 } 145 146 /* 147 * Try to read-lock a vma. The function is allowed to occasionally yield false 148 * locked result to avoid performance overhead, in which case we fall back to 149 * using mmap_lock. The function should never yield false unlocked result. 150 * False locked result is possible if mm_lock_seq overflows or if vma gets 151 * reused and attached to a different mm before we lock it. 152 * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got 153 * detached. 154 * 155 * IMPORTANT: RCU lock must be held upon entering the function, but upon error 156 * IT IS RELEASED. The caller must handle this correctly. 157 */ 158 static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, 159 struct vm_area_struct *vma) 160 { 161 struct mm_struct *other_mm; 162 int oldcnt; 163 164 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); 165 /* 166 * Check before locking. A race might cause false locked result. 167 * We can use READ_ONCE() for the mm_lock_seq here, and don't need 168 * ACQUIRE semantics, because this is just a lockless check whose result 169 * we don't rely on for anything - the mm_lock_seq read against which we 170 * need ordering is below. 171 */ 172 if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) { 173 vma = NULL; 174 goto err; 175 } 176 177 /* 178 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() 179 * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. 180 * Acquire fence is required here to avoid reordering against later 181 * vm_lock_seq check and checks inside lock_vma_under_rcu(). 182 */ 183 if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, 184 VMA_REF_LIMIT))) { 185 /* return EAGAIN if vma got detached from under us */ 186 vma = oldcnt ? NULL : ERR_PTR(-EAGAIN); 187 goto err; 188 } 189 190 rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); 191 192 if (unlikely(vma->vm_mm != mm)) 193 goto err_unstable; 194 195 /* 196 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. 197 * False unlocked result is impossible because we modify and check 198 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq 199 * modification invalidates all existing locks. 200 * 201 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are 202 * racing with vma_end_write_all(), we only start reading from the VMA 203 * after it has been unlocked. 204 * This pairs with RELEASE semantics in vma_end_write_all(). 205 */ 206 if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { 207 vma_refcount_put(vma); 208 vma = NULL; 209 goto err; 210 } 211 212 return vma; 213 err: 214 rcu_read_unlock(); 215 216 return vma; 217 err_unstable: 218 /* 219 * If vma got attached to another mm from under us, that mm is not 220 * stable and can be freed in the narrow window after vma->vm_refcnt 221 * is dropped and before rcuwait_wake_up(mm) is called. Grab it before 222 * releasing vma->vm_refcnt. 223 */ 224 other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */ 225 226 /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */ 227 rcu_read_unlock(); 228 mmgrab(other_mm); 229 vma_refcount_put(vma); 230 mmdrop(other_mm); 231 232 return NULL; 233 } 234 235 /* 236 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be 237 * stable and not isolated. If the VMA is not found or is being modified the 238 * function returns NULL. 239 */ 240 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, 241 unsigned long address) 242 { 243 MA_STATE(mas, &mm->mm_mt, address, address); 244 struct vm_area_struct *vma; 245 246 retry: 247 rcu_read_lock(); 248 vma = mas_walk(&mas); 249 if (!vma) { 250 rcu_read_unlock(); 251 goto inval; 252 } 253 254 vma = vma_start_read(mm, vma); 255 if (IS_ERR_OR_NULL(vma)) { 256 /* Check if the VMA got isolated after we found it */ 257 if (PTR_ERR(vma) == -EAGAIN) { 258 count_vm_vma_lock_event(VMA_LOCK_MISS); 259 /* The area was replaced with another one */ 260 goto retry; 261 } 262 263 /* Failed to lock the VMA */ 264 goto inval; 265 } 266 /* 267 * At this point, we have a stable reference to a VMA: The VMA is 268 * locked and we know it hasn't already been isolated. 269 * From here on, we can access the VMA without worrying about which 270 * fields are accessible for RCU readers. 271 */ 272 rcu_read_unlock(); 273 274 /* Check if the vma we locked is the right one. */ 275 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 276 vma_end_read(vma); 277 goto inval; 278 } 279 280 return vma; 281 282 inval: 283 count_vm_vma_lock_event(VMA_LOCK_ABORT); 284 return NULL; 285 } 286 287 static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm, 288 struct vma_iterator *vmi, 289 unsigned long from_addr) 290 { 291 struct vm_area_struct *vma; 292 int ret; 293 294 ret = mmap_read_lock_killable(mm); 295 if (ret) 296 return ERR_PTR(ret); 297 298 /* Lookup the vma at the last position again under mmap_read_lock */ 299 vma_iter_set(vmi, from_addr); 300 vma = vma_next(vmi); 301 if (vma) { 302 /* Very unlikely vma->vm_refcnt overflow case */ 303 if (unlikely(!vma_start_read_locked(vma))) 304 vma = ERR_PTR(-EAGAIN); 305 } 306 307 mmap_read_unlock(mm); 308 309 return vma; 310 } 311 312 struct vm_area_struct *lock_next_vma(struct mm_struct *mm, 313 struct vma_iterator *vmi, 314 unsigned long from_addr) 315 { 316 struct vm_area_struct *vma; 317 unsigned int mm_wr_seq; 318 bool mmap_unlocked; 319 320 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); 321 retry: 322 /* Start mmap_lock speculation in case we need to verify the vma later */ 323 mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); 324 vma = vma_next(vmi); 325 if (!vma) 326 return NULL; 327 328 vma = vma_start_read(mm, vma); 329 if (IS_ERR_OR_NULL(vma)) { 330 /* 331 * Retry immediately if the vma gets detached from under us. 332 * Infinite loop should not happen because the vma we find will 333 * have to be constantly knocked out from under us. 334 */ 335 if (PTR_ERR(vma) == -EAGAIN) { 336 /* reset to search from the last address */ 337 rcu_read_lock(); 338 vma_iter_set(vmi, from_addr); 339 goto retry; 340 } 341 342 goto fallback; 343 } 344 345 /* Verify the vma is not behind the last search position. */ 346 if (unlikely(from_addr >= vma->vm_end)) 347 goto fallback_unlock; 348 349 /* 350 * vma can be ahead of the last search position but we need to verify 351 * it was not shrunk after we found it and another vma has not been 352 * installed ahead of it. Otherwise we might observe a gap that should 353 * not be there. 354 */ 355 if (from_addr < vma->vm_start) { 356 /* Verify only if the address space might have changed since vma lookup. */ 357 if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) { 358 vma_iter_set(vmi, from_addr); 359 if (vma != vma_next(vmi)) 360 goto fallback_unlock; 361 } 362 } 363 364 return vma; 365 366 fallback_unlock: 367 rcu_read_unlock(); 368 vma_end_read(vma); 369 fallback: 370 vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr); 371 rcu_read_lock(); 372 /* Reinitialize the iterator after re-entering rcu read section */ 373 vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end); 374 375 return vma; 376 } 377 #endif /* CONFIG_PER_VMA_LOCK */ 378 379 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA 380 #include <linux/extable.h> 381 382 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 383 { 384 if (likely(mmap_read_trylock(mm))) 385 return true; 386 387 if (regs && !user_mode(regs)) { 388 unsigned long ip = exception_ip(regs); 389 if (!search_exception_tables(ip)) 390 return false; 391 } 392 393 return !mmap_read_lock_killable(mm); 394 } 395 396 static inline bool mmap_upgrade_trylock(struct mm_struct *mm) 397 { 398 /* 399 * We don't have this operation yet. 400 * 401 * It should be easy enough to do: it's basically a 402 * atomic_long_try_cmpxchg_acquire() 403 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but 404 * it also needs the proper lockdep magic etc. 405 */ 406 return false; 407 } 408 409 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 410 { 411 mmap_read_unlock(mm); 412 if (regs && !user_mode(regs)) { 413 unsigned long ip = exception_ip(regs); 414 if (!search_exception_tables(ip)) 415 return false; 416 } 417 return !mmap_write_lock_killable(mm); 418 } 419 420 /* 421 * Helper for page fault handling. 422 * 423 * This is kind of equivalent to "mmap_read_lock()" followed 424 * by "find_extend_vma()", except it's a lot more careful about 425 * the locking (and will drop the lock on failure). 426 * 427 * For example, if we have a kernel bug that causes a page 428 * fault, we don't want to just use mmap_read_lock() to get 429 * the mm lock, because that would deadlock if the bug were 430 * to happen while we're holding the mm lock for writing. 431 * 432 * So this checks the exception tables on kernel faults in 433 * order to only do this all for instructions that are actually 434 * expected to fault. 435 * 436 * We can also actually take the mm lock for writing if we 437 * need to extend the vma, which helps the VM layer a lot. 438 */ 439 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 440 unsigned long addr, struct pt_regs *regs) 441 { 442 struct vm_area_struct *vma; 443 444 if (!get_mmap_lock_carefully(mm, regs)) 445 return NULL; 446 447 vma = find_vma(mm, addr); 448 if (likely(vma && (vma->vm_start <= addr))) 449 return vma; 450 451 /* 452 * Well, dang. We might still be successful, but only 453 * if we can extend a vma to do so. 454 */ 455 if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { 456 mmap_read_unlock(mm); 457 return NULL; 458 } 459 460 /* 461 * We can try to upgrade the mmap lock atomically, 462 * in which case we can continue to use the vma 463 * we already looked up. 464 * 465 * Otherwise we'll have to drop the mmap lock and 466 * re-take it, and also look up the vma again, 467 * re-checking it. 468 */ 469 if (!mmap_upgrade_trylock(mm)) { 470 if (!upgrade_mmap_lock_carefully(mm, regs)) 471 return NULL; 472 473 vma = find_vma(mm, addr); 474 if (!vma) 475 goto fail; 476 if (vma->vm_start <= addr) 477 goto success; 478 if (!(vma->vm_flags & VM_GROWSDOWN)) 479 goto fail; 480 } 481 482 if (expand_stack_locked(vma, addr)) 483 goto fail; 484 485 success: 486 mmap_write_downgrade(mm); 487 return vma; 488 489 fail: 490 mmap_write_unlock(mm); 491 return NULL; 492 } 493 #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */ 494 495 #else /* CONFIG_MMU */ 496 497 /* 498 * At least xtensa ends up having protection faults even with no 499 * MMU.. No stack expansion, at least. 500 */ 501 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 502 unsigned long addr, struct pt_regs *regs) 503 { 504 struct vm_area_struct *vma; 505 506 mmap_read_lock(mm); 507 vma = vma_lookup(mm, addr); 508 if (!vma) 509 mmap_read_unlock(mm); 510 return vma; 511 } 512 513 #endif /* CONFIG_MMU */ 514