1 // SPDX-License-Identifier: GPL-2.0 2 #define CREATE_TRACE_POINTS 3 #include <trace/events/mmap_lock.h> 4 5 #include <linux/mm.h> 6 #include <linux/cgroup.h> 7 #include <linux/memcontrol.h> 8 #include <linux/mmap_lock.h> 9 #include <linux/mutex.h> 10 #include <linux/percpu.h> 11 #include <linux/rcupdate.h> 12 #include <linux/smp.h> 13 #include <linux/trace_events.h> 14 #include <linux/local_lock.h> 15 16 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); 17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); 18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); 19 20 #ifdef CONFIG_TRACING 21 /* 22 * Trace calls must be in a separate file, as otherwise there's a circular 23 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. 24 */ 25 26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) 27 { 28 trace_mmap_lock_start_locking(mm, write); 29 } 30 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); 31 32 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, 33 bool success) 34 { 35 trace_mmap_lock_acquire_returned(mm, write, success); 36 } 37 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); 38 39 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) 40 { 41 trace_mmap_lock_released(mm, write); 42 } 43 EXPORT_SYMBOL(__mmap_lock_do_trace_released); 44 #endif /* CONFIG_TRACING */ 45 46 #ifdef CONFIG_MMU 47 #ifdef CONFIG_PER_VMA_LOCK 48 static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) 49 { 50 unsigned int tgt_refcnt = VMA_LOCK_OFFSET; 51 52 /* Additional refcnt if the vma is attached. */ 53 if (!detaching) 54 tgt_refcnt++; 55 56 /* 57 * If vma is detached then only vma_mark_attached() can raise the 58 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). 59 */ 60 if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) 61 return false; 62 63 rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); 64 rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, 65 refcount_read(&vma->vm_refcnt) == tgt_refcnt, 66 TASK_UNINTERRUPTIBLE); 67 lock_acquired(&vma->vmlock_dep_map, _RET_IP_); 68 69 return true; 70 } 71 72 static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) 73 { 74 *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); 75 rwsem_release(&vma->vmlock_dep_map, _RET_IP_); 76 } 77 78 void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) 79 { 80 bool locked; 81 82 /* 83 * __vma_enter_locked() returns false immediately if the vma is not 84 * attached, otherwise it waits until refcnt is indicating that vma 85 * is attached with no readers. 86 */ 87 locked = __vma_enter_locked(vma, false); 88 89 /* 90 * We should use WRITE_ONCE() here because we can have concurrent reads 91 * from the early lockless pessimistic check in vma_start_read(). 92 * We don't really care about the correctness of that early check, but 93 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. 94 */ 95 WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); 96 97 if (locked) { 98 bool detached; 99 100 __vma_exit_locked(vma, &detached); 101 WARN_ON_ONCE(detached); /* vma should remain attached */ 102 } 103 } 104 EXPORT_SYMBOL_GPL(__vma_start_write); 105 106 void vma_mark_detached(struct vm_area_struct *vma) 107 { 108 vma_assert_write_locked(vma); 109 vma_assert_attached(vma); 110 111 /* 112 * We are the only writer, so no need to use vma_refcount_put(). 113 * The condition below is unlikely because the vma has been already 114 * write-locked and readers can increment vm_refcnt only temporarily 115 * before they check vm_lock_seq, realize the vma is locked and drop 116 * back the vm_refcnt. That is a narrow window for observing a raised 117 * vm_refcnt. 118 */ 119 if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { 120 /* Wait until vma is detached with no readers. */ 121 if (__vma_enter_locked(vma, true)) { 122 bool detached; 123 124 __vma_exit_locked(vma, &detached); 125 WARN_ON_ONCE(!detached); 126 } 127 } 128 } 129 130 /* 131 * Try to read-lock a vma. The function is allowed to occasionally yield false 132 * locked result to avoid performance overhead, in which case we fall back to 133 * using mmap_lock. The function should never yield false unlocked result. 134 * False locked result is possible if mm_lock_seq overflows or if vma gets 135 * reused and attached to a different mm before we lock it. 136 * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got 137 * detached. 138 * 139 * IMPORTANT: RCU lock must be held upon entering the function, but upon error 140 * IT IS RELEASED. The caller must handle this correctly. 141 */ 142 static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, 143 struct vm_area_struct *vma) 144 { 145 struct mm_struct *other_mm; 146 int oldcnt; 147 148 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); 149 /* 150 * Check before locking. A race might cause false locked result. 151 * We can use READ_ONCE() for the mm_lock_seq here, and don't need 152 * ACQUIRE semantics, because this is just a lockless check whose result 153 * we don't rely on for anything - the mm_lock_seq read against which we 154 * need ordering is below. 155 */ 156 if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) { 157 vma = NULL; 158 goto err; 159 } 160 161 /* 162 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() 163 * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. 164 * Acquire fence is required here to avoid reordering against later 165 * vm_lock_seq check and checks inside lock_vma_under_rcu(). 166 */ 167 if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, 168 VMA_REF_LIMIT))) { 169 /* return EAGAIN if vma got detached from under us */ 170 vma = oldcnt ? NULL : ERR_PTR(-EAGAIN); 171 goto err; 172 } 173 174 rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); 175 176 if (unlikely(vma->vm_mm != mm)) 177 goto err_unstable; 178 179 /* 180 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. 181 * False unlocked result is impossible because we modify and check 182 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq 183 * modification invalidates all existing locks. 184 * 185 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are 186 * racing with vma_end_write_all(), we only start reading from the VMA 187 * after it has been unlocked. 188 * This pairs with RELEASE semantics in vma_end_write_all(). 189 */ 190 if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { 191 vma_refcount_put(vma); 192 vma = NULL; 193 goto err; 194 } 195 196 return vma; 197 err: 198 rcu_read_unlock(); 199 200 return vma; 201 err_unstable: 202 /* 203 * If vma got attached to another mm from under us, that mm is not 204 * stable and can be freed in the narrow window after vma->vm_refcnt 205 * is dropped and before rcuwait_wake_up(mm) is called. Grab it before 206 * releasing vma->vm_refcnt. 207 */ 208 other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */ 209 210 /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */ 211 rcu_read_unlock(); 212 mmgrab(other_mm); 213 vma_refcount_put(vma); 214 mmdrop(other_mm); 215 216 return NULL; 217 } 218 219 /* 220 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be 221 * stable and not isolated. If the VMA is not found or is being modified the 222 * function returns NULL. 223 */ 224 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, 225 unsigned long address) 226 { 227 MA_STATE(mas, &mm->mm_mt, address, address); 228 struct vm_area_struct *vma; 229 230 retry: 231 rcu_read_lock(); 232 vma = mas_walk(&mas); 233 if (!vma) { 234 rcu_read_unlock(); 235 goto inval; 236 } 237 238 vma = vma_start_read(mm, vma); 239 if (IS_ERR_OR_NULL(vma)) { 240 /* Check if the VMA got isolated after we found it */ 241 if (PTR_ERR(vma) == -EAGAIN) { 242 count_vm_vma_lock_event(VMA_LOCK_MISS); 243 /* The area was replaced with another one */ 244 mas_set(&mas, address); 245 goto retry; 246 } 247 248 /* Failed to lock the VMA */ 249 goto inval; 250 } 251 /* 252 * At this point, we have a stable reference to a VMA: The VMA is 253 * locked and we know it hasn't already been isolated. 254 * From here on, we can access the VMA without worrying about which 255 * fields are accessible for RCU readers. 256 */ 257 rcu_read_unlock(); 258 259 /* Check if the vma we locked is the right one. */ 260 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 261 vma_end_read(vma); 262 goto inval; 263 } 264 265 return vma; 266 267 inval: 268 count_vm_vma_lock_event(VMA_LOCK_ABORT); 269 return NULL; 270 } 271 272 static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm, 273 struct vma_iterator *vmi, 274 unsigned long from_addr) 275 { 276 struct vm_area_struct *vma; 277 int ret; 278 279 ret = mmap_read_lock_killable(mm); 280 if (ret) 281 return ERR_PTR(ret); 282 283 /* Lookup the vma at the last position again under mmap_read_lock */ 284 vma_iter_set(vmi, from_addr); 285 vma = vma_next(vmi); 286 if (vma) { 287 /* Very unlikely vma->vm_refcnt overflow case */ 288 if (unlikely(!vma_start_read_locked(vma))) 289 vma = ERR_PTR(-EAGAIN); 290 } 291 292 mmap_read_unlock(mm); 293 294 return vma; 295 } 296 297 struct vm_area_struct *lock_next_vma(struct mm_struct *mm, 298 struct vma_iterator *vmi, 299 unsigned long from_addr) 300 { 301 struct vm_area_struct *vma; 302 unsigned int mm_wr_seq; 303 bool mmap_unlocked; 304 305 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); 306 retry: 307 /* Start mmap_lock speculation in case we need to verify the vma later */ 308 mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); 309 vma = vma_next(vmi); 310 if (!vma) 311 return NULL; 312 313 vma = vma_start_read(mm, vma); 314 if (IS_ERR_OR_NULL(vma)) { 315 /* 316 * Retry immediately if the vma gets detached from under us. 317 * Infinite loop should not happen because the vma we find will 318 * have to be constantly knocked out from under us. 319 */ 320 if (PTR_ERR(vma) == -EAGAIN) { 321 /* reset to search from the last address */ 322 rcu_read_lock(); 323 vma_iter_set(vmi, from_addr); 324 goto retry; 325 } 326 327 goto fallback; 328 } 329 330 /* Verify the vma is not behind the last search position. */ 331 if (unlikely(from_addr >= vma->vm_end)) 332 goto fallback_unlock; 333 334 /* 335 * vma can be ahead of the last search position but we need to verify 336 * it was not shrunk after we found it and another vma has not been 337 * installed ahead of it. Otherwise we might observe a gap that should 338 * not be there. 339 */ 340 if (from_addr < vma->vm_start) { 341 /* Verify only if the address space might have changed since vma lookup. */ 342 if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) { 343 vma_iter_set(vmi, from_addr); 344 if (vma != vma_next(vmi)) 345 goto fallback_unlock; 346 } 347 } 348 349 return vma; 350 351 fallback_unlock: 352 rcu_read_unlock(); 353 vma_end_read(vma); 354 fallback: 355 vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr); 356 rcu_read_lock(); 357 /* Reinitialize the iterator after re-entering rcu read section */ 358 vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end); 359 360 return vma; 361 } 362 #endif /* CONFIG_PER_VMA_LOCK */ 363 364 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA 365 #include <linux/extable.h> 366 367 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 368 { 369 if (likely(mmap_read_trylock(mm))) 370 return true; 371 372 if (regs && !user_mode(regs)) { 373 unsigned long ip = exception_ip(regs); 374 if (!search_exception_tables(ip)) 375 return false; 376 } 377 378 return !mmap_read_lock_killable(mm); 379 } 380 381 static inline bool mmap_upgrade_trylock(struct mm_struct *mm) 382 { 383 /* 384 * We don't have this operation yet. 385 * 386 * It should be easy enough to do: it's basically a 387 * atomic_long_try_cmpxchg_acquire() 388 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but 389 * it also needs the proper lockdep magic etc. 390 */ 391 return false; 392 } 393 394 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 395 { 396 mmap_read_unlock(mm); 397 if (regs && !user_mode(regs)) { 398 unsigned long ip = exception_ip(regs); 399 if (!search_exception_tables(ip)) 400 return false; 401 } 402 return !mmap_write_lock_killable(mm); 403 } 404 405 /* 406 * Helper for page fault handling. 407 * 408 * This is kind of equivalent to "mmap_read_lock()" followed 409 * by "find_extend_vma()", except it's a lot more careful about 410 * the locking (and will drop the lock on failure). 411 * 412 * For example, if we have a kernel bug that causes a page 413 * fault, we don't want to just use mmap_read_lock() to get 414 * the mm lock, because that would deadlock if the bug were 415 * to happen while we're holding the mm lock for writing. 416 * 417 * So this checks the exception tables on kernel faults in 418 * order to only do this all for instructions that are actually 419 * expected to fault. 420 * 421 * We can also actually take the mm lock for writing if we 422 * need to extend the vma, which helps the VM layer a lot. 423 */ 424 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 425 unsigned long addr, struct pt_regs *regs) 426 { 427 struct vm_area_struct *vma; 428 429 if (!get_mmap_lock_carefully(mm, regs)) 430 return NULL; 431 432 vma = find_vma(mm, addr); 433 if (likely(vma && (vma->vm_start <= addr))) 434 return vma; 435 436 /* 437 * Well, dang. We might still be successful, but only 438 * if we can extend a vma to do so. 439 */ 440 if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { 441 mmap_read_unlock(mm); 442 return NULL; 443 } 444 445 /* 446 * We can try to upgrade the mmap lock atomically, 447 * in which case we can continue to use the vma 448 * we already looked up. 449 * 450 * Otherwise we'll have to drop the mmap lock and 451 * re-take it, and also look up the vma again, 452 * re-checking it. 453 */ 454 if (!mmap_upgrade_trylock(mm)) { 455 if (!upgrade_mmap_lock_carefully(mm, regs)) 456 return NULL; 457 458 vma = find_vma(mm, addr); 459 if (!vma) 460 goto fail; 461 if (vma->vm_start <= addr) 462 goto success; 463 if (!(vma->vm_flags & VM_GROWSDOWN)) 464 goto fail; 465 } 466 467 if (expand_stack_locked(vma, addr)) 468 goto fail; 469 470 success: 471 mmap_write_downgrade(mm); 472 return vma; 473 474 fail: 475 mmap_write_unlock(mm); 476 return NULL; 477 } 478 #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */ 479 480 #else /* CONFIG_MMU */ 481 482 /* 483 * At least xtensa ends up having protection faults even with no 484 * MMU.. No stack expansion, at least. 485 */ 486 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 487 unsigned long addr, struct pt_regs *regs) 488 { 489 struct vm_area_struct *vma; 490 491 mmap_read_lock(mm); 492 vma = vma_lookup(mm, addr); 493 if (!vma) 494 mmap_read_unlock(mm); 495 return vma; 496 } 497 498 #endif /* CONFIG_MMU */ 499