1 // SPDX-License-Identifier: GPL-2.0 2 #define CREATE_TRACE_POINTS 3 #include <trace/events/mmap_lock.h> 4 5 #include <linux/mm.h> 6 #include <linux/cgroup.h> 7 #include <linux/memcontrol.h> 8 #include <linux/mmap_lock.h> 9 #include <linux/mutex.h> 10 #include <linux/percpu.h> 11 #include <linux/rcupdate.h> 12 #include <linux/smp.h> 13 #include <linux/trace_events.h> 14 #include <linux/local_lock.h> 15 16 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); 17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); 18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); 19 20 #ifdef CONFIG_TRACING 21 /* 22 * Trace calls must be in a separate file, as otherwise there's a circular 23 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. 24 */ 25 26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) 27 { 28 trace_mmap_lock_start_locking(mm, write); 29 } 30 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); 31 32 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, 33 bool success) 34 { 35 trace_mmap_lock_acquire_returned(mm, write, success); 36 } 37 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); 38 39 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) 40 { 41 trace_mmap_lock_released(mm, write); 42 } 43 EXPORT_SYMBOL(__mmap_lock_do_trace_released); 44 #endif /* CONFIG_TRACING */ 45 46 #ifdef CONFIG_MMU 47 #ifdef CONFIG_PER_VMA_LOCK 48 static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) 49 { 50 unsigned int tgt_refcnt = VMA_LOCK_OFFSET; 51 52 /* Additional refcnt if the vma is attached. */ 53 if (!detaching) 54 tgt_refcnt++; 55 56 /* 57 * If vma is detached then only vma_mark_attached() can raise the 58 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). 59 */ 60 if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) 61 return false; 62 63 rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); 64 rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, 65 refcount_read(&vma->vm_refcnt) == tgt_refcnt, 66 TASK_UNINTERRUPTIBLE); 67 lock_acquired(&vma->vmlock_dep_map, _RET_IP_); 68 69 return true; 70 } 71 72 static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) 73 { 74 *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); 75 rwsem_release(&vma->vmlock_dep_map, _RET_IP_); 76 } 77 78 void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) 79 { 80 bool locked; 81 82 /* 83 * __vma_enter_locked() returns false immediately if the vma is not 84 * attached, otherwise it waits until refcnt is indicating that vma 85 * is attached with no readers. 86 */ 87 locked = __vma_enter_locked(vma, false); 88 89 /* 90 * We should use WRITE_ONCE() here because we can have concurrent reads 91 * from the early lockless pessimistic check in vma_start_read(). 92 * We don't really care about the correctness of that early check, but 93 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. 94 */ 95 WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); 96 97 if (locked) { 98 bool detached; 99 100 __vma_exit_locked(vma, &detached); 101 WARN_ON_ONCE(detached); /* vma should remain attached */ 102 } 103 } 104 EXPORT_SYMBOL_GPL(__vma_start_write); 105 106 void vma_mark_detached(struct vm_area_struct *vma) 107 { 108 vma_assert_write_locked(vma); 109 vma_assert_attached(vma); 110 111 /* 112 * We are the only writer, so no need to use vma_refcount_put(). 113 * The condition below is unlikely because the vma has been already 114 * write-locked and readers can increment vm_refcnt only temporarily 115 * before they check vm_lock_seq, realize the vma is locked and drop 116 * back the vm_refcnt. That is a narrow window for observing a raised 117 * vm_refcnt. 118 */ 119 if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { 120 /* Wait until vma is detached with no readers. */ 121 if (__vma_enter_locked(vma, true)) { 122 bool detached; 123 124 __vma_exit_locked(vma, &detached); 125 WARN_ON_ONCE(!detached); 126 } 127 } 128 } 129 130 /* 131 * Try to read-lock a vma. The function is allowed to occasionally yield false 132 * locked result to avoid performance overhead, in which case we fall back to 133 * using mmap_lock. The function should never yield false unlocked result. 134 * False locked result is possible if mm_lock_seq overflows or if vma gets 135 * reused and attached to a different mm before we lock it. 136 * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got 137 * detached. 138 * 139 * IMPORTANT: RCU lock must be held upon entering the function, but upon error 140 * IT IS RELEASED. The caller must handle this correctly. 141 */ 142 static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, 143 struct vm_area_struct *vma) 144 { 145 struct mm_struct *other_mm; 146 int oldcnt; 147 148 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); 149 /* 150 * Check before locking. A race might cause false locked result. 151 * We can use READ_ONCE() for the mm_lock_seq here, and don't need 152 * ACQUIRE semantics, because this is just a lockless check whose result 153 * we don't rely on for anything - the mm_lock_seq read against which we 154 * need ordering is below. 155 */ 156 if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) { 157 vma = NULL; 158 goto err; 159 } 160 161 /* 162 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() 163 * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. 164 * Acquire fence is required here to avoid reordering against later 165 * vm_lock_seq check and checks inside lock_vma_under_rcu(). 166 */ 167 if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, 168 VMA_REF_LIMIT))) { 169 /* return EAGAIN if vma got detached from under us */ 170 vma = oldcnt ? NULL : ERR_PTR(-EAGAIN); 171 goto err; 172 } 173 174 rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); 175 176 if (unlikely(vma->vm_mm != mm)) 177 goto err_unstable; 178 179 /* 180 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. 181 * False unlocked result is impossible because we modify and check 182 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq 183 * modification invalidates all existing locks. 184 * 185 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are 186 * racing with vma_end_write_all(), we only start reading from the VMA 187 * after it has been unlocked. 188 * This pairs with RELEASE semantics in vma_end_write_all(). 189 */ 190 if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { 191 vma_refcount_put(vma); 192 vma = NULL; 193 goto err; 194 } 195 196 return vma; 197 err: 198 rcu_read_unlock(); 199 200 return vma; 201 err_unstable: 202 /* 203 * If vma got attached to another mm from under us, that mm is not 204 * stable and can be freed in the narrow window after vma->vm_refcnt 205 * is dropped and before rcuwait_wake_up(mm) is called. Grab it before 206 * releasing vma->vm_refcnt. 207 */ 208 other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */ 209 210 /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */ 211 rcu_read_unlock(); 212 mmgrab(other_mm); 213 vma_refcount_put(vma); 214 mmdrop(other_mm); 215 216 return NULL; 217 } 218 219 /* 220 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be 221 * stable and not isolated. If the VMA is not found or is being modified the 222 * function returns NULL. 223 */ 224 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, 225 unsigned long address) 226 { 227 MA_STATE(mas, &mm->mm_mt, address, address); 228 struct vm_area_struct *vma; 229 230 retry: 231 rcu_read_lock(); 232 vma = mas_walk(&mas); 233 if (!vma) { 234 rcu_read_unlock(); 235 goto inval; 236 } 237 238 vma = vma_start_read(mm, vma); 239 if (IS_ERR_OR_NULL(vma)) { 240 /* Check if the VMA got isolated after we found it */ 241 if (PTR_ERR(vma) == -EAGAIN) { 242 count_vm_vma_lock_event(VMA_LOCK_MISS); 243 /* The area was replaced with another one */ 244 goto retry; 245 } 246 247 /* Failed to lock the VMA */ 248 goto inval; 249 } 250 /* 251 * At this point, we have a stable reference to a VMA: The VMA is 252 * locked and we know it hasn't already been isolated. 253 * From here on, we can access the VMA without worrying about which 254 * fields are accessible for RCU readers. 255 */ 256 rcu_read_unlock(); 257 258 /* Check if the vma we locked is the right one. */ 259 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 260 vma_end_read(vma); 261 goto inval; 262 } 263 264 return vma; 265 266 inval: 267 count_vm_vma_lock_event(VMA_LOCK_ABORT); 268 return NULL; 269 } 270 271 static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm, 272 struct vma_iterator *vmi, 273 unsigned long from_addr) 274 { 275 struct vm_area_struct *vma; 276 int ret; 277 278 ret = mmap_read_lock_killable(mm); 279 if (ret) 280 return ERR_PTR(ret); 281 282 /* Lookup the vma at the last position again under mmap_read_lock */ 283 vma_iter_set(vmi, from_addr); 284 vma = vma_next(vmi); 285 if (vma) { 286 /* Very unlikely vma->vm_refcnt overflow case */ 287 if (unlikely(!vma_start_read_locked(vma))) 288 vma = ERR_PTR(-EAGAIN); 289 } 290 291 mmap_read_unlock(mm); 292 293 return vma; 294 } 295 296 struct vm_area_struct *lock_next_vma(struct mm_struct *mm, 297 struct vma_iterator *vmi, 298 unsigned long from_addr) 299 { 300 struct vm_area_struct *vma; 301 unsigned int mm_wr_seq; 302 bool mmap_unlocked; 303 304 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); 305 retry: 306 /* Start mmap_lock speculation in case we need to verify the vma later */ 307 mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); 308 vma = vma_next(vmi); 309 if (!vma) 310 return NULL; 311 312 vma = vma_start_read(mm, vma); 313 if (IS_ERR_OR_NULL(vma)) { 314 /* 315 * Retry immediately if the vma gets detached from under us. 316 * Infinite loop should not happen because the vma we find will 317 * have to be constantly knocked out from under us. 318 */ 319 if (PTR_ERR(vma) == -EAGAIN) { 320 /* reset to search from the last address */ 321 rcu_read_lock(); 322 vma_iter_set(vmi, from_addr); 323 goto retry; 324 } 325 326 goto fallback; 327 } 328 329 /* Verify the vma is not behind the last search position. */ 330 if (unlikely(from_addr >= vma->vm_end)) 331 goto fallback_unlock; 332 333 /* 334 * vma can be ahead of the last search position but we need to verify 335 * it was not shrunk after we found it and another vma has not been 336 * installed ahead of it. Otherwise we might observe a gap that should 337 * not be there. 338 */ 339 if (from_addr < vma->vm_start) { 340 /* Verify only if the address space might have changed since vma lookup. */ 341 if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) { 342 vma_iter_set(vmi, from_addr); 343 if (vma != vma_next(vmi)) 344 goto fallback_unlock; 345 } 346 } 347 348 return vma; 349 350 fallback_unlock: 351 rcu_read_unlock(); 352 vma_end_read(vma); 353 fallback: 354 vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr); 355 rcu_read_lock(); 356 /* Reinitialize the iterator after re-entering rcu read section */ 357 vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end); 358 359 return vma; 360 } 361 #endif /* CONFIG_PER_VMA_LOCK */ 362 363 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA 364 #include <linux/extable.h> 365 366 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 367 { 368 if (likely(mmap_read_trylock(mm))) 369 return true; 370 371 if (regs && !user_mode(regs)) { 372 unsigned long ip = exception_ip(regs); 373 if (!search_exception_tables(ip)) 374 return false; 375 } 376 377 return !mmap_read_lock_killable(mm); 378 } 379 380 static inline bool mmap_upgrade_trylock(struct mm_struct *mm) 381 { 382 /* 383 * We don't have this operation yet. 384 * 385 * It should be easy enough to do: it's basically a 386 * atomic_long_try_cmpxchg_acquire() 387 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but 388 * it also needs the proper lockdep magic etc. 389 */ 390 return false; 391 } 392 393 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 394 { 395 mmap_read_unlock(mm); 396 if (regs && !user_mode(regs)) { 397 unsigned long ip = exception_ip(regs); 398 if (!search_exception_tables(ip)) 399 return false; 400 } 401 return !mmap_write_lock_killable(mm); 402 } 403 404 /* 405 * Helper for page fault handling. 406 * 407 * This is kind of equivalent to "mmap_read_lock()" followed 408 * by "find_extend_vma()", except it's a lot more careful about 409 * the locking (and will drop the lock on failure). 410 * 411 * For example, if we have a kernel bug that causes a page 412 * fault, we don't want to just use mmap_read_lock() to get 413 * the mm lock, because that would deadlock if the bug were 414 * to happen while we're holding the mm lock for writing. 415 * 416 * So this checks the exception tables on kernel faults in 417 * order to only do this all for instructions that are actually 418 * expected to fault. 419 * 420 * We can also actually take the mm lock for writing if we 421 * need to extend the vma, which helps the VM layer a lot. 422 */ 423 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 424 unsigned long addr, struct pt_regs *regs) 425 { 426 struct vm_area_struct *vma; 427 428 if (!get_mmap_lock_carefully(mm, regs)) 429 return NULL; 430 431 vma = find_vma(mm, addr); 432 if (likely(vma && (vma->vm_start <= addr))) 433 return vma; 434 435 /* 436 * Well, dang. We might still be successful, but only 437 * if we can extend a vma to do so. 438 */ 439 if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { 440 mmap_read_unlock(mm); 441 return NULL; 442 } 443 444 /* 445 * We can try to upgrade the mmap lock atomically, 446 * in which case we can continue to use the vma 447 * we already looked up. 448 * 449 * Otherwise we'll have to drop the mmap lock and 450 * re-take it, and also look up the vma again, 451 * re-checking it. 452 */ 453 if (!mmap_upgrade_trylock(mm)) { 454 if (!upgrade_mmap_lock_carefully(mm, regs)) 455 return NULL; 456 457 vma = find_vma(mm, addr); 458 if (!vma) 459 goto fail; 460 if (vma->vm_start <= addr) 461 goto success; 462 if (!(vma->vm_flags & VM_GROWSDOWN)) 463 goto fail; 464 } 465 466 if (expand_stack_locked(vma, addr)) 467 goto fail; 468 469 success: 470 mmap_write_downgrade(mm); 471 return vma; 472 473 fail: 474 mmap_write_unlock(mm); 475 return NULL; 476 } 477 #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */ 478 479 #else /* CONFIG_MMU */ 480 481 /* 482 * At least xtensa ends up having protection faults even with no 483 * MMU.. No stack expansion, at least. 484 */ 485 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 486 unsigned long addr, struct pt_regs *regs) 487 { 488 struct vm_area_struct *vma; 489 490 mmap_read_lock(mm); 491 vma = vma_lookup(mm, addr); 492 if (!vma) 493 mmap_read_unlock(mm); 494 return vma; 495 } 496 497 #endif /* CONFIG_MMU */ 498