1 // SPDX-License-Identifier: GPL-2.0 2 #define CREATE_TRACE_POINTS 3 #include <trace/events/mmap_lock.h> 4 5 #include <linux/mm.h> 6 #include <linux/cgroup.h> 7 #include <linux/memcontrol.h> 8 #include <linux/mmap_lock.h> 9 #include <linux/mutex.h> 10 #include <linux/percpu.h> 11 #include <linux/rcupdate.h> 12 #include <linux/smp.h> 13 #include <linux/trace_events.h> 14 #include <linux/local_lock.h> 15 16 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); 17 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); 18 EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); 19 20 #ifdef CONFIG_TRACING 21 /* 22 * Trace calls must be in a separate file, as otherwise there's a circular 23 * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. 24 */ 25 26 void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) 27 { 28 trace_mmap_lock_start_locking(mm, write); 29 } 30 EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); 31 32 void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, 33 bool success) 34 { 35 trace_mmap_lock_acquire_returned(mm, write, success); 36 } 37 EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); 38 39 void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) 40 { 41 trace_mmap_lock_released(mm, write); 42 } 43 EXPORT_SYMBOL(__mmap_lock_do_trace_released); 44 #endif /* CONFIG_TRACING */ 45 46 #ifdef CONFIG_MMU 47 #ifdef CONFIG_PER_VMA_LOCK 48 static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) 49 { 50 unsigned int tgt_refcnt = VMA_LOCK_OFFSET; 51 52 /* Additional refcnt if the vma is attached. */ 53 if (!detaching) 54 tgt_refcnt++; 55 56 /* 57 * If vma is detached then only vma_mark_attached() can raise the 58 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). 59 */ 60 if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) 61 return false; 62 63 rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); 64 rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, 65 refcount_read(&vma->vm_refcnt) == tgt_refcnt, 66 TASK_UNINTERRUPTIBLE); 67 lock_acquired(&vma->vmlock_dep_map, _RET_IP_); 68 69 return true; 70 } 71 72 static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) 73 { 74 *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); 75 rwsem_release(&vma->vmlock_dep_map, _RET_IP_); 76 } 77 78 void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) 79 { 80 bool locked; 81 82 /* 83 * __vma_enter_locked() returns false immediately if the vma is not 84 * attached, otherwise it waits until refcnt is indicating that vma 85 * is attached with no readers. 86 */ 87 locked = __vma_enter_locked(vma, false); 88 89 /* 90 * We should use WRITE_ONCE() here because we can have concurrent reads 91 * from the early lockless pessimistic check in vma_start_read(). 92 * We don't really care about the correctness of that early check, but 93 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. 94 */ 95 WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); 96 97 if (locked) { 98 bool detached; 99 100 __vma_exit_locked(vma, &detached); 101 WARN_ON_ONCE(detached); /* vma should remain attached */ 102 } 103 } 104 EXPORT_SYMBOL_GPL(__vma_start_write); 105 106 void vma_mark_detached(struct vm_area_struct *vma) 107 { 108 vma_assert_write_locked(vma); 109 vma_assert_attached(vma); 110 111 /* 112 * We are the only writer, so no need to use vma_refcount_put(). 113 * The condition below is unlikely because the vma has been already 114 * write-locked and readers can increment vm_refcnt only temporarily 115 * before they check vm_lock_seq, realize the vma is locked and drop 116 * back the vm_refcnt. That is a narrow window for observing a raised 117 * vm_refcnt. 118 */ 119 if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { 120 /* Wait until vma is detached with no readers. */ 121 if (__vma_enter_locked(vma, true)) { 122 bool detached; 123 124 __vma_exit_locked(vma, &detached); 125 WARN_ON_ONCE(!detached); 126 } 127 } 128 } 129 130 /* 131 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be 132 * stable and not isolated. If the VMA is not found or is being modified the 133 * function returns NULL. 134 */ 135 struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, 136 unsigned long address) 137 { 138 MA_STATE(mas, &mm->mm_mt, address, address); 139 struct vm_area_struct *vma; 140 141 rcu_read_lock(); 142 retry: 143 vma = mas_walk(&mas); 144 if (!vma) 145 goto inval; 146 147 vma = vma_start_read(mm, vma); 148 if (IS_ERR_OR_NULL(vma)) { 149 /* Check if the VMA got isolated after we found it */ 150 if (PTR_ERR(vma) == -EAGAIN) { 151 count_vm_vma_lock_event(VMA_LOCK_MISS); 152 /* The area was replaced with another one */ 153 goto retry; 154 } 155 156 /* Failed to lock the VMA */ 157 goto inval; 158 } 159 /* 160 * At this point, we have a stable reference to a VMA: The VMA is 161 * locked and we know it hasn't already been isolated. 162 * From here on, we can access the VMA without worrying about which 163 * fields are accessible for RCU readers. 164 */ 165 166 /* Check if the vma we locked is the right one. */ 167 if (unlikely(vma->vm_mm != mm || 168 address < vma->vm_start || address >= vma->vm_end)) 169 goto inval_end_read; 170 171 rcu_read_unlock(); 172 return vma; 173 174 inval_end_read: 175 vma_end_read(vma); 176 inval: 177 rcu_read_unlock(); 178 count_vm_vma_lock_event(VMA_LOCK_ABORT); 179 return NULL; 180 } 181 182 static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm, 183 struct vma_iterator *vmi, 184 unsigned long from_addr) 185 { 186 struct vm_area_struct *vma; 187 int ret; 188 189 ret = mmap_read_lock_killable(mm); 190 if (ret) 191 return ERR_PTR(ret); 192 193 /* Lookup the vma at the last position again under mmap_read_lock */ 194 vma_iter_set(vmi, from_addr); 195 vma = vma_next(vmi); 196 if (vma) { 197 /* Very unlikely vma->vm_refcnt overflow case */ 198 if (unlikely(!vma_start_read_locked(vma))) 199 vma = ERR_PTR(-EAGAIN); 200 } 201 202 mmap_read_unlock(mm); 203 204 return vma; 205 } 206 207 struct vm_area_struct *lock_next_vma(struct mm_struct *mm, 208 struct vma_iterator *vmi, 209 unsigned long from_addr) 210 { 211 struct vm_area_struct *vma; 212 unsigned int mm_wr_seq; 213 bool mmap_unlocked; 214 215 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); 216 retry: 217 /* Start mmap_lock speculation in case we need to verify the vma later */ 218 mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); 219 vma = vma_next(vmi); 220 if (!vma) 221 return NULL; 222 223 vma = vma_start_read(mm, vma); 224 if (IS_ERR_OR_NULL(vma)) { 225 /* 226 * Retry immediately if the vma gets detached from under us. 227 * Infinite loop should not happen because the vma we find will 228 * have to be constantly knocked out from under us. 229 */ 230 if (PTR_ERR(vma) == -EAGAIN) { 231 /* reset to search from the last address */ 232 vma_iter_set(vmi, from_addr); 233 goto retry; 234 } 235 236 goto fallback; 237 } 238 239 /* 240 * Verify the vma we locked belongs to the same address space and it's 241 * not behind of the last search position. 242 */ 243 if (unlikely(vma->vm_mm != mm || from_addr >= vma->vm_end)) 244 goto fallback_unlock; 245 246 /* 247 * vma can be ahead of the last search position but we need to verify 248 * it was not shrunk after we found it and another vma has not been 249 * installed ahead of it. Otherwise we might observe a gap that should 250 * not be there. 251 */ 252 if (from_addr < vma->vm_start) { 253 /* Verify only if the address space might have changed since vma lookup. */ 254 if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) { 255 vma_iter_set(vmi, from_addr); 256 if (vma != vma_next(vmi)) 257 goto fallback_unlock; 258 } 259 } 260 261 return vma; 262 263 fallback_unlock: 264 vma_end_read(vma); 265 fallback: 266 rcu_read_unlock(); 267 vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr); 268 rcu_read_lock(); 269 /* Reinitialize the iterator after re-entering rcu read section */ 270 vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end); 271 272 return vma; 273 } 274 #endif /* CONFIG_PER_VMA_LOCK */ 275 276 #ifdef CONFIG_LOCK_MM_AND_FIND_VMA 277 #include <linux/extable.h> 278 279 static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 280 { 281 if (likely(mmap_read_trylock(mm))) 282 return true; 283 284 if (regs && !user_mode(regs)) { 285 unsigned long ip = exception_ip(regs); 286 if (!search_exception_tables(ip)) 287 return false; 288 } 289 290 return !mmap_read_lock_killable(mm); 291 } 292 293 static inline bool mmap_upgrade_trylock(struct mm_struct *mm) 294 { 295 /* 296 * We don't have this operation yet. 297 * 298 * It should be easy enough to do: it's basically a 299 * atomic_long_try_cmpxchg_acquire() 300 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but 301 * it also needs the proper lockdep magic etc. 302 */ 303 return false; 304 } 305 306 static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) 307 { 308 mmap_read_unlock(mm); 309 if (regs && !user_mode(regs)) { 310 unsigned long ip = exception_ip(regs); 311 if (!search_exception_tables(ip)) 312 return false; 313 } 314 return !mmap_write_lock_killable(mm); 315 } 316 317 /* 318 * Helper for page fault handling. 319 * 320 * This is kind of equivalent to "mmap_read_lock()" followed 321 * by "find_extend_vma()", except it's a lot more careful about 322 * the locking (and will drop the lock on failure). 323 * 324 * For example, if we have a kernel bug that causes a page 325 * fault, we don't want to just use mmap_read_lock() to get 326 * the mm lock, because that would deadlock if the bug were 327 * to happen while we're holding the mm lock for writing. 328 * 329 * So this checks the exception tables on kernel faults in 330 * order to only do this all for instructions that are actually 331 * expected to fault. 332 * 333 * We can also actually take the mm lock for writing if we 334 * need to extend the vma, which helps the VM layer a lot. 335 */ 336 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 337 unsigned long addr, struct pt_regs *regs) 338 { 339 struct vm_area_struct *vma; 340 341 if (!get_mmap_lock_carefully(mm, regs)) 342 return NULL; 343 344 vma = find_vma(mm, addr); 345 if (likely(vma && (vma->vm_start <= addr))) 346 return vma; 347 348 /* 349 * Well, dang. We might still be successful, but only 350 * if we can extend a vma to do so. 351 */ 352 if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { 353 mmap_read_unlock(mm); 354 return NULL; 355 } 356 357 /* 358 * We can try to upgrade the mmap lock atomically, 359 * in which case we can continue to use the vma 360 * we already looked up. 361 * 362 * Otherwise we'll have to drop the mmap lock and 363 * re-take it, and also look up the vma again, 364 * re-checking it. 365 */ 366 if (!mmap_upgrade_trylock(mm)) { 367 if (!upgrade_mmap_lock_carefully(mm, regs)) 368 return NULL; 369 370 vma = find_vma(mm, addr); 371 if (!vma) 372 goto fail; 373 if (vma->vm_start <= addr) 374 goto success; 375 if (!(vma->vm_flags & VM_GROWSDOWN)) 376 goto fail; 377 } 378 379 if (expand_stack_locked(vma, addr)) 380 goto fail; 381 382 success: 383 mmap_write_downgrade(mm); 384 return vma; 385 386 fail: 387 mmap_write_unlock(mm); 388 return NULL; 389 } 390 #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */ 391 392 #else /* CONFIG_MMU */ 393 394 /* 395 * At least xtensa ends up having protection faults even with no 396 * MMU.. No stack expansion, at least. 397 */ 398 struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, 399 unsigned long addr, struct pt_regs *regs) 400 { 401 struct vm_area_struct *vma; 402 403 mmap_read_lock(mm); 404 vma = vma_lookup(mm, addr); 405 if (!vma) 406 mmap_read_unlock(mm); 407 return vma; 408 } 409 410 #endif /* CONFIG_MMU */ 411