1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * include/linux/userfaultfd_k.h 4 * 5 * Copyright (C) 2015 Red Hat, Inc. 6 * 7 */ 8 9 #ifndef _LINUX_USERFAULTFD_K_H 10 #define _LINUX_USERFAULTFD_K_H 11 12 #ifdef CONFIG_USERFAULTFD 13 14 #include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */ 15 16 #include <linux/fcntl.h> 17 #include <linux/mm.h> 18 #include <linux/swap.h> 19 #include <linux/leafops.h> 20 #include <asm-generic/pgtable_uffd.h> 21 #include <linux/hugetlb_inline.h> 22 23 /* The set of all possible UFFD-related VM flags. */ 24 #define __VM_UFFD_FLAGS (VM_UFFD_MISSING | VM_UFFD_WP | VM_UFFD_MINOR) 25 26 #define __VMA_UFFD_FLAGS mk_vma_flags(VMA_UFFD_MISSING_BIT, VMA_UFFD_WP_BIT, \ 27 VMA_UFFD_MINOR_BIT) 28 29 /* 30 * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining 31 * new flags, since they might collide with O_* ones. We want 32 * to re-use O_* flags that couldn't possibly have a meaning 33 * from userfaultfd, in order to leave a free define-space for 34 * shared O_* flags. 35 */ 36 #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) 37 38 /* 39 * Start with fault_pending_wqh and fault_wqh so they're more likely 40 * to be in the same cacheline. 41 * 42 * Locking order: 43 * fd_wqh.lock 44 * fault_pending_wqh.lock 45 * fault_wqh.lock 46 * event_wqh.lock 47 * 48 * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, 49 * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's 50 * also taken in IRQ context. 51 */ 52 struct userfaultfd_ctx { 53 /* waitqueue head for the pending (i.e. not read) userfaults */ 54 wait_queue_head_t fault_pending_wqh; 55 /* waitqueue head for the userfaults */ 56 wait_queue_head_t fault_wqh; 57 /* waitqueue head for the pseudo fd to wakeup poll/read */ 58 wait_queue_head_t fd_wqh; 59 /* waitqueue head for events */ 60 wait_queue_head_t event_wqh; 61 /* a refile sequence protected by fault_pending_wqh lock */ 62 seqcount_spinlock_t refile_seq; 63 /* pseudo fd refcounting */ 64 refcount_t refcount; 65 /* userfaultfd syscall flags */ 66 unsigned int flags; 67 /* features requested from the userspace */ 68 unsigned int features; 69 /* released */ 70 bool released; 71 /* 72 * Prevents userfaultfd operations (fill/move/wp) from happening while 73 * some non-cooperative event(s) is taking place. Increments are done 74 * in write-mode. Whereas, userfaultfd operations, which includes 75 * reading mmap_changing, is done under read-mode. 76 */ 77 struct rw_semaphore map_changing_lock; 78 /* memory mappings are changing because of non-cooperative event */ 79 atomic_t mmap_changing; 80 /* mm with one ore more vmas attached to this userfaultfd_ctx */ 81 struct mm_struct *mm; 82 }; 83 84 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); 85 86 /* VMA userfaultfd operations */ 87 struct vm_uffd_ops { 88 /* Checks if a VMA can support userfaultfd */ 89 bool (*can_userfault)(struct vm_area_struct *vma, vm_flags_t vm_flags); 90 /* 91 * Called to resolve UFFDIO_CONTINUE request. 92 * Should return the folio found at pgoff in the VMA's pagecache if it 93 * exists or ERR_PTR otherwise. 94 * The returned folio is locked and with reference held. 95 */ 96 struct folio *(*get_folio_noalloc)(struct inode *inode, pgoff_t pgoff); 97 /* 98 * Called during resolution of UFFDIO_COPY request. 99 * Should allocate and return a folio or NULL if allocation fails. 100 */ 101 struct folio *(*alloc_folio)(struct vm_area_struct *vma, 102 unsigned long addr); 103 /* 104 * Called during resolution of UFFDIO_COPY request. 105 * Should only be called with a folio returned by alloc_folio() above. 106 * The folio will be set to locked. 107 * Returns 0 on success, error code on failure. 108 */ 109 int (*filemap_add)(struct folio *folio, struct vm_area_struct *vma, 110 unsigned long addr); 111 /* 112 * Called during resolution of UFFDIO_COPY request on the error 113 * handling path. 114 * Should revert the operation of ->filemap_add(). 115 */ 116 void (*filemap_remove)(struct folio *folio, struct vm_area_struct *vma); 117 }; 118 119 /* A combined operation mode + behavior flags. */ 120 typedef unsigned int __bitwise uffd_flags_t; 121 122 /* Mutually exclusive modes of operation. */ 123 enum mfill_atomic_mode { 124 MFILL_ATOMIC_COPY, 125 MFILL_ATOMIC_ZEROPAGE, 126 MFILL_ATOMIC_CONTINUE, 127 MFILL_ATOMIC_POISON, 128 NR_MFILL_ATOMIC_MODES, 129 }; 130 131 #define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1) 132 #define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr)) 133 #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr)) 134 #define MFILL_ATOMIC_MODE_MASK ((__force uffd_flags_t) (MFILL_ATOMIC_BIT(0) - 1)) 135 136 static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode expected) 137 { 138 return (flags & MFILL_ATOMIC_MODE_MASK) == ((__force uffd_flags_t) expected); 139 } 140 141 static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode) 142 { 143 flags &= ~MFILL_ATOMIC_MODE_MASK; 144 return flags | ((__force uffd_flags_t) mode); 145 } 146 147 /* Flags controlling behavior. These behavior changes are mode-independent. */ 148 #define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0) 149 150 extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, 151 unsigned long src_start, unsigned long len, 152 uffd_flags_t flags); 153 extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, 154 unsigned long dst_start, 155 unsigned long len); 156 extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start, 157 unsigned long len, uffd_flags_t flags); 158 extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, 159 unsigned long len, uffd_flags_t flags); 160 extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, 161 unsigned long len, bool enable_wp); 162 extern long uffd_wp_range(struct vm_area_struct *vma, 163 unsigned long start, unsigned long len, bool enable_wp); 164 165 /* move_pages */ 166 void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); 167 void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); 168 ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, 169 unsigned long src_start, unsigned long len, __u64 flags); 170 int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval, 171 struct vm_area_struct *dst_vma, 172 struct vm_area_struct *src_vma, 173 unsigned long dst_addr, unsigned long src_addr); 174 175 /* mm helpers */ 176 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, 177 struct vm_userfaultfd_ctx vm_ctx) 178 { 179 return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx; 180 } 181 182 /* 183 * Never enable huge pmd sharing on some uffd registered vmas: 184 * 185 * - VM_UFFD_WP VMAs, because write protect information is per pgtable entry. 186 * 187 * - VM_UFFD_MINOR VMAs, because otherwise we would never get minor faults for 188 * VMAs which share huge pmds. (If you have two mappings to the same 189 * underlying pages, and fault in the non-UFFD-registered one with a write, 190 * with huge pmd sharing this would *also* setup the second UFFD-registered 191 * mapping, and we'd not get minor faults.) 192 */ 193 static inline bool uffd_disable_huge_pmd_share(struct vm_area_struct *vma) 194 { 195 return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); 196 } 197 198 /* 199 * Don't do fault around for either WP or MINOR registered uffd range. For 200 * MINOR registered range, fault around will be a total disaster and ptes can 201 * be installed without notifications; for WP it should mostly be fine as long 202 * as the fault around checks for pte_none() before the installation, however 203 * to be super safe we just forbid it. 204 */ 205 static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) 206 { 207 return vma->vm_flags & (VM_UFFD_WP | VM_UFFD_MINOR); 208 } 209 210 static inline bool userfaultfd_missing(struct vm_area_struct *vma) 211 { 212 return vma->vm_flags & VM_UFFD_MISSING; 213 } 214 215 static inline bool userfaultfd_wp(struct vm_area_struct *vma) 216 { 217 return vma->vm_flags & VM_UFFD_WP; 218 } 219 220 static inline bool userfaultfd_minor(struct vm_area_struct *vma) 221 { 222 return vma->vm_flags & VM_UFFD_MINOR; 223 } 224 225 static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, 226 pte_t pte) 227 { 228 return userfaultfd_wp(vma) && pte_uffd_wp(pte); 229 } 230 231 static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, 232 pmd_t pmd) 233 { 234 return userfaultfd_wp(vma) && pmd_uffd_wp(pmd); 235 } 236 237 static inline bool userfaultfd_armed(struct vm_area_struct *vma) 238 { 239 return vma->vm_flags & __VM_UFFD_FLAGS; 240 } 241 242 bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, 243 bool wp_async); 244 245 static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) 246 { 247 struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx; 248 249 return uffd_ctx && (uffd_ctx->features & UFFD_FEATURE_EVENT_REMAP) == 0; 250 } 251 252 extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); 253 extern void dup_userfaultfd_complete(struct list_head *); 254 void dup_userfaultfd_fail(struct list_head *); 255 256 extern void mremap_userfaultfd_prep(struct vm_area_struct *, 257 struct vm_userfaultfd_ctx *); 258 extern void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *, 259 unsigned long from, unsigned long to, 260 unsigned long len); 261 void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *); 262 263 extern bool userfaultfd_remove(struct vm_area_struct *vma, 264 unsigned long start, 265 unsigned long end); 266 267 extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, 268 unsigned long start, unsigned long end, struct list_head *uf); 269 extern void userfaultfd_unmap_complete(struct mm_struct *mm, 270 struct list_head *uf); 271 extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); 272 extern bool userfaultfd_wp_async(struct vm_area_struct *vma); 273 274 void userfaultfd_reset_ctx(struct vm_area_struct *vma); 275 276 struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, 277 struct vm_area_struct *prev, 278 struct vm_area_struct *vma, 279 unsigned long start, 280 unsigned long end); 281 282 int userfaultfd_register_range(struct userfaultfd_ctx *ctx, 283 struct vm_area_struct *vma, 284 vm_flags_t vm_flags, 285 unsigned long start, unsigned long end, 286 bool wp_async); 287 288 void userfaultfd_release_new(struct userfaultfd_ctx *ctx); 289 290 void userfaultfd_release_all(struct mm_struct *mm, 291 struct userfaultfd_ctx *ctx); 292 293 static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) 294 { 295 /* Only wr-protect mode uses pte markers */ 296 if (!userfaultfd_wp(vma)) 297 return false; 298 299 /* File-based uffd-wp always need markers */ 300 if (!vma_is_anonymous(vma)) 301 return true; 302 303 /* 304 * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED 305 * enabled (to apply markers on zero pages). 306 */ 307 return userfaultfd_wp_unpopulated(vma); 308 } 309 310 /* 311 * Returns true if this is a swap pte and was uffd-wp wr-protected in either 312 * forms (pte marker or a normal swap pte), false otherwise. 313 */ 314 static inline bool pte_swp_uffd_wp_any(pte_t pte) 315 { 316 if (!uffd_supports_wp_marker()) 317 return false; 318 319 if (pte_present(pte)) 320 return false; 321 322 if (pte_swp_uffd_wp(pte)) 323 return true; 324 325 if (pte_is_uffd_wp_marker(pte)) 326 return true; 327 328 return false; 329 } 330 #else /* CONFIG_USERFAULTFD */ 331 332 /* mm helpers */ 333 static inline vm_fault_t handle_userfault(struct vm_fault *vmf, 334 unsigned long reason) 335 { 336 return VM_FAULT_SIGBUS; 337 } 338 339 static inline long uffd_wp_range(struct vm_area_struct *vma, 340 unsigned long start, unsigned long len, 341 bool enable_wp) 342 { 343 return false; 344 } 345 346 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, 347 struct vm_userfaultfd_ctx vm_ctx) 348 { 349 return true; 350 } 351 352 static inline bool userfaultfd_missing(struct vm_area_struct *vma) 353 { 354 return false; 355 } 356 357 static inline bool userfaultfd_wp(struct vm_area_struct *vma) 358 { 359 return false; 360 } 361 362 static inline bool userfaultfd_minor(struct vm_area_struct *vma) 363 { 364 return false; 365 } 366 367 static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma, 368 pte_t pte) 369 { 370 return false; 371 } 372 373 static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma, 374 pmd_t pmd) 375 { 376 return false; 377 } 378 379 380 static inline bool userfaultfd_armed(struct vm_area_struct *vma) 381 { 382 return false; 383 } 384 385 static inline int dup_userfaultfd(struct vm_area_struct *vma, 386 struct list_head *l) 387 { 388 return 0; 389 } 390 391 static inline void dup_userfaultfd_complete(struct list_head *l) 392 { 393 } 394 395 static inline void dup_userfaultfd_fail(struct list_head *l) 396 { 397 } 398 399 static inline void mremap_userfaultfd_prep(struct vm_area_struct *vma, 400 struct vm_userfaultfd_ctx *ctx) 401 { 402 } 403 404 static inline void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *ctx, 405 unsigned long from, 406 unsigned long to, 407 unsigned long len) 408 { 409 } 410 411 static inline void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *ctx) 412 { 413 } 414 415 static inline bool userfaultfd_remove(struct vm_area_struct *vma, 416 unsigned long start, 417 unsigned long end) 418 { 419 return true; 420 } 421 422 static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, 423 unsigned long start, unsigned long end, 424 struct list_head *uf) 425 { 426 return 0; 427 } 428 429 static inline void userfaultfd_unmap_complete(struct mm_struct *mm, 430 struct list_head *uf) 431 { 432 } 433 434 static inline bool uffd_disable_fault_around(struct vm_area_struct *vma) 435 { 436 return false; 437 } 438 439 static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) 440 { 441 return false; 442 } 443 444 static inline bool userfaultfd_wp_async(struct vm_area_struct *vma) 445 { 446 return false; 447 } 448 449 static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) 450 { 451 return false; 452 } 453 454 static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) 455 { 456 return false; 457 } 458 459 /* 460 * Returns true if this is a swap pte and was uffd-wp wr-protected in either 461 * forms (pte marker or a normal swap pte), false otherwise. 462 */ 463 static inline bool pte_swp_uffd_wp_any(pte_t pte) 464 { 465 return false; 466 } 467 #endif /* CONFIG_USERFAULTFD */ 468 #endif /* _LINUX_USERFAULTFD_K_H */ 469