1 /* SPDX-License-Identifier: GPL-2.0 */ 2 #ifndef _LINUX_RMAP_H 3 #define _LINUX_RMAP_H 4 /* 5 * Declarations for Reverse Mapping functions in mm/rmap.c 6 */ 7 8 #include <linux/list.h> 9 #include <linux/slab.h> 10 #include <linux/mm.h> 11 #include <linux/rwsem.h> 12 #include <linux/memcontrol.h> 13 #include <linux/highmem.h> 14 #include <linux/pagemap.h> 15 #include <linux/memremap.h> 16 #include <linux/bit_spinlock.h> 17 18 /* 19 * The anon_vma heads a list of private "related" vmas, to scan if 20 * an anonymous page pointing to this anon_vma needs to be unmapped: 21 * the vmas on the list will be related by forking, or by splitting. 22 * 23 * Since vmas come and go as they are split and merged (particularly 24 * in mprotect), the mapping field of an anonymous page cannot point 25 * directly to a vma: instead it points to an anon_vma, on whose list 26 * the related vmas can be easily linked or unlinked. 27 * 28 * After unlinking the last vma on the list, we must garbage collect 29 * the anon_vma object itself: we're guaranteed no page can be 30 * pointing to this anon_vma once its vma list is empty. 31 */ 32 struct anon_vma { 33 struct anon_vma *root; /* Root of this anon_vma tree */ 34 struct rw_semaphore rwsem; /* W: modification, R: walking the list */ 35 /* 36 * The refcount is taken on an anon_vma when there is no 37 * guarantee that the vma of page tables will exist for 38 * the duration of the operation. A caller that takes 39 * the reference is responsible for clearing up the 40 * anon_vma if they are the last user on release 41 */ 42 atomic_t refcount; 43 44 /* 45 * Count of child anon_vmas. Equals to the count of all anon_vmas that 46 * have ->parent pointing to this one, including itself. 47 * 48 * This counter is used for making decision about reusing anon_vma 49 * instead of forking new one. See comments in function anon_vma_clone. 50 */ 51 unsigned long num_children; 52 /* Count of VMAs whose ->anon_vma pointer points to this object. */ 53 unsigned long num_active_vmas; 54 55 struct anon_vma *parent; /* Parent of this anon_vma */ 56 57 /* 58 * NOTE: the LSB of the rb_root.rb_node is set by 59 * mm_take_all_locks() _after_ taking the above lock. So the 60 * rb_root must only be read/written after taking the above lock 61 * to be sure to see a valid next pointer. The LSB bit itself 62 * is serialized by a system wide lock only visible to 63 * mm_take_all_locks() (mm_all_locks_mutex). 64 */ 65 66 /* Interval tree of private "related" vmas */ 67 struct rb_root_cached rb_root; 68 }; 69 70 /* 71 * The copy-on-write semantics of fork mean that an anon_vma 72 * can become associated with multiple processes. Furthermore, 73 * each child process will have its own anon_vma, where new 74 * pages for that process are instantiated. 75 * 76 * This structure allows us to find the anon_vmas associated 77 * with a VMA, or the VMAs associated with an anon_vma. 78 * The "same_vma" list contains the anon_vma_chains linking 79 * all the anon_vmas associated with this VMA. 80 * The "rb" field indexes on an interval tree the anon_vma_chains 81 * which link all the VMAs associated with this anon_vma. 82 */ 83 struct anon_vma_chain { 84 struct vm_area_struct *vma; 85 struct anon_vma *anon_vma; 86 struct list_head same_vma; /* locked by mmap_lock & page_table_lock */ 87 struct rb_node rb; /* locked by anon_vma->rwsem */ 88 unsigned long rb_subtree_last; 89 #ifdef CONFIG_DEBUG_VM_RB 90 unsigned long cached_vma_start, cached_vma_last; 91 #endif 92 }; 93 94 enum ttu_flags { 95 TTU_USE_SHARED_ZEROPAGE = 0x2, /* for unused pages of large folios */ 96 TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ 97 TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ 98 TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */ 99 TTU_HWPOISON = 0x20, /* do convert pte to hwpoison entry */ 100 TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible 101 * and caller guarantees they will 102 * do a final flush if necessary */ 103 TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: 104 * caller holds it */ 105 }; 106 107 #ifdef CONFIG_MMU 108 109 void anon_vma_init(void); /* create anon_vma_cachep */ 110 111 #ifdef CONFIG_MM_ID 112 static __always_inline void folio_lock_large_mapcount(struct folio *folio) 113 { 114 bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids); 115 } 116 117 static __always_inline void folio_unlock_large_mapcount(struct folio *folio) 118 { 119 __bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids); 120 } 121 122 static inline unsigned int folio_mm_id(const struct folio *folio, int idx) 123 { 124 VM_WARN_ON_ONCE(idx != 0 && idx != 1); 125 return folio->_mm_id[idx] & MM_ID_MASK; 126 } 127 128 static inline void folio_set_mm_id(struct folio *folio, int idx, mm_id_t id) 129 { 130 VM_WARN_ON_ONCE(idx != 0 && idx != 1); 131 folio->_mm_id[idx] &= ~MM_ID_MASK; 132 folio->_mm_id[idx] |= id; 133 } 134 135 static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio, 136 int diff, mm_id_t mm_id) 137 { 138 VM_WARN_ON_ONCE(!folio_test_large(folio) || folio_test_hugetlb(folio)); 139 VM_WARN_ON_ONCE(diff <= 0); 140 VM_WARN_ON_ONCE(mm_id < MM_ID_MIN || mm_id > MM_ID_MAX); 141 142 /* 143 * Make sure we can detect at least one complete PTE mapping of the 144 * folio in a single MM as "exclusively mapped". This is primarily 145 * a check on 32bit, where we currently reduce the size of the per-MM 146 * mapcount to a short. 147 */ 148 VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio)); 149 VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - 1 > MM_ID_MAPCOUNT_MAX); 150 151 VM_WARN_ON_ONCE(folio_mm_id(folio, 0) == MM_ID_DUMMY && 152 folio->_mm_id_mapcount[0] != -1); 153 VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY && 154 folio->_mm_id_mapcount[0] < 0); 155 VM_WARN_ON_ONCE(folio_mm_id(folio, 1) == MM_ID_DUMMY && 156 folio->_mm_id_mapcount[1] != -1); 157 VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY && 158 folio->_mm_id_mapcount[1] < 0); 159 VM_WARN_ON_ONCE(!folio_mapped(folio) && 160 test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids)); 161 } 162 163 static __always_inline void folio_set_large_mapcount(struct folio *folio, 164 int mapcount, struct vm_area_struct *vma) 165 { 166 __folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id); 167 168 VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY); 169 VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY); 170 171 /* Note: mapcounts start at -1. */ 172 atomic_set(&folio->_large_mapcount, mapcount - 1); 173 folio->_mm_id_mapcount[0] = mapcount - 1; 174 folio_set_mm_id(folio, 0, vma->vm_mm->mm_id); 175 } 176 177 static __always_inline int folio_add_return_large_mapcount(struct folio *folio, 178 int diff, struct vm_area_struct *vma) 179 { 180 const mm_id_t mm_id = vma->vm_mm->mm_id; 181 int new_mapcount_val; 182 183 folio_lock_large_mapcount(folio); 184 __folio_large_mapcount_sanity_checks(folio, diff, mm_id); 185 186 new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff; 187 atomic_set(&folio->_large_mapcount, new_mapcount_val); 188 189 /* 190 * If a folio is mapped more than once into an MM on 32bit, we 191 * can in theory overflow the per-MM mapcount (although only for 192 * fairly large folios), turning it negative. In that case, just 193 * free up the slot and mark the folio "mapped shared", otherwise 194 * we might be in trouble when unmapping pages later. 195 */ 196 if (folio_mm_id(folio, 0) == mm_id) { 197 folio->_mm_id_mapcount[0] += diff; 198 if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[0] < 0)) { 199 folio->_mm_id_mapcount[0] = -1; 200 folio_set_mm_id(folio, 0, MM_ID_DUMMY); 201 folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; 202 } 203 } else if (folio_mm_id(folio, 1) == mm_id) { 204 folio->_mm_id_mapcount[1] += diff; 205 if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[1] < 0)) { 206 folio->_mm_id_mapcount[1] = -1; 207 folio_set_mm_id(folio, 1, MM_ID_DUMMY); 208 folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; 209 } 210 } else if (folio_mm_id(folio, 0) == MM_ID_DUMMY) { 211 folio_set_mm_id(folio, 0, mm_id); 212 folio->_mm_id_mapcount[0] = diff - 1; 213 /* We might have other mappings already. */ 214 if (new_mapcount_val != diff - 1) 215 folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; 216 } else if (folio_mm_id(folio, 1) == MM_ID_DUMMY) { 217 folio_set_mm_id(folio, 1, mm_id); 218 folio->_mm_id_mapcount[1] = diff - 1; 219 /* Slot 0 certainly has mappings as well. */ 220 folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT; 221 } 222 folio_unlock_large_mapcount(folio); 223 return new_mapcount_val + 1; 224 } 225 #define folio_add_large_mapcount folio_add_return_large_mapcount 226 227 static __always_inline int folio_sub_return_large_mapcount(struct folio *folio, 228 int diff, struct vm_area_struct *vma) 229 { 230 const mm_id_t mm_id = vma->vm_mm->mm_id; 231 int new_mapcount_val; 232 233 folio_lock_large_mapcount(folio); 234 __folio_large_mapcount_sanity_checks(folio, diff, mm_id); 235 236 new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff; 237 atomic_set(&folio->_large_mapcount, new_mapcount_val); 238 239 /* 240 * There are valid corner cases where we might underflow a per-MM 241 * mapcount (some mappings added when no slot was free, some mappings 242 * added once a slot was free), so we always set it to -1 once we go 243 * negative. 244 */ 245 if (folio_mm_id(folio, 0) == mm_id) { 246 folio->_mm_id_mapcount[0] -= diff; 247 if (folio->_mm_id_mapcount[0] >= 0) 248 goto out; 249 folio->_mm_id_mapcount[0] = -1; 250 folio_set_mm_id(folio, 0, MM_ID_DUMMY); 251 } else if (folio_mm_id(folio, 1) == mm_id) { 252 folio->_mm_id_mapcount[1] -= diff; 253 if (folio->_mm_id_mapcount[1] >= 0) 254 goto out; 255 folio->_mm_id_mapcount[1] = -1; 256 folio_set_mm_id(folio, 1, MM_ID_DUMMY); 257 } 258 259 /* 260 * If one MM slot owns all mappings, the folio is mapped exclusively. 261 * Note that if the folio is now unmapped (new_mapcount_val == -1), both 262 * slots must be free (mapcount == -1), and we'll also mark it as 263 * exclusive. 264 */ 265 if (folio->_mm_id_mapcount[0] == new_mapcount_val || 266 folio->_mm_id_mapcount[1] == new_mapcount_val) 267 folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT; 268 out: 269 folio_unlock_large_mapcount(folio); 270 return new_mapcount_val + 1; 271 } 272 #define folio_sub_large_mapcount folio_sub_return_large_mapcount 273 #else /* !CONFIG_MM_ID */ 274 /* 275 * See __folio_rmap_sanity_checks(), we might map large folios even without 276 * CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now. 277 */ 278 static inline void folio_set_large_mapcount(struct folio *folio, int mapcount, 279 struct vm_area_struct *vma) 280 { 281 /* Note: mapcounts start at -1. */ 282 atomic_set(&folio->_large_mapcount, mapcount - 1); 283 } 284 285 static inline void folio_add_large_mapcount(struct folio *folio, 286 int diff, struct vm_area_struct *vma) 287 { 288 atomic_add(diff, &folio->_large_mapcount); 289 } 290 291 static inline int folio_add_return_large_mapcount(struct folio *folio, 292 int diff, struct vm_area_struct *vma) 293 { 294 BUILD_BUG(); 295 } 296 297 static inline void folio_sub_large_mapcount(struct folio *folio, 298 int diff, struct vm_area_struct *vma) 299 { 300 atomic_sub(diff, &folio->_large_mapcount); 301 } 302 303 static inline int folio_sub_return_large_mapcount(struct folio *folio, 304 int diff, struct vm_area_struct *vma) 305 { 306 BUILD_BUG(); 307 } 308 #endif /* CONFIG_MM_ID */ 309 310 #define folio_inc_large_mapcount(folio, vma) \ 311 folio_add_large_mapcount(folio, 1, vma) 312 #define folio_inc_return_large_mapcount(folio, vma) \ 313 folio_add_return_large_mapcount(folio, 1, vma) 314 #define folio_dec_large_mapcount(folio, vma) \ 315 folio_sub_large_mapcount(folio, 1, vma) 316 #define folio_dec_return_large_mapcount(folio, vma) \ 317 folio_sub_return_large_mapcount(folio, 1, vma) 318 319 /* RMAP flags, currently only relevant for some anon rmap operations. */ 320 typedef int __bitwise rmap_t; 321 322 /* 323 * No special request: A mapped anonymous (sub)page is possibly shared between 324 * processes. 325 */ 326 #define RMAP_NONE ((__force rmap_t)0) 327 328 /* The anonymous (sub)page is exclusive to a single process. */ 329 #define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0)) 330 331 static __always_inline void __folio_rmap_sanity_checks(const struct folio *folio, 332 const struct page *page, int nr_pages, enum pgtable_level level) 333 { 334 /* hugetlb folios are handled separately. */ 335 VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); 336 337 /* When (un)mapping zeropages, we should never touch ref+mapcount. */ 338 VM_WARN_ON_FOLIO(is_zero_folio(folio), folio); 339 340 /* 341 * TODO: we get driver-allocated folios that have nothing to do with 342 * the rmap using vm_insert_page(); therefore, we cannot assume that 343 * folio_test_large_rmappable() holds for large folios. We should 344 * handle any desired mapcount+stats accounting for these folios in 345 * VM_MIXEDMAP VMAs separately, and then sanity-check here that 346 * we really only get rmappable folios. 347 */ 348 349 VM_WARN_ON_ONCE(nr_pages <= 0); 350 VM_WARN_ON_FOLIO(page_folio(page) != folio, folio); 351 VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio); 352 353 switch (level) { 354 case PGTABLE_LEVEL_PTE: 355 break; 356 case PGTABLE_LEVEL_PMD: 357 /* 358 * We don't support folios larger than a single PMD yet. So 359 * when PGTABLE_LEVEL_PMD is set, we assume that we are creating 360 * a single "entire" mapping of the folio. 361 */ 362 VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); 363 VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); 364 break; 365 case PGTABLE_LEVEL_PUD: 366 /* 367 * Assume that we are creating a single "entire" mapping of the 368 * folio. 369 */ 370 VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio); 371 VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio); 372 break; 373 default: 374 BUILD_BUG(); 375 } 376 377 /* 378 * Anon folios must have an associated live anon_vma as long as they're 379 * mapped into userspace. 380 * Note that the atomic_read() mainly does two things: 381 * 382 * 1. In KASAN builds with CONFIG_SLUB_RCU_DEBUG, it causes KASAN to 383 * check that the associated anon_vma has not yet been freed (subject 384 * to KASAN's usual limitations). This check will pass if the 385 * anon_vma's refcount has already dropped to 0 but an RCU grace 386 * period hasn't passed since then. 387 * 2. If the anon_vma has not yet been freed, it checks that the 388 * anon_vma still has a nonzero refcount (as opposed to being in the 389 * middle of an RCU delay for getting freed). 390 */ 391 if (folio_test_anon(folio) && !folio_test_ksm(folio)) { 392 unsigned long mapping = (unsigned long)folio->mapping; 393 struct anon_vma *anon_vma; 394 395 anon_vma = (void *)(mapping - FOLIO_MAPPING_ANON); 396 VM_WARN_ON_FOLIO(atomic_read(&anon_vma->refcount) == 0, folio); 397 } 398 } 399 400 /* 401 * rmap interfaces called when adding or removing pte of page 402 */ 403 void folio_move_anon_rmap(struct folio *, struct vm_area_struct *); 404 void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, 405 struct vm_area_struct *, unsigned long address, rmap_t flags); 406 #define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \ 407 folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags) 408 void folio_add_anon_rmap_pmd(struct folio *, struct page *, 409 struct vm_area_struct *, unsigned long address, rmap_t flags); 410 void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, 411 unsigned long address, rmap_t flags); 412 void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, 413 struct vm_area_struct *); 414 #define folio_add_file_rmap_pte(folio, page, vma) \ 415 folio_add_file_rmap_ptes(folio, page, 1, vma) 416 void folio_add_file_rmap_pmd(struct folio *, struct page *, 417 struct vm_area_struct *); 418 void folio_add_file_rmap_pud(struct folio *, struct page *, 419 struct vm_area_struct *); 420 void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, 421 struct vm_area_struct *); 422 #define folio_remove_rmap_pte(folio, page, vma) \ 423 folio_remove_rmap_ptes(folio, page, 1, vma) 424 void folio_remove_rmap_pmd(struct folio *, struct page *, 425 struct vm_area_struct *); 426 void folio_remove_rmap_pud(struct folio *, struct page *, 427 struct vm_area_struct *); 428 429 void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, 430 unsigned long address, rmap_t flags); 431 void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, 432 unsigned long address); 433 434 /* See folio_try_dup_anon_rmap_*() */ 435 static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, 436 struct vm_area_struct *vma) 437 { 438 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 439 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 440 441 if (PageAnonExclusive(&folio->page)) { 442 if (unlikely(folio_needs_cow_for_dma(vma, folio))) 443 return -EBUSY; 444 ClearPageAnonExclusive(&folio->page); 445 } 446 atomic_inc(&folio->_entire_mapcount); 447 atomic_inc(&folio->_large_mapcount); 448 return 0; 449 } 450 451 /* See folio_try_share_anon_rmap_*() */ 452 static inline int hugetlb_try_share_anon_rmap(struct folio *folio) 453 { 454 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 455 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 456 VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio); 457 458 /* Paired with the memory barrier in try_grab_folio(). */ 459 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) 460 smp_mb(); 461 462 if (unlikely(folio_maybe_dma_pinned(folio))) 463 return -EBUSY; 464 ClearPageAnonExclusive(&folio->page); 465 466 /* 467 * This is conceptually a smp_wmb() paired with the smp_rmb() in 468 * gup_must_unshare(). 469 */ 470 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) 471 smp_mb__after_atomic(); 472 return 0; 473 } 474 475 static inline void hugetlb_add_file_rmap(struct folio *folio) 476 { 477 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 478 VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); 479 480 atomic_inc(&folio->_entire_mapcount); 481 atomic_inc(&folio->_large_mapcount); 482 } 483 484 static inline void hugetlb_remove_rmap(struct folio *folio) 485 { 486 VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); 487 488 atomic_dec(&folio->_entire_mapcount); 489 atomic_dec(&folio->_large_mapcount); 490 } 491 492 static __always_inline void __folio_dup_file_rmap(struct folio *folio, 493 struct page *page, int nr_pages, struct vm_area_struct *dst_vma, 494 enum pgtable_level level) 495 { 496 const int orig_nr_pages = nr_pages; 497 498 __folio_rmap_sanity_checks(folio, page, nr_pages, level); 499 500 switch (level) { 501 case PGTABLE_LEVEL_PTE: 502 if (!folio_test_large(folio)) { 503 atomic_inc(&folio->_mapcount); 504 break; 505 } 506 507 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) { 508 do { 509 atomic_inc(&page->_mapcount); 510 } while (page++, --nr_pages > 0); 511 } 512 folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); 513 break; 514 case PGTABLE_LEVEL_PMD: 515 case PGTABLE_LEVEL_PUD: 516 atomic_inc(&folio->_entire_mapcount); 517 folio_inc_large_mapcount(folio, dst_vma); 518 break; 519 default: 520 BUILD_BUG(); 521 } 522 } 523 524 /** 525 * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio 526 * @folio: The folio to duplicate the mappings of 527 * @page: The first page to duplicate the mappings of 528 * @nr_pages: The number of pages of which the mapping will be duplicated 529 * @dst_vma: The destination vm area 530 * 531 * The page range of the folio is defined by [page, page + nr_pages) 532 * 533 * The caller needs to hold the page table lock. 534 */ 535 static inline void folio_dup_file_rmap_ptes(struct folio *folio, 536 struct page *page, int nr_pages, struct vm_area_struct *dst_vma) 537 { 538 __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, PGTABLE_LEVEL_PTE); 539 } 540 541 static __always_inline void folio_dup_file_rmap_pte(struct folio *folio, 542 struct page *page, struct vm_area_struct *dst_vma) 543 { 544 __folio_dup_file_rmap(folio, page, 1, dst_vma, PGTABLE_LEVEL_PTE); 545 } 546 547 /** 548 * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio 549 * @folio: The folio to duplicate the mapping of 550 * @page: The first page to duplicate the mapping of 551 * @dst_vma: The destination vm area 552 * 553 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) 554 * 555 * The caller needs to hold the page table lock. 556 */ 557 static inline void folio_dup_file_rmap_pmd(struct folio *folio, 558 struct page *page, struct vm_area_struct *dst_vma) 559 { 560 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 561 __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, PGTABLE_LEVEL_PTE); 562 #else 563 WARN_ON_ONCE(true); 564 #endif 565 } 566 567 static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, 568 struct page *page, int nr_pages, struct vm_area_struct *dst_vma, 569 struct vm_area_struct *src_vma, enum pgtable_level level) 570 { 571 const int orig_nr_pages = nr_pages; 572 bool maybe_pinned; 573 int i; 574 575 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 576 __folio_rmap_sanity_checks(folio, page, nr_pages, level); 577 578 /* 579 * If this folio may have been pinned by the parent process, 580 * don't allow to duplicate the mappings but instead require to e.g., 581 * copy the subpage immediately for the child so that we'll always 582 * guarantee the pinned folio won't be randomly replaced in the 583 * future on write faults. 584 */ 585 maybe_pinned = likely(!folio_is_device_private(folio)) && 586 unlikely(folio_needs_cow_for_dma(src_vma, folio)); 587 588 /* 589 * No need to check+clear for already shared PTEs/PMDs of the 590 * folio. But if any page is PageAnonExclusive, we must fallback to 591 * copying if the folio maybe pinned. 592 */ 593 switch (level) { 594 case PGTABLE_LEVEL_PTE: 595 if (unlikely(maybe_pinned)) { 596 for (i = 0; i < nr_pages; i++) 597 if (PageAnonExclusive(page + i)) 598 return -EBUSY; 599 } 600 601 if (!folio_test_large(folio)) { 602 if (PageAnonExclusive(page)) 603 ClearPageAnonExclusive(page); 604 atomic_inc(&folio->_mapcount); 605 break; 606 } 607 608 do { 609 if (PageAnonExclusive(page)) 610 ClearPageAnonExclusive(page); 611 if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) 612 atomic_inc(&page->_mapcount); 613 } while (page++, --nr_pages > 0); 614 folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); 615 break; 616 case PGTABLE_LEVEL_PMD: 617 case PGTABLE_LEVEL_PUD: 618 if (PageAnonExclusive(page)) { 619 if (unlikely(maybe_pinned)) 620 return -EBUSY; 621 ClearPageAnonExclusive(page); 622 } 623 atomic_inc(&folio->_entire_mapcount); 624 folio_inc_large_mapcount(folio, dst_vma); 625 break; 626 default: 627 BUILD_BUG(); 628 } 629 return 0; 630 } 631 632 /** 633 * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range 634 * of a folio 635 * @folio: The folio to duplicate the mappings of 636 * @page: The first page to duplicate the mappings of 637 * @nr_pages: The number of pages of which the mapping will be duplicated 638 * @dst_vma: The destination vm area 639 * @src_vma: The vm area from which the mappings are duplicated 640 * 641 * The page range of the folio is defined by [page, page + nr_pages) 642 * 643 * The caller needs to hold the page table lock and the 644 * vma->vma_mm->write_protect_seq. 645 * 646 * Duplicating the mappings can only fail if the folio may be pinned; device 647 * private folios cannot get pinned and consequently this function cannot fail 648 * for them. 649 * 650 * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in 651 * the parent and the child. They must *not* be writable after this call 652 * succeeded. 653 * 654 * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise. 655 */ 656 static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, 657 struct page *page, int nr_pages, struct vm_area_struct *dst_vma, 658 struct vm_area_struct *src_vma) 659 { 660 return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma, 661 src_vma, PGTABLE_LEVEL_PTE); 662 } 663 664 static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, 665 struct page *page, struct vm_area_struct *dst_vma, 666 struct vm_area_struct *src_vma) 667 { 668 return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma, 669 PGTABLE_LEVEL_PTE); 670 } 671 672 /** 673 * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range 674 * of a folio 675 * @folio: The folio to duplicate the mapping of 676 * @page: The first page to duplicate the mapping of 677 * @dst_vma: The destination vm area 678 * @src_vma: The vm area from which the mapping is duplicated 679 * 680 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) 681 * 682 * The caller needs to hold the page table lock and the 683 * vma->vma_mm->write_protect_seq. 684 * 685 * Duplicating the mapping can only fail if the folio may be pinned; device 686 * private folios cannot get pinned and consequently this function cannot fail 687 * for them. 688 * 689 * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in 690 * the parent and the child. They must *not* be writable after this call 691 * succeeded. 692 * 693 * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. 694 */ 695 static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, 696 struct page *page, struct vm_area_struct *dst_vma, 697 struct vm_area_struct *src_vma) 698 { 699 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 700 return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma, 701 src_vma, PGTABLE_LEVEL_PMD); 702 #else 703 WARN_ON_ONCE(true); 704 return -EBUSY; 705 #endif 706 } 707 708 static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, 709 struct page *page, int nr_pages, enum pgtable_level level) 710 { 711 VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); 712 VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio); 713 __folio_rmap_sanity_checks(folio, page, nr_pages, level); 714 715 /* device private folios cannot get pinned via GUP. */ 716 if (unlikely(folio_is_device_private(folio))) { 717 ClearPageAnonExclusive(page); 718 return 0; 719 } 720 721 /* 722 * We have to make sure that when we clear PageAnonExclusive, that 723 * the page is not pinned and that concurrent GUP-fast won't succeed in 724 * concurrently pinning the page. 725 * 726 * Conceptually, PageAnonExclusive clearing consists of: 727 * (A1) Clear PTE 728 * (A2) Check if the page is pinned; back off if so. 729 * (A3) Clear PageAnonExclusive 730 * (A4) Restore PTE (optional, but certainly not writable) 731 * 732 * When clearing PageAnonExclusive, we cannot possibly map the page 733 * writable again, because anon pages that may be shared must never 734 * be writable. So in any case, if the PTE was writable it cannot 735 * be writable anymore afterwards and there would be a PTE change. Only 736 * if the PTE wasn't writable, there might not be a PTE change. 737 * 738 * Conceptually, GUP-fast pinning of an anon page consists of: 739 * (B1) Read the PTE 740 * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so. 741 * (B3) Pin the mapped page 742 * (B4) Check if the PTE changed by re-reading it; back off if so. 743 * (B5) If the original PTE is not writable, check if 744 * PageAnonExclusive is not set; back off if so. 745 * 746 * If the PTE was writable, we only have to make sure that GUP-fast 747 * observes a PTE change and properly backs off. 748 * 749 * If the PTE was not writable, we have to make sure that GUP-fast either 750 * detects a (temporary) PTE change or that PageAnonExclusive is cleared 751 * and properly backs off. 752 * 753 * Consequently, when clearing PageAnonExclusive(), we have to make 754 * sure that (A1), (A2)/(A3) and (A4) happen in the right memory 755 * order. In GUP-fast pinning code, we have to make sure that (B3),(B4) 756 * and (B5) happen in the right memory order. 757 * 758 * We assume that there might not be a memory barrier after 759 * clearing/invalidating the PTE (A1) and before restoring the PTE (A4), 760 * so we use explicit ones here. 761 */ 762 763 /* Paired with the memory barrier in try_grab_folio(). */ 764 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) 765 smp_mb(); 766 767 if (unlikely(folio_maybe_dma_pinned(folio))) 768 return -EBUSY; 769 ClearPageAnonExclusive(page); 770 771 /* 772 * This is conceptually a smp_wmb() paired with the smp_rmb() in 773 * gup_must_unshare(). 774 */ 775 if (IS_ENABLED(CONFIG_HAVE_GUP_FAST)) 776 smp_mb__after_atomic(); 777 return 0; 778 } 779 780 /** 781 * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page 782 * mapped by a PTE possibly shared to prepare 783 * for KSM or temporary unmapping 784 * @folio: The folio to share a mapping of 785 * @page: The mapped exclusive page 786 * 787 * The caller needs to hold the page table lock and has to have the page table 788 * entries cleared/invalidated. 789 * 790 * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during 791 * fork() to duplicate mappings, but instead to prepare for KSM or temporarily 792 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte(). 793 * 794 * Marking the mapped page shared can only fail if the folio maybe pinned; 795 * device private folios cannot get pinned and consequently this function cannot 796 * fail. 797 * 798 * Returns 0 if marking the mapped page possibly shared succeeded. Returns 799 * -EBUSY otherwise. 800 */ 801 static inline int folio_try_share_anon_rmap_pte(struct folio *folio, 802 struct page *page) 803 { 804 return __folio_try_share_anon_rmap(folio, page, 1, PGTABLE_LEVEL_PTE); 805 } 806 807 /** 808 * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page 809 * range mapped by a PMD possibly shared to 810 * prepare for temporary unmapping 811 * @folio: The folio to share the mapping of 812 * @page: The first page to share the mapping of 813 * 814 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) 815 * 816 * The caller needs to hold the page table lock and has to have the page table 817 * entries cleared/invalidated. 818 * 819 * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during 820 * fork() to duplicate a mapping, but instead to prepare for temporarily 821 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd(). 822 * 823 * Marking the mapped pages shared can only fail if the folio maybe pinned; 824 * device private folios cannot get pinned and consequently this function cannot 825 * fail. 826 * 827 * Returns 0 if marking the mapped pages possibly shared succeeded. Returns 828 * -EBUSY otherwise. 829 */ 830 static inline int folio_try_share_anon_rmap_pmd(struct folio *folio, 831 struct page *page) 832 { 833 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 834 return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR, 835 PGTABLE_LEVEL_PMD); 836 #else 837 WARN_ON_ONCE(true); 838 return -EBUSY; 839 #endif 840 } 841 842 /* 843 * Called from mm/vmscan.c to handle paging out 844 */ 845 int folio_referenced(struct folio *, int is_locked, 846 struct mem_cgroup *memcg, vm_flags_t *vm_flags); 847 848 void try_to_migrate(struct folio *folio, enum ttu_flags flags); 849 void try_to_unmap(struct folio *, enum ttu_flags flags); 850 851 struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, 852 void *owner, struct folio **foliop); 853 854 /* Avoid racy checks */ 855 #define PVMW_SYNC (1 << 0) 856 /* Look for migration entries rather than present PTEs */ 857 #define PVMW_MIGRATION (1 << 1) 858 859 /* Result flags */ 860 861 /* The page is mapped across page table boundary */ 862 #define PVMW_PGTABLE_CROSSED (1 << 16) 863 864 struct page_vma_mapped_walk { 865 unsigned long pfn; 866 unsigned long nr_pages; 867 pgoff_t pgoff; 868 struct vm_area_struct *vma; 869 unsigned long address; 870 pmd_t *pmd; 871 pte_t *pte; 872 spinlock_t *ptl; 873 unsigned int flags; 874 }; 875 876 #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \ 877 struct page_vma_mapped_walk name = { \ 878 .pfn = folio_pfn(_folio), \ 879 .nr_pages = folio_nr_pages(_folio), \ 880 .pgoff = folio_pgoff(_folio), \ 881 .vma = _vma, \ 882 .address = _address, \ 883 .flags = _flags, \ 884 } 885 886 static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) 887 { 888 /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */ 889 if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma)) 890 pte_unmap(pvmw->pte); 891 if (pvmw->ptl) 892 spin_unlock(pvmw->ptl); 893 } 894 895 /** 896 * page_vma_mapped_walk_restart - Restart the page table walk. 897 * @pvmw: Pointer to struct page_vma_mapped_walk. 898 * 899 * It restarts the page table walk when changes occur in the page 900 * table, such as splitting a PMD. Ensures that the PTL held during 901 * the previous walk is released and resets the state to allow for 902 * a new walk starting at the current address stored in pvmw->address. 903 */ 904 static inline void 905 page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw) 906 { 907 WARN_ON_ONCE(!pvmw->pmd && !pvmw->pte); 908 909 if (likely(pvmw->ptl)) 910 spin_unlock(pvmw->ptl); 911 else 912 WARN_ON_ONCE(1); 913 914 pvmw->ptl = NULL; 915 pvmw->pmd = NULL; 916 pvmw->pte = NULL; 917 } 918 919 bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); 920 unsigned long page_address_in_vma(const struct folio *folio, 921 const struct page *, const struct vm_area_struct *); 922 923 /* 924 * Cleans the PTEs of shared mappings. 925 * (and since clean PTEs should also be readonly, write protects them too) 926 * 927 * returns the number of cleaned PTEs. 928 */ 929 int folio_mkclean(struct folio *); 930 931 int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff, 932 unsigned long pfn, unsigned long nr_pages); 933 934 int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, 935 struct vm_area_struct *vma); 936 937 void remove_migration_ptes(struct folio *src, struct folio *dst, 938 enum ttu_flags flags); 939 940 /* 941 * rmap_walk_control: To control rmap traversing for specific needs 942 * 943 * arg: passed to rmap_one() and invalid_vma() 944 * try_lock: bail out if the rmap lock is contended 945 * contended: indicate the rmap traversal bailed out due to lock contention 946 * rmap_one: executed on each vma where page is mapped 947 * done: for checking traversing termination condition 948 * anon_lock: for getting anon_lock by optimized way rather than default 949 * invalid_vma: for skipping uninterested vma 950 */ 951 struct rmap_walk_control { 952 void *arg; 953 bool try_lock; 954 bool contended; 955 /* 956 * Return false if page table scanning in rmap_walk should be stopped. 957 * Otherwise, return true. 958 */ 959 bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma, 960 unsigned long addr, void *arg); 961 int (*done)(struct folio *folio); 962 struct anon_vma *(*anon_lock)(const struct folio *folio, 963 struct rmap_walk_control *rwc); 964 bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); 965 }; 966 967 void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); 968 void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); 969 struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, 970 struct rmap_walk_control *rwc); 971 972 #else /* !CONFIG_MMU */ 973 974 #define anon_vma_init() do {} while (0) 975 #define anon_vma_prepare(vma) (0) 976 977 static inline int folio_referenced(struct folio *folio, int is_locked, 978 struct mem_cgroup *memcg, 979 vm_flags_t *vm_flags) 980 { 981 *vm_flags = 0; 982 return 0; 983 } 984 985 static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags) 986 { 987 } 988 989 static inline int folio_mkclean(struct folio *folio) 990 { 991 return 0; 992 } 993 #endif /* CONFIG_MMU */ 994 995 #endif /* _LINUX_RMAP_H */ 996