1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * fs/dax.c - Direct Access filesystem code 4 * Copyright (c) 2013-2014 Intel Corporation 5 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 6 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 7 */ 8 9 #include <linux/atomic.h> 10 #include <linux/blkdev.h> 11 #include <linux/buffer_head.h> 12 #include <linux/dax.h> 13 #include <linux/fs.h> 14 #include <linux/highmem.h> 15 #include <linux/memcontrol.h> 16 #include <linux/mm.h> 17 #include <linux/mutex.h> 18 #include <linux/pagevec.h> 19 #include <linux/sched.h> 20 #include <linux/sched/signal.h> 21 #include <linux/uio.h> 22 #include <linux/vmstat.h> 23 #include <linux/pfn_t.h> 24 #include <linux/sizes.h> 25 #include <linux/mmu_notifier.h> 26 #include <linux/iomap.h> 27 #include <linux/rmap.h> 28 #include <asm/pgalloc.h> 29 30 #define CREATE_TRACE_POINTS 31 #include <trace/events/fs_dax.h> 32 33 /* We choose 4096 entries - same as per-zone page wait tables */ 34 #define DAX_WAIT_TABLE_BITS 12 35 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 36 37 /* The 'colour' (ie low bits) within a PMD of a page offset. */ 38 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 39 #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) 40 41 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 42 43 static int __init init_dax_wait_table(void) 44 { 45 int i; 46 47 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) 48 init_waitqueue_head(wait_table + i); 49 return 0; 50 } 51 fs_initcall(init_dax_wait_table); 52 53 /* 54 * DAX pagecache entries use XArray value entries so they can't be mistaken 55 * for pages. We use one bit for locking, one bit for the entry size (PMD) 56 * and two more to tell us if the entry is a zero page or an empty entry that 57 * is just used for locking. In total four special bits. 58 * 59 * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE 60 * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem 61 * block allocation. 62 */ 63 #define DAX_SHIFT (4) 64 #define DAX_LOCKED (1UL << 0) 65 #define DAX_PMD (1UL << 1) 66 #define DAX_ZERO_PAGE (1UL << 2) 67 #define DAX_EMPTY (1UL << 3) 68 69 static unsigned long dax_to_pfn(void *entry) 70 { 71 return xa_to_value(entry) >> DAX_SHIFT; 72 } 73 74 static struct folio *dax_to_folio(void *entry) 75 { 76 return page_folio(pfn_to_page(dax_to_pfn(entry))); 77 } 78 79 static void *dax_make_entry(pfn_t pfn, unsigned long flags) 80 { 81 return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); 82 } 83 84 static bool dax_is_locked(void *entry) 85 { 86 return xa_to_value(entry) & DAX_LOCKED; 87 } 88 89 static unsigned int dax_entry_order(void *entry) 90 { 91 if (xa_to_value(entry) & DAX_PMD) 92 return PMD_ORDER; 93 return 0; 94 } 95 96 static unsigned long dax_is_pmd_entry(void *entry) 97 { 98 return xa_to_value(entry) & DAX_PMD; 99 } 100 101 static bool dax_is_pte_entry(void *entry) 102 { 103 return !(xa_to_value(entry) & DAX_PMD); 104 } 105 106 static int dax_is_zero_entry(void *entry) 107 { 108 return xa_to_value(entry) & DAX_ZERO_PAGE; 109 } 110 111 static int dax_is_empty_entry(void *entry) 112 { 113 return xa_to_value(entry) & DAX_EMPTY; 114 } 115 116 /* 117 * true if the entry that was found is of a smaller order than the entry 118 * we were looking for 119 */ 120 static bool dax_is_conflict(void *entry) 121 { 122 return entry == XA_RETRY_ENTRY; 123 } 124 125 /* 126 * DAX page cache entry locking 127 */ 128 struct exceptional_entry_key { 129 struct xarray *xa; 130 pgoff_t entry_start; 131 }; 132 133 struct wait_exceptional_entry_queue { 134 wait_queue_entry_t wait; 135 struct exceptional_entry_key key; 136 }; 137 138 /** 139 * enum dax_wake_mode: waitqueue wakeup behaviour 140 * @WAKE_ALL: wake all waiters in the waitqueue 141 * @WAKE_NEXT: wake only the first waiter in the waitqueue 142 */ 143 enum dax_wake_mode { 144 WAKE_ALL, 145 WAKE_NEXT, 146 }; 147 148 static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, 149 void *entry, struct exceptional_entry_key *key) 150 { 151 unsigned long hash; 152 unsigned long index = xas->xa_index; 153 154 /* 155 * If 'entry' is a PMD, align the 'index' that we use for the wait 156 * queue to the start of that PMD. This ensures that all offsets in 157 * the range covered by the PMD map to the same bit lock. 158 */ 159 if (dax_is_pmd_entry(entry)) 160 index &= ~PG_PMD_COLOUR; 161 key->xa = xas->xa; 162 key->entry_start = index; 163 164 hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS); 165 return wait_table + hash; 166 } 167 168 static int wake_exceptional_entry_func(wait_queue_entry_t *wait, 169 unsigned int mode, int sync, void *keyp) 170 { 171 struct exceptional_entry_key *key = keyp; 172 struct wait_exceptional_entry_queue *ewait = 173 container_of(wait, struct wait_exceptional_entry_queue, wait); 174 175 if (key->xa != ewait->key.xa || 176 key->entry_start != ewait->key.entry_start) 177 return 0; 178 return autoremove_wake_function(wait, mode, sync, NULL); 179 } 180 181 /* 182 * @entry may no longer be the entry at the index in the mapping. 183 * The important information it's conveying is whether the entry at 184 * this index used to be a PMD entry. 185 */ 186 static void dax_wake_entry(struct xa_state *xas, void *entry, 187 enum dax_wake_mode mode) 188 { 189 struct exceptional_entry_key key; 190 wait_queue_head_t *wq; 191 192 wq = dax_entry_waitqueue(xas, entry, &key); 193 194 /* 195 * Checking for locked entry and prepare_to_wait_exclusive() happens 196 * under the i_pages lock, ditto for entry handling in our callers. 197 * So at this point all tasks that could have seen our entry locked 198 * must be in the waitqueue and the following check will see them. 199 */ 200 if (waitqueue_active(wq)) 201 __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key); 202 } 203 204 /* 205 * Look up entry in page cache, wait for it to become unlocked if it 206 * is a DAX entry and return it. The caller must subsequently call 207 * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry() 208 * if it did. The entry returned may have a larger order than @order. 209 * If @order is larger than the order of the entry found in i_pages, this 210 * function returns a dax_is_conflict entry. 211 * 212 * Must be called with the i_pages lock held. 213 */ 214 static void *get_next_unlocked_entry(struct xa_state *xas, unsigned int order) 215 { 216 void *entry; 217 struct wait_exceptional_entry_queue ewait; 218 wait_queue_head_t *wq; 219 220 init_wait(&ewait.wait); 221 ewait.wait.func = wake_exceptional_entry_func; 222 223 for (;;) { 224 entry = xas_find_conflict(xas); 225 if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) 226 return entry; 227 if (dax_entry_order(entry) < order) 228 return XA_RETRY_ENTRY; 229 if (!dax_is_locked(entry)) 230 return entry; 231 232 wq = dax_entry_waitqueue(xas, entry, &ewait.key); 233 prepare_to_wait_exclusive(wq, &ewait.wait, 234 TASK_UNINTERRUPTIBLE); 235 xas_unlock_irq(xas); 236 xas_reset(xas); 237 schedule(); 238 finish_wait(wq, &ewait.wait); 239 xas_lock_irq(xas); 240 } 241 } 242 243 /* 244 * Wait for the given entry to become unlocked. Caller must hold the i_pages 245 * lock and call either put_unlocked_entry() if it did not lock the entry or 246 * dax_unlock_entry() if it did. Returns an unlocked entry if still present. 247 */ 248 static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry) 249 { 250 struct wait_exceptional_entry_queue ewait; 251 wait_queue_head_t *wq; 252 253 init_wait(&ewait.wait); 254 ewait.wait.func = wake_exceptional_entry_func; 255 256 while (unlikely(dax_is_locked(entry))) { 257 wq = dax_entry_waitqueue(xas, entry, &ewait.key); 258 prepare_to_wait_exclusive(wq, &ewait.wait, 259 TASK_UNINTERRUPTIBLE); 260 xas_pause(xas); 261 xas_unlock_irq(xas); 262 schedule(); 263 finish_wait(wq, &ewait.wait); 264 xas_lock_irq(xas); 265 entry = xas_load(xas); 266 } 267 268 if (xa_is_internal(entry)) 269 return NULL; 270 271 return entry; 272 } 273 274 /* 275 * The only thing keeping the address space around is the i_pages lock 276 * (it's cycled in clear_inode() after removing the entries from i_pages) 277 * After we call xas_unlock_irq(), we cannot touch xas->xa. 278 */ 279 static void wait_entry_unlocked(struct xa_state *xas, void *entry) 280 { 281 struct wait_exceptional_entry_queue ewait; 282 wait_queue_head_t *wq; 283 284 init_wait(&ewait.wait); 285 ewait.wait.func = wake_exceptional_entry_func; 286 287 wq = dax_entry_waitqueue(xas, entry, &ewait.key); 288 /* 289 * Unlike get_next_unlocked_entry() there is no guarantee that this 290 * path ever successfully retrieves an unlocked entry before an 291 * inode dies. Perform a non-exclusive wait in case this path 292 * never successfully performs its own wake up. 293 */ 294 prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); 295 xas_unlock_irq(xas); 296 schedule(); 297 finish_wait(wq, &ewait.wait); 298 } 299 300 static void put_unlocked_entry(struct xa_state *xas, void *entry, 301 enum dax_wake_mode mode) 302 { 303 if (entry && !dax_is_conflict(entry)) 304 dax_wake_entry(xas, entry, mode); 305 } 306 307 /* 308 * We used the xa_state to get the entry, but then we locked the entry and 309 * dropped the xa_lock, so we know the xa_state is stale and must be reset 310 * before use. 311 */ 312 static void dax_unlock_entry(struct xa_state *xas, void *entry) 313 { 314 void *old; 315 316 BUG_ON(dax_is_locked(entry)); 317 xas_reset(xas); 318 xas_lock_irq(xas); 319 old = xas_store(xas, entry); 320 xas_unlock_irq(xas); 321 BUG_ON(!dax_is_locked(old)); 322 dax_wake_entry(xas, entry, WAKE_NEXT); 323 } 324 325 /* 326 * Return: The entry stored at this location before it was locked. 327 */ 328 static void *dax_lock_entry(struct xa_state *xas, void *entry) 329 { 330 unsigned long v = xa_to_value(entry); 331 return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); 332 } 333 334 static unsigned long dax_entry_size(void *entry) 335 { 336 if (dax_is_zero_entry(entry)) 337 return 0; 338 else if (dax_is_empty_entry(entry)) 339 return 0; 340 else if (dax_is_pmd_entry(entry)) 341 return PMD_SIZE; 342 else 343 return PAGE_SIZE; 344 } 345 346 /* 347 * A DAX folio is considered shared if it has no mapping set and ->share (which 348 * shares the ->index field) is non-zero. Note this may return false even if the 349 * page is shared between multiple files but has not yet actually been mapped 350 * into multiple address spaces. 351 */ 352 static inline bool dax_folio_is_shared(struct folio *folio) 353 { 354 return !folio->mapping && folio->share; 355 } 356 357 /* 358 * When it is called by dax_insert_entry(), the shared flag will indicate 359 * whether this entry is shared by multiple files. If the page has not 360 * previously been associated with any mappings the ->mapping and ->index 361 * fields will be set. If it has already been associated with a mapping 362 * the mapping will be cleared and the share count set. It's then up to 363 * reverse map users like memory_failure() to call back into the filesystem to 364 * recover ->mapping and ->index information. For example by implementing 365 * dax_holder_operations. 366 */ 367 static void dax_folio_make_shared(struct folio *folio) 368 { 369 /* 370 * folio is not currently shared so mark it as shared by clearing 371 * folio->mapping. 372 */ 373 folio->mapping = NULL; 374 375 /* 376 * folio has previously been mapped into one address space so set the 377 * share count. 378 */ 379 folio->share = 1; 380 } 381 382 static inline unsigned long dax_folio_put(struct folio *folio) 383 { 384 unsigned long ref; 385 int order, i; 386 387 if (!dax_folio_is_shared(folio)) 388 ref = 0; 389 else 390 ref = --folio->share; 391 392 if (ref) 393 return ref; 394 395 folio->mapping = NULL; 396 order = folio_order(folio); 397 if (!order) 398 return 0; 399 400 for (i = 0; i < (1UL << order); i++) { 401 struct dev_pagemap *pgmap = page_pgmap(&folio->page); 402 struct page *page = folio_page(folio, i); 403 struct folio *new_folio = (struct folio *)page; 404 405 ClearPageHead(page); 406 clear_compound_head(page); 407 408 new_folio->mapping = NULL; 409 /* 410 * Reset pgmap which was over-written by 411 * prep_compound_page(). 412 */ 413 new_folio->pgmap = pgmap; 414 new_folio->share = 0; 415 WARN_ON_ONCE(folio_ref_count(new_folio)); 416 } 417 418 return ref; 419 } 420 421 static void dax_folio_init(void *entry) 422 { 423 struct folio *folio = dax_to_folio(entry); 424 int order = dax_entry_order(entry); 425 426 /* 427 * Folio should have been split back to order-0 pages in 428 * dax_folio_put() when they were removed from their 429 * final mapping. 430 */ 431 WARN_ON_ONCE(folio_order(folio)); 432 433 if (order > 0) { 434 prep_compound_page(&folio->page, order); 435 if (order > 1) 436 INIT_LIST_HEAD(&folio->_deferred_list); 437 WARN_ON_ONCE(folio_ref_count(folio)); 438 } 439 } 440 441 static void dax_associate_entry(void *entry, struct address_space *mapping, 442 struct vm_area_struct *vma, 443 unsigned long address, bool shared) 444 { 445 unsigned long size = dax_entry_size(entry), index; 446 struct folio *folio = dax_to_folio(entry); 447 448 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) 449 return; 450 451 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) 452 return; 453 454 index = linear_page_index(vma, address & ~(size - 1)); 455 if (shared && (folio->mapping || dax_folio_is_shared(folio))) { 456 if (folio->mapping) 457 dax_folio_make_shared(folio); 458 459 WARN_ON_ONCE(!folio->share); 460 WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio)); 461 folio->share++; 462 } else { 463 WARN_ON_ONCE(folio->mapping); 464 dax_folio_init(entry); 465 folio = dax_to_folio(entry); 466 folio->mapping = mapping; 467 folio->index = index; 468 } 469 } 470 471 static void dax_disassociate_entry(void *entry, struct address_space *mapping, 472 bool trunc) 473 { 474 struct folio *folio = dax_to_folio(entry); 475 476 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) 477 return; 478 479 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) 480 return; 481 482 dax_folio_put(folio); 483 } 484 485 static struct page *dax_busy_page(void *entry) 486 { 487 struct folio *folio = dax_to_folio(entry); 488 489 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) 490 return NULL; 491 492 if (folio_ref_count(folio) - folio_mapcount(folio)) 493 return &folio->page; 494 else 495 return NULL; 496 } 497 498 /** 499 * dax_lock_folio - Lock the DAX entry corresponding to a folio 500 * @folio: The folio whose entry we want to lock 501 * 502 * Context: Process context. 503 * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could 504 * not be locked. 505 */ 506 dax_entry_t dax_lock_folio(struct folio *folio) 507 { 508 XA_STATE(xas, NULL, 0); 509 void *entry; 510 511 /* Ensure folio->mapping isn't freed while we look at it */ 512 rcu_read_lock(); 513 for (;;) { 514 struct address_space *mapping = READ_ONCE(folio->mapping); 515 516 entry = NULL; 517 if (!mapping || !dax_mapping(mapping)) 518 break; 519 520 /* 521 * In the device-dax case there's no need to lock, a 522 * struct dev_pagemap pin is sufficient to keep the 523 * inode alive, and we assume we have dev_pagemap pin 524 * otherwise we would not have a valid pfn_to_page() 525 * translation. 526 */ 527 entry = (void *)~0UL; 528 if (S_ISCHR(mapping->host->i_mode)) 529 break; 530 531 xas.xa = &mapping->i_pages; 532 xas_lock_irq(&xas); 533 if (mapping != folio->mapping) { 534 xas_unlock_irq(&xas); 535 continue; 536 } 537 xas_set(&xas, folio->index); 538 entry = xas_load(&xas); 539 if (dax_is_locked(entry)) { 540 rcu_read_unlock(); 541 wait_entry_unlocked(&xas, entry); 542 rcu_read_lock(); 543 continue; 544 } 545 dax_lock_entry(&xas, entry); 546 xas_unlock_irq(&xas); 547 break; 548 } 549 rcu_read_unlock(); 550 return (dax_entry_t)entry; 551 } 552 553 void dax_unlock_folio(struct folio *folio, dax_entry_t cookie) 554 { 555 struct address_space *mapping = folio->mapping; 556 XA_STATE(xas, &mapping->i_pages, folio->index); 557 558 if (S_ISCHR(mapping->host->i_mode)) 559 return; 560 561 dax_unlock_entry(&xas, (void *)cookie); 562 } 563 564 /* 565 * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping 566 * @mapping: the file's mapping whose entry we want to lock 567 * @index: the offset within this file 568 * @page: output the dax page corresponding to this dax entry 569 * 570 * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry 571 * could not be locked. 572 */ 573 dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index, 574 struct page **page) 575 { 576 XA_STATE(xas, NULL, 0); 577 void *entry; 578 579 rcu_read_lock(); 580 for (;;) { 581 entry = NULL; 582 if (!dax_mapping(mapping)) 583 break; 584 585 xas.xa = &mapping->i_pages; 586 xas_lock_irq(&xas); 587 xas_set(&xas, index); 588 entry = xas_load(&xas); 589 if (dax_is_locked(entry)) { 590 rcu_read_unlock(); 591 wait_entry_unlocked(&xas, entry); 592 rcu_read_lock(); 593 continue; 594 } 595 if (!entry || 596 dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { 597 /* 598 * Because we are looking for entry from file's mapping 599 * and index, so the entry may not be inserted for now, 600 * or even a zero/empty entry. We don't think this is 601 * an error case. So, return a special value and do 602 * not output @page. 603 */ 604 entry = (void *)~0UL; 605 } else { 606 *page = pfn_to_page(dax_to_pfn(entry)); 607 dax_lock_entry(&xas, entry); 608 } 609 xas_unlock_irq(&xas); 610 break; 611 } 612 rcu_read_unlock(); 613 return (dax_entry_t)entry; 614 } 615 616 void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index, 617 dax_entry_t cookie) 618 { 619 XA_STATE(xas, &mapping->i_pages, index); 620 621 if (cookie == ~0UL) 622 return; 623 624 dax_unlock_entry(&xas, (void *)cookie); 625 } 626 627 /* 628 * Find page cache entry at given index. If it is a DAX entry, return it 629 * with the entry locked. If the page cache doesn't contain an entry at 630 * that index, add a locked empty entry. 631 * 632 * When requesting an entry with size DAX_PMD, grab_mapping_entry() will 633 * either return that locked entry or will return VM_FAULT_FALLBACK. 634 * This will happen if there are any PTE entries within the PMD range 635 * that we are requesting. 636 * 637 * We always favor PTE entries over PMD entries. There isn't a flow where we 638 * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD 639 * insertion will fail if it finds any PTE entries already in the tree, and a 640 * PTE insertion will cause an existing PMD entry to be unmapped and 641 * downgraded to PTE entries. This happens for both PMD zero pages as 642 * well as PMD empty entries. 643 * 644 * The exception to this downgrade path is for PMD entries that have 645 * real storage backing them. We will leave these real PMD entries in 646 * the tree, and PTE writes will simply dirty the entire PMD entry. 647 * 648 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 649 * persistent memory the benefit is doubtful. We can add that later if we can 650 * show it helps. 651 * 652 * On error, this function does not return an ERR_PTR. Instead it returns 653 * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values 654 * overlap with xarray value entries. 655 */ 656 static void *grab_mapping_entry(struct xa_state *xas, 657 struct address_space *mapping, unsigned int order) 658 { 659 unsigned long index = xas->xa_index; 660 bool pmd_downgrade; /* splitting PMD entry into PTE entries? */ 661 void *entry; 662 663 retry: 664 pmd_downgrade = false; 665 xas_lock_irq(xas); 666 entry = get_next_unlocked_entry(xas, order); 667 668 if (entry) { 669 if (dax_is_conflict(entry)) 670 goto fallback; 671 if (!xa_is_value(entry)) { 672 xas_set_err(xas, -EIO); 673 goto out_unlock; 674 } 675 676 if (order == 0) { 677 if (dax_is_pmd_entry(entry) && 678 (dax_is_zero_entry(entry) || 679 dax_is_empty_entry(entry))) { 680 pmd_downgrade = true; 681 } 682 } 683 } 684 685 if (pmd_downgrade) { 686 /* 687 * Make sure 'entry' remains valid while we drop 688 * the i_pages lock. 689 */ 690 dax_lock_entry(xas, entry); 691 692 /* 693 * Besides huge zero pages the only other thing that gets 694 * downgraded are empty entries which don't need to be 695 * unmapped. 696 */ 697 if (dax_is_zero_entry(entry)) { 698 xas_unlock_irq(xas); 699 unmap_mapping_pages(mapping, 700 xas->xa_index & ~PG_PMD_COLOUR, 701 PG_PMD_NR, false); 702 xas_reset(xas); 703 xas_lock_irq(xas); 704 } 705 706 dax_disassociate_entry(entry, mapping, false); 707 xas_store(xas, NULL); /* undo the PMD join */ 708 dax_wake_entry(xas, entry, WAKE_ALL); 709 mapping->nrpages -= PG_PMD_NR; 710 entry = NULL; 711 xas_set(xas, index); 712 } 713 714 if (entry) { 715 dax_lock_entry(xas, entry); 716 } else { 717 unsigned long flags = DAX_EMPTY; 718 719 if (order > 0) 720 flags |= DAX_PMD; 721 entry = dax_make_entry(pfn_to_pfn_t(0), flags); 722 dax_lock_entry(xas, entry); 723 if (xas_error(xas)) 724 goto out_unlock; 725 mapping->nrpages += 1UL << order; 726 } 727 728 out_unlock: 729 xas_unlock_irq(xas); 730 if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) 731 goto retry; 732 if (xas->xa_node == XA_ERROR(-ENOMEM)) 733 return xa_mk_internal(VM_FAULT_OOM); 734 if (xas_error(xas)) 735 return xa_mk_internal(VM_FAULT_SIGBUS); 736 return entry; 737 fallback: 738 xas_unlock_irq(xas); 739 return xa_mk_internal(VM_FAULT_FALLBACK); 740 } 741 742 /** 743 * dax_layout_busy_page_range - find first pinned page in @mapping 744 * @mapping: address space to scan for a page with ref count > 1 745 * @start: Starting offset. Page containing 'start' is included. 746 * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX, 747 * pages from 'start' till the end of file are included. 748 * 749 * DAX requires ZONE_DEVICE mapped pages. These pages are never 750 * 'onlined' to the page allocator so they are considered idle when 751 * page->count == 1. A filesystem uses this interface to determine if 752 * any page in the mapping is busy, i.e. for DMA, or other 753 * get_user_pages() usages. 754 * 755 * It is expected that the filesystem is holding locks to block the 756 * establishment of new mappings in this address_space. I.e. it expects 757 * to be able to run unmap_mapping_range() and subsequently not race 758 * mapping_mapped() becoming true. 759 */ 760 struct page *dax_layout_busy_page_range(struct address_space *mapping, 761 loff_t start, loff_t end) 762 { 763 void *entry; 764 unsigned int scanned = 0; 765 struct page *page = NULL; 766 pgoff_t start_idx = start >> PAGE_SHIFT; 767 pgoff_t end_idx; 768 XA_STATE(xas, &mapping->i_pages, start_idx); 769 770 /* 771 * In the 'limited' case get_user_pages() for dax is disabled. 772 */ 773 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) 774 return NULL; 775 776 if (!dax_mapping(mapping)) 777 return NULL; 778 779 /* If end == LLONG_MAX, all pages from start to till end of file */ 780 if (end == LLONG_MAX) 781 end_idx = ULONG_MAX; 782 else 783 end_idx = end >> PAGE_SHIFT; 784 /* 785 * If we race get_user_pages_fast() here either we'll see the 786 * elevated page count in the iteration and wait, or 787 * get_user_pages_fast() will see that the page it took a reference 788 * against is no longer mapped in the page tables and bail to the 789 * get_user_pages() slow path. The slow path is protected by 790 * pte_lock() and pmd_lock(). New references are not taken without 791 * holding those locks, and unmap_mapping_pages() will not zero the 792 * pte or pmd without holding the respective lock, so we are 793 * guaranteed to either see new references or prevent new 794 * references from being established. 795 */ 796 unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0); 797 798 xas_lock_irq(&xas); 799 xas_for_each(&xas, entry, end_idx) { 800 if (WARN_ON_ONCE(!xa_is_value(entry))) 801 continue; 802 entry = wait_entry_unlocked_exclusive(&xas, entry); 803 if (entry) 804 page = dax_busy_page(entry); 805 put_unlocked_entry(&xas, entry, WAKE_NEXT); 806 if (page) 807 break; 808 if (++scanned % XA_CHECK_SCHED) 809 continue; 810 811 xas_pause(&xas); 812 xas_unlock_irq(&xas); 813 cond_resched(); 814 xas_lock_irq(&xas); 815 } 816 xas_unlock_irq(&xas); 817 return page; 818 } 819 EXPORT_SYMBOL_GPL(dax_layout_busy_page_range); 820 821 struct page *dax_layout_busy_page(struct address_space *mapping) 822 { 823 return dax_layout_busy_page_range(mapping, 0, LLONG_MAX); 824 } 825 EXPORT_SYMBOL_GPL(dax_layout_busy_page); 826 827 static int __dax_invalidate_entry(struct address_space *mapping, 828 pgoff_t index, bool trunc) 829 { 830 XA_STATE(xas, &mapping->i_pages, index); 831 int ret = 0; 832 void *entry; 833 834 xas_lock_irq(&xas); 835 entry = get_next_unlocked_entry(&xas, 0); 836 if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) 837 goto out; 838 if (!trunc && 839 (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) || 840 xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE))) 841 goto out; 842 dax_disassociate_entry(entry, mapping, trunc); 843 xas_store(&xas, NULL); 844 mapping->nrpages -= 1UL << dax_entry_order(entry); 845 ret = 1; 846 out: 847 put_unlocked_entry(&xas, entry, WAKE_ALL); 848 xas_unlock_irq(&xas); 849 return ret; 850 } 851 852 static int __dax_clear_dirty_range(struct address_space *mapping, 853 pgoff_t start, pgoff_t end) 854 { 855 XA_STATE(xas, &mapping->i_pages, start); 856 unsigned int scanned = 0; 857 void *entry; 858 859 xas_lock_irq(&xas); 860 xas_for_each(&xas, entry, end) { 861 entry = wait_entry_unlocked_exclusive(&xas, entry); 862 if (!entry) 863 continue; 864 xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); 865 xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); 866 put_unlocked_entry(&xas, entry, WAKE_NEXT); 867 868 if (++scanned % XA_CHECK_SCHED) 869 continue; 870 871 xas_pause(&xas); 872 xas_unlock_irq(&xas); 873 cond_resched(); 874 xas_lock_irq(&xas); 875 } 876 xas_unlock_irq(&xas); 877 878 return 0; 879 } 880 881 /* 882 * Delete DAX entry at @index from @mapping. Wait for it 883 * to be unlocked before deleting it. 884 */ 885 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) 886 { 887 int ret = __dax_invalidate_entry(mapping, index, true); 888 889 /* 890 * This gets called from truncate / punch_hole path. As such, the caller 891 * must hold locks protecting against concurrent modifications of the 892 * page cache (usually fs-private i_mmap_sem for writing). Since the 893 * caller has seen a DAX entry for this index, we better find it 894 * at that index as well... 895 */ 896 WARN_ON_ONCE(!ret); 897 return ret; 898 } 899 900 void dax_delete_mapping_range(struct address_space *mapping, 901 loff_t start, loff_t end) 902 { 903 void *entry; 904 pgoff_t start_idx = start >> PAGE_SHIFT; 905 pgoff_t end_idx; 906 XA_STATE(xas, &mapping->i_pages, start_idx); 907 908 /* If end == LLONG_MAX, all pages from start to till end of file */ 909 if (end == LLONG_MAX) 910 end_idx = ULONG_MAX; 911 else 912 end_idx = end >> PAGE_SHIFT; 913 914 xas_lock_irq(&xas); 915 xas_for_each(&xas, entry, end_idx) { 916 if (!xa_is_value(entry)) 917 continue; 918 entry = wait_entry_unlocked_exclusive(&xas, entry); 919 if (!entry) 920 continue; 921 dax_disassociate_entry(entry, mapping, true); 922 xas_store(&xas, NULL); 923 mapping->nrpages -= 1UL << dax_entry_order(entry); 924 put_unlocked_entry(&xas, entry, WAKE_ALL); 925 } 926 xas_unlock_irq(&xas); 927 } 928 EXPORT_SYMBOL_GPL(dax_delete_mapping_range); 929 930 static int wait_page_idle(struct page *page, 931 void (cb)(struct inode *), 932 struct inode *inode) 933 { 934 return ___wait_var_event(page, dax_page_is_idle(page), 935 TASK_INTERRUPTIBLE, 0, 0, cb(inode)); 936 } 937 938 static void wait_page_idle_uninterruptible(struct page *page, 939 struct inode *inode) 940 { 941 ___wait_var_event(page, dax_page_is_idle(page), 942 TASK_UNINTERRUPTIBLE, 0, 0, schedule()); 943 } 944 945 /* 946 * Unmaps the inode and waits for any DMA to complete prior to deleting the 947 * DAX mapping entries for the range. 948 * 949 * For NOWAIT behavior, pass @cb as NULL to early-exit on first found 950 * busy page 951 */ 952 int dax_break_layout(struct inode *inode, loff_t start, loff_t end, 953 void (cb)(struct inode *)) 954 { 955 struct page *page; 956 int error = 0; 957 958 if (!dax_mapping(inode->i_mapping)) 959 return 0; 960 961 do { 962 page = dax_layout_busy_page_range(inode->i_mapping, start, end); 963 if (!page) 964 break; 965 if (!cb) { 966 error = -ERESTARTSYS; 967 break; 968 } 969 970 error = wait_page_idle(page, cb, inode); 971 } while (error == 0); 972 973 if (!page) 974 dax_delete_mapping_range(inode->i_mapping, start, end); 975 976 return error; 977 } 978 EXPORT_SYMBOL_GPL(dax_break_layout); 979 980 void dax_break_layout_final(struct inode *inode) 981 { 982 struct page *page; 983 984 if (!dax_mapping(inode->i_mapping)) 985 return; 986 987 do { 988 page = dax_layout_busy_page_range(inode->i_mapping, 0, 989 LLONG_MAX); 990 if (!page) 991 break; 992 993 wait_page_idle_uninterruptible(page, inode); 994 } while (true); 995 996 if (!page) 997 dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX); 998 } 999 EXPORT_SYMBOL_GPL(dax_break_layout_final); 1000 1001 /* 1002 * Invalidate DAX entry if it is clean. 1003 */ 1004 int dax_invalidate_mapping_entry_sync(struct address_space *mapping, 1005 pgoff_t index) 1006 { 1007 return __dax_invalidate_entry(mapping, index, false); 1008 } 1009 1010 static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos) 1011 { 1012 return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset); 1013 } 1014 1015 static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter) 1016 { 1017 pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos); 1018 void *vto, *kaddr; 1019 long rc; 1020 int id; 1021 1022 id = dax_read_lock(); 1023 rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS, 1024 &kaddr, NULL); 1025 if (rc < 0) { 1026 dax_read_unlock(id); 1027 return rc; 1028 } 1029 vto = kmap_atomic(vmf->cow_page); 1030 copy_user_page(vto, kaddr, vmf->address, vmf->cow_page); 1031 kunmap_atomic(vto); 1032 dax_read_unlock(id); 1033 return 0; 1034 } 1035 1036 /* 1037 * MAP_SYNC on a dax mapping guarantees dirty metadata is 1038 * flushed on write-faults (non-cow), but not read-faults. 1039 */ 1040 static bool dax_fault_is_synchronous(const struct iomap_iter *iter, 1041 struct vm_area_struct *vma) 1042 { 1043 return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) && 1044 (iter->iomap.flags & IOMAP_F_DIRTY); 1045 } 1046 1047 /* 1048 * By this point grab_mapping_entry() has ensured that we have a locked entry 1049 * of the appropriate size so we don't have to worry about downgrading PMDs to 1050 * PTEs. If we happen to be trying to insert a PTE and there is a PMD 1051 * already in the tree, we will skip the insertion and just dirty the PMD as 1052 * appropriate. 1053 */ 1054 static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, 1055 const struct iomap_iter *iter, void *entry, pfn_t pfn, 1056 unsigned long flags) 1057 { 1058 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1059 void *new_entry = dax_make_entry(pfn, flags); 1060 bool write = iter->flags & IOMAP_WRITE; 1061 bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma); 1062 bool shared = iter->iomap.flags & IOMAP_F_SHARED; 1063 1064 if (dirty) 1065 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1066 1067 if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { 1068 unsigned long index = xas->xa_index; 1069 /* we are replacing a zero page with block mapping */ 1070 if (dax_is_pmd_entry(entry)) 1071 unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, 1072 PG_PMD_NR, false); 1073 else /* pte entry */ 1074 unmap_mapping_pages(mapping, index, 1, false); 1075 } 1076 1077 xas_reset(xas); 1078 xas_lock_irq(xas); 1079 if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { 1080 void *old; 1081 1082 dax_disassociate_entry(entry, mapping, false); 1083 dax_associate_entry(new_entry, mapping, vmf->vma, 1084 vmf->address, shared); 1085 1086 /* 1087 * Only swap our new entry into the page cache if the current 1088 * entry is a zero page or an empty entry. If a normal PTE or 1089 * PMD entry is already in the cache, we leave it alone. This 1090 * means that if we are trying to insert a PTE and the 1091 * existing entry is a PMD, we will just leave the PMD in the 1092 * tree and dirty it if necessary. 1093 */ 1094 old = dax_lock_entry(xas, new_entry); 1095 WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) | 1096 DAX_LOCKED)); 1097 entry = new_entry; 1098 } else { 1099 xas_load(xas); /* Walk the xa_state */ 1100 } 1101 1102 if (dirty) 1103 xas_set_mark(xas, PAGECACHE_TAG_DIRTY); 1104 1105 if (write && shared) 1106 xas_set_mark(xas, PAGECACHE_TAG_TOWRITE); 1107 1108 xas_unlock_irq(xas); 1109 return entry; 1110 } 1111 1112 static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, 1113 struct address_space *mapping, void *entry) 1114 { 1115 unsigned long pfn, index, count, end; 1116 long ret = 0; 1117 struct vm_area_struct *vma; 1118 1119 /* 1120 * A page got tagged dirty in DAX mapping? Something is seriously 1121 * wrong. 1122 */ 1123 if (WARN_ON(!xa_is_value(entry))) 1124 return -EIO; 1125 1126 if (unlikely(dax_is_locked(entry))) { 1127 void *old_entry = entry; 1128 1129 entry = get_next_unlocked_entry(xas, 0); 1130 1131 /* Entry got punched out / reallocated? */ 1132 if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) 1133 goto put_unlocked; 1134 /* 1135 * Entry got reallocated elsewhere? No need to writeback. 1136 * We have to compare pfns as we must not bail out due to 1137 * difference in lockbit or entry type. 1138 */ 1139 if (dax_to_pfn(old_entry) != dax_to_pfn(entry)) 1140 goto put_unlocked; 1141 if (WARN_ON_ONCE(dax_is_empty_entry(entry) || 1142 dax_is_zero_entry(entry))) { 1143 ret = -EIO; 1144 goto put_unlocked; 1145 } 1146 1147 /* Another fsync thread may have already done this entry */ 1148 if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE)) 1149 goto put_unlocked; 1150 } 1151 1152 /* Lock the entry to serialize with page faults */ 1153 dax_lock_entry(xas, entry); 1154 1155 /* 1156 * We can clear the tag now but we have to be careful so that concurrent 1157 * dax_writeback_one() calls for the same index cannot finish before we 1158 * actually flush the caches. This is achieved as the calls will look 1159 * at the entry only under the i_pages lock and once they do that 1160 * they will see the entry locked and wait for it to unlock. 1161 */ 1162 xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE); 1163 xas_unlock_irq(xas); 1164 1165 /* 1166 * If dax_writeback_mapping_range() was given a wbc->range_start 1167 * in the middle of a PMD, the 'index' we use needs to be 1168 * aligned to the start of the PMD. 1169 * This allows us to flush for PMD_SIZE and not have to worry about 1170 * partial PMD writebacks. 1171 */ 1172 pfn = dax_to_pfn(entry); 1173 count = 1UL << dax_entry_order(entry); 1174 index = xas->xa_index & ~(count - 1); 1175 end = index + count - 1; 1176 1177 /* Walk all mappings of a given index of a file and writeprotect them */ 1178 i_mmap_lock_read(mapping); 1179 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) { 1180 pfn_mkclean_range(pfn, count, index, vma); 1181 cond_resched(); 1182 } 1183 i_mmap_unlock_read(mapping); 1184 1185 dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); 1186 /* 1187 * After we have flushed the cache, we can clear the dirty tag. There 1188 * cannot be new dirty data in the pfn after the flush has completed as 1189 * the pfn mappings are writeprotected and fault waits for mapping 1190 * entry lock. 1191 */ 1192 xas_reset(xas); 1193 xas_lock_irq(xas); 1194 xas_store(xas, entry); 1195 xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); 1196 dax_wake_entry(xas, entry, WAKE_NEXT); 1197 1198 trace_dax_writeback_one(mapping->host, index, count); 1199 return ret; 1200 1201 put_unlocked: 1202 put_unlocked_entry(xas, entry, WAKE_NEXT); 1203 return ret; 1204 } 1205 1206 /* 1207 * Flush the mapping to the persistent domain within the byte range of [start, 1208 * end]. This is required by data integrity operations to ensure file data is 1209 * on persistent storage prior to completion of the operation. 1210 */ 1211 int dax_writeback_mapping_range(struct address_space *mapping, 1212 struct dax_device *dax_dev, struct writeback_control *wbc) 1213 { 1214 XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT); 1215 struct inode *inode = mapping->host; 1216 pgoff_t end_index = wbc->range_end >> PAGE_SHIFT; 1217 void *entry; 1218 int ret = 0; 1219 unsigned int scanned = 0; 1220 1221 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 1222 return -EIO; 1223 1224 if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL) 1225 return 0; 1226 1227 trace_dax_writeback_range(inode, xas.xa_index, end_index); 1228 1229 tag_pages_for_writeback(mapping, xas.xa_index, end_index); 1230 1231 xas_lock_irq(&xas); 1232 xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) { 1233 ret = dax_writeback_one(&xas, dax_dev, mapping, entry); 1234 if (ret < 0) { 1235 mapping_set_error(mapping, ret); 1236 break; 1237 } 1238 if (++scanned % XA_CHECK_SCHED) 1239 continue; 1240 1241 xas_pause(&xas); 1242 xas_unlock_irq(&xas); 1243 cond_resched(); 1244 xas_lock_irq(&xas); 1245 } 1246 xas_unlock_irq(&xas); 1247 trace_dax_writeback_range_done(inode, xas.xa_index, end_index); 1248 return ret; 1249 } 1250 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 1251 1252 static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos, 1253 size_t size, void **kaddr, pfn_t *pfnp) 1254 { 1255 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); 1256 int id, rc = 0; 1257 long length; 1258 1259 id = dax_read_lock(); 1260 length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), 1261 DAX_ACCESS, kaddr, pfnp); 1262 if (length < 0) { 1263 rc = length; 1264 goto out; 1265 } 1266 if (!pfnp) 1267 goto out_check_addr; 1268 rc = -EINVAL; 1269 if (PFN_PHYS(length) < size) 1270 goto out; 1271 if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1)) 1272 goto out; 1273 1274 rc = 0; 1275 1276 out_check_addr: 1277 if (!kaddr) 1278 goto out; 1279 if (!*kaddr) 1280 rc = -EFAULT; 1281 out: 1282 dax_read_unlock(id); 1283 return rc; 1284 } 1285 1286 /** 1287 * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page 1288 * by copying the data before and after the range to be written. 1289 * @pos: address to do copy from. 1290 * @length: size of copy operation. 1291 * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE) 1292 * @srcmap: iomap srcmap 1293 * @daddr: destination address to copy to. 1294 * 1295 * This can be called from two places. Either during DAX write fault (page 1296 * aligned), to copy the length size data to daddr. Or, while doing normal DAX 1297 * write operation, dax_iomap_iter() might call this to do the copy of either 1298 * start or end unaligned address. In the latter case the rest of the copy of 1299 * aligned ranges is taken care by dax_iomap_iter() itself. 1300 * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the 1301 * area to make sure no old data remains. 1302 */ 1303 static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, 1304 const struct iomap *srcmap, void *daddr) 1305 { 1306 loff_t head_off = pos & (align_size - 1); 1307 size_t size = ALIGN(head_off + length, align_size); 1308 loff_t end = pos + length; 1309 loff_t pg_end = round_up(end, align_size); 1310 /* copy_all is usually in page fault case */ 1311 bool copy_all = head_off == 0 && end == pg_end; 1312 /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */ 1313 bool zero_edge = srcmap->flags & IOMAP_F_SHARED || 1314 srcmap->type == IOMAP_UNWRITTEN; 1315 void *saddr = NULL; 1316 int ret = 0; 1317 1318 if (!zero_edge) { 1319 ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); 1320 if (ret) 1321 return dax_mem2blk_err(ret); 1322 } 1323 1324 if (copy_all) { 1325 if (zero_edge) 1326 memset(daddr, 0, size); 1327 else 1328 ret = copy_mc_to_kernel(daddr, saddr, length); 1329 goto out; 1330 } 1331 1332 /* Copy the head part of the range */ 1333 if (head_off) { 1334 if (zero_edge) 1335 memset(daddr, 0, head_off); 1336 else { 1337 ret = copy_mc_to_kernel(daddr, saddr, head_off); 1338 if (ret) 1339 return -EIO; 1340 } 1341 } 1342 1343 /* Copy the tail part of the range */ 1344 if (end < pg_end) { 1345 loff_t tail_off = head_off + length; 1346 loff_t tail_len = pg_end - end; 1347 1348 if (zero_edge) 1349 memset(daddr + tail_off, 0, tail_len); 1350 else { 1351 ret = copy_mc_to_kernel(daddr + tail_off, 1352 saddr + tail_off, tail_len); 1353 if (ret) 1354 return -EIO; 1355 } 1356 } 1357 out: 1358 if (zero_edge) 1359 dax_flush(srcmap->dax_dev, daddr, size); 1360 return ret ? -EIO : 0; 1361 } 1362 1363 /* 1364 * The user has performed a load from a hole in the file. Allocating a new 1365 * page in the file would cause excessive storage usage for workloads with 1366 * sparse files. Instead we insert a read-only mapping of the 4k zero page. 1367 * If this page is ever written to we will re-fault and change the mapping to 1368 * point to real DAX storage instead. 1369 */ 1370 static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf, 1371 const struct iomap_iter *iter, void **entry) 1372 { 1373 struct inode *inode = iter->inode; 1374 unsigned long vaddr = vmf->address; 1375 pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); 1376 vm_fault_t ret; 1377 1378 *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE); 1379 1380 ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), false); 1381 trace_dax_load_hole(inode, vmf, ret); 1382 return ret; 1383 } 1384 1385 #ifdef CONFIG_FS_DAX_PMD 1386 static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, 1387 const struct iomap_iter *iter, void **entry) 1388 { 1389 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1390 unsigned long pmd_addr = vmf->address & PMD_MASK; 1391 struct vm_area_struct *vma = vmf->vma; 1392 struct inode *inode = mapping->host; 1393 pgtable_t pgtable = NULL; 1394 struct folio *zero_folio; 1395 spinlock_t *ptl; 1396 pmd_t pmd_entry; 1397 pfn_t pfn; 1398 1399 zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm); 1400 1401 if (unlikely(!zero_folio)) 1402 goto fallback; 1403 1404 pfn = page_to_pfn_t(&zero_folio->page); 1405 *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, 1406 DAX_PMD | DAX_ZERO_PAGE); 1407 1408 if (arch_needs_pgtable_deposit()) { 1409 pgtable = pte_alloc_one(vma->vm_mm); 1410 if (!pgtable) 1411 return VM_FAULT_OOM; 1412 } 1413 1414 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1415 if (!pmd_none(*(vmf->pmd))) { 1416 spin_unlock(ptl); 1417 goto fallback; 1418 } 1419 1420 if (pgtable) { 1421 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); 1422 mm_inc_nr_ptes(vma->vm_mm); 1423 } 1424 pmd_entry = mk_pmd(&zero_folio->page, vmf->vma->vm_page_prot); 1425 pmd_entry = pmd_mkhuge(pmd_entry); 1426 set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); 1427 spin_unlock(ptl); 1428 trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry); 1429 return VM_FAULT_NOPAGE; 1430 1431 fallback: 1432 if (pgtable) 1433 pte_free(vma->vm_mm, pgtable); 1434 trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry); 1435 return VM_FAULT_FALLBACK; 1436 } 1437 #else 1438 static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, 1439 const struct iomap_iter *iter, void **entry) 1440 { 1441 return VM_FAULT_FALLBACK; 1442 } 1443 #endif /* CONFIG_FS_DAX_PMD */ 1444 1445 static int dax_unshare_iter(struct iomap_iter *iter) 1446 { 1447 struct iomap *iomap = &iter->iomap; 1448 const struct iomap *srcmap = iomap_iter_srcmap(iter); 1449 loff_t copy_pos = iter->pos; 1450 u64 copy_len = iomap_length(iter); 1451 u32 mod; 1452 int id = 0; 1453 s64 ret; 1454 void *daddr = NULL, *saddr = NULL; 1455 1456 if (!iomap_want_unshare_iter(iter)) 1457 return iomap_iter_advance_full(iter); 1458 1459 /* 1460 * Extend the file range to be aligned to fsblock/pagesize, because 1461 * we need to copy entire blocks, not just the byte range specified. 1462 * Invalidate the mapping because we're about to CoW. 1463 */ 1464 mod = offset_in_page(copy_pos); 1465 if (mod) { 1466 copy_len += mod; 1467 copy_pos -= mod; 1468 } 1469 1470 mod = offset_in_page(copy_pos + copy_len); 1471 if (mod) 1472 copy_len += PAGE_SIZE - mod; 1473 1474 invalidate_inode_pages2_range(iter->inode->i_mapping, 1475 copy_pos >> PAGE_SHIFT, 1476 (copy_pos + copy_len - 1) >> PAGE_SHIFT); 1477 1478 id = dax_read_lock(); 1479 ret = dax_iomap_direct_access(iomap, copy_pos, copy_len, &daddr, NULL); 1480 if (ret < 0) 1481 goto out_unlock; 1482 1483 ret = dax_iomap_direct_access(srcmap, copy_pos, copy_len, &saddr, NULL); 1484 if (ret < 0) 1485 goto out_unlock; 1486 1487 if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0) 1488 ret = -EIO; 1489 1490 out_unlock: 1491 dax_read_unlock(id); 1492 if (ret < 0) 1493 return dax_mem2blk_err(ret); 1494 return iomap_iter_advance_full(iter); 1495 } 1496 1497 int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, 1498 const struct iomap_ops *ops) 1499 { 1500 struct iomap_iter iter = { 1501 .inode = inode, 1502 .pos = pos, 1503 .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX, 1504 }; 1505 loff_t size = i_size_read(inode); 1506 int ret; 1507 1508 if (pos < 0 || pos >= size) 1509 return 0; 1510 1511 iter.len = min(len, size - pos); 1512 while ((ret = iomap_iter(&iter, ops)) > 0) 1513 iter.status = dax_unshare_iter(&iter); 1514 return ret; 1515 } 1516 EXPORT_SYMBOL_GPL(dax_file_unshare); 1517 1518 static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) 1519 { 1520 const struct iomap *iomap = &iter->iomap; 1521 const struct iomap *srcmap = iomap_iter_srcmap(iter); 1522 unsigned offset = offset_in_page(pos); 1523 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); 1524 void *kaddr; 1525 long ret; 1526 1527 ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, 1528 NULL); 1529 if (ret < 0) 1530 return dax_mem2blk_err(ret); 1531 1532 memset(kaddr + offset, 0, size); 1533 if (iomap->flags & IOMAP_F_SHARED) 1534 ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap, 1535 kaddr); 1536 else 1537 dax_flush(iomap->dax_dev, kaddr + offset, size); 1538 return ret; 1539 } 1540 1541 static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) 1542 { 1543 const struct iomap *iomap = &iter->iomap; 1544 const struct iomap *srcmap = iomap_iter_srcmap(iter); 1545 u64 length = iomap_length(iter); 1546 int ret; 1547 1548 /* already zeroed? we're done. */ 1549 if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) 1550 return iomap_iter_advance(iter, &length); 1551 1552 /* 1553 * invalidate the pages whose sharing state is to be changed 1554 * because of CoW. 1555 */ 1556 if (iomap->flags & IOMAP_F_SHARED) 1557 invalidate_inode_pages2_range(iter->inode->i_mapping, 1558 iter->pos >> PAGE_SHIFT, 1559 (iter->pos + length - 1) >> PAGE_SHIFT); 1560 1561 do { 1562 loff_t pos = iter->pos; 1563 unsigned offset = offset_in_page(pos); 1564 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); 1565 int id; 1566 1567 length = min_t(u64, PAGE_SIZE - offset, length); 1568 1569 id = dax_read_lock(); 1570 if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE) 1571 ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1); 1572 else 1573 ret = dax_memzero(iter, pos, length); 1574 dax_read_unlock(id); 1575 1576 if (ret < 0) 1577 return ret; 1578 1579 ret = iomap_iter_advance(iter, &length); 1580 if (ret) 1581 return ret; 1582 } while (length > 0); 1583 1584 if (did_zero) 1585 *did_zero = true; 1586 return ret; 1587 } 1588 1589 int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 1590 const struct iomap_ops *ops) 1591 { 1592 struct iomap_iter iter = { 1593 .inode = inode, 1594 .pos = pos, 1595 .len = len, 1596 .flags = IOMAP_DAX | IOMAP_ZERO, 1597 }; 1598 int ret; 1599 1600 while ((ret = iomap_iter(&iter, ops)) > 0) 1601 iter.status = dax_zero_iter(&iter, did_zero); 1602 return ret; 1603 } 1604 EXPORT_SYMBOL_GPL(dax_zero_range); 1605 1606 int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 1607 const struct iomap_ops *ops) 1608 { 1609 unsigned int blocksize = i_blocksize(inode); 1610 unsigned int off = pos & (blocksize - 1); 1611 1612 /* Block boundary? Nothing to do */ 1613 if (!off) 1614 return 0; 1615 return dax_zero_range(inode, pos, blocksize - off, did_zero, ops); 1616 } 1617 EXPORT_SYMBOL_GPL(dax_truncate_page); 1618 1619 static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter) 1620 { 1621 const struct iomap *iomap = &iomi->iomap; 1622 const struct iomap *srcmap = iomap_iter_srcmap(iomi); 1623 loff_t length = iomap_length(iomi); 1624 loff_t pos = iomi->pos; 1625 struct dax_device *dax_dev = iomap->dax_dev; 1626 loff_t end = pos + length, done = 0; 1627 bool write = iov_iter_rw(iter) == WRITE; 1628 bool cow = write && iomap->flags & IOMAP_F_SHARED; 1629 ssize_t ret = 0; 1630 size_t xfer; 1631 int id; 1632 1633 if (!write) { 1634 end = min(end, i_size_read(iomi->inode)); 1635 if (pos >= end) 1636 return 0; 1637 1638 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) { 1639 done = iov_iter_zero(min(length, end - pos), iter); 1640 return iomap_iter_advance(iomi, &done); 1641 } 1642 } 1643 1644 /* 1645 * In DAX mode, enforce either pure overwrites of written extents, or 1646 * writes to unwritten extents as part of a copy-on-write operation. 1647 */ 1648 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED && 1649 !(iomap->flags & IOMAP_F_SHARED))) 1650 return -EIO; 1651 1652 /* 1653 * Write can allocate block for an area which has a hole page mapped 1654 * into page tables. We have to tear down these mappings so that data 1655 * written by write(2) is visible in mmap. 1656 */ 1657 if (iomap->flags & IOMAP_F_NEW || cow) { 1658 /* 1659 * Filesystem allows CoW on non-shared extents. The src extents 1660 * may have been mmapped with dirty mark before. To be able to 1661 * invalidate its dax entries, we need to clear the dirty mark 1662 * in advance. 1663 */ 1664 if (cow) 1665 __dax_clear_dirty_range(iomi->inode->i_mapping, 1666 pos >> PAGE_SHIFT, 1667 (end - 1) >> PAGE_SHIFT); 1668 invalidate_inode_pages2_range(iomi->inode->i_mapping, 1669 pos >> PAGE_SHIFT, 1670 (end - 1) >> PAGE_SHIFT); 1671 } 1672 1673 id = dax_read_lock(); 1674 while ((pos = iomi->pos) < end) { 1675 unsigned offset = pos & (PAGE_SIZE - 1); 1676 const size_t size = ALIGN(length + offset, PAGE_SIZE); 1677 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); 1678 ssize_t map_len; 1679 bool recovery = false; 1680 void *kaddr; 1681 1682 if (fatal_signal_pending(current)) { 1683 ret = -EINTR; 1684 break; 1685 } 1686 1687 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), 1688 DAX_ACCESS, &kaddr, NULL); 1689 if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) { 1690 map_len = dax_direct_access(dax_dev, pgoff, 1691 PHYS_PFN(size), DAX_RECOVERY_WRITE, 1692 &kaddr, NULL); 1693 if (map_len > 0) 1694 recovery = true; 1695 } 1696 if (map_len < 0) { 1697 ret = dax_mem2blk_err(map_len); 1698 break; 1699 } 1700 1701 if (cow) { 1702 ret = dax_iomap_copy_around(pos, length, PAGE_SIZE, 1703 srcmap, kaddr); 1704 if (ret) 1705 break; 1706 } 1707 1708 map_len = PFN_PHYS(map_len); 1709 kaddr += offset; 1710 map_len -= offset; 1711 if (map_len > end - pos) 1712 map_len = end - pos; 1713 1714 if (recovery) 1715 xfer = dax_recovery_write(dax_dev, pgoff, kaddr, 1716 map_len, iter); 1717 else if (write) 1718 xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, 1719 map_len, iter); 1720 else 1721 xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, 1722 map_len, iter); 1723 1724 length = xfer; 1725 ret = iomap_iter_advance(iomi, &length); 1726 if (!ret && xfer == 0) 1727 ret = -EFAULT; 1728 if (xfer < map_len) 1729 break; 1730 } 1731 dax_read_unlock(id); 1732 1733 return ret; 1734 } 1735 1736 /** 1737 * dax_iomap_rw - Perform I/O to a DAX file 1738 * @iocb: The control block for this I/O 1739 * @iter: The addresses to do I/O from or to 1740 * @ops: iomap ops passed from the file system 1741 * 1742 * This function performs read and write operations to directly mapped 1743 * persistent memory. The callers needs to take care of read/write exclusion 1744 * and evicting any page cache pages in the region under I/O. 1745 */ 1746 ssize_t 1747 dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 1748 const struct iomap_ops *ops) 1749 { 1750 struct iomap_iter iomi = { 1751 .inode = iocb->ki_filp->f_mapping->host, 1752 .pos = iocb->ki_pos, 1753 .len = iov_iter_count(iter), 1754 .flags = IOMAP_DAX, 1755 }; 1756 loff_t done = 0; 1757 int ret; 1758 1759 if (!iomi.len) 1760 return 0; 1761 1762 if (iov_iter_rw(iter) == WRITE) { 1763 lockdep_assert_held_write(&iomi.inode->i_rwsem); 1764 iomi.flags |= IOMAP_WRITE; 1765 } else { 1766 lockdep_assert_held(&iomi.inode->i_rwsem); 1767 } 1768 1769 if (iocb->ki_flags & IOCB_NOWAIT) 1770 iomi.flags |= IOMAP_NOWAIT; 1771 1772 while ((ret = iomap_iter(&iomi, ops)) > 0) 1773 iomi.status = dax_iomap_iter(&iomi, iter); 1774 1775 done = iomi.pos - iocb->ki_pos; 1776 iocb->ki_pos = iomi.pos; 1777 return done ? done : ret; 1778 } 1779 EXPORT_SYMBOL_GPL(dax_iomap_rw); 1780 1781 static vm_fault_t dax_fault_return(int error) 1782 { 1783 if (error == 0) 1784 return VM_FAULT_NOPAGE; 1785 return vmf_error(error); 1786 } 1787 1788 /* 1789 * When handling a synchronous page fault and the inode need a fsync, we can 1790 * insert the PTE/PMD into page tables only after that fsync happened. Skip 1791 * insertion for now and return the pfn so that caller can insert it after the 1792 * fsync is done. 1793 */ 1794 static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn) 1795 { 1796 if (WARN_ON_ONCE(!pfnp)) 1797 return VM_FAULT_SIGBUS; 1798 *pfnp = pfn; 1799 return VM_FAULT_NEEDDSYNC; 1800 } 1801 1802 static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, 1803 const struct iomap_iter *iter) 1804 { 1805 vm_fault_t ret; 1806 int error = 0; 1807 1808 switch (iter->iomap.type) { 1809 case IOMAP_HOLE: 1810 case IOMAP_UNWRITTEN: 1811 clear_user_highpage(vmf->cow_page, vmf->address); 1812 break; 1813 case IOMAP_MAPPED: 1814 error = copy_cow_page_dax(vmf, iter); 1815 break; 1816 default: 1817 WARN_ON_ONCE(1); 1818 error = -EIO; 1819 break; 1820 } 1821 1822 if (error) 1823 return dax_fault_return(error); 1824 1825 __SetPageUptodate(vmf->cow_page); 1826 ret = finish_fault(vmf); 1827 if (!ret) 1828 return VM_FAULT_DONE_COW; 1829 return ret; 1830 } 1831 1832 /** 1833 * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault. 1834 * @vmf: vm fault instance 1835 * @iter: iomap iter 1836 * @pfnp: pfn to be returned 1837 * @xas: the dax mapping tree of a file 1838 * @entry: an unlocked dax entry to be inserted 1839 * @pmd: distinguish whether it is a pmd fault 1840 */ 1841 static vm_fault_t dax_fault_iter(struct vm_fault *vmf, 1842 const struct iomap_iter *iter, pfn_t *pfnp, 1843 struct xa_state *xas, void **entry, bool pmd) 1844 { 1845 const struct iomap *iomap = &iter->iomap; 1846 const struct iomap *srcmap = iomap_iter_srcmap(iter); 1847 size_t size = pmd ? PMD_SIZE : PAGE_SIZE; 1848 loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; 1849 bool write = iter->flags & IOMAP_WRITE; 1850 unsigned long entry_flags = pmd ? DAX_PMD : 0; 1851 struct folio *folio; 1852 int ret, err = 0; 1853 pfn_t pfn; 1854 void *kaddr; 1855 1856 if (!pmd && vmf->cow_page) 1857 return dax_fault_cow_page(vmf, iter); 1858 1859 /* if we are reading UNWRITTEN and HOLE, return a hole. */ 1860 if (!write && 1861 (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) { 1862 if (!pmd) 1863 return dax_load_hole(xas, vmf, iter, entry); 1864 return dax_pmd_load_hole(xas, vmf, iter, entry); 1865 } 1866 1867 if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) { 1868 WARN_ON_ONCE(1); 1869 return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS; 1870 } 1871 1872 err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn); 1873 if (err) 1874 return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err); 1875 1876 *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags); 1877 1878 if (write && iomap->flags & IOMAP_F_SHARED) { 1879 err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr); 1880 if (err) 1881 return dax_fault_return(err); 1882 } 1883 1884 folio = dax_to_folio(*entry); 1885 if (dax_fault_is_synchronous(iter, vmf->vma)) 1886 return dax_fault_synchronous_pfnp(pfnp, pfn); 1887 1888 folio_ref_inc(folio); 1889 if (pmd) 1890 ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn_t_to_pfn(pfn)), 1891 write); 1892 else 1893 ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), write); 1894 folio_put(folio); 1895 1896 return ret; 1897 } 1898 1899 static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, 1900 int *iomap_errp, const struct iomap_ops *ops) 1901 { 1902 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1903 XA_STATE(xas, &mapping->i_pages, vmf->pgoff); 1904 struct iomap_iter iter = { 1905 .inode = mapping->host, 1906 .pos = (loff_t)vmf->pgoff << PAGE_SHIFT, 1907 .len = PAGE_SIZE, 1908 .flags = IOMAP_DAX | IOMAP_FAULT, 1909 }; 1910 vm_fault_t ret = 0; 1911 void *entry; 1912 int error; 1913 1914 trace_dax_pte_fault(iter.inode, vmf, ret); 1915 /* 1916 * Check whether offset isn't beyond end of file now. Caller is supposed 1917 * to hold locks serializing us with truncate / punch hole so this is 1918 * a reliable test. 1919 */ 1920 if (iter.pos >= i_size_read(iter.inode)) { 1921 ret = VM_FAULT_SIGBUS; 1922 goto out; 1923 } 1924 1925 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) 1926 iter.flags |= IOMAP_WRITE; 1927 1928 entry = grab_mapping_entry(&xas, mapping, 0); 1929 if (xa_is_internal(entry)) { 1930 ret = xa_to_internal(entry); 1931 goto out; 1932 } 1933 1934 /* 1935 * It is possible, particularly with mixed reads & writes to private 1936 * mappings, that we have raced with a PMD fault that overlaps with 1937 * the PTE we need to set up. If so just return and the fault will be 1938 * retried. 1939 */ 1940 if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { 1941 ret = VM_FAULT_NOPAGE; 1942 goto unlock_entry; 1943 } 1944 1945 while ((error = iomap_iter(&iter, ops)) > 0) { 1946 if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) { 1947 iter.status = -EIO; /* fs corruption? */ 1948 continue; 1949 } 1950 1951 ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false); 1952 if (ret != VM_FAULT_SIGBUS && 1953 (iter.iomap.flags & IOMAP_F_NEW)) { 1954 count_vm_event(PGMAJFAULT); 1955 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); 1956 ret |= VM_FAULT_MAJOR; 1957 } 1958 1959 if (!(ret & VM_FAULT_ERROR)) { 1960 u64 length = PAGE_SIZE; 1961 iter.status = iomap_iter_advance(&iter, &length); 1962 } 1963 } 1964 1965 if (iomap_errp) 1966 *iomap_errp = error; 1967 if (!ret && error) 1968 ret = dax_fault_return(error); 1969 1970 unlock_entry: 1971 dax_unlock_entry(&xas, entry); 1972 out: 1973 trace_dax_pte_fault_done(iter.inode, vmf, ret); 1974 return ret; 1975 } 1976 1977 #ifdef CONFIG_FS_DAX_PMD 1978 static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas, 1979 pgoff_t max_pgoff) 1980 { 1981 unsigned long pmd_addr = vmf->address & PMD_MASK; 1982 bool write = vmf->flags & FAULT_FLAG_WRITE; 1983 1984 /* 1985 * Make sure that the faulting address's PMD offset (color) matches 1986 * the PMD offset from the start of the file. This is necessary so 1987 * that a PMD range in the page table overlaps exactly with a PMD 1988 * range in the page cache. 1989 */ 1990 if ((vmf->pgoff & PG_PMD_COLOUR) != 1991 ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) 1992 return true; 1993 1994 /* Fall back to PTEs if we're going to COW */ 1995 if (write && !(vmf->vma->vm_flags & VM_SHARED)) 1996 return true; 1997 1998 /* If the PMD would extend outside the VMA */ 1999 if (pmd_addr < vmf->vma->vm_start) 2000 return true; 2001 if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end) 2002 return true; 2003 2004 /* If the PMD would extend beyond the file size */ 2005 if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff) 2006 return true; 2007 2008 return false; 2009 } 2010 2011 static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, 2012 const struct iomap_ops *ops) 2013 { 2014 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 2015 XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); 2016 struct iomap_iter iter = { 2017 .inode = mapping->host, 2018 .len = PMD_SIZE, 2019 .flags = IOMAP_DAX | IOMAP_FAULT, 2020 }; 2021 vm_fault_t ret = VM_FAULT_FALLBACK; 2022 pgoff_t max_pgoff; 2023 void *entry; 2024 2025 if (vmf->flags & FAULT_FLAG_WRITE) 2026 iter.flags |= IOMAP_WRITE; 2027 2028 /* 2029 * Check whether offset isn't beyond end of file now. Caller is 2030 * supposed to hold locks serializing us with truncate / punch hole so 2031 * this is a reliable test. 2032 */ 2033 max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE); 2034 2035 trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0); 2036 2037 if (xas.xa_index >= max_pgoff) { 2038 ret = VM_FAULT_SIGBUS; 2039 goto out; 2040 } 2041 2042 if (dax_fault_check_fallback(vmf, &xas, max_pgoff)) 2043 goto fallback; 2044 2045 /* 2046 * grab_mapping_entry() will make sure we get an empty PMD entry, 2047 * a zero PMD entry or a DAX PMD. If it can't (because a PTE 2048 * entry is already in the array, for instance), it will return 2049 * VM_FAULT_FALLBACK. 2050 */ 2051 entry = grab_mapping_entry(&xas, mapping, PMD_ORDER); 2052 if (xa_is_internal(entry)) { 2053 ret = xa_to_internal(entry); 2054 goto fallback; 2055 } 2056 2057 /* 2058 * It is possible, particularly with mixed reads & writes to private 2059 * mappings, that we have raced with a PTE fault that overlaps with 2060 * the PMD we need to set up. If so just return and the fault will be 2061 * retried. 2062 */ 2063 if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && 2064 !pmd_devmap(*vmf->pmd)) { 2065 ret = 0; 2066 goto unlock_entry; 2067 } 2068 2069 iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT; 2070 while (iomap_iter(&iter, ops) > 0) { 2071 if (iomap_length(&iter) < PMD_SIZE) 2072 continue; /* actually breaks out of the loop */ 2073 2074 ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); 2075 if (ret != VM_FAULT_FALLBACK) { 2076 u64 length = PMD_SIZE; 2077 iter.status = iomap_iter_advance(&iter, &length); 2078 } 2079 } 2080 2081 unlock_entry: 2082 dax_unlock_entry(&xas, entry); 2083 fallback: 2084 if (ret == VM_FAULT_FALLBACK) { 2085 split_huge_pmd(vmf->vma, vmf->pmd, vmf->address); 2086 count_vm_event(THP_FAULT_FALLBACK); 2087 } 2088 out: 2089 trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret); 2090 return ret; 2091 } 2092 #else 2093 static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, 2094 const struct iomap_ops *ops) 2095 { 2096 return VM_FAULT_FALLBACK; 2097 } 2098 #endif /* CONFIG_FS_DAX_PMD */ 2099 2100 /** 2101 * dax_iomap_fault - handle a page fault on a DAX file 2102 * @vmf: The description of the fault 2103 * @order: Order of the page to fault in 2104 * @pfnp: PFN to insert for synchronous faults if fsync is required 2105 * @iomap_errp: Storage for detailed error code in case of error 2106 * @ops: Iomap ops passed from the file system 2107 * 2108 * When a page fault occurs, filesystems may call this helper in 2109 * their fault handler for DAX files. dax_iomap_fault() assumes the caller 2110 * has done all the necessary locking for page fault to proceed 2111 * successfully. 2112 */ 2113 vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, 2114 pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) 2115 { 2116 if (order == 0) 2117 return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops); 2118 else if (order == PMD_ORDER) 2119 return dax_iomap_pmd_fault(vmf, pfnp, ops); 2120 else 2121 return VM_FAULT_FALLBACK; 2122 } 2123 EXPORT_SYMBOL_GPL(dax_iomap_fault); 2124 2125 /* 2126 * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables 2127 * @vmf: The description of the fault 2128 * @pfn: PFN to insert 2129 * @order: Order of entry to insert. 2130 * 2131 * This function inserts a writeable PTE or PMD entry into the page tables 2132 * for an mmaped DAX file. It also marks the page cache entry as dirty. 2133 */ 2134 static vm_fault_t 2135 dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) 2136 { 2137 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 2138 XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); 2139 struct folio *folio; 2140 void *entry; 2141 vm_fault_t ret; 2142 2143 xas_lock_irq(&xas); 2144 entry = get_next_unlocked_entry(&xas, order); 2145 /* Did we race with someone splitting entry or so? */ 2146 if (!entry || dax_is_conflict(entry) || 2147 (order == 0 && !dax_is_pte_entry(entry))) { 2148 put_unlocked_entry(&xas, entry, WAKE_NEXT); 2149 xas_unlock_irq(&xas); 2150 trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, 2151 VM_FAULT_NOPAGE); 2152 return VM_FAULT_NOPAGE; 2153 } 2154 xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); 2155 dax_lock_entry(&xas, entry); 2156 xas_unlock_irq(&xas); 2157 folio = pfn_folio(pfn_t_to_pfn(pfn)); 2158 folio_ref_inc(folio); 2159 if (order == 0) 2160 ret = vmf_insert_page_mkwrite(vmf, &folio->page, true); 2161 #ifdef CONFIG_FS_DAX_PMD 2162 else if (order == PMD_ORDER) 2163 ret = vmf_insert_folio_pmd(vmf, folio, FAULT_FLAG_WRITE); 2164 #endif 2165 else 2166 ret = VM_FAULT_FALLBACK; 2167 folio_put(folio); 2168 dax_unlock_entry(&xas, entry); 2169 trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); 2170 return ret; 2171 } 2172 2173 /** 2174 * dax_finish_sync_fault - finish synchronous page fault 2175 * @vmf: The description of the fault 2176 * @order: Order of entry to be inserted 2177 * @pfn: PFN to insert 2178 * 2179 * This function ensures that the file range touched by the page fault is 2180 * stored persistently on the media and handles inserting of appropriate page 2181 * table entry. 2182 */ 2183 vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, 2184 pfn_t pfn) 2185 { 2186 int err; 2187 loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; 2188 size_t len = PAGE_SIZE << order; 2189 2190 err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); 2191 if (err) 2192 return VM_FAULT_SIGBUS; 2193 return dax_insert_pfn_mkwrite(vmf, pfn, order); 2194 } 2195 EXPORT_SYMBOL_GPL(dax_finish_sync_fault); 2196 2197 static int dax_range_compare_iter(struct iomap_iter *it_src, 2198 struct iomap_iter *it_dest, u64 len, bool *same) 2199 { 2200 const struct iomap *smap = &it_src->iomap; 2201 const struct iomap *dmap = &it_dest->iomap; 2202 loff_t pos1 = it_src->pos, pos2 = it_dest->pos; 2203 u64 dest_len; 2204 void *saddr, *daddr; 2205 int id, ret; 2206 2207 len = min(len, min(smap->length, dmap->length)); 2208 2209 if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) { 2210 *same = true; 2211 goto advance; 2212 } 2213 2214 if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) { 2215 *same = false; 2216 return 0; 2217 } 2218 2219 id = dax_read_lock(); 2220 ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE), 2221 &saddr, NULL); 2222 if (ret < 0) 2223 goto out_unlock; 2224 2225 ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE), 2226 &daddr, NULL); 2227 if (ret < 0) 2228 goto out_unlock; 2229 2230 *same = !memcmp(saddr, daddr, len); 2231 if (!*same) 2232 len = 0; 2233 dax_read_unlock(id); 2234 2235 advance: 2236 dest_len = len; 2237 ret = iomap_iter_advance(it_src, &len); 2238 if (!ret) 2239 ret = iomap_iter_advance(it_dest, &dest_len); 2240 return ret; 2241 2242 out_unlock: 2243 dax_read_unlock(id); 2244 return -EIO; 2245 } 2246 2247 int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, 2248 struct inode *dst, loff_t dstoff, loff_t len, bool *same, 2249 const struct iomap_ops *ops) 2250 { 2251 struct iomap_iter src_iter = { 2252 .inode = src, 2253 .pos = srcoff, 2254 .len = len, 2255 .flags = IOMAP_DAX, 2256 }; 2257 struct iomap_iter dst_iter = { 2258 .inode = dst, 2259 .pos = dstoff, 2260 .len = len, 2261 .flags = IOMAP_DAX, 2262 }; 2263 int ret, status; 2264 2265 while ((ret = iomap_iter(&src_iter, ops)) > 0 && 2266 (ret = iomap_iter(&dst_iter, ops)) > 0) { 2267 status = dax_range_compare_iter(&src_iter, &dst_iter, 2268 min(src_iter.len, dst_iter.len), same); 2269 if (status < 0) 2270 return ret; 2271 src_iter.status = dst_iter.status = status; 2272 } 2273 return ret; 2274 } 2275 2276 int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, 2277 struct file *file_out, loff_t pos_out, 2278 loff_t *len, unsigned int remap_flags, 2279 const struct iomap_ops *ops) 2280 { 2281 return __generic_remap_file_range_prep(file_in, pos_in, file_out, 2282 pos_out, len, remap_flags, ops); 2283 } 2284 EXPORT_SYMBOL_GPL(dax_remap_file_range_prep); 2285