1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * fs/dax.c - Direct Access filesystem code 4 * Copyright (c) 2013-2014 Intel Corporation 5 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 6 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 7 */ 8 9 #include <linux/atomic.h> 10 #include <linux/blkdev.h> 11 #include <linux/buffer_head.h> 12 #include <linux/dax.h> 13 #include <linux/fs.h> 14 #include <linux/highmem.h> 15 #include <linux/memcontrol.h> 16 #include <linux/mm.h> 17 #include <linux/mutex.h> 18 #include <linux/sched.h> 19 #include <linux/sched/signal.h> 20 #include <linux/uio.h> 21 #include <linux/vmstat.h> 22 #include <linux/sizes.h> 23 #include <linux/mmu_notifier.h> 24 #include <linux/iomap.h> 25 #include <linux/rmap.h> 26 #include <linux/pgalloc.h> 27 28 #define CREATE_TRACE_POINTS 29 #include <trace/events/fs_dax.h> 30 31 /* We choose 4096 entries - same as per-zone page wait tables */ 32 #define DAX_WAIT_TABLE_BITS 12 33 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 34 35 /* The 'colour' (ie low bits) within a PMD of a page offset. */ 36 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 37 #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) 38 39 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 40 41 static int __init init_dax_wait_table(void) 42 { 43 int i; 44 45 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) 46 init_waitqueue_head(wait_table + i); 47 return 0; 48 } 49 fs_initcall(init_dax_wait_table); 50 51 /* 52 * DAX pagecache entries use XArray value entries so they can't be mistaken 53 * for pages. We use one bit for locking, one bit for the entry size (PMD) 54 * and two more to tell us if the entry is a zero page or an empty entry that 55 * is just used for locking. In total four special bits. 56 * 57 * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE 58 * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem 59 * block allocation. 60 */ 61 #define DAX_SHIFT (4) 62 #define DAX_LOCKED (1UL << 0) 63 #define DAX_PMD (1UL << 1) 64 #define DAX_ZERO_PAGE (1UL << 2) 65 #define DAX_EMPTY (1UL << 3) 66 67 static unsigned long dax_to_pfn(void *entry) 68 { 69 return xa_to_value(entry) >> DAX_SHIFT; 70 } 71 72 static struct folio *dax_to_folio(void *entry) 73 { 74 return page_folio(pfn_to_page(dax_to_pfn(entry))); 75 } 76 77 static void *dax_make_entry(unsigned long pfn, unsigned long flags) 78 { 79 return xa_mk_value(flags | (pfn << DAX_SHIFT)); 80 } 81 82 static bool dax_is_locked(void *entry) 83 { 84 return xa_to_value(entry) & DAX_LOCKED; 85 } 86 87 static unsigned int dax_entry_order(void *entry) 88 { 89 if (xa_to_value(entry) & DAX_PMD) 90 return PMD_ORDER; 91 return 0; 92 } 93 94 static unsigned long dax_is_pmd_entry(void *entry) 95 { 96 return xa_to_value(entry) & DAX_PMD; 97 } 98 99 static bool dax_is_pte_entry(void *entry) 100 { 101 return !(xa_to_value(entry) & DAX_PMD); 102 } 103 104 static int dax_is_zero_entry(void *entry) 105 { 106 return xa_to_value(entry) & DAX_ZERO_PAGE; 107 } 108 109 static int dax_is_empty_entry(void *entry) 110 { 111 return xa_to_value(entry) & DAX_EMPTY; 112 } 113 114 /* 115 * true if the entry that was found is of a smaller order than the entry 116 * we were looking for 117 */ 118 static bool dax_is_conflict(void *entry) 119 { 120 return entry == XA_RETRY_ENTRY; 121 } 122 123 /* 124 * DAX page cache entry locking 125 */ 126 struct exceptional_entry_key { 127 struct xarray *xa; 128 pgoff_t entry_start; 129 }; 130 131 struct wait_exceptional_entry_queue { 132 wait_queue_entry_t wait; 133 struct exceptional_entry_key key; 134 }; 135 136 /** 137 * enum dax_wake_mode: waitqueue wakeup behaviour 138 * @WAKE_ALL: wake all waiters in the waitqueue 139 * @WAKE_NEXT: wake only the first waiter in the waitqueue 140 */ 141 enum dax_wake_mode { 142 WAKE_ALL, 143 WAKE_NEXT, 144 }; 145 146 static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, 147 void *entry, struct exceptional_entry_key *key) 148 { 149 unsigned long hash; 150 unsigned long index = xas->xa_index; 151 152 /* 153 * If 'entry' is a PMD, align the 'index' that we use for the wait 154 * queue to the start of that PMD. This ensures that all offsets in 155 * the range covered by the PMD map to the same bit lock. 156 */ 157 if (dax_is_pmd_entry(entry)) 158 index &= ~PG_PMD_COLOUR; 159 key->xa = xas->xa; 160 key->entry_start = index; 161 162 hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS); 163 return wait_table + hash; 164 } 165 166 static int wake_exceptional_entry_func(wait_queue_entry_t *wait, 167 unsigned int mode, int sync, void *keyp) 168 { 169 struct exceptional_entry_key *key = keyp; 170 struct wait_exceptional_entry_queue *ewait = 171 container_of(wait, struct wait_exceptional_entry_queue, wait); 172 173 if (key->xa != ewait->key.xa || 174 key->entry_start != ewait->key.entry_start) 175 return 0; 176 return autoremove_wake_function(wait, mode, sync, NULL); 177 } 178 179 /* 180 * @entry may no longer be the entry at the index in the mapping. 181 * The important information it's conveying is whether the entry at 182 * this index used to be a PMD entry. 183 */ 184 static void dax_wake_entry(struct xa_state *xas, void *entry, 185 enum dax_wake_mode mode) 186 { 187 struct exceptional_entry_key key; 188 wait_queue_head_t *wq; 189 190 wq = dax_entry_waitqueue(xas, entry, &key); 191 192 /* 193 * Checking for locked entry and prepare_to_wait_exclusive() happens 194 * under the i_pages lock, ditto for entry handling in our callers. 195 * So at this point all tasks that could have seen our entry locked 196 * must be in the waitqueue and the following check will see them. 197 */ 198 if (waitqueue_active(wq)) 199 __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key); 200 } 201 202 /* 203 * Look up entry in page cache, wait for it to become unlocked if it 204 * is a DAX entry and return it. The caller must subsequently call 205 * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry() 206 * if it did. The entry returned may have a larger order than @order. 207 * If @order is larger than the order of the entry found in i_pages, this 208 * function returns a dax_is_conflict entry. 209 * 210 * Must be called with the i_pages lock held. 211 */ 212 static void *get_next_unlocked_entry(struct xa_state *xas, unsigned int order) 213 { 214 void *entry; 215 struct wait_exceptional_entry_queue ewait; 216 wait_queue_head_t *wq; 217 218 init_wait(&ewait.wait); 219 ewait.wait.func = wake_exceptional_entry_func; 220 221 for (;;) { 222 entry = xas_find_conflict(xas); 223 if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) 224 return entry; 225 if (dax_entry_order(entry) < order) 226 return XA_RETRY_ENTRY; 227 if (!dax_is_locked(entry)) 228 return entry; 229 230 wq = dax_entry_waitqueue(xas, entry, &ewait.key); 231 prepare_to_wait_exclusive(wq, &ewait.wait, 232 TASK_UNINTERRUPTIBLE); 233 xas_unlock_irq(xas); 234 xas_reset(xas); 235 schedule(); 236 finish_wait(wq, &ewait.wait); 237 xas_lock_irq(xas); 238 } 239 } 240 241 /* 242 * Wait for the given entry to become unlocked. Caller must hold the i_pages 243 * lock and call either put_unlocked_entry() if it did not lock the entry or 244 * dax_unlock_entry() if it did. Returns an unlocked entry if still present. 245 */ 246 static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry) 247 { 248 struct wait_exceptional_entry_queue ewait; 249 wait_queue_head_t *wq; 250 251 init_wait(&ewait.wait); 252 ewait.wait.func = wake_exceptional_entry_func; 253 254 while (unlikely(dax_is_locked(entry))) { 255 wq = dax_entry_waitqueue(xas, entry, &ewait.key); 256 prepare_to_wait_exclusive(wq, &ewait.wait, 257 TASK_UNINTERRUPTIBLE); 258 xas_reset(xas); 259 xas_unlock_irq(xas); 260 schedule(); 261 finish_wait(wq, &ewait.wait); 262 xas_lock_irq(xas); 263 entry = xas_load(xas); 264 } 265 266 if (xa_is_internal(entry)) 267 return NULL; 268 269 return entry; 270 } 271 272 /* 273 * The only thing keeping the address space around is the i_pages lock 274 * (it's cycled in clear_inode() after removing the entries from i_pages) 275 * After we call xas_unlock_irq(), we cannot touch xas->xa. 276 */ 277 static void wait_entry_unlocked(struct xa_state *xas, void *entry) 278 { 279 struct wait_exceptional_entry_queue ewait; 280 wait_queue_head_t *wq; 281 282 init_wait(&ewait.wait); 283 ewait.wait.func = wake_exceptional_entry_func; 284 285 wq = dax_entry_waitqueue(xas, entry, &ewait.key); 286 /* 287 * Unlike get_next_unlocked_entry() there is no guarantee that this 288 * path ever successfully retrieves an unlocked entry before an 289 * inode dies. Perform a non-exclusive wait in case this path 290 * never successfully performs its own wake up. 291 */ 292 prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); 293 xas_unlock_irq(xas); 294 schedule(); 295 finish_wait(wq, &ewait.wait); 296 } 297 298 static void put_unlocked_entry(struct xa_state *xas, void *entry, 299 enum dax_wake_mode mode) 300 { 301 if (entry && !dax_is_conflict(entry)) 302 dax_wake_entry(xas, entry, mode); 303 } 304 305 /* 306 * We used the xa_state to get the entry, but then we locked the entry and 307 * dropped the xa_lock, so we know the xa_state is stale and must be reset 308 * before use. 309 */ 310 static void dax_unlock_entry(struct xa_state *xas, void *entry) 311 { 312 void *old; 313 314 BUG_ON(dax_is_locked(entry)); 315 xas_reset(xas); 316 xas_lock_irq(xas); 317 old = xas_store(xas, entry); 318 xas_unlock_irq(xas); 319 BUG_ON(!dax_is_locked(old)); 320 dax_wake_entry(xas, entry, WAKE_NEXT); 321 } 322 323 /* 324 * Return: The entry stored at this location before it was locked. 325 */ 326 static void *dax_lock_entry(struct xa_state *xas, void *entry) 327 { 328 unsigned long v = xa_to_value(entry); 329 return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); 330 } 331 332 static unsigned long dax_entry_size(void *entry) 333 { 334 if (dax_is_zero_entry(entry)) 335 return 0; 336 else if (dax_is_empty_entry(entry)) 337 return 0; 338 else if (dax_is_pmd_entry(entry)) 339 return PMD_SIZE; 340 else 341 return PAGE_SIZE; 342 } 343 344 /* 345 * A DAX folio is considered shared if it has no mapping set and ->share (which 346 * shares the ->index field) is non-zero. Note this may return false even if the 347 * page is shared between multiple files but has not yet actually been mapped 348 * into multiple address spaces. 349 */ 350 static inline bool dax_folio_is_shared(struct folio *folio) 351 { 352 return !folio->mapping && folio->share; 353 } 354 355 /* 356 * When it is called by dax_insert_entry(), the shared flag will indicate 357 * whether this entry is shared by multiple files. If the page has not 358 * previously been associated with any mappings the ->mapping and ->index 359 * fields will be set. If it has already been associated with a mapping 360 * the mapping will be cleared and the share count set. It's then up to 361 * reverse map users like memory_failure() to call back into the filesystem to 362 * recover ->mapping and ->index information. For example by implementing 363 * dax_holder_operations. 364 */ 365 static void dax_folio_make_shared(struct folio *folio) 366 { 367 /* 368 * folio is not currently shared so mark it as shared by clearing 369 * folio->mapping. 370 */ 371 folio->mapping = NULL; 372 373 /* 374 * folio has previously been mapped into one address space so set the 375 * share count. 376 */ 377 folio->share = 1; 378 } 379 380 /** 381 * dax_folio_reset_order - Reset a compound DAX folio to order-0 pages 382 * @folio: The folio to reset 383 * 384 * Splits a compound folio back into individual order-0 pages, 385 * clearing compound state and restoring pgmap pointers. 386 * 387 * Returns: the original folio order (0 if already order-0) 388 */ 389 int dax_folio_reset_order(struct folio *folio) 390 { 391 struct dev_pagemap *pgmap = page_pgmap(&folio->page); 392 int order = folio_order(folio); 393 394 /* 395 * DAX maintains the invariant that folio->share != 0 only when 396 * folio->mapping == NULL (enforced by dax_folio_make_shared()). 397 * Equivalently: folio->mapping != NULL implies folio->share == 0. 398 * Callers ensure share has been decremented to zero before 399 * calling here, so unconditionally clearing both fields is 400 * correct. 401 */ 402 folio->mapping = NULL; 403 folio->share = 0; 404 405 if (!order) { 406 /* 407 * Restore pgmap explicitly even for order-0 folios. For the 408 * dax_folio_put() caller this is a no-op (same value), but 409 * fsdev_clear_folio_state() may call this on folios that 410 * were previously compound and need pgmap re-established. 411 */ 412 folio->pgmap = pgmap; 413 return 0; 414 } 415 416 folio_reset_order(folio); 417 418 for (int i = 0; i < (1UL << order); i++) { 419 struct page *page = folio_page(folio, i); 420 struct folio *f = (struct folio *)page; 421 422 ClearPageHead(page); 423 clear_compound_head(page); 424 f->mapping = NULL; 425 f->share = 0; 426 f->pgmap = pgmap; 427 } 428 429 return order; 430 } 431 EXPORT_SYMBOL_GPL(dax_folio_reset_order); 432 433 static inline unsigned long dax_folio_put(struct folio *folio) 434 { 435 unsigned long ref; 436 int order, i; 437 438 if (!dax_folio_is_shared(folio)) 439 ref = 0; 440 else 441 ref = --folio->share; 442 443 if (ref) 444 return ref; 445 446 order = dax_folio_reset_order(folio); 447 448 /* Debug check: verify refcounts are zero for all sub-folios */ 449 for (i = 0; i < (1UL << order); i++) { 450 struct page *page = folio_page(folio, i); 451 452 WARN_ON_ONCE(folio_ref_count((struct folio *)page)); 453 } 454 455 return ref; 456 } 457 458 static void dax_folio_init(void *entry) 459 { 460 struct folio *folio = dax_to_folio(entry); 461 int order = dax_entry_order(entry); 462 463 /* 464 * Folio should have been split back to order-0 pages in 465 * dax_folio_put() when they were removed from their 466 * final mapping. 467 */ 468 WARN_ON_ONCE(folio_order(folio)); 469 470 if (order > 0) { 471 prep_compound_page(&folio->page, order); 472 if (order > 1) 473 INIT_LIST_HEAD(&folio->_deferred_list); 474 WARN_ON_ONCE(folio_ref_count(folio)); 475 } 476 } 477 478 static void dax_associate_entry(void *entry, struct address_space *mapping, 479 struct vm_area_struct *vma, 480 unsigned long address, bool shared) 481 { 482 unsigned long size = dax_entry_size(entry), index; 483 struct folio *folio = dax_to_folio(entry); 484 485 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) 486 return; 487 488 index = linear_page_index(vma, address & ~(size - 1)); 489 if (shared && (folio->mapping || dax_folio_is_shared(folio))) { 490 if (folio->mapping) 491 dax_folio_make_shared(folio); 492 493 WARN_ON_ONCE(!folio->share); 494 WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio)); 495 folio->share++; 496 } else { 497 WARN_ON_ONCE(folio->mapping); 498 dax_folio_init(entry); 499 folio = dax_to_folio(entry); 500 folio->mapping = mapping; 501 folio->index = index; 502 } 503 } 504 505 static void dax_disassociate_entry(void *entry, struct address_space *mapping, 506 bool trunc) 507 { 508 struct folio *folio = dax_to_folio(entry); 509 510 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) 511 return; 512 513 dax_folio_put(folio); 514 } 515 516 static struct page *dax_busy_page(void *entry) 517 { 518 struct folio *folio = dax_to_folio(entry); 519 520 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) 521 return NULL; 522 523 if (folio_ref_count(folio) - folio_mapcount(folio)) 524 return &folio->page; 525 else 526 return NULL; 527 } 528 529 /** 530 * dax_lock_folio - Lock the DAX entry corresponding to a folio 531 * @folio: The folio whose entry we want to lock 532 * 533 * Context: Process context. 534 * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could 535 * not be locked. 536 */ 537 dax_entry_t dax_lock_folio(struct folio *folio) 538 { 539 XA_STATE(xas, NULL, 0); 540 void *entry; 541 542 /* Ensure folio->mapping isn't freed while we look at it */ 543 rcu_read_lock(); 544 for (;;) { 545 struct address_space *mapping = READ_ONCE(folio->mapping); 546 547 entry = NULL; 548 if (!mapping || !dax_mapping(mapping)) 549 break; 550 551 /* 552 * In the device-dax case there's no need to lock, a 553 * struct dev_pagemap pin is sufficient to keep the 554 * inode alive, and we assume we have dev_pagemap pin 555 * otherwise we would not have a valid pfn_to_page() 556 * translation. 557 */ 558 entry = (void *)~0UL; 559 if (S_ISCHR(mapping->host->i_mode)) 560 break; 561 562 xas.xa = &mapping->i_pages; 563 xas_lock_irq(&xas); 564 if (mapping != folio->mapping) { 565 xas_unlock_irq(&xas); 566 continue; 567 } 568 xas_set(&xas, folio->index); 569 entry = xas_load(&xas); 570 if (dax_is_locked(entry)) { 571 rcu_read_unlock(); 572 wait_entry_unlocked(&xas, entry); 573 rcu_read_lock(); 574 continue; 575 } 576 dax_lock_entry(&xas, entry); 577 xas_unlock_irq(&xas); 578 break; 579 } 580 rcu_read_unlock(); 581 return (dax_entry_t)entry; 582 } 583 584 void dax_unlock_folio(struct folio *folio, dax_entry_t cookie) 585 { 586 struct address_space *mapping = folio->mapping; 587 XA_STATE(xas, &mapping->i_pages, folio->index); 588 589 if (S_ISCHR(mapping->host->i_mode)) 590 return; 591 592 dax_unlock_entry(&xas, (void *)cookie); 593 } 594 595 /* 596 * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping 597 * @mapping: the file's mapping whose entry we want to lock 598 * @index: the offset within this file 599 * @page: output the dax page corresponding to this dax entry 600 * 601 * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry 602 * could not be locked. 603 */ 604 dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index, 605 struct page **page) 606 { 607 XA_STATE(xas, NULL, 0); 608 void *entry; 609 610 rcu_read_lock(); 611 for (;;) { 612 entry = NULL; 613 if (!dax_mapping(mapping)) 614 break; 615 616 xas.xa = &mapping->i_pages; 617 xas_lock_irq(&xas); 618 xas_set(&xas, index); 619 entry = xas_load(&xas); 620 if (dax_is_locked(entry)) { 621 rcu_read_unlock(); 622 wait_entry_unlocked(&xas, entry); 623 rcu_read_lock(); 624 continue; 625 } 626 if (!entry || 627 dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { 628 /* 629 * Because we are looking for entry from file's mapping 630 * and index, so the entry may not be inserted for now, 631 * or even a zero/empty entry. We don't think this is 632 * an error case. So, return a special value and do 633 * not output @page. 634 */ 635 entry = (void *)~0UL; 636 } else { 637 *page = pfn_to_page(dax_to_pfn(entry)); 638 dax_lock_entry(&xas, entry); 639 } 640 xas_unlock_irq(&xas); 641 break; 642 } 643 rcu_read_unlock(); 644 return (dax_entry_t)entry; 645 } 646 647 void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index, 648 dax_entry_t cookie) 649 { 650 XA_STATE(xas, &mapping->i_pages, index); 651 652 if (cookie == ~0UL) 653 return; 654 655 dax_unlock_entry(&xas, (void *)cookie); 656 } 657 658 /* 659 * Find page cache entry at given index. If it is a DAX entry, return it 660 * with the entry locked. If the page cache doesn't contain an entry at 661 * that index, add a locked empty entry. 662 * 663 * When requesting an entry with size DAX_PMD, grab_mapping_entry() will 664 * either return that locked entry or will return VM_FAULT_FALLBACK. 665 * This will happen if there are any PTE entries within the PMD range 666 * that we are requesting. 667 * 668 * We always favor PTE entries over PMD entries. There isn't a flow where we 669 * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD 670 * insertion will fail if it finds any PTE entries already in the tree, and a 671 * PTE insertion will cause an existing PMD entry to be unmapped and 672 * downgraded to PTE entries. This happens for both PMD zero pages as 673 * well as PMD empty entries. 674 * 675 * The exception to this downgrade path is for PMD entries that have 676 * real storage backing them. We will leave these real PMD entries in 677 * the tree, and PTE writes will simply dirty the entire PMD entry. 678 * 679 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 680 * persistent memory the benefit is doubtful. We can add that later if we can 681 * show it helps. 682 * 683 * On error, this function does not return an ERR_PTR. Instead it returns 684 * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values 685 * overlap with xarray value entries. 686 */ 687 static void *grab_mapping_entry(struct xa_state *xas, 688 struct address_space *mapping, unsigned int order) 689 { 690 unsigned long index = xas->xa_index; 691 bool pmd_downgrade; /* splitting PMD entry into PTE entries? */ 692 void *entry; 693 694 retry: 695 pmd_downgrade = false; 696 xas_lock_irq(xas); 697 entry = get_next_unlocked_entry(xas, order); 698 699 if (entry) { 700 if (dax_is_conflict(entry)) 701 goto fallback; 702 if (!xa_is_value(entry)) { 703 xas_set_err(xas, -EIO); 704 goto out_unlock; 705 } 706 707 if (order == 0) { 708 if (dax_is_pmd_entry(entry) && 709 (dax_is_zero_entry(entry) || 710 dax_is_empty_entry(entry))) { 711 pmd_downgrade = true; 712 } 713 } 714 } 715 716 if (pmd_downgrade) { 717 /* 718 * Make sure 'entry' remains valid while we drop 719 * the i_pages lock. 720 */ 721 dax_lock_entry(xas, entry); 722 723 /* 724 * Besides huge zero pages the only other thing that gets 725 * downgraded are empty entries which don't need to be 726 * unmapped. 727 */ 728 if (dax_is_zero_entry(entry)) { 729 xas_unlock_irq(xas); 730 unmap_mapping_pages(mapping, 731 xas->xa_index & ~PG_PMD_COLOUR, 732 PG_PMD_NR, false); 733 xas_reset(xas); 734 xas_lock_irq(xas); 735 } 736 737 dax_disassociate_entry(entry, mapping, false); 738 xas_store(xas, NULL); /* undo the PMD join */ 739 dax_wake_entry(xas, entry, WAKE_ALL); 740 mapping->nrpages -= PG_PMD_NR; 741 entry = NULL; 742 xas_set(xas, index); 743 } 744 745 if (entry) { 746 dax_lock_entry(xas, entry); 747 } else { 748 unsigned long flags = DAX_EMPTY; 749 750 if (order > 0) 751 flags |= DAX_PMD; 752 entry = dax_make_entry(0, flags); 753 dax_lock_entry(xas, entry); 754 if (xas_error(xas)) 755 goto out_unlock; 756 mapping->nrpages += 1UL << order; 757 } 758 759 out_unlock: 760 xas_unlock_irq(xas); 761 if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) 762 goto retry; 763 if (xas->xa_node == XA_ERROR(-ENOMEM)) 764 return xa_mk_internal(VM_FAULT_OOM); 765 if (xas_error(xas)) 766 return xa_mk_internal(VM_FAULT_SIGBUS); 767 return entry; 768 fallback: 769 xas_unlock_irq(xas); 770 return xa_mk_internal(VM_FAULT_FALLBACK); 771 } 772 773 /** 774 * dax_layout_busy_page_range - find first pinned page in @mapping 775 * @mapping: address space to scan for a page with ref count > 1 776 * @start: Starting offset. Page containing 'start' is included. 777 * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX, 778 * pages from 'start' till the end of file are included. 779 * 780 * DAX requires ZONE_DEVICE mapped pages. These pages are never 781 * 'onlined' to the page allocator so they are considered idle when 782 * page->count == 1. A filesystem uses this interface to determine if 783 * any page in the mapping is busy, i.e. for DMA, or other 784 * get_user_pages() usages. 785 * 786 * It is expected that the filesystem is holding locks to block the 787 * establishment of new mappings in this address_space. I.e. it expects 788 * to be able to run unmap_mapping_range() and subsequently not race 789 * mapping_mapped() becoming true. 790 */ 791 struct page *dax_layout_busy_page_range(struct address_space *mapping, 792 loff_t start, loff_t end) 793 { 794 void *entry; 795 unsigned int scanned = 0; 796 struct page *page = NULL; 797 pgoff_t start_idx = start >> PAGE_SHIFT; 798 pgoff_t end_idx; 799 XA_STATE(xas, &mapping->i_pages, start_idx); 800 801 if (!dax_mapping(mapping)) 802 return NULL; 803 804 /* If end == LLONG_MAX, all pages from start to till end of file */ 805 if (end == LLONG_MAX) 806 end_idx = ULONG_MAX; 807 else 808 end_idx = end >> PAGE_SHIFT; 809 /* 810 * If we race get_user_pages_fast() here either we'll see the 811 * elevated page count in the iteration and wait, or 812 * get_user_pages_fast() will see that the page it took a reference 813 * against is no longer mapped in the page tables and bail to the 814 * get_user_pages() slow path. The slow path is protected by 815 * pte_lock() and pmd_lock(). New references are not taken without 816 * holding those locks, and unmap_mapping_pages() will not zero the 817 * pte or pmd without holding the respective lock, so we are 818 * guaranteed to either see new references or prevent new 819 * references from being established. 820 */ 821 unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0); 822 823 xas_lock_irq(&xas); 824 xas_for_each(&xas, entry, end_idx) { 825 if (WARN_ON_ONCE(!xa_is_value(entry))) 826 continue; 827 entry = wait_entry_unlocked_exclusive(&xas, entry); 828 if (entry) 829 page = dax_busy_page(entry); 830 put_unlocked_entry(&xas, entry, WAKE_NEXT); 831 if (page) 832 break; 833 if (++scanned % XA_CHECK_SCHED) 834 continue; 835 836 xas_pause(&xas); 837 xas_unlock_irq(&xas); 838 cond_resched(); 839 xas_lock_irq(&xas); 840 } 841 xas_unlock_irq(&xas); 842 return page; 843 } 844 EXPORT_SYMBOL_GPL(dax_layout_busy_page_range); 845 846 struct page *dax_layout_busy_page(struct address_space *mapping) 847 { 848 return dax_layout_busy_page_range(mapping, 0, LLONG_MAX); 849 } 850 EXPORT_SYMBOL_GPL(dax_layout_busy_page); 851 852 static int __dax_invalidate_entry(struct address_space *mapping, 853 pgoff_t index, bool trunc) 854 { 855 XA_STATE(xas, &mapping->i_pages, index); 856 int ret = 0; 857 void *entry; 858 859 xas_lock_irq(&xas); 860 entry = get_next_unlocked_entry(&xas, 0); 861 if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) 862 goto out; 863 if (!trunc && 864 (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) || 865 xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE))) 866 goto out; 867 dax_disassociate_entry(entry, mapping, trunc); 868 xas_store(&xas, NULL); 869 mapping->nrpages -= 1UL << dax_entry_order(entry); 870 ret = 1; 871 out: 872 put_unlocked_entry(&xas, entry, WAKE_ALL); 873 xas_unlock_irq(&xas); 874 return ret; 875 } 876 877 static int __dax_clear_dirty_range(struct address_space *mapping, 878 pgoff_t start, pgoff_t end) 879 { 880 XA_STATE(xas, &mapping->i_pages, start); 881 unsigned int scanned = 0; 882 void *entry; 883 884 xas_lock_irq(&xas); 885 xas_for_each(&xas, entry, end) { 886 entry = wait_entry_unlocked_exclusive(&xas, entry); 887 if (!entry) 888 continue; 889 xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); 890 xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); 891 put_unlocked_entry(&xas, entry, WAKE_NEXT); 892 893 if (++scanned % XA_CHECK_SCHED) 894 continue; 895 896 xas_pause(&xas); 897 xas_unlock_irq(&xas); 898 cond_resched(); 899 xas_lock_irq(&xas); 900 } 901 xas_unlock_irq(&xas); 902 903 return 0; 904 } 905 906 /* 907 * Delete DAX entry at @index from @mapping. Wait for it 908 * to be unlocked before deleting it. 909 */ 910 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) 911 { 912 int ret = __dax_invalidate_entry(mapping, index, true); 913 914 /* 915 * This gets called from truncate / punch_hole path. As such, the caller 916 * must hold locks protecting against concurrent modifications of the 917 * page cache (usually fs-private i_mmap_sem for writing). Since the 918 * caller has seen a DAX entry for this index, we better find it 919 * at that index as well... 920 */ 921 WARN_ON_ONCE(!ret); 922 return ret; 923 } 924 925 void dax_delete_mapping_range(struct address_space *mapping, 926 loff_t start, loff_t end) 927 { 928 void *entry; 929 pgoff_t start_idx = start >> PAGE_SHIFT; 930 pgoff_t end_idx; 931 XA_STATE(xas, &mapping->i_pages, start_idx); 932 933 /* If end == LLONG_MAX, all pages from start to till end of file */ 934 if (end == LLONG_MAX) 935 end_idx = ULONG_MAX; 936 else 937 end_idx = end >> PAGE_SHIFT; 938 939 xas_lock_irq(&xas); 940 xas_for_each(&xas, entry, end_idx) { 941 if (!xa_is_value(entry)) 942 continue; 943 entry = wait_entry_unlocked_exclusive(&xas, entry); 944 if (!entry) 945 continue; 946 dax_disassociate_entry(entry, mapping, true); 947 xas_store(&xas, NULL); 948 mapping->nrpages -= 1UL << dax_entry_order(entry); 949 put_unlocked_entry(&xas, entry, WAKE_ALL); 950 } 951 xas_unlock_irq(&xas); 952 } 953 EXPORT_SYMBOL_GPL(dax_delete_mapping_range); 954 955 static int wait_page_idle(struct page *page, 956 void (cb)(struct inode *), 957 struct inode *inode) 958 { 959 return ___wait_var_event(page, dax_page_is_idle(page), 960 TASK_INTERRUPTIBLE, 0, 0, cb(inode)); 961 } 962 963 static void wait_page_idle_uninterruptible(struct page *page, 964 struct inode *inode) 965 { 966 ___wait_var_event(page, dax_page_is_idle(page), 967 TASK_UNINTERRUPTIBLE, 0, 0, schedule()); 968 } 969 970 /* 971 * Unmaps the inode and waits for any DMA to complete prior to deleting the 972 * DAX mapping entries for the range. 973 * 974 * For NOWAIT behavior, pass @cb as NULL to early-exit on first found 975 * busy page 976 */ 977 int dax_break_layout(struct inode *inode, loff_t start, loff_t end, 978 void (cb)(struct inode *)) 979 { 980 struct page *page; 981 int error = 0; 982 983 if (!dax_mapping(inode->i_mapping)) 984 return 0; 985 986 do { 987 page = dax_layout_busy_page_range(inode->i_mapping, start, end); 988 if (!page) 989 break; 990 if (!cb) { 991 error = -ERESTARTSYS; 992 break; 993 } 994 995 error = wait_page_idle(page, cb, inode); 996 } while (error == 0); 997 998 if (!page) 999 dax_delete_mapping_range(inode->i_mapping, start, end); 1000 1001 return error; 1002 } 1003 EXPORT_SYMBOL_GPL(dax_break_layout); 1004 1005 void dax_break_layout_final(struct inode *inode) 1006 { 1007 struct page *page; 1008 1009 if (!dax_mapping(inode->i_mapping)) 1010 return; 1011 1012 do { 1013 page = dax_layout_busy_page_range(inode->i_mapping, 0, 1014 LLONG_MAX); 1015 if (!page) 1016 break; 1017 1018 wait_page_idle_uninterruptible(page, inode); 1019 } while (true); 1020 1021 if (!page) 1022 dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX); 1023 } 1024 EXPORT_SYMBOL_GPL(dax_break_layout_final); 1025 1026 /* 1027 * Invalidate DAX entry if it is clean. 1028 */ 1029 int dax_invalidate_mapping_entry_sync(struct address_space *mapping, 1030 pgoff_t index) 1031 { 1032 return __dax_invalidate_entry(mapping, index, false); 1033 } 1034 1035 static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos) 1036 { 1037 return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset); 1038 } 1039 1040 static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter) 1041 { 1042 pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos); 1043 void *vto, *kaddr; 1044 long rc; 1045 int id; 1046 1047 id = dax_read_lock(); 1048 rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS, 1049 &kaddr, NULL); 1050 if (rc < 0) { 1051 dax_read_unlock(id); 1052 return rc; 1053 } 1054 vto = kmap_atomic(vmf->cow_page); 1055 copy_user_page(vto, kaddr, vmf->address, vmf->cow_page); 1056 kunmap_atomic(vto); 1057 dax_read_unlock(id); 1058 return 0; 1059 } 1060 1061 /* 1062 * MAP_SYNC on a dax mapping guarantees dirty metadata is 1063 * flushed on write-faults (non-cow), but not read-faults. 1064 */ 1065 static bool dax_fault_is_synchronous(const struct iomap_iter *iter, 1066 struct vm_area_struct *vma) 1067 { 1068 return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) && 1069 (iter->iomap.flags & IOMAP_F_DIRTY); 1070 } 1071 1072 /* 1073 * By this point grab_mapping_entry() has ensured that we have a locked entry 1074 * of the appropriate size so we don't have to worry about downgrading PMDs to 1075 * PTEs. If we happen to be trying to insert a PTE and there is a PMD 1076 * already in the tree, we will skip the insertion and just dirty the PMD as 1077 * appropriate. 1078 */ 1079 static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, 1080 const struct iomap_iter *iter, void *entry, unsigned long pfn, 1081 unsigned long flags) 1082 { 1083 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1084 void *new_entry = dax_make_entry(pfn, flags); 1085 bool write = iter->flags & IOMAP_WRITE; 1086 bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma); 1087 bool shared = iter->iomap.flags & IOMAP_F_SHARED; 1088 1089 if (dirty) 1090 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 1091 1092 if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { 1093 unsigned long index = xas->xa_index; 1094 /* we are replacing a zero page with block mapping */ 1095 if (dax_is_pmd_entry(entry)) 1096 unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, 1097 PG_PMD_NR, false); 1098 else /* pte entry */ 1099 unmap_mapping_pages(mapping, index, 1, false); 1100 } 1101 1102 xas_reset(xas); 1103 xas_lock_irq(xas); 1104 if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { 1105 void *old; 1106 1107 dax_disassociate_entry(entry, mapping, false); 1108 dax_associate_entry(new_entry, mapping, vmf->vma, 1109 vmf->address, shared); 1110 1111 /* 1112 * Only swap our new entry into the page cache if the current 1113 * entry is a zero page or an empty entry. If a normal PTE or 1114 * PMD entry is already in the cache, we leave it alone. This 1115 * means that if we are trying to insert a PTE and the 1116 * existing entry is a PMD, we will just leave the PMD in the 1117 * tree and dirty it if necessary. 1118 */ 1119 old = dax_lock_entry(xas, new_entry); 1120 WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) | 1121 DAX_LOCKED)); 1122 entry = new_entry; 1123 } else { 1124 xas_load(xas); /* Walk the xa_state */ 1125 } 1126 1127 if (dirty) 1128 xas_set_mark(xas, PAGECACHE_TAG_DIRTY); 1129 1130 if (write && shared) 1131 xas_set_mark(xas, PAGECACHE_TAG_TOWRITE); 1132 1133 xas_unlock_irq(xas); 1134 return entry; 1135 } 1136 1137 static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, 1138 struct address_space *mapping, void *entry) 1139 { 1140 unsigned long pfn, index, count, end; 1141 long ret = 0; 1142 struct vm_area_struct *vma; 1143 1144 /* 1145 * A page got tagged dirty in DAX mapping? Something is seriously 1146 * wrong. 1147 */ 1148 if (WARN_ON(!xa_is_value(entry))) 1149 return -EIO; 1150 1151 if (unlikely(dax_is_locked(entry))) { 1152 void *old_entry = entry; 1153 1154 entry = get_next_unlocked_entry(xas, 0); 1155 1156 /* Entry got punched out / reallocated? */ 1157 if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) 1158 goto put_unlocked; 1159 /* 1160 * Entry got reallocated elsewhere? No need to writeback. 1161 * We have to compare pfns as we must not bail out due to 1162 * difference in lockbit or entry type. 1163 */ 1164 if (dax_to_pfn(old_entry) != dax_to_pfn(entry)) 1165 goto put_unlocked; 1166 if (WARN_ON_ONCE(dax_is_empty_entry(entry) || 1167 dax_is_zero_entry(entry))) { 1168 ret = -EIO; 1169 goto put_unlocked; 1170 } 1171 1172 /* Another fsync thread may have already done this entry */ 1173 if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE)) 1174 goto put_unlocked; 1175 } 1176 1177 /* Lock the entry to serialize with page faults */ 1178 dax_lock_entry(xas, entry); 1179 1180 /* 1181 * We can clear the tag now but we have to be careful so that concurrent 1182 * dax_writeback_one() calls for the same index cannot finish before we 1183 * actually flush the caches. This is achieved as the calls will look 1184 * at the entry only under the i_pages lock and once they do that 1185 * they will see the entry locked and wait for it to unlock. 1186 */ 1187 xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE); 1188 xas_unlock_irq(xas); 1189 1190 /* 1191 * If dax_writeback_mapping_range() was given a wbc->range_start 1192 * in the middle of a PMD, the 'index' we use needs to be 1193 * aligned to the start of the PMD. 1194 * This allows us to flush for PMD_SIZE and not have to worry about 1195 * partial PMD writebacks. 1196 */ 1197 pfn = dax_to_pfn(entry); 1198 count = 1UL << dax_entry_order(entry); 1199 index = xas->xa_index & ~(count - 1); 1200 end = index + count - 1; 1201 1202 /* Walk all mappings of a given index of a file and writeprotect them */ 1203 i_mmap_lock_read(mapping); 1204 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) { 1205 pfn_mkclean_range(pfn, count, index, vma); 1206 cond_resched(); 1207 } 1208 i_mmap_unlock_read(mapping); 1209 1210 dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); 1211 /* 1212 * After we have flushed the cache, we can clear the dirty tag. There 1213 * cannot be new dirty data in the pfn after the flush has completed as 1214 * the pfn mappings are writeprotected and fault waits for mapping 1215 * entry lock. 1216 */ 1217 xas_reset(xas); 1218 xas_lock_irq(xas); 1219 xas_store(xas, entry); 1220 xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); 1221 dax_wake_entry(xas, entry, WAKE_NEXT); 1222 1223 trace_dax_writeback_one(mapping->host, index, count); 1224 return ret; 1225 1226 put_unlocked: 1227 put_unlocked_entry(xas, entry, WAKE_NEXT); 1228 return ret; 1229 } 1230 1231 /* 1232 * Flush the mapping to the persistent domain within the byte range of [start, 1233 * end]. This is required by data integrity operations to ensure file data is 1234 * on persistent storage prior to completion of the operation. 1235 */ 1236 int dax_writeback_mapping_range(struct address_space *mapping, 1237 struct dax_device *dax_dev, struct writeback_control *wbc) 1238 { 1239 XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT); 1240 struct inode *inode = mapping->host; 1241 pgoff_t end_index = wbc->range_end >> PAGE_SHIFT; 1242 void *entry; 1243 int ret = 0; 1244 unsigned int scanned = 0; 1245 1246 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 1247 return -EIO; 1248 1249 if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL) 1250 return 0; 1251 1252 trace_dax_writeback_range(inode, xas.xa_index, end_index); 1253 1254 tag_pages_for_writeback(mapping, xas.xa_index, end_index); 1255 1256 xas_lock_irq(&xas); 1257 xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) { 1258 ret = dax_writeback_one(&xas, dax_dev, mapping, entry); 1259 if (ret < 0) { 1260 mapping_set_error(mapping, ret); 1261 break; 1262 } 1263 if (++scanned % XA_CHECK_SCHED) 1264 continue; 1265 1266 xas_pause(&xas); 1267 xas_unlock_irq(&xas); 1268 cond_resched(); 1269 xas_lock_irq(&xas); 1270 } 1271 xas_unlock_irq(&xas); 1272 trace_dax_writeback_range_done(inode, xas.xa_index, end_index); 1273 return ret; 1274 } 1275 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 1276 1277 static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos, 1278 size_t size, void **kaddr, unsigned long *pfnp) 1279 { 1280 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); 1281 int id, rc = 0; 1282 long length; 1283 1284 id = dax_read_lock(); 1285 length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), 1286 DAX_ACCESS, kaddr, pfnp); 1287 if (length < 0) { 1288 rc = length; 1289 goto out; 1290 } 1291 if (!pfnp) 1292 goto out_check_addr; 1293 rc = -EINVAL; 1294 if (PFN_PHYS(length) < size) 1295 goto out; 1296 if (*pfnp & (PHYS_PFN(size)-1)) 1297 goto out; 1298 1299 rc = 0; 1300 1301 out_check_addr: 1302 if (!kaddr) 1303 goto out; 1304 if (!*kaddr) 1305 rc = -EFAULT; 1306 out: 1307 dax_read_unlock(id); 1308 return rc; 1309 } 1310 1311 /** 1312 * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page 1313 * by copying the data before and after the range to be written. 1314 * @pos: address to do copy from. 1315 * @length: size of copy operation. 1316 * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE) 1317 * @srcmap: iomap srcmap 1318 * @daddr: destination address to copy to. 1319 * 1320 * This can be called from two places. Either during DAX write fault (page 1321 * aligned), to copy the length size data to daddr. Or, while doing normal DAX 1322 * write operation, dax_iomap_iter() might call this to do the copy of either 1323 * start or end unaligned address. In the latter case the rest of the copy of 1324 * aligned ranges is taken care by dax_iomap_iter() itself. 1325 * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the 1326 * area to make sure no old data remains. 1327 */ 1328 static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, 1329 const struct iomap *srcmap, void *daddr) 1330 { 1331 loff_t head_off = pos & (align_size - 1); 1332 size_t size = ALIGN(head_off + length, align_size); 1333 loff_t end = pos + length; 1334 loff_t pg_end = round_up(end, align_size); 1335 /* copy_all is usually in page fault case */ 1336 bool copy_all = head_off == 0 && end == pg_end; 1337 /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */ 1338 bool zero_edge = srcmap->flags & IOMAP_F_SHARED || 1339 srcmap->type == IOMAP_UNWRITTEN; 1340 void *saddr = NULL; 1341 int ret = 0; 1342 1343 if (!zero_edge) { 1344 ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); 1345 if (ret) 1346 return dax_mem2blk_err(ret); 1347 } 1348 1349 if (copy_all) { 1350 if (zero_edge) 1351 memset(daddr, 0, size); 1352 else 1353 ret = copy_mc_to_kernel(daddr, saddr, length); 1354 goto out; 1355 } 1356 1357 /* Copy the head part of the range */ 1358 if (head_off) { 1359 if (zero_edge) 1360 memset(daddr, 0, head_off); 1361 else { 1362 ret = copy_mc_to_kernel(daddr, saddr, head_off); 1363 if (ret) 1364 return -EIO; 1365 } 1366 } 1367 1368 /* Copy the tail part of the range */ 1369 if (end < pg_end) { 1370 loff_t tail_off = head_off + length; 1371 loff_t tail_len = pg_end - end; 1372 1373 if (zero_edge) 1374 memset(daddr + tail_off, 0, tail_len); 1375 else { 1376 ret = copy_mc_to_kernel(daddr + tail_off, 1377 saddr + tail_off, tail_len); 1378 if (ret) 1379 return -EIO; 1380 } 1381 } 1382 out: 1383 if (zero_edge) 1384 dax_flush(srcmap->dax_dev, daddr, size); 1385 return ret ? -EIO : 0; 1386 } 1387 1388 /* 1389 * The user has performed a load from a hole in the file. Allocating a new 1390 * page in the file would cause excessive storage usage for workloads with 1391 * sparse files. Instead we insert a read-only mapping of the 4k zero page. 1392 * If this page is ever written to we will re-fault and change the mapping to 1393 * point to real DAX storage instead. 1394 */ 1395 static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf, 1396 const struct iomap_iter *iter, void **entry) 1397 { 1398 struct inode *inode = iter->inode; 1399 unsigned long vaddr = vmf->address; 1400 unsigned long pfn = zero_pfn(vaddr); 1401 vm_fault_t ret; 1402 1403 *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE); 1404 1405 ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), false); 1406 trace_dax_load_hole(inode, vmf, ret); 1407 return ret; 1408 } 1409 1410 #ifdef CONFIG_FS_DAX_PMD 1411 static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, 1412 const struct iomap_iter *iter, void **entry) 1413 { 1414 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1415 struct inode *inode = mapping->host; 1416 struct folio *zero_folio; 1417 vm_fault_t ret; 1418 1419 zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm); 1420 1421 if (unlikely(!zero_folio)) { 1422 trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry); 1423 return VM_FAULT_FALLBACK; 1424 } 1425 1426 *entry = dax_insert_entry(xas, vmf, iter, *entry, folio_pfn(zero_folio), 1427 DAX_PMD | DAX_ZERO_PAGE); 1428 1429 ret = vmf_insert_folio_pmd(vmf, zero_folio, false); 1430 if (ret == VM_FAULT_NOPAGE) 1431 trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry); 1432 return ret; 1433 } 1434 #else 1435 static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, 1436 const struct iomap_iter *iter, void **entry) 1437 { 1438 return VM_FAULT_FALLBACK; 1439 } 1440 #endif /* CONFIG_FS_DAX_PMD */ 1441 1442 static int dax_unshare_iter(struct iomap_iter *iter) 1443 { 1444 struct iomap *iomap = &iter->iomap; 1445 const struct iomap *srcmap = iomap_iter_srcmap(iter); 1446 loff_t copy_pos = iter->pos; 1447 u64 copy_len = iomap_length(iter); 1448 u32 mod; 1449 int id = 0; 1450 s64 ret; 1451 void *daddr = NULL, *saddr = NULL; 1452 1453 if (!iomap_want_unshare_iter(iter)) 1454 return iomap_iter_advance_full(iter); 1455 1456 /* 1457 * Extend the file range to be aligned to fsblock/pagesize, because 1458 * we need to copy entire blocks, not just the byte range specified. 1459 * Invalidate the mapping because we're about to CoW. 1460 */ 1461 mod = offset_in_page(copy_pos); 1462 if (mod) { 1463 copy_len += mod; 1464 copy_pos -= mod; 1465 } 1466 1467 mod = offset_in_page(copy_pos + copy_len); 1468 if (mod) 1469 copy_len += PAGE_SIZE - mod; 1470 1471 invalidate_inode_pages2_range(iter->inode->i_mapping, 1472 copy_pos >> PAGE_SHIFT, 1473 (copy_pos + copy_len - 1) >> PAGE_SHIFT); 1474 1475 id = dax_read_lock(); 1476 ret = dax_iomap_direct_access(iomap, copy_pos, copy_len, &daddr, NULL); 1477 if (ret < 0) 1478 goto out_unlock; 1479 1480 ret = dax_iomap_direct_access(srcmap, copy_pos, copy_len, &saddr, NULL); 1481 if (ret < 0) 1482 goto out_unlock; 1483 1484 if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0) 1485 ret = -EIO; 1486 1487 out_unlock: 1488 dax_read_unlock(id); 1489 if (ret < 0) 1490 return dax_mem2blk_err(ret); 1491 return iomap_iter_advance_full(iter); 1492 } 1493 1494 int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, 1495 const struct iomap_ops *ops) 1496 { 1497 struct iomap_iter iter = { 1498 .inode = inode, 1499 .pos = pos, 1500 .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX, 1501 }; 1502 loff_t size = i_size_read(inode); 1503 int ret; 1504 1505 if (pos < 0 || pos >= size) 1506 return 0; 1507 1508 iter.len = min(len, size - pos); 1509 while ((ret = iomap_iter(&iter, ops)) > 0) 1510 iter.status = dax_unshare_iter(&iter); 1511 return ret; 1512 } 1513 EXPORT_SYMBOL_GPL(dax_file_unshare); 1514 1515 static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) 1516 { 1517 const struct iomap *iomap = &iter->iomap; 1518 const struct iomap *srcmap = iomap_iter_srcmap(iter); 1519 unsigned offset = offset_in_page(pos); 1520 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); 1521 void *kaddr; 1522 long ret; 1523 1524 ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, 1525 NULL); 1526 if (ret < 0) 1527 return dax_mem2blk_err(ret); 1528 1529 memset(kaddr + offset, 0, size); 1530 if (iomap->flags & IOMAP_F_SHARED) 1531 ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap, 1532 kaddr); 1533 else 1534 dax_flush(iomap->dax_dev, kaddr + offset, size); 1535 return ret; 1536 } 1537 1538 static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) 1539 { 1540 const struct iomap *iomap = &iter->iomap; 1541 const struct iomap *srcmap = iomap_iter_srcmap(iter); 1542 u64 length = iomap_length(iter); 1543 int ret; 1544 1545 /* already zeroed? we're done. */ 1546 if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) 1547 return iomap_iter_advance(iter, length); 1548 1549 /* 1550 * invalidate the pages whose sharing state is to be changed 1551 * because of CoW. 1552 */ 1553 if (iomap->flags & IOMAP_F_SHARED) 1554 invalidate_inode_pages2_range(iter->inode->i_mapping, 1555 iter->pos >> PAGE_SHIFT, 1556 (iter->pos + length - 1) >> PAGE_SHIFT); 1557 1558 do { 1559 loff_t pos = iter->pos; 1560 unsigned offset = offset_in_page(pos); 1561 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); 1562 int id; 1563 1564 length = min_t(u64, PAGE_SIZE - offset, length); 1565 1566 id = dax_read_lock(); 1567 if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE) 1568 ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1); 1569 else 1570 ret = dax_memzero(iter, pos, length); 1571 dax_read_unlock(id); 1572 1573 if (ret < 0) 1574 return ret; 1575 1576 ret = iomap_iter_advance(iter, length); 1577 if (ret) 1578 return ret; 1579 } while ((length = iomap_length(iter)) > 0); 1580 1581 if (did_zero) 1582 *did_zero = true; 1583 return ret; 1584 } 1585 1586 int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, 1587 const struct iomap_ops *ops) 1588 { 1589 struct iomap_iter iter = { 1590 .inode = inode, 1591 .pos = pos, 1592 .len = len, 1593 .flags = IOMAP_DAX | IOMAP_ZERO, 1594 }; 1595 int ret; 1596 1597 while ((ret = iomap_iter(&iter, ops)) > 0) 1598 iter.status = dax_zero_iter(&iter, did_zero); 1599 return ret; 1600 } 1601 EXPORT_SYMBOL_GPL(dax_zero_range); 1602 1603 int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, 1604 const struct iomap_ops *ops) 1605 { 1606 unsigned int blocksize = i_blocksize(inode); 1607 unsigned int off = pos & (blocksize - 1); 1608 1609 /* Block boundary? Nothing to do */ 1610 if (!off) 1611 return 0; 1612 return dax_zero_range(inode, pos, blocksize - off, did_zero, ops); 1613 } 1614 EXPORT_SYMBOL_GPL(dax_truncate_page); 1615 1616 static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter) 1617 { 1618 const struct iomap *iomap = &iomi->iomap; 1619 const struct iomap *srcmap = iomap_iter_srcmap(iomi); 1620 loff_t length = iomap_length(iomi); 1621 loff_t pos = iomi->pos; 1622 struct dax_device *dax_dev = iomap->dax_dev; 1623 loff_t end = pos + length, done = 0; 1624 bool write = iov_iter_rw(iter) == WRITE; 1625 bool cow = write && iomap->flags & IOMAP_F_SHARED; 1626 ssize_t ret = 0; 1627 size_t xfer; 1628 int id; 1629 1630 if (!write) { 1631 end = min(end, i_size_read(iomi->inode)); 1632 if (pos >= end) 1633 return 0; 1634 1635 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) { 1636 done = iov_iter_zero(min(length, end - pos), iter); 1637 return iomap_iter_advance(iomi, done); 1638 } 1639 } 1640 1641 /* 1642 * In DAX mode, enforce either pure overwrites of written extents, or 1643 * writes to unwritten extents as part of a copy-on-write operation. 1644 */ 1645 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED && 1646 !(iomap->flags & IOMAP_F_SHARED))) 1647 return -EIO; 1648 1649 /* 1650 * Write can allocate block for an area which has a hole page mapped 1651 * into page tables. We have to tear down these mappings so that data 1652 * written by write(2) is visible in mmap. 1653 */ 1654 if (iomap->flags & IOMAP_F_NEW || cow) { 1655 /* 1656 * Filesystem allows CoW on non-shared extents. The src extents 1657 * may have been mmapped with dirty mark before. To be able to 1658 * invalidate its dax entries, we need to clear the dirty mark 1659 * in advance. 1660 */ 1661 if (cow) 1662 __dax_clear_dirty_range(iomi->inode->i_mapping, 1663 pos >> PAGE_SHIFT, 1664 (end - 1) >> PAGE_SHIFT); 1665 invalidate_inode_pages2_range(iomi->inode->i_mapping, 1666 pos >> PAGE_SHIFT, 1667 (end - 1) >> PAGE_SHIFT); 1668 } 1669 1670 id = dax_read_lock(); 1671 while ((pos = iomi->pos) < end) { 1672 unsigned offset = pos & (PAGE_SIZE - 1); 1673 const size_t size = ALIGN(length + offset, PAGE_SIZE); 1674 pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); 1675 ssize_t map_len; 1676 bool recovery = false; 1677 void *kaddr; 1678 1679 if (fatal_signal_pending(current)) { 1680 ret = -EINTR; 1681 break; 1682 } 1683 1684 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), 1685 DAX_ACCESS, &kaddr, NULL); 1686 if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) { 1687 map_len = dax_direct_access(dax_dev, pgoff, 1688 PHYS_PFN(size), DAX_RECOVERY_WRITE, 1689 &kaddr, NULL); 1690 if (map_len > 0) 1691 recovery = true; 1692 } 1693 if (map_len < 0) { 1694 ret = dax_mem2blk_err(map_len); 1695 break; 1696 } 1697 1698 if (cow) { 1699 ret = dax_iomap_copy_around(pos, length, PAGE_SIZE, 1700 srcmap, kaddr); 1701 if (ret) 1702 break; 1703 } 1704 1705 map_len = PFN_PHYS(map_len); 1706 kaddr += offset; 1707 map_len -= offset; 1708 if (map_len > end - pos) 1709 map_len = end - pos; 1710 1711 if (recovery) 1712 xfer = dax_recovery_write(dax_dev, pgoff, kaddr, 1713 map_len, iter); 1714 else if (write) 1715 xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, 1716 map_len, iter); 1717 else 1718 xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, 1719 map_len, iter); 1720 1721 ret = iomap_iter_advance(iomi, xfer); 1722 if (!ret && xfer == 0) 1723 ret = -EFAULT; 1724 if (xfer < map_len) 1725 break; 1726 length = iomap_length(iomi); 1727 } 1728 dax_read_unlock(id); 1729 1730 return ret; 1731 } 1732 1733 /** 1734 * dax_iomap_rw - Perform I/O to a DAX file 1735 * @iocb: The control block for this I/O 1736 * @iter: The addresses to do I/O from or to 1737 * @ops: iomap ops passed from the file system 1738 * 1739 * This function performs read and write operations to directly mapped 1740 * persistent memory. The callers needs to take care of read/write exclusion 1741 * and evicting any page cache pages in the region under I/O. 1742 */ 1743 ssize_t 1744 dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 1745 const struct iomap_ops *ops) 1746 { 1747 struct iomap_iter iomi = { 1748 .inode = iocb->ki_filp->f_mapping->host, 1749 .pos = iocb->ki_pos, 1750 .len = iov_iter_count(iter), 1751 .flags = IOMAP_DAX, 1752 }; 1753 loff_t done = 0; 1754 int ret; 1755 1756 if (WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC)) 1757 return -EIO; 1758 1759 if (!iomi.len) 1760 return 0; 1761 1762 if (iov_iter_rw(iter) == WRITE) { 1763 lockdep_assert_held_write(&iomi.inode->i_rwsem); 1764 iomi.flags |= IOMAP_WRITE; 1765 } else if (!sb_rdonly(iomi.inode->i_sb)) { 1766 lockdep_assert_held(&iomi.inode->i_rwsem); 1767 } 1768 1769 if (iocb->ki_flags & IOCB_NOWAIT) 1770 iomi.flags |= IOMAP_NOWAIT; 1771 1772 while ((ret = iomap_iter(&iomi, ops)) > 0) 1773 iomi.status = dax_iomap_iter(&iomi, iter); 1774 1775 done = iomi.pos - iocb->ki_pos; 1776 iocb->ki_pos = iomi.pos; 1777 return done ? done : ret; 1778 } 1779 EXPORT_SYMBOL_GPL(dax_iomap_rw); 1780 1781 static vm_fault_t dax_fault_return(int error) 1782 { 1783 if (error == 0) 1784 return VM_FAULT_NOPAGE; 1785 return vmf_error(error); 1786 } 1787 1788 /* 1789 * When handling a synchronous page fault and the inode need a fsync, we can 1790 * insert the PTE/PMD into page tables only after that fsync happened. Skip 1791 * insertion for now and return the pfn so that caller can insert it after the 1792 * fsync is done. 1793 */ 1794 static vm_fault_t dax_fault_synchronous_pfnp(unsigned long *pfnp, 1795 unsigned long pfn) 1796 { 1797 if (WARN_ON_ONCE(!pfnp)) 1798 return VM_FAULT_SIGBUS; 1799 *pfnp = pfn; 1800 return VM_FAULT_NEEDDSYNC; 1801 } 1802 1803 static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, 1804 const struct iomap_iter *iter) 1805 { 1806 vm_fault_t ret; 1807 int error = 0; 1808 1809 switch (iter->iomap.type) { 1810 case IOMAP_HOLE: 1811 case IOMAP_UNWRITTEN: 1812 clear_user_highpage(vmf->cow_page, vmf->address); 1813 break; 1814 case IOMAP_MAPPED: 1815 error = copy_cow_page_dax(vmf, iter); 1816 break; 1817 default: 1818 WARN_ON_ONCE(1); 1819 error = -EIO; 1820 break; 1821 } 1822 1823 if (error) 1824 return dax_fault_return(error); 1825 1826 __SetPageUptodate(vmf->cow_page); 1827 ret = finish_fault(vmf); 1828 if (!ret) 1829 return VM_FAULT_DONE_COW; 1830 return ret; 1831 } 1832 1833 /** 1834 * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault. 1835 * @vmf: vm fault instance 1836 * @iter: iomap iter 1837 * @pfnp: pfn to be returned 1838 * @xas: the dax mapping tree of a file 1839 * @entry: an unlocked dax entry to be inserted 1840 * @pmd: distinguish whether it is a pmd fault 1841 */ 1842 static vm_fault_t dax_fault_iter(struct vm_fault *vmf, 1843 const struct iomap_iter *iter, unsigned long *pfnp, 1844 struct xa_state *xas, void **entry, bool pmd) 1845 { 1846 const struct iomap *iomap = &iter->iomap; 1847 const struct iomap *srcmap = iomap_iter_srcmap(iter); 1848 size_t size = pmd ? PMD_SIZE : PAGE_SIZE; 1849 loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; 1850 bool write = iter->flags & IOMAP_WRITE; 1851 unsigned long entry_flags = pmd ? DAX_PMD : 0; 1852 struct folio *folio; 1853 int ret, err = 0; 1854 unsigned long pfn; 1855 void *kaddr; 1856 1857 if (!pmd && vmf->cow_page) 1858 return dax_fault_cow_page(vmf, iter); 1859 1860 /* if we are reading UNWRITTEN and HOLE, return a hole. */ 1861 if (!write && 1862 (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) { 1863 if (!pmd) 1864 return dax_load_hole(xas, vmf, iter, entry); 1865 return dax_pmd_load_hole(xas, vmf, iter, entry); 1866 } 1867 1868 if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) { 1869 WARN_ON_ONCE(1); 1870 return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS; 1871 } 1872 1873 err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn); 1874 if (err) 1875 return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err); 1876 1877 *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags); 1878 1879 if (write && iomap->flags & IOMAP_F_SHARED) { 1880 err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr); 1881 if (err) 1882 return dax_fault_return(err); 1883 } 1884 1885 folio = dax_to_folio(*entry); 1886 if (dax_fault_is_synchronous(iter, vmf->vma)) 1887 return dax_fault_synchronous_pfnp(pfnp, pfn); 1888 1889 folio_ref_inc(folio); 1890 if (pmd) 1891 ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn), write); 1892 else 1893 ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), write); 1894 folio_put(folio); 1895 1896 return ret; 1897 } 1898 1899 static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp, 1900 int *iomap_errp, const struct iomap_ops *ops) 1901 { 1902 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1903 XA_STATE(xas, &mapping->i_pages, vmf->pgoff); 1904 struct iomap_iter iter = { 1905 .inode = mapping->host, 1906 .pos = (loff_t)vmf->pgoff << PAGE_SHIFT, 1907 .len = PAGE_SIZE, 1908 .flags = IOMAP_DAX | IOMAP_FAULT, 1909 }; 1910 vm_fault_t ret = 0; 1911 void *entry; 1912 int error; 1913 1914 trace_dax_pte_fault(iter.inode, vmf, ret); 1915 /* 1916 * Check whether offset isn't beyond end of file now. Caller is supposed 1917 * to hold locks serializing us with truncate / punch hole so this is 1918 * a reliable test. 1919 */ 1920 if (iter.pos >= i_size_read(iter.inode)) { 1921 ret = VM_FAULT_SIGBUS; 1922 goto out; 1923 } 1924 1925 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) 1926 iter.flags |= IOMAP_WRITE; 1927 1928 entry = grab_mapping_entry(&xas, mapping, 0); 1929 if (xa_is_internal(entry)) { 1930 ret = xa_to_internal(entry); 1931 goto out; 1932 } 1933 1934 /* 1935 * It is possible, particularly with mixed reads & writes to private 1936 * mappings, that we have raced with a PMD fault that overlaps with 1937 * the PTE we need to set up. If so just return and the fault will be 1938 * retried. 1939 */ 1940 if (pmd_trans_huge(*vmf->pmd)) { 1941 ret = VM_FAULT_NOPAGE; 1942 goto unlock_entry; 1943 } 1944 1945 while ((error = iomap_iter(&iter, ops)) > 0) { 1946 if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) { 1947 iter.status = -EIO; /* fs corruption? */ 1948 continue; 1949 } 1950 1951 ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false); 1952 if (ret != VM_FAULT_SIGBUS && 1953 (iter.iomap.flags & IOMAP_F_NEW)) { 1954 count_vm_event(PGMAJFAULT); 1955 count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); 1956 ret |= VM_FAULT_MAJOR; 1957 } 1958 1959 if (!(ret & VM_FAULT_ERROR)) 1960 iter.status = iomap_iter_advance(&iter, PAGE_SIZE); 1961 } 1962 1963 if (iomap_errp) 1964 *iomap_errp = error; 1965 if (!ret && error) 1966 ret = dax_fault_return(error); 1967 1968 unlock_entry: 1969 dax_unlock_entry(&xas, entry); 1970 out: 1971 trace_dax_pte_fault_done(iter.inode, vmf, ret); 1972 return ret; 1973 } 1974 1975 #ifdef CONFIG_FS_DAX_PMD 1976 static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas, 1977 pgoff_t max_pgoff) 1978 { 1979 unsigned long pmd_addr = vmf->address & PMD_MASK; 1980 bool write = vmf->flags & FAULT_FLAG_WRITE; 1981 1982 /* 1983 * Make sure that the faulting address's PMD offset (color) matches 1984 * the PMD offset from the start of the file. This is necessary so 1985 * that a PMD range in the page table overlaps exactly with a PMD 1986 * range in the page cache. 1987 */ 1988 if ((vmf->pgoff & PG_PMD_COLOUR) != 1989 ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) 1990 return true; 1991 1992 /* Fall back to PTEs if we're going to COW */ 1993 if (write && !(vmf->vma->vm_flags & VM_SHARED)) 1994 return true; 1995 1996 /* If the PMD would extend outside the VMA */ 1997 if (pmd_addr < vmf->vma->vm_start) 1998 return true; 1999 if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end) 2000 return true; 2001 2002 /* If the PMD would extend beyond the file size */ 2003 if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff) 2004 return true; 2005 2006 return false; 2007 } 2008 2009 static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp, 2010 const struct iomap_ops *ops) 2011 { 2012 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 2013 XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); 2014 struct iomap_iter iter = { 2015 .inode = mapping->host, 2016 .len = PMD_SIZE, 2017 .flags = IOMAP_DAX | IOMAP_FAULT, 2018 }; 2019 vm_fault_t ret = VM_FAULT_FALLBACK; 2020 pgoff_t max_pgoff; 2021 void *entry; 2022 2023 if (vmf->flags & FAULT_FLAG_WRITE) 2024 iter.flags |= IOMAP_WRITE; 2025 2026 /* 2027 * Check whether offset isn't beyond end of file now. Caller is 2028 * supposed to hold locks serializing us with truncate / punch hole so 2029 * this is a reliable test. 2030 */ 2031 max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE); 2032 2033 trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0); 2034 2035 if (xas.xa_index >= max_pgoff) { 2036 ret = VM_FAULT_SIGBUS; 2037 goto out; 2038 } 2039 2040 if (dax_fault_check_fallback(vmf, &xas, max_pgoff)) 2041 goto fallback; 2042 2043 /* 2044 * grab_mapping_entry() will make sure we get an empty PMD entry, 2045 * a zero PMD entry or a DAX PMD. If it can't (because a PTE 2046 * entry is already in the array, for instance), it will return 2047 * VM_FAULT_FALLBACK. 2048 */ 2049 entry = grab_mapping_entry(&xas, mapping, PMD_ORDER); 2050 if (xa_is_internal(entry)) { 2051 ret = xa_to_internal(entry); 2052 goto fallback; 2053 } 2054 2055 /* 2056 * It is possible, particularly with mixed reads & writes to private 2057 * mappings, that we have raced with a PTE fault that overlaps with 2058 * the PMD we need to set up. If so just return and the fault will be 2059 * retried. 2060 */ 2061 if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd)) { 2062 ret = 0; 2063 goto unlock_entry; 2064 } 2065 2066 iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT; 2067 while (iomap_iter(&iter, ops) > 0) { 2068 if (iomap_length(&iter) < PMD_SIZE) 2069 continue; /* actually breaks out of the loop */ 2070 2071 ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); 2072 if (ret != VM_FAULT_FALLBACK) 2073 iter.status = iomap_iter_advance(&iter, PMD_SIZE); 2074 } 2075 2076 unlock_entry: 2077 dax_unlock_entry(&xas, entry); 2078 fallback: 2079 if (ret == VM_FAULT_FALLBACK) { 2080 split_huge_pmd(vmf->vma, vmf->pmd, vmf->address); 2081 count_vm_event(THP_FAULT_FALLBACK); 2082 } 2083 out: 2084 trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret); 2085 return ret; 2086 } 2087 #else 2088 static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp, 2089 const struct iomap_ops *ops) 2090 { 2091 return VM_FAULT_FALLBACK; 2092 } 2093 #endif /* CONFIG_FS_DAX_PMD */ 2094 2095 /** 2096 * dax_iomap_fault - handle a page fault on a DAX file 2097 * @vmf: The description of the fault 2098 * @order: Order of the page to fault in 2099 * @pfnp: PFN to insert for synchronous faults if fsync is required 2100 * @iomap_errp: Storage for detailed error code in case of error 2101 * @ops: Iomap ops passed from the file system 2102 * 2103 * When a page fault occurs, filesystems may call this helper in 2104 * their fault handler for DAX files. dax_iomap_fault() assumes the caller 2105 * has done all the necessary locking for page fault to proceed 2106 * successfully. 2107 */ 2108 vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, 2109 unsigned long *pfnp, int *iomap_errp, 2110 const struct iomap_ops *ops) 2111 { 2112 if (order == 0) 2113 return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops); 2114 else if (order == PMD_ORDER) 2115 return dax_iomap_pmd_fault(vmf, pfnp, ops); 2116 else 2117 return VM_FAULT_FALLBACK; 2118 } 2119 EXPORT_SYMBOL_GPL(dax_iomap_fault); 2120 2121 /* 2122 * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables 2123 * @vmf: The description of the fault 2124 * @pfn: PFN to insert 2125 * @order: Order of entry to insert. 2126 * 2127 * This function inserts a writeable PTE or PMD entry into the page tables 2128 * for an mmaped DAX file. It also marks the page cache entry as dirty. 2129 */ 2130 static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, 2131 unsigned long pfn, unsigned int order) 2132 { 2133 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 2134 XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); 2135 struct folio *folio; 2136 void *entry; 2137 vm_fault_t ret; 2138 2139 xas_lock_irq(&xas); 2140 entry = get_next_unlocked_entry(&xas, order); 2141 /* Did we race with someone splitting entry or so? */ 2142 if (!entry || dax_is_conflict(entry) || 2143 (order == 0 && !dax_is_pte_entry(entry))) { 2144 put_unlocked_entry(&xas, entry, WAKE_NEXT); 2145 xas_unlock_irq(&xas); 2146 trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, 2147 VM_FAULT_NOPAGE); 2148 return VM_FAULT_NOPAGE; 2149 } 2150 xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); 2151 dax_lock_entry(&xas, entry); 2152 xas_unlock_irq(&xas); 2153 folio = pfn_folio(pfn); 2154 folio_ref_inc(folio); 2155 if (order == 0) 2156 ret = vmf_insert_page_mkwrite(vmf, &folio->page, true); 2157 #ifdef CONFIG_FS_DAX_PMD 2158 else if (order == PMD_ORDER) 2159 ret = vmf_insert_folio_pmd(vmf, folio, FAULT_FLAG_WRITE); 2160 #endif 2161 else 2162 ret = VM_FAULT_FALLBACK; 2163 folio_put(folio); 2164 dax_unlock_entry(&xas, entry); 2165 trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); 2166 return ret; 2167 } 2168 2169 /** 2170 * dax_finish_sync_fault - finish synchronous page fault 2171 * @vmf: The description of the fault 2172 * @order: Order of entry to be inserted 2173 * @pfn: PFN to insert 2174 * 2175 * This function ensures that the file range touched by the page fault is 2176 * stored persistently on the media and handles inserting of appropriate page 2177 * table entry. 2178 */ 2179 vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, 2180 unsigned long pfn) 2181 { 2182 int err; 2183 loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; 2184 size_t len = PAGE_SIZE << order; 2185 2186 err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); 2187 if (err) 2188 return VM_FAULT_SIGBUS; 2189 return dax_insert_pfn_mkwrite(vmf, pfn, order); 2190 } 2191 EXPORT_SYMBOL_GPL(dax_finish_sync_fault); 2192 2193 static int dax_range_compare_iter(struct iomap_iter *it_src, 2194 struct iomap_iter *it_dest, u64 len, bool *same) 2195 { 2196 const struct iomap *smap = &it_src->iomap; 2197 const struct iomap *dmap = &it_dest->iomap; 2198 loff_t pos1 = it_src->pos, pos2 = it_dest->pos; 2199 void *saddr, *daddr; 2200 int id, ret; 2201 2202 len = min(len, min(smap->length, dmap->length)); 2203 2204 if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) { 2205 *same = true; 2206 goto advance; 2207 } 2208 2209 if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) { 2210 *same = false; 2211 return 0; 2212 } 2213 2214 id = dax_read_lock(); 2215 ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE), 2216 &saddr, NULL); 2217 if (ret < 0) 2218 goto out_unlock; 2219 2220 ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE), 2221 &daddr, NULL); 2222 if (ret < 0) 2223 goto out_unlock; 2224 2225 *same = !memcmp(saddr, daddr, len); 2226 if (!*same) 2227 len = 0; 2228 dax_read_unlock(id); 2229 2230 advance: 2231 ret = iomap_iter_advance(it_src, len); 2232 if (!ret) 2233 ret = iomap_iter_advance(it_dest, len); 2234 return ret; 2235 2236 out_unlock: 2237 dax_read_unlock(id); 2238 return -EIO; 2239 } 2240 2241 int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, 2242 struct inode *dst, loff_t dstoff, loff_t len, bool *same, 2243 const struct iomap_ops *ops) 2244 { 2245 struct iomap_iter src_iter = { 2246 .inode = src, 2247 .pos = srcoff, 2248 .len = len, 2249 .flags = IOMAP_DAX, 2250 }; 2251 struct iomap_iter dst_iter = { 2252 .inode = dst, 2253 .pos = dstoff, 2254 .len = len, 2255 .flags = IOMAP_DAX, 2256 }; 2257 int ret, status; 2258 2259 while ((ret = iomap_iter(&src_iter, ops)) > 0 && 2260 (ret = iomap_iter(&dst_iter, ops)) > 0) { 2261 status = dax_range_compare_iter(&src_iter, &dst_iter, 2262 min(src_iter.len, dst_iter.len), same); 2263 if (status < 0) 2264 return ret; 2265 src_iter.status = dst_iter.status = status; 2266 } 2267 return ret; 2268 } 2269 2270 int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, 2271 struct file *file_out, loff_t pos_out, 2272 loff_t *len, unsigned int remap_flags, 2273 const struct iomap_ops *ops) 2274 { 2275 return __generic_remap_file_range_prep(file_in, pos_in, file_out, 2276 pos_out, len, remap_flags, ops); 2277 } 2278 EXPORT_SYMBOL_GPL(dax_remap_file_range_prep); 2279