1 /* 2 * fs/dax.c - Direct Access filesystem code 3 * Copyright (c) 2013-2014 Intel Corporation 4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 */ 16 17 #include <linux/atomic.h> 18 #include <linux/blkdev.h> 19 #include <linux/buffer_head.h> 20 #include <linux/dax.h> 21 #include <linux/fs.h> 22 #include <linux/genhd.h> 23 #include <linux/highmem.h> 24 #include <linux/memcontrol.h> 25 #include <linux/mm.h> 26 #include <linux/mutex.h> 27 #include <linux/pagevec.h> 28 #include <linux/pmem.h> 29 #include <linux/sched.h> 30 #include <linux/sched/signal.h> 31 #include <linux/uio.h> 32 #include <linux/vmstat.h> 33 #include <linux/pfn_t.h> 34 #include <linux/sizes.h> 35 #include <linux/mmu_notifier.h> 36 #include <linux/iomap.h> 37 #include "internal.h" 38 39 #define CREATE_TRACE_POINTS 40 #include <trace/events/fs_dax.h> 41 42 /* We choose 4096 entries - same as per-zone page wait tables */ 43 #define DAX_WAIT_TABLE_BITS 12 44 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 45 46 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 47 48 static int __init init_dax_wait_table(void) 49 { 50 int i; 51 52 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) 53 init_waitqueue_head(wait_table + i); 54 return 0; 55 } 56 fs_initcall(init_dax_wait_table); 57 58 static int dax_is_pmd_entry(void *entry) 59 { 60 return (unsigned long)entry & RADIX_DAX_PMD; 61 } 62 63 static int dax_is_pte_entry(void *entry) 64 { 65 return !((unsigned long)entry & RADIX_DAX_PMD); 66 } 67 68 static int dax_is_zero_entry(void *entry) 69 { 70 return (unsigned long)entry & RADIX_DAX_HZP; 71 } 72 73 static int dax_is_empty_entry(void *entry) 74 { 75 return (unsigned long)entry & RADIX_DAX_EMPTY; 76 } 77 78 /* 79 * DAX radix tree locking 80 */ 81 struct exceptional_entry_key { 82 struct address_space *mapping; 83 pgoff_t entry_start; 84 }; 85 86 struct wait_exceptional_entry_queue { 87 wait_queue_t wait; 88 struct exceptional_entry_key key; 89 }; 90 91 static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, 92 pgoff_t index, void *entry, struct exceptional_entry_key *key) 93 { 94 unsigned long hash; 95 96 /* 97 * If 'entry' is a PMD, align the 'index' that we use for the wait 98 * queue to the start of that PMD. This ensures that all offsets in 99 * the range covered by the PMD map to the same bit lock. 100 */ 101 if (dax_is_pmd_entry(entry)) 102 index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); 103 104 key->mapping = mapping; 105 key->entry_start = index; 106 107 hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); 108 return wait_table + hash; 109 } 110 111 static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, 112 int sync, void *keyp) 113 { 114 struct exceptional_entry_key *key = keyp; 115 struct wait_exceptional_entry_queue *ewait = 116 container_of(wait, struct wait_exceptional_entry_queue, wait); 117 118 if (key->mapping != ewait->key.mapping || 119 key->entry_start != ewait->key.entry_start) 120 return 0; 121 return autoremove_wake_function(wait, mode, sync, NULL); 122 } 123 124 /* 125 * Check whether the given slot is locked. The function must be called with 126 * mapping->tree_lock held 127 */ 128 static inline int slot_locked(struct address_space *mapping, void **slot) 129 { 130 unsigned long entry = (unsigned long) 131 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 132 return entry & RADIX_DAX_ENTRY_LOCK; 133 } 134 135 /* 136 * Mark the given slot is locked. The function must be called with 137 * mapping->tree_lock held 138 */ 139 static inline void *lock_slot(struct address_space *mapping, void **slot) 140 { 141 unsigned long entry = (unsigned long) 142 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 143 144 entry |= RADIX_DAX_ENTRY_LOCK; 145 radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); 146 return (void *)entry; 147 } 148 149 /* 150 * Mark the given slot is unlocked. The function must be called with 151 * mapping->tree_lock held 152 */ 153 static inline void *unlock_slot(struct address_space *mapping, void **slot) 154 { 155 unsigned long entry = (unsigned long) 156 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 157 158 entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; 159 radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); 160 return (void *)entry; 161 } 162 163 /* 164 * Lookup entry in radix tree, wait for it to become unlocked if it is 165 * exceptional entry and return it. The caller must call 166 * put_unlocked_mapping_entry() when he decided not to lock the entry or 167 * put_locked_mapping_entry() when he locked the entry and now wants to 168 * unlock it. 169 * 170 * The function must be called with mapping->tree_lock held. 171 */ 172 static void *get_unlocked_mapping_entry(struct address_space *mapping, 173 pgoff_t index, void ***slotp) 174 { 175 void *entry, **slot; 176 struct wait_exceptional_entry_queue ewait; 177 wait_queue_head_t *wq; 178 179 init_wait(&ewait.wait); 180 ewait.wait.func = wake_exceptional_entry_func; 181 182 for (;;) { 183 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, 184 &slot); 185 if (!entry || !radix_tree_exceptional_entry(entry) || 186 !slot_locked(mapping, slot)) { 187 if (slotp) 188 *slotp = slot; 189 return entry; 190 } 191 192 wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); 193 prepare_to_wait_exclusive(wq, &ewait.wait, 194 TASK_UNINTERRUPTIBLE); 195 spin_unlock_irq(&mapping->tree_lock); 196 schedule(); 197 finish_wait(wq, &ewait.wait); 198 spin_lock_irq(&mapping->tree_lock); 199 } 200 } 201 202 static void dax_unlock_mapping_entry(struct address_space *mapping, 203 pgoff_t index) 204 { 205 void *entry, **slot; 206 207 spin_lock_irq(&mapping->tree_lock); 208 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); 209 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || 210 !slot_locked(mapping, slot))) { 211 spin_unlock_irq(&mapping->tree_lock); 212 return; 213 } 214 unlock_slot(mapping, slot); 215 spin_unlock_irq(&mapping->tree_lock); 216 dax_wake_mapping_entry_waiter(mapping, index, entry, false); 217 } 218 219 static void put_locked_mapping_entry(struct address_space *mapping, 220 pgoff_t index, void *entry) 221 { 222 if (!radix_tree_exceptional_entry(entry)) { 223 unlock_page(entry); 224 put_page(entry); 225 } else { 226 dax_unlock_mapping_entry(mapping, index); 227 } 228 } 229 230 /* 231 * Called when we are done with radix tree entry we looked up via 232 * get_unlocked_mapping_entry() and which we didn't lock in the end. 233 */ 234 static void put_unlocked_mapping_entry(struct address_space *mapping, 235 pgoff_t index, void *entry) 236 { 237 if (!radix_tree_exceptional_entry(entry)) 238 return; 239 240 /* We have to wake up next waiter for the radix tree entry lock */ 241 dax_wake_mapping_entry_waiter(mapping, index, entry, false); 242 } 243 244 /* 245 * Find radix tree entry at given index. If it points to a page, return with 246 * the page locked. If it points to the exceptional entry, return with the 247 * radix tree entry locked. If the radix tree doesn't contain given index, 248 * create empty exceptional entry for the index and return with it locked. 249 * 250 * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will 251 * either return that locked entry or will return an error. This error will 252 * happen if there are any 4k entries (either zero pages or DAX entries) 253 * within the 2MiB range that we are requesting. 254 * 255 * We always favor 4k entries over 2MiB entries. There isn't a flow where we 256 * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB 257 * insertion will fail if it finds any 4k entries already in the tree, and a 258 * 4k insertion will cause an existing 2MiB entry to be unmapped and 259 * downgraded to 4k entries. This happens for both 2MiB huge zero pages as 260 * well as 2MiB empty entries. 261 * 262 * The exception to this downgrade path is for 2MiB DAX PMD entries that have 263 * real storage backing them. We will leave these real 2MiB DAX entries in 264 * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. 265 * 266 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 267 * persistent memory the benefit is doubtful. We can add that later if we can 268 * show it helps. 269 */ 270 static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, 271 unsigned long size_flag) 272 { 273 bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ 274 void *entry, **slot; 275 276 restart: 277 spin_lock_irq(&mapping->tree_lock); 278 entry = get_unlocked_mapping_entry(mapping, index, &slot); 279 280 if (entry) { 281 if (size_flag & RADIX_DAX_PMD) { 282 if (!radix_tree_exceptional_entry(entry) || 283 dax_is_pte_entry(entry)) { 284 put_unlocked_mapping_entry(mapping, index, 285 entry); 286 entry = ERR_PTR(-EEXIST); 287 goto out_unlock; 288 } 289 } else { /* trying to grab a PTE entry */ 290 if (radix_tree_exceptional_entry(entry) && 291 dax_is_pmd_entry(entry) && 292 (dax_is_zero_entry(entry) || 293 dax_is_empty_entry(entry))) { 294 pmd_downgrade = true; 295 } 296 } 297 } 298 299 /* No entry for given index? Make sure radix tree is big enough. */ 300 if (!entry || pmd_downgrade) { 301 int err; 302 303 if (pmd_downgrade) { 304 /* 305 * Make sure 'entry' remains valid while we drop 306 * mapping->tree_lock. 307 */ 308 entry = lock_slot(mapping, slot); 309 } 310 311 spin_unlock_irq(&mapping->tree_lock); 312 /* 313 * Besides huge zero pages the only other thing that gets 314 * downgraded are empty entries which don't need to be 315 * unmapped. 316 */ 317 if (pmd_downgrade && dax_is_zero_entry(entry)) 318 unmap_mapping_range(mapping, 319 (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); 320 321 err = radix_tree_preload( 322 mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); 323 if (err) { 324 if (pmd_downgrade) 325 put_locked_mapping_entry(mapping, index, entry); 326 return ERR_PTR(err); 327 } 328 spin_lock_irq(&mapping->tree_lock); 329 330 if (!entry) { 331 /* 332 * We needed to drop the page_tree lock while calling 333 * radix_tree_preload() and we didn't have an entry to 334 * lock. See if another thread inserted an entry at 335 * our index during this time. 336 */ 337 entry = __radix_tree_lookup(&mapping->page_tree, index, 338 NULL, &slot); 339 if (entry) { 340 radix_tree_preload_end(); 341 spin_unlock_irq(&mapping->tree_lock); 342 goto restart; 343 } 344 } 345 346 if (pmd_downgrade) { 347 radix_tree_delete(&mapping->page_tree, index); 348 mapping->nrexceptional--; 349 dax_wake_mapping_entry_waiter(mapping, index, entry, 350 true); 351 } 352 353 entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); 354 355 err = __radix_tree_insert(&mapping->page_tree, index, 356 dax_radix_order(entry), entry); 357 radix_tree_preload_end(); 358 if (err) { 359 spin_unlock_irq(&mapping->tree_lock); 360 /* 361 * Our insertion of a DAX entry failed, most likely 362 * because we were inserting a PMD entry and it 363 * collided with a PTE sized entry at a different 364 * index in the PMD range. We haven't inserted 365 * anything into the radix tree and have no waiters to 366 * wake. 367 */ 368 return ERR_PTR(err); 369 } 370 /* Good, we have inserted empty locked entry into the tree. */ 371 mapping->nrexceptional++; 372 spin_unlock_irq(&mapping->tree_lock); 373 return entry; 374 } 375 /* Normal page in radix tree? */ 376 if (!radix_tree_exceptional_entry(entry)) { 377 struct page *page = entry; 378 379 get_page(page); 380 spin_unlock_irq(&mapping->tree_lock); 381 lock_page(page); 382 /* Page got truncated? Retry... */ 383 if (unlikely(page->mapping != mapping)) { 384 unlock_page(page); 385 put_page(page); 386 goto restart; 387 } 388 return page; 389 } 390 entry = lock_slot(mapping, slot); 391 out_unlock: 392 spin_unlock_irq(&mapping->tree_lock); 393 return entry; 394 } 395 396 /* 397 * We do not necessarily hold the mapping->tree_lock when we call this 398 * function so it is possible that 'entry' is no longer a valid item in the 399 * radix tree. This is okay because all we really need to do is to find the 400 * correct waitqueue where tasks might be waiting for that old 'entry' and 401 * wake them. 402 */ 403 void dax_wake_mapping_entry_waiter(struct address_space *mapping, 404 pgoff_t index, void *entry, bool wake_all) 405 { 406 struct exceptional_entry_key key; 407 wait_queue_head_t *wq; 408 409 wq = dax_entry_waitqueue(mapping, index, entry, &key); 410 411 /* 412 * Checking for locked entry and prepare_to_wait_exclusive() happens 413 * under mapping->tree_lock, ditto for entry handling in our callers. 414 * So at this point all tasks that could have seen our entry locked 415 * must be in the waitqueue and the following check will see them. 416 */ 417 if (waitqueue_active(wq)) 418 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 419 } 420 421 static int __dax_invalidate_mapping_entry(struct address_space *mapping, 422 pgoff_t index, bool trunc) 423 { 424 int ret = 0; 425 void *entry; 426 struct radix_tree_root *page_tree = &mapping->page_tree; 427 428 spin_lock_irq(&mapping->tree_lock); 429 entry = get_unlocked_mapping_entry(mapping, index, NULL); 430 if (!entry || !radix_tree_exceptional_entry(entry)) 431 goto out; 432 if (!trunc && 433 (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || 434 radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) 435 goto out; 436 radix_tree_delete(page_tree, index); 437 mapping->nrexceptional--; 438 ret = 1; 439 out: 440 put_unlocked_mapping_entry(mapping, index, entry); 441 spin_unlock_irq(&mapping->tree_lock); 442 return ret; 443 } 444 /* 445 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree 446 * entry to get unlocked before deleting it. 447 */ 448 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) 449 { 450 int ret = __dax_invalidate_mapping_entry(mapping, index, true); 451 452 /* 453 * This gets called from truncate / punch_hole path. As such, the caller 454 * must hold locks protecting against concurrent modifications of the 455 * radix tree (usually fs-private i_mmap_sem for writing). Since the 456 * caller has seen exceptional entry for this index, we better find it 457 * at that index as well... 458 */ 459 WARN_ON_ONCE(!ret); 460 return ret; 461 } 462 463 /* 464 * Invalidate exceptional DAX entry if easily possible. This handles DAX 465 * entries for invalidate_inode_pages() so we evict the entry only if we can 466 * do so without blocking. 467 */ 468 int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index) 469 { 470 int ret = 0; 471 void *entry, **slot; 472 struct radix_tree_root *page_tree = &mapping->page_tree; 473 474 spin_lock_irq(&mapping->tree_lock); 475 entry = __radix_tree_lookup(page_tree, index, NULL, &slot); 476 if (!entry || !radix_tree_exceptional_entry(entry) || 477 slot_locked(mapping, slot)) 478 goto out; 479 if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || 480 radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 481 goto out; 482 radix_tree_delete(page_tree, index); 483 mapping->nrexceptional--; 484 ret = 1; 485 out: 486 spin_unlock_irq(&mapping->tree_lock); 487 if (ret) 488 dax_wake_mapping_entry_waiter(mapping, index, entry, true); 489 return ret; 490 } 491 492 /* 493 * Invalidate exceptional DAX entry if it is clean. 494 */ 495 int dax_invalidate_mapping_entry_sync(struct address_space *mapping, 496 pgoff_t index) 497 { 498 return __dax_invalidate_mapping_entry(mapping, index, false); 499 } 500 501 /* 502 * The user has performed a load from a hole in the file. Allocating 503 * a new page in the file would cause excessive storage usage for 504 * workloads with sparse files. We allocate a page cache page instead. 505 * We'll kick it out of the page cache if it's ever written to, 506 * otherwise it will simply fall out of the page cache under memory 507 * pressure without ever having been dirtied. 508 */ 509 static int dax_load_hole(struct address_space *mapping, void **entry, 510 struct vm_fault *vmf) 511 { 512 struct inode *inode = mapping->host; 513 struct page *page; 514 int ret; 515 516 /* Hole page already exists? Return it... */ 517 if (!radix_tree_exceptional_entry(*entry)) { 518 page = *entry; 519 goto finish_fault; 520 } 521 522 /* This will replace locked radix tree entry with a hole page */ 523 page = find_or_create_page(mapping, vmf->pgoff, 524 vmf->gfp_mask | __GFP_ZERO); 525 if (!page) { 526 ret = VM_FAULT_OOM; 527 goto out; 528 } 529 530 finish_fault: 531 vmf->page = page; 532 ret = finish_fault(vmf); 533 vmf->page = NULL; 534 *entry = page; 535 if (!ret) { 536 /* Grab reference for PTE that is now referencing the page */ 537 get_page(page); 538 ret = VM_FAULT_NOPAGE; 539 } 540 out: 541 trace_dax_load_hole(inode, vmf, ret); 542 return ret; 543 } 544 545 static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, 546 sector_t sector, size_t size, struct page *to, 547 unsigned long vaddr) 548 { 549 void *vto, *kaddr; 550 pgoff_t pgoff; 551 pfn_t pfn; 552 long rc; 553 int id; 554 555 rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); 556 if (rc) 557 return rc; 558 559 id = dax_read_lock(); 560 rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); 561 if (rc < 0) { 562 dax_read_unlock(id); 563 return rc; 564 } 565 vto = kmap_atomic(to); 566 copy_user_page(vto, (void __force *)kaddr, vaddr, to); 567 kunmap_atomic(vto); 568 dax_read_unlock(id); 569 return 0; 570 } 571 572 /* 573 * By this point grab_mapping_entry() has ensured that we have a locked entry 574 * of the appropriate size so we don't have to worry about downgrading PMDs to 575 * PTEs. If we happen to be trying to insert a PTE and there is a PMD 576 * already in the tree, we will skip the insertion and just dirty the PMD as 577 * appropriate. 578 */ 579 static void *dax_insert_mapping_entry(struct address_space *mapping, 580 struct vm_fault *vmf, 581 void *entry, sector_t sector, 582 unsigned long flags) 583 { 584 struct radix_tree_root *page_tree = &mapping->page_tree; 585 int error = 0; 586 bool hole_fill = false; 587 void *new_entry; 588 pgoff_t index = vmf->pgoff; 589 590 if (vmf->flags & FAULT_FLAG_WRITE) 591 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 592 593 /* Replacing hole page with block mapping? */ 594 if (!radix_tree_exceptional_entry(entry)) { 595 hole_fill = true; 596 /* 597 * Unmap the page now before we remove it from page cache below. 598 * The page is locked so it cannot be faulted in again. 599 */ 600 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 601 PAGE_SIZE, 0); 602 error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); 603 if (error) 604 return ERR_PTR(error); 605 } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { 606 /* replacing huge zero page with PMD block mapping */ 607 unmap_mapping_range(mapping, 608 (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); 609 } 610 611 spin_lock_irq(&mapping->tree_lock); 612 new_entry = dax_radix_locked_entry(sector, flags); 613 614 if (hole_fill) { 615 __delete_from_page_cache(entry, NULL); 616 /* Drop pagecache reference */ 617 put_page(entry); 618 error = __radix_tree_insert(page_tree, index, 619 dax_radix_order(new_entry), new_entry); 620 if (error) { 621 new_entry = ERR_PTR(error); 622 goto unlock; 623 } 624 mapping->nrexceptional++; 625 } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { 626 /* 627 * Only swap our new entry into the radix tree if the current 628 * entry is a zero page or an empty entry. If a normal PTE or 629 * PMD entry is already in the tree, we leave it alone. This 630 * means that if we are trying to insert a PTE and the 631 * existing entry is a PMD, we will just leave the PMD in the 632 * tree and dirty it if necessary. 633 */ 634 struct radix_tree_node *node; 635 void **slot; 636 void *ret; 637 638 ret = __radix_tree_lookup(page_tree, index, &node, &slot); 639 WARN_ON_ONCE(ret != entry); 640 __radix_tree_replace(page_tree, node, slot, 641 new_entry, NULL, NULL); 642 } 643 if (vmf->flags & FAULT_FLAG_WRITE) 644 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 645 unlock: 646 spin_unlock_irq(&mapping->tree_lock); 647 if (hole_fill) { 648 radix_tree_preload_end(); 649 /* 650 * We don't need hole page anymore, it has been replaced with 651 * locked radix tree entry now. 652 */ 653 if (mapping->a_ops->freepage) 654 mapping->a_ops->freepage(entry); 655 unlock_page(entry); 656 put_page(entry); 657 } 658 return new_entry; 659 } 660 661 static inline unsigned long 662 pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) 663 { 664 unsigned long address; 665 666 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 667 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); 668 return address; 669 } 670 671 /* Walk all mappings of a given index of a file and writeprotect them */ 672 static void dax_mapping_entry_mkclean(struct address_space *mapping, 673 pgoff_t index, unsigned long pfn) 674 { 675 struct vm_area_struct *vma; 676 pte_t pte, *ptep = NULL; 677 pmd_t *pmdp = NULL; 678 spinlock_t *ptl; 679 bool changed; 680 681 i_mmap_lock_read(mapping); 682 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { 683 unsigned long address; 684 685 cond_resched(); 686 687 if (!(vma->vm_flags & VM_SHARED)) 688 continue; 689 690 address = pgoff_address(index, vma); 691 changed = false; 692 if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) 693 continue; 694 695 if (pmdp) { 696 #ifdef CONFIG_FS_DAX_PMD 697 pmd_t pmd; 698 699 if (pfn != pmd_pfn(*pmdp)) 700 goto unlock_pmd; 701 if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) 702 goto unlock_pmd; 703 704 flush_cache_page(vma, address, pfn); 705 pmd = pmdp_huge_clear_flush(vma, address, pmdp); 706 pmd = pmd_wrprotect(pmd); 707 pmd = pmd_mkclean(pmd); 708 set_pmd_at(vma->vm_mm, address, pmdp, pmd); 709 changed = true; 710 unlock_pmd: 711 spin_unlock(ptl); 712 #endif 713 } else { 714 if (pfn != pte_pfn(*ptep)) 715 goto unlock_pte; 716 if (!pte_dirty(*ptep) && !pte_write(*ptep)) 717 goto unlock_pte; 718 719 flush_cache_page(vma, address, pfn); 720 pte = ptep_clear_flush(vma, address, ptep); 721 pte = pte_wrprotect(pte); 722 pte = pte_mkclean(pte); 723 set_pte_at(vma->vm_mm, address, ptep, pte); 724 changed = true; 725 unlock_pte: 726 pte_unmap_unlock(ptep, ptl); 727 } 728 729 if (changed) 730 mmu_notifier_invalidate_page(vma->vm_mm, address); 731 } 732 i_mmap_unlock_read(mapping); 733 } 734 735 static int dax_writeback_one(struct block_device *bdev, 736 struct dax_device *dax_dev, struct address_space *mapping, 737 pgoff_t index, void *entry) 738 { 739 struct radix_tree_root *page_tree = &mapping->page_tree; 740 void *entry2, **slot, *kaddr; 741 long ret = 0, id; 742 sector_t sector; 743 pgoff_t pgoff; 744 size_t size; 745 pfn_t pfn; 746 747 /* 748 * A page got tagged dirty in DAX mapping? Something is seriously 749 * wrong. 750 */ 751 if (WARN_ON(!radix_tree_exceptional_entry(entry))) 752 return -EIO; 753 754 spin_lock_irq(&mapping->tree_lock); 755 entry2 = get_unlocked_mapping_entry(mapping, index, &slot); 756 /* Entry got punched out / reallocated? */ 757 if (!entry2 || !radix_tree_exceptional_entry(entry2)) 758 goto put_unlocked; 759 /* 760 * Entry got reallocated elsewhere? No need to writeback. We have to 761 * compare sectors as we must not bail out due to difference in lockbit 762 * or entry type. 763 */ 764 if (dax_radix_sector(entry2) != dax_radix_sector(entry)) 765 goto put_unlocked; 766 if (WARN_ON_ONCE(dax_is_empty_entry(entry) || 767 dax_is_zero_entry(entry))) { 768 ret = -EIO; 769 goto put_unlocked; 770 } 771 772 /* Another fsync thread may have already written back this entry */ 773 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 774 goto put_unlocked; 775 /* Lock the entry to serialize with page faults */ 776 entry = lock_slot(mapping, slot); 777 /* 778 * We can clear the tag now but we have to be careful so that concurrent 779 * dax_writeback_one() calls for the same index cannot finish before we 780 * actually flush the caches. This is achieved as the calls will look 781 * at the entry only under tree_lock and once they do that they will 782 * see the entry locked and wait for it to unlock. 783 */ 784 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); 785 spin_unlock_irq(&mapping->tree_lock); 786 787 /* 788 * Even if dax_writeback_mapping_range() was given a wbc->range_start 789 * in the middle of a PMD, the 'index' we are given will be aligned to 790 * the start index of the PMD, as will the sector we pull from 791 * 'entry'. This allows us to flush for PMD_SIZE and not have to 792 * worry about partial PMD writebacks. 793 */ 794 sector = dax_radix_sector(entry); 795 size = PAGE_SIZE << dax_radix_order(entry); 796 797 id = dax_read_lock(); 798 ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); 799 if (ret) 800 goto dax_unlock; 801 802 /* 803 * dax_direct_access() may sleep, so cannot hold tree_lock over 804 * its invocation. 805 */ 806 ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn); 807 if (ret < 0) 808 goto dax_unlock; 809 810 if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) { 811 ret = -EIO; 812 goto dax_unlock; 813 } 814 815 dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); 816 wb_cache_pmem(kaddr, size); 817 /* 818 * After we have flushed the cache, we can clear the dirty tag. There 819 * cannot be new dirty data in the pfn after the flush has completed as 820 * the pfn mappings are writeprotected and fault waits for mapping 821 * entry lock. 822 */ 823 spin_lock_irq(&mapping->tree_lock); 824 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); 825 spin_unlock_irq(&mapping->tree_lock); 826 trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); 827 dax_unlock: 828 dax_read_unlock(id); 829 put_locked_mapping_entry(mapping, index, entry); 830 return ret; 831 832 put_unlocked: 833 put_unlocked_mapping_entry(mapping, index, entry2); 834 spin_unlock_irq(&mapping->tree_lock); 835 return ret; 836 } 837 838 /* 839 * Flush the mapping to the persistent domain within the byte range of [start, 840 * end]. This is required by data integrity operations to ensure file data is 841 * on persistent storage prior to completion of the operation. 842 */ 843 int dax_writeback_mapping_range(struct address_space *mapping, 844 struct block_device *bdev, struct writeback_control *wbc) 845 { 846 struct inode *inode = mapping->host; 847 pgoff_t start_index, end_index; 848 pgoff_t indices[PAGEVEC_SIZE]; 849 struct dax_device *dax_dev; 850 struct pagevec pvec; 851 bool done = false; 852 int i, ret = 0; 853 854 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 855 return -EIO; 856 857 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) 858 return 0; 859 860 dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); 861 if (!dax_dev) 862 return -EIO; 863 864 start_index = wbc->range_start >> PAGE_SHIFT; 865 end_index = wbc->range_end >> PAGE_SHIFT; 866 867 trace_dax_writeback_range(inode, start_index, end_index); 868 869 tag_pages_for_writeback(mapping, start_index, end_index); 870 871 pagevec_init(&pvec, 0); 872 while (!done) { 873 pvec.nr = find_get_entries_tag(mapping, start_index, 874 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, 875 pvec.pages, indices); 876 877 if (pvec.nr == 0) 878 break; 879 880 for (i = 0; i < pvec.nr; i++) { 881 if (indices[i] > end_index) { 882 done = true; 883 break; 884 } 885 886 ret = dax_writeback_one(bdev, dax_dev, mapping, 887 indices[i], pvec.pages[i]); 888 if (ret < 0) 889 goto out; 890 } 891 } 892 out: 893 put_dax(dax_dev); 894 trace_dax_writeback_range_done(inode, start_index, end_index); 895 return (ret < 0 ? ret : 0); 896 } 897 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 898 899 static int dax_insert_mapping(struct address_space *mapping, 900 struct block_device *bdev, struct dax_device *dax_dev, 901 sector_t sector, size_t size, void **entryp, 902 struct vm_area_struct *vma, struct vm_fault *vmf) 903 { 904 unsigned long vaddr = vmf->address; 905 void *entry = *entryp; 906 void *ret, *kaddr; 907 pgoff_t pgoff; 908 int id, rc; 909 pfn_t pfn; 910 911 rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); 912 if (rc) 913 return rc; 914 915 id = dax_read_lock(); 916 rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); 917 if (rc < 0) { 918 dax_read_unlock(id); 919 return rc; 920 } 921 dax_read_unlock(id); 922 923 ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); 924 if (IS_ERR(ret)) 925 return PTR_ERR(ret); 926 *entryp = ret; 927 928 trace_dax_insert_mapping(mapping->host, vmf, ret); 929 return vm_insert_mixed(vma, vaddr, pfn); 930 } 931 932 /** 933 * dax_pfn_mkwrite - handle first write to DAX page 934 * @vmf: The description of the fault 935 */ 936 int dax_pfn_mkwrite(struct vm_fault *vmf) 937 { 938 struct file *file = vmf->vma->vm_file; 939 struct address_space *mapping = file->f_mapping; 940 struct inode *inode = mapping->host; 941 void *entry, **slot; 942 pgoff_t index = vmf->pgoff; 943 944 spin_lock_irq(&mapping->tree_lock); 945 entry = get_unlocked_mapping_entry(mapping, index, &slot); 946 if (!entry || !radix_tree_exceptional_entry(entry)) { 947 if (entry) 948 put_unlocked_mapping_entry(mapping, index, entry); 949 spin_unlock_irq(&mapping->tree_lock); 950 trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); 951 return VM_FAULT_NOPAGE; 952 } 953 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); 954 entry = lock_slot(mapping, slot); 955 spin_unlock_irq(&mapping->tree_lock); 956 /* 957 * If we race with somebody updating the PTE and finish_mkwrite_fault() 958 * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry 959 * the fault in either case. 960 */ 961 finish_mkwrite_fault(vmf); 962 put_locked_mapping_entry(mapping, index, entry); 963 trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); 964 return VM_FAULT_NOPAGE; 965 } 966 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 967 968 static bool dax_range_is_aligned(struct block_device *bdev, 969 unsigned int offset, unsigned int length) 970 { 971 unsigned short sector_size = bdev_logical_block_size(bdev); 972 973 if (!IS_ALIGNED(offset, sector_size)) 974 return false; 975 if (!IS_ALIGNED(length, sector_size)) 976 return false; 977 978 return true; 979 } 980 981 int __dax_zero_page_range(struct block_device *bdev, 982 struct dax_device *dax_dev, sector_t sector, 983 unsigned int offset, unsigned int size) 984 { 985 if (dax_range_is_aligned(bdev, offset, size)) { 986 sector_t start_sector = sector + (offset >> 9); 987 988 return blkdev_issue_zeroout(bdev, start_sector, 989 size >> 9, GFP_NOFS, 0); 990 } else { 991 pgoff_t pgoff; 992 long rc, id; 993 void *kaddr; 994 pfn_t pfn; 995 996 rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); 997 if (rc) 998 return rc; 999 1000 id = dax_read_lock(); 1001 rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, 1002 &pfn); 1003 if (rc < 0) { 1004 dax_read_unlock(id); 1005 return rc; 1006 } 1007 clear_pmem(kaddr + offset, size); 1008 dax_read_unlock(id); 1009 } 1010 return 0; 1011 } 1012 EXPORT_SYMBOL_GPL(__dax_zero_page_range); 1013 1014 static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) 1015 { 1016 return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); 1017 } 1018 1019 static loff_t 1020 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, 1021 struct iomap *iomap) 1022 { 1023 struct block_device *bdev = iomap->bdev; 1024 struct dax_device *dax_dev = iomap->dax_dev; 1025 struct iov_iter *iter = data; 1026 loff_t end = pos + length, done = 0; 1027 ssize_t ret = 0; 1028 int id; 1029 1030 if (iov_iter_rw(iter) == READ) { 1031 end = min(end, i_size_read(inode)); 1032 if (pos >= end) 1033 return 0; 1034 1035 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) 1036 return iov_iter_zero(min(length, end - pos), iter); 1037 } 1038 1039 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) 1040 return -EIO; 1041 1042 /* 1043 * Write can allocate block for an area which has a hole page mapped 1044 * into page tables. We have to tear down these mappings so that data 1045 * written by write(2) is visible in mmap. 1046 */ 1047 if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) { 1048 invalidate_inode_pages2_range(inode->i_mapping, 1049 pos >> PAGE_SHIFT, 1050 (end - 1) >> PAGE_SHIFT); 1051 } 1052 1053 id = dax_read_lock(); 1054 while (pos < end) { 1055 unsigned offset = pos & (PAGE_SIZE - 1); 1056 const size_t size = ALIGN(length + offset, PAGE_SIZE); 1057 const sector_t sector = dax_iomap_sector(iomap, pos); 1058 ssize_t map_len; 1059 pgoff_t pgoff; 1060 void *kaddr; 1061 pfn_t pfn; 1062 1063 if (fatal_signal_pending(current)) { 1064 ret = -EINTR; 1065 break; 1066 } 1067 1068 ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); 1069 if (ret) 1070 break; 1071 1072 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), 1073 &kaddr, &pfn); 1074 if (map_len < 0) { 1075 ret = map_len; 1076 break; 1077 } 1078 1079 map_len = PFN_PHYS(map_len); 1080 kaddr += offset; 1081 map_len -= offset; 1082 if (map_len > end - pos) 1083 map_len = end - pos; 1084 1085 if (iov_iter_rw(iter) == WRITE) 1086 map_len = copy_from_iter_pmem(kaddr, map_len, iter); 1087 else 1088 map_len = copy_to_iter(kaddr, map_len, iter); 1089 if (map_len <= 0) { 1090 ret = map_len ? map_len : -EFAULT; 1091 break; 1092 } 1093 1094 pos += map_len; 1095 length -= map_len; 1096 done += map_len; 1097 } 1098 dax_read_unlock(id); 1099 1100 return done ? done : ret; 1101 } 1102 1103 /** 1104 * dax_iomap_rw - Perform I/O to a DAX file 1105 * @iocb: The control block for this I/O 1106 * @iter: The addresses to do I/O from or to 1107 * @ops: iomap ops passed from the file system 1108 * 1109 * This function performs read and write operations to directly mapped 1110 * persistent memory. The callers needs to take care of read/write exclusion 1111 * and evicting any page cache pages in the region under I/O. 1112 */ 1113 ssize_t 1114 dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 1115 const struct iomap_ops *ops) 1116 { 1117 struct address_space *mapping = iocb->ki_filp->f_mapping; 1118 struct inode *inode = mapping->host; 1119 loff_t pos = iocb->ki_pos, ret = 0, done = 0; 1120 unsigned flags = 0; 1121 1122 if (iov_iter_rw(iter) == WRITE) { 1123 lockdep_assert_held_exclusive(&inode->i_rwsem); 1124 flags |= IOMAP_WRITE; 1125 } else { 1126 lockdep_assert_held(&inode->i_rwsem); 1127 } 1128 1129 while (iov_iter_count(iter)) { 1130 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, 1131 iter, dax_iomap_actor); 1132 if (ret <= 0) 1133 break; 1134 pos += ret; 1135 done += ret; 1136 } 1137 1138 iocb->ki_pos += done; 1139 return done ? done : ret; 1140 } 1141 EXPORT_SYMBOL_GPL(dax_iomap_rw); 1142 1143 static int dax_fault_return(int error) 1144 { 1145 if (error == 0) 1146 return VM_FAULT_NOPAGE; 1147 if (error == -ENOMEM) 1148 return VM_FAULT_OOM; 1149 return VM_FAULT_SIGBUS; 1150 } 1151 1152 static int dax_iomap_pte_fault(struct vm_fault *vmf, 1153 const struct iomap_ops *ops) 1154 { 1155 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1156 struct inode *inode = mapping->host; 1157 unsigned long vaddr = vmf->address; 1158 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; 1159 sector_t sector; 1160 struct iomap iomap = { 0 }; 1161 unsigned flags = IOMAP_FAULT; 1162 int error, major = 0; 1163 int vmf_ret = 0; 1164 void *entry; 1165 1166 trace_dax_pte_fault(inode, vmf, vmf_ret); 1167 /* 1168 * Check whether offset isn't beyond end of file now. Caller is supposed 1169 * to hold locks serializing us with truncate / punch hole so this is 1170 * a reliable test. 1171 */ 1172 if (pos >= i_size_read(inode)) { 1173 vmf_ret = VM_FAULT_SIGBUS; 1174 goto out; 1175 } 1176 1177 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) 1178 flags |= IOMAP_WRITE; 1179 1180 /* 1181 * Note that we don't bother to use iomap_apply here: DAX required 1182 * the file system block size to be equal the page size, which means 1183 * that we never have to deal with more than a single extent here. 1184 */ 1185 error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); 1186 if (error) { 1187 vmf_ret = dax_fault_return(error); 1188 goto out; 1189 } 1190 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { 1191 vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ 1192 goto finish_iomap; 1193 } 1194 1195 entry = grab_mapping_entry(mapping, vmf->pgoff, 0); 1196 if (IS_ERR(entry)) { 1197 vmf_ret = dax_fault_return(PTR_ERR(entry)); 1198 goto finish_iomap; 1199 } 1200 1201 sector = dax_iomap_sector(&iomap, pos); 1202 1203 if (vmf->cow_page) { 1204 switch (iomap.type) { 1205 case IOMAP_HOLE: 1206 case IOMAP_UNWRITTEN: 1207 clear_user_highpage(vmf->cow_page, vaddr); 1208 break; 1209 case IOMAP_MAPPED: 1210 error = copy_user_dax(iomap.bdev, iomap.dax_dev, 1211 sector, PAGE_SIZE, vmf->cow_page, vaddr); 1212 break; 1213 default: 1214 WARN_ON_ONCE(1); 1215 error = -EIO; 1216 break; 1217 } 1218 1219 if (error) 1220 goto error_unlock_entry; 1221 1222 __SetPageUptodate(vmf->cow_page); 1223 vmf_ret = finish_fault(vmf); 1224 if (!vmf_ret) 1225 vmf_ret = VM_FAULT_DONE_COW; 1226 goto unlock_entry; 1227 } 1228 1229 switch (iomap.type) { 1230 case IOMAP_MAPPED: 1231 if (iomap.flags & IOMAP_F_NEW) { 1232 count_vm_event(PGMAJFAULT); 1233 mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); 1234 major = VM_FAULT_MAJOR; 1235 } 1236 error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, 1237 sector, PAGE_SIZE, &entry, vmf->vma, vmf); 1238 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1239 if (error == -EBUSY) 1240 error = 0; 1241 break; 1242 case IOMAP_UNWRITTEN: 1243 case IOMAP_HOLE: 1244 if (!(vmf->flags & FAULT_FLAG_WRITE)) { 1245 vmf_ret = dax_load_hole(mapping, &entry, vmf); 1246 goto unlock_entry; 1247 } 1248 /*FALLTHRU*/ 1249 default: 1250 WARN_ON_ONCE(1); 1251 error = -EIO; 1252 break; 1253 } 1254 1255 error_unlock_entry: 1256 vmf_ret = dax_fault_return(error) | major; 1257 unlock_entry: 1258 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 1259 finish_iomap: 1260 if (ops->iomap_end) { 1261 int copied = PAGE_SIZE; 1262 1263 if (vmf_ret & VM_FAULT_ERROR) 1264 copied = 0; 1265 /* 1266 * The fault is done by now and there's no way back (other 1267 * thread may be already happily using PTE we have installed). 1268 * Just ignore error from ->iomap_end since we cannot do much 1269 * with it. 1270 */ 1271 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); 1272 } 1273 out: 1274 trace_dax_pte_fault_done(inode, vmf, vmf_ret); 1275 return vmf_ret; 1276 } 1277 1278 #ifdef CONFIG_FS_DAX_PMD 1279 /* 1280 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 1281 * more often than one might expect in the below functions. 1282 */ 1283 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 1284 1285 static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, 1286 loff_t pos, void **entryp) 1287 { 1288 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1289 const sector_t sector = dax_iomap_sector(iomap, pos); 1290 struct dax_device *dax_dev = iomap->dax_dev; 1291 struct block_device *bdev = iomap->bdev; 1292 struct inode *inode = mapping->host; 1293 const size_t size = PMD_SIZE; 1294 void *ret = NULL, *kaddr; 1295 long length = 0; 1296 pgoff_t pgoff; 1297 pfn_t pfn; 1298 int id; 1299 1300 if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) 1301 goto fallback; 1302 1303 id = dax_read_lock(); 1304 length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); 1305 if (length < 0) 1306 goto unlock_fallback; 1307 length = PFN_PHYS(length); 1308 1309 if (length < size) 1310 goto unlock_fallback; 1311 if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR) 1312 goto unlock_fallback; 1313 if (!pfn_t_devmap(pfn)) 1314 goto unlock_fallback; 1315 dax_read_unlock(id); 1316 1317 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, 1318 RADIX_DAX_PMD); 1319 if (IS_ERR(ret)) 1320 goto fallback; 1321 *entryp = ret; 1322 1323 trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); 1324 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, 1325 pfn, vmf->flags & FAULT_FLAG_WRITE); 1326 1327 unlock_fallback: 1328 dax_read_unlock(id); 1329 fallback: 1330 trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret); 1331 return VM_FAULT_FALLBACK; 1332 } 1333 1334 static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, 1335 void **entryp) 1336 { 1337 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1338 unsigned long pmd_addr = vmf->address & PMD_MASK; 1339 struct inode *inode = mapping->host; 1340 struct page *zero_page; 1341 void *ret = NULL; 1342 spinlock_t *ptl; 1343 pmd_t pmd_entry; 1344 1345 zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); 1346 1347 if (unlikely(!zero_page)) 1348 goto fallback; 1349 1350 ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, 1351 RADIX_DAX_PMD | RADIX_DAX_HZP); 1352 if (IS_ERR(ret)) 1353 goto fallback; 1354 *entryp = ret; 1355 1356 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); 1357 if (!pmd_none(*(vmf->pmd))) { 1358 spin_unlock(ptl); 1359 goto fallback; 1360 } 1361 1362 pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); 1363 pmd_entry = pmd_mkhuge(pmd_entry); 1364 set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); 1365 spin_unlock(ptl); 1366 trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); 1367 return VM_FAULT_NOPAGE; 1368 1369 fallback: 1370 trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); 1371 return VM_FAULT_FALLBACK; 1372 } 1373 1374 static int dax_iomap_pmd_fault(struct vm_fault *vmf, 1375 const struct iomap_ops *ops) 1376 { 1377 struct vm_area_struct *vma = vmf->vma; 1378 struct address_space *mapping = vma->vm_file->f_mapping; 1379 unsigned long pmd_addr = vmf->address & PMD_MASK; 1380 bool write = vmf->flags & FAULT_FLAG_WRITE; 1381 unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; 1382 struct inode *inode = mapping->host; 1383 int result = VM_FAULT_FALLBACK; 1384 struct iomap iomap = { 0 }; 1385 pgoff_t max_pgoff, pgoff; 1386 void *entry; 1387 loff_t pos; 1388 int error; 1389 1390 /* 1391 * Check whether offset isn't beyond end of file now. Caller is 1392 * supposed to hold locks serializing us with truncate / punch hole so 1393 * this is a reliable test. 1394 */ 1395 pgoff = linear_page_index(vma, pmd_addr); 1396 max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; 1397 1398 trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); 1399 1400 /* Fall back to PTEs if we're going to COW */ 1401 if (write && !(vma->vm_flags & VM_SHARED)) 1402 goto fallback; 1403 1404 /* If the PMD would extend outside the VMA */ 1405 if (pmd_addr < vma->vm_start) 1406 goto fallback; 1407 if ((pmd_addr + PMD_SIZE) > vma->vm_end) 1408 goto fallback; 1409 1410 if (pgoff > max_pgoff) { 1411 result = VM_FAULT_SIGBUS; 1412 goto out; 1413 } 1414 1415 /* If the PMD would extend beyond the file size */ 1416 if ((pgoff | PG_PMD_COLOUR) > max_pgoff) 1417 goto fallback; 1418 1419 /* 1420 * Note that we don't use iomap_apply here. We aren't doing I/O, only 1421 * setting up a mapping, so really we're using iomap_begin() as a way 1422 * to look up our filesystem block. 1423 */ 1424 pos = (loff_t)pgoff << PAGE_SHIFT; 1425 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); 1426 if (error) 1427 goto fallback; 1428 1429 if (iomap.offset + iomap.length < pos + PMD_SIZE) 1430 goto finish_iomap; 1431 1432 /* 1433 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX 1434 * PMD or a HZP entry. If it can't (because a 4k page is already in 1435 * the tree, for instance), it will return -EEXIST and we just fall 1436 * back to 4k entries. 1437 */ 1438 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); 1439 if (IS_ERR(entry)) 1440 goto finish_iomap; 1441 1442 switch (iomap.type) { 1443 case IOMAP_MAPPED: 1444 result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); 1445 break; 1446 case IOMAP_UNWRITTEN: 1447 case IOMAP_HOLE: 1448 if (WARN_ON_ONCE(write)) 1449 goto unlock_entry; 1450 result = dax_pmd_load_hole(vmf, &iomap, &entry); 1451 break; 1452 default: 1453 WARN_ON_ONCE(1); 1454 break; 1455 } 1456 1457 unlock_entry: 1458 put_locked_mapping_entry(mapping, pgoff, entry); 1459 finish_iomap: 1460 if (ops->iomap_end) { 1461 int copied = PMD_SIZE; 1462 1463 if (result == VM_FAULT_FALLBACK) 1464 copied = 0; 1465 /* 1466 * The fault is done by now and there's no way back (other 1467 * thread may be already happily using PMD we have installed). 1468 * Just ignore error from ->iomap_end since we cannot do much 1469 * with it. 1470 */ 1471 ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, 1472 &iomap); 1473 } 1474 fallback: 1475 if (result == VM_FAULT_FALLBACK) { 1476 split_huge_pmd(vma, vmf->pmd, vmf->address); 1477 count_vm_event(THP_FAULT_FALLBACK); 1478 } 1479 out: 1480 trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result); 1481 return result; 1482 } 1483 #else 1484 static int dax_iomap_pmd_fault(struct vm_fault *vmf, 1485 const struct iomap_ops *ops) 1486 { 1487 return VM_FAULT_FALLBACK; 1488 } 1489 #endif /* CONFIG_FS_DAX_PMD */ 1490 1491 /** 1492 * dax_iomap_fault - handle a page fault on a DAX file 1493 * @vmf: The description of the fault 1494 * @ops: iomap ops passed from the file system 1495 * 1496 * When a page fault occurs, filesystems may call this helper in 1497 * their fault handler for DAX files. dax_iomap_fault() assumes the caller 1498 * has done all the necessary locking for page fault to proceed 1499 * successfully. 1500 */ 1501 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, 1502 const struct iomap_ops *ops) 1503 { 1504 switch (pe_size) { 1505 case PE_SIZE_PTE: 1506 return dax_iomap_pte_fault(vmf, ops); 1507 case PE_SIZE_PMD: 1508 return dax_iomap_pmd_fault(vmf, ops); 1509 default: 1510 return VM_FAULT_FALLBACK; 1511 } 1512 } 1513 EXPORT_SYMBOL_GPL(dax_iomap_fault); 1514