1 /* 2 * fs/dax.c - Direct Access filesystem code 3 * Copyright (c) 2013-2014 Intel Corporation 4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 */ 16 17 #include <linux/atomic.h> 18 #include <linux/blkdev.h> 19 #include <linux/buffer_head.h> 20 #include <linux/dax.h> 21 #include <linux/fs.h> 22 #include <linux/genhd.h> 23 #include <linux/highmem.h> 24 #include <linux/memcontrol.h> 25 #include <linux/mm.h> 26 #include <linux/mutex.h> 27 #include <linux/pagevec.h> 28 #include <linux/pmem.h> 29 #include <linux/sched.h> 30 #include <linux/uio.h> 31 #include <linux/vmstat.h> 32 #include <linux/pfn_t.h> 33 #include <linux/sizes.h> 34 35 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) 36 { 37 struct request_queue *q = bdev->bd_queue; 38 long rc = -EIO; 39 40 dax->addr = (void __pmem *) ERR_PTR(-EIO); 41 if (blk_queue_enter(q, true) != 0) 42 return rc; 43 44 rc = bdev_direct_access(bdev, dax); 45 if (rc < 0) { 46 dax->addr = (void __pmem *) ERR_PTR(rc); 47 blk_queue_exit(q); 48 return rc; 49 } 50 return rc; 51 } 52 53 static void dax_unmap_atomic(struct block_device *bdev, 54 const struct blk_dax_ctl *dax) 55 { 56 if (IS_ERR(dax->addr)) 57 return; 58 blk_queue_exit(bdev->bd_queue); 59 } 60 61 /* 62 * dax_clear_blocks() is called from within transaction context from XFS, 63 * and hence this means the stack from this point must follow GFP_NOFS 64 * semantics for all operations. 65 */ 66 int dax_clear_blocks(struct inode *inode, sector_t block, long _size) 67 { 68 struct block_device *bdev = inode->i_sb->s_bdev; 69 struct blk_dax_ctl dax = { 70 .sector = block << (inode->i_blkbits - 9), 71 .size = _size, 72 }; 73 74 might_sleep(); 75 do { 76 long count, sz; 77 78 count = dax_map_atomic(bdev, &dax); 79 if (count < 0) 80 return count; 81 sz = min_t(long, count, SZ_128K); 82 clear_pmem(dax.addr, sz); 83 dax.size -= sz; 84 dax.sector += sz / 512; 85 dax_unmap_atomic(bdev, &dax); 86 cond_resched(); 87 } while (dax.size); 88 89 wmb_pmem(); 90 return 0; 91 } 92 EXPORT_SYMBOL_GPL(dax_clear_blocks); 93 94 /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ 95 static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, 96 loff_t pos, loff_t end) 97 { 98 loff_t final = end - pos + first; /* The final byte of the buffer */ 99 100 if (first > 0) 101 clear_pmem(addr, first); 102 if (final < size) 103 clear_pmem(addr + final, size - final); 104 } 105 106 static bool buffer_written(struct buffer_head *bh) 107 { 108 return buffer_mapped(bh) && !buffer_unwritten(bh); 109 } 110 111 /* 112 * When ext4 encounters a hole, it returns without modifying the buffer_head 113 * which means that we can't trust b_size. To cope with this, we set b_state 114 * to 0 before calling get_block and, if any bit is set, we know we can trust 115 * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is 116 * and would save us time calling get_block repeatedly. 117 */ 118 static bool buffer_size_valid(struct buffer_head *bh) 119 { 120 return bh->b_state != 0; 121 } 122 123 124 static sector_t to_sector(const struct buffer_head *bh, 125 const struct inode *inode) 126 { 127 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 128 129 return sector; 130 } 131 132 static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, 133 loff_t start, loff_t end, get_block_t get_block, 134 struct buffer_head *bh) 135 { 136 loff_t pos = start, max = start, bh_max = start; 137 bool hole = false, need_wmb = false; 138 struct block_device *bdev = NULL; 139 int rw = iov_iter_rw(iter), rc; 140 long map_len = 0; 141 struct blk_dax_ctl dax = { 142 .addr = (void __pmem *) ERR_PTR(-EIO), 143 }; 144 145 if (rw == READ) 146 end = min(end, i_size_read(inode)); 147 148 while (pos < end) { 149 size_t len; 150 if (pos == max) { 151 unsigned blkbits = inode->i_blkbits; 152 long page = pos >> PAGE_SHIFT; 153 sector_t block = page << (PAGE_SHIFT - blkbits); 154 unsigned first = pos - (block << blkbits); 155 long size; 156 157 if (pos == bh_max) { 158 bh->b_size = PAGE_ALIGN(end - pos); 159 bh->b_state = 0; 160 rc = get_block(inode, block, bh, rw == WRITE); 161 if (rc) 162 break; 163 if (!buffer_size_valid(bh)) 164 bh->b_size = 1 << blkbits; 165 bh_max = pos - first + bh->b_size; 166 bdev = bh->b_bdev; 167 } else { 168 unsigned done = bh->b_size - 169 (bh_max - (pos - first)); 170 bh->b_blocknr += done >> blkbits; 171 bh->b_size -= done; 172 } 173 174 hole = rw == READ && !buffer_written(bh); 175 if (hole) { 176 size = bh->b_size - first; 177 } else { 178 dax_unmap_atomic(bdev, &dax); 179 dax.sector = to_sector(bh, inode); 180 dax.size = bh->b_size; 181 map_len = dax_map_atomic(bdev, &dax); 182 if (map_len < 0) { 183 rc = map_len; 184 break; 185 } 186 if (buffer_unwritten(bh) || buffer_new(bh)) { 187 dax_new_buf(dax.addr, map_len, first, 188 pos, end); 189 need_wmb = true; 190 } 191 dax.addr += first; 192 size = map_len - first; 193 } 194 max = min(pos + size, end); 195 } 196 197 if (iov_iter_rw(iter) == WRITE) { 198 len = copy_from_iter_pmem(dax.addr, max - pos, iter); 199 need_wmb = true; 200 } else if (!hole) 201 len = copy_to_iter((void __force *) dax.addr, max - pos, 202 iter); 203 else 204 len = iov_iter_zero(max - pos, iter); 205 206 if (!len) { 207 rc = -EFAULT; 208 break; 209 } 210 211 pos += len; 212 if (!IS_ERR(dax.addr)) 213 dax.addr += len; 214 } 215 216 if (need_wmb) 217 wmb_pmem(); 218 dax_unmap_atomic(bdev, &dax); 219 220 return (pos == start) ? rc : pos - start; 221 } 222 223 /** 224 * dax_do_io - Perform I/O to a DAX file 225 * @iocb: The control block for this I/O 226 * @inode: The file which the I/O is directed at 227 * @iter: The addresses to do I/O from or to 228 * @pos: The file offset where the I/O starts 229 * @get_block: The filesystem method used to translate file offsets to blocks 230 * @end_io: A filesystem callback for I/O completion 231 * @flags: See below 232 * 233 * This function uses the same locking scheme as do_blockdev_direct_IO: 234 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the 235 * caller for writes. For reads, we take and release the i_mutex ourselves. 236 * If DIO_LOCKING is not set, the filesystem takes care of its own locking. 237 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O 238 * is in progress. 239 */ 240 ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, 241 struct iov_iter *iter, loff_t pos, get_block_t get_block, 242 dio_iodone_t end_io, int flags) 243 { 244 struct buffer_head bh; 245 ssize_t retval = -EINVAL; 246 loff_t end = pos + iov_iter_count(iter); 247 248 memset(&bh, 0, sizeof(bh)); 249 bh.b_bdev = inode->i_sb->s_bdev; 250 251 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { 252 struct address_space *mapping = inode->i_mapping; 253 inode_lock(inode); 254 retval = filemap_write_and_wait_range(mapping, pos, end - 1); 255 if (retval) { 256 inode_unlock(inode); 257 goto out; 258 } 259 } 260 261 /* Protects against truncate */ 262 if (!(flags & DIO_SKIP_DIO_COUNT)) 263 inode_dio_begin(inode); 264 265 retval = dax_io(inode, iter, pos, end, get_block, &bh); 266 267 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 268 inode_unlock(inode); 269 270 if ((retval > 0) && end_io) 271 end_io(iocb, pos, retval, bh.b_private); 272 273 if (!(flags & DIO_SKIP_DIO_COUNT)) 274 inode_dio_end(inode); 275 out: 276 return retval; 277 } 278 EXPORT_SYMBOL_GPL(dax_do_io); 279 280 /* 281 * The user has performed a load from a hole in the file. Allocating 282 * a new page in the file would cause excessive storage usage for 283 * workloads with sparse files. We allocate a page cache page instead. 284 * We'll kick it out of the page cache if it's ever written to, 285 * otherwise it will simply fall out of the page cache under memory 286 * pressure without ever having been dirtied. 287 */ 288 static int dax_load_hole(struct address_space *mapping, struct page *page, 289 struct vm_fault *vmf) 290 { 291 unsigned long size; 292 struct inode *inode = mapping->host; 293 if (!page) 294 page = find_or_create_page(mapping, vmf->pgoff, 295 GFP_KERNEL | __GFP_ZERO); 296 if (!page) 297 return VM_FAULT_OOM; 298 /* Recheck i_size under page lock to avoid truncate race */ 299 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 300 if (vmf->pgoff >= size) { 301 unlock_page(page); 302 page_cache_release(page); 303 return VM_FAULT_SIGBUS; 304 } 305 306 vmf->page = page; 307 return VM_FAULT_LOCKED; 308 } 309 310 static int copy_user_bh(struct page *to, struct inode *inode, 311 struct buffer_head *bh, unsigned long vaddr) 312 { 313 struct blk_dax_ctl dax = { 314 .sector = to_sector(bh, inode), 315 .size = bh->b_size, 316 }; 317 struct block_device *bdev = bh->b_bdev; 318 void *vto; 319 320 if (dax_map_atomic(bdev, &dax) < 0) 321 return PTR_ERR(dax.addr); 322 vto = kmap_atomic(to); 323 copy_user_page(vto, (void __force *)dax.addr, vaddr, to); 324 kunmap_atomic(vto); 325 dax_unmap_atomic(bdev, &dax); 326 return 0; 327 } 328 329 #define NO_SECTOR -1 330 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT)) 331 332 static int dax_radix_entry(struct address_space *mapping, pgoff_t index, 333 sector_t sector, bool pmd_entry, bool dirty) 334 { 335 struct radix_tree_root *page_tree = &mapping->page_tree; 336 pgoff_t pmd_index = DAX_PMD_INDEX(index); 337 int type, error = 0; 338 void *entry; 339 340 WARN_ON_ONCE(pmd_entry && !dirty); 341 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 342 343 spin_lock_irq(&mapping->tree_lock); 344 345 entry = radix_tree_lookup(page_tree, pmd_index); 346 if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { 347 index = pmd_index; 348 goto dirty; 349 } 350 351 entry = radix_tree_lookup(page_tree, index); 352 if (entry) { 353 type = RADIX_DAX_TYPE(entry); 354 if (WARN_ON_ONCE(type != RADIX_DAX_PTE && 355 type != RADIX_DAX_PMD)) { 356 error = -EIO; 357 goto unlock; 358 } 359 360 if (!pmd_entry || type == RADIX_DAX_PMD) 361 goto dirty; 362 363 /* 364 * We only insert dirty PMD entries into the radix tree. This 365 * means we don't need to worry about removing a dirty PTE 366 * entry and inserting a clean PMD entry, thus reducing the 367 * range we would flush with a follow-up fsync/msync call. 368 */ 369 radix_tree_delete(&mapping->page_tree, index); 370 mapping->nrexceptional--; 371 } 372 373 if (sector == NO_SECTOR) { 374 /* 375 * This can happen during correct operation if our pfn_mkwrite 376 * fault raced against a hole punch operation. If this 377 * happens the pte that was hole punched will have been 378 * unmapped and the radix tree entry will have been removed by 379 * the time we are called, but the call will still happen. We 380 * will return all the way up to wp_pfn_shared(), where the 381 * pte_same() check will fail, eventually causing page fault 382 * to be retried by the CPU. 383 */ 384 goto unlock; 385 } 386 387 error = radix_tree_insert(page_tree, index, 388 RADIX_DAX_ENTRY(sector, pmd_entry)); 389 if (error) 390 goto unlock; 391 392 mapping->nrexceptional++; 393 dirty: 394 if (dirty) 395 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 396 unlock: 397 spin_unlock_irq(&mapping->tree_lock); 398 return error; 399 } 400 401 static int dax_writeback_one(struct block_device *bdev, 402 struct address_space *mapping, pgoff_t index, void *entry) 403 { 404 struct radix_tree_root *page_tree = &mapping->page_tree; 405 int type = RADIX_DAX_TYPE(entry); 406 struct radix_tree_node *node; 407 struct blk_dax_ctl dax; 408 void **slot; 409 int ret = 0; 410 411 spin_lock_irq(&mapping->tree_lock); 412 /* 413 * Regular page slots are stabilized by the page lock even 414 * without the tree itself locked. These unlocked entries 415 * need verification under the tree lock. 416 */ 417 if (!__radix_tree_lookup(page_tree, index, &node, &slot)) 418 goto unlock; 419 if (*slot != entry) 420 goto unlock; 421 422 /* another fsync thread may have already written back this entry */ 423 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) 424 goto unlock; 425 426 if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { 427 ret = -EIO; 428 goto unlock; 429 } 430 431 dax.sector = RADIX_DAX_SECTOR(entry); 432 dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); 433 spin_unlock_irq(&mapping->tree_lock); 434 435 /* 436 * We cannot hold tree_lock while calling dax_map_atomic() because it 437 * eventually calls cond_resched(). 438 */ 439 ret = dax_map_atomic(bdev, &dax); 440 if (ret < 0) 441 return ret; 442 443 if (WARN_ON_ONCE(ret < dax.size)) { 444 ret = -EIO; 445 goto unmap; 446 } 447 448 wb_cache_pmem(dax.addr, dax.size); 449 450 spin_lock_irq(&mapping->tree_lock); 451 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); 452 spin_unlock_irq(&mapping->tree_lock); 453 unmap: 454 dax_unmap_atomic(bdev, &dax); 455 return ret; 456 457 unlock: 458 spin_unlock_irq(&mapping->tree_lock); 459 return ret; 460 } 461 462 /* 463 * Flush the mapping to the persistent domain within the byte range of [start, 464 * end]. This is required by data integrity operations to ensure file data is 465 * on persistent storage prior to completion of the operation. 466 */ 467 int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, 468 loff_t end) 469 { 470 struct inode *inode = mapping->host; 471 struct block_device *bdev = inode->i_sb->s_bdev; 472 pgoff_t start_index, end_index, pmd_index; 473 pgoff_t indices[PAGEVEC_SIZE]; 474 struct pagevec pvec; 475 bool done = false; 476 int i, ret = 0; 477 void *entry; 478 479 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 480 return -EIO; 481 482 start_index = start >> PAGE_CACHE_SHIFT; 483 end_index = end >> PAGE_CACHE_SHIFT; 484 pmd_index = DAX_PMD_INDEX(start_index); 485 486 rcu_read_lock(); 487 entry = radix_tree_lookup(&mapping->page_tree, pmd_index); 488 rcu_read_unlock(); 489 490 /* see if the start of our range is covered by a PMD entry */ 491 if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) 492 start_index = pmd_index; 493 494 tag_pages_for_writeback(mapping, start_index, end_index); 495 496 pagevec_init(&pvec, 0); 497 while (!done) { 498 pvec.nr = find_get_entries_tag(mapping, start_index, 499 PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, 500 pvec.pages, indices); 501 502 if (pvec.nr == 0) 503 break; 504 505 for (i = 0; i < pvec.nr; i++) { 506 if (indices[i] > end_index) { 507 done = true; 508 break; 509 } 510 511 ret = dax_writeback_one(bdev, mapping, indices[i], 512 pvec.pages[i]); 513 if (ret < 0) 514 return ret; 515 } 516 } 517 wmb_pmem(); 518 return 0; 519 } 520 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 521 522 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, 523 struct vm_area_struct *vma, struct vm_fault *vmf) 524 { 525 unsigned long vaddr = (unsigned long)vmf->virtual_address; 526 struct address_space *mapping = inode->i_mapping; 527 struct block_device *bdev = bh->b_bdev; 528 struct blk_dax_ctl dax = { 529 .sector = to_sector(bh, inode), 530 .size = bh->b_size, 531 }; 532 pgoff_t size; 533 int error; 534 535 i_mmap_lock_read(mapping); 536 537 /* 538 * Check truncate didn't happen while we were allocating a block. 539 * If it did, this block may or may not be still allocated to the 540 * file. We can't tell the filesystem to free it because we can't 541 * take i_mutex here. In the worst case, the file still has blocks 542 * allocated past the end of the file. 543 */ 544 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 545 if (unlikely(vmf->pgoff >= size)) { 546 error = -EIO; 547 goto out; 548 } 549 550 if (dax_map_atomic(bdev, &dax) < 0) { 551 error = PTR_ERR(dax.addr); 552 goto out; 553 } 554 555 if (buffer_unwritten(bh) || buffer_new(bh)) { 556 clear_pmem(dax.addr, PAGE_SIZE); 557 wmb_pmem(); 558 } 559 dax_unmap_atomic(bdev, &dax); 560 561 error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, 562 vmf->flags & FAULT_FLAG_WRITE); 563 if (error) 564 goto out; 565 566 error = vm_insert_mixed(vma, vaddr, dax.pfn); 567 568 out: 569 i_mmap_unlock_read(mapping); 570 571 return error; 572 } 573 574 /** 575 * __dax_fault - handle a page fault on a DAX file 576 * @vma: The virtual memory area where the fault occurred 577 * @vmf: The description of the fault 578 * @get_block: The filesystem method used to translate file offsets to blocks 579 * @complete_unwritten: The filesystem method used to convert unwritten blocks 580 * to written so the data written to them is exposed. This is required for 581 * required by write faults for filesystems that will return unwritten 582 * extent mappings from @get_block, but it is optional for reads as 583 * dax_insert_mapping() will always zero unwritten blocks. If the fs does 584 * not support unwritten extents, the it should pass NULL. 585 * 586 * When a page fault occurs, filesystems may call this helper in their 587 * fault handler for DAX files. __dax_fault() assumes the caller has done all 588 * the necessary locking for the page fault to proceed successfully. 589 */ 590 int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 591 get_block_t get_block, dax_iodone_t complete_unwritten) 592 { 593 struct file *file = vma->vm_file; 594 struct address_space *mapping = file->f_mapping; 595 struct inode *inode = mapping->host; 596 struct page *page; 597 struct buffer_head bh; 598 unsigned long vaddr = (unsigned long)vmf->virtual_address; 599 unsigned blkbits = inode->i_blkbits; 600 sector_t block; 601 pgoff_t size; 602 int error; 603 int major = 0; 604 605 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 606 if (vmf->pgoff >= size) 607 return VM_FAULT_SIGBUS; 608 609 memset(&bh, 0, sizeof(bh)); 610 block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 611 bh.b_bdev = inode->i_sb->s_bdev; 612 bh.b_size = PAGE_SIZE; 613 614 repeat: 615 page = find_get_page(mapping, vmf->pgoff); 616 if (page) { 617 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 618 page_cache_release(page); 619 return VM_FAULT_RETRY; 620 } 621 if (unlikely(page->mapping != mapping)) { 622 unlock_page(page); 623 page_cache_release(page); 624 goto repeat; 625 } 626 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 627 if (unlikely(vmf->pgoff >= size)) { 628 /* 629 * We have a struct page covering a hole in the file 630 * from a read fault and we've raced with a truncate 631 */ 632 error = -EIO; 633 goto unlock_page; 634 } 635 } 636 637 error = get_block(inode, block, &bh, 0); 638 if (!error && (bh.b_size < PAGE_SIZE)) 639 error = -EIO; /* fs corruption? */ 640 if (error) 641 goto unlock_page; 642 643 if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { 644 if (vmf->flags & FAULT_FLAG_WRITE) { 645 error = get_block(inode, block, &bh, 1); 646 count_vm_event(PGMAJFAULT); 647 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 648 major = VM_FAULT_MAJOR; 649 if (!error && (bh.b_size < PAGE_SIZE)) 650 error = -EIO; 651 if (error) 652 goto unlock_page; 653 } else { 654 return dax_load_hole(mapping, page, vmf); 655 } 656 } 657 658 if (vmf->cow_page) { 659 struct page *new_page = vmf->cow_page; 660 if (buffer_written(&bh)) 661 error = copy_user_bh(new_page, inode, &bh, vaddr); 662 else 663 clear_user_highpage(new_page, vaddr); 664 if (error) 665 goto unlock_page; 666 vmf->page = page; 667 if (!page) { 668 i_mmap_lock_read(mapping); 669 /* Check we didn't race with truncate */ 670 size = (i_size_read(inode) + PAGE_SIZE - 1) >> 671 PAGE_SHIFT; 672 if (vmf->pgoff >= size) { 673 i_mmap_unlock_read(mapping); 674 error = -EIO; 675 goto out; 676 } 677 } 678 return VM_FAULT_LOCKED; 679 } 680 681 /* Check we didn't race with a read fault installing a new page */ 682 if (!page && major) 683 page = find_lock_page(mapping, vmf->pgoff); 684 685 if (page) { 686 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 687 PAGE_CACHE_SIZE, 0); 688 delete_from_page_cache(page); 689 unlock_page(page); 690 page_cache_release(page); 691 page = NULL; 692 } 693 694 /* 695 * If we successfully insert the new mapping over an unwritten extent, 696 * we need to ensure we convert the unwritten extent. If there is an 697 * error inserting the mapping, the filesystem needs to leave it as 698 * unwritten to prevent exposure of the stale underlying data to 699 * userspace, but we still need to call the completion function so 700 * the private resources on the mapping buffer can be released. We 701 * indicate what the callback should do via the uptodate variable, same 702 * as for normal BH based IO completions. 703 */ 704 error = dax_insert_mapping(inode, &bh, vma, vmf); 705 if (buffer_unwritten(&bh)) { 706 if (complete_unwritten) 707 complete_unwritten(&bh, !error); 708 else 709 WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); 710 } 711 712 out: 713 if (error == -ENOMEM) 714 return VM_FAULT_OOM | major; 715 /* -EBUSY is fine, somebody else faulted on the same PTE */ 716 if ((error < 0) && (error != -EBUSY)) 717 return VM_FAULT_SIGBUS | major; 718 return VM_FAULT_NOPAGE | major; 719 720 unlock_page: 721 if (page) { 722 unlock_page(page); 723 page_cache_release(page); 724 } 725 goto out; 726 } 727 EXPORT_SYMBOL(__dax_fault); 728 729 /** 730 * dax_fault - handle a page fault on a DAX file 731 * @vma: The virtual memory area where the fault occurred 732 * @vmf: The description of the fault 733 * @get_block: The filesystem method used to translate file offsets to blocks 734 * 735 * When a page fault occurs, filesystems may call this helper in their 736 * fault handler for DAX files. 737 */ 738 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 739 get_block_t get_block, dax_iodone_t complete_unwritten) 740 { 741 int result; 742 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 743 744 if (vmf->flags & FAULT_FLAG_WRITE) { 745 sb_start_pagefault(sb); 746 file_update_time(vma->vm_file); 747 } 748 result = __dax_fault(vma, vmf, get_block, complete_unwritten); 749 if (vmf->flags & FAULT_FLAG_WRITE) 750 sb_end_pagefault(sb); 751 752 return result; 753 } 754 EXPORT_SYMBOL_GPL(dax_fault); 755 756 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 757 /* 758 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 759 * more often than one might expect in the below function. 760 */ 761 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 762 763 static void __dax_dbg(struct buffer_head *bh, unsigned long address, 764 const char *reason, const char *fn) 765 { 766 if (bh) { 767 char bname[BDEVNAME_SIZE]; 768 bdevname(bh->b_bdev, bname); 769 pr_debug("%s: %s addr: %lx dev %s state %lx start %lld " 770 "length %zd fallback: %s\n", fn, current->comm, 771 address, bname, bh->b_state, (u64)bh->b_blocknr, 772 bh->b_size, reason); 773 } else { 774 pr_debug("%s: %s addr: %lx fallback: %s\n", fn, 775 current->comm, address, reason); 776 } 777 } 778 779 #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") 780 781 int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 782 pmd_t *pmd, unsigned int flags, get_block_t get_block, 783 dax_iodone_t complete_unwritten) 784 { 785 struct file *file = vma->vm_file; 786 struct address_space *mapping = file->f_mapping; 787 struct inode *inode = mapping->host; 788 struct buffer_head bh; 789 unsigned blkbits = inode->i_blkbits; 790 unsigned long pmd_addr = address & PMD_MASK; 791 bool write = flags & FAULT_FLAG_WRITE; 792 struct block_device *bdev; 793 pgoff_t size, pgoff; 794 sector_t block; 795 int error, result = 0; 796 bool alloc = false; 797 798 /* dax pmd mappings require pfn_t_devmap() */ 799 if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) 800 return VM_FAULT_FALLBACK; 801 802 /* Fall back to PTEs if we're going to COW */ 803 if (write && !(vma->vm_flags & VM_SHARED)) { 804 split_huge_pmd(vma, pmd, address); 805 dax_pmd_dbg(NULL, address, "cow write"); 806 return VM_FAULT_FALLBACK; 807 } 808 /* If the PMD would extend outside the VMA */ 809 if (pmd_addr < vma->vm_start) { 810 dax_pmd_dbg(NULL, address, "vma start unaligned"); 811 return VM_FAULT_FALLBACK; 812 } 813 if ((pmd_addr + PMD_SIZE) > vma->vm_end) { 814 dax_pmd_dbg(NULL, address, "vma end unaligned"); 815 return VM_FAULT_FALLBACK; 816 } 817 818 pgoff = linear_page_index(vma, pmd_addr); 819 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 820 if (pgoff >= size) 821 return VM_FAULT_SIGBUS; 822 /* If the PMD would cover blocks out of the file */ 823 if ((pgoff | PG_PMD_COLOUR) >= size) { 824 dax_pmd_dbg(NULL, address, 825 "offset + huge page size > file size"); 826 return VM_FAULT_FALLBACK; 827 } 828 829 memset(&bh, 0, sizeof(bh)); 830 bh.b_bdev = inode->i_sb->s_bdev; 831 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 832 833 bh.b_size = PMD_SIZE; 834 835 if (get_block(inode, block, &bh, 0) != 0) 836 return VM_FAULT_SIGBUS; 837 838 if (!buffer_mapped(&bh) && write) { 839 if (get_block(inode, block, &bh, 1) != 0) 840 return VM_FAULT_SIGBUS; 841 alloc = true; 842 } 843 844 bdev = bh.b_bdev; 845 846 /* 847 * If the filesystem isn't willing to tell us the length of a hole, 848 * just fall back to PTEs. Calling get_block 512 times in a loop 849 * would be silly. 850 */ 851 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { 852 dax_pmd_dbg(&bh, address, "allocated block too small"); 853 return VM_FAULT_FALLBACK; 854 } 855 856 /* 857 * If we allocated new storage, make sure no process has any 858 * zero pages covering this hole 859 */ 860 if (alloc) { 861 loff_t lstart = pgoff << PAGE_SHIFT; 862 loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ 863 864 truncate_pagecache_range(inode, lstart, lend); 865 } 866 867 i_mmap_lock_read(mapping); 868 869 /* 870 * If a truncate happened while we were allocating blocks, we may 871 * leave blocks allocated to the file that are beyond EOF. We can't 872 * take i_mutex here, so just leave them hanging; they'll be freed 873 * when the file is deleted. 874 */ 875 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 876 if (pgoff >= size) { 877 result = VM_FAULT_SIGBUS; 878 goto out; 879 } 880 if ((pgoff | PG_PMD_COLOUR) >= size) { 881 dax_pmd_dbg(&bh, address, 882 "offset + huge page size > file size"); 883 goto fallback; 884 } 885 886 if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { 887 spinlock_t *ptl; 888 pmd_t entry; 889 struct page *zero_page = get_huge_zero_page(); 890 891 if (unlikely(!zero_page)) { 892 dax_pmd_dbg(&bh, address, "no zero page"); 893 goto fallback; 894 } 895 896 ptl = pmd_lock(vma->vm_mm, pmd); 897 if (!pmd_none(*pmd)) { 898 spin_unlock(ptl); 899 dax_pmd_dbg(&bh, address, "pmd already present"); 900 goto fallback; 901 } 902 903 dev_dbg(part_to_dev(bdev->bd_part), 904 "%s: %s addr: %lx pfn: <zero> sect: %llx\n", 905 __func__, current->comm, address, 906 (unsigned long long) to_sector(&bh, inode)); 907 908 entry = mk_pmd(zero_page, vma->vm_page_prot); 909 entry = pmd_mkhuge(entry); 910 set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); 911 result = VM_FAULT_NOPAGE; 912 spin_unlock(ptl); 913 } else { 914 struct blk_dax_ctl dax = { 915 .sector = to_sector(&bh, inode), 916 .size = PMD_SIZE, 917 }; 918 long length = dax_map_atomic(bdev, &dax); 919 920 if (length < 0) { 921 result = VM_FAULT_SIGBUS; 922 goto out; 923 } 924 if (length < PMD_SIZE) { 925 dax_pmd_dbg(&bh, address, "dax-length too small"); 926 dax_unmap_atomic(bdev, &dax); 927 goto fallback; 928 } 929 if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { 930 dax_pmd_dbg(&bh, address, "pfn unaligned"); 931 dax_unmap_atomic(bdev, &dax); 932 goto fallback; 933 } 934 935 if (!pfn_t_devmap(dax.pfn)) { 936 dax_unmap_atomic(bdev, &dax); 937 dax_pmd_dbg(&bh, address, "pfn not in memmap"); 938 goto fallback; 939 } 940 941 if (buffer_unwritten(&bh) || buffer_new(&bh)) { 942 clear_pmem(dax.addr, PMD_SIZE); 943 wmb_pmem(); 944 count_vm_event(PGMAJFAULT); 945 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 946 result |= VM_FAULT_MAJOR; 947 } 948 dax_unmap_atomic(bdev, &dax); 949 950 /* 951 * For PTE faults we insert a radix tree entry for reads, and 952 * leave it clean. Then on the first write we dirty the radix 953 * tree entry via the dax_pfn_mkwrite() path. This sequence 954 * allows the dax_pfn_mkwrite() call to be simpler and avoid a 955 * call into get_block() to translate the pgoff to a sector in 956 * order to be able to create a new radix tree entry. 957 * 958 * The PMD path doesn't have an equivalent to 959 * dax_pfn_mkwrite(), though, so for a read followed by a 960 * write we traverse all the way through __dax_pmd_fault() 961 * twice. This means we can just skip inserting a radix tree 962 * entry completely on the initial read and just wait until 963 * the write to insert a dirty entry. 964 */ 965 if (write) { 966 error = dax_radix_entry(mapping, pgoff, dax.sector, 967 true, true); 968 if (error) { 969 dax_pmd_dbg(&bh, address, 970 "PMD radix insertion failed"); 971 goto fallback; 972 } 973 } 974 975 dev_dbg(part_to_dev(bdev->bd_part), 976 "%s: %s addr: %lx pfn: %lx sect: %llx\n", 977 __func__, current->comm, address, 978 pfn_t_to_pfn(dax.pfn), 979 (unsigned long long) dax.sector); 980 result |= vmf_insert_pfn_pmd(vma, address, pmd, 981 dax.pfn, write); 982 } 983 984 out: 985 i_mmap_unlock_read(mapping); 986 987 if (buffer_unwritten(&bh)) 988 complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); 989 990 return result; 991 992 fallback: 993 count_vm_event(THP_FAULT_FALLBACK); 994 result = VM_FAULT_FALLBACK; 995 goto out; 996 } 997 EXPORT_SYMBOL_GPL(__dax_pmd_fault); 998 999 /** 1000 * dax_pmd_fault - handle a PMD fault on a DAX file 1001 * @vma: The virtual memory area where the fault occurred 1002 * @vmf: The description of the fault 1003 * @get_block: The filesystem method used to translate file offsets to blocks 1004 * 1005 * When a page fault occurs, filesystems may call this helper in their 1006 * pmd_fault handler for DAX files. 1007 */ 1008 int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 1009 pmd_t *pmd, unsigned int flags, get_block_t get_block, 1010 dax_iodone_t complete_unwritten) 1011 { 1012 int result; 1013 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 1014 1015 if (flags & FAULT_FLAG_WRITE) { 1016 sb_start_pagefault(sb); 1017 file_update_time(vma->vm_file); 1018 } 1019 result = __dax_pmd_fault(vma, address, pmd, flags, get_block, 1020 complete_unwritten); 1021 if (flags & FAULT_FLAG_WRITE) 1022 sb_end_pagefault(sb); 1023 1024 return result; 1025 } 1026 EXPORT_SYMBOL_GPL(dax_pmd_fault); 1027 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1028 1029 /** 1030 * dax_pfn_mkwrite - handle first write to DAX page 1031 * @vma: The virtual memory area where the fault occurred 1032 * @vmf: The description of the fault 1033 */ 1034 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1035 { 1036 struct file *file = vma->vm_file; 1037 1038 /* 1039 * We pass NO_SECTOR to dax_radix_entry() because we expect that a 1040 * RADIX_DAX_PTE entry already exists in the radix tree from a 1041 * previous call to __dax_fault(). We just want to look up that PTE 1042 * entry using vmf->pgoff and make sure the dirty tag is set. This 1043 * saves us from having to make a call to get_block() here to look 1044 * up the sector. 1045 */ 1046 dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true); 1047 return VM_FAULT_NOPAGE; 1048 } 1049 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 1050 1051 /** 1052 * dax_zero_page_range - zero a range within a page of a DAX file 1053 * @inode: The file being truncated 1054 * @from: The file offset that is being truncated to 1055 * @length: The number of bytes to zero 1056 * @get_block: The filesystem method used to translate file offsets to blocks 1057 * 1058 * This function can be called by a filesystem when it is zeroing part of a 1059 * page in a DAX file. This is intended for hole-punch operations. If 1060 * you are truncating a file, the helper function dax_truncate_page() may be 1061 * more convenient. 1062 * 1063 * We work in terms of PAGE_CACHE_SIZE here for commonality with 1064 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem 1065 * took care of disposing of the unnecessary blocks. Even if the filesystem 1066 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page 1067 * since the file might be mmapped. 1068 */ 1069 int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, 1070 get_block_t get_block) 1071 { 1072 struct buffer_head bh; 1073 pgoff_t index = from >> PAGE_CACHE_SHIFT; 1074 unsigned offset = from & (PAGE_CACHE_SIZE-1); 1075 int err; 1076 1077 /* Block boundary? Nothing to do */ 1078 if (!length) 1079 return 0; 1080 BUG_ON((offset + length) > PAGE_CACHE_SIZE); 1081 1082 memset(&bh, 0, sizeof(bh)); 1083 bh.b_bdev = inode->i_sb->s_bdev; 1084 bh.b_size = PAGE_CACHE_SIZE; 1085 err = get_block(inode, index, &bh, 0); 1086 if (err < 0) 1087 return err; 1088 if (buffer_written(&bh)) { 1089 struct block_device *bdev = bh.b_bdev; 1090 struct blk_dax_ctl dax = { 1091 .sector = to_sector(&bh, inode), 1092 .size = PAGE_CACHE_SIZE, 1093 }; 1094 1095 if (dax_map_atomic(bdev, &dax) < 0) 1096 return PTR_ERR(dax.addr); 1097 clear_pmem(dax.addr + offset, length); 1098 wmb_pmem(); 1099 dax_unmap_atomic(bdev, &dax); 1100 } 1101 1102 return 0; 1103 } 1104 EXPORT_SYMBOL_GPL(dax_zero_page_range); 1105 1106 /** 1107 * dax_truncate_page - handle a partial page being truncated in a DAX file 1108 * @inode: The file being truncated 1109 * @from: The file offset that is being truncated to 1110 * @get_block: The filesystem method used to translate file offsets to blocks 1111 * 1112 * Similar to block_truncate_page(), this function can be called by a 1113 * filesystem when it is truncating a DAX file to handle the partial page. 1114 * 1115 * We work in terms of PAGE_CACHE_SIZE here for commonality with 1116 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem 1117 * took care of disposing of the unnecessary blocks. Even if the filesystem 1118 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page 1119 * since the file might be mmapped. 1120 */ 1121 int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) 1122 { 1123 unsigned length = PAGE_CACHE_ALIGN(from) - from; 1124 return dax_zero_page_range(inode, from, length, get_block); 1125 } 1126 EXPORT_SYMBOL_GPL(dax_truncate_page); 1127