1 /* 2 * fs/dax.c - Direct Access filesystem code 3 * Copyright (c) 2013-2014 Intel Corporation 4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 */ 16 17 #include <linux/atomic.h> 18 #include <linux/blkdev.h> 19 #include <linux/buffer_head.h> 20 #include <linux/dax.h> 21 #include <linux/fs.h> 22 #include <linux/genhd.h> 23 #include <linux/highmem.h> 24 #include <linux/memcontrol.h> 25 #include <linux/mm.h> 26 #include <linux/mutex.h> 27 #include <linux/pmem.h> 28 #include <linux/sched.h> 29 #include <linux/uio.h> 30 #include <linux/vmstat.h> 31 32 int dax_clear_blocks(struct inode *inode, sector_t block, long size) 33 { 34 struct block_device *bdev = inode->i_sb->s_bdev; 35 sector_t sector = block << (inode->i_blkbits - 9); 36 37 might_sleep(); 38 do { 39 void __pmem *addr; 40 unsigned long pfn; 41 long count; 42 43 count = bdev_direct_access(bdev, sector, &addr, &pfn, size); 44 if (count < 0) 45 return count; 46 BUG_ON(size < count); 47 while (count > 0) { 48 unsigned pgsz = PAGE_SIZE - offset_in_page(addr); 49 if (pgsz > count) 50 pgsz = count; 51 clear_pmem(addr, pgsz); 52 addr += pgsz; 53 size -= pgsz; 54 count -= pgsz; 55 BUG_ON(pgsz & 511); 56 sector += pgsz / 512; 57 cond_resched(); 58 } 59 } while (size); 60 61 wmb_pmem(); 62 return 0; 63 } 64 EXPORT_SYMBOL_GPL(dax_clear_blocks); 65 66 static long dax_get_addr(struct buffer_head *bh, void __pmem **addr, 67 unsigned blkbits) 68 { 69 unsigned long pfn; 70 sector_t sector = bh->b_blocknr << (blkbits - 9); 71 return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); 72 } 73 74 /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ 75 static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, 76 loff_t pos, loff_t end) 77 { 78 loff_t final = end - pos + first; /* The final byte of the buffer */ 79 80 if (first > 0) 81 clear_pmem(addr, first); 82 if (final < size) 83 clear_pmem(addr + final, size - final); 84 } 85 86 static bool buffer_written(struct buffer_head *bh) 87 { 88 return buffer_mapped(bh) && !buffer_unwritten(bh); 89 } 90 91 /* 92 * When ext4 encounters a hole, it returns without modifying the buffer_head 93 * which means that we can't trust b_size. To cope with this, we set b_state 94 * to 0 before calling get_block and, if any bit is set, we know we can trust 95 * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is 96 * and would save us time calling get_block repeatedly. 97 */ 98 static bool buffer_size_valid(struct buffer_head *bh) 99 { 100 return bh->b_state != 0; 101 } 102 103 static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, 104 loff_t start, loff_t end, get_block_t get_block, 105 struct buffer_head *bh) 106 { 107 ssize_t retval = 0; 108 loff_t pos = start; 109 loff_t max = start; 110 loff_t bh_max = start; 111 void __pmem *addr; 112 bool hole = false; 113 bool need_wmb = false; 114 115 if (iov_iter_rw(iter) != WRITE) 116 end = min(end, i_size_read(inode)); 117 118 while (pos < end) { 119 size_t len; 120 if (pos == max) { 121 unsigned blkbits = inode->i_blkbits; 122 long page = pos >> PAGE_SHIFT; 123 sector_t block = page << (PAGE_SHIFT - blkbits); 124 unsigned first = pos - (block << blkbits); 125 long size; 126 127 if (pos == bh_max) { 128 bh->b_size = PAGE_ALIGN(end - pos); 129 bh->b_state = 0; 130 retval = get_block(inode, block, bh, 131 iov_iter_rw(iter) == WRITE); 132 if (retval) 133 break; 134 if (!buffer_size_valid(bh)) 135 bh->b_size = 1 << blkbits; 136 bh_max = pos - first + bh->b_size; 137 } else { 138 unsigned done = bh->b_size - 139 (bh_max - (pos - first)); 140 bh->b_blocknr += done >> blkbits; 141 bh->b_size -= done; 142 } 143 144 hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh); 145 if (hole) { 146 addr = NULL; 147 size = bh->b_size - first; 148 } else { 149 retval = dax_get_addr(bh, &addr, blkbits); 150 if (retval < 0) 151 break; 152 if (buffer_unwritten(bh) || buffer_new(bh)) { 153 dax_new_buf(addr, retval, first, pos, 154 end); 155 need_wmb = true; 156 } 157 addr += first; 158 size = retval - first; 159 } 160 max = min(pos + size, end); 161 } 162 163 if (iov_iter_rw(iter) == WRITE) { 164 len = copy_from_iter_pmem(addr, max - pos, iter); 165 need_wmb = true; 166 } else if (!hole) 167 len = copy_to_iter((void __force *)addr, max - pos, 168 iter); 169 else 170 len = iov_iter_zero(max - pos, iter); 171 172 if (!len) 173 break; 174 175 pos += len; 176 addr += len; 177 } 178 179 if (need_wmb) 180 wmb_pmem(); 181 182 return (pos == start) ? retval : pos - start; 183 } 184 185 /** 186 * dax_do_io - Perform I/O to a DAX file 187 * @iocb: The control block for this I/O 188 * @inode: The file which the I/O is directed at 189 * @iter: The addresses to do I/O from or to 190 * @pos: The file offset where the I/O starts 191 * @get_block: The filesystem method used to translate file offsets to blocks 192 * @end_io: A filesystem callback for I/O completion 193 * @flags: See below 194 * 195 * This function uses the same locking scheme as do_blockdev_direct_IO: 196 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the 197 * caller for writes. For reads, we take and release the i_mutex ourselves. 198 * If DIO_LOCKING is not set, the filesystem takes care of its own locking. 199 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O 200 * is in progress. 201 */ 202 ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, 203 struct iov_iter *iter, loff_t pos, get_block_t get_block, 204 dio_iodone_t end_io, int flags) 205 { 206 struct buffer_head bh; 207 ssize_t retval = -EINVAL; 208 loff_t end = pos + iov_iter_count(iter); 209 210 memset(&bh, 0, sizeof(bh)); 211 212 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { 213 struct address_space *mapping = inode->i_mapping; 214 mutex_lock(&inode->i_mutex); 215 retval = filemap_write_and_wait_range(mapping, pos, end - 1); 216 if (retval) { 217 mutex_unlock(&inode->i_mutex); 218 goto out; 219 } 220 } 221 222 /* Protects against truncate */ 223 if (!(flags & DIO_SKIP_DIO_COUNT)) 224 inode_dio_begin(inode); 225 226 retval = dax_io(inode, iter, pos, end, get_block, &bh); 227 228 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 229 mutex_unlock(&inode->i_mutex); 230 231 if ((retval > 0) && end_io) 232 end_io(iocb, pos, retval, bh.b_private); 233 234 if (!(flags & DIO_SKIP_DIO_COUNT)) 235 inode_dio_end(inode); 236 out: 237 return retval; 238 } 239 EXPORT_SYMBOL_GPL(dax_do_io); 240 241 /* 242 * The user has performed a load from a hole in the file. Allocating 243 * a new page in the file would cause excessive storage usage for 244 * workloads with sparse files. We allocate a page cache page instead. 245 * We'll kick it out of the page cache if it's ever written to, 246 * otherwise it will simply fall out of the page cache under memory 247 * pressure without ever having been dirtied. 248 */ 249 static int dax_load_hole(struct address_space *mapping, struct page *page, 250 struct vm_fault *vmf) 251 { 252 unsigned long size; 253 struct inode *inode = mapping->host; 254 if (!page) 255 page = find_or_create_page(mapping, vmf->pgoff, 256 GFP_KERNEL | __GFP_ZERO); 257 if (!page) 258 return VM_FAULT_OOM; 259 /* Recheck i_size under page lock to avoid truncate race */ 260 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 261 if (vmf->pgoff >= size) { 262 unlock_page(page); 263 page_cache_release(page); 264 return VM_FAULT_SIGBUS; 265 } 266 267 vmf->page = page; 268 return VM_FAULT_LOCKED; 269 } 270 271 static int copy_user_bh(struct page *to, struct buffer_head *bh, 272 unsigned blkbits, unsigned long vaddr) 273 { 274 void __pmem *vfrom; 275 void *vto; 276 277 if (dax_get_addr(bh, &vfrom, blkbits) < 0) 278 return -EIO; 279 vto = kmap_atomic(to); 280 copy_user_page(vto, (void __force *)vfrom, vaddr, to); 281 kunmap_atomic(vto); 282 return 0; 283 } 284 285 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, 286 struct vm_area_struct *vma, struct vm_fault *vmf) 287 { 288 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 289 unsigned long vaddr = (unsigned long)vmf->virtual_address; 290 void __pmem *addr; 291 unsigned long pfn; 292 pgoff_t size; 293 int error; 294 295 /* 296 * Check truncate didn't happen while we were allocating a block. 297 * If it did, this block may or may not be still allocated to the 298 * file. We can't tell the filesystem to free it because we can't 299 * take i_mutex here. In the worst case, the file still has blocks 300 * allocated past the end of the file. 301 */ 302 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 303 if (unlikely(vmf->pgoff >= size)) { 304 error = -EIO; 305 goto out; 306 } 307 308 error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); 309 if (error < 0) 310 goto out; 311 if (error < PAGE_SIZE) { 312 error = -EIO; 313 goto out; 314 } 315 316 if (buffer_unwritten(bh) || buffer_new(bh)) { 317 clear_pmem(addr, PAGE_SIZE); 318 wmb_pmem(); 319 } 320 321 error = vm_insert_mixed(vma, vaddr, pfn); 322 323 out: 324 return error; 325 } 326 327 /** 328 * __dax_fault - handle a page fault on a DAX file 329 * @vma: The virtual memory area where the fault occurred 330 * @vmf: The description of the fault 331 * @get_block: The filesystem method used to translate file offsets to blocks 332 * @complete_unwritten: The filesystem method used to convert unwritten blocks 333 * to written so the data written to them is exposed. This is required for 334 * required by write faults for filesystems that will return unwritten 335 * extent mappings from @get_block, but it is optional for reads as 336 * dax_insert_mapping() will always zero unwritten blocks. If the fs does 337 * not support unwritten extents, the it should pass NULL. 338 * 339 * When a page fault occurs, filesystems may call this helper in their 340 * fault handler for DAX files. __dax_fault() assumes the caller has done all 341 * the necessary locking for the page fault to proceed successfully. 342 */ 343 int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 344 get_block_t get_block, dax_iodone_t complete_unwritten) 345 { 346 struct file *file = vma->vm_file; 347 struct address_space *mapping = file->f_mapping; 348 struct inode *inode = mapping->host; 349 struct page *page; 350 struct buffer_head bh; 351 unsigned long vaddr = (unsigned long)vmf->virtual_address; 352 unsigned blkbits = inode->i_blkbits; 353 sector_t block; 354 pgoff_t size; 355 int error; 356 int major = 0; 357 358 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 359 if (vmf->pgoff >= size) 360 return VM_FAULT_SIGBUS; 361 362 memset(&bh, 0, sizeof(bh)); 363 block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 364 bh.b_size = PAGE_SIZE; 365 366 repeat: 367 page = find_get_page(mapping, vmf->pgoff); 368 if (page) { 369 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 370 page_cache_release(page); 371 return VM_FAULT_RETRY; 372 } 373 if (unlikely(page->mapping != mapping)) { 374 unlock_page(page); 375 page_cache_release(page); 376 goto repeat; 377 } 378 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 379 if (unlikely(vmf->pgoff >= size)) { 380 /* 381 * We have a struct page covering a hole in the file 382 * from a read fault and we've raced with a truncate 383 */ 384 error = -EIO; 385 goto unlock; 386 } 387 } else { 388 i_mmap_lock_write(mapping); 389 } 390 391 error = get_block(inode, block, &bh, 0); 392 if (!error && (bh.b_size < PAGE_SIZE)) 393 error = -EIO; /* fs corruption? */ 394 if (error) 395 goto unlock; 396 397 if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { 398 if (vmf->flags & FAULT_FLAG_WRITE) { 399 error = get_block(inode, block, &bh, 1); 400 count_vm_event(PGMAJFAULT); 401 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 402 major = VM_FAULT_MAJOR; 403 if (!error && (bh.b_size < PAGE_SIZE)) 404 error = -EIO; 405 if (error) 406 goto unlock; 407 } else { 408 i_mmap_unlock_write(mapping); 409 return dax_load_hole(mapping, page, vmf); 410 } 411 } 412 413 if (vmf->cow_page) { 414 struct page *new_page = vmf->cow_page; 415 if (buffer_written(&bh)) 416 error = copy_user_bh(new_page, &bh, blkbits, vaddr); 417 else 418 clear_user_highpage(new_page, vaddr); 419 if (error) 420 goto unlock; 421 vmf->page = page; 422 if (!page) { 423 /* Check we didn't race with truncate */ 424 size = (i_size_read(inode) + PAGE_SIZE - 1) >> 425 PAGE_SHIFT; 426 if (vmf->pgoff >= size) { 427 error = -EIO; 428 goto unlock; 429 } 430 } 431 return VM_FAULT_LOCKED; 432 } 433 434 /* Check we didn't race with a read fault installing a new page */ 435 if (!page && major) 436 page = find_lock_page(mapping, vmf->pgoff); 437 438 if (page) { 439 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 440 PAGE_CACHE_SIZE, 0); 441 delete_from_page_cache(page); 442 unlock_page(page); 443 page_cache_release(page); 444 } 445 446 /* 447 * If we successfully insert the new mapping over an unwritten extent, 448 * we need to ensure we convert the unwritten extent. If there is an 449 * error inserting the mapping, the filesystem needs to leave it as 450 * unwritten to prevent exposure of the stale underlying data to 451 * userspace, but we still need to call the completion function so 452 * the private resources on the mapping buffer can be released. We 453 * indicate what the callback should do via the uptodate variable, same 454 * as for normal BH based IO completions. 455 */ 456 error = dax_insert_mapping(inode, &bh, vma, vmf); 457 if (buffer_unwritten(&bh)) { 458 if (complete_unwritten) 459 complete_unwritten(&bh, !error); 460 else 461 WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); 462 } 463 464 if (!page) 465 i_mmap_unlock_write(mapping); 466 out: 467 if (error == -ENOMEM) 468 return VM_FAULT_OOM | major; 469 /* -EBUSY is fine, somebody else faulted on the same PTE */ 470 if ((error < 0) && (error != -EBUSY)) 471 return VM_FAULT_SIGBUS | major; 472 return VM_FAULT_NOPAGE | major; 473 474 unlock: 475 if (page) { 476 unlock_page(page); 477 page_cache_release(page); 478 } else { 479 i_mmap_unlock_write(mapping); 480 } 481 482 goto out; 483 } 484 EXPORT_SYMBOL(__dax_fault); 485 486 /** 487 * dax_fault - handle a page fault on a DAX file 488 * @vma: The virtual memory area where the fault occurred 489 * @vmf: The description of the fault 490 * @get_block: The filesystem method used to translate file offsets to blocks 491 * 492 * When a page fault occurs, filesystems may call this helper in their 493 * fault handler for DAX files. 494 */ 495 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 496 get_block_t get_block, dax_iodone_t complete_unwritten) 497 { 498 int result; 499 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 500 501 if (vmf->flags & FAULT_FLAG_WRITE) { 502 sb_start_pagefault(sb); 503 file_update_time(vma->vm_file); 504 } 505 result = __dax_fault(vma, vmf, get_block, complete_unwritten); 506 if (vmf->flags & FAULT_FLAG_WRITE) 507 sb_end_pagefault(sb); 508 509 return result; 510 } 511 EXPORT_SYMBOL_GPL(dax_fault); 512 513 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 514 /* 515 * The 'colour' (ie low bits) within a PMD of a page offset. This comes up 516 * more often than one might expect in the below function. 517 */ 518 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 519 520 int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 521 pmd_t *pmd, unsigned int flags, get_block_t get_block, 522 dax_iodone_t complete_unwritten) 523 { 524 struct file *file = vma->vm_file; 525 struct address_space *mapping = file->f_mapping; 526 struct inode *inode = mapping->host; 527 struct buffer_head bh; 528 unsigned blkbits = inode->i_blkbits; 529 unsigned long pmd_addr = address & PMD_MASK; 530 bool write = flags & FAULT_FLAG_WRITE; 531 long length; 532 void __pmem *kaddr; 533 pgoff_t size, pgoff; 534 sector_t block, sector; 535 unsigned long pfn; 536 int result = 0; 537 538 /* Fall back to PTEs if we're going to COW */ 539 if (write && !(vma->vm_flags & VM_SHARED)) 540 return VM_FAULT_FALLBACK; 541 /* If the PMD would extend outside the VMA */ 542 if (pmd_addr < vma->vm_start) 543 return VM_FAULT_FALLBACK; 544 if ((pmd_addr + PMD_SIZE) > vma->vm_end) 545 return VM_FAULT_FALLBACK; 546 547 pgoff = linear_page_index(vma, pmd_addr); 548 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 549 if (pgoff >= size) 550 return VM_FAULT_SIGBUS; 551 /* If the PMD would cover blocks out of the file */ 552 if ((pgoff | PG_PMD_COLOUR) >= size) 553 return VM_FAULT_FALLBACK; 554 555 memset(&bh, 0, sizeof(bh)); 556 block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); 557 558 bh.b_size = PMD_SIZE; 559 i_mmap_lock_write(mapping); 560 length = get_block(inode, block, &bh, write); 561 if (length) 562 return VM_FAULT_SIGBUS; 563 564 /* 565 * If the filesystem isn't willing to tell us the length of a hole, 566 * just fall back to PTEs. Calling get_block 512 times in a loop 567 * would be silly. 568 */ 569 if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) 570 goto fallback; 571 572 sector = bh.b_blocknr << (blkbits - 9); 573 574 if (buffer_unwritten(&bh) || buffer_new(&bh)) { 575 int i; 576 577 length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, 578 bh.b_size); 579 if (length < 0) { 580 result = VM_FAULT_SIGBUS; 581 goto out; 582 } 583 if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) 584 goto fallback; 585 586 for (i = 0; i < PTRS_PER_PMD; i++) 587 clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE); 588 wmb_pmem(); 589 count_vm_event(PGMAJFAULT); 590 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 591 result |= VM_FAULT_MAJOR; 592 } 593 594 /* 595 * If we allocated new storage, make sure no process has any 596 * zero pages covering this hole 597 */ 598 if (buffer_new(&bh)) { 599 i_mmap_unlock_write(mapping); 600 unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); 601 i_mmap_lock_write(mapping); 602 } 603 604 /* 605 * If a truncate happened while we were allocating blocks, we may 606 * leave blocks allocated to the file that are beyond EOF. We can't 607 * take i_mutex here, so just leave them hanging; they'll be freed 608 * when the file is deleted. 609 */ 610 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 611 if (pgoff >= size) { 612 result = VM_FAULT_SIGBUS; 613 goto out; 614 } 615 if ((pgoff | PG_PMD_COLOUR) >= size) 616 goto fallback; 617 618 if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { 619 spinlock_t *ptl; 620 pmd_t entry; 621 struct page *zero_page = get_huge_zero_page(); 622 623 if (unlikely(!zero_page)) 624 goto fallback; 625 626 ptl = pmd_lock(vma->vm_mm, pmd); 627 if (!pmd_none(*pmd)) { 628 spin_unlock(ptl); 629 goto fallback; 630 } 631 632 entry = mk_pmd(zero_page, vma->vm_page_prot); 633 entry = pmd_mkhuge(entry); 634 set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); 635 result = VM_FAULT_NOPAGE; 636 spin_unlock(ptl); 637 } else { 638 length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, 639 bh.b_size); 640 if (length < 0) { 641 result = VM_FAULT_SIGBUS; 642 goto out; 643 } 644 if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) 645 goto fallback; 646 647 result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); 648 } 649 650 out: 651 if (buffer_unwritten(&bh)) 652 complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); 653 654 i_mmap_unlock_write(mapping); 655 656 return result; 657 658 fallback: 659 count_vm_event(THP_FAULT_FALLBACK); 660 result = VM_FAULT_FALLBACK; 661 goto out; 662 } 663 EXPORT_SYMBOL_GPL(__dax_pmd_fault); 664 665 /** 666 * dax_pmd_fault - handle a PMD fault on a DAX file 667 * @vma: The virtual memory area where the fault occurred 668 * @vmf: The description of the fault 669 * @get_block: The filesystem method used to translate file offsets to blocks 670 * 671 * When a page fault occurs, filesystems may call this helper in their 672 * pmd_fault handler for DAX files. 673 */ 674 int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 675 pmd_t *pmd, unsigned int flags, get_block_t get_block, 676 dax_iodone_t complete_unwritten) 677 { 678 int result; 679 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 680 681 if (flags & FAULT_FLAG_WRITE) { 682 sb_start_pagefault(sb); 683 file_update_time(vma->vm_file); 684 } 685 result = __dax_pmd_fault(vma, address, pmd, flags, get_block, 686 complete_unwritten); 687 if (flags & FAULT_FLAG_WRITE) 688 sb_end_pagefault(sb); 689 690 return result; 691 } 692 EXPORT_SYMBOL_GPL(dax_pmd_fault); 693 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 694 695 /** 696 * dax_pfn_mkwrite - handle first write to DAX page 697 * @vma: The virtual memory area where the fault occurred 698 * @vmf: The description of the fault 699 * 700 */ 701 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 702 { 703 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 704 705 sb_start_pagefault(sb); 706 file_update_time(vma->vm_file); 707 sb_end_pagefault(sb); 708 return VM_FAULT_NOPAGE; 709 } 710 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 711 712 /** 713 * dax_zero_page_range - zero a range within a page of a DAX file 714 * @inode: The file being truncated 715 * @from: The file offset that is being truncated to 716 * @length: The number of bytes to zero 717 * @get_block: The filesystem method used to translate file offsets to blocks 718 * 719 * This function can be called by a filesystem when it is zeroing part of a 720 * page in a DAX file. This is intended for hole-punch operations. If 721 * you are truncating a file, the helper function dax_truncate_page() may be 722 * more convenient. 723 * 724 * We work in terms of PAGE_CACHE_SIZE here for commonality with 725 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem 726 * took care of disposing of the unnecessary blocks. Even if the filesystem 727 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page 728 * since the file might be mmapped. 729 */ 730 int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, 731 get_block_t get_block) 732 { 733 struct buffer_head bh; 734 pgoff_t index = from >> PAGE_CACHE_SHIFT; 735 unsigned offset = from & (PAGE_CACHE_SIZE-1); 736 int err; 737 738 /* Block boundary? Nothing to do */ 739 if (!length) 740 return 0; 741 BUG_ON((offset + length) > PAGE_CACHE_SIZE); 742 743 memset(&bh, 0, sizeof(bh)); 744 bh.b_size = PAGE_CACHE_SIZE; 745 err = get_block(inode, index, &bh, 0); 746 if (err < 0) 747 return err; 748 if (buffer_written(&bh)) { 749 void __pmem *addr; 750 err = dax_get_addr(&bh, &addr, inode->i_blkbits); 751 if (err < 0) 752 return err; 753 clear_pmem(addr + offset, length); 754 wmb_pmem(); 755 } 756 757 return 0; 758 } 759 EXPORT_SYMBOL_GPL(dax_zero_page_range); 760 761 /** 762 * dax_truncate_page - handle a partial page being truncated in a DAX file 763 * @inode: The file being truncated 764 * @from: The file offset that is being truncated to 765 * @get_block: The filesystem method used to translate file offsets to blocks 766 * 767 * Similar to block_truncate_page(), this function can be called by a 768 * filesystem when it is truncating a DAX file to handle the partial page. 769 * 770 * We work in terms of PAGE_CACHE_SIZE here for commonality with 771 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem 772 * took care of disposing of the unnecessary blocks. Even if the filesystem 773 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page 774 * since the file might be mmapped. 775 */ 776 int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) 777 { 778 unsigned length = PAGE_CACHE_ALIGN(from) - from; 779 return dax_zero_page_range(inode, from, length, get_block); 780 } 781 EXPORT_SYMBOL_GPL(dax_truncate_page); 782