1 /* 2 * fs/dax.c - Direct Access filesystem code 3 * Copyright (c) 2013-2014 Intel Corporation 4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 6 * 7 * This program is free software; you can redistribute it and/or modify it 8 * under the terms and conditions of the GNU General Public License, 9 * version 2, as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 * more details. 15 */ 16 17 #include <linux/atomic.h> 18 #include <linux/blkdev.h> 19 #include <linux/buffer_head.h> 20 #include <linux/fs.h> 21 #include <linux/genhd.h> 22 #include <linux/highmem.h> 23 #include <linux/memcontrol.h> 24 #include <linux/mm.h> 25 #include <linux/mutex.h> 26 #include <linux/sched.h> 27 #include <linux/uio.h> 28 #include <linux/vmstat.h> 29 30 int dax_clear_blocks(struct inode *inode, sector_t block, long size) 31 { 32 struct block_device *bdev = inode->i_sb->s_bdev; 33 sector_t sector = block << (inode->i_blkbits - 9); 34 35 might_sleep(); 36 do { 37 void *addr; 38 unsigned long pfn; 39 long count; 40 41 count = bdev_direct_access(bdev, sector, &addr, &pfn, size); 42 if (count < 0) 43 return count; 44 BUG_ON(size < count); 45 while (count > 0) { 46 unsigned pgsz = PAGE_SIZE - offset_in_page(addr); 47 if (pgsz > count) 48 pgsz = count; 49 if (pgsz < PAGE_SIZE) 50 memset(addr, 0, pgsz); 51 else 52 clear_page(addr); 53 addr += pgsz; 54 size -= pgsz; 55 count -= pgsz; 56 BUG_ON(pgsz & 511); 57 sector += pgsz / 512; 58 cond_resched(); 59 } 60 } while (size); 61 62 return 0; 63 } 64 EXPORT_SYMBOL_GPL(dax_clear_blocks); 65 66 static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) 67 { 68 unsigned long pfn; 69 sector_t sector = bh->b_blocknr << (blkbits - 9); 70 return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); 71 } 72 73 static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, 74 loff_t end) 75 { 76 loff_t final = end - pos + first; /* The final byte of the buffer */ 77 78 if (first > 0) 79 memset(addr, 0, first); 80 if (final < size) 81 memset(addr + final, 0, size - final); 82 } 83 84 static bool buffer_written(struct buffer_head *bh) 85 { 86 return buffer_mapped(bh) && !buffer_unwritten(bh); 87 } 88 89 /* 90 * When ext4 encounters a hole, it returns without modifying the buffer_head 91 * which means that we can't trust b_size. To cope with this, we set b_state 92 * to 0 before calling get_block and, if any bit is set, we know we can trust 93 * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is 94 * and would save us time calling get_block repeatedly. 95 */ 96 static bool buffer_size_valid(struct buffer_head *bh) 97 { 98 return bh->b_state != 0; 99 } 100 101 static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, 102 loff_t start, loff_t end, get_block_t get_block, 103 struct buffer_head *bh) 104 { 105 ssize_t retval = 0; 106 loff_t pos = start; 107 loff_t max = start; 108 loff_t bh_max = start; 109 void *addr; 110 bool hole = false; 111 112 if (iov_iter_rw(iter) != WRITE) 113 end = min(end, i_size_read(inode)); 114 115 while (pos < end) { 116 unsigned len; 117 if (pos == max) { 118 unsigned blkbits = inode->i_blkbits; 119 sector_t block = pos >> blkbits; 120 unsigned first = pos - (block << blkbits); 121 long size; 122 123 if (pos == bh_max) { 124 bh->b_size = PAGE_ALIGN(end - pos); 125 bh->b_state = 0; 126 retval = get_block(inode, block, bh, 127 iov_iter_rw(iter) == WRITE); 128 if (retval) 129 break; 130 if (!buffer_size_valid(bh)) 131 bh->b_size = 1 << blkbits; 132 bh_max = pos - first + bh->b_size; 133 } else { 134 unsigned done = bh->b_size - 135 (bh_max - (pos - first)); 136 bh->b_blocknr += done >> blkbits; 137 bh->b_size -= done; 138 } 139 140 hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh); 141 if (hole) { 142 addr = NULL; 143 size = bh->b_size - first; 144 } else { 145 retval = dax_get_addr(bh, &addr, blkbits); 146 if (retval < 0) 147 break; 148 if (buffer_unwritten(bh) || buffer_new(bh)) 149 dax_new_buf(addr, retval, first, pos, 150 end); 151 addr += first; 152 size = retval - first; 153 } 154 max = min(pos + size, end); 155 } 156 157 if (iov_iter_rw(iter) == WRITE) 158 len = copy_from_iter_nocache(addr, max - pos, iter); 159 else if (!hole) 160 len = copy_to_iter(addr, max - pos, iter); 161 else 162 len = iov_iter_zero(max - pos, iter); 163 164 if (!len) 165 break; 166 167 pos += len; 168 addr += len; 169 } 170 171 return (pos == start) ? retval : pos - start; 172 } 173 174 /** 175 * dax_do_io - Perform I/O to a DAX file 176 * @iocb: The control block for this I/O 177 * @inode: The file which the I/O is directed at 178 * @iter: The addresses to do I/O from or to 179 * @pos: The file offset where the I/O starts 180 * @get_block: The filesystem method used to translate file offsets to blocks 181 * @end_io: A filesystem callback for I/O completion 182 * @flags: See below 183 * 184 * This function uses the same locking scheme as do_blockdev_direct_IO: 185 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the 186 * caller for writes. For reads, we take and release the i_mutex ourselves. 187 * If DIO_LOCKING is not set, the filesystem takes care of its own locking. 188 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O 189 * is in progress. 190 */ 191 ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, 192 struct iov_iter *iter, loff_t pos, get_block_t get_block, 193 dio_iodone_t end_io, int flags) 194 { 195 struct buffer_head bh; 196 ssize_t retval = -EINVAL; 197 loff_t end = pos + iov_iter_count(iter); 198 199 memset(&bh, 0, sizeof(bh)); 200 201 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { 202 struct address_space *mapping = inode->i_mapping; 203 mutex_lock(&inode->i_mutex); 204 retval = filemap_write_and_wait_range(mapping, pos, end - 1); 205 if (retval) { 206 mutex_unlock(&inode->i_mutex); 207 goto out; 208 } 209 } 210 211 /* Protects against truncate */ 212 if (!(flags & DIO_SKIP_DIO_COUNT)) 213 inode_dio_begin(inode); 214 215 retval = dax_io(inode, iter, pos, end, get_block, &bh); 216 217 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) 218 mutex_unlock(&inode->i_mutex); 219 220 if ((retval > 0) && end_io) 221 end_io(iocb, pos, retval, bh.b_private); 222 223 if (!(flags & DIO_SKIP_DIO_COUNT)) 224 inode_dio_end(inode); 225 out: 226 return retval; 227 } 228 EXPORT_SYMBOL_GPL(dax_do_io); 229 230 /* 231 * The user has performed a load from a hole in the file. Allocating 232 * a new page in the file would cause excessive storage usage for 233 * workloads with sparse files. We allocate a page cache page instead. 234 * We'll kick it out of the page cache if it's ever written to, 235 * otherwise it will simply fall out of the page cache under memory 236 * pressure without ever having been dirtied. 237 */ 238 static int dax_load_hole(struct address_space *mapping, struct page *page, 239 struct vm_fault *vmf) 240 { 241 unsigned long size; 242 struct inode *inode = mapping->host; 243 if (!page) 244 page = find_or_create_page(mapping, vmf->pgoff, 245 GFP_KERNEL | __GFP_ZERO); 246 if (!page) 247 return VM_FAULT_OOM; 248 /* Recheck i_size under page lock to avoid truncate race */ 249 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 250 if (vmf->pgoff >= size) { 251 unlock_page(page); 252 page_cache_release(page); 253 return VM_FAULT_SIGBUS; 254 } 255 256 vmf->page = page; 257 return VM_FAULT_LOCKED; 258 } 259 260 static int copy_user_bh(struct page *to, struct buffer_head *bh, 261 unsigned blkbits, unsigned long vaddr) 262 { 263 void *vfrom, *vto; 264 if (dax_get_addr(bh, &vfrom, blkbits) < 0) 265 return -EIO; 266 vto = kmap_atomic(to); 267 copy_user_page(vto, vfrom, vaddr, to); 268 kunmap_atomic(vto); 269 return 0; 270 } 271 272 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, 273 struct vm_area_struct *vma, struct vm_fault *vmf) 274 { 275 struct address_space *mapping = inode->i_mapping; 276 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); 277 unsigned long vaddr = (unsigned long)vmf->virtual_address; 278 void *addr; 279 unsigned long pfn; 280 pgoff_t size; 281 int error; 282 283 i_mmap_lock_read(mapping); 284 285 /* 286 * Check truncate didn't happen while we were allocating a block. 287 * If it did, this block may or may not be still allocated to the 288 * file. We can't tell the filesystem to free it because we can't 289 * take i_mutex here. In the worst case, the file still has blocks 290 * allocated past the end of the file. 291 */ 292 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 293 if (unlikely(vmf->pgoff >= size)) { 294 error = -EIO; 295 goto out; 296 } 297 298 error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); 299 if (error < 0) 300 goto out; 301 if (error < PAGE_SIZE) { 302 error = -EIO; 303 goto out; 304 } 305 306 if (buffer_unwritten(bh) || buffer_new(bh)) 307 clear_page(addr); 308 309 error = vm_insert_mixed(vma, vaddr, pfn); 310 311 out: 312 i_mmap_unlock_read(mapping); 313 314 return error; 315 } 316 317 /** 318 * __dax_fault - handle a page fault on a DAX file 319 * @vma: The virtual memory area where the fault occurred 320 * @vmf: The description of the fault 321 * @get_block: The filesystem method used to translate file offsets to blocks 322 * @complete_unwritten: The filesystem method used to convert unwritten blocks 323 * to written so the data written to them is exposed. This is required for 324 * required by write faults for filesystems that will return unwritten 325 * extent mappings from @get_block, but it is optional for reads as 326 * dax_insert_mapping() will always zero unwritten blocks. If the fs does 327 * not support unwritten extents, the it should pass NULL. 328 * 329 * When a page fault occurs, filesystems may call this helper in their 330 * fault handler for DAX files. __dax_fault() assumes the caller has done all 331 * the necessary locking for the page fault to proceed successfully. 332 */ 333 int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 334 get_block_t get_block, dax_iodone_t complete_unwritten) 335 { 336 struct file *file = vma->vm_file; 337 struct address_space *mapping = file->f_mapping; 338 struct inode *inode = mapping->host; 339 struct page *page; 340 struct buffer_head bh; 341 unsigned long vaddr = (unsigned long)vmf->virtual_address; 342 unsigned blkbits = inode->i_blkbits; 343 sector_t block; 344 pgoff_t size; 345 int error; 346 int major = 0; 347 348 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 349 if (vmf->pgoff >= size) 350 return VM_FAULT_SIGBUS; 351 352 memset(&bh, 0, sizeof(bh)); 353 block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); 354 bh.b_size = PAGE_SIZE; 355 356 repeat: 357 page = find_get_page(mapping, vmf->pgoff); 358 if (page) { 359 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { 360 page_cache_release(page); 361 return VM_FAULT_RETRY; 362 } 363 if (unlikely(page->mapping != mapping)) { 364 unlock_page(page); 365 page_cache_release(page); 366 goto repeat; 367 } 368 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 369 if (unlikely(vmf->pgoff >= size)) { 370 /* 371 * We have a struct page covering a hole in the file 372 * from a read fault and we've raced with a truncate 373 */ 374 error = -EIO; 375 goto unlock_page; 376 } 377 } 378 379 error = get_block(inode, block, &bh, 0); 380 if (!error && (bh.b_size < PAGE_SIZE)) 381 error = -EIO; /* fs corruption? */ 382 if (error) 383 goto unlock_page; 384 385 if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { 386 if (vmf->flags & FAULT_FLAG_WRITE) { 387 error = get_block(inode, block, &bh, 1); 388 count_vm_event(PGMAJFAULT); 389 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 390 major = VM_FAULT_MAJOR; 391 if (!error && (bh.b_size < PAGE_SIZE)) 392 error = -EIO; 393 if (error) 394 goto unlock_page; 395 } else { 396 return dax_load_hole(mapping, page, vmf); 397 } 398 } 399 400 if (vmf->cow_page) { 401 struct page *new_page = vmf->cow_page; 402 if (buffer_written(&bh)) 403 error = copy_user_bh(new_page, &bh, blkbits, vaddr); 404 else 405 clear_user_highpage(new_page, vaddr); 406 if (error) 407 goto unlock_page; 408 vmf->page = page; 409 if (!page) { 410 i_mmap_lock_read(mapping); 411 /* Check we didn't race with truncate */ 412 size = (i_size_read(inode) + PAGE_SIZE - 1) >> 413 PAGE_SHIFT; 414 if (vmf->pgoff >= size) { 415 i_mmap_unlock_read(mapping); 416 error = -EIO; 417 goto out; 418 } 419 } 420 return VM_FAULT_LOCKED; 421 } 422 423 /* Check we didn't race with a read fault installing a new page */ 424 if (!page && major) 425 page = find_lock_page(mapping, vmf->pgoff); 426 427 if (page) { 428 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, 429 PAGE_CACHE_SIZE, 0); 430 delete_from_page_cache(page); 431 unlock_page(page); 432 page_cache_release(page); 433 } 434 435 /* 436 * If we successfully insert the new mapping over an unwritten extent, 437 * we need to ensure we convert the unwritten extent. If there is an 438 * error inserting the mapping, the filesystem needs to leave it as 439 * unwritten to prevent exposure of the stale underlying data to 440 * userspace, but we still need to call the completion function so 441 * the private resources on the mapping buffer can be released. We 442 * indicate what the callback should do via the uptodate variable, same 443 * as for normal BH based IO completions. 444 */ 445 error = dax_insert_mapping(inode, &bh, vma, vmf); 446 if (buffer_unwritten(&bh)) { 447 if (complete_unwritten) 448 complete_unwritten(&bh, !error); 449 else 450 WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); 451 } 452 453 out: 454 if (error == -ENOMEM) 455 return VM_FAULT_OOM | major; 456 /* -EBUSY is fine, somebody else faulted on the same PTE */ 457 if ((error < 0) && (error != -EBUSY)) 458 return VM_FAULT_SIGBUS | major; 459 return VM_FAULT_NOPAGE | major; 460 461 unlock_page: 462 if (page) { 463 unlock_page(page); 464 page_cache_release(page); 465 } 466 goto out; 467 } 468 EXPORT_SYMBOL(__dax_fault); 469 470 /** 471 * dax_fault - handle a page fault on a DAX file 472 * @vma: The virtual memory area where the fault occurred 473 * @vmf: The description of the fault 474 * @get_block: The filesystem method used to translate file offsets to blocks 475 * 476 * When a page fault occurs, filesystems may call this helper in their 477 * fault handler for DAX files. 478 */ 479 int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 480 get_block_t get_block, dax_iodone_t complete_unwritten) 481 { 482 int result; 483 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 484 485 if (vmf->flags & FAULT_FLAG_WRITE) { 486 sb_start_pagefault(sb); 487 file_update_time(vma->vm_file); 488 } 489 result = __dax_fault(vma, vmf, get_block, complete_unwritten); 490 if (vmf->flags & FAULT_FLAG_WRITE) 491 sb_end_pagefault(sb); 492 493 return result; 494 } 495 EXPORT_SYMBOL_GPL(dax_fault); 496 497 /** 498 * dax_pfn_mkwrite - handle first write to DAX page 499 * @vma: The virtual memory area where the fault occurred 500 * @vmf: The description of the fault 501 * 502 */ 503 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 504 { 505 struct super_block *sb = file_inode(vma->vm_file)->i_sb; 506 507 sb_start_pagefault(sb); 508 file_update_time(vma->vm_file); 509 sb_end_pagefault(sb); 510 return VM_FAULT_NOPAGE; 511 } 512 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 513 514 /** 515 * dax_zero_page_range - zero a range within a page of a DAX file 516 * @inode: The file being truncated 517 * @from: The file offset that is being truncated to 518 * @length: The number of bytes to zero 519 * @get_block: The filesystem method used to translate file offsets to blocks 520 * 521 * This function can be called by a filesystem when it is zeroing part of a 522 * page in a DAX file. This is intended for hole-punch operations. If 523 * you are truncating a file, the helper function dax_truncate_page() may be 524 * more convenient. 525 * 526 * We work in terms of PAGE_CACHE_SIZE here for commonality with 527 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem 528 * took care of disposing of the unnecessary blocks. Even if the filesystem 529 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page 530 * since the file might be mmapped. 531 */ 532 int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, 533 get_block_t get_block) 534 { 535 struct buffer_head bh; 536 pgoff_t index = from >> PAGE_CACHE_SHIFT; 537 unsigned offset = from & (PAGE_CACHE_SIZE-1); 538 int err; 539 540 /* Block boundary? Nothing to do */ 541 if (!length) 542 return 0; 543 BUG_ON((offset + length) > PAGE_CACHE_SIZE); 544 545 memset(&bh, 0, sizeof(bh)); 546 bh.b_size = PAGE_CACHE_SIZE; 547 err = get_block(inode, index, &bh, 0); 548 if (err < 0) 549 return err; 550 if (buffer_written(&bh)) { 551 void *addr; 552 err = dax_get_addr(&bh, &addr, inode->i_blkbits); 553 if (err < 0) 554 return err; 555 memset(addr + offset, 0, length); 556 } 557 558 return 0; 559 } 560 EXPORT_SYMBOL_GPL(dax_zero_page_range); 561 562 /** 563 * dax_truncate_page - handle a partial page being truncated in a DAX file 564 * @inode: The file being truncated 565 * @from: The file offset that is being truncated to 566 * @get_block: The filesystem method used to translate file offsets to blocks 567 * 568 * Similar to block_truncate_page(), this function can be called by a 569 * filesystem when it is truncating a DAX file to handle the partial page. 570 * 571 * We work in terms of PAGE_CACHE_SIZE here for commonality with 572 * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem 573 * took care of disposing of the unnecessary blocks. Even if the filesystem 574 * block size is smaller than PAGE_SIZE, we have to zero the rest of the page 575 * since the file might be mmapped. 576 */ 577 int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) 578 { 579 unsigned length = PAGE_CACHE_ALIGN(from) - from; 580 return dax_zero_page_range(inode, from, length, get_block); 581 } 582 EXPORT_SYMBOL_GPL(dax_truncate_page); 583