1 /* 2 * linux/fs/ext4/file.c 3 * 4 * Copyright (C) 1992, 1993, 1994, 1995 5 * Remy Card (card@masi.ibp.fr) 6 * Laboratoire MASI - Institut Blaise Pascal 7 * Universite Pierre et Marie Curie (Paris VI) 8 * 9 * from 10 * 11 * linux/fs/minix/file.c 12 * 13 * Copyright (C) 1991, 1992 Linus Torvalds 14 * 15 * ext4 fs regular file handling primitives 16 * 17 * 64-bit file support on 64-bit platforms by Jakub Jelinek 18 * (jj@sunsite.ms.mff.cuni.cz) 19 */ 20 21 #include <linux/time.h> 22 #include <linux/fs.h> 23 #include <linux/mount.h> 24 #include <linux/path.h> 25 #include <linux/dax.h> 26 #include <linux/quotaops.h> 27 #include <linux/pagevec.h> 28 #include <linux/uio.h> 29 #include "ext4.h" 30 #include "ext4_jbd2.h" 31 #include "xattr.h" 32 #include "acl.h" 33 34 /* 35 * Called when an inode is released. Note that this is different 36 * from ext4_file_open: open gets called at every open, but release 37 * gets called only when /all/ the files are closed. 38 */ 39 static int ext4_release_file(struct inode *inode, struct file *filp) 40 { 41 if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { 42 ext4_alloc_da_blocks(inode); 43 ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 44 } 45 /* if we are the last writer on the inode, drop the block reservation */ 46 if ((filp->f_mode & FMODE_WRITE) && 47 (atomic_read(&inode->i_writecount) == 1) && 48 !EXT4_I(inode)->i_reserved_data_blocks) 49 { 50 down_write(&EXT4_I(inode)->i_data_sem); 51 ext4_discard_preallocations(inode); 52 up_write(&EXT4_I(inode)->i_data_sem); 53 } 54 if (is_dx(inode) && filp->private_data) 55 ext4_htree_free_dir_info(filp->private_data); 56 57 return 0; 58 } 59 60 static void ext4_unwritten_wait(struct inode *inode) 61 { 62 wait_queue_head_t *wq = ext4_ioend_wq(inode); 63 64 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); 65 } 66 67 /* 68 * This tests whether the IO in question is block-aligned or not. 69 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they 70 * are converted to written only after the IO is complete. Until they are 71 * mapped, these blocks appear as holes, so dio_zero_block() will assume that 72 * it needs to zero out portions of the start and/or end block. If 2 AIO 73 * threads are at work on the same unwritten block, they must be synchronized 74 * or one thread will zero the other's data, causing corruption. 75 */ 76 static int 77 ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) 78 { 79 struct super_block *sb = inode->i_sb; 80 int blockmask = sb->s_blocksize - 1; 81 82 if (pos >= i_size_read(inode)) 83 return 0; 84 85 if ((pos | iov_iter_alignment(from)) & blockmask) 86 return 1; 87 88 return 0; 89 } 90 91 static ssize_t 92 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 93 { 94 struct inode *inode = file_inode(iocb->ki_filp); 95 int o_direct = iocb->ki_flags & IOCB_DIRECT; 96 int unaligned_aio = 0; 97 int overwrite = 0; 98 ssize_t ret; 99 100 inode_lock(inode); 101 ret = generic_write_checks(iocb, from); 102 if (ret <= 0) 103 goto out; 104 105 /* 106 * Unaligned direct AIO must be serialized among each other as zeroing 107 * of partial blocks of two competing unaligned AIOs can result in data 108 * corruption. 109 */ 110 if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && 111 !is_sync_kiocb(iocb) && 112 ext4_unaligned_aio(inode, from, iocb->ki_pos)) { 113 unaligned_aio = 1; 114 ext4_unwritten_wait(inode); 115 } 116 117 /* 118 * If we have encountered a bitmap-format file, the size limit 119 * is smaller than s_maxbytes, which is for extent-mapped files. 120 */ 121 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 122 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 123 124 if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) { 125 ret = -EFBIG; 126 goto out; 127 } 128 iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); 129 } 130 131 iocb->private = &overwrite; 132 if (o_direct) { 133 size_t length = iov_iter_count(from); 134 loff_t pos = iocb->ki_pos; 135 136 /* check whether we do a DIO overwrite or not */ 137 if (ext4_should_dioread_nolock(inode) && !unaligned_aio && 138 pos + length <= i_size_read(inode)) { 139 struct ext4_map_blocks map; 140 unsigned int blkbits = inode->i_blkbits; 141 int err, len; 142 143 map.m_lblk = pos >> blkbits; 144 map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits); 145 len = map.m_len; 146 147 err = ext4_map_blocks(NULL, inode, &map, 0); 148 /* 149 * 'err==len' means that all of blocks has 150 * been preallocated no matter they are 151 * initialized or not. For excluding 152 * unwritten extents, we need to check 153 * m_flags. There are two conditions that 154 * indicate for initialized extents. 1) If we 155 * hit extent cache, EXT4_MAP_MAPPED flag is 156 * returned; 2) If we do a real lookup, 157 * non-flags are returned. So we should check 158 * these two conditions. 159 */ 160 if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) 161 overwrite = 1; 162 } 163 } 164 165 ret = __generic_file_write_iter(iocb, from); 166 inode_unlock(inode); 167 168 if (ret > 0) 169 ret = generic_write_sync(iocb, ret); 170 171 return ret; 172 173 out: 174 inode_unlock(inode); 175 return ret; 176 } 177 178 #ifdef CONFIG_FS_DAX 179 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 180 { 181 int result; 182 handle_t *handle = NULL; 183 struct inode *inode = file_inode(vma->vm_file); 184 struct super_block *sb = inode->i_sb; 185 bool write = vmf->flags & FAULT_FLAG_WRITE; 186 187 if (write) { 188 sb_start_pagefault(sb); 189 file_update_time(vma->vm_file); 190 down_read(&EXT4_I(inode)->i_mmap_sem); 191 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, 192 EXT4_DATA_TRANS_BLOCKS(sb)); 193 } else 194 down_read(&EXT4_I(inode)->i_mmap_sem); 195 196 if (IS_ERR(handle)) 197 result = VM_FAULT_SIGBUS; 198 else 199 result = dax_fault(vma, vmf, ext4_dax_get_block); 200 201 if (write) { 202 if (!IS_ERR(handle)) 203 ext4_journal_stop(handle); 204 up_read(&EXT4_I(inode)->i_mmap_sem); 205 sb_end_pagefault(sb); 206 } else 207 up_read(&EXT4_I(inode)->i_mmap_sem); 208 209 return result; 210 } 211 212 static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, 213 pmd_t *pmd, unsigned int flags) 214 { 215 int result; 216 handle_t *handle = NULL; 217 struct inode *inode = file_inode(vma->vm_file); 218 struct super_block *sb = inode->i_sb; 219 bool write = flags & FAULT_FLAG_WRITE; 220 221 if (write) { 222 sb_start_pagefault(sb); 223 file_update_time(vma->vm_file); 224 down_read(&EXT4_I(inode)->i_mmap_sem); 225 handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, 226 ext4_chunk_trans_blocks(inode, 227 PMD_SIZE / PAGE_SIZE)); 228 } else 229 down_read(&EXT4_I(inode)->i_mmap_sem); 230 231 if (IS_ERR(handle)) 232 result = VM_FAULT_SIGBUS; 233 else 234 result = dax_pmd_fault(vma, addr, pmd, flags, 235 ext4_dax_get_block); 236 237 if (write) { 238 if (!IS_ERR(handle)) 239 ext4_journal_stop(handle); 240 up_read(&EXT4_I(inode)->i_mmap_sem); 241 sb_end_pagefault(sb); 242 } else 243 up_read(&EXT4_I(inode)->i_mmap_sem); 244 245 return result; 246 } 247 248 /* 249 * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault() 250 * handler we check for races agaist truncate. Note that since we cycle through 251 * i_mmap_sem, we are sure that also any hole punching that began before we 252 * were called is finished by now and so if it included part of the file we 253 * are working on, our pte will get unmapped and the check for pte_same() in 254 * wp_pfn_shared() fails. Thus fault gets retried and things work out as 255 * desired. 256 */ 257 static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, 258 struct vm_fault *vmf) 259 { 260 struct inode *inode = file_inode(vma->vm_file); 261 struct super_block *sb = inode->i_sb; 262 loff_t size; 263 int ret; 264 265 sb_start_pagefault(sb); 266 file_update_time(vma->vm_file); 267 down_read(&EXT4_I(inode)->i_mmap_sem); 268 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 269 if (vmf->pgoff >= size) 270 ret = VM_FAULT_SIGBUS; 271 else 272 ret = dax_pfn_mkwrite(vma, vmf); 273 up_read(&EXT4_I(inode)->i_mmap_sem); 274 sb_end_pagefault(sb); 275 276 return ret; 277 } 278 279 static const struct vm_operations_struct ext4_dax_vm_ops = { 280 .fault = ext4_dax_fault, 281 .pmd_fault = ext4_dax_pmd_fault, 282 .page_mkwrite = ext4_dax_fault, 283 .pfn_mkwrite = ext4_dax_pfn_mkwrite, 284 }; 285 #else 286 #define ext4_dax_vm_ops ext4_file_vm_ops 287 #endif 288 289 static const struct vm_operations_struct ext4_file_vm_ops = { 290 .fault = ext4_filemap_fault, 291 .map_pages = filemap_map_pages, 292 .page_mkwrite = ext4_page_mkwrite, 293 }; 294 295 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) 296 { 297 struct inode *inode = file->f_mapping->host; 298 299 if (ext4_encrypted_inode(inode)) { 300 int err = fscrypt_get_encryption_info(inode); 301 if (err) 302 return 0; 303 if (!fscrypt_has_encryption_key(inode)) 304 return -ENOKEY; 305 } 306 file_accessed(file); 307 if (IS_DAX(file_inode(file))) { 308 vma->vm_ops = &ext4_dax_vm_ops; 309 vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; 310 } else { 311 vma->vm_ops = &ext4_file_vm_ops; 312 } 313 return 0; 314 } 315 316 static int ext4_file_open(struct inode * inode, struct file * filp) 317 { 318 struct super_block *sb = inode->i_sb; 319 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 320 struct vfsmount *mnt = filp->f_path.mnt; 321 struct dentry *dir; 322 struct path path; 323 char buf[64], *cp; 324 int ret; 325 326 if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && 327 !(sb->s_flags & MS_RDONLY))) { 328 sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; 329 /* 330 * Sample where the filesystem has been mounted and 331 * store it in the superblock for sysadmin convenience 332 * when trying to sort through large numbers of block 333 * devices or filesystem images. 334 */ 335 memset(buf, 0, sizeof(buf)); 336 path.mnt = mnt; 337 path.dentry = mnt->mnt_root; 338 cp = d_path(&path, buf, sizeof(buf)); 339 if (!IS_ERR(cp)) { 340 handle_t *handle; 341 int err; 342 343 handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); 344 if (IS_ERR(handle)) 345 return PTR_ERR(handle); 346 BUFFER_TRACE(sbi->s_sbh, "get_write_access"); 347 err = ext4_journal_get_write_access(handle, sbi->s_sbh); 348 if (err) { 349 ext4_journal_stop(handle); 350 return err; 351 } 352 strlcpy(sbi->s_es->s_last_mounted, cp, 353 sizeof(sbi->s_es->s_last_mounted)); 354 ext4_handle_dirty_super(handle, sb); 355 ext4_journal_stop(handle); 356 } 357 } 358 if (ext4_encrypted_inode(inode)) { 359 ret = fscrypt_get_encryption_info(inode); 360 if (ret) 361 return -EACCES; 362 if (!fscrypt_has_encryption_key(inode)) 363 return -ENOKEY; 364 } 365 366 dir = dget_parent(file_dentry(filp)); 367 if (ext4_encrypted_inode(d_inode(dir)) && 368 !fscrypt_has_permitted_context(d_inode(dir), inode)) { 369 ext4_warning(inode->i_sb, 370 "Inconsistent encryption contexts: %lu/%lu", 371 (unsigned long) d_inode(dir)->i_ino, 372 (unsigned long) inode->i_ino); 373 dput(dir); 374 return -EPERM; 375 } 376 dput(dir); 377 /* 378 * Set up the jbd2_inode if we are opening the inode for 379 * writing and the journal is present 380 */ 381 if (filp->f_mode & FMODE_WRITE) { 382 ret = ext4_inode_attach_jinode(inode); 383 if (ret < 0) 384 return ret; 385 } 386 return dquot_file_open(inode, filp); 387 } 388 389 /* 390 * Here we use ext4_map_blocks() to get a block mapping for a extent-based 391 * file rather than ext4_ext_walk_space() because we can introduce 392 * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same 393 * function. When extent status tree has been fully implemented, it will 394 * track all extent status for a file and we can directly use it to 395 * retrieve the offset for SEEK_DATA/SEEK_HOLE. 396 */ 397 398 /* 399 * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to 400 * lookup page cache to check whether or not there has some data between 401 * [startoff, endoff] because, if this range contains an unwritten extent, 402 * we determine this extent as a data or a hole according to whether the 403 * page cache has data or not. 404 */ 405 static int ext4_find_unwritten_pgoff(struct inode *inode, 406 int whence, 407 ext4_lblk_t end_blk, 408 loff_t *offset) 409 { 410 struct pagevec pvec; 411 unsigned int blkbits; 412 pgoff_t index; 413 pgoff_t end; 414 loff_t endoff; 415 loff_t startoff; 416 loff_t lastoff; 417 int found = 0; 418 419 blkbits = inode->i_sb->s_blocksize_bits; 420 startoff = *offset; 421 lastoff = startoff; 422 endoff = (loff_t)end_blk << blkbits; 423 424 index = startoff >> PAGE_SHIFT; 425 end = endoff >> PAGE_SHIFT; 426 427 pagevec_init(&pvec, 0); 428 do { 429 int i, num; 430 unsigned long nr_pages; 431 432 num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); 433 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, 434 (pgoff_t)num); 435 if (nr_pages == 0) { 436 if (whence == SEEK_DATA) 437 break; 438 439 BUG_ON(whence != SEEK_HOLE); 440 /* 441 * If this is the first time to go into the loop and 442 * offset is not beyond the end offset, it will be a 443 * hole at this offset 444 */ 445 if (lastoff == startoff || lastoff < endoff) 446 found = 1; 447 break; 448 } 449 450 /* 451 * If this is the first time to go into the loop and 452 * offset is smaller than the first page offset, it will be a 453 * hole at this offset. 454 */ 455 if (lastoff == startoff && whence == SEEK_HOLE && 456 lastoff < page_offset(pvec.pages[0])) { 457 found = 1; 458 break; 459 } 460 461 for (i = 0; i < nr_pages; i++) { 462 struct page *page = pvec.pages[i]; 463 struct buffer_head *bh, *head; 464 465 /* 466 * If the current offset is not beyond the end of given 467 * range, it will be a hole. 468 */ 469 if (lastoff < endoff && whence == SEEK_HOLE && 470 page->index > end) { 471 found = 1; 472 *offset = lastoff; 473 goto out; 474 } 475 476 lock_page(page); 477 478 if (unlikely(page->mapping != inode->i_mapping)) { 479 unlock_page(page); 480 continue; 481 } 482 483 if (!page_has_buffers(page)) { 484 unlock_page(page); 485 continue; 486 } 487 488 if (page_has_buffers(page)) { 489 lastoff = page_offset(page); 490 bh = head = page_buffers(page); 491 do { 492 if (buffer_uptodate(bh) || 493 buffer_unwritten(bh)) { 494 if (whence == SEEK_DATA) 495 found = 1; 496 } else { 497 if (whence == SEEK_HOLE) 498 found = 1; 499 } 500 if (found) { 501 *offset = max_t(loff_t, 502 startoff, lastoff); 503 unlock_page(page); 504 goto out; 505 } 506 lastoff += bh->b_size; 507 bh = bh->b_this_page; 508 } while (bh != head); 509 } 510 511 lastoff = page_offset(page) + PAGE_SIZE; 512 unlock_page(page); 513 } 514 515 /* 516 * The no. of pages is less than our desired, that would be a 517 * hole in there. 518 */ 519 if (nr_pages < num && whence == SEEK_HOLE) { 520 found = 1; 521 *offset = lastoff; 522 break; 523 } 524 525 index = pvec.pages[i - 1]->index + 1; 526 pagevec_release(&pvec); 527 } while (index <= end); 528 529 out: 530 pagevec_release(&pvec); 531 return found; 532 } 533 534 /* 535 * ext4_seek_data() retrieves the offset for SEEK_DATA. 536 */ 537 static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) 538 { 539 struct inode *inode = file->f_mapping->host; 540 struct extent_status es; 541 ext4_lblk_t start, last, end; 542 loff_t dataoff, isize; 543 int blkbits; 544 int ret; 545 546 inode_lock(inode); 547 548 isize = i_size_read(inode); 549 if (offset >= isize) { 550 inode_unlock(inode); 551 return -ENXIO; 552 } 553 554 blkbits = inode->i_sb->s_blocksize_bits; 555 start = offset >> blkbits; 556 last = start; 557 end = isize >> blkbits; 558 dataoff = offset; 559 560 do { 561 ret = ext4_get_next_extent(inode, last, end - last + 1, &es); 562 if (ret <= 0) { 563 /* No extent found -> no data */ 564 if (ret == 0) 565 ret = -ENXIO; 566 inode_unlock(inode); 567 return ret; 568 } 569 570 last = es.es_lblk; 571 if (last != start) 572 dataoff = (loff_t)last << blkbits; 573 if (!ext4_es_is_unwritten(&es)) 574 break; 575 576 /* 577 * If there is a unwritten extent at this offset, 578 * it will be as a data or a hole according to page 579 * cache that has data or not. 580 */ 581 if (ext4_find_unwritten_pgoff(inode, SEEK_DATA, 582 es.es_lblk + es.es_len, &dataoff)) 583 break; 584 last += es.es_len; 585 dataoff = (loff_t)last << blkbits; 586 cond_resched(); 587 } while (last <= end); 588 589 inode_unlock(inode); 590 591 if (dataoff > isize) 592 return -ENXIO; 593 594 return vfs_setpos(file, dataoff, maxsize); 595 } 596 597 /* 598 * ext4_seek_hole() retrieves the offset for SEEK_HOLE. 599 */ 600 static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) 601 { 602 struct inode *inode = file->f_mapping->host; 603 struct extent_status es; 604 ext4_lblk_t start, last, end; 605 loff_t holeoff, isize; 606 int blkbits; 607 int ret; 608 609 inode_lock(inode); 610 611 isize = i_size_read(inode); 612 if (offset >= isize) { 613 inode_unlock(inode); 614 return -ENXIO; 615 } 616 617 blkbits = inode->i_sb->s_blocksize_bits; 618 start = offset >> blkbits; 619 last = start; 620 end = isize >> blkbits; 621 holeoff = offset; 622 623 do { 624 ret = ext4_get_next_extent(inode, last, end - last + 1, &es); 625 if (ret < 0) { 626 inode_unlock(inode); 627 return ret; 628 } 629 /* Found a hole? */ 630 if (ret == 0 || es.es_lblk > last) { 631 if (last != start) 632 holeoff = (loff_t)last << blkbits; 633 break; 634 } 635 /* 636 * If there is a unwritten extent at this offset, 637 * it will be as a data or a hole according to page 638 * cache that has data or not. 639 */ 640 if (ext4_es_is_unwritten(&es) && 641 ext4_find_unwritten_pgoff(inode, SEEK_HOLE, 642 last + es.es_len, &holeoff)) 643 break; 644 645 last += es.es_len; 646 holeoff = (loff_t)last << blkbits; 647 cond_resched(); 648 } while (last <= end); 649 650 inode_unlock(inode); 651 652 if (holeoff > isize) 653 holeoff = isize; 654 655 return vfs_setpos(file, holeoff, maxsize); 656 } 657 658 /* 659 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values 660 * by calling generic_file_llseek_size() with the appropriate maxbytes 661 * value for each. 662 */ 663 loff_t ext4_llseek(struct file *file, loff_t offset, int whence) 664 { 665 struct inode *inode = file->f_mapping->host; 666 loff_t maxbytes; 667 668 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 669 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; 670 else 671 maxbytes = inode->i_sb->s_maxbytes; 672 673 switch (whence) { 674 case SEEK_SET: 675 case SEEK_CUR: 676 case SEEK_END: 677 return generic_file_llseek_size(file, offset, whence, 678 maxbytes, i_size_read(inode)); 679 case SEEK_DATA: 680 return ext4_seek_data(file, offset, maxbytes); 681 case SEEK_HOLE: 682 return ext4_seek_hole(file, offset, maxbytes); 683 } 684 685 return -EINVAL; 686 } 687 688 const struct file_operations ext4_file_operations = { 689 .llseek = ext4_llseek, 690 .read_iter = generic_file_read_iter, 691 .write_iter = ext4_file_write_iter, 692 .unlocked_ioctl = ext4_ioctl, 693 #ifdef CONFIG_COMPAT 694 .compat_ioctl = ext4_compat_ioctl, 695 #endif 696 .mmap = ext4_file_mmap, 697 .open = ext4_file_open, 698 .release = ext4_release_file, 699 .fsync = ext4_sync_file, 700 .get_unmapped_area = thp_get_unmapped_area, 701 .splice_read = generic_file_splice_read, 702 .splice_write = iter_file_splice_write, 703 .fallocate = ext4_fallocate, 704 }; 705 706 const struct inode_operations ext4_file_inode_operations = { 707 .setattr = ext4_setattr, 708 .getattr = ext4_getattr, 709 .listxattr = ext4_listxattr, 710 .get_acl = ext4_get_acl, 711 .set_acl = ext4_set_acl, 712 .fiemap = ext4_fiemap, 713 }; 714 715