1 /* 2 * linux/fs/ext4/file.c 3 * 4 * Copyright (C) 1992, 1993, 1994, 1995 5 * Remy Card (card@masi.ibp.fr) 6 * Laboratoire MASI - Institut Blaise Pascal 7 * Universite Pierre et Marie Curie (Paris VI) 8 * 9 * from 10 * 11 * linux/fs/minix/file.c 12 * 13 * Copyright (C) 1991, 1992 Linus Torvalds 14 * 15 * ext4 fs regular file handling primitives 16 * 17 * 64-bit file support on 64-bit platforms by Jakub Jelinek 18 * (jj@sunsite.ms.mff.cuni.cz) 19 */ 20 21 #include <linux/time.h> 22 #include <linux/fs.h> 23 #include <linux/mount.h> 24 #include <linux/path.h> 25 #include <linux/quotaops.h> 26 #include <linux/pagevec.h> 27 #include <linux/uio.h> 28 #include "ext4.h" 29 #include "ext4_jbd2.h" 30 #include "xattr.h" 31 #include "acl.h" 32 33 /* 34 * Called when an inode is released. Note that this is different 35 * from ext4_file_open: open gets called at every open, but release 36 * gets called only when /all/ the files are closed. 37 */ 38 static int ext4_release_file(struct inode *inode, struct file *filp) 39 { 40 if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { 41 ext4_alloc_da_blocks(inode); 42 ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 43 } 44 /* if we are the last writer on the inode, drop the block reservation */ 45 if ((filp->f_mode & FMODE_WRITE) && 46 (atomic_read(&inode->i_writecount) == 1) && 47 !EXT4_I(inode)->i_reserved_data_blocks) 48 { 49 down_write(&EXT4_I(inode)->i_data_sem); 50 ext4_discard_preallocations(inode); 51 up_write(&EXT4_I(inode)->i_data_sem); 52 } 53 if (is_dx(inode) && filp->private_data) 54 ext4_htree_free_dir_info(filp->private_data); 55 56 return 0; 57 } 58 59 static void ext4_unwritten_wait(struct inode *inode) 60 { 61 wait_queue_head_t *wq = ext4_ioend_wq(inode); 62 63 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0)); 64 } 65 66 /* 67 * This tests whether the IO in question is block-aligned or not. 68 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they 69 * are converted to written only after the IO is complete. Until they are 70 * mapped, these blocks appear as holes, so dio_zero_block() will assume that 71 * it needs to zero out portions of the start and/or end block. If 2 AIO 72 * threads are at work on the same unwritten block, they must be synchronized 73 * or one thread will zero the other's data, causing corruption. 74 */ 75 static int 76 ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) 77 { 78 struct super_block *sb = inode->i_sb; 79 int blockmask = sb->s_blocksize - 1; 80 81 if (pos >= i_size_read(inode)) 82 return 0; 83 84 if ((pos | iov_iter_alignment(from)) & blockmask) 85 return 1; 86 87 return 0; 88 } 89 90 static ssize_t 91 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 92 { 93 struct file *file = iocb->ki_filp; 94 struct inode *inode = file_inode(iocb->ki_filp); 95 struct mutex *aio_mutex = NULL; 96 struct blk_plug plug; 97 int o_direct = iocb->ki_flags & IOCB_DIRECT; 98 int overwrite = 0; 99 ssize_t ret; 100 101 /* 102 * Unaligned direct AIO must be serialized; see comment above 103 * In the case of O_APPEND, assume that we must always serialize 104 */ 105 if (o_direct && 106 ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && 107 !is_sync_kiocb(iocb) && 108 (iocb->ki_flags & IOCB_APPEND || 109 ext4_unaligned_aio(inode, from, iocb->ki_pos))) { 110 aio_mutex = ext4_aio_mutex(inode); 111 mutex_lock(aio_mutex); 112 ext4_unwritten_wait(inode); 113 } 114 115 mutex_lock(&inode->i_mutex); 116 ret = generic_write_checks(iocb, from); 117 if (ret <= 0) 118 goto out; 119 120 /* 121 * If we have encountered a bitmap-format file, the size limit 122 * is smaller than s_maxbytes, which is for extent-mapped files. 123 */ 124 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 125 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 126 127 if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) { 128 ret = -EFBIG; 129 goto out; 130 } 131 iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); 132 } 133 134 iocb->private = &overwrite; 135 if (o_direct) { 136 size_t length = iov_iter_count(from); 137 loff_t pos = iocb->ki_pos; 138 blk_start_plug(&plug); 139 140 /* check whether we do a DIO overwrite or not */ 141 if (ext4_should_dioread_nolock(inode) && !aio_mutex && 142 !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { 143 struct ext4_map_blocks map; 144 unsigned int blkbits = inode->i_blkbits; 145 int err, len; 146 147 map.m_lblk = pos >> blkbits; 148 map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits) 149 - map.m_lblk; 150 len = map.m_len; 151 152 err = ext4_map_blocks(NULL, inode, &map, 0); 153 /* 154 * 'err==len' means that all of blocks has 155 * been preallocated no matter they are 156 * initialized or not. For excluding 157 * unwritten extents, we need to check 158 * m_flags. There are two conditions that 159 * indicate for initialized extents. 1) If we 160 * hit extent cache, EXT4_MAP_MAPPED flag is 161 * returned; 2) If we do a real lookup, 162 * non-flags are returned. So we should check 163 * these two conditions. 164 */ 165 if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) 166 overwrite = 1; 167 } 168 } 169 170 ret = __generic_file_write_iter(iocb, from); 171 mutex_unlock(&inode->i_mutex); 172 173 if (ret > 0) { 174 ssize_t err; 175 176 err = generic_write_sync(file, iocb->ki_pos - ret, ret); 177 if (err < 0) 178 ret = err; 179 } 180 if (o_direct) 181 blk_finish_plug(&plug); 182 183 if (aio_mutex) 184 mutex_unlock(aio_mutex); 185 return ret; 186 187 out: 188 mutex_unlock(&inode->i_mutex); 189 if (aio_mutex) 190 mutex_unlock(aio_mutex); 191 return ret; 192 } 193 194 #ifdef CONFIG_FS_DAX 195 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 196 { 197 return dax_fault(vma, vmf, ext4_get_block); 198 /* Is this the right get_block? */ 199 } 200 201 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 202 { 203 return dax_mkwrite(vma, vmf, ext4_get_block); 204 } 205 206 static const struct vm_operations_struct ext4_dax_vm_ops = { 207 .fault = ext4_dax_fault, 208 .page_mkwrite = ext4_dax_mkwrite, 209 .pfn_mkwrite = dax_pfn_mkwrite, 210 }; 211 #else 212 #define ext4_dax_vm_ops ext4_file_vm_ops 213 #endif 214 215 static const struct vm_operations_struct ext4_file_vm_ops = { 216 .fault = filemap_fault, 217 .map_pages = filemap_map_pages, 218 .page_mkwrite = ext4_page_mkwrite, 219 }; 220 221 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) 222 { 223 struct inode *inode = file->f_mapping->host; 224 225 if (ext4_encrypted_inode(inode)) { 226 int err = ext4_get_encryption_info(inode); 227 if (err) 228 return 0; 229 if (ext4_encryption_info(inode) == NULL) 230 return -ENOKEY; 231 } 232 file_accessed(file); 233 if (IS_DAX(file_inode(file))) { 234 vma->vm_ops = &ext4_dax_vm_ops; 235 vma->vm_flags |= VM_MIXEDMAP; 236 } else { 237 vma->vm_ops = &ext4_file_vm_ops; 238 } 239 return 0; 240 } 241 242 static int ext4_file_open(struct inode * inode, struct file * filp) 243 { 244 struct super_block *sb = inode->i_sb; 245 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 246 struct vfsmount *mnt = filp->f_path.mnt; 247 struct path path; 248 char buf[64], *cp; 249 int ret; 250 251 if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && 252 !(sb->s_flags & MS_RDONLY))) { 253 sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; 254 /* 255 * Sample where the filesystem has been mounted and 256 * store it in the superblock for sysadmin convenience 257 * when trying to sort through large numbers of block 258 * devices or filesystem images. 259 */ 260 memset(buf, 0, sizeof(buf)); 261 path.mnt = mnt; 262 path.dentry = mnt->mnt_root; 263 cp = d_path(&path, buf, sizeof(buf)); 264 if (!IS_ERR(cp)) { 265 handle_t *handle; 266 int err; 267 268 handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); 269 if (IS_ERR(handle)) 270 return PTR_ERR(handle); 271 BUFFER_TRACE(sbi->s_sbh, "get_write_access"); 272 err = ext4_journal_get_write_access(handle, sbi->s_sbh); 273 if (err) { 274 ext4_journal_stop(handle); 275 return err; 276 } 277 strlcpy(sbi->s_es->s_last_mounted, cp, 278 sizeof(sbi->s_es->s_last_mounted)); 279 ext4_handle_dirty_super(handle, sb); 280 ext4_journal_stop(handle); 281 } 282 } 283 if (ext4_encrypted_inode(inode)) { 284 ret = ext4_get_encryption_info(inode); 285 if (ret) 286 return -EACCES; 287 if (ext4_encryption_info(inode) == NULL) 288 return -ENOKEY; 289 } 290 /* 291 * Set up the jbd2_inode if we are opening the inode for 292 * writing and the journal is present 293 */ 294 if (filp->f_mode & FMODE_WRITE) { 295 ret = ext4_inode_attach_jinode(inode); 296 if (ret < 0) 297 return ret; 298 } 299 return dquot_file_open(inode, filp); 300 } 301 302 /* 303 * Here we use ext4_map_blocks() to get a block mapping for a extent-based 304 * file rather than ext4_ext_walk_space() because we can introduce 305 * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same 306 * function. When extent status tree has been fully implemented, it will 307 * track all extent status for a file and we can directly use it to 308 * retrieve the offset for SEEK_DATA/SEEK_HOLE. 309 */ 310 311 /* 312 * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to 313 * lookup page cache to check whether or not there has some data between 314 * [startoff, endoff] because, if this range contains an unwritten extent, 315 * we determine this extent as a data or a hole according to whether the 316 * page cache has data or not. 317 */ 318 static int ext4_find_unwritten_pgoff(struct inode *inode, 319 int whence, 320 struct ext4_map_blocks *map, 321 loff_t *offset) 322 { 323 struct pagevec pvec; 324 unsigned int blkbits; 325 pgoff_t index; 326 pgoff_t end; 327 loff_t endoff; 328 loff_t startoff; 329 loff_t lastoff; 330 int found = 0; 331 332 blkbits = inode->i_sb->s_blocksize_bits; 333 startoff = *offset; 334 lastoff = startoff; 335 endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits; 336 337 index = startoff >> PAGE_CACHE_SHIFT; 338 end = endoff >> PAGE_CACHE_SHIFT; 339 340 pagevec_init(&pvec, 0); 341 do { 342 int i, num; 343 unsigned long nr_pages; 344 345 num = min_t(pgoff_t, end - index, PAGEVEC_SIZE); 346 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, 347 (pgoff_t)num); 348 if (nr_pages == 0) { 349 if (whence == SEEK_DATA) 350 break; 351 352 BUG_ON(whence != SEEK_HOLE); 353 /* 354 * If this is the first time to go into the loop and 355 * offset is not beyond the end offset, it will be a 356 * hole at this offset 357 */ 358 if (lastoff == startoff || lastoff < endoff) 359 found = 1; 360 break; 361 } 362 363 /* 364 * If this is the first time to go into the loop and 365 * offset is smaller than the first page offset, it will be a 366 * hole at this offset. 367 */ 368 if (lastoff == startoff && whence == SEEK_HOLE && 369 lastoff < page_offset(pvec.pages[0])) { 370 found = 1; 371 break; 372 } 373 374 for (i = 0; i < nr_pages; i++) { 375 struct page *page = pvec.pages[i]; 376 struct buffer_head *bh, *head; 377 378 /* 379 * If the current offset is not beyond the end of given 380 * range, it will be a hole. 381 */ 382 if (lastoff < endoff && whence == SEEK_HOLE && 383 page->index > end) { 384 found = 1; 385 *offset = lastoff; 386 goto out; 387 } 388 389 lock_page(page); 390 391 if (unlikely(page->mapping != inode->i_mapping)) { 392 unlock_page(page); 393 continue; 394 } 395 396 if (!page_has_buffers(page)) { 397 unlock_page(page); 398 continue; 399 } 400 401 if (page_has_buffers(page)) { 402 lastoff = page_offset(page); 403 bh = head = page_buffers(page); 404 do { 405 if (buffer_uptodate(bh) || 406 buffer_unwritten(bh)) { 407 if (whence == SEEK_DATA) 408 found = 1; 409 } else { 410 if (whence == SEEK_HOLE) 411 found = 1; 412 } 413 if (found) { 414 *offset = max_t(loff_t, 415 startoff, lastoff); 416 unlock_page(page); 417 goto out; 418 } 419 lastoff += bh->b_size; 420 bh = bh->b_this_page; 421 } while (bh != head); 422 } 423 424 lastoff = page_offset(page) + PAGE_SIZE; 425 unlock_page(page); 426 } 427 428 /* 429 * The no. of pages is less than our desired, that would be a 430 * hole in there. 431 */ 432 if (nr_pages < num && whence == SEEK_HOLE) { 433 found = 1; 434 *offset = lastoff; 435 break; 436 } 437 438 index = pvec.pages[i - 1]->index + 1; 439 pagevec_release(&pvec); 440 } while (index <= end); 441 442 out: 443 pagevec_release(&pvec); 444 return found; 445 } 446 447 /* 448 * ext4_seek_data() retrieves the offset for SEEK_DATA. 449 */ 450 static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) 451 { 452 struct inode *inode = file->f_mapping->host; 453 struct ext4_map_blocks map; 454 struct extent_status es; 455 ext4_lblk_t start, last, end; 456 loff_t dataoff, isize; 457 int blkbits; 458 int ret = 0; 459 460 mutex_lock(&inode->i_mutex); 461 462 isize = i_size_read(inode); 463 if (offset >= isize) { 464 mutex_unlock(&inode->i_mutex); 465 return -ENXIO; 466 } 467 468 blkbits = inode->i_sb->s_blocksize_bits; 469 start = offset >> blkbits; 470 last = start; 471 end = isize >> blkbits; 472 dataoff = offset; 473 474 do { 475 map.m_lblk = last; 476 map.m_len = end - last + 1; 477 ret = ext4_map_blocks(NULL, inode, &map, 0); 478 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 479 if (last != start) 480 dataoff = (loff_t)last << blkbits; 481 break; 482 } 483 484 /* 485 * If there is a delay extent at this offset, 486 * it will be as a data. 487 */ 488 ext4_es_find_delayed_extent_range(inode, last, last, &es); 489 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { 490 if (last != start) 491 dataoff = (loff_t)last << blkbits; 492 break; 493 } 494 495 /* 496 * If there is a unwritten extent at this offset, 497 * it will be as a data or a hole according to page 498 * cache that has data or not. 499 */ 500 if (map.m_flags & EXT4_MAP_UNWRITTEN) { 501 int unwritten; 502 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA, 503 &map, &dataoff); 504 if (unwritten) 505 break; 506 } 507 508 last++; 509 dataoff = (loff_t)last << blkbits; 510 } while (last <= end); 511 512 mutex_unlock(&inode->i_mutex); 513 514 if (dataoff > isize) 515 return -ENXIO; 516 517 return vfs_setpos(file, dataoff, maxsize); 518 } 519 520 /* 521 * ext4_seek_hole() retrieves the offset for SEEK_HOLE. 522 */ 523 static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) 524 { 525 struct inode *inode = file->f_mapping->host; 526 struct ext4_map_blocks map; 527 struct extent_status es; 528 ext4_lblk_t start, last, end; 529 loff_t holeoff, isize; 530 int blkbits; 531 int ret = 0; 532 533 mutex_lock(&inode->i_mutex); 534 535 isize = i_size_read(inode); 536 if (offset >= isize) { 537 mutex_unlock(&inode->i_mutex); 538 return -ENXIO; 539 } 540 541 blkbits = inode->i_sb->s_blocksize_bits; 542 start = offset >> blkbits; 543 last = start; 544 end = isize >> blkbits; 545 holeoff = offset; 546 547 do { 548 map.m_lblk = last; 549 map.m_len = end - last + 1; 550 ret = ext4_map_blocks(NULL, inode, &map, 0); 551 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 552 last += ret; 553 holeoff = (loff_t)last << blkbits; 554 continue; 555 } 556 557 /* 558 * If there is a delay extent at this offset, 559 * we will skip this extent. 560 */ 561 ext4_es_find_delayed_extent_range(inode, last, last, &es); 562 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { 563 last = es.es_lblk + es.es_len; 564 holeoff = (loff_t)last << blkbits; 565 continue; 566 } 567 568 /* 569 * If there is a unwritten extent at this offset, 570 * it will be as a data or a hole according to page 571 * cache that has data or not. 572 */ 573 if (map.m_flags & EXT4_MAP_UNWRITTEN) { 574 int unwritten; 575 unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE, 576 &map, &holeoff); 577 if (!unwritten) { 578 last += ret; 579 holeoff = (loff_t)last << blkbits; 580 continue; 581 } 582 } 583 584 /* find a hole */ 585 break; 586 } while (last <= end); 587 588 mutex_unlock(&inode->i_mutex); 589 590 if (holeoff > isize) 591 holeoff = isize; 592 593 return vfs_setpos(file, holeoff, maxsize); 594 } 595 596 /* 597 * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values 598 * by calling generic_file_llseek_size() with the appropriate maxbytes 599 * value for each. 600 */ 601 loff_t ext4_llseek(struct file *file, loff_t offset, int whence) 602 { 603 struct inode *inode = file->f_mapping->host; 604 loff_t maxbytes; 605 606 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 607 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; 608 else 609 maxbytes = inode->i_sb->s_maxbytes; 610 611 switch (whence) { 612 case SEEK_SET: 613 case SEEK_CUR: 614 case SEEK_END: 615 return generic_file_llseek_size(file, offset, whence, 616 maxbytes, i_size_read(inode)); 617 case SEEK_DATA: 618 return ext4_seek_data(file, offset, maxbytes); 619 case SEEK_HOLE: 620 return ext4_seek_hole(file, offset, maxbytes); 621 } 622 623 return -EINVAL; 624 } 625 626 const struct file_operations ext4_file_operations = { 627 .llseek = ext4_llseek, 628 .read_iter = generic_file_read_iter, 629 .write_iter = ext4_file_write_iter, 630 .unlocked_ioctl = ext4_ioctl, 631 #ifdef CONFIG_COMPAT 632 .compat_ioctl = ext4_compat_ioctl, 633 #endif 634 .mmap = ext4_file_mmap, 635 .open = ext4_file_open, 636 .release = ext4_release_file, 637 .fsync = ext4_sync_file, 638 .splice_read = generic_file_splice_read, 639 .splice_write = iter_file_splice_write, 640 .fallocate = ext4_fallocate, 641 }; 642 643 const struct inode_operations ext4_file_inode_operations = { 644 .setattr = ext4_setattr, 645 .getattr = ext4_getattr, 646 .setxattr = generic_setxattr, 647 .getxattr = generic_getxattr, 648 .listxattr = ext4_listxattr, 649 .removexattr = generic_removexattr, 650 .get_acl = ext4_get_acl, 651 .set_acl = ext4_set_acl, 652 .fiemap = ext4_fiemap, 653 }; 654 655