1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_fs.h" 20 #include "xfs_shared.h" 21 #include "xfs_format.h" 22 #include "xfs_log_format.h" 23 #include "xfs_trans_resv.h" 24 #include "xfs_mount.h" 25 #include "xfs_da_format.h" 26 #include "xfs_da_btree.h" 27 #include "xfs_inode.h" 28 #include "xfs_trans.h" 29 #include "xfs_inode_item.h" 30 #include "xfs_bmap.h" 31 #include "xfs_bmap_util.h" 32 #include "xfs_error.h" 33 #include "xfs_dir2.h" 34 #include "xfs_dir2_priv.h" 35 #include "xfs_ioctl.h" 36 #include "xfs_trace.h" 37 #include "xfs_log.h" 38 #include "xfs_icache.h" 39 #include "xfs_pnfs.h" 40 41 #include <linux/dcache.h> 42 #include <linux/falloc.h> 43 #include <linux/pagevec.h> 44 #include <linux/backing-dev.h> 45 46 static const struct vm_operations_struct xfs_file_vm_ops; 47 48 /* 49 * Locking primitives for read and write IO paths to ensure we consistently use 50 * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. 51 */ 52 static inline void 53 xfs_rw_ilock( 54 struct xfs_inode *ip, 55 int type) 56 { 57 if (type & XFS_IOLOCK_EXCL) 58 inode_lock(VFS_I(ip)); 59 xfs_ilock(ip, type); 60 } 61 62 static inline void 63 xfs_rw_iunlock( 64 struct xfs_inode *ip, 65 int type) 66 { 67 xfs_iunlock(ip, type); 68 if (type & XFS_IOLOCK_EXCL) 69 inode_unlock(VFS_I(ip)); 70 } 71 72 static inline void 73 xfs_rw_ilock_demote( 74 struct xfs_inode *ip, 75 int type) 76 { 77 xfs_ilock_demote(ip, type); 78 if (type & XFS_IOLOCK_EXCL) 79 inode_unlock(VFS_I(ip)); 80 } 81 82 /* 83 * xfs_iozero clears the specified range supplied via the page cache (except in 84 * the DAX case). Writes through the page cache will allocate blocks over holes, 85 * though the callers usually map the holes first and avoid them. If a block is 86 * not completely zeroed, then it will be read from disk before being partially 87 * zeroed. 88 * 89 * In the DAX case, we can just directly write to the underlying pages. This 90 * will not allocate blocks, but will avoid holes and unwritten extents and so 91 * not do unnecessary work. 92 */ 93 int 94 xfs_iozero( 95 struct xfs_inode *ip, /* inode */ 96 loff_t pos, /* offset in file */ 97 size_t count) /* size of data to zero */ 98 { 99 struct page *page; 100 struct address_space *mapping; 101 int status = 0; 102 103 104 mapping = VFS_I(ip)->i_mapping; 105 do { 106 unsigned offset, bytes; 107 void *fsdata; 108 109 offset = (pos & (PAGE_SIZE -1)); /* Within page */ 110 bytes = PAGE_SIZE - offset; 111 if (bytes > count) 112 bytes = count; 113 114 if (IS_DAX(VFS_I(ip))) { 115 status = dax_zero_page_range(VFS_I(ip), pos, bytes, 116 xfs_get_blocks_direct); 117 if (status) 118 break; 119 } else { 120 status = pagecache_write_begin(NULL, mapping, pos, bytes, 121 AOP_FLAG_UNINTERRUPTIBLE, 122 &page, &fsdata); 123 if (status) 124 break; 125 126 zero_user(page, offset, bytes); 127 128 status = pagecache_write_end(NULL, mapping, pos, bytes, 129 bytes, page, fsdata); 130 WARN_ON(status <= 0); /* can't return less than zero! */ 131 status = 0; 132 } 133 pos += bytes; 134 count -= bytes; 135 } while (count); 136 137 return status; 138 } 139 140 int 141 xfs_update_prealloc_flags( 142 struct xfs_inode *ip, 143 enum xfs_prealloc_flags flags) 144 { 145 struct xfs_trans *tp; 146 int error; 147 148 error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid, 149 0, 0, 0, &tp); 150 if (error) 151 return error; 152 153 xfs_ilock(ip, XFS_ILOCK_EXCL); 154 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 155 156 if (!(flags & XFS_PREALLOC_INVISIBLE)) { 157 VFS_I(ip)->i_mode &= ~S_ISUID; 158 if (VFS_I(ip)->i_mode & S_IXGRP) 159 VFS_I(ip)->i_mode &= ~S_ISGID; 160 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 161 } 162 163 if (flags & XFS_PREALLOC_SET) 164 ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; 165 if (flags & XFS_PREALLOC_CLEAR) 166 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; 167 168 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 169 if (flags & XFS_PREALLOC_SYNC) 170 xfs_trans_set_sync(tp); 171 return xfs_trans_commit(tp); 172 } 173 174 /* 175 * Fsync operations on directories are much simpler than on regular files, 176 * as there is no file data to flush, and thus also no need for explicit 177 * cache flush operations, and there are no non-transaction metadata updates 178 * on directories either. 179 */ 180 STATIC int 181 xfs_dir_fsync( 182 struct file *file, 183 loff_t start, 184 loff_t end, 185 int datasync) 186 { 187 struct xfs_inode *ip = XFS_I(file->f_mapping->host); 188 struct xfs_mount *mp = ip->i_mount; 189 xfs_lsn_t lsn = 0; 190 191 trace_xfs_dir_fsync(ip); 192 193 xfs_ilock(ip, XFS_ILOCK_SHARED); 194 if (xfs_ipincount(ip)) 195 lsn = ip->i_itemp->ili_last_lsn; 196 xfs_iunlock(ip, XFS_ILOCK_SHARED); 197 198 if (!lsn) 199 return 0; 200 return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); 201 } 202 203 STATIC int 204 xfs_file_fsync( 205 struct file *file, 206 loff_t start, 207 loff_t end, 208 int datasync) 209 { 210 struct inode *inode = file->f_mapping->host; 211 struct xfs_inode *ip = XFS_I(inode); 212 struct xfs_mount *mp = ip->i_mount; 213 int error = 0; 214 int log_flushed = 0; 215 xfs_lsn_t lsn = 0; 216 217 trace_xfs_file_fsync(ip); 218 219 error = filemap_write_and_wait_range(inode->i_mapping, start, end); 220 if (error) 221 return error; 222 223 if (XFS_FORCED_SHUTDOWN(mp)) 224 return -EIO; 225 226 xfs_iflags_clear(ip, XFS_ITRUNCATED); 227 228 if (mp->m_flags & XFS_MOUNT_BARRIER) { 229 /* 230 * If we have an RT and/or log subvolume we need to make sure 231 * to flush the write cache the device used for file data 232 * first. This is to ensure newly written file data make 233 * it to disk before logging the new inode size in case of 234 * an extending write. 235 */ 236 if (XFS_IS_REALTIME_INODE(ip)) 237 xfs_blkdev_issue_flush(mp->m_rtdev_targp); 238 else if (mp->m_logdev_targp != mp->m_ddev_targp) 239 xfs_blkdev_issue_flush(mp->m_ddev_targp); 240 } 241 242 /* 243 * All metadata updates are logged, which means that we just have to 244 * flush the log up to the latest LSN that touched the inode. If we have 245 * concurrent fsync/fdatasync() calls, we need them to all block on the 246 * log force before we clear the ili_fsync_fields field. This ensures 247 * that we don't get a racing sync operation that does not wait for the 248 * metadata to hit the journal before returning. If we race with 249 * clearing the ili_fsync_fields, then all that will happen is the log 250 * force will do nothing as the lsn will already be on disk. We can't 251 * race with setting ili_fsync_fields because that is done under 252 * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared 253 * until after the ili_fsync_fields is cleared. 254 */ 255 xfs_ilock(ip, XFS_ILOCK_SHARED); 256 if (xfs_ipincount(ip)) { 257 if (!datasync || 258 (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) 259 lsn = ip->i_itemp->ili_last_lsn; 260 } 261 262 if (lsn) { 263 error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); 264 ip->i_itemp->ili_fsync_fields = 0; 265 } 266 xfs_iunlock(ip, XFS_ILOCK_SHARED); 267 268 /* 269 * If we only have a single device, and the log force about was 270 * a no-op we might have to flush the data device cache here. 271 * This can only happen for fdatasync/O_DSYNC if we were overwriting 272 * an already allocated file and thus do not have any metadata to 273 * commit. 274 */ 275 if ((mp->m_flags & XFS_MOUNT_BARRIER) && 276 mp->m_logdev_targp == mp->m_ddev_targp && 277 !XFS_IS_REALTIME_INODE(ip) && 278 !log_flushed) 279 xfs_blkdev_issue_flush(mp->m_ddev_targp); 280 281 return error; 282 } 283 284 STATIC ssize_t 285 xfs_file_read_iter( 286 struct kiocb *iocb, 287 struct iov_iter *to) 288 { 289 struct file *file = iocb->ki_filp; 290 struct inode *inode = file->f_mapping->host; 291 struct xfs_inode *ip = XFS_I(inode); 292 struct xfs_mount *mp = ip->i_mount; 293 size_t size = iov_iter_count(to); 294 ssize_t ret = 0; 295 int ioflags = 0; 296 xfs_fsize_t n; 297 loff_t pos = iocb->ki_pos; 298 299 XFS_STATS_INC(mp, xs_read_calls); 300 301 if (unlikely(iocb->ki_flags & IOCB_DIRECT)) 302 ioflags |= XFS_IO_ISDIRECT; 303 if (file->f_mode & FMODE_NOCMTIME) 304 ioflags |= XFS_IO_INVIS; 305 306 if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { 307 xfs_buftarg_t *target = 308 XFS_IS_REALTIME_INODE(ip) ? 309 mp->m_rtdev_targp : mp->m_ddev_targp; 310 /* DIO must be aligned to device logical sector size */ 311 if ((pos | size) & target->bt_logical_sectormask) { 312 if (pos == i_size_read(inode)) 313 return 0; 314 return -EINVAL; 315 } 316 } 317 318 n = mp->m_super->s_maxbytes - pos; 319 if (n <= 0 || size == 0) 320 return 0; 321 322 if (n < size) 323 size = n; 324 325 if (XFS_FORCED_SHUTDOWN(mp)) 326 return -EIO; 327 328 /* 329 * Locking is a bit tricky here. If we take an exclusive lock for direct 330 * IO, we effectively serialise all new concurrent read IO to this file 331 * and block it behind IO that is currently in progress because IO in 332 * progress holds the IO lock shared. We only need to hold the lock 333 * exclusive to blow away the page cache, so only take lock exclusively 334 * if the page cache needs invalidation. This allows the normal direct 335 * IO case of no page cache pages to proceeed concurrently without 336 * serialisation. 337 */ 338 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); 339 if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { 340 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); 341 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); 342 343 /* 344 * The generic dio code only flushes the range of the particular 345 * I/O. Because we take an exclusive lock here, this whole 346 * sequence is considerably more expensive for us. This has a 347 * noticeable performance impact for any file with cached pages, 348 * even when outside of the range of the particular I/O. 349 * 350 * Hence, amortize the cost of the lock against a full file 351 * flush and reduce the chances of repeated iolock cycles going 352 * forward. 353 */ 354 if (inode->i_mapping->nrpages) { 355 ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); 356 if (ret) { 357 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); 358 return ret; 359 } 360 361 /* 362 * Invalidate whole pages. This can return an error if 363 * we fail to invalidate a page, but this should never 364 * happen on XFS. Warn if it does fail. 365 */ 366 ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); 367 WARN_ON_ONCE(ret); 368 ret = 0; 369 } 370 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 371 } 372 373 trace_xfs_file_read(ip, size, pos, ioflags); 374 375 ret = generic_file_read_iter(iocb, to); 376 if (ret > 0) 377 XFS_STATS_ADD(mp, xs_read_bytes, ret); 378 379 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); 380 return ret; 381 } 382 383 STATIC ssize_t 384 xfs_file_splice_read( 385 struct file *infilp, 386 loff_t *ppos, 387 struct pipe_inode_info *pipe, 388 size_t count, 389 unsigned int flags) 390 { 391 struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); 392 int ioflags = 0; 393 ssize_t ret; 394 395 XFS_STATS_INC(ip->i_mount, xs_read_calls); 396 397 if (infilp->f_mode & FMODE_NOCMTIME) 398 ioflags |= XFS_IO_INVIS; 399 400 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 401 return -EIO; 402 403 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 404 405 /* 406 * DAX inodes cannot ues the page cache for splice, so we have to push 407 * them through the VFS IO path. This means it goes through 408 * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we 409 * cannot lock the splice operation at this level for DAX inodes. 410 */ 411 if (IS_DAX(VFS_I(ip))) { 412 ret = default_file_splice_read(infilp, ppos, pipe, count, 413 flags); 414 goto out; 415 } 416 417 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); 418 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); 419 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); 420 out: 421 if (ret > 0) 422 XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret); 423 return ret; 424 } 425 426 /* 427 * This routine is called to handle zeroing any space in the last block of the 428 * file that is beyond the EOF. We do this since the size is being increased 429 * without writing anything to that block and we don't want to read the 430 * garbage on the disk. 431 */ 432 STATIC int /* error (positive) */ 433 xfs_zero_last_block( 434 struct xfs_inode *ip, 435 xfs_fsize_t offset, 436 xfs_fsize_t isize, 437 bool *did_zeroing) 438 { 439 struct xfs_mount *mp = ip->i_mount; 440 xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); 441 int zero_offset = XFS_B_FSB_OFFSET(mp, isize); 442 int zero_len; 443 int nimaps = 1; 444 int error = 0; 445 struct xfs_bmbt_irec imap; 446 447 xfs_ilock(ip, XFS_ILOCK_EXCL); 448 error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); 449 xfs_iunlock(ip, XFS_ILOCK_EXCL); 450 if (error) 451 return error; 452 453 ASSERT(nimaps > 0); 454 455 /* 456 * If the block underlying isize is just a hole, then there 457 * is nothing to zero. 458 */ 459 if (imap.br_startblock == HOLESTARTBLOCK) 460 return 0; 461 462 zero_len = mp->m_sb.sb_blocksize - zero_offset; 463 if (isize + zero_len > offset) 464 zero_len = offset - isize; 465 *did_zeroing = true; 466 return xfs_iozero(ip, isize, zero_len); 467 } 468 469 /* 470 * Zero any on disk space between the current EOF and the new, larger EOF. 471 * 472 * This handles the normal case of zeroing the remainder of the last block in 473 * the file and the unusual case of zeroing blocks out beyond the size of the 474 * file. This second case only happens with fixed size extents and when the 475 * system crashes before the inode size was updated but after blocks were 476 * allocated. 477 * 478 * Expects the iolock to be held exclusive, and will take the ilock internally. 479 */ 480 int /* error (positive) */ 481 xfs_zero_eof( 482 struct xfs_inode *ip, 483 xfs_off_t offset, /* starting I/O offset */ 484 xfs_fsize_t isize, /* current inode size */ 485 bool *did_zeroing) 486 { 487 struct xfs_mount *mp = ip->i_mount; 488 xfs_fileoff_t start_zero_fsb; 489 xfs_fileoff_t end_zero_fsb; 490 xfs_fileoff_t zero_count_fsb; 491 xfs_fileoff_t last_fsb; 492 xfs_fileoff_t zero_off; 493 xfs_fsize_t zero_len; 494 int nimaps; 495 int error = 0; 496 struct xfs_bmbt_irec imap; 497 498 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 499 ASSERT(offset > isize); 500 501 trace_xfs_zero_eof(ip, isize, offset - isize); 502 503 /* 504 * First handle zeroing the block on which isize resides. 505 * 506 * We only zero a part of that block so it is handled specially. 507 */ 508 if (XFS_B_FSB_OFFSET(mp, isize) != 0) { 509 error = xfs_zero_last_block(ip, offset, isize, did_zeroing); 510 if (error) 511 return error; 512 } 513 514 /* 515 * Calculate the range between the new size and the old where blocks 516 * needing to be zeroed may exist. 517 * 518 * To get the block where the last byte in the file currently resides, 519 * we need to subtract one from the size and truncate back to a block 520 * boundary. We subtract 1 in case the size is exactly on a block 521 * boundary. 522 */ 523 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; 524 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); 525 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); 526 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); 527 if (last_fsb == end_zero_fsb) { 528 /* 529 * The size was only incremented on its last block. 530 * We took care of that above, so just return. 531 */ 532 return 0; 533 } 534 535 ASSERT(start_zero_fsb <= end_zero_fsb); 536 while (start_zero_fsb <= end_zero_fsb) { 537 nimaps = 1; 538 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; 539 540 xfs_ilock(ip, XFS_ILOCK_EXCL); 541 error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, 542 &imap, &nimaps, 0); 543 xfs_iunlock(ip, XFS_ILOCK_EXCL); 544 if (error) 545 return error; 546 547 ASSERT(nimaps > 0); 548 549 if (imap.br_state == XFS_EXT_UNWRITTEN || 550 imap.br_startblock == HOLESTARTBLOCK) { 551 start_zero_fsb = imap.br_startoff + imap.br_blockcount; 552 ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); 553 continue; 554 } 555 556 /* 557 * There are blocks we need to zero. 558 */ 559 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); 560 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); 561 562 if ((zero_off + zero_len) > offset) 563 zero_len = offset - zero_off; 564 565 error = xfs_iozero(ip, zero_off, zero_len); 566 if (error) 567 return error; 568 569 *did_zeroing = true; 570 start_zero_fsb = imap.br_startoff + imap.br_blockcount; 571 ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); 572 } 573 574 return 0; 575 } 576 577 /* 578 * Common pre-write limit and setup checks. 579 * 580 * Called with the iolocked held either shared and exclusive according to 581 * @iolock, and returns with it held. Might upgrade the iolock to exclusive 582 * if called for a direct write beyond i_size. 583 */ 584 STATIC ssize_t 585 xfs_file_aio_write_checks( 586 struct kiocb *iocb, 587 struct iov_iter *from, 588 int *iolock) 589 { 590 struct file *file = iocb->ki_filp; 591 struct inode *inode = file->f_mapping->host; 592 struct xfs_inode *ip = XFS_I(inode); 593 ssize_t error = 0; 594 size_t count = iov_iter_count(from); 595 bool drained_dio = false; 596 597 restart: 598 error = generic_write_checks(iocb, from); 599 if (error <= 0) 600 return error; 601 602 error = xfs_break_layouts(inode, iolock, true); 603 if (error) 604 return error; 605 606 /* For changing security info in file_remove_privs() we need i_mutex */ 607 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { 608 xfs_rw_iunlock(ip, *iolock); 609 *iolock = XFS_IOLOCK_EXCL; 610 xfs_rw_ilock(ip, *iolock); 611 goto restart; 612 } 613 /* 614 * If the offset is beyond the size of the file, we need to zero any 615 * blocks that fall between the existing EOF and the start of this 616 * write. If zeroing is needed and we are currently holding the 617 * iolock shared, we need to update it to exclusive which implies 618 * having to redo all checks before. 619 * 620 * We need to serialise against EOF updates that occur in IO 621 * completions here. We want to make sure that nobody is changing the 622 * size while we do this check until we have placed an IO barrier (i.e. 623 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. 624 * The spinlock effectively forms a memory barrier once we have the 625 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value 626 * and hence be able to correctly determine if we need to run zeroing. 627 */ 628 spin_lock(&ip->i_flags_lock); 629 if (iocb->ki_pos > i_size_read(inode)) { 630 bool zero = false; 631 632 spin_unlock(&ip->i_flags_lock); 633 if (!drained_dio) { 634 if (*iolock == XFS_IOLOCK_SHARED) { 635 xfs_rw_iunlock(ip, *iolock); 636 *iolock = XFS_IOLOCK_EXCL; 637 xfs_rw_ilock(ip, *iolock); 638 iov_iter_reexpand(from, count); 639 } 640 /* 641 * We now have an IO submission barrier in place, but 642 * AIO can do EOF updates during IO completion and hence 643 * we now need to wait for all of them to drain. Non-AIO 644 * DIO will have drained before we are given the 645 * XFS_IOLOCK_EXCL, and so for most cases this wait is a 646 * no-op. 647 */ 648 inode_dio_wait(inode); 649 drained_dio = true; 650 goto restart; 651 } 652 error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); 653 if (error) 654 return error; 655 } else 656 spin_unlock(&ip->i_flags_lock); 657 658 /* 659 * Updating the timestamps will grab the ilock again from 660 * xfs_fs_dirty_inode, so we have to call it after dropping the 661 * lock above. Eventually we should look into a way to avoid 662 * the pointless lock roundtrip. 663 */ 664 if (likely(!(file->f_mode & FMODE_NOCMTIME))) { 665 error = file_update_time(file); 666 if (error) 667 return error; 668 } 669 670 /* 671 * If we're writing the file then make sure to clear the setuid and 672 * setgid bits if the process is not being run by root. This keeps 673 * people from modifying setuid and setgid binaries. 674 */ 675 if (!IS_NOSEC(inode)) 676 return file_remove_privs(file); 677 return 0; 678 } 679 680 /* 681 * xfs_file_dio_aio_write - handle direct IO writes 682 * 683 * Lock the inode appropriately to prepare for and issue a direct IO write. 684 * By separating it from the buffered write path we remove all the tricky to 685 * follow locking changes and looping. 686 * 687 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL 688 * until we're sure the bytes at the new EOF have been zeroed and/or the cached 689 * pages are flushed out. 690 * 691 * In most cases the direct IO writes will be done holding IOLOCK_SHARED 692 * allowing them to be done in parallel with reads and other direct IO writes. 693 * However, if the IO is not aligned to filesystem blocks, the direct IO layer 694 * needs to do sub-block zeroing and that requires serialisation against other 695 * direct IOs to the same block. In this case we need to serialise the 696 * submission of the unaligned IOs so that we don't get racing block zeroing in 697 * the dio layer. To avoid the problem with aio, we also need to wait for 698 * outstanding IOs to complete so that unwritten extent conversion is completed 699 * before we try to map the overlapping block. This is currently implemented by 700 * hitting it with a big hammer (i.e. inode_dio_wait()). 701 * 702 * Returns with locks held indicated by @iolock and errors indicated by 703 * negative return values. 704 */ 705 STATIC ssize_t 706 xfs_file_dio_aio_write( 707 struct kiocb *iocb, 708 struct iov_iter *from) 709 { 710 struct file *file = iocb->ki_filp; 711 struct address_space *mapping = file->f_mapping; 712 struct inode *inode = mapping->host; 713 struct xfs_inode *ip = XFS_I(inode); 714 struct xfs_mount *mp = ip->i_mount; 715 ssize_t ret = 0; 716 int unaligned_io = 0; 717 int iolock; 718 size_t count = iov_iter_count(from); 719 loff_t end; 720 struct iov_iter data; 721 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? 722 mp->m_rtdev_targp : mp->m_ddev_targp; 723 724 /* DIO must be aligned to device logical sector size */ 725 if (!IS_DAX(inode) && 726 ((iocb->ki_pos | count) & target->bt_logical_sectormask)) 727 return -EINVAL; 728 729 /* "unaligned" here means not aligned to a filesystem block */ 730 if ((iocb->ki_pos & mp->m_blockmask) || 731 ((iocb->ki_pos + count) & mp->m_blockmask)) 732 unaligned_io = 1; 733 734 /* 735 * We don't need to take an exclusive lock unless there page cache needs 736 * to be invalidated or unaligned IO is being executed. We don't need to 737 * consider the EOF extension case here because 738 * xfs_file_aio_write_checks() will relock the inode as necessary for 739 * EOF zeroing cases and fill out the new inode size as appropriate. 740 */ 741 if (unaligned_io || mapping->nrpages) 742 iolock = XFS_IOLOCK_EXCL; 743 else 744 iolock = XFS_IOLOCK_SHARED; 745 xfs_rw_ilock(ip, iolock); 746 747 /* 748 * Recheck if there are cached pages that need invalidate after we got 749 * the iolock to protect against other threads adding new pages while 750 * we were waiting for the iolock. 751 */ 752 if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { 753 xfs_rw_iunlock(ip, iolock); 754 iolock = XFS_IOLOCK_EXCL; 755 xfs_rw_ilock(ip, iolock); 756 } 757 758 ret = xfs_file_aio_write_checks(iocb, from, &iolock); 759 if (ret) 760 goto out; 761 count = iov_iter_count(from); 762 end = iocb->ki_pos + count - 1; 763 764 /* 765 * See xfs_file_read_iter() for why we do a full-file flush here. 766 */ 767 if (mapping->nrpages) { 768 ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); 769 if (ret) 770 goto out; 771 /* 772 * Invalidate whole pages. This can return an error if we fail 773 * to invalidate a page, but this should never happen on XFS. 774 * Warn if it does fail. 775 */ 776 ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); 777 WARN_ON_ONCE(ret); 778 ret = 0; 779 } 780 781 /* 782 * If we are doing unaligned IO, wait for all other IO to drain, 783 * otherwise demote the lock if we had to flush cached pages 784 */ 785 if (unaligned_io) 786 inode_dio_wait(inode); 787 else if (iolock == XFS_IOLOCK_EXCL) { 788 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 789 iolock = XFS_IOLOCK_SHARED; 790 } 791 792 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); 793 794 data = *from; 795 ret = mapping->a_ops->direct_IO(iocb, &data); 796 797 /* see generic_file_direct_write() for why this is necessary */ 798 if (mapping->nrpages) { 799 invalidate_inode_pages2_range(mapping, 800 iocb->ki_pos >> PAGE_SHIFT, 801 end >> PAGE_SHIFT); 802 } 803 804 if (ret > 0) { 805 iocb->ki_pos += ret; 806 iov_iter_advance(from, ret); 807 } 808 out: 809 xfs_rw_iunlock(ip, iolock); 810 811 /* 812 * No fallback to buffered IO on errors for XFS. DAX can result in 813 * partial writes, but direct IO will either complete fully or fail. 814 */ 815 ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); 816 return ret; 817 } 818 819 STATIC ssize_t 820 xfs_file_buffered_aio_write( 821 struct kiocb *iocb, 822 struct iov_iter *from) 823 { 824 struct file *file = iocb->ki_filp; 825 struct address_space *mapping = file->f_mapping; 826 struct inode *inode = mapping->host; 827 struct xfs_inode *ip = XFS_I(inode); 828 ssize_t ret; 829 int enospc = 0; 830 int iolock = XFS_IOLOCK_EXCL; 831 832 xfs_rw_ilock(ip, iolock); 833 834 ret = xfs_file_aio_write_checks(iocb, from, &iolock); 835 if (ret) 836 goto out; 837 838 /* We can write back this queue in page reclaim */ 839 current->backing_dev_info = inode_to_bdi(inode); 840 841 write_retry: 842 trace_xfs_file_buffered_write(ip, iov_iter_count(from), 843 iocb->ki_pos, 0); 844 ret = generic_perform_write(file, from, iocb->ki_pos); 845 if (likely(ret >= 0)) 846 iocb->ki_pos += ret; 847 848 /* 849 * If we hit a space limit, try to free up some lingering preallocated 850 * space before returning an error. In the case of ENOSPC, first try to 851 * write back all dirty inodes to free up some of the excess reserved 852 * metadata space. This reduces the chances that the eofblocks scan 853 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this 854 * also behaves as a filter to prevent too many eofblocks scans from 855 * running at the same time. 856 */ 857 if (ret == -EDQUOT && !enospc) { 858 enospc = xfs_inode_free_quota_eofblocks(ip); 859 if (enospc) 860 goto write_retry; 861 } else if (ret == -ENOSPC && !enospc) { 862 struct xfs_eofblocks eofb = {0}; 863 864 enospc = 1; 865 xfs_flush_inodes(ip->i_mount); 866 eofb.eof_scan_owner = ip->i_ino; /* for locking */ 867 eofb.eof_flags = XFS_EOF_FLAGS_SYNC; 868 xfs_icache_free_eofblocks(ip->i_mount, &eofb); 869 goto write_retry; 870 } 871 872 current->backing_dev_info = NULL; 873 out: 874 xfs_rw_iunlock(ip, iolock); 875 return ret; 876 } 877 878 STATIC ssize_t 879 xfs_file_write_iter( 880 struct kiocb *iocb, 881 struct iov_iter *from) 882 { 883 struct file *file = iocb->ki_filp; 884 struct address_space *mapping = file->f_mapping; 885 struct inode *inode = mapping->host; 886 struct xfs_inode *ip = XFS_I(inode); 887 ssize_t ret; 888 size_t ocount = iov_iter_count(from); 889 890 XFS_STATS_INC(ip->i_mount, xs_write_calls); 891 892 if (ocount == 0) 893 return 0; 894 895 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 896 return -EIO; 897 898 if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) 899 ret = xfs_file_dio_aio_write(iocb, from); 900 else 901 ret = xfs_file_buffered_aio_write(iocb, from); 902 903 if (ret > 0) { 904 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); 905 906 /* Handle various SYNC-type writes */ 907 ret = generic_write_sync(iocb, ret); 908 } 909 return ret; 910 } 911 912 #define XFS_FALLOC_FL_SUPPORTED \ 913 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ 914 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ 915 FALLOC_FL_INSERT_RANGE) 916 917 STATIC long 918 xfs_file_fallocate( 919 struct file *file, 920 int mode, 921 loff_t offset, 922 loff_t len) 923 { 924 struct inode *inode = file_inode(file); 925 struct xfs_inode *ip = XFS_I(inode); 926 long error; 927 enum xfs_prealloc_flags flags = 0; 928 uint iolock = XFS_IOLOCK_EXCL; 929 loff_t new_size = 0; 930 bool do_file_insert = 0; 931 932 if (!S_ISREG(inode->i_mode)) 933 return -EINVAL; 934 if (mode & ~XFS_FALLOC_FL_SUPPORTED) 935 return -EOPNOTSUPP; 936 937 xfs_ilock(ip, iolock); 938 error = xfs_break_layouts(inode, &iolock, false); 939 if (error) 940 goto out_unlock; 941 942 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 943 iolock |= XFS_MMAPLOCK_EXCL; 944 945 if (mode & FALLOC_FL_PUNCH_HOLE) { 946 error = xfs_free_file_space(ip, offset, len); 947 if (error) 948 goto out_unlock; 949 } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { 950 unsigned blksize_mask = (1 << inode->i_blkbits) - 1; 951 952 if (offset & blksize_mask || len & blksize_mask) { 953 error = -EINVAL; 954 goto out_unlock; 955 } 956 957 /* 958 * There is no need to overlap collapse range with EOF, 959 * in which case it is effectively a truncate operation 960 */ 961 if (offset + len >= i_size_read(inode)) { 962 error = -EINVAL; 963 goto out_unlock; 964 } 965 966 new_size = i_size_read(inode) - len; 967 968 error = xfs_collapse_file_space(ip, offset, len); 969 if (error) 970 goto out_unlock; 971 } else if (mode & FALLOC_FL_INSERT_RANGE) { 972 unsigned blksize_mask = (1 << inode->i_blkbits) - 1; 973 974 new_size = i_size_read(inode) + len; 975 if (offset & blksize_mask || len & blksize_mask) { 976 error = -EINVAL; 977 goto out_unlock; 978 } 979 980 /* check the new inode size does not wrap through zero */ 981 if (new_size > inode->i_sb->s_maxbytes) { 982 error = -EFBIG; 983 goto out_unlock; 984 } 985 986 /* Offset should be less than i_size */ 987 if (offset >= i_size_read(inode)) { 988 error = -EINVAL; 989 goto out_unlock; 990 } 991 do_file_insert = 1; 992 } else { 993 flags |= XFS_PREALLOC_SET; 994 995 if (!(mode & FALLOC_FL_KEEP_SIZE) && 996 offset + len > i_size_read(inode)) { 997 new_size = offset + len; 998 error = inode_newsize_ok(inode, new_size); 999 if (error) 1000 goto out_unlock; 1001 } 1002 1003 if (mode & FALLOC_FL_ZERO_RANGE) 1004 error = xfs_zero_file_space(ip, offset, len); 1005 else 1006 error = xfs_alloc_file_space(ip, offset, len, 1007 XFS_BMAPI_PREALLOC); 1008 if (error) 1009 goto out_unlock; 1010 } 1011 1012 if (file->f_flags & O_DSYNC) 1013 flags |= XFS_PREALLOC_SYNC; 1014 1015 error = xfs_update_prealloc_flags(ip, flags); 1016 if (error) 1017 goto out_unlock; 1018 1019 /* Change file size if needed */ 1020 if (new_size) { 1021 struct iattr iattr; 1022 1023 iattr.ia_valid = ATTR_SIZE; 1024 iattr.ia_size = new_size; 1025 error = xfs_setattr_size(ip, &iattr); 1026 if (error) 1027 goto out_unlock; 1028 } 1029 1030 /* 1031 * Perform hole insertion now that the file size has been 1032 * updated so that if we crash during the operation we don't 1033 * leave shifted extents past EOF and hence losing access to 1034 * the data that is contained within them. 1035 */ 1036 if (do_file_insert) 1037 error = xfs_insert_file_space(ip, offset, len); 1038 1039 out_unlock: 1040 xfs_iunlock(ip, iolock); 1041 return error; 1042 } 1043 1044 1045 STATIC int 1046 xfs_file_open( 1047 struct inode *inode, 1048 struct file *file) 1049 { 1050 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) 1051 return -EFBIG; 1052 if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) 1053 return -EIO; 1054 return 0; 1055 } 1056 1057 STATIC int 1058 xfs_dir_open( 1059 struct inode *inode, 1060 struct file *file) 1061 { 1062 struct xfs_inode *ip = XFS_I(inode); 1063 int mode; 1064 int error; 1065 1066 error = xfs_file_open(inode, file); 1067 if (error) 1068 return error; 1069 1070 /* 1071 * If there are any blocks, read-ahead block 0 as we're almost 1072 * certain to have the next operation be a read there. 1073 */ 1074 mode = xfs_ilock_data_map_shared(ip); 1075 if (ip->i_d.di_nextents > 0) 1076 xfs_dir3_data_readahead(ip, 0, -1); 1077 xfs_iunlock(ip, mode); 1078 return 0; 1079 } 1080 1081 STATIC int 1082 xfs_file_release( 1083 struct inode *inode, 1084 struct file *filp) 1085 { 1086 return xfs_release(XFS_I(inode)); 1087 } 1088 1089 STATIC int 1090 xfs_file_readdir( 1091 struct file *file, 1092 struct dir_context *ctx) 1093 { 1094 struct inode *inode = file_inode(file); 1095 xfs_inode_t *ip = XFS_I(inode); 1096 size_t bufsize; 1097 1098 /* 1099 * The Linux API doesn't pass down the total size of the buffer 1100 * we read into down to the filesystem. With the filldir concept 1101 * it's not needed for correct information, but the XFS dir2 leaf 1102 * code wants an estimate of the buffer size to calculate it's 1103 * readahead window and size the buffers used for mapping to 1104 * physical blocks. 1105 * 1106 * Try to give it an estimate that's good enough, maybe at some 1107 * point we can change the ->readdir prototype to include the 1108 * buffer size. For now we use the current glibc buffer size. 1109 */ 1110 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); 1111 1112 return xfs_readdir(ip, ctx, bufsize); 1113 } 1114 1115 /* 1116 * This type is designed to indicate the type of offset we would like 1117 * to search from page cache for xfs_seek_hole_data(). 1118 */ 1119 enum { 1120 HOLE_OFF = 0, 1121 DATA_OFF, 1122 }; 1123 1124 /* 1125 * Lookup the desired type of offset from the given page. 1126 * 1127 * On success, return true and the offset argument will point to the 1128 * start of the region that was found. Otherwise this function will 1129 * return false and keep the offset argument unchanged. 1130 */ 1131 STATIC bool 1132 xfs_lookup_buffer_offset( 1133 struct page *page, 1134 loff_t *offset, 1135 unsigned int type) 1136 { 1137 loff_t lastoff = page_offset(page); 1138 bool found = false; 1139 struct buffer_head *bh, *head; 1140 1141 bh = head = page_buffers(page); 1142 do { 1143 /* 1144 * Unwritten extents that have data in the page 1145 * cache covering them can be identified by the 1146 * BH_Unwritten state flag. Pages with multiple 1147 * buffers might have a mix of holes, data and 1148 * unwritten extents - any buffer with valid 1149 * data in it should have BH_Uptodate flag set 1150 * on it. 1151 */ 1152 if (buffer_unwritten(bh) || 1153 buffer_uptodate(bh)) { 1154 if (type == DATA_OFF) 1155 found = true; 1156 } else { 1157 if (type == HOLE_OFF) 1158 found = true; 1159 } 1160 1161 if (found) { 1162 *offset = lastoff; 1163 break; 1164 } 1165 lastoff += bh->b_size; 1166 } while ((bh = bh->b_this_page) != head); 1167 1168 return found; 1169 } 1170 1171 /* 1172 * This routine is called to find out and return a data or hole offset 1173 * from the page cache for unwritten extents according to the desired 1174 * type for xfs_seek_hole_data(). 1175 * 1176 * The argument offset is used to tell where we start to search from the 1177 * page cache. Map is used to figure out the end points of the range to 1178 * lookup pages. 1179 * 1180 * Return true if the desired type of offset was found, and the argument 1181 * offset is filled with that address. Otherwise, return false and keep 1182 * offset unchanged. 1183 */ 1184 STATIC bool 1185 xfs_find_get_desired_pgoff( 1186 struct inode *inode, 1187 struct xfs_bmbt_irec *map, 1188 unsigned int type, 1189 loff_t *offset) 1190 { 1191 struct xfs_inode *ip = XFS_I(inode); 1192 struct xfs_mount *mp = ip->i_mount; 1193 struct pagevec pvec; 1194 pgoff_t index; 1195 pgoff_t end; 1196 loff_t endoff; 1197 loff_t startoff = *offset; 1198 loff_t lastoff = startoff; 1199 bool found = false; 1200 1201 pagevec_init(&pvec, 0); 1202 1203 index = startoff >> PAGE_SHIFT; 1204 endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); 1205 end = endoff >> PAGE_SHIFT; 1206 do { 1207 int want; 1208 unsigned nr_pages; 1209 unsigned int i; 1210 1211 want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); 1212 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, 1213 want); 1214 /* 1215 * No page mapped into given range. If we are searching holes 1216 * and if this is the first time we got into the loop, it means 1217 * that the given offset is landed in a hole, return it. 1218 * 1219 * If we have already stepped through some block buffers to find 1220 * holes but they all contains data. In this case, the last 1221 * offset is already updated and pointed to the end of the last 1222 * mapped page, if it does not reach the endpoint to search, 1223 * that means there should be a hole between them. 1224 */ 1225 if (nr_pages == 0) { 1226 /* Data search found nothing */ 1227 if (type == DATA_OFF) 1228 break; 1229 1230 ASSERT(type == HOLE_OFF); 1231 if (lastoff == startoff || lastoff < endoff) { 1232 found = true; 1233 *offset = lastoff; 1234 } 1235 break; 1236 } 1237 1238 /* 1239 * At lease we found one page. If this is the first time we 1240 * step into the loop, and if the first page index offset is 1241 * greater than the given search offset, a hole was found. 1242 */ 1243 if (type == HOLE_OFF && lastoff == startoff && 1244 lastoff < page_offset(pvec.pages[0])) { 1245 found = true; 1246 break; 1247 } 1248 1249 for (i = 0; i < nr_pages; i++) { 1250 struct page *page = pvec.pages[i]; 1251 loff_t b_offset; 1252 1253 /* 1254 * At this point, the page may be truncated or 1255 * invalidated (changing page->mapping to NULL), 1256 * or even swizzled back from swapper_space to tmpfs 1257 * file mapping. However, page->index will not change 1258 * because we have a reference on the page. 1259 * 1260 * Searching done if the page index is out of range. 1261 * If the current offset is not reaches the end of 1262 * the specified search range, there should be a hole 1263 * between them. 1264 */ 1265 if (page->index > end) { 1266 if (type == HOLE_OFF && lastoff < endoff) { 1267 *offset = lastoff; 1268 found = true; 1269 } 1270 goto out; 1271 } 1272 1273 lock_page(page); 1274 /* 1275 * Page truncated or invalidated(page->mapping == NULL). 1276 * We can freely skip it and proceed to check the next 1277 * page. 1278 */ 1279 if (unlikely(page->mapping != inode->i_mapping)) { 1280 unlock_page(page); 1281 continue; 1282 } 1283 1284 if (!page_has_buffers(page)) { 1285 unlock_page(page); 1286 continue; 1287 } 1288 1289 found = xfs_lookup_buffer_offset(page, &b_offset, type); 1290 if (found) { 1291 /* 1292 * The found offset may be less than the start 1293 * point to search if this is the first time to 1294 * come here. 1295 */ 1296 *offset = max_t(loff_t, startoff, b_offset); 1297 unlock_page(page); 1298 goto out; 1299 } 1300 1301 /* 1302 * We either searching data but nothing was found, or 1303 * searching hole but found a data buffer. In either 1304 * case, probably the next page contains the desired 1305 * things, update the last offset to it so. 1306 */ 1307 lastoff = page_offset(page) + PAGE_SIZE; 1308 unlock_page(page); 1309 } 1310 1311 /* 1312 * The number of returned pages less than our desired, search 1313 * done. In this case, nothing was found for searching data, 1314 * but we found a hole behind the last offset. 1315 */ 1316 if (nr_pages < want) { 1317 if (type == HOLE_OFF) { 1318 *offset = lastoff; 1319 found = true; 1320 } 1321 break; 1322 } 1323 1324 index = pvec.pages[i - 1]->index + 1; 1325 pagevec_release(&pvec); 1326 } while (index <= end); 1327 1328 out: 1329 pagevec_release(&pvec); 1330 return found; 1331 } 1332 1333 /* 1334 * caller must lock inode with xfs_ilock_data_map_shared, 1335 * can we craft an appropriate ASSERT? 1336 * 1337 * end is because the VFS-level lseek interface is defined such that any 1338 * offset past i_size shall return -ENXIO, but we use this for quota code 1339 * which does not maintain i_size, and we want to SEEK_DATA past i_size. 1340 */ 1341 loff_t 1342 __xfs_seek_hole_data( 1343 struct inode *inode, 1344 loff_t start, 1345 loff_t end, 1346 int whence) 1347 { 1348 struct xfs_inode *ip = XFS_I(inode); 1349 struct xfs_mount *mp = ip->i_mount; 1350 loff_t uninitialized_var(offset); 1351 xfs_fileoff_t fsbno; 1352 xfs_filblks_t lastbno; 1353 int error; 1354 1355 if (start >= end) { 1356 error = -ENXIO; 1357 goto out_error; 1358 } 1359 1360 /* 1361 * Try to read extents from the first block indicated 1362 * by fsbno to the end block of the file. 1363 */ 1364 fsbno = XFS_B_TO_FSBT(mp, start); 1365 lastbno = XFS_B_TO_FSB(mp, end); 1366 1367 for (;;) { 1368 struct xfs_bmbt_irec map[2]; 1369 int nmap = 2; 1370 unsigned int i; 1371 1372 error = xfs_bmapi_read(ip, fsbno, lastbno - fsbno, map, &nmap, 1373 XFS_BMAPI_ENTIRE); 1374 if (error) 1375 goto out_error; 1376 1377 /* No extents at given offset, must be beyond EOF */ 1378 if (nmap == 0) { 1379 error = -ENXIO; 1380 goto out_error; 1381 } 1382 1383 for (i = 0; i < nmap; i++) { 1384 offset = max_t(loff_t, start, 1385 XFS_FSB_TO_B(mp, map[i].br_startoff)); 1386 1387 /* Landed in the hole we wanted? */ 1388 if (whence == SEEK_HOLE && 1389 map[i].br_startblock == HOLESTARTBLOCK) 1390 goto out; 1391 1392 /* Landed in the data extent we wanted? */ 1393 if (whence == SEEK_DATA && 1394 (map[i].br_startblock == DELAYSTARTBLOCK || 1395 (map[i].br_state == XFS_EXT_NORM && 1396 !isnullstartblock(map[i].br_startblock)))) 1397 goto out; 1398 1399 /* 1400 * Landed in an unwritten extent, try to search 1401 * for hole or data from page cache. 1402 */ 1403 if (map[i].br_state == XFS_EXT_UNWRITTEN) { 1404 if (xfs_find_get_desired_pgoff(inode, &map[i], 1405 whence == SEEK_HOLE ? HOLE_OFF : DATA_OFF, 1406 &offset)) 1407 goto out; 1408 } 1409 } 1410 1411 /* 1412 * We only received one extent out of the two requested. This 1413 * means we've hit EOF and didn't find what we are looking for. 1414 */ 1415 if (nmap == 1) { 1416 /* 1417 * If we were looking for a hole, set offset to 1418 * the end of the file (i.e., there is an implicit 1419 * hole at the end of any file). 1420 */ 1421 if (whence == SEEK_HOLE) { 1422 offset = end; 1423 break; 1424 } 1425 /* 1426 * If we were looking for data, it's nowhere to be found 1427 */ 1428 ASSERT(whence == SEEK_DATA); 1429 error = -ENXIO; 1430 goto out_error; 1431 } 1432 1433 ASSERT(i > 1); 1434 1435 /* 1436 * Nothing was found, proceed to the next round of search 1437 * if the next reading offset is not at or beyond EOF. 1438 */ 1439 fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; 1440 start = XFS_FSB_TO_B(mp, fsbno); 1441 if (start >= end) { 1442 if (whence == SEEK_HOLE) { 1443 offset = end; 1444 break; 1445 } 1446 ASSERT(whence == SEEK_DATA); 1447 error = -ENXIO; 1448 goto out_error; 1449 } 1450 } 1451 1452 out: 1453 /* 1454 * If at this point we have found the hole we wanted, the returned 1455 * offset may be bigger than the file size as it may be aligned to 1456 * page boundary for unwritten extents. We need to deal with this 1457 * situation in particular. 1458 */ 1459 if (whence == SEEK_HOLE) 1460 offset = min_t(loff_t, offset, end); 1461 1462 return offset; 1463 1464 out_error: 1465 return error; 1466 } 1467 1468 STATIC loff_t 1469 xfs_seek_hole_data( 1470 struct file *file, 1471 loff_t start, 1472 int whence) 1473 { 1474 struct inode *inode = file->f_mapping->host; 1475 struct xfs_inode *ip = XFS_I(inode); 1476 struct xfs_mount *mp = ip->i_mount; 1477 uint lock; 1478 loff_t offset, end; 1479 int error = 0; 1480 1481 if (XFS_FORCED_SHUTDOWN(mp)) 1482 return -EIO; 1483 1484 lock = xfs_ilock_data_map_shared(ip); 1485 1486 end = i_size_read(inode); 1487 offset = __xfs_seek_hole_data(inode, start, end, whence); 1488 if (offset < 0) { 1489 error = offset; 1490 goto out_unlock; 1491 } 1492 1493 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1494 1495 out_unlock: 1496 xfs_iunlock(ip, lock); 1497 1498 if (error) 1499 return error; 1500 return offset; 1501 } 1502 1503 STATIC loff_t 1504 xfs_file_llseek( 1505 struct file *file, 1506 loff_t offset, 1507 int whence) 1508 { 1509 switch (whence) { 1510 case SEEK_END: 1511 case SEEK_CUR: 1512 case SEEK_SET: 1513 return generic_file_llseek(file, offset, whence); 1514 case SEEK_HOLE: 1515 case SEEK_DATA: 1516 return xfs_seek_hole_data(file, offset, whence); 1517 default: 1518 return -EINVAL; 1519 } 1520 } 1521 1522 /* 1523 * Locking for serialisation of IO during page faults. This results in a lock 1524 * ordering of: 1525 * 1526 * mmap_sem (MM) 1527 * sb_start_pagefault(vfs, freeze) 1528 * i_mmaplock (XFS - truncate serialisation) 1529 * page_lock (MM) 1530 * i_lock (XFS - extent map serialisation) 1531 */ 1532 1533 /* 1534 * mmap()d file has taken write protection fault and is being made writable. We 1535 * can set the page state up correctly for a writable page, which means we can 1536 * do correct delalloc accounting (ENOSPC checking!) and unwritten extent 1537 * mapping. 1538 */ 1539 STATIC int 1540 xfs_filemap_page_mkwrite( 1541 struct vm_area_struct *vma, 1542 struct vm_fault *vmf) 1543 { 1544 struct inode *inode = file_inode(vma->vm_file); 1545 int ret; 1546 1547 trace_xfs_filemap_page_mkwrite(XFS_I(inode)); 1548 1549 sb_start_pagefault(inode->i_sb); 1550 file_update_time(vma->vm_file); 1551 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1552 1553 if (IS_DAX(inode)) { 1554 ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); 1555 } else { 1556 ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); 1557 ret = block_page_mkwrite_return(ret); 1558 } 1559 1560 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1561 sb_end_pagefault(inode->i_sb); 1562 1563 return ret; 1564 } 1565 1566 STATIC int 1567 xfs_filemap_fault( 1568 struct vm_area_struct *vma, 1569 struct vm_fault *vmf) 1570 { 1571 struct inode *inode = file_inode(vma->vm_file); 1572 int ret; 1573 1574 trace_xfs_filemap_fault(XFS_I(inode)); 1575 1576 /* DAX can shortcut the normal fault path on write faults! */ 1577 if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode)) 1578 return xfs_filemap_page_mkwrite(vma, vmf); 1579 1580 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1581 if (IS_DAX(inode)) { 1582 /* 1583 * we do not want to trigger unwritten extent conversion on read 1584 * faults - that is unnecessary overhead and would also require 1585 * changes to xfs_get_blocks_direct() to map unwritten extent 1586 * ioend for conversion on read-only mappings. 1587 */ 1588 ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault); 1589 } else 1590 ret = filemap_fault(vma, vmf); 1591 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1592 1593 return ret; 1594 } 1595 1596 /* 1597 * Similar to xfs_filemap_fault(), the DAX fault path can call into here on 1598 * both read and write faults. Hence we need to handle both cases. There is no 1599 * ->pmd_mkwrite callout for huge pages, so we have a single function here to 1600 * handle both cases here. @flags carries the information on the type of fault 1601 * occuring. 1602 */ 1603 STATIC int 1604 xfs_filemap_pmd_fault( 1605 struct vm_area_struct *vma, 1606 unsigned long addr, 1607 pmd_t *pmd, 1608 unsigned int flags) 1609 { 1610 struct inode *inode = file_inode(vma->vm_file); 1611 struct xfs_inode *ip = XFS_I(inode); 1612 int ret; 1613 1614 if (!IS_DAX(inode)) 1615 return VM_FAULT_FALLBACK; 1616 1617 trace_xfs_filemap_pmd_fault(ip); 1618 1619 if (flags & FAULT_FLAG_WRITE) { 1620 sb_start_pagefault(inode->i_sb); 1621 file_update_time(vma->vm_file); 1622 } 1623 1624 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1625 ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); 1626 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1627 1628 if (flags & FAULT_FLAG_WRITE) 1629 sb_end_pagefault(inode->i_sb); 1630 1631 return ret; 1632 } 1633 1634 /* 1635 * pfn_mkwrite was originally inteneded to ensure we capture time stamp 1636 * updates on write faults. In reality, it's need to serialise against 1637 * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED 1638 * to ensure we serialise the fault barrier in place. 1639 */ 1640 static int 1641 xfs_filemap_pfn_mkwrite( 1642 struct vm_area_struct *vma, 1643 struct vm_fault *vmf) 1644 { 1645 1646 struct inode *inode = file_inode(vma->vm_file); 1647 struct xfs_inode *ip = XFS_I(inode); 1648 int ret = VM_FAULT_NOPAGE; 1649 loff_t size; 1650 1651 trace_xfs_filemap_pfn_mkwrite(ip); 1652 1653 sb_start_pagefault(inode->i_sb); 1654 file_update_time(vma->vm_file); 1655 1656 /* check if the faulting page hasn't raced with truncate */ 1657 xfs_ilock(ip, XFS_MMAPLOCK_SHARED); 1658 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; 1659 if (vmf->pgoff >= size) 1660 ret = VM_FAULT_SIGBUS; 1661 else if (IS_DAX(inode)) 1662 ret = dax_pfn_mkwrite(vma, vmf); 1663 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); 1664 sb_end_pagefault(inode->i_sb); 1665 return ret; 1666 1667 } 1668 1669 static const struct vm_operations_struct xfs_file_vm_ops = { 1670 .fault = xfs_filemap_fault, 1671 .pmd_fault = xfs_filemap_pmd_fault, 1672 .map_pages = filemap_map_pages, 1673 .page_mkwrite = xfs_filemap_page_mkwrite, 1674 .pfn_mkwrite = xfs_filemap_pfn_mkwrite, 1675 }; 1676 1677 STATIC int 1678 xfs_file_mmap( 1679 struct file *filp, 1680 struct vm_area_struct *vma) 1681 { 1682 file_accessed(filp); 1683 vma->vm_ops = &xfs_file_vm_ops; 1684 if (IS_DAX(file_inode(filp))) 1685 vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; 1686 return 0; 1687 } 1688 1689 const struct file_operations xfs_file_operations = { 1690 .llseek = xfs_file_llseek, 1691 .read_iter = xfs_file_read_iter, 1692 .write_iter = xfs_file_write_iter, 1693 .splice_read = xfs_file_splice_read, 1694 .splice_write = iter_file_splice_write, 1695 .unlocked_ioctl = xfs_file_ioctl, 1696 #ifdef CONFIG_COMPAT 1697 .compat_ioctl = xfs_file_compat_ioctl, 1698 #endif 1699 .mmap = xfs_file_mmap, 1700 .open = xfs_file_open, 1701 .release = xfs_file_release, 1702 .fsync = xfs_file_fsync, 1703 .fallocate = xfs_file_fallocate, 1704 }; 1705 1706 const struct file_operations xfs_dir_file_operations = { 1707 .open = xfs_dir_open, 1708 .read = generic_read_dir, 1709 .iterate_shared = xfs_file_readdir, 1710 .llseek = generic_file_llseek, 1711 .unlocked_ioctl = xfs_file_ioctl, 1712 #ifdef CONFIG_COMPAT 1713 .compat_ioctl = xfs_file_compat_ioctl, 1714 #endif 1715 .fsync = xfs_dir_fsync, 1716 }; 1717