1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2011, Lawrence Livermore National Security, LLC. 23 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 24 */ 25 26 27 #ifdef CONFIG_COMPAT 28 #include <linux/compat.h> 29 #endif 30 #include <sys/file.h> 31 #include <sys/dmu_objset.h> 32 #include <sys/zfs_znode.h> 33 #include <sys/zfs_vfsops.h> 34 #include <sys/zfs_vnops.h> 35 #include <sys/zfs_project.h> 36 #ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS 37 #include <linux/pagemap.h> 38 #endif 39 40 /* 41 * When using fallocate(2) to preallocate space, inflate the requested 42 * capacity check by 10% to account for the required metadata blocks. 43 */ 44 static unsigned int zfs_fallocate_reserve_percent = 110; 45 46 static int 47 zpl_open(struct inode *ip, struct file *filp) 48 { 49 cred_t *cr = CRED(); 50 int error; 51 fstrans_cookie_t cookie; 52 53 error = generic_file_open(ip, filp); 54 if (error) 55 return (error); 56 57 crhold(cr); 58 cookie = spl_fstrans_mark(); 59 error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr); 60 spl_fstrans_unmark(cookie); 61 crfree(cr); 62 ASSERT3S(error, <=, 0); 63 64 return (error); 65 } 66 67 static int 68 zpl_release(struct inode *ip, struct file *filp) 69 { 70 cred_t *cr = CRED(); 71 int error; 72 fstrans_cookie_t cookie; 73 74 cookie = spl_fstrans_mark(); 75 if (ITOZ(ip)->z_atime_dirty) 76 zfs_mark_inode_dirty(ip); 77 78 crhold(cr); 79 error = -zfs_close(ip, filp->f_flags, cr); 80 spl_fstrans_unmark(cookie); 81 crfree(cr); 82 ASSERT3S(error, <=, 0); 83 84 return (error); 85 } 86 87 static int 88 zpl_iterate(struct file *filp, zpl_dir_context_t *ctx) 89 { 90 cred_t *cr = CRED(); 91 int error; 92 fstrans_cookie_t cookie; 93 94 crhold(cr); 95 cookie = spl_fstrans_mark(); 96 error = -zfs_readdir(file_inode(filp), ctx, cr); 97 spl_fstrans_unmark(cookie); 98 crfree(cr); 99 ASSERT3S(error, <=, 0); 100 101 return (error); 102 } 103 104 #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) 105 static int 106 zpl_readdir(struct file *filp, void *dirent, filldir_t filldir) 107 { 108 zpl_dir_context_t ctx = 109 ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); 110 int error; 111 112 error = zpl_iterate(filp, &ctx); 113 filp->f_pos = ctx.pos; 114 115 return (error); 116 } 117 #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ 118 119 #if defined(HAVE_FSYNC_WITHOUT_DENTRY) 120 /* 121 * Linux 2.6.35 - 3.0 API, 122 * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed 123 * redundant. The dentry is still accessible via filp->f_path.dentry, 124 * and we are guaranteed that filp will never be NULL. 125 */ 126 static int 127 zpl_fsync(struct file *filp, int datasync) 128 { 129 struct inode *inode = filp->f_mapping->host; 130 cred_t *cr = CRED(); 131 int error; 132 fstrans_cookie_t cookie; 133 134 crhold(cr); 135 cookie = spl_fstrans_mark(); 136 error = -zfs_fsync(ITOZ(inode), datasync, cr); 137 spl_fstrans_unmark(cookie); 138 crfree(cr); 139 ASSERT3S(error, <=, 0); 140 141 return (error); 142 } 143 144 #ifdef HAVE_FILE_AIO_FSYNC 145 static int 146 zpl_aio_fsync(struct kiocb *kiocb, int datasync) 147 { 148 return (zpl_fsync(kiocb->ki_filp, datasync)); 149 } 150 #endif 151 152 #elif defined(HAVE_FSYNC_RANGE) 153 /* 154 * Linux 3.1 API, 155 * As of 3.1 the responsibility to call filemap_write_and_wait_range() has 156 * been pushed down in to the .fsync() vfs hook. Additionally, the i_mutex 157 * lock is no longer held by the caller, for zfs we don't require the lock 158 * to be held so we don't acquire it. 159 */ 160 static int 161 zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) 162 { 163 struct inode *inode = filp->f_mapping->host; 164 cred_t *cr = CRED(); 165 int error; 166 fstrans_cookie_t cookie; 167 168 error = filemap_write_and_wait_range(inode->i_mapping, start, end); 169 if (error) 170 return (error); 171 172 crhold(cr); 173 cookie = spl_fstrans_mark(); 174 error = -zfs_fsync(ITOZ(inode), datasync, cr); 175 spl_fstrans_unmark(cookie); 176 crfree(cr); 177 ASSERT3S(error, <=, 0); 178 179 return (error); 180 } 181 182 #ifdef HAVE_FILE_AIO_FSYNC 183 static int 184 zpl_aio_fsync(struct kiocb *kiocb, int datasync) 185 { 186 return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync)); 187 } 188 #endif 189 190 #else 191 #error "Unsupported fops->fsync() implementation" 192 #endif 193 194 static inline int 195 zfs_io_flags(struct kiocb *kiocb) 196 { 197 int flags = 0; 198 199 #if defined(IOCB_DSYNC) 200 if (kiocb->ki_flags & IOCB_DSYNC) 201 flags |= O_DSYNC; 202 #endif 203 #if defined(IOCB_SYNC) 204 if (kiocb->ki_flags & IOCB_SYNC) 205 flags |= O_SYNC; 206 #endif 207 #if defined(IOCB_APPEND) 208 if (kiocb->ki_flags & IOCB_APPEND) 209 flags |= O_APPEND; 210 #endif 211 #if defined(IOCB_DIRECT) 212 if (kiocb->ki_flags & IOCB_DIRECT) 213 flags |= O_DIRECT; 214 #endif 215 return (flags); 216 } 217 218 /* 219 * If relatime is enabled, call file_accessed() if zfs_relatime_need_update() 220 * is true. This is needed since datasets with inherited "relatime" property 221 * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after 222 * `zfs set relatime=...`), which is what relatime test in VFS by 223 * relatime_need_update() is based on. 224 */ 225 static inline void 226 zpl_file_accessed(struct file *filp) 227 { 228 struct inode *ip = filp->f_mapping->host; 229 230 if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) { 231 if (zfs_relatime_need_update(ip)) 232 file_accessed(filp); 233 } else { 234 file_accessed(filp); 235 } 236 } 237 238 #if defined(HAVE_VFS_RW_ITERATE) 239 240 /* 241 * When HAVE_VFS_IOV_ITER is defined the iov_iter structure supports 242 * iovecs, kvevs, bvecs and pipes, plus all the required interfaces to 243 * manipulate the iov_iter are available. In which case the full iov_iter 244 * can be attached to the uio and correctly handled in the lower layers. 245 * Otherwise, for older kernels extract the iovec and pass it instead. 246 */ 247 static void 248 zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, 249 loff_t pos, ssize_t count, size_t skip) 250 { 251 #if defined(HAVE_VFS_IOV_ITER) 252 zfs_uio_iov_iter_init(uio, to, pos, count, skip); 253 #else 254 #ifdef HAVE_IOV_ITER_TYPE 255 zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos, 256 iov_iter_type(to) & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, 257 count, skip); 258 #else 259 zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos, 260 to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, 261 count, skip); 262 #endif 263 #endif 264 } 265 266 static ssize_t 267 zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) 268 { 269 cred_t *cr = CRED(); 270 fstrans_cookie_t cookie; 271 struct file *filp = kiocb->ki_filp; 272 ssize_t count = iov_iter_count(to); 273 zfs_uio_t uio; 274 275 zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); 276 277 crhold(cr); 278 cookie = spl_fstrans_mark(); 279 280 int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, 281 filp->f_flags | zfs_io_flags(kiocb), cr); 282 283 spl_fstrans_unmark(cookie); 284 crfree(cr); 285 286 if (error < 0) 287 return (error); 288 289 ssize_t read = count - uio.uio_resid; 290 kiocb->ki_pos += read; 291 292 zpl_file_accessed(filp); 293 294 return (read); 295 } 296 297 static inline ssize_t 298 zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, 299 size_t *countp) 300 { 301 #ifdef HAVE_GENERIC_WRITE_CHECKS_KIOCB 302 ssize_t ret = generic_write_checks(kiocb, from); 303 if (ret <= 0) 304 return (ret); 305 306 *countp = ret; 307 #else 308 struct file *file = kiocb->ki_filp; 309 struct address_space *mapping = file->f_mapping; 310 struct inode *ip = mapping->host; 311 int isblk = S_ISBLK(ip->i_mode); 312 313 *countp = iov_iter_count(from); 314 ssize_t ret = generic_write_checks(file, &kiocb->ki_pos, countp, isblk); 315 if (ret) 316 return (ret); 317 #endif 318 319 return (0); 320 } 321 322 static ssize_t 323 zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) 324 { 325 cred_t *cr = CRED(); 326 fstrans_cookie_t cookie; 327 struct file *filp = kiocb->ki_filp; 328 struct inode *ip = filp->f_mapping->host; 329 zfs_uio_t uio; 330 size_t count = 0; 331 ssize_t ret; 332 333 ret = zpl_generic_write_checks(kiocb, from, &count); 334 if (ret) 335 return (ret); 336 337 zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); 338 339 crhold(cr); 340 cookie = spl_fstrans_mark(); 341 342 int error = -zfs_write(ITOZ(ip), &uio, 343 filp->f_flags | zfs_io_flags(kiocb), cr); 344 345 spl_fstrans_unmark(cookie); 346 crfree(cr); 347 348 if (error < 0) 349 return (error); 350 351 ssize_t wrote = count - uio.uio_resid; 352 kiocb->ki_pos += wrote; 353 354 return (wrote); 355 } 356 357 #else /* !HAVE_VFS_RW_ITERATE */ 358 359 static ssize_t 360 zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, 361 unsigned long nr_segs, loff_t pos) 362 { 363 cred_t *cr = CRED(); 364 fstrans_cookie_t cookie; 365 struct file *filp = kiocb->ki_filp; 366 size_t count; 367 ssize_t ret; 368 369 ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); 370 if (ret) 371 return (ret); 372 373 zfs_uio_t uio; 374 zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, 375 count, 0); 376 377 crhold(cr); 378 cookie = spl_fstrans_mark(); 379 380 int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, 381 filp->f_flags | zfs_io_flags(kiocb), cr); 382 383 spl_fstrans_unmark(cookie); 384 crfree(cr); 385 386 if (error < 0) 387 return (error); 388 389 ssize_t read = count - uio.uio_resid; 390 kiocb->ki_pos += read; 391 392 zpl_file_accessed(filp); 393 394 return (read); 395 } 396 397 static ssize_t 398 zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, 399 unsigned long nr_segs, loff_t pos) 400 { 401 cred_t *cr = CRED(); 402 fstrans_cookie_t cookie; 403 struct file *filp = kiocb->ki_filp; 404 struct inode *ip = filp->f_mapping->host; 405 size_t count; 406 ssize_t ret; 407 408 ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); 409 if (ret) 410 return (ret); 411 412 ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode)); 413 if (ret) 414 return (ret); 415 416 zfs_uio_t uio; 417 zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, 418 count, 0); 419 420 crhold(cr); 421 cookie = spl_fstrans_mark(); 422 423 int error = -zfs_write(ITOZ(ip), &uio, 424 filp->f_flags | zfs_io_flags(kiocb), cr); 425 426 spl_fstrans_unmark(cookie); 427 crfree(cr); 428 429 if (error < 0) 430 return (error); 431 432 ssize_t wrote = count - uio.uio_resid; 433 kiocb->ki_pos += wrote; 434 435 return (wrote); 436 } 437 #endif /* HAVE_VFS_RW_ITERATE */ 438 439 #if defined(HAVE_VFS_RW_ITERATE) 440 static ssize_t 441 zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter) 442 { 443 if (rw == WRITE) 444 return (zpl_iter_write(kiocb, iter)); 445 else 446 return (zpl_iter_read(kiocb, iter)); 447 } 448 #if defined(HAVE_VFS_DIRECT_IO_ITER) 449 static ssize_t 450 zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) 451 { 452 return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); 453 } 454 #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET) 455 static ssize_t 456 zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) 457 { 458 ASSERT3S(pos, ==, kiocb->ki_pos); 459 return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); 460 } 461 #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) 462 static ssize_t 463 zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) 464 { 465 ASSERT3S(pos, ==, kiocb->ki_pos); 466 return (zpl_direct_IO_impl(rw, kiocb, iter)); 467 } 468 #else 469 #error "Unknown direct IO interface" 470 #endif 471 472 #else /* HAVE_VFS_RW_ITERATE */ 473 474 #if defined(HAVE_VFS_DIRECT_IO_IOVEC) 475 static ssize_t 476 zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov, 477 loff_t pos, unsigned long nr_segs) 478 { 479 if (rw == WRITE) 480 return (zpl_aio_write(kiocb, iov, nr_segs, pos)); 481 else 482 return (zpl_aio_read(kiocb, iov, nr_segs, pos)); 483 } 484 #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) 485 static ssize_t 486 zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) 487 { 488 const struct iovec *iovp = iov_iter_iovec(iter); 489 unsigned long nr_segs = iter->nr_segs; 490 491 ASSERT3S(pos, ==, kiocb->ki_pos); 492 if (rw == WRITE) 493 return (zpl_aio_write(kiocb, iovp, nr_segs, pos)); 494 else 495 return (zpl_aio_read(kiocb, iovp, nr_segs, pos)); 496 } 497 #else 498 #error "Unknown direct IO interface" 499 #endif 500 501 #endif /* HAVE_VFS_RW_ITERATE */ 502 503 static loff_t 504 zpl_llseek(struct file *filp, loff_t offset, int whence) 505 { 506 #if defined(SEEK_HOLE) && defined(SEEK_DATA) 507 fstrans_cookie_t cookie; 508 509 if (whence == SEEK_DATA || whence == SEEK_HOLE) { 510 struct inode *ip = filp->f_mapping->host; 511 loff_t maxbytes = ip->i_sb->s_maxbytes; 512 loff_t error; 513 514 spl_inode_lock_shared(ip); 515 cookie = spl_fstrans_mark(); 516 error = -zfs_holey(ITOZ(ip), whence, &offset); 517 spl_fstrans_unmark(cookie); 518 if (error == 0) 519 error = lseek_execute(filp, ip, offset, maxbytes); 520 spl_inode_unlock_shared(ip); 521 522 return (error); 523 } 524 #endif /* SEEK_HOLE && SEEK_DATA */ 525 526 return (generic_file_llseek(filp, offset, whence)); 527 } 528 529 /* 530 * It's worth taking a moment to describe how mmap is implemented 531 * for zfs because it differs considerably from other Linux filesystems. 532 * However, this issue is handled the same way under OpenSolaris. 533 * 534 * The issue is that by design zfs bypasses the Linux page cache and 535 * leaves all caching up to the ARC. This has been shown to work 536 * well for the common read(2)/write(2) case. However, mmap(2) 537 * is problem because it relies on being tightly integrated with the 538 * page cache. To handle this we cache mmap'ed files twice, once in 539 * the ARC and a second time in the page cache. The code is careful 540 * to keep both copies synchronized. 541 * 542 * When a file with an mmap'ed region is written to using write(2) 543 * both the data in the ARC and existing pages in the page cache 544 * are updated. For a read(2) data will be read first from the page 545 * cache then the ARC if needed. Neither a write(2) or read(2) will 546 * will ever result in new pages being added to the page cache. 547 * 548 * New pages are added to the page cache only via .readpage() which 549 * is called when the vfs needs to read a page off disk to back the 550 * virtual memory region. These pages may be modified without 551 * notifying the ARC and will be written out periodically via 552 * .writepage(). This will occur due to either a sync or the usual 553 * page aging behavior. Note because a read(2) of a mmap'ed file 554 * will always check the page cache first even when the ARC is out 555 * of date correct data will still be returned. 556 * 557 * While this implementation ensures correct behavior it does have 558 * have some drawbacks. The most obvious of which is that it 559 * increases the required memory footprint when access mmap'ed 560 * files. It also adds additional complexity to the code keeping 561 * both caches synchronized. 562 * 563 * Longer term it may be possible to cleanly resolve this wart by 564 * mapping page cache pages directly on to the ARC buffers. The 565 * Linux address space operations are flexible enough to allow 566 * selection of which pages back a particular index. The trick 567 * would be working out the details of which subsystem is in 568 * charge, the ARC, the page cache, or both. It may also prove 569 * helpful to move the ARC buffers to a scatter-gather lists 570 * rather than a vmalloc'ed region. 571 */ 572 static int 573 zpl_mmap(struct file *filp, struct vm_area_struct *vma) 574 { 575 struct inode *ip = filp->f_mapping->host; 576 znode_t *zp = ITOZ(ip); 577 int error; 578 fstrans_cookie_t cookie; 579 580 cookie = spl_fstrans_mark(); 581 error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, 582 (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); 583 spl_fstrans_unmark(cookie); 584 if (error) 585 return (error); 586 587 error = generic_file_mmap(filp, vma); 588 if (error) 589 return (error); 590 591 mutex_enter(&zp->z_lock); 592 zp->z_is_mapped = B_TRUE; 593 mutex_exit(&zp->z_lock); 594 595 return (error); 596 } 597 598 /* 599 * Populate a page with data for the Linux page cache. This function is 600 * only used to support mmap(2). There will be an identical copy of the 601 * data in the ARC which is kept up to date via .write() and .writepage(). 602 */ 603 static inline int 604 zpl_readpage_common(struct page *pp) 605 { 606 struct inode *ip; 607 struct page *pl[1]; 608 int error = 0; 609 fstrans_cookie_t cookie; 610 611 ASSERT(PageLocked(pp)); 612 ip = pp->mapping->host; 613 pl[0] = pp; 614 615 cookie = spl_fstrans_mark(); 616 error = -zfs_getpage(ip, pl, 1); 617 spl_fstrans_unmark(cookie); 618 619 if (error) { 620 SetPageError(pp); 621 ClearPageUptodate(pp); 622 } else { 623 ClearPageError(pp); 624 SetPageUptodate(pp); 625 flush_dcache_page(pp); 626 } 627 628 unlock_page(pp); 629 return (error); 630 } 631 632 static int 633 zpl_readpage(struct file *filp, struct page *pp) 634 { 635 return (zpl_readpage_common(pp)); 636 } 637 638 static int 639 zpl_readpage_filler(void *data, struct page *pp) 640 { 641 return (zpl_readpage_common(pp)); 642 } 643 644 /* 645 * Populate a set of pages with data for the Linux page cache. This 646 * function will only be called for read ahead and never for demand 647 * paging. For simplicity, the code relies on read_cache_pages() to 648 * correctly lock each page for IO and call zpl_readpage(). 649 */ 650 static int 651 zpl_readpages(struct file *filp, struct address_space *mapping, 652 struct list_head *pages, unsigned nr_pages) 653 { 654 return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL)); 655 } 656 657 static int 658 zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) 659 { 660 struct address_space *mapping = data; 661 fstrans_cookie_t cookie; 662 663 ASSERT(PageLocked(pp)); 664 ASSERT(!PageWriteback(pp)); 665 666 cookie = spl_fstrans_mark(); 667 (void) zfs_putpage(mapping->host, pp, wbc); 668 spl_fstrans_unmark(cookie); 669 670 return (0); 671 } 672 673 static int 674 zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) 675 { 676 znode_t *zp = ITOZ(mapping->host); 677 zfsvfs_t *zfsvfs = ITOZSB(mapping->host); 678 enum writeback_sync_modes sync_mode; 679 int result; 680 681 ZPL_ENTER(zfsvfs); 682 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 683 wbc->sync_mode = WB_SYNC_ALL; 684 ZPL_EXIT(zfsvfs); 685 sync_mode = wbc->sync_mode; 686 687 /* 688 * We don't want to run write_cache_pages() in SYNC mode here, because 689 * that would make putpage() wait for a single page to be committed to 690 * disk every single time, resulting in atrocious performance. Instead 691 * we run it once in non-SYNC mode so that the ZIL gets all the data, 692 * and then we commit it all in one go. 693 */ 694 wbc->sync_mode = WB_SYNC_NONE; 695 result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); 696 if (sync_mode != wbc->sync_mode) { 697 ZPL_ENTER(zfsvfs); 698 ZPL_VERIFY_ZP(zp); 699 if (zfsvfs->z_log != NULL) 700 zil_commit(zfsvfs->z_log, zp->z_id); 701 ZPL_EXIT(zfsvfs); 702 703 /* 704 * We need to call write_cache_pages() again (we can't just 705 * return after the commit) because the previous call in 706 * non-SYNC mode does not guarantee that we got all the dirty 707 * pages (see the implementation of write_cache_pages() for 708 * details). That being said, this is a no-op in most cases. 709 */ 710 wbc->sync_mode = sync_mode; 711 result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); 712 } 713 return (result); 714 } 715 716 /* 717 * Write out dirty pages to the ARC, this function is only required to 718 * support mmap(2). Mapped pages may be dirtied by memory operations 719 * which never call .write(). These dirty pages are kept in sync with 720 * the ARC buffers via this hook. 721 */ 722 static int 723 zpl_writepage(struct page *pp, struct writeback_control *wbc) 724 { 725 if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) 726 wbc->sync_mode = WB_SYNC_ALL; 727 728 return (zpl_putpage(pp, wbc, pp->mapping)); 729 } 730 731 /* 732 * The flag combination which matches the behavior of zfs_space() is 733 * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE 734 * flag was introduced in the 2.6.38 kernel. 735 * 736 * The original mode=0 (allocate space) behavior can be reasonably emulated 737 * by checking if enough space exists and creating a sparse file, as real 738 * persistent space reservation is not possible due to COW, snapshots, etc. 739 */ 740 static long 741 zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) 742 { 743 cred_t *cr = CRED(); 744 loff_t olen; 745 fstrans_cookie_t cookie; 746 int error = 0; 747 748 if ((mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) != 0) 749 return (-EOPNOTSUPP); 750 751 if (offset < 0 || len <= 0) 752 return (-EINVAL); 753 754 spl_inode_lock(ip); 755 olen = i_size_read(ip); 756 757 crhold(cr); 758 cookie = spl_fstrans_mark(); 759 if (mode & FALLOC_FL_PUNCH_HOLE) { 760 flock64_t bf; 761 762 if (offset > olen) 763 goto out_unmark; 764 765 if (offset + len > olen) 766 len = olen - offset; 767 bf.l_type = F_WRLCK; 768 bf.l_whence = SEEK_SET; 769 bf.l_start = offset; 770 bf.l_len = len; 771 bf.l_pid = 0; 772 773 error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr); 774 } else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) { 775 unsigned int percent = zfs_fallocate_reserve_percent; 776 struct kstatfs statfs; 777 778 /* Legacy mode, disable fallocate compatibility. */ 779 if (percent == 0) { 780 error = -EOPNOTSUPP; 781 goto out_unmark; 782 } 783 784 /* 785 * Use zfs_statvfs() instead of dmu_objset_space() since it 786 * also checks project quota limits, which are relevant here. 787 */ 788 error = zfs_statvfs(ip, &statfs); 789 if (error) 790 goto out_unmark; 791 792 /* 793 * Shrink available space a bit to account for overhead/races. 794 * We know the product previously fit into availbytes from 795 * dmu_objset_space(), so the smaller product will also fit. 796 */ 797 if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) { 798 error = -ENOSPC; 799 goto out_unmark; 800 } 801 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen) 802 error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE); 803 } 804 out_unmark: 805 spl_fstrans_unmark(cookie); 806 spl_inode_unlock(ip); 807 808 crfree(cr); 809 810 return (error); 811 } 812 813 static long 814 zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len) 815 { 816 return zpl_fallocate_common(file_inode(filp), 817 mode, offset, len); 818 } 819 820 static int 821 zpl_ioctl_getversion(struct file *filp, void __user *arg) 822 { 823 uint32_t generation = file_inode(filp)->i_generation; 824 825 return (copy_to_user(arg, &generation, sizeof (generation))); 826 } 827 828 #define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) 829 #define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) 830 831 static uint32_t 832 __zpl_ioctl_getflags(struct inode *ip) 833 { 834 uint64_t zfs_flags = ITOZ(ip)->z_pflags; 835 uint32_t ioctl_flags = 0; 836 837 if (zfs_flags & ZFS_IMMUTABLE) 838 ioctl_flags |= FS_IMMUTABLE_FL; 839 840 if (zfs_flags & ZFS_APPENDONLY) 841 ioctl_flags |= FS_APPEND_FL; 842 843 if (zfs_flags & ZFS_NODUMP) 844 ioctl_flags |= FS_NODUMP_FL; 845 846 if (zfs_flags & ZFS_PROJINHERIT) 847 ioctl_flags |= ZFS_PROJINHERIT_FL; 848 849 return (ioctl_flags & ZFS_FL_USER_VISIBLE); 850 } 851 852 /* 853 * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file 854 * attributes common to both Linux and Solaris are mapped. 855 */ 856 static int 857 zpl_ioctl_getflags(struct file *filp, void __user *arg) 858 { 859 uint32_t flags; 860 int err; 861 862 flags = __zpl_ioctl_getflags(file_inode(filp)); 863 err = copy_to_user(arg, &flags, sizeof (flags)); 864 865 return (err); 866 } 867 868 /* 869 * fchange() is a helper macro to detect if we have been asked to change a 870 * flag. This is ugly, but the requirement that we do this is a consequence of 871 * how the Linux file attribute interface was designed. Another consequence is 872 * that concurrent modification of files suffers from a TOCTOU race. Neither 873 * are things we can fix without modifying the kernel-userland interface, which 874 * is outside of our jurisdiction. 875 */ 876 877 #define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1))) 878 879 static int 880 __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) 881 { 882 uint64_t zfs_flags = ITOZ(ip)->z_pflags; 883 xoptattr_t *xoap; 884 885 if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | 886 ZFS_PROJINHERIT_FL)) 887 return (-EOPNOTSUPP); 888 889 if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE) 890 return (-EACCES); 891 892 if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) || 893 fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) && 894 !capable(CAP_LINUX_IMMUTABLE)) 895 return (-EPERM); 896 897 if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) 898 return (-EACCES); 899 900 xva_init(xva); 901 xoap = xva_getxoptattr(xva); 902 903 XVA_SET_REQ(xva, XAT_IMMUTABLE); 904 if (ioctl_flags & FS_IMMUTABLE_FL) 905 xoap->xoa_immutable = B_TRUE; 906 907 XVA_SET_REQ(xva, XAT_APPENDONLY); 908 if (ioctl_flags & FS_APPEND_FL) 909 xoap->xoa_appendonly = B_TRUE; 910 911 XVA_SET_REQ(xva, XAT_NODUMP); 912 if (ioctl_flags & FS_NODUMP_FL) 913 xoap->xoa_nodump = B_TRUE; 914 915 XVA_SET_REQ(xva, XAT_PROJINHERIT); 916 if (ioctl_flags & ZFS_PROJINHERIT_FL) 917 xoap->xoa_projinherit = B_TRUE; 918 919 return (0); 920 } 921 922 static int 923 zpl_ioctl_setflags(struct file *filp, void __user *arg) 924 { 925 struct inode *ip = file_inode(filp); 926 uint32_t flags; 927 cred_t *cr = CRED(); 928 xvattr_t xva; 929 int err; 930 fstrans_cookie_t cookie; 931 932 if (copy_from_user(&flags, arg, sizeof (flags))) 933 return (-EFAULT); 934 935 err = __zpl_ioctl_setflags(ip, flags, &xva); 936 if (err) 937 return (err); 938 939 crhold(cr); 940 cookie = spl_fstrans_mark(); 941 err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr); 942 spl_fstrans_unmark(cookie); 943 crfree(cr); 944 945 return (err); 946 } 947 948 static int 949 zpl_ioctl_getxattr(struct file *filp, void __user *arg) 950 { 951 zfsxattr_t fsx = { 0 }; 952 struct inode *ip = file_inode(filp); 953 int err; 954 955 fsx.fsx_xflags = __zpl_ioctl_getflags(ip); 956 fsx.fsx_projid = ITOZ(ip)->z_projid; 957 err = copy_to_user(arg, &fsx, sizeof (fsx)); 958 959 return (err); 960 } 961 962 static int 963 zpl_ioctl_setxattr(struct file *filp, void __user *arg) 964 { 965 struct inode *ip = file_inode(filp); 966 zfsxattr_t fsx; 967 cred_t *cr = CRED(); 968 xvattr_t xva; 969 xoptattr_t *xoap; 970 int err; 971 fstrans_cookie_t cookie; 972 973 if (copy_from_user(&fsx, arg, sizeof (fsx))) 974 return (-EFAULT); 975 976 if (!zpl_is_valid_projid(fsx.fsx_projid)) 977 return (-EINVAL); 978 979 err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva); 980 if (err) 981 return (err); 982 983 xoap = xva_getxoptattr(&xva); 984 XVA_SET_REQ(&xva, XAT_PROJID); 985 xoap->xoa_projid = fsx.fsx_projid; 986 987 crhold(cr); 988 cookie = spl_fstrans_mark(); 989 err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr); 990 spl_fstrans_unmark(cookie); 991 crfree(cr); 992 993 return (err); 994 } 995 996 static long 997 zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 998 { 999 switch (cmd) { 1000 case FS_IOC_GETVERSION: 1001 return (zpl_ioctl_getversion(filp, (void *)arg)); 1002 case FS_IOC_GETFLAGS: 1003 return (zpl_ioctl_getflags(filp, (void *)arg)); 1004 case FS_IOC_SETFLAGS: 1005 return (zpl_ioctl_setflags(filp, (void *)arg)); 1006 case ZFS_IOC_FSGETXATTR: 1007 return (zpl_ioctl_getxattr(filp, (void *)arg)); 1008 case ZFS_IOC_FSSETXATTR: 1009 return (zpl_ioctl_setxattr(filp, (void *)arg)); 1010 default: 1011 return (-ENOTTY); 1012 } 1013 } 1014 1015 #ifdef CONFIG_COMPAT 1016 static long 1017 zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 1018 { 1019 switch (cmd) { 1020 case FS_IOC32_GETVERSION: 1021 cmd = FS_IOC_GETVERSION; 1022 break; 1023 case FS_IOC32_GETFLAGS: 1024 cmd = FS_IOC_GETFLAGS; 1025 break; 1026 case FS_IOC32_SETFLAGS: 1027 cmd = FS_IOC_SETFLAGS; 1028 break; 1029 default: 1030 return (-ENOTTY); 1031 } 1032 return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg))); 1033 } 1034 #endif /* CONFIG_COMPAT */ 1035 1036 1037 const struct address_space_operations zpl_address_space_operations = { 1038 .readpages = zpl_readpages, 1039 .readpage = zpl_readpage, 1040 .writepage = zpl_writepage, 1041 .writepages = zpl_writepages, 1042 .direct_IO = zpl_direct_IO, 1043 #ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS 1044 .set_page_dirty = __set_page_dirty_nobuffers, 1045 #endif 1046 }; 1047 1048 const struct file_operations zpl_file_operations = { 1049 .open = zpl_open, 1050 .release = zpl_release, 1051 .llseek = zpl_llseek, 1052 #ifdef HAVE_VFS_RW_ITERATE 1053 #ifdef HAVE_NEW_SYNC_READ 1054 .read = new_sync_read, 1055 .write = new_sync_write, 1056 #endif 1057 .read_iter = zpl_iter_read, 1058 .write_iter = zpl_iter_write, 1059 #ifdef HAVE_VFS_IOV_ITER 1060 .splice_read = generic_file_splice_read, 1061 .splice_write = iter_file_splice_write, 1062 #endif 1063 #else 1064 .read = do_sync_read, 1065 .write = do_sync_write, 1066 .aio_read = zpl_aio_read, 1067 .aio_write = zpl_aio_write, 1068 #endif 1069 .mmap = zpl_mmap, 1070 .fsync = zpl_fsync, 1071 #ifdef HAVE_FILE_AIO_FSYNC 1072 .aio_fsync = zpl_aio_fsync, 1073 #endif 1074 .fallocate = zpl_fallocate, 1075 .unlocked_ioctl = zpl_ioctl, 1076 #ifdef CONFIG_COMPAT 1077 .compat_ioctl = zpl_compat_ioctl, 1078 #endif 1079 }; 1080 1081 const struct file_operations zpl_dir_file_operations = { 1082 .llseek = generic_file_llseek, 1083 .read = generic_read_dir, 1084 #if defined(HAVE_VFS_ITERATE_SHARED) 1085 .iterate_shared = zpl_iterate, 1086 #elif defined(HAVE_VFS_ITERATE) 1087 .iterate = zpl_iterate, 1088 #else 1089 .readdir = zpl_readdir, 1090 #endif 1091 .fsync = zpl_fsync, 1092 .unlocked_ioctl = zpl_ioctl, 1093 #ifdef CONFIG_COMPAT 1094 .compat_ioctl = zpl_compat_ioctl, 1095 #endif 1096 }; 1097 1098 /* BEGIN CSTYLED */ 1099 module_param(zfs_fallocate_reserve_percent, uint, 0644); 1100 MODULE_PARM_DESC(zfs_fallocate_reserve_percent, 1101 "Percentage of length to use for the available capacity check"); 1102 /* END CSTYLED */ 1103