1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 26 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 27 * Copyright 2017 Nexenta Systems, Inc. 28 * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek 29 * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> 30 */ 31 32 /* Portions Copyright 2007 Jeremy Teo */ 33 /* Portions Copyright 2010 Robert Milkowski */ 34 35 #include <sys/types.h> 36 #include <sys/param.h> 37 #include <sys/time.h> 38 #include <sys/sysmacros.h> 39 #include <sys/vfs.h> 40 #include <sys/file.h> 41 #include <sys/stat.h> 42 #include <sys/kmem.h> 43 #include <sys/cmn_err.h> 44 #include <sys/errno.h> 45 #include <sys/zfs_dir.h> 46 #include <sys/zfs_acl.h> 47 #include <sys/zfs_ioctl.h> 48 #include <sys/fs/zfs.h> 49 #include <sys/dmu.h> 50 #include <sys/dmu_objset.h> 51 #include <sys/dsl_crypt.h> 52 #include <sys/spa.h> 53 #include <sys/txg.h> 54 #include <sys/dbuf.h> 55 #include <sys/policy.h> 56 #include <sys/zfeature.h> 57 #include <sys/zfs_vnops.h> 58 #include <sys/zfs_quota.h> 59 #include <sys/zfs_vfsops.h> 60 #include <sys/zfs_znode.h> 61 62 /* 63 * Enables access to the block cloning feature. If this setting is 0, then even 64 * if feature@block_cloning is enabled, using functions and system calls that 65 * attempt to clone blocks will act as though the feature is disabled. 66 */ 67 int zfs_bclone_enabled = 1; 68 69 /* 70 * When set zfs_clone_range() waits for dirty data to be written to disk. 71 * This allows the clone operation to reliably succeed when a file is modified 72 * and then immediately cloned. For small files this may be slower than making 73 * a copy of the file and is therefore not the default. However, in certain 74 * scenarios this behavior may be desirable so a tunable is provided. 75 */ 76 int zfs_bclone_wait_dirty = 0; 77 78 /* 79 * Enable Direct I/O. If this setting is 0, then all I/O requests will be 80 * directed through the ARC acting as though the dataset property direct was 81 * set to disabled. 82 * 83 * Disabled by default on FreeBSD until a potential range locking issue in 84 * zfs_getpages() can be resolved. 85 */ 86 #ifdef __FreeBSD__ 87 static int zfs_dio_enabled = 0; 88 #else 89 static int zfs_dio_enabled = 1; 90 #endif 91 92 /* 93 * Strictly enforce alignment for Direct I/O requests, returning EINVAL 94 * if not page-aligned instead of silently falling back to uncached I/O. 95 */ 96 static int zfs_dio_strict = 0; 97 98 99 /* 100 * Maximum bytes to read per chunk in zfs_read(). 101 */ 102 static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; 103 104 int 105 zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) 106 { 107 int error = 0; 108 zfsvfs_t *zfsvfs = ZTOZSB(zp); 109 110 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 111 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 112 return (error); 113 atomic_inc_32(&zp->z_sync_writes_cnt); 114 zil_commit(zfsvfs->z_log, zp->z_id); 115 atomic_dec_32(&zp->z_sync_writes_cnt); 116 zfs_exit(zfsvfs, FTAG); 117 } 118 return (error); 119 } 120 121 122 #if defined(SEEK_HOLE) && defined(SEEK_DATA) 123 /* 124 * Lseek support for finding holes (cmd == SEEK_HOLE) and 125 * data (cmd == SEEK_DATA). "off" is an in/out parameter. 126 */ 127 static int 128 zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) 129 { 130 zfs_locked_range_t *lr; 131 uint64_t noff = (uint64_t)*off; /* new offset */ 132 uint64_t file_sz; 133 int error; 134 boolean_t hole; 135 136 file_sz = zp->z_size; 137 if (noff >= file_sz) { 138 return (SET_ERROR(ENXIO)); 139 } 140 141 if (cmd == F_SEEK_HOLE) 142 hole = B_TRUE; 143 else 144 hole = B_FALSE; 145 146 /* Flush any mmap()'d data to disk */ 147 if (zn_has_cached_data(zp, 0, file_sz - 1)) 148 zn_flush_cached_data(zp, B_TRUE); 149 150 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER); 151 error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); 152 zfs_rangelock_exit(lr); 153 154 if (error == ESRCH) 155 return (SET_ERROR(ENXIO)); 156 157 /* File was dirty, so fall back to using generic logic */ 158 if (error == EBUSY) { 159 if (hole) 160 *off = file_sz; 161 162 return (0); 163 } 164 165 /* 166 * We could find a hole that begins after the logical end-of-file, 167 * because dmu_offset_next() only works on whole blocks. If the 168 * EOF falls mid-block, then indicate that the "virtual hole" 169 * at the end of the file begins at the logical EOF, rather than 170 * at the end of the last block. 171 */ 172 if (noff > file_sz) { 173 ASSERT(hole); 174 noff = file_sz; 175 } 176 177 if (noff < *off) 178 return (error); 179 *off = noff; 180 return (error); 181 } 182 183 int 184 zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) 185 { 186 zfsvfs_t *zfsvfs = ZTOZSB(zp); 187 int error; 188 189 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 190 return (error); 191 192 error = zfs_holey_common(zp, cmd, off); 193 194 zfs_exit(zfsvfs, FTAG); 195 return (error); 196 } 197 #endif /* SEEK_HOLE && SEEK_DATA */ 198 199 int 200 zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) 201 { 202 zfsvfs_t *zfsvfs = ZTOZSB(zp); 203 int error; 204 205 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 206 return (error); 207 208 if (flag & V_ACE_MASK) 209 #if defined(__linux__) 210 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, 211 zfs_init_idmap); 212 #else 213 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, 214 NULL); 215 #endif 216 else 217 #if defined(__linux__) 218 error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap); 219 #else 220 error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL); 221 #endif 222 223 zfs_exit(zfsvfs, FTAG); 224 return (error); 225 } 226 227 /* 228 * Determine if Direct I/O has been requested (either via the O_DIRECT flag or 229 * the "direct" dataset property). When inherited by the property only apply 230 * the O_DIRECT flag to correctly aligned IO requests. The rational for this 231 * is it allows the property to be safely set on a dataset without forcing 232 * all of the applications to be aware of the alignment restrictions. When 233 * O_DIRECT is explicitly requested by an application return EINVAL if the 234 * request is unaligned. In all cases, if the range for this request has 235 * been mmap'ed then we will perform buffered I/O to keep the mapped region 236 * synhronized with the ARC. 237 * 238 * It is possible that a file's pages could be mmap'ed after it is checked 239 * here. If so, that is handled coorarding in zfs_write(). See comments in the 240 * following area for how this is handled: 241 * zfs_write() -> update_pages() 242 */ 243 static int 244 zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, 245 int *ioflagp) 246 { 247 zfsvfs_t *zfsvfs = ZTOZSB(zp); 248 objset_t *os = zfsvfs->z_os; 249 int ioflag = *ioflagp; 250 int error = 0; 251 252 if (os->os_direct == ZFS_DIRECT_ALWAYS) { 253 /* Force either direct or uncached I/O. */ 254 ioflag |= O_DIRECT; 255 } 256 257 if ((ioflag & O_DIRECT) == 0) 258 goto out; 259 260 if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED) { 261 /* 262 * Direct I/O is disabled. The I/O request will be directed 263 * through the ARC as uncached I/O. 264 */ 265 goto out; 266 } 267 268 if (!zfs_uio_page_aligned(uio) || 269 !zfs_uio_aligned(uio, PAGE_SIZE)) { 270 /* 271 * Misaligned requests can be executed through the ARC as 272 * uncached I/O. But if O_DIRECT was set by user and we 273 * were set to be strict, then it is a failure. 274 */ 275 if ((*ioflagp & O_DIRECT) && zfs_dio_strict) 276 error = SET_ERROR(EINVAL); 277 goto out; 278 } 279 280 if (zn_has_cached_data(zp, zfs_uio_offset(uio), 281 zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) { 282 /* 283 * The region is mmap'ed. The I/O request will be directed 284 * through the ARC as uncached I/O. 285 */ 286 goto out; 287 } 288 289 /* 290 * For short writes the page mapping of Direct I/O makes no sense. 291 * Direct them through the ARC as uncached I/O. 292 */ 293 if (rw == UIO_WRITE && zfs_uio_resid(uio) < zp->z_blksz) 294 goto out; 295 296 error = zfs_uio_get_dio_pages_alloc(uio, rw); 297 if (error) 298 goto out; 299 ASSERT(uio->uio_extflg & UIO_DIRECT); 300 301 out: 302 *ioflagp = ioflag; 303 return (error); 304 } 305 306 /* 307 * Read bytes from specified file into supplied buffer. 308 * 309 * IN: zp - inode of file to be read from. 310 * uio - structure supplying read location, range info, 311 * and return buffer. 312 * ioflag - O_SYNC flags; used to provide FRSYNC semantics. 313 * O_DIRECT flag; used to bypass page cache. 314 * cr - credentials of caller. 315 * 316 * OUT: uio - updated offset and range, buffer filled. 317 * 318 * RETURN: 0 on success, error code on failure. 319 * 320 * Side Effects: 321 * inode - atime updated if byte count > 0 322 */ 323 int 324 zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) 325 { 326 (void) cr; 327 int error = 0; 328 boolean_t frsync = B_FALSE; 329 boolean_t dio_checksum_failure = B_FALSE; 330 331 zfsvfs_t *zfsvfs = ZTOZSB(zp); 332 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 333 return (error); 334 335 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 336 zfs_exit(zfsvfs, FTAG); 337 return (SET_ERROR(EACCES)); 338 } 339 340 /* We don't copy out anything useful for directories. */ 341 if (Z_ISDIR(ZTOTYPE(zp))) { 342 zfs_exit(zfsvfs, FTAG); 343 return (SET_ERROR(EISDIR)); 344 } 345 346 /* 347 * Validate file offset 348 */ 349 if (zfs_uio_offset(uio) < (offset_t)0) { 350 zfs_exit(zfsvfs, FTAG); 351 return (SET_ERROR(EINVAL)); 352 } 353 354 /* 355 * Fasttrack empty reads 356 */ 357 if (zfs_uio_resid(uio) == 0) { 358 zfs_exit(zfsvfs, FTAG); 359 return (0); 360 } 361 362 #ifdef FRSYNC 363 /* 364 * If we're in FRSYNC mode, sync out this znode before reading it. 365 * Only do this for non-snapshots. 366 * 367 * Some platforms do not support FRSYNC and instead map it 368 * to O_SYNC, which results in unnecessary calls to zil_commit. We 369 * only honor FRSYNC requests on platforms which support it. 370 */ 371 frsync = !!(ioflag & FRSYNC); 372 #endif 373 if (zfsvfs->z_log && 374 (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) 375 zil_commit(zfsvfs->z_log, zp->z_id); 376 377 /* 378 * Lock the range against changes. 379 */ 380 zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, 381 zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER); 382 383 /* 384 * If we are reading past end-of-file we can skip 385 * to the end; but we might still need to set atime. 386 */ 387 if (zfs_uio_offset(uio) >= zp->z_size) { 388 error = 0; 389 goto out; 390 } 391 ASSERT(zfs_uio_offset(uio) < zp->z_size); 392 393 /* 394 * Setting up Direct I/O if requested. 395 */ 396 error = zfs_setup_direct(zp, uio, UIO_READ, &ioflag); 397 if (error) { 398 goto out; 399 } 400 401 #if defined(__linux__) 402 ssize_t start_offset = zfs_uio_offset(uio); 403 #endif 404 ssize_t chunk_size = zfs_vnops_read_chunk_size; 405 ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); 406 ssize_t start_resid = n; 407 ssize_t dio_remaining_resid = 0; 408 409 dmu_flags_t dflags = DMU_READ_PREFETCH; 410 if (ioflag & O_DIRECT) 411 dflags |= DMU_UNCACHEDIO; 412 if (uio->uio_extflg & UIO_DIRECT) { 413 /* 414 * All pages for an O_DIRECT request ahve already been mapped 415 * so there's no compelling reason to handle this uio in 416 * smaller chunks. 417 */ 418 chunk_size = DMU_MAX_ACCESS; 419 420 /* 421 * In the event that the O_DIRECT request is reading the entire 422 * file, it is possible file's length is not page sized 423 * aligned. However, lower layers expect that the Direct I/O 424 * request is page-aligned. In this case, as much of the file 425 * that can be read using Direct I/O happens and the remaining 426 * amount will be read through the ARC. 427 * 428 * This is still consistent with the semantics of Direct I/O in 429 * ZFS as at a minimum the I/O request must be page-aligned. 430 */ 431 dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t); 432 if (dio_remaining_resid != 0) 433 n -= dio_remaining_resid; 434 dflags |= DMU_DIRECTIO; 435 } 436 437 while (n > 0) { 438 ssize_t nbytes = MIN(n, chunk_size - 439 P2PHASE(zfs_uio_offset(uio), chunk_size)); 440 #ifdef UIO_NOCOPY 441 if (zfs_uio_segflg(uio) == UIO_NOCOPY) 442 error = mappedread_sf(zp, nbytes, uio); 443 else 444 #endif 445 if (zn_has_cached_data(zp, zfs_uio_offset(uio), 446 zfs_uio_offset(uio) + nbytes - 1)) { 447 error = mappedread(zp, nbytes, uio); 448 } else { 449 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 450 uio, nbytes, dflags); 451 } 452 453 if (error) { 454 /* convert checksum errors into IO errors */ 455 if (error == ECKSUM) { 456 /* 457 * If a Direct I/O read returned a checksum 458 * verify error, then it must be treated as 459 * suspicious. The contents of the buffer could 460 * have beeen manipulated while the I/O was in 461 * flight. In this case, the remainder of I/O 462 * request will just be reissued through the 463 * ARC. 464 */ 465 if (uio->uio_extflg & UIO_DIRECT) { 466 dio_checksum_failure = B_TRUE; 467 uio->uio_extflg &= ~UIO_DIRECT; 468 n += dio_remaining_resid; 469 dio_remaining_resid = 0; 470 continue; 471 } else { 472 error = SET_ERROR(EIO); 473 } 474 } 475 476 #if defined(__linux__) 477 /* 478 * if we actually read some bytes, bubbling EFAULT 479 * up to become EAGAIN isn't what we want here... 480 * 481 * ...on Linux, at least. On FBSD, doing this breaks. 482 */ 483 if (error == EFAULT && 484 (zfs_uio_offset(uio) - start_offset) != 0) 485 error = 0; 486 #endif 487 break; 488 } 489 490 n -= nbytes; 491 } 492 493 if (error == 0 && (uio->uio_extflg & UIO_DIRECT) && 494 dio_remaining_resid != 0) { 495 /* 496 * Temporarily remove the UIO_DIRECT flag from the UIO so the 497 * remainder of the file can be read using the ARC. 498 */ 499 uio->uio_extflg &= ~UIO_DIRECT; 500 dflags &= ~DMU_DIRECTIO; 501 502 if (zn_has_cached_data(zp, zfs_uio_offset(uio), 503 zfs_uio_offset(uio) + dio_remaining_resid - 1)) { 504 error = mappedread(zp, dio_remaining_resid, uio); 505 } else { 506 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, 507 dio_remaining_resid, dflags); 508 } 509 uio->uio_extflg |= UIO_DIRECT; 510 dflags |= DMU_DIRECTIO; 511 512 if (error != 0) 513 n += dio_remaining_resid; 514 } else if (error && (uio->uio_extflg & UIO_DIRECT)) { 515 n += dio_remaining_resid; 516 } 517 int64_t nread = start_resid - n; 518 519 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); 520 out: 521 zfs_rangelock_exit(lr); 522 523 if (dio_checksum_failure == B_TRUE) 524 uio->uio_extflg |= UIO_DIRECT; 525 526 /* 527 * Cleanup for Direct I/O if requested. 528 */ 529 if (uio->uio_extflg & UIO_DIRECT) 530 zfs_uio_free_dio_pages(uio, UIO_READ); 531 532 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 533 zfs_exit(zfsvfs, FTAG); 534 return (error); 535 } 536 537 static void 538 zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr, 539 uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx) 540 { 541 zilog_t *zilog = zfsvfs->z_log; 542 const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); 543 544 ASSERT(clear_setid_bits_txgp != NULL); 545 ASSERT(tx != NULL); 546 547 /* 548 * Clear Set-UID/Set-GID bits on successful write if not 549 * privileged and at least one of the execute bits is set. 550 * 551 * It would be nice to do this after all writes have 552 * been done, but that would still expose the ISUID/ISGID 553 * to another app after the partial write is committed. 554 * 555 * Note: we don't call zfs_fuid_map_id() here because 556 * user 0 is not an ephemeral uid. 557 */ 558 mutex_enter(&zp->z_acl_lock); 559 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && 560 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 561 secpolicy_vnode_setid_retain(zp, cr, 562 ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { 563 uint64_t newmode; 564 565 zp->z_mode &= ~(S_ISUID | S_ISGID); 566 newmode = zp->z_mode; 567 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 568 (void *)&newmode, sizeof (uint64_t), tx); 569 570 mutex_exit(&zp->z_acl_lock); 571 572 /* 573 * Make sure SUID/SGID bits will be removed when we replay the 574 * log. If the setid bits are keep coming back, don't log more 575 * than one TX_SETATTR per transaction group. 576 */ 577 if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) { 578 vattr_t va = {0}; 579 580 va.va_mask = ATTR_MODE; 581 va.va_nodeid = zp->z_id; 582 va.va_mode = newmode; 583 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, 584 ATTR_MODE, NULL); 585 *clear_setid_bits_txgp = dmu_tx_get_txg(tx); 586 } 587 } else { 588 mutex_exit(&zp->z_acl_lock); 589 } 590 } 591 592 /* 593 * Write the bytes to a file. 594 * 595 * IN: zp - znode of file to be written to. 596 * uio - structure supplying write location, range info, 597 * and data buffer. 598 * ioflag - O_APPEND flag set if in append mode. 599 * O_DIRECT flag; used to bypass page cache. 600 * cr - credentials of caller. 601 * 602 * OUT: uio - updated offset and range. 603 * 604 * RETURN: 0 if success 605 * error code if failure 606 * 607 * Timestamps: 608 * ip - ctime|mtime updated if byte count > 0 609 */ 610 int 611 zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) 612 { 613 int error = 0, error1; 614 ssize_t start_resid = zfs_uio_resid(uio); 615 uint64_t clear_setid_bits_txg = 0; 616 boolean_t o_direct_defer = B_FALSE; 617 618 /* 619 * Fasttrack empty write 620 */ 621 ssize_t n = start_resid; 622 if (n == 0) 623 return (0); 624 625 zfsvfs_t *zfsvfs = ZTOZSB(zp); 626 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 627 return (error); 628 629 sa_bulk_attr_t bulk[4]; 630 int count = 0; 631 uint64_t mtime[2], ctime[2]; 632 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 633 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 634 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 635 &zp->z_size, 8); 636 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 637 &zp->z_pflags, 8); 638 639 /* 640 * Callers might not be able to detect properly that we are read-only, 641 * so check it explicitly here. 642 */ 643 if (zfs_is_readonly(zfsvfs)) { 644 zfs_exit(zfsvfs, FTAG); 645 return (SET_ERROR(EROFS)); 646 } 647 648 /* 649 * If immutable or not appending then return EPERM. 650 * Intentionally allow ZFS_READONLY through here. 651 * See zfs_zaccess_common() 652 */ 653 if ((zp->z_pflags & ZFS_IMMUTABLE) || 654 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && 655 (zfs_uio_offset(uio) < zp->z_size))) { 656 zfs_exit(zfsvfs, FTAG); 657 return (SET_ERROR(EPERM)); 658 } 659 660 /* 661 * Validate file offset 662 */ 663 offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); 664 if (woff < 0) { 665 zfs_exit(zfsvfs, FTAG); 666 return (SET_ERROR(EINVAL)); 667 } 668 669 /* 670 * Setting up Direct I/O if requested. 671 */ 672 error = zfs_setup_direct(zp, uio, UIO_WRITE, &ioflag); 673 if (error) { 674 zfs_exit(zfsvfs, FTAG); 675 return (SET_ERROR(error)); 676 } 677 678 /* 679 * Pre-fault the pages to ensure slow (eg NFS) pages 680 * don't hold up txg. 681 */ 682 ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1); 683 if (zfs_uio_prefaultpages(pfbytes, uio)) { 684 zfs_exit(zfsvfs, FTAG); 685 return (SET_ERROR(EFAULT)); 686 } 687 688 /* 689 * If in append mode, set the io offset pointer to eof. 690 */ 691 zfs_locked_range_t *lr; 692 if (ioflag & O_APPEND) { 693 /* 694 * Obtain an appending range lock to guarantee file append 695 * semantics. We reset the write offset once we have the lock. 696 */ 697 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); 698 woff = lr->lr_offset; 699 if (lr->lr_length == UINT64_MAX) { 700 /* 701 * We overlocked the file because this write will cause 702 * the file block size to increase. 703 * Note that zp_size cannot change with this lock held. 704 */ 705 woff = zp->z_size; 706 } 707 zfs_uio_setoffset(uio, woff); 708 /* 709 * We need to update the starting offset as well because it is 710 * set previously in the ZPL (Linux) and VNOPS (FreeBSD) 711 * layers. 712 */ 713 zfs_uio_setsoffset(uio, woff); 714 } else { 715 /* 716 * Note that if the file block size will change as a result of 717 * this write, then this range lock will lock the entire file 718 * so that we can re-write the block safely. 719 */ 720 lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); 721 } 722 723 if (zn_rlimit_fsize_uio(zp, uio)) { 724 zfs_rangelock_exit(lr); 725 zfs_exit(zfsvfs, FTAG); 726 return (SET_ERROR(EFBIG)); 727 } 728 729 const rlim64_t limit = MAXOFFSET_T; 730 731 if (woff >= limit) { 732 zfs_rangelock_exit(lr); 733 zfs_exit(zfsvfs, FTAG); 734 return (SET_ERROR(EFBIG)); 735 } 736 737 if (n > limit - woff) 738 n = limit - woff; 739 740 uint64_t end_size = MAX(zp->z_size, woff + n); 741 zilog_t *zilog = zfsvfs->z_log; 742 boolean_t commit = (ioflag & (O_SYNC | O_DSYNC)) || 743 (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS); 744 745 const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); 746 const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); 747 const uint64_t projid = zp->z_projid; 748 749 /* 750 * In the event we are increasing the file block size 751 * (lr_length == UINT64_MAX), we will direct the write to the ARC. 752 * Because zfs_grow_blocksize() will read from the ARC in order to 753 * grow the dbuf, we avoid doing Direct I/O here as that would cause 754 * data written to disk to be overwritten by data in the ARC during 755 * the sync phase. Besides writing data twice to disk, we also 756 * want to avoid consistency concerns between data in the the ARC and 757 * on disk while growing the file's blocksize. 758 * 759 * We will only temporarily remove Direct I/O and put it back after 760 * we have grown the blocksize. We do this in the event a request 761 * is larger than max_blksz, so further requests to 762 * dmu_write_uio_dbuf() will still issue the requests using Direct 763 * IO. 764 * 765 * As an example: 766 * The first block to file is being written as a 4k request with 767 * a recorsize of 1K. The first 1K issued in the loop below will go 768 * through the ARC; however, the following 3 1K requests will 769 * use Direct I/O. 770 */ 771 if (uio->uio_extflg & UIO_DIRECT && lr->lr_length == UINT64_MAX) { 772 uio->uio_extflg &= ~UIO_DIRECT; 773 o_direct_defer = B_TRUE; 774 } 775 776 /* 777 * Write the file in reasonable size chunks. Each chunk is written 778 * in a separate transaction; this keeps the intent log records small 779 * and allows us to do more fine-grained space accounting. 780 */ 781 while (n > 0) { 782 woff = zfs_uio_offset(uio); 783 784 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || 785 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || 786 (projid != ZFS_DEFAULT_PROJID && 787 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 788 projid))) { 789 error = SET_ERROR(EDQUOT); 790 break; 791 } 792 793 uint64_t blksz; 794 if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) { 795 if (zp->z_blksz > zfsvfs->z_max_blksz && 796 !ISP2(zp->z_blksz)) { 797 /* 798 * File's blocksize is already larger than the 799 * "recordsize" property. Only let it grow to 800 * the next power of 2. 801 */ 802 blksz = 1 << highbit64(zp->z_blksz); 803 } else { 804 blksz = zfsvfs->z_max_blksz; 805 } 806 blksz = MIN(blksz, P2ROUNDUP(end_size, 807 SPA_MINBLOCKSIZE)); 808 blksz = MAX(blksz, zp->z_blksz); 809 } else { 810 blksz = zp->z_blksz; 811 } 812 813 arc_buf_t *abuf = NULL; 814 ssize_t nbytes = n; 815 if (n >= blksz && woff >= zp->z_size && 816 P2PHASE(woff, blksz) == 0 && 817 !(uio->uio_extflg & UIO_DIRECT) && 818 (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) { 819 /* 820 * This write covers a full block. "Borrow" a buffer 821 * from the dmu so that we can fill it before we enter 822 * a transaction. This avoids the possibility of 823 * holding up the transaction if the data copy hangs 824 * up on a pagefault (e.g., from an NFS server mapping). 825 */ 826 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 827 blksz); 828 ASSERT(abuf != NULL); 829 ASSERT(arc_buf_size(abuf) == blksz); 830 if ((error = zfs_uiocopy(abuf->b_data, blksz, 831 UIO_WRITE, uio, &nbytes))) { 832 dmu_return_arcbuf(abuf); 833 break; 834 } 835 ASSERT3S(nbytes, ==, blksz); 836 } else { 837 nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) - 838 P2PHASE(woff, blksz)); 839 if (pfbytes < nbytes) { 840 if (zfs_uio_prefaultpages(nbytes, uio)) { 841 error = SET_ERROR(EFAULT); 842 break; 843 } 844 pfbytes = nbytes; 845 } 846 } 847 848 /* 849 * Start a transaction. 850 */ 851 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 852 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 853 dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); 854 DB_DNODE_ENTER(db); 855 dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes); 856 DB_DNODE_EXIT(db); 857 zfs_sa_upgrade_txholds(tx, zp); 858 error = dmu_tx_assign(tx, DMU_TX_WAIT); 859 if (error) { 860 dmu_tx_abort(tx); 861 if (abuf != NULL) 862 dmu_return_arcbuf(abuf); 863 break; 864 } 865 866 /* 867 * NB: We must call zfs_clear_setid_bits_if_necessary before 868 * committing the transaction! 869 */ 870 871 /* 872 * If rangelock_enter() over-locked we grow the blocksize 873 * and then reduce the lock range. This will only happen 874 * on the first iteration since rangelock_reduce() will 875 * shrink down lr_length to the appropriate size. 876 */ 877 if (lr->lr_length == UINT64_MAX) { 878 zfs_grow_blocksize(zp, blksz, tx); 879 zfs_rangelock_reduce(lr, woff, n); 880 } 881 882 dmu_flags_t dflags = DMU_READ_PREFETCH; 883 if (ioflag & O_DIRECT) 884 dflags |= DMU_UNCACHEDIO; 885 if (uio->uio_extflg & UIO_DIRECT) 886 dflags |= DMU_DIRECTIO; 887 888 ssize_t tx_bytes; 889 if (abuf == NULL) { 890 tx_bytes = zfs_uio_resid(uio); 891 zfs_uio_fault_disable(uio, B_TRUE); 892 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 893 uio, nbytes, tx, dflags); 894 zfs_uio_fault_disable(uio, B_FALSE); 895 #ifdef __linux__ 896 if (error == EFAULT) { 897 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, 898 cr, &clear_setid_bits_txg, tx); 899 dmu_tx_commit(tx); 900 /* 901 * Account for partial writes before 902 * continuing the loop. 903 * Update needs to occur before the next 904 * zfs_uio_prefaultpages, or prefaultpages may 905 * error, and we may break the loop early. 906 */ 907 n -= tx_bytes - zfs_uio_resid(uio); 908 pfbytes -= tx_bytes - zfs_uio_resid(uio); 909 continue; 910 } 911 #endif 912 /* 913 * On FreeBSD, EFAULT should be propagated back to the 914 * VFS, which will handle faulting and will retry. 915 */ 916 if (error != 0 && error != EFAULT) { 917 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, 918 cr, &clear_setid_bits_txg, tx); 919 dmu_tx_commit(tx); 920 break; 921 } 922 tx_bytes -= zfs_uio_resid(uio); 923 } else { 924 /* 925 * Thus, we're writing a full block at a block-aligned 926 * offset and extending the file past EOF. 927 * 928 * dmu_assign_arcbuf_by_dbuf() will directly assign the 929 * arc buffer to a dbuf. 930 */ 931 error = dmu_assign_arcbuf_by_dbuf( 932 sa_get_db(zp->z_sa_hdl), woff, abuf, tx, dflags); 933 if (error != 0) { 934 /* 935 * XXX This might not be necessary if 936 * dmu_assign_arcbuf_by_dbuf is guaranteed 937 * to be atomic. 938 */ 939 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, 940 cr, &clear_setid_bits_txg, tx); 941 dmu_return_arcbuf(abuf); 942 dmu_tx_commit(tx); 943 break; 944 } 945 ASSERT3S(nbytes, <=, zfs_uio_resid(uio)); 946 zfs_uioskip(uio, nbytes); 947 tx_bytes = nbytes; 948 } 949 /* 950 * There is a window where a file's pages can be mmap'ed after 951 * zfs_setup_direct() is called. This is due to the fact that 952 * the rangelock in this function is acquired after calling 953 * zfs_setup_direct(). This is done so that 954 * zfs_uio_prefaultpages() does not attempt to fault in pages 955 * on Linux for Direct I/O requests. This is not necessary as 956 * the pages are pinned in memory and can not be faulted out. 957 * Ideally, the rangelock would be held before calling 958 * zfs_setup_direct() and zfs_uio_prefaultpages(); however, 959 * this can lead to a deadlock as zfs_getpage() also acquires 960 * the rangelock as a RL_WRITER and prefaulting the pages can 961 * lead to zfs_getpage() being called. 962 * 963 * In the case of the pages being mapped after 964 * zfs_setup_direct() is called, the call to update_pages() 965 * will still be made to make sure there is consistency between 966 * the ARC and the Linux page cache. This is an ufortunate 967 * situation as the data will be read back into the ARC after 968 * the Direct I/O write has completed, but this is the penality 969 * for writing to a mmap'ed region of a file using Direct I/O. 970 */ 971 if (tx_bytes && 972 zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) { 973 update_pages(zp, woff, tx_bytes, zfsvfs->z_os); 974 } 975 976 /* 977 * If we made no progress, we're done. If we made even 978 * partial progress, update the znode and ZIL accordingly. 979 */ 980 if (tx_bytes == 0) { 981 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 982 (void *)&zp->z_size, sizeof (uint64_t), tx); 983 dmu_tx_commit(tx); 984 ASSERT(error != 0); 985 break; 986 } 987 988 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, 989 &clear_setid_bits_txg, tx); 990 991 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); 992 993 /* 994 * Update the file size (zp_size) if it has changed; 995 * account for possible concurrent updates. 996 */ 997 while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { 998 (void) atomic_cas_64(&zp->z_size, end_size, 999 zfs_uio_offset(uio)); 1000 ASSERT(error == 0 || error == EFAULT); 1001 } 1002 /* 1003 * If we are replaying and eof is non zero then force 1004 * the file size to the specified eof. Note, there's no 1005 * concurrency during replay. 1006 */ 1007 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 1008 zp->z_size = zfsvfs->z_replay_eof; 1009 1010 error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1011 if (error1 != 0) 1012 /* Avoid clobbering EFAULT. */ 1013 error = error1; 1014 1015 /* 1016 * NB: During replay, the TX_SETATTR record logged by 1017 * zfs_clear_setid_bits_if_necessary must precede any of 1018 * the TX_WRITE records logged here. 1019 */ 1020 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit, 1021 uio->uio_extflg & UIO_DIRECT ? B_TRUE : B_FALSE, NULL, 1022 NULL); 1023 1024 dmu_tx_commit(tx); 1025 1026 /* 1027 * Direct I/O was deferred in order to grow the first block. 1028 * At this point it can be re-enabled for subsequent writes. 1029 */ 1030 if (o_direct_defer) { 1031 ASSERT(ioflag & O_DIRECT); 1032 uio->uio_extflg |= UIO_DIRECT; 1033 o_direct_defer = B_FALSE; 1034 } 1035 1036 if (error != 0) 1037 break; 1038 ASSERT3S(tx_bytes, ==, nbytes); 1039 n -= nbytes; 1040 pfbytes -= nbytes; 1041 } 1042 1043 if (o_direct_defer) { 1044 ASSERT(ioflag & O_DIRECT); 1045 uio->uio_extflg |= UIO_DIRECT; 1046 o_direct_defer = B_FALSE; 1047 } 1048 1049 zfs_znode_update_vfs(zp); 1050 zfs_rangelock_exit(lr); 1051 1052 /* 1053 * Cleanup for Direct I/O if requested. 1054 */ 1055 if (uio->uio_extflg & UIO_DIRECT) 1056 zfs_uio_free_dio_pages(uio, UIO_WRITE); 1057 1058 /* 1059 * If we're in replay mode, or we made no progress, or the 1060 * uio data is inaccessible return an error. Otherwise, it's 1061 * at least a partial write, so it's successful. 1062 */ 1063 if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || 1064 error == EFAULT) { 1065 zfs_exit(zfsvfs, FTAG); 1066 return (error); 1067 } 1068 1069 if (commit) 1070 zil_commit(zilog, zp->z_id); 1071 1072 int64_t nwritten = start_resid - zfs_uio_resid(uio); 1073 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); 1074 1075 zfs_exit(zfsvfs, FTAG); 1076 return (0); 1077 } 1078 1079 /* 1080 * Rewrite a range of file as-is without modification. 1081 * 1082 * IN: zp - znode of file to be rewritten. 1083 * off - Offset of the range to rewrite. 1084 * len - Length of the range to rewrite. 1085 * flags - Random rewrite parameters. 1086 * arg - flags-specific argument. 1087 * 1088 * RETURN: 0 if success 1089 * error code if failure 1090 */ 1091 int 1092 zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags, 1093 uint64_t arg) 1094 { 1095 int error; 1096 1097 if (flags != 0 || arg != 0) 1098 return (SET_ERROR(EINVAL)); 1099 1100 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1101 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1102 return (error); 1103 1104 if (zfs_is_readonly(zfsvfs)) { 1105 zfs_exit(zfsvfs, FTAG); 1106 return (SET_ERROR(EROFS)); 1107 } 1108 1109 if (off >= zp->z_size) { 1110 zfs_exit(zfsvfs, FTAG); 1111 return (0); 1112 } 1113 if (len == 0 || len > zp->z_size - off) 1114 len = zp->z_size - off; 1115 1116 /* Flush any mmap()'d data to disk */ 1117 if (zn_has_cached_data(zp, off, off + len - 1)) 1118 zn_flush_cached_data(zp, B_TRUE); 1119 1120 zfs_locked_range_t *lr; 1121 lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); 1122 1123 const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); 1124 const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); 1125 const uint64_t projid = zp->z_projid; 1126 1127 dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); 1128 DB_DNODE_ENTER(db); 1129 dnode_t *dn = DB_DNODE(db); 1130 1131 uint64_t n, noff = off, nr = 0, nw = 0; 1132 while (len > 0) { 1133 /* 1134 * Rewrite only actual data, skipping any holes. This might 1135 * be inaccurate for dirty files, but we don't really care. 1136 */ 1137 if (noff == off) { 1138 /* Find next data in the file. */ 1139 error = dnode_next_offset(dn, 0, &noff, 1, 1, 0); 1140 if (error || noff >= off + len) { 1141 if (error == ESRCH) /* No more data. */ 1142 error = 0; 1143 break; 1144 } 1145 ASSERT3U(noff, >=, off); 1146 len -= noff - off; 1147 off = noff; 1148 1149 /* Find where the data end. */ 1150 error = dnode_next_offset(dn, DNODE_FIND_HOLE, &noff, 1151 1, 1, 0); 1152 if (error != 0) 1153 noff = off + len; 1154 } 1155 ASSERT3U(noff, >, off); 1156 1157 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || 1158 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || 1159 (projid != ZFS_DEFAULT_PROJID && 1160 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 1161 projid))) { 1162 error = SET_ERROR(EDQUOT); 1163 break; 1164 } 1165 1166 n = MIN(MIN(len, noff - off), 1167 DMU_MAX_ACCESS / 2 - P2PHASE(off, zp->z_blksz)); 1168 1169 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 1170 dmu_tx_hold_write_by_dnode(tx, dn, off, n); 1171 error = dmu_tx_assign(tx, DMU_TX_WAIT); 1172 if (error) { 1173 dmu_tx_abort(tx); 1174 break; 1175 } 1176 1177 /* Mark all dbufs within range as dirty to trigger rewrite. */ 1178 dmu_buf_t **dbp; 1179 int numbufs; 1180 error = dmu_buf_hold_array_by_dnode(dn, off, n, TRUE, FTAG, 1181 &numbufs, &dbp, DMU_READ_PREFETCH | DMU_UNCACHEDIO); 1182 if (error) { 1183 dmu_tx_abort(tx); 1184 break; 1185 } 1186 for (int i = 0; i < numbufs; i++) { 1187 nr += dbp[i]->db_size; 1188 if (dmu_buf_is_dirty(dbp[i], tx)) 1189 continue; 1190 nw += dbp[i]->db_size; 1191 dmu_buf_will_dirty(dbp[i], tx); 1192 } 1193 dmu_buf_rele_array(dbp, numbufs, FTAG); 1194 1195 dmu_tx_commit(tx); 1196 1197 len -= n; 1198 off += n; 1199 1200 if (issig()) { 1201 error = SET_ERROR(EINTR); 1202 break; 1203 } 1204 } 1205 1206 DB_DNODE_EXIT(db); 1207 1208 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr); 1209 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nw); 1210 1211 zfs_rangelock_exit(lr); 1212 zfs_exit(zfsvfs, FTAG); 1213 return (error); 1214 } 1215 1216 int 1217 zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) 1218 { 1219 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1220 int error; 1221 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 1222 1223 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1224 return (error); 1225 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 1226 zfs_exit(zfsvfs, FTAG); 1227 1228 return (error); 1229 } 1230 1231 int 1232 zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) 1233 { 1234 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1235 int error; 1236 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 1237 zilog_t *zilog; 1238 1239 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1240 return (error); 1241 zilog = zfsvfs->z_log; 1242 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 1243 1244 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1245 zil_commit(zilog, 0); 1246 1247 zfs_exit(zfsvfs, FTAG); 1248 return (error); 1249 } 1250 1251 /* 1252 * Get the optimal alignment to ensure direct IO can be performed without 1253 * incurring any RMW penalty on write. If direct IO is not enabled for this 1254 * file, returns an error. 1255 */ 1256 int 1257 zfs_get_direct_alignment(znode_t *zp, uint64_t *alignp) 1258 { 1259 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1260 1261 if (!zfs_dio_enabled || zfsvfs->z_os->os_direct == ZFS_DIRECT_DISABLED) 1262 return (SET_ERROR(EOPNOTSUPP)); 1263 1264 /* 1265 * If the file has multiple blocks, then its block size is fixed 1266 * forever, and so is the ideal alignment. 1267 * 1268 * If however it only has a single block, then we want to return the 1269 * max block size it could possibly grown to (ie, the dataset 1270 * recordsize). We do this so that a program querying alignment 1271 * immediately after the file is created gets a value that won't change 1272 * once the file has grown into the second block and beyond. 1273 * 1274 * Because we don't have a count of blocks easily available here, we 1275 * check if the apparent file size is smaller than its current block 1276 * size (meaning, the file hasn't yet grown into the current block 1277 * size) and then, check if the block size is smaller than the dataset 1278 * maximum (meaning, if the file grew past the current block size, the 1279 * block size could would be increased). 1280 */ 1281 if (zp->z_size <= zp->z_blksz && zp->z_blksz < zfsvfs->z_max_blksz) 1282 *alignp = MAX(zfsvfs->z_max_blksz, PAGE_SIZE); 1283 else 1284 *alignp = MAX(zp->z_blksz, PAGE_SIZE); 1285 1286 return (0); 1287 } 1288 1289 #ifdef ZFS_DEBUG 1290 static int zil_fault_io = 0; 1291 #endif 1292 1293 static void zfs_get_done(zgd_t *zgd, int error); 1294 1295 /* 1296 * Get data to generate a TX_WRITE intent log record. 1297 */ 1298 int 1299 zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, 1300 struct lwb *lwb, zio_t *zio) 1301 { 1302 zfsvfs_t *zfsvfs = arg; 1303 objset_t *os = zfsvfs->z_os; 1304 znode_t *zp; 1305 uint64_t object = lr->lr_foid; 1306 uint64_t offset = lr->lr_offset; 1307 uint64_t size = lr->lr_length; 1308 zgd_t *zgd; 1309 int error = 0; 1310 uint64_t zp_gen; 1311 1312 ASSERT3P(lwb, !=, NULL); 1313 ASSERT3U(size, !=, 0); 1314 1315 /* 1316 * Nothing to do if the file has been removed 1317 */ 1318 if (zfs_zget(zfsvfs, object, &zp) != 0) 1319 return (SET_ERROR(ENOENT)); 1320 if (zp->z_unlinked) { 1321 /* 1322 * Release the vnode asynchronously as we currently have the 1323 * txg stopped from syncing. 1324 */ 1325 zfs_zrele_async(zp); 1326 return (SET_ERROR(ENOENT)); 1327 } 1328 /* check if generation number matches */ 1329 if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 1330 sizeof (zp_gen)) != 0) { 1331 zfs_zrele_async(zp); 1332 return (SET_ERROR(EIO)); 1333 } 1334 if (zp_gen != gen) { 1335 zfs_zrele_async(zp); 1336 return (SET_ERROR(ENOENT)); 1337 } 1338 1339 zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1340 zgd->zgd_lwb = lwb; 1341 zgd->zgd_private = zp; 1342 1343 /* 1344 * Write records come in two flavors: immediate and indirect. 1345 * For small writes it's cheaper to store the data with the 1346 * log record (immediate); for large writes it's cheaper to 1347 * sync the data and get a pointer to it (indirect) so that 1348 * we don't have to write the data twice. 1349 */ 1350 if (buf != NULL) { /* immediate write */ 1351 zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, 1352 size, RL_READER); 1353 /* test for truncation needs to be done while range locked */ 1354 if (offset >= zp->z_size) { 1355 error = SET_ERROR(ENOENT); 1356 } else { 1357 error = dmu_read(os, object, offset, size, buf, 1358 DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING); 1359 } 1360 ASSERT(error == 0 || error == ENOENT); 1361 } else { /* indirect write */ 1362 ASSERT3P(zio, !=, NULL); 1363 /* 1364 * Have to lock the whole block to ensure when it's 1365 * written out and its checksum is being calculated 1366 * that no one can change the data. We need to re-check 1367 * blocksize after we get the lock in case it's changed! 1368 */ 1369 for (;;) { 1370 uint64_t blkoff; 1371 size = zp->z_blksz; 1372 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1373 offset -= blkoff; 1374 zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, 1375 offset, size, RL_READER); 1376 if (zp->z_blksz == size) 1377 break; 1378 offset += blkoff; 1379 zfs_rangelock_exit(zgd->zgd_lr); 1380 } 1381 /* test for truncation needs to be done while range locked */ 1382 if (lr->lr_offset >= zp->z_size) 1383 error = SET_ERROR(ENOENT); 1384 #ifdef ZFS_DEBUG 1385 if (zil_fault_io) { 1386 error = SET_ERROR(EIO); 1387 zil_fault_io = 0; 1388 } 1389 #endif 1390 1391 dmu_buf_t *dbp; 1392 if (error == 0) 1393 error = dmu_buf_hold_noread(os, object, offset, zgd, 1394 &dbp); 1395 1396 if (error == 0) { 1397 zgd->zgd_db = dbp; 1398 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp; 1399 boolean_t direct_write = B_FALSE; 1400 mutex_enter(&db->db_mtx); 1401 dbuf_dirty_record_t *dr = 1402 dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg); 1403 if (dr != NULL && dr->dt.dl.dr_diowrite) 1404 direct_write = B_TRUE; 1405 mutex_exit(&db->db_mtx); 1406 1407 /* 1408 * All Direct I/O writes will have already completed and 1409 * the block pointer can be immediately stored in the 1410 * log record. 1411 */ 1412 if (direct_write) { 1413 /* 1414 * A Direct I/O write always covers an entire 1415 * block. 1416 */ 1417 ASSERT3U(dbp->db_size, ==, zp->z_blksz); 1418 lr->lr_blkptr = dr->dt.dl.dr_overridden_by; 1419 zfs_get_done(zgd, 0); 1420 return (0); 1421 } 1422 1423 blkptr_t *bp = &lr->lr_blkptr; 1424 zgd->zgd_bp = bp; 1425 1426 ASSERT3U(dbp->db_offset, ==, offset); 1427 ASSERT3U(dbp->db_size, ==, size); 1428 1429 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1430 zfs_get_done, zgd); 1431 ASSERT(error || lr->lr_length <= size); 1432 1433 /* 1434 * On success, we need to wait for the write I/O 1435 * initiated by dmu_sync() to complete before we can 1436 * release this dbuf. We will finish everything up 1437 * in the zfs_get_done() callback. 1438 */ 1439 if (error == 0) 1440 return (0); 1441 1442 if (error == EALREADY) { 1443 lr->lr_common.lrc_txtype = TX_WRITE2; 1444 /* 1445 * TX_WRITE2 relies on the data previously 1446 * written by the TX_WRITE that caused 1447 * EALREADY. We zero out the BP because 1448 * it is the old, currently-on-disk BP. 1449 */ 1450 zgd->zgd_bp = NULL; 1451 BP_ZERO(bp); 1452 error = 0; 1453 } 1454 } 1455 } 1456 1457 zfs_get_done(zgd, error); 1458 1459 return (error); 1460 } 1461 1462 static void 1463 zfs_get_done(zgd_t *zgd, int error) 1464 { 1465 (void) error; 1466 znode_t *zp = zgd->zgd_private; 1467 1468 if (zgd->zgd_db) 1469 dmu_buf_rele(zgd->zgd_db, zgd); 1470 1471 zfs_rangelock_exit(zgd->zgd_lr); 1472 1473 /* 1474 * Release the vnode asynchronously as we currently have the 1475 * txg stopped from syncing. 1476 */ 1477 zfs_zrele_async(zp); 1478 1479 kmem_free(zgd, sizeof (zgd_t)); 1480 } 1481 1482 static int 1483 zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) 1484 { 1485 int error; 1486 1487 /* Swap. Not sure if the order of zfs_enter()s is important. */ 1488 if (zfsvfs1 > zfsvfs2) { 1489 zfsvfs_t *tmpzfsvfs; 1490 1491 tmpzfsvfs = zfsvfs2; 1492 zfsvfs2 = zfsvfs1; 1493 zfsvfs1 = tmpzfsvfs; 1494 } 1495 1496 error = zfs_enter(zfsvfs1, tag); 1497 if (error != 0) 1498 return (error); 1499 if (zfsvfs1 != zfsvfs2) { 1500 error = zfs_enter(zfsvfs2, tag); 1501 if (error != 0) { 1502 zfs_exit(zfsvfs1, tag); 1503 return (error); 1504 } 1505 } 1506 1507 return (0); 1508 } 1509 1510 static void 1511 zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) 1512 { 1513 1514 zfs_exit(zfsvfs1, tag); 1515 if (zfsvfs1 != zfsvfs2) 1516 zfs_exit(zfsvfs2, tag); 1517 } 1518 1519 /* 1520 * We split each clone request in chunks that can fit into a single ZIL 1521 * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning 1522 * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives 1523 * us room for storing 1022 block pointers. 1524 * 1525 * On success, the function return the number of bytes copied in *lenp. 1526 * Note, it doesn't return how much bytes are left to be copied. 1527 * On errors which are caused by any file system limitations or 1528 * brt limitations `EINVAL` is returned. In the most cases a user 1529 * requested bad parameters, it could be possible to clone the file but 1530 * some parameters don't match the requirements. 1531 */ 1532 int 1533 zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, 1534 uint64_t *outoffp, uint64_t *lenp, cred_t *cr) 1535 { 1536 zfsvfs_t *inzfsvfs, *outzfsvfs; 1537 objset_t *inos, *outos; 1538 zfs_locked_range_t *inlr, *outlr; 1539 dmu_buf_impl_t *db; 1540 dmu_tx_t *tx; 1541 zilog_t *zilog; 1542 uint64_t inoff, outoff, len, done; 1543 uint64_t outsize, size; 1544 int error; 1545 int count = 0; 1546 sa_bulk_attr_t bulk[3]; 1547 uint64_t mtime[2], ctime[2]; 1548 uint64_t uid, gid, projid; 1549 blkptr_t *bps; 1550 size_t maxblocks, nbps; 1551 uint_t inblksz; 1552 uint64_t clear_setid_bits_txg = 0; 1553 uint64_t last_synced_txg = 0; 1554 1555 inoff = *inoffp; 1556 outoff = *outoffp; 1557 len = *lenp; 1558 done = 0; 1559 1560 inzfsvfs = ZTOZSB(inzp); 1561 outzfsvfs = ZTOZSB(outzp); 1562 1563 /* 1564 * We need to call zfs_enter() potentially on two different datasets, 1565 * so we need a dedicated function for that. 1566 */ 1567 error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG); 1568 if (error != 0) 1569 return (error); 1570 1571 inos = inzfsvfs->z_os; 1572 outos = outzfsvfs->z_os; 1573 1574 /* 1575 * Both source and destination have to belong to the same storage pool. 1576 */ 1577 if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { 1578 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1579 return (SET_ERROR(EXDEV)); 1580 } 1581 1582 /* 1583 * outos and inos belongs to the same storage pool. 1584 * see a few lines above, only one check. 1585 */ 1586 if (!spa_feature_is_enabled(dmu_objset_spa(outos), 1587 SPA_FEATURE_BLOCK_CLONING)) { 1588 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1589 return (SET_ERROR(EOPNOTSUPP)); 1590 } 1591 1592 ASSERT(!outzfsvfs->z_replay); 1593 1594 /* 1595 * Block cloning from an unencrypted dataset into an encrypted 1596 * dataset and vice versa is not supported. 1597 */ 1598 if (inos->os_encrypted != outos->os_encrypted) { 1599 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1600 return (SET_ERROR(EXDEV)); 1601 } 1602 1603 /* 1604 * Cloning across encrypted datasets is possible only if they 1605 * share the same master key. 1606 */ 1607 if (inos != outos && inos->os_encrypted && 1608 !dmu_objset_crypto_key_equal(inos, outos)) { 1609 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1610 return (SET_ERROR(EXDEV)); 1611 } 1612 1613 error = zfs_verify_zp(inzp); 1614 if (error == 0) 1615 error = zfs_verify_zp(outzp); 1616 if (error != 0) { 1617 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1618 return (error); 1619 } 1620 1621 /* 1622 * We don't copy source file's flags that's why we don't allow to clone 1623 * files that are in quarantine. 1624 */ 1625 if (inzp->z_pflags & ZFS_AV_QUARANTINED) { 1626 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1627 return (SET_ERROR(EACCES)); 1628 } 1629 1630 if (inoff >= inzp->z_size) { 1631 *lenp = 0; 1632 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1633 return (0); 1634 } 1635 if (len > inzp->z_size - inoff) { 1636 len = inzp->z_size - inoff; 1637 } 1638 if (len == 0) { 1639 *lenp = 0; 1640 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1641 return (0); 1642 } 1643 1644 /* 1645 * Callers might not be able to detect properly that we are read-only, 1646 * so check it explicitly here. 1647 */ 1648 if (zfs_is_readonly(outzfsvfs)) { 1649 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1650 return (SET_ERROR(EROFS)); 1651 } 1652 1653 /* 1654 * If immutable or not appending then return EPERM. 1655 * Intentionally allow ZFS_READONLY through here. 1656 * See zfs_zaccess_common() 1657 */ 1658 if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) { 1659 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1660 return (SET_ERROR(EPERM)); 1661 } 1662 1663 /* 1664 * No overlapping if we are cloning within the same file. 1665 */ 1666 if (inzp == outzp) { 1667 if (inoff < outoff + len && outoff < inoff + len) { 1668 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1669 return (SET_ERROR(EINVAL)); 1670 } 1671 } 1672 1673 /* Flush any mmap()'d data to disk */ 1674 if (zn_has_cached_data(inzp, inoff, inoff + len - 1)) 1675 zn_flush_cached_data(inzp, B_TRUE); 1676 1677 /* 1678 * Maintain predictable lock order. 1679 */ 1680 if (inzp < outzp || (inzp == outzp && inoff < outoff)) { 1681 inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, 1682 RL_READER); 1683 outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, 1684 RL_WRITER); 1685 } else { 1686 outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, 1687 RL_WRITER); 1688 inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, 1689 RL_READER); 1690 } 1691 1692 inblksz = inzp->z_blksz; 1693 1694 /* 1695 * We cannot clone into a file with different block size if we can't 1696 * grow it (block size is already bigger, has more than one block, or 1697 * not locked for growth). There are other possible reasons for the 1698 * grow to fail, but we cover what we can before opening transaction 1699 * and the rest detect after we try to do it. 1700 */ 1701 if (inblksz < outzp->z_blksz) { 1702 error = SET_ERROR(EINVAL); 1703 goto unlock; 1704 } 1705 if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz || 1706 outlr->lr_length != UINT64_MAX)) { 1707 error = SET_ERROR(EINVAL); 1708 goto unlock; 1709 } 1710 1711 /* 1712 * Block size must be power-of-2 if destination offset != 0. 1713 * There can be no multiple blocks of non-power-of-2 size. 1714 */ 1715 if (outoff != 0 && !ISP2(inblksz)) { 1716 error = SET_ERROR(EINVAL); 1717 goto unlock; 1718 } 1719 1720 /* 1721 * Offsets and len must be at block boundries. 1722 */ 1723 if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) { 1724 error = SET_ERROR(EINVAL); 1725 goto unlock; 1726 } 1727 /* 1728 * Length must be multipe of blksz, except for the end of the file. 1729 */ 1730 if ((len % inblksz) != 0 && 1731 (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) { 1732 error = SET_ERROR(EINVAL); 1733 goto unlock; 1734 } 1735 1736 /* 1737 * If we are copying only one block and it is smaller than recordsize 1738 * property, do not allow destination to grow beyond one block if it 1739 * is not there yet. Otherwise the destination will get stuck with 1740 * that block size forever, that can be as small as 512 bytes, no 1741 * matter how big the destination grow later. 1742 */ 1743 if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz && 1744 outzp->z_size <= inblksz && outoff + len > inblksz) { 1745 error = SET_ERROR(EINVAL); 1746 goto unlock; 1747 } 1748 1749 error = zn_rlimit_fsize(outoff + len); 1750 if (error != 0) { 1751 goto unlock; 1752 } 1753 1754 if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) { 1755 error = SET_ERROR(EFBIG); 1756 goto unlock; 1757 } 1758 1759 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL, 1760 &mtime, 16); 1761 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL, 1762 &ctime, 16); 1763 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL, 1764 &outzp->z_size, 8); 1765 1766 zilog = outzfsvfs->z_log; 1767 maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) / 1768 sizeof (bps[0]); 1769 1770 uid = KUID_TO_SUID(ZTOUID(outzp)); 1771 gid = KGID_TO_SGID(ZTOGID(outzp)); 1772 projid = outzp->z_projid; 1773 1774 bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); 1775 1776 /* 1777 * Clone the file in reasonable size chunks. Each chunk is cloned 1778 * in a separate transaction; this keeps the intent log records small 1779 * and allows us to do more fine-grained space accounting. 1780 */ 1781 while (len > 0) { 1782 size = MIN(inblksz * maxblocks, len); 1783 1784 if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT, 1785 uid) || 1786 zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT, 1787 gid) || 1788 (projid != ZFS_DEFAULT_PROJID && 1789 zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT, 1790 projid))) { 1791 error = SET_ERROR(EDQUOT); 1792 break; 1793 } 1794 1795 nbps = maxblocks; 1796 last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos)); 1797 error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps, 1798 &nbps); 1799 if (error != 0) { 1800 /* 1801 * If we are trying to clone a block that was created 1802 * in the current transaction group, the error will be 1803 * EAGAIN here. Based on zfs_bclone_wait_dirty either 1804 * return a shortened range to the caller so it can 1805 * fallback, or wait for the next TXG and check again. 1806 */ 1807 if (error == EAGAIN && zfs_bclone_wait_dirty) { 1808 txg_wait_synced(dmu_objset_pool(inos), 1809 last_synced_txg + 1); 1810 continue; 1811 } 1812 1813 break; 1814 } 1815 1816 /* 1817 * Start a transaction. 1818 */ 1819 tx = dmu_tx_create(outos); 1820 dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE); 1821 db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl); 1822 DB_DNODE_ENTER(db); 1823 dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size); 1824 DB_DNODE_EXIT(db); 1825 zfs_sa_upgrade_txholds(tx, outzp); 1826 error = dmu_tx_assign(tx, DMU_TX_WAIT); 1827 if (error != 0) { 1828 dmu_tx_abort(tx); 1829 break; 1830 } 1831 1832 /* 1833 * Copy source znode's block size. This is done only if the 1834 * whole znode is locked (see zfs_rangelock_cb()) and only 1835 * on the first iteration since zfs_rangelock_reduce() will 1836 * shrink down lr_length to the appropriate size. 1837 */ 1838 if (outlr->lr_length == UINT64_MAX) { 1839 zfs_grow_blocksize(outzp, inblksz, tx); 1840 1841 /* 1842 * Block growth may fail for many reasons we can not 1843 * predict here. If it happen the cloning is doomed. 1844 */ 1845 if (inblksz != outzp->z_blksz) { 1846 error = SET_ERROR(EINVAL); 1847 dmu_tx_abort(tx); 1848 break; 1849 } 1850 1851 /* 1852 * Round range lock up to the block boundary, so we 1853 * prevent appends until we are done. 1854 */ 1855 zfs_rangelock_reduce(outlr, outoff, 1856 ((len - 1) / inblksz + 1) * inblksz); 1857 } 1858 1859 error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, 1860 bps, nbps); 1861 if (error != 0) { 1862 dmu_tx_commit(tx); 1863 break; 1864 } 1865 1866 if (zn_has_cached_data(outzp, outoff, outoff + size - 1)) { 1867 update_pages(outzp, outoff, size, outos); 1868 } 1869 1870 zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr, 1871 &clear_setid_bits_txg, tx); 1872 1873 zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime); 1874 1875 /* 1876 * Update the file size (zp_size) if it has changed; 1877 * account for possible concurrent updates. 1878 */ 1879 while ((outsize = outzp->z_size) < outoff + size) { 1880 (void) atomic_cas_64(&outzp->z_size, outsize, 1881 outoff + size); 1882 } 1883 1884 error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx); 1885 1886 zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff, 1887 size, inblksz, bps, nbps); 1888 1889 dmu_tx_commit(tx); 1890 1891 if (error != 0) 1892 break; 1893 1894 inoff += size; 1895 outoff += size; 1896 len -= size; 1897 done += size; 1898 1899 if (issig()) { 1900 error = SET_ERROR(EINTR); 1901 break; 1902 } 1903 } 1904 1905 vmem_free(bps, sizeof (bps[0]) * maxblocks); 1906 zfs_znode_update_vfs(outzp); 1907 1908 unlock: 1909 zfs_rangelock_exit(outlr); 1910 zfs_rangelock_exit(inlr); 1911 1912 if (done > 0) { 1913 /* 1914 * If we have made at least partial progress, reset the error. 1915 */ 1916 error = 0; 1917 1918 ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp); 1919 1920 if (outos->os_sync == ZFS_SYNC_ALWAYS) { 1921 zil_commit(zilog, outzp->z_id); 1922 } 1923 1924 *inoffp += done; 1925 *outoffp += done; 1926 *lenp = done; 1927 } else { 1928 /* 1929 * If we made no progress, there must be a good reason. 1930 * EOF is handled explicitly above, before the loop. 1931 */ 1932 ASSERT3S(error, !=, 0); 1933 } 1934 1935 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1936 1937 return (error); 1938 } 1939 1940 /* 1941 * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(), 1942 * but we cannot do that, because when replaying we don't have source znode 1943 * available. This is why we need a dedicated replay function. 1944 */ 1945 int 1946 zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz, 1947 const blkptr_t *bps, size_t nbps) 1948 { 1949 zfsvfs_t *zfsvfs; 1950 dmu_buf_impl_t *db; 1951 dmu_tx_t *tx; 1952 int error; 1953 int count = 0; 1954 sa_bulk_attr_t bulk[3]; 1955 uint64_t mtime[2], ctime[2]; 1956 1957 ASSERT3U(off, <, MAXOFFSET_T); 1958 ASSERT3U(len, >, 0); 1959 ASSERT3U(nbps, >, 0); 1960 1961 zfsvfs = ZTOZSB(zp); 1962 1963 ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os), 1964 SPA_FEATURE_BLOCK_CLONING)); 1965 1966 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1967 return (error); 1968 1969 ASSERT(zfsvfs->z_replay); 1970 ASSERT(!zfs_is_readonly(zfsvfs)); 1971 1972 if ((off % blksz) != 0) { 1973 zfs_exit(zfsvfs, FTAG); 1974 return (SET_ERROR(EINVAL)); 1975 } 1976 1977 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 1978 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 1979 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 1980 &zp->z_size, 8); 1981 1982 /* 1983 * Start a transaction. 1984 */ 1985 tx = dmu_tx_create(zfsvfs->z_os); 1986 1987 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 1988 db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); 1989 DB_DNODE_ENTER(db); 1990 dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len); 1991 DB_DNODE_EXIT(db); 1992 zfs_sa_upgrade_txholds(tx, zp); 1993 error = dmu_tx_assign(tx, DMU_TX_WAIT); 1994 if (error != 0) { 1995 dmu_tx_abort(tx); 1996 zfs_exit(zfsvfs, FTAG); 1997 return (error); 1998 } 1999 2000 if (zp->z_blksz < blksz) 2001 zfs_grow_blocksize(zp, blksz, tx); 2002 2003 dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps); 2004 2005 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); 2006 2007 if (zp->z_size < off + len) 2008 zp->z_size = off + len; 2009 2010 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 2011 2012 /* 2013 * zil_replaying() not only check if we are replaying ZIL, but also 2014 * updates the ZIL header to record replay progress. 2015 */ 2016 VERIFY(zil_replaying(zfsvfs->z_log, tx)); 2017 2018 dmu_tx_commit(tx); 2019 2020 zfs_znode_update_vfs(zp); 2021 2022 zfs_exit(zfsvfs, FTAG); 2023 2024 return (error); 2025 } 2026 2027 EXPORT_SYMBOL(zfs_access); 2028 EXPORT_SYMBOL(zfs_fsync); 2029 EXPORT_SYMBOL(zfs_holey); 2030 EXPORT_SYMBOL(zfs_read); 2031 EXPORT_SYMBOL(zfs_write); 2032 EXPORT_SYMBOL(zfs_getsecattr); 2033 EXPORT_SYMBOL(zfs_setsecattr); 2034 EXPORT_SYMBOL(zfs_clone_range); 2035 EXPORT_SYMBOL(zfs_clone_range_replay); 2036 2037 ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW, 2038 "Bytes to read per chunk"); 2039 2040 ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW, 2041 "Enable block cloning"); 2042 2043 ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW, 2044 "Wait for dirty blocks when cloning"); 2045 2046 ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW, 2047 "Enable Direct I/O"); 2048 2049 ZFS_MODULE_PARAM(zfs, zfs_, dio_strict, INT, ZMOD_RW, 2050 "Return errors on misaligned Direct I/O"); 2051