1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 26 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 27 * Copyright 2017 Nexenta Systems, Inc. 28 * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek 29 * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> 30 */ 31 32 /* Portions Copyright 2007 Jeremy Teo */ 33 /* Portions Copyright 2010 Robert Milkowski */ 34 35 #include <sys/types.h> 36 #include <sys/param.h> 37 #include <sys/time.h> 38 #include <sys/sysmacros.h> 39 #include <sys/vfs.h> 40 #include <sys/file.h> 41 #include <sys/stat.h> 42 #include <sys/kmem.h> 43 #include <sys/cmn_err.h> 44 #include <sys/errno.h> 45 #include <sys/zfs_dir.h> 46 #include <sys/zfs_acl.h> 47 #include <sys/zfs_ioctl.h> 48 #include <sys/fs/zfs.h> 49 #include <sys/dmu.h> 50 #include <sys/dmu_objset.h> 51 #include <sys/dsl_crypt.h> 52 #include <sys/dsl_dataset.h> 53 #include <sys/spa.h> 54 #include <sys/txg.h> 55 #include <sys/dbuf.h> 56 #include <sys/policy.h> 57 #include <sys/zfeature.h> 58 #include <sys/zfs_vnops.h> 59 #include <sys/zfs_quota.h> 60 #include <sys/zfs_vfsops.h> 61 #include <sys/zfs_znode.h> 62 63 /* 64 * Enables access to the block cloning feature. If this setting is 0, then even 65 * if feature@block_cloning is enabled, using functions and system calls that 66 * attempt to clone blocks will act as though the feature is disabled. 67 */ 68 int zfs_bclone_enabled = 1; 69 70 /* 71 * When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty 72 * data to be written to disk before proceeding. This ensures that the clone 73 * operation reliably succeeds, even if a file is modified and then immediately 74 * cloned. Note that for small files this may be slower than simply copying 75 * the file. When set to 0 the clone operation will immediately fail if it 76 * encounters any dirty blocks. By default waiting is enabled. 77 */ 78 int zfs_bclone_wait_dirty = 1; 79 80 /* 81 * Enable Direct I/O. If this setting is 0, then all I/O requests will be 82 * directed through the ARC acting as though the dataset property direct was 83 * set to disabled. 84 * 85 * Disabled by default on FreeBSD until a potential range locking issue in 86 * zfs_getpages() can be resolved. 87 */ 88 #ifdef __FreeBSD__ 89 static int zfs_dio_enabled = 0; 90 #else 91 static int zfs_dio_enabled = 1; 92 #endif 93 94 /* 95 * Strictly enforce alignment for Direct I/O requests, returning EINVAL 96 * if not page-aligned instead of silently falling back to uncached I/O. 97 */ 98 static int zfs_dio_strict = 0; 99 100 101 /* 102 * Maximum bytes to read per chunk in zfs_read(). 103 */ 104 #ifdef _ILP32 105 static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; 106 #else 107 static uint64_t zfs_vnops_read_chunk_size = DMU_MAX_ACCESS / 2; 108 #endif 109 110 int 111 zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) 112 { 113 int error = 0; 114 zfsvfs_t *zfsvfs = ZTOZSB(zp); 115 116 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 117 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 118 return (error); 119 zil_commit(zfsvfs->z_log, zp->z_id); 120 zfs_exit(zfsvfs, FTAG); 121 } 122 return (error); 123 } 124 125 126 #if defined(SEEK_HOLE) && defined(SEEK_DATA) 127 /* 128 * Lseek support for finding holes (cmd == SEEK_HOLE) and 129 * data (cmd == SEEK_DATA). "off" is an in/out parameter. 130 */ 131 static int 132 zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) 133 { 134 zfs_locked_range_t *lr; 135 uint64_t noff = (uint64_t)*off; /* new offset */ 136 uint64_t file_sz; 137 int error; 138 boolean_t hole; 139 140 file_sz = zp->z_size; 141 if (noff >= file_sz) { 142 return (SET_ERROR(ENXIO)); 143 } 144 145 if (cmd == F_SEEK_HOLE) 146 hole = B_TRUE; 147 else 148 hole = B_FALSE; 149 150 /* Flush any mmap()'d data to disk */ 151 if (zn_has_cached_data(zp, 0, file_sz - 1)) 152 zn_flush_cached_data(zp, B_TRUE); 153 154 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER); 155 error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); 156 zfs_rangelock_exit(lr); 157 158 if (error == ESRCH) 159 return (SET_ERROR(ENXIO)); 160 161 /* File was dirty, so fall back to using generic logic */ 162 if (error == EBUSY) { 163 if (hole) 164 *off = file_sz; 165 166 return (0); 167 } 168 169 /* 170 * We could find a hole that begins after the logical end-of-file, 171 * because dmu_offset_next() only works on whole blocks. If the 172 * EOF falls mid-block, then indicate that the "virtual hole" 173 * at the end of the file begins at the logical EOF, rather than 174 * at the end of the last block. 175 */ 176 if (noff > file_sz) { 177 ASSERT(hole); 178 noff = file_sz; 179 } 180 181 if (noff < *off) 182 return (error); 183 *off = noff; 184 return (error); 185 } 186 187 int 188 zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) 189 { 190 zfsvfs_t *zfsvfs = ZTOZSB(zp); 191 int error; 192 193 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 194 return (error); 195 196 error = zfs_holey_common(zp, cmd, off); 197 198 zfs_exit(zfsvfs, FTAG); 199 return (error); 200 } 201 #endif /* SEEK_HOLE && SEEK_DATA */ 202 203 int 204 zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) 205 { 206 zfsvfs_t *zfsvfs = ZTOZSB(zp); 207 int error; 208 209 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 210 return (error); 211 212 if (flag & V_ACE_MASK) 213 #if defined(__linux__) 214 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, 215 zfs_init_idmap); 216 #else 217 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, 218 NULL); 219 #endif 220 else 221 #if defined(__linux__) 222 error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap); 223 #else 224 error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL); 225 #endif 226 227 zfs_exit(zfsvfs, FTAG); 228 return (error); 229 } 230 231 /* 232 * Determine if Direct I/O has been requested (either via the O_DIRECT flag or 233 * the "direct" dataset property). When inherited by the property only apply 234 * the O_DIRECT flag to correctly aligned IO requests. The rational for this 235 * is it allows the property to be safely set on a dataset without forcing 236 * all of the applications to be aware of the alignment restrictions. When 237 * O_DIRECT is explicitly requested by an application return EINVAL if the 238 * request is unaligned. In all cases, if the range for this request has 239 * been mmap'ed then we will perform buffered I/O to keep the mapped region 240 * synhronized with the ARC. 241 * 242 * It is possible that a file's pages could be mmap'ed after it is checked 243 * here. If so, that is handled coorarding in zfs_write(). See comments in the 244 * following area for how this is handled: 245 * zfs_write() -> update_pages() 246 */ 247 static int 248 zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, 249 int *ioflagp) 250 { 251 zfsvfs_t *zfsvfs = ZTOZSB(zp); 252 objset_t *os = zfsvfs->z_os; 253 int ioflag = *ioflagp; 254 int error = 0; 255 256 if (os->os_direct == ZFS_DIRECT_ALWAYS) { 257 /* Force either direct or uncached I/O. */ 258 ioflag |= O_DIRECT; 259 } 260 261 if ((ioflag & O_DIRECT) == 0) 262 goto out; 263 264 if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED) { 265 /* 266 * Direct I/O is disabled. The I/O request will be directed 267 * through the ARC as uncached I/O. 268 */ 269 goto out; 270 } 271 272 if (!zfs_uio_page_aligned(uio) || 273 !zfs_uio_aligned(uio, PAGE_SIZE)) { 274 /* 275 * Misaligned requests can be executed through the ARC as 276 * uncached I/O. But if O_DIRECT was set by user and we 277 * were set to be strict, then it is a failure. 278 */ 279 if ((*ioflagp & O_DIRECT) && zfs_dio_strict) 280 error = SET_ERROR(EINVAL); 281 goto out; 282 } 283 284 if (zn_has_cached_data(zp, zfs_uio_offset(uio), 285 zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) { 286 /* 287 * The region is mmap'ed. The I/O request will be directed 288 * through the ARC as uncached I/O. 289 */ 290 goto out; 291 } 292 293 /* 294 * For short writes the page mapping of Direct I/O makes no sense. 295 * Direct them through the ARC as uncached I/O. 296 */ 297 if (rw == UIO_WRITE && zfs_uio_resid(uio) < zp->z_blksz) 298 goto out; 299 300 error = zfs_uio_get_dio_pages_alloc(uio, rw); 301 if (error) 302 goto out; 303 ASSERT(uio->uio_extflg & UIO_DIRECT); 304 305 out: 306 *ioflagp = ioflag; 307 return (error); 308 } 309 310 /* 311 * Read bytes from specified file into supplied buffer. 312 * 313 * IN: zp - inode of file to be read from. 314 * uio - structure supplying read location, range info, 315 * and return buffer. 316 * ioflag - O_SYNC flags; used to provide FRSYNC semantics. 317 * O_DIRECT flag; used to bypass page cache. 318 * cr - credentials of caller. 319 * 320 * OUT: uio - updated offset and range, buffer filled. 321 * 322 * RETURN: 0 on success, error code on failure. 323 * 324 * Side Effects: 325 * inode - atime updated if byte count > 0 326 */ 327 int 328 zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) 329 { 330 (void) cr; 331 int error = 0; 332 boolean_t frsync = B_FALSE; 333 boolean_t dio_checksum_failure = B_FALSE; 334 335 zfsvfs_t *zfsvfs = ZTOZSB(zp); 336 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 337 return (error); 338 339 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 340 zfs_exit(zfsvfs, FTAG); 341 return (SET_ERROR(EACCES)); 342 } 343 344 /* We don't copy out anything useful for directories. */ 345 if (Z_ISDIR(ZTOTYPE(zp))) { 346 zfs_exit(zfsvfs, FTAG); 347 return (SET_ERROR(EISDIR)); 348 } 349 350 /* 351 * Validate file offset 352 */ 353 if (zfs_uio_offset(uio) < (offset_t)0) { 354 zfs_exit(zfsvfs, FTAG); 355 return (SET_ERROR(EINVAL)); 356 } 357 358 /* 359 * Fasttrack empty reads 360 */ 361 if (zfs_uio_resid(uio) == 0) { 362 zfs_exit(zfsvfs, FTAG); 363 return (0); 364 } 365 366 #ifdef FRSYNC 367 /* 368 * If we're in FRSYNC mode, sync out this znode before reading it. 369 * Only do this for non-snapshots. 370 * 371 * Some platforms do not support FRSYNC and instead map it 372 * to O_SYNC, which results in unnecessary calls to zil_commit. We 373 * only honor FRSYNC requests on platforms which support it. 374 */ 375 frsync = !!(ioflag & FRSYNC); 376 #endif 377 if (zfsvfs->z_log && 378 (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) 379 zil_commit(zfsvfs->z_log, zp->z_id); 380 381 /* 382 * Lock the range against changes. 383 */ 384 zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, 385 zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER); 386 387 /* 388 * If we are reading past end-of-file we can skip 389 * to the end; but we might still need to set atime. 390 */ 391 if (zfs_uio_offset(uio) >= zp->z_size) { 392 error = 0; 393 goto out; 394 } 395 ASSERT(zfs_uio_offset(uio) < zp->z_size); 396 397 /* 398 * Setting up Direct I/O if requested. 399 */ 400 error = zfs_setup_direct(zp, uio, UIO_READ, &ioflag); 401 if (error) { 402 goto out; 403 } 404 405 #if defined(__linux__) 406 ssize_t start_offset = zfs_uio_offset(uio); 407 #endif 408 uint_t blksz = zp->z_blksz; 409 ssize_t chunk_size; 410 ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); 411 ssize_t start_resid = n; 412 ssize_t dio_remaining_resid = 0; 413 414 dmu_flags_t dflags = DMU_READ_PREFETCH; 415 if (ioflag & O_DIRECT) 416 dflags |= DMU_UNCACHEDIO; 417 if (uio->uio_extflg & UIO_DIRECT) { 418 /* 419 * All pages for an O_DIRECT request ahve already been mapped 420 * so there's no compelling reason to handle this uio in 421 * smaller chunks. 422 */ 423 chunk_size = DMU_MAX_ACCESS; 424 425 /* 426 * In the event that the O_DIRECT request is reading the entire 427 * file, it is possible file's length is not page sized 428 * aligned. However, lower layers expect that the Direct I/O 429 * request is page-aligned. In this case, as much of the file 430 * that can be read using Direct I/O happens and the remaining 431 * amount will be read through the ARC. 432 * 433 * This is still consistent with the semantics of Direct I/O in 434 * ZFS as at a minimum the I/O request must be page-aligned. 435 */ 436 dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t); 437 if (dio_remaining_resid != 0) 438 n -= dio_remaining_resid; 439 dflags |= DMU_DIRECTIO; 440 } else { 441 chunk_size = MIN(MAX(zfs_vnops_read_chunk_size, blksz), 442 DMU_MAX_ACCESS / 2); 443 } 444 445 while (n > 0) { 446 ssize_t nbytes = MIN(n, chunk_size - 447 P2PHASE(zfs_uio_offset(uio), blksz)); 448 #ifdef UIO_NOCOPY 449 if (zfs_uio_segflg(uio) == UIO_NOCOPY) 450 error = mappedread_sf(zp, nbytes, uio); 451 else 452 #endif 453 if (zn_has_cached_data(zp, zfs_uio_offset(uio), 454 zfs_uio_offset(uio) + nbytes - 1)) { 455 error = mappedread(zp, nbytes, uio); 456 } else { 457 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 458 uio, nbytes, dflags); 459 } 460 461 if (error) { 462 /* convert checksum errors into IO errors */ 463 if (error == ECKSUM) { 464 /* 465 * If a Direct I/O read returned a checksum 466 * verify error, then it must be treated as 467 * suspicious. The contents of the buffer could 468 * have beeen manipulated while the I/O was in 469 * flight. In this case, the remainder of I/O 470 * request will just be reissued through the 471 * ARC. 472 */ 473 if (uio->uio_extflg & UIO_DIRECT) { 474 dio_checksum_failure = B_TRUE; 475 uio->uio_extflg &= ~UIO_DIRECT; 476 n += dio_remaining_resid; 477 dio_remaining_resid = 0; 478 continue; 479 } else { 480 error = SET_ERROR(EIO); 481 } 482 } 483 484 #if defined(__linux__) 485 /* 486 * if we actually read some bytes, bubbling EFAULT 487 * up to become EAGAIN isn't what we want here... 488 * 489 * ...on Linux, at least. On FBSD, doing this breaks. 490 */ 491 if (error == EFAULT && 492 (zfs_uio_offset(uio) - start_offset) != 0) 493 error = 0; 494 #endif 495 break; 496 } 497 498 n -= nbytes; 499 } 500 501 if (error == 0 && (uio->uio_extflg & UIO_DIRECT) && 502 dio_remaining_resid != 0) { 503 /* 504 * Temporarily remove the UIO_DIRECT flag from the UIO so the 505 * remainder of the file can be read using the ARC. 506 */ 507 uio->uio_extflg &= ~UIO_DIRECT; 508 dflags &= ~DMU_DIRECTIO; 509 510 if (zn_has_cached_data(zp, zfs_uio_offset(uio), 511 zfs_uio_offset(uio) + dio_remaining_resid - 1)) { 512 error = mappedread(zp, dio_remaining_resid, uio); 513 } else { 514 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, 515 dio_remaining_resid, dflags); 516 } 517 uio->uio_extflg |= UIO_DIRECT; 518 dflags |= DMU_DIRECTIO; 519 520 if (error != 0) 521 n += dio_remaining_resid; 522 } else if (error && (uio->uio_extflg & UIO_DIRECT)) { 523 n += dio_remaining_resid; 524 } 525 int64_t nread = start_resid - n; 526 527 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); 528 out: 529 zfs_rangelock_exit(lr); 530 531 if (dio_checksum_failure == B_TRUE) 532 uio->uio_extflg |= UIO_DIRECT; 533 534 /* 535 * Cleanup for Direct I/O if requested. 536 */ 537 if (uio->uio_extflg & UIO_DIRECT) 538 zfs_uio_free_dio_pages(uio, UIO_READ); 539 540 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 541 zfs_exit(zfsvfs, FTAG); 542 return (error); 543 } 544 545 static void 546 zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr, 547 uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx) 548 { 549 zilog_t *zilog = zfsvfs->z_log; 550 const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); 551 552 ASSERT(clear_setid_bits_txgp != NULL); 553 ASSERT(tx != NULL); 554 555 /* 556 * Clear Set-UID/Set-GID bits on successful write if not 557 * privileged and at least one of the execute bits is set. 558 * 559 * It would be nice to do this after all writes have 560 * been done, but that would still expose the ISUID/ISGID 561 * to another app after the partial write is committed. 562 * 563 * Note: we don't call zfs_fuid_map_id() here because 564 * user 0 is not an ephemeral uid. 565 */ 566 mutex_enter(&zp->z_acl_lock); 567 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && 568 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 569 secpolicy_vnode_setid_retain(zp, cr, 570 ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { 571 uint64_t newmode; 572 573 zp->z_mode &= ~(S_ISUID | S_ISGID); 574 newmode = zp->z_mode; 575 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 576 (void *)&newmode, sizeof (uint64_t), tx); 577 578 mutex_exit(&zp->z_acl_lock); 579 580 /* 581 * Make sure SUID/SGID bits will be removed when we replay the 582 * log. If the setid bits are keep coming back, don't log more 583 * than one TX_SETATTR per transaction group. 584 */ 585 if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) { 586 vattr_t va = {0}; 587 588 va.va_mask = ATTR_MODE; 589 va.va_nodeid = zp->z_id; 590 va.va_mode = newmode; 591 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, 592 ATTR_MODE, NULL); 593 *clear_setid_bits_txgp = dmu_tx_get_txg(tx); 594 } 595 } else { 596 mutex_exit(&zp->z_acl_lock); 597 } 598 } 599 600 /* 601 * Write the bytes to a file. 602 * 603 * IN: zp - znode of file to be written to. 604 * uio - structure supplying write location, range info, 605 * and data buffer. 606 * ioflag - O_APPEND flag set if in append mode. 607 * O_DIRECT flag; used to bypass page cache. 608 * cr - credentials of caller. 609 * 610 * OUT: uio - updated offset and range. 611 * 612 * RETURN: 0 if success 613 * error code if failure 614 * 615 * Timestamps: 616 * ip - ctime|mtime updated if byte count > 0 617 */ 618 int 619 zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) 620 { 621 int error = 0, error1; 622 ssize_t start_resid = zfs_uio_resid(uio); 623 uint64_t clear_setid_bits_txg = 0; 624 boolean_t o_direct_defer = B_FALSE; 625 626 /* 627 * Fasttrack empty write 628 */ 629 ssize_t n = start_resid; 630 if (n == 0) 631 return (0); 632 633 zfsvfs_t *zfsvfs = ZTOZSB(zp); 634 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 635 return (error); 636 637 sa_bulk_attr_t bulk[4]; 638 int count = 0; 639 uint64_t mtime[2], ctime[2]; 640 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 641 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 642 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 643 &zp->z_size, 8); 644 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 645 &zp->z_pflags, 8); 646 647 /* 648 * Callers might not be able to detect properly that we are read-only, 649 * so check it explicitly here. 650 */ 651 if (zfs_is_readonly(zfsvfs)) { 652 zfs_exit(zfsvfs, FTAG); 653 return (SET_ERROR(EROFS)); 654 } 655 656 /* 657 * If immutable or not appending then return EPERM. 658 * Intentionally allow ZFS_READONLY through here. 659 * See zfs_zaccess_common() 660 */ 661 if ((zp->z_pflags & ZFS_IMMUTABLE) || 662 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && 663 (zfs_uio_offset(uio) < zp->z_size))) { 664 zfs_exit(zfsvfs, FTAG); 665 return (SET_ERROR(EPERM)); 666 } 667 668 /* 669 * Validate file offset 670 */ 671 offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); 672 if (woff < 0) { 673 zfs_exit(zfsvfs, FTAG); 674 return (SET_ERROR(EINVAL)); 675 } 676 677 /* 678 * Setting up Direct I/O if requested. 679 */ 680 error = zfs_setup_direct(zp, uio, UIO_WRITE, &ioflag); 681 if (error) { 682 zfs_exit(zfsvfs, FTAG); 683 return (SET_ERROR(error)); 684 } 685 686 /* 687 * Pre-fault the pages to ensure slow (eg NFS) pages 688 * don't hold up txg. 689 */ 690 ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1); 691 if (zfs_uio_prefaultpages(pfbytes, uio)) { 692 zfs_exit(zfsvfs, FTAG); 693 return (SET_ERROR(EFAULT)); 694 } 695 696 /* 697 * If in append mode, set the io offset pointer to eof. 698 */ 699 zfs_locked_range_t *lr; 700 if (ioflag & O_APPEND) { 701 /* 702 * Obtain an appending range lock to guarantee file append 703 * semantics. We reset the write offset once we have the lock. 704 */ 705 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); 706 woff = lr->lr_offset; 707 if (lr->lr_length == UINT64_MAX) { 708 /* 709 * We overlocked the file because this write will cause 710 * the file block size to increase. 711 * Note that zp_size cannot change with this lock held. 712 */ 713 woff = zp->z_size; 714 } 715 zfs_uio_setoffset(uio, woff); 716 /* 717 * We need to update the starting offset as well because it is 718 * set previously in the ZPL (Linux) and VNOPS (FreeBSD) 719 * layers. 720 */ 721 zfs_uio_setsoffset(uio, woff); 722 } else { 723 /* 724 * Note that if the file block size will change as a result of 725 * this write, then this range lock will lock the entire file 726 * so that we can re-write the block safely. 727 */ 728 lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); 729 } 730 731 if (zn_rlimit_fsize_uio(zp, uio)) { 732 zfs_rangelock_exit(lr); 733 zfs_exit(zfsvfs, FTAG); 734 return (SET_ERROR(EFBIG)); 735 } 736 737 const rlim64_t limit = MAXOFFSET_T; 738 739 if (woff >= limit) { 740 zfs_rangelock_exit(lr); 741 zfs_exit(zfsvfs, FTAG); 742 return (SET_ERROR(EFBIG)); 743 } 744 745 if (n > limit - woff) 746 n = limit - woff; 747 748 uint64_t end_size = MAX(zp->z_size, woff + n); 749 zilog_t *zilog = zfsvfs->z_log; 750 boolean_t commit = (ioflag & (O_SYNC | O_DSYNC)) || 751 (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS); 752 753 const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); 754 const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); 755 const uint64_t projid = zp->z_projid; 756 757 /* 758 * In the event we are increasing the file block size 759 * (lr_length == UINT64_MAX), we will direct the write to the ARC. 760 * Because zfs_grow_blocksize() will read from the ARC in order to 761 * grow the dbuf, we avoid doing Direct I/O here as that would cause 762 * data written to disk to be overwritten by data in the ARC during 763 * the sync phase. Besides writing data twice to disk, we also 764 * want to avoid consistency concerns between data in the the ARC and 765 * on disk while growing the file's blocksize. 766 * 767 * We will only temporarily remove Direct I/O and put it back after 768 * we have grown the blocksize. We do this in the event a request 769 * is larger than max_blksz, so further requests to 770 * dmu_write_uio_dbuf() will still issue the requests using Direct 771 * IO. 772 * 773 * As an example: 774 * The first block to file is being written as a 4k request with 775 * a recorsize of 1K. The first 1K issued in the loop below will go 776 * through the ARC; however, the following 3 1K requests will 777 * use Direct I/O. 778 */ 779 if (uio->uio_extflg & UIO_DIRECT && lr->lr_length == UINT64_MAX) { 780 uio->uio_extflg &= ~UIO_DIRECT; 781 o_direct_defer = B_TRUE; 782 } 783 784 /* 785 * Write the file in reasonable size chunks. Each chunk is written 786 * in a separate transaction; this keeps the intent log records small 787 * and allows us to do more fine-grained space accounting. 788 */ 789 while (n > 0) { 790 woff = zfs_uio_offset(uio); 791 792 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || 793 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || 794 (projid != ZFS_DEFAULT_PROJID && 795 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 796 projid))) { 797 error = SET_ERROR(EDQUOT); 798 break; 799 } 800 801 uint64_t blksz; 802 if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) { 803 if (zp->z_blksz > zfsvfs->z_max_blksz && 804 !ISP2(zp->z_blksz)) { 805 /* 806 * File's blocksize is already larger than the 807 * "recordsize" property. Only let it grow to 808 * the next power of 2. 809 */ 810 blksz = 1 << highbit64(zp->z_blksz); 811 } else { 812 blksz = zfsvfs->z_max_blksz; 813 } 814 blksz = MIN(blksz, P2ROUNDUP(end_size, 815 SPA_MINBLOCKSIZE)); 816 blksz = MAX(blksz, zp->z_blksz); 817 } else { 818 blksz = zp->z_blksz; 819 } 820 821 arc_buf_t *abuf = NULL; 822 ssize_t nbytes = n; 823 if (n >= blksz && woff >= zp->z_size && 824 P2PHASE(woff, blksz) == 0 && 825 !(uio->uio_extflg & UIO_DIRECT) && 826 (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) { 827 /* 828 * This write covers a full block. "Borrow" a buffer 829 * from the dmu so that we can fill it before we enter 830 * a transaction. This avoids the possibility of 831 * holding up the transaction if the data copy hangs 832 * up on a pagefault (e.g., from an NFS server mapping). 833 */ 834 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 835 blksz); 836 ASSERT(abuf != NULL); 837 ASSERT(arc_buf_size(abuf) == blksz); 838 if ((error = zfs_uiocopy(abuf->b_data, blksz, 839 UIO_WRITE, uio, &nbytes))) { 840 dmu_return_arcbuf(abuf); 841 break; 842 } 843 ASSERT3S(nbytes, ==, blksz); 844 } else { 845 nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) - 846 P2PHASE(woff, blksz)); 847 if (pfbytes < nbytes) { 848 if (zfs_uio_prefaultpages(nbytes, uio)) { 849 error = SET_ERROR(EFAULT); 850 break; 851 } 852 pfbytes = nbytes; 853 } 854 } 855 856 /* 857 * Start a transaction. 858 */ 859 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 860 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 861 dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); 862 DB_DNODE_ENTER(db); 863 dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes); 864 DB_DNODE_EXIT(db); 865 zfs_sa_upgrade_txholds(tx, zp); 866 error = dmu_tx_assign(tx, DMU_TX_WAIT); 867 if (error) { 868 dmu_tx_abort(tx); 869 if (abuf != NULL) 870 dmu_return_arcbuf(abuf); 871 break; 872 } 873 874 /* 875 * NB: We must call zfs_clear_setid_bits_if_necessary before 876 * committing the transaction! 877 */ 878 879 /* 880 * If rangelock_enter() over-locked we grow the blocksize 881 * and then reduce the lock range. This will only happen 882 * on the first iteration since rangelock_reduce() will 883 * shrink down lr_length to the appropriate size. 884 */ 885 if (lr->lr_length == UINT64_MAX) { 886 zfs_grow_blocksize(zp, blksz, tx); 887 zfs_rangelock_reduce(lr, woff, n); 888 } 889 890 dmu_flags_t dflags = DMU_READ_PREFETCH; 891 if (ioflag & O_DIRECT) 892 dflags |= DMU_UNCACHEDIO; 893 if (uio->uio_extflg & UIO_DIRECT) 894 dflags |= DMU_DIRECTIO; 895 896 ssize_t tx_bytes; 897 if (abuf == NULL) { 898 tx_bytes = zfs_uio_resid(uio); 899 zfs_uio_fault_disable(uio, B_TRUE); 900 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 901 uio, nbytes, tx, dflags); 902 zfs_uio_fault_disable(uio, B_FALSE); 903 #ifdef __linux__ 904 if (error == EFAULT) { 905 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, 906 cr, &clear_setid_bits_txg, tx); 907 dmu_tx_commit(tx); 908 /* 909 * Account for partial writes before 910 * continuing the loop. 911 * Update needs to occur before the next 912 * zfs_uio_prefaultpages, or prefaultpages may 913 * error, and we may break the loop early. 914 */ 915 n -= tx_bytes - zfs_uio_resid(uio); 916 pfbytes -= tx_bytes - zfs_uio_resid(uio); 917 continue; 918 } 919 #endif 920 /* 921 * On FreeBSD, EFAULT should be propagated back to the 922 * VFS, which will handle faulting and will retry. 923 */ 924 if (error != 0 && error != EFAULT) { 925 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, 926 cr, &clear_setid_bits_txg, tx); 927 dmu_tx_commit(tx); 928 break; 929 } 930 tx_bytes -= zfs_uio_resid(uio); 931 } else { 932 /* 933 * Thus, we're writing a full block at a block-aligned 934 * offset and extending the file past EOF. 935 * 936 * dmu_assign_arcbuf_by_dbuf() will directly assign the 937 * arc buffer to a dbuf. 938 */ 939 error = dmu_assign_arcbuf_by_dbuf( 940 sa_get_db(zp->z_sa_hdl), woff, abuf, tx, dflags); 941 if (error != 0) { 942 /* 943 * XXX This might not be necessary if 944 * dmu_assign_arcbuf_by_dbuf is guaranteed 945 * to be atomic. 946 */ 947 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, 948 cr, &clear_setid_bits_txg, tx); 949 dmu_return_arcbuf(abuf); 950 dmu_tx_commit(tx); 951 break; 952 } 953 ASSERT3S(nbytes, <=, zfs_uio_resid(uio)); 954 zfs_uioskip(uio, nbytes); 955 tx_bytes = nbytes; 956 } 957 /* 958 * There is a window where a file's pages can be mmap'ed after 959 * zfs_setup_direct() is called. This is due to the fact that 960 * the rangelock in this function is acquired after calling 961 * zfs_setup_direct(). This is done so that 962 * zfs_uio_prefaultpages() does not attempt to fault in pages 963 * on Linux for Direct I/O requests. This is not necessary as 964 * the pages are pinned in memory and can not be faulted out. 965 * Ideally, the rangelock would be held before calling 966 * zfs_setup_direct() and zfs_uio_prefaultpages(); however, 967 * this can lead to a deadlock as zfs_getpage() also acquires 968 * the rangelock as a RL_WRITER and prefaulting the pages can 969 * lead to zfs_getpage() being called. 970 * 971 * In the case of the pages being mapped after 972 * zfs_setup_direct() is called, the call to update_pages() 973 * will still be made to make sure there is consistency between 974 * the ARC and the Linux page cache. This is an ufortunate 975 * situation as the data will be read back into the ARC after 976 * the Direct I/O write has completed, but this is the penality 977 * for writing to a mmap'ed region of a file using Direct I/O. 978 */ 979 if (tx_bytes && 980 zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) { 981 update_pages(zp, woff, tx_bytes, zfsvfs->z_os); 982 } 983 984 /* 985 * If we made no progress, we're done. If we made even 986 * partial progress, update the znode and ZIL accordingly. 987 */ 988 if (tx_bytes == 0) { 989 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 990 (void *)&zp->z_size, sizeof (uint64_t), tx); 991 dmu_tx_commit(tx); 992 ASSERT(error != 0); 993 break; 994 } 995 996 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, 997 &clear_setid_bits_txg, tx); 998 999 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); 1000 1001 /* 1002 * Update the file size (zp_size) if it has changed; 1003 * account for possible concurrent updates. 1004 */ 1005 while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { 1006 (void) atomic_cas_64(&zp->z_size, end_size, 1007 zfs_uio_offset(uio)); 1008 ASSERT(error == 0 || error == EFAULT); 1009 } 1010 /* 1011 * If we are replaying and eof is non zero then force 1012 * the file size to the specified eof. Note, there's no 1013 * concurrency during replay. 1014 */ 1015 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 1016 zp->z_size = zfsvfs->z_replay_eof; 1017 1018 error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1019 if (error1 != 0) 1020 /* Avoid clobbering EFAULT. */ 1021 error = error1; 1022 1023 /* 1024 * NB: During replay, the TX_SETATTR record logged by 1025 * zfs_clear_setid_bits_if_necessary must precede any of 1026 * the TX_WRITE records logged here. 1027 */ 1028 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit, 1029 uio->uio_extflg & UIO_DIRECT ? B_TRUE : B_FALSE, NULL, 1030 NULL); 1031 1032 dmu_tx_commit(tx); 1033 1034 /* 1035 * Direct I/O was deferred in order to grow the first block. 1036 * At this point it can be re-enabled for subsequent writes. 1037 */ 1038 if (o_direct_defer) { 1039 ASSERT(ioflag & O_DIRECT); 1040 uio->uio_extflg |= UIO_DIRECT; 1041 o_direct_defer = B_FALSE; 1042 } 1043 1044 if (error != 0) 1045 break; 1046 ASSERT3S(tx_bytes, ==, nbytes); 1047 n -= nbytes; 1048 pfbytes -= nbytes; 1049 } 1050 1051 if (o_direct_defer) { 1052 ASSERT(ioflag & O_DIRECT); 1053 uio->uio_extflg |= UIO_DIRECT; 1054 o_direct_defer = B_FALSE; 1055 } 1056 1057 zfs_znode_update_vfs(zp); 1058 zfs_rangelock_exit(lr); 1059 1060 /* 1061 * Cleanup for Direct I/O if requested. 1062 */ 1063 if (uio->uio_extflg & UIO_DIRECT) 1064 zfs_uio_free_dio_pages(uio, UIO_WRITE); 1065 1066 /* 1067 * If we're in replay mode, or we made no progress, or the 1068 * uio data is inaccessible return an error. Otherwise, it's 1069 * at least a partial write, so it's successful. 1070 */ 1071 if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || 1072 error == EFAULT) { 1073 zfs_exit(zfsvfs, FTAG); 1074 return (error); 1075 } 1076 1077 if (commit) 1078 zil_commit(zilog, zp->z_id); 1079 1080 int64_t nwritten = start_resid - zfs_uio_resid(uio); 1081 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); 1082 1083 zfs_exit(zfsvfs, FTAG); 1084 return (0); 1085 } 1086 1087 /* 1088 * Rewrite a range of file as-is without modification. 1089 * 1090 * IN: zp - znode of file to be rewritten. 1091 * off - Offset of the range to rewrite. 1092 * len - Length of the range to rewrite. 1093 * flags - Random rewrite parameters. 1094 * arg - flags-specific argument. 1095 * 1096 * RETURN: 0 if success 1097 * error code if failure 1098 */ 1099 int 1100 zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags, 1101 uint64_t arg) 1102 { 1103 int error; 1104 1105 if ((flags & ~ZFS_REWRITE_PHYSICAL) != 0 || arg != 0) 1106 return (SET_ERROR(EINVAL)); 1107 1108 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1109 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1110 return (error); 1111 1112 /* Check if physical rewrite is allowed */ 1113 spa_t *spa = zfsvfs->z_os->os_spa; 1114 if ((flags & ZFS_REWRITE_PHYSICAL) && 1115 !spa_feature_is_enabled(spa, SPA_FEATURE_PHYSICAL_REWRITE)) { 1116 zfs_exit(zfsvfs, FTAG); 1117 return (SET_ERROR(ENOTSUP)); 1118 } 1119 1120 if (zfs_is_readonly(zfsvfs)) { 1121 zfs_exit(zfsvfs, FTAG); 1122 return (SET_ERROR(EROFS)); 1123 } 1124 1125 if (off >= zp->z_size) { 1126 zfs_exit(zfsvfs, FTAG); 1127 return (0); 1128 } 1129 if (len == 0 || len > zp->z_size - off) 1130 len = zp->z_size - off; 1131 1132 /* Flush any mmap()'d data to disk */ 1133 if (zn_has_cached_data(zp, off, off + len - 1)) 1134 zn_flush_cached_data(zp, B_TRUE); 1135 1136 zfs_locked_range_t *lr; 1137 lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); 1138 1139 const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); 1140 const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); 1141 const uint64_t projid = zp->z_projid; 1142 1143 dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); 1144 DB_DNODE_ENTER(db); 1145 dnode_t *dn = DB_DNODE(db); 1146 1147 uint64_t n, noff = off, nr = 0, nw = 0; 1148 while (len > 0) { 1149 /* 1150 * Rewrite only actual data, skipping any holes. This might 1151 * be inaccurate for dirty files, but we don't really care. 1152 */ 1153 if (noff == off) { 1154 /* Find next data in the file. */ 1155 error = dnode_next_offset(dn, 0, &noff, 1, 1, 0); 1156 if (error || noff >= off + len) { 1157 if (error == ESRCH) /* No more data. */ 1158 error = 0; 1159 break; 1160 } 1161 ASSERT3U(noff, >=, off); 1162 len -= noff - off; 1163 off = noff; 1164 1165 /* Find where the data end. */ 1166 error = dnode_next_offset(dn, DNODE_FIND_HOLE, &noff, 1167 1, 1, 0); 1168 if (error != 0) 1169 noff = off + len; 1170 } 1171 ASSERT3U(noff, >, off); 1172 1173 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || 1174 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || 1175 (projid != ZFS_DEFAULT_PROJID && 1176 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 1177 projid))) { 1178 error = SET_ERROR(EDQUOT); 1179 break; 1180 } 1181 1182 n = MIN(MIN(len, noff - off), 1183 DMU_MAX_ACCESS / 2 - P2PHASE(off, zp->z_blksz)); 1184 1185 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 1186 dmu_tx_hold_write_by_dnode(tx, dn, off, n); 1187 error = dmu_tx_assign(tx, DMU_TX_WAIT); 1188 if (error) { 1189 dmu_tx_abort(tx); 1190 break; 1191 } 1192 1193 /* Mark all dbufs within range as dirty to trigger rewrite. */ 1194 dmu_buf_t **dbp; 1195 int numbufs; 1196 error = dmu_buf_hold_array_by_dnode(dn, off, n, TRUE, FTAG, 1197 &numbufs, &dbp, DMU_READ_PREFETCH | DMU_UNCACHEDIO); 1198 if (error) { 1199 dmu_tx_commit(tx); 1200 break; 1201 } 1202 for (int i = 0; i < numbufs; i++) { 1203 nr += dbp[i]->db_size; 1204 if (dmu_buf_is_dirty(dbp[i], tx)) 1205 continue; 1206 nw += dbp[i]->db_size; 1207 if (flags & ZFS_REWRITE_PHYSICAL) 1208 dmu_buf_will_rewrite(dbp[i], tx); 1209 else 1210 dmu_buf_will_dirty(dbp[i], tx); 1211 } 1212 dmu_buf_rele_array(dbp, numbufs, FTAG); 1213 1214 dmu_tx_commit(tx); 1215 1216 len -= n; 1217 off += n; 1218 1219 if (issig()) { 1220 error = SET_ERROR(EINTR); 1221 break; 1222 } 1223 } 1224 1225 DB_DNODE_EXIT(db); 1226 1227 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr); 1228 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nw); 1229 1230 zfs_rangelock_exit(lr); 1231 zfs_exit(zfsvfs, FTAG); 1232 return (error); 1233 } 1234 1235 int 1236 zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) 1237 { 1238 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1239 int error; 1240 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 1241 1242 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1243 return (error); 1244 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 1245 zfs_exit(zfsvfs, FTAG); 1246 1247 return (error); 1248 } 1249 1250 int 1251 zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) 1252 { 1253 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1254 int error; 1255 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 1256 zilog_t *zilog; 1257 1258 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1259 return (error); 1260 zilog = zfsvfs->z_log; 1261 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 1262 1263 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1264 zil_commit(zilog, 0); 1265 1266 zfs_exit(zfsvfs, FTAG); 1267 return (error); 1268 } 1269 1270 /* 1271 * Get the optimal alignment to ensure direct IO can be performed without 1272 * incurring any RMW penalty on write. If direct IO is not enabled for this 1273 * file, returns an error. 1274 */ 1275 int 1276 zfs_get_direct_alignment(znode_t *zp, uint64_t *alignp) 1277 { 1278 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1279 1280 if (!zfs_dio_enabled || zfsvfs->z_os->os_direct == ZFS_DIRECT_DISABLED) 1281 return (SET_ERROR(EOPNOTSUPP)); 1282 1283 /* 1284 * If the file has multiple blocks, then its block size is fixed 1285 * forever, and so is the ideal alignment. 1286 * 1287 * If however it only has a single block, then we want to return the 1288 * max block size it could possibly grown to (ie, the dataset 1289 * recordsize). We do this so that a program querying alignment 1290 * immediately after the file is created gets a value that won't change 1291 * once the file has grown into the second block and beyond. 1292 * 1293 * Because we don't have a count of blocks easily available here, we 1294 * check if the apparent file size is smaller than its current block 1295 * size (meaning, the file hasn't yet grown into the current block 1296 * size) and then, check if the block size is smaller than the dataset 1297 * maximum (meaning, if the file grew past the current block size, the 1298 * block size could would be increased). 1299 */ 1300 if (zp->z_size <= zp->z_blksz && zp->z_blksz < zfsvfs->z_max_blksz) 1301 *alignp = MAX(zfsvfs->z_max_blksz, PAGE_SIZE); 1302 else 1303 *alignp = MAX(zp->z_blksz, PAGE_SIZE); 1304 1305 return (0); 1306 } 1307 1308 #ifdef ZFS_DEBUG 1309 static int zil_fault_io = 0; 1310 #endif 1311 1312 static void zfs_get_done(zgd_t *zgd, int error); 1313 1314 /* 1315 * Get data to generate a TX_WRITE intent log record. 1316 */ 1317 int 1318 zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, 1319 struct lwb *lwb, zio_t *zio) 1320 { 1321 zfsvfs_t *zfsvfs = arg; 1322 objset_t *os = zfsvfs->z_os; 1323 znode_t *zp; 1324 uint64_t object = lr->lr_foid; 1325 uint64_t offset = lr->lr_offset; 1326 uint64_t size = lr->lr_length; 1327 zgd_t *zgd; 1328 int error = 0; 1329 uint64_t zp_gen; 1330 1331 ASSERT3P(lwb, !=, NULL); 1332 ASSERT3U(size, !=, 0); 1333 1334 /* 1335 * Nothing to do if the file has been removed 1336 */ 1337 if (zfs_zget(zfsvfs, object, &zp) != 0) 1338 return (SET_ERROR(ENOENT)); 1339 if (zp->z_unlinked) { 1340 /* 1341 * Release the vnode asynchronously as we currently have the 1342 * txg stopped from syncing. 1343 */ 1344 zfs_zrele_async(zp); 1345 return (SET_ERROR(ENOENT)); 1346 } 1347 /* check if generation number matches */ 1348 if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 1349 sizeof (zp_gen)) != 0) { 1350 zfs_zrele_async(zp); 1351 return (SET_ERROR(EIO)); 1352 } 1353 if (zp_gen != gen) { 1354 zfs_zrele_async(zp); 1355 return (SET_ERROR(ENOENT)); 1356 } 1357 1358 zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1359 zgd->zgd_lwb = lwb; 1360 zgd->zgd_private = zp; 1361 1362 /* 1363 * Write records come in two flavors: immediate and indirect. 1364 * For small writes it's cheaper to store the data with the 1365 * log record (immediate); for large writes it's cheaper to 1366 * sync the data and get a pointer to it (indirect) so that 1367 * we don't have to write the data twice. 1368 */ 1369 if (buf != NULL) { /* immediate write */ 1370 zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, 1371 size, RL_READER); 1372 /* test for truncation needs to be done while range locked */ 1373 if (offset >= zp->z_size) { 1374 error = SET_ERROR(ENOENT); 1375 } else { 1376 error = dmu_read(os, object, offset, size, buf, 1377 DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING); 1378 } 1379 ASSERT(error == 0 || error == ENOENT); 1380 } else { /* indirect write */ 1381 ASSERT3P(zio, !=, NULL); 1382 /* 1383 * Have to lock the whole block to ensure when it's 1384 * written out and its checksum is being calculated 1385 * that no one can change the data. We need to re-check 1386 * blocksize after we get the lock in case it's changed! 1387 */ 1388 for (;;) { 1389 uint64_t blkoff; 1390 size = zp->z_blksz; 1391 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1392 offset -= blkoff; 1393 zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, 1394 offset, size, RL_READER); 1395 if (zp->z_blksz == size) 1396 break; 1397 offset += blkoff; 1398 zfs_rangelock_exit(zgd->zgd_lr); 1399 } 1400 /* test for truncation needs to be done while range locked */ 1401 if (lr->lr_offset >= zp->z_size) 1402 error = SET_ERROR(ENOENT); 1403 #ifdef ZFS_DEBUG 1404 if (zil_fault_io) { 1405 error = SET_ERROR(EIO); 1406 zil_fault_io = 0; 1407 } 1408 #endif 1409 1410 dmu_buf_t *dbp; 1411 if (error == 0) 1412 error = dmu_buf_hold_noread(os, object, offset, zgd, 1413 &dbp); 1414 1415 if (error == 0) { 1416 zgd->zgd_db = dbp; 1417 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp; 1418 boolean_t direct_write = B_FALSE; 1419 mutex_enter(&db->db_mtx); 1420 dbuf_dirty_record_t *dr = 1421 dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg); 1422 if (dr != NULL && dr->dt.dl.dr_diowrite) 1423 direct_write = B_TRUE; 1424 mutex_exit(&db->db_mtx); 1425 1426 /* 1427 * All Direct I/O writes will have already completed and 1428 * the block pointer can be immediately stored in the 1429 * log record. 1430 */ 1431 if (direct_write) { 1432 /* 1433 * A Direct I/O write always covers an entire 1434 * block. 1435 */ 1436 ASSERT3U(dbp->db_size, ==, zp->z_blksz); 1437 lr->lr_blkptr = dr->dt.dl.dr_overridden_by; 1438 zfs_get_done(zgd, 0); 1439 return (0); 1440 } 1441 1442 blkptr_t *bp = &lr->lr_blkptr; 1443 zgd->zgd_bp = bp; 1444 1445 ASSERT3U(dbp->db_offset, ==, offset); 1446 ASSERT3U(dbp->db_size, ==, size); 1447 1448 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1449 zfs_get_done, zgd); 1450 ASSERT(error || lr->lr_length <= size); 1451 1452 /* 1453 * On success, we need to wait for the write I/O 1454 * initiated by dmu_sync() to complete before we can 1455 * release this dbuf. We will finish everything up 1456 * in the zfs_get_done() callback. 1457 */ 1458 if (error == 0) 1459 return (0); 1460 1461 if (error == EALREADY) { 1462 lr->lr_common.lrc_txtype = TX_WRITE2; 1463 /* 1464 * TX_WRITE2 relies on the data previously 1465 * written by the TX_WRITE that caused 1466 * EALREADY. We zero out the BP because 1467 * it is the old, currently-on-disk BP. 1468 */ 1469 zgd->zgd_bp = NULL; 1470 BP_ZERO(bp); 1471 error = 0; 1472 } 1473 } 1474 } 1475 1476 zfs_get_done(zgd, error); 1477 1478 return (error); 1479 } 1480 1481 static void 1482 zfs_get_done(zgd_t *zgd, int error) 1483 { 1484 (void) error; 1485 znode_t *zp = zgd->zgd_private; 1486 1487 if (zgd->zgd_db) 1488 dmu_buf_rele(zgd->zgd_db, zgd); 1489 1490 zfs_rangelock_exit(zgd->zgd_lr); 1491 1492 /* 1493 * Release the vnode asynchronously as we currently have the 1494 * txg stopped from syncing. 1495 */ 1496 zfs_zrele_async(zp); 1497 1498 kmem_free(zgd, sizeof (zgd_t)); 1499 } 1500 1501 static int 1502 zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) 1503 { 1504 int error; 1505 1506 /* Swap. Not sure if the order of zfs_enter()s is important. */ 1507 if (zfsvfs1 > zfsvfs2) { 1508 zfsvfs_t *tmpzfsvfs; 1509 1510 tmpzfsvfs = zfsvfs2; 1511 zfsvfs2 = zfsvfs1; 1512 zfsvfs1 = tmpzfsvfs; 1513 } 1514 1515 error = zfs_enter(zfsvfs1, tag); 1516 if (error != 0) 1517 return (error); 1518 if (zfsvfs1 != zfsvfs2) { 1519 error = zfs_enter(zfsvfs2, tag); 1520 if (error != 0) { 1521 zfs_exit(zfsvfs1, tag); 1522 return (error); 1523 } 1524 } 1525 1526 return (0); 1527 } 1528 1529 static void 1530 zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) 1531 { 1532 1533 zfs_exit(zfsvfs1, tag); 1534 if (zfsvfs1 != zfsvfs2) 1535 zfs_exit(zfsvfs2, tag); 1536 } 1537 1538 /* 1539 * We split each clone request in chunks that can fit into a single ZIL 1540 * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning 1541 * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives 1542 * us room for storing 1022 block pointers. 1543 * 1544 * On success, the function return the number of bytes copied in *lenp. 1545 * Note, it doesn't return how much bytes are left to be copied. 1546 * On errors which are caused by any file system limitations or 1547 * brt limitations `EINVAL` is returned. In the most cases a user 1548 * requested bad parameters, it could be possible to clone the file but 1549 * some parameters don't match the requirements. 1550 */ 1551 int 1552 zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, 1553 uint64_t *outoffp, uint64_t *lenp, cred_t *cr) 1554 { 1555 zfsvfs_t *inzfsvfs, *outzfsvfs; 1556 objset_t *inos, *outos; 1557 zfs_locked_range_t *inlr, *outlr; 1558 dmu_buf_impl_t *db; 1559 dmu_tx_t *tx; 1560 zilog_t *zilog; 1561 uint64_t inoff, outoff, len, done; 1562 uint64_t outsize, size; 1563 int error; 1564 int count = 0; 1565 sa_bulk_attr_t bulk[3]; 1566 uint64_t mtime[2], ctime[2]; 1567 uint64_t uid, gid, projid; 1568 blkptr_t *bps; 1569 size_t maxblocks, nbps; 1570 uint_t inblksz; 1571 uint64_t clear_setid_bits_txg = 0; 1572 uint64_t last_synced_txg = 0; 1573 1574 inoff = *inoffp; 1575 outoff = *outoffp; 1576 len = *lenp; 1577 done = 0; 1578 1579 inzfsvfs = ZTOZSB(inzp); 1580 outzfsvfs = ZTOZSB(outzp); 1581 1582 /* 1583 * We need to call zfs_enter() potentially on two different datasets, 1584 * so we need a dedicated function for that. 1585 */ 1586 error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG); 1587 if (error != 0) 1588 return (error); 1589 1590 inos = inzfsvfs->z_os; 1591 outos = outzfsvfs->z_os; 1592 1593 /* 1594 * Both source and destination have to belong to the same storage pool. 1595 */ 1596 if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { 1597 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1598 return (SET_ERROR(EXDEV)); 1599 } 1600 1601 /* 1602 * outos and inos belongs to the same storage pool. 1603 * see a few lines above, only one check. 1604 */ 1605 if (!spa_feature_is_enabled(dmu_objset_spa(outos), 1606 SPA_FEATURE_BLOCK_CLONING)) { 1607 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1608 return (SET_ERROR(EOPNOTSUPP)); 1609 } 1610 1611 ASSERT(!outzfsvfs->z_replay); 1612 1613 /* 1614 * Block cloning from an unencrypted dataset into an encrypted 1615 * dataset and vice versa is not supported. 1616 */ 1617 if (inos->os_encrypted != outos->os_encrypted) { 1618 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1619 return (SET_ERROR(EXDEV)); 1620 } 1621 1622 /* 1623 * Cloning across encrypted datasets is possible only if they 1624 * share the same master key. 1625 */ 1626 if (inos != outos && inos->os_encrypted && 1627 !dmu_objset_crypto_key_equal(inos, outos)) { 1628 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1629 return (SET_ERROR(EXDEV)); 1630 } 1631 1632 error = zfs_verify_zp(inzp); 1633 if (error == 0) 1634 error = zfs_verify_zp(outzp); 1635 if (error != 0) { 1636 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1637 return (error); 1638 } 1639 1640 /* 1641 * We don't copy source file's flags that's why we don't allow to clone 1642 * files that are in quarantine. 1643 */ 1644 if (inzp->z_pflags & ZFS_AV_QUARANTINED) { 1645 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1646 return (SET_ERROR(EACCES)); 1647 } 1648 1649 if (inoff >= inzp->z_size) { 1650 *lenp = 0; 1651 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1652 return (0); 1653 } 1654 if (len > inzp->z_size - inoff) { 1655 len = inzp->z_size - inoff; 1656 } 1657 if (len == 0) { 1658 *lenp = 0; 1659 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1660 return (0); 1661 } 1662 1663 /* 1664 * Callers might not be able to detect properly that we are read-only, 1665 * so check it explicitly here. 1666 */ 1667 if (zfs_is_readonly(outzfsvfs)) { 1668 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1669 return (SET_ERROR(EROFS)); 1670 } 1671 1672 /* 1673 * If immutable or not appending then return EPERM. 1674 * Intentionally allow ZFS_READONLY through here. 1675 * See zfs_zaccess_common() 1676 */ 1677 if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) { 1678 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1679 return (SET_ERROR(EPERM)); 1680 } 1681 1682 /* 1683 * No overlapping if we are cloning within the same file. 1684 */ 1685 if (inzp == outzp) { 1686 if (inoff < outoff + len && outoff < inoff + len) { 1687 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1688 return (SET_ERROR(EINVAL)); 1689 } 1690 } 1691 1692 /* Flush any mmap()'d data to disk */ 1693 if (zn_has_cached_data(inzp, inoff, inoff + len - 1)) 1694 zn_flush_cached_data(inzp, B_TRUE); 1695 1696 /* 1697 * Maintain predictable lock order. 1698 */ 1699 if (inzp < outzp || (inzp == outzp && inoff < outoff)) { 1700 inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, 1701 RL_READER); 1702 outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, 1703 RL_WRITER); 1704 } else { 1705 outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, 1706 RL_WRITER); 1707 inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, 1708 RL_READER); 1709 } 1710 1711 inblksz = inzp->z_blksz; 1712 1713 /* 1714 * We cannot clone into a file with different block size if we can't 1715 * grow it (block size is already bigger, has more than one block, or 1716 * not locked for growth). There are other possible reasons for the 1717 * grow to fail, but we cover what we can before opening transaction 1718 * and the rest detect after we try to do it. 1719 */ 1720 if (inblksz < outzp->z_blksz) { 1721 error = SET_ERROR(EINVAL); 1722 goto unlock; 1723 } 1724 if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz || 1725 outlr->lr_length != UINT64_MAX)) { 1726 error = SET_ERROR(EINVAL); 1727 goto unlock; 1728 } 1729 1730 /* 1731 * Block size must be power-of-2 if destination offset != 0. 1732 * There can be no multiple blocks of non-power-of-2 size. 1733 */ 1734 if (outoff != 0 && !ISP2(inblksz)) { 1735 error = SET_ERROR(EINVAL); 1736 goto unlock; 1737 } 1738 1739 /* 1740 * Offsets and len must be at block boundries. 1741 */ 1742 if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) { 1743 error = SET_ERROR(EINVAL); 1744 goto unlock; 1745 } 1746 /* 1747 * Length must be multipe of blksz, except for the end of the file. 1748 */ 1749 if ((len % inblksz) != 0 && 1750 (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) { 1751 error = SET_ERROR(EINVAL); 1752 goto unlock; 1753 } 1754 1755 /* 1756 * If we are copying only one block and it is smaller than recordsize 1757 * property, do not allow destination to grow beyond one block if it 1758 * is not there yet. Otherwise the destination will get stuck with 1759 * that block size forever, that can be as small as 512 bytes, no 1760 * matter how big the destination grow later. 1761 */ 1762 if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz && 1763 outzp->z_size <= inblksz && outoff + len > inblksz) { 1764 error = SET_ERROR(EINVAL); 1765 goto unlock; 1766 } 1767 1768 error = zn_rlimit_fsize(outoff + len); 1769 if (error != 0) { 1770 goto unlock; 1771 } 1772 1773 if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) { 1774 error = SET_ERROR(EFBIG); 1775 goto unlock; 1776 } 1777 1778 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL, 1779 &mtime, 16); 1780 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL, 1781 &ctime, 16); 1782 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL, 1783 &outzp->z_size, 8); 1784 1785 zilog = outzfsvfs->z_log; 1786 maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) / 1787 sizeof (bps[0]); 1788 1789 uid = KUID_TO_SUID(ZTOUID(outzp)); 1790 gid = KGID_TO_SGID(ZTOGID(outzp)); 1791 projid = outzp->z_projid; 1792 1793 bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); 1794 1795 /* 1796 * Clone the file in reasonable size chunks. Each chunk is cloned 1797 * in a separate transaction; this keeps the intent log records small 1798 * and allows us to do more fine-grained space accounting. 1799 */ 1800 while (len > 0) { 1801 size = MIN(inblksz * maxblocks, len); 1802 1803 if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT, 1804 uid) || 1805 zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT, 1806 gid) || 1807 (projid != ZFS_DEFAULT_PROJID && 1808 zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT, 1809 projid))) { 1810 error = SET_ERROR(EDQUOT); 1811 break; 1812 } 1813 1814 nbps = maxblocks; 1815 last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos)); 1816 error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps, 1817 &nbps); 1818 if (error != 0) { 1819 /* 1820 * If we are trying to clone a block that was created 1821 * in the current transaction group, the error will be 1822 * EAGAIN here. Based on zfs_bclone_wait_dirty either 1823 * return a shortened range to the caller so it can 1824 * fallback, or wait for the next TXG and check again. 1825 */ 1826 if (error == EAGAIN && zfs_bclone_wait_dirty) { 1827 txg_wait_flag_t wait_flags = 1828 spa_get_failmode(dmu_objset_spa(inos)) == 1829 ZIO_FAILURE_MODE_CONTINUE ? 1830 TXG_WAIT_SUSPEND : 0; 1831 error = txg_wait_synced_flags( 1832 dmu_objset_pool(inos), last_synced_txg + 1, 1833 wait_flags); 1834 if (error == 0) 1835 continue; 1836 ASSERT3U(error, ==, ESHUTDOWN); 1837 error = SET_ERROR(EIO); 1838 } 1839 1840 break; 1841 } 1842 1843 /* 1844 * Start a transaction. 1845 */ 1846 tx = dmu_tx_create(outos); 1847 dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE); 1848 db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl); 1849 DB_DNODE_ENTER(db); 1850 dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size, 1851 inblksz); 1852 DB_DNODE_EXIT(db); 1853 zfs_sa_upgrade_txholds(tx, outzp); 1854 error = dmu_tx_assign(tx, DMU_TX_WAIT); 1855 if (error != 0) { 1856 dmu_tx_abort(tx); 1857 break; 1858 } 1859 1860 /* 1861 * Copy source znode's block size. This is done only if the 1862 * whole znode is locked (see zfs_rangelock_cb()) and only 1863 * on the first iteration since zfs_rangelock_reduce() will 1864 * shrink down lr_length to the appropriate size. 1865 */ 1866 if (outlr->lr_length == UINT64_MAX) { 1867 zfs_grow_blocksize(outzp, inblksz, tx); 1868 1869 /* 1870 * Block growth may fail for many reasons we can not 1871 * predict here. If it happen the cloning is doomed. 1872 */ 1873 if (inblksz != outzp->z_blksz) { 1874 error = SET_ERROR(EINVAL); 1875 dmu_tx_commit(tx); 1876 break; 1877 } 1878 1879 /* 1880 * Round range lock up to the block boundary, so we 1881 * prevent appends until we are done. 1882 */ 1883 zfs_rangelock_reduce(outlr, outoff, 1884 ((len - 1) / inblksz + 1) * inblksz); 1885 } 1886 1887 error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, 1888 bps, nbps); 1889 if (error != 0) { 1890 dmu_tx_commit(tx); 1891 break; 1892 } 1893 1894 if (zn_has_cached_data(outzp, outoff, outoff + size - 1)) { 1895 update_pages(outzp, outoff, size, outos); 1896 } 1897 1898 zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr, 1899 &clear_setid_bits_txg, tx); 1900 1901 zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime); 1902 1903 /* 1904 * Update the file size (zp_size) if it has changed; 1905 * account for possible concurrent updates. 1906 */ 1907 while ((outsize = outzp->z_size) < outoff + size) { 1908 (void) atomic_cas_64(&outzp->z_size, outsize, 1909 outoff + size); 1910 } 1911 1912 error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx); 1913 1914 zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff, 1915 size, inblksz, bps, nbps); 1916 1917 dmu_tx_commit(tx); 1918 1919 if (error != 0) 1920 break; 1921 1922 inoff += size; 1923 outoff += size; 1924 len -= size; 1925 done += size; 1926 1927 if (issig()) { 1928 error = SET_ERROR(EINTR); 1929 break; 1930 } 1931 } 1932 1933 vmem_free(bps, sizeof (bps[0]) * maxblocks); 1934 zfs_znode_update_vfs(outzp); 1935 1936 unlock: 1937 zfs_rangelock_exit(outlr); 1938 zfs_rangelock_exit(inlr); 1939 1940 if (done > 0) { 1941 /* 1942 * If we have made at least partial progress, reset the error. 1943 */ 1944 error = 0; 1945 1946 ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp); 1947 1948 if (outos->os_sync == ZFS_SYNC_ALWAYS) { 1949 zil_commit(zilog, outzp->z_id); 1950 } 1951 1952 *inoffp += done; 1953 *outoffp += done; 1954 *lenp = done; 1955 } else { 1956 /* 1957 * If we made no progress, there must be a good reason. 1958 * EOF is handled explicitly above, before the loop. 1959 */ 1960 ASSERT3S(error, !=, 0); 1961 } 1962 1963 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1964 1965 return (error); 1966 } 1967 1968 /* 1969 * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(), 1970 * but we cannot do that, because when replaying we don't have source znode 1971 * available. This is why we need a dedicated replay function. 1972 */ 1973 int 1974 zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz, 1975 const blkptr_t *bps, size_t nbps) 1976 { 1977 zfsvfs_t *zfsvfs; 1978 dmu_buf_impl_t *db; 1979 dmu_tx_t *tx; 1980 int error; 1981 int count = 0; 1982 sa_bulk_attr_t bulk[3]; 1983 uint64_t mtime[2], ctime[2]; 1984 1985 ASSERT3U(off, <, MAXOFFSET_T); 1986 ASSERT3U(len, >, 0); 1987 ASSERT3U(nbps, >, 0); 1988 1989 zfsvfs = ZTOZSB(zp); 1990 1991 ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os), 1992 SPA_FEATURE_BLOCK_CLONING)); 1993 1994 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1995 return (error); 1996 1997 ASSERT(zfsvfs->z_replay); 1998 ASSERT(!zfs_is_readonly(zfsvfs)); 1999 2000 if ((off % blksz) != 0) { 2001 zfs_exit(zfsvfs, FTAG); 2002 return (SET_ERROR(EINVAL)); 2003 } 2004 2005 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 2006 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 2007 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 2008 &zp->z_size, 8); 2009 2010 /* 2011 * Start a transaction. 2012 */ 2013 tx = dmu_tx_create(zfsvfs->z_os); 2014 2015 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2016 db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); 2017 DB_DNODE_ENTER(db); 2018 dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len, blksz); 2019 DB_DNODE_EXIT(db); 2020 zfs_sa_upgrade_txholds(tx, zp); 2021 error = dmu_tx_assign(tx, DMU_TX_WAIT); 2022 if (error != 0) { 2023 dmu_tx_abort(tx); 2024 zfs_exit(zfsvfs, FTAG); 2025 return (error); 2026 } 2027 2028 if (zp->z_blksz < blksz) 2029 zfs_grow_blocksize(zp, blksz, tx); 2030 2031 dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps); 2032 2033 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); 2034 2035 if (zp->z_size < off + len) 2036 zp->z_size = off + len; 2037 2038 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 2039 2040 /* 2041 * zil_replaying() not only check if we are replaying ZIL, but also 2042 * updates the ZIL header to record replay progress. 2043 */ 2044 VERIFY(zil_replaying(zfsvfs->z_log, tx)); 2045 2046 dmu_tx_commit(tx); 2047 2048 zfs_znode_update_vfs(zp); 2049 2050 zfs_exit(zfsvfs, FTAG); 2051 2052 return (error); 2053 } 2054 2055 EXPORT_SYMBOL(zfs_access); 2056 EXPORT_SYMBOL(zfs_fsync); 2057 EXPORT_SYMBOL(zfs_holey); 2058 EXPORT_SYMBOL(zfs_read); 2059 EXPORT_SYMBOL(zfs_write); 2060 EXPORT_SYMBOL(zfs_getsecattr); 2061 EXPORT_SYMBOL(zfs_setsecattr); 2062 EXPORT_SYMBOL(zfs_clone_range); 2063 EXPORT_SYMBOL(zfs_clone_range_replay); 2064 2065 ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW, 2066 "Bytes to read per chunk"); 2067 2068 ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW, 2069 "Enable block cloning"); 2070 2071 ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW, 2072 "Wait for dirty blocks when cloning"); 2073 2074 ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW, 2075 "Enable Direct I/O"); 2076 2077 ZFS_MODULE_PARAM(zfs, zfs_, dio_strict, INT, ZMOD_RW, 2078 "Return errors on misaligned Direct I/O"); 2079