1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 26 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 27 * Copyright 2017 Nexenta Systems, Inc. 28 * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek 29 * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> 30 * Copyright (c) 2025, Klara, Inc. 31 */ 32 33 /* Portions Copyright 2007 Jeremy Teo */ 34 /* Portions Copyright 2010 Robert Milkowski */ 35 36 #include <sys/types.h> 37 #include <sys/param.h> 38 #include <sys/time.h> 39 #include <sys/sysmacros.h> 40 #include <sys/vfs.h> 41 #include <sys/file.h> 42 #include <sys/stat.h> 43 #include <sys/kmem.h> 44 #include <sys/cmn_err.h> 45 #include <sys/errno.h> 46 #include <sys/zfs_dir.h> 47 #include <sys/zfs_acl.h> 48 #include <sys/zfs_ioctl.h> 49 #include <sys/fs/zfs.h> 50 #include <sys/dmu.h> 51 #include <sys/dmu_objset.h> 52 #include <sys/dsl_crypt.h> 53 #include <sys/dsl_dataset.h> 54 #include <sys/spa.h> 55 #include <sys/txg.h> 56 #include <sys/dbuf.h> 57 #include <sys/policy.h> 58 #include <sys/zfeature.h> 59 #include <sys/zfs_vnops.h> 60 #include <sys/zfs_quota.h> 61 #include <sys/zfs_vfsops.h> 62 #include <sys/zfs_znode.h> 63 64 /* 65 * Enables access to the block cloning feature. If this setting is 0, then even 66 * if feature@block_cloning is enabled, using functions and system calls that 67 * attempt to clone blocks will act as though the feature is disabled. 68 */ 69 int zfs_bclone_enabled = 1; 70 71 /* 72 * When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty 73 * data to be written to disk before proceeding. This ensures that the clone 74 * operation reliably succeeds, even if a file is modified and then immediately 75 * cloned. Note that for small files this may be slower than simply copying 76 * the file. When set to 0 the clone operation will immediately fail if it 77 * encounters any dirty blocks. By default waiting is enabled. 78 */ 79 int zfs_bclone_wait_dirty = 1; 80 81 /* 82 * Enable Direct I/O. If this setting is 0, then all I/O requests will be 83 * directed through the ARC acting as though the dataset property direct was 84 * set to disabled. 85 * 86 * Disabled by default on FreeBSD until a potential range locking issue in 87 * zfs_getpages() can be resolved. 88 */ 89 #ifdef __FreeBSD__ 90 static int zfs_dio_enabled = 0; 91 #else 92 static int zfs_dio_enabled = 1; 93 #endif 94 95 /* 96 * Strictly enforce alignment for Direct I/O requests, returning EINVAL 97 * if not page-aligned instead of silently falling back to uncached I/O. 98 */ 99 static int zfs_dio_strict = 0; 100 101 102 /* 103 * Maximum bytes to read per chunk in zfs_read(). 104 */ 105 #ifdef _ILP32 106 static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; 107 #else 108 static uint64_t zfs_vnops_read_chunk_size = DMU_MAX_ACCESS / 2; 109 #endif 110 111 int 112 zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) 113 { 114 int error = 0; 115 zfsvfs_t *zfsvfs = ZTOZSB(zp); 116 117 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 118 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 119 return (error); 120 error = zil_commit(zfsvfs->z_log, zp->z_id); 121 zfs_exit(zfsvfs, FTAG); 122 } 123 return (error); 124 } 125 126 127 #if defined(SEEK_HOLE) && defined(SEEK_DATA) 128 /* 129 * Lseek support for finding holes (cmd == SEEK_HOLE) and 130 * data (cmd == SEEK_DATA). "off" is an in/out parameter. 131 */ 132 static int 133 zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) 134 { 135 zfs_locked_range_t *lr; 136 uint64_t noff = (uint64_t)*off; /* new offset */ 137 uint64_t file_sz; 138 int error; 139 boolean_t hole; 140 141 file_sz = zp->z_size; 142 if (noff >= file_sz) { 143 return (SET_ERROR(ENXIO)); 144 } 145 146 if (cmd == F_SEEK_HOLE) 147 hole = B_TRUE; 148 else 149 hole = B_FALSE; 150 151 /* Flush any mmap()'d data to disk */ 152 if (zn_has_cached_data(zp, 0, file_sz - 1)) 153 zn_flush_cached_data(zp, B_TRUE); 154 155 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER); 156 error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); 157 zfs_rangelock_exit(lr); 158 159 if (error == ESRCH) 160 return (SET_ERROR(ENXIO)); 161 162 /* File was dirty, so fall back to using generic logic */ 163 if (error == EBUSY) { 164 if (hole) 165 *off = file_sz; 166 167 return (0); 168 } 169 170 /* 171 * We could find a hole that begins after the logical end-of-file, 172 * because dmu_offset_next() only works on whole blocks. If the 173 * EOF falls mid-block, then indicate that the "virtual hole" 174 * at the end of the file begins at the logical EOF, rather than 175 * at the end of the last block. 176 */ 177 if (noff > file_sz) { 178 ASSERT(hole); 179 noff = file_sz; 180 } 181 182 if (noff < *off) 183 return (error); 184 *off = noff; 185 return (error); 186 } 187 188 int 189 zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) 190 { 191 zfsvfs_t *zfsvfs = ZTOZSB(zp); 192 int error; 193 194 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 195 return (error); 196 197 error = zfs_holey_common(zp, cmd, off); 198 199 zfs_exit(zfsvfs, FTAG); 200 return (error); 201 } 202 #endif /* SEEK_HOLE && SEEK_DATA */ 203 204 int 205 zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) 206 { 207 zfsvfs_t *zfsvfs = ZTOZSB(zp); 208 int error; 209 210 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 211 return (error); 212 213 if (flag & V_ACE_MASK) 214 #if defined(__linux__) 215 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, 216 zfs_init_idmap); 217 #else 218 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, 219 NULL); 220 #endif 221 else 222 #if defined(__linux__) 223 error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap); 224 #else 225 error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL); 226 #endif 227 228 zfs_exit(zfsvfs, FTAG); 229 return (error); 230 } 231 232 /* 233 * Determine if Direct I/O has been requested (either via the O_DIRECT flag or 234 * the "direct" dataset property). When inherited by the property only apply 235 * the O_DIRECT flag to correctly aligned IO requests. The rational for this 236 * is it allows the property to be safely set on a dataset without forcing 237 * all of the applications to be aware of the alignment restrictions. When 238 * O_DIRECT is explicitly requested by an application return EINVAL if the 239 * request is unaligned. In all cases, if the range for this request has 240 * been mmap'ed then we will perform buffered I/O to keep the mapped region 241 * synhronized with the ARC. 242 * 243 * It is possible that a file's pages could be mmap'ed after it is checked 244 * here. If so, that is handled coorarding in zfs_write(). See comments in the 245 * following area for how this is handled: 246 * zfs_write() -> update_pages() 247 */ 248 static int 249 zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, 250 int *ioflagp) 251 { 252 zfsvfs_t *zfsvfs = ZTOZSB(zp); 253 objset_t *os = zfsvfs->z_os; 254 int ioflag = *ioflagp; 255 int error = 0; 256 257 if (os->os_direct == ZFS_DIRECT_ALWAYS) { 258 /* Force either direct or uncached I/O. */ 259 ioflag |= O_DIRECT; 260 } 261 262 if ((ioflag & O_DIRECT) == 0) 263 goto out; 264 265 if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED) { 266 /* 267 * Direct I/O is disabled. The I/O request will be directed 268 * through the ARC as uncached I/O. 269 */ 270 goto out; 271 } 272 273 if (!zfs_uio_page_aligned(uio) || 274 !zfs_uio_aligned(uio, PAGE_SIZE)) { 275 /* 276 * Misaligned requests can be executed through the ARC as 277 * uncached I/O. But if O_DIRECT was set by user and we 278 * were set to be strict, then it is a failure. 279 */ 280 if ((*ioflagp & O_DIRECT) && zfs_dio_strict) 281 error = SET_ERROR(EINVAL); 282 goto out; 283 } 284 285 if (zn_has_cached_data(zp, zfs_uio_offset(uio), 286 zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) { 287 /* 288 * The region is mmap'ed. The I/O request will be directed 289 * through the ARC as uncached I/O. 290 */ 291 goto out; 292 } 293 294 /* 295 * For short writes the page mapping of Direct I/O makes no sense. 296 * Direct them through the ARC as uncached I/O. 297 */ 298 if (rw == UIO_WRITE && zfs_uio_resid(uio) < zp->z_blksz) 299 goto out; 300 301 error = zfs_uio_get_dio_pages_alloc(uio, rw); 302 if (error) 303 goto out; 304 ASSERT(uio->uio_extflg & UIO_DIRECT); 305 306 out: 307 *ioflagp = ioflag; 308 return (error); 309 } 310 311 /* 312 * Read bytes from specified file into supplied buffer. 313 * 314 * IN: zp - inode of file to be read from. 315 * uio - structure supplying read location, range info, 316 * and return buffer. 317 * ioflag - O_SYNC flags; used to provide FRSYNC semantics. 318 * O_DIRECT flag; used to bypass page cache. 319 * cr - credentials of caller. 320 * 321 * OUT: uio - updated offset and range, buffer filled. 322 * 323 * RETURN: 0 on success, error code on failure. 324 * 325 * Side Effects: 326 * inode - atime updated if byte count > 0 327 */ 328 int 329 zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) 330 { 331 (void) cr; 332 int error = 0; 333 boolean_t frsync = B_FALSE; 334 boolean_t dio_checksum_failure = B_FALSE; 335 336 zfsvfs_t *zfsvfs = ZTOZSB(zp); 337 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 338 return (error); 339 340 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 341 zfs_exit(zfsvfs, FTAG); 342 return (SET_ERROR(EACCES)); 343 } 344 345 /* We don't copy out anything useful for directories. */ 346 if (Z_ISDIR(ZTOTYPE(zp))) { 347 zfs_exit(zfsvfs, FTAG); 348 return (SET_ERROR(EISDIR)); 349 } 350 351 /* 352 * Validate file offset 353 */ 354 if (zfs_uio_offset(uio) < (offset_t)0) { 355 zfs_exit(zfsvfs, FTAG); 356 return (SET_ERROR(EINVAL)); 357 } 358 359 /* 360 * Fasttrack empty reads 361 */ 362 if (zfs_uio_resid(uio) == 0) { 363 zfs_exit(zfsvfs, FTAG); 364 return (0); 365 } 366 367 #ifdef FRSYNC 368 /* 369 * If we're in FRSYNC mode, sync out this znode before reading it. 370 * Only do this for non-snapshots. 371 * 372 * Some platforms do not support FRSYNC and instead map it 373 * to O_SYNC, which results in unnecessary calls to zil_commit. We 374 * only honor FRSYNC requests on platforms which support it. 375 */ 376 frsync = !!(ioflag & FRSYNC); 377 #endif 378 if (zfsvfs->z_log && 379 (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) { 380 error = zil_commit(zfsvfs->z_log, zp->z_id); 381 if (error != 0) { 382 zfs_exit(zfsvfs, FTAG); 383 return (error); 384 } 385 } 386 387 /* 388 * Lock the range against changes. 389 */ 390 zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, 391 zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER); 392 393 /* 394 * If we are reading past end-of-file we can skip 395 * to the end; but we might still need to set atime. 396 */ 397 if (zfs_uio_offset(uio) >= zp->z_size) { 398 error = 0; 399 goto out; 400 } 401 ASSERT(zfs_uio_offset(uio) < zp->z_size); 402 403 /* 404 * Setting up Direct I/O if requested. 405 */ 406 error = zfs_setup_direct(zp, uio, UIO_READ, &ioflag); 407 if (error) { 408 goto out; 409 } 410 411 #if defined(__linux__) 412 ssize_t start_offset = zfs_uio_offset(uio); 413 #endif 414 uint_t blksz = zp->z_blksz; 415 ssize_t chunk_size; 416 ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); 417 ssize_t start_resid = n; 418 ssize_t dio_remaining_resid = 0; 419 420 dmu_flags_t dflags = DMU_READ_PREFETCH; 421 if (ioflag & O_DIRECT) 422 dflags |= DMU_UNCACHEDIO; 423 if (uio->uio_extflg & UIO_DIRECT) { 424 /* 425 * All pages for an O_DIRECT request ahve already been mapped 426 * so there's no compelling reason to handle this uio in 427 * smaller chunks. 428 */ 429 chunk_size = DMU_MAX_ACCESS; 430 431 /* 432 * In the event that the O_DIRECT request is reading the entire 433 * file, it is possible file's length is not page sized 434 * aligned. However, lower layers expect that the Direct I/O 435 * request is page-aligned. In this case, as much of the file 436 * that can be read using Direct I/O happens and the remaining 437 * amount will be read through the ARC. 438 * 439 * This is still consistent with the semantics of Direct I/O in 440 * ZFS as at a minimum the I/O request must be page-aligned. 441 */ 442 dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t); 443 if (dio_remaining_resid != 0) 444 n -= dio_remaining_resid; 445 dflags |= DMU_DIRECTIO; 446 } else { 447 chunk_size = MIN(MAX(zfs_vnops_read_chunk_size, blksz), 448 DMU_MAX_ACCESS / 2); 449 } 450 451 while (n > 0) { 452 ssize_t nbytes = MIN(n, chunk_size - 453 P2PHASE(zfs_uio_offset(uio), blksz)); 454 #ifdef UIO_NOCOPY 455 if (zfs_uio_segflg(uio) == UIO_NOCOPY) 456 error = mappedread_sf(zp, nbytes, uio); 457 else 458 #endif 459 if (zn_has_cached_data(zp, zfs_uio_offset(uio), 460 zfs_uio_offset(uio) + nbytes - 1)) { 461 error = mappedread(zp, nbytes, uio); 462 } else { 463 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 464 uio, nbytes, dflags); 465 } 466 467 if (error) { 468 /* convert checksum errors into IO errors */ 469 if (error == ECKSUM) { 470 /* 471 * If a Direct I/O read returned a checksum 472 * verify error, then it must be treated as 473 * suspicious. The contents of the buffer could 474 * have beeen manipulated while the I/O was in 475 * flight. In this case, the remainder of I/O 476 * request will just be reissued through the 477 * ARC. 478 */ 479 if (uio->uio_extflg & UIO_DIRECT) { 480 dio_checksum_failure = B_TRUE; 481 uio->uio_extflg &= ~UIO_DIRECT; 482 n += dio_remaining_resid; 483 dio_remaining_resid = 0; 484 continue; 485 } else { 486 error = SET_ERROR(EIO); 487 } 488 } 489 490 #if defined(__linux__) 491 /* 492 * if we actually read some bytes, bubbling EFAULT 493 * up to become EAGAIN isn't what we want here... 494 * 495 * ...on Linux, at least. On FBSD, doing this breaks. 496 */ 497 if (error == EFAULT && 498 (zfs_uio_offset(uio) - start_offset) != 0) 499 error = 0; 500 #endif 501 break; 502 } 503 504 n -= nbytes; 505 } 506 507 if (error == 0 && (uio->uio_extflg & UIO_DIRECT) && 508 dio_remaining_resid != 0) { 509 /* 510 * Temporarily remove the UIO_DIRECT flag from the UIO so the 511 * remainder of the file can be read using the ARC. 512 */ 513 uio->uio_extflg &= ~UIO_DIRECT; 514 dflags &= ~DMU_DIRECTIO; 515 516 if (zn_has_cached_data(zp, zfs_uio_offset(uio), 517 zfs_uio_offset(uio) + dio_remaining_resid - 1)) { 518 error = mappedread(zp, dio_remaining_resid, uio); 519 } else { 520 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, 521 dio_remaining_resid, dflags); 522 } 523 uio->uio_extflg |= UIO_DIRECT; 524 dflags |= DMU_DIRECTIO; 525 526 if (error != 0) 527 n += dio_remaining_resid; 528 } else if (error && (uio->uio_extflg & UIO_DIRECT)) { 529 n += dio_remaining_resid; 530 } 531 int64_t nread = start_resid - n; 532 533 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); 534 out: 535 zfs_rangelock_exit(lr); 536 537 if (dio_checksum_failure == B_TRUE) 538 uio->uio_extflg |= UIO_DIRECT; 539 540 /* 541 * Cleanup for Direct I/O if requested. 542 */ 543 if (uio->uio_extflg & UIO_DIRECT) 544 zfs_uio_free_dio_pages(uio, UIO_READ); 545 546 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 547 zfs_exit(zfsvfs, FTAG); 548 return (error); 549 } 550 551 static void 552 zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr, 553 uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx) 554 { 555 zilog_t *zilog = zfsvfs->z_log; 556 const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); 557 558 ASSERT(clear_setid_bits_txgp != NULL); 559 ASSERT(tx != NULL); 560 561 /* 562 * Clear Set-UID/Set-GID bits on successful write if not 563 * privileged and at least one of the execute bits is set. 564 * 565 * It would be nice to do this after all writes have 566 * been done, but that would still expose the ISUID/ISGID 567 * to another app after the partial write is committed. 568 * 569 * Note: we don't call zfs_fuid_map_id() here because 570 * user 0 is not an ephemeral uid. 571 */ 572 mutex_enter(&zp->z_acl_lock); 573 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && 574 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 575 secpolicy_vnode_setid_retain(zp, cr, 576 ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { 577 uint64_t newmode; 578 579 zp->z_mode &= ~(S_ISUID | S_ISGID); 580 newmode = zp->z_mode; 581 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 582 (void *)&newmode, sizeof (uint64_t), tx); 583 584 mutex_exit(&zp->z_acl_lock); 585 586 /* 587 * Make sure SUID/SGID bits will be removed when we replay the 588 * log. If the setid bits are keep coming back, don't log more 589 * than one TX_SETATTR per transaction group. 590 */ 591 if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) { 592 vattr_t va = {0}; 593 594 va.va_mask = ATTR_MODE; 595 va.va_nodeid = zp->z_id; 596 va.va_mode = newmode; 597 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, 598 ATTR_MODE, NULL); 599 *clear_setid_bits_txgp = dmu_tx_get_txg(tx); 600 } 601 } else { 602 mutex_exit(&zp->z_acl_lock); 603 } 604 } 605 606 /* 607 * Write the bytes to a file. 608 * 609 * IN: zp - znode of file to be written to. 610 * uio - structure supplying write location, range info, 611 * and data buffer. 612 * ioflag - O_APPEND flag set if in append mode. 613 * O_DIRECT flag; used to bypass page cache. 614 * cr - credentials of caller. 615 * 616 * OUT: uio - updated offset and range. 617 * 618 * RETURN: 0 if success 619 * error code if failure 620 * 621 * Timestamps: 622 * ip - ctime|mtime updated if byte count > 0 623 */ 624 int 625 zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) 626 { 627 int error = 0, error1; 628 ssize_t start_resid = zfs_uio_resid(uio); 629 uint64_t clear_setid_bits_txg = 0; 630 boolean_t o_direct_defer = B_FALSE; 631 632 /* 633 * Fasttrack empty write 634 */ 635 ssize_t n = start_resid; 636 if (n == 0) 637 return (0); 638 639 zfsvfs_t *zfsvfs = ZTOZSB(zp); 640 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 641 return (error); 642 643 sa_bulk_attr_t bulk[4]; 644 int count = 0; 645 uint64_t mtime[2], ctime[2]; 646 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 647 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 648 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 649 &zp->z_size, 8); 650 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 651 &zp->z_pflags, 8); 652 653 /* 654 * Callers might not be able to detect properly that we are read-only, 655 * so check it explicitly here. 656 */ 657 if (zfs_is_readonly(zfsvfs)) { 658 zfs_exit(zfsvfs, FTAG); 659 return (SET_ERROR(EROFS)); 660 } 661 662 /* 663 * If immutable or not appending then return EPERM. 664 * Intentionally allow ZFS_READONLY through here. 665 * See zfs_zaccess_common() 666 */ 667 if ((zp->z_pflags & ZFS_IMMUTABLE) || 668 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && 669 (zfs_uio_offset(uio) < zp->z_size))) { 670 zfs_exit(zfsvfs, FTAG); 671 return (SET_ERROR(EPERM)); 672 } 673 674 /* 675 * Validate file offset 676 */ 677 offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); 678 if (woff < 0) { 679 zfs_exit(zfsvfs, FTAG); 680 return (SET_ERROR(EINVAL)); 681 } 682 683 /* 684 * Setting up Direct I/O if requested. 685 */ 686 error = zfs_setup_direct(zp, uio, UIO_WRITE, &ioflag); 687 if (error) { 688 zfs_exit(zfsvfs, FTAG); 689 return (SET_ERROR(error)); 690 } 691 692 /* 693 * Pre-fault the pages to ensure slow (eg NFS) pages 694 * don't hold up txg. 695 */ 696 ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1); 697 if (zfs_uio_prefaultpages(pfbytes, uio)) { 698 zfs_exit(zfsvfs, FTAG); 699 return (SET_ERROR(EFAULT)); 700 } 701 702 /* 703 * If in append mode, set the io offset pointer to eof. 704 */ 705 zfs_locked_range_t *lr; 706 if (ioflag & O_APPEND) { 707 /* 708 * Obtain an appending range lock to guarantee file append 709 * semantics. We reset the write offset once we have the lock. 710 */ 711 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); 712 woff = lr->lr_offset; 713 if (lr->lr_length == UINT64_MAX) { 714 /* 715 * We overlocked the file because this write will cause 716 * the file block size to increase. 717 * Note that zp_size cannot change with this lock held. 718 */ 719 woff = zp->z_size; 720 } 721 zfs_uio_setoffset(uio, woff); 722 /* 723 * We need to update the starting offset as well because it is 724 * set previously in the ZPL (Linux) and VNOPS (FreeBSD) 725 * layers. 726 */ 727 zfs_uio_setsoffset(uio, woff); 728 } else { 729 /* 730 * Note that if the file block size will change as a result of 731 * this write, then this range lock will lock the entire file 732 * so that we can re-write the block safely. 733 */ 734 lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); 735 } 736 737 if (zn_rlimit_fsize_uio(zp, uio)) { 738 zfs_rangelock_exit(lr); 739 zfs_exit(zfsvfs, FTAG); 740 return (SET_ERROR(EFBIG)); 741 } 742 743 const rlim64_t limit = MAXOFFSET_T; 744 745 if (woff >= limit) { 746 zfs_rangelock_exit(lr); 747 zfs_exit(zfsvfs, FTAG); 748 return (SET_ERROR(EFBIG)); 749 } 750 751 if (n > limit - woff) 752 n = limit - woff; 753 754 uint64_t end_size = MAX(zp->z_size, woff + n); 755 zilog_t *zilog = zfsvfs->z_log; 756 boolean_t commit = (ioflag & (O_SYNC | O_DSYNC)) || 757 (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS); 758 759 const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); 760 const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); 761 const uint64_t projid = zp->z_projid; 762 763 /* 764 * In the event we are increasing the file block size 765 * (lr_length == UINT64_MAX), we will direct the write to the ARC. 766 * Because zfs_grow_blocksize() will read from the ARC in order to 767 * grow the dbuf, we avoid doing Direct I/O here as that would cause 768 * data written to disk to be overwritten by data in the ARC during 769 * the sync phase. Besides writing data twice to disk, we also 770 * want to avoid consistency concerns between data in the the ARC and 771 * on disk while growing the file's blocksize. 772 * 773 * We will only temporarily remove Direct I/O and put it back after 774 * we have grown the blocksize. We do this in the event a request 775 * is larger than max_blksz, so further requests to 776 * dmu_write_uio_dbuf() will still issue the requests using Direct 777 * IO. 778 * 779 * As an example: 780 * The first block to file is being written as a 4k request with 781 * a recorsize of 1K. The first 1K issued in the loop below will go 782 * through the ARC; however, the following 3 1K requests will 783 * use Direct I/O. 784 */ 785 if (uio->uio_extflg & UIO_DIRECT && lr->lr_length == UINT64_MAX) { 786 uio->uio_extflg &= ~UIO_DIRECT; 787 o_direct_defer = B_TRUE; 788 } 789 790 /* 791 * Write the file in reasonable size chunks. Each chunk is written 792 * in a separate transaction; this keeps the intent log records small 793 * and allows us to do more fine-grained space accounting. 794 */ 795 while (n > 0) { 796 woff = zfs_uio_offset(uio); 797 798 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || 799 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || 800 (projid != ZFS_DEFAULT_PROJID && 801 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 802 projid))) { 803 error = SET_ERROR(EDQUOT); 804 break; 805 } 806 807 uint64_t blksz; 808 if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) { 809 if (zp->z_blksz > zfsvfs->z_max_blksz && 810 !ISP2(zp->z_blksz)) { 811 /* 812 * File's blocksize is already larger than the 813 * "recordsize" property. Only let it grow to 814 * the next power of 2. 815 */ 816 blksz = 1 << highbit64(zp->z_blksz); 817 } else { 818 blksz = zfsvfs->z_max_blksz; 819 } 820 blksz = MIN(blksz, P2ROUNDUP(end_size, 821 SPA_MINBLOCKSIZE)); 822 blksz = MAX(blksz, zp->z_blksz); 823 } else { 824 blksz = zp->z_blksz; 825 } 826 827 arc_buf_t *abuf = NULL; 828 ssize_t nbytes = n; 829 if (n >= blksz && woff >= zp->z_size && 830 P2PHASE(woff, blksz) == 0 && 831 !(uio->uio_extflg & UIO_DIRECT) && 832 (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) { 833 /* 834 * This write covers a full block. "Borrow" a buffer 835 * from the dmu so that we can fill it before we enter 836 * a transaction. This avoids the possibility of 837 * holding up the transaction if the data copy hangs 838 * up on a pagefault (e.g., from an NFS server mapping). 839 */ 840 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 841 blksz); 842 ASSERT(abuf != NULL); 843 ASSERT(arc_buf_size(abuf) == blksz); 844 if ((error = zfs_uiocopy(abuf->b_data, blksz, 845 UIO_WRITE, uio, &nbytes))) { 846 dmu_return_arcbuf(abuf); 847 break; 848 } 849 ASSERT3S(nbytes, ==, blksz); 850 } else { 851 nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) - 852 P2PHASE(woff, blksz)); 853 if (pfbytes < nbytes) { 854 if (zfs_uio_prefaultpages(nbytes, uio)) { 855 error = SET_ERROR(EFAULT); 856 break; 857 } 858 pfbytes = nbytes; 859 } 860 } 861 862 /* 863 * Start a transaction. 864 */ 865 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 866 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 867 dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); 868 DB_DNODE_ENTER(db); 869 dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes); 870 DB_DNODE_EXIT(db); 871 zfs_sa_upgrade_txholds(tx, zp); 872 error = dmu_tx_assign(tx, DMU_TX_WAIT); 873 if (error) { 874 dmu_tx_abort(tx); 875 if (abuf != NULL) 876 dmu_return_arcbuf(abuf); 877 break; 878 } 879 880 /* 881 * NB: We must call zfs_clear_setid_bits_if_necessary before 882 * committing the transaction! 883 */ 884 885 /* 886 * If rangelock_enter() over-locked we grow the blocksize 887 * and then reduce the lock range. This will only happen 888 * on the first iteration since rangelock_reduce() will 889 * shrink down lr_length to the appropriate size. 890 */ 891 if (lr->lr_length == UINT64_MAX) { 892 zfs_grow_blocksize(zp, blksz, tx); 893 zfs_rangelock_reduce(lr, woff, n); 894 } 895 896 dmu_flags_t dflags = DMU_READ_PREFETCH; 897 if (ioflag & O_DIRECT) 898 dflags |= DMU_UNCACHEDIO; 899 if (uio->uio_extflg & UIO_DIRECT) 900 dflags |= DMU_DIRECTIO; 901 902 ssize_t tx_bytes; 903 if (abuf == NULL) { 904 tx_bytes = zfs_uio_resid(uio); 905 zfs_uio_fault_disable(uio, B_TRUE); 906 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 907 uio, nbytes, tx, dflags); 908 zfs_uio_fault_disable(uio, B_FALSE); 909 #ifdef __linux__ 910 if (error == EFAULT) { 911 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, 912 cr, &clear_setid_bits_txg, tx); 913 dmu_tx_commit(tx); 914 /* 915 * Account for partial writes before 916 * continuing the loop. 917 * Update needs to occur before the next 918 * zfs_uio_prefaultpages, or prefaultpages may 919 * error, and we may break the loop early. 920 */ 921 n -= tx_bytes - zfs_uio_resid(uio); 922 pfbytes -= tx_bytes - zfs_uio_resid(uio); 923 continue; 924 } 925 #endif 926 /* 927 * On FreeBSD, EFAULT should be propagated back to the 928 * VFS, which will handle faulting and will retry. 929 */ 930 if (error != 0 && error != EFAULT) { 931 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, 932 cr, &clear_setid_bits_txg, tx); 933 dmu_tx_commit(tx); 934 break; 935 } 936 tx_bytes -= zfs_uio_resid(uio); 937 } else { 938 /* 939 * Thus, we're writing a full block at a block-aligned 940 * offset and extending the file past EOF. 941 * 942 * dmu_assign_arcbuf_by_dbuf() will directly assign the 943 * arc buffer to a dbuf. 944 */ 945 error = dmu_assign_arcbuf_by_dbuf( 946 sa_get_db(zp->z_sa_hdl), woff, abuf, tx, dflags); 947 if (error != 0) { 948 /* 949 * XXX This might not be necessary if 950 * dmu_assign_arcbuf_by_dbuf is guaranteed 951 * to be atomic. 952 */ 953 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, 954 cr, &clear_setid_bits_txg, tx); 955 dmu_return_arcbuf(abuf); 956 dmu_tx_commit(tx); 957 break; 958 } 959 ASSERT3S(nbytes, <=, zfs_uio_resid(uio)); 960 zfs_uioskip(uio, nbytes); 961 tx_bytes = nbytes; 962 } 963 /* 964 * There is a window where a file's pages can be mmap'ed after 965 * zfs_setup_direct() is called. This is due to the fact that 966 * the rangelock in this function is acquired after calling 967 * zfs_setup_direct(). This is done so that 968 * zfs_uio_prefaultpages() does not attempt to fault in pages 969 * on Linux for Direct I/O requests. This is not necessary as 970 * the pages are pinned in memory and can not be faulted out. 971 * Ideally, the rangelock would be held before calling 972 * zfs_setup_direct() and zfs_uio_prefaultpages(); however, 973 * this can lead to a deadlock as zfs_getpage() also acquires 974 * the rangelock as a RL_WRITER and prefaulting the pages can 975 * lead to zfs_getpage() being called. 976 * 977 * In the case of the pages being mapped after 978 * zfs_setup_direct() is called, the call to update_pages() 979 * will still be made to make sure there is consistency between 980 * the ARC and the Linux page cache. This is an ufortunate 981 * situation as the data will be read back into the ARC after 982 * the Direct I/O write has completed, but this is the penality 983 * for writing to a mmap'ed region of a file using Direct I/O. 984 */ 985 if (tx_bytes && 986 zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) { 987 update_pages(zp, woff, tx_bytes, zfsvfs->z_os); 988 } 989 990 /* 991 * If we made no progress, we're done. If we made even 992 * partial progress, update the znode and ZIL accordingly. 993 */ 994 if (tx_bytes == 0) { 995 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 996 (void *)&zp->z_size, sizeof (uint64_t), tx); 997 dmu_tx_commit(tx); 998 ASSERT(error != 0); 999 break; 1000 } 1001 1002 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, 1003 &clear_setid_bits_txg, tx); 1004 1005 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); 1006 1007 /* 1008 * Update the file size (zp_size) if it has changed; 1009 * account for possible concurrent updates. 1010 */ 1011 while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { 1012 (void) atomic_cas_64(&zp->z_size, end_size, 1013 zfs_uio_offset(uio)); 1014 ASSERT(error == 0 || error == EFAULT); 1015 } 1016 /* 1017 * If we are replaying and eof is non zero then force 1018 * the file size to the specified eof. Note, there's no 1019 * concurrency during replay. 1020 */ 1021 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 1022 zp->z_size = zfsvfs->z_replay_eof; 1023 1024 error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1025 if (error1 != 0) 1026 /* Avoid clobbering EFAULT. */ 1027 error = error1; 1028 1029 /* 1030 * NB: During replay, the TX_SETATTR record logged by 1031 * zfs_clear_setid_bits_if_necessary must precede any of 1032 * the TX_WRITE records logged here. 1033 */ 1034 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit, 1035 uio->uio_extflg & UIO_DIRECT ? B_TRUE : B_FALSE, NULL, 1036 NULL); 1037 1038 dmu_tx_commit(tx); 1039 1040 /* 1041 * Direct I/O was deferred in order to grow the first block. 1042 * At this point it can be re-enabled for subsequent writes. 1043 */ 1044 if (o_direct_defer) { 1045 ASSERT(ioflag & O_DIRECT); 1046 uio->uio_extflg |= UIO_DIRECT; 1047 o_direct_defer = B_FALSE; 1048 } 1049 1050 if (error != 0) 1051 break; 1052 ASSERT3S(tx_bytes, ==, nbytes); 1053 n -= nbytes; 1054 pfbytes -= nbytes; 1055 } 1056 1057 if (o_direct_defer) { 1058 ASSERT(ioflag & O_DIRECT); 1059 uio->uio_extflg |= UIO_DIRECT; 1060 o_direct_defer = B_FALSE; 1061 } 1062 1063 zfs_znode_update_vfs(zp); 1064 zfs_rangelock_exit(lr); 1065 1066 /* 1067 * Cleanup for Direct I/O if requested. 1068 */ 1069 if (uio->uio_extflg & UIO_DIRECT) 1070 zfs_uio_free_dio_pages(uio, UIO_WRITE); 1071 1072 /* 1073 * If we're in replay mode, or we made no progress, or the 1074 * uio data is inaccessible return an error. Otherwise, it's 1075 * at least a partial write, so it's successful. 1076 */ 1077 if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || 1078 error == EFAULT) { 1079 zfs_exit(zfsvfs, FTAG); 1080 return (error); 1081 } 1082 1083 if (commit) { 1084 error = zil_commit(zilog, zp->z_id); 1085 if (error != 0) { 1086 zfs_exit(zfsvfs, FTAG); 1087 return (error); 1088 } 1089 } 1090 1091 int64_t nwritten = start_resid - zfs_uio_resid(uio); 1092 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); 1093 1094 zfs_exit(zfsvfs, FTAG); 1095 return (0); 1096 } 1097 1098 /* 1099 * Rewrite a range of file as-is without modification. 1100 * 1101 * IN: zp - znode of file to be rewritten. 1102 * off - Offset of the range to rewrite. 1103 * len - Length of the range to rewrite. 1104 * flags - Random rewrite parameters. 1105 * arg - flags-specific argument. 1106 * 1107 * RETURN: 0 if success 1108 * error code if failure 1109 */ 1110 int 1111 zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags, 1112 uint64_t arg) 1113 { 1114 int error; 1115 1116 if ((flags & ~ZFS_REWRITE_PHYSICAL) != 0 || arg != 0) 1117 return (SET_ERROR(EINVAL)); 1118 1119 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1120 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1121 return (error); 1122 1123 /* Check if physical rewrite is allowed */ 1124 spa_t *spa = zfsvfs->z_os->os_spa; 1125 if ((flags & ZFS_REWRITE_PHYSICAL) && 1126 !spa_feature_is_enabled(spa, SPA_FEATURE_PHYSICAL_REWRITE)) { 1127 zfs_exit(zfsvfs, FTAG); 1128 return (SET_ERROR(ENOTSUP)); 1129 } 1130 1131 if (zfs_is_readonly(zfsvfs)) { 1132 zfs_exit(zfsvfs, FTAG); 1133 return (SET_ERROR(EROFS)); 1134 } 1135 1136 if (off >= zp->z_size) { 1137 zfs_exit(zfsvfs, FTAG); 1138 return (0); 1139 } 1140 if (len == 0 || len > zp->z_size - off) 1141 len = zp->z_size - off; 1142 1143 /* Flush any mmap()'d data to disk */ 1144 if (zn_has_cached_data(zp, off, off + len - 1)) 1145 zn_flush_cached_data(zp, B_TRUE); 1146 1147 zfs_locked_range_t *lr; 1148 lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); 1149 1150 const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); 1151 const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); 1152 const uint64_t projid = zp->z_projid; 1153 1154 dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); 1155 DB_DNODE_ENTER(db); 1156 dnode_t *dn = DB_DNODE(db); 1157 1158 uint64_t n, noff = off, nr = 0, nw = 0; 1159 while (len > 0) { 1160 /* 1161 * Rewrite only actual data, skipping any holes. This might 1162 * be inaccurate for dirty files, but we don't really care. 1163 */ 1164 if (noff == off) { 1165 /* Find next data in the file. */ 1166 error = dnode_next_offset(dn, 0, &noff, 1, 1, 0); 1167 if (error || noff >= off + len) { 1168 if (error == ESRCH) /* No more data. */ 1169 error = 0; 1170 break; 1171 } 1172 ASSERT3U(noff, >=, off); 1173 len -= noff - off; 1174 off = noff; 1175 1176 /* Find where the data end. */ 1177 error = dnode_next_offset(dn, DNODE_FIND_HOLE, &noff, 1178 1, 1, 0); 1179 if (error != 0) 1180 noff = off + len; 1181 } 1182 ASSERT3U(noff, >, off); 1183 1184 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || 1185 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || 1186 (projid != ZFS_DEFAULT_PROJID && 1187 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 1188 projid))) { 1189 error = SET_ERROR(EDQUOT); 1190 break; 1191 } 1192 1193 n = MIN(MIN(len, noff - off), 1194 DMU_MAX_ACCESS / 2 - P2PHASE(off, zp->z_blksz)); 1195 1196 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 1197 dmu_tx_hold_write_by_dnode(tx, dn, off, n); 1198 error = dmu_tx_assign(tx, DMU_TX_WAIT); 1199 if (error) { 1200 dmu_tx_abort(tx); 1201 break; 1202 } 1203 1204 /* Mark all dbufs within range as dirty to trigger rewrite. */ 1205 dmu_buf_t **dbp; 1206 int numbufs; 1207 error = dmu_buf_hold_array_by_dnode(dn, off, n, TRUE, FTAG, 1208 &numbufs, &dbp, DMU_READ_PREFETCH | DMU_UNCACHEDIO); 1209 if (error) { 1210 dmu_tx_commit(tx); 1211 break; 1212 } 1213 for (int i = 0; i < numbufs; i++) { 1214 nr += dbp[i]->db_size; 1215 if (dmu_buf_is_dirty(dbp[i], tx)) 1216 continue; 1217 nw += dbp[i]->db_size; 1218 if (flags & ZFS_REWRITE_PHYSICAL) 1219 dmu_buf_will_rewrite(dbp[i], tx); 1220 else 1221 dmu_buf_will_dirty(dbp[i], tx); 1222 } 1223 dmu_buf_rele_array(dbp, numbufs, FTAG); 1224 1225 dmu_tx_commit(tx); 1226 1227 len -= n; 1228 off += n; 1229 1230 if (issig()) { 1231 error = SET_ERROR(EINTR); 1232 break; 1233 } 1234 } 1235 1236 DB_DNODE_EXIT(db); 1237 1238 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr); 1239 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nw); 1240 1241 zfs_rangelock_exit(lr); 1242 zfs_exit(zfsvfs, FTAG); 1243 return (error); 1244 } 1245 1246 int 1247 zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) 1248 { 1249 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1250 int error; 1251 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 1252 1253 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1254 return (error); 1255 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 1256 zfs_exit(zfsvfs, FTAG); 1257 1258 return (error); 1259 } 1260 1261 int 1262 zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) 1263 { 1264 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1265 int error; 1266 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 1267 zilog_t *zilog; 1268 1269 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1270 return (error); 1271 zilog = zfsvfs->z_log; 1272 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 1273 1274 if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1275 error = zil_commit(zilog, 0); 1276 1277 zfs_exit(zfsvfs, FTAG); 1278 return (error); 1279 } 1280 1281 /* 1282 * Get the optimal alignment to ensure direct IO can be performed without 1283 * incurring any RMW penalty on write. If direct IO is not enabled for this 1284 * file, returns an error. 1285 */ 1286 int 1287 zfs_get_direct_alignment(znode_t *zp, uint64_t *alignp) 1288 { 1289 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1290 1291 if (!zfs_dio_enabled || zfsvfs->z_os->os_direct == ZFS_DIRECT_DISABLED) 1292 return (SET_ERROR(EOPNOTSUPP)); 1293 1294 /* 1295 * If the file has multiple blocks, then its block size is fixed 1296 * forever, and so is the ideal alignment. 1297 * 1298 * If however it only has a single block, then we want to return the 1299 * max block size it could possibly grown to (ie, the dataset 1300 * recordsize). We do this so that a program querying alignment 1301 * immediately after the file is created gets a value that won't change 1302 * once the file has grown into the second block and beyond. 1303 * 1304 * Because we don't have a count of blocks easily available here, we 1305 * check if the apparent file size is smaller than its current block 1306 * size (meaning, the file hasn't yet grown into the current block 1307 * size) and then, check if the block size is smaller than the dataset 1308 * maximum (meaning, if the file grew past the current block size, the 1309 * block size could would be increased). 1310 */ 1311 if (zp->z_size <= zp->z_blksz && zp->z_blksz < zfsvfs->z_max_blksz) 1312 *alignp = MAX(zfsvfs->z_max_blksz, PAGE_SIZE); 1313 else 1314 *alignp = MAX(zp->z_blksz, PAGE_SIZE); 1315 1316 return (0); 1317 } 1318 1319 #ifdef ZFS_DEBUG 1320 static int zil_fault_io = 0; 1321 #endif 1322 1323 static void zfs_get_done(zgd_t *zgd, int error); 1324 1325 /* 1326 * Get data to generate a TX_WRITE intent log record. 1327 */ 1328 int 1329 zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, 1330 struct lwb *lwb, zio_t *zio) 1331 { 1332 zfsvfs_t *zfsvfs = arg; 1333 objset_t *os = zfsvfs->z_os; 1334 znode_t *zp; 1335 uint64_t object = lr->lr_foid; 1336 uint64_t offset = lr->lr_offset; 1337 uint64_t size = lr->lr_length; 1338 zgd_t *zgd; 1339 int error = 0; 1340 uint64_t zp_gen; 1341 1342 ASSERT3P(lwb, !=, NULL); 1343 ASSERT3U(size, !=, 0); 1344 1345 /* 1346 * Nothing to do if the file has been removed 1347 */ 1348 if (zfs_zget(zfsvfs, object, &zp) != 0) 1349 return (SET_ERROR(ENOENT)); 1350 if (zp->z_unlinked) { 1351 /* 1352 * Release the vnode asynchronously as we currently have the 1353 * txg stopped from syncing. 1354 */ 1355 zfs_zrele_async(zp); 1356 return (SET_ERROR(ENOENT)); 1357 } 1358 /* check if generation number matches */ 1359 if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 1360 sizeof (zp_gen)) != 0) { 1361 zfs_zrele_async(zp); 1362 return (SET_ERROR(EIO)); 1363 } 1364 if (zp_gen != gen) { 1365 zfs_zrele_async(zp); 1366 return (SET_ERROR(ENOENT)); 1367 } 1368 1369 zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1370 zgd->zgd_lwb = lwb; 1371 zgd->zgd_private = zp; 1372 1373 /* 1374 * Write records come in two flavors: immediate and indirect. 1375 * For small writes it's cheaper to store the data with the 1376 * log record (immediate); for large writes it's cheaper to 1377 * sync the data and get a pointer to it (indirect) so that 1378 * we don't have to write the data twice. 1379 */ 1380 if (buf != NULL) { /* immediate write */ 1381 zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, 1382 size, RL_READER); 1383 /* test for truncation needs to be done while range locked */ 1384 if (offset >= zp->z_size) { 1385 error = SET_ERROR(ENOENT); 1386 } else { 1387 error = dmu_read(os, object, offset, size, buf, 1388 DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING); 1389 } 1390 ASSERT(error == 0 || error == ENOENT); 1391 } else { /* indirect write */ 1392 ASSERT3P(zio, !=, NULL); 1393 /* 1394 * Have to lock the whole block to ensure when it's 1395 * written out and its checksum is being calculated 1396 * that no one can change the data. We need to re-check 1397 * blocksize after we get the lock in case it's changed! 1398 */ 1399 for (;;) { 1400 uint64_t blkoff; 1401 size = zp->z_blksz; 1402 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1403 offset -= blkoff; 1404 zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, 1405 offset, size, RL_READER); 1406 if (zp->z_blksz == size) 1407 break; 1408 offset += blkoff; 1409 zfs_rangelock_exit(zgd->zgd_lr); 1410 } 1411 /* test for truncation needs to be done while range locked */ 1412 if (lr->lr_offset >= zp->z_size) 1413 error = SET_ERROR(ENOENT); 1414 #ifdef ZFS_DEBUG 1415 if (zil_fault_io) { 1416 error = SET_ERROR(EIO); 1417 zil_fault_io = 0; 1418 } 1419 #endif 1420 1421 dmu_buf_t *dbp; 1422 if (error == 0) 1423 error = dmu_buf_hold_noread(os, object, offset, zgd, 1424 &dbp); 1425 1426 if (error == 0) { 1427 zgd->zgd_db = dbp; 1428 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp; 1429 boolean_t direct_write = B_FALSE; 1430 mutex_enter(&db->db_mtx); 1431 dbuf_dirty_record_t *dr = 1432 dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg); 1433 if (dr != NULL && dr->dt.dl.dr_diowrite) 1434 direct_write = B_TRUE; 1435 mutex_exit(&db->db_mtx); 1436 1437 /* 1438 * All Direct I/O writes will have already completed and 1439 * the block pointer can be immediately stored in the 1440 * log record. 1441 */ 1442 if (direct_write) { 1443 /* 1444 * A Direct I/O write always covers an entire 1445 * block. 1446 */ 1447 ASSERT3U(dbp->db_size, ==, zp->z_blksz); 1448 lr->lr_blkptr = dr->dt.dl.dr_overridden_by; 1449 zfs_get_done(zgd, 0); 1450 return (0); 1451 } 1452 1453 blkptr_t *bp = &lr->lr_blkptr; 1454 zgd->zgd_bp = bp; 1455 1456 ASSERT3U(dbp->db_offset, ==, offset); 1457 ASSERT3U(dbp->db_size, ==, size); 1458 1459 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1460 zfs_get_done, zgd); 1461 ASSERT(error || lr->lr_length <= size); 1462 1463 /* 1464 * On success, we need to wait for the write I/O 1465 * initiated by dmu_sync() to complete before we can 1466 * release this dbuf. We will finish everything up 1467 * in the zfs_get_done() callback. 1468 */ 1469 if (error == 0) 1470 return (0); 1471 1472 if (error == EALREADY) { 1473 lr->lr_common.lrc_txtype = TX_WRITE2; 1474 /* 1475 * TX_WRITE2 relies on the data previously 1476 * written by the TX_WRITE that caused 1477 * EALREADY. We zero out the BP because 1478 * it is the old, currently-on-disk BP. 1479 */ 1480 zgd->zgd_bp = NULL; 1481 BP_ZERO(bp); 1482 error = 0; 1483 } 1484 } 1485 } 1486 1487 zfs_get_done(zgd, error); 1488 1489 return (error); 1490 } 1491 1492 static void 1493 zfs_get_done(zgd_t *zgd, int error) 1494 { 1495 (void) error; 1496 znode_t *zp = zgd->zgd_private; 1497 1498 if (zgd->zgd_db) 1499 dmu_buf_rele(zgd->zgd_db, zgd); 1500 1501 zfs_rangelock_exit(zgd->zgd_lr); 1502 1503 /* 1504 * Release the vnode asynchronously as we currently have the 1505 * txg stopped from syncing. 1506 */ 1507 zfs_zrele_async(zp); 1508 1509 kmem_free(zgd, sizeof (zgd_t)); 1510 } 1511 1512 static int 1513 zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) 1514 { 1515 int error; 1516 1517 /* Swap. Not sure if the order of zfs_enter()s is important. */ 1518 if (zfsvfs1 > zfsvfs2) { 1519 zfsvfs_t *tmpzfsvfs; 1520 1521 tmpzfsvfs = zfsvfs2; 1522 zfsvfs2 = zfsvfs1; 1523 zfsvfs1 = tmpzfsvfs; 1524 } 1525 1526 error = zfs_enter(zfsvfs1, tag); 1527 if (error != 0) 1528 return (error); 1529 if (zfsvfs1 != zfsvfs2) { 1530 error = zfs_enter(zfsvfs2, tag); 1531 if (error != 0) { 1532 zfs_exit(zfsvfs1, tag); 1533 return (error); 1534 } 1535 } 1536 1537 return (0); 1538 } 1539 1540 static void 1541 zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) 1542 { 1543 1544 zfs_exit(zfsvfs1, tag); 1545 if (zfsvfs1 != zfsvfs2) 1546 zfs_exit(zfsvfs2, tag); 1547 } 1548 1549 /* 1550 * We split each clone request in chunks that can fit into a single ZIL 1551 * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning 1552 * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives 1553 * us room for storing 1022 block pointers. 1554 * 1555 * On success, the function return the number of bytes copied in *lenp. 1556 * Note, it doesn't return how much bytes are left to be copied. 1557 * On errors which are caused by any file system limitations or 1558 * brt limitations `EINVAL` is returned. In the most cases a user 1559 * requested bad parameters, it could be possible to clone the file but 1560 * some parameters don't match the requirements. 1561 */ 1562 int 1563 zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, 1564 uint64_t *outoffp, uint64_t *lenp, cred_t *cr) 1565 { 1566 zfsvfs_t *inzfsvfs, *outzfsvfs; 1567 objset_t *inos, *outos; 1568 zfs_locked_range_t *inlr, *outlr; 1569 dmu_buf_impl_t *db; 1570 dmu_tx_t *tx; 1571 zilog_t *zilog; 1572 uint64_t inoff, outoff, len, done; 1573 uint64_t outsize, size; 1574 int error; 1575 int count = 0; 1576 sa_bulk_attr_t bulk[3]; 1577 uint64_t mtime[2], ctime[2]; 1578 uint64_t uid, gid, projid; 1579 blkptr_t *bps; 1580 size_t maxblocks, nbps; 1581 uint_t inblksz; 1582 uint64_t clear_setid_bits_txg = 0; 1583 uint64_t last_synced_txg = 0; 1584 1585 inoff = *inoffp; 1586 outoff = *outoffp; 1587 len = *lenp; 1588 done = 0; 1589 1590 inzfsvfs = ZTOZSB(inzp); 1591 outzfsvfs = ZTOZSB(outzp); 1592 1593 /* 1594 * We need to call zfs_enter() potentially on two different datasets, 1595 * so we need a dedicated function for that. 1596 */ 1597 error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG); 1598 if (error != 0) 1599 return (error); 1600 1601 inos = inzfsvfs->z_os; 1602 outos = outzfsvfs->z_os; 1603 1604 /* 1605 * Both source and destination have to belong to the same storage pool. 1606 */ 1607 if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { 1608 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1609 return (SET_ERROR(EXDEV)); 1610 } 1611 1612 /* 1613 * outos and inos belongs to the same storage pool. 1614 * see a few lines above, only one check. 1615 */ 1616 if (!spa_feature_is_enabled(dmu_objset_spa(outos), 1617 SPA_FEATURE_BLOCK_CLONING)) { 1618 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1619 return (SET_ERROR(EOPNOTSUPP)); 1620 } 1621 1622 ASSERT(!outzfsvfs->z_replay); 1623 1624 /* 1625 * Block cloning from an unencrypted dataset into an encrypted 1626 * dataset and vice versa is not supported. 1627 */ 1628 if (inos->os_encrypted != outos->os_encrypted) { 1629 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1630 return (SET_ERROR(EXDEV)); 1631 } 1632 1633 /* 1634 * Cloning across encrypted datasets is possible only if they 1635 * share the same master key. 1636 */ 1637 if (inos != outos && inos->os_encrypted && 1638 !dmu_objset_crypto_key_equal(inos, outos)) { 1639 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1640 return (SET_ERROR(EXDEV)); 1641 } 1642 1643 error = zfs_verify_zp(inzp); 1644 if (error == 0) 1645 error = zfs_verify_zp(outzp); 1646 if (error != 0) { 1647 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1648 return (error); 1649 } 1650 1651 /* 1652 * We don't copy source file's flags that's why we don't allow to clone 1653 * files that are in quarantine. 1654 */ 1655 if (inzp->z_pflags & ZFS_AV_QUARANTINED) { 1656 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1657 return (SET_ERROR(EACCES)); 1658 } 1659 1660 if (inoff >= inzp->z_size) { 1661 *lenp = 0; 1662 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1663 return (0); 1664 } 1665 if (len > inzp->z_size - inoff) { 1666 len = inzp->z_size - inoff; 1667 } 1668 if (len == 0) { 1669 *lenp = 0; 1670 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1671 return (0); 1672 } 1673 1674 /* 1675 * Callers might not be able to detect properly that we are read-only, 1676 * so check it explicitly here. 1677 */ 1678 if (zfs_is_readonly(outzfsvfs)) { 1679 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1680 return (SET_ERROR(EROFS)); 1681 } 1682 1683 /* 1684 * If immutable or not appending then return EPERM. 1685 * Intentionally allow ZFS_READONLY through here. 1686 * See zfs_zaccess_common() 1687 */ 1688 if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) { 1689 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1690 return (SET_ERROR(EPERM)); 1691 } 1692 1693 /* 1694 * No overlapping if we are cloning within the same file. 1695 */ 1696 if (inzp == outzp) { 1697 if (inoff < outoff + len && outoff < inoff + len) { 1698 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1699 return (SET_ERROR(EINVAL)); 1700 } 1701 } 1702 1703 /* Flush any mmap()'d data to disk */ 1704 if (zn_has_cached_data(inzp, inoff, inoff + len - 1)) 1705 zn_flush_cached_data(inzp, B_TRUE); 1706 1707 /* 1708 * Maintain predictable lock order. 1709 */ 1710 if (inzp < outzp || (inzp == outzp && inoff < outoff)) { 1711 inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, 1712 RL_READER); 1713 outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, 1714 RL_WRITER); 1715 } else { 1716 outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, 1717 RL_WRITER); 1718 inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, 1719 RL_READER); 1720 } 1721 1722 inblksz = inzp->z_blksz; 1723 1724 /* 1725 * We cannot clone into a file with different block size if we can't 1726 * grow it (block size is already bigger, has more than one block, or 1727 * not locked for growth). There are other possible reasons for the 1728 * grow to fail, but we cover what we can before opening transaction 1729 * and the rest detect after we try to do it. 1730 */ 1731 if (inblksz < outzp->z_blksz) { 1732 error = SET_ERROR(EINVAL); 1733 goto unlock; 1734 } 1735 if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz || 1736 outlr->lr_length != UINT64_MAX)) { 1737 error = SET_ERROR(EINVAL); 1738 goto unlock; 1739 } 1740 1741 /* 1742 * Block size must be power-of-2 if destination offset != 0. 1743 * There can be no multiple blocks of non-power-of-2 size. 1744 */ 1745 if (outoff != 0 && !ISP2(inblksz)) { 1746 error = SET_ERROR(EINVAL); 1747 goto unlock; 1748 } 1749 1750 /* 1751 * Offsets and len must be at block boundries. 1752 */ 1753 if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) { 1754 error = SET_ERROR(EINVAL); 1755 goto unlock; 1756 } 1757 /* 1758 * Length must be multipe of blksz, except for the end of the file. 1759 */ 1760 if ((len % inblksz) != 0 && 1761 (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) { 1762 error = SET_ERROR(EINVAL); 1763 goto unlock; 1764 } 1765 1766 /* 1767 * If we are copying only one block and it is smaller than recordsize 1768 * property, do not allow destination to grow beyond one block if it 1769 * is not there yet. Otherwise the destination will get stuck with 1770 * that block size forever, that can be as small as 512 bytes, no 1771 * matter how big the destination grow later. 1772 */ 1773 if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz && 1774 outzp->z_size <= inblksz && outoff + len > inblksz) { 1775 error = SET_ERROR(EINVAL); 1776 goto unlock; 1777 } 1778 1779 error = zn_rlimit_fsize(outoff + len); 1780 if (error != 0) { 1781 goto unlock; 1782 } 1783 1784 if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) { 1785 error = SET_ERROR(EFBIG); 1786 goto unlock; 1787 } 1788 1789 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL, 1790 &mtime, 16); 1791 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL, 1792 &ctime, 16); 1793 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL, 1794 &outzp->z_size, 8); 1795 1796 zilog = outzfsvfs->z_log; 1797 maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) / 1798 sizeof (bps[0]); 1799 1800 uid = KUID_TO_SUID(ZTOUID(outzp)); 1801 gid = KGID_TO_SGID(ZTOGID(outzp)); 1802 projid = outzp->z_projid; 1803 1804 bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); 1805 1806 /* 1807 * Clone the file in reasonable size chunks. Each chunk is cloned 1808 * in a separate transaction; this keeps the intent log records small 1809 * and allows us to do more fine-grained space accounting. 1810 */ 1811 while (len > 0) { 1812 size = MIN(inblksz * maxblocks, len); 1813 1814 if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT, 1815 uid) || 1816 zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT, 1817 gid) || 1818 (projid != ZFS_DEFAULT_PROJID && 1819 zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT, 1820 projid))) { 1821 error = SET_ERROR(EDQUOT); 1822 break; 1823 } 1824 1825 nbps = maxblocks; 1826 last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos)); 1827 error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps, 1828 &nbps); 1829 if (error != 0) { 1830 /* 1831 * If we are trying to clone a block that was created 1832 * in the current transaction group, the error will be 1833 * EAGAIN here. Based on zfs_bclone_wait_dirty either 1834 * return a shortened range to the caller so it can 1835 * fallback, or wait for the next TXG and check again. 1836 */ 1837 if (error == EAGAIN && zfs_bclone_wait_dirty) { 1838 txg_wait_flag_t wait_flags = 1839 spa_get_failmode(dmu_objset_spa(inos)) == 1840 ZIO_FAILURE_MODE_CONTINUE ? 1841 TXG_WAIT_SUSPEND : 0; 1842 error = txg_wait_synced_flags( 1843 dmu_objset_pool(inos), last_synced_txg + 1, 1844 wait_flags); 1845 if (error == 0) 1846 continue; 1847 ASSERT3U(error, ==, ESHUTDOWN); 1848 error = SET_ERROR(EIO); 1849 } 1850 1851 break; 1852 } 1853 1854 /* 1855 * Start a transaction. 1856 */ 1857 tx = dmu_tx_create(outos); 1858 dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE); 1859 db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl); 1860 DB_DNODE_ENTER(db); 1861 dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size, 1862 inblksz); 1863 DB_DNODE_EXIT(db); 1864 zfs_sa_upgrade_txholds(tx, outzp); 1865 error = dmu_tx_assign(tx, DMU_TX_WAIT); 1866 if (error != 0) { 1867 dmu_tx_abort(tx); 1868 break; 1869 } 1870 1871 /* 1872 * Copy source znode's block size. This is done only if the 1873 * whole znode is locked (see zfs_rangelock_cb()) and only 1874 * on the first iteration since zfs_rangelock_reduce() will 1875 * shrink down lr_length to the appropriate size. 1876 */ 1877 if (outlr->lr_length == UINT64_MAX) { 1878 zfs_grow_blocksize(outzp, inblksz, tx); 1879 1880 /* 1881 * Block growth may fail for many reasons we can not 1882 * predict here. If it happen the cloning is doomed. 1883 */ 1884 if (inblksz != outzp->z_blksz) { 1885 error = SET_ERROR(EINVAL); 1886 dmu_tx_commit(tx); 1887 break; 1888 } 1889 1890 /* 1891 * Round range lock up to the block boundary, so we 1892 * prevent appends until we are done. 1893 */ 1894 zfs_rangelock_reduce(outlr, outoff, 1895 ((len - 1) / inblksz + 1) * inblksz); 1896 } 1897 1898 error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, 1899 bps, nbps); 1900 if (error != 0) { 1901 dmu_tx_commit(tx); 1902 break; 1903 } 1904 1905 if (zn_has_cached_data(outzp, outoff, outoff + size - 1)) { 1906 update_pages(outzp, outoff, size, outos); 1907 } 1908 1909 zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr, 1910 &clear_setid_bits_txg, tx); 1911 1912 zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime); 1913 1914 /* 1915 * Update the file size (zp_size) if it has changed; 1916 * account for possible concurrent updates. 1917 */ 1918 while ((outsize = outzp->z_size) < outoff + size) { 1919 (void) atomic_cas_64(&outzp->z_size, outsize, 1920 outoff + size); 1921 } 1922 1923 error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx); 1924 1925 zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff, 1926 size, inblksz, bps, nbps); 1927 1928 dmu_tx_commit(tx); 1929 1930 if (error != 0) 1931 break; 1932 1933 inoff += size; 1934 outoff += size; 1935 len -= size; 1936 done += size; 1937 1938 if (issig()) { 1939 error = SET_ERROR(EINTR); 1940 break; 1941 } 1942 } 1943 1944 vmem_free(bps, sizeof (bps[0]) * maxblocks); 1945 zfs_znode_update_vfs(outzp); 1946 1947 unlock: 1948 zfs_rangelock_exit(outlr); 1949 zfs_rangelock_exit(inlr); 1950 1951 if (done > 0) { 1952 /* 1953 * If we have made at least partial progress, reset the error. 1954 */ 1955 error = 0; 1956 1957 ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp); 1958 1959 if (outos->os_sync == ZFS_SYNC_ALWAYS) { 1960 error = zil_commit(zilog, outzp->z_id); 1961 } 1962 1963 *inoffp += done; 1964 *outoffp += done; 1965 *lenp = done; 1966 } else { 1967 /* 1968 * If we made no progress, there must be a good reason. 1969 * EOF is handled explicitly above, before the loop. 1970 */ 1971 ASSERT3S(error, !=, 0); 1972 } 1973 1974 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1975 1976 return (error); 1977 } 1978 1979 /* 1980 * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(), 1981 * but we cannot do that, because when replaying we don't have source znode 1982 * available. This is why we need a dedicated replay function. 1983 */ 1984 int 1985 zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz, 1986 const blkptr_t *bps, size_t nbps) 1987 { 1988 zfsvfs_t *zfsvfs; 1989 dmu_buf_impl_t *db; 1990 dmu_tx_t *tx; 1991 int error; 1992 int count = 0; 1993 sa_bulk_attr_t bulk[3]; 1994 uint64_t mtime[2], ctime[2]; 1995 1996 ASSERT3U(off, <, MAXOFFSET_T); 1997 ASSERT3U(len, >, 0); 1998 ASSERT3U(nbps, >, 0); 1999 2000 zfsvfs = ZTOZSB(zp); 2001 2002 ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os), 2003 SPA_FEATURE_BLOCK_CLONING)); 2004 2005 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 2006 return (error); 2007 2008 ASSERT(zfsvfs->z_replay); 2009 ASSERT(!zfs_is_readonly(zfsvfs)); 2010 2011 if ((off % blksz) != 0) { 2012 zfs_exit(zfsvfs, FTAG); 2013 return (SET_ERROR(EINVAL)); 2014 } 2015 2016 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 2017 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 2018 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 2019 &zp->z_size, 8); 2020 2021 /* 2022 * Start a transaction. 2023 */ 2024 tx = dmu_tx_create(zfsvfs->z_os); 2025 2026 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2027 db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); 2028 DB_DNODE_ENTER(db); 2029 dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len, blksz); 2030 DB_DNODE_EXIT(db); 2031 zfs_sa_upgrade_txholds(tx, zp); 2032 error = dmu_tx_assign(tx, DMU_TX_WAIT); 2033 if (error != 0) { 2034 dmu_tx_abort(tx); 2035 zfs_exit(zfsvfs, FTAG); 2036 return (error); 2037 } 2038 2039 if (zp->z_blksz < blksz) 2040 zfs_grow_blocksize(zp, blksz, tx); 2041 2042 dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps); 2043 2044 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); 2045 2046 if (zp->z_size < off + len) 2047 zp->z_size = off + len; 2048 2049 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 2050 2051 /* 2052 * zil_replaying() not only check if we are replaying ZIL, but also 2053 * updates the ZIL header to record replay progress. 2054 */ 2055 VERIFY(zil_replaying(zfsvfs->z_log, tx)); 2056 2057 dmu_tx_commit(tx); 2058 2059 zfs_znode_update_vfs(zp); 2060 2061 zfs_exit(zfsvfs, FTAG); 2062 2063 return (error); 2064 } 2065 2066 EXPORT_SYMBOL(zfs_access); 2067 EXPORT_SYMBOL(zfs_fsync); 2068 EXPORT_SYMBOL(zfs_holey); 2069 EXPORT_SYMBOL(zfs_read); 2070 EXPORT_SYMBOL(zfs_write); 2071 EXPORT_SYMBOL(zfs_getsecattr); 2072 EXPORT_SYMBOL(zfs_setsecattr); 2073 EXPORT_SYMBOL(zfs_clone_range); 2074 EXPORT_SYMBOL(zfs_clone_range_replay); 2075 2076 ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW, 2077 "Bytes to read per chunk"); 2078 2079 ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW, 2080 "Enable block cloning"); 2081 2082 ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW, 2083 "Wait for dirty blocks when cloning"); 2084 2085 ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW, 2086 "Enable Direct I/O"); 2087 2088 ZFS_MODULE_PARAM(zfs, zfs_, dio_strict, INT, ZMOD_RW, 2089 "Return errors on misaligned Direct I/O"); 2090