1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 26 * Copyright (c) 2015 by Chunwei Chen. All rights reserved. 27 * Copyright 2017 Nexenta Systems, Inc. 28 * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek 29 * Copyright (c) 2025, Rob Norris <robn@despairlabs.com> 30 */ 31 32 /* Portions Copyright 2007 Jeremy Teo */ 33 /* Portions Copyright 2010 Robert Milkowski */ 34 35 #include <sys/types.h> 36 #include <sys/param.h> 37 #include <sys/time.h> 38 #include <sys/sysmacros.h> 39 #include <sys/vfs.h> 40 #include <sys/file.h> 41 #include <sys/stat.h> 42 #include <sys/kmem.h> 43 #include <sys/cmn_err.h> 44 #include <sys/errno.h> 45 #include <sys/zfs_dir.h> 46 #include <sys/zfs_acl.h> 47 #include <sys/zfs_ioctl.h> 48 #include <sys/fs/zfs.h> 49 #include <sys/dmu.h> 50 #include <sys/dmu_objset.h> 51 #include <sys/dsl_crypt.h> 52 #include <sys/spa.h> 53 #include <sys/txg.h> 54 #include <sys/dbuf.h> 55 #include <sys/policy.h> 56 #include <sys/zfeature.h> 57 #include <sys/zfs_vnops.h> 58 #include <sys/zfs_quota.h> 59 #include <sys/zfs_vfsops.h> 60 #include <sys/zfs_znode.h> 61 62 /* 63 * Enables access to the block cloning feature. If this setting is 0, then even 64 * if feature@block_cloning is enabled, using functions and system calls that 65 * attempt to clone blocks will act as though the feature is disabled. 66 */ 67 int zfs_bclone_enabled = 1; 68 69 /* 70 * When set zfs_clone_range() waits for dirty data to be written to disk. 71 * This allows the clone operation to reliably succeed when a file is modified 72 * and then immediately cloned. For small files this may be slower than making 73 * a copy of the file and is therefore not the default. However, in certain 74 * scenarios this behavior may be desirable so a tunable is provided. 75 */ 76 int zfs_bclone_wait_dirty = 0; 77 78 /* 79 * Enable Direct I/O. If this setting is 0, then all I/O requests will be 80 * directed through the ARC acting as though the dataset property direct was 81 * set to disabled. 82 * 83 * Disabled by default on FreeBSD until a potential range locking issue in 84 * zfs_getpages() can be resolved. 85 */ 86 #ifdef __FreeBSD__ 87 static int zfs_dio_enabled = 0; 88 #else 89 static int zfs_dio_enabled = 1; 90 #endif 91 92 /* 93 * Strictly enforce alignment for Direct I/O requests, returning EINVAL 94 * if not page-aligned instead of silently falling back to uncached I/O. 95 */ 96 static int zfs_dio_strict = 0; 97 98 99 /* 100 * Maximum bytes to read per chunk in zfs_read(). 101 */ 102 #ifdef _ILP32 103 static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; 104 #else 105 static uint64_t zfs_vnops_read_chunk_size = DMU_MAX_ACCESS / 2; 106 #endif 107 108 int 109 zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) 110 { 111 int error = 0; 112 zfsvfs_t *zfsvfs = ZTOZSB(zp); 113 114 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { 115 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 116 return (error); 117 atomic_inc_32(&zp->z_sync_writes_cnt); 118 zil_commit(zfsvfs->z_log, zp->z_id); 119 atomic_dec_32(&zp->z_sync_writes_cnt); 120 zfs_exit(zfsvfs, FTAG); 121 } 122 return (error); 123 } 124 125 126 #if defined(SEEK_HOLE) && defined(SEEK_DATA) 127 /* 128 * Lseek support for finding holes (cmd == SEEK_HOLE) and 129 * data (cmd == SEEK_DATA). "off" is an in/out parameter. 130 */ 131 static int 132 zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) 133 { 134 zfs_locked_range_t *lr; 135 uint64_t noff = (uint64_t)*off; /* new offset */ 136 uint64_t file_sz; 137 int error; 138 boolean_t hole; 139 140 file_sz = zp->z_size; 141 if (noff >= file_sz) { 142 return (SET_ERROR(ENXIO)); 143 } 144 145 if (cmd == F_SEEK_HOLE) 146 hole = B_TRUE; 147 else 148 hole = B_FALSE; 149 150 /* Flush any mmap()'d data to disk */ 151 if (zn_has_cached_data(zp, 0, file_sz - 1)) 152 zn_flush_cached_data(zp, B_TRUE); 153 154 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER); 155 error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); 156 zfs_rangelock_exit(lr); 157 158 if (error == ESRCH) 159 return (SET_ERROR(ENXIO)); 160 161 /* File was dirty, so fall back to using generic logic */ 162 if (error == EBUSY) { 163 if (hole) 164 *off = file_sz; 165 166 return (0); 167 } 168 169 /* 170 * We could find a hole that begins after the logical end-of-file, 171 * because dmu_offset_next() only works on whole blocks. If the 172 * EOF falls mid-block, then indicate that the "virtual hole" 173 * at the end of the file begins at the logical EOF, rather than 174 * at the end of the last block. 175 */ 176 if (noff > file_sz) { 177 ASSERT(hole); 178 noff = file_sz; 179 } 180 181 if (noff < *off) 182 return (error); 183 *off = noff; 184 return (error); 185 } 186 187 int 188 zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) 189 { 190 zfsvfs_t *zfsvfs = ZTOZSB(zp); 191 int error; 192 193 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 194 return (error); 195 196 error = zfs_holey_common(zp, cmd, off); 197 198 zfs_exit(zfsvfs, FTAG); 199 return (error); 200 } 201 #endif /* SEEK_HOLE && SEEK_DATA */ 202 203 int 204 zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) 205 { 206 zfsvfs_t *zfsvfs = ZTOZSB(zp); 207 int error; 208 209 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 210 return (error); 211 212 if (flag & V_ACE_MASK) 213 #if defined(__linux__) 214 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, 215 zfs_init_idmap); 216 #else 217 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, 218 NULL); 219 #endif 220 else 221 #if defined(__linux__) 222 error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap); 223 #else 224 error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL); 225 #endif 226 227 zfs_exit(zfsvfs, FTAG); 228 return (error); 229 } 230 231 /* 232 * Determine if Direct I/O has been requested (either via the O_DIRECT flag or 233 * the "direct" dataset property). When inherited by the property only apply 234 * the O_DIRECT flag to correctly aligned IO requests. The rational for this 235 * is it allows the property to be safely set on a dataset without forcing 236 * all of the applications to be aware of the alignment restrictions. When 237 * O_DIRECT is explicitly requested by an application return EINVAL if the 238 * request is unaligned. In all cases, if the range for this request has 239 * been mmap'ed then we will perform buffered I/O to keep the mapped region 240 * synhronized with the ARC. 241 * 242 * It is possible that a file's pages could be mmap'ed after it is checked 243 * here. If so, that is handled coorarding in zfs_write(). See comments in the 244 * following area for how this is handled: 245 * zfs_write() -> update_pages() 246 */ 247 static int 248 zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, 249 int *ioflagp) 250 { 251 zfsvfs_t *zfsvfs = ZTOZSB(zp); 252 objset_t *os = zfsvfs->z_os; 253 int ioflag = *ioflagp; 254 int error = 0; 255 256 if (os->os_direct == ZFS_DIRECT_ALWAYS) { 257 /* Force either direct or uncached I/O. */ 258 ioflag |= O_DIRECT; 259 } 260 261 if ((ioflag & O_DIRECT) == 0) 262 goto out; 263 264 if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED) { 265 /* 266 * Direct I/O is disabled. The I/O request will be directed 267 * through the ARC as uncached I/O. 268 */ 269 goto out; 270 } 271 272 if (!zfs_uio_page_aligned(uio) || 273 !zfs_uio_aligned(uio, PAGE_SIZE)) { 274 /* 275 * Misaligned requests can be executed through the ARC as 276 * uncached I/O. But if O_DIRECT was set by user and we 277 * were set to be strict, then it is a failure. 278 */ 279 if ((*ioflagp & O_DIRECT) && zfs_dio_strict) 280 error = SET_ERROR(EINVAL); 281 goto out; 282 } 283 284 if (zn_has_cached_data(zp, zfs_uio_offset(uio), 285 zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) { 286 /* 287 * The region is mmap'ed. The I/O request will be directed 288 * through the ARC as uncached I/O. 289 */ 290 goto out; 291 } 292 293 /* 294 * For short writes the page mapping of Direct I/O makes no sense. 295 * Direct them through the ARC as uncached I/O. 296 */ 297 if (rw == UIO_WRITE && zfs_uio_resid(uio) < zp->z_blksz) 298 goto out; 299 300 error = zfs_uio_get_dio_pages_alloc(uio, rw); 301 if (error) 302 goto out; 303 ASSERT(uio->uio_extflg & UIO_DIRECT); 304 305 out: 306 *ioflagp = ioflag; 307 return (error); 308 } 309 310 /* 311 * Read bytes from specified file into supplied buffer. 312 * 313 * IN: zp - inode of file to be read from. 314 * uio - structure supplying read location, range info, 315 * and return buffer. 316 * ioflag - O_SYNC flags; used to provide FRSYNC semantics. 317 * O_DIRECT flag; used to bypass page cache. 318 * cr - credentials of caller. 319 * 320 * OUT: uio - updated offset and range, buffer filled. 321 * 322 * RETURN: 0 on success, error code on failure. 323 * 324 * Side Effects: 325 * inode - atime updated if byte count > 0 326 */ 327 int 328 zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) 329 { 330 (void) cr; 331 int error = 0; 332 boolean_t frsync = B_FALSE; 333 boolean_t dio_checksum_failure = B_FALSE; 334 335 zfsvfs_t *zfsvfs = ZTOZSB(zp); 336 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 337 return (error); 338 339 if (zp->z_pflags & ZFS_AV_QUARANTINED) { 340 zfs_exit(zfsvfs, FTAG); 341 return (SET_ERROR(EACCES)); 342 } 343 344 /* We don't copy out anything useful for directories. */ 345 if (Z_ISDIR(ZTOTYPE(zp))) { 346 zfs_exit(zfsvfs, FTAG); 347 return (SET_ERROR(EISDIR)); 348 } 349 350 /* 351 * Validate file offset 352 */ 353 if (zfs_uio_offset(uio) < (offset_t)0) { 354 zfs_exit(zfsvfs, FTAG); 355 return (SET_ERROR(EINVAL)); 356 } 357 358 /* 359 * Fasttrack empty reads 360 */ 361 if (zfs_uio_resid(uio) == 0) { 362 zfs_exit(zfsvfs, FTAG); 363 return (0); 364 } 365 366 #ifdef FRSYNC 367 /* 368 * If we're in FRSYNC mode, sync out this znode before reading it. 369 * Only do this for non-snapshots. 370 * 371 * Some platforms do not support FRSYNC and instead map it 372 * to O_SYNC, which results in unnecessary calls to zil_commit. We 373 * only honor FRSYNC requests on platforms which support it. 374 */ 375 frsync = !!(ioflag & FRSYNC); 376 #endif 377 if (zfsvfs->z_log && 378 (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) 379 zil_commit(zfsvfs->z_log, zp->z_id); 380 381 /* 382 * Lock the range against changes. 383 */ 384 zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, 385 zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER); 386 387 /* 388 * If we are reading past end-of-file we can skip 389 * to the end; but we might still need to set atime. 390 */ 391 if (zfs_uio_offset(uio) >= zp->z_size) { 392 error = 0; 393 goto out; 394 } 395 ASSERT(zfs_uio_offset(uio) < zp->z_size); 396 397 /* 398 * Setting up Direct I/O if requested. 399 */ 400 error = zfs_setup_direct(zp, uio, UIO_READ, &ioflag); 401 if (error) { 402 goto out; 403 } 404 405 #if defined(__linux__) 406 ssize_t start_offset = zfs_uio_offset(uio); 407 #endif 408 uint_t blksz = zp->z_blksz; 409 ssize_t chunk_size; 410 ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); 411 ssize_t start_resid = n; 412 ssize_t dio_remaining_resid = 0; 413 414 dmu_flags_t dflags = DMU_READ_PREFETCH; 415 if (ioflag & O_DIRECT) 416 dflags |= DMU_UNCACHEDIO; 417 if (uio->uio_extflg & UIO_DIRECT) { 418 /* 419 * All pages for an O_DIRECT request ahve already been mapped 420 * so there's no compelling reason to handle this uio in 421 * smaller chunks. 422 */ 423 chunk_size = DMU_MAX_ACCESS; 424 425 /* 426 * In the event that the O_DIRECT request is reading the entire 427 * file, it is possible file's length is not page sized 428 * aligned. However, lower layers expect that the Direct I/O 429 * request is page-aligned. In this case, as much of the file 430 * that can be read using Direct I/O happens and the remaining 431 * amount will be read through the ARC. 432 * 433 * This is still consistent with the semantics of Direct I/O in 434 * ZFS as at a minimum the I/O request must be page-aligned. 435 */ 436 dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t); 437 if (dio_remaining_resid != 0) 438 n -= dio_remaining_resid; 439 dflags |= DMU_DIRECTIO; 440 } else { 441 chunk_size = MIN(MAX(zfs_vnops_read_chunk_size, blksz), 442 DMU_MAX_ACCESS / 2); 443 } 444 445 while (n > 0) { 446 ssize_t nbytes = MIN(n, chunk_size - 447 P2PHASE(zfs_uio_offset(uio), blksz)); 448 #ifdef UIO_NOCOPY 449 if (zfs_uio_segflg(uio) == UIO_NOCOPY) 450 error = mappedread_sf(zp, nbytes, uio); 451 else 452 #endif 453 if (zn_has_cached_data(zp, zfs_uio_offset(uio), 454 zfs_uio_offset(uio) + nbytes - 1)) { 455 error = mappedread(zp, nbytes, uio); 456 } else { 457 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), 458 uio, nbytes, dflags); 459 } 460 461 if (error) { 462 /* convert checksum errors into IO errors */ 463 if (error == ECKSUM) { 464 /* 465 * If a Direct I/O read returned a checksum 466 * verify error, then it must be treated as 467 * suspicious. The contents of the buffer could 468 * have beeen manipulated while the I/O was in 469 * flight. In this case, the remainder of I/O 470 * request will just be reissued through the 471 * ARC. 472 */ 473 if (uio->uio_extflg & UIO_DIRECT) { 474 dio_checksum_failure = B_TRUE; 475 uio->uio_extflg &= ~UIO_DIRECT; 476 n += dio_remaining_resid; 477 dio_remaining_resid = 0; 478 continue; 479 } else { 480 error = SET_ERROR(EIO); 481 } 482 } 483 484 #if defined(__linux__) 485 /* 486 * if we actually read some bytes, bubbling EFAULT 487 * up to become EAGAIN isn't what we want here... 488 * 489 * ...on Linux, at least. On FBSD, doing this breaks. 490 */ 491 if (error == EFAULT && 492 (zfs_uio_offset(uio) - start_offset) != 0) 493 error = 0; 494 #endif 495 break; 496 } 497 498 n -= nbytes; 499 } 500 501 if (error == 0 && (uio->uio_extflg & UIO_DIRECT) && 502 dio_remaining_resid != 0) { 503 /* 504 * Temporarily remove the UIO_DIRECT flag from the UIO so the 505 * remainder of the file can be read using the ARC. 506 */ 507 uio->uio_extflg &= ~UIO_DIRECT; 508 dflags &= ~DMU_DIRECTIO; 509 510 if (zn_has_cached_data(zp, zfs_uio_offset(uio), 511 zfs_uio_offset(uio) + dio_remaining_resid - 1)) { 512 error = mappedread(zp, dio_remaining_resid, uio); 513 } else { 514 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, 515 dio_remaining_resid, dflags); 516 } 517 uio->uio_extflg |= UIO_DIRECT; 518 dflags |= DMU_DIRECTIO; 519 520 if (error != 0) 521 n += dio_remaining_resid; 522 } else if (error && (uio->uio_extflg & UIO_DIRECT)) { 523 n += dio_remaining_resid; 524 } 525 int64_t nread = start_resid - n; 526 527 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); 528 out: 529 zfs_rangelock_exit(lr); 530 531 if (dio_checksum_failure == B_TRUE) 532 uio->uio_extflg |= UIO_DIRECT; 533 534 /* 535 * Cleanup for Direct I/O if requested. 536 */ 537 if (uio->uio_extflg & UIO_DIRECT) 538 zfs_uio_free_dio_pages(uio, UIO_READ); 539 540 ZFS_ACCESSTIME_STAMP(zfsvfs, zp); 541 zfs_exit(zfsvfs, FTAG); 542 return (error); 543 } 544 545 static void 546 zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr, 547 uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx) 548 { 549 zilog_t *zilog = zfsvfs->z_log; 550 const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); 551 552 ASSERT(clear_setid_bits_txgp != NULL); 553 ASSERT(tx != NULL); 554 555 /* 556 * Clear Set-UID/Set-GID bits on successful write if not 557 * privileged and at least one of the execute bits is set. 558 * 559 * It would be nice to do this after all writes have 560 * been done, but that would still expose the ISUID/ISGID 561 * to another app after the partial write is committed. 562 * 563 * Note: we don't call zfs_fuid_map_id() here because 564 * user 0 is not an ephemeral uid. 565 */ 566 mutex_enter(&zp->z_acl_lock); 567 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && 568 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && 569 secpolicy_vnode_setid_retain(zp, cr, 570 ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { 571 uint64_t newmode; 572 573 zp->z_mode &= ~(S_ISUID | S_ISGID); 574 newmode = zp->z_mode; 575 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), 576 (void *)&newmode, sizeof (uint64_t), tx); 577 578 mutex_exit(&zp->z_acl_lock); 579 580 /* 581 * Make sure SUID/SGID bits will be removed when we replay the 582 * log. If the setid bits are keep coming back, don't log more 583 * than one TX_SETATTR per transaction group. 584 */ 585 if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) { 586 vattr_t va = {0}; 587 588 va.va_mask = ATTR_MODE; 589 va.va_nodeid = zp->z_id; 590 va.va_mode = newmode; 591 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va, 592 ATTR_MODE, NULL); 593 *clear_setid_bits_txgp = dmu_tx_get_txg(tx); 594 } 595 } else { 596 mutex_exit(&zp->z_acl_lock); 597 } 598 } 599 600 /* 601 * Write the bytes to a file. 602 * 603 * IN: zp - znode of file to be written to. 604 * uio - structure supplying write location, range info, 605 * and data buffer. 606 * ioflag - O_APPEND flag set if in append mode. 607 * O_DIRECT flag; used to bypass page cache. 608 * cr - credentials of caller. 609 * 610 * OUT: uio - updated offset and range. 611 * 612 * RETURN: 0 if success 613 * error code if failure 614 * 615 * Timestamps: 616 * ip - ctime|mtime updated if byte count > 0 617 */ 618 int 619 zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) 620 { 621 int error = 0, error1; 622 ssize_t start_resid = zfs_uio_resid(uio); 623 uint64_t clear_setid_bits_txg = 0; 624 boolean_t o_direct_defer = B_FALSE; 625 626 /* 627 * Fasttrack empty write 628 */ 629 ssize_t n = start_resid; 630 if (n == 0) 631 return (0); 632 633 zfsvfs_t *zfsvfs = ZTOZSB(zp); 634 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 635 return (error); 636 637 sa_bulk_attr_t bulk[4]; 638 int count = 0; 639 uint64_t mtime[2], ctime[2]; 640 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 641 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 642 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 643 &zp->z_size, 8); 644 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, 645 &zp->z_pflags, 8); 646 647 /* 648 * Callers might not be able to detect properly that we are read-only, 649 * so check it explicitly here. 650 */ 651 if (zfs_is_readonly(zfsvfs)) { 652 zfs_exit(zfsvfs, FTAG); 653 return (SET_ERROR(EROFS)); 654 } 655 656 /* 657 * If immutable or not appending then return EPERM. 658 * Intentionally allow ZFS_READONLY through here. 659 * See zfs_zaccess_common() 660 */ 661 if ((zp->z_pflags & ZFS_IMMUTABLE) || 662 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && 663 (zfs_uio_offset(uio) < zp->z_size))) { 664 zfs_exit(zfsvfs, FTAG); 665 return (SET_ERROR(EPERM)); 666 } 667 668 /* 669 * Validate file offset 670 */ 671 offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio); 672 if (woff < 0) { 673 zfs_exit(zfsvfs, FTAG); 674 return (SET_ERROR(EINVAL)); 675 } 676 677 /* 678 * Setting up Direct I/O if requested. 679 */ 680 error = zfs_setup_direct(zp, uio, UIO_WRITE, &ioflag); 681 if (error) { 682 zfs_exit(zfsvfs, FTAG); 683 return (SET_ERROR(error)); 684 } 685 686 /* 687 * Pre-fault the pages to ensure slow (eg NFS) pages 688 * don't hold up txg. 689 */ 690 ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1); 691 if (zfs_uio_prefaultpages(pfbytes, uio)) { 692 zfs_exit(zfsvfs, FTAG); 693 return (SET_ERROR(EFAULT)); 694 } 695 696 /* 697 * If in append mode, set the io offset pointer to eof. 698 */ 699 zfs_locked_range_t *lr; 700 if (ioflag & O_APPEND) { 701 /* 702 * Obtain an appending range lock to guarantee file append 703 * semantics. We reset the write offset once we have the lock. 704 */ 705 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); 706 woff = lr->lr_offset; 707 if (lr->lr_length == UINT64_MAX) { 708 /* 709 * We overlocked the file because this write will cause 710 * the file block size to increase. 711 * Note that zp_size cannot change with this lock held. 712 */ 713 woff = zp->z_size; 714 } 715 zfs_uio_setoffset(uio, woff); 716 /* 717 * We need to update the starting offset as well because it is 718 * set previously in the ZPL (Linux) and VNOPS (FreeBSD) 719 * layers. 720 */ 721 zfs_uio_setsoffset(uio, woff); 722 } else { 723 /* 724 * Note that if the file block size will change as a result of 725 * this write, then this range lock will lock the entire file 726 * so that we can re-write the block safely. 727 */ 728 lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); 729 } 730 731 if (zn_rlimit_fsize_uio(zp, uio)) { 732 zfs_rangelock_exit(lr); 733 zfs_exit(zfsvfs, FTAG); 734 return (SET_ERROR(EFBIG)); 735 } 736 737 const rlim64_t limit = MAXOFFSET_T; 738 739 if (woff >= limit) { 740 zfs_rangelock_exit(lr); 741 zfs_exit(zfsvfs, FTAG); 742 return (SET_ERROR(EFBIG)); 743 } 744 745 if (n > limit - woff) 746 n = limit - woff; 747 748 uint64_t end_size = MAX(zp->z_size, woff + n); 749 zilog_t *zilog = zfsvfs->z_log; 750 boolean_t commit = (ioflag & (O_SYNC | O_DSYNC)) || 751 (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS); 752 753 const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); 754 const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); 755 const uint64_t projid = zp->z_projid; 756 757 /* 758 * In the event we are increasing the file block size 759 * (lr_length == UINT64_MAX), we will direct the write to the ARC. 760 * Because zfs_grow_blocksize() will read from the ARC in order to 761 * grow the dbuf, we avoid doing Direct I/O here as that would cause 762 * data written to disk to be overwritten by data in the ARC during 763 * the sync phase. Besides writing data twice to disk, we also 764 * want to avoid consistency concerns between data in the the ARC and 765 * on disk while growing the file's blocksize. 766 * 767 * We will only temporarily remove Direct I/O and put it back after 768 * we have grown the blocksize. We do this in the event a request 769 * is larger than max_blksz, so further requests to 770 * dmu_write_uio_dbuf() will still issue the requests using Direct 771 * IO. 772 * 773 * As an example: 774 * The first block to file is being written as a 4k request with 775 * a recorsize of 1K. The first 1K issued in the loop below will go 776 * through the ARC; however, the following 3 1K requests will 777 * use Direct I/O. 778 */ 779 if (uio->uio_extflg & UIO_DIRECT && lr->lr_length == UINT64_MAX) { 780 uio->uio_extflg &= ~UIO_DIRECT; 781 o_direct_defer = B_TRUE; 782 } 783 784 /* 785 * Write the file in reasonable size chunks. Each chunk is written 786 * in a separate transaction; this keeps the intent log records small 787 * and allows us to do more fine-grained space accounting. 788 */ 789 while (n > 0) { 790 woff = zfs_uio_offset(uio); 791 792 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || 793 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || 794 (projid != ZFS_DEFAULT_PROJID && 795 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 796 projid))) { 797 error = SET_ERROR(EDQUOT); 798 break; 799 } 800 801 uint64_t blksz; 802 if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) { 803 if (zp->z_blksz > zfsvfs->z_max_blksz && 804 !ISP2(zp->z_blksz)) { 805 /* 806 * File's blocksize is already larger than the 807 * "recordsize" property. Only let it grow to 808 * the next power of 2. 809 */ 810 blksz = 1 << highbit64(zp->z_blksz); 811 } else { 812 blksz = zfsvfs->z_max_blksz; 813 } 814 blksz = MIN(blksz, P2ROUNDUP(end_size, 815 SPA_MINBLOCKSIZE)); 816 blksz = MAX(blksz, zp->z_blksz); 817 } else { 818 blksz = zp->z_blksz; 819 } 820 821 arc_buf_t *abuf = NULL; 822 ssize_t nbytes = n; 823 if (n >= blksz && woff >= zp->z_size && 824 P2PHASE(woff, blksz) == 0 && 825 !(uio->uio_extflg & UIO_DIRECT) && 826 (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) { 827 /* 828 * This write covers a full block. "Borrow" a buffer 829 * from the dmu so that we can fill it before we enter 830 * a transaction. This avoids the possibility of 831 * holding up the transaction if the data copy hangs 832 * up on a pagefault (e.g., from an NFS server mapping). 833 */ 834 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), 835 blksz); 836 ASSERT(abuf != NULL); 837 ASSERT(arc_buf_size(abuf) == blksz); 838 if ((error = zfs_uiocopy(abuf->b_data, blksz, 839 UIO_WRITE, uio, &nbytes))) { 840 dmu_return_arcbuf(abuf); 841 break; 842 } 843 ASSERT3S(nbytes, ==, blksz); 844 } else { 845 nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) - 846 P2PHASE(woff, blksz)); 847 if (pfbytes < nbytes) { 848 if (zfs_uio_prefaultpages(nbytes, uio)) { 849 error = SET_ERROR(EFAULT); 850 break; 851 } 852 pfbytes = nbytes; 853 } 854 } 855 856 /* 857 * Start a transaction. 858 */ 859 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 860 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 861 dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); 862 DB_DNODE_ENTER(db); 863 dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes); 864 DB_DNODE_EXIT(db); 865 zfs_sa_upgrade_txholds(tx, zp); 866 error = dmu_tx_assign(tx, DMU_TX_WAIT); 867 if (error) { 868 dmu_tx_abort(tx); 869 if (abuf != NULL) 870 dmu_return_arcbuf(abuf); 871 break; 872 } 873 874 /* 875 * NB: We must call zfs_clear_setid_bits_if_necessary before 876 * committing the transaction! 877 */ 878 879 /* 880 * If rangelock_enter() over-locked we grow the blocksize 881 * and then reduce the lock range. This will only happen 882 * on the first iteration since rangelock_reduce() will 883 * shrink down lr_length to the appropriate size. 884 */ 885 if (lr->lr_length == UINT64_MAX) { 886 zfs_grow_blocksize(zp, blksz, tx); 887 zfs_rangelock_reduce(lr, woff, n); 888 } 889 890 dmu_flags_t dflags = DMU_READ_PREFETCH; 891 if (ioflag & O_DIRECT) 892 dflags |= DMU_UNCACHEDIO; 893 if (uio->uio_extflg & UIO_DIRECT) 894 dflags |= DMU_DIRECTIO; 895 896 ssize_t tx_bytes; 897 if (abuf == NULL) { 898 tx_bytes = zfs_uio_resid(uio); 899 zfs_uio_fault_disable(uio, B_TRUE); 900 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), 901 uio, nbytes, tx, dflags); 902 zfs_uio_fault_disable(uio, B_FALSE); 903 #ifdef __linux__ 904 if (error == EFAULT) { 905 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, 906 cr, &clear_setid_bits_txg, tx); 907 dmu_tx_commit(tx); 908 /* 909 * Account for partial writes before 910 * continuing the loop. 911 * Update needs to occur before the next 912 * zfs_uio_prefaultpages, or prefaultpages may 913 * error, and we may break the loop early. 914 */ 915 n -= tx_bytes - zfs_uio_resid(uio); 916 pfbytes -= tx_bytes - zfs_uio_resid(uio); 917 continue; 918 } 919 #endif 920 /* 921 * On FreeBSD, EFAULT should be propagated back to the 922 * VFS, which will handle faulting and will retry. 923 */ 924 if (error != 0 && error != EFAULT) { 925 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, 926 cr, &clear_setid_bits_txg, tx); 927 dmu_tx_commit(tx); 928 break; 929 } 930 tx_bytes -= zfs_uio_resid(uio); 931 } else { 932 /* 933 * Thus, we're writing a full block at a block-aligned 934 * offset and extending the file past EOF. 935 * 936 * dmu_assign_arcbuf_by_dbuf() will directly assign the 937 * arc buffer to a dbuf. 938 */ 939 error = dmu_assign_arcbuf_by_dbuf( 940 sa_get_db(zp->z_sa_hdl), woff, abuf, tx, dflags); 941 if (error != 0) { 942 /* 943 * XXX This might not be necessary if 944 * dmu_assign_arcbuf_by_dbuf is guaranteed 945 * to be atomic. 946 */ 947 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, 948 cr, &clear_setid_bits_txg, tx); 949 dmu_return_arcbuf(abuf); 950 dmu_tx_commit(tx); 951 break; 952 } 953 ASSERT3S(nbytes, <=, zfs_uio_resid(uio)); 954 zfs_uioskip(uio, nbytes); 955 tx_bytes = nbytes; 956 } 957 /* 958 * There is a window where a file's pages can be mmap'ed after 959 * zfs_setup_direct() is called. This is due to the fact that 960 * the rangelock in this function is acquired after calling 961 * zfs_setup_direct(). This is done so that 962 * zfs_uio_prefaultpages() does not attempt to fault in pages 963 * on Linux for Direct I/O requests. This is not necessary as 964 * the pages are pinned in memory and can not be faulted out. 965 * Ideally, the rangelock would be held before calling 966 * zfs_setup_direct() and zfs_uio_prefaultpages(); however, 967 * this can lead to a deadlock as zfs_getpage() also acquires 968 * the rangelock as a RL_WRITER and prefaulting the pages can 969 * lead to zfs_getpage() being called. 970 * 971 * In the case of the pages being mapped after 972 * zfs_setup_direct() is called, the call to update_pages() 973 * will still be made to make sure there is consistency between 974 * the ARC and the Linux page cache. This is an ufortunate 975 * situation as the data will be read back into the ARC after 976 * the Direct I/O write has completed, but this is the penality 977 * for writing to a mmap'ed region of a file using Direct I/O. 978 */ 979 if (tx_bytes && 980 zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) { 981 update_pages(zp, woff, tx_bytes, zfsvfs->z_os); 982 } 983 984 /* 985 * If we made no progress, we're done. If we made even 986 * partial progress, update the znode and ZIL accordingly. 987 */ 988 if (tx_bytes == 0) { 989 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), 990 (void *)&zp->z_size, sizeof (uint64_t), tx); 991 dmu_tx_commit(tx); 992 ASSERT(error != 0); 993 break; 994 } 995 996 zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr, 997 &clear_setid_bits_txg, tx); 998 999 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); 1000 1001 /* 1002 * Update the file size (zp_size) if it has changed; 1003 * account for possible concurrent updates. 1004 */ 1005 while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { 1006 (void) atomic_cas_64(&zp->z_size, end_size, 1007 zfs_uio_offset(uio)); 1008 ASSERT(error == 0 || error == EFAULT); 1009 } 1010 /* 1011 * If we are replaying and eof is non zero then force 1012 * the file size to the specified eof. Note, there's no 1013 * concurrency during replay. 1014 */ 1015 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) 1016 zp->z_size = zfsvfs->z_replay_eof; 1017 1018 error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 1019 if (error1 != 0) 1020 /* Avoid clobbering EFAULT. */ 1021 error = error1; 1022 1023 /* 1024 * NB: During replay, the TX_SETATTR record logged by 1025 * zfs_clear_setid_bits_if_necessary must precede any of 1026 * the TX_WRITE records logged here. 1027 */ 1028 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit, 1029 uio->uio_extflg & UIO_DIRECT ? B_TRUE : B_FALSE, NULL, 1030 NULL); 1031 1032 dmu_tx_commit(tx); 1033 1034 /* 1035 * Direct I/O was deferred in order to grow the first block. 1036 * At this point it can be re-enabled for subsequent writes. 1037 */ 1038 if (o_direct_defer) { 1039 ASSERT(ioflag & O_DIRECT); 1040 uio->uio_extflg |= UIO_DIRECT; 1041 o_direct_defer = B_FALSE; 1042 } 1043 1044 if (error != 0) 1045 break; 1046 ASSERT3S(tx_bytes, ==, nbytes); 1047 n -= nbytes; 1048 pfbytes -= nbytes; 1049 } 1050 1051 if (o_direct_defer) { 1052 ASSERT(ioflag & O_DIRECT); 1053 uio->uio_extflg |= UIO_DIRECT; 1054 o_direct_defer = B_FALSE; 1055 } 1056 1057 zfs_znode_update_vfs(zp); 1058 zfs_rangelock_exit(lr); 1059 1060 /* 1061 * Cleanup for Direct I/O if requested. 1062 */ 1063 if (uio->uio_extflg & UIO_DIRECT) 1064 zfs_uio_free_dio_pages(uio, UIO_WRITE); 1065 1066 /* 1067 * If we're in replay mode, or we made no progress, or the 1068 * uio data is inaccessible return an error. Otherwise, it's 1069 * at least a partial write, so it's successful. 1070 */ 1071 if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid || 1072 error == EFAULT) { 1073 zfs_exit(zfsvfs, FTAG); 1074 return (error); 1075 } 1076 1077 if (commit) 1078 zil_commit(zilog, zp->z_id); 1079 1080 int64_t nwritten = start_resid - zfs_uio_resid(uio); 1081 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); 1082 1083 zfs_exit(zfsvfs, FTAG); 1084 return (0); 1085 } 1086 1087 /* 1088 * Rewrite a range of file as-is without modification. 1089 * 1090 * IN: zp - znode of file to be rewritten. 1091 * off - Offset of the range to rewrite. 1092 * len - Length of the range to rewrite. 1093 * flags - Random rewrite parameters. 1094 * arg - flags-specific argument. 1095 * 1096 * RETURN: 0 if success 1097 * error code if failure 1098 */ 1099 int 1100 zfs_rewrite(znode_t *zp, uint64_t off, uint64_t len, uint64_t flags, 1101 uint64_t arg) 1102 { 1103 int error; 1104 1105 if (flags != 0 || arg != 0) 1106 return (SET_ERROR(EINVAL)); 1107 1108 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1109 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1110 return (error); 1111 1112 if (zfs_is_readonly(zfsvfs)) { 1113 zfs_exit(zfsvfs, FTAG); 1114 return (SET_ERROR(EROFS)); 1115 } 1116 1117 if (off >= zp->z_size) { 1118 zfs_exit(zfsvfs, FTAG); 1119 return (0); 1120 } 1121 if (len == 0 || len > zp->z_size - off) 1122 len = zp->z_size - off; 1123 1124 /* Flush any mmap()'d data to disk */ 1125 if (zn_has_cached_data(zp, off, off + len - 1)) 1126 zn_flush_cached_data(zp, B_TRUE); 1127 1128 zfs_locked_range_t *lr; 1129 lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); 1130 1131 const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); 1132 const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); 1133 const uint64_t projid = zp->z_projid; 1134 1135 dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); 1136 DB_DNODE_ENTER(db); 1137 dnode_t *dn = DB_DNODE(db); 1138 1139 uint64_t n, noff = off, nr = 0, nw = 0; 1140 while (len > 0) { 1141 /* 1142 * Rewrite only actual data, skipping any holes. This might 1143 * be inaccurate for dirty files, but we don't really care. 1144 */ 1145 if (noff == off) { 1146 /* Find next data in the file. */ 1147 error = dnode_next_offset(dn, 0, &noff, 1, 1, 0); 1148 if (error || noff >= off + len) { 1149 if (error == ESRCH) /* No more data. */ 1150 error = 0; 1151 break; 1152 } 1153 ASSERT3U(noff, >=, off); 1154 len -= noff - off; 1155 off = noff; 1156 1157 /* Find where the data end. */ 1158 error = dnode_next_offset(dn, DNODE_FIND_HOLE, &noff, 1159 1, 1, 0); 1160 if (error != 0) 1161 noff = off + len; 1162 } 1163 ASSERT3U(noff, >, off); 1164 1165 if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || 1166 zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || 1167 (projid != ZFS_DEFAULT_PROJID && 1168 zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, 1169 projid))) { 1170 error = SET_ERROR(EDQUOT); 1171 break; 1172 } 1173 1174 n = MIN(MIN(len, noff - off), 1175 DMU_MAX_ACCESS / 2 - P2PHASE(off, zp->z_blksz)); 1176 1177 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); 1178 dmu_tx_hold_write_by_dnode(tx, dn, off, n); 1179 error = dmu_tx_assign(tx, DMU_TX_WAIT); 1180 if (error) { 1181 dmu_tx_abort(tx); 1182 break; 1183 } 1184 1185 /* Mark all dbufs within range as dirty to trigger rewrite. */ 1186 dmu_buf_t **dbp; 1187 int numbufs; 1188 error = dmu_buf_hold_array_by_dnode(dn, off, n, TRUE, FTAG, 1189 &numbufs, &dbp, DMU_READ_PREFETCH | DMU_UNCACHEDIO); 1190 if (error) { 1191 dmu_tx_commit(tx); 1192 break; 1193 } 1194 for (int i = 0; i < numbufs; i++) { 1195 nr += dbp[i]->db_size; 1196 if (dmu_buf_is_dirty(dbp[i], tx)) 1197 continue; 1198 nw += dbp[i]->db_size; 1199 dmu_buf_will_dirty(dbp[i], tx); 1200 } 1201 dmu_buf_rele_array(dbp, numbufs, FTAG); 1202 1203 dmu_tx_commit(tx); 1204 1205 len -= n; 1206 off += n; 1207 1208 if (issig()) { 1209 error = SET_ERROR(EINTR); 1210 break; 1211 } 1212 } 1213 1214 DB_DNODE_EXIT(db); 1215 1216 dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nr); 1217 dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nw); 1218 1219 zfs_rangelock_exit(lr); 1220 zfs_exit(zfsvfs, FTAG); 1221 return (error); 1222 } 1223 1224 int 1225 zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) 1226 { 1227 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1228 int error; 1229 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 1230 1231 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1232 return (error); 1233 error = zfs_getacl(zp, vsecp, skipaclchk, cr); 1234 zfs_exit(zfsvfs, FTAG); 1235 1236 return (error); 1237 } 1238 1239 int 1240 zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) 1241 { 1242 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1243 int error; 1244 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; 1245 zilog_t *zilog; 1246 1247 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1248 return (error); 1249 zilog = zfsvfs->z_log; 1250 error = zfs_setacl(zp, vsecp, skipaclchk, cr); 1251 1252 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) 1253 zil_commit(zilog, 0); 1254 1255 zfs_exit(zfsvfs, FTAG); 1256 return (error); 1257 } 1258 1259 /* 1260 * Get the optimal alignment to ensure direct IO can be performed without 1261 * incurring any RMW penalty on write. If direct IO is not enabled for this 1262 * file, returns an error. 1263 */ 1264 int 1265 zfs_get_direct_alignment(znode_t *zp, uint64_t *alignp) 1266 { 1267 zfsvfs_t *zfsvfs = ZTOZSB(zp); 1268 1269 if (!zfs_dio_enabled || zfsvfs->z_os->os_direct == ZFS_DIRECT_DISABLED) 1270 return (SET_ERROR(EOPNOTSUPP)); 1271 1272 /* 1273 * If the file has multiple blocks, then its block size is fixed 1274 * forever, and so is the ideal alignment. 1275 * 1276 * If however it only has a single block, then we want to return the 1277 * max block size it could possibly grown to (ie, the dataset 1278 * recordsize). We do this so that a program querying alignment 1279 * immediately after the file is created gets a value that won't change 1280 * once the file has grown into the second block and beyond. 1281 * 1282 * Because we don't have a count of blocks easily available here, we 1283 * check if the apparent file size is smaller than its current block 1284 * size (meaning, the file hasn't yet grown into the current block 1285 * size) and then, check if the block size is smaller than the dataset 1286 * maximum (meaning, if the file grew past the current block size, the 1287 * block size could would be increased). 1288 */ 1289 if (zp->z_size <= zp->z_blksz && zp->z_blksz < zfsvfs->z_max_blksz) 1290 *alignp = MAX(zfsvfs->z_max_blksz, PAGE_SIZE); 1291 else 1292 *alignp = MAX(zp->z_blksz, PAGE_SIZE); 1293 1294 return (0); 1295 } 1296 1297 #ifdef ZFS_DEBUG 1298 static int zil_fault_io = 0; 1299 #endif 1300 1301 static void zfs_get_done(zgd_t *zgd, int error); 1302 1303 /* 1304 * Get data to generate a TX_WRITE intent log record. 1305 */ 1306 int 1307 zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, 1308 struct lwb *lwb, zio_t *zio) 1309 { 1310 zfsvfs_t *zfsvfs = arg; 1311 objset_t *os = zfsvfs->z_os; 1312 znode_t *zp; 1313 uint64_t object = lr->lr_foid; 1314 uint64_t offset = lr->lr_offset; 1315 uint64_t size = lr->lr_length; 1316 zgd_t *zgd; 1317 int error = 0; 1318 uint64_t zp_gen; 1319 1320 ASSERT3P(lwb, !=, NULL); 1321 ASSERT3U(size, !=, 0); 1322 1323 /* 1324 * Nothing to do if the file has been removed 1325 */ 1326 if (zfs_zget(zfsvfs, object, &zp) != 0) 1327 return (SET_ERROR(ENOENT)); 1328 if (zp->z_unlinked) { 1329 /* 1330 * Release the vnode asynchronously as we currently have the 1331 * txg stopped from syncing. 1332 */ 1333 zfs_zrele_async(zp); 1334 return (SET_ERROR(ENOENT)); 1335 } 1336 /* check if generation number matches */ 1337 if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, 1338 sizeof (zp_gen)) != 0) { 1339 zfs_zrele_async(zp); 1340 return (SET_ERROR(EIO)); 1341 } 1342 if (zp_gen != gen) { 1343 zfs_zrele_async(zp); 1344 return (SET_ERROR(ENOENT)); 1345 } 1346 1347 zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 1348 zgd->zgd_lwb = lwb; 1349 zgd->zgd_private = zp; 1350 1351 /* 1352 * Write records come in two flavors: immediate and indirect. 1353 * For small writes it's cheaper to store the data with the 1354 * log record (immediate); for large writes it's cheaper to 1355 * sync the data and get a pointer to it (indirect) so that 1356 * we don't have to write the data twice. 1357 */ 1358 if (buf != NULL) { /* immediate write */ 1359 zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, 1360 size, RL_READER); 1361 /* test for truncation needs to be done while range locked */ 1362 if (offset >= zp->z_size) { 1363 error = SET_ERROR(ENOENT); 1364 } else { 1365 error = dmu_read(os, object, offset, size, buf, 1366 DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING); 1367 } 1368 ASSERT(error == 0 || error == ENOENT); 1369 } else { /* indirect write */ 1370 ASSERT3P(zio, !=, NULL); 1371 /* 1372 * Have to lock the whole block to ensure when it's 1373 * written out and its checksum is being calculated 1374 * that no one can change the data. We need to re-check 1375 * blocksize after we get the lock in case it's changed! 1376 */ 1377 for (;;) { 1378 uint64_t blkoff; 1379 size = zp->z_blksz; 1380 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; 1381 offset -= blkoff; 1382 zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, 1383 offset, size, RL_READER); 1384 if (zp->z_blksz == size) 1385 break; 1386 offset += blkoff; 1387 zfs_rangelock_exit(zgd->zgd_lr); 1388 } 1389 /* test for truncation needs to be done while range locked */ 1390 if (lr->lr_offset >= zp->z_size) 1391 error = SET_ERROR(ENOENT); 1392 #ifdef ZFS_DEBUG 1393 if (zil_fault_io) { 1394 error = SET_ERROR(EIO); 1395 zil_fault_io = 0; 1396 } 1397 #endif 1398 1399 dmu_buf_t *dbp; 1400 if (error == 0) 1401 error = dmu_buf_hold_noread(os, object, offset, zgd, 1402 &dbp); 1403 1404 if (error == 0) { 1405 zgd->zgd_db = dbp; 1406 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp; 1407 boolean_t direct_write = B_FALSE; 1408 mutex_enter(&db->db_mtx); 1409 dbuf_dirty_record_t *dr = 1410 dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg); 1411 if (dr != NULL && dr->dt.dl.dr_diowrite) 1412 direct_write = B_TRUE; 1413 mutex_exit(&db->db_mtx); 1414 1415 /* 1416 * All Direct I/O writes will have already completed and 1417 * the block pointer can be immediately stored in the 1418 * log record. 1419 */ 1420 if (direct_write) { 1421 /* 1422 * A Direct I/O write always covers an entire 1423 * block. 1424 */ 1425 ASSERT3U(dbp->db_size, ==, zp->z_blksz); 1426 lr->lr_blkptr = dr->dt.dl.dr_overridden_by; 1427 zfs_get_done(zgd, 0); 1428 return (0); 1429 } 1430 1431 blkptr_t *bp = &lr->lr_blkptr; 1432 zgd->zgd_bp = bp; 1433 1434 ASSERT3U(dbp->db_offset, ==, offset); 1435 ASSERT3U(dbp->db_size, ==, size); 1436 1437 error = dmu_sync(zio, lr->lr_common.lrc_txg, 1438 zfs_get_done, zgd); 1439 ASSERT(error || lr->lr_length <= size); 1440 1441 /* 1442 * On success, we need to wait for the write I/O 1443 * initiated by dmu_sync() to complete before we can 1444 * release this dbuf. We will finish everything up 1445 * in the zfs_get_done() callback. 1446 */ 1447 if (error == 0) 1448 return (0); 1449 1450 if (error == EALREADY) { 1451 lr->lr_common.lrc_txtype = TX_WRITE2; 1452 /* 1453 * TX_WRITE2 relies on the data previously 1454 * written by the TX_WRITE that caused 1455 * EALREADY. We zero out the BP because 1456 * it is the old, currently-on-disk BP. 1457 */ 1458 zgd->zgd_bp = NULL; 1459 BP_ZERO(bp); 1460 error = 0; 1461 } 1462 } 1463 } 1464 1465 zfs_get_done(zgd, error); 1466 1467 return (error); 1468 } 1469 1470 static void 1471 zfs_get_done(zgd_t *zgd, int error) 1472 { 1473 (void) error; 1474 znode_t *zp = zgd->zgd_private; 1475 1476 if (zgd->zgd_db) 1477 dmu_buf_rele(zgd->zgd_db, zgd); 1478 1479 zfs_rangelock_exit(zgd->zgd_lr); 1480 1481 /* 1482 * Release the vnode asynchronously as we currently have the 1483 * txg stopped from syncing. 1484 */ 1485 zfs_zrele_async(zp); 1486 1487 kmem_free(zgd, sizeof (zgd_t)); 1488 } 1489 1490 static int 1491 zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) 1492 { 1493 int error; 1494 1495 /* Swap. Not sure if the order of zfs_enter()s is important. */ 1496 if (zfsvfs1 > zfsvfs2) { 1497 zfsvfs_t *tmpzfsvfs; 1498 1499 tmpzfsvfs = zfsvfs2; 1500 zfsvfs2 = zfsvfs1; 1501 zfsvfs1 = tmpzfsvfs; 1502 } 1503 1504 error = zfs_enter(zfsvfs1, tag); 1505 if (error != 0) 1506 return (error); 1507 if (zfsvfs1 != zfsvfs2) { 1508 error = zfs_enter(zfsvfs2, tag); 1509 if (error != 0) { 1510 zfs_exit(zfsvfs1, tag); 1511 return (error); 1512 } 1513 } 1514 1515 return (0); 1516 } 1517 1518 static void 1519 zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag) 1520 { 1521 1522 zfs_exit(zfsvfs1, tag); 1523 if (zfsvfs1 != zfsvfs2) 1524 zfs_exit(zfsvfs2, tag); 1525 } 1526 1527 /* 1528 * We split each clone request in chunks that can fit into a single ZIL 1529 * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning 1530 * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives 1531 * us room for storing 1022 block pointers. 1532 * 1533 * On success, the function return the number of bytes copied in *lenp. 1534 * Note, it doesn't return how much bytes are left to be copied. 1535 * On errors which are caused by any file system limitations or 1536 * brt limitations `EINVAL` is returned. In the most cases a user 1537 * requested bad parameters, it could be possible to clone the file but 1538 * some parameters don't match the requirements. 1539 */ 1540 int 1541 zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, 1542 uint64_t *outoffp, uint64_t *lenp, cred_t *cr) 1543 { 1544 zfsvfs_t *inzfsvfs, *outzfsvfs; 1545 objset_t *inos, *outos; 1546 zfs_locked_range_t *inlr, *outlr; 1547 dmu_buf_impl_t *db; 1548 dmu_tx_t *tx; 1549 zilog_t *zilog; 1550 uint64_t inoff, outoff, len, done; 1551 uint64_t outsize, size; 1552 int error; 1553 int count = 0; 1554 sa_bulk_attr_t bulk[3]; 1555 uint64_t mtime[2], ctime[2]; 1556 uint64_t uid, gid, projid; 1557 blkptr_t *bps; 1558 size_t maxblocks, nbps; 1559 uint_t inblksz; 1560 uint64_t clear_setid_bits_txg = 0; 1561 uint64_t last_synced_txg = 0; 1562 1563 inoff = *inoffp; 1564 outoff = *outoffp; 1565 len = *lenp; 1566 done = 0; 1567 1568 inzfsvfs = ZTOZSB(inzp); 1569 outzfsvfs = ZTOZSB(outzp); 1570 1571 /* 1572 * We need to call zfs_enter() potentially on two different datasets, 1573 * so we need a dedicated function for that. 1574 */ 1575 error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG); 1576 if (error != 0) 1577 return (error); 1578 1579 inos = inzfsvfs->z_os; 1580 outos = outzfsvfs->z_os; 1581 1582 /* 1583 * Both source and destination have to belong to the same storage pool. 1584 */ 1585 if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { 1586 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1587 return (SET_ERROR(EXDEV)); 1588 } 1589 1590 /* 1591 * outos and inos belongs to the same storage pool. 1592 * see a few lines above, only one check. 1593 */ 1594 if (!spa_feature_is_enabled(dmu_objset_spa(outos), 1595 SPA_FEATURE_BLOCK_CLONING)) { 1596 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1597 return (SET_ERROR(EOPNOTSUPP)); 1598 } 1599 1600 ASSERT(!outzfsvfs->z_replay); 1601 1602 /* 1603 * Block cloning from an unencrypted dataset into an encrypted 1604 * dataset and vice versa is not supported. 1605 */ 1606 if (inos->os_encrypted != outos->os_encrypted) { 1607 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1608 return (SET_ERROR(EXDEV)); 1609 } 1610 1611 /* 1612 * Cloning across encrypted datasets is possible only if they 1613 * share the same master key. 1614 */ 1615 if (inos != outos && inos->os_encrypted && 1616 !dmu_objset_crypto_key_equal(inos, outos)) { 1617 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1618 return (SET_ERROR(EXDEV)); 1619 } 1620 1621 error = zfs_verify_zp(inzp); 1622 if (error == 0) 1623 error = zfs_verify_zp(outzp); 1624 if (error != 0) { 1625 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1626 return (error); 1627 } 1628 1629 /* 1630 * We don't copy source file's flags that's why we don't allow to clone 1631 * files that are in quarantine. 1632 */ 1633 if (inzp->z_pflags & ZFS_AV_QUARANTINED) { 1634 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1635 return (SET_ERROR(EACCES)); 1636 } 1637 1638 if (inoff >= inzp->z_size) { 1639 *lenp = 0; 1640 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1641 return (0); 1642 } 1643 if (len > inzp->z_size - inoff) { 1644 len = inzp->z_size - inoff; 1645 } 1646 if (len == 0) { 1647 *lenp = 0; 1648 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1649 return (0); 1650 } 1651 1652 /* 1653 * Callers might not be able to detect properly that we are read-only, 1654 * so check it explicitly here. 1655 */ 1656 if (zfs_is_readonly(outzfsvfs)) { 1657 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1658 return (SET_ERROR(EROFS)); 1659 } 1660 1661 /* 1662 * If immutable or not appending then return EPERM. 1663 * Intentionally allow ZFS_READONLY through here. 1664 * See zfs_zaccess_common() 1665 */ 1666 if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) { 1667 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1668 return (SET_ERROR(EPERM)); 1669 } 1670 1671 /* 1672 * No overlapping if we are cloning within the same file. 1673 */ 1674 if (inzp == outzp) { 1675 if (inoff < outoff + len && outoff < inoff + len) { 1676 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1677 return (SET_ERROR(EINVAL)); 1678 } 1679 } 1680 1681 /* Flush any mmap()'d data to disk */ 1682 if (zn_has_cached_data(inzp, inoff, inoff + len - 1)) 1683 zn_flush_cached_data(inzp, B_TRUE); 1684 1685 /* 1686 * Maintain predictable lock order. 1687 */ 1688 if (inzp < outzp || (inzp == outzp && inoff < outoff)) { 1689 inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, 1690 RL_READER); 1691 outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, 1692 RL_WRITER); 1693 } else { 1694 outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len, 1695 RL_WRITER); 1696 inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len, 1697 RL_READER); 1698 } 1699 1700 inblksz = inzp->z_blksz; 1701 1702 /* 1703 * We cannot clone into a file with different block size if we can't 1704 * grow it (block size is already bigger, has more than one block, or 1705 * not locked for growth). There are other possible reasons for the 1706 * grow to fail, but we cover what we can before opening transaction 1707 * and the rest detect after we try to do it. 1708 */ 1709 if (inblksz < outzp->z_blksz) { 1710 error = SET_ERROR(EINVAL); 1711 goto unlock; 1712 } 1713 if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz || 1714 outlr->lr_length != UINT64_MAX)) { 1715 error = SET_ERROR(EINVAL); 1716 goto unlock; 1717 } 1718 1719 /* 1720 * Block size must be power-of-2 if destination offset != 0. 1721 * There can be no multiple blocks of non-power-of-2 size. 1722 */ 1723 if (outoff != 0 && !ISP2(inblksz)) { 1724 error = SET_ERROR(EINVAL); 1725 goto unlock; 1726 } 1727 1728 /* 1729 * Offsets and len must be at block boundries. 1730 */ 1731 if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) { 1732 error = SET_ERROR(EINVAL); 1733 goto unlock; 1734 } 1735 /* 1736 * Length must be multipe of blksz, except for the end of the file. 1737 */ 1738 if ((len % inblksz) != 0 && 1739 (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) { 1740 error = SET_ERROR(EINVAL); 1741 goto unlock; 1742 } 1743 1744 /* 1745 * If we are copying only one block and it is smaller than recordsize 1746 * property, do not allow destination to grow beyond one block if it 1747 * is not there yet. Otherwise the destination will get stuck with 1748 * that block size forever, that can be as small as 512 bytes, no 1749 * matter how big the destination grow later. 1750 */ 1751 if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz && 1752 outzp->z_size <= inblksz && outoff + len > inblksz) { 1753 error = SET_ERROR(EINVAL); 1754 goto unlock; 1755 } 1756 1757 error = zn_rlimit_fsize(outoff + len); 1758 if (error != 0) { 1759 goto unlock; 1760 } 1761 1762 if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) { 1763 error = SET_ERROR(EFBIG); 1764 goto unlock; 1765 } 1766 1767 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL, 1768 &mtime, 16); 1769 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL, 1770 &ctime, 16); 1771 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL, 1772 &outzp->z_size, 8); 1773 1774 zilog = outzfsvfs->z_log; 1775 maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) / 1776 sizeof (bps[0]); 1777 1778 uid = KUID_TO_SUID(ZTOUID(outzp)); 1779 gid = KGID_TO_SGID(ZTOGID(outzp)); 1780 projid = outzp->z_projid; 1781 1782 bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); 1783 1784 /* 1785 * Clone the file in reasonable size chunks. Each chunk is cloned 1786 * in a separate transaction; this keeps the intent log records small 1787 * and allows us to do more fine-grained space accounting. 1788 */ 1789 while (len > 0) { 1790 size = MIN(inblksz * maxblocks, len); 1791 1792 if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT, 1793 uid) || 1794 zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT, 1795 gid) || 1796 (projid != ZFS_DEFAULT_PROJID && 1797 zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT, 1798 projid))) { 1799 error = SET_ERROR(EDQUOT); 1800 break; 1801 } 1802 1803 nbps = maxblocks; 1804 last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos)); 1805 error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps, 1806 &nbps); 1807 if (error != 0) { 1808 /* 1809 * If we are trying to clone a block that was created 1810 * in the current transaction group, the error will be 1811 * EAGAIN here. Based on zfs_bclone_wait_dirty either 1812 * return a shortened range to the caller so it can 1813 * fallback, or wait for the next TXG and check again. 1814 */ 1815 if (error == EAGAIN && zfs_bclone_wait_dirty) { 1816 txg_wait_flag_t wait_flags = 1817 spa_get_failmode(dmu_objset_spa(inos)) == 1818 ZIO_FAILURE_MODE_CONTINUE ? 1819 TXG_WAIT_SUSPEND : 0; 1820 error = txg_wait_synced_flags( 1821 dmu_objset_pool(inos), last_synced_txg + 1, 1822 wait_flags); 1823 if (error == 0) 1824 continue; 1825 ASSERT3U(error, ==, ESHUTDOWN); 1826 error = SET_ERROR(EIO); 1827 } 1828 1829 break; 1830 } 1831 1832 /* 1833 * Start a transaction. 1834 */ 1835 tx = dmu_tx_create(outos); 1836 dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE); 1837 db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl); 1838 DB_DNODE_ENTER(db); 1839 dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size, 1840 inblksz); 1841 DB_DNODE_EXIT(db); 1842 zfs_sa_upgrade_txholds(tx, outzp); 1843 error = dmu_tx_assign(tx, DMU_TX_WAIT); 1844 if (error != 0) { 1845 dmu_tx_abort(tx); 1846 break; 1847 } 1848 1849 /* 1850 * Copy source znode's block size. This is done only if the 1851 * whole znode is locked (see zfs_rangelock_cb()) and only 1852 * on the first iteration since zfs_rangelock_reduce() will 1853 * shrink down lr_length to the appropriate size. 1854 */ 1855 if (outlr->lr_length == UINT64_MAX) { 1856 zfs_grow_blocksize(outzp, inblksz, tx); 1857 1858 /* 1859 * Block growth may fail for many reasons we can not 1860 * predict here. If it happen the cloning is doomed. 1861 */ 1862 if (inblksz != outzp->z_blksz) { 1863 error = SET_ERROR(EINVAL); 1864 dmu_tx_commit(tx); 1865 break; 1866 } 1867 1868 /* 1869 * Round range lock up to the block boundary, so we 1870 * prevent appends until we are done. 1871 */ 1872 zfs_rangelock_reduce(outlr, outoff, 1873 ((len - 1) / inblksz + 1) * inblksz); 1874 } 1875 1876 error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, 1877 bps, nbps); 1878 if (error != 0) { 1879 dmu_tx_commit(tx); 1880 break; 1881 } 1882 1883 if (zn_has_cached_data(outzp, outoff, outoff + size - 1)) { 1884 update_pages(outzp, outoff, size, outos); 1885 } 1886 1887 zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr, 1888 &clear_setid_bits_txg, tx); 1889 1890 zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime); 1891 1892 /* 1893 * Update the file size (zp_size) if it has changed; 1894 * account for possible concurrent updates. 1895 */ 1896 while ((outsize = outzp->z_size) < outoff + size) { 1897 (void) atomic_cas_64(&outzp->z_size, outsize, 1898 outoff + size); 1899 } 1900 1901 error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx); 1902 1903 zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff, 1904 size, inblksz, bps, nbps); 1905 1906 dmu_tx_commit(tx); 1907 1908 if (error != 0) 1909 break; 1910 1911 inoff += size; 1912 outoff += size; 1913 len -= size; 1914 done += size; 1915 1916 if (issig()) { 1917 error = SET_ERROR(EINTR); 1918 break; 1919 } 1920 } 1921 1922 vmem_free(bps, sizeof (bps[0]) * maxblocks); 1923 zfs_znode_update_vfs(outzp); 1924 1925 unlock: 1926 zfs_rangelock_exit(outlr); 1927 zfs_rangelock_exit(inlr); 1928 1929 if (done > 0) { 1930 /* 1931 * If we have made at least partial progress, reset the error. 1932 */ 1933 error = 0; 1934 1935 ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp); 1936 1937 if (outos->os_sync == ZFS_SYNC_ALWAYS) { 1938 zil_commit(zilog, outzp->z_id); 1939 } 1940 1941 *inoffp += done; 1942 *outoffp += done; 1943 *lenp = done; 1944 } else { 1945 /* 1946 * If we made no progress, there must be a good reason. 1947 * EOF is handled explicitly above, before the loop. 1948 */ 1949 ASSERT3S(error, !=, 0); 1950 } 1951 1952 zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); 1953 1954 return (error); 1955 } 1956 1957 /* 1958 * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(), 1959 * but we cannot do that, because when replaying we don't have source znode 1960 * available. This is why we need a dedicated replay function. 1961 */ 1962 int 1963 zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz, 1964 const blkptr_t *bps, size_t nbps) 1965 { 1966 zfsvfs_t *zfsvfs; 1967 dmu_buf_impl_t *db; 1968 dmu_tx_t *tx; 1969 int error; 1970 int count = 0; 1971 sa_bulk_attr_t bulk[3]; 1972 uint64_t mtime[2], ctime[2]; 1973 1974 ASSERT3U(off, <, MAXOFFSET_T); 1975 ASSERT3U(len, >, 0); 1976 ASSERT3U(nbps, >, 0); 1977 1978 zfsvfs = ZTOZSB(zp); 1979 1980 ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os), 1981 SPA_FEATURE_BLOCK_CLONING)); 1982 1983 if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) 1984 return (error); 1985 1986 ASSERT(zfsvfs->z_replay); 1987 ASSERT(!zfs_is_readonly(zfsvfs)); 1988 1989 if ((off % blksz) != 0) { 1990 zfs_exit(zfsvfs, FTAG); 1991 return (SET_ERROR(EINVAL)); 1992 } 1993 1994 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); 1995 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); 1996 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, 1997 &zp->z_size, 8); 1998 1999 /* 2000 * Start a transaction. 2001 */ 2002 tx = dmu_tx_create(zfsvfs->z_os); 2003 2004 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); 2005 db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); 2006 DB_DNODE_ENTER(db); 2007 dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len, blksz); 2008 DB_DNODE_EXIT(db); 2009 zfs_sa_upgrade_txholds(tx, zp); 2010 error = dmu_tx_assign(tx, DMU_TX_WAIT); 2011 if (error != 0) { 2012 dmu_tx_abort(tx); 2013 zfs_exit(zfsvfs, FTAG); 2014 return (error); 2015 } 2016 2017 if (zp->z_blksz < blksz) 2018 zfs_grow_blocksize(zp, blksz, tx); 2019 2020 dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps); 2021 2022 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); 2023 2024 if (zp->z_size < off + len) 2025 zp->z_size = off + len; 2026 2027 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); 2028 2029 /* 2030 * zil_replaying() not only check if we are replaying ZIL, but also 2031 * updates the ZIL header to record replay progress. 2032 */ 2033 VERIFY(zil_replaying(zfsvfs->z_log, tx)); 2034 2035 dmu_tx_commit(tx); 2036 2037 zfs_znode_update_vfs(zp); 2038 2039 zfs_exit(zfsvfs, FTAG); 2040 2041 return (error); 2042 } 2043 2044 EXPORT_SYMBOL(zfs_access); 2045 EXPORT_SYMBOL(zfs_fsync); 2046 EXPORT_SYMBOL(zfs_holey); 2047 EXPORT_SYMBOL(zfs_read); 2048 EXPORT_SYMBOL(zfs_write); 2049 EXPORT_SYMBOL(zfs_getsecattr); 2050 EXPORT_SYMBOL(zfs_setsecattr); 2051 EXPORT_SYMBOL(zfs_clone_range); 2052 EXPORT_SYMBOL(zfs_clone_range_replay); 2053 2054 ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW, 2055 "Bytes to read per chunk"); 2056 2057 ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW, 2058 "Enable block cloning"); 2059 2060 ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW, 2061 "Wait for dirty blocks when cloning"); 2062 2063 ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW, 2064 "Enable Direct I/O"); 2065 2066 ZFS_MODULE_PARAM(zfs, zfs_, dio_strict, INT, ZMOD_RW, 2067 "Return errors on misaligned Direct I/O"); 2068