xref: /freebsd/sys/contrib/openzfs/module/zfs/zfs_vnops.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
26  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
27  * Copyright 2017 Nexenta Systems, Inc.
28  * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
29  * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
30  */
31 
32 /* Portions Copyright 2007 Jeremy Teo */
33 /* Portions Copyright 2010 Robert Milkowski */
34 
35 #include <sys/types.h>
36 #include <sys/param.h>
37 #include <sys/time.h>
38 #include <sys/sysmacros.h>
39 #include <sys/vfs.h>
40 #include <sys/file.h>
41 #include <sys/stat.h>
42 #include <sys/kmem.h>
43 #include <sys/cmn_err.h>
44 #include <sys/errno.h>
45 #include <sys/zfs_dir.h>
46 #include <sys/zfs_acl.h>
47 #include <sys/zfs_ioctl.h>
48 #include <sys/fs/zfs.h>
49 #include <sys/dmu.h>
50 #include <sys/dmu_objset.h>
51 #include <sys/dsl_crypt.h>
52 #include <sys/spa.h>
53 #include <sys/txg.h>
54 #include <sys/dbuf.h>
55 #include <sys/policy.h>
56 #include <sys/zfeature.h>
57 #include <sys/zfs_vnops.h>
58 #include <sys/zfs_quota.h>
59 #include <sys/zfs_vfsops.h>
60 #include <sys/zfs_znode.h>
61 
62 /*
63  * Enables access to the block cloning feature. If this setting is 0, then even
64  * if feature@block_cloning is enabled, using functions and system calls that
65  * attempt to clone blocks will act as though the feature is disabled.
66  */
67 int zfs_bclone_enabled = 1;
68 
69 /*
70  * When set zfs_clone_range() waits for dirty data to be written to disk.
71  * This allows the clone operation to reliably succeed when a file is modified
72  * and then immediately cloned. For small files this may be slower than making
73  * a copy of the file and is therefore not the default.  However, in certain
74  * scenarios this behavior may be desirable so a tunable is provided.
75  */
76 int zfs_bclone_wait_dirty = 0;
77 
78 /*
79  * Enable Direct I/O. If this setting is 0, then all I/O requests will be
80  * directed through the ARC acting as though the dataset property direct was
81  * set to disabled.
82  *
83  * Disabled by default on FreeBSD until a potential range locking issue in
84  * zfs_getpages() can be resolved.
85  */
86 #ifdef __FreeBSD__
87 static int zfs_dio_enabled = 0;
88 #else
89 static int zfs_dio_enabled = 1;
90 #endif
91 
92 
93 /*
94  * Maximum bytes to read per chunk in zfs_read().
95  */
96 static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024;
97 
98 int
zfs_fsync(znode_t * zp,int syncflag,cred_t * cr)99 zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
100 {
101 	int error = 0;
102 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
103 
104 	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
105 		if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
106 			return (error);
107 		atomic_inc_32(&zp->z_sync_writes_cnt);
108 		zil_commit(zfsvfs->z_log, zp->z_id);
109 		atomic_dec_32(&zp->z_sync_writes_cnt);
110 		zfs_exit(zfsvfs, FTAG);
111 	}
112 	return (error);
113 }
114 
115 
116 #if defined(SEEK_HOLE) && defined(SEEK_DATA)
117 /*
118  * Lseek support for finding holes (cmd == SEEK_HOLE) and
119  * data (cmd == SEEK_DATA). "off" is an in/out parameter.
120  */
121 static int
zfs_holey_common(znode_t * zp,ulong_t cmd,loff_t * off)122 zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
123 {
124 	zfs_locked_range_t *lr;
125 	uint64_t noff = (uint64_t)*off; /* new offset */
126 	uint64_t file_sz;
127 	int error;
128 	boolean_t hole;
129 
130 	file_sz = zp->z_size;
131 	if (noff >= file_sz)  {
132 		return (SET_ERROR(ENXIO));
133 	}
134 
135 	if (cmd == F_SEEK_HOLE)
136 		hole = B_TRUE;
137 	else
138 		hole = B_FALSE;
139 
140 	/* Flush any mmap()'d data to disk */
141 	if (zn_has_cached_data(zp, 0, file_sz - 1))
142 		zn_flush_cached_data(zp, B_TRUE);
143 
144 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
145 	error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
146 	zfs_rangelock_exit(lr);
147 
148 	if (error == ESRCH)
149 		return (SET_ERROR(ENXIO));
150 
151 	/* File was dirty, so fall back to using generic logic */
152 	if (error == EBUSY) {
153 		if (hole)
154 			*off = file_sz;
155 
156 		return (0);
157 	}
158 
159 	/*
160 	 * We could find a hole that begins after the logical end-of-file,
161 	 * because dmu_offset_next() only works on whole blocks.  If the
162 	 * EOF falls mid-block, then indicate that the "virtual hole"
163 	 * at the end of the file begins at the logical EOF, rather than
164 	 * at the end of the last block.
165 	 */
166 	if (noff > file_sz) {
167 		ASSERT(hole);
168 		noff = file_sz;
169 	}
170 
171 	if (noff < *off)
172 		return (error);
173 	*off = noff;
174 	return (error);
175 }
176 
177 int
zfs_holey(znode_t * zp,ulong_t cmd,loff_t * off)178 zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off)
179 {
180 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
181 	int error;
182 
183 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
184 		return (error);
185 
186 	error = zfs_holey_common(zp, cmd, off);
187 
188 	zfs_exit(zfsvfs, FTAG);
189 	return (error);
190 }
191 #endif /* SEEK_HOLE && SEEK_DATA */
192 
193 int
zfs_access(znode_t * zp,int mode,int flag,cred_t * cr)194 zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
195 {
196 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
197 	int error;
198 
199 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
200 		return (error);
201 
202 	if (flag & V_ACE_MASK)
203 #if defined(__linux__)
204 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr,
205 		    zfs_init_idmap);
206 #else
207 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr,
208 		    NULL);
209 #endif
210 	else
211 #if defined(__linux__)
212 		error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap);
213 #else
214 		error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL);
215 #endif
216 
217 	zfs_exit(zfsvfs, FTAG);
218 	return (error);
219 }
220 
221 /*
222  * Determine if Direct I/O has been requested (either via the O_DIRECT flag or
223  * the "direct" dataset property). When inherited by the property only apply
224  * the O_DIRECT flag to correctly aligned IO requests. The rational for this
225  * is it allows the property to be safely set on a dataset without forcing
226  * all of the applications to be aware of the alignment restrictions. When
227  * O_DIRECT is explicitly requested by an application return EINVAL if the
228  * request is unaligned.  In all cases, if the range for this request has
229  * been mmap'ed then we will perform buffered I/O to keep the mapped region
230  * synhronized with the ARC.
231  *
232  * It is possible that a file's pages could be mmap'ed after it is checked
233  * here. If so, that is handled coorarding in zfs_write(). See comments in the
234  * following area for how this is handled:
235  * zfs_write() -> update_pages()
236  */
237 static int
zfs_setup_direct(struct znode * zp,zfs_uio_t * uio,zfs_uio_rw_t rw,int * ioflagp)238 zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw,
239     int *ioflagp)
240 {
241 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
242 	objset_t *os = zfsvfs->z_os;
243 	int ioflag = *ioflagp;
244 	int error = 0;
245 
246 	if (!zfs_dio_enabled || os->os_direct == ZFS_DIRECT_DISABLED ||
247 	    zn_has_cached_data(zp, zfs_uio_offset(uio),
248 	    zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) {
249 		/*
250 		 * Direct I/O is disabled or the region is mmap'ed. In either
251 		 * case the I/O request will just directed through the ARC.
252 		 */
253 		ioflag &= ~O_DIRECT;
254 		goto out;
255 	} else if (os->os_direct == ZFS_DIRECT_ALWAYS &&
256 	    zfs_uio_page_aligned(uio) &&
257 	    zfs_uio_aligned(uio, PAGE_SIZE)) {
258 		if ((rw == UIO_WRITE && zfs_uio_resid(uio) >= zp->z_blksz) ||
259 		    (rw == UIO_READ)) {
260 			ioflag |= O_DIRECT;
261 		}
262 	} else if (os->os_direct == ZFS_DIRECT_ALWAYS && (ioflag & O_DIRECT)) {
263 		/*
264 		 * Direct I/O was requested through the direct=always, but it
265 		 * is not properly PAGE_SIZE aligned. The request will be
266 		 * directed through the ARC.
267 		 */
268 		ioflag &= ~O_DIRECT;
269 	}
270 
271 	if (ioflag & O_DIRECT) {
272 		if (!zfs_uio_page_aligned(uio) ||
273 		    !zfs_uio_aligned(uio, PAGE_SIZE)) {
274 			error = SET_ERROR(EINVAL);
275 			goto out;
276 		}
277 
278 		error = zfs_uio_get_dio_pages_alloc(uio, rw);
279 		if (error) {
280 			goto out;
281 		}
282 	}
283 
284 	IMPLY(ioflag & O_DIRECT, uio->uio_extflg & UIO_DIRECT);
285 	ASSERT0(error);
286 
287 out:
288 	*ioflagp = ioflag;
289 	return (error);
290 }
291 
292 /*
293  * Read bytes from specified file into supplied buffer.
294  *
295  *	IN:	zp	- inode of file to be read from.
296  *		uio	- structure supplying read location, range info,
297  *			  and return buffer.
298  *		ioflag	- O_SYNC flags; used to provide FRSYNC semantics.
299  *			  O_DIRECT flag; used to bypass page cache.
300  *		cr	- credentials of caller.
301  *
302  *	OUT:	uio	- updated offset and range, buffer filled.
303  *
304  *	RETURN:	0 on success, error code on failure.
305  *
306  * Side Effects:
307  *	inode - atime updated if byte count > 0
308  */
309 int
zfs_read(struct znode * zp,zfs_uio_t * uio,int ioflag,cred_t * cr)310 zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
311 {
312 	(void) cr;
313 	int error = 0;
314 	boolean_t frsync = B_FALSE;
315 	boolean_t dio_checksum_failure = B_FALSE;
316 
317 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
318 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
319 		return (error);
320 
321 	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
322 		zfs_exit(zfsvfs, FTAG);
323 		return (SET_ERROR(EACCES));
324 	}
325 
326 	/* We don't copy out anything useful for directories. */
327 	if (Z_ISDIR(ZTOTYPE(zp))) {
328 		zfs_exit(zfsvfs, FTAG);
329 		return (SET_ERROR(EISDIR));
330 	}
331 
332 	/*
333 	 * Validate file offset
334 	 */
335 	if (zfs_uio_offset(uio) < (offset_t)0) {
336 		zfs_exit(zfsvfs, FTAG);
337 		return (SET_ERROR(EINVAL));
338 	}
339 
340 	/*
341 	 * Fasttrack empty reads
342 	 */
343 	if (zfs_uio_resid(uio) == 0) {
344 		zfs_exit(zfsvfs, FTAG);
345 		return (0);
346 	}
347 
348 #ifdef FRSYNC
349 	/*
350 	 * If we're in FRSYNC mode, sync out this znode before reading it.
351 	 * Only do this for non-snapshots.
352 	 *
353 	 * Some platforms do not support FRSYNC and instead map it
354 	 * to O_SYNC, which results in unnecessary calls to zil_commit. We
355 	 * only honor FRSYNC requests on platforms which support it.
356 	 */
357 	frsync = !!(ioflag & FRSYNC);
358 #endif
359 	if (zfsvfs->z_log &&
360 	    (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
361 		zil_commit(zfsvfs->z_log, zp->z_id);
362 
363 	/*
364 	 * Lock the range against changes.
365 	 */
366 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
367 	    zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER);
368 
369 	/*
370 	 * If we are reading past end-of-file we can skip
371 	 * to the end; but we might still need to set atime.
372 	 */
373 	if (zfs_uio_offset(uio) >= zp->z_size) {
374 		error = 0;
375 		goto out;
376 	}
377 	ASSERT(zfs_uio_offset(uio) < zp->z_size);
378 
379 	/*
380 	 * Setting up Direct I/O if requested.
381 	 */
382 	error = zfs_setup_direct(zp, uio, UIO_READ, &ioflag);
383 	if (error) {
384 		goto out;
385 	}
386 
387 #if defined(__linux__)
388 	ssize_t start_offset = zfs_uio_offset(uio);
389 #endif
390 	ssize_t chunk_size = zfs_vnops_read_chunk_size;
391 	ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio));
392 	ssize_t start_resid = n;
393 	ssize_t dio_remaining_resid = 0;
394 
395 	if (uio->uio_extflg & UIO_DIRECT) {
396 		/*
397 		 * All pages for an O_DIRECT request ahve already been mapped
398 		 * so there's no compelling reason to handle this uio in
399 		 * smaller chunks.
400 		 */
401 		chunk_size = DMU_MAX_ACCESS;
402 
403 		/*
404 		 * In the event that the O_DIRECT request is reading the entire
405 		 * file, it is possible file's length is not page sized
406 		 * aligned. However, lower layers expect that the Direct I/O
407 		 * request is page-aligned. In this case, as much of the file
408 		 * that can be read using Direct I/O happens and the remaining
409 		 * amount will be read through the ARC.
410 		 *
411 		 * This is still consistent with the semantics of Direct I/O in
412 		 * ZFS as at a minimum the I/O request must be page-aligned.
413 		 */
414 		dio_remaining_resid = n - P2ALIGN_TYPED(n, PAGE_SIZE, ssize_t);
415 		if (dio_remaining_resid != 0)
416 			n -= dio_remaining_resid;
417 	}
418 
419 	while (n > 0) {
420 		ssize_t nbytes = MIN(n, chunk_size -
421 		    P2PHASE(zfs_uio_offset(uio), chunk_size));
422 #ifdef UIO_NOCOPY
423 		if (zfs_uio_segflg(uio) == UIO_NOCOPY)
424 			error = mappedread_sf(zp, nbytes, uio);
425 		else
426 #endif
427 		if (zn_has_cached_data(zp, zfs_uio_offset(uio),
428 		    zfs_uio_offset(uio) + nbytes - 1)) {
429 			error = mappedread(zp, nbytes, uio);
430 		} else {
431 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
432 			    uio, nbytes);
433 		}
434 
435 		if (error) {
436 			/* convert checksum errors into IO errors */
437 			if (error == ECKSUM) {
438 				/*
439 				 * If a Direct I/O read returned a checksum
440 				 * verify error, then it must be treated as
441 				 * suspicious. The contents of the buffer could
442 				 * have beeen manipulated while the I/O was in
443 				 * flight. In this case, the remainder of I/O
444 				 * request will just be reissued through the
445 				 * ARC.
446 				 */
447 				if (uio->uio_extflg & UIO_DIRECT) {
448 					dio_checksum_failure = B_TRUE;
449 					uio->uio_extflg &= ~UIO_DIRECT;
450 					n += dio_remaining_resid;
451 					dio_remaining_resid = 0;
452 					continue;
453 				} else {
454 					error = SET_ERROR(EIO);
455 				}
456 			}
457 
458 #if defined(__linux__)
459 			/*
460 			 * if we actually read some bytes, bubbling EFAULT
461 			 * up to become EAGAIN isn't what we want here...
462 			 *
463 			 * ...on Linux, at least. On FBSD, doing this breaks.
464 			 */
465 			if (error == EFAULT &&
466 			    (zfs_uio_offset(uio) - start_offset) != 0)
467 				error = 0;
468 #endif
469 			break;
470 		}
471 
472 		n -= nbytes;
473 	}
474 
475 	if (error == 0 && (uio->uio_extflg & UIO_DIRECT) &&
476 	    dio_remaining_resid != 0) {
477 		/*
478 		 * Temporarily remove the UIO_DIRECT flag from the UIO so the
479 		 * remainder of the file can be read using the ARC.
480 		 */
481 		uio->uio_extflg &= ~UIO_DIRECT;
482 
483 		if (zn_has_cached_data(zp, zfs_uio_offset(uio),
484 		    zfs_uio_offset(uio) + dio_remaining_resid - 1)) {
485 			error = mappedread(zp, dio_remaining_resid, uio);
486 		} else {
487 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio,
488 			    dio_remaining_resid);
489 		}
490 		uio->uio_extflg |= UIO_DIRECT;
491 
492 		if (error != 0)
493 			n += dio_remaining_resid;
494 	} else if (error && (uio->uio_extflg & UIO_DIRECT)) {
495 		n += dio_remaining_resid;
496 	}
497 	int64_t nread = start_resid - n;
498 
499 	dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
500 out:
501 	zfs_rangelock_exit(lr);
502 
503 	if (dio_checksum_failure == B_TRUE)
504 		uio->uio_extflg |= UIO_DIRECT;
505 
506 	/*
507 	 * Cleanup for Direct I/O if requested.
508 	 */
509 	if (uio->uio_extflg & UIO_DIRECT)
510 		zfs_uio_free_dio_pages(uio, UIO_READ);
511 
512 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
513 	zfs_exit(zfsvfs, FTAG);
514 	return (error);
515 }
516 
517 static void
zfs_clear_setid_bits_if_necessary(zfsvfs_t * zfsvfs,znode_t * zp,cred_t * cr,uint64_t * clear_setid_bits_txgp,dmu_tx_t * tx)518 zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr,
519     uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx)
520 {
521 	zilog_t *zilog = zfsvfs->z_log;
522 	const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
523 
524 	ASSERT(clear_setid_bits_txgp != NULL);
525 	ASSERT(tx != NULL);
526 
527 	/*
528 	 * Clear Set-UID/Set-GID bits on successful write if not
529 	 * privileged and at least one of the execute bits is set.
530 	 *
531 	 * It would be nice to do this after all writes have
532 	 * been done, but that would still expose the ISUID/ISGID
533 	 * to another app after the partial write is committed.
534 	 *
535 	 * Note: we don't call zfs_fuid_map_id() here because
536 	 * user 0 is not an ephemeral uid.
537 	 */
538 	mutex_enter(&zp->z_acl_lock);
539 	if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 &&
540 	    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
541 	    secpolicy_vnode_setid_retain(zp, cr,
542 	    ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
543 		uint64_t newmode;
544 
545 		zp->z_mode &= ~(S_ISUID | S_ISGID);
546 		newmode = zp->z_mode;
547 		(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
548 		    (void *)&newmode, sizeof (uint64_t), tx);
549 
550 		mutex_exit(&zp->z_acl_lock);
551 
552 		/*
553 		 * Make sure SUID/SGID bits will be removed when we replay the
554 		 * log. If the setid bits are keep coming back, don't log more
555 		 * than one TX_SETATTR per transaction group.
556 		 */
557 		if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) {
558 			vattr_t va = {0};
559 
560 			va.va_mask = ATTR_MODE;
561 			va.va_nodeid = zp->z_id;
562 			va.va_mode = newmode;
563 			zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va,
564 			    ATTR_MODE, NULL);
565 			*clear_setid_bits_txgp = dmu_tx_get_txg(tx);
566 		}
567 	} else {
568 		mutex_exit(&zp->z_acl_lock);
569 	}
570 }
571 
572 /*
573  * Write the bytes to a file.
574  *
575  *	IN:	zp	- znode of file to be written to.
576  *		uio	- structure supplying write location, range info,
577  *			  and data buffer.
578  *		ioflag	- O_APPEND flag set if in append mode.
579  *			  O_DIRECT flag; used to bypass page cache.
580  *		cr	- credentials of caller.
581  *
582  *	OUT:	uio	- updated offset and range.
583  *
584  *	RETURN:	0 if success
585  *		error code if failure
586  *
587  * Timestamps:
588  *	ip - ctime|mtime updated if byte count > 0
589  */
590 int
zfs_write(znode_t * zp,zfs_uio_t * uio,int ioflag,cred_t * cr)591 zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
592 {
593 	int error = 0, error1;
594 	ssize_t start_resid = zfs_uio_resid(uio);
595 	uint64_t clear_setid_bits_txg = 0;
596 	boolean_t o_direct_defer = B_FALSE;
597 
598 	/*
599 	 * Fasttrack empty write
600 	 */
601 	ssize_t n = start_resid;
602 	if (n == 0)
603 		return (0);
604 
605 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
606 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
607 		return (error);
608 
609 	sa_bulk_attr_t bulk[4];
610 	int count = 0;
611 	uint64_t mtime[2], ctime[2];
612 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
613 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
614 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
615 	    &zp->z_size, 8);
616 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
617 	    &zp->z_pflags, 8);
618 
619 	/*
620 	 * Callers might not be able to detect properly that we are read-only,
621 	 * so check it explicitly here.
622 	 */
623 	if (zfs_is_readonly(zfsvfs)) {
624 		zfs_exit(zfsvfs, FTAG);
625 		return (SET_ERROR(EROFS));
626 	}
627 
628 	/*
629 	 * If immutable or not appending then return EPERM.
630 	 * Intentionally allow ZFS_READONLY through here.
631 	 * See zfs_zaccess_common()
632 	 */
633 	if ((zp->z_pflags & ZFS_IMMUTABLE) ||
634 	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) &&
635 	    (zfs_uio_offset(uio) < zp->z_size))) {
636 		zfs_exit(zfsvfs, FTAG);
637 		return (SET_ERROR(EPERM));
638 	}
639 
640 	/*
641 	 * Validate file offset
642 	 */
643 	offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio);
644 	if (woff < 0) {
645 		zfs_exit(zfsvfs, FTAG);
646 		return (SET_ERROR(EINVAL));
647 	}
648 
649 	/*
650 	 * Setting up Direct I/O if requested.
651 	 */
652 	error = zfs_setup_direct(zp, uio, UIO_WRITE, &ioflag);
653 	if (error) {
654 		zfs_exit(zfsvfs, FTAG);
655 		return (SET_ERROR(error));
656 	}
657 
658 	/*
659 	 * Pre-fault the pages to ensure slow (eg NFS) pages
660 	 * don't hold up txg.
661 	 */
662 	ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1);
663 	if (zfs_uio_prefaultpages(pfbytes, uio)) {
664 		zfs_exit(zfsvfs, FTAG);
665 		return (SET_ERROR(EFAULT));
666 	}
667 
668 	/*
669 	 * If in append mode, set the io offset pointer to eof.
670 	 */
671 	zfs_locked_range_t *lr;
672 	if (ioflag & O_APPEND) {
673 		/*
674 		 * Obtain an appending range lock to guarantee file append
675 		 * semantics.  We reset the write offset once we have the lock.
676 		 */
677 		lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
678 		woff = lr->lr_offset;
679 		if (lr->lr_length == UINT64_MAX) {
680 			/*
681 			 * We overlocked the file because this write will cause
682 			 * the file block size to increase.
683 			 * Note that zp_size cannot change with this lock held.
684 			 */
685 			woff = zp->z_size;
686 		}
687 		zfs_uio_setoffset(uio, woff);
688 		/*
689 		 * We need to update the starting offset as well because it is
690 		 * set previously in the ZPL (Linux) and VNOPS (FreeBSD)
691 		 * layers.
692 		 */
693 		zfs_uio_setsoffset(uio, woff);
694 	} else {
695 		/*
696 		 * Note that if the file block size will change as a result of
697 		 * this write, then this range lock will lock the entire file
698 		 * so that we can re-write the block safely.
699 		 */
700 		lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
701 	}
702 
703 	if (zn_rlimit_fsize_uio(zp, uio)) {
704 		zfs_rangelock_exit(lr);
705 		zfs_exit(zfsvfs, FTAG);
706 		return (SET_ERROR(EFBIG));
707 	}
708 
709 	const rlim64_t limit = MAXOFFSET_T;
710 
711 	if (woff >= limit) {
712 		zfs_rangelock_exit(lr);
713 		zfs_exit(zfsvfs, FTAG);
714 		return (SET_ERROR(EFBIG));
715 	}
716 
717 	if (n > limit - woff)
718 		n = limit - woff;
719 
720 	uint64_t end_size = MAX(zp->z_size, woff + n);
721 	zilog_t *zilog = zfsvfs->z_log;
722 	boolean_t commit = (ioflag & (O_SYNC | O_DSYNC)) ||
723 	    (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS);
724 
725 	const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
726 	const uint64_t gid = KGID_TO_SGID(ZTOGID(zp));
727 	const uint64_t projid = zp->z_projid;
728 
729 	/*
730 	 * In the event we are increasing the file block size
731 	 * (lr_length == UINT64_MAX), we will direct the write to the ARC.
732 	 * Because zfs_grow_blocksize() will read from the ARC in order to
733 	 * grow the dbuf, we avoid doing Direct I/O here as that would cause
734 	 * data written to disk to be overwritten by data in the ARC during
735 	 * the sync phase. Besides writing data twice to disk, we also
736 	 * want to avoid consistency concerns between data in the the ARC and
737 	 * on disk while growing the file's blocksize.
738 	 *
739 	 * We will only temporarily remove Direct I/O and put it back after
740 	 * we have grown the blocksize. We do this in the event a request
741 	 * is larger than max_blksz, so further requests to
742 	 * dmu_write_uio_dbuf() will still issue the requests using Direct
743 	 * IO.
744 	 *
745 	 * As an example:
746 	 * The first block to file is being written as a 4k request with
747 	 * a recorsize of 1K. The first 1K issued in the loop below will go
748 	 * through the ARC; however, the following 3 1K requests will
749 	 * use Direct I/O.
750 	 */
751 	if (uio->uio_extflg & UIO_DIRECT && lr->lr_length == UINT64_MAX) {
752 		uio->uio_extflg &= ~UIO_DIRECT;
753 		o_direct_defer = B_TRUE;
754 	}
755 
756 	/*
757 	 * Write the file in reasonable size chunks.  Each chunk is written
758 	 * in a separate transaction; this keeps the intent log records small
759 	 * and allows us to do more fine-grained space accounting.
760 	 */
761 	while (n > 0) {
762 		woff = zfs_uio_offset(uio);
763 
764 		if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) ||
765 		    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) ||
766 		    (projid != ZFS_DEFAULT_PROJID &&
767 		    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
768 		    projid))) {
769 			error = SET_ERROR(EDQUOT);
770 			break;
771 		}
772 
773 		uint64_t blksz;
774 		if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) {
775 			if (zp->z_blksz > zfsvfs->z_max_blksz &&
776 			    !ISP2(zp->z_blksz)) {
777 				/*
778 				 * File's blocksize is already larger than the
779 				 * "recordsize" property.  Only let it grow to
780 				 * the next power of 2.
781 				 */
782 				blksz = 1 << highbit64(zp->z_blksz);
783 			} else {
784 				blksz = zfsvfs->z_max_blksz;
785 			}
786 			blksz = MIN(blksz, P2ROUNDUP(end_size,
787 			    SPA_MINBLOCKSIZE));
788 			blksz = MAX(blksz, zp->z_blksz);
789 		} else {
790 			blksz = zp->z_blksz;
791 		}
792 
793 		arc_buf_t *abuf = NULL;
794 		ssize_t nbytes = n;
795 		if (n >= blksz && woff >= zp->z_size &&
796 		    P2PHASE(woff, blksz) == 0 &&
797 		    !(uio->uio_extflg & UIO_DIRECT) &&
798 		    (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
799 			/*
800 			 * This write covers a full block.  "Borrow" a buffer
801 			 * from the dmu so that we can fill it before we enter
802 			 * a transaction.  This avoids the possibility of
803 			 * holding up the transaction if the data copy hangs
804 			 * up on a pagefault (e.g., from an NFS server mapping).
805 			 */
806 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
807 			    blksz);
808 			ASSERT(abuf != NULL);
809 			ASSERT(arc_buf_size(abuf) == blksz);
810 			if ((error = zfs_uiocopy(abuf->b_data, blksz,
811 			    UIO_WRITE, uio, &nbytes))) {
812 				dmu_return_arcbuf(abuf);
813 				break;
814 			}
815 			ASSERT3S(nbytes, ==, blksz);
816 		} else {
817 			nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) -
818 			    P2PHASE(woff, blksz));
819 			if (pfbytes < nbytes) {
820 				if (zfs_uio_prefaultpages(nbytes, uio)) {
821 					error = SET_ERROR(EFAULT);
822 					break;
823 				}
824 				pfbytes = nbytes;
825 			}
826 		}
827 
828 		/*
829 		 * Start a transaction.
830 		 */
831 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
832 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
833 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
834 		DB_DNODE_ENTER(db);
835 		dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes);
836 		DB_DNODE_EXIT(db);
837 		zfs_sa_upgrade_txholds(tx, zp);
838 		error = dmu_tx_assign(tx, DMU_TX_WAIT);
839 		if (error) {
840 			dmu_tx_abort(tx);
841 			if (abuf != NULL)
842 				dmu_return_arcbuf(abuf);
843 			break;
844 		}
845 
846 		/*
847 		 * NB: We must call zfs_clear_setid_bits_if_necessary before
848 		 * committing the transaction!
849 		 */
850 
851 		/*
852 		 * If rangelock_enter() over-locked we grow the blocksize
853 		 * and then reduce the lock range.  This will only happen
854 		 * on the first iteration since rangelock_reduce() will
855 		 * shrink down lr_length to the appropriate size.
856 		 */
857 		if (lr->lr_length == UINT64_MAX) {
858 			zfs_grow_blocksize(zp, blksz, tx);
859 			zfs_rangelock_reduce(lr, woff, n);
860 		}
861 
862 		ssize_t tx_bytes;
863 		if (abuf == NULL) {
864 			tx_bytes = zfs_uio_resid(uio);
865 			zfs_uio_fault_disable(uio, B_TRUE);
866 			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
867 			    uio, nbytes, tx);
868 			zfs_uio_fault_disable(uio, B_FALSE);
869 #ifdef __linux__
870 			if (error == EFAULT) {
871 				zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
872 				    cr, &clear_setid_bits_txg, tx);
873 				dmu_tx_commit(tx);
874 				/*
875 				 * Account for partial writes before
876 				 * continuing the loop.
877 				 * Update needs to occur before the next
878 				 * zfs_uio_prefaultpages, or prefaultpages may
879 				 * error, and we may break the loop early.
880 				 */
881 				n -= tx_bytes - zfs_uio_resid(uio);
882 				pfbytes -= tx_bytes - zfs_uio_resid(uio);
883 				continue;
884 			}
885 #endif
886 			/*
887 			 * On FreeBSD, EFAULT should be propagated back to the
888 			 * VFS, which will handle faulting and will retry.
889 			 */
890 			if (error != 0 && error != EFAULT) {
891 				zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
892 				    cr, &clear_setid_bits_txg, tx);
893 				dmu_tx_commit(tx);
894 				break;
895 			}
896 			tx_bytes -= zfs_uio_resid(uio);
897 		} else {
898 			/*
899 			 * Thus, we're writing a full block at a block-aligned
900 			 * offset and extending the file past EOF.
901 			 *
902 			 * dmu_assign_arcbuf_by_dbuf() will directly assign the
903 			 * arc buffer to a dbuf.
904 			 */
905 			error = dmu_assign_arcbuf_by_dbuf(
906 			    sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
907 			if (error != 0) {
908 				/*
909 				 * XXX This might not be necessary if
910 				 * dmu_assign_arcbuf_by_dbuf is guaranteed
911 				 * to be atomic.
912 				 */
913 				zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
914 				    cr, &clear_setid_bits_txg, tx);
915 				dmu_return_arcbuf(abuf);
916 				dmu_tx_commit(tx);
917 				break;
918 			}
919 			ASSERT3S(nbytes, <=, zfs_uio_resid(uio));
920 			zfs_uioskip(uio, nbytes);
921 			tx_bytes = nbytes;
922 		}
923 		/*
924 		 * There is a window where a file's pages can be mmap'ed after
925 		 * zfs_setup_direct() is called. This is due to the fact that
926 		 * the rangelock in this function is acquired after calling
927 		 * zfs_setup_direct(). This is done so that
928 		 * zfs_uio_prefaultpages() does not attempt to fault in pages
929 		 * on Linux for Direct I/O requests. This is not necessary as
930 		 * the pages are pinned in memory and can not be faulted out.
931 		 * Ideally, the rangelock would be held before calling
932 		 * zfs_setup_direct() and zfs_uio_prefaultpages(); however,
933 		 * this can lead to a deadlock as zfs_getpage() also acquires
934 		 * the rangelock as a RL_WRITER and prefaulting the pages can
935 		 * lead to zfs_getpage() being called.
936 		 *
937 		 * In the case of the pages being mapped after
938 		 * zfs_setup_direct() is called, the call to update_pages()
939 		 * will still be made to make sure there is consistency between
940 		 * the ARC and the Linux page cache. This is an ufortunate
941 		 * situation as the data will be read back into the ARC after
942 		 * the Direct I/O write has completed, but this is the penality
943 		 * for writing to a mmap'ed region of a file using Direct I/O.
944 		 */
945 		if (tx_bytes &&
946 		    zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) {
947 			update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
948 		}
949 
950 		/*
951 		 * If we made no progress, we're done.  If we made even
952 		 * partial progress, update the znode and ZIL accordingly.
953 		 */
954 		if (tx_bytes == 0) {
955 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
956 			    (void *)&zp->z_size, sizeof (uint64_t), tx);
957 			dmu_tx_commit(tx);
958 			ASSERT(error != 0);
959 			break;
960 		}
961 
962 		zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr,
963 		    &clear_setid_bits_txg, tx);
964 
965 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
966 
967 		/*
968 		 * Update the file size (zp_size) if it has changed;
969 		 * account for possible concurrent updates.
970 		 */
971 		while ((end_size = zp->z_size) < zfs_uio_offset(uio)) {
972 			(void) atomic_cas_64(&zp->z_size, end_size,
973 			    zfs_uio_offset(uio));
974 			ASSERT(error == 0 || error == EFAULT);
975 		}
976 		/*
977 		 * If we are replaying and eof is non zero then force
978 		 * the file size to the specified eof. Note, there's no
979 		 * concurrency during replay.
980 		 */
981 		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
982 			zp->z_size = zfsvfs->z_replay_eof;
983 
984 		error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
985 		if (error1 != 0)
986 			/* Avoid clobbering EFAULT. */
987 			error = error1;
988 
989 		/*
990 		 * NB: During replay, the TX_SETATTR record logged by
991 		 * zfs_clear_setid_bits_if_necessary must precede any of
992 		 * the TX_WRITE records logged here.
993 		 */
994 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit,
995 		    uio->uio_extflg & UIO_DIRECT ? B_TRUE : B_FALSE, NULL,
996 		    NULL);
997 
998 		dmu_tx_commit(tx);
999 
1000 		/*
1001 		 * Direct I/O was deferred in order to grow the first block.
1002 		 * At this point it can be re-enabled for subsequent writes.
1003 		 */
1004 		if (o_direct_defer) {
1005 			ASSERT(ioflag & O_DIRECT);
1006 			uio->uio_extflg |= UIO_DIRECT;
1007 			o_direct_defer = B_FALSE;
1008 		}
1009 
1010 		if (error != 0)
1011 			break;
1012 		ASSERT3S(tx_bytes, ==, nbytes);
1013 		n -= nbytes;
1014 		pfbytes -= nbytes;
1015 	}
1016 
1017 	if (o_direct_defer) {
1018 		ASSERT(ioflag & O_DIRECT);
1019 		uio->uio_extflg |= UIO_DIRECT;
1020 		o_direct_defer = B_FALSE;
1021 	}
1022 
1023 	zfs_znode_update_vfs(zp);
1024 	zfs_rangelock_exit(lr);
1025 
1026 	/*
1027 	 * Cleanup for Direct I/O if requested.
1028 	 */
1029 	if (uio->uio_extflg & UIO_DIRECT)
1030 		zfs_uio_free_dio_pages(uio, UIO_WRITE);
1031 
1032 	/*
1033 	 * If we're in replay mode, or we made no progress, or the
1034 	 * uio data is inaccessible return an error.  Otherwise, it's
1035 	 * at least a partial write, so it's successful.
1036 	 */
1037 	if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid ||
1038 	    error == EFAULT) {
1039 		zfs_exit(zfsvfs, FTAG);
1040 		return (error);
1041 	}
1042 
1043 	if (commit)
1044 		zil_commit(zilog, zp->z_id);
1045 
1046 	int64_t nwritten = start_resid - zfs_uio_resid(uio);
1047 	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
1048 
1049 	zfs_exit(zfsvfs, FTAG);
1050 	return (0);
1051 }
1052 
1053 int
zfs_getsecattr(znode_t * zp,vsecattr_t * vsecp,int flag,cred_t * cr)1054 zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
1055 {
1056 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1057 	int error;
1058 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1059 
1060 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1061 		return (error);
1062 	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
1063 	zfs_exit(zfsvfs, FTAG);
1064 
1065 	return (error);
1066 }
1067 
1068 int
zfs_setsecattr(znode_t * zp,vsecattr_t * vsecp,int flag,cred_t * cr)1069 zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
1070 {
1071 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1072 	int error;
1073 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
1074 	zilog_t	*zilog;
1075 
1076 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1077 		return (error);
1078 	zilog = zfsvfs->z_log;
1079 	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
1080 
1081 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1082 		zil_commit(zilog, 0);
1083 
1084 	zfs_exit(zfsvfs, FTAG);
1085 	return (error);
1086 }
1087 
1088 /*
1089  * Get the optimal alignment to ensure direct IO can be performed without
1090  * incurring any RMW penalty on write. If direct IO is not enabled for this
1091  * file, returns an error.
1092  */
1093 int
zfs_get_direct_alignment(znode_t * zp,uint64_t * alignp)1094 zfs_get_direct_alignment(znode_t *zp, uint64_t *alignp)
1095 {
1096 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1097 
1098 	if (!zfs_dio_enabled || zfsvfs->z_os->os_direct == ZFS_DIRECT_DISABLED)
1099 		return (SET_ERROR(EOPNOTSUPP));
1100 
1101 	/*
1102 	 * If the file has multiple blocks, then its block size is fixed
1103 	 * forever, and so is the ideal alignment.
1104 	 *
1105 	 * If however it only has a single block, then we want to return the
1106 	 * max block size it could possibly grown to (ie, the dataset
1107 	 * recordsize). We do this so that a program querying alignment
1108 	 * immediately after the file is created gets a value that won't change
1109 	 * once the file has grown into the second block and beyond.
1110 	 *
1111 	 * Because we don't have a count of blocks easily available here, we
1112 	 * check if the apparent file size is smaller than its current block
1113 	 * size (meaning, the file hasn't yet grown into the current block
1114 	 * size) and then, check if the block size is smaller than the dataset
1115 	 * maximum (meaning, if the file grew past the current block size, the
1116 	 * block size could would be increased).
1117 	 */
1118 	if (zp->z_size <= zp->z_blksz && zp->z_blksz < zfsvfs->z_max_blksz)
1119 		*alignp = MAX(zfsvfs->z_max_blksz, PAGE_SIZE);
1120 	else
1121 		*alignp = MAX(zp->z_blksz, PAGE_SIZE);
1122 
1123 	return (0);
1124 }
1125 
1126 #ifdef ZFS_DEBUG
1127 static int zil_fault_io = 0;
1128 #endif
1129 
1130 static void zfs_get_done(zgd_t *zgd, int error);
1131 
1132 /*
1133  * Get data to generate a TX_WRITE intent log record.
1134  */
1135 int
zfs_get_data(void * arg,uint64_t gen,lr_write_t * lr,char * buf,struct lwb * lwb,zio_t * zio)1136 zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
1137     struct lwb *lwb, zio_t *zio)
1138 {
1139 	zfsvfs_t *zfsvfs = arg;
1140 	objset_t *os = zfsvfs->z_os;
1141 	znode_t *zp;
1142 	uint64_t object = lr->lr_foid;
1143 	uint64_t offset = lr->lr_offset;
1144 	uint64_t size = lr->lr_length;
1145 	zgd_t *zgd;
1146 	int error = 0;
1147 	uint64_t zp_gen;
1148 
1149 	ASSERT3P(lwb, !=, NULL);
1150 	ASSERT3U(size, !=, 0);
1151 
1152 	/*
1153 	 * Nothing to do if the file has been removed
1154 	 */
1155 	if (zfs_zget(zfsvfs, object, &zp) != 0)
1156 		return (SET_ERROR(ENOENT));
1157 	if (zp->z_unlinked) {
1158 		/*
1159 		 * Release the vnode asynchronously as we currently have the
1160 		 * txg stopped from syncing.
1161 		 */
1162 		zfs_zrele_async(zp);
1163 		return (SET_ERROR(ENOENT));
1164 	}
1165 	/* check if generation number matches */
1166 	if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1167 	    sizeof (zp_gen)) != 0) {
1168 		zfs_zrele_async(zp);
1169 		return (SET_ERROR(EIO));
1170 	}
1171 	if (zp_gen != gen) {
1172 		zfs_zrele_async(zp);
1173 		return (SET_ERROR(ENOENT));
1174 	}
1175 
1176 	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1177 	zgd->zgd_lwb = lwb;
1178 	zgd->zgd_private = zp;
1179 
1180 	/*
1181 	 * Write records come in two flavors: immediate and indirect.
1182 	 * For small writes it's cheaper to store the data with the
1183 	 * log record (immediate); for large writes it's cheaper to
1184 	 * sync the data and get a pointer to it (indirect) so that
1185 	 * we don't have to write the data twice.
1186 	 */
1187 	if (buf != NULL) { /* immediate write */
1188 		zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset,
1189 		    size, RL_READER);
1190 		/* test for truncation needs to be done while range locked */
1191 		if (offset >= zp->z_size) {
1192 			error = SET_ERROR(ENOENT);
1193 		} else {
1194 			error = dmu_read(os, object, offset, size, buf,
1195 			    DMU_READ_NO_PREFETCH);
1196 		}
1197 		ASSERT(error == 0 || error == ENOENT);
1198 	} else { /* indirect write */
1199 		ASSERT3P(zio, !=, NULL);
1200 		/*
1201 		 * Have to lock the whole block to ensure when it's
1202 		 * written out and its checksum is being calculated
1203 		 * that no one can change the data. We need to re-check
1204 		 * blocksize after we get the lock in case it's changed!
1205 		 */
1206 		for (;;) {
1207 			uint64_t blkoff;
1208 			size = zp->z_blksz;
1209 			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1210 			offset -= blkoff;
1211 			zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
1212 			    offset, size, RL_READER);
1213 			if (zp->z_blksz == size)
1214 				break;
1215 			offset += blkoff;
1216 			zfs_rangelock_exit(zgd->zgd_lr);
1217 		}
1218 		/* test for truncation needs to be done while range locked */
1219 		if (lr->lr_offset >= zp->z_size)
1220 			error = SET_ERROR(ENOENT);
1221 #ifdef ZFS_DEBUG
1222 		if (zil_fault_io) {
1223 			error = SET_ERROR(EIO);
1224 			zil_fault_io = 0;
1225 		}
1226 #endif
1227 
1228 		dmu_buf_t *dbp;
1229 		if (error == 0)
1230 			error = dmu_buf_hold_noread(os, object, offset, zgd,
1231 			    &dbp);
1232 
1233 		if (error == 0) {
1234 			zgd->zgd_db = dbp;
1235 			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp;
1236 			boolean_t direct_write = B_FALSE;
1237 			mutex_enter(&db->db_mtx);
1238 			dbuf_dirty_record_t *dr =
1239 			    dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg);
1240 			if (dr != NULL && dr->dt.dl.dr_diowrite)
1241 				direct_write = B_TRUE;
1242 			mutex_exit(&db->db_mtx);
1243 
1244 			/*
1245 			 * All Direct I/O writes will have already completed and
1246 			 * the block pointer can be immediately stored in the
1247 			 * log record.
1248 			 */
1249 			if (direct_write) {
1250 				/*
1251 				 * A Direct I/O write always covers an entire
1252 				 * block.
1253 				 */
1254 				ASSERT3U(dbp->db_size, ==, zp->z_blksz);
1255 				lr->lr_blkptr = dr->dt.dl.dr_overridden_by;
1256 				zfs_get_done(zgd, 0);
1257 				return (0);
1258 			}
1259 
1260 			blkptr_t *bp = &lr->lr_blkptr;
1261 			zgd->zgd_bp = bp;
1262 
1263 			ASSERT3U(dbp->db_offset, ==, offset);
1264 			ASSERT3U(dbp->db_size, ==, size);
1265 
1266 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1267 			    zfs_get_done, zgd);
1268 			ASSERT(error || lr->lr_length <= size);
1269 
1270 			/*
1271 			 * On success, we need to wait for the write I/O
1272 			 * initiated by dmu_sync() to complete before we can
1273 			 * release this dbuf.  We will finish everything up
1274 			 * in the zfs_get_done() callback.
1275 			 */
1276 			if (error == 0)
1277 				return (0);
1278 
1279 			if (error == EALREADY) {
1280 				lr->lr_common.lrc_txtype = TX_WRITE2;
1281 				/*
1282 				 * TX_WRITE2 relies on the data previously
1283 				 * written by the TX_WRITE that caused
1284 				 * EALREADY.  We zero out the BP because
1285 				 * it is the old, currently-on-disk BP.
1286 				 */
1287 				zgd->zgd_bp = NULL;
1288 				BP_ZERO(bp);
1289 				error = 0;
1290 			}
1291 		}
1292 	}
1293 
1294 	zfs_get_done(zgd, error);
1295 
1296 	return (error);
1297 }
1298 
1299 static void
zfs_get_done(zgd_t * zgd,int error)1300 zfs_get_done(zgd_t *zgd, int error)
1301 {
1302 	(void) error;
1303 	znode_t *zp = zgd->zgd_private;
1304 
1305 	if (zgd->zgd_db)
1306 		dmu_buf_rele(zgd->zgd_db, zgd);
1307 
1308 	zfs_rangelock_exit(zgd->zgd_lr);
1309 
1310 	/*
1311 	 * Release the vnode asynchronously as we currently have the
1312 	 * txg stopped from syncing.
1313 	 */
1314 	zfs_zrele_async(zp);
1315 
1316 	kmem_free(zgd, sizeof (zgd_t));
1317 }
1318 
1319 static int
zfs_enter_two(zfsvfs_t * zfsvfs1,zfsvfs_t * zfsvfs2,const char * tag)1320 zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
1321 {
1322 	int error;
1323 
1324 	/* Swap. Not sure if the order of zfs_enter()s is important. */
1325 	if (zfsvfs1 > zfsvfs2) {
1326 		zfsvfs_t *tmpzfsvfs;
1327 
1328 		tmpzfsvfs = zfsvfs2;
1329 		zfsvfs2 = zfsvfs1;
1330 		zfsvfs1 = tmpzfsvfs;
1331 	}
1332 
1333 	error = zfs_enter(zfsvfs1, tag);
1334 	if (error != 0)
1335 		return (error);
1336 	if (zfsvfs1 != zfsvfs2) {
1337 		error = zfs_enter(zfsvfs2, tag);
1338 		if (error != 0) {
1339 			zfs_exit(zfsvfs1, tag);
1340 			return (error);
1341 		}
1342 	}
1343 
1344 	return (0);
1345 }
1346 
1347 static void
zfs_exit_two(zfsvfs_t * zfsvfs1,zfsvfs_t * zfsvfs2,const char * tag)1348 zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
1349 {
1350 
1351 	zfs_exit(zfsvfs1, tag);
1352 	if (zfsvfs1 != zfsvfs2)
1353 		zfs_exit(zfsvfs2, tag);
1354 }
1355 
1356 /*
1357  * We split each clone request in chunks that can fit into a single ZIL
1358  * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning
1359  * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives
1360  * us room for storing 1022 block pointers.
1361  *
1362  * On success, the function return the number of bytes copied in *lenp.
1363  * Note, it doesn't return how much bytes are left to be copied.
1364  * On errors which are caused by any file system limitations or
1365  * brt limitations `EINVAL` is returned. In the most cases a user
1366  * requested bad parameters, it could be possible to clone the file but
1367  * some parameters don't match the requirements.
1368  */
1369 int
zfs_clone_range(znode_t * inzp,uint64_t * inoffp,znode_t * outzp,uint64_t * outoffp,uint64_t * lenp,cred_t * cr)1370 zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
1371     uint64_t *outoffp, uint64_t *lenp, cred_t *cr)
1372 {
1373 	zfsvfs_t	*inzfsvfs, *outzfsvfs;
1374 	objset_t	*inos, *outos;
1375 	zfs_locked_range_t *inlr, *outlr;
1376 	dmu_buf_impl_t	*db;
1377 	dmu_tx_t	*tx;
1378 	zilog_t		*zilog;
1379 	uint64_t	inoff, outoff, len, done;
1380 	uint64_t	outsize, size;
1381 	int		error;
1382 	int		count = 0;
1383 	sa_bulk_attr_t	bulk[3];
1384 	uint64_t	mtime[2], ctime[2];
1385 	uint64_t	uid, gid, projid;
1386 	blkptr_t	*bps;
1387 	size_t		maxblocks, nbps;
1388 	uint_t		inblksz;
1389 	uint64_t	clear_setid_bits_txg = 0;
1390 	uint64_t	last_synced_txg = 0;
1391 
1392 	inoff = *inoffp;
1393 	outoff = *outoffp;
1394 	len = *lenp;
1395 	done = 0;
1396 
1397 	inzfsvfs = ZTOZSB(inzp);
1398 	outzfsvfs = ZTOZSB(outzp);
1399 
1400 	/*
1401 	 * We need to call zfs_enter() potentially on two different datasets,
1402 	 * so we need a dedicated function for that.
1403 	 */
1404 	error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG);
1405 	if (error != 0)
1406 		return (error);
1407 
1408 	inos = inzfsvfs->z_os;
1409 	outos = outzfsvfs->z_os;
1410 
1411 	/*
1412 	 * Both source and destination have to belong to the same storage pool.
1413 	 */
1414 	if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) {
1415 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1416 		return (SET_ERROR(EXDEV));
1417 	}
1418 
1419 	/*
1420 	 * outos and inos belongs to the same storage pool.
1421 	 * see a few lines above, only one check.
1422 	 */
1423 	if (!spa_feature_is_enabled(dmu_objset_spa(outos),
1424 	    SPA_FEATURE_BLOCK_CLONING)) {
1425 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1426 		return (SET_ERROR(EOPNOTSUPP));
1427 	}
1428 
1429 	ASSERT(!outzfsvfs->z_replay);
1430 
1431 	/*
1432 	 * Block cloning from an unencrypted dataset into an encrypted
1433 	 * dataset and vice versa is not supported.
1434 	 */
1435 	if (inos->os_encrypted != outos->os_encrypted) {
1436 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1437 		return (SET_ERROR(EXDEV));
1438 	}
1439 
1440 	/*
1441 	 * Cloning across encrypted datasets is possible only if they
1442 	 * share the same master key.
1443 	 */
1444 	if (inos != outos && inos->os_encrypted &&
1445 	    !dmu_objset_crypto_key_equal(inos, outos)) {
1446 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1447 		return (SET_ERROR(EXDEV));
1448 	}
1449 
1450 	error = zfs_verify_zp(inzp);
1451 	if (error == 0)
1452 		error = zfs_verify_zp(outzp);
1453 	if (error != 0) {
1454 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1455 		return (error);
1456 	}
1457 
1458 	/*
1459 	 * We don't copy source file's flags that's why we don't allow to clone
1460 	 * files that are in quarantine.
1461 	 */
1462 	if (inzp->z_pflags & ZFS_AV_QUARANTINED) {
1463 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1464 		return (SET_ERROR(EACCES));
1465 	}
1466 
1467 	if (inoff >= inzp->z_size) {
1468 		*lenp = 0;
1469 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1470 		return (0);
1471 	}
1472 	if (len > inzp->z_size - inoff) {
1473 		len = inzp->z_size - inoff;
1474 	}
1475 	if (len == 0) {
1476 		*lenp = 0;
1477 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1478 		return (0);
1479 	}
1480 
1481 	/*
1482 	 * Callers might not be able to detect properly that we are read-only,
1483 	 * so check it explicitly here.
1484 	 */
1485 	if (zfs_is_readonly(outzfsvfs)) {
1486 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1487 		return (SET_ERROR(EROFS));
1488 	}
1489 
1490 	/*
1491 	 * If immutable or not appending then return EPERM.
1492 	 * Intentionally allow ZFS_READONLY through here.
1493 	 * See zfs_zaccess_common()
1494 	 */
1495 	if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) {
1496 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1497 		return (SET_ERROR(EPERM));
1498 	}
1499 
1500 	/*
1501 	 * No overlapping if we are cloning within the same file.
1502 	 */
1503 	if (inzp == outzp) {
1504 		if (inoff < outoff + len && outoff < inoff + len) {
1505 			zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1506 			return (SET_ERROR(EINVAL));
1507 		}
1508 	}
1509 
1510 	/* Flush any mmap()'d data to disk */
1511 	if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
1512 		zn_flush_cached_data(inzp, B_TRUE);
1513 
1514 	/*
1515 	 * Maintain predictable lock order.
1516 	 */
1517 	if (inzp < outzp || (inzp == outzp && inoff < outoff)) {
1518 		inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
1519 		    RL_READER);
1520 		outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
1521 		    RL_WRITER);
1522 	} else {
1523 		outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
1524 		    RL_WRITER);
1525 		inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
1526 		    RL_READER);
1527 	}
1528 
1529 	inblksz = inzp->z_blksz;
1530 
1531 	/*
1532 	 * We cannot clone into a file with different block size if we can't
1533 	 * grow it (block size is already bigger, has more than one block, or
1534 	 * not locked for growth).  There are other possible reasons for the
1535 	 * grow to fail, but we cover what we can before opening transaction
1536 	 * and the rest detect after we try to do it.
1537 	 */
1538 	if (inblksz < outzp->z_blksz) {
1539 		error = SET_ERROR(EINVAL);
1540 		goto unlock;
1541 	}
1542 	if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz ||
1543 	    outlr->lr_length != UINT64_MAX)) {
1544 		error = SET_ERROR(EINVAL);
1545 		goto unlock;
1546 	}
1547 
1548 	/*
1549 	 * Block size must be power-of-2 if destination offset != 0.
1550 	 * There can be no multiple blocks of non-power-of-2 size.
1551 	 */
1552 	if (outoff != 0 && !ISP2(inblksz)) {
1553 		error = SET_ERROR(EINVAL);
1554 		goto unlock;
1555 	}
1556 
1557 	/*
1558 	 * Offsets and len must be at block boundries.
1559 	 */
1560 	if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) {
1561 		error = SET_ERROR(EINVAL);
1562 		goto unlock;
1563 	}
1564 	/*
1565 	 * Length must be multipe of blksz, except for the end of the file.
1566 	 */
1567 	if ((len % inblksz) != 0 &&
1568 	    (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) {
1569 		error = SET_ERROR(EINVAL);
1570 		goto unlock;
1571 	}
1572 
1573 	/*
1574 	 * If we are copying only one block and it is smaller than recordsize
1575 	 * property, do not allow destination to grow beyond one block if it
1576 	 * is not there yet.  Otherwise the destination will get stuck with
1577 	 * that block size forever, that can be as small as 512 bytes, no
1578 	 * matter how big the destination grow later.
1579 	 */
1580 	if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz &&
1581 	    outzp->z_size <= inblksz && outoff + len > inblksz) {
1582 		error = SET_ERROR(EINVAL);
1583 		goto unlock;
1584 	}
1585 
1586 	error = zn_rlimit_fsize(outoff + len);
1587 	if (error != 0) {
1588 		goto unlock;
1589 	}
1590 
1591 	if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) {
1592 		error = SET_ERROR(EFBIG);
1593 		goto unlock;
1594 	}
1595 
1596 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL,
1597 	    &mtime, 16);
1598 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL,
1599 	    &ctime, 16);
1600 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL,
1601 	    &outzp->z_size, 8);
1602 
1603 	zilog = outzfsvfs->z_log;
1604 	maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) /
1605 	    sizeof (bps[0]);
1606 
1607 	uid = KUID_TO_SUID(ZTOUID(outzp));
1608 	gid = KGID_TO_SGID(ZTOGID(outzp));
1609 	projid = outzp->z_projid;
1610 
1611 	bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
1612 
1613 	/*
1614 	 * Clone the file in reasonable size chunks.  Each chunk is cloned
1615 	 * in a separate transaction; this keeps the intent log records small
1616 	 * and allows us to do more fine-grained space accounting.
1617 	 */
1618 	while (len > 0) {
1619 		size = MIN(inblksz * maxblocks, len);
1620 
1621 		if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT,
1622 		    uid) ||
1623 		    zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT,
1624 		    gid) ||
1625 		    (projid != ZFS_DEFAULT_PROJID &&
1626 		    zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT,
1627 		    projid))) {
1628 			error = SET_ERROR(EDQUOT);
1629 			break;
1630 		}
1631 
1632 		nbps = maxblocks;
1633 		last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos));
1634 		error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
1635 		    &nbps);
1636 		if (error != 0) {
1637 			/*
1638 			 * If we are trying to clone a block that was created
1639 			 * in the current transaction group, the error will be
1640 			 * EAGAIN here.  Based on zfs_bclone_wait_dirty either
1641 			 * return a shortened range to the caller so it can
1642 			 * fallback, or wait for the next TXG and check again.
1643 			 */
1644 			if (error == EAGAIN && zfs_bclone_wait_dirty) {
1645 				txg_wait_synced(dmu_objset_pool(inos),
1646 				    last_synced_txg + 1);
1647 				continue;
1648 			}
1649 
1650 			break;
1651 		}
1652 
1653 		/*
1654 		 * Start a transaction.
1655 		 */
1656 		tx = dmu_tx_create(outos);
1657 		dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE);
1658 		db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl);
1659 		DB_DNODE_ENTER(db);
1660 		dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size);
1661 		DB_DNODE_EXIT(db);
1662 		zfs_sa_upgrade_txholds(tx, outzp);
1663 		error = dmu_tx_assign(tx, DMU_TX_WAIT);
1664 		if (error != 0) {
1665 			dmu_tx_abort(tx);
1666 			break;
1667 		}
1668 
1669 		/*
1670 		 * Copy source znode's block size. This is done only if the
1671 		 * whole znode is locked (see zfs_rangelock_cb()) and only
1672 		 * on the first iteration since zfs_rangelock_reduce() will
1673 		 * shrink down lr_length to the appropriate size.
1674 		 */
1675 		if (outlr->lr_length == UINT64_MAX) {
1676 			zfs_grow_blocksize(outzp, inblksz, tx);
1677 
1678 			/*
1679 			 * Block growth may fail for many reasons we can not
1680 			 * predict here.  If it happen the cloning is doomed.
1681 			 */
1682 			if (inblksz != outzp->z_blksz) {
1683 				error = SET_ERROR(EINVAL);
1684 				dmu_tx_abort(tx);
1685 				break;
1686 			}
1687 
1688 			/*
1689 			 * Round range lock up to the block boundary, so we
1690 			 * prevent appends until we are done.
1691 			 */
1692 			zfs_rangelock_reduce(outlr, outoff,
1693 			    ((len - 1) / inblksz + 1) * inblksz);
1694 		}
1695 
1696 		error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx,
1697 		    bps, nbps);
1698 		if (error != 0) {
1699 			dmu_tx_commit(tx);
1700 			break;
1701 		}
1702 
1703 		if (zn_has_cached_data(outzp, outoff, outoff + size - 1)) {
1704 			update_pages(outzp, outoff, size, outos);
1705 		}
1706 
1707 		zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr,
1708 		    &clear_setid_bits_txg, tx);
1709 
1710 		zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime);
1711 
1712 		/*
1713 		 * Update the file size (zp_size) if it has changed;
1714 		 * account for possible concurrent updates.
1715 		 */
1716 		while ((outsize = outzp->z_size) < outoff + size) {
1717 			(void) atomic_cas_64(&outzp->z_size, outsize,
1718 			    outoff + size);
1719 		}
1720 
1721 		error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx);
1722 
1723 		zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff,
1724 		    size, inblksz, bps, nbps);
1725 
1726 		dmu_tx_commit(tx);
1727 
1728 		if (error != 0)
1729 			break;
1730 
1731 		inoff += size;
1732 		outoff += size;
1733 		len -= size;
1734 		done += size;
1735 
1736 		if (issig()) {
1737 			error = SET_ERROR(EINTR);
1738 			break;
1739 		}
1740 	}
1741 
1742 	vmem_free(bps, sizeof (bps[0]) * maxblocks);
1743 	zfs_znode_update_vfs(outzp);
1744 
1745 unlock:
1746 	zfs_rangelock_exit(outlr);
1747 	zfs_rangelock_exit(inlr);
1748 
1749 	if (done > 0) {
1750 		/*
1751 		 * If we have made at least partial progress, reset the error.
1752 		 */
1753 		error = 0;
1754 
1755 		ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp);
1756 
1757 		if (outos->os_sync == ZFS_SYNC_ALWAYS) {
1758 			zil_commit(zilog, outzp->z_id);
1759 		}
1760 
1761 		*inoffp += done;
1762 		*outoffp += done;
1763 		*lenp = done;
1764 	} else {
1765 		/*
1766 		 * If we made no progress, there must be a good reason.
1767 		 * EOF is handled explicitly above, before the loop.
1768 		 */
1769 		ASSERT3S(error, !=, 0);
1770 	}
1771 
1772 	zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
1773 
1774 	return (error);
1775 }
1776 
1777 /*
1778  * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(),
1779  * but we cannot do that, because when replaying we don't have source znode
1780  * available. This is why we need a dedicated replay function.
1781  */
1782 int
zfs_clone_range_replay(znode_t * zp,uint64_t off,uint64_t len,uint64_t blksz,const blkptr_t * bps,size_t nbps)1783 zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz,
1784     const blkptr_t *bps, size_t nbps)
1785 {
1786 	zfsvfs_t	*zfsvfs;
1787 	dmu_buf_impl_t	*db;
1788 	dmu_tx_t	*tx;
1789 	int		error;
1790 	int		count = 0;
1791 	sa_bulk_attr_t	bulk[3];
1792 	uint64_t	mtime[2], ctime[2];
1793 
1794 	ASSERT3U(off, <, MAXOFFSET_T);
1795 	ASSERT3U(len, >, 0);
1796 	ASSERT3U(nbps, >, 0);
1797 
1798 	zfsvfs = ZTOZSB(zp);
1799 
1800 	ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
1801 	    SPA_FEATURE_BLOCK_CLONING));
1802 
1803 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
1804 		return (error);
1805 
1806 	ASSERT(zfsvfs->z_replay);
1807 	ASSERT(!zfs_is_readonly(zfsvfs));
1808 
1809 	if ((off % blksz) != 0) {
1810 		zfs_exit(zfsvfs, FTAG);
1811 		return (SET_ERROR(EINVAL));
1812 	}
1813 
1814 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
1815 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
1816 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1817 	    &zp->z_size, 8);
1818 
1819 	/*
1820 	 * Start a transaction.
1821 	 */
1822 	tx = dmu_tx_create(zfsvfs->z_os);
1823 
1824 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1825 	db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
1826 	DB_DNODE_ENTER(db);
1827 	dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len);
1828 	DB_DNODE_EXIT(db);
1829 	zfs_sa_upgrade_txholds(tx, zp);
1830 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
1831 	if (error != 0) {
1832 		dmu_tx_abort(tx);
1833 		zfs_exit(zfsvfs, FTAG);
1834 		return (error);
1835 	}
1836 
1837 	if (zp->z_blksz < blksz)
1838 		zfs_grow_blocksize(zp, blksz, tx);
1839 
1840 	dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps);
1841 
1842 	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
1843 
1844 	if (zp->z_size < off + len)
1845 		zp->z_size = off + len;
1846 
1847 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1848 
1849 	/*
1850 	 * zil_replaying() not only check if we are replaying ZIL, but also
1851 	 * updates the ZIL header to record replay progress.
1852 	 */
1853 	VERIFY(zil_replaying(zfsvfs->z_log, tx));
1854 
1855 	dmu_tx_commit(tx);
1856 
1857 	zfs_znode_update_vfs(zp);
1858 
1859 	zfs_exit(zfsvfs, FTAG);
1860 
1861 	return (error);
1862 }
1863 
1864 EXPORT_SYMBOL(zfs_access);
1865 EXPORT_SYMBOL(zfs_fsync);
1866 EXPORT_SYMBOL(zfs_holey);
1867 EXPORT_SYMBOL(zfs_read);
1868 EXPORT_SYMBOL(zfs_write);
1869 EXPORT_SYMBOL(zfs_getsecattr);
1870 EXPORT_SYMBOL(zfs_setsecattr);
1871 EXPORT_SYMBOL(zfs_clone_range);
1872 EXPORT_SYMBOL(zfs_clone_range_replay);
1873 
1874 ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
1875 	"Bytes to read per chunk");
1876 
1877 ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW,
1878 	"Enable block cloning");
1879 
1880 ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW,
1881 	"Wait for dirty blocks when cloning");
1882 
1883 ZFS_MODULE_PARAM(zfs, zfs_, dio_enabled, INT, ZMOD_RW,
1884 	"Enable Direct I/O");
1885