xref: /illumos-gate/usr/src/uts/common/fs/zfs/dmu_send.c (revision 33c72b7598992897b94815b1f47b7b8077e53808)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
25  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
26  * Copyright 2014 HybridCluster. All rights reserved.
27  * Copyright 2016 RackTop Systems.
28  * Copyright (c) 2014 Integros [integros.com]
29  */
30 
31 #include <sys/dmu.h>
32 #include <sys/dmu_impl.h>
33 #include <sys/dmu_tx.h>
34 #include <sys/dbuf.h>
35 #include <sys/dnode.h>
36 #include <sys/zfs_context.h>
37 #include <sys/dmu_objset.h>
38 #include <sys/dmu_traverse.h>
39 #include <sys/dsl_dataset.h>
40 #include <sys/dsl_dir.h>
41 #include <sys/dsl_prop.h>
42 #include <sys/dsl_pool.h>
43 #include <sys/dsl_synctask.h>
44 #include <sys/zfs_ioctl.h>
45 #include <sys/zap.h>
46 #include <sys/zio_checksum.h>
47 #include <sys/zfs_znode.h>
48 #include <zfs_fletcher.h>
49 #include <sys/avl.h>
50 #include <sys/ddt.h>
51 #include <sys/zfs_onexit.h>
52 #include <sys/dmu_send.h>
53 #include <sys/dsl_destroy.h>
54 #include <sys/blkptr.h>
55 #include <sys/dsl_bookmark.h>
56 #include <sys/zfeature.h>
57 #include <sys/bqueue.h>
58 
59 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
60 int zfs_send_corrupt_data = B_FALSE;
61 int zfs_send_queue_length = 16 * 1024 * 1024;
62 /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
63 int zfs_send_set_freerecords_bit = B_TRUE;
64 /* Set this tunable to FALSE is disable sending unmodified spill blocks. */
65 int zfs_send_unmodified_spill_blocks = B_TRUE;
66 
67 /*
68  * Use this to override the recordsize calculation for fast zfs send estimates.
69  */
70 uint64_t zfs_override_estimate_recordsize = 0;
71 
72 #define	BP_SPAN(datablkszsec, indblkshift, level) \
73 	(((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
74 	(level) * (indblkshift - SPA_BLKPTRSHIFT)))
75 
76 struct send_thread_arg {
77 	bqueue_t	q;
78 	dsl_dataset_t	*ds;		/* Dataset to traverse */
79 	uint64_t	fromtxg;	/* Traverse from this txg */
80 	int		flags;		/* flags to pass to traverse_dataset */
81 	int		error_code;
82 	boolean_t	cancel;
83 	zbookmark_phys_t resume;
84 };
85 
86 struct send_block_record {
87 	boolean_t		eos_marker; /* Marks the end of the stream */
88 	blkptr_t		bp;
89 	zbookmark_phys_t	zb;
90 	uint8_t			indblkshift;
91 	uint16_t		datablkszsec;
92 	bqueue_node_t		ln;
93 };
94 
95 static int do_dump(dmu_sendarg_t *dsa, struct send_block_record *data);
96 
97 static int
98 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
99 {
100 	dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os);
101 	ssize_t resid; /* have to get resid to get detailed errno */
102 
103 	/*
104 	 * The code does not rely on len being a multiple of 8.  We keep
105 	 * this assertion because of the corresponding assertion in
106 	 * receive_read().  Keeping this assertion ensures that we do not
107 	 * inadvertently break backwards compatibility (causing the assertion
108 	 * in receive_read() to trigger on old software). Newer feature flags
109 	 * (such as raw send) may break this assertion since they were
110 	 * introduced after the requirement was made obsolete.
111 	 */
112 
113 	ASSERT(len % 8 == 0 ||
114 	    (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0);
115 
116 	dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp,
117 	    (caddr_t)buf, len,
118 	    0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
119 
120 	mutex_enter(&ds->ds_sendstream_lock);
121 	*dsp->dsa_off += len;
122 	mutex_exit(&ds->ds_sendstream_lock);
123 
124 	return (dsp->dsa_err);
125 }
126 
127 /*
128  * For all record types except BEGIN, fill in the checksum (overlaid in
129  * drr_u.drr_checksum.drr_checksum).  The checksum verifies everything
130  * up to the start of the checksum itself.
131  */
132 static int
133 dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
134 {
135 	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
136 	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
137 	(void) fletcher_4_incremental_native(dsp->dsa_drr,
138 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
139 	    &dsp->dsa_zc);
140 	if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
141 		dsp->dsa_sent_begin = B_TRUE;
142 	} else {
143 		ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u.
144 		    drr_checksum.drr_checksum));
145 		dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc;
146 	}
147 	if (dsp->dsa_drr->drr_type == DRR_END) {
148 		dsp->dsa_sent_end = B_TRUE;
149 	}
150 	(void) fletcher_4_incremental_native(&dsp->dsa_drr->
151 	    drr_u.drr_checksum.drr_checksum,
152 	    sizeof (zio_cksum_t), &dsp->dsa_zc);
153 	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
154 		return (SET_ERROR(EINTR));
155 	if (payload_len != 0) {
156 		(void) fletcher_4_incremental_native(payload, payload_len,
157 		    &dsp->dsa_zc);
158 		if (dump_bytes(dsp, payload, payload_len) != 0)
159 			return (SET_ERROR(EINTR));
160 	}
161 	return (0);
162 }
163 
164 /*
165  * Fill in the drr_free struct, or perform aggregation if the previous record is
166  * also a free record, and the two are adjacent.
167  *
168  * Note that we send free records even for a full send, because we want to be
169  * able to receive a full send as a clone, which requires a list of all the free
170  * and freeobject records that were generated on the source.
171  */
172 static int
173 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
174     uint64_t length)
175 {
176 	struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
177 
178 	/*
179 	 * When we receive a free record, dbuf_free_range() assumes
180 	 * that the receiving system doesn't have any dbufs in the range
181 	 * being freed.  This is always true because there is a one-record
182 	 * constraint: we only send one WRITE record for any given
183 	 * object,offset.  We know that the one-record constraint is
184 	 * true because we always send data in increasing order by
185 	 * object,offset.
186 	 *
187 	 * If the increasing-order constraint ever changes, we should find
188 	 * another way to assert that the one-record constraint is still
189 	 * satisfied.
190 	 */
191 	ASSERT(object > dsp->dsa_last_data_object ||
192 	    (object == dsp->dsa_last_data_object &&
193 	    offset > dsp->dsa_last_data_offset));
194 
195 	/*
196 	 * If there is a pending op, but it's not PENDING_FREE, push it out,
197 	 * since free block aggregation can only be done for blocks of the
198 	 * same type (i.e., DRR_FREE records can only be aggregated with
199 	 * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
200 	 * aggregated with other DRR_FREEOBJECTS records.
201 	 */
202 	if (dsp->dsa_pending_op != PENDING_NONE &&
203 	    dsp->dsa_pending_op != PENDING_FREE) {
204 		if (dump_record(dsp, NULL, 0) != 0)
205 			return (SET_ERROR(EINTR));
206 		dsp->dsa_pending_op = PENDING_NONE;
207 	}
208 
209 	if (dsp->dsa_pending_op == PENDING_FREE) {
210 		/*
211 		 * There should never be a PENDING_FREE if length is
212 		 * DMU_OBJECT_END (because dump_dnode is the only place where
213 		 * this function is called with a DMU_OBJECT_END, and only after
214 		 * flushing any pending record).
215 		 */
216 		ASSERT(length != DMU_OBJECT_END);
217 		/*
218 		 * Check to see whether this free block can be aggregated
219 		 * with pending one.
220 		 */
221 		if (drrf->drr_object == object && drrf->drr_offset +
222 		    drrf->drr_length == offset) {
223 			if (offset + length < offset)
224 				drrf->drr_length = DMU_OBJECT_END;
225 			else
226 				drrf->drr_length += length;
227 			return (0);
228 		} else {
229 			/* not a continuation.  Push out pending record */
230 			if (dump_record(dsp, NULL, 0) != 0)
231 				return (SET_ERROR(EINTR));
232 			dsp->dsa_pending_op = PENDING_NONE;
233 		}
234 	}
235 	/* create a FREE record and make it pending */
236 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
237 	dsp->dsa_drr->drr_type = DRR_FREE;
238 	drrf->drr_object = object;
239 	drrf->drr_offset = offset;
240 	if (offset + length < offset)
241 		drrf->drr_length = DMU_OBJECT_END;
242 	else
243 		drrf->drr_length = length;
244 	drrf->drr_toguid = dsp->dsa_toguid;
245 	if (length == DMU_OBJECT_END) {
246 		if (dump_record(dsp, NULL, 0) != 0)
247 			return (SET_ERROR(EINTR));
248 	} else {
249 		dsp->dsa_pending_op = PENDING_FREE;
250 	}
251 
252 	return (0);
253 }
254 
255 static int
256 dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object,
257     uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data)
258 {
259 	uint64_t payload_size;
260 	boolean_t raw = (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW);
261 	struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
262 
263 	/*
264 	 * We send data in increasing object, offset order.
265 	 * See comment in dump_free() for details.
266 	 */
267 	ASSERT(object > dsp->dsa_last_data_object ||
268 	    (object == dsp->dsa_last_data_object &&
269 	    offset > dsp->dsa_last_data_offset));
270 	dsp->dsa_last_data_object = object;
271 	dsp->dsa_last_data_offset = offset + lsize - 1;
272 
273 	/*
274 	 * If there is any kind of pending aggregation (currently either
275 	 * a grouping of free objects or free blocks), push it out to
276 	 * the stream, since aggregation can't be done across operations
277 	 * of different types.
278 	 */
279 	if (dsp->dsa_pending_op != PENDING_NONE) {
280 		if (dump_record(dsp, NULL, 0) != 0)
281 			return (SET_ERROR(EINTR));
282 		dsp->dsa_pending_op = PENDING_NONE;
283 	}
284 	/* write a WRITE record */
285 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
286 	dsp->dsa_drr->drr_type = DRR_WRITE;
287 	drrw->drr_object = object;
288 	drrw->drr_type = type;
289 	drrw->drr_offset = offset;
290 	drrw->drr_toguid = dsp->dsa_toguid;
291 	drrw->drr_logical_size = lsize;
292 
293 	/* only set the compression fields if the buf is compressed or raw */
294 	if (raw || lsize != psize) {
295 		ASSERT(!BP_IS_EMBEDDED(bp));
296 		ASSERT3S(psize, >, 0);
297 
298 		if (raw) {
299 			ASSERT(BP_IS_PROTECTED(bp));
300 
301 			/*
302 			 * This is a raw protected block so we need to pass
303 			 * along everything the receiving side will need to
304 			 * interpret this block, including the byteswap, salt,
305 			 * IV, and MAC.
306 			 */
307 			if (BP_SHOULD_BYTESWAP(bp))
308 				drrw->drr_flags |= DRR_RAW_BYTESWAP;
309 			zio_crypt_decode_params_bp(bp, drrw->drr_salt,
310 			    drrw->drr_iv);
311 			zio_crypt_decode_mac_bp(bp, drrw->drr_mac);
312 		} else {
313 			/* this is a compressed block */
314 			ASSERT(dsp->dsa_featureflags &
315 			    DMU_BACKUP_FEATURE_COMPRESSED);
316 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
317 			ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)));
318 			ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF);
319 			ASSERT3S(lsize, >=, psize);
320 		}
321 
322 		/* set fields common to compressed and raw sends */
323 		drrw->drr_compressiontype = BP_GET_COMPRESS(bp);
324 		drrw->drr_compressed_size = psize;
325 		payload_size = drrw->drr_compressed_size;
326 	} else {
327 		payload_size = drrw->drr_logical_size;
328 	}
329 
330 	if (bp == NULL || BP_IS_EMBEDDED(bp) || (BP_IS_PROTECTED(bp) && !raw)) {
331 		/*
332 		 * There's no pre-computed checksum for partial-block writes,
333 		 * embedded BP's, or encrypted BP's that are being sent as
334 		 * plaintext, so (like fletcher4-checkummed blocks) userland
335 		 * will have to compute a dedup-capable checksum itself.
336 		 */
337 		drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
338 	} else {
339 		drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
340 		if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
341 		    ZCHECKSUM_FLAG_DEDUP)
342 			drrw->drr_flags |= DRR_CHECKSUM_DEDUP;
343 		DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
344 		DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
345 		DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
346 		DDK_SET_CRYPT(&drrw->drr_key, BP_IS_PROTECTED(bp));
347 		drrw->drr_key.ddk_cksum = bp->blk_cksum;
348 	}
349 
350 	if (dump_record(dsp, data, payload_size) != 0)
351 		return (SET_ERROR(EINTR));
352 	return (0);
353 }
354 
355 static int
356 dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
357     int blksz, const blkptr_t *bp)
358 {
359 	char buf[BPE_PAYLOAD_SIZE];
360 	struct drr_write_embedded *drrw =
361 	    &(dsp->dsa_drr->drr_u.drr_write_embedded);
362 
363 	if (dsp->dsa_pending_op != PENDING_NONE) {
364 		if (dump_record(dsp, NULL, 0) != 0)
365 			return (EINTR);
366 		dsp->dsa_pending_op = PENDING_NONE;
367 	}
368 
369 	ASSERT(BP_IS_EMBEDDED(bp));
370 
371 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
372 	dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
373 	drrw->drr_object = object;
374 	drrw->drr_offset = offset;
375 	drrw->drr_length = blksz;
376 	drrw->drr_toguid = dsp->dsa_toguid;
377 	drrw->drr_compression = BP_GET_COMPRESS(bp);
378 	drrw->drr_etype = BPE_GET_ETYPE(bp);
379 	drrw->drr_lsize = BPE_GET_LSIZE(bp);
380 	drrw->drr_psize = BPE_GET_PSIZE(bp);
381 
382 	decode_embedded_bp_compressed(bp, buf);
383 
384 	if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
385 		return (EINTR);
386 	return (0);
387 }
388 
389 static int
390 dump_spill(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, void *data)
391 {
392 	struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
393 	uint64_t blksz = BP_GET_LSIZE(bp);
394 	uint64_t payload_size = blksz;
395 
396 	if (dsp->dsa_pending_op != PENDING_NONE) {
397 		if (dump_record(dsp, NULL, 0) != 0)
398 			return (SET_ERROR(EINTR));
399 		dsp->dsa_pending_op = PENDING_NONE;
400 	}
401 
402 	/* write a SPILL record */
403 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
404 	dsp->dsa_drr->drr_type = DRR_SPILL;
405 	drrs->drr_object = object;
406 	drrs->drr_length = blksz;
407 	drrs->drr_toguid = dsp->dsa_toguid;
408 
409 	/* See comment in dump_dnode() for full details */
410 	if (zfs_send_unmodified_spill_blocks &&
411 	    (bp->blk_birth <= dsp->dsa_fromtxg)) {
412 		drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
413 	}
414 
415 	/* handle raw send fields */
416 	if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
417 		ASSERT(BP_IS_PROTECTED(bp));
418 
419 		if (BP_SHOULD_BYTESWAP(bp))
420 			drrs->drr_flags |= DRR_RAW_BYTESWAP;
421 		drrs->drr_compressiontype = BP_GET_COMPRESS(bp);
422 		drrs->drr_compressed_size = BP_GET_PSIZE(bp);
423 		zio_crypt_decode_params_bp(bp, drrs->drr_salt, drrs->drr_iv);
424 		zio_crypt_decode_mac_bp(bp, drrs->drr_mac);
425 		payload_size = drrs->drr_compressed_size;
426 	}
427 
428 	if (dump_record(dsp, data, payload_size) != 0)
429 		return (SET_ERROR(EINTR));
430 	return (0);
431 }
432 
433 static int
434 dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
435 {
436 	struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
437 
438 	/*
439 	 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
440 	 * push it out, since free block aggregation can only be done for
441 	 * blocks of the same type (i.e., DRR_FREE records can only be
442 	 * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
443 	 * can only be aggregated with other DRR_FREEOBJECTS records.
444 	 */
445 	if (dsp->dsa_pending_op != PENDING_NONE &&
446 	    dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
447 		if (dump_record(dsp, NULL, 0) != 0)
448 			return (SET_ERROR(EINTR));
449 		dsp->dsa_pending_op = PENDING_NONE;
450 	}
451 	if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
452 		/*
453 		 * See whether this free object array can be aggregated
454 		 * with pending one
455 		 */
456 		if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
457 			drrfo->drr_numobjs += numobjs;
458 			return (0);
459 		} else {
460 			/* can't be aggregated.  Push out pending record */
461 			if (dump_record(dsp, NULL, 0) != 0)
462 				return (SET_ERROR(EINTR));
463 			dsp->dsa_pending_op = PENDING_NONE;
464 		}
465 	}
466 
467 	/* write a FREEOBJECTS record */
468 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
469 	dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
470 	drrfo->drr_firstobj = firstobj;
471 	drrfo->drr_numobjs = numobjs;
472 	drrfo->drr_toguid = dsp->dsa_toguid;
473 
474 	dsp->dsa_pending_op = PENDING_FREEOBJECTS;
475 
476 	return (0);
477 }
478 
479 static int
480 dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object,
481     dnode_phys_t *dnp)
482 {
483 	struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
484 	int bonuslen;
485 
486 	if (object < dsp->dsa_resume_object) {
487 		/*
488 		 * Note: when resuming, we will visit all the dnodes in
489 		 * the block of dnodes that we are resuming from.  In
490 		 * this case it's unnecessary to send the dnodes prior to
491 		 * the one we are resuming from.  We should be at most one
492 		 * block's worth of dnodes behind the resume point.
493 		 */
494 		ASSERT3U(dsp->dsa_resume_object - object, <,
495 		    1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
496 		return (0);
497 	}
498 
499 	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
500 		return (dump_freeobjects(dsp, object, 1));
501 
502 	if (dsp->dsa_pending_op != PENDING_NONE) {
503 		if (dump_record(dsp, NULL, 0) != 0)
504 			return (SET_ERROR(EINTR));
505 		dsp->dsa_pending_op = PENDING_NONE;
506 	}
507 
508 	/* write an OBJECT record */
509 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
510 	dsp->dsa_drr->drr_type = DRR_OBJECT;
511 	drro->drr_object = object;
512 	drro->drr_type = dnp->dn_type;
513 	drro->drr_bonustype = dnp->dn_bonustype;
514 	drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
515 	drro->drr_bonuslen = dnp->dn_bonuslen;
516 	drro->drr_dn_slots = dnp->dn_extra_slots + 1;
517 	drro->drr_checksumtype = dnp->dn_checksum;
518 	drro->drr_compress = dnp->dn_compress;
519 	drro->drr_toguid = dsp->dsa_toguid;
520 
521 	if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
522 	    drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
523 		drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
524 
525 	bonuslen = P2ROUNDUP(dnp->dn_bonuslen, 8);
526 
527 	if ((dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW)) {
528 		ASSERT(BP_IS_ENCRYPTED(bp));
529 
530 		if (BP_SHOULD_BYTESWAP(bp))
531 			drro->drr_flags |= DRR_RAW_BYTESWAP;
532 
533 		/* needed for reconstructing dnp on recv side */
534 		drro->drr_maxblkid = dnp->dn_maxblkid;
535 		drro->drr_indblkshift = dnp->dn_indblkshift;
536 		drro->drr_nlevels = dnp->dn_nlevels;
537 		drro->drr_nblkptr = dnp->dn_nblkptr;
538 
539 		/*
540 		 * Since we encrypt the entire bonus area, the (raw) part
541 		 * beyond the bonuslen is actually nonzero, so we need
542 		 * to send it.
543 		 */
544 		if (bonuslen != 0) {
545 			drro->drr_raw_bonuslen = DN_MAX_BONUS_LEN(dnp);
546 			bonuslen = drro->drr_raw_bonuslen;
547 		}
548 	}
549 
550 	/*
551 	 * DRR_OBJECT_SPILL is set for every dnode which references a
552 	 * spill block.  This allows the receiving pool to definitively
553 	 * determine when a spill block should be kept or freed.
554 	 */
555 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
556 		drro->drr_flags |= DRR_OBJECT_SPILL;
557 
558 	if (dump_record(dsp, DN_BONUS(dnp), bonuslen) != 0)
559 		return (SET_ERROR(EINTR));
560 
561 	/* Free anything past the end of the file. */
562 	if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
563 	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0)
564 		return (SET_ERROR(EINTR));
565 
566 	/*
567 	 * Send DRR_SPILL records for unmodified spill blocks.  This is useful
568 	 * because changing certain attributes of the object (e.g. blocksize)
569 	 * can cause old versions of ZFS to incorrectly remove a spill block.
570 	 * Including these records in the stream forces an up to date version
571 	 * to always be written ensuring they're never lost.  Current versions
572 	 * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
573 	 * ignore these unmodified spill blocks.
574 	 */
575 	if (zfs_send_unmodified_spill_blocks &&
576 	    (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
577 	    (DN_SPILL_BLKPTR(dnp)->blk_birth <= dsp->dsa_fromtxg)) {
578 		struct send_block_record record;
579 
580 		bzero(&record, sizeof (struct send_block_record));
581 		record.eos_marker = B_FALSE;
582 		record.bp = *DN_SPILL_BLKPTR(dnp);
583 		SET_BOOKMARK(&(record.zb), dmu_objset_id(dsp->dsa_os),
584 		    object, 0, DMU_SPILL_BLKID);
585 
586 		if (do_dump(dsp, &record) != 0)
587 			return (SET_ERROR(EINTR));
588 	}
589 
590 	if (dsp->dsa_err != 0)
591 		return (SET_ERROR(EINTR));
592 	return (0);
593 }
594 
595 static int
596 dump_object_range(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t firstobj,
597     uint64_t numslots)
598 {
599 	struct drr_object_range *drror =
600 	    &(dsp->dsa_drr->drr_u.drr_object_range);
601 
602 	/* we only use this record type for raw sends */
603 	ASSERT(BP_IS_PROTECTED(bp));
604 	ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW);
605 	ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
606 	ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE);
607 	ASSERT0(BP_GET_LEVEL(bp));
608 
609 	if (dsp->dsa_pending_op != PENDING_NONE) {
610 		if (dump_record(dsp, NULL, 0) != 0)
611 			return (SET_ERROR(EINTR));
612 		dsp->dsa_pending_op = PENDING_NONE;
613 	}
614 
615 	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
616 	dsp->dsa_drr->drr_type = DRR_OBJECT_RANGE;
617 	drror->drr_firstobj = firstobj;
618 	drror->drr_numslots = numslots;
619 	drror->drr_toguid = dsp->dsa_toguid;
620 	if (BP_SHOULD_BYTESWAP(bp))
621 		drror->drr_flags |= DRR_RAW_BYTESWAP;
622 	zio_crypt_decode_params_bp(bp, drror->drr_salt, drror->drr_iv);
623 	zio_crypt_decode_mac_bp(bp, drror->drr_mac);
624 
625 	if (dump_record(dsp, NULL, 0) != 0)
626 		return (SET_ERROR(EINTR));
627 	return (0);
628 }
629 
630 static boolean_t
631 backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
632 {
633 	if (!BP_IS_EMBEDDED(bp))
634 		return (B_FALSE);
635 
636 	/*
637 	 * Compression function must be legacy, or explicitly enabled.
638 	 */
639 	if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
640 	    !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4)))
641 		return (B_FALSE);
642 
643 	/*
644 	 * Embed type must be explicitly enabled.
645 	 */
646 	switch (BPE_GET_ETYPE(bp)) {
647 	case BP_EMBEDDED_TYPE_DATA:
648 		if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
649 			return (B_TRUE);
650 		break;
651 	default:
652 		return (B_FALSE);
653 	}
654 	return (B_FALSE);
655 }
656 
657 /*
658  * This is the callback function to traverse_dataset that acts as the worker
659  * thread for dmu_send_impl.
660  */
661 /*ARGSUSED*/
662 static int
663 send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
664     const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
665 {
666 	struct send_thread_arg *sta = arg;
667 	struct send_block_record *record;
668 	uint64_t record_size;
669 	int err = 0;
670 
671 	ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
672 	    zb->zb_object >= sta->resume.zb_object);
673 	ASSERT3P(sta->ds, !=, NULL);
674 
675 	if (sta->cancel)
676 		return (SET_ERROR(EINTR));
677 
678 	if (bp == NULL) {
679 		ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
680 		return (0);
681 	} else if (zb->zb_level < 0) {
682 		return (0);
683 	}
684 
685 	record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP);
686 	record->eos_marker = B_FALSE;
687 	record->bp = *bp;
688 	record->zb = *zb;
689 	record->indblkshift = dnp->dn_indblkshift;
690 	record->datablkszsec = dnp->dn_datablkszsec;
691 	record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
692 	bqueue_enqueue(&sta->q, record, record_size);
693 
694 	return (err);
695 }
696 
697 /*
698  * This function kicks off the traverse_dataset.  It also handles setting the
699  * error code of the thread in case something goes wrong, and pushes the End of
700  * Stream record when the traverse_dataset call has finished.  If there is no
701  * dataset to traverse, the thread immediately pushes End of Stream marker.
702  */
703 static void
704 send_traverse_thread(void *arg)
705 {
706 	struct send_thread_arg *st_arg = arg;
707 	int err;
708 	struct send_block_record *data;
709 
710 	if (st_arg->ds != NULL) {
711 		err = traverse_dataset_resume(st_arg->ds,
712 		    st_arg->fromtxg, &st_arg->resume,
713 		    st_arg->flags, send_cb, st_arg);
714 
715 		if (err != EINTR)
716 			st_arg->error_code = err;
717 	}
718 	data = kmem_zalloc(sizeof (*data), KM_SLEEP);
719 	data->eos_marker = B_TRUE;
720 	bqueue_enqueue(&st_arg->q, data, 1);
721 	thread_exit();
722 }
723 
724 /*
725  * This function actually handles figuring out what kind of record needs to be
726  * dumped, reading the data (which has hopefully been prefetched), and calling
727  * the appropriate helper function.
728  */
729 static int
730 do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
731 {
732 	dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os);
733 	const blkptr_t *bp = &data->bp;
734 	const zbookmark_phys_t *zb = &data->zb;
735 	uint8_t indblkshift = data->indblkshift;
736 	uint16_t dblkszsec = data->datablkszsec;
737 	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
738 	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
739 	int err = 0;
740 
741 	ASSERT3U(zb->zb_level, >=, 0);
742 
743 	ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
744 	    zb->zb_object >= dsa->dsa_resume_object);
745 
746 	/*
747 	 * All bps of an encrypted os should have the encryption bit set.
748 	 * If this is not true it indicates tampering and we report an error.
749 	 */
750 	if (dsa->dsa_os->os_encrypted &&
751 	    !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
752 		spa_log_error(spa, zb);
753 		zfs_panic_recover("unencrypted block in encrypted "
754 		    "object set %llu", ds->ds_object);
755 		return (SET_ERROR(EIO));
756 	}
757 
758 	if (zb->zb_object != DMU_META_DNODE_OBJECT &&
759 	    DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
760 		return (0);
761 	} else if (BP_IS_HOLE(bp) &&
762 	    zb->zb_object == DMU_META_DNODE_OBJECT) {
763 		uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
764 		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
765 		err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT);
766 	} else if (BP_IS_HOLE(bp)) {
767 		uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
768 		uint64_t offset = zb->zb_blkid * span;
769 		/* Don't dump free records for offsets > DMU_OBJECT_END */
770 		if (zb->zb_blkid == 0 || span <= DMU_OBJECT_END / zb->zb_blkid)
771 			err = dump_free(dsa, zb->zb_object, offset, span);
772 	} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
773 		return (0);
774 	} else if (type == DMU_OT_DNODE) {
775 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
776 		arc_flags_t aflags = ARC_FLAG_WAIT;
777 		arc_buf_t *abuf;
778 		enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
779 
780 		if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
781 			ASSERT(BP_IS_ENCRYPTED(bp));
782 			ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
783 			zioflags |= ZIO_FLAG_RAW;
784 		}
785 
786 		ASSERT0(zb->zb_level);
787 
788 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
789 		    ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0)
790 			return (SET_ERROR(EIO));
791 
792 		dnode_phys_t *blk = abuf->b_data;
793 		uint64_t dnobj = zb->zb_blkid * epb;
794 
795 		/*
796 		 * Raw sends require sending encryption parameters for the
797 		 * block of dnodes. Regular sends do not need to send this
798 		 * info.
799 		 */
800 		if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
801 			ASSERT(arc_is_encrypted(abuf));
802 			err = dump_object_range(dsa, bp, dnobj, epb);
803 		}
804 
805 		if (err == 0) {
806 			for (int i = 0; i < epb;
807 			    i += blk[i].dn_extra_slots + 1) {
808 				err = dump_dnode(dsa, bp, dnobj + i, blk + i);
809 				if (err != 0)
810 					break;
811 			}
812 		}
813 		arc_buf_destroy(abuf, &abuf);
814 	} else if (type == DMU_OT_SA) {
815 		arc_flags_t aflags = ARC_FLAG_WAIT;
816 		arc_buf_t *abuf;
817 		enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
818 
819 		if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
820 			ASSERT(BP_IS_PROTECTED(bp));
821 			zioflags |= ZIO_FLAG_RAW;
822 		}
823 
824 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
825 		    ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0)
826 			return (SET_ERROR(EIO));
827 
828 		err = dump_spill(dsa, bp, zb->zb_object, abuf->b_data);
829 		arc_buf_destroy(abuf, &abuf);
830 	} else if (backup_do_embed(dsa, bp)) {
831 		/* it's an embedded level-0 block of a regular object */
832 		int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
833 		ASSERT0(zb->zb_level);
834 		err = dump_write_embedded(dsa, zb->zb_object,
835 		    zb->zb_blkid * blksz, blksz, bp);
836 	} else {
837 		/* it's a level-0 block of a regular object */
838 		arc_flags_t aflags = ARC_FLAG_WAIT;
839 		arc_buf_t *abuf;
840 		int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
841 		uint64_t offset;
842 
843 		/*
844 		 * If we have large blocks stored on disk but the send flags
845 		 * don't allow us to send large blocks, we split the data from
846 		 * the arc buf into chunks.
847 		 */
848 		boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE &&
849 		    !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS);
850 
851 		/*
852 		 * Raw sends require that we always get raw data as it exists
853 		 * on disk, so we assert that we are not splitting blocks here.
854 		 */
855 		boolean_t request_raw =
856 		    (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0;
857 
858 		/*
859 		 * We should only request compressed data from the ARC if all
860 		 * the following are true:
861 		 *  - stream compression was requested
862 		 *  - we aren't splitting large blocks into smaller chunks
863 		 *  - the data won't need to be byteswapped before sending
864 		 *  - this isn't an embedded block
865 		 *  - this isn't metadata (if receiving on a different endian
866 		 *    system it can be byteswapped more easily)
867 		 */
868 		boolean_t request_compressed =
869 		    (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) &&
870 		    !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) &&
871 		    !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp));
872 
873 		IMPLY(request_raw, !split_large_blocks);
874 		IMPLY(request_raw, BP_IS_PROTECTED(bp));
875 		ASSERT0(zb->zb_level);
876 		ASSERT(zb->zb_object > dsa->dsa_resume_object ||
877 		    (zb->zb_object == dsa->dsa_resume_object &&
878 		    zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
879 
880 		ASSERT0(zb->zb_level);
881 		ASSERT(zb->zb_object > dsa->dsa_resume_object ||
882 		    (zb->zb_object == dsa->dsa_resume_object &&
883 		    zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
884 
885 		ASSERT3U(blksz, ==, BP_GET_LSIZE(bp));
886 
887 		enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
888 		if (request_raw)
889 			zioflags |= ZIO_FLAG_RAW;
890 		else if (request_compressed)
891 			zioflags |= ZIO_FLAG_RAW_COMPRESS;
892 
893 		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
894 		    ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) {
895 			if (zfs_send_corrupt_data) {
896 				/* Send a block filled with 0x"zfs badd bloc" */
897 				abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA,
898 				    blksz);
899 				uint64_t *ptr;
900 				for (ptr = abuf->b_data;
901 				    (char *)ptr < (char *)abuf->b_data + blksz;
902 				    ptr++)
903 					*ptr = 0x2f5baddb10cULL;
904 			} else {
905 				return (SET_ERROR(EIO));
906 			}
907 		}
908 
909 		offset = zb->zb_blkid * blksz;
910 
911 		if (split_large_blocks) {
912 			ASSERT0(arc_is_encrypted(abuf));
913 			ASSERT3U(arc_get_compression(abuf), ==,
914 			    ZIO_COMPRESS_OFF);
915 			char *buf = abuf->b_data;
916 			while (blksz > 0 && err == 0) {
917 				int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
918 				err = dump_write(dsa, type, zb->zb_object,
919 				    offset, n, n, NULL, buf);
920 				offset += n;
921 				buf += n;
922 				blksz -= n;
923 			}
924 		} else {
925 			err = dump_write(dsa, type, zb->zb_object, offset,
926 			    blksz, arc_buf_size(abuf), bp, abuf->b_data);
927 		}
928 		arc_buf_destroy(abuf, &abuf);
929 	}
930 
931 	ASSERT(err == 0 || err == EINTR);
932 	return (err);
933 }
934 
935 /*
936  * Pop the new data off the queue, and free the old data.
937  */
938 static struct send_block_record *
939 get_next_record(bqueue_t *bq, struct send_block_record *data)
940 {
941 	struct send_block_record *tmp = bqueue_dequeue(bq);
942 	kmem_free(data, sizeof (*data));
943 	return (tmp);
944 }
945 
946 /*
947  * Actually do the bulk of the work in a zfs send.
948  *
949  * Note: Releases dp using the specified tag.
950  */
951 static int
952 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
953     zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone,
954     boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
955     boolean_t rawok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
956     vnode_t *vp, offset_t *off)
957 {
958 	objset_t *os;
959 	dmu_replay_record_t *drr;
960 	dmu_sendarg_t *dsp;
961 	int err;
962 	uint64_t fromtxg = 0;
963 	uint64_t featureflags = 0;
964 	struct send_thread_arg to_arg = { 0 };
965 
966 	err = dmu_objset_from_ds(to_ds, &os);
967 	if (err != 0) {
968 		dsl_pool_rele(dp, tag);
969 		return (err);
970 	}
971 
972 	/*
973 	 * If this is a non-raw send of an encrypted ds, we can ensure that
974 	 * the objset_phys_t is authenticated. This is safe because this is
975 	 * either a snapshot or we have owned the dataset, ensuring that
976 	 * it can't be modified.
977 	 */
978 	if (!rawok && os->os_encrypted &&
979 	    arc_is_unauthenticated(os->os_phys_buf)) {
980 		zbookmark_phys_t zb;
981 
982 		SET_BOOKMARK(&zb, to_ds->ds_object, ZB_ROOT_OBJECT,
983 		    ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
984 		err = arc_untransform(os->os_phys_buf, os->os_spa,
985 		    &zb, B_FALSE);
986 		if (err != 0) {
987 			dsl_pool_rele(dp, tag);
988 			return (err);
989 		}
990 
991 		ASSERT0(arc_is_unauthenticated(os->os_phys_buf));
992 	}
993 
994 	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
995 	drr->drr_type = DRR_BEGIN;
996 	drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
997 	DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
998 	    DMU_SUBSTREAM);
999 
1000 #ifdef _KERNEL
1001 	if (dmu_objset_type(os) == DMU_OST_ZFS) {
1002 		uint64_t version;
1003 		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
1004 			kmem_free(drr, sizeof (dmu_replay_record_t));
1005 			dsl_pool_rele(dp, tag);
1006 			return (SET_ERROR(EINVAL));
1007 		}
1008 		if (version >= ZPL_VERSION_SA) {
1009 			featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
1010 		}
1011 	}
1012 #endif
1013 
1014 	/* raw sends imply large_block_ok */
1015 	if ((large_block_ok || rawok) &&
1016 	    to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
1017 		featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
1018 	if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE])
1019 		featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE;
1020 
1021 	/* encrypted datasets will not have embedded blocks */
1022 	if ((embedok || rawok) && !os->os_encrypted &&
1023 	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
1024 		featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
1025 	}
1026 
1027 	/* raw send implies compressok */
1028 	if (compressok || rawok)
1029 		featureflags |= DMU_BACKUP_FEATURE_COMPRESSED;
1030 	if (rawok && os->os_encrypted)
1031 		featureflags |= DMU_BACKUP_FEATURE_RAW;
1032 
1033 	if ((featureflags &
1034 	    (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED |
1035 	    DMU_BACKUP_FEATURE_RAW)) != 0 &&
1036 	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) {
1037 		featureflags |= DMU_BACKUP_FEATURE_LZ4;
1038 	}
1039 
1040 	if (resumeobj != 0 || resumeoff != 0) {
1041 		featureflags |= DMU_BACKUP_FEATURE_RESUMING;
1042 	}
1043 
1044 	DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
1045 	    featureflags);
1046 
1047 	drr->drr_u.drr_begin.drr_creation_time =
1048 	    dsl_dataset_phys(to_ds)->ds_creation_time;
1049 	drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
1050 	if (is_clone)
1051 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
1052 	drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
1053 	if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
1054 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
1055 	if (zfs_send_set_freerecords_bit)
1056 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
1057 
1058 	drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_SPILL_BLOCK;
1059 
1060 	if (ancestor_zb != NULL) {
1061 		drr->drr_u.drr_begin.drr_fromguid =
1062 		    ancestor_zb->zbm_guid;
1063 		fromtxg = ancestor_zb->zbm_creation_txg;
1064 	}
1065 	dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname);
1066 	if (!to_ds->ds_is_snapshot) {
1067 		(void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
1068 		    sizeof (drr->drr_u.drr_begin.drr_toname));
1069 	}
1070 
1071 	dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
1072 
1073 	dsp->dsa_drr = drr;
1074 	dsp->dsa_vp = vp;
1075 	dsp->dsa_outfd = outfd;
1076 	dsp->dsa_proc = curproc;
1077 	dsp->dsa_os = os;
1078 	dsp->dsa_off = off;
1079 	dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
1080 	dsp->dsa_fromtxg = fromtxg;
1081 	dsp->dsa_pending_op = PENDING_NONE;
1082 	dsp->dsa_featureflags = featureflags;
1083 	dsp->dsa_resume_object = resumeobj;
1084 	dsp->dsa_resume_offset = resumeoff;
1085 
1086 	mutex_enter(&to_ds->ds_sendstream_lock);
1087 	list_insert_head(&to_ds->ds_sendstreams, dsp);
1088 	mutex_exit(&to_ds->ds_sendstream_lock);
1089 
1090 	dsl_dataset_long_hold(to_ds, FTAG);
1091 	dsl_pool_rele(dp, tag);
1092 
1093 	void *payload = NULL;
1094 	size_t payload_len = 0;
1095 	/* handle features that require a DRR_BEGIN payload */
1096 	if (featureflags &
1097 	    (DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_RAW)) {
1098 		nvlist_t *keynvl = NULL;
1099 		nvlist_t *nvl = fnvlist_alloc();
1100 
1101 		if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
1102 			dmu_object_info_t to_doi;
1103 			err = dmu_object_info(os, resumeobj, &to_doi);
1104 			if (err != 0) {
1105 				fnvlist_free(nvl);
1106 				goto out;
1107 			}
1108 
1109 			SET_BOOKMARK(&to_arg.resume, to_ds->ds_object,
1110 			    resumeobj, 0,
1111 			    resumeoff / to_doi.doi_data_block_size);
1112 
1113 			fnvlist_add_uint64(nvl, "resume_object", resumeobj);
1114 			fnvlist_add_uint64(nvl, "resume_offset", resumeoff);
1115 		}
1116 
1117 		if (featureflags & DMU_BACKUP_FEATURE_RAW) {
1118 			uint64_t ivset_guid = (ancestor_zb != NULL) ?
1119 			    ancestor_zb->zbm_ivset_guid : 0;
1120 
1121 			ASSERT(os->os_encrypted);
1122 
1123 			err = dsl_crypto_populate_key_nvlist(to_ds,
1124 			    ivset_guid, &keynvl);
1125 			if (err != 0) {
1126 				fnvlist_free(nvl);
1127 				goto out;
1128 			}
1129 
1130 			fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl);
1131 		}
1132 
1133 		payload = fnvlist_pack(nvl, &payload_len);
1134 		drr->drr_payloadlen = payload_len;
1135 		fnvlist_free(keynvl);
1136 		fnvlist_free(nvl);
1137 	}
1138 
1139 	err = dump_record(dsp, payload, payload_len);
1140 	fnvlist_pack_free(payload, payload_len);
1141 	if (err != 0) {
1142 		err = dsp->dsa_err;
1143 		goto out;
1144 	}
1145 
1146 	err = bqueue_init(&to_arg.q, zfs_send_queue_length,
1147 	    offsetof(struct send_block_record, ln));
1148 	to_arg.error_code = 0;
1149 	to_arg.cancel = B_FALSE;
1150 	to_arg.ds = to_ds;
1151 	to_arg.fromtxg = fromtxg;
1152 	to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH;
1153 	if (rawok)
1154 		to_arg.flags |= TRAVERSE_NO_DECRYPT;
1155 	(void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc,
1156 	    TS_RUN, minclsyspri);
1157 
1158 	struct send_block_record *to_data;
1159 	to_data = bqueue_dequeue(&to_arg.q);
1160 
1161 	while (!to_data->eos_marker && err == 0) {
1162 		err = do_dump(dsp, to_data);
1163 		to_data = get_next_record(&to_arg.q, to_data);
1164 		if (issig(JUSTLOOKING) && issig(FORREAL))
1165 			err = EINTR;
1166 	}
1167 
1168 	if (err != 0) {
1169 		to_arg.cancel = B_TRUE;
1170 		while (!to_data->eos_marker) {
1171 			to_data = get_next_record(&to_arg.q, to_data);
1172 		}
1173 	}
1174 	kmem_free(to_data, sizeof (*to_data));
1175 
1176 	bqueue_destroy(&to_arg.q);
1177 
1178 	if (err == 0 && to_arg.error_code != 0)
1179 		err = to_arg.error_code;
1180 
1181 	if (err != 0)
1182 		goto out;
1183 
1184 	if (dsp->dsa_pending_op != PENDING_NONE)
1185 		if (dump_record(dsp, NULL, 0) != 0)
1186 			err = SET_ERROR(EINTR);
1187 
1188 	if (err != 0) {
1189 		if (err == EINTR && dsp->dsa_err != 0)
1190 			err = dsp->dsa_err;
1191 		goto out;
1192 	}
1193 
1194 	bzero(drr, sizeof (dmu_replay_record_t));
1195 	drr->drr_type = DRR_END;
1196 	drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
1197 	drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
1198 
1199 	if (dump_record(dsp, NULL, 0) != 0)
1200 		err = dsp->dsa_err;
1201 out:
1202 	mutex_enter(&to_ds->ds_sendstream_lock);
1203 	list_remove(&to_ds->ds_sendstreams, dsp);
1204 	mutex_exit(&to_ds->ds_sendstream_lock);
1205 
1206 	VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end));
1207 
1208 	kmem_free(drr, sizeof (dmu_replay_record_t));
1209 	kmem_free(dsp, sizeof (dmu_sendarg_t));
1210 
1211 	dsl_dataset_long_rele(to_ds, FTAG);
1212 
1213 	return (err);
1214 }
1215 
1216 int
1217 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
1218     boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
1219     boolean_t rawok, int outfd, vnode_t *vp, offset_t *off)
1220 {
1221 	dsl_pool_t *dp;
1222 	dsl_dataset_t *ds;
1223 	dsl_dataset_t *fromds = NULL;
1224 	ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT;
1225 	int err;
1226 
1227 	err = dsl_pool_hold(pool, FTAG, &dp);
1228 	if (err != 0)
1229 		return (err);
1230 
1231 	err = dsl_dataset_hold_obj_flags(dp, tosnap, dsflags, FTAG, &ds);
1232 	if (err != 0) {
1233 		dsl_pool_rele(dp, FTAG);
1234 		return (err);
1235 	}
1236 
1237 	if (fromsnap != 0) {
1238 		zfs_bookmark_phys_t zb = { 0 };
1239 		boolean_t is_clone;
1240 
1241 		err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
1242 		if (err != 0) {
1243 			dsl_dataset_rele_flags(ds, dsflags, FTAG);
1244 			dsl_pool_rele(dp, FTAG);
1245 			return (err);
1246 		}
1247 		if (!dsl_dataset_is_before(ds, fromds, 0)) {
1248 			err = SET_ERROR(EXDEV);
1249 			dsl_dataset_rele(fromds, FTAG);
1250 			dsl_dataset_rele_flags(ds, dsflags, FTAG);
1251 			dsl_pool_rele(dp, FTAG);
1252 			return (err);
1253 		}
1254 
1255 		zb.zbm_creation_time =
1256 		    dsl_dataset_phys(fromds)->ds_creation_time;
1257 		zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg;
1258 		zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
1259 
1260 		if (dsl_dataset_is_zapified(fromds)) {
1261 			(void) zap_lookup(dp->dp_meta_objset,
1262 			    fromds->ds_object, DS_FIELD_IVSET_GUID, 8, 1,
1263 			    &zb.zbm_ivset_guid);
1264 		}
1265 
1266 		is_clone = (fromds->ds_dir != ds->ds_dir);
1267 		dsl_dataset_rele(fromds, FTAG);
1268 		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
1269 		    embedok, large_block_ok, compressok, rawok, outfd,
1270 		    0, 0, vp, off);
1271 	} else {
1272 		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
1273 		    embedok, large_block_ok, compressok, rawok, outfd,
1274 		    0, 0, vp, off);
1275 	}
1276 	dsl_dataset_rele_flags(ds, dsflags, FTAG);
1277 	return (err);
1278 }
1279 
1280 int
1281 dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
1282     boolean_t large_block_ok, boolean_t compressok, boolean_t rawok,
1283     int outfd, uint64_t resumeobj, uint64_t resumeoff, vnode_t *vp,
1284     offset_t *off)
1285 {
1286 	dsl_pool_t *dp;
1287 	dsl_dataset_t *ds;
1288 	int err;
1289 	ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT;
1290 	boolean_t owned = B_FALSE;
1291 
1292 	if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
1293 		return (SET_ERROR(EINVAL));
1294 
1295 	err = dsl_pool_hold(tosnap, FTAG, &dp);
1296 	if (err != 0)
1297 		return (err);
1298 
1299 	if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) {
1300 		/*
1301 		 * We are sending a filesystem or volume.  Ensure
1302 		 * that it doesn't change by owning the dataset.
1303 		 */
1304 		err = dsl_dataset_own(dp, tosnap, dsflags, FTAG, &ds);
1305 		owned = B_TRUE;
1306 	} else {
1307 		err = dsl_dataset_hold_flags(dp, tosnap, dsflags, FTAG, &ds);
1308 	}
1309 	if (err != 0) {
1310 		dsl_pool_rele(dp, FTAG);
1311 		return (err);
1312 	}
1313 
1314 	if (fromsnap != NULL) {
1315 		zfs_bookmark_phys_t zb = { 0 };
1316 		boolean_t is_clone = B_FALSE;
1317 		int fsnamelen = strchr(tosnap, '@') - tosnap;
1318 
1319 		/*
1320 		 * If the fromsnap is in a different filesystem, then
1321 		 * mark the send stream as a clone.
1322 		 */
1323 		if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
1324 		    (fromsnap[fsnamelen] != '@' &&
1325 		    fromsnap[fsnamelen] != '#')) {
1326 			is_clone = B_TRUE;
1327 		}
1328 
1329 		if (strchr(fromsnap, '@')) {
1330 			dsl_dataset_t *fromds;
1331 			err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
1332 			if (err == 0) {
1333 				if (!dsl_dataset_is_before(ds, fromds, 0))
1334 					err = SET_ERROR(EXDEV);
1335 				zb.zbm_creation_time =
1336 				    dsl_dataset_phys(fromds)->ds_creation_time;
1337 				zb.zbm_creation_txg =
1338 				    dsl_dataset_phys(fromds)->ds_creation_txg;
1339 				zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
1340 				is_clone = (ds->ds_dir != fromds->ds_dir);
1341 
1342 				if (dsl_dataset_is_zapified(fromds)) {
1343 					(void) zap_lookup(dp->dp_meta_objset,
1344 					    fromds->ds_object,
1345 					    DS_FIELD_IVSET_GUID, 8, 1,
1346 					    &zb.zbm_ivset_guid);
1347 				}
1348 				dsl_dataset_rele(fromds, FTAG);
1349 			}
1350 		} else {
1351 			err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
1352 		}
1353 		if (err != 0) {
1354 			if (owned)
1355 				dsl_dataset_disown(ds, dsflags, FTAG);
1356 			else
1357 				dsl_dataset_rele_flags(ds, dsflags, FTAG);
1358 
1359 			dsl_pool_rele(dp, FTAG);
1360 			return (err);
1361 		}
1362 		err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
1363 		    embedok, large_block_ok, compressok, rawok,
1364 		    outfd, resumeobj, resumeoff, vp, off);
1365 	} else {
1366 		err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
1367 		    embedok, large_block_ok, compressok, rawok,
1368 		    outfd, resumeobj, resumeoff, vp, off);
1369 	}
1370 	if (owned)
1371 		dsl_dataset_disown(ds, dsflags, FTAG);
1372 	else
1373 		dsl_dataset_rele_flags(ds, dsflags, FTAG);
1374 
1375 	return (err);
1376 }
1377 
1378 static int
1379 dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
1380     uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep)
1381 {
1382 	int err = 0;
1383 	uint64_t size;
1384 	/*
1385 	 * Assume that space (both on-disk and in-stream) is dominated by
1386 	 * data.  We will adjust for indirect blocks and the copies property,
1387 	 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
1388 	 */
1389 	uint64_t recordsize;
1390 	uint64_t record_count;
1391 	objset_t *os;
1392 	VERIFY0(dmu_objset_from_ds(ds, &os));
1393 
1394 	/* Assume all (uncompressed) blocks are recordsize. */
1395 	if (zfs_override_estimate_recordsize != 0) {
1396 		recordsize = zfs_override_estimate_recordsize;
1397 	} else if (os->os_phys->os_type == DMU_OST_ZVOL) {
1398 		err = dsl_prop_get_int_ds(ds,
1399 		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize);
1400 	} else {
1401 		err = dsl_prop_get_int_ds(ds,
1402 		    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize);
1403 	}
1404 	if (err != 0)
1405 		return (err);
1406 	record_count = uncompressed / recordsize;
1407 
1408 	/*
1409 	 * If we're estimating a send size for a compressed stream, use the
1410 	 * compressed data size to estimate the stream size. Otherwise, use the
1411 	 * uncompressed data size.
1412 	 */
1413 	size = stream_compressed ? compressed : uncompressed;
1414 
1415 	/*
1416 	 * Subtract out approximate space used by indirect blocks.
1417 	 * Assume most space is used by data blocks (non-indirect, non-dnode).
1418 	 * Assume no ditto blocks or internal fragmentation.
1419 	 *
1420 	 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
1421 	 * block.
1422 	 */
1423 	size -= record_count * sizeof (blkptr_t);
1424 
1425 	/* Add in the space for the record associated with each block. */
1426 	size += record_count * sizeof (dmu_replay_record_t);
1427 
1428 	*sizep = size;
1429 
1430 	return (0);
1431 }
1432 
1433 int
1434 dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds,
1435     boolean_t stream_compressed, uint64_t *sizep)
1436 {
1437 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1438 	int err;
1439 	uint64_t uncomp, comp;
1440 
1441 	ASSERT(dsl_pool_config_held(dp));
1442 
1443 	/* tosnap must be a snapshot */
1444 	if (!ds->ds_is_snapshot)
1445 		return (SET_ERROR(EINVAL));
1446 
1447 	/* fromsnap, if provided, must be a snapshot */
1448 	if (fromds != NULL && !fromds->ds_is_snapshot)
1449 		return (SET_ERROR(EINVAL));
1450 
1451 	/*
1452 	 * fromsnap must be an earlier snapshot from the same fs as tosnap,
1453 	 * or the origin's fs.
1454 	 */
1455 	if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
1456 		return (SET_ERROR(EXDEV));
1457 
1458 	/* Get compressed and uncompressed size estimates of changed data. */
1459 	if (fromds == NULL) {
1460 		uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
1461 		comp = dsl_dataset_phys(ds)->ds_compressed_bytes;
1462 	} else {
1463 		uint64_t used;
1464 		err = dsl_dataset_space_written(fromds, ds,
1465 		    &used, &comp, &uncomp);
1466 		if (err != 0)
1467 			return (err);
1468 	}
1469 
1470 	err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp,
1471 	    stream_compressed, sizep);
1472 	/*
1473 	 * Add the size of the BEGIN and END records to the estimate.
1474 	 */
1475 	*sizep += 2 * sizeof (dmu_replay_record_t);
1476 	return (err);
1477 }
1478 
1479 struct calculate_send_arg {
1480 	uint64_t uncompressed;
1481 	uint64_t compressed;
1482 };
1483 
1484 /*
1485  * Simple callback used to traverse the blocks of a snapshot and sum their
1486  * uncompressed and compressed sizes.
1487  */
1488 /* ARGSUSED */
1489 static int
1490 dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1491     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
1492 {
1493 	struct calculate_send_arg *space = arg;
1494 	if (bp != NULL && !BP_IS_HOLE(bp)) {
1495 		space->uncompressed += BP_GET_UCSIZE(bp);
1496 		space->compressed += BP_GET_PSIZE(bp);
1497 	}
1498 	return (0);
1499 }
1500 
1501 /*
1502  * Given a desination snapshot and a TXG, calculate the approximate size of a
1503  * send stream sent from that TXG. from_txg may be zero, indicating that the
1504  * whole snapshot will be sent.
1505  */
1506 int
1507 dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
1508     boolean_t stream_compressed, uint64_t *sizep)
1509 {
1510 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
1511 	int err;
1512 	struct calculate_send_arg size = { 0 };
1513 
1514 	ASSERT(dsl_pool_config_held(dp));
1515 
1516 	/* tosnap must be a snapshot */
1517 	if (!ds->ds_is_snapshot)
1518 		return (SET_ERROR(EINVAL));
1519 
1520 	/* verify that from_txg is before the provided snapshot was taken */
1521 	if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) {
1522 		return (SET_ERROR(EXDEV));
1523 	}
1524 
1525 	/*
1526 	 * traverse the blocks of the snapshot with birth times after
1527 	 * from_txg, summing their uncompressed size
1528 	 */
1529 	err = traverse_dataset(ds, from_txg,
1530 	    TRAVERSE_POST | TRAVERSE_NO_DECRYPT,
1531 	    dmu_calculate_send_traversal, &size);
1532 	if (err)
1533 		return (err);
1534 
1535 	err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed,
1536 	    size.compressed, stream_compressed, sizep);
1537 	return (err);
1538 }
1539