xref: /titanic_41/usr/src/lib/libzfs/common/libzfs_sendrecv.c (revision 98c507c4288789fc67365c4cb51f80eb641e7182)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <assert.h>
28 #include <ctype.h>
29 #include <errno.h>
30 #include <libintl.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <strings.h>
34 #include <unistd.h>
35 #include <stddef.h>
36 #include <fcntl.h>
37 #include <sys/mount.h>
38 #include <pthread.h>
39 #include <umem.h>
40 
41 #include <libzfs.h>
42 
43 #include "zfs_namecheck.h"
44 #include "zfs_prop.h"
45 #include "zfs_fletcher.h"
46 #include "libzfs_impl.h"
47 #include <sha2.h>
48 
49 static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t,
50     int, avl_tree_t *, char **);
51 
52 static const zio_cksum_t zero_cksum = { 0 };
53 
54 typedef struct dedup_arg {
55 	int	inputfd;
56 	int	outputfd;
57 	libzfs_handle_t  *dedup_hdl;
58 } dedup_arg_t;
59 
60 typedef struct dataref {
61 	uint64_t ref_guid;
62 	uint64_t ref_object;
63 	uint64_t ref_offset;
64 } dataref_t;
65 
66 typedef struct dedup_entry {
67 	struct dedup_entry	*dde_next;
68 	zio_cksum_t dde_chksum;
69 	dataref_t dde_ref;
70 } dedup_entry_t;
71 
72 #define	MAX_DDT_PHYSMEM_PERCENT		20
73 #define	SMALLEST_POSSIBLE_MAX_DDT_MB		128
74 
75 typedef struct dedup_table {
76 	dedup_entry_t	**dedup_hash_array;
77 	umem_cache_t	*ddecache;
78 	uint64_t	max_ddt_size;  /* max dedup table size in bytes */
79 	uint64_t	cur_ddt_size;  /* current dedup table size in bytes */
80 	uint64_t	ddt_count;
81 	int		numhashbits;
82 	boolean_t	ddt_full;
83 } dedup_table_t;
84 
85 static int
86 high_order_bit(uint64_t n)
87 {
88 	int count;
89 
90 	for (count = 0; n != 0; count++)
91 		n >>= 1;
92 	return (count);
93 }
94 
95 static size_t
96 ssread(void *buf, size_t len, FILE *stream)
97 {
98 	size_t outlen;
99 
100 	if ((outlen = fread(buf, len, 1, stream)) == 0)
101 		return (0);
102 
103 	return (outlen);
104 }
105 
106 static void
107 ddt_hash_append(libzfs_handle_t *hdl, dedup_table_t *ddt, dedup_entry_t **ddepp,
108     zio_cksum_t *cs, dataref_t *dr)
109 {
110 	dedup_entry_t	*dde;
111 
112 	if (ddt->cur_ddt_size >= ddt->max_ddt_size) {
113 		if (ddt->ddt_full == B_FALSE) {
114 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
115 			    "Dedup table full.  Deduplication will continue "
116 			    "with existing table entries"));
117 			ddt->ddt_full = B_TRUE;
118 		}
119 		return;
120 	}
121 
122 	if ((dde = umem_cache_alloc(ddt->ddecache, UMEM_DEFAULT))
123 	    != NULL) {
124 		assert(*ddepp == NULL);
125 		dde->dde_next = NULL;
126 		dde->dde_chksum = *cs;
127 		dde->dde_ref = *dr;
128 		*ddepp = dde;
129 		ddt->cur_ddt_size += sizeof (dedup_entry_t);
130 		ddt->ddt_count++;
131 	}
132 }
133 
134 /*
135  * Using the specified dedup table, do a lookup for an entry with
136  * the checksum cs.  If found, return the block's reference info
137  * in *dr. Otherwise, insert a new entry in the dedup table, using
138  * the reference information specified by *dr.
139  *
140  * return value:  true - entry was found
141  *		  false - entry was not found
142  */
143 static boolean_t
144 ddt_update(libzfs_handle_t *hdl, dedup_table_t *ddt, zio_cksum_t *cs,
145     dataref_t *dr)
146 {
147 	uint32_t hashcode;
148 	dedup_entry_t **ddepp;
149 
150 	hashcode = BF64_GET(cs->zc_word[0], 0, ddt->numhashbits);
151 
152 	for (ddepp = &(ddt->dedup_hash_array[hashcode]); *ddepp != NULL;
153 	    ddepp = &((*ddepp)->dde_next)) {
154 		if (ZIO_CHECKSUM_EQUAL(((*ddepp)->dde_chksum), *cs)) {
155 			*dr = (*ddepp)->dde_ref;
156 			return (B_TRUE);
157 		}
158 	}
159 	ddt_hash_append(hdl, ddt, ddepp, cs, dr);
160 	return (B_FALSE);
161 }
162 
163 static int
164 cksum_and_write(const void *buf, uint64_t len, zio_cksum_t *zc, int outfd)
165 {
166 	fletcher_4_incremental_native(buf, len, zc);
167 	return (write(outfd, buf, len));
168 }
169 
170 /*
171  * This function is started in a separate thread when the dedup option
172  * has been requested.  The main send thread determines the list of
173  * snapshots to be included in the send stream and makes the ioctl calls
174  * for each one.  But instead of having the ioctl send the output to the
175  * the output fd specified by the caller of zfs_send()), the
176  * ioctl is told to direct the output to a pipe, which is read by the
177  * alternate thread running THIS function.  This function does the
178  * dedup'ing by:
179  *  1. building a dedup table (the DDT)
180  *  2. doing checksums on each data block and inserting a record in the DDT
181  *  3. looking for matching checksums, and
182  *  4.  sending a DRR_WRITE_BYREF record instead of a write record whenever
183  *      a duplicate block is found.
184  * The output of this function then goes to the output fd requested
185  * by the caller of zfs_send().
186  */
187 static void *
188 cksummer(void *arg)
189 {
190 	dedup_arg_t *dda = arg;
191 	char *buf = malloc(1<<20);
192 	dmu_replay_record_t thedrr;
193 	dmu_replay_record_t *drr = &thedrr;
194 	struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
195 	struct drr_end *drre = &thedrr.drr_u.drr_end;
196 	struct drr_object *drro = &thedrr.drr_u.drr_object;
197 	struct drr_write *drrw = &thedrr.drr_u.drr_write;
198 	FILE *ofp;
199 	int outfd;
200 	dmu_replay_record_t wbr_drr;
201 	struct drr_write_byref *wbr_drrr = &wbr_drr.drr_u.drr_write_byref;
202 	dedup_table_t ddt;
203 	zio_cksum_t stream_cksum;
204 	uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
205 	uint64_t numbuckets;
206 
207 	ddt.max_ddt_size =
208 	    MAX((physmem * MAX_DDT_PHYSMEM_PERCENT)/100,
209 	    SMALLEST_POSSIBLE_MAX_DDT_MB<<20);
210 
211 	numbuckets = ddt.max_ddt_size/(sizeof (dedup_entry_t));
212 
213 	/*
214 	 * numbuckets must be a power of 2.  Increase number to
215 	 * a power of 2 if necessary.
216 	 */
217 	if (!ISP2(numbuckets))
218 		numbuckets = 1 << high_order_bit(numbuckets);
219 
220 	ddt.dedup_hash_array = calloc(numbuckets, sizeof (dedup_entry_t *));
221 	ddt.ddecache = umem_cache_create("dde", sizeof (dedup_entry_t), 0,
222 	    NULL, NULL, NULL, NULL, NULL, 0);
223 	ddt.cur_ddt_size = numbuckets * sizeof (dedup_entry_t *);
224 	ddt.numhashbits = high_order_bit(numbuckets) - 1;
225 	ddt.ddt_full = B_FALSE;
226 
227 	/* Initialize the write-by-reference block. */
228 	wbr_drr.drr_type = DRR_WRITE_BYREF;
229 	wbr_drr.drr_payloadlen = 0;
230 
231 	outfd = dda->outputfd;
232 	ofp = fdopen(dda->inputfd, "r");
233 	while (ssread(drr, sizeof (dmu_replay_record_t), ofp) != 0) {
234 
235 		switch (drr->drr_type) {
236 		case DRR_BEGIN:
237 		{
238 			int	fflags;
239 			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
240 
241 			/* set the DEDUP feature flag for this stream */
242 			fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
243 			fflags |= DMU_BACKUP_FEATURE_DEDUP;
244 			DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
245 
246 			if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
247 			    &stream_cksum, outfd) == -1)
248 				goto out;
249 			if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
250 			    DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) {
251 				int sz = drr->drr_payloadlen;
252 
253 				if (sz > 1<<20) {
254 					free(buf);
255 					buf = malloc(sz);
256 				}
257 				(void) ssread(buf, sz, ofp);
258 				if (ferror(stdin))
259 					perror("fread");
260 				if (cksum_and_write(buf, sz, &stream_cksum,
261 				    outfd) == -1)
262 					goto out;
263 			}
264 			break;
265 		}
266 
267 		case DRR_END:
268 		{
269 			/* use the recalculated checksum */
270 			ZIO_SET_CHECKSUM(&drre->drr_checksum,
271 			    stream_cksum.zc_word[0], stream_cksum.zc_word[1],
272 			    stream_cksum.zc_word[2], stream_cksum.zc_word[3]);
273 			if ((write(outfd, drr,
274 			    sizeof (dmu_replay_record_t))) == -1)
275 				goto out;
276 			break;
277 		}
278 
279 		case DRR_OBJECT:
280 		{
281 			if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
282 			    &stream_cksum, outfd) == -1)
283 				goto out;
284 			if (drro->drr_bonuslen > 0) {
285 				(void) ssread(buf,
286 				    P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
287 				    ofp);
288 				if (cksum_and_write(buf,
289 				    P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
290 				    &stream_cksum, outfd) == -1)
291 					goto out;
292 			}
293 			break;
294 		}
295 
296 		case DRR_FREEOBJECTS:
297 		{
298 			if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
299 			    &stream_cksum, outfd) == -1)
300 				goto out;
301 			break;
302 		}
303 
304 		case DRR_WRITE:
305 		{
306 			dataref_t	dataref;
307 
308 			(void) ssread(buf, drrw->drr_length, ofp);
309 			/*
310 			 * If the block doesn't already have a dedup
311 			 * checksum, calculate one.
312 			 */
313 			if (ZIO_CHECKSUM_EQUAL(drrw->drr_blkcksum,
314 			    zero_cksum)) {
315 				SHA256_CTX	ctx;
316 				zio_cksum_t	tmpsha256;
317 
318 				SHA256Init(&ctx);
319 				SHA256Update(&ctx, buf, drrw->drr_length);
320 				SHA256Final(&tmpsha256, &ctx);
321 				drrw->drr_blkcksum.zc_word[0] =
322 				    BE_64(tmpsha256.zc_word[0]);
323 				drrw->drr_blkcksum.zc_word[1] =
324 				    BE_64(tmpsha256.zc_word[1]);
325 				drrw->drr_blkcksum.zc_word[2] =
326 				    BE_64(tmpsha256.zc_word[2]);
327 				drrw->drr_blkcksum.zc_word[3] =
328 				    BE_64(tmpsha256.zc_word[3]);
329 			}
330 
331 			dataref.ref_guid = drrw->drr_toguid;
332 			dataref.ref_object = drrw->drr_object;
333 			dataref.ref_offset = drrw->drr_offset;
334 
335 			if (ddt_update(dda->dedup_hdl, &ddt,
336 			    &drrw->drr_blkcksum, &dataref)) {
337 				/* block already present in stream */
338 				wbr_drrr->drr_object = drrw->drr_object;
339 				wbr_drrr->drr_offset = drrw->drr_offset;
340 				wbr_drrr->drr_length = drrw->drr_length;
341 				wbr_drrr->drr_toguid = drrw->drr_toguid;
342 				wbr_drrr->drr_refguid = dataref.ref_guid;
343 				wbr_drrr->drr_refobject =
344 				    dataref.ref_object;
345 				wbr_drrr->drr_refoffset =
346 				    dataref.ref_offset;
347 
348 				wbr_drrr->drr_blkcksum = drrw->drr_blkcksum;
349 
350 				if (cksum_and_write(&wbr_drr,
351 				    sizeof (dmu_replay_record_t), &stream_cksum,
352 				    outfd) == -1)
353 					goto out;
354 			} else {
355 				/* block not previously seen */
356 				if (cksum_and_write(drr,
357 				    sizeof (dmu_replay_record_t), &stream_cksum,
358 				    outfd) == -1)
359 					goto out;
360 				if (cksum_and_write(buf,
361 				    drrw->drr_length,
362 				    &stream_cksum, outfd) == -1)
363 					goto out;
364 			}
365 			break;
366 		}
367 
368 		case DRR_FREE:
369 		{
370 			if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
371 			    &stream_cksum, outfd) == -1)
372 				goto out;
373 			break;
374 		}
375 
376 		default:
377 			(void) printf("INVALID record type 0x%x\n",
378 			    drr->drr_type);
379 			/* should never happen, so assert */
380 			assert(B_FALSE);
381 		}
382 	}
383 out:
384 	umem_cache_destroy(ddt.ddecache);
385 	free(ddt.dedup_hash_array);
386 	free(buf);
387 	(void) fclose(ofp);
388 
389 	return (NULL);
390 }
391 
392 /*
393  * Routines for dealing with the AVL tree of fs-nvlists
394  */
395 typedef struct fsavl_node {
396 	avl_node_t fn_node;
397 	nvlist_t *fn_nvfs;
398 	char *fn_snapname;
399 	uint64_t fn_guid;
400 } fsavl_node_t;
401 
402 static int
403 fsavl_compare(const void *arg1, const void *arg2)
404 {
405 	const fsavl_node_t *fn1 = arg1;
406 	const fsavl_node_t *fn2 = arg2;
407 
408 	if (fn1->fn_guid > fn2->fn_guid)
409 		return (+1);
410 	else if (fn1->fn_guid < fn2->fn_guid)
411 		return (-1);
412 	else
413 		return (0);
414 }
415 
416 /*
417  * Given the GUID of a snapshot, find its containing filesystem and
418  * (optionally) name.
419  */
420 static nvlist_t *
421 fsavl_find(avl_tree_t *avl, uint64_t snapguid, char **snapname)
422 {
423 	fsavl_node_t fn_find;
424 	fsavl_node_t *fn;
425 
426 	fn_find.fn_guid = snapguid;
427 
428 	fn = avl_find(avl, &fn_find, NULL);
429 	if (fn) {
430 		if (snapname)
431 			*snapname = fn->fn_snapname;
432 		return (fn->fn_nvfs);
433 	}
434 	return (NULL);
435 }
436 
437 static void
438 fsavl_destroy(avl_tree_t *avl)
439 {
440 	fsavl_node_t *fn;
441 	void *cookie;
442 
443 	if (avl == NULL)
444 		return;
445 
446 	cookie = NULL;
447 	while ((fn = avl_destroy_nodes(avl, &cookie)) != NULL)
448 		free(fn);
449 	avl_destroy(avl);
450 	free(avl);
451 }
452 
453 /*
454  * Given an nvlist, produce an avl tree of snapshots, ordered by guid
455  */
456 static avl_tree_t *
457 fsavl_create(nvlist_t *fss)
458 {
459 	avl_tree_t *fsavl;
460 	nvpair_t *fselem = NULL;
461 
462 	if ((fsavl = malloc(sizeof (avl_tree_t))) == NULL)
463 		return (NULL);
464 
465 	avl_create(fsavl, fsavl_compare, sizeof (fsavl_node_t),
466 	    offsetof(fsavl_node_t, fn_node));
467 
468 	while ((fselem = nvlist_next_nvpair(fss, fselem)) != NULL) {
469 		nvlist_t *nvfs, *snaps;
470 		nvpair_t *snapelem = NULL;
471 
472 		VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs));
473 		VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps));
474 
475 		while ((snapelem =
476 		    nvlist_next_nvpair(snaps, snapelem)) != NULL) {
477 			fsavl_node_t *fn;
478 			uint64_t guid;
479 
480 			VERIFY(0 == nvpair_value_uint64(snapelem, &guid));
481 			if ((fn = malloc(sizeof (fsavl_node_t))) == NULL) {
482 				fsavl_destroy(fsavl);
483 				return (NULL);
484 			}
485 			fn->fn_nvfs = nvfs;
486 			fn->fn_snapname = nvpair_name(snapelem);
487 			fn->fn_guid = guid;
488 
489 			/*
490 			 * Note: if there are multiple snaps with the
491 			 * same GUID, we ignore all but one.
492 			 */
493 			if (avl_find(fsavl, fn, NULL) == NULL)
494 				avl_add(fsavl, fn);
495 			else
496 				free(fn);
497 		}
498 	}
499 
500 	return (fsavl);
501 }
502 
503 /*
504  * Routines for dealing with the giant nvlist of fs-nvlists, etc.
505  */
506 typedef struct send_data {
507 	uint64_t parent_fromsnap_guid;
508 	nvlist_t *parent_snaps;
509 	nvlist_t *fss;
510 	nvlist_t *snapprops;
511 	const char *fromsnap;
512 	const char *tosnap;
513 
514 	/*
515 	 * The header nvlist is of the following format:
516 	 * {
517 	 *   "tosnap" -> string
518 	 *   "fromsnap" -> string (if incremental)
519 	 *   "fss" -> {
520 	 *	id -> {
521 	 *
522 	 *	 "name" -> string (full name; for debugging)
523 	 *	 "parentfromsnap" -> number (guid of fromsnap in parent)
524 	 *
525 	 *	 "props" -> { name -> value (only if set here) }
526 	 *	 "snaps" -> { name (lastname) -> number (guid) }
527 	 *	 "snapprops" -> { name (lastname) -> { name -> value } }
528 	 *
529 	 *	 "origin" -> number (guid) (if clone)
530 	 *	 "sent" -> boolean (not on-disk)
531 	 *	}
532 	 *   }
533 	 * }
534 	 *
535 	 */
536 } send_data_t;
537 
538 static void send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv);
539 
540 static int
541 send_iterate_snap(zfs_handle_t *zhp, void *arg)
542 {
543 	send_data_t *sd = arg;
544 	uint64_t guid = zhp->zfs_dmustats.dds_guid;
545 	char *snapname;
546 	nvlist_t *nv;
547 
548 	snapname = strrchr(zhp->zfs_name, '@')+1;
549 
550 	VERIFY(0 == nvlist_add_uint64(sd->parent_snaps, snapname, guid));
551 	/*
552 	 * NB: if there is no fromsnap here (it's a newly created fs in
553 	 * an incremental replication), we will substitute the tosnap.
554 	 */
555 	if ((sd->fromsnap && strcmp(snapname, sd->fromsnap) == 0) ||
556 	    (sd->parent_fromsnap_guid == 0 && sd->tosnap &&
557 	    strcmp(snapname, sd->tosnap) == 0)) {
558 		sd->parent_fromsnap_guid = guid;
559 	}
560 
561 	VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0));
562 	send_iterate_prop(zhp, nv);
563 	VERIFY(0 == nvlist_add_nvlist(sd->snapprops, snapname, nv));
564 	nvlist_free(nv);
565 
566 	zfs_close(zhp);
567 	return (0);
568 }
569 
570 static void
571 send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv)
572 {
573 	nvpair_t *elem = NULL;
574 
575 	while ((elem = nvlist_next_nvpair(zhp->zfs_props, elem)) != NULL) {
576 		char *propname = nvpair_name(elem);
577 		zfs_prop_t prop = zfs_name_to_prop(propname);
578 		nvlist_t *propnv;
579 
580 		if (!zfs_prop_user(propname)) {
581 			/*
582 			 * Realistically, this should never happen.  However,
583 			 * we want the ability to add DSL properties without
584 			 * needing to make incompatible version changes.  We
585 			 * need to ignore unknown properties to allow older
586 			 * software to still send datasets containing these
587 			 * properties, with the unknown properties elided.
588 			 */
589 			if (prop == ZPROP_INVAL)
590 				continue;
591 
592 			if (zfs_prop_readonly(prop))
593 				continue;
594 		}
595 
596 		verify(nvpair_value_nvlist(elem, &propnv) == 0);
597 		if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION ||
598 		    prop == ZFS_PROP_REFQUOTA ||
599 		    prop == ZFS_PROP_REFRESERVATION) {
600 			/* these guys are modifyable, but have no source */
601 			uint64_t value;
602 			verify(nvlist_lookup_uint64(propnv,
603 			    ZPROP_VALUE, &value) == 0);
604 			if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT)
605 				continue;
606 		} else {
607 			char *source;
608 			if (nvlist_lookup_string(propnv,
609 			    ZPROP_SOURCE, &source) != 0)
610 				continue;
611 			if (strcmp(source, zhp->zfs_name) != 0)
612 				continue;
613 		}
614 
615 		if (zfs_prop_user(propname) ||
616 		    zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
617 			char *value;
618 			verify(nvlist_lookup_string(propnv,
619 			    ZPROP_VALUE, &value) == 0);
620 			VERIFY(0 == nvlist_add_string(nv, propname, value));
621 		} else {
622 			uint64_t value;
623 			verify(nvlist_lookup_uint64(propnv,
624 			    ZPROP_VALUE, &value) == 0);
625 			VERIFY(0 == nvlist_add_uint64(nv, propname, value));
626 		}
627 	}
628 }
629 
630 /*
631  * recursively generate nvlists describing datasets.  See comment
632  * for the data structure send_data_t above for description of contents
633  * of the nvlist.
634  */
635 static int
636 send_iterate_fs(zfs_handle_t *zhp, void *arg)
637 {
638 	send_data_t *sd = arg;
639 	nvlist_t *nvfs, *nv;
640 	int rv;
641 	uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid;
642 	uint64_t guid = zhp->zfs_dmustats.dds_guid;
643 	char guidstring[64];
644 
645 	VERIFY(0 == nvlist_alloc(&nvfs, NV_UNIQUE_NAME, 0));
646 	VERIFY(0 == nvlist_add_string(nvfs, "name", zhp->zfs_name));
647 	VERIFY(0 == nvlist_add_uint64(nvfs, "parentfromsnap",
648 	    sd->parent_fromsnap_guid));
649 
650 	if (zhp->zfs_dmustats.dds_origin[0]) {
651 		zfs_handle_t *origin = zfs_open(zhp->zfs_hdl,
652 		    zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT);
653 		if (origin == NULL)
654 			return (-1);
655 		VERIFY(0 == nvlist_add_uint64(nvfs, "origin",
656 		    origin->zfs_dmustats.dds_guid));
657 	}
658 
659 	/* iterate over props */
660 	VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0));
661 	send_iterate_prop(zhp, nv);
662 	VERIFY(0 == nvlist_add_nvlist(nvfs, "props", nv));
663 	nvlist_free(nv);
664 
665 	/* iterate over snaps, and set sd->parent_fromsnap_guid */
666 	sd->parent_fromsnap_guid = 0;
667 	VERIFY(0 == nvlist_alloc(&sd->parent_snaps, NV_UNIQUE_NAME, 0));
668 	VERIFY(0 == nvlist_alloc(&sd->snapprops, NV_UNIQUE_NAME, 0));
669 	(void) zfs_iter_snapshots(zhp, send_iterate_snap, sd);
670 	VERIFY(0 == nvlist_add_nvlist(nvfs, "snaps", sd->parent_snaps));
671 	VERIFY(0 == nvlist_add_nvlist(nvfs, "snapprops", sd->snapprops));
672 	nvlist_free(sd->parent_snaps);
673 	nvlist_free(sd->snapprops);
674 
675 	/* add this fs to nvlist */
676 	(void) snprintf(guidstring, sizeof (guidstring),
677 	    "0x%llx", (longlong_t)guid);
678 	VERIFY(0 == nvlist_add_nvlist(sd->fss, guidstring, nvfs));
679 	nvlist_free(nvfs);
680 
681 	/* iterate over children */
682 	rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd);
683 
684 	sd->parent_fromsnap_guid = parent_fromsnap_guid_save;
685 
686 	zfs_close(zhp);
687 	return (rv);
688 }
689 
690 static int
691 gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap,
692     const char *tosnap, nvlist_t **nvlp, avl_tree_t **avlp)
693 {
694 	zfs_handle_t *zhp;
695 	send_data_t sd = { 0 };
696 	int error;
697 
698 	zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
699 	if (zhp == NULL)
700 		return (EZFS_BADTYPE);
701 
702 	VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0));
703 	sd.fromsnap = fromsnap;
704 	sd.tosnap = tosnap;
705 
706 	if ((error = send_iterate_fs(zhp, &sd)) != 0) {
707 		nvlist_free(sd.fss);
708 		if (avlp != NULL)
709 			*avlp = NULL;
710 		*nvlp = NULL;
711 		return (error);
712 	}
713 
714 	if (avlp != NULL && (*avlp = fsavl_create(sd.fss)) == NULL) {
715 		nvlist_free(sd.fss);
716 		*nvlp = NULL;
717 		return (EZFS_NOMEM);
718 	}
719 
720 	*nvlp = sd.fss;
721 	return (0);
722 }
723 
724 /*
725  * Routines for dealing with the sorted snapshot functionality
726  */
727 typedef struct zfs_node {
728 	zfs_handle_t	*zn_handle;
729 	avl_node_t	zn_avlnode;
730 } zfs_node_t;
731 
732 static int
733 zfs_sort_snaps(zfs_handle_t *zhp, void *data)
734 {
735 	avl_tree_t *avl = data;
736 	zfs_node_t *node = zfs_alloc(zhp->zfs_hdl, sizeof (zfs_node_t));
737 
738 	node->zn_handle = zhp;
739 	avl_add(avl, node);
740 	return (0);
741 }
742 
743 /* ARGSUSED */
744 static int
745 zfs_snapshot_compare(const void *larg, const void *rarg)
746 {
747 	zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle;
748 	zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle;
749 	uint64_t lcreate, rcreate;
750 
751 	/*
752 	 * Sort them according to creation time.  We use the hidden
753 	 * CREATETXG property to get an absolute ordering of snapshots.
754 	 */
755 	lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG);
756 	rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG);
757 
758 	if (lcreate < rcreate)
759 		return (-1);
760 	else if (lcreate > rcreate)
761 		return (+1);
762 	else
763 		return (0);
764 }
765 
766 int
767 zfs_iter_snapshots_sorted(zfs_handle_t *zhp, zfs_iter_f callback, void *data)
768 {
769 	int ret = 0;
770 	zfs_node_t *node;
771 	avl_tree_t avl;
772 	void *cookie = NULL;
773 
774 	avl_create(&avl, zfs_snapshot_compare,
775 	    sizeof (zfs_node_t), offsetof(zfs_node_t, zn_avlnode));
776 
777 	ret = zfs_iter_snapshots(zhp, zfs_sort_snaps, &avl);
778 
779 	for (node = avl_first(&avl); node != NULL; node = AVL_NEXT(&avl, node))
780 		ret |= callback(node->zn_handle, data);
781 
782 	while ((node = avl_destroy_nodes(&avl, &cookie)) != NULL)
783 		free(node);
784 
785 	avl_destroy(&avl);
786 
787 	return (ret);
788 }
789 
790 /*
791  * Routines specific to "zfs send"
792  */
793 typedef struct send_dump_data {
794 	/* these are all just the short snapname (the part after the @) */
795 	const char *fromsnap;
796 	const char *tosnap;
797 	char prevsnap[ZFS_MAXNAMELEN];
798 	boolean_t seenfrom, seento, replicate, doall, fromorigin;
799 	boolean_t verbose;
800 	int outfd;
801 	boolean_t err;
802 	nvlist_t *fss;
803 	avl_tree_t *fsavl;
804 	snapfilter_cb_t *filter_cb;
805 	void *filter_cb_arg;
806 } send_dump_data_t;
807 
808 /*
809  * Dumps a backup of the given snapshot (incremental from fromsnap if it's not
810  * NULL) to the file descriptor specified by outfd.
811  */
812 static int
813 dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
814     int outfd)
815 {
816 	zfs_cmd_t zc = { 0 };
817 	libzfs_handle_t *hdl = zhp->zfs_hdl;
818 
819 	assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
820 	assert(fromsnap == NULL || fromsnap[0] == '\0' || !fromorigin);
821 
822 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
823 	if (fromsnap)
824 		(void) strlcpy(zc.zc_value, fromsnap, sizeof (zc.zc_value));
825 	zc.zc_cookie = outfd;
826 	zc.zc_obj = fromorigin;
827 
828 	if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SEND, &zc) != 0) {
829 		char errbuf[1024];
830 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
831 		    "warning: cannot send '%s'"), zhp->zfs_name);
832 
833 		switch (errno) {
834 
835 		case EXDEV:
836 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
837 			    "not an earlier snapshot from the same fs"));
838 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
839 
840 		case ENOENT:
841 			if (zfs_dataset_exists(hdl, zc.zc_name,
842 			    ZFS_TYPE_SNAPSHOT)) {
843 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
844 				    "incremental source (@%s) does not exist"),
845 				    zc.zc_value);
846 			}
847 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
848 
849 		case EDQUOT:
850 		case EFBIG:
851 		case EIO:
852 		case ENOLINK:
853 		case ENOSPC:
854 		case ENOSTR:
855 		case ENXIO:
856 		case EPIPE:
857 		case ERANGE:
858 		case EFAULT:
859 		case EROFS:
860 			zfs_error_aux(hdl, strerror(errno));
861 			return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
862 
863 		default:
864 			return (zfs_standard_error(hdl, errno, errbuf));
865 		}
866 	}
867 
868 	return (0);
869 }
870 
871 static int
872 dump_snapshot(zfs_handle_t *zhp, void *arg)
873 {
874 	send_dump_data_t *sdd = arg;
875 	const char *thissnap;
876 	int err;
877 
878 	thissnap = strchr(zhp->zfs_name, '@') + 1;
879 
880 	if (sdd->fromsnap && !sdd->seenfrom &&
881 	    strcmp(sdd->fromsnap, thissnap) == 0) {
882 		sdd->seenfrom = B_TRUE;
883 		(void) strcpy(sdd->prevsnap, thissnap);
884 		zfs_close(zhp);
885 		return (0);
886 	}
887 
888 	if (sdd->seento || !sdd->seenfrom) {
889 		zfs_close(zhp);
890 		return (0);
891 	}
892 
893 	if (strcmp(sdd->tosnap, thissnap) == 0)
894 		sdd->seento = B_TRUE;
895 
896 	/*
897 	 * If a filter function exists, call it to determine whether
898 	 * this snapshot will be sent.
899 	 */
900 	if (sdd->filter_cb != NULL &&
901 	    sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE) {
902 		/*
903 		 * This snapshot is filtered out.  Don't send it, and don't
904 		 * set prevsnap, so it will be as if this snapshot didn't
905 		 * exist, and the next accepted snapshot will be sent as
906 		 * an incremental from the last accepted one, or as the
907 		 * first (and full) snapshot in the case of a replication,
908 		 * non-incremental send.
909 		 */
910 		zfs_close(zhp);
911 		return (0);
912 	}
913 
914 	/* send it */
915 	if (sdd->verbose) {
916 		(void) fprintf(stderr, "sending from @%s to %s\n",
917 		    sdd->prevsnap, zhp->zfs_name);
918 	}
919 
920 	err = dump_ioctl(zhp, sdd->prevsnap,
921 	    sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate),
922 	    sdd->outfd);
923 
924 	(void) strcpy(sdd->prevsnap, thissnap);
925 	zfs_close(zhp);
926 	return (err);
927 }
928 
929 static int
930 dump_filesystem(zfs_handle_t *zhp, void *arg)
931 {
932 	int rv = 0;
933 	send_dump_data_t *sdd = arg;
934 	boolean_t missingfrom = B_FALSE;
935 	zfs_cmd_t zc = { 0 };
936 
937 	(void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s",
938 	    zhp->zfs_name, sdd->tosnap);
939 	if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) {
940 		(void) fprintf(stderr, "WARNING: "
941 		    "could not send %s@%s: does not exist\n",
942 		    zhp->zfs_name, sdd->tosnap);
943 		sdd->err = B_TRUE;
944 		return (0);
945 	}
946 
947 	if (sdd->replicate && sdd->fromsnap) {
948 		/*
949 		 * If this fs does not have fromsnap, and we're doing
950 		 * recursive, we need to send a full stream from the
951 		 * beginning (or an incremental from the origin if this
952 		 * is a clone).  If we're doing non-recursive, then let
953 		 * them get the error.
954 		 */
955 		(void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s",
956 		    zhp->zfs_name, sdd->fromsnap);
957 		if (ioctl(zhp->zfs_hdl->libzfs_fd,
958 		    ZFS_IOC_OBJSET_STATS, &zc) != 0) {
959 			missingfrom = B_TRUE;
960 		}
961 	}
962 
963 	if (sdd->doall) {
964 		sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0;
965 		if (sdd->fromsnap == NULL || missingfrom)
966 			sdd->seenfrom = B_TRUE;
967 
968 		rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg);
969 		if (!sdd->seenfrom) {
970 			(void) fprintf(stderr,
971 			    "WARNING: could not send %s@%s:\n"
972 			    "incremental source (%s@%s) does not exist\n",
973 			    zhp->zfs_name, sdd->tosnap,
974 			    zhp->zfs_name, sdd->fromsnap);
975 			sdd->err = B_TRUE;
976 		} else if (!sdd->seento) {
977 			if (sdd->fromsnap) {
978 				(void) fprintf(stderr,
979 				    "WARNING: could not send %s@%s:\n"
980 				    "incremental source (%s@%s) "
981 				    "is not earlier than it\n",
982 				    zhp->zfs_name, sdd->tosnap,
983 				    zhp->zfs_name, sdd->fromsnap);
984 			} else {
985 				(void) fprintf(stderr, "WARNING: "
986 				    "could not send %s@%s: does not exist\n",
987 				    zhp->zfs_name, sdd->tosnap);
988 			}
989 			sdd->err = B_TRUE;
990 		}
991 	} else {
992 		zfs_handle_t *snapzhp;
993 		char snapname[ZFS_MAXNAMELEN];
994 
995 		(void) snprintf(snapname, sizeof (snapname), "%s@%s",
996 		    zfs_get_name(zhp), sdd->tosnap);
997 		snapzhp = zfs_open(zhp->zfs_hdl, snapname, ZFS_TYPE_SNAPSHOT);
998 		if (snapzhp == NULL) {
999 			rv = -1;
1000 		} else {
1001 			if (sdd->filter_cb == NULL ||
1002 			    sdd->filter_cb(snapzhp, sdd->filter_cb_arg) ==
1003 			    B_TRUE) {
1004 				rv = dump_ioctl(snapzhp,
1005 				    missingfrom ? NULL : sdd->fromsnap,
1006 				    sdd->fromorigin || missingfrom,
1007 				    sdd->outfd);
1008 			}
1009 			sdd->seento = B_TRUE;
1010 			zfs_close(snapzhp);
1011 		}
1012 	}
1013 
1014 	return (rv);
1015 }
1016 
1017 static int
1018 dump_filesystems(zfs_handle_t *rzhp, void *arg)
1019 {
1020 	send_dump_data_t *sdd = arg;
1021 	nvpair_t *fspair;
1022 	boolean_t needagain, progress;
1023 
1024 	if (!sdd->replicate)
1025 		return (dump_filesystem(rzhp, sdd));
1026 
1027 again:
1028 	needagain = progress = B_FALSE;
1029 	for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair;
1030 	    fspair = nvlist_next_nvpair(sdd->fss, fspair)) {
1031 		nvlist_t *fslist;
1032 		char *fsname;
1033 		zfs_handle_t *zhp;
1034 		int err;
1035 		uint64_t origin_guid = 0;
1036 		nvlist_t *origin_nv;
1037 
1038 		VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0);
1039 		if (nvlist_lookup_boolean(fslist, "sent") == 0)
1040 			continue;
1041 
1042 		VERIFY(nvlist_lookup_string(fslist, "name", &fsname) == 0);
1043 		(void) nvlist_lookup_uint64(fslist, "origin", &origin_guid);
1044 
1045 		origin_nv = fsavl_find(sdd->fsavl, origin_guid, NULL);
1046 		if (origin_nv &&
1047 		    nvlist_lookup_boolean(origin_nv, "sent") == ENOENT) {
1048 				/*
1049 				 * origin has not been sent yet;
1050 				 * skip this clone.
1051 				 */
1052 				needagain = B_TRUE;
1053 				continue;
1054 		}
1055 
1056 		zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET);
1057 		if (zhp == NULL)
1058 			return (-1);
1059 		err = dump_filesystem(zhp, sdd);
1060 		VERIFY(nvlist_add_boolean(fslist, "sent") == 0);
1061 		progress = B_TRUE;
1062 		zfs_close(zhp);
1063 		if (err)
1064 			return (err);
1065 	}
1066 	if (needagain) {
1067 		assert(progress);
1068 		goto again;
1069 	}
1070 	return (0);
1071 }
1072 
1073 /*
1074  * Generate a send stream for the dataset identified by the argument zhp.
1075  *
1076  * The content of the send stream is the snapshot identified by
1077  * 'tosnap'.  Incremental streams are requested in two ways:
1078  *     - from the snapshot identified by "fromsnap" (if non-null) or
1079  *     - from the origin of the dataset identified by zhp, which must
1080  *	 be a clone.  In this case, "fromsnap" is null and "fromorigin"
1081  *	 is TRUE.
1082  *
1083  * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and
1084  * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM)
1085  * if "replicate" is set.  If "doall" is set, dump all the intermediate
1086  * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall"
1087  * case too.
1088  */
1089 int
1090 zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
1091     sendflags_t flags, int outfd, snapfilter_cb_t filter_func,
1092     void *cb_arg)
1093 {
1094 	char errbuf[1024];
1095 	send_dump_data_t sdd = { 0 };
1096 	int err;
1097 	nvlist_t *fss = NULL;
1098 	avl_tree_t *fsavl = NULL;
1099 	char holdtag[128];
1100 	static uint64_t holdseq;
1101 	int spa_version;
1102 	boolean_t holdsnaps = B_FALSE;
1103 	pthread_t tid;
1104 	int pipefd[2];
1105 	dedup_arg_t dda = { 0 };
1106 	int featureflags = 0;
1107 
1108 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
1109 	    "cannot send '%s'"), zhp->zfs_name);
1110 
1111 	if (fromsnap && fromsnap[0] == '\0') {
1112 		zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
1113 		    "zero-length incremental source"));
1114 		return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));
1115 	}
1116 
1117 	if (zfs_spa_version(zhp, &spa_version) == 0 &&
1118 	    spa_version >= SPA_VERSION_USERREFS)
1119 		holdsnaps = B_TRUE;
1120 
1121 	if (flags.dedup) {
1122 		featureflags |= DMU_BACKUP_FEATURE_DEDUP;
1123 		if (err = pipe(pipefd)) {
1124 			zfs_error_aux(zhp->zfs_hdl, strerror(errno));
1125 			return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED,
1126 			    errbuf));
1127 		}
1128 		dda.outputfd = outfd;
1129 		dda.inputfd = pipefd[1];
1130 		dda.dedup_hdl = zhp->zfs_hdl;
1131 		if (err = pthread_create(&tid, NULL, cksummer, &dda)) {
1132 			(void) close(pipefd[0]);
1133 			(void) close(pipefd[1]);
1134 			zfs_error_aux(zhp->zfs_hdl, strerror(errno));
1135 			return (zfs_error(zhp->zfs_hdl,
1136 			    EZFS_THREADCREATEFAILED, errbuf));
1137 		}
1138 	}
1139 
1140 	if (flags.replicate || flags.doall) {
1141 		dmu_replay_record_t drr = { 0 };
1142 		char *packbuf = NULL;
1143 		size_t buflen = 0;
1144 		zio_cksum_t zc = { 0 };
1145 
1146 		assert(fromsnap || flags.doall);
1147 
1148 		if (holdsnaps) {
1149 			(void) snprintf(holdtag, sizeof (holdtag),
1150 			    ".send-%d-%llu", getpid(), (u_longlong_t)holdseq);
1151 			++holdseq;
1152 			err = zfs_hold_range(zhp, fromsnap, tosnap,
1153 			    holdtag, B_TRUE);
1154 			if (err)
1155 				goto err_out;
1156 		}
1157 
1158 
1159 		if (flags.replicate) {
1160 			nvlist_t *hdrnv;
1161 
1162 			VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0));
1163 			if (fromsnap) {
1164 				VERIFY(0 == nvlist_add_string(hdrnv,
1165 				    "fromsnap", fromsnap));
1166 			}
1167 			VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap));
1168 
1169 			err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name,
1170 			    fromsnap, tosnap, &fss, &fsavl);
1171 			if (err) {
1172 				if (holdsnaps) {
1173 					(void) zfs_release_range(zhp, fromsnap,
1174 					    tosnap, holdtag);
1175 				}
1176 				goto err_out;
1177 			}
1178 			VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss));
1179 			err = nvlist_pack(hdrnv, &packbuf, &buflen,
1180 			    NV_ENCODE_XDR, 0);
1181 			nvlist_free(hdrnv);
1182 			if (err) {
1183 				fsavl_destroy(fsavl);
1184 				nvlist_free(fss);
1185 				if (holdsnaps) {
1186 					(void) zfs_release_range(zhp, fromsnap,
1187 					    tosnap, holdtag);
1188 				}
1189 				goto stderr_out;
1190 			}
1191 		}
1192 
1193 		/* write first begin record */
1194 		drr.drr_type = DRR_BEGIN;
1195 		drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
1196 		DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin.drr_versioninfo,
1197 		    DMU_COMPOUNDSTREAM);
1198 		DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin.drr_versioninfo,
1199 		    featureflags);
1200 		(void) snprintf(drr.drr_u.drr_begin.drr_toname,
1201 		    sizeof (drr.drr_u.drr_begin.drr_toname),
1202 		    "%s@%s", zhp->zfs_name, tosnap);
1203 		drr.drr_payloadlen = buflen;
1204 		err = cksum_and_write(&drr, sizeof (drr), &zc, outfd);
1205 
1206 		/* write header nvlist */
1207 		if (err != -1 && flags.replicate) {
1208 			err = cksum_and_write(packbuf, buflen, &zc, outfd);
1209 		}
1210 		free(packbuf);
1211 		if (err == -1) {
1212 			fsavl_destroy(fsavl);
1213 			nvlist_free(fss);
1214 			if (holdsnaps) {
1215 				(void) zfs_release_range(zhp, fromsnap, tosnap,
1216 				    holdtag);
1217 			}
1218 			err = errno;
1219 			goto stderr_out;
1220 		}
1221 
1222 		/* write end record */
1223 		if (err != -1) {
1224 			bzero(&drr, sizeof (drr));
1225 			drr.drr_type = DRR_END;
1226 			drr.drr_u.drr_end.drr_checksum = zc;
1227 			err = write(outfd, &drr, sizeof (drr));
1228 			if (err == -1) {
1229 				fsavl_destroy(fsavl);
1230 				nvlist_free(fss);
1231 				if (holdsnaps) {
1232 					(void) zfs_release_range(zhp, fromsnap,
1233 					    tosnap, holdtag);
1234 				}
1235 				err = errno;
1236 				goto stderr_out;
1237 			}
1238 		}
1239 	}
1240 
1241 	/* dump each stream */
1242 	sdd.fromsnap = fromsnap;
1243 	sdd.tosnap = tosnap;
1244 	if (flags.dedup)
1245 		sdd.outfd = pipefd[0];
1246 	else
1247 		sdd.outfd = outfd;
1248 	sdd.replicate = flags.replicate;
1249 	sdd.doall = flags.doall;
1250 	sdd.fromorigin = flags.fromorigin;
1251 	sdd.fss = fss;
1252 	sdd.fsavl = fsavl;
1253 	sdd.verbose = flags.verbose;
1254 	sdd.filter_cb = filter_func;
1255 	sdd.filter_cb_arg = cb_arg;
1256 	err = dump_filesystems(zhp, &sdd);
1257 	fsavl_destroy(fsavl);
1258 	nvlist_free(fss);
1259 
1260 	if (flags.dedup) {
1261 		(void) close(pipefd[0]);
1262 		(void) pthread_join(tid, NULL);
1263 	}
1264 	if (flags.replicate || flags.doall) {
1265 		/*
1266 		 * write final end record.  NB: want to do this even if
1267 		 * there was some error, because it might not be totally
1268 		 * failed.
1269 		 */
1270 		dmu_replay_record_t drr = { 0 };
1271 		drr.drr_type = DRR_END;
1272 		if (holdsnaps) {
1273 			(void) zfs_release_range(zhp, fromsnap, tosnap,
1274 			    holdtag);
1275 		}
1276 		if (write(outfd, &drr, sizeof (drr)) == -1) {
1277 			return (zfs_standard_error(zhp->zfs_hdl,
1278 			    errno, errbuf));
1279 		}
1280 	}
1281 
1282 	return (err || sdd.err);
1283 
1284 stderr_out:
1285 	err = zfs_standard_error(zhp->zfs_hdl, err, errbuf);
1286 err_out:
1287 	if (flags.dedup) {
1288 		(void) pthread_cancel(tid);
1289 		(void) pthread_join(tid, NULL);
1290 		(void) close(pipefd[0]);
1291 	}
1292 	return (err);
1293 }
1294 
1295 /*
1296  * Routines specific to "zfs recv"
1297  */
1298 
1299 static int
1300 recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen,
1301     boolean_t byteswap, zio_cksum_t *zc)
1302 {
1303 	char *cp = buf;
1304 	int rv;
1305 	int len = ilen;
1306 
1307 	do {
1308 		rv = read(fd, cp, len);
1309 		cp += rv;
1310 		len -= rv;
1311 	} while (rv > 0);
1312 
1313 	if (rv < 0 || len != 0) {
1314 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
1315 		    "failed to read from stream"));
1316 		return (zfs_error(hdl, EZFS_BADSTREAM, dgettext(TEXT_DOMAIN,
1317 		    "cannot receive")));
1318 	}
1319 
1320 	if (zc) {
1321 		if (byteswap)
1322 			fletcher_4_incremental_byteswap(buf, ilen, zc);
1323 		else
1324 			fletcher_4_incremental_native(buf, ilen, zc);
1325 	}
1326 	return (0);
1327 }
1328 
1329 static int
1330 recv_read_nvlist(libzfs_handle_t *hdl, int fd, int len, nvlist_t **nvp,
1331     boolean_t byteswap, zio_cksum_t *zc)
1332 {
1333 	char *buf;
1334 	int err;
1335 
1336 	buf = zfs_alloc(hdl, len);
1337 	if (buf == NULL)
1338 		return (ENOMEM);
1339 
1340 	err = recv_read(hdl, fd, buf, len, byteswap, zc);
1341 	if (err != 0) {
1342 		free(buf);
1343 		return (err);
1344 	}
1345 
1346 	err = nvlist_unpack(buf, len, nvp, 0);
1347 	free(buf);
1348 	if (err != 0) {
1349 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
1350 		    "stream (malformed nvlist)"));
1351 		return (EINVAL);
1352 	}
1353 	return (0);
1354 }
1355 
1356 static int
1357 recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname,
1358     int baselen, char *newname, recvflags_t flags)
1359 {
1360 	static int seq;
1361 	zfs_cmd_t zc = { 0 };
1362 	int err;
1363 	prop_changelist_t *clp;
1364 	zfs_handle_t *zhp;
1365 
1366 	zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
1367 	if (zhp == NULL)
1368 		return (-1);
1369 	clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
1370 	    flags.force ? MS_FORCE : 0);
1371 	zfs_close(zhp);
1372 	if (clp == NULL)
1373 		return (-1);
1374 	err = changelist_prefix(clp);
1375 	if (err)
1376 		return (err);
1377 
1378 	zc.zc_objset_type = DMU_OST_ZFS;
1379 	(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
1380 
1381 	if (tryname) {
1382 		(void) strcpy(newname, tryname);
1383 
1384 		(void) strlcpy(zc.zc_value, tryname, sizeof (zc.zc_value));
1385 
1386 		if (flags.verbose) {
1387 			(void) printf("attempting rename %s to %s\n",
1388 			    zc.zc_name, zc.zc_value);
1389 		}
1390 		err = ioctl(hdl->libzfs_fd, ZFS_IOC_RENAME, &zc);
1391 		if (err == 0)
1392 			changelist_rename(clp, name, tryname);
1393 	} else {
1394 		err = ENOENT;
1395 	}
1396 
1397 	if (err != 0 && strncmp(name+baselen, "recv-", 5) != 0) {
1398 		seq++;
1399 
1400 		(void) strncpy(newname, name, baselen);
1401 		(void) snprintf(newname+baselen, ZFS_MAXNAMELEN-baselen,
1402 		    "recv-%u-%u", getpid(), seq);
1403 		(void) strlcpy(zc.zc_value, newname, sizeof (zc.zc_value));
1404 
1405 		if (flags.verbose) {
1406 			(void) printf("failed - trying rename %s to %s\n",
1407 			    zc.zc_name, zc.zc_value);
1408 		}
1409 		err = ioctl(hdl->libzfs_fd, ZFS_IOC_RENAME, &zc);
1410 		if (err == 0)
1411 			changelist_rename(clp, name, newname);
1412 		if (err && flags.verbose) {
1413 			(void) printf("failed (%u) - "
1414 			    "will try again on next pass\n", errno);
1415 		}
1416 		err = EAGAIN;
1417 	} else if (flags.verbose) {
1418 		if (err == 0)
1419 			(void) printf("success\n");
1420 		else
1421 			(void) printf("failed (%u)\n", errno);
1422 	}
1423 
1424 	(void) changelist_postfix(clp);
1425 	changelist_free(clp);
1426 
1427 	return (err);
1428 }
1429 
1430 static int
1431 recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
1432     char *newname, recvflags_t flags)
1433 {
1434 	zfs_cmd_t zc = { 0 };
1435 	int err = 0;
1436 	prop_changelist_t *clp;
1437 	zfs_handle_t *zhp;
1438 	boolean_t defer = B_FALSE;
1439 	int spa_version;
1440 
1441 	zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
1442 	if (zhp == NULL)
1443 		return (-1);
1444 	clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
1445 	    flags.force ? MS_FORCE : 0);
1446 	if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
1447 	    zfs_spa_version(zhp, &spa_version) == 0 &&
1448 	    spa_version >= SPA_VERSION_USERREFS)
1449 		defer = B_TRUE;
1450 	zfs_close(zhp);
1451 	if (clp == NULL)
1452 		return (-1);
1453 	err = changelist_prefix(clp);
1454 	if (err)
1455 		return (err);
1456 
1457 	zc.zc_objset_type = DMU_OST_ZFS;
1458 	zc.zc_defer_destroy = defer;
1459 	(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
1460 
1461 	if (flags.verbose)
1462 		(void) printf("attempting destroy %s\n", zc.zc_name);
1463 	err = ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc);
1464 	if (err == 0) {
1465 		if (flags.verbose)
1466 			(void) printf("success\n");
1467 		changelist_remove(clp, zc.zc_name);
1468 	}
1469 
1470 	(void) changelist_postfix(clp);
1471 	changelist_free(clp);
1472 
1473 	/*
1474 	 * Deferred destroy should always succeed. Since we can't tell
1475 	 * if it destroyed the dataset or just marked it for deferred
1476 	 * destroy, always do the rename just in case.
1477 	 */
1478 	if (err != 0 || defer)
1479 		err = recv_rename(hdl, name, NULL, baselen, newname, flags);
1480 
1481 	return (err);
1482 }
1483 
1484 typedef struct guid_to_name_data {
1485 	uint64_t guid;
1486 	char *name;
1487 } guid_to_name_data_t;
1488 
1489 static int
1490 guid_to_name_cb(zfs_handle_t *zhp, void *arg)
1491 {
1492 	guid_to_name_data_t *gtnd = arg;
1493 	int err;
1494 
1495 	if (zhp->zfs_dmustats.dds_guid == gtnd->guid) {
1496 		(void) strcpy(gtnd->name, zhp->zfs_name);
1497 		return (EEXIST);
1498 	}
1499 	err = zfs_iter_children(zhp, guid_to_name_cb, gtnd);
1500 	zfs_close(zhp);
1501 	return (err);
1502 }
1503 
1504 static int
1505 guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid,
1506     char *name)
1507 {
1508 	/* exhaustive search all local snapshots */
1509 	guid_to_name_data_t gtnd;
1510 	int err = 0;
1511 	zfs_handle_t *zhp;
1512 	char *cp;
1513 
1514 	gtnd.guid = guid;
1515 	gtnd.name = name;
1516 
1517 	if (strchr(parent, '@') == NULL) {
1518 		zhp = make_dataset_handle(hdl, parent);
1519 		if (zhp != NULL) {
1520 			err = zfs_iter_children(zhp, guid_to_name_cb, &gtnd);
1521 			zfs_close(zhp);
1522 			if (err == EEXIST)
1523 				return (0);
1524 		}
1525 	}
1526 
1527 	cp = strchr(parent, '/');
1528 	if (cp)
1529 		*cp = '\0';
1530 	zhp = make_dataset_handle(hdl, parent);
1531 	if (cp)
1532 		*cp = '/';
1533 
1534 	if (zhp) {
1535 		err = zfs_iter_children(zhp, guid_to_name_cb, &gtnd);
1536 		zfs_close(zhp);
1537 	}
1538 
1539 	return (err == EEXIST ? 0 : ENOENT);
1540 
1541 }
1542 
1543 /*
1544  * Return true if dataset guid1 is created before guid2.
1545  */
1546 static int
1547 created_before(libzfs_handle_t *hdl, avl_tree_t *avl,
1548     uint64_t guid1, uint64_t guid2)
1549 {
1550 	nvlist_t *nvfs;
1551 	char *fsname, *snapname;
1552 	char buf[ZFS_MAXNAMELEN];
1553 	int rv;
1554 	zfs_node_t zn1, zn2;
1555 
1556 	if (guid2 == 0)
1557 		return (0);
1558 	if (guid1 == 0)
1559 		return (1);
1560 
1561 	nvfs = fsavl_find(avl, guid1, &snapname);
1562 	VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
1563 	(void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname);
1564 	zn1.zn_handle = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT);
1565 	if (zn1.zn_handle == NULL)
1566 		return (-1);
1567 
1568 	nvfs = fsavl_find(avl, guid2, &snapname);
1569 	VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
1570 	(void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname);
1571 	zn2.zn_handle = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT);
1572 	if (zn2.zn_handle == NULL) {
1573 		zfs_close(zn2.zn_handle);
1574 		return (-1);
1575 	}
1576 
1577 	rv = (zfs_snapshot_compare(&zn1, &zn2) == -1);
1578 
1579 	zfs_close(zn1.zn_handle);
1580 	zfs_close(zn2.zn_handle);
1581 
1582 	return (rv);
1583 }
1584 
1585 static int
1586 recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs,
1587     recvflags_t flags, nvlist_t *stream_nv, avl_tree_t *stream_avl)
1588 {
1589 	nvlist_t *local_nv;
1590 	avl_tree_t *local_avl;
1591 	nvpair_t *fselem, *nextfselem;
1592 	char *tosnap, *fromsnap;
1593 	char newname[ZFS_MAXNAMELEN];
1594 	int error;
1595 	boolean_t needagain, progress;
1596 	char *s1, *s2;
1597 
1598 	VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap));
1599 	VERIFY(0 == nvlist_lookup_string(stream_nv, "tosnap", &tosnap));
1600 
1601 	if (flags.dryrun)
1602 		return (0);
1603 
1604 again:
1605 	needagain = progress = B_FALSE;
1606 
1607 	if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL,
1608 	    &local_nv, &local_avl)) != 0)
1609 		return (error);
1610 
1611 	/*
1612 	 * Process deletes and renames
1613 	 */
1614 	for (fselem = nvlist_next_nvpair(local_nv, NULL);
1615 	    fselem; fselem = nextfselem) {
1616 		nvlist_t *nvfs, *snaps;
1617 		nvlist_t *stream_nvfs = NULL;
1618 		nvpair_t *snapelem, *nextsnapelem;
1619 		uint64_t fromguid = 0;
1620 		uint64_t originguid = 0;
1621 		uint64_t stream_originguid = 0;
1622 		uint64_t parent_fromsnap_guid, stream_parent_fromsnap_guid;
1623 		char *fsname, *stream_fsname;
1624 
1625 		nextfselem = nvlist_next_nvpair(local_nv, fselem);
1626 
1627 		VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs));
1628 		VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps));
1629 		VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname));
1630 		VERIFY(0 == nvlist_lookup_uint64(nvfs, "parentfromsnap",
1631 		    &parent_fromsnap_guid));
1632 		(void) nvlist_lookup_uint64(nvfs, "origin", &originguid);
1633 
1634 		/*
1635 		 * First find the stream's fs, so we can check for
1636 		 * a different origin (due to "zfs promote")
1637 		 */
1638 		for (snapelem = nvlist_next_nvpair(snaps, NULL);
1639 		    snapelem; snapelem = nvlist_next_nvpair(snaps, snapelem)) {
1640 			uint64_t thisguid;
1641 
1642 			VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid));
1643 			stream_nvfs = fsavl_find(stream_avl, thisguid, NULL);
1644 
1645 			if (stream_nvfs != NULL)
1646 				break;
1647 		}
1648 
1649 		/* check for promote */
1650 		(void) nvlist_lookup_uint64(stream_nvfs, "origin",
1651 		    &stream_originguid);
1652 		if (stream_nvfs && originguid != stream_originguid) {
1653 			switch (created_before(hdl, local_avl,
1654 			    stream_originguid, originguid)) {
1655 			case 1: {
1656 				/* promote it! */
1657 				zfs_cmd_t zc = { 0 };
1658 				nvlist_t *origin_nvfs;
1659 				char *origin_fsname;
1660 
1661 				if (flags.verbose)
1662 					(void) printf("promoting %s\n", fsname);
1663 
1664 				origin_nvfs = fsavl_find(local_avl, originguid,
1665 				    NULL);
1666 				VERIFY(0 == nvlist_lookup_string(origin_nvfs,
1667 				    "name", &origin_fsname));
1668 				(void) strlcpy(zc.zc_value, origin_fsname,
1669 				    sizeof (zc.zc_value));
1670 				(void) strlcpy(zc.zc_name, fsname,
1671 				    sizeof (zc.zc_name));
1672 				error = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc);
1673 				if (error == 0)
1674 					progress = B_TRUE;
1675 				break;
1676 			}
1677 			default:
1678 				break;
1679 			case -1:
1680 				fsavl_destroy(local_avl);
1681 				nvlist_free(local_nv);
1682 				return (-1);
1683 			}
1684 			/*
1685 			 * We had/have the wrong origin, therefore our
1686 			 * list of snapshots is wrong.  Need to handle
1687 			 * them on the next pass.
1688 			 */
1689 			needagain = B_TRUE;
1690 			continue;
1691 		}
1692 
1693 		for (snapelem = nvlist_next_nvpair(snaps, NULL);
1694 		    snapelem; snapelem = nextsnapelem) {
1695 			uint64_t thisguid;
1696 			char *stream_snapname;
1697 			nvlist_t *found, *props;
1698 
1699 			nextsnapelem = nvlist_next_nvpair(snaps, snapelem);
1700 
1701 			VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid));
1702 			found = fsavl_find(stream_avl, thisguid,
1703 			    &stream_snapname);
1704 
1705 			/* check for delete */
1706 			if (found == NULL) {
1707 				char name[ZFS_MAXNAMELEN];
1708 
1709 				if (!flags.force)
1710 					continue;
1711 
1712 				(void) snprintf(name, sizeof (name), "%s@%s",
1713 				    fsname, nvpair_name(snapelem));
1714 
1715 				error = recv_destroy(hdl, name,
1716 				    strlen(fsname)+1, newname, flags);
1717 				if (error)
1718 					needagain = B_TRUE;
1719 				else
1720 					progress = B_TRUE;
1721 				continue;
1722 			}
1723 
1724 			stream_nvfs = found;
1725 
1726 			if (0 == nvlist_lookup_nvlist(stream_nvfs, "snapprops",
1727 			    &props) && 0 == nvlist_lookup_nvlist(props,
1728 			    stream_snapname, &props)) {
1729 				zfs_cmd_t zc = { 0 };
1730 
1731 				zc.zc_cookie = B_TRUE; /* clear current props */
1732 				(void) snprintf(zc.zc_name, sizeof (zc.zc_name),
1733 				    "%s@%s", fsname, nvpair_name(snapelem));
1734 				if (zcmd_write_src_nvlist(hdl, &zc,
1735 				    props) == 0) {
1736 					(void) zfs_ioctl(hdl,
1737 					    ZFS_IOC_SET_PROP, &zc);
1738 					zcmd_free_nvlists(&zc);
1739 				}
1740 			}
1741 
1742 			/* check for different snapname */
1743 			if (strcmp(nvpair_name(snapelem),
1744 			    stream_snapname) != 0) {
1745 				char name[ZFS_MAXNAMELEN];
1746 				char tryname[ZFS_MAXNAMELEN];
1747 
1748 				(void) snprintf(name, sizeof (name), "%s@%s",
1749 				    fsname, nvpair_name(snapelem));
1750 				(void) snprintf(tryname, sizeof (name), "%s@%s",
1751 				    fsname, stream_snapname);
1752 
1753 				error = recv_rename(hdl, name, tryname,
1754 				    strlen(fsname)+1, newname, flags);
1755 				if (error)
1756 					needagain = B_TRUE;
1757 				else
1758 					progress = B_TRUE;
1759 			}
1760 
1761 			if (strcmp(stream_snapname, fromsnap) == 0)
1762 				fromguid = thisguid;
1763 		}
1764 
1765 		/* check for delete */
1766 		if (stream_nvfs == NULL) {
1767 			if (!flags.force)
1768 				continue;
1769 
1770 			error = recv_destroy(hdl, fsname, strlen(tofs)+1,
1771 			    newname, flags);
1772 			if (error)
1773 				needagain = B_TRUE;
1774 			else
1775 				progress = B_TRUE;
1776 			continue;
1777 		}
1778 
1779 		if (fromguid == 0 && flags.verbose) {
1780 			(void) printf("local fs %s does not have fromsnap "
1781 			    "(%s in stream); must have been deleted locally; "
1782 			    "ignoring\n", fsname, fromsnap);
1783 			continue;
1784 		}
1785 
1786 		VERIFY(0 == nvlist_lookup_string(stream_nvfs,
1787 		    "name", &stream_fsname));
1788 		VERIFY(0 == nvlist_lookup_uint64(stream_nvfs,
1789 		    "parentfromsnap", &stream_parent_fromsnap_guid));
1790 
1791 		s1 = strrchr(fsname, '/');
1792 		s2 = strrchr(stream_fsname, '/');
1793 
1794 		/* check for rename */
1795 		if ((stream_parent_fromsnap_guid != 0 &&
1796 		    stream_parent_fromsnap_guid != parent_fromsnap_guid) ||
1797 		    ((s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) {
1798 			nvlist_t *parent;
1799 			char tryname[ZFS_MAXNAMELEN];
1800 
1801 			parent = fsavl_find(local_avl,
1802 			    stream_parent_fromsnap_guid, NULL);
1803 			/*
1804 			 * NB: parent might not be found if we used the
1805 			 * tosnap for stream_parent_fromsnap_guid,
1806 			 * because the parent is a newly-created fs;
1807 			 * we'll be able to rename it after we recv the
1808 			 * new fs.
1809 			 */
1810 			if (parent != NULL) {
1811 				char *pname;
1812 
1813 				VERIFY(0 == nvlist_lookup_string(parent, "name",
1814 				    &pname));
1815 				(void) snprintf(tryname, sizeof (tryname),
1816 				    "%s%s", pname, strrchr(stream_fsname, '/'));
1817 			} else {
1818 				tryname[0] = '\0';
1819 				if (flags.verbose) {
1820 					(void) printf("local fs %s new parent "
1821 					    "not found\n", fsname);
1822 				}
1823 			}
1824 
1825 			error = recv_rename(hdl, fsname, tryname,
1826 			    strlen(tofs)+1, newname, flags);
1827 			if (error)
1828 				needagain = B_TRUE;
1829 			else
1830 				progress = B_TRUE;
1831 		}
1832 	}
1833 
1834 	fsavl_destroy(local_avl);
1835 	nvlist_free(local_nv);
1836 
1837 	if (needagain && progress) {
1838 		/* do another pass to fix up temporary names */
1839 		if (flags.verbose)
1840 			(void) printf("another pass:\n");
1841 		goto again;
1842 	}
1843 
1844 	return (needagain);
1845 }
1846 
1847 static int
1848 zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
1849     recvflags_t flags, dmu_replay_record_t *drr, zio_cksum_t *zc,
1850     char **top_zfs)
1851 {
1852 	nvlist_t *stream_nv = NULL;
1853 	avl_tree_t *stream_avl = NULL;
1854 	char *fromsnap = NULL;
1855 	char tofs[ZFS_MAXNAMELEN];
1856 	char errbuf[1024];
1857 	dmu_replay_record_t drre;
1858 	int error;
1859 	boolean_t anyerr = B_FALSE;
1860 	boolean_t softerr = B_FALSE;
1861 
1862 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
1863 	    "cannot receive"));
1864 
1865 	if (strchr(destname, '@')) {
1866 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
1867 		    "can not specify snapshot name for multi-snapshot stream"));
1868 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
1869 	}
1870 
1871 	assert(drr->drr_type == DRR_BEGIN);
1872 	assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC);
1873 	assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) ==
1874 	    DMU_COMPOUNDSTREAM);
1875 
1876 	/*
1877 	 * Read in the nvlist from the stream.
1878 	 */
1879 	if (drr->drr_payloadlen != 0) {
1880 		if (!flags.isprefix) {
1881 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
1882 			    "must use -d to receive replication "
1883 			    "(send -R) stream"));
1884 			return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
1885 		}
1886 
1887 		error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen,
1888 		    &stream_nv, flags.byteswap, zc);
1889 		if (error) {
1890 			error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
1891 			goto out;
1892 		}
1893 	}
1894 
1895 	/*
1896 	 * Read in the end record and verify checksum.
1897 	 */
1898 	if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre),
1899 	    flags.byteswap, NULL)))
1900 		goto out;
1901 	if (flags.byteswap) {
1902 		drre.drr_type = BSWAP_32(drre.drr_type);
1903 		drre.drr_u.drr_end.drr_checksum.zc_word[0] =
1904 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]);
1905 		drre.drr_u.drr_end.drr_checksum.zc_word[1] =
1906 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[1]);
1907 		drre.drr_u.drr_end.drr_checksum.zc_word[2] =
1908 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[2]);
1909 		drre.drr_u.drr_end.drr_checksum.zc_word[3] =
1910 		    BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[3]);
1911 	}
1912 	if (drre.drr_type != DRR_END) {
1913 		error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
1914 		goto out;
1915 	}
1916 	if (!ZIO_CHECKSUM_EQUAL(drre.drr_u.drr_end.drr_checksum, *zc)) {
1917 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
1918 		    "incorrect header checksum"));
1919 		error = zfs_error(hdl, EZFS_BADSTREAM, errbuf);
1920 		goto out;
1921 	}
1922 
1923 	(void) nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap);
1924 
1925 	if (drr->drr_payloadlen != 0) {
1926 		nvlist_t *stream_fss;
1927 
1928 		VERIFY(0 == nvlist_lookup_nvlist(stream_nv, "fss",
1929 		    &stream_fss));
1930 		if ((stream_avl = fsavl_create(stream_fss)) == NULL) {
1931 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
1932 			    "couldn't allocate avl tree"));
1933 			error = zfs_error(hdl, EZFS_NOMEM, errbuf);
1934 			goto out;
1935 		}
1936 
1937 		if (fromsnap != NULL) {
1938 			(void) strlcpy(tofs, destname, ZFS_MAXNAMELEN);
1939 			if (flags.isprefix) {
1940 				int i = strcspn(drr->drr_u.drr_begin.drr_toname,
1941 				    "/@");
1942 				/* zfs_receive_one() will create_parents() */
1943 				(void) strlcat(tofs,
1944 				    &drr->drr_u.drr_begin.drr_toname[i],
1945 				    ZFS_MAXNAMELEN);
1946 				*strchr(tofs, '@') = '\0';
1947 			}
1948 			softerr = recv_incremental_replication(hdl, tofs,
1949 			    flags, stream_nv, stream_avl);
1950 		}
1951 	}
1952 
1953 
1954 	/* Finally, receive each contained stream */
1955 	do {
1956 		/*
1957 		 * we should figure out if it has a recoverable
1958 		 * error, in which case do a recv_skip() and drive on.
1959 		 * Note, if we fail due to already having this guid,
1960 		 * zfs_receive_one() will take care of it (ie,
1961 		 * recv_skip() and return 0).
1962 		 */
1963 		error = zfs_receive_impl(hdl, destname, flags, fd,
1964 		    stream_avl, top_zfs);
1965 		if (error == ENODATA) {
1966 			error = 0;
1967 			break;
1968 		}
1969 		anyerr |= error;
1970 	} while (error == 0);
1971 
1972 	if (drr->drr_payloadlen != 0 && fromsnap != NULL) {
1973 		/*
1974 		 * Now that we have the fs's they sent us, try the
1975 		 * renames again.
1976 		 */
1977 		softerr = recv_incremental_replication(hdl, tofs, flags,
1978 		    stream_nv, stream_avl);
1979 	}
1980 
1981 out:
1982 	fsavl_destroy(stream_avl);
1983 	if (stream_nv)
1984 		nvlist_free(stream_nv);
1985 	if (softerr)
1986 		error = -2;
1987 	if (anyerr)
1988 		error = -1;
1989 	return (error);
1990 }
1991 
1992 static int
1993 recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
1994 {
1995 	dmu_replay_record_t *drr;
1996 	void *buf = malloc(1<<20);
1997 	char errbuf[1024];
1998 
1999 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
2000 	    "cannot receive:"));
2001 
2002 	/* XXX would be great to use lseek if possible... */
2003 	drr = buf;
2004 
2005 	while (recv_read(hdl, fd, drr, sizeof (dmu_replay_record_t),
2006 	    byteswap, NULL) == 0) {
2007 		if (byteswap)
2008 			drr->drr_type = BSWAP_32(drr->drr_type);
2009 
2010 		switch (drr->drr_type) {
2011 		case DRR_BEGIN:
2012 			/* NB: not to be used on v2 stream packages */
2013 			if (drr->drr_payloadlen != 0) {
2014 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2015 				    "invalid substream header"));
2016 				return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
2017 			}
2018 			break;
2019 
2020 		case DRR_END:
2021 			free(buf);
2022 			return (0);
2023 
2024 		case DRR_OBJECT:
2025 			if (byteswap) {
2026 				drr->drr_u.drr_object.drr_bonuslen =
2027 				    BSWAP_32(drr->drr_u.drr_object.
2028 				    drr_bonuslen);
2029 			}
2030 			(void) recv_read(hdl, fd, buf,
2031 			    P2ROUNDUP(drr->drr_u.drr_object.drr_bonuslen, 8),
2032 			    B_FALSE, NULL);
2033 			break;
2034 
2035 		case DRR_WRITE:
2036 			if (byteswap) {
2037 				drr->drr_u.drr_write.drr_length =
2038 				    BSWAP_64(drr->drr_u.drr_write.drr_length);
2039 			}
2040 			(void) recv_read(hdl, fd, buf,
2041 			    drr->drr_u.drr_write.drr_length, B_FALSE, NULL);
2042 			break;
2043 
2044 		case DRR_WRITE_BYREF:
2045 		case DRR_FREEOBJECTS:
2046 		case DRR_FREE:
2047 			break;
2048 
2049 		default:
2050 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2051 			    "invalid record type"));
2052 			return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
2053 		}
2054 	}
2055 
2056 	free(buf);
2057 	return (-1);
2058 }
2059 
2060 /*
2061  * Restores a backup of tosnap from the file descriptor specified by infd.
2062  */
2063 static int
2064 zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
2065     recvflags_t flags, dmu_replay_record_t *drr,
2066     dmu_replay_record_t *drr_noswap, avl_tree_t *stream_avl,
2067     char **top_zfs)
2068 {
2069 	zfs_cmd_t zc = { 0 };
2070 	time_t begin_time;
2071 	int ioctl_err, ioctl_errno, err, choplen;
2072 	char *cp;
2073 	struct drr_begin *drrb = &drr->drr_u.drr_begin;
2074 	char errbuf[1024];
2075 	char chopprefix[ZFS_MAXNAMELEN];
2076 	boolean_t newfs = B_FALSE;
2077 	boolean_t stream_wantsnewfs;
2078 	uint64_t parent_snapguid = 0;
2079 	prop_changelist_t *clp = NULL;
2080 	nvlist_t *snapprops_nvlist = NULL;
2081 
2082 	begin_time = time(NULL);
2083 
2084 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
2085 	    "cannot receive"));
2086 
2087 	if (stream_avl != NULL) {
2088 		char *snapname;
2089 		nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid,
2090 		    &snapname);
2091 		nvlist_t *props;
2092 		int ret;
2093 
2094 		(void) nvlist_lookup_uint64(fs, "parentfromsnap",
2095 		    &parent_snapguid);
2096 		err = nvlist_lookup_nvlist(fs, "props", &props);
2097 		if (err)
2098 			VERIFY(0 == nvlist_alloc(&props, NV_UNIQUE_NAME, 0));
2099 
2100 		if (flags.canmountoff) {
2101 			VERIFY(0 == nvlist_add_uint64(props,
2102 			    zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0));
2103 		}
2104 		ret = zcmd_write_src_nvlist(hdl, &zc, props);
2105 		if (err)
2106 			nvlist_free(props);
2107 
2108 		if (0 == nvlist_lookup_nvlist(fs, "snapprops", &props)) {
2109 			VERIFY(0 == nvlist_lookup_nvlist(props,
2110 			    snapname, &snapprops_nvlist));
2111 		}
2112 
2113 		if (ret != 0)
2114 			return (-1);
2115 	}
2116 
2117 	/*
2118 	 * Determine how much of the snapshot name stored in the stream
2119 	 * we are going to tack on to the name they specified on the
2120 	 * command line, and how much we are going to chop off.
2121 	 *
2122 	 * If they specified a snapshot, chop the entire name stored in
2123 	 * the stream.
2124 	 */
2125 	(void) strcpy(chopprefix, drrb->drr_toname);
2126 	if (flags.isprefix) {
2127 		/*
2128 		 * They specified a fs with -d, we want to tack on
2129 		 * everything but the pool name stored in the stream
2130 		 */
2131 		if (strchr(tosnap, '@')) {
2132 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
2133 			    "argument - snapshot not allowed with -d"));
2134 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
2135 		}
2136 		cp = strchr(chopprefix, '/');
2137 		if (cp == NULL)
2138 			cp = strchr(chopprefix, '@');
2139 		*cp = '\0';
2140 	} else if (strchr(tosnap, '@') == NULL) {
2141 		/*
2142 		 * If they specified a filesystem without -d, we want to
2143 		 * tack on everything after the fs specified in the
2144 		 * first name from the stream.
2145 		 */
2146 		cp = strchr(chopprefix, '@');
2147 		*cp = '\0';
2148 	}
2149 	choplen = strlen(chopprefix);
2150 
2151 	/*
2152 	 * Determine name of destination snapshot, store in zc_value.
2153 	 */
2154 	(void) strcpy(zc.zc_top_ds, tosnap);
2155 	(void) strcpy(zc.zc_value, tosnap);
2156 	(void) strncat(zc.zc_value, drrb->drr_toname+choplen,
2157 	    sizeof (zc.zc_value));
2158 	if (!zfs_name_valid(zc.zc_value, ZFS_TYPE_SNAPSHOT)) {
2159 		zcmd_free_nvlists(&zc);
2160 		return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
2161 	}
2162 
2163 	/*
2164 	 * Determine the name of the origin snapshot, store in zc_string.
2165 	 */
2166 	if (drrb->drr_flags & DRR_FLAG_CLONE) {
2167 		if (guid_to_name(hdl, tosnap,
2168 		    drrb->drr_fromguid, zc.zc_string) != 0) {
2169 			zcmd_free_nvlists(&zc);
2170 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2171 			    "local origin for clone %s does not exist"),
2172 			    zc.zc_value);
2173 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
2174 		}
2175 		if (flags.verbose)
2176 			(void) printf("found clone origin %s\n", zc.zc_string);
2177 	}
2178 
2179 	stream_wantsnewfs = (drrb->drr_fromguid == NULL ||
2180 	    (drrb->drr_flags & DRR_FLAG_CLONE));
2181 
2182 	if (stream_wantsnewfs) {
2183 		/*
2184 		 * if the parent fs does not exist, look for it based on
2185 		 * the parent snap GUID
2186 		 */
2187 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
2188 		    "cannot receive new filesystem stream"));
2189 
2190 		(void) strcpy(zc.zc_name, zc.zc_value);
2191 		cp = strrchr(zc.zc_name, '/');
2192 		if (cp)
2193 			*cp = '\0';
2194 		if (cp &&
2195 		    !zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
2196 			char suffix[ZFS_MAXNAMELEN];
2197 			(void) strcpy(suffix, strrchr(zc.zc_value, '/'));
2198 			if (guid_to_name(hdl, tosnap, parent_snapguid,
2199 			    zc.zc_value) == 0) {
2200 				*strchr(zc.zc_value, '@') = '\0';
2201 				(void) strcat(zc.zc_value, suffix);
2202 			}
2203 		}
2204 	} else {
2205 		/*
2206 		 * if the fs does not exist, look for it based on the
2207 		 * fromsnap GUID
2208 		 */
2209 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
2210 		    "cannot receive incremental stream"));
2211 
2212 		(void) strcpy(zc.zc_name, zc.zc_value);
2213 		*strchr(zc.zc_name, '@') = '\0';
2214 
2215 		if (!zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
2216 			char snap[ZFS_MAXNAMELEN];
2217 			(void) strcpy(snap, strchr(zc.zc_value, '@'));
2218 			if (guid_to_name(hdl, tosnap, drrb->drr_fromguid,
2219 			    zc.zc_value) == 0) {
2220 				*strchr(zc.zc_value, '@') = '\0';
2221 				(void) strcat(zc.zc_value, snap);
2222 			}
2223 		}
2224 	}
2225 
2226 	(void) strcpy(zc.zc_name, zc.zc_value);
2227 	*strchr(zc.zc_name, '@') = '\0';
2228 
2229 	if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_DATASET)) {
2230 		zfs_handle_t *zhp;
2231 		/*
2232 		 * Destination fs exists.  Therefore this should either
2233 		 * be an incremental, or the stream specifies a new fs
2234 		 * (full stream or clone) and they want us to blow it
2235 		 * away (and have therefore specified -F and removed any
2236 		 * snapshots).
2237 		 */
2238 
2239 		if (stream_wantsnewfs) {
2240 			if (!flags.force) {
2241 				zcmd_free_nvlists(&zc);
2242 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2243 				    "destination '%s' exists\n"
2244 				    "must specify -F to overwrite it"),
2245 				    zc.zc_name);
2246 				return (zfs_error(hdl, EZFS_EXISTS, errbuf));
2247 			}
2248 			if (ioctl(hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT,
2249 			    &zc) == 0) {
2250 				zcmd_free_nvlists(&zc);
2251 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2252 				    "destination has snapshots (eg. %s)\n"
2253 				    "must destroy them to overwrite it"),
2254 				    zc.zc_name);
2255 				return (zfs_error(hdl, EZFS_EXISTS, errbuf));
2256 			}
2257 		}
2258 
2259 		if ((zhp = zfs_open(hdl, zc.zc_name,
2260 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) {
2261 			zcmd_free_nvlists(&zc);
2262 			return (-1);
2263 		}
2264 
2265 		if (stream_wantsnewfs &&
2266 		    zhp->zfs_dmustats.dds_origin[0]) {
2267 			zcmd_free_nvlists(&zc);
2268 			zfs_close(zhp);
2269 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2270 			    "destination '%s' is a clone\n"
2271 			    "must destroy it to overwrite it"),
2272 			    zc.zc_name);
2273 			return (zfs_error(hdl, EZFS_EXISTS, errbuf));
2274 		}
2275 
2276 		if (!flags.dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM &&
2277 		    stream_wantsnewfs) {
2278 			/* We can't do online recv in this case */
2279 			clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0);
2280 			if (clp == NULL) {
2281 				zfs_close(zhp);
2282 				zcmd_free_nvlists(&zc);
2283 				return (-1);
2284 			}
2285 			if (changelist_prefix(clp) != 0) {
2286 				changelist_free(clp);
2287 				zfs_close(zhp);
2288 				zcmd_free_nvlists(&zc);
2289 				return (-1);
2290 			}
2291 		}
2292 		zfs_close(zhp);
2293 	} else {
2294 		/*
2295 		 * Destination filesystem does not exist.  Therefore we better
2296 		 * be creating a new filesystem (either from a full backup, or
2297 		 * a clone).  It would therefore be invalid if the user
2298 		 * specified only the pool name (i.e. if the destination name
2299 		 * contained no slash character).
2300 		 */
2301 		if (!stream_wantsnewfs ||
2302 		    (cp = strrchr(zc.zc_name, '/')) == NULL) {
2303 			zcmd_free_nvlists(&zc);
2304 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2305 			    "destination '%s' does not exist"), zc.zc_name);
2306 			return (zfs_error(hdl, EZFS_NOENT, errbuf));
2307 		}
2308 
2309 		/*
2310 		 * Trim off the final dataset component so we perform the
2311 		 * recvbackup ioctl to the filesystems's parent.
2312 		 */
2313 		*cp = '\0';
2314 
2315 		if (flags.isprefix && !flags.dryrun &&
2316 		    create_parents(hdl, zc.zc_value, strlen(tosnap)) != 0) {
2317 			zcmd_free_nvlists(&zc);
2318 			return (zfs_error(hdl, EZFS_BADRESTORE, errbuf));
2319 		}
2320 
2321 		newfs = B_TRUE;
2322 	}
2323 
2324 	zc.zc_begin_record = drr_noswap->drr_u.drr_begin;
2325 	zc.zc_cookie = infd;
2326 	zc.zc_guid = flags.force;
2327 	if (flags.verbose) {
2328 		(void) printf("%s %s stream of %s into %s\n",
2329 		    flags.dryrun ? "would receive" : "receiving",
2330 		    drrb->drr_fromguid ? "incremental" : "full",
2331 		    drrb->drr_toname, zc.zc_value);
2332 		(void) fflush(stdout);
2333 	}
2334 
2335 	if (flags.dryrun) {
2336 		zcmd_free_nvlists(&zc);
2337 		return (recv_skip(hdl, infd, flags.byteswap));
2338 	}
2339 
2340 	err = ioctl_err = zfs_ioctl(hdl, ZFS_IOC_RECV, &zc);
2341 	ioctl_errno = errno;
2342 	zcmd_free_nvlists(&zc);
2343 
2344 	if (err == 0 && snapprops_nvlist) {
2345 		zfs_cmd_t zc2 = { 0 };
2346 
2347 		(void) strcpy(zc2.zc_name, zc.zc_value);
2348 		if (zcmd_write_src_nvlist(hdl, &zc2, snapprops_nvlist) == 0) {
2349 			(void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc2);
2350 			zcmd_free_nvlists(&zc2);
2351 		}
2352 	}
2353 
2354 	if (err && (ioctl_errno == ENOENT || ioctl_errno == ENODEV)) {
2355 		/*
2356 		 * It may be that this snapshot already exists,
2357 		 * in which case we want to consume & ignore it
2358 		 * rather than failing.
2359 		 */
2360 		avl_tree_t *local_avl;
2361 		nvlist_t *local_nv, *fs;
2362 		char *cp = strchr(zc.zc_value, '@');
2363 
2364 		/*
2365 		 * XXX Do this faster by just iterating over snaps in
2366 		 * this fs.  Also if zc_value does not exist, we will
2367 		 * get a strange "does not exist" error message.
2368 		 */
2369 		*cp = '\0';
2370 		if (gather_nvlist(hdl, zc.zc_value, NULL, NULL,
2371 		    &local_nv, &local_avl) == 0) {
2372 			*cp = '@';
2373 			fs = fsavl_find(local_avl, drrb->drr_toguid, NULL);
2374 			fsavl_destroy(local_avl);
2375 			nvlist_free(local_nv);
2376 
2377 			if (fs != NULL) {
2378 				if (flags.verbose) {
2379 					(void) printf("snap %s already exists; "
2380 					    "ignoring\n", zc.zc_value);
2381 				}
2382 				ioctl_err = recv_skip(hdl, infd,
2383 				    flags.byteswap);
2384 			}
2385 		}
2386 		*cp = '@';
2387 	}
2388 
2389 
2390 	if (ioctl_err != 0) {
2391 		switch (ioctl_errno) {
2392 		case ENODEV:
2393 			cp = strchr(zc.zc_value, '@');
2394 			*cp = '\0';
2395 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2396 			    "most recent snapshot of %s does not\n"
2397 			    "match incremental source"), zc.zc_value);
2398 			(void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
2399 			*cp = '@';
2400 			break;
2401 		case ETXTBSY:
2402 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2403 			    "destination %s has been modified\n"
2404 			    "since most recent snapshot"), zc.zc_name);
2405 			(void) zfs_error(hdl, EZFS_BADRESTORE, errbuf);
2406 			break;
2407 		case EEXIST:
2408 			cp = strchr(zc.zc_value, '@');
2409 			if (newfs) {
2410 				/* it's the containing fs that exists */
2411 				*cp = '\0';
2412 			}
2413 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2414 			    "destination already exists"));
2415 			(void) zfs_error_fmt(hdl, EZFS_EXISTS,
2416 			    dgettext(TEXT_DOMAIN, "cannot restore to %s"),
2417 			    zc.zc_value);
2418 			*cp = '@';
2419 			break;
2420 		case EINVAL:
2421 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
2422 			break;
2423 		case ECKSUM:
2424 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2425 			    "invalid stream (checksum mismatch)"));
2426 			(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
2427 			break;
2428 		default:
2429 			(void) zfs_standard_error(hdl, ioctl_errno, errbuf);
2430 		}
2431 	}
2432 
2433 	/*
2434 	 * Mount the target filesystem (if created).  Also mount any
2435 	 * children of the target filesystem if we did a replication
2436 	 * receive (indicated by stream_avl being non-NULL).
2437 	 */
2438 	cp = strchr(zc.zc_value, '@');
2439 	if (cp && (ioctl_err == 0 || !newfs)) {
2440 		zfs_handle_t *h;
2441 
2442 		*cp = '\0';
2443 		h = zfs_open(hdl, zc.zc_value,
2444 		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
2445 		if (h != NULL) {
2446 			if (h->zfs_type == ZFS_TYPE_VOLUME) {
2447 				*cp = '@';
2448 			} else if (newfs || stream_avl) {
2449 				/*
2450 				 * Track the first/top of hierarchy fs,
2451 				 * for mounting and sharing later.
2452 				 */
2453 				if (top_zfs && *top_zfs == NULL)
2454 					*top_zfs = zfs_strdup(hdl, zc.zc_value);
2455 			}
2456 			zfs_close(h);
2457 		}
2458 		*cp = '@';
2459 	}
2460 
2461 	if (clp) {
2462 		err |= changelist_postfix(clp);
2463 		changelist_free(clp);
2464 	}
2465 
2466 	if (err || ioctl_err)
2467 		return (-1);
2468 
2469 	if (flags.verbose) {
2470 		char buf1[64];
2471 		char buf2[64];
2472 		uint64_t bytes = zc.zc_cookie;
2473 		time_t delta = time(NULL) - begin_time;
2474 		if (delta == 0)
2475 			delta = 1;
2476 		zfs_nicenum(bytes, buf1, sizeof (buf1));
2477 		zfs_nicenum(bytes/delta, buf2, sizeof (buf1));
2478 
2479 		(void) printf("received %sB stream in %lu seconds (%sB/sec)\n",
2480 		    buf1, delta, buf2);
2481 	}
2482 
2483 	return (0);
2484 }
2485 
2486 static int
2487 zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
2488     int infd, avl_tree_t *stream_avl, char **top_zfs)
2489 {
2490 	int err;
2491 	dmu_replay_record_t drr, drr_noswap;
2492 	struct drr_begin *drrb = &drr.drr_u.drr_begin;
2493 	char errbuf[1024];
2494 	zio_cksum_t zcksum = { 0 };
2495 	uint64_t featureflags;
2496 	int hdrtype;
2497 
2498 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
2499 	    "cannot receive"));
2500 
2501 	if (flags.isprefix &&
2502 	    !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) {
2503 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs "
2504 		    "(%s) does not exist"), tosnap);
2505 		return (zfs_error(hdl, EZFS_NOENT, errbuf));
2506 	}
2507 
2508 	/* read in the BEGIN record */
2509 	if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE,
2510 	    &zcksum)))
2511 		return (err);
2512 
2513 	if (drr.drr_type == DRR_END || drr.drr_type == BSWAP_32(DRR_END)) {
2514 		/* It's the double end record at the end of a package */
2515 		return (ENODATA);
2516 	}
2517 
2518 	/* the kernel needs the non-byteswapped begin record */
2519 	drr_noswap = drr;
2520 
2521 	flags.byteswap = B_FALSE;
2522 	if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
2523 		/*
2524 		 * We computed the checksum in the wrong byteorder in
2525 		 * recv_read() above; do it again correctly.
2526 		 */
2527 		bzero(&zcksum, sizeof (zio_cksum_t));
2528 		fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum);
2529 		flags.byteswap = B_TRUE;
2530 
2531 		drr.drr_type = BSWAP_32(drr.drr_type);
2532 		drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen);
2533 		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
2534 		drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
2535 		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
2536 		drrb->drr_type = BSWAP_32(drrb->drr_type);
2537 		drrb->drr_flags = BSWAP_32(drrb->drr_flags);
2538 		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
2539 		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
2540 	}
2541 
2542 	if (drrb->drr_magic != DMU_BACKUP_MAGIC || drr.drr_type != DRR_BEGIN) {
2543 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
2544 		    "stream (bad magic number)"));
2545 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
2546 	}
2547 
2548 	featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
2549 	hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo);
2550 
2551 	if (!DMU_STREAM_SUPPORTED(featureflags) ||
2552 	    (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) {
2553 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
2554 		    "stream has unsupported feature, feature flags = %lx"),
2555 		    featureflags);
2556 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
2557 	}
2558 
2559 	if (strchr(drrb->drr_toname, '@') == NULL) {
2560 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
2561 		    "stream (bad snapshot name)"));
2562 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
2563 	}
2564 
2565 	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) {
2566 		return (zfs_receive_one(hdl, infd, tosnap, flags,
2567 		    &drr, &drr_noswap, stream_avl, top_zfs));
2568 	} else {  /* must be DMU_COMPOUNDSTREAM */
2569 		assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
2570 		    DMU_COMPOUNDSTREAM);
2571 		return (zfs_receive_package(hdl, infd, tosnap, flags,
2572 		    &drr, &zcksum, top_zfs));
2573 	}
2574 }
2575 
2576 /*
2577  * Restores a backup of tosnap from the file descriptor specified by infd.
2578  * Return 0 on total success, -2 if some things couldn't be
2579  * destroyed/renamed/promoted, -1 if some things couldn't be received.
2580  * (-1 will override -2).
2581  */
2582 int
2583 zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
2584     int infd, avl_tree_t *stream_avl)
2585 {
2586 	char *top_zfs = NULL;
2587 	int err;
2588 
2589 	err = zfs_receive_impl(hdl, tosnap, flags, infd, stream_avl, &top_zfs);
2590 
2591 	if (err == 0 && !flags.nomount && top_zfs) {
2592 		zfs_handle_t *zhp;
2593 		prop_changelist_t *clp;
2594 
2595 		zhp = zfs_open(hdl, top_zfs, ZFS_TYPE_FILESYSTEM);
2596 		if (zhp != NULL) {
2597 			clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT,
2598 			    CL_GATHER_MOUNT_ALWAYS, 0);
2599 			zfs_close(zhp);
2600 			if (clp != NULL) {
2601 				/* mount and share received datasets */
2602 				err = changelist_postfix(clp);
2603 				changelist_free(clp);
2604 			}
2605 		}
2606 		if (zhp == NULL || clp == NULL || err)
2607 			err = -1;
2608 	}
2609 	if (top_zfs)
2610 		free(top_zfs);
2611 
2612 	return (err);
2613 }
2614