xref: /titanic_41/usr/src/uts/common/fs/zfs/dmu.c (revision 75d01c9ab5ef6f1bbac9f9d4eb379d5c38583d82)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/dmu.h>
30 #include <sys/dmu_impl.h>
31 #include <sys/dmu_tx.h>
32 #include <sys/dbuf.h>
33 #include <sys/dnode.h>
34 #include <sys/zfs_context.h>
35 #include <sys/dmu_objset.h>
36 #include <sys/dmu_traverse.h>
37 #include <sys/dsl_dataset.h>
38 #include <sys/dsl_dir.h>
39 #include <sys/dsl_pool.h>
40 #include <sys/dmu_zfetch.h>
41 #include <sys/zfs_ioctl.h>
42 #include <sys/zap.h>
43 
44 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
45 	{	byteswap_uint8_array,	TRUE,	"unallocated"		},
46 	{	zap_byteswap,		TRUE,	"object directory"	},
47 	{	byteswap_uint64_array,	TRUE,	"object array"		},
48 	{	byteswap_uint8_array,	TRUE,	"packed nvlist"		},
49 	{	byteswap_uint64_array,	TRUE,	"packed nvlist size"	},
50 	{	byteswap_uint64_array,	TRUE,	"bplist"		},
51 	{	byteswap_uint64_array,	TRUE,	"bplist header"		},
52 	{	byteswap_uint64_array,	TRUE,	"SPA space map header"	},
53 	{	byteswap_uint64_array,	TRUE,	"SPA space map"		},
54 	{	byteswap_uint64_array,	TRUE,	"ZIL intent log"	},
55 	{	dnode_buf_byteswap,	TRUE,	"DMU dnode"		},
56 	{	dmu_objset_byteswap,	TRUE,	"DMU objset"		},
57 	{	byteswap_uint64_array,	TRUE,	"DSL directory"		},
58 	{	zap_byteswap,		TRUE,	"DSL directory child map"},
59 	{	zap_byteswap,		TRUE,	"DSL dataset snap map"	},
60 	{	zap_byteswap,		TRUE,	"DSL props"		},
61 	{	byteswap_uint64_array,	TRUE,	"DSL dataset"		},
62 	{	zfs_znode_byteswap,	TRUE,	"ZFS znode"		},
63 	{	zfs_acl_byteswap,	TRUE,	"ZFS ACL"		},
64 	{	byteswap_uint8_array,	FALSE,	"ZFS plain file"	},
65 	{	zap_byteswap,		TRUE,	"ZFS directory"		},
66 	{	zap_byteswap,		TRUE,	"ZFS master node"	},
67 	{	zap_byteswap,		TRUE,	"ZFS delete queue"	},
68 	{	byteswap_uint8_array,	FALSE,	"zvol object"		},
69 	{	zap_byteswap,		TRUE,	"zvol prop"		},
70 	{	byteswap_uint8_array,	FALSE,	"other uint8[]"		},
71 	{	byteswap_uint64_array,	FALSE,	"other uint64[]"	},
72 	{	zap_byteswap,		TRUE,	"other ZAP"		},
73 };
74 
75 static int
76 dmu_buf_read_array_impl(dmu_buf_impl_t **dbp, int numbufs, uint32_t flags)
77 {
78 	int i, err = 0;
79 	dnode_t *dn;
80 	zio_t *zio;
81 	int canfail;
82 	uint64_t rd_sz;
83 
84 	if (numbufs == 0)
85 		return (0);
86 
87 	rd_sz = numbufs * dbp[0]->db.db_size;
88 	ASSERT(rd_sz <= DMU_MAX_ACCESS);
89 
90 	dn = dbp[0]->db_dnode;
91 	if (flags & DB_RF_CANFAIL) {
92 		canfail = 1;
93 	} else {
94 		canfail = 0;
95 	}
96 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, canfail);
97 
98 	/* don't prefetch if read the read is large */
99 	if (rd_sz >= zfetch_array_rd_sz) {
100 		flags |= DB_RF_NOPREFETCH;
101 	}
102 
103 	/* initiate async reads */
104 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
105 	for (i = 0; i < numbufs; i++) {
106 		if (dbp[i]->db_state == DB_UNCACHED)
107 			dbuf_read_impl(dbp[i], zio, flags);
108 	}
109 	rw_exit(&dn->dn_struct_rwlock);
110 	err = zio_wait(zio);
111 
112 	if (err)
113 		return (err);
114 
115 	/* wait for other io to complete */
116 	for (i = 0; i < numbufs; i++) {
117 		mutex_enter(&dbp[i]->db_mtx);
118 		while (dbp[i]->db_state == DB_READ ||
119 		    dbp[i]->db_state == DB_FILL)
120 			cv_wait(&dbp[i]->db_changed, &dbp[i]->db_mtx);
121 		ASSERT(dbp[i]->db_state == DB_CACHED);
122 		mutex_exit(&dbp[i]->db_mtx);
123 	}
124 
125 	return (0);
126 }
127 
128 void
129 dmu_buf_read_array(dmu_buf_t **dbp_fake, int numbufs)
130 {
131 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
132 	int err;
133 
134 	err = dmu_buf_read_array_impl(dbp, numbufs, DB_RF_MUST_SUCCEED);
135 	ASSERT(err == 0);
136 }
137 
138 int
139 dmu_buf_read_array_canfail(dmu_buf_t **dbp_fake, int numbufs)
140 {
141 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
142 
143 	return (dmu_buf_read_array_impl(dbp, numbufs, DB_RF_CANFAIL));
144 }
145 
146 dmu_buf_t *
147 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset)
148 {
149 	dnode_t *dn;
150 	uint64_t blkid;
151 	dmu_buf_impl_t *db;
152 
153 	/* dataset_verify(dd); */
154 
155 	dn = dnode_hold(os->os, object, FTAG);
156 	blkid = dbuf_whichblock(dn, offset);
157 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
158 	db = dbuf_hold(dn, blkid);
159 	rw_exit(&dn->dn_struct_rwlock);
160 	dnode_rele(dn, FTAG);
161 	return (&db->db);
162 }
163 
164 dmu_buf_t *
165 dmu_bonus_hold(objset_t *os, uint64_t object)
166 {
167 	return (dmu_bonus_hold_tag(os, object, NULL));
168 }
169 
170 int
171 dmu_bonus_max(void)
172 {
173 	return (DN_MAX_BONUSLEN);
174 }
175 
176 /*
177  * Returns held bonus buffer if the object exists, NULL if it doesn't.
178  */
179 dmu_buf_t *
180 dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag)
181 {
182 	dnode_t *dn = dnode_hold(os->os, object, FTAG);
183 	dmu_buf_impl_t *db;
184 
185 	if (dn == NULL)
186 		return (NULL);
187 
188 	db = dbuf_hold_bonus(dn, tag);
189 	/* XXX - hack: hold the first block if this is a ZAP object */
190 	if (dmu_ot[dn->dn_type].ot_byteswap == zap_byteswap) {
191 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
192 		dn->dn_db0 = dbuf_hold(dn, 0);
193 		rw_exit(&dn->dn_struct_rwlock);
194 	}
195 	dnode_rele(dn, FTAG);
196 	return (&db->db);
197 }
198 
199 static dmu_buf_t **
200 dbuf_hold_array(dnode_t *dn,
201     uint64_t offset, uint64_t length, int *numbufsp)
202 {
203 	dmu_buf_t **dbp;
204 	uint64_t blkid, nblks, i;
205 
206 	if (length == 0) {
207 		if (numbufsp)
208 			*numbufsp = 0;
209 		return (NULL);
210 	}
211 
212 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
213 	if (dn->dn_datablkshift) {
214 		int blkshift = dn->dn_datablkshift;
215 		nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
216 			P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
217 	} else {
218 		ASSERT3U(offset + length, <=, dn->dn_datablksz);
219 		nblks = 1;
220 	}
221 	dbp = kmem_alloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
222 
223 	blkid = dbuf_whichblock(dn, offset);
224 	for (i = 0; i < nblks; i++) {
225 		dmu_buf_impl_t *dbuf;
226 		dbuf = dbuf_hold(dn, blkid+i);
227 		dbp[i] = &dbuf->db;
228 	}
229 	rw_exit(&dn->dn_struct_rwlock);
230 
231 	if (numbufsp)
232 		*numbufsp = nblks;
233 	return (dbp);
234 }
235 
236 dmu_buf_t **
237 dmu_buf_hold_array(objset_t *os, uint64_t object,
238 	uint64_t offset, uint64_t length, int *numbufsp)
239 {
240 	dnode_t *dn;
241 	dmu_buf_t **dbp;
242 
243 	ASSERT(length <= DMU_MAX_ACCESS);
244 
245 	if (length == 0) {
246 		if (numbufsp)
247 			*numbufsp = 0;
248 		return (NULL);
249 	}
250 
251 	dn = dnode_hold(os->os, object, FTAG);
252 	dbp = dbuf_hold_array(dn, offset, length, numbufsp);
253 	dnode_rele(dn, FTAG);
254 
255 	return (dbp);
256 }
257 
258 void
259 dmu_buf_add_ref(dmu_buf_t *dbuf, void *tag)
260 {
261 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
262 	dbuf_add_ref(db, tag);
263 }
264 
265 void
266 dmu_buf_remove_ref(dmu_buf_t *dbuf, void *tag)
267 {
268 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
269 	dbuf_remove_ref(db, tag);
270 }
271 
272 void
273 dmu_buf_rele(dmu_buf_t *dbuf_fake)
274 {
275 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake;
276 
277 	/* XXX - hack: hold the first block  if this is a ZAP object */
278 	if (db->db_blkid == DB_BONUS_BLKID &&
279 	    dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap)
280 		dbuf_rele(db->db_dnode->dn_db0);
281 	dbuf_rele(db);
282 }
283 
284 void
285 dmu_buf_rele_tag(dmu_buf_t *dbuf_fake, void *tag)
286 {
287 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake;
288 
289 	/* XXX - hack: hold the first block  if this is a ZAP object */
290 	if (db->db_blkid == DB_BONUS_BLKID &&
291 	    dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap)
292 		dbuf_rele(db->db_dnode->dn_db0);
293 	dbuf_remove_ref(db, tag);
294 }
295 
296 void
297 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs)
298 {
299 	int i;
300 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
301 
302 	if (numbufs == 0)
303 		return;
304 
305 	ASSERT((numbufs * dbp[0]->db.db_size) <= DMU_MAX_ACCESS);
306 
307 	for (i = 0; i < numbufs; i++)
308 		dbuf_rele(dbp[i]);
309 
310 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
311 }
312 
313 void
314 dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
315 {
316 	dnode_t *dn;
317 	uint64_t blkid;
318 	int nblks, i;
319 
320 	if (len == 0) {  /* they're interested in the bonus buffer */
321 		dn = os->os->os_meta_dnode;
322 
323 		if (object == 0 || object >= DN_MAX_OBJECT)
324 			return;
325 
326 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
327 		blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
328 		dbuf_prefetch(dn, blkid);
329 		rw_exit(&dn->dn_struct_rwlock);
330 		return;
331 	}
332 
333 	/*
334 	 * XXX - Note, if the dnode for the requested object is not
335 	 * already cached, we will do a *synchronous* read in the
336 	 * dnode_hold() call.  The same is true for any indirects.
337 	 */
338 	dn = dnode_hold(os->os, object, FTAG);
339 	if (dn == NULL)
340 		return;
341 
342 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
343 	if (dn->dn_datablkshift) {
344 		int blkshift = dn->dn_datablkshift;
345 		nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
346 			P2ALIGN(offset, 1<<blkshift)) >> blkshift;
347 	} else {
348 		nblks = (offset < dn->dn_datablksz);
349 	}
350 
351 	if (nblks != 0) {
352 		blkid = dbuf_whichblock(dn, offset);
353 		for (i = 0; i < nblks; i++)
354 			dbuf_prefetch(dn, blkid+i);
355 	}
356 
357 	rw_exit(&dn->dn_struct_rwlock);
358 
359 	dnode_rele(dn, FTAG);
360 }
361 
362 void
363 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
364     uint64_t size, dmu_tx_t *tx)
365 {
366 	dnode_t *dn = dnode_hold(os->os, object, FTAG);
367 	ASSERT(offset < UINT64_MAX);
368 	ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
369 	dnode_free_range(dn, offset, size, tx);
370 	dnode_rele(dn, FTAG);
371 }
372 
373 static int
374 dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
375     void *buf, uint32_t flags)
376 {
377 	dnode_t *dn;
378 	dmu_buf_t **dbp;
379 	int numbufs, i;
380 
381 	dn = dnode_hold(os->os, object, FTAG);
382 
383 	if (dn->dn_datablkshift == 0) {
384 		int newsz = offset > dn->dn_datablksz ? 0 :
385 		    MIN(size, dn->dn_datablksz - offset);
386 		bzero((char *)buf + newsz, size - newsz);
387 		size = newsz;
388 	}
389 
390 	dnode_rele(dn, FTAG);
391 
392 	if (size == 0)
393 		return (0);
394 
395 	while (size > 0) {
396 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
397 		int err;
398 
399 		/*
400 		 * NB: we could do this block-at-a-time, but it's nice
401 		 * to be reading in parallel.
402 		 */
403 		dbp = dmu_buf_hold_array(os, object, offset, mylen, &numbufs);
404 		err = dmu_buf_read_array_impl((dmu_buf_impl_t **)dbp, numbufs,
405 		    flags);
406 		if (err) {
407 			dmu_buf_rele_array(dbp, numbufs);
408 			return (err);
409 		}
410 
411 		for (i = 0; i < numbufs; i++) {
412 			int tocpy;
413 			int bufoff;
414 			dmu_buf_t *db = dbp[i];
415 
416 			ASSERT(size > 0);
417 
418 			bufoff = offset - db->db_offset;
419 			tocpy = (int)MIN(db->db_size - bufoff, size);
420 
421 			bcopy((char *)db->db_data + bufoff, buf, tocpy);
422 
423 			offset += tocpy;
424 			size -= tocpy;
425 			buf = (char *)buf + tocpy;
426 		}
427 		dmu_buf_rele_array(dbp, numbufs);
428 	}
429 	return (0);
430 }
431 
432 void
433 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
434     void *buf)
435 {
436 	int err;
437 
438 	err = dmu_read_impl(os, object, offset, size, buf, DB_RF_MUST_SUCCEED);
439 	ASSERT3U(err, ==, 0);
440 }
441 
442 int
443 dmu_read_canfail(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
444     void *buf)
445 {
446 	return (dmu_read_impl(os, object, offset, size, buf, DB_RF_CANFAIL));
447 }
448 
449 void
450 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
451     const void *buf, dmu_tx_t *tx)
452 {
453 	dmu_buf_t **dbp;
454 	int numbufs, i;
455 
456 	dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs);
457 
458 	for (i = 0; i < numbufs; i++) {
459 		int tocpy;
460 		int bufoff;
461 		dmu_buf_t *db = dbp[i];
462 
463 		ASSERT(size > 0);
464 
465 		bufoff = offset - db->db_offset;
466 		tocpy = (int)MIN(db->db_size - bufoff, size);
467 
468 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
469 
470 		if (tocpy == db->db_size)
471 			dmu_buf_will_fill(db, tx);
472 		else
473 			dmu_buf_will_dirty(db, tx);
474 
475 		bcopy(buf, (char *)db->db_data + bufoff, tocpy);
476 
477 		if (tocpy == db->db_size)
478 			dmu_buf_fill_done(db, tx);
479 
480 		offset += tocpy;
481 		size -= tocpy;
482 		buf = (char *)buf + tocpy;
483 	}
484 	dmu_buf_rele_array(dbp, numbufs);
485 }
486 
487 #ifdef _KERNEL
488 int
489 dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
490     uio_t *uio, dmu_tx_t *tx)
491 {
492 	dmu_buf_t **dbp;
493 	int numbufs, i;
494 	int err = 0;
495 
496 	dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs);
497 
498 	for (i = 0; i < numbufs; i++) {
499 		int tocpy;
500 		int bufoff;
501 		dmu_buf_t *db = dbp[i];
502 
503 		ASSERT(size > 0);
504 
505 		bufoff = offset - db->db_offset;
506 		tocpy = (int)MIN(db->db_size - bufoff, size);
507 
508 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
509 
510 		if (tocpy == db->db_size)
511 			dmu_buf_will_fill(db, tx);
512 		else
513 			dmu_buf_will_dirty(db, tx);
514 
515 		/*
516 		 * XXX uiomove could block forever (eg. nfs-backed
517 		 * pages).  There needs to be a uiolockdown() function
518 		 * to lock the pages in memory, so that uiomove won't
519 		 * block.
520 		 */
521 		err = uiomove((char *)db->db_data + bufoff, tocpy,
522 		    UIO_WRITE, uio);
523 
524 		if (tocpy == db->db_size)
525 			dmu_buf_fill_done(db, tx);
526 
527 		if (err)
528 			break;
529 
530 		offset += tocpy;
531 		size -= tocpy;
532 	}
533 	dmu_buf_rele_array(dbp, numbufs);
534 	return (err);
535 }
536 #endif
537 
538 struct backuparg {
539 	dmu_replay_record_t *drr;
540 	vnode_t *vp;
541 	objset_t *os;
542 	int err;
543 };
544 
545 static int
546 dump_bytes(struct backuparg *ba, void *buf, int len)
547 {
548 	ssize_t resid; /* have to get resid to get detailed errno */
549 	/* Need to compute checksum here */
550 	ASSERT3U(len % 8, ==, 0);
551 	ba->err = vn_rdwr(UIO_WRITE, ba->vp,
552 	    (caddr_t)buf, len,
553 	    0, UIO_SYSSPACE, FAPPEND, RLIM_INFINITY, CRED(), &resid);
554 	return (ba->err);
555 }
556 
557 static int
558 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
559     uint64_t length)
560 {
561 	/* write a FREE record */
562 	bzero(ba->drr, sizeof (dmu_replay_record_t));
563 	ba->drr->drr_type = DRR_FREE;
564 	ba->drr->drr_u.drr_free.drr_object = object;
565 	ba->drr->drr_u.drr_free.drr_offset = offset;
566 	ba->drr->drr_u.drr_free.drr_length = length;
567 
568 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
569 		return (EINTR);
570 	return (0);
571 }
572 
573 static int
574 dump_data(struct backuparg *ba, dmu_object_type_t type,
575     uint64_t object, uint64_t offset, int blksz, void *data)
576 {
577 	/* write a DATA record */
578 	bzero(ba->drr, sizeof (dmu_replay_record_t));
579 	ba->drr->drr_type = DRR_WRITE;
580 	ba->drr->drr_u.drr_write.drr_object = object;
581 	ba->drr->drr_u.drr_write.drr_type = type;
582 	ba->drr->drr_u.drr_write.drr_offset = offset;
583 	ba->drr->drr_u.drr_write.drr_length = blksz;
584 
585 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
586 		return (EINTR);
587 	if (dump_bytes(ba, data, blksz))
588 		return (EINTR);
589 	return (0);
590 }
591 
592 static int
593 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
594 {
595 	/* write a FREEOBJECTS record */
596 	bzero(ba->drr, sizeof (dmu_replay_record_t));
597 	ba->drr->drr_type = DRR_FREEOBJECTS;
598 	ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj;
599 	ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs;
600 
601 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
602 		return (EINTR);
603 	return (0);
604 }
605 
606 static int
607 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
608 {
609 	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
610 		return (dump_freeobjects(ba, object, 1));
611 
612 	/* write an OBJECT record */
613 	bzero(ba->drr, sizeof (dmu_replay_record_t));
614 	ba->drr->drr_type = DRR_OBJECT;
615 	ba->drr->drr_u.drr_object.drr_object = object;
616 	ba->drr->drr_u.drr_object.drr_type = dnp->dn_type;
617 	ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype;
618 	ba->drr->drr_u.drr_object.drr_blksz =
619 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
620 	ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen;
621 	ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum;
622 	ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress;
623 
624 	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
625 		return (EINTR);
626 
627 	if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)))
628 		return (EINTR);
629 
630 	/* free anything past the end of the file */
631 	if (dump_free(ba, object, (dnp->dn_maxblkid + 1) *
632 	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
633 		return (EINTR);
634 	if (ba->err)
635 		return (EINTR);
636 	return (0);
637 }
638 
639 #define	BP_SPAN(dnp, level) \
640 	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
641 	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
642 
643 static int
644 backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
645 {
646 	struct backuparg *ba = arg;
647 	uint64_t object = bc->bc_bookmark.zb_object;
648 	int level = bc->bc_bookmark.zb_level;
649 	uint64_t blkid = bc->bc_bookmark.zb_blkid;
650 	blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL;
651 	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
652 	void *data = bc->bc_data;
653 	int err = 0;
654 
655 	if (issig(JUSTLOOKING))
656 		return (EINTR);
657 
658 	ASSERT(data || bp == NULL);
659 
660 	if (bp == NULL && object == 0) {
661 		uint64_t span = BP_SPAN(bc->bc_dnode, level);
662 		uint64_t dnobj = (blkid * span) >> DNODE_SHIFT;
663 		err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
664 	} else if (bp == NULL) {
665 		uint64_t span = BP_SPAN(bc->bc_dnode, level);
666 		err = dump_free(ba, object, blkid * span, span);
667 	} else if (data && level == 0 && type == DMU_OT_DNODE) {
668 		dnode_phys_t *blk = data;
669 		int i;
670 		int blksz = BP_GET_LSIZE(bp);
671 
672 		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
673 			uint64_t dnobj =
674 			    (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
675 			err = dump_dnode(ba, dnobj, blk+i);
676 			if (err)
677 				break;
678 		}
679 	} else if (level == 0 &&
680 	    type != DMU_OT_DNODE && type != DMU_OT_OBJSET) {
681 		int blksz = BP_GET_LSIZE(bp);
682 		if (data == NULL) {
683 			arc_buf_t *abuf;
684 
685 			(void) arc_read(NULL, spa, bp,
686 			    dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
687 			    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
688 			    ARC_WAIT);
689 
690 			if (abuf) {
691 				err = dump_data(ba, type, object, blkid * blksz,
692 				    blksz, abuf->b_data);
693 				arc_buf_free(abuf, &abuf);
694 			}
695 		} else {
696 			err = dump_data(ba, type, object, blkid * blksz,
697 			    blksz, data);
698 		}
699 	}
700 
701 	ASSERT(err == 0 || err == EINTR);
702 	return (err);
703 }
704 
705 int
706 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp)
707 {
708 	dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
709 	dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
710 	dmu_replay_record_t *drr;
711 	struct backuparg ba;
712 	int err;
713 
714 	/* tosnap must be a snapshot */
715 	if (ds->ds_phys->ds_next_snap_obj == 0)
716 		return (EINVAL);
717 
718 	/* fromsnap must be an earlier snapshot from the same fs as tosnap */
719 	if (fromds && (ds->ds_dir != fromds->ds_dir ||
720 	    fromds->ds_phys->ds_creation_txg >=
721 	    ds->ds_phys->ds_creation_txg))
722 		return (EXDEV);
723 
724 	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
725 	drr->drr_type = DRR_BEGIN;
726 	drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
727 	drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION;
728 	drr->drr_u.drr_begin.drr_creation_time =
729 	    ds->ds_phys->ds_creation_time;
730 	drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
731 	drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
732 	if (fromds)
733 		drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
734 	dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
735 
736 	ba.drr = drr;
737 	ba.vp = vp;
738 	ba.os = tosnap;
739 
740 	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
741 		kmem_free(drr, sizeof (dmu_replay_record_t));
742 		return (ba.err);
743 	}
744 
745 	err = traverse_dsl_dataset(ds,
746 	    fromds ? fromds->ds_phys->ds_creation_txg : 0,
747 	    ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK,
748 	    backup_cb, &ba);
749 
750 	if (err) {
751 		if (err == EINTR && ba.err)
752 			err = ba.err;
753 		return (err);
754 	}
755 
756 	bzero(drr, sizeof (dmu_replay_record_t));
757 	drr->drr_type = DRR_END;
758 
759 	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)))
760 		return (ba.err);
761 
762 	kmem_free(drr, sizeof (dmu_replay_record_t));
763 
764 	return (0);
765 }
766 
767 struct restorearg {
768 	int err;
769 	int byteswap;
770 	vnode_t *vp;
771 	char *buf;
772 	uint64_t voff;
773 	int buflen; /* number of valid bytes in buf */
774 	int bufoff; /* next offset to read */
775 	int bufsize; /* amount of memory allocated for buf */
776 };
777 
778 static int
779 replay_incremental_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
780 {
781 	struct drr_begin *drrb = arg;
782 	dsl_dataset_t *ds = NULL;
783 	dsl_dataset_t *ds_prev = NULL;
784 	const char *snapname;
785 	int err = EINVAL;
786 	uint64_t val;
787 
788 	/* this must be a filesytem */
789 	if (dd->dd_phys->dd_head_dataset_obj == 0)
790 		goto die;
791 
792 	ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
793 	    NULL, DS_MODE_EXCLUSIVE, FTAG);
794 
795 	if (ds == NULL) {
796 		err = EBUSY;
797 		goto die;
798 	}
799 
800 	/* must already be a snapshot of this fs */
801 	if (ds->ds_phys->ds_prev_snap_obj == 0) {
802 		err = ENODEV;
803 		goto die;
804 	}
805 
806 	/* most recent snapshot must match fromguid */
807 	ds_prev = dsl_dataset_open_obj(dd->dd_pool,
808 	    ds->ds_phys->ds_prev_snap_obj, NULL,
809 	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
810 	if (ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) {
811 		err = ENODEV;
812 		goto die;
813 	}
814 
815 	/* must not have any changes since most recent snapshot */
816 	if (ds->ds_phys->ds_bp.blk_birth >
817 	    ds_prev->ds_phys->ds_creation_txg) {
818 		err = ETXTBSY;
819 		goto die;
820 	}
821 
822 	/* new snapshot name must not exist */
823 	snapname = strrchr(drrb->drr_toname, '@');
824 	if (snapname == NULL) {
825 		err = EEXIST;
826 		goto die;
827 	}
828 	snapname++;
829 	err = zap_lookup(dd->dd_pool->dp_meta_objset,
830 	    ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val);
831 	if (err != ENOENT) {
832 		if (err == 0)
833 			err = EEXIST;
834 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
835 		dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG);
836 		return (err);
837 	}
838 
839 	dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG);
840 
841 	/* The point of no (unsuccessful) return. */
842 
843 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
844 	ds->ds_phys->ds_restoring = TRUE;
845 
846 	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
847 	return (0);
848 
849 die:
850 	if (ds_prev)
851 		dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG);
852 	if (ds)
853 		dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
854 	return (err);
855 }
856 
857 static int
858 replay_full_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
859 {
860 	struct drr_begin *drrb = arg;
861 	int err;
862 	char *fsfullname, *fslastname, *cp;
863 	dsl_dataset_t *ds;
864 
865 	fsfullname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
866 	(void) strncpy(fsfullname, drrb->drr_toname, MAXNAMELEN);
867 	cp = strchr(fsfullname, '@');
868 	if (cp == NULL) {
869 		kmem_free(fsfullname, MAXNAMELEN);
870 		return (EINVAL);
871 	}
872 	*cp = '\0';
873 	fslastname = strrchr(fsfullname, '/');
874 	if (fslastname == NULL) {
875 		kmem_free(fsfullname, MAXNAMELEN);
876 		return (EINVAL);
877 	}
878 	fslastname++;
879 
880 	err = dsl_dataset_create_sync(dd, fsfullname, fslastname, NULL, tx);
881 	if (err) {
882 		kmem_free(fsfullname, MAXNAMELEN);
883 		return (err);
884 	}
885 
886 	/* the point of no (unsuccessful) return */
887 
888 	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, fsfullname,
889 	    DS_MODE_EXCLUSIVE, FTAG, &ds);
890 	ASSERT3U(err, ==, 0);
891 	kmem_free(fsfullname, MAXNAMELEN);
892 
893 	(void) dmu_objset_create_impl(dsl_dataset_get_spa(ds),
894 	    ds, drrb->drr_type, tx);
895 
896 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
897 	ds->ds_phys->ds_restoring = TRUE;
898 
899 	dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG);
900 	return (0);
901 }
902 
903 static int
904 replay_end_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
905 {
906 	struct drr_begin *drrb = arg;
907 	int err;
908 	char *snapname;
909 	dsl_dataset_t *ds;
910 
911 	/* XXX verify that drr_toname is in dd */
912 
913 	snapname = strchr(drrb->drr_toname, '@');
914 	if (snapname == NULL)
915 		return (EINVAL);
916 	snapname++;
917 
918 	/* create snapshot */
919 	err = dsl_dataset_snapshot_sync(dd, snapname, tx);
920 	if (err)
921 		return (err);
922 
923 	/* set snapshot's creation time and guid */
924 	err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, drrb->drr_toname,
925 	    DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_RESTORE, FTAG, &ds);
926 	ASSERT3U(err, ==, 0);
927 
928 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
929 	ds->ds_phys->ds_creation_time = drrb->drr_creation_time;
930 	ds->ds_phys->ds_guid = drrb->drr_toguid;
931 	ds->ds_phys->ds_restoring = FALSE;
932 
933 	dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG);
934 
935 	ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
936 	    NULL, DS_MODE_STANDARD | DS_MODE_RESTORE, FTAG);
937 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
938 	ds->ds_phys->ds_restoring = FALSE;
939 	dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
940 
941 	return (0);
942 }
943 
944 void *
945 restore_read(struct restorearg *ra, int len)
946 {
947 	void *rv;
948 
949 	/* some things will require 8-byte alignment, so everything must */
950 	ASSERT3U(len % 8, ==, 0);
951 
952 	while (ra->buflen - ra->bufoff < len) {
953 		ssize_t resid;
954 		int leftover = ra->buflen - ra->bufoff;
955 
956 		(void) memmove(ra->buf, ra->buf + ra->bufoff, leftover);
957 		ra->err = vn_rdwr(UIO_READ, ra->vp,
958 		    (caddr_t)ra->buf + leftover, ra->bufsize - leftover,
959 		    ra->voff, UIO_SYSSPACE, FAPPEND,
960 		    RLIM_INFINITY, CRED(), &resid);
961 
962 		/* Need to compute checksum */
963 
964 		ra->voff += ra->bufsize - leftover - resid;
965 		ra->buflen = ra->bufsize - resid;
966 		ra->bufoff = 0;
967 		if (resid == ra->bufsize - leftover)
968 			ra->err = EINVAL;
969 		if (ra->err)
970 			return (NULL);
971 	}
972 
973 	ASSERT3U(ra->bufoff % 8, ==, 0);
974 	ASSERT3U(ra->buflen - ra->bufoff, >=, len);
975 	rv = ra->buf + ra->bufoff;
976 	ra->bufoff += len;
977 	return (rv);
978 }
979 
980 static void
981 backup_byteswap(dmu_replay_record_t *drr)
982 {
983 #define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
984 #define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
985 	drr->drr_type = BSWAP_32(drr->drr_type);
986 	switch (drr->drr_type) {
987 	case DRR_BEGIN:
988 		DO64(drr_begin.drr_magic);
989 		DO64(drr_begin.drr_version);
990 		DO64(drr_begin.drr_creation_time);
991 		DO32(drr_begin.drr_type);
992 		DO64(drr_begin.drr_toguid);
993 		DO64(drr_begin.drr_fromguid);
994 		break;
995 	case DRR_OBJECT:
996 		DO64(drr_object.drr_object);
997 		/* DO64(drr_object.drr_allocation_txg); */
998 		DO32(drr_object.drr_type);
999 		DO32(drr_object.drr_bonustype);
1000 		DO32(drr_object.drr_blksz);
1001 		DO32(drr_object.drr_bonuslen);
1002 		break;
1003 	case DRR_FREEOBJECTS:
1004 		DO64(drr_freeobjects.drr_firstobj);
1005 		DO64(drr_freeobjects.drr_numobjs);
1006 		break;
1007 	case DRR_WRITE:
1008 		DO64(drr_write.drr_object);
1009 		DO32(drr_write.drr_type);
1010 		DO64(drr_write.drr_offset);
1011 		DO64(drr_write.drr_length);
1012 		break;
1013 	case DRR_FREE:
1014 		DO64(drr_free.drr_object);
1015 		DO64(drr_free.drr_offset);
1016 		DO64(drr_free.drr_length);
1017 		break;
1018 	case DRR_END:
1019 		DO64(drr_end.drr_checksum);
1020 		break;
1021 	}
1022 #undef DO64
1023 #undef DO32
1024 }
1025 
1026 static int
1027 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
1028 {
1029 	int err;
1030 	dmu_tx_t *tx;
1031 
1032 	err = dmu_object_info(os, drro->drr_object, NULL);
1033 
1034 	if (err != 0 && err != ENOENT)
1035 		return (EINVAL);
1036 
1037 	if (drro->drr_type == DMU_OT_NONE ||
1038 	    drro->drr_type >= DMU_OT_NUMTYPES ||
1039 	    drro->drr_bonustype >= DMU_OT_NUMTYPES ||
1040 	    drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS ||
1041 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
1042 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
1043 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
1044 	    drro->drr_blksz > SPA_MAXBLOCKSIZE ||
1045 	    drro->drr_bonuslen > DN_MAX_BONUSLEN) {
1046 		return (EINVAL);
1047 	}
1048 
1049 	tx = dmu_tx_create(os);
1050 
1051 	if (err == ENOENT) {
1052 		/* currently free, want to be allocated */
1053 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1054 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1);
1055 		err = dmu_tx_assign(tx, TXG_WAIT);
1056 		if (err) {
1057 			dmu_tx_abort(tx);
1058 			return (err);
1059 		}
1060 		err = dmu_object_claim(os, drro->drr_object,
1061 		    drro->drr_type, drro->drr_blksz,
1062 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
1063 	} else {
1064 		/* currently allocated, want to be allocated */
1065 		dmu_tx_hold_bonus(tx, drro->drr_object);
1066 		/*
1067 		 * We may change blocksize, so need to
1068 		 * hold_write
1069 		 */
1070 		dmu_tx_hold_write(tx, drro->drr_object, 0, 1);
1071 		err = dmu_tx_assign(tx, TXG_WAIT);
1072 		if (err) {
1073 			dmu_tx_abort(tx);
1074 			return (err);
1075 		}
1076 
1077 		err = dmu_object_reclaim(os, drro->drr_object,
1078 		    drro->drr_type, drro->drr_blksz,
1079 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
1080 	}
1081 	if (err) {
1082 		dmu_tx_commit(tx);
1083 		return (EINVAL);
1084 	}
1085 
1086 	dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
1087 	dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
1088 
1089 	if (drro->drr_bonuslen) {
1090 		dmu_buf_t *db;
1091 		void *data;
1092 		db = dmu_bonus_hold(os, drro->drr_object);
1093 		dmu_buf_will_dirty(db, tx);
1094 
1095 		ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
1096 		data = restore_read(ra, P2ROUNDUP(db->db_size, 8));
1097 		if (data == NULL) {
1098 			dmu_tx_commit(tx);
1099 			return (ra->err);
1100 		}
1101 		bcopy(data, db->db_data, db->db_size);
1102 		if (ra->byteswap) {
1103 			dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
1104 			    drro->drr_bonuslen);
1105 		}
1106 		dmu_buf_rele(db);
1107 	}
1108 	dmu_tx_commit(tx);
1109 	return (0);
1110 }
1111 
1112 /* ARGSUSED */
1113 static int
1114 restore_freeobjects(struct restorearg *ra, objset_t *os,
1115     struct drr_freeobjects *drrfo)
1116 {
1117 	uint64_t obj;
1118 
1119 	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
1120 		return (EINVAL);
1121 
1122 	for (obj = drrfo->drr_firstobj;
1123 	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs; obj++) {
1124 		dmu_tx_t *tx;
1125 		int err;
1126 
1127 		if (dmu_object_info(os, obj, NULL) != 0)
1128 			continue;
1129 
1130 		tx = dmu_tx_create(os);
1131 		dmu_tx_hold_bonus(tx, obj);
1132 		err = dmu_tx_assign(tx, TXG_WAIT);
1133 		if (err) {
1134 			dmu_tx_abort(tx);
1135 			return (err);
1136 		}
1137 		err = dmu_object_free(os, obj, tx);
1138 		dmu_tx_commit(tx);
1139 		if (err && err != ENOENT)
1140 			return (EINVAL);
1141 	}
1142 	return (0);
1143 }
1144 
1145 static int
1146 restore_write(struct restorearg *ra, objset_t *os,
1147     struct drr_write *drrw)
1148 {
1149 	dmu_tx_t *tx;
1150 	void *data;
1151 	int err;
1152 
1153 	if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
1154 	    drrw->drr_type >= DMU_OT_NUMTYPES)
1155 		return (EINVAL);
1156 
1157 	data = restore_read(ra, drrw->drr_length);
1158 	if (data == NULL)
1159 		return (ra->err);
1160 
1161 	if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
1162 		return (EINVAL);
1163 
1164 	tx = dmu_tx_create(os);
1165 
1166 	dmu_tx_hold_write(tx, drrw->drr_object,
1167 	    drrw->drr_offset, drrw->drr_length);
1168 	err = dmu_tx_assign(tx, TXG_WAIT);
1169 	if (err) {
1170 		dmu_tx_abort(tx);
1171 		return (err);
1172 	}
1173 	if (ra->byteswap)
1174 		dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length);
1175 	dmu_write(os, drrw->drr_object,
1176 	    drrw->drr_offset, drrw->drr_length, data, tx);
1177 	dmu_tx_commit(tx);
1178 	return (0);
1179 }
1180 
1181 /* ARGSUSED */
1182 static int
1183 restore_free(struct restorearg *ra, objset_t *os,
1184     struct drr_free *drrf)
1185 {
1186 	dmu_tx_t *tx;
1187 	int err;
1188 
1189 	if (drrf->drr_length != -1ULL &&
1190 	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
1191 		return (EINVAL);
1192 
1193 	if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
1194 		return (EINVAL);
1195 
1196 	tx = dmu_tx_create(os);
1197 
1198 	dmu_tx_hold_free(tx, drrf->drr_object,
1199 	    drrf->drr_offset, drrf->drr_length);
1200 	err = dmu_tx_assign(tx, TXG_WAIT);
1201 	if (err) {
1202 		dmu_tx_abort(tx);
1203 		return (err);
1204 	}
1205 	dmu_free_range(os, drrf->drr_object,
1206 	    drrf->drr_offset, drrf->drr_length, tx);
1207 	dmu_tx_commit(tx);
1208 	return (0);
1209 }
1210 
1211 int
1212 dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
1213     vnode_t *vp, uint64_t voffset)
1214 {
1215 	struct restorearg ra;
1216 	dmu_replay_record_t *drr;
1217 	char *cp, *tosnap;
1218 	dsl_dir_t *dd = NULL;
1219 	objset_t *os = NULL;
1220 
1221 	bzero(&ra, sizeof (ra));
1222 	ra.vp = vp;
1223 	ra.voff = voffset;
1224 	ra.bufsize = 1<<20;
1225 	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
1226 
1227 	if (drrb->drr_magic == DMU_BACKUP_MAGIC) {
1228 		ra.byteswap = FALSE;
1229 	} else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
1230 		ra.byteswap = TRUE;
1231 	} else {
1232 		ra.err = EINVAL;
1233 		goto out;
1234 	}
1235 
1236 	if (ra.byteswap) {
1237 		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
1238 		drrb->drr_version = BSWAP_64(drrb->drr_version);
1239 		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
1240 		drrb->drr_type = BSWAP_32(drrb->drr_type);
1241 		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
1242 		drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
1243 	}
1244 
1245 	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
1246 
1247 	tosnap = drrb->drr_toname;
1248 	if (drrb->drr_version != DMU_BACKUP_VERSION ||
1249 	    drrb->drr_type >= DMU_OST_NUMTYPES ||
1250 	    strchr(drrb->drr_toname, '@') == NULL) {
1251 		ra.err = EINVAL;
1252 		goto out;
1253 	}
1254 
1255 	/*
1256 	 * Process the begin in syncing context.
1257 	 */
1258 	if (drrb->drr_fromguid) {
1259 		/* incremental backup */
1260 
1261 		cp = strchr(tosnap, '@');
1262 		*cp = '\0';
1263 		dd = dsl_dir_open(tosnap, FTAG, NULL);
1264 		*cp = '@';
1265 		if (dd == NULL) {
1266 			ra.err = ENOENT;
1267 			goto out;
1268 		}
1269 
1270 		ra.err = dsl_dir_sync_task(dd, replay_incremental_sync,
1271 		    drrb, 1<<20);
1272 	} else {
1273 		/* full backup */
1274 		const char *tail;
1275 
1276 		cp = strchr(tosnap, '@');
1277 		*cp = '\0';
1278 		dd = dsl_dir_open(tosnap, FTAG, &tail);
1279 		*cp = '@';
1280 		if (dd == NULL) {
1281 			ra.err = ENOENT;
1282 			goto out;
1283 		}
1284 		if (tail == NULL) {
1285 			ra.err = EEXIST;
1286 			goto out;
1287 		}
1288 
1289 		ra.err = dsl_dir_sync_task(dd, replay_full_sync,
1290 		    drrb, 1<<20);
1291 	}
1292 	if (ra.err)
1293 		goto out;
1294 
1295 	/*
1296 	 * Open the objset we are modifying.
1297 	 */
1298 
1299 	cp = strchr(tosnap, '@');
1300 	*cp = '\0';
1301 	ra.err = dmu_objset_open(tosnap, DMU_OST_ANY,
1302 	    DS_MODE_PRIMARY | DS_MODE_RESTORE, &os);
1303 	*cp = '@';
1304 	ASSERT3U(ra.err, ==, 0);
1305 
1306 	/*
1307 	 * Read records and process them.
1308 	 */
1309 	while (ra.err == 0 &&
1310 	    NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
1311 		if (issig(JUSTLOOKING)) {
1312 			ra.err = EINTR;
1313 			goto out;
1314 		}
1315 
1316 		if (ra.byteswap)
1317 			backup_byteswap(drr);
1318 
1319 		switch (drr->drr_type) {
1320 		case DRR_OBJECT:
1321 		{
1322 			/*
1323 			 * We need to make a copy of the record header,
1324 			 * because restore_{object,write} may need to
1325 			 * restore_read(), which will invalidate drr.
1326 			 */
1327 			struct drr_object drro = drr->drr_u.drr_object;
1328 			ra.err = restore_object(&ra, os, &drro);
1329 			break;
1330 		}
1331 		case DRR_FREEOBJECTS:
1332 		{
1333 			struct drr_freeobjects drrfo =
1334 			    drr->drr_u.drr_freeobjects;
1335 			ra.err = restore_freeobjects(&ra, os, &drrfo);
1336 			break;
1337 		}
1338 		case DRR_WRITE:
1339 		{
1340 			struct drr_write drrw = drr->drr_u.drr_write;
1341 			ra.err = restore_write(&ra, os, &drrw);
1342 			break;
1343 		}
1344 		case DRR_FREE:
1345 		{
1346 			struct drr_free drrf = drr->drr_u.drr_free;
1347 			ra.err = restore_free(&ra, os, &drrf);
1348 			break;
1349 		}
1350 		case DRR_END:
1351 			/* Need to verify checksum. */
1352 			/*
1353 			 * dd may be the parent of the dd we are
1354 			 * restoring into (eg. if it's a full backup).
1355 			 */
1356 			ra.err = dsl_dir_sync_task(dmu_objset_ds(os)->
1357 			    ds_dir, replay_end_sync, drrb, 1<<20);
1358 			goto out;
1359 		default:
1360 			ra.err = EINVAL;
1361 			goto out;
1362 		}
1363 	}
1364 
1365 out:
1366 	if (os)
1367 		dmu_objset_close(os);
1368 
1369 	/*
1370 	 * Make sure we don't rollback/destroy unless we actually
1371 	 * processed the begin properly.  'os' will only be set if this
1372 	 * is the case.
1373 	 */
1374 	if (ra.err && os && dd && tosnap && strchr(tosnap, '@')) {
1375 		/*
1376 		 * rollback or destroy what we created, so we don't
1377 		 * leave it in the restoring state.
1378 		 */
1379 		txg_wait_synced(dd->dd_pool, 0);
1380 		if (drrb->drr_fromguid) {
1381 			/* incremental: rollback to most recent snapshot */
1382 			(void) dsl_dir_sync_task(dd,
1383 			    dsl_dataset_rollback_sync, NULL, 0);
1384 		} else {
1385 			/* full: destroy whole fs */
1386 			cp = strchr(tosnap, '@');
1387 			*cp = '\0';
1388 			cp = strchr(tosnap, '/');
1389 			if (cp) {
1390 				(void) dsl_dir_sync_task(dd,
1391 				    dsl_dir_destroy_sync, cp+1, 0);
1392 			}
1393 			cp = strchr(tosnap, '\0');
1394 			*cp = '@';
1395 		}
1396 
1397 	}
1398 
1399 	if (dd)
1400 		dsl_dir_close(dd, FTAG);
1401 	kmem_free(ra.buf, ra.bufsize);
1402 	if (sizep)
1403 		*sizep = ra.voff;
1404 	return (ra.err);
1405 }
1406 
1407 /*
1408  * Intent log support: sync the block at <os, object, offset> to disk.
1409  * N.B. and XXX: the caller is responsible for serializing dmu_sync()s
1410  * of the same block, and for making sure that the data isn't changing
1411  * while dmu_sync() is writing it.
1412  *
1413  * Return values:
1414  *
1415  *	EALREADY: this txg has already been synced, so there's nothing to to.
1416  *		The caller should not log the write.
1417  *
1418  *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
1419  *		The caller should not log the write.
1420  *
1421  *	EINPROGRESS: the block is in the process of being synced by the
1422  *		usual mechanism (spa_sync()), so we can't sync it here.
1423  *		The caller should txg_wait_synced() and not log the write.
1424  *
1425  *	EBUSY: another thread is trying to dmu_sync() the same dbuf.
1426  *		(This case cannot arise under the current locking rules.)
1427  *		The caller should txg_wait_synced() and not log the write.
1428  *
1429  *	ESTALE: the block was dirtied or freed while we were writing it,
1430  *		so the data is no longer valid.
1431  *		The caller should txg_wait_synced() and not log the write.
1432  *
1433  *	0: success.  Sets *bp to the blkptr just written, and sets
1434  *		*blkoff to the data's offset within that block.
1435  *		The caller should log this blkptr/blkoff in its lr_write_t.
1436  */
1437 int
1438 dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
1439     blkptr_t *bp, uint64_t txg)
1440 {
1441 	dsl_pool_t *dp = os->os->os_dsl_dataset->ds_dir->dd_pool;
1442 	tx_state_t *tx = &dp->dp_tx;
1443 	dmu_buf_impl_t *db;
1444 	blkptr_t *blk;
1445 	int err;
1446 
1447 	ASSERT(RW_LOCK_HELD(&tx->tx_suspend));
1448 	ASSERT(BP_IS_HOLE(bp));
1449 	ASSERT(txg != 0);
1450 
1451 	dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
1452 	    txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
1453 
1454 	/*
1455 	 * If this txg already synced, there's nothing to do.
1456 	 */
1457 	if (txg <= tx->tx_synced_txg) {
1458 		/*
1459 		 * If we're running ziltest, we need the blkptr regardless.
1460 		 */
1461 		if (txg > spa_freeze_txg(dp->dp_spa)) {
1462 			db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset);
1463 			/* if db_blkptr == NULL, this was an empty write */
1464 			if (db->db_blkptr)
1465 				*bp = *db->db_blkptr; /* structure assignment */
1466 			else
1467 				bzero(bp, sizeof (blkptr_t));
1468 			*blkoff = offset - db->db.db_offset;
1469 			ASSERT3U(*blkoff, <, db->db.db_size);
1470 			dmu_buf_rele((dmu_buf_t *)db);
1471 			return (0);
1472 		}
1473 		return (EALREADY);
1474 	}
1475 
1476 	/*
1477 	 * If this txg is in the middle of syncing, just wait for it.
1478 	 */
1479 	if (txg == tx->tx_syncing_txg) {
1480 		ASSERT(txg != tx->tx_open_txg);
1481 		return (EINPROGRESS);
1482 	}
1483 
1484 	db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset);
1485 
1486 	mutex_enter(&db->db_mtx);
1487 
1488 	/*
1489 	 * If this dbuf isn't dirty, must have been free_range'd.
1490 	 * There's no need to log writes to freed blocks, so we're done.
1491 	 */
1492 	if (!list_link_active(&db->db_dirty_node[txg&TXG_MASK])) {
1493 		mutex_exit(&db->db_mtx);
1494 		dmu_buf_rele((dmu_buf_t *)db);
1495 		return (ENOENT);
1496 	}
1497 
1498 	blk = db->db_d.db_overridden_by[txg&TXG_MASK];
1499 
1500 	/*
1501 	 * If we already did a dmu_sync() of this dbuf in this txg,
1502 	 * free the old block before writing the new one.
1503 	 */
1504 	if (blk != NULL) {
1505 		ASSERT(blk != IN_DMU_SYNC);
1506 		if (blk == IN_DMU_SYNC) {
1507 			mutex_exit(&db->db_mtx);
1508 			dmu_buf_rele((dmu_buf_t *)db);
1509 			return (EBUSY);
1510 		}
1511 		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
1512 		if (!BP_IS_HOLE(blk)) {
1513 			(void) arc_free(NULL, os->os->os_spa, txg, blk,
1514 			    NULL, NULL, ARC_WAIT);
1515 		}
1516 		kmem_free(blk, sizeof (blkptr_t));
1517 	}
1518 
1519 	db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC;
1520 	mutex_exit(&db->db_mtx);
1521 
1522 	blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
1523 	blk->blk_birth = 0; /* mark as invalid */
1524 
1525 	err = arc_write(NULL, os->os->os_spa,
1526 	    zio_checksum_select(db->db_dnode->dn_checksum, os->os->os_checksum),
1527 	    zio_compress_select(db->db_dnode->dn_compress, os->os->os_compress),
1528 	    txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL,
1529 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
1530 	ASSERT(err == 0);
1531 
1532 	if (!BP_IS_HOLE(blk)) {
1533 		blk->blk_fill = 1;
1534 		BP_SET_TYPE(blk, db->db_dnode->dn_type);
1535 		BP_SET_LEVEL(blk, 0);
1536 	}
1537 
1538 	/* copy the block pointer back to caller */
1539 	*bp = *blk; /* structure assignment */
1540 	*blkoff = offset - db->db.db_offset;
1541 	ASSERT3U(*blkoff, <, db->db.db_size);
1542 
1543 	mutex_enter(&db->db_mtx);
1544 	if (db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC) {
1545 		/* we were dirtied/freed during the sync */
1546 		ASSERT3P(db->db_d.db_overridden_by[txg&TXG_MASK], ==, NULL);
1547 		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
1548 		mutex_exit(&db->db_mtx);
1549 		dmu_buf_rele((dmu_buf_t *)db);
1550 		/* Note that this block does not free on disk until txg syncs */
1551 
1552 		/*
1553 		 * XXX can we use ARC_NOWAIT here?
1554 		 * XXX should we be ignoring the return code?
1555 		 */
1556 		if (!BP_IS_HOLE(blk)) {
1557 			(void) arc_free(NULL, os->os->os_spa, txg, blk,
1558 			    NULL, NULL, ARC_WAIT);
1559 		}
1560 		kmem_free(blk, sizeof (blkptr_t));
1561 		return (ESTALE);
1562 	}
1563 
1564 	db->db_d.db_overridden_by[txg&TXG_MASK] = blk;
1565 	mutex_exit(&db->db_mtx);
1566 	dmu_buf_rele((dmu_buf_t *)db);
1567 	ASSERT3U(txg, >, tx->tx_syncing_txg);
1568 	return (0);
1569 }
1570 
1571 uint64_t
1572 dmu_object_max_nonzero_offset(objset_t *os, uint64_t object)
1573 {
1574 	dnode_t *dn = dnode_hold(os->os, object, FTAG);
1575 	uint64_t rv = dnode_max_nonzero_offset(dn);
1576 	dnode_rele(dn, FTAG);
1577 	return (rv);
1578 }
1579 
1580 int
1581 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
1582 	dmu_tx_t *tx)
1583 {
1584 	dnode_t *dn = dnode_hold(os->os, object, FTAG);
1585 	int err = dnode_set_blksz(dn, size, ibs, tx);
1586 	dnode_rele(dn, FTAG);
1587 	return (err);
1588 }
1589 
1590 void
1591 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
1592 	dmu_tx_t *tx)
1593 {
1594 	dnode_t *dn = dnode_hold(os->os, object, FTAG);
1595 	ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
1596 	dn->dn_checksum = checksum;
1597 	dnode_setdirty(dn, tx);
1598 	dnode_rele(dn, FTAG);
1599 }
1600 
1601 void
1602 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
1603 	dmu_tx_t *tx)
1604 {
1605 	dnode_t *dn = dnode_hold(os->os, object, FTAG);
1606 	ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
1607 	dn->dn_compress = compress;
1608 	dnode_setdirty(dn, tx);
1609 	dnode_rele(dn, FTAG);
1610 }
1611 
1612 int
1613 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
1614 {
1615 	dnode_t *dn;
1616 	int i, err;
1617 
1618 	dn = dnode_hold(os->os, object, FTAG);
1619 	/*
1620 	 * Sync any current changes before
1621 	 * we go trundling through the block pointers.
1622 	 */
1623 	for (i = 0; i < TXG_SIZE; i++) {
1624 		if (dn->dn_dirtyblksz[i])
1625 			break;
1626 	}
1627 	if (i != TXG_SIZE) {
1628 		dnode_rele(dn, FTAG);
1629 		txg_wait_synced(dmu_objset_pool(os), 0);
1630 		dn = dnode_hold(os->os, object, FTAG);
1631 	}
1632 
1633 	err = dnode_next_offset(dn, hole, off, 1, 1);
1634 	dnode_rele(dn, FTAG);
1635 
1636 	return (err);
1637 }
1638 
1639 void
1640 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
1641 {
1642 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
1643 	mutex_enter(&dn->dn_mtx);
1644 
1645 	doi->doi_data_block_size = dn->dn_datablksz;
1646 	doi->doi_metadata_block_size = dn->dn_indblkshift ?
1647 	    1ULL << dn->dn_indblkshift : 0;
1648 	doi->doi_indirection = dn->dn_nlevels;
1649 	doi->doi_checksum = dn->dn_checksum;
1650 	doi->doi_compress = dn->dn_compress;
1651 	doi->doi_physical_blks = dn->dn_phys->dn_secphys;
1652 	doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
1653 	doi->doi_type = dn->dn_type;
1654 	doi->doi_bonus_size = dn->dn_bonuslen;
1655 	doi->doi_bonus_type = dn->dn_bonustype;
1656 
1657 	mutex_exit(&dn->dn_mtx);
1658 	rw_exit(&dn->dn_struct_rwlock);
1659 }
1660 
1661 /*
1662  * Get information on a DMU object.
1663  * If doi is NULL, just indicates whether the object exists.
1664  */
1665 int
1666 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
1667 {
1668 	dnode_t *dn = dnode_hold(os->os, object, FTAG);
1669 
1670 	if (dn == NULL)
1671 		return (ENOENT);
1672 
1673 	if (doi != NULL)
1674 		dmu_object_info_from_dnode(dn, doi);
1675 
1676 	dnode_rele(dn, FTAG);
1677 	return (0);
1678 }
1679 
1680 /*
1681  * As above, but faster; can be used when you have a held dbuf in hand.
1682  */
1683 void
1684 dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi)
1685 {
1686 	dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi);
1687 }
1688 
1689 /*
1690  * Faster still when you only care about the size.
1691  * This is specifically optimized for zfs_getattr().
1692  */
1693 void
1694 dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
1695 {
1696 	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
1697 
1698 	*blksize = dn->dn_datablksz;
1699 	*nblk512 = dn->dn_phys->dn_secphys + 1;	/* add 1 for dnode space */
1700 }
1701 
1702 void
1703 byteswap_uint64_array(void *vbuf, size_t size)
1704 {
1705 	uint64_t *buf = vbuf;
1706 	size_t count = size >> 3;
1707 	int i;
1708 
1709 	ASSERT((size & 7) == 0);
1710 
1711 	for (i = 0; i < count; i++)
1712 		buf[i] = BSWAP_64(buf[i]);
1713 }
1714 
1715 void
1716 byteswap_uint32_array(void *vbuf, size_t size)
1717 {
1718 	uint32_t *buf = vbuf;
1719 	size_t count = size >> 2;
1720 	int i;
1721 
1722 	ASSERT((size & 3) == 0);
1723 
1724 	for (i = 0; i < count; i++)
1725 		buf[i] = BSWAP_32(buf[i]);
1726 }
1727 
1728 void
1729 byteswap_uint16_array(void *vbuf, size_t size)
1730 {
1731 	uint16_t *buf = vbuf;
1732 	size_t count = size >> 1;
1733 	int i;
1734 
1735 	ASSERT((size & 1) == 0);
1736 
1737 	for (i = 0; i < count; i++)
1738 		buf[i] = BSWAP_16(buf[i]);
1739 }
1740 
1741 /* ARGSUSED */
1742 void
1743 byteswap_uint8_array(void *vbuf, size_t size)
1744 {
1745 }
1746 
1747 void
1748 dmu_init(void)
1749 {
1750 	dbuf_init();
1751 	dnode_init();
1752 	arc_init();
1753 }
1754 
1755 void
1756 dmu_fini(void)
1757 {
1758 	arc_fini();
1759 	dnode_fini();
1760 	dbuf_fini();
1761 }
1762