1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2014, Joyent, Inc. All rights reserved. 26 * Copyright 2014 HybridCluster. All rights reserved. 27 */ 28 29 #include <sys/dmu.h> 30 #include <sys/dmu_impl.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/dbuf.h> 33 #include <sys/dnode.h> 34 #include <sys/zfs_context.h> 35 #include <sys/dmu_objset.h> 36 #include <sys/dmu_traverse.h> 37 #include <sys/dsl_dataset.h> 38 #include <sys/dsl_dir.h> 39 #include <sys/dsl_prop.h> 40 #include <sys/dsl_pool.h> 41 #include <sys/dsl_synctask.h> 42 #include <sys/zfs_ioctl.h> 43 #include <sys/zap.h> 44 #include <sys/zio_checksum.h> 45 #include <sys/zfs_znode.h> 46 #include <zfs_fletcher.h> 47 #include <sys/avl.h> 48 #include <sys/ddt.h> 49 #include <sys/zfs_onexit.h> 50 #include <sys/dmu_send.h> 51 #include <sys/dsl_destroy.h> 52 #include <sys/blkptr.h> 53 #include <sys/dsl_bookmark.h> 54 #include <sys/zfeature.h> 55 56 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 57 int zfs_send_corrupt_data = B_FALSE; 58 59 static char *dmu_recv_tag = "dmu_recv_tag"; 60 static const char *recv_clone_name = "%recv"; 61 62 static int 63 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) 64 { 65 dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; 66 ssize_t resid; /* have to get resid to get detailed errno */ 67 ASSERT0(len % 8); 68 69 fletcher_4_incremental_native(buf, len, &dsp->dsa_zc); 70 dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, 71 (caddr_t)buf, len, 72 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); 73 74 mutex_enter(&ds->ds_sendstream_lock); 75 *dsp->dsa_off += len; 76 mutex_exit(&ds->ds_sendstream_lock); 77 78 return (dsp->dsa_err); 79 } 80 81 static int 82 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 83 uint64_t length) 84 { 85 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); 86 87 /* 88 * When we receive a free record, dbuf_free_range() assumes 89 * that the receiving system doesn't have any dbufs in the range 90 * being freed. This is always true because there is a one-record 91 * constraint: we only send one WRITE record for any given 92 * object+offset. We know that the one-record constraint is 93 * true because we always send data in increasing order by 94 * object,offset. 95 * 96 * If the increasing-order constraint ever changes, we should find 97 * another way to assert that the one-record constraint is still 98 * satisfied. 99 */ 100 ASSERT(object > dsp->dsa_last_data_object || 101 (object == dsp->dsa_last_data_object && 102 offset > dsp->dsa_last_data_offset)); 103 104 /* 105 * If we are doing a non-incremental send, then there can't 106 * be any data in the dataset we're receiving into. Therefore 107 * a free record would simply be a no-op. Save space by not 108 * sending it to begin with. 109 */ 110 if (!dsp->dsa_incremental) 111 return (0); 112 113 if (length != -1ULL && offset + length < offset) 114 length = -1ULL; 115 116 /* 117 * If there is a pending op, but it's not PENDING_FREE, push it out, 118 * since free block aggregation can only be done for blocks of the 119 * same type (i.e., DRR_FREE records can only be aggregated with 120 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 121 * aggregated with other DRR_FREEOBJECTS records. 122 */ 123 if (dsp->dsa_pending_op != PENDING_NONE && 124 dsp->dsa_pending_op != PENDING_FREE) { 125 if (dump_bytes(dsp, dsp->dsa_drr, 126 sizeof (dmu_replay_record_t)) != 0) 127 return (SET_ERROR(EINTR)); 128 dsp->dsa_pending_op = PENDING_NONE; 129 } 130 131 if (dsp->dsa_pending_op == PENDING_FREE) { 132 /* 133 * There should never be a PENDING_FREE if length is -1 134 * (because dump_dnode is the only place where this 135 * function is called with a -1, and only after flushing 136 * any pending record). 137 */ 138 ASSERT(length != -1ULL); 139 /* 140 * Check to see whether this free block can be aggregated 141 * with pending one. 142 */ 143 if (drrf->drr_object == object && drrf->drr_offset + 144 drrf->drr_length == offset) { 145 drrf->drr_length += length; 146 return (0); 147 } else { 148 /* not a continuation. Push out pending record */ 149 if (dump_bytes(dsp, dsp->dsa_drr, 150 sizeof (dmu_replay_record_t)) != 0) 151 return (SET_ERROR(EINTR)); 152 dsp->dsa_pending_op = PENDING_NONE; 153 } 154 } 155 /* create a FREE record and make it pending */ 156 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 157 dsp->dsa_drr->drr_type = DRR_FREE; 158 drrf->drr_object = object; 159 drrf->drr_offset = offset; 160 drrf->drr_length = length; 161 drrf->drr_toguid = dsp->dsa_toguid; 162 if (length == -1ULL) { 163 if (dump_bytes(dsp, dsp->dsa_drr, 164 sizeof (dmu_replay_record_t)) != 0) 165 return (SET_ERROR(EINTR)); 166 } else { 167 dsp->dsa_pending_op = PENDING_FREE; 168 } 169 170 return (0); 171 } 172 173 static int 174 dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, 175 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 176 { 177 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); 178 179 /* 180 * We send data in increasing object, offset order. 181 * See comment in dump_free() for details. 182 */ 183 ASSERT(object > dsp->dsa_last_data_object || 184 (object == dsp->dsa_last_data_object && 185 offset > dsp->dsa_last_data_offset)); 186 dsp->dsa_last_data_object = object; 187 dsp->dsa_last_data_offset = offset + blksz - 1; 188 189 /* 190 * If there is any kind of pending aggregation (currently either 191 * a grouping of free objects or free blocks), push it out to 192 * the stream, since aggregation can't be done across operations 193 * of different types. 194 */ 195 if (dsp->dsa_pending_op != PENDING_NONE) { 196 if (dump_bytes(dsp, dsp->dsa_drr, 197 sizeof (dmu_replay_record_t)) != 0) 198 return (SET_ERROR(EINTR)); 199 dsp->dsa_pending_op = PENDING_NONE; 200 } 201 /* write a DATA record */ 202 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 203 dsp->dsa_drr->drr_type = DRR_WRITE; 204 drrw->drr_object = object; 205 drrw->drr_type = type; 206 drrw->drr_offset = offset; 207 drrw->drr_length = blksz; 208 drrw->drr_toguid = dsp->dsa_toguid; 209 if (bp == NULL || BP_IS_EMBEDDED(bp)) { 210 /* 211 * There's no pre-computed checksum for partial-block 212 * writes or embedded BP's, so (like 213 * fletcher4-checkummed blocks) userland will have to 214 * compute a dedup-capable checksum itself. 215 */ 216 drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; 217 } else { 218 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 219 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 220 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 221 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 222 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 223 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 224 drrw->drr_key.ddk_cksum = bp->blk_cksum; 225 } 226 227 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 228 return (SET_ERROR(EINTR)); 229 if (dump_bytes(dsp, data, blksz) != 0) 230 return (SET_ERROR(EINTR)); 231 return (0); 232 } 233 234 static int 235 dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 236 int blksz, const blkptr_t *bp) 237 { 238 char buf[BPE_PAYLOAD_SIZE]; 239 struct drr_write_embedded *drrw = 240 &(dsp->dsa_drr->drr_u.drr_write_embedded); 241 242 if (dsp->dsa_pending_op != PENDING_NONE) { 243 if (dump_bytes(dsp, dsp->dsa_drr, 244 sizeof (dmu_replay_record_t)) != 0) 245 return (EINTR); 246 dsp->dsa_pending_op = PENDING_NONE; 247 } 248 249 ASSERT(BP_IS_EMBEDDED(bp)); 250 251 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 252 dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; 253 drrw->drr_object = object; 254 drrw->drr_offset = offset; 255 drrw->drr_length = blksz; 256 drrw->drr_toguid = dsp->dsa_toguid; 257 drrw->drr_compression = BP_GET_COMPRESS(bp); 258 drrw->drr_etype = BPE_GET_ETYPE(bp); 259 drrw->drr_lsize = BPE_GET_LSIZE(bp); 260 drrw->drr_psize = BPE_GET_PSIZE(bp); 261 262 decode_embedded_bp_compressed(bp, buf); 263 264 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 265 return (EINTR); 266 if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) 267 return (EINTR); 268 return (0); 269 } 270 271 static int 272 dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) 273 { 274 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); 275 276 if (dsp->dsa_pending_op != PENDING_NONE) { 277 if (dump_bytes(dsp, dsp->dsa_drr, 278 sizeof (dmu_replay_record_t)) != 0) 279 return (SET_ERROR(EINTR)); 280 dsp->dsa_pending_op = PENDING_NONE; 281 } 282 283 /* write a SPILL record */ 284 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 285 dsp->dsa_drr->drr_type = DRR_SPILL; 286 drrs->drr_object = object; 287 drrs->drr_length = blksz; 288 drrs->drr_toguid = dsp->dsa_toguid; 289 290 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) 291 return (SET_ERROR(EINTR)); 292 if (dump_bytes(dsp, data, blksz)) 293 return (SET_ERROR(EINTR)); 294 return (0); 295 } 296 297 static int 298 dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) 299 { 300 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); 301 302 /* See comment in dump_free(). */ 303 if (!dsp->dsa_incremental) 304 return (0); 305 306 /* 307 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 308 * push it out, since free block aggregation can only be done for 309 * blocks of the same type (i.e., DRR_FREE records can only be 310 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 311 * can only be aggregated with other DRR_FREEOBJECTS records. 312 */ 313 if (dsp->dsa_pending_op != PENDING_NONE && 314 dsp->dsa_pending_op != PENDING_FREEOBJECTS) { 315 if (dump_bytes(dsp, dsp->dsa_drr, 316 sizeof (dmu_replay_record_t)) != 0) 317 return (SET_ERROR(EINTR)); 318 dsp->dsa_pending_op = PENDING_NONE; 319 } 320 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { 321 /* 322 * See whether this free object array can be aggregated 323 * with pending one 324 */ 325 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 326 drrfo->drr_numobjs += numobjs; 327 return (0); 328 } else { 329 /* can't be aggregated. Push out pending record */ 330 if (dump_bytes(dsp, dsp->dsa_drr, 331 sizeof (dmu_replay_record_t)) != 0) 332 return (SET_ERROR(EINTR)); 333 dsp->dsa_pending_op = PENDING_NONE; 334 } 335 } 336 337 /* write a FREEOBJECTS record */ 338 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 339 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; 340 drrfo->drr_firstobj = firstobj; 341 drrfo->drr_numobjs = numobjs; 342 drrfo->drr_toguid = dsp->dsa_toguid; 343 344 dsp->dsa_pending_op = PENDING_FREEOBJECTS; 345 346 return (0); 347 } 348 349 static int 350 dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) 351 { 352 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); 353 354 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 355 return (dump_freeobjects(dsp, object, 1)); 356 357 if (dsp->dsa_pending_op != PENDING_NONE) { 358 if (dump_bytes(dsp, dsp->dsa_drr, 359 sizeof (dmu_replay_record_t)) != 0) 360 return (SET_ERROR(EINTR)); 361 dsp->dsa_pending_op = PENDING_NONE; 362 } 363 364 /* write an OBJECT record */ 365 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 366 dsp->dsa_drr->drr_type = DRR_OBJECT; 367 drro->drr_object = object; 368 drro->drr_type = dnp->dn_type; 369 drro->drr_bonustype = dnp->dn_bonustype; 370 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 371 drro->drr_bonuslen = dnp->dn_bonuslen; 372 drro->drr_checksumtype = dnp->dn_checksum; 373 drro->drr_compress = dnp->dn_compress; 374 drro->drr_toguid = dsp->dsa_toguid; 375 376 if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 377 drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) 378 drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; 379 380 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 381 return (SET_ERROR(EINTR)); 382 383 if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) 384 return (SET_ERROR(EINTR)); 385 386 /* Free anything past the end of the file. */ 387 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * 388 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) 389 return (SET_ERROR(EINTR)); 390 if (dsp->dsa_err != 0) 391 return (SET_ERROR(EINTR)); 392 return (0); 393 } 394 395 static boolean_t 396 backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) 397 { 398 if (!BP_IS_EMBEDDED(bp)) 399 return (B_FALSE); 400 401 /* 402 * Compression function must be legacy, or explicitly enabled. 403 */ 404 if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && 405 !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4))) 406 return (B_FALSE); 407 408 /* 409 * Embed type must be explicitly enabled. 410 */ 411 switch (BPE_GET_ETYPE(bp)) { 412 case BP_EMBEDDED_TYPE_DATA: 413 if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) 414 return (B_TRUE); 415 break; 416 default: 417 return (B_FALSE); 418 } 419 return (B_FALSE); 420 } 421 422 #define BP_SPAN(dnp, level) \ 423 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 424 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 425 426 /* ARGSUSED */ 427 static int 428 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 429 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 430 { 431 dmu_sendarg_t *dsp = arg; 432 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 433 int err = 0; 434 435 if (issig(JUSTLOOKING) && issig(FORREAL)) 436 return (SET_ERROR(EINTR)); 437 438 if (zb->zb_object != DMU_META_DNODE_OBJECT && 439 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 440 return (0); 441 } else if (zb->zb_level == ZB_ZIL_LEVEL) { 442 /* 443 * If we are sending a non-snapshot (which is allowed on 444 * read-only pools), it may have a ZIL, which must be ignored. 445 */ 446 return (0); 447 } else if (BP_IS_HOLE(bp) && 448 zb->zb_object == DMU_META_DNODE_OBJECT) { 449 uint64_t span = BP_SPAN(dnp, zb->zb_level); 450 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 451 err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); 452 } else if (BP_IS_HOLE(bp)) { 453 uint64_t span = BP_SPAN(dnp, zb->zb_level); 454 err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); 455 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 456 return (0); 457 } else if (type == DMU_OT_DNODE) { 458 dnode_phys_t *blk; 459 int i; 460 int blksz = BP_GET_LSIZE(bp); 461 uint32_t aflags = ARC_WAIT; 462 arc_buf_t *abuf; 463 464 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 465 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 466 &aflags, zb) != 0) 467 return (SET_ERROR(EIO)); 468 469 blk = abuf->b_data; 470 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 471 uint64_t dnobj = (zb->zb_blkid << 472 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 473 err = dump_dnode(dsp, dnobj, blk+i); 474 if (err != 0) 475 break; 476 } 477 (void) arc_buf_remove_ref(abuf, &abuf); 478 } else if (type == DMU_OT_SA) { 479 uint32_t aflags = ARC_WAIT; 480 arc_buf_t *abuf; 481 int blksz = BP_GET_LSIZE(bp); 482 483 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 484 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 485 &aflags, zb) != 0) 486 return (SET_ERROR(EIO)); 487 488 err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); 489 (void) arc_buf_remove_ref(abuf, &abuf); 490 } else if (backup_do_embed(dsp, bp)) { 491 /* it's an embedded level-0 block of a regular object */ 492 int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 493 err = dump_write_embedded(dsp, zb->zb_object, 494 zb->zb_blkid * blksz, blksz, bp); 495 } else { /* it's a level-0 block of a regular object */ 496 uint32_t aflags = ARC_WAIT; 497 arc_buf_t *abuf; 498 int blksz = BP_GET_LSIZE(bp); 499 uint64_t offset; 500 501 ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); 502 ASSERT0(zb->zb_level); 503 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 504 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 505 &aflags, zb) != 0) { 506 if (zfs_send_corrupt_data) { 507 /* Send a block filled with 0x"zfs badd bloc" */ 508 abuf = arc_buf_alloc(spa, blksz, &abuf, 509 ARC_BUFC_DATA); 510 uint64_t *ptr; 511 for (ptr = abuf->b_data; 512 (char *)ptr < (char *)abuf->b_data + blksz; 513 ptr++) 514 *ptr = 0x2f5baddb10c; 515 } else { 516 return (SET_ERROR(EIO)); 517 } 518 } 519 520 offset = zb->zb_blkid * blksz; 521 522 if (!(dsp->dsa_featureflags & 523 DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 524 blksz > SPA_OLD_MAXBLOCKSIZE) { 525 char *buf = abuf->b_data; 526 while (blksz > 0 && err == 0) { 527 int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); 528 err = dump_write(dsp, type, zb->zb_object, 529 offset, n, NULL, buf); 530 offset += n; 531 buf += n; 532 blksz -= n; 533 } 534 } else { 535 err = dump_write(dsp, type, zb->zb_object, 536 offset, blksz, bp, abuf->b_data); 537 } 538 (void) arc_buf_remove_ref(abuf, &abuf); 539 } 540 541 ASSERT(err == 0 || err == EINTR); 542 return (err); 543 } 544 545 /* 546 * Releases dp using the specified tag. 547 */ 548 static int 549 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, 550 zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, 551 boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) 552 { 553 objset_t *os; 554 dmu_replay_record_t *drr; 555 dmu_sendarg_t *dsp; 556 int err; 557 uint64_t fromtxg = 0; 558 uint64_t featureflags = 0; 559 560 err = dmu_objset_from_ds(ds, &os); 561 if (err != 0) { 562 dsl_pool_rele(dp, tag); 563 return (err); 564 } 565 566 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 567 drr->drr_type = DRR_BEGIN; 568 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 569 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 570 DMU_SUBSTREAM); 571 572 #ifdef _KERNEL 573 if (dmu_objset_type(os) == DMU_OST_ZFS) { 574 uint64_t version; 575 if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { 576 kmem_free(drr, sizeof (dmu_replay_record_t)); 577 dsl_pool_rele(dp, tag); 578 return (SET_ERROR(EINVAL)); 579 } 580 if (version >= ZPL_VERSION_SA) { 581 featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; 582 } 583 } 584 #endif 585 586 if (large_block_ok && ds->ds_large_blocks) 587 featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; 588 if (embedok && 589 spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { 590 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; 591 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 592 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; 593 } else { 594 embedok = B_FALSE; 595 } 596 597 DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, 598 featureflags); 599 600 drr->drr_u.drr_begin.drr_creation_time = 601 ds->ds_phys->ds_creation_time; 602 drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); 603 if (is_clone) 604 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 605 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 606 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 607 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 608 609 if (fromzb != NULL) { 610 drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid; 611 fromtxg = fromzb->zbm_creation_txg; 612 } 613 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 614 if (!dsl_dataset_is_snapshot(ds)) { 615 (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", 616 sizeof (drr->drr_u.drr_begin.drr_toname)); 617 } 618 619 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); 620 621 dsp->dsa_drr = drr; 622 dsp->dsa_vp = vp; 623 dsp->dsa_outfd = outfd; 624 dsp->dsa_proc = curproc; 625 dsp->dsa_os = os; 626 dsp->dsa_off = off; 627 dsp->dsa_toguid = ds->ds_phys->ds_guid; 628 ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); 629 dsp->dsa_pending_op = PENDING_NONE; 630 dsp->dsa_incremental = (fromzb != NULL); 631 dsp->dsa_featureflags = featureflags; 632 633 mutex_enter(&ds->ds_sendstream_lock); 634 list_insert_head(&ds->ds_sendstreams, dsp); 635 mutex_exit(&ds->ds_sendstream_lock); 636 637 dsl_dataset_long_hold(ds, FTAG); 638 dsl_pool_rele(dp, tag); 639 640 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 641 err = dsp->dsa_err; 642 goto out; 643 } 644 645 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 646 backup_cb, dsp); 647 648 if (dsp->dsa_pending_op != PENDING_NONE) 649 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) 650 err = SET_ERROR(EINTR); 651 652 if (err != 0) { 653 if (err == EINTR && dsp->dsa_err != 0) 654 err = dsp->dsa_err; 655 goto out; 656 } 657 658 bzero(drr, sizeof (dmu_replay_record_t)); 659 drr->drr_type = DRR_END; 660 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; 661 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; 662 663 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 664 err = dsp->dsa_err; 665 goto out; 666 } 667 668 out: 669 mutex_enter(&ds->ds_sendstream_lock); 670 list_remove(&ds->ds_sendstreams, dsp); 671 mutex_exit(&ds->ds_sendstream_lock); 672 673 kmem_free(drr, sizeof (dmu_replay_record_t)); 674 kmem_free(dsp, sizeof (dmu_sendarg_t)); 675 676 dsl_dataset_long_rele(ds, FTAG); 677 678 return (err); 679 } 680 681 int 682 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, 683 boolean_t embedok, boolean_t large_block_ok, 684 int outfd, vnode_t *vp, offset_t *off) 685 { 686 dsl_pool_t *dp; 687 dsl_dataset_t *ds; 688 dsl_dataset_t *fromds = NULL; 689 int err; 690 691 err = dsl_pool_hold(pool, FTAG, &dp); 692 if (err != 0) 693 return (err); 694 695 err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); 696 if (err != 0) { 697 dsl_pool_rele(dp, FTAG); 698 return (err); 699 } 700 701 if (fromsnap != 0) { 702 zfs_bookmark_phys_t zb; 703 boolean_t is_clone; 704 705 err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); 706 if (err != 0) { 707 dsl_dataset_rele(ds, FTAG); 708 dsl_pool_rele(dp, FTAG); 709 return (err); 710 } 711 if (!dsl_dataset_is_before(ds, fromds, 0)) 712 err = SET_ERROR(EXDEV); 713 zb.zbm_creation_time = fromds->ds_phys->ds_creation_time; 714 zb.zbm_creation_txg = fromds->ds_phys->ds_creation_txg; 715 zb.zbm_guid = fromds->ds_phys->ds_guid; 716 is_clone = (fromds->ds_dir != ds->ds_dir); 717 dsl_dataset_rele(fromds, FTAG); 718 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, 719 embedok, large_block_ok, outfd, vp, off); 720 } else { 721 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, 722 embedok, large_block_ok, outfd, vp, off); 723 } 724 dsl_dataset_rele(ds, FTAG); 725 return (err); 726 } 727 728 int 729 dmu_send(const char *tosnap, const char *fromsnap, 730 boolean_t embedok, boolean_t large_block_ok, 731 int outfd, vnode_t *vp, offset_t *off) 732 { 733 dsl_pool_t *dp; 734 dsl_dataset_t *ds; 735 int err; 736 boolean_t owned = B_FALSE; 737 738 if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) 739 return (SET_ERROR(EINVAL)); 740 741 err = dsl_pool_hold(tosnap, FTAG, &dp); 742 if (err != 0) 743 return (err); 744 745 if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { 746 /* 747 * We are sending a filesystem or volume. Ensure 748 * that it doesn't change by owning the dataset. 749 */ 750 err = dsl_dataset_own(dp, tosnap, FTAG, &ds); 751 owned = B_TRUE; 752 } else { 753 err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); 754 } 755 if (err != 0) { 756 dsl_pool_rele(dp, FTAG); 757 return (err); 758 } 759 760 if (fromsnap != NULL) { 761 zfs_bookmark_phys_t zb; 762 boolean_t is_clone = B_FALSE; 763 int fsnamelen = strchr(tosnap, '@') - tosnap; 764 765 /* 766 * If the fromsnap is in a different filesystem, then 767 * mark the send stream as a clone. 768 */ 769 if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || 770 (fromsnap[fsnamelen] != '@' && 771 fromsnap[fsnamelen] != '#')) { 772 is_clone = B_TRUE; 773 } 774 775 if (strchr(fromsnap, '@')) { 776 dsl_dataset_t *fromds; 777 err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); 778 if (err == 0) { 779 if (!dsl_dataset_is_before(ds, fromds, 0)) 780 err = SET_ERROR(EXDEV); 781 zb.zbm_creation_time = 782 fromds->ds_phys->ds_creation_time; 783 zb.zbm_creation_txg = 784 fromds->ds_phys->ds_creation_txg; 785 zb.zbm_guid = fromds->ds_phys->ds_guid; 786 is_clone = (ds->ds_dir != fromds->ds_dir); 787 dsl_dataset_rele(fromds, FTAG); 788 } 789 } else { 790 err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); 791 } 792 if (err != 0) { 793 dsl_dataset_rele(ds, FTAG); 794 dsl_pool_rele(dp, FTAG); 795 return (err); 796 } 797 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, 798 embedok, large_block_ok, outfd, vp, off); 799 } else { 800 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, 801 embedok, large_block_ok, outfd, vp, off); 802 } 803 if (owned) 804 dsl_dataset_disown(ds, FTAG); 805 else 806 dsl_dataset_rele(ds, FTAG); 807 return (err); 808 } 809 810 int 811 dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) 812 { 813 dsl_pool_t *dp = ds->ds_dir->dd_pool; 814 int err; 815 uint64_t size; 816 817 ASSERT(dsl_pool_config_held(dp)); 818 819 /* tosnap must be a snapshot */ 820 if (!dsl_dataset_is_snapshot(ds)) 821 return (SET_ERROR(EINVAL)); 822 823 /* 824 * fromsnap must be an earlier snapshot from the same fs as tosnap, 825 * or the origin's fs. 826 */ 827 if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) 828 return (SET_ERROR(EXDEV)); 829 830 /* Get uncompressed size estimate of changed data. */ 831 if (fromds == NULL) { 832 size = ds->ds_phys->ds_uncompressed_bytes; 833 } else { 834 uint64_t used, comp; 835 err = dsl_dataset_space_written(fromds, ds, 836 &used, &comp, &size); 837 if (err != 0) 838 return (err); 839 } 840 841 /* 842 * Assume that space (both on-disk and in-stream) is dominated by 843 * data. We will adjust for indirect blocks and the copies property, 844 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 845 */ 846 847 /* 848 * Subtract out approximate space used by indirect blocks. 849 * Assume most space is used by data blocks (non-indirect, non-dnode). 850 * Assume all blocks are recordsize. Assume ditto blocks and 851 * internal fragmentation counter out compression. 852 * 853 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 854 * block, which we observe in practice. 855 */ 856 uint64_t recordsize; 857 err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); 858 if (err != 0) 859 return (err); 860 size -= size / recordsize * sizeof (blkptr_t); 861 862 /* Add in the space for the record associated with each block. */ 863 size += size / recordsize * sizeof (dmu_replay_record_t); 864 865 *sizep = size; 866 867 return (0); 868 } 869 870 typedef struct dmu_recv_begin_arg { 871 const char *drba_origin; 872 dmu_recv_cookie_t *drba_cookie; 873 cred_t *drba_cred; 874 uint64_t drba_snapobj; 875 } dmu_recv_begin_arg_t; 876 877 static int 878 recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, 879 uint64_t fromguid) 880 { 881 uint64_t val; 882 int error; 883 dsl_pool_t *dp = ds->ds_dir->dd_pool; 884 885 /* temporary clone name must not exist */ 886 error = zap_lookup(dp->dp_meta_objset, 887 ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name, 888 8, 1, &val); 889 if (error != ENOENT) 890 return (error == 0 ? EBUSY : error); 891 892 /* new snapshot name must not exist */ 893 error = zap_lookup(dp->dp_meta_objset, 894 ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, 895 8, 1, &val); 896 if (error != ENOENT) 897 return (error == 0 ? EEXIST : error); 898 899 /* 900 * Check snapshot limit before receiving. We'll recheck again at the 901 * end, but might as well abort before receiving if we're already over 902 * the limit. 903 * 904 * Note that we do not check the file system limit with 905 * dsl_dir_fscount_check because the temporary %clones don't count 906 * against that limit. 907 */ 908 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, 909 NULL, drba->drba_cred); 910 if (error != 0) 911 return (error); 912 913 if (fromguid != 0) { 914 dsl_dataset_t *snap; 915 uint64_t obj = ds->ds_phys->ds_prev_snap_obj; 916 917 /* Find snapshot in this dir that matches fromguid. */ 918 while (obj != 0) { 919 error = dsl_dataset_hold_obj(dp, obj, FTAG, 920 &snap); 921 if (error != 0) 922 return (SET_ERROR(ENODEV)); 923 if (snap->ds_dir != ds->ds_dir) { 924 dsl_dataset_rele(snap, FTAG); 925 return (SET_ERROR(ENODEV)); 926 } 927 if (snap->ds_phys->ds_guid == fromguid) 928 break; 929 obj = snap->ds_phys->ds_prev_snap_obj; 930 dsl_dataset_rele(snap, FTAG); 931 } 932 if (obj == 0) 933 return (SET_ERROR(ENODEV)); 934 935 if (drba->drba_cookie->drc_force) { 936 drba->drba_snapobj = obj; 937 } else { 938 /* 939 * If we are not forcing, there must be no 940 * changes since fromsnap. 941 */ 942 if (dsl_dataset_modified_since_snap(ds, snap)) { 943 dsl_dataset_rele(snap, FTAG); 944 return (SET_ERROR(ETXTBSY)); 945 } 946 drba->drba_snapobj = ds->ds_prev->ds_object; 947 } 948 949 dsl_dataset_rele(snap, FTAG); 950 } else { 951 /* if full, most recent snapshot must be $ORIGIN */ 952 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) 953 return (SET_ERROR(ENODEV)); 954 drba->drba_snapobj = ds->ds_phys->ds_prev_snap_obj; 955 } 956 957 return (0); 958 959 } 960 961 static int 962 dmu_recv_begin_check(void *arg, dmu_tx_t *tx) 963 { 964 dmu_recv_begin_arg_t *drba = arg; 965 dsl_pool_t *dp = dmu_tx_pool(tx); 966 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 967 uint64_t fromguid = drrb->drr_fromguid; 968 int flags = drrb->drr_flags; 969 int error; 970 uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 971 dsl_dataset_t *ds; 972 const char *tofs = drba->drba_cookie->drc_tofs; 973 974 /* already checked */ 975 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); 976 977 if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == 978 DMU_COMPOUNDSTREAM || 979 drrb->drr_type >= DMU_OST_NUMTYPES || 980 ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) 981 return (SET_ERROR(EINVAL)); 982 983 /* Verify pool version supports SA if SA_SPILL feature set */ 984 if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 985 spa_version(dp->dp_spa) < SPA_VERSION_SA) 986 return (SET_ERROR(ENOTSUP)); 987 988 /* 989 * The receiving code doesn't know how to translate a WRITE_EMBEDDED 990 * record to a plan WRITE record, so the pool must have the 991 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED 992 * records. Same with WRITE_EMBEDDED records that use LZ4 compression. 993 */ 994 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && 995 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) 996 return (SET_ERROR(ENOTSUP)); 997 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && 998 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 999 return (SET_ERROR(ENOTSUP)); 1000 1001 /* 1002 * The receiving code doesn't know how to translate large blocks 1003 * to smaller ones, so the pool must have the LARGE_BLOCKS 1004 * feature enabled if the stream has LARGE_BLOCKS. 1005 */ 1006 if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 1007 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) 1008 return (SET_ERROR(ENOTSUP)); 1009 1010 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1011 if (error == 0) { 1012 /* target fs already exists; recv into temp clone */ 1013 1014 /* Can't recv a clone into an existing fs */ 1015 if (flags & DRR_FLAG_CLONE) { 1016 dsl_dataset_rele(ds, FTAG); 1017 return (SET_ERROR(EINVAL)); 1018 } 1019 1020 error = recv_begin_check_existing_impl(drba, ds, fromguid); 1021 dsl_dataset_rele(ds, FTAG); 1022 } else if (error == ENOENT) { 1023 /* target fs does not exist; must be a full backup or clone */ 1024 char buf[MAXNAMELEN]; 1025 1026 /* 1027 * If it's a non-clone incremental, we are missing the 1028 * target fs, so fail the recv. 1029 */ 1030 if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) 1031 return (SET_ERROR(ENOENT)); 1032 1033 /* Open the parent of tofs */ 1034 ASSERT3U(strlen(tofs), <, MAXNAMELEN); 1035 (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); 1036 error = dsl_dataset_hold(dp, buf, FTAG, &ds); 1037 if (error != 0) 1038 return (error); 1039 1040 /* 1041 * Check filesystem and snapshot limits before receiving. We'll 1042 * recheck snapshot limits again at the end (we create the 1043 * filesystems and increment those counts during begin_sync). 1044 */ 1045 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1046 ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); 1047 if (error != 0) { 1048 dsl_dataset_rele(ds, FTAG); 1049 return (error); 1050 } 1051 1052 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1053 ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); 1054 if (error != 0) { 1055 dsl_dataset_rele(ds, FTAG); 1056 return (error); 1057 } 1058 1059 if (drba->drba_origin != NULL) { 1060 dsl_dataset_t *origin; 1061 error = dsl_dataset_hold(dp, drba->drba_origin, 1062 FTAG, &origin); 1063 if (error != 0) { 1064 dsl_dataset_rele(ds, FTAG); 1065 return (error); 1066 } 1067 if (!dsl_dataset_is_snapshot(origin)) { 1068 dsl_dataset_rele(origin, FTAG); 1069 dsl_dataset_rele(ds, FTAG); 1070 return (SET_ERROR(EINVAL)); 1071 } 1072 if (origin->ds_phys->ds_guid != fromguid) { 1073 dsl_dataset_rele(origin, FTAG); 1074 dsl_dataset_rele(ds, FTAG); 1075 return (SET_ERROR(ENODEV)); 1076 } 1077 dsl_dataset_rele(origin, FTAG); 1078 } 1079 dsl_dataset_rele(ds, FTAG); 1080 error = 0; 1081 } 1082 return (error); 1083 } 1084 1085 static void 1086 dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) 1087 { 1088 dmu_recv_begin_arg_t *drba = arg; 1089 dsl_pool_t *dp = dmu_tx_pool(tx); 1090 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 1091 const char *tofs = drba->drba_cookie->drc_tofs; 1092 dsl_dataset_t *ds, *newds; 1093 uint64_t dsobj; 1094 int error; 1095 uint64_t crflags; 1096 1097 crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? 1098 DS_FLAG_CI_DATASET : 0; 1099 1100 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1101 if (error == 0) { 1102 /* create temporary clone */ 1103 dsl_dataset_t *snap = NULL; 1104 if (drba->drba_snapobj != 0) { 1105 VERIFY0(dsl_dataset_hold_obj(dp, 1106 drba->drba_snapobj, FTAG, &snap)); 1107 } 1108 dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, 1109 snap, crflags, drba->drba_cred, tx); 1110 dsl_dataset_rele(snap, FTAG); 1111 dsl_dataset_rele(ds, FTAG); 1112 } else { 1113 dsl_dir_t *dd; 1114 const char *tail; 1115 dsl_dataset_t *origin = NULL; 1116 1117 VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); 1118 1119 if (drba->drba_origin != NULL) { 1120 VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, 1121 FTAG, &origin)); 1122 } 1123 1124 /* Create new dataset. */ 1125 dsobj = dsl_dataset_create_sync(dd, 1126 strrchr(tofs, '/') + 1, 1127 origin, crflags, drba->drba_cred, tx); 1128 if (origin != NULL) 1129 dsl_dataset_rele(origin, FTAG); 1130 dsl_dir_rele(dd, FTAG); 1131 drba->drba_cookie->drc_newfs = B_TRUE; 1132 } 1133 VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); 1134 1135 if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & 1136 DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 1137 !newds->ds_large_blocks) { 1138 dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); 1139 newds->ds_large_blocks = B_TRUE; 1140 } 1141 1142 dmu_buf_will_dirty(newds->ds_dbuf, tx); 1143 newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 1144 1145 /* 1146 * If we actually created a non-clone, we need to create the 1147 * objset in our new dataset. 1148 */ 1149 if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { 1150 (void) dmu_objset_create_impl(dp->dp_spa, 1151 newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); 1152 } 1153 1154 drba->drba_cookie->drc_ds = newds; 1155 1156 spa_history_log_internal_ds(newds, "receive", tx, ""); 1157 } 1158 1159 /* 1160 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 1161 * succeeds; otherwise we will leak the holds on the datasets. 1162 */ 1163 int 1164 dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, 1165 boolean_t force, char *origin, dmu_recv_cookie_t *drc) 1166 { 1167 dmu_recv_begin_arg_t drba = { 0 }; 1168 dmu_replay_record_t *drr; 1169 1170 bzero(drc, sizeof (dmu_recv_cookie_t)); 1171 drc->drc_drrb = drrb; 1172 drc->drc_tosnap = tosnap; 1173 drc->drc_tofs = tofs; 1174 drc->drc_force = force; 1175 drc->drc_cred = CRED(); 1176 1177 if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 1178 drc->drc_byteswap = B_TRUE; 1179 else if (drrb->drr_magic != DMU_BACKUP_MAGIC) 1180 return (SET_ERROR(EINVAL)); 1181 1182 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 1183 drr->drr_type = DRR_BEGIN; 1184 drr->drr_u.drr_begin = *drc->drc_drrb; 1185 if (drc->drc_byteswap) { 1186 fletcher_4_incremental_byteswap(drr, 1187 sizeof (dmu_replay_record_t), &drc->drc_cksum); 1188 } else { 1189 fletcher_4_incremental_native(drr, 1190 sizeof (dmu_replay_record_t), &drc->drc_cksum); 1191 } 1192 kmem_free(drr, sizeof (dmu_replay_record_t)); 1193 1194 if (drc->drc_byteswap) { 1195 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1196 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 1197 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1198 drrb->drr_type = BSWAP_32(drrb->drr_type); 1199 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1200 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1201 } 1202 1203 drba.drba_origin = origin; 1204 drba.drba_cookie = drc; 1205 drba.drba_cred = CRED(); 1206 1207 return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, 1208 &drba, 5, ZFS_SPACE_CHECK_NORMAL)); 1209 } 1210 1211 struct restorearg { 1212 int err; 1213 boolean_t byteswap; 1214 vnode_t *vp; 1215 char *buf; 1216 uint64_t voff; 1217 int bufsize; /* amount of memory allocated for buf */ 1218 zio_cksum_t cksum; 1219 avl_tree_t *guid_to_ds_map; 1220 }; 1221 1222 typedef struct guid_map_entry { 1223 uint64_t guid; 1224 dsl_dataset_t *gme_ds; 1225 avl_node_t avlnode; 1226 } guid_map_entry_t; 1227 1228 static int 1229 guid_compare(const void *arg1, const void *arg2) 1230 { 1231 const guid_map_entry_t *gmep1 = arg1; 1232 const guid_map_entry_t *gmep2 = arg2; 1233 1234 if (gmep1->guid < gmep2->guid) 1235 return (-1); 1236 else if (gmep1->guid > gmep2->guid) 1237 return (1); 1238 return (0); 1239 } 1240 1241 static void 1242 free_guid_map_onexit(void *arg) 1243 { 1244 avl_tree_t *ca = arg; 1245 void *cookie = NULL; 1246 guid_map_entry_t *gmep; 1247 1248 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 1249 dsl_dataset_long_rele(gmep->gme_ds, gmep); 1250 dsl_dataset_rele(gmep->gme_ds, gmep); 1251 kmem_free(gmep, sizeof (guid_map_entry_t)); 1252 } 1253 avl_destroy(ca); 1254 kmem_free(ca, sizeof (avl_tree_t)); 1255 } 1256 1257 static void * 1258 restore_read(struct restorearg *ra, int len, char *buf) 1259 { 1260 int done = 0; 1261 1262 if (buf == NULL) 1263 buf = ra->buf; 1264 1265 /* some things will require 8-byte alignment, so everything must */ 1266 ASSERT0(len % 8); 1267 ASSERT3U(len, <=, ra->bufsize); 1268 1269 while (done < len) { 1270 ssize_t resid; 1271 1272 ra->err = vn_rdwr(UIO_READ, ra->vp, 1273 buf + done, len - done, 1274 ra->voff, UIO_SYSSPACE, FAPPEND, 1275 RLIM64_INFINITY, CRED(), &resid); 1276 1277 if (resid == len - done) 1278 ra->err = SET_ERROR(EINVAL); 1279 ra->voff += len - done - resid; 1280 done = len - resid; 1281 if (ra->err != 0) 1282 return (NULL); 1283 } 1284 1285 ASSERT3U(done, ==, len); 1286 if (ra->byteswap) 1287 fletcher_4_incremental_byteswap(buf, len, &ra->cksum); 1288 else 1289 fletcher_4_incremental_native(buf, len, &ra->cksum); 1290 return (buf); 1291 } 1292 1293 static void 1294 backup_byteswap(dmu_replay_record_t *drr) 1295 { 1296 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 1297 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 1298 drr->drr_type = BSWAP_32(drr->drr_type); 1299 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 1300 switch (drr->drr_type) { 1301 case DRR_BEGIN: 1302 DO64(drr_begin.drr_magic); 1303 DO64(drr_begin.drr_versioninfo); 1304 DO64(drr_begin.drr_creation_time); 1305 DO32(drr_begin.drr_type); 1306 DO32(drr_begin.drr_flags); 1307 DO64(drr_begin.drr_toguid); 1308 DO64(drr_begin.drr_fromguid); 1309 break; 1310 case DRR_OBJECT: 1311 DO64(drr_object.drr_object); 1312 DO32(drr_object.drr_type); 1313 DO32(drr_object.drr_bonustype); 1314 DO32(drr_object.drr_blksz); 1315 DO32(drr_object.drr_bonuslen); 1316 DO64(drr_object.drr_toguid); 1317 break; 1318 case DRR_FREEOBJECTS: 1319 DO64(drr_freeobjects.drr_firstobj); 1320 DO64(drr_freeobjects.drr_numobjs); 1321 DO64(drr_freeobjects.drr_toguid); 1322 break; 1323 case DRR_WRITE: 1324 DO64(drr_write.drr_object); 1325 DO32(drr_write.drr_type); 1326 DO64(drr_write.drr_offset); 1327 DO64(drr_write.drr_length); 1328 DO64(drr_write.drr_toguid); 1329 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); 1330 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); 1331 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); 1332 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); 1333 DO64(drr_write.drr_key.ddk_prop); 1334 break; 1335 case DRR_WRITE_BYREF: 1336 DO64(drr_write_byref.drr_object); 1337 DO64(drr_write_byref.drr_offset); 1338 DO64(drr_write_byref.drr_length); 1339 DO64(drr_write_byref.drr_toguid); 1340 DO64(drr_write_byref.drr_refguid); 1341 DO64(drr_write_byref.drr_refobject); 1342 DO64(drr_write_byref.drr_refoffset); 1343 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); 1344 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); 1345 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); 1346 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); 1347 DO64(drr_write_byref.drr_key.ddk_prop); 1348 break; 1349 case DRR_WRITE_EMBEDDED: 1350 DO64(drr_write_embedded.drr_object); 1351 DO64(drr_write_embedded.drr_offset); 1352 DO64(drr_write_embedded.drr_length); 1353 DO64(drr_write_embedded.drr_toguid); 1354 DO32(drr_write_embedded.drr_lsize); 1355 DO32(drr_write_embedded.drr_psize); 1356 break; 1357 case DRR_FREE: 1358 DO64(drr_free.drr_object); 1359 DO64(drr_free.drr_offset); 1360 DO64(drr_free.drr_length); 1361 DO64(drr_free.drr_toguid); 1362 break; 1363 case DRR_SPILL: 1364 DO64(drr_spill.drr_object); 1365 DO64(drr_spill.drr_length); 1366 DO64(drr_spill.drr_toguid); 1367 break; 1368 case DRR_END: 1369 DO64(drr_end.drr_checksum.zc_word[0]); 1370 DO64(drr_end.drr_checksum.zc_word[1]); 1371 DO64(drr_end.drr_checksum.zc_word[2]); 1372 DO64(drr_end.drr_checksum.zc_word[3]); 1373 DO64(drr_end.drr_toguid); 1374 break; 1375 } 1376 #undef DO64 1377 #undef DO32 1378 } 1379 1380 static inline uint8_t 1381 deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) 1382 { 1383 if (bonus_type == DMU_OT_SA) { 1384 return (1); 1385 } else { 1386 return (1 + 1387 ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT)); 1388 } 1389 } 1390 1391 static int 1392 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 1393 { 1394 dmu_object_info_t doi; 1395 dmu_tx_t *tx; 1396 void *data = NULL; 1397 uint64_t object; 1398 int err; 1399 1400 if (drro->drr_type == DMU_OT_NONE || 1401 !DMU_OT_IS_VALID(drro->drr_type) || 1402 !DMU_OT_IS_VALID(drro->drr_bonustype) || 1403 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 1404 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1405 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1406 drro->drr_blksz < SPA_MINBLOCKSIZE || 1407 drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) || 1408 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1409 return (SET_ERROR(EINVAL)); 1410 } 1411 1412 err = dmu_object_info(os, drro->drr_object, &doi); 1413 1414 if (err != 0 && err != ENOENT) 1415 return (SET_ERROR(EINVAL)); 1416 object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; 1417 1418 if (drro->drr_bonuslen) { 1419 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8), NULL); 1420 if (ra->err != 0) 1421 return (ra->err); 1422 } 1423 1424 /* 1425 * If we are losing blkptrs or changing the block size this must 1426 * be a new file instance. We must clear out the previous file 1427 * contents before we can change this type of metadata in the dnode. 1428 */ 1429 if (err == 0) { 1430 int nblkptr; 1431 1432 nblkptr = deduce_nblkptr(drro->drr_bonustype, 1433 drro->drr_bonuslen); 1434 1435 if (drro->drr_blksz != doi.doi_data_block_size || 1436 nblkptr < doi.doi_nblkptr) { 1437 err = dmu_free_long_range(os, drro->drr_object, 1438 0, DMU_OBJECT_END); 1439 if (err != 0) 1440 return (SET_ERROR(EINVAL)); 1441 } 1442 } 1443 1444 tx = dmu_tx_create(os); 1445 dmu_tx_hold_bonus(tx, object); 1446 err = dmu_tx_assign(tx, TXG_WAIT); 1447 if (err != 0) { 1448 dmu_tx_abort(tx); 1449 return (err); 1450 } 1451 1452 if (object == DMU_NEW_OBJECT) { 1453 /* currently free, want to be allocated */ 1454 err = dmu_object_claim(os, drro->drr_object, 1455 drro->drr_type, drro->drr_blksz, 1456 drro->drr_bonustype, drro->drr_bonuslen, tx); 1457 } else if (drro->drr_type != doi.doi_type || 1458 drro->drr_blksz != doi.doi_data_block_size || 1459 drro->drr_bonustype != doi.doi_bonus_type || 1460 drro->drr_bonuslen != doi.doi_bonus_size) { 1461 /* currently allocated, but with different properties */ 1462 err = dmu_object_reclaim(os, drro->drr_object, 1463 drro->drr_type, drro->drr_blksz, 1464 drro->drr_bonustype, drro->drr_bonuslen, tx); 1465 } 1466 if (err != 0) { 1467 dmu_tx_commit(tx); 1468 return (SET_ERROR(EINVAL)); 1469 } 1470 1471 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, 1472 tx); 1473 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 1474 1475 if (data != NULL) { 1476 dmu_buf_t *db; 1477 1478 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 1479 dmu_buf_will_dirty(db, tx); 1480 1481 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 1482 bcopy(data, db->db_data, drro->drr_bonuslen); 1483 if (ra->byteswap) { 1484 dmu_object_byteswap_t byteswap = 1485 DMU_OT_BYTESWAP(drro->drr_bonustype); 1486 dmu_ot_byteswap[byteswap].ob_func(db->db_data, 1487 drro->drr_bonuslen); 1488 } 1489 dmu_buf_rele(db, FTAG); 1490 } 1491 dmu_tx_commit(tx); 1492 return (0); 1493 } 1494 1495 /* ARGSUSED */ 1496 static int 1497 restore_freeobjects(struct restorearg *ra, objset_t *os, 1498 struct drr_freeobjects *drrfo) 1499 { 1500 uint64_t obj; 1501 1502 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1503 return (SET_ERROR(EINVAL)); 1504 1505 for (obj = drrfo->drr_firstobj; 1506 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1507 (void) dmu_object_next(os, &obj, FALSE, 0)) { 1508 int err; 1509 1510 if (dmu_object_info(os, obj, NULL) != 0) 1511 continue; 1512 1513 err = dmu_free_long_object(os, obj); 1514 if (err != 0) 1515 return (err); 1516 } 1517 return (0); 1518 } 1519 1520 static int 1521 restore_write(struct restorearg *ra, objset_t *os, 1522 struct drr_write *drrw) 1523 { 1524 dmu_tx_t *tx; 1525 void *data; 1526 int err; 1527 1528 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1529 !DMU_OT_IS_VALID(drrw->drr_type)) 1530 return (SET_ERROR(EINVAL)); 1531 1532 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1533 return (SET_ERROR(EINVAL)); 1534 1535 dmu_buf_t *bonus; 1536 if (dmu_bonus_hold(os, drrw->drr_object, FTAG, &bonus) != 0) 1537 return (SET_ERROR(EINVAL)); 1538 1539 arc_buf_t *abuf = dmu_request_arcbuf(bonus, drrw->drr_length); 1540 1541 data = restore_read(ra, drrw->drr_length, abuf->b_data); 1542 if (data == NULL) { 1543 dmu_return_arcbuf(abuf); 1544 dmu_buf_rele(bonus, FTAG); 1545 return (ra->err); 1546 } 1547 1548 tx = dmu_tx_create(os); 1549 1550 dmu_tx_hold_write(tx, drrw->drr_object, 1551 drrw->drr_offset, drrw->drr_length); 1552 err = dmu_tx_assign(tx, TXG_WAIT); 1553 if (err != 0) { 1554 dmu_return_arcbuf(abuf); 1555 dmu_buf_rele(bonus, FTAG); 1556 dmu_tx_abort(tx); 1557 return (err); 1558 } 1559 if (ra->byteswap) { 1560 dmu_object_byteswap_t byteswap = 1561 DMU_OT_BYTESWAP(drrw->drr_type); 1562 dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); 1563 } 1564 dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); 1565 dmu_tx_commit(tx); 1566 dmu_buf_rele(bonus, FTAG); 1567 return (0); 1568 } 1569 1570 /* 1571 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1572 * streams to refer to a copy of the data that is already on the 1573 * system because it came in earlier in the stream. This function 1574 * finds the earlier copy of the data, and uses that copy instead of 1575 * data from the stream to fulfill this write. 1576 */ 1577 static int 1578 restore_write_byref(struct restorearg *ra, objset_t *os, 1579 struct drr_write_byref *drrwbr) 1580 { 1581 dmu_tx_t *tx; 1582 int err; 1583 guid_map_entry_t gmesrch; 1584 guid_map_entry_t *gmep; 1585 avl_index_t where; 1586 objset_t *ref_os = NULL; 1587 dmu_buf_t *dbp; 1588 1589 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1590 return (SET_ERROR(EINVAL)); 1591 1592 /* 1593 * If the GUID of the referenced dataset is different from the 1594 * GUID of the target dataset, find the referenced dataset. 1595 */ 1596 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1597 gmesrch.guid = drrwbr->drr_refguid; 1598 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 1599 &where)) == NULL) { 1600 return (SET_ERROR(EINVAL)); 1601 } 1602 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1603 return (SET_ERROR(EINVAL)); 1604 } else { 1605 ref_os = os; 1606 } 1607 1608 err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1609 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); 1610 if (err != 0) 1611 return (err); 1612 1613 tx = dmu_tx_create(os); 1614 1615 dmu_tx_hold_write(tx, drrwbr->drr_object, 1616 drrwbr->drr_offset, drrwbr->drr_length); 1617 err = dmu_tx_assign(tx, TXG_WAIT); 1618 if (err != 0) { 1619 dmu_tx_abort(tx); 1620 return (err); 1621 } 1622 dmu_write(os, drrwbr->drr_object, 1623 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1624 dmu_buf_rele(dbp, FTAG); 1625 dmu_tx_commit(tx); 1626 return (0); 1627 } 1628 1629 static int 1630 restore_write_embedded(struct restorearg *ra, objset_t *os, 1631 struct drr_write_embedded *drrwnp) 1632 { 1633 dmu_tx_t *tx; 1634 int err; 1635 void *data; 1636 1637 if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) 1638 return (EINVAL); 1639 1640 if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE) 1641 return (EINVAL); 1642 1643 if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES) 1644 return (EINVAL); 1645 if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) 1646 return (EINVAL); 1647 1648 data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8), NULL); 1649 if (data == NULL) 1650 return (ra->err); 1651 1652 tx = dmu_tx_create(os); 1653 1654 dmu_tx_hold_write(tx, drrwnp->drr_object, 1655 drrwnp->drr_offset, drrwnp->drr_length); 1656 err = dmu_tx_assign(tx, TXG_WAIT); 1657 if (err != 0) { 1658 dmu_tx_abort(tx); 1659 return (err); 1660 } 1661 1662 dmu_write_embedded(os, drrwnp->drr_object, 1663 drrwnp->drr_offset, data, drrwnp->drr_etype, 1664 drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, 1665 ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); 1666 1667 dmu_tx_commit(tx); 1668 return (0); 1669 } 1670 1671 static int 1672 restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) 1673 { 1674 dmu_tx_t *tx; 1675 void *data; 1676 dmu_buf_t *db, *db_spill; 1677 int err; 1678 1679 if (drrs->drr_length < SPA_MINBLOCKSIZE || 1680 drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os))) 1681 return (SET_ERROR(EINVAL)); 1682 1683 data = restore_read(ra, drrs->drr_length, NULL); 1684 if (data == NULL) 1685 return (ra->err); 1686 1687 if (dmu_object_info(os, drrs->drr_object, NULL) != 0) 1688 return (SET_ERROR(EINVAL)); 1689 1690 VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); 1691 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 1692 dmu_buf_rele(db, FTAG); 1693 return (err); 1694 } 1695 1696 tx = dmu_tx_create(os); 1697 1698 dmu_tx_hold_spill(tx, db->db_object); 1699 1700 err = dmu_tx_assign(tx, TXG_WAIT); 1701 if (err != 0) { 1702 dmu_buf_rele(db, FTAG); 1703 dmu_buf_rele(db_spill, FTAG); 1704 dmu_tx_abort(tx); 1705 return (err); 1706 } 1707 dmu_buf_will_dirty(db_spill, tx); 1708 1709 if (db_spill->db_size < drrs->drr_length) 1710 VERIFY(0 == dbuf_spill_set_blksz(db_spill, 1711 drrs->drr_length, tx)); 1712 bcopy(data, db_spill->db_data, drrs->drr_length); 1713 1714 dmu_buf_rele(db, FTAG); 1715 dmu_buf_rele(db_spill, FTAG); 1716 1717 dmu_tx_commit(tx); 1718 return (0); 1719 } 1720 1721 /* ARGSUSED */ 1722 static int 1723 restore_free(struct restorearg *ra, objset_t *os, 1724 struct drr_free *drrf) 1725 { 1726 int err; 1727 1728 if (drrf->drr_length != -1ULL && 1729 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1730 return (SET_ERROR(EINVAL)); 1731 1732 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1733 return (SET_ERROR(EINVAL)); 1734 1735 err = dmu_free_long_range(os, drrf->drr_object, 1736 drrf->drr_offset, drrf->drr_length); 1737 return (err); 1738 } 1739 1740 /* used to destroy the drc_ds on error */ 1741 static void 1742 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) 1743 { 1744 char name[MAXNAMELEN]; 1745 dsl_dataset_name(drc->drc_ds, name); 1746 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 1747 (void) dsl_destroy_head(name); 1748 } 1749 1750 /* 1751 * NB: callers *must* call dmu_recv_end() if this succeeds. 1752 */ 1753 int 1754 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, 1755 int cleanup_fd, uint64_t *action_handlep) 1756 { 1757 struct restorearg ra = { 0 }; 1758 dmu_replay_record_t *drr; 1759 objset_t *os; 1760 zio_cksum_t pcksum; 1761 int featureflags; 1762 1763 ra.byteswap = drc->drc_byteswap; 1764 ra.cksum = drc->drc_cksum; 1765 ra.vp = vp; 1766 ra.voff = *voffp; 1767 ra.bufsize = SPA_MAXBLOCKSIZE; 1768 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1769 1770 /* these were verified in dmu_recv_begin */ 1771 ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, 1772 DMU_SUBSTREAM); 1773 ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); 1774 1775 /* 1776 * Open the objset we are modifying. 1777 */ 1778 VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os)); 1779 1780 ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); 1781 1782 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1783 1784 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1785 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1786 minor_t minor; 1787 1788 if (cleanup_fd == -1) { 1789 ra.err = SET_ERROR(EBADF); 1790 goto out; 1791 } 1792 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 1793 if (ra.err != 0) { 1794 cleanup_fd = -1; 1795 goto out; 1796 } 1797 1798 if (*action_handlep == 0) { 1799 ra.guid_to_ds_map = 1800 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1801 avl_create(ra.guid_to_ds_map, guid_compare, 1802 sizeof (guid_map_entry_t), 1803 offsetof(guid_map_entry_t, avlnode)); 1804 ra.err = zfs_onexit_add_cb(minor, 1805 free_guid_map_onexit, ra.guid_to_ds_map, 1806 action_handlep); 1807 if (ra.err != 0) 1808 goto out; 1809 } else { 1810 ra.err = zfs_onexit_cb_data(minor, *action_handlep, 1811 (void **)&ra.guid_to_ds_map); 1812 if (ra.err != 0) 1813 goto out; 1814 } 1815 1816 drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 1817 } 1818 1819 /* 1820 * Read records and process them. 1821 */ 1822 pcksum = ra.cksum; 1823 while (ra.err == 0 && 1824 NULL != (drr = restore_read(&ra, sizeof (*drr), NULL))) { 1825 if (issig(JUSTLOOKING) && issig(FORREAL)) { 1826 ra.err = SET_ERROR(EINTR); 1827 goto out; 1828 } 1829 1830 if (ra.byteswap) 1831 backup_byteswap(drr); 1832 1833 switch (drr->drr_type) { 1834 case DRR_OBJECT: 1835 { 1836 /* 1837 * We need to make a copy of the record header, 1838 * because restore_{object,write} may need to 1839 * restore_read(), which will invalidate drr. 1840 */ 1841 struct drr_object drro = drr->drr_u.drr_object; 1842 ra.err = restore_object(&ra, os, &drro); 1843 break; 1844 } 1845 case DRR_FREEOBJECTS: 1846 { 1847 struct drr_freeobjects drrfo = 1848 drr->drr_u.drr_freeobjects; 1849 ra.err = restore_freeobjects(&ra, os, &drrfo); 1850 break; 1851 } 1852 case DRR_WRITE: 1853 { 1854 struct drr_write drrw = drr->drr_u.drr_write; 1855 ra.err = restore_write(&ra, os, &drrw); 1856 break; 1857 } 1858 case DRR_WRITE_BYREF: 1859 { 1860 struct drr_write_byref drrwbr = 1861 drr->drr_u.drr_write_byref; 1862 ra.err = restore_write_byref(&ra, os, &drrwbr); 1863 break; 1864 } 1865 case DRR_WRITE_EMBEDDED: 1866 { 1867 struct drr_write_embedded drrwe = 1868 drr->drr_u.drr_write_embedded; 1869 ra.err = restore_write_embedded(&ra, os, &drrwe); 1870 break; 1871 } 1872 case DRR_FREE: 1873 { 1874 struct drr_free drrf = drr->drr_u.drr_free; 1875 ra.err = restore_free(&ra, os, &drrf); 1876 break; 1877 } 1878 case DRR_END: 1879 { 1880 struct drr_end drre = drr->drr_u.drr_end; 1881 /* 1882 * We compare against the *previous* checksum 1883 * value, because the stored checksum is of 1884 * everything before the DRR_END record. 1885 */ 1886 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) 1887 ra.err = SET_ERROR(ECKSUM); 1888 goto out; 1889 } 1890 case DRR_SPILL: 1891 { 1892 struct drr_spill drrs = drr->drr_u.drr_spill; 1893 ra.err = restore_spill(&ra, os, &drrs); 1894 break; 1895 } 1896 default: 1897 ra.err = SET_ERROR(EINVAL); 1898 goto out; 1899 } 1900 pcksum = ra.cksum; 1901 } 1902 ASSERT(ra.err != 0); 1903 1904 out: 1905 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 1906 zfs_onexit_fd_rele(cleanup_fd); 1907 1908 if (ra.err != 0) { 1909 /* 1910 * destroy what we created, so we don't leave it in the 1911 * inconsistent restoring state. 1912 */ 1913 dmu_recv_cleanup_ds(drc); 1914 } 1915 1916 kmem_free(ra.buf, ra.bufsize); 1917 *voffp = ra.voff; 1918 return (ra.err); 1919 } 1920 1921 static int 1922 dmu_recv_end_check(void *arg, dmu_tx_t *tx) 1923 { 1924 dmu_recv_cookie_t *drc = arg; 1925 dsl_pool_t *dp = dmu_tx_pool(tx); 1926 int error; 1927 1928 ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); 1929 1930 if (!drc->drc_newfs) { 1931 dsl_dataset_t *origin_head; 1932 1933 error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); 1934 if (error != 0) 1935 return (error); 1936 if (drc->drc_force) { 1937 /* 1938 * We will destroy any snapshots in tofs (i.e. before 1939 * origin_head) that are after the origin (which is 1940 * the snap before drc_ds, because drc_ds can not 1941 * have any snaps of its own). 1942 */ 1943 uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj; 1944 while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) { 1945 dsl_dataset_t *snap; 1946 error = dsl_dataset_hold_obj(dp, obj, FTAG, 1947 &snap); 1948 if (error != 0) 1949 return (error); 1950 if (snap->ds_dir != origin_head->ds_dir) 1951 error = SET_ERROR(EINVAL); 1952 if (error == 0) { 1953 error = dsl_destroy_snapshot_check_impl( 1954 snap, B_FALSE); 1955 } 1956 obj = snap->ds_phys->ds_prev_snap_obj; 1957 dsl_dataset_rele(snap, FTAG); 1958 if (error != 0) 1959 return (error); 1960 } 1961 } 1962 error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, 1963 origin_head, drc->drc_force, drc->drc_owner, tx); 1964 if (error != 0) { 1965 dsl_dataset_rele(origin_head, FTAG); 1966 return (error); 1967 } 1968 error = dsl_dataset_snapshot_check_impl(origin_head, 1969 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 1970 dsl_dataset_rele(origin_head, FTAG); 1971 if (error != 0) 1972 return (error); 1973 1974 error = dsl_destroy_head_check_impl(drc->drc_ds, 1); 1975 } else { 1976 error = dsl_dataset_snapshot_check_impl(drc->drc_ds, 1977 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 1978 } 1979 return (error); 1980 } 1981 1982 static void 1983 dmu_recv_end_sync(void *arg, dmu_tx_t *tx) 1984 { 1985 dmu_recv_cookie_t *drc = arg; 1986 dsl_pool_t *dp = dmu_tx_pool(tx); 1987 1988 spa_history_log_internal_ds(drc->drc_ds, "finish receiving", 1989 tx, "snap=%s", drc->drc_tosnap); 1990 1991 if (!drc->drc_newfs) { 1992 dsl_dataset_t *origin_head; 1993 1994 VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, 1995 &origin_head)); 1996 1997 if (drc->drc_force) { 1998 /* 1999 * Destroy any snapshots of drc_tofs (origin_head) 2000 * after the origin (the snap before drc_ds). 2001 */ 2002 uint64_t obj = origin_head->ds_phys->ds_prev_snap_obj; 2003 while (obj != drc->drc_ds->ds_phys->ds_prev_snap_obj) { 2004 dsl_dataset_t *snap; 2005 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, 2006 &snap)); 2007 ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); 2008 obj = snap->ds_phys->ds_prev_snap_obj; 2009 dsl_destroy_snapshot_sync_impl(snap, 2010 B_FALSE, tx); 2011 dsl_dataset_rele(snap, FTAG); 2012 } 2013 } 2014 VERIFY3P(drc->drc_ds->ds_prev, ==, 2015 origin_head->ds_prev); 2016 2017 dsl_dataset_clone_swap_sync_impl(drc->drc_ds, 2018 origin_head, tx); 2019 dsl_dataset_snapshot_sync_impl(origin_head, 2020 drc->drc_tosnap, tx); 2021 2022 /* set snapshot's creation time and guid */ 2023 dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); 2024 origin_head->ds_prev->ds_phys->ds_creation_time = 2025 drc->drc_drrb->drr_creation_time; 2026 origin_head->ds_prev->ds_phys->ds_guid = 2027 drc->drc_drrb->drr_toguid; 2028 origin_head->ds_prev->ds_phys->ds_flags &= 2029 ~DS_FLAG_INCONSISTENT; 2030 2031 dmu_buf_will_dirty(origin_head->ds_dbuf, tx); 2032 origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 2033 2034 dsl_dataset_rele(origin_head, FTAG); 2035 dsl_destroy_head_sync_impl(drc->drc_ds, tx); 2036 2037 if (drc->drc_owner != NULL) 2038 VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); 2039 } else { 2040 dsl_dataset_t *ds = drc->drc_ds; 2041 2042 dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); 2043 2044 /* set snapshot's creation time and guid */ 2045 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 2046 ds->ds_prev->ds_phys->ds_creation_time = 2047 drc->drc_drrb->drr_creation_time; 2048 ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid; 2049 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 2050 2051 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2052 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 2053 } 2054 drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj; 2055 /* 2056 * Release the hold from dmu_recv_begin. This must be done before 2057 * we return to open context, so that when we free the dataset's dnode, 2058 * we can evict its bonus buffer. 2059 */ 2060 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 2061 drc->drc_ds = NULL; 2062 } 2063 2064 static int 2065 add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) 2066 { 2067 dsl_pool_t *dp; 2068 dsl_dataset_t *snapds; 2069 guid_map_entry_t *gmep; 2070 int err; 2071 2072 ASSERT(guid_map != NULL); 2073 2074 err = dsl_pool_hold(name, FTAG, &dp); 2075 if (err != 0) 2076 return (err); 2077 gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); 2078 err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); 2079 if (err == 0) { 2080 gmep->guid = snapds->ds_phys->ds_guid; 2081 gmep->gme_ds = snapds; 2082 avl_add(guid_map, gmep); 2083 dsl_dataset_long_hold(snapds, gmep); 2084 } else { 2085 kmem_free(gmep, sizeof (*gmep)); 2086 } 2087 2088 dsl_pool_rele(dp, FTAG); 2089 return (err); 2090 } 2091 2092 static int dmu_recv_end_modified_blocks = 3; 2093 2094 static int 2095 dmu_recv_existing_end(dmu_recv_cookie_t *drc) 2096 { 2097 int error; 2098 char name[MAXNAMELEN]; 2099 2100 #ifdef _KERNEL 2101 /* 2102 * We will be destroying the ds; make sure its origin is unmounted if 2103 * necessary. 2104 */ 2105 dsl_dataset_name(drc->drc_ds, name); 2106 zfs_destroy_unmount_origin(name); 2107 #endif 2108 2109 error = dsl_sync_task(drc->drc_tofs, 2110 dmu_recv_end_check, dmu_recv_end_sync, drc, 2111 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); 2112 2113 if (error != 0) 2114 dmu_recv_cleanup_ds(drc); 2115 return (error); 2116 } 2117 2118 static int 2119 dmu_recv_new_end(dmu_recv_cookie_t *drc) 2120 { 2121 int error; 2122 2123 error = dsl_sync_task(drc->drc_tofs, 2124 dmu_recv_end_check, dmu_recv_end_sync, drc, 2125 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); 2126 2127 if (error != 0) { 2128 dmu_recv_cleanup_ds(drc); 2129 } else if (drc->drc_guid_to_ds_map != NULL) { 2130 (void) add_ds_to_guidmap(drc->drc_tofs, 2131 drc->drc_guid_to_ds_map, 2132 drc->drc_newsnapobj); 2133 } 2134 return (error); 2135 } 2136 2137 int 2138 dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) 2139 { 2140 drc->drc_owner = owner; 2141 2142 if (drc->drc_newfs) 2143 return (dmu_recv_new_end(drc)); 2144 else 2145 return (dmu_recv_existing_end(drc)); 2146 } 2147 2148 /* 2149 * Return TRUE if this objset is currently being received into. 2150 */ 2151 boolean_t 2152 dmu_objset_is_receiving(objset_t *os) 2153 { 2154 return (os->os_dsl_dataset != NULL && 2155 os->os_dsl_dataset->ds_owner == dmu_recv_tag); 2156 } 2157