1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2014, Joyent, Inc. All rights reserved. 26 * Copyright 2014 HybridCluster. All rights reserved. 27 */ 28 29 #include <sys/dmu.h> 30 #include <sys/dmu_impl.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/dbuf.h> 33 #include <sys/dnode.h> 34 #include <sys/zfs_context.h> 35 #include <sys/dmu_objset.h> 36 #include <sys/dmu_traverse.h> 37 #include <sys/dsl_dataset.h> 38 #include <sys/dsl_dir.h> 39 #include <sys/dsl_prop.h> 40 #include <sys/dsl_pool.h> 41 #include <sys/dsl_synctask.h> 42 #include <sys/zfs_ioctl.h> 43 #include <sys/zap.h> 44 #include <sys/zio_checksum.h> 45 #include <sys/zfs_znode.h> 46 #include <zfs_fletcher.h> 47 #include <sys/avl.h> 48 #include <sys/ddt.h> 49 #include <sys/zfs_onexit.h> 50 #include <sys/dmu_send.h> 51 #include <sys/dsl_destroy.h> 52 #include <sys/blkptr.h> 53 #include <sys/dsl_bookmark.h> 54 #include <sys/zfeature.h> 55 56 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 57 int zfs_send_corrupt_data = B_FALSE; 58 59 static char *dmu_recv_tag = "dmu_recv_tag"; 60 static const char *recv_clone_name = "%recv"; 61 62 static int 63 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) 64 { 65 dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; 66 ssize_t resid; /* have to get resid to get detailed errno */ 67 ASSERT0(len % 8); 68 69 dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, 70 (caddr_t)buf, len, 71 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); 72 73 mutex_enter(&ds->ds_sendstream_lock); 74 *dsp->dsa_off += len; 75 mutex_exit(&ds->ds_sendstream_lock); 76 77 return (dsp->dsa_err); 78 } 79 80 /* 81 * For all record types except BEGIN, fill in the checksum (overlaid in 82 * drr_u.drr_checksum.drr_checksum). The checksum verifies everything 83 * up to the start of the checksum itself. 84 */ 85 static int 86 dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) 87 { 88 ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), 89 ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); 90 fletcher_4_incremental_native(dsp->dsa_drr, 91 offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), 92 &dsp->dsa_zc); 93 if (dsp->dsa_drr->drr_type != DRR_BEGIN) { 94 ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u. 95 drr_checksum.drr_checksum)); 96 dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; 97 } 98 fletcher_4_incremental_native(&dsp->dsa_drr-> 99 drr_u.drr_checksum.drr_checksum, 100 sizeof (zio_cksum_t), &dsp->dsa_zc); 101 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 102 return (SET_ERROR(EINTR)); 103 if (payload_len != 0) { 104 fletcher_4_incremental_native(payload, payload_len, 105 &dsp->dsa_zc); 106 if (dump_bytes(dsp, payload, payload_len) != 0) 107 return (SET_ERROR(EINTR)); 108 } 109 return (0); 110 } 111 112 static int 113 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 114 uint64_t length) 115 { 116 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); 117 118 /* 119 * When we receive a free record, dbuf_free_range() assumes 120 * that the receiving system doesn't have any dbufs in the range 121 * being freed. This is always true because there is a one-record 122 * constraint: we only send one WRITE record for any given 123 * object+offset. We know that the one-record constraint is 124 * true because we always send data in increasing order by 125 * object,offset. 126 * 127 * If the increasing-order constraint ever changes, we should find 128 * another way to assert that the one-record constraint is still 129 * satisfied. 130 */ 131 ASSERT(object > dsp->dsa_last_data_object || 132 (object == dsp->dsa_last_data_object && 133 offset > dsp->dsa_last_data_offset)); 134 135 /* 136 * If we are doing a non-incremental send, then there can't 137 * be any data in the dataset we're receiving into. Therefore 138 * a free record would simply be a no-op. Save space by not 139 * sending it to begin with. 140 */ 141 if (!dsp->dsa_incremental) 142 return (0); 143 144 if (length != -1ULL && offset + length < offset) 145 length = -1ULL; 146 147 /* 148 * If there is a pending op, but it's not PENDING_FREE, push it out, 149 * since free block aggregation can only be done for blocks of the 150 * same type (i.e., DRR_FREE records can only be aggregated with 151 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 152 * aggregated with other DRR_FREEOBJECTS records. 153 */ 154 if (dsp->dsa_pending_op != PENDING_NONE && 155 dsp->dsa_pending_op != PENDING_FREE) { 156 if (dump_record(dsp, NULL, 0) != 0) 157 return (SET_ERROR(EINTR)); 158 dsp->dsa_pending_op = PENDING_NONE; 159 } 160 161 if (dsp->dsa_pending_op == PENDING_FREE) { 162 /* 163 * There should never be a PENDING_FREE if length is -1 164 * (because dump_dnode is the only place where this 165 * function is called with a -1, and only after flushing 166 * any pending record). 167 */ 168 ASSERT(length != -1ULL); 169 /* 170 * Check to see whether this free block can be aggregated 171 * with pending one. 172 */ 173 if (drrf->drr_object == object && drrf->drr_offset + 174 drrf->drr_length == offset) { 175 drrf->drr_length += length; 176 return (0); 177 } else { 178 /* not a continuation. Push out pending record */ 179 if (dump_record(dsp, NULL, 0) != 0) 180 return (SET_ERROR(EINTR)); 181 dsp->dsa_pending_op = PENDING_NONE; 182 } 183 } 184 /* create a FREE record and make it pending */ 185 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 186 dsp->dsa_drr->drr_type = DRR_FREE; 187 drrf->drr_object = object; 188 drrf->drr_offset = offset; 189 drrf->drr_length = length; 190 drrf->drr_toguid = dsp->dsa_toguid; 191 if (length == -1ULL) { 192 if (dump_record(dsp, NULL, 0) != 0) 193 return (SET_ERROR(EINTR)); 194 } else { 195 dsp->dsa_pending_op = PENDING_FREE; 196 } 197 198 return (0); 199 } 200 201 static int 202 dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, 203 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 204 { 205 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); 206 207 /* 208 * We send data in increasing object, offset order. 209 * See comment in dump_free() for details. 210 */ 211 ASSERT(object > dsp->dsa_last_data_object || 212 (object == dsp->dsa_last_data_object && 213 offset > dsp->dsa_last_data_offset)); 214 dsp->dsa_last_data_object = object; 215 dsp->dsa_last_data_offset = offset + blksz - 1; 216 217 /* 218 * If there is any kind of pending aggregation (currently either 219 * a grouping of free objects or free blocks), push it out to 220 * the stream, since aggregation can't be done across operations 221 * of different types. 222 */ 223 if (dsp->dsa_pending_op != PENDING_NONE) { 224 if (dump_record(dsp, NULL, 0) != 0) 225 return (SET_ERROR(EINTR)); 226 dsp->dsa_pending_op = PENDING_NONE; 227 } 228 /* write a WRITE record */ 229 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 230 dsp->dsa_drr->drr_type = DRR_WRITE; 231 drrw->drr_object = object; 232 drrw->drr_type = type; 233 drrw->drr_offset = offset; 234 drrw->drr_length = blksz; 235 drrw->drr_toguid = dsp->dsa_toguid; 236 if (bp == NULL || BP_IS_EMBEDDED(bp)) { 237 /* 238 * There's no pre-computed checksum for partial-block 239 * writes or embedded BP's, so (like 240 * fletcher4-checkummed blocks) userland will have to 241 * compute a dedup-capable checksum itself. 242 */ 243 drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; 244 } else { 245 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 246 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 247 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 248 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 249 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 250 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 251 drrw->drr_key.ddk_cksum = bp->blk_cksum; 252 } 253 254 if (dump_record(dsp, data, blksz) != 0) 255 return (SET_ERROR(EINTR)); 256 return (0); 257 } 258 259 static int 260 dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 261 int blksz, const blkptr_t *bp) 262 { 263 char buf[BPE_PAYLOAD_SIZE]; 264 struct drr_write_embedded *drrw = 265 &(dsp->dsa_drr->drr_u.drr_write_embedded); 266 267 if (dsp->dsa_pending_op != PENDING_NONE) { 268 if (dump_record(dsp, NULL, 0) != 0) 269 return (EINTR); 270 dsp->dsa_pending_op = PENDING_NONE; 271 } 272 273 ASSERT(BP_IS_EMBEDDED(bp)); 274 275 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 276 dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; 277 drrw->drr_object = object; 278 drrw->drr_offset = offset; 279 drrw->drr_length = blksz; 280 drrw->drr_toguid = dsp->dsa_toguid; 281 drrw->drr_compression = BP_GET_COMPRESS(bp); 282 drrw->drr_etype = BPE_GET_ETYPE(bp); 283 drrw->drr_lsize = BPE_GET_LSIZE(bp); 284 drrw->drr_psize = BPE_GET_PSIZE(bp); 285 286 decode_embedded_bp_compressed(bp, buf); 287 288 if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) 289 return (EINTR); 290 return (0); 291 } 292 293 static int 294 dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) 295 { 296 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); 297 298 if (dsp->dsa_pending_op != PENDING_NONE) { 299 if (dump_record(dsp, NULL, 0) != 0) 300 return (SET_ERROR(EINTR)); 301 dsp->dsa_pending_op = PENDING_NONE; 302 } 303 304 /* write a SPILL record */ 305 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 306 dsp->dsa_drr->drr_type = DRR_SPILL; 307 drrs->drr_object = object; 308 drrs->drr_length = blksz; 309 drrs->drr_toguid = dsp->dsa_toguid; 310 311 if (dump_record(dsp, data, blksz) != 0) 312 return (SET_ERROR(EINTR)); 313 return (0); 314 } 315 316 static int 317 dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) 318 { 319 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); 320 321 /* See comment in dump_free(). */ 322 if (!dsp->dsa_incremental) 323 return (0); 324 325 /* 326 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 327 * push it out, since free block aggregation can only be done for 328 * blocks of the same type (i.e., DRR_FREE records can only be 329 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 330 * can only be aggregated with other DRR_FREEOBJECTS records. 331 */ 332 if (dsp->dsa_pending_op != PENDING_NONE && 333 dsp->dsa_pending_op != PENDING_FREEOBJECTS) { 334 if (dump_record(dsp, NULL, 0) != 0) 335 return (SET_ERROR(EINTR)); 336 dsp->dsa_pending_op = PENDING_NONE; 337 } 338 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { 339 /* 340 * See whether this free object array can be aggregated 341 * with pending one 342 */ 343 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 344 drrfo->drr_numobjs += numobjs; 345 return (0); 346 } else { 347 /* can't be aggregated. Push out pending record */ 348 if (dump_record(dsp, NULL, 0) != 0) 349 return (SET_ERROR(EINTR)); 350 dsp->dsa_pending_op = PENDING_NONE; 351 } 352 } 353 354 /* write a FREEOBJECTS record */ 355 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 356 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; 357 drrfo->drr_firstobj = firstobj; 358 drrfo->drr_numobjs = numobjs; 359 drrfo->drr_toguid = dsp->dsa_toguid; 360 361 dsp->dsa_pending_op = PENDING_FREEOBJECTS; 362 363 return (0); 364 } 365 366 static int 367 dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) 368 { 369 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); 370 371 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 372 return (dump_freeobjects(dsp, object, 1)); 373 374 if (dsp->dsa_pending_op != PENDING_NONE) { 375 if (dump_record(dsp, NULL, 0) != 0) 376 return (SET_ERROR(EINTR)); 377 dsp->dsa_pending_op = PENDING_NONE; 378 } 379 380 /* write an OBJECT record */ 381 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 382 dsp->dsa_drr->drr_type = DRR_OBJECT; 383 drro->drr_object = object; 384 drro->drr_type = dnp->dn_type; 385 drro->drr_bonustype = dnp->dn_bonustype; 386 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 387 drro->drr_bonuslen = dnp->dn_bonuslen; 388 drro->drr_checksumtype = dnp->dn_checksum; 389 drro->drr_compress = dnp->dn_compress; 390 drro->drr_toguid = dsp->dsa_toguid; 391 392 if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 393 drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) 394 drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; 395 396 if (dump_record(dsp, DN_BONUS(dnp), 397 P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) { 398 return (SET_ERROR(EINTR)); 399 } 400 401 /* Free anything past the end of the file. */ 402 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * 403 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) 404 return (SET_ERROR(EINTR)); 405 if (dsp->dsa_err != 0) 406 return (SET_ERROR(EINTR)); 407 return (0); 408 } 409 410 static boolean_t 411 backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) 412 { 413 if (!BP_IS_EMBEDDED(bp)) 414 return (B_FALSE); 415 416 /* 417 * Compression function must be legacy, or explicitly enabled. 418 */ 419 if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && 420 !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4))) 421 return (B_FALSE); 422 423 /* 424 * Embed type must be explicitly enabled. 425 */ 426 switch (BPE_GET_ETYPE(bp)) { 427 case BP_EMBEDDED_TYPE_DATA: 428 if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) 429 return (B_TRUE); 430 break; 431 default: 432 return (B_FALSE); 433 } 434 return (B_FALSE); 435 } 436 437 #define BP_SPAN(dnp, level) \ 438 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 439 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 440 441 /* ARGSUSED */ 442 static int 443 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 444 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 445 { 446 dmu_sendarg_t *dsp = arg; 447 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 448 int err = 0; 449 450 if (issig(JUSTLOOKING) && issig(FORREAL)) 451 return (SET_ERROR(EINTR)); 452 453 if (zb->zb_object != DMU_META_DNODE_OBJECT && 454 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 455 return (0); 456 } else if (zb->zb_level == ZB_ZIL_LEVEL) { 457 /* 458 * If we are sending a non-snapshot (which is allowed on 459 * read-only pools), it may have a ZIL, which must be ignored. 460 */ 461 return (0); 462 } else if (BP_IS_HOLE(bp) && 463 zb->zb_object == DMU_META_DNODE_OBJECT) { 464 uint64_t span = BP_SPAN(dnp, zb->zb_level); 465 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 466 err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); 467 } else if (BP_IS_HOLE(bp)) { 468 uint64_t span = BP_SPAN(dnp, zb->zb_level); 469 err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); 470 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 471 return (0); 472 } else if (type == DMU_OT_DNODE) { 473 dnode_phys_t *blk; 474 int i; 475 int blksz = BP_GET_LSIZE(bp); 476 arc_flags_t aflags = ARC_FLAG_WAIT; 477 arc_buf_t *abuf; 478 479 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 480 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 481 &aflags, zb) != 0) 482 return (SET_ERROR(EIO)); 483 484 blk = abuf->b_data; 485 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 486 uint64_t dnobj = (zb->zb_blkid << 487 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 488 err = dump_dnode(dsp, dnobj, blk+i); 489 if (err != 0) 490 break; 491 } 492 (void) arc_buf_remove_ref(abuf, &abuf); 493 } else if (type == DMU_OT_SA) { 494 arc_flags_t aflags = ARC_FLAG_WAIT; 495 arc_buf_t *abuf; 496 int blksz = BP_GET_LSIZE(bp); 497 498 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 499 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 500 &aflags, zb) != 0) 501 return (SET_ERROR(EIO)); 502 503 err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); 504 (void) arc_buf_remove_ref(abuf, &abuf); 505 } else if (backup_do_embed(dsp, bp)) { 506 /* it's an embedded level-0 block of a regular object */ 507 int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 508 err = dump_write_embedded(dsp, zb->zb_object, 509 zb->zb_blkid * blksz, blksz, bp); 510 } else { /* it's a level-0 block of a regular object */ 511 arc_flags_t aflags = ARC_FLAG_WAIT; 512 arc_buf_t *abuf; 513 int blksz = BP_GET_LSIZE(bp); 514 uint64_t offset; 515 516 ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); 517 ASSERT0(zb->zb_level); 518 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 519 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 520 &aflags, zb) != 0) { 521 if (zfs_send_corrupt_data) { 522 /* Send a block filled with 0x"zfs badd bloc" */ 523 abuf = arc_buf_alloc(spa, blksz, &abuf, 524 ARC_BUFC_DATA); 525 uint64_t *ptr; 526 for (ptr = abuf->b_data; 527 (char *)ptr < (char *)abuf->b_data + blksz; 528 ptr++) 529 *ptr = 0x2f5baddb10cULL; 530 } else { 531 return (SET_ERROR(EIO)); 532 } 533 } 534 535 offset = zb->zb_blkid * blksz; 536 537 if (!(dsp->dsa_featureflags & 538 DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 539 blksz > SPA_OLD_MAXBLOCKSIZE) { 540 char *buf = abuf->b_data; 541 while (blksz > 0 && err == 0) { 542 int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); 543 err = dump_write(dsp, type, zb->zb_object, 544 offset, n, NULL, buf); 545 offset += n; 546 buf += n; 547 blksz -= n; 548 } 549 } else { 550 err = dump_write(dsp, type, zb->zb_object, 551 offset, blksz, bp, abuf->b_data); 552 } 553 (void) arc_buf_remove_ref(abuf, &abuf); 554 } 555 556 ASSERT(err == 0 || err == EINTR); 557 return (err); 558 } 559 560 /* 561 * Releases dp using the specified tag. 562 */ 563 static int 564 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, 565 zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, 566 boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) 567 { 568 objset_t *os; 569 dmu_replay_record_t *drr; 570 dmu_sendarg_t *dsp; 571 int err; 572 uint64_t fromtxg = 0; 573 uint64_t featureflags = 0; 574 575 err = dmu_objset_from_ds(ds, &os); 576 if (err != 0) { 577 dsl_pool_rele(dp, tag); 578 return (err); 579 } 580 581 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 582 drr->drr_type = DRR_BEGIN; 583 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 584 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 585 DMU_SUBSTREAM); 586 587 #ifdef _KERNEL 588 if (dmu_objset_type(os) == DMU_OST_ZFS) { 589 uint64_t version; 590 if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { 591 kmem_free(drr, sizeof (dmu_replay_record_t)); 592 dsl_pool_rele(dp, tag); 593 return (SET_ERROR(EINVAL)); 594 } 595 if (version >= ZPL_VERSION_SA) { 596 featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; 597 } 598 } 599 #endif 600 601 if (large_block_ok && ds->ds_large_blocks) 602 featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; 603 if (embedok && 604 spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { 605 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; 606 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 607 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; 608 } else { 609 embedok = B_FALSE; 610 } 611 612 DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, 613 featureflags); 614 615 drr->drr_u.drr_begin.drr_creation_time = 616 dsl_dataset_phys(ds)->ds_creation_time; 617 drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); 618 if (is_clone) 619 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 620 drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid; 621 if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) 622 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 623 624 if (fromzb != NULL) { 625 drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid; 626 fromtxg = fromzb->zbm_creation_txg; 627 } 628 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 629 if (!ds->ds_is_snapshot) { 630 (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", 631 sizeof (drr->drr_u.drr_begin.drr_toname)); 632 } 633 634 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); 635 636 dsp->dsa_drr = drr; 637 dsp->dsa_vp = vp; 638 dsp->dsa_outfd = outfd; 639 dsp->dsa_proc = curproc; 640 dsp->dsa_os = os; 641 dsp->dsa_off = off; 642 dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid; 643 dsp->dsa_pending_op = PENDING_NONE; 644 dsp->dsa_incremental = (fromzb != NULL); 645 dsp->dsa_featureflags = featureflags; 646 647 mutex_enter(&ds->ds_sendstream_lock); 648 list_insert_head(&ds->ds_sendstreams, dsp); 649 mutex_exit(&ds->ds_sendstream_lock); 650 651 dsl_dataset_long_hold(ds, FTAG); 652 dsl_pool_rele(dp, tag); 653 654 if (dump_record(dsp, NULL, 0) != 0) { 655 err = dsp->dsa_err; 656 goto out; 657 } 658 659 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 660 backup_cb, dsp); 661 662 if (dsp->dsa_pending_op != PENDING_NONE) 663 if (dump_record(dsp, NULL, 0) != 0) 664 err = SET_ERROR(EINTR); 665 666 if (err != 0) { 667 if (err == EINTR && dsp->dsa_err != 0) 668 err = dsp->dsa_err; 669 goto out; 670 } 671 672 bzero(drr, sizeof (dmu_replay_record_t)); 673 drr->drr_type = DRR_END; 674 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; 675 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; 676 677 if (dump_record(dsp, NULL, 0) != 0) { 678 err = dsp->dsa_err; 679 goto out; 680 } 681 682 out: 683 mutex_enter(&ds->ds_sendstream_lock); 684 list_remove(&ds->ds_sendstreams, dsp); 685 mutex_exit(&ds->ds_sendstream_lock); 686 687 kmem_free(drr, sizeof (dmu_replay_record_t)); 688 kmem_free(dsp, sizeof (dmu_sendarg_t)); 689 690 dsl_dataset_long_rele(ds, FTAG); 691 692 return (err); 693 } 694 695 int 696 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, 697 boolean_t embedok, boolean_t large_block_ok, 698 int outfd, vnode_t *vp, offset_t *off) 699 { 700 dsl_pool_t *dp; 701 dsl_dataset_t *ds; 702 dsl_dataset_t *fromds = NULL; 703 int err; 704 705 err = dsl_pool_hold(pool, FTAG, &dp); 706 if (err != 0) 707 return (err); 708 709 err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); 710 if (err != 0) { 711 dsl_pool_rele(dp, FTAG); 712 return (err); 713 } 714 715 if (fromsnap != 0) { 716 zfs_bookmark_phys_t zb; 717 boolean_t is_clone; 718 719 err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); 720 if (err != 0) { 721 dsl_dataset_rele(ds, FTAG); 722 dsl_pool_rele(dp, FTAG); 723 return (err); 724 } 725 if (!dsl_dataset_is_before(ds, fromds, 0)) 726 err = SET_ERROR(EXDEV); 727 zb.zbm_creation_time = 728 dsl_dataset_phys(fromds)->ds_creation_time; 729 zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; 730 zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; 731 is_clone = (fromds->ds_dir != ds->ds_dir); 732 dsl_dataset_rele(fromds, FTAG); 733 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, 734 embedok, large_block_ok, outfd, vp, off); 735 } else { 736 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, 737 embedok, large_block_ok, outfd, vp, off); 738 } 739 dsl_dataset_rele(ds, FTAG); 740 return (err); 741 } 742 743 int 744 dmu_send(const char *tosnap, const char *fromsnap, 745 boolean_t embedok, boolean_t large_block_ok, 746 int outfd, vnode_t *vp, offset_t *off) 747 { 748 dsl_pool_t *dp; 749 dsl_dataset_t *ds; 750 int err; 751 boolean_t owned = B_FALSE; 752 753 if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) 754 return (SET_ERROR(EINVAL)); 755 756 err = dsl_pool_hold(tosnap, FTAG, &dp); 757 if (err != 0) 758 return (err); 759 760 if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { 761 /* 762 * We are sending a filesystem or volume. Ensure 763 * that it doesn't change by owning the dataset. 764 */ 765 err = dsl_dataset_own(dp, tosnap, FTAG, &ds); 766 owned = B_TRUE; 767 } else { 768 err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); 769 } 770 if (err != 0) { 771 dsl_pool_rele(dp, FTAG); 772 return (err); 773 } 774 775 if (fromsnap != NULL) { 776 zfs_bookmark_phys_t zb; 777 boolean_t is_clone = B_FALSE; 778 int fsnamelen = strchr(tosnap, '@') - tosnap; 779 780 /* 781 * If the fromsnap is in a different filesystem, then 782 * mark the send stream as a clone. 783 */ 784 if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || 785 (fromsnap[fsnamelen] != '@' && 786 fromsnap[fsnamelen] != '#')) { 787 is_clone = B_TRUE; 788 } 789 790 if (strchr(fromsnap, '@')) { 791 dsl_dataset_t *fromds; 792 err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); 793 if (err == 0) { 794 if (!dsl_dataset_is_before(ds, fromds, 0)) 795 err = SET_ERROR(EXDEV); 796 zb.zbm_creation_time = 797 dsl_dataset_phys(fromds)->ds_creation_time; 798 zb.zbm_creation_txg = 799 dsl_dataset_phys(fromds)->ds_creation_txg; 800 zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; 801 is_clone = (ds->ds_dir != fromds->ds_dir); 802 dsl_dataset_rele(fromds, FTAG); 803 } 804 } else { 805 err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); 806 } 807 if (err != 0) { 808 dsl_dataset_rele(ds, FTAG); 809 dsl_pool_rele(dp, FTAG); 810 return (err); 811 } 812 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, 813 embedok, large_block_ok, outfd, vp, off); 814 } else { 815 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, 816 embedok, large_block_ok, outfd, vp, off); 817 } 818 if (owned) 819 dsl_dataset_disown(ds, FTAG); 820 else 821 dsl_dataset_rele(ds, FTAG); 822 return (err); 823 } 824 825 static int 826 dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size, 827 uint64_t *sizep) 828 { 829 int err; 830 /* 831 * Assume that space (both on-disk and in-stream) is dominated by 832 * data. We will adjust for indirect blocks and the copies property, 833 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 834 */ 835 836 /* 837 * Subtract out approximate space used by indirect blocks. 838 * Assume most space is used by data blocks (non-indirect, non-dnode). 839 * Assume all blocks are recordsize. Assume ditto blocks and 840 * internal fragmentation counter out compression. 841 * 842 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 843 * block, which we observe in practice. 844 */ 845 uint64_t recordsize; 846 err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); 847 if (err != 0) 848 return (err); 849 size -= size / recordsize * sizeof (blkptr_t); 850 851 /* Add in the space for the record associated with each block. */ 852 size += size / recordsize * sizeof (dmu_replay_record_t); 853 854 *sizep = size; 855 856 return (0); 857 } 858 859 int 860 dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) 861 { 862 dsl_pool_t *dp = ds->ds_dir->dd_pool; 863 int err; 864 uint64_t size; 865 866 ASSERT(dsl_pool_config_held(dp)); 867 868 /* tosnap must be a snapshot */ 869 if (!ds->ds_is_snapshot) 870 return (SET_ERROR(EINVAL)); 871 872 /* 873 * fromsnap must be an earlier snapshot from the same fs as tosnap, 874 * or the origin's fs. 875 */ 876 if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) 877 return (SET_ERROR(EXDEV)); 878 879 /* Get uncompressed size estimate of changed data. */ 880 if (fromds == NULL) { 881 size = dsl_dataset_phys(ds)->ds_uncompressed_bytes; 882 } else { 883 uint64_t used, comp; 884 err = dsl_dataset_space_written(fromds, ds, 885 &used, &comp, &size); 886 if (err != 0) 887 return (err); 888 } 889 890 err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep); 891 return (err); 892 } 893 894 /* 895 * Simple callback used to traverse the blocks of a snapshot and sum their 896 * uncompressed size 897 */ 898 /* ARGSUSED */ 899 static int 900 dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 901 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 902 { 903 uint64_t *spaceptr = arg; 904 if (bp != NULL && !BP_IS_HOLE(bp)) { 905 *spaceptr += BP_GET_UCSIZE(bp); 906 } 907 return (0); 908 } 909 910 /* 911 * Given a desination snapshot and a TXG, calculate the approximate size of a 912 * send stream sent from that TXG. from_txg may be zero, indicating that the 913 * whole snapshot will be sent. 914 */ 915 int 916 dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, 917 uint64_t *sizep) 918 { 919 dsl_pool_t *dp = ds->ds_dir->dd_pool; 920 int err; 921 uint64_t size = 0; 922 923 ASSERT(dsl_pool_config_held(dp)); 924 925 /* tosnap must be a snapshot */ 926 if (!dsl_dataset_is_snapshot(ds)) 927 return (SET_ERROR(EINVAL)); 928 929 /* verify that from_txg is before the provided snapshot was taken */ 930 if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) { 931 return (SET_ERROR(EXDEV)); 932 } 933 934 /* 935 * traverse the blocks of the snapshot with birth times after 936 * from_txg, summing their uncompressed size 937 */ 938 err = traverse_dataset(ds, from_txg, TRAVERSE_POST, 939 dmu_calculate_send_traversal, &size); 940 if (err) 941 return (err); 942 943 err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep); 944 return (err); 945 } 946 947 typedef struct dmu_recv_begin_arg { 948 const char *drba_origin; 949 dmu_recv_cookie_t *drba_cookie; 950 cred_t *drba_cred; 951 uint64_t drba_snapobj; 952 } dmu_recv_begin_arg_t; 953 954 static int 955 recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, 956 uint64_t fromguid) 957 { 958 uint64_t val; 959 int error; 960 dsl_pool_t *dp = ds->ds_dir->dd_pool; 961 962 /* temporary clone name must not exist */ 963 error = zap_lookup(dp->dp_meta_objset, 964 dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, 965 8, 1, &val); 966 if (error != ENOENT) 967 return (error == 0 ? EBUSY : error); 968 969 /* new snapshot name must not exist */ 970 error = zap_lookup(dp->dp_meta_objset, 971 dsl_dataset_phys(ds)->ds_snapnames_zapobj, 972 drba->drba_cookie->drc_tosnap, 8, 1, &val); 973 if (error != ENOENT) 974 return (error == 0 ? EEXIST : error); 975 976 /* 977 * Check snapshot limit before receiving. We'll recheck again at the 978 * end, but might as well abort before receiving if we're already over 979 * the limit. 980 * 981 * Note that we do not check the file system limit with 982 * dsl_dir_fscount_check because the temporary %clones don't count 983 * against that limit. 984 */ 985 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, 986 NULL, drba->drba_cred); 987 if (error != 0) 988 return (error); 989 990 if (fromguid != 0) { 991 dsl_dataset_t *snap; 992 uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 993 994 /* Find snapshot in this dir that matches fromguid. */ 995 while (obj != 0) { 996 error = dsl_dataset_hold_obj(dp, obj, FTAG, 997 &snap); 998 if (error != 0) 999 return (SET_ERROR(ENODEV)); 1000 if (snap->ds_dir != ds->ds_dir) { 1001 dsl_dataset_rele(snap, FTAG); 1002 return (SET_ERROR(ENODEV)); 1003 } 1004 if (dsl_dataset_phys(snap)->ds_guid == fromguid) 1005 break; 1006 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; 1007 dsl_dataset_rele(snap, FTAG); 1008 } 1009 if (obj == 0) 1010 return (SET_ERROR(ENODEV)); 1011 1012 if (drba->drba_cookie->drc_force) { 1013 drba->drba_snapobj = obj; 1014 } else { 1015 /* 1016 * If we are not forcing, there must be no 1017 * changes since fromsnap. 1018 */ 1019 if (dsl_dataset_modified_since_snap(ds, snap)) { 1020 dsl_dataset_rele(snap, FTAG); 1021 return (SET_ERROR(ETXTBSY)); 1022 } 1023 drba->drba_snapobj = ds->ds_prev->ds_object; 1024 } 1025 1026 dsl_dataset_rele(snap, FTAG); 1027 } else { 1028 /* if full, most recent snapshot must be $ORIGIN */ 1029 if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= TXG_INITIAL) 1030 return (SET_ERROR(ENODEV)); 1031 drba->drba_snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 1032 } 1033 1034 return (0); 1035 1036 } 1037 1038 static int 1039 dmu_recv_begin_check(void *arg, dmu_tx_t *tx) 1040 { 1041 dmu_recv_begin_arg_t *drba = arg; 1042 dsl_pool_t *dp = dmu_tx_pool(tx); 1043 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 1044 uint64_t fromguid = drrb->drr_fromguid; 1045 int flags = drrb->drr_flags; 1046 int error; 1047 uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 1048 dsl_dataset_t *ds; 1049 const char *tofs = drba->drba_cookie->drc_tofs; 1050 1051 /* already checked */ 1052 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); 1053 1054 if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == 1055 DMU_COMPOUNDSTREAM || 1056 drrb->drr_type >= DMU_OST_NUMTYPES || 1057 ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) 1058 return (SET_ERROR(EINVAL)); 1059 1060 /* Verify pool version supports SA if SA_SPILL feature set */ 1061 if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 1062 spa_version(dp->dp_spa) < SPA_VERSION_SA) 1063 return (SET_ERROR(ENOTSUP)); 1064 1065 /* 1066 * The receiving code doesn't know how to translate a WRITE_EMBEDDED 1067 * record to a plan WRITE record, so the pool must have the 1068 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED 1069 * records. Same with WRITE_EMBEDDED records that use LZ4 compression. 1070 */ 1071 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && 1072 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) 1073 return (SET_ERROR(ENOTSUP)); 1074 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && 1075 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 1076 return (SET_ERROR(ENOTSUP)); 1077 1078 /* 1079 * The receiving code doesn't know how to translate large blocks 1080 * to smaller ones, so the pool must have the LARGE_BLOCKS 1081 * feature enabled if the stream has LARGE_BLOCKS. 1082 */ 1083 if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 1084 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) 1085 return (SET_ERROR(ENOTSUP)); 1086 1087 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1088 if (error == 0) { 1089 /* target fs already exists; recv into temp clone */ 1090 1091 /* Can't recv a clone into an existing fs */ 1092 if (flags & DRR_FLAG_CLONE) { 1093 dsl_dataset_rele(ds, FTAG); 1094 return (SET_ERROR(EINVAL)); 1095 } 1096 1097 error = recv_begin_check_existing_impl(drba, ds, fromguid); 1098 dsl_dataset_rele(ds, FTAG); 1099 } else if (error == ENOENT) { 1100 /* target fs does not exist; must be a full backup or clone */ 1101 char buf[MAXNAMELEN]; 1102 1103 /* 1104 * If it's a non-clone incremental, we are missing the 1105 * target fs, so fail the recv. 1106 */ 1107 if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) 1108 return (SET_ERROR(ENOENT)); 1109 1110 /* Open the parent of tofs */ 1111 ASSERT3U(strlen(tofs), <, MAXNAMELEN); 1112 (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); 1113 error = dsl_dataset_hold(dp, buf, FTAG, &ds); 1114 if (error != 0) 1115 return (error); 1116 1117 /* 1118 * Check filesystem and snapshot limits before receiving. We'll 1119 * recheck snapshot limits again at the end (we create the 1120 * filesystems and increment those counts during begin_sync). 1121 */ 1122 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1123 ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); 1124 if (error != 0) { 1125 dsl_dataset_rele(ds, FTAG); 1126 return (error); 1127 } 1128 1129 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1130 ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); 1131 if (error != 0) { 1132 dsl_dataset_rele(ds, FTAG); 1133 return (error); 1134 } 1135 1136 if (drba->drba_origin != NULL) { 1137 dsl_dataset_t *origin; 1138 error = dsl_dataset_hold(dp, drba->drba_origin, 1139 FTAG, &origin); 1140 if (error != 0) { 1141 dsl_dataset_rele(ds, FTAG); 1142 return (error); 1143 } 1144 if (!origin->ds_is_snapshot) { 1145 dsl_dataset_rele(origin, FTAG); 1146 dsl_dataset_rele(ds, FTAG); 1147 return (SET_ERROR(EINVAL)); 1148 } 1149 if (dsl_dataset_phys(origin)->ds_guid != fromguid) { 1150 dsl_dataset_rele(origin, FTAG); 1151 dsl_dataset_rele(ds, FTAG); 1152 return (SET_ERROR(ENODEV)); 1153 } 1154 dsl_dataset_rele(origin, FTAG); 1155 } 1156 dsl_dataset_rele(ds, FTAG); 1157 error = 0; 1158 } 1159 return (error); 1160 } 1161 1162 static void 1163 dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) 1164 { 1165 dmu_recv_begin_arg_t *drba = arg; 1166 dsl_pool_t *dp = dmu_tx_pool(tx); 1167 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 1168 const char *tofs = drba->drba_cookie->drc_tofs; 1169 dsl_dataset_t *ds, *newds; 1170 uint64_t dsobj; 1171 int error; 1172 uint64_t crflags; 1173 1174 crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? 1175 DS_FLAG_CI_DATASET : 0; 1176 1177 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1178 if (error == 0) { 1179 /* create temporary clone */ 1180 dsl_dataset_t *snap = NULL; 1181 if (drba->drba_snapobj != 0) { 1182 VERIFY0(dsl_dataset_hold_obj(dp, 1183 drba->drba_snapobj, FTAG, &snap)); 1184 } 1185 dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, 1186 snap, crflags, drba->drba_cred, tx); 1187 dsl_dataset_rele(snap, FTAG); 1188 dsl_dataset_rele(ds, FTAG); 1189 } else { 1190 dsl_dir_t *dd; 1191 const char *tail; 1192 dsl_dataset_t *origin = NULL; 1193 1194 VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); 1195 1196 if (drba->drba_origin != NULL) { 1197 VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, 1198 FTAG, &origin)); 1199 } 1200 1201 /* Create new dataset. */ 1202 dsobj = dsl_dataset_create_sync(dd, 1203 strrchr(tofs, '/') + 1, 1204 origin, crflags, drba->drba_cred, tx); 1205 if (origin != NULL) 1206 dsl_dataset_rele(origin, FTAG); 1207 dsl_dir_rele(dd, FTAG); 1208 drba->drba_cookie->drc_newfs = B_TRUE; 1209 } 1210 VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); 1211 1212 if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & 1213 DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 1214 !newds->ds_large_blocks) { 1215 dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); 1216 newds->ds_large_blocks = B_TRUE; 1217 } 1218 1219 dmu_buf_will_dirty(newds->ds_dbuf, tx); 1220 dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; 1221 1222 /* 1223 * If we actually created a non-clone, we need to create the 1224 * objset in our new dataset. 1225 */ 1226 if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { 1227 (void) dmu_objset_create_impl(dp->dp_spa, 1228 newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); 1229 } 1230 1231 drba->drba_cookie->drc_ds = newds; 1232 1233 spa_history_log_internal_ds(newds, "receive", tx, ""); 1234 } 1235 1236 /* 1237 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 1238 * succeeds; otherwise we will leak the holds on the datasets. 1239 */ 1240 int 1241 dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, 1242 boolean_t force, char *origin, dmu_recv_cookie_t *drc) 1243 { 1244 dmu_recv_begin_arg_t drba = { 0 }; 1245 dmu_replay_record_t *drr; 1246 1247 bzero(drc, sizeof (dmu_recv_cookie_t)); 1248 drc->drc_drrb = drrb; 1249 drc->drc_tosnap = tosnap; 1250 drc->drc_tofs = tofs; 1251 drc->drc_force = force; 1252 drc->drc_cred = CRED(); 1253 1254 if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 1255 drc->drc_byteswap = B_TRUE; 1256 else if (drrb->drr_magic != DMU_BACKUP_MAGIC) 1257 return (SET_ERROR(EINVAL)); 1258 1259 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 1260 drr->drr_type = DRR_BEGIN; 1261 drr->drr_u.drr_begin = *drc->drc_drrb; 1262 if (drc->drc_byteswap) { 1263 fletcher_4_incremental_byteswap(drr, 1264 sizeof (dmu_replay_record_t), &drc->drc_cksum); 1265 } else { 1266 fletcher_4_incremental_native(drr, 1267 sizeof (dmu_replay_record_t), &drc->drc_cksum); 1268 } 1269 kmem_free(drr, sizeof (dmu_replay_record_t)); 1270 1271 if (drc->drc_byteswap) { 1272 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1273 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 1274 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1275 drrb->drr_type = BSWAP_32(drrb->drr_type); 1276 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1277 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1278 } 1279 1280 drba.drba_origin = origin; 1281 drba.drba_cookie = drc; 1282 drba.drba_cred = CRED(); 1283 1284 return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, 1285 &drba, 5, ZFS_SPACE_CHECK_NORMAL)); 1286 } 1287 1288 struct restorearg { 1289 objset_t *os; 1290 int err; 1291 boolean_t byteswap; 1292 vnode_t *vp; 1293 uint64_t voff; 1294 int bufsize; /* amount of memory allocated for buf */ 1295 1296 dmu_replay_record_t *drr; 1297 dmu_replay_record_t *next_drr; 1298 char *buf; 1299 zio_cksum_t cksum; 1300 zio_cksum_t prev_cksum; 1301 1302 avl_tree_t *guid_to_ds_map; 1303 }; 1304 1305 typedef struct guid_map_entry { 1306 uint64_t guid; 1307 dsl_dataset_t *gme_ds; 1308 avl_node_t avlnode; 1309 } guid_map_entry_t; 1310 1311 static int 1312 guid_compare(const void *arg1, const void *arg2) 1313 { 1314 const guid_map_entry_t *gmep1 = arg1; 1315 const guid_map_entry_t *gmep2 = arg2; 1316 1317 if (gmep1->guid < gmep2->guid) 1318 return (-1); 1319 else if (gmep1->guid > gmep2->guid) 1320 return (1); 1321 return (0); 1322 } 1323 1324 static void 1325 free_guid_map_onexit(void *arg) 1326 { 1327 avl_tree_t *ca = arg; 1328 void *cookie = NULL; 1329 guid_map_entry_t *gmep; 1330 1331 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 1332 dsl_dataset_long_rele(gmep->gme_ds, gmep); 1333 dsl_dataset_rele(gmep->gme_ds, gmep); 1334 kmem_free(gmep, sizeof (guid_map_entry_t)); 1335 } 1336 avl_destroy(ca); 1337 kmem_free(ca, sizeof (avl_tree_t)); 1338 } 1339 1340 static int 1341 restore_read(struct restorearg *ra, int len, void *buf) 1342 { 1343 int done = 0; 1344 1345 /* some things will require 8-byte alignment, so everything must */ 1346 ASSERT0(len % 8); 1347 ASSERT3U(len, <=, ra->bufsize); 1348 1349 while (done < len) { 1350 ssize_t resid; 1351 1352 ra->err = vn_rdwr(UIO_READ, ra->vp, 1353 (char *)buf + done, len - done, 1354 ra->voff, UIO_SYSSPACE, FAPPEND, 1355 RLIM64_INFINITY, CRED(), &resid); 1356 1357 if (resid == len - done) 1358 ra->err = SET_ERROR(EINVAL); 1359 ra->voff += len - done - resid; 1360 done = len - resid; 1361 if (ra->err != 0) 1362 return (ra->err); 1363 } 1364 1365 ASSERT3U(done, ==, len); 1366 return (0); 1367 } 1368 1369 static void 1370 byteswap_record(dmu_replay_record_t *drr) 1371 { 1372 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 1373 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 1374 drr->drr_type = BSWAP_32(drr->drr_type); 1375 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 1376 1377 switch (drr->drr_type) { 1378 case DRR_BEGIN: 1379 DO64(drr_begin.drr_magic); 1380 DO64(drr_begin.drr_versioninfo); 1381 DO64(drr_begin.drr_creation_time); 1382 DO32(drr_begin.drr_type); 1383 DO32(drr_begin.drr_flags); 1384 DO64(drr_begin.drr_toguid); 1385 DO64(drr_begin.drr_fromguid); 1386 break; 1387 case DRR_OBJECT: 1388 DO64(drr_object.drr_object); 1389 DO32(drr_object.drr_type); 1390 DO32(drr_object.drr_bonustype); 1391 DO32(drr_object.drr_blksz); 1392 DO32(drr_object.drr_bonuslen); 1393 DO64(drr_object.drr_toguid); 1394 break; 1395 case DRR_FREEOBJECTS: 1396 DO64(drr_freeobjects.drr_firstobj); 1397 DO64(drr_freeobjects.drr_numobjs); 1398 DO64(drr_freeobjects.drr_toguid); 1399 break; 1400 case DRR_WRITE: 1401 DO64(drr_write.drr_object); 1402 DO32(drr_write.drr_type); 1403 DO64(drr_write.drr_offset); 1404 DO64(drr_write.drr_length); 1405 DO64(drr_write.drr_toguid); 1406 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); 1407 DO64(drr_write.drr_key.ddk_prop); 1408 break; 1409 case DRR_WRITE_BYREF: 1410 DO64(drr_write_byref.drr_object); 1411 DO64(drr_write_byref.drr_offset); 1412 DO64(drr_write_byref.drr_length); 1413 DO64(drr_write_byref.drr_toguid); 1414 DO64(drr_write_byref.drr_refguid); 1415 DO64(drr_write_byref.drr_refobject); 1416 DO64(drr_write_byref.drr_refoffset); 1417 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. 1418 drr_key.ddk_cksum); 1419 DO64(drr_write_byref.drr_key.ddk_prop); 1420 break; 1421 case DRR_WRITE_EMBEDDED: 1422 DO64(drr_write_embedded.drr_object); 1423 DO64(drr_write_embedded.drr_offset); 1424 DO64(drr_write_embedded.drr_length); 1425 DO64(drr_write_embedded.drr_toguid); 1426 DO32(drr_write_embedded.drr_lsize); 1427 DO32(drr_write_embedded.drr_psize); 1428 break; 1429 case DRR_FREE: 1430 DO64(drr_free.drr_object); 1431 DO64(drr_free.drr_offset); 1432 DO64(drr_free.drr_length); 1433 DO64(drr_free.drr_toguid); 1434 break; 1435 case DRR_SPILL: 1436 DO64(drr_spill.drr_object); 1437 DO64(drr_spill.drr_length); 1438 DO64(drr_spill.drr_toguid); 1439 break; 1440 case DRR_END: 1441 DO64(drr_end.drr_toguid); 1442 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); 1443 break; 1444 } 1445 1446 if (drr->drr_type != DRR_BEGIN) { 1447 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); 1448 } 1449 1450 #undef DO64 1451 #undef DO32 1452 } 1453 1454 static inline uint8_t 1455 deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) 1456 { 1457 if (bonus_type == DMU_OT_SA) { 1458 return (1); 1459 } else { 1460 return (1 + 1461 ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT)); 1462 } 1463 } 1464 1465 static int 1466 restore_object(struct restorearg *ra, struct drr_object *drro, void *data) 1467 { 1468 dmu_object_info_t doi; 1469 dmu_tx_t *tx; 1470 uint64_t object; 1471 int err; 1472 1473 if (drro->drr_type == DMU_OT_NONE || 1474 !DMU_OT_IS_VALID(drro->drr_type) || 1475 !DMU_OT_IS_VALID(drro->drr_bonustype) || 1476 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 1477 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1478 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1479 drro->drr_blksz < SPA_MINBLOCKSIZE || 1480 drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(ra->os)) || 1481 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1482 return (SET_ERROR(EINVAL)); 1483 } 1484 1485 err = dmu_object_info(ra->os, drro->drr_object, &doi); 1486 1487 if (err != 0 && err != ENOENT) 1488 return (SET_ERROR(EINVAL)); 1489 object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; 1490 1491 /* 1492 * If we are losing blkptrs or changing the block size this must 1493 * be a new file instance. We must clear out the previous file 1494 * contents before we can change this type of metadata in the dnode. 1495 */ 1496 if (err == 0) { 1497 int nblkptr; 1498 1499 nblkptr = deduce_nblkptr(drro->drr_bonustype, 1500 drro->drr_bonuslen); 1501 1502 if (drro->drr_blksz != doi.doi_data_block_size || 1503 nblkptr < doi.doi_nblkptr) { 1504 err = dmu_free_long_range(ra->os, drro->drr_object, 1505 0, DMU_OBJECT_END); 1506 if (err != 0) 1507 return (SET_ERROR(EINVAL)); 1508 } 1509 } 1510 1511 tx = dmu_tx_create(ra->os); 1512 dmu_tx_hold_bonus(tx, object); 1513 err = dmu_tx_assign(tx, TXG_WAIT); 1514 if (err != 0) { 1515 dmu_tx_abort(tx); 1516 return (err); 1517 } 1518 1519 if (object == DMU_NEW_OBJECT) { 1520 /* currently free, want to be allocated */ 1521 err = dmu_object_claim(ra->os, drro->drr_object, 1522 drro->drr_type, drro->drr_blksz, 1523 drro->drr_bonustype, drro->drr_bonuslen, tx); 1524 } else if (drro->drr_type != doi.doi_type || 1525 drro->drr_blksz != doi.doi_data_block_size || 1526 drro->drr_bonustype != doi.doi_bonus_type || 1527 drro->drr_bonuslen != doi.doi_bonus_size) { 1528 /* currently allocated, but with different properties */ 1529 err = dmu_object_reclaim(ra->os, drro->drr_object, 1530 drro->drr_type, drro->drr_blksz, 1531 drro->drr_bonustype, drro->drr_bonuslen, tx); 1532 } 1533 if (err != 0) { 1534 dmu_tx_commit(tx); 1535 return (SET_ERROR(EINVAL)); 1536 } 1537 1538 dmu_object_set_checksum(ra->os, drro->drr_object, 1539 drro->drr_checksumtype, tx); 1540 dmu_object_set_compress(ra->os, drro->drr_object, 1541 drro->drr_compress, tx); 1542 1543 if (data != NULL) { 1544 dmu_buf_t *db; 1545 1546 VERIFY0(dmu_bonus_hold(ra->os, drro->drr_object, FTAG, &db)); 1547 dmu_buf_will_dirty(db, tx); 1548 1549 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 1550 bcopy(data, db->db_data, drro->drr_bonuslen); 1551 if (ra->byteswap) { 1552 dmu_object_byteswap_t byteswap = 1553 DMU_OT_BYTESWAP(drro->drr_bonustype); 1554 dmu_ot_byteswap[byteswap].ob_func(db->db_data, 1555 drro->drr_bonuslen); 1556 } 1557 dmu_buf_rele(db, FTAG); 1558 } 1559 dmu_tx_commit(tx); 1560 return (0); 1561 } 1562 1563 /* ARGSUSED */ 1564 static int 1565 restore_freeobjects(struct restorearg *ra, 1566 struct drr_freeobjects *drrfo) 1567 { 1568 uint64_t obj; 1569 1570 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1571 return (SET_ERROR(EINVAL)); 1572 1573 for (obj = drrfo->drr_firstobj; 1574 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1575 (void) dmu_object_next(ra->os, &obj, FALSE, 0)) { 1576 int err; 1577 1578 if (dmu_object_info(ra->os, obj, NULL) != 0) 1579 continue; 1580 1581 err = dmu_free_long_object(ra->os, obj); 1582 if (err != 0) 1583 return (err); 1584 } 1585 return (0); 1586 } 1587 1588 static int 1589 restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) 1590 { 1591 dmu_tx_t *tx; 1592 int err; 1593 1594 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1595 !DMU_OT_IS_VALID(drrw->drr_type)) 1596 return (SET_ERROR(EINVAL)); 1597 1598 if (dmu_object_info(ra->os, drrw->drr_object, NULL) != 0) 1599 return (SET_ERROR(EINVAL)); 1600 1601 tx = dmu_tx_create(ra->os); 1602 1603 dmu_tx_hold_write(tx, drrw->drr_object, 1604 drrw->drr_offset, drrw->drr_length); 1605 err = dmu_tx_assign(tx, TXG_WAIT); 1606 if (err != 0) { 1607 dmu_tx_abort(tx); 1608 return (err); 1609 } 1610 if (ra->byteswap) { 1611 dmu_object_byteswap_t byteswap = 1612 DMU_OT_BYTESWAP(drrw->drr_type); 1613 dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, 1614 drrw->drr_length); 1615 } 1616 1617 dmu_buf_t *bonus; 1618 if (dmu_bonus_hold(ra->os, drrw->drr_object, FTAG, &bonus) != 0) 1619 return (SET_ERROR(EINVAL)); 1620 dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); 1621 dmu_tx_commit(tx); 1622 dmu_buf_rele(bonus, FTAG); 1623 return (0); 1624 } 1625 1626 /* 1627 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1628 * streams to refer to a copy of the data that is already on the 1629 * system because it came in earlier in the stream. This function 1630 * finds the earlier copy of the data, and uses that copy instead of 1631 * data from the stream to fulfill this write. 1632 */ 1633 static int 1634 restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) 1635 { 1636 dmu_tx_t *tx; 1637 int err; 1638 guid_map_entry_t gmesrch; 1639 guid_map_entry_t *gmep; 1640 avl_index_t where; 1641 objset_t *ref_os = NULL; 1642 dmu_buf_t *dbp; 1643 1644 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1645 return (SET_ERROR(EINVAL)); 1646 1647 /* 1648 * If the GUID of the referenced dataset is different from the 1649 * GUID of the target dataset, find the referenced dataset. 1650 */ 1651 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1652 gmesrch.guid = drrwbr->drr_refguid; 1653 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 1654 &where)) == NULL) { 1655 return (SET_ERROR(EINVAL)); 1656 } 1657 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1658 return (SET_ERROR(EINVAL)); 1659 } else { 1660 ref_os = ra->os; 1661 } 1662 1663 err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1664 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); 1665 if (err != 0) 1666 return (err); 1667 1668 tx = dmu_tx_create(ra->os); 1669 1670 dmu_tx_hold_write(tx, drrwbr->drr_object, 1671 drrwbr->drr_offset, drrwbr->drr_length); 1672 err = dmu_tx_assign(tx, TXG_WAIT); 1673 if (err != 0) { 1674 dmu_tx_abort(tx); 1675 return (err); 1676 } 1677 dmu_write(ra->os, drrwbr->drr_object, 1678 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1679 dmu_buf_rele(dbp, FTAG); 1680 dmu_tx_commit(tx); 1681 return (0); 1682 } 1683 1684 static int 1685 restore_write_embedded(struct restorearg *ra, 1686 struct drr_write_embedded *drrwnp, void *data) 1687 { 1688 dmu_tx_t *tx; 1689 int err; 1690 1691 if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) 1692 return (EINVAL); 1693 1694 if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE) 1695 return (EINVAL); 1696 1697 if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES) 1698 return (EINVAL); 1699 if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) 1700 return (EINVAL); 1701 1702 tx = dmu_tx_create(ra->os); 1703 1704 dmu_tx_hold_write(tx, drrwnp->drr_object, 1705 drrwnp->drr_offset, drrwnp->drr_length); 1706 err = dmu_tx_assign(tx, TXG_WAIT); 1707 if (err != 0) { 1708 dmu_tx_abort(tx); 1709 return (err); 1710 } 1711 1712 dmu_write_embedded(ra->os, drrwnp->drr_object, 1713 drrwnp->drr_offset, data, drrwnp->drr_etype, 1714 drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, 1715 ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); 1716 1717 dmu_tx_commit(tx); 1718 return (0); 1719 } 1720 1721 static int 1722 restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data) 1723 { 1724 dmu_tx_t *tx; 1725 dmu_buf_t *db, *db_spill; 1726 int err; 1727 1728 if (drrs->drr_length < SPA_MINBLOCKSIZE || 1729 drrs->drr_length > spa_maxblocksize(dmu_objset_spa(ra->os))) 1730 return (SET_ERROR(EINVAL)); 1731 1732 if (dmu_object_info(ra->os, drrs->drr_object, NULL) != 0) 1733 return (SET_ERROR(EINVAL)); 1734 1735 VERIFY0(dmu_bonus_hold(ra->os, drrs->drr_object, FTAG, &db)); 1736 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 1737 dmu_buf_rele(db, FTAG); 1738 return (err); 1739 } 1740 1741 tx = dmu_tx_create(ra->os); 1742 1743 dmu_tx_hold_spill(tx, db->db_object); 1744 1745 err = dmu_tx_assign(tx, TXG_WAIT); 1746 if (err != 0) { 1747 dmu_buf_rele(db, FTAG); 1748 dmu_buf_rele(db_spill, FTAG); 1749 dmu_tx_abort(tx); 1750 return (err); 1751 } 1752 dmu_buf_will_dirty(db_spill, tx); 1753 1754 if (db_spill->db_size < drrs->drr_length) 1755 VERIFY(0 == dbuf_spill_set_blksz(db_spill, 1756 drrs->drr_length, tx)); 1757 bcopy(data, db_spill->db_data, drrs->drr_length); 1758 1759 dmu_buf_rele(db, FTAG); 1760 dmu_buf_rele(db_spill, FTAG); 1761 1762 dmu_tx_commit(tx); 1763 return (0); 1764 } 1765 1766 /* ARGSUSED */ 1767 static int 1768 restore_free(struct restorearg *ra, struct drr_free *drrf) 1769 { 1770 int err; 1771 1772 if (drrf->drr_length != -1ULL && 1773 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1774 return (SET_ERROR(EINVAL)); 1775 1776 if (dmu_object_info(ra->os, drrf->drr_object, NULL) != 0) 1777 return (SET_ERROR(EINVAL)); 1778 1779 err = dmu_free_long_range(ra->os, drrf->drr_object, 1780 drrf->drr_offset, drrf->drr_length); 1781 return (err); 1782 } 1783 1784 /* used to destroy the drc_ds on error */ 1785 static void 1786 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) 1787 { 1788 char name[MAXNAMELEN]; 1789 dsl_dataset_name(drc->drc_ds, name); 1790 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 1791 (void) dsl_destroy_head(name); 1792 } 1793 1794 static void 1795 restore_cksum(struct restorearg *ra, int len, void *buf) 1796 { 1797 if (ra->byteswap) { 1798 fletcher_4_incremental_byteswap(buf, len, &ra->cksum); 1799 } else { 1800 fletcher_4_incremental_native(buf, len, &ra->cksum); 1801 } 1802 } 1803 1804 /* 1805 * If len != 0, read payload into buf. 1806 * Read next record's header into ra->next_drr. 1807 * Verify checksum of payload and next record. 1808 */ 1809 static int 1810 restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf) 1811 { 1812 int err; 1813 1814 if (len != 0) { 1815 ASSERT3U(len, <=, ra->bufsize); 1816 err = restore_read(ra, len, buf); 1817 if (err != 0) 1818 return (err); 1819 restore_cksum(ra, len, buf); 1820 } 1821 1822 ra->prev_cksum = ra->cksum; 1823 1824 err = restore_read(ra, sizeof (*ra->next_drr), ra->next_drr); 1825 if (err != 0) 1826 return (err); 1827 if (ra->next_drr->drr_type == DRR_BEGIN) 1828 return (SET_ERROR(EINVAL)); 1829 1830 /* 1831 * Note: checksum is of everything up to but not including the 1832 * checksum itself. 1833 */ 1834 ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), 1835 ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); 1836 restore_cksum(ra, 1837 offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), 1838 ra->next_drr); 1839 1840 zio_cksum_t cksum_orig = ra->next_drr->drr_u.drr_checksum.drr_checksum; 1841 zio_cksum_t *cksump = &ra->next_drr->drr_u.drr_checksum.drr_checksum; 1842 1843 if (ra->byteswap) 1844 byteswap_record(ra->next_drr); 1845 1846 if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && 1847 !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) 1848 return (SET_ERROR(ECKSUM)); 1849 1850 restore_cksum(ra, sizeof (cksum_orig), &cksum_orig); 1851 1852 return (0); 1853 } 1854 1855 static int 1856 restore_process_record(struct restorearg *ra) 1857 { 1858 int err; 1859 1860 switch (ra->drr->drr_type) { 1861 case DRR_OBJECT: 1862 { 1863 struct drr_object *drro = &ra->drr->drr_u.drr_object; 1864 err = restore_read_payload_and_next_header(ra, 1865 P2ROUNDUP(drro->drr_bonuslen, 8), ra->buf); 1866 if (err != 0) 1867 return (err); 1868 return (restore_object(ra, drro, ra->buf)); 1869 } 1870 case DRR_FREEOBJECTS: 1871 { 1872 struct drr_freeobjects *drrfo = 1873 &ra->drr->drr_u.drr_freeobjects; 1874 err = restore_read_payload_and_next_header(ra, 0, NULL); 1875 if (err != 0) 1876 return (err); 1877 return (restore_freeobjects(ra, drrfo)); 1878 } 1879 case DRR_WRITE: 1880 { 1881 struct drr_write *drrw = &ra->drr->drr_u.drr_write; 1882 arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os), 1883 drrw->drr_length); 1884 1885 err = restore_read_payload_and_next_header(ra, 1886 drrw->drr_length, abuf->b_data); 1887 if (err != 0) 1888 return (err); 1889 err = restore_write(ra, drrw, abuf); 1890 /* if restore_write() is successful, it consumes the arc_buf */ 1891 if (err != 0) 1892 dmu_return_arcbuf(abuf); 1893 return (err); 1894 } 1895 case DRR_WRITE_BYREF: 1896 { 1897 struct drr_write_byref *drrwbr = 1898 &ra->drr->drr_u.drr_write_byref; 1899 err = restore_read_payload_and_next_header(ra, 0, NULL); 1900 if (err != 0) 1901 return (err); 1902 return (restore_write_byref(ra, drrwbr)); 1903 } 1904 case DRR_WRITE_EMBEDDED: 1905 { 1906 struct drr_write_embedded *drrwe = 1907 &ra->drr->drr_u.drr_write_embedded; 1908 err = restore_read_payload_and_next_header(ra, 1909 P2ROUNDUP(drrwe->drr_psize, 8), ra->buf); 1910 if (err != 0) 1911 return (err); 1912 return (restore_write_embedded(ra, drrwe, ra->buf)); 1913 } 1914 case DRR_FREE: 1915 { 1916 struct drr_free *drrf = &ra->drr->drr_u.drr_free; 1917 err = restore_read_payload_and_next_header(ra, 0, NULL); 1918 if (err != 0) 1919 return (err); 1920 return (restore_free(ra, drrf)); 1921 } 1922 case DRR_END: 1923 { 1924 struct drr_end *drre = &ra->drr->drr_u.drr_end; 1925 if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) 1926 return (SET_ERROR(EINVAL)); 1927 return (0); 1928 } 1929 case DRR_SPILL: 1930 { 1931 struct drr_spill *drrs = &ra->drr->drr_u.drr_spill; 1932 err = restore_read_payload_and_next_header(ra, 1933 drrs->drr_length, ra->buf); 1934 if (err != 0) 1935 return (err); 1936 return (restore_spill(ra, drrs, ra->buf)); 1937 } 1938 default: 1939 return (SET_ERROR(EINVAL)); 1940 } 1941 } 1942 1943 /* 1944 * NB: callers *must* call dmu_recv_end() if this succeeds. 1945 */ 1946 int 1947 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, 1948 int cleanup_fd, uint64_t *action_handlep) 1949 { 1950 int err = 0; 1951 struct restorearg ra = { 0 }; 1952 int featureflags; 1953 1954 ra.byteswap = drc->drc_byteswap; 1955 ra.cksum = drc->drc_cksum; 1956 ra.vp = vp; 1957 ra.voff = *voffp; 1958 ra.bufsize = SPA_MAXBLOCKSIZE; 1959 ra.drr = kmem_alloc(sizeof (*ra.drr), KM_SLEEP); 1960 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1961 ra.next_drr = kmem_alloc(sizeof (*ra.next_drr), KM_SLEEP); 1962 1963 /* these were verified in dmu_recv_begin */ 1964 ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, 1965 DMU_SUBSTREAM); 1966 ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); 1967 1968 /* 1969 * Open the objset we are modifying. 1970 */ 1971 VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os)); 1972 1973 ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); 1974 1975 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1976 1977 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1978 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1979 minor_t minor; 1980 1981 if (cleanup_fd == -1) { 1982 ra.err = SET_ERROR(EBADF); 1983 goto out; 1984 } 1985 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 1986 if (ra.err != 0) { 1987 cleanup_fd = -1; 1988 goto out; 1989 } 1990 1991 if (*action_handlep == 0) { 1992 ra.guid_to_ds_map = 1993 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1994 avl_create(ra.guid_to_ds_map, guid_compare, 1995 sizeof (guid_map_entry_t), 1996 offsetof(guid_map_entry_t, avlnode)); 1997 err = zfs_onexit_add_cb(minor, 1998 free_guid_map_onexit, ra.guid_to_ds_map, 1999 action_handlep); 2000 if (ra.err != 0) 2001 goto out; 2002 } else { 2003 err = zfs_onexit_cb_data(minor, *action_handlep, 2004 (void **)&ra.guid_to_ds_map); 2005 if (ra.err != 0) 2006 goto out; 2007 } 2008 2009 drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 2010 } 2011 2012 err = restore_read_payload_and_next_header(&ra, 0, NULL); 2013 if (err != 0) 2014 goto out; 2015 for (;;) { 2016 void *tmp; 2017 2018 if (issig(JUSTLOOKING) && issig(FORREAL)) { 2019 err = SET_ERROR(EINTR); 2020 break; 2021 } 2022 2023 tmp = ra.next_drr; 2024 ra.next_drr = ra.drr; 2025 ra.drr = tmp; 2026 2027 /* process ra.drr, read in ra.next_drr */ 2028 err = restore_process_record(&ra); 2029 if (err != 0) 2030 break; 2031 if (ra.drr->drr_type == DRR_END) 2032 break; 2033 } 2034 2035 out: 2036 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 2037 zfs_onexit_fd_rele(cleanup_fd); 2038 2039 if (err != 0) { 2040 /* 2041 * destroy what we created, so we don't leave it in the 2042 * inconsistent restoring state. 2043 */ 2044 dmu_recv_cleanup_ds(drc); 2045 } 2046 2047 kmem_free(ra.drr, sizeof (*ra.drr)); 2048 kmem_free(ra.buf, ra.bufsize); 2049 kmem_free(ra.next_drr, sizeof (*ra.next_drr)); 2050 *voffp = ra.voff; 2051 return (err); 2052 } 2053 2054 static int 2055 dmu_recv_end_check(void *arg, dmu_tx_t *tx) 2056 { 2057 dmu_recv_cookie_t *drc = arg; 2058 dsl_pool_t *dp = dmu_tx_pool(tx); 2059 int error; 2060 2061 ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); 2062 2063 if (!drc->drc_newfs) { 2064 dsl_dataset_t *origin_head; 2065 2066 error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); 2067 if (error != 0) 2068 return (error); 2069 if (drc->drc_force) { 2070 /* 2071 * We will destroy any snapshots in tofs (i.e. before 2072 * origin_head) that are after the origin (which is 2073 * the snap before drc_ds, because drc_ds can not 2074 * have any snaps of its own). 2075 */ 2076 uint64_t obj; 2077 2078 obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; 2079 while (obj != 2080 dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { 2081 dsl_dataset_t *snap; 2082 error = dsl_dataset_hold_obj(dp, obj, FTAG, 2083 &snap); 2084 if (error != 0) 2085 return (error); 2086 if (snap->ds_dir != origin_head->ds_dir) 2087 error = SET_ERROR(EINVAL); 2088 if (error == 0) { 2089 error = dsl_destroy_snapshot_check_impl( 2090 snap, B_FALSE); 2091 } 2092 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; 2093 dsl_dataset_rele(snap, FTAG); 2094 if (error != 0) 2095 return (error); 2096 } 2097 } 2098 error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, 2099 origin_head, drc->drc_force, drc->drc_owner, tx); 2100 if (error != 0) { 2101 dsl_dataset_rele(origin_head, FTAG); 2102 return (error); 2103 } 2104 error = dsl_dataset_snapshot_check_impl(origin_head, 2105 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 2106 dsl_dataset_rele(origin_head, FTAG); 2107 if (error != 0) 2108 return (error); 2109 2110 error = dsl_destroy_head_check_impl(drc->drc_ds, 1); 2111 } else { 2112 error = dsl_dataset_snapshot_check_impl(drc->drc_ds, 2113 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 2114 } 2115 return (error); 2116 } 2117 2118 static void 2119 dmu_recv_end_sync(void *arg, dmu_tx_t *tx) 2120 { 2121 dmu_recv_cookie_t *drc = arg; 2122 dsl_pool_t *dp = dmu_tx_pool(tx); 2123 2124 spa_history_log_internal_ds(drc->drc_ds, "finish receiving", 2125 tx, "snap=%s", drc->drc_tosnap); 2126 2127 if (!drc->drc_newfs) { 2128 dsl_dataset_t *origin_head; 2129 2130 VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, 2131 &origin_head)); 2132 2133 if (drc->drc_force) { 2134 /* 2135 * Destroy any snapshots of drc_tofs (origin_head) 2136 * after the origin (the snap before drc_ds). 2137 */ 2138 uint64_t obj; 2139 2140 obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; 2141 while (obj != 2142 dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { 2143 dsl_dataset_t *snap; 2144 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, 2145 &snap)); 2146 ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); 2147 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; 2148 dsl_destroy_snapshot_sync_impl(snap, 2149 B_FALSE, tx); 2150 dsl_dataset_rele(snap, FTAG); 2151 } 2152 } 2153 VERIFY3P(drc->drc_ds->ds_prev, ==, 2154 origin_head->ds_prev); 2155 2156 dsl_dataset_clone_swap_sync_impl(drc->drc_ds, 2157 origin_head, tx); 2158 dsl_dataset_snapshot_sync_impl(origin_head, 2159 drc->drc_tosnap, tx); 2160 2161 /* set snapshot's creation time and guid */ 2162 dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); 2163 dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = 2164 drc->drc_drrb->drr_creation_time; 2165 dsl_dataset_phys(origin_head->ds_prev)->ds_guid = 2166 drc->drc_drrb->drr_toguid; 2167 dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= 2168 ~DS_FLAG_INCONSISTENT; 2169 2170 dmu_buf_will_dirty(origin_head->ds_dbuf, tx); 2171 dsl_dataset_phys(origin_head)->ds_flags &= 2172 ~DS_FLAG_INCONSISTENT; 2173 2174 dsl_dataset_rele(origin_head, FTAG); 2175 dsl_destroy_head_sync_impl(drc->drc_ds, tx); 2176 2177 if (drc->drc_owner != NULL) 2178 VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); 2179 } else { 2180 dsl_dataset_t *ds = drc->drc_ds; 2181 2182 dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); 2183 2184 /* set snapshot's creation time and guid */ 2185 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 2186 dsl_dataset_phys(ds->ds_prev)->ds_creation_time = 2187 drc->drc_drrb->drr_creation_time; 2188 dsl_dataset_phys(ds->ds_prev)->ds_guid = 2189 drc->drc_drrb->drr_toguid; 2190 dsl_dataset_phys(ds->ds_prev)->ds_flags &= 2191 ~DS_FLAG_INCONSISTENT; 2192 2193 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2194 dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; 2195 } 2196 drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; 2197 /* 2198 * Release the hold from dmu_recv_begin. This must be done before 2199 * we return to open context, so that when we free the dataset's dnode, 2200 * we can evict its bonus buffer. 2201 */ 2202 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 2203 drc->drc_ds = NULL; 2204 } 2205 2206 static int 2207 add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) 2208 { 2209 dsl_pool_t *dp; 2210 dsl_dataset_t *snapds; 2211 guid_map_entry_t *gmep; 2212 int err; 2213 2214 ASSERT(guid_map != NULL); 2215 2216 err = dsl_pool_hold(name, FTAG, &dp); 2217 if (err != 0) 2218 return (err); 2219 gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); 2220 err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); 2221 if (err == 0) { 2222 gmep->guid = dsl_dataset_phys(snapds)->ds_guid; 2223 gmep->gme_ds = snapds; 2224 avl_add(guid_map, gmep); 2225 dsl_dataset_long_hold(snapds, gmep); 2226 } else { 2227 kmem_free(gmep, sizeof (*gmep)); 2228 } 2229 2230 dsl_pool_rele(dp, FTAG); 2231 return (err); 2232 } 2233 2234 static int dmu_recv_end_modified_blocks = 3; 2235 2236 static int 2237 dmu_recv_existing_end(dmu_recv_cookie_t *drc) 2238 { 2239 int error; 2240 char name[MAXNAMELEN]; 2241 2242 #ifdef _KERNEL 2243 /* 2244 * We will be destroying the ds; make sure its origin is unmounted if 2245 * necessary. 2246 */ 2247 dsl_dataset_name(drc->drc_ds, name); 2248 zfs_destroy_unmount_origin(name); 2249 #endif 2250 2251 error = dsl_sync_task(drc->drc_tofs, 2252 dmu_recv_end_check, dmu_recv_end_sync, drc, 2253 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); 2254 2255 if (error != 0) 2256 dmu_recv_cleanup_ds(drc); 2257 return (error); 2258 } 2259 2260 static int 2261 dmu_recv_new_end(dmu_recv_cookie_t *drc) 2262 { 2263 int error; 2264 2265 error = dsl_sync_task(drc->drc_tofs, 2266 dmu_recv_end_check, dmu_recv_end_sync, drc, 2267 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); 2268 2269 if (error != 0) { 2270 dmu_recv_cleanup_ds(drc); 2271 } else if (drc->drc_guid_to_ds_map != NULL) { 2272 (void) add_ds_to_guidmap(drc->drc_tofs, 2273 drc->drc_guid_to_ds_map, 2274 drc->drc_newsnapobj); 2275 } 2276 return (error); 2277 } 2278 2279 int 2280 dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) 2281 { 2282 drc->drc_owner = owner; 2283 2284 if (drc->drc_newfs) 2285 return (dmu_recv_new_end(drc)); 2286 else 2287 return (dmu_recv_existing_end(drc)); 2288 } 2289 2290 /* 2291 * Return TRUE if this objset is currently being received into. 2292 */ 2293 boolean_t 2294 dmu_objset_is_receiving(objset_t *os) 2295 { 2296 return (os->os_dsl_dataset != NULL && 2297 os->os_dsl_dataset->ds_owner == dmu_recv_tag); 2298 } 2299