1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2014, Joyent, Inc. All rights reserved. 26 * Copyright 2014 HybridCluster. All rights reserved. 27 */ 28 29 #include <sys/dmu.h> 30 #include <sys/dmu_impl.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/dbuf.h> 33 #include <sys/dnode.h> 34 #include <sys/zfs_context.h> 35 #include <sys/dmu_objset.h> 36 #include <sys/dmu_traverse.h> 37 #include <sys/dsl_dataset.h> 38 #include <sys/dsl_dir.h> 39 #include <sys/dsl_prop.h> 40 #include <sys/dsl_pool.h> 41 #include <sys/dsl_synctask.h> 42 #include <sys/zfs_ioctl.h> 43 #include <sys/zap.h> 44 #include <sys/zio_checksum.h> 45 #include <sys/zfs_znode.h> 46 #include <zfs_fletcher.h> 47 #include <sys/avl.h> 48 #include <sys/ddt.h> 49 #include <sys/zfs_onexit.h> 50 #include <sys/dmu_send.h> 51 #include <sys/dsl_destroy.h> 52 #include <sys/blkptr.h> 53 #include <sys/dsl_bookmark.h> 54 #include <sys/zfeature.h> 55 56 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 57 int zfs_send_corrupt_data = B_FALSE; 58 59 static char *dmu_recv_tag = "dmu_recv_tag"; 60 static const char *recv_clone_name = "%recv"; 61 62 static int 63 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) 64 { 65 dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; 66 ssize_t resid; /* have to get resid to get detailed errno */ 67 ASSERT0(len % 8); 68 69 dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, 70 (caddr_t)buf, len, 71 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); 72 73 mutex_enter(&ds->ds_sendstream_lock); 74 *dsp->dsa_off += len; 75 mutex_exit(&ds->ds_sendstream_lock); 76 77 return (dsp->dsa_err); 78 } 79 80 /* 81 * For all record types except BEGIN, fill in the checksum (overlaid in 82 * drr_u.drr_checksum.drr_checksum). The checksum verifies everything 83 * up to the start of the checksum itself. 84 */ 85 static int 86 dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) 87 { 88 ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), 89 ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); 90 fletcher_4_incremental_native(dsp->dsa_drr, 91 offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), 92 &dsp->dsa_zc); 93 if (dsp->dsa_drr->drr_type != DRR_BEGIN) { 94 ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u. 95 drr_checksum.drr_checksum)); 96 dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; 97 } 98 fletcher_4_incremental_native(&dsp->dsa_drr-> 99 drr_u.drr_checksum.drr_checksum, 100 sizeof (zio_cksum_t), &dsp->dsa_zc); 101 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 102 return (SET_ERROR(EINTR)); 103 if (payload_len != 0) { 104 fletcher_4_incremental_native(payload, payload_len, 105 &dsp->dsa_zc); 106 if (dump_bytes(dsp, payload, payload_len) != 0) 107 return (SET_ERROR(EINTR)); 108 } 109 return (0); 110 } 111 112 static int 113 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 114 uint64_t length) 115 { 116 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); 117 118 /* 119 * When we receive a free record, dbuf_free_range() assumes 120 * that the receiving system doesn't have any dbufs in the range 121 * being freed. This is always true because there is a one-record 122 * constraint: we only send one WRITE record for any given 123 * object+offset. We know that the one-record constraint is 124 * true because we always send data in increasing order by 125 * object,offset. 126 * 127 * If the increasing-order constraint ever changes, we should find 128 * another way to assert that the one-record constraint is still 129 * satisfied. 130 */ 131 ASSERT(object > dsp->dsa_last_data_object || 132 (object == dsp->dsa_last_data_object && 133 offset > dsp->dsa_last_data_offset)); 134 135 /* 136 * If we are doing a non-incremental send, then there can't 137 * be any data in the dataset we're receiving into. Therefore 138 * a free record would simply be a no-op. Save space by not 139 * sending it to begin with. 140 */ 141 if (!dsp->dsa_incremental) 142 return (0); 143 144 if (length != -1ULL && offset + length < offset) 145 length = -1ULL; 146 147 /* 148 * If there is a pending op, but it's not PENDING_FREE, push it out, 149 * since free block aggregation can only be done for blocks of the 150 * same type (i.e., DRR_FREE records can only be aggregated with 151 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 152 * aggregated with other DRR_FREEOBJECTS records. 153 */ 154 if (dsp->dsa_pending_op != PENDING_NONE && 155 dsp->dsa_pending_op != PENDING_FREE) { 156 if (dump_record(dsp, NULL, 0) != 0) 157 return (SET_ERROR(EINTR)); 158 dsp->dsa_pending_op = PENDING_NONE; 159 } 160 161 if (dsp->dsa_pending_op == PENDING_FREE) { 162 /* 163 * There should never be a PENDING_FREE if length is -1 164 * (because dump_dnode is the only place where this 165 * function is called with a -1, and only after flushing 166 * any pending record). 167 */ 168 ASSERT(length != -1ULL); 169 /* 170 * Check to see whether this free block can be aggregated 171 * with pending one. 172 */ 173 if (drrf->drr_object == object && drrf->drr_offset + 174 drrf->drr_length == offset) { 175 drrf->drr_length += length; 176 return (0); 177 } else { 178 /* not a continuation. Push out pending record */ 179 if (dump_record(dsp, NULL, 0) != 0) 180 return (SET_ERROR(EINTR)); 181 dsp->dsa_pending_op = PENDING_NONE; 182 } 183 } 184 /* create a FREE record and make it pending */ 185 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 186 dsp->dsa_drr->drr_type = DRR_FREE; 187 drrf->drr_object = object; 188 drrf->drr_offset = offset; 189 drrf->drr_length = length; 190 drrf->drr_toguid = dsp->dsa_toguid; 191 if (length == -1ULL) { 192 if (dump_record(dsp, NULL, 0) != 0) 193 return (SET_ERROR(EINTR)); 194 } else { 195 dsp->dsa_pending_op = PENDING_FREE; 196 } 197 198 return (0); 199 } 200 201 static int 202 dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, 203 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 204 { 205 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); 206 207 /* 208 * We send data in increasing object, offset order. 209 * See comment in dump_free() for details. 210 */ 211 ASSERT(object > dsp->dsa_last_data_object || 212 (object == dsp->dsa_last_data_object && 213 offset > dsp->dsa_last_data_offset)); 214 dsp->dsa_last_data_object = object; 215 dsp->dsa_last_data_offset = offset + blksz - 1; 216 217 /* 218 * If there is any kind of pending aggregation (currently either 219 * a grouping of free objects or free blocks), push it out to 220 * the stream, since aggregation can't be done across operations 221 * of different types. 222 */ 223 if (dsp->dsa_pending_op != PENDING_NONE) { 224 if (dump_record(dsp, NULL, 0) != 0) 225 return (SET_ERROR(EINTR)); 226 dsp->dsa_pending_op = PENDING_NONE; 227 } 228 /* write a WRITE record */ 229 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 230 dsp->dsa_drr->drr_type = DRR_WRITE; 231 drrw->drr_object = object; 232 drrw->drr_type = type; 233 drrw->drr_offset = offset; 234 drrw->drr_length = blksz; 235 drrw->drr_toguid = dsp->dsa_toguid; 236 if (bp == NULL || BP_IS_EMBEDDED(bp)) { 237 /* 238 * There's no pre-computed checksum for partial-block 239 * writes or embedded BP's, so (like 240 * fletcher4-checkummed blocks) userland will have to 241 * compute a dedup-capable checksum itself. 242 */ 243 drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; 244 } else { 245 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 246 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 247 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 248 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 249 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 250 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 251 drrw->drr_key.ddk_cksum = bp->blk_cksum; 252 } 253 254 if (dump_record(dsp, data, blksz) != 0) 255 return (SET_ERROR(EINTR)); 256 return (0); 257 } 258 259 static int 260 dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 261 int blksz, const blkptr_t *bp) 262 { 263 char buf[BPE_PAYLOAD_SIZE]; 264 struct drr_write_embedded *drrw = 265 &(dsp->dsa_drr->drr_u.drr_write_embedded); 266 267 if (dsp->dsa_pending_op != PENDING_NONE) { 268 if (dump_record(dsp, NULL, 0) != 0) 269 return (EINTR); 270 dsp->dsa_pending_op = PENDING_NONE; 271 } 272 273 ASSERT(BP_IS_EMBEDDED(bp)); 274 275 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 276 dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; 277 drrw->drr_object = object; 278 drrw->drr_offset = offset; 279 drrw->drr_length = blksz; 280 drrw->drr_toguid = dsp->dsa_toguid; 281 drrw->drr_compression = BP_GET_COMPRESS(bp); 282 drrw->drr_etype = BPE_GET_ETYPE(bp); 283 drrw->drr_lsize = BPE_GET_LSIZE(bp); 284 drrw->drr_psize = BPE_GET_PSIZE(bp); 285 286 decode_embedded_bp_compressed(bp, buf); 287 288 if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) 289 return (EINTR); 290 return (0); 291 } 292 293 static int 294 dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) 295 { 296 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); 297 298 if (dsp->dsa_pending_op != PENDING_NONE) { 299 if (dump_record(dsp, NULL, 0) != 0) 300 return (SET_ERROR(EINTR)); 301 dsp->dsa_pending_op = PENDING_NONE; 302 } 303 304 /* write a SPILL record */ 305 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 306 dsp->dsa_drr->drr_type = DRR_SPILL; 307 drrs->drr_object = object; 308 drrs->drr_length = blksz; 309 drrs->drr_toguid = dsp->dsa_toguid; 310 311 if (dump_record(dsp, data, blksz) != 0) 312 return (SET_ERROR(EINTR)); 313 return (0); 314 } 315 316 static int 317 dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) 318 { 319 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); 320 321 /* See comment in dump_free(). */ 322 if (!dsp->dsa_incremental) 323 return (0); 324 325 /* 326 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 327 * push it out, since free block aggregation can only be done for 328 * blocks of the same type (i.e., DRR_FREE records can only be 329 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 330 * can only be aggregated with other DRR_FREEOBJECTS records. 331 */ 332 if (dsp->dsa_pending_op != PENDING_NONE && 333 dsp->dsa_pending_op != PENDING_FREEOBJECTS) { 334 if (dump_record(dsp, NULL, 0) != 0) 335 return (SET_ERROR(EINTR)); 336 dsp->dsa_pending_op = PENDING_NONE; 337 } 338 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { 339 /* 340 * See whether this free object array can be aggregated 341 * with pending one 342 */ 343 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 344 drrfo->drr_numobjs += numobjs; 345 return (0); 346 } else { 347 /* can't be aggregated. Push out pending record */ 348 if (dump_record(dsp, NULL, 0) != 0) 349 return (SET_ERROR(EINTR)); 350 dsp->dsa_pending_op = PENDING_NONE; 351 } 352 } 353 354 /* write a FREEOBJECTS record */ 355 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 356 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; 357 drrfo->drr_firstobj = firstobj; 358 drrfo->drr_numobjs = numobjs; 359 drrfo->drr_toguid = dsp->dsa_toguid; 360 361 dsp->dsa_pending_op = PENDING_FREEOBJECTS; 362 363 return (0); 364 } 365 366 static int 367 dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) 368 { 369 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); 370 371 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 372 return (dump_freeobjects(dsp, object, 1)); 373 374 if (dsp->dsa_pending_op != PENDING_NONE) { 375 if (dump_record(dsp, NULL, 0) != 0) 376 return (SET_ERROR(EINTR)); 377 dsp->dsa_pending_op = PENDING_NONE; 378 } 379 380 /* write an OBJECT record */ 381 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 382 dsp->dsa_drr->drr_type = DRR_OBJECT; 383 drro->drr_object = object; 384 drro->drr_type = dnp->dn_type; 385 drro->drr_bonustype = dnp->dn_bonustype; 386 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 387 drro->drr_bonuslen = dnp->dn_bonuslen; 388 drro->drr_checksumtype = dnp->dn_checksum; 389 drro->drr_compress = dnp->dn_compress; 390 drro->drr_toguid = dsp->dsa_toguid; 391 392 if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 393 drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) 394 drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; 395 396 if (dump_record(dsp, DN_BONUS(dnp), 397 P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) { 398 return (SET_ERROR(EINTR)); 399 } 400 401 /* Free anything past the end of the file. */ 402 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * 403 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) 404 return (SET_ERROR(EINTR)); 405 if (dsp->dsa_err != 0) 406 return (SET_ERROR(EINTR)); 407 return (0); 408 } 409 410 static boolean_t 411 backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) 412 { 413 if (!BP_IS_EMBEDDED(bp)) 414 return (B_FALSE); 415 416 /* 417 * Compression function must be legacy, or explicitly enabled. 418 */ 419 if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && 420 !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4))) 421 return (B_FALSE); 422 423 /* 424 * Embed type must be explicitly enabled. 425 */ 426 switch (BPE_GET_ETYPE(bp)) { 427 case BP_EMBEDDED_TYPE_DATA: 428 if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) 429 return (B_TRUE); 430 break; 431 default: 432 return (B_FALSE); 433 } 434 return (B_FALSE); 435 } 436 437 #define BP_SPAN(dnp, level) \ 438 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 439 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 440 441 /* ARGSUSED */ 442 static int 443 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 444 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 445 { 446 dmu_sendarg_t *dsp = arg; 447 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 448 int err = 0; 449 450 if (issig(JUSTLOOKING) && issig(FORREAL)) 451 return (SET_ERROR(EINTR)); 452 453 if (zb->zb_object != DMU_META_DNODE_OBJECT && 454 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 455 return (0); 456 } else if (zb->zb_level == ZB_ZIL_LEVEL) { 457 /* 458 * If we are sending a non-snapshot (which is allowed on 459 * read-only pools), it may have a ZIL, which must be ignored. 460 */ 461 return (0); 462 } else if (BP_IS_HOLE(bp) && 463 zb->zb_object == DMU_META_DNODE_OBJECT) { 464 uint64_t span = BP_SPAN(dnp, zb->zb_level); 465 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 466 err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); 467 } else if (BP_IS_HOLE(bp)) { 468 uint64_t span = BP_SPAN(dnp, zb->zb_level); 469 err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); 470 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 471 return (0); 472 } else if (type == DMU_OT_DNODE) { 473 dnode_phys_t *blk; 474 int i; 475 int blksz = BP_GET_LSIZE(bp); 476 arc_flags_t aflags = ARC_FLAG_WAIT; 477 arc_buf_t *abuf; 478 479 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 480 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 481 &aflags, zb) != 0) 482 return (SET_ERROR(EIO)); 483 484 blk = abuf->b_data; 485 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 486 uint64_t dnobj = (zb->zb_blkid << 487 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 488 err = dump_dnode(dsp, dnobj, blk+i); 489 if (err != 0) 490 break; 491 } 492 (void) arc_buf_remove_ref(abuf, &abuf); 493 } else if (type == DMU_OT_SA) { 494 arc_flags_t aflags = ARC_FLAG_WAIT; 495 arc_buf_t *abuf; 496 int blksz = BP_GET_LSIZE(bp); 497 498 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 499 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 500 &aflags, zb) != 0) 501 return (SET_ERROR(EIO)); 502 503 err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); 504 (void) arc_buf_remove_ref(abuf, &abuf); 505 } else if (backup_do_embed(dsp, bp)) { 506 /* it's an embedded level-0 block of a regular object */ 507 int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 508 err = dump_write_embedded(dsp, zb->zb_object, 509 zb->zb_blkid * blksz, blksz, bp); 510 } else { /* it's a level-0 block of a regular object */ 511 arc_flags_t aflags = ARC_FLAG_WAIT; 512 arc_buf_t *abuf; 513 int blksz = BP_GET_LSIZE(bp); 514 uint64_t offset; 515 516 ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); 517 ASSERT0(zb->zb_level); 518 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 519 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 520 &aflags, zb) != 0) { 521 if (zfs_send_corrupt_data) { 522 /* Send a block filled with 0x"zfs badd bloc" */ 523 abuf = arc_buf_alloc(spa, blksz, &abuf, 524 ARC_BUFC_DATA); 525 uint64_t *ptr; 526 for (ptr = abuf->b_data; 527 (char *)ptr < (char *)abuf->b_data + blksz; 528 ptr++) 529 *ptr = 0x2f5baddb10cULL; 530 } else { 531 return (SET_ERROR(EIO)); 532 } 533 } 534 535 offset = zb->zb_blkid * blksz; 536 537 if (!(dsp->dsa_featureflags & 538 DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 539 blksz > SPA_OLD_MAXBLOCKSIZE) { 540 char *buf = abuf->b_data; 541 while (blksz > 0 && err == 0) { 542 int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); 543 err = dump_write(dsp, type, zb->zb_object, 544 offset, n, NULL, buf); 545 offset += n; 546 buf += n; 547 blksz -= n; 548 } 549 } else { 550 err = dump_write(dsp, type, zb->zb_object, 551 offset, blksz, bp, abuf->b_data); 552 } 553 (void) arc_buf_remove_ref(abuf, &abuf); 554 } 555 556 ASSERT(err == 0 || err == EINTR); 557 return (err); 558 } 559 560 /* 561 * Releases dp using the specified tag. 562 */ 563 static int 564 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, 565 zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, 566 boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) 567 { 568 objset_t *os; 569 dmu_replay_record_t *drr; 570 dmu_sendarg_t *dsp; 571 int err; 572 uint64_t fromtxg = 0; 573 uint64_t featureflags = 0; 574 575 err = dmu_objset_from_ds(ds, &os); 576 if (err != 0) { 577 dsl_pool_rele(dp, tag); 578 return (err); 579 } 580 581 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 582 drr->drr_type = DRR_BEGIN; 583 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 584 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 585 DMU_SUBSTREAM); 586 587 #ifdef _KERNEL 588 if (dmu_objset_type(os) == DMU_OST_ZFS) { 589 uint64_t version; 590 if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { 591 kmem_free(drr, sizeof (dmu_replay_record_t)); 592 dsl_pool_rele(dp, tag); 593 return (SET_ERROR(EINVAL)); 594 } 595 if (version >= ZPL_VERSION_SA) { 596 featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; 597 } 598 } 599 #endif 600 601 if (large_block_ok && ds->ds_large_blocks) 602 featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; 603 if (embedok && 604 spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { 605 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; 606 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 607 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; 608 } else { 609 embedok = B_FALSE; 610 } 611 612 DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, 613 featureflags); 614 615 drr->drr_u.drr_begin.drr_creation_time = 616 dsl_dataset_phys(ds)->ds_creation_time; 617 drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); 618 if (is_clone) 619 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 620 drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid; 621 if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) 622 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 623 624 if (fromzb != NULL) { 625 drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid; 626 fromtxg = fromzb->zbm_creation_txg; 627 } 628 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 629 if (!ds->ds_is_snapshot) { 630 (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", 631 sizeof (drr->drr_u.drr_begin.drr_toname)); 632 } 633 634 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); 635 636 dsp->dsa_drr = drr; 637 dsp->dsa_vp = vp; 638 dsp->dsa_outfd = outfd; 639 dsp->dsa_proc = curproc; 640 dsp->dsa_os = os; 641 dsp->dsa_off = off; 642 dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid; 643 dsp->dsa_pending_op = PENDING_NONE; 644 dsp->dsa_incremental = (fromzb != NULL); 645 dsp->dsa_featureflags = featureflags; 646 647 mutex_enter(&ds->ds_sendstream_lock); 648 list_insert_head(&ds->ds_sendstreams, dsp); 649 mutex_exit(&ds->ds_sendstream_lock); 650 651 dsl_dataset_long_hold(ds, FTAG); 652 dsl_pool_rele(dp, tag); 653 654 if (dump_record(dsp, NULL, 0) != 0) { 655 err = dsp->dsa_err; 656 goto out; 657 } 658 659 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 660 backup_cb, dsp); 661 662 if (dsp->dsa_pending_op != PENDING_NONE) 663 if (dump_record(dsp, NULL, 0) != 0) 664 err = SET_ERROR(EINTR); 665 666 if (err != 0) { 667 if (err == EINTR && dsp->dsa_err != 0) 668 err = dsp->dsa_err; 669 goto out; 670 } 671 672 bzero(drr, sizeof (dmu_replay_record_t)); 673 drr->drr_type = DRR_END; 674 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; 675 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; 676 677 if (dump_record(dsp, NULL, 0) != 0) { 678 err = dsp->dsa_err; 679 goto out; 680 } 681 682 out: 683 mutex_enter(&ds->ds_sendstream_lock); 684 list_remove(&ds->ds_sendstreams, dsp); 685 mutex_exit(&ds->ds_sendstream_lock); 686 687 kmem_free(drr, sizeof (dmu_replay_record_t)); 688 kmem_free(dsp, sizeof (dmu_sendarg_t)); 689 690 dsl_dataset_long_rele(ds, FTAG); 691 692 return (err); 693 } 694 695 int 696 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, 697 boolean_t embedok, boolean_t large_block_ok, 698 int outfd, vnode_t *vp, offset_t *off) 699 { 700 dsl_pool_t *dp; 701 dsl_dataset_t *ds; 702 dsl_dataset_t *fromds = NULL; 703 int err; 704 705 err = dsl_pool_hold(pool, FTAG, &dp); 706 if (err != 0) 707 return (err); 708 709 err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); 710 if (err != 0) { 711 dsl_pool_rele(dp, FTAG); 712 return (err); 713 } 714 715 if (fromsnap != 0) { 716 zfs_bookmark_phys_t zb; 717 boolean_t is_clone; 718 719 err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); 720 if (err != 0) { 721 dsl_dataset_rele(ds, FTAG); 722 dsl_pool_rele(dp, FTAG); 723 return (err); 724 } 725 if (!dsl_dataset_is_before(ds, fromds, 0)) 726 err = SET_ERROR(EXDEV); 727 zb.zbm_creation_time = 728 dsl_dataset_phys(fromds)->ds_creation_time; 729 zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; 730 zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; 731 is_clone = (fromds->ds_dir != ds->ds_dir); 732 dsl_dataset_rele(fromds, FTAG); 733 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, 734 embedok, large_block_ok, outfd, vp, off); 735 } else { 736 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, 737 embedok, large_block_ok, outfd, vp, off); 738 } 739 dsl_dataset_rele(ds, FTAG); 740 return (err); 741 } 742 743 int 744 dmu_send(const char *tosnap, const char *fromsnap, 745 boolean_t embedok, boolean_t large_block_ok, 746 int outfd, vnode_t *vp, offset_t *off) 747 { 748 dsl_pool_t *dp; 749 dsl_dataset_t *ds; 750 int err; 751 boolean_t owned = B_FALSE; 752 753 if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) 754 return (SET_ERROR(EINVAL)); 755 756 err = dsl_pool_hold(tosnap, FTAG, &dp); 757 if (err != 0) 758 return (err); 759 760 if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { 761 /* 762 * We are sending a filesystem or volume. Ensure 763 * that it doesn't change by owning the dataset. 764 */ 765 err = dsl_dataset_own(dp, tosnap, FTAG, &ds); 766 owned = B_TRUE; 767 } else { 768 err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); 769 } 770 if (err != 0) { 771 dsl_pool_rele(dp, FTAG); 772 return (err); 773 } 774 775 if (fromsnap != NULL) { 776 zfs_bookmark_phys_t zb; 777 boolean_t is_clone = B_FALSE; 778 int fsnamelen = strchr(tosnap, '@') - tosnap; 779 780 /* 781 * If the fromsnap is in a different filesystem, then 782 * mark the send stream as a clone. 783 */ 784 if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || 785 (fromsnap[fsnamelen] != '@' && 786 fromsnap[fsnamelen] != '#')) { 787 is_clone = B_TRUE; 788 } 789 790 if (strchr(fromsnap, '@')) { 791 dsl_dataset_t *fromds; 792 err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); 793 if (err == 0) { 794 if (!dsl_dataset_is_before(ds, fromds, 0)) 795 err = SET_ERROR(EXDEV); 796 zb.zbm_creation_time = 797 dsl_dataset_phys(fromds)->ds_creation_time; 798 zb.zbm_creation_txg = 799 dsl_dataset_phys(fromds)->ds_creation_txg; 800 zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; 801 is_clone = (ds->ds_dir != fromds->ds_dir); 802 dsl_dataset_rele(fromds, FTAG); 803 } 804 } else { 805 err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); 806 } 807 if (err != 0) { 808 dsl_dataset_rele(ds, FTAG); 809 dsl_pool_rele(dp, FTAG); 810 return (err); 811 } 812 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, 813 embedok, large_block_ok, outfd, vp, off); 814 } else { 815 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, 816 embedok, large_block_ok, outfd, vp, off); 817 } 818 if (owned) 819 dsl_dataset_disown(ds, FTAG); 820 else 821 dsl_dataset_rele(ds, FTAG); 822 return (err); 823 } 824 825 static int 826 dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size, 827 uint64_t *sizep) 828 { 829 int err; 830 /* 831 * Assume that space (both on-disk and in-stream) is dominated by 832 * data. We will adjust for indirect blocks and the copies property, 833 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 834 */ 835 836 /* 837 * Subtract out approximate space used by indirect blocks. 838 * Assume most space is used by data blocks (non-indirect, non-dnode). 839 * Assume all blocks are recordsize. Assume ditto blocks and 840 * internal fragmentation counter out compression. 841 * 842 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 843 * block, which we observe in practice. 844 */ 845 uint64_t recordsize; 846 err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); 847 if (err != 0) 848 return (err); 849 size -= size / recordsize * sizeof (blkptr_t); 850 851 /* Add in the space for the record associated with each block. */ 852 size += size / recordsize * sizeof (dmu_replay_record_t); 853 854 *sizep = size; 855 856 return (0); 857 } 858 859 int 860 dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) 861 { 862 dsl_pool_t *dp = ds->ds_dir->dd_pool; 863 int err; 864 uint64_t size; 865 866 ASSERT(dsl_pool_config_held(dp)); 867 868 /* tosnap must be a snapshot */ 869 if (!ds->ds_is_snapshot) 870 return (SET_ERROR(EINVAL)); 871 872 /* 873 * fromsnap must be an earlier snapshot from the same fs as tosnap, 874 * or the origin's fs. 875 */ 876 if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) 877 return (SET_ERROR(EXDEV)); 878 879 /* Get uncompressed size estimate of changed data. */ 880 if (fromds == NULL) { 881 size = dsl_dataset_phys(ds)->ds_uncompressed_bytes; 882 } else { 883 uint64_t used, comp; 884 err = dsl_dataset_space_written(fromds, ds, 885 &used, &comp, &size); 886 if (err != 0) 887 return (err); 888 } 889 890 err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep); 891 return (err); 892 } 893 894 /* 895 * Simple callback used to traverse the blocks of a snapshot and sum their 896 * uncompressed size 897 */ 898 /* ARGSUSED */ 899 static int 900 dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 901 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 902 { 903 uint64_t *spaceptr = arg; 904 if (bp != NULL && !BP_IS_HOLE(bp)) { 905 *spaceptr += BP_GET_UCSIZE(bp); 906 } 907 return (0); 908 } 909 910 /* 911 * Given a desination snapshot and a TXG, calculate the approximate size of a 912 * send stream sent from that TXG. from_txg may be zero, indicating that the 913 * whole snapshot will be sent. 914 */ 915 int 916 dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, 917 uint64_t *sizep) 918 { 919 dsl_pool_t *dp = ds->ds_dir->dd_pool; 920 int err; 921 uint64_t size = 0; 922 923 ASSERT(dsl_pool_config_held(dp)); 924 925 /* tosnap must be a snapshot */ 926 if (!dsl_dataset_is_snapshot(ds)) 927 return (SET_ERROR(EINVAL)); 928 929 /* verify that from_txg is before the provided snapshot was taken */ 930 if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) { 931 return (SET_ERROR(EXDEV)); 932 } 933 934 /* 935 * traverse the blocks of the snapshot with birth times after 936 * from_txg, summing their uncompressed size 937 */ 938 err = traverse_dataset(ds, from_txg, TRAVERSE_POST, 939 dmu_calculate_send_traversal, &size); 940 if (err) 941 return (err); 942 943 err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep); 944 return (err); 945 } 946 947 typedef struct dmu_recv_begin_arg { 948 const char *drba_origin; 949 dmu_recv_cookie_t *drba_cookie; 950 cred_t *drba_cred; 951 uint64_t drba_snapobj; 952 } dmu_recv_begin_arg_t; 953 954 static int 955 recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, 956 uint64_t fromguid) 957 { 958 uint64_t val; 959 int error; 960 dsl_pool_t *dp = ds->ds_dir->dd_pool; 961 962 /* temporary clone name must not exist */ 963 error = zap_lookup(dp->dp_meta_objset, 964 dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, 965 8, 1, &val); 966 if (error != ENOENT) 967 return (error == 0 ? EBUSY : error); 968 969 /* new snapshot name must not exist */ 970 error = zap_lookup(dp->dp_meta_objset, 971 dsl_dataset_phys(ds)->ds_snapnames_zapobj, 972 drba->drba_cookie->drc_tosnap, 8, 1, &val); 973 if (error != ENOENT) 974 return (error == 0 ? EEXIST : error); 975 976 /* 977 * Check snapshot limit before receiving. We'll recheck again at the 978 * end, but might as well abort before receiving if we're already over 979 * the limit. 980 * 981 * Note that we do not check the file system limit with 982 * dsl_dir_fscount_check because the temporary %clones don't count 983 * against that limit. 984 */ 985 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, 986 NULL, drba->drba_cred); 987 if (error != 0) 988 return (error); 989 990 if (fromguid != 0) { 991 dsl_dataset_t *snap; 992 uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 993 994 /* Find snapshot in this dir that matches fromguid. */ 995 while (obj != 0) { 996 error = dsl_dataset_hold_obj(dp, obj, FTAG, 997 &snap); 998 if (error != 0) 999 return (SET_ERROR(ENODEV)); 1000 if (snap->ds_dir != ds->ds_dir) { 1001 dsl_dataset_rele(snap, FTAG); 1002 return (SET_ERROR(ENODEV)); 1003 } 1004 if (dsl_dataset_phys(snap)->ds_guid == fromguid) 1005 break; 1006 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; 1007 dsl_dataset_rele(snap, FTAG); 1008 } 1009 if (obj == 0) 1010 return (SET_ERROR(ENODEV)); 1011 1012 if (drba->drba_cookie->drc_force) { 1013 drba->drba_snapobj = obj; 1014 } else { 1015 /* 1016 * If we are not forcing, there must be no 1017 * changes since fromsnap. 1018 */ 1019 if (dsl_dataset_modified_since_snap(ds, snap)) { 1020 dsl_dataset_rele(snap, FTAG); 1021 return (SET_ERROR(ETXTBSY)); 1022 } 1023 drba->drba_snapobj = ds->ds_prev->ds_object; 1024 } 1025 1026 dsl_dataset_rele(snap, FTAG); 1027 } else { 1028 /* if full, most recent snapshot must be $ORIGIN */ 1029 if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= TXG_INITIAL) 1030 return (SET_ERROR(ENODEV)); 1031 drba->drba_snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 1032 } 1033 1034 return (0); 1035 1036 } 1037 1038 static int 1039 dmu_recv_begin_check(void *arg, dmu_tx_t *tx) 1040 { 1041 dmu_recv_begin_arg_t *drba = arg; 1042 dsl_pool_t *dp = dmu_tx_pool(tx); 1043 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 1044 uint64_t fromguid = drrb->drr_fromguid; 1045 int flags = drrb->drr_flags; 1046 int error; 1047 uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 1048 dsl_dataset_t *ds; 1049 const char *tofs = drba->drba_cookie->drc_tofs; 1050 1051 /* already checked */ 1052 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); 1053 1054 if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == 1055 DMU_COMPOUNDSTREAM || 1056 drrb->drr_type >= DMU_OST_NUMTYPES || 1057 ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) 1058 return (SET_ERROR(EINVAL)); 1059 1060 /* Verify pool version supports SA if SA_SPILL feature set */ 1061 if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 1062 spa_version(dp->dp_spa) < SPA_VERSION_SA) 1063 return (SET_ERROR(ENOTSUP)); 1064 1065 /* 1066 * The receiving code doesn't know how to translate a WRITE_EMBEDDED 1067 * record to a plan WRITE record, so the pool must have the 1068 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED 1069 * records. Same with WRITE_EMBEDDED records that use LZ4 compression. 1070 */ 1071 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && 1072 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) 1073 return (SET_ERROR(ENOTSUP)); 1074 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && 1075 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 1076 return (SET_ERROR(ENOTSUP)); 1077 1078 /* 1079 * The receiving code doesn't know how to translate large blocks 1080 * to smaller ones, so the pool must have the LARGE_BLOCKS 1081 * feature enabled if the stream has LARGE_BLOCKS. 1082 */ 1083 if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 1084 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) 1085 return (SET_ERROR(ENOTSUP)); 1086 1087 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1088 if (error == 0) { 1089 /* target fs already exists; recv into temp clone */ 1090 1091 /* Can't recv a clone into an existing fs */ 1092 if (flags & DRR_FLAG_CLONE) { 1093 dsl_dataset_rele(ds, FTAG); 1094 return (SET_ERROR(EINVAL)); 1095 } 1096 1097 error = recv_begin_check_existing_impl(drba, ds, fromguid); 1098 dsl_dataset_rele(ds, FTAG); 1099 } else if (error == ENOENT) { 1100 /* target fs does not exist; must be a full backup or clone */ 1101 char buf[MAXNAMELEN]; 1102 1103 /* 1104 * If it's a non-clone incremental, we are missing the 1105 * target fs, so fail the recv. 1106 */ 1107 if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) 1108 return (SET_ERROR(ENOENT)); 1109 1110 /* Open the parent of tofs */ 1111 ASSERT3U(strlen(tofs), <, MAXNAMELEN); 1112 (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); 1113 error = dsl_dataset_hold(dp, buf, FTAG, &ds); 1114 if (error != 0) 1115 return (error); 1116 1117 /* 1118 * Check filesystem and snapshot limits before receiving. We'll 1119 * recheck snapshot limits again at the end (we create the 1120 * filesystems and increment those counts during begin_sync). 1121 */ 1122 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1123 ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); 1124 if (error != 0) { 1125 dsl_dataset_rele(ds, FTAG); 1126 return (error); 1127 } 1128 1129 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1130 ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); 1131 if (error != 0) { 1132 dsl_dataset_rele(ds, FTAG); 1133 return (error); 1134 } 1135 1136 if (drba->drba_origin != NULL) { 1137 dsl_dataset_t *origin; 1138 error = dsl_dataset_hold(dp, drba->drba_origin, 1139 FTAG, &origin); 1140 if (error != 0) { 1141 dsl_dataset_rele(ds, FTAG); 1142 return (error); 1143 } 1144 if (!origin->ds_is_snapshot) { 1145 dsl_dataset_rele(origin, FTAG); 1146 dsl_dataset_rele(ds, FTAG); 1147 return (SET_ERROR(EINVAL)); 1148 } 1149 if (dsl_dataset_phys(origin)->ds_guid != fromguid) { 1150 dsl_dataset_rele(origin, FTAG); 1151 dsl_dataset_rele(ds, FTAG); 1152 return (SET_ERROR(ENODEV)); 1153 } 1154 dsl_dataset_rele(origin, FTAG); 1155 } 1156 dsl_dataset_rele(ds, FTAG); 1157 error = 0; 1158 } 1159 return (error); 1160 } 1161 1162 static void 1163 dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) 1164 { 1165 dmu_recv_begin_arg_t *drba = arg; 1166 dsl_pool_t *dp = dmu_tx_pool(tx); 1167 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 1168 const char *tofs = drba->drba_cookie->drc_tofs; 1169 dsl_dataset_t *ds, *newds; 1170 uint64_t dsobj; 1171 int error; 1172 uint64_t crflags; 1173 1174 crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? 1175 DS_FLAG_CI_DATASET : 0; 1176 1177 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1178 if (error == 0) { 1179 /* create temporary clone */ 1180 dsl_dataset_t *snap = NULL; 1181 if (drba->drba_snapobj != 0) { 1182 VERIFY0(dsl_dataset_hold_obj(dp, 1183 drba->drba_snapobj, FTAG, &snap)); 1184 } 1185 dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, 1186 snap, crflags, drba->drba_cred, tx); 1187 if (drba->drba_snapobj != 0) 1188 dsl_dataset_rele(snap, FTAG); 1189 dsl_dataset_rele(ds, FTAG); 1190 } else { 1191 dsl_dir_t *dd; 1192 const char *tail; 1193 dsl_dataset_t *origin = NULL; 1194 1195 VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); 1196 1197 if (drba->drba_origin != NULL) { 1198 VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, 1199 FTAG, &origin)); 1200 } 1201 1202 /* Create new dataset. */ 1203 dsobj = dsl_dataset_create_sync(dd, 1204 strrchr(tofs, '/') + 1, 1205 origin, crflags, drba->drba_cred, tx); 1206 if (origin != NULL) 1207 dsl_dataset_rele(origin, FTAG); 1208 dsl_dir_rele(dd, FTAG); 1209 drba->drba_cookie->drc_newfs = B_TRUE; 1210 } 1211 VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); 1212 1213 if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & 1214 DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 1215 !newds->ds_large_blocks) { 1216 dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); 1217 newds->ds_large_blocks = B_TRUE; 1218 } 1219 1220 dmu_buf_will_dirty(newds->ds_dbuf, tx); 1221 dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; 1222 1223 /* 1224 * If we actually created a non-clone, we need to create the 1225 * objset in our new dataset. 1226 */ 1227 if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { 1228 (void) dmu_objset_create_impl(dp->dp_spa, 1229 newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); 1230 } 1231 1232 drba->drba_cookie->drc_ds = newds; 1233 1234 spa_history_log_internal_ds(newds, "receive", tx, ""); 1235 } 1236 1237 /* 1238 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 1239 * succeeds; otherwise we will leak the holds on the datasets. 1240 */ 1241 int 1242 dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, 1243 boolean_t force, char *origin, dmu_recv_cookie_t *drc) 1244 { 1245 dmu_recv_begin_arg_t drba = { 0 }; 1246 dmu_replay_record_t *drr; 1247 1248 bzero(drc, sizeof (dmu_recv_cookie_t)); 1249 drc->drc_drrb = drrb; 1250 drc->drc_tosnap = tosnap; 1251 drc->drc_tofs = tofs; 1252 drc->drc_force = force; 1253 drc->drc_cred = CRED(); 1254 1255 if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 1256 drc->drc_byteswap = B_TRUE; 1257 else if (drrb->drr_magic != DMU_BACKUP_MAGIC) 1258 return (SET_ERROR(EINVAL)); 1259 1260 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 1261 drr->drr_type = DRR_BEGIN; 1262 drr->drr_u.drr_begin = *drc->drc_drrb; 1263 if (drc->drc_byteswap) { 1264 fletcher_4_incremental_byteswap(drr, 1265 sizeof (dmu_replay_record_t), &drc->drc_cksum); 1266 } else { 1267 fletcher_4_incremental_native(drr, 1268 sizeof (dmu_replay_record_t), &drc->drc_cksum); 1269 } 1270 kmem_free(drr, sizeof (dmu_replay_record_t)); 1271 1272 if (drc->drc_byteswap) { 1273 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1274 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 1275 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1276 drrb->drr_type = BSWAP_32(drrb->drr_type); 1277 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1278 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1279 } 1280 1281 drba.drba_origin = origin; 1282 drba.drba_cookie = drc; 1283 drba.drba_cred = CRED(); 1284 1285 return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, 1286 &drba, 5, ZFS_SPACE_CHECK_NORMAL)); 1287 } 1288 1289 struct restorearg { 1290 objset_t *os; 1291 int err; 1292 boolean_t byteswap; 1293 vnode_t *vp; 1294 uint64_t voff; 1295 int bufsize; /* amount of memory allocated for buf */ 1296 1297 dmu_replay_record_t *drr; 1298 dmu_replay_record_t *next_drr; 1299 char *buf; 1300 zio_cksum_t cksum; 1301 zio_cksum_t prev_cksum; 1302 1303 avl_tree_t *guid_to_ds_map; 1304 }; 1305 1306 typedef struct guid_map_entry { 1307 uint64_t guid; 1308 dsl_dataset_t *gme_ds; 1309 avl_node_t avlnode; 1310 } guid_map_entry_t; 1311 1312 static int 1313 guid_compare(const void *arg1, const void *arg2) 1314 { 1315 const guid_map_entry_t *gmep1 = arg1; 1316 const guid_map_entry_t *gmep2 = arg2; 1317 1318 if (gmep1->guid < gmep2->guid) 1319 return (-1); 1320 else if (gmep1->guid > gmep2->guid) 1321 return (1); 1322 return (0); 1323 } 1324 1325 static void 1326 free_guid_map_onexit(void *arg) 1327 { 1328 avl_tree_t *ca = arg; 1329 void *cookie = NULL; 1330 guid_map_entry_t *gmep; 1331 1332 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 1333 dsl_dataset_long_rele(gmep->gme_ds, gmep); 1334 dsl_dataset_rele(gmep->gme_ds, gmep); 1335 kmem_free(gmep, sizeof (guid_map_entry_t)); 1336 } 1337 avl_destroy(ca); 1338 kmem_free(ca, sizeof (avl_tree_t)); 1339 } 1340 1341 static int 1342 restore_read(struct restorearg *ra, int len, void *buf) 1343 { 1344 int done = 0; 1345 1346 /* some things will require 8-byte alignment, so everything must */ 1347 ASSERT0(len % 8); 1348 ASSERT3U(len, <=, ra->bufsize); 1349 1350 while (done < len) { 1351 ssize_t resid; 1352 1353 ra->err = vn_rdwr(UIO_READ, ra->vp, 1354 (char *)buf + done, len - done, 1355 ra->voff, UIO_SYSSPACE, FAPPEND, 1356 RLIM64_INFINITY, CRED(), &resid); 1357 1358 if (resid == len - done) 1359 ra->err = SET_ERROR(EINVAL); 1360 ra->voff += len - done - resid; 1361 done = len - resid; 1362 if (ra->err != 0) 1363 return (ra->err); 1364 } 1365 1366 ASSERT3U(done, ==, len); 1367 return (0); 1368 } 1369 1370 static void 1371 byteswap_record(dmu_replay_record_t *drr) 1372 { 1373 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 1374 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 1375 drr->drr_type = BSWAP_32(drr->drr_type); 1376 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 1377 1378 switch (drr->drr_type) { 1379 case DRR_BEGIN: 1380 DO64(drr_begin.drr_magic); 1381 DO64(drr_begin.drr_versioninfo); 1382 DO64(drr_begin.drr_creation_time); 1383 DO32(drr_begin.drr_type); 1384 DO32(drr_begin.drr_flags); 1385 DO64(drr_begin.drr_toguid); 1386 DO64(drr_begin.drr_fromguid); 1387 break; 1388 case DRR_OBJECT: 1389 DO64(drr_object.drr_object); 1390 DO32(drr_object.drr_type); 1391 DO32(drr_object.drr_bonustype); 1392 DO32(drr_object.drr_blksz); 1393 DO32(drr_object.drr_bonuslen); 1394 DO64(drr_object.drr_toguid); 1395 break; 1396 case DRR_FREEOBJECTS: 1397 DO64(drr_freeobjects.drr_firstobj); 1398 DO64(drr_freeobjects.drr_numobjs); 1399 DO64(drr_freeobjects.drr_toguid); 1400 break; 1401 case DRR_WRITE: 1402 DO64(drr_write.drr_object); 1403 DO32(drr_write.drr_type); 1404 DO64(drr_write.drr_offset); 1405 DO64(drr_write.drr_length); 1406 DO64(drr_write.drr_toguid); 1407 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); 1408 DO64(drr_write.drr_key.ddk_prop); 1409 break; 1410 case DRR_WRITE_BYREF: 1411 DO64(drr_write_byref.drr_object); 1412 DO64(drr_write_byref.drr_offset); 1413 DO64(drr_write_byref.drr_length); 1414 DO64(drr_write_byref.drr_toguid); 1415 DO64(drr_write_byref.drr_refguid); 1416 DO64(drr_write_byref.drr_refobject); 1417 DO64(drr_write_byref.drr_refoffset); 1418 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. 1419 drr_key.ddk_cksum); 1420 DO64(drr_write_byref.drr_key.ddk_prop); 1421 break; 1422 case DRR_WRITE_EMBEDDED: 1423 DO64(drr_write_embedded.drr_object); 1424 DO64(drr_write_embedded.drr_offset); 1425 DO64(drr_write_embedded.drr_length); 1426 DO64(drr_write_embedded.drr_toguid); 1427 DO32(drr_write_embedded.drr_lsize); 1428 DO32(drr_write_embedded.drr_psize); 1429 break; 1430 case DRR_FREE: 1431 DO64(drr_free.drr_object); 1432 DO64(drr_free.drr_offset); 1433 DO64(drr_free.drr_length); 1434 DO64(drr_free.drr_toguid); 1435 break; 1436 case DRR_SPILL: 1437 DO64(drr_spill.drr_object); 1438 DO64(drr_spill.drr_length); 1439 DO64(drr_spill.drr_toguid); 1440 break; 1441 case DRR_END: 1442 DO64(drr_end.drr_toguid); 1443 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); 1444 break; 1445 } 1446 1447 if (drr->drr_type != DRR_BEGIN) { 1448 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); 1449 } 1450 1451 #undef DO64 1452 #undef DO32 1453 } 1454 1455 static inline uint8_t 1456 deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) 1457 { 1458 if (bonus_type == DMU_OT_SA) { 1459 return (1); 1460 } else { 1461 return (1 + 1462 ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT)); 1463 } 1464 } 1465 1466 static int 1467 restore_object(struct restorearg *ra, struct drr_object *drro, void *data) 1468 { 1469 dmu_object_info_t doi; 1470 dmu_tx_t *tx; 1471 uint64_t object; 1472 int err; 1473 1474 if (drro->drr_type == DMU_OT_NONE || 1475 !DMU_OT_IS_VALID(drro->drr_type) || 1476 !DMU_OT_IS_VALID(drro->drr_bonustype) || 1477 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 1478 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1479 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1480 drro->drr_blksz < SPA_MINBLOCKSIZE || 1481 drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(ra->os)) || 1482 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1483 return (SET_ERROR(EINVAL)); 1484 } 1485 1486 err = dmu_object_info(ra->os, drro->drr_object, &doi); 1487 1488 if (err != 0 && err != ENOENT) 1489 return (SET_ERROR(EINVAL)); 1490 object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; 1491 1492 /* 1493 * If we are losing blkptrs or changing the block size this must 1494 * be a new file instance. We must clear out the previous file 1495 * contents before we can change this type of metadata in the dnode. 1496 */ 1497 if (err == 0) { 1498 int nblkptr; 1499 1500 nblkptr = deduce_nblkptr(drro->drr_bonustype, 1501 drro->drr_bonuslen); 1502 1503 if (drro->drr_blksz != doi.doi_data_block_size || 1504 nblkptr < doi.doi_nblkptr) { 1505 err = dmu_free_long_range(ra->os, drro->drr_object, 1506 0, DMU_OBJECT_END); 1507 if (err != 0) 1508 return (SET_ERROR(EINVAL)); 1509 } 1510 } 1511 1512 tx = dmu_tx_create(ra->os); 1513 dmu_tx_hold_bonus(tx, object); 1514 err = dmu_tx_assign(tx, TXG_WAIT); 1515 if (err != 0) { 1516 dmu_tx_abort(tx); 1517 return (err); 1518 } 1519 1520 if (object == DMU_NEW_OBJECT) { 1521 /* currently free, want to be allocated */ 1522 err = dmu_object_claim(ra->os, drro->drr_object, 1523 drro->drr_type, drro->drr_blksz, 1524 drro->drr_bonustype, drro->drr_bonuslen, tx); 1525 } else if (drro->drr_type != doi.doi_type || 1526 drro->drr_blksz != doi.doi_data_block_size || 1527 drro->drr_bonustype != doi.doi_bonus_type || 1528 drro->drr_bonuslen != doi.doi_bonus_size) { 1529 /* currently allocated, but with different properties */ 1530 err = dmu_object_reclaim(ra->os, drro->drr_object, 1531 drro->drr_type, drro->drr_blksz, 1532 drro->drr_bonustype, drro->drr_bonuslen, tx); 1533 } 1534 if (err != 0) { 1535 dmu_tx_commit(tx); 1536 return (SET_ERROR(EINVAL)); 1537 } 1538 1539 dmu_object_set_checksum(ra->os, drro->drr_object, 1540 drro->drr_checksumtype, tx); 1541 dmu_object_set_compress(ra->os, drro->drr_object, 1542 drro->drr_compress, tx); 1543 1544 if (data != NULL) { 1545 dmu_buf_t *db; 1546 1547 VERIFY0(dmu_bonus_hold(ra->os, drro->drr_object, FTAG, &db)); 1548 dmu_buf_will_dirty(db, tx); 1549 1550 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 1551 bcopy(data, db->db_data, drro->drr_bonuslen); 1552 if (ra->byteswap) { 1553 dmu_object_byteswap_t byteswap = 1554 DMU_OT_BYTESWAP(drro->drr_bonustype); 1555 dmu_ot_byteswap[byteswap].ob_func(db->db_data, 1556 drro->drr_bonuslen); 1557 } 1558 dmu_buf_rele(db, FTAG); 1559 } 1560 dmu_tx_commit(tx); 1561 return (0); 1562 } 1563 1564 /* ARGSUSED */ 1565 static int 1566 restore_freeobjects(struct restorearg *ra, 1567 struct drr_freeobjects *drrfo) 1568 { 1569 uint64_t obj; 1570 1571 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1572 return (SET_ERROR(EINVAL)); 1573 1574 for (obj = drrfo->drr_firstobj; 1575 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1576 (void) dmu_object_next(ra->os, &obj, FALSE, 0)) { 1577 int err; 1578 1579 if (dmu_object_info(ra->os, obj, NULL) != 0) 1580 continue; 1581 1582 err = dmu_free_long_object(ra->os, obj); 1583 if (err != 0) 1584 return (err); 1585 } 1586 return (0); 1587 } 1588 1589 static int 1590 restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) 1591 { 1592 dmu_tx_t *tx; 1593 int err; 1594 1595 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1596 !DMU_OT_IS_VALID(drrw->drr_type)) 1597 return (SET_ERROR(EINVAL)); 1598 1599 if (dmu_object_info(ra->os, drrw->drr_object, NULL) != 0) 1600 return (SET_ERROR(EINVAL)); 1601 1602 tx = dmu_tx_create(ra->os); 1603 1604 dmu_tx_hold_write(tx, drrw->drr_object, 1605 drrw->drr_offset, drrw->drr_length); 1606 err = dmu_tx_assign(tx, TXG_WAIT); 1607 if (err != 0) { 1608 dmu_tx_abort(tx); 1609 return (err); 1610 } 1611 if (ra->byteswap) { 1612 dmu_object_byteswap_t byteswap = 1613 DMU_OT_BYTESWAP(drrw->drr_type); 1614 dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, 1615 drrw->drr_length); 1616 } 1617 1618 dmu_buf_t *bonus; 1619 if (dmu_bonus_hold(ra->os, drrw->drr_object, FTAG, &bonus) != 0) 1620 return (SET_ERROR(EINVAL)); 1621 dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); 1622 dmu_tx_commit(tx); 1623 dmu_buf_rele(bonus, FTAG); 1624 return (0); 1625 } 1626 1627 /* 1628 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1629 * streams to refer to a copy of the data that is already on the 1630 * system because it came in earlier in the stream. This function 1631 * finds the earlier copy of the data, and uses that copy instead of 1632 * data from the stream to fulfill this write. 1633 */ 1634 static int 1635 restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) 1636 { 1637 dmu_tx_t *tx; 1638 int err; 1639 guid_map_entry_t gmesrch; 1640 guid_map_entry_t *gmep; 1641 avl_index_t where; 1642 objset_t *ref_os = NULL; 1643 dmu_buf_t *dbp; 1644 1645 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1646 return (SET_ERROR(EINVAL)); 1647 1648 /* 1649 * If the GUID of the referenced dataset is different from the 1650 * GUID of the target dataset, find the referenced dataset. 1651 */ 1652 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1653 gmesrch.guid = drrwbr->drr_refguid; 1654 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 1655 &where)) == NULL) { 1656 return (SET_ERROR(EINVAL)); 1657 } 1658 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1659 return (SET_ERROR(EINVAL)); 1660 } else { 1661 ref_os = ra->os; 1662 } 1663 1664 err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1665 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); 1666 if (err != 0) 1667 return (err); 1668 1669 tx = dmu_tx_create(ra->os); 1670 1671 dmu_tx_hold_write(tx, drrwbr->drr_object, 1672 drrwbr->drr_offset, drrwbr->drr_length); 1673 err = dmu_tx_assign(tx, TXG_WAIT); 1674 if (err != 0) { 1675 dmu_tx_abort(tx); 1676 return (err); 1677 } 1678 dmu_write(ra->os, drrwbr->drr_object, 1679 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1680 dmu_buf_rele(dbp, FTAG); 1681 dmu_tx_commit(tx); 1682 return (0); 1683 } 1684 1685 static int 1686 restore_write_embedded(struct restorearg *ra, 1687 struct drr_write_embedded *drrwnp, void *data) 1688 { 1689 dmu_tx_t *tx; 1690 int err; 1691 1692 if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) 1693 return (EINVAL); 1694 1695 if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE) 1696 return (EINVAL); 1697 1698 if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES) 1699 return (EINVAL); 1700 if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) 1701 return (EINVAL); 1702 1703 tx = dmu_tx_create(ra->os); 1704 1705 dmu_tx_hold_write(tx, drrwnp->drr_object, 1706 drrwnp->drr_offset, drrwnp->drr_length); 1707 err = dmu_tx_assign(tx, TXG_WAIT); 1708 if (err != 0) { 1709 dmu_tx_abort(tx); 1710 return (err); 1711 } 1712 1713 dmu_write_embedded(ra->os, drrwnp->drr_object, 1714 drrwnp->drr_offset, data, drrwnp->drr_etype, 1715 drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, 1716 ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); 1717 1718 dmu_tx_commit(tx); 1719 return (0); 1720 } 1721 1722 static int 1723 restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data) 1724 { 1725 dmu_tx_t *tx; 1726 dmu_buf_t *db, *db_spill; 1727 int err; 1728 1729 if (drrs->drr_length < SPA_MINBLOCKSIZE || 1730 drrs->drr_length > spa_maxblocksize(dmu_objset_spa(ra->os))) 1731 return (SET_ERROR(EINVAL)); 1732 1733 if (dmu_object_info(ra->os, drrs->drr_object, NULL) != 0) 1734 return (SET_ERROR(EINVAL)); 1735 1736 VERIFY0(dmu_bonus_hold(ra->os, drrs->drr_object, FTAG, &db)); 1737 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 1738 dmu_buf_rele(db, FTAG); 1739 return (err); 1740 } 1741 1742 tx = dmu_tx_create(ra->os); 1743 1744 dmu_tx_hold_spill(tx, db->db_object); 1745 1746 err = dmu_tx_assign(tx, TXG_WAIT); 1747 if (err != 0) { 1748 dmu_buf_rele(db, FTAG); 1749 dmu_buf_rele(db_spill, FTAG); 1750 dmu_tx_abort(tx); 1751 return (err); 1752 } 1753 dmu_buf_will_dirty(db_spill, tx); 1754 1755 if (db_spill->db_size < drrs->drr_length) 1756 VERIFY(0 == dbuf_spill_set_blksz(db_spill, 1757 drrs->drr_length, tx)); 1758 bcopy(data, db_spill->db_data, drrs->drr_length); 1759 1760 dmu_buf_rele(db, FTAG); 1761 dmu_buf_rele(db_spill, FTAG); 1762 1763 dmu_tx_commit(tx); 1764 return (0); 1765 } 1766 1767 /* ARGSUSED */ 1768 static int 1769 restore_free(struct restorearg *ra, struct drr_free *drrf) 1770 { 1771 int err; 1772 1773 if (drrf->drr_length != -1ULL && 1774 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1775 return (SET_ERROR(EINVAL)); 1776 1777 if (dmu_object_info(ra->os, drrf->drr_object, NULL) != 0) 1778 return (SET_ERROR(EINVAL)); 1779 1780 err = dmu_free_long_range(ra->os, drrf->drr_object, 1781 drrf->drr_offset, drrf->drr_length); 1782 return (err); 1783 } 1784 1785 /* used to destroy the drc_ds on error */ 1786 static void 1787 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) 1788 { 1789 char name[MAXNAMELEN]; 1790 dsl_dataset_name(drc->drc_ds, name); 1791 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 1792 (void) dsl_destroy_head(name); 1793 } 1794 1795 static void 1796 restore_cksum(struct restorearg *ra, int len, void *buf) 1797 { 1798 if (ra->byteswap) { 1799 fletcher_4_incremental_byteswap(buf, len, &ra->cksum); 1800 } else { 1801 fletcher_4_incremental_native(buf, len, &ra->cksum); 1802 } 1803 } 1804 1805 /* 1806 * If len != 0, read payload into buf. 1807 * Read next record's header into ra->next_drr. 1808 * Verify checksum of payload and next record. 1809 */ 1810 static int 1811 restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf) 1812 { 1813 int err; 1814 1815 if (len != 0) { 1816 ASSERT3U(len, <=, ra->bufsize); 1817 err = restore_read(ra, len, buf); 1818 if (err != 0) 1819 return (err); 1820 restore_cksum(ra, len, buf); 1821 } 1822 1823 ra->prev_cksum = ra->cksum; 1824 1825 err = restore_read(ra, sizeof (*ra->next_drr), ra->next_drr); 1826 if (err != 0) 1827 return (err); 1828 if (ra->next_drr->drr_type == DRR_BEGIN) 1829 return (SET_ERROR(EINVAL)); 1830 1831 /* 1832 * Note: checksum is of everything up to but not including the 1833 * checksum itself. 1834 */ 1835 ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), 1836 ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); 1837 restore_cksum(ra, 1838 offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), 1839 ra->next_drr); 1840 1841 zio_cksum_t cksum_orig = ra->next_drr->drr_u.drr_checksum.drr_checksum; 1842 zio_cksum_t *cksump = &ra->next_drr->drr_u.drr_checksum.drr_checksum; 1843 1844 if (ra->byteswap) 1845 byteswap_record(ra->next_drr); 1846 1847 if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && 1848 !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) 1849 return (SET_ERROR(ECKSUM)); 1850 1851 restore_cksum(ra, sizeof (cksum_orig), &cksum_orig); 1852 1853 return (0); 1854 } 1855 1856 static int 1857 restore_process_record(struct restorearg *ra) 1858 { 1859 int err; 1860 1861 switch (ra->drr->drr_type) { 1862 case DRR_OBJECT: 1863 { 1864 struct drr_object *drro = &ra->drr->drr_u.drr_object; 1865 err = restore_read_payload_and_next_header(ra, 1866 P2ROUNDUP(drro->drr_bonuslen, 8), ra->buf); 1867 if (err != 0) 1868 return (err); 1869 return (restore_object(ra, drro, ra->buf)); 1870 } 1871 case DRR_FREEOBJECTS: 1872 { 1873 struct drr_freeobjects *drrfo = 1874 &ra->drr->drr_u.drr_freeobjects; 1875 err = restore_read_payload_and_next_header(ra, 0, NULL); 1876 if (err != 0) 1877 return (err); 1878 return (restore_freeobjects(ra, drrfo)); 1879 } 1880 case DRR_WRITE: 1881 { 1882 struct drr_write *drrw = &ra->drr->drr_u.drr_write; 1883 arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os), 1884 drrw->drr_length); 1885 1886 err = restore_read_payload_and_next_header(ra, 1887 drrw->drr_length, abuf->b_data); 1888 if (err != 0) 1889 return (err); 1890 err = restore_write(ra, drrw, abuf); 1891 /* if restore_write() is successful, it consumes the arc_buf */ 1892 if (err != 0) 1893 dmu_return_arcbuf(abuf); 1894 return (err); 1895 } 1896 case DRR_WRITE_BYREF: 1897 { 1898 struct drr_write_byref *drrwbr = 1899 &ra->drr->drr_u.drr_write_byref; 1900 err = restore_read_payload_and_next_header(ra, 0, NULL); 1901 if (err != 0) 1902 return (err); 1903 return (restore_write_byref(ra, drrwbr)); 1904 } 1905 case DRR_WRITE_EMBEDDED: 1906 { 1907 struct drr_write_embedded *drrwe = 1908 &ra->drr->drr_u.drr_write_embedded; 1909 err = restore_read_payload_and_next_header(ra, 1910 P2ROUNDUP(drrwe->drr_psize, 8), ra->buf); 1911 if (err != 0) 1912 return (err); 1913 return (restore_write_embedded(ra, drrwe, ra->buf)); 1914 } 1915 case DRR_FREE: 1916 { 1917 struct drr_free *drrf = &ra->drr->drr_u.drr_free; 1918 err = restore_read_payload_and_next_header(ra, 0, NULL); 1919 if (err != 0) 1920 return (err); 1921 return (restore_free(ra, drrf)); 1922 } 1923 case DRR_END: 1924 { 1925 struct drr_end *drre = &ra->drr->drr_u.drr_end; 1926 if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) 1927 return (SET_ERROR(EINVAL)); 1928 return (0); 1929 } 1930 case DRR_SPILL: 1931 { 1932 struct drr_spill *drrs = &ra->drr->drr_u.drr_spill; 1933 err = restore_read_payload_and_next_header(ra, 1934 drrs->drr_length, ra->buf); 1935 if (err != 0) 1936 return (err); 1937 return (restore_spill(ra, drrs, ra->buf)); 1938 } 1939 default: 1940 return (SET_ERROR(EINVAL)); 1941 } 1942 } 1943 1944 /* 1945 * NB: callers *must* call dmu_recv_end() if this succeeds. 1946 */ 1947 int 1948 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, 1949 int cleanup_fd, uint64_t *action_handlep) 1950 { 1951 int err = 0; 1952 struct restorearg ra = { 0 }; 1953 int featureflags; 1954 1955 ra.byteswap = drc->drc_byteswap; 1956 ra.cksum = drc->drc_cksum; 1957 ra.vp = vp; 1958 ra.voff = *voffp; 1959 ra.bufsize = SPA_MAXBLOCKSIZE; 1960 ra.drr = kmem_alloc(sizeof (*ra.drr), KM_SLEEP); 1961 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1962 ra.next_drr = kmem_alloc(sizeof (*ra.next_drr), KM_SLEEP); 1963 1964 /* these were verified in dmu_recv_begin */ 1965 ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, 1966 DMU_SUBSTREAM); 1967 ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); 1968 1969 /* 1970 * Open the objset we are modifying. 1971 */ 1972 VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os)); 1973 1974 ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); 1975 1976 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1977 1978 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1979 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1980 minor_t minor; 1981 1982 if (cleanup_fd == -1) { 1983 ra.err = SET_ERROR(EBADF); 1984 goto out; 1985 } 1986 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 1987 if (ra.err != 0) { 1988 cleanup_fd = -1; 1989 goto out; 1990 } 1991 1992 if (*action_handlep == 0) { 1993 ra.guid_to_ds_map = 1994 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1995 avl_create(ra.guid_to_ds_map, guid_compare, 1996 sizeof (guid_map_entry_t), 1997 offsetof(guid_map_entry_t, avlnode)); 1998 err = zfs_onexit_add_cb(minor, 1999 free_guid_map_onexit, ra.guid_to_ds_map, 2000 action_handlep); 2001 if (ra.err != 0) 2002 goto out; 2003 } else { 2004 err = zfs_onexit_cb_data(minor, *action_handlep, 2005 (void **)&ra.guid_to_ds_map); 2006 if (ra.err != 0) 2007 goto out; 2008 } 2009 2010 drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 2011 } 2012 2013 err = restore_read_payload_and_next_header(&ra, 0, NULL); 2014 if (err != 0) 2015 goto out; 2016 for (;;) { 2017 void *tmp; 2018 2019 if (issig(JUSTLOOKING) && issig(FORREAL)) { 2020 err = SET_ERROR(EINTR); 2021 break; 2022 } 2023 2024 tmp = ra.next_drr; 2025 ra.next_drr = ra.drr; 2026 ra.drr = tmp; 2027 2028 /* process ra.drr, read in ra.next_drr */ 2029 err = restore_process_record(&ra); 2030 if (err != 0) 2031 break; 2032 if (ra.drr->drr_type == DRR_END) 2033 break; 2034 } 2035 2036 out: 2037 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 2038 zfs_onexit_fd_rele(cleanup_fd); 2039 2040 if (err != 0) { 2041 /* 2042 * destroy what we created, so we don't leave it in the 2043 * inconsistent restoring state. 2044 */ 2045 dmu_recv_cleanup_ds(drc); 2046 } 2047 2048 kmem_free(ra.drr, sizeof (*ra.drr)); 2049 kmem_free(ra.buf, ra.bufsize); 2050 kmem_free(ra.next_drr, sizeof (*ra.next_drr)); 2051 *voffp = ra.voff; 2052 return (err); 2053 } 2054 2055 static int 2056 dmu_recv_end_check(void *arg, dmu_tx_t *tx) 2057 { 2058 dmu_recv_cookie_t *drc = arg; 2059 dsl_pool_t *dp = dmu_tx_pool(tx); 2060 int error; 2061 2062 ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); 2063 2064 if (!drc->drc_newfs) { 2065 dsl_dataset_t *origin_head; 2066 2067 error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); 2068 if (error != 0) 2069 return (error); 2070 if (drc->drc_force) { 2071 /* 2072 * We will destroy any snapshots in tofs (i.e. before 2073 * origin_head) that are after the origin (which is 2074 * the snap before drc_ds, because drc_ds can not 2075 * have any snaps of its own). 2076 */ 2077 uint64_t obj; 2078 2079 obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; 2080 while (obj != 2081 dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { 2082 dsl_dataset_t *snap; 2083 error = dsl_dataset_hold_obj(dp, obj, FTAG, 2084 &snap); 2085 if (error != 0) 2086 return (error); 2087 if (snap->ds_dir != origin_head->ds_dir) 2088 error = SET_ERROR(EINVAL); 2089 if (error == 0) { 2090 error = dsl_destroy_snapshot_check_impl( 2091 snap, B_FALSE); 2092 } 2093 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; 2094 dsl_dataset_rele(snap, FTAG); 2095 if (error != 0) 2096 return (error); 2097 } 2098 } 2099 error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, 2100 origin_head, drc->drc_force, drc->drc_owner, tx); 2101 if (error != 0) { 2102 dsl_dataset_rele(origin_head, FTAG); 2103 return (error); 2104 } 2105 error = dsl_dataset_snapshot_check_impl(origin_head, 2106 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 2107 dsl_dataset_rele(origin_head, FTAG); 2108 if (error != 0) 2109 return (error); 2110 2111 error = dsl_destroy_head_check_impl(drc->drc_ds, 1); 2112 } else { 2113 error = dsl_dataset_snapshot_check_impl(drc->drc_ds, 2114 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 2115 } 2116 return (error); 2117 } 2118 2119 static void 2120 dmu_recv_end_sync(void *arg, dmu_tx_t *tx) 2121 { 2122 dmu_recv_cookie_t *drc = arg; 2123 dsl_pool_t *dp = dmu_tx_pool(tx); 2124 2125 spa_history_log_internal_ds(drc->drc_ds, "finish receiving", 2126 tx, "snap=%s", drc->drc_tosnap); 2127 2128 if (!drc->drc_newfs) { 2129 dsl_dataset_t *origin_head; 2130 2131 VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, 2132 &origin_head)); 2133 2134 if (drc->drc_force) { 2135 /* 2136 * Destroy any snapshots of drc_tofs (origin_head) 2137 * after the origin (the snap before drc_ds). 2138 */ 2139 uint64_t obj; 2140 2141 obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; 2142 while (obj != 2143 dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { 2144 dsl_dataset_t *snap; 2145 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, 2146 &snap)); 2147 ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); 2148 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; 2149 dsl_destroy_snapshot_sync_impl(snap, 2150 B_FALSE, tx); 2151 dsl_dataset_rele(snap, FTAG); 2152 } 2153 } 2154 VERIFY3P(drc->drc_ds->ds_prev, ==, 2155 origin_head->ds_prev); 2156 2157 dsl_dataset_clone_swap_sync_impl(drc->drc_ds, 2158 origin_head, tx); 2159 dsl_dataset_snapshot_sync_impl(origin_head, 2160 drc->drc_tosnap, tx); 2161 2162 /* set snapshot's creation time and guid */ 2163 dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); 2164 dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = 2165 drc->drc_drrb->drr_creation_time; 2166 dsl_dataset_phys(origin_head->ds_prev)->ds_guid = 2167 drc->drc_drrb->drr_toguid; 2168 dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= 2169 ~DS_FLAG_INCONSISTENT; 2170 2171 dmu_buf_will_dirty(origin_head->ds_dbuf, tx); 2172 dsl_dataset_phys(origin_head)->ds_flags &= 2173 ~DS_FLAG_INCONSISTENT; 2174 2175 dsl_dataset_rele(origin_head, FTAG); 2176 dsl_destroy_head_sync_impl(drc->drc_ds, tx); 2177 2178 if (drc->drc_owner != NULL) 2179 VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); 2180 } else { 2181 dsl_dataset_t *ds = drc->drc_ds; 2182 2183 dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); 2184 2185 /* set snapshot's creation time and guid */ 2186 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 2187 dsl_dataset_phys(ds->ds_prev)->ds_creation_time = 2188 drc->drc_drrb->drr_creation_time; 2189 dsl_dataset_phys(ds->ds_prev)->ds_guid = 2190 drc->drc_drrb->drr_toguid; 2191 dsl_dataset_phys(ds->ds_prev)->ds_flags &= 2192 ~DS_FLAG_INCONSISTENT; 2193 2194 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2195 dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; 2196 } 2197 drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; 2198 /* 2199 * Release the hold from dmu_recv_begin. This must be done before 2200 * we return to open context, so that when we free the dataset's dnode, 2201 * we can evict its bonus buffer. 2202 */ 2203 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 2204 drc->drc_ds = NULL; 2205 } 2206 2207 static int 2208 add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) 2209 { 2210 dsl_pool_t *dp; 2211 dsl_dataset_t *snapds; 2212 guid_map_entry_t *gmep; 2213 int err; 2214 2215 ASSERT(guid_map != NULL); 2216 2217 err = dsl_pool_hold(name, FTAG, &dp); 2218 if (err != 0) 2219 return (err); 2220 gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); 2221 err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); 2222 if (err == 0) { 2223 gmep->guid = dsl_dataset_phys(snapds)->ds_guid; 2224 gmep->gme_ds = snapds; 2225 avl_add(guid_map, gmep); 2226 dsl_dataset_long_hold(snapds, gmep); 2227 } else { 2228 kmem_free(gmep, sizeof (*gmep)); 2229 } 2230 2231 dsl_pool_rele(dp, FTAG); 2232 return (err); 2233 } 2234 2235 static int dmu_recv_end_modified_blocks = 3; 2236 2237 static int 2238 dmu_recv_existing_end(dmu_recv_cookie_t *drc) 2239 { 2240 int error; 2241 char name[MAXNAMELEN]; 2242 2243 #ifdef _KERNEL 2244 /* 2245 * We will be destroying the ds; make sure its origin is unmounted if 2246 * necessary. 2247 */ 2248 dsl_dataset_name(drc->drc_ds, name); 2249 zfs_destroy_unmount_origin(name); 2250 #endif 2251 2252 error = dsl_sync_task(drc->drc_tofs, 2253 dmu_recv_end_check, dmu_recv_end_sync, drc, 2254 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); 2255 2256 if (error != 0) 2257 dmu_recv_cleanup_ds(drc); 2258 return (error); 2259 } 2260 2261 static int 2262 dmu_recv_new_end(dmu_recv_cookie_t *drc) 2263 { 2264 int error; 2265 2266 error = dsl_sync_task(drc->drc_tofs, 2267 dmu_recv_end_check, dmu_recv_end_sync, drc, 2268 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); 2269 2270 if (error != 0) { 2271 dmu_recv_cleanup_ds(drc); 2272 } else if (drc->drc_guid_to_ds_map != NULL) { 2273 (void) add_ds_to_guidmap(drc->drc_tofs, 2274 drc->drc_guid_to_ds_map, 2275 drc->drc_newsnapobj); 2276 } 2277 return (error); 2278 } 2279 2280 int 2281 dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) 2282 { 2283 drc->drc_owner = owner; 2284 2285 if (drc->drc_newfs) 2286 return (dmu_recv_new_end(drc)); 2287 else 2288 return (dmu_recv_existing_end(drc)); 2289 } 2290 2291 /* 2292 * Return TRUE if this objset is currently being received into. 2293 */ 2294 boolean_t 2295 dmu_objset_is_receiving(objset_t *os) 2296 { 2297 return (os->os_dsl_dataset != NULL && 2298 os->os_dsl_dataset->ds_owner == dmu_recv_tag); 2299 } 2300