1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2011, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2014, Joyent, Inc. All rights reserved. 26 * Copyright 2014 HybridCluster. All rights reserved. 27 */ 28 29 #include <sys/dmu.h> 30 #include <sys/dmu_impl.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/dbuf.h> 33 #include <sys/dnode.h> 34 #include <sys/zfs_context.h> 35 #include <sys/dmu_objset.h> 36 #include <sys/dmu_traverse.h> 37 #include <sys/dsl_dataset.h> 38 #include <sys/dsl_dir.h> 39 #include <sys/dsl_prop.h> 40 #include <sys/dsl_pool.h> 41 #include <sys/dsl_synctask.h> 42 #include <sys/zfs_ioctl.h> 43 #include <sys/zap.h> 44 #include <sys/zio_checksum.h> 45 #include <sys/zfs_znode.h> 46 #include <zfs_fletcher.h> 47 #include <sys/avl.h> 48 #include <sys/ddt.h> 49 #include <sys/zfs_onexit.h> 50 #include <sys/dmu_send.h> 51 #include <sys/dsl_destroy.h> 52 #include <sys/blkptr.h> 53 #include <sys/dsl_bookmark.h> 54 #include <sys/zfeature.h> 55 56 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 57 int zfs_send_corrupt_data = B_FALSE; 58 59 static char *dmu_recv_tag = "dmu_recv_tag"; 60 static const char *recv_clone_name = "%recv"; 61 62 static int 63 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) 64 { 65 dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; 66 ssize_t resid; /* have to get resid to get detailed errno */ 67 ASSERT0(len % 8); 68 69 dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, 70 (caddr_t)buf, len, 71 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); 72 73 mutex_enter(&ds->ds_sendstream_lock); 74 *dsp->dsa_off += len; 75 mutex_exit(&ds->ds_sendstream_lock); 76 77 return (dsp->dsa_err); 78 } 79 80 /* 81 * For all record types except BEGIN, fill in the checksum (overlaid in 82 * drr_u.drr_checksum.drr_checksum). The checksum verifies everything 83 * up to the start of the checksum itself. 84 */ 85 static int 86 dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) 87 { 88 ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), 89 ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); 90 fletcher_4_incremental_native(dsp->dsa_drr, 91 offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), 92 &dsp->dsa_zc); 93 if (dsp->dsa_drr->drr_type != DRR_BEGIN) { 94 ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u. 95 drr_checksum.drr_checksum)); 96 dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; 97 } 98 fletcher_4_incremental_native(&dsp->dsa_drr-> 99 drr_u.drr_checksum.drr_checksum, 100 sizeof (zio_cksum_t), &dsp->dsa_zc); 101 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 102 return (SET_ERROR(EINTR)); 103 if (payload_len != 0) { 104 fletcher_4_incremental_native(payload, payload_len, 105 &dsp->dsa_zc); 106 if (dump_bytes(dsp, payload, payload_len) != 0) 107 return (SET_ERROR(EINTR)); 108 } 109 return (0); 110 } 111 112 static int 113 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 114 uint64_t length) 115 { 116 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); 117 118 /* 119 * When we receive a free record, dbuf_free_range() assumes 120 * that the receiving system doesn't have any dbufs in the range 121 * being freed. This is always true because there is a one-record 122 * constraint: we only send one WRITE record for any given 123 * object+offset. We know that the one-record constraint is 124 * true because we always send data in increasing order by 125 * object,offset. 126 * 127 * If the increasing-order constraint ever changes, we should find 128 * another way to assert that the one-record constraint is still 129 * satisfied. 130 */ 131 ASSERT(object > dsp->dsa_last_data_object || 132 (object == dsp->dsa_last_data_object && 133 offset > dsp->dsa_last_data_offset)); 134 135 /* 136 * If we are doing a non-incremental send, then there can't 137 * be any data in the dataset we're receiving into. Therefore 138 * a free record would simply be a no-op. Save space by not 139 * sending it to begin with. 140 */ 141 if (!dsp->dsa_incremental) 142 return (0); 143 144 if (length != -1ULL && offset + length < offset) 145 length = -1ULL; 146 147 /* 148 * If there is a pending op, but it's not PENDING_FREE, push it out, 149 * since free block aggregation can only be done for blocks of the 150 * same type (i.e., DRR_FREE records can only be aggregated with 151 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 152 * aggregated with other DRR_FREEOBJECTS records. 153 */ 154 if (dsp->dsa_pending_op != PENDING_NONE && 155 dsp->dsa_pending_op != PENDING_FREE) { 156 if (dump_record(dsp, NULL, 0) != 0) 157 return (SET_ERROR(EINTR)); 158 dsp->dsa_pending_op = PENDING_NONE; 159 } 160 161 if (dsp->dsa_pending_op == PENDING_FREE) { 162 /* 163 * There should never be a PENDING_FREE if length is -1 164 * (because dump_dnode is the only place where this 165 * function is called with a -1, and only after flushing 166 * any pending record). 167 */ 168 ASSERT(length != -1ULL); 169 /* 170 * Check to see whether this free block can be aggregated 171 * with pending one. 172 */ 173 if (drrf->drr_object == object && drrf->drr_offset + 174 drrf->drr_length == offset) { 175 drrf->drr_length += length; 176 return (0); 177 } else { 178 /* not a continuation. Push out pending record */ 179 if (dump_record(dsp, NULL, 0) != 0) 180 return (SET_ERROR(EINTR)); 181 dsp->dsa_pending_op = PENDING_NONE; 182 } 183 } 184 /* create a FREE record and make it pending */ 185 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 186 dsp->dsa_drr->drr_type = DRR_FREE; 187 drrf->drr_object = object; 188 drrf->drr_offset = offset; 189 drrf->drr_length = length; 190 drrf->drr_toguid = dsp->dsa_toguid; 191 if (length == -1ULL) { 192 if (dump_record(dsp, NULL, 0) != 0) 193 return (SET_ERROR(EINTR)); 194 } else { 195 dsp->dsa_pending_op = PENDING_FREE; 196 } 197 198 return (0); 199 } 200 201 static int 202 dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, 203 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 204 { 205 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); 206 207 /* 208 * We send data in increasing object, offset order. 209 * See comment in dump_free() for details. 210 */ 211 ASSERT(object > dsp->dsa_last_data_object || 212 (object == dsp->dsa_last_data_object && 213 offset > dsp->dsa_last_data_offset)); 214 dsp->dsa_last_data_object = object; 215 dsp->dsa_last_data_offset = offset + blksz - 1; 216 217 /* 218 * If there is any kind of pending aggregation (currently either 219 * a grouping of free objects or free blocks), push it out to 220 * the stream, since aggregation can't be done across operations 221 * of different types. 222 */ 223 if (dsp->dsa_pending_op != PENDING_NONE) { 224 if (dump_record(dsp, NULL, 0) != 0) 225 return (SET_ERROR(EINTR)); 226 dsp->dsa_pending_op = PENDING_NONE; 227 } 228 /* write a WRITE record */ 229 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 230 dsp->dsa_drr->drr_type = DRR_WRITE; 231 drrw->drr_object = object; 232 drrw->drr_type = type; 233 drrw->drr_offset = offset; 234 drrw->drr_length = blksz; 235 drrw->drr_toguid = dsp->dsa_toguid; 236 if (bp == NULL || BP_IS_EMBEDDED(bp)) { 237 /* 238 * There's no pre-computed checksum for partial-block 239 * writes or embedded BP's, so (like 240 * fletcher4-checkummed blocks) userland will have to 241 * compute a dedup-capable checksum itself. 242 */ 243 drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; 244 } else { 245 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 246 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 247 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 248 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 249 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 250 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 251 drrw->drr_key.ddk_cksum = bp->blk_cksum; 252 } 253 254 if (dump_record(dsp, data, blksz) != 0) 255 return (SET_ERROR(EINTR)); 256 return (0); 257 } 258 259 static int 260 dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 261 int blksz, const blkptr_t *bp) 262 { 263 char buf[BPE_PAYLOAD_SIZE]; 264 struct drr_write_embedded *drrw = 265 &(dsp->dsa_drr->drr_u.drr_write_embedded); 266 267 if (dsp->dsa_pending_op != PENDING_NONE) { 268 if (dump_record(dsp, NULL, 0) != 0) 269 return (EINTR); 270 dsp->dsa_pending_op = PENDING_NONE; 271 } 272 273 ASSERT(BP_IS_EMBEDDED(bp)); 274 275 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 276 dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; 277 drrw->drr_object = object; 278 drrw->drr_offset = offset; 279 drrw->drr_length = blksz; 280 drrw->drr_toguid = dsp->dsa_toguid; 281 drrw->drr_compression = BP_GET_COMPRESS(bp); 282 drrw->drr_etype = BPE_GET_ETYPE(bp); 283 drrw->drr_lsize = BPE_GET_LSIZE(bp); 284 drrw->drr_psize = BPE_GET_PSIZE(bp); 285 286 decode_embedded_bp_compressed(bp, buf); 287 288 if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) 289 return (EINTR); 290 return (0); 291 } 292 293 static int 294 dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) 295 { 296 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); 297 298 if (dsp->dsa_pending_op != PENDING_NONE) { 299 if (dump_record(dsp, NULL, 0) != 0) 300 return (SET_ERROR(EINTR)); 301 dsp->dsa_pending_op = PENDING_NONE; 302 } 303 304 /* write a SPILL record */ 305 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 306 dsp->dsa_drr->drr_type = DRR_SPILL; 307 drrs->drr_object = object; 308 drrs->drr_length = blksz; 309 drrs->drr_toguid = dsp->dsa_toguid; 310 311 if (dump_record(dsp, data, blksz) != 0) 312 return (SET_ERROR(EINTR)); 313 return (0); 314 } 315 316 static int 317 dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) 318 { 319 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); 320 321 /* See comment in dump_free(). */ 322 if (!dsp->dsa_incremental) 323 return (0); 324 325 /* 326 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 327 * push it out, since free block aggregation can only be done for 328 * blocks of the same type (i.e., DRR_FREE records can only be 329 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 330 * can only be aggregated with other DRR_FREEOBJECTS records. 331 */ 332 if (dsp->dsa_pending_op != PENDING_NONE && 333 dsp->dsa_pending_op != PENDING_FREEOBJECTS) { 334 if (dump_record(dsp, NULL, 0) != 0) 335 return (SET_ERROR(EINTR)); 336 dsp->dsa_pending_op = PENDING_NONE; 337 } 338 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { 339 /* 340 * See whether this free object array can be aggregated 341 * with pending one 342 */ 343 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 344 drrfo->drr_numobjs += numobjs; 345 return (0); 346 } else { 347 /* can't be aggregated. Push out pending record */ 348 if (dump_record(dsp, NULL, 0) != 0) 349 return (SET_ERROR(EINTR)); 350 dsp->dsa_pending_op = PENDING_NONE; 351 } 352 } 353 354 /* write a FREEOBJECTS record */ 355 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 356 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; 357 drrfo->drr_firstobj = firstobj; 358 drrfo->drr_numobjs = numobjs; 359 drrfo->drr_toguid = dsp->dsa_toguid; 360 361 dsp->dsa_pending_op = PENDING_FREEOBJECTS; 362 363 return (0); 364 } 365 366 static int 367 dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) 368 { 369 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); 370 371 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 372 return (dump_freeobjects(dsp, object, 1)); 373 374 if (dsp->dsa_pending_op != PENDING_NONE) { 375 if (dump_record(dsp, NULL, 0) != 0) 376 return (SET_ERROR(EINTR)); 377 dsp->dsa_pending_op = PENDING_NONE; 378 } 379 380 /* write an OBJECT record */ 381 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 382 dsp->dsa_drr->drr_type = DRR_OBJECT; 383 drro->drr_object = object; 384 drro->drr_type = dnp->dn_type; 385 drro->drr_bonustype = dnp->dn_bonustype; 386 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 387 drro->drr_bonuslen = dnp->dn_bonuslen; 388 drro->drr_checksumtype = dnp->dn_checksum; 389 drro->drr_compress = dnp->dn_compress; 390 drro->drr_toguid = dsp->dsa_toguid; 391 392 if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 393 drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) 394 drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; 395 396 if (dump_record(dsp, DN_BONUS(dnp), 397 P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) { 398 return (SET_ERROR(EINTR)); 399 } 400 401 /* Free anything past the end of the file. */ 402 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * 403 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) 404 return (SET_ERROR(EINTR)); 405 if (dsp->dsa_err != 0) 406 return (SET_ERROR(EINTR)); 407 return (0); 408 } 409 410 static boolean_t 411 backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) 412 { 413 if (!BP_IS_EMBEDDED(bp)) 414 return (B_FALSE); 415 416 /* 417 * Compression function must be legacy, or explicitly enabled. 418 */ 419 if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && 420 !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4))) 421 return (B_FALSE); 422 423 /* 424 * Embed type must be explicitly enabled. 425 */ 426 switch (BPE_GET_ETYPE(bp)) { 427 case BP_EMBEDDED_TYPE_DATA: 428 if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) 429 return (B_TRUE); 430 break; 431 default: 432 return (B_FALSE); 433 } 434 return (B_FALSE); 435 } 436 437 #define BP_SPAN(dnp, level) \ 438 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 439 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 440 441 /* ARGSUSED */ 442 static int 443 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 444 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 445 { 446 dmu_sendarg_t *dsp = arg; 447 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 448 int err = 0; 449 450 if (issig(JUSTLOOKING) && issig(FORREAL)) 451 return (SET_ERROR(EINTR)); 452 453 if (zb->zb_object != DMU_META_DNODE_OBJECT && 454 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 455 return (0); 456 } else if (zb->zb_level == ZB_ZIL_LEVEL) { 457 /* 458 * If we are sending a non-snapshot (which is allowed on 459 * read-only pools), it may have a ZIL, which must be ignored. 460 */ 461 return (0); 462 } else if (BP_IS_HOLE(bp) && 463 zb->zb_object == DMU_META_DNODE_OBJECT) { 464 uint64_t span = BP_SPAN(dnp, zb->zb_level); 465 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 466 err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); 467 } else if (BP_IS_HOLE(bp)) { 468 uint64_t span = BP_SPAN(dnp, zb->zb_level); 469 err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); 470 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 471 return (0); 472 } else if (type == DMU_OT_DNODE) { 473 dnode_phys_t *blk; 474 int i; 475 int blksz = BP_GET_LSIZE(bp); 476 arc_flags_t aflags = ARC_FLAG_WAIT; 477 arc_buf_t *abuf; 478 479 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 480 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 481 &aflags, zb) != 0) 482 return (SET_ERROR(EIO)); 483 484 blk = abuf->b_data; 485 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 486 uint64_t dnobj = (zb->zb_blkid << 487 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 488 err = dump_dnode(dsp, dnobj, blk+i); 489 if (err != 0) 490 break; 491 } 492 (void) arc_buf_remove_ref(abuf, &abuf); 493 } else if (type == DMU_OT_SA) { 494 arc_flags_t aflags = ARC_FLAG_WAIT; 495 arc_buf_t *abuf; 496 int blksz = BP_GET_LSIZE(bp); 497 498 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 499 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 500 &aflags, zb) != 0) 501 return (SET_ERROR(EIO)); 502 503 err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); 504 (void) arc_buf_remove_ref(abuf, &abuf); 505 } else if (backup_do_embed(dsp, bp)) { 506 /* it's an embedded level-0 block of a regular object */ 507 int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 508 err = dump_write_embedded(dsp, zb->zb_object, 509 zb->zb_blkid * blksz, blksz, bp); 510 } else { /* it's a level-0 block of a regular object */ 511 arc_flags_t aflags = ARC_FLAG_WAIT; 512 arc_buf_t *abuf; 513 int blksz = BP_GET_LSIZE(bp); 514 uint64_t offset; 515 516 ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); 517 ASSERT0(zb->zb_level); 518 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 519 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 520 &aflags, zb) != 0) { 521 if (zfs_send_corrupt_data) { 522 /* Send a block filled with 0x"zfs badd bloc" */ 523 abuf = arc_buf_alloc(spa, blksz, &abuf, 524 ARC_BUFC_DATA); 525 uint64_t *ptr; 526 for (ptr = abuf->b_data; 527 (char *)ptr < (char *)abuf->b_data + blksz; 528 ptr++) 529 *ptr = 0x2f5baddb10cULL; 530 } else { 531 return (SET_ERROR(EIO)); 532 } 533 } 534 535 offset = zb->zb_blkid * blksz; 536 537 if (!(dsp->dsa_featureflags & 538 DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 539 blksz > SPA_OLD_MAXBLOCKSIZE) { 540 char *buf = abuf->b_data; 541 while (blksz > 0 && err == 0) { 542 int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); 543 err = dump_write(dsp, type, zb->zb_object, 544 offset, n, NULL, buf); 545 offset += n; 546 buf += n; 547 blksz -= n; 548 } 549 } else { 550 err = dump_write(dsp, type, zb->zb_object, 551 offset, blksz, bp, abuf->b_data); 552 } 553 (void) arc_buf_remove_ref(abuf, &abuf); 554 } 555 556 ASSERT(err == 0 || err == EINTR); 557 return (err); 558 } 559 560 /* 561 * Releases dp using the specified tag. 562 */ 563 static int 564 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, 565 zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, 566 boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) 567 { 568 objset_t *os; 569 dmu_replay_record_t *drr; 570 dmu_sendarg_t *dsp; 571 int err; 572 uint64_t fromtxg = 0; 573 uint64_t featureflags = 0; 574 575 err = dmu_objset_from_ds(ds, &os); 576 if (err != 0) { 577 dsl_pool_rele(dp, tag); 578 return (err); 579 } 580 581 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 582 drr->drr_type = DRR_BEGIN; 583 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 584 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 585 DMU_SUBSTREAM); 586 587 #ifdef _KERNEL 588 if (dmu_objset_type(os) == DMU_OST_ZFS) { 589 uint64_t version; 590 if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { 591 kmem_free(drr, sizeof (dmu_replay_record_t)); 592 dsl_pool_rele(dp, tag); 593 return (SET_ERROR(EINVAL)); 594 } 595 if (version >= ZPL_VERSION_SA) { 596 featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; 597 } 598 } 599 #endif 600 601 if (large_block_ok && ds->ds_large_blocks) 602 featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; 603 if (embedok && 604 spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { 605 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; 606 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 607 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; 608 } else { 609 embedok = B_FALSE; 610 } 611 612 DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, 613 featureflags); 614 615 drr->drr_u.drr_begin.drr_creation_time = 616 dsl_dataset_phys(ds)->ds_creation_time; 617 drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); 618 if (is_clone) 619 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 620 drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid; 621 if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) 622 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 623 624 if (fromzb != NULL) { 625 drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid; 626 fromtxg = fromzb->zbm_creation_txg; 627 } 628 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 629 if (!ds->ds_is_snapshot) { 630 (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", 631 sizeof (drr->drr_u.drr_begin.drr_toname)); 632 } 633 634 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); 635 636 dsp->dsa_drr = drr; 637 dsp->dsa_vp = vp; 638 dsp->dsa_outfd = outfd; 639 dsp->dsa_proc = curproc; 640 dsp->dsa_os = os; 641 dsp->dsa_off = off; 642 dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid; 643 dsp->dsa_pending_op = PENDING_NONE; 644 dsp->dsa_incremental = (fromzb != NULL); 645 dsp->dsa_featureflags = featureflags; 646 647 mutex_enter(&ds->ds_sendstream_lock); 648 list_insert_head(&ds->ds_sendstreams, dsp); 649 mutex_exit(&ds->ds_sendstream_lock); 650 651 dsl_dataset_long_hold(ds, FTAG); 652 dsl_pool_rele(dp, tag); 653 654 if (dump_record(dsp, NULL, 0) != 0) { 655 err = dsp->dsa_err; 656 goto out; 657 } 658 659 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 660 backup_cb, dsp); 661 662 if (dsp->dsa_pending_op != PENDING_NONE) 663 if (dump_record(dsp, NULL, 0) != 0) 664 err = SET_ERROR(EINTR); 665 666 if (err != 0) { 667 if (err == EINTR && dsp->dsa_err != 0) 668 err = dsp->dsa_err; 669 goto out; 670 } 671 672 bzero(drr, sizeof (dmu_replay_record_t)); 673 drr->drr_type = DRR_END; 674 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; 675 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; 676 677 if (dump_record(dsp, NULL, 0) != 0) { 678 err = dsp->dsa_err; 679 goto out; 680 } 681 682 out: 683 mutex_enter(&ds->ds_sendstream_lock); 684 list_remove(&ds->ds_sendstreams, dsp); 685 mutex_exit(&ds->ds_sendstream_lock); 686 687 kmem_free(drr, sizeof (dmu_replay_record_t)); 688 kmem_free(dsp, sizeof (dmu_sendarg_t)); 689 690 dsl_dataset_long_rele(ds, FTAG); 691 692 return (err); 693 } 694 695 int 696 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, 697 boolean_t embedok, boolean_t large_block_ok, 698 int outfd, vnode_t *vp, offset_t *off) 699 { 700 dsl_pool_t *dp; 701 dsl_dataset_t *ds; 702 dsl_dataset_t *fromds = NULL; 703 int err; 704 705 err = dsl_pool_hold(pool, FTAG, &dp); 706 if (err != 0) 707 return (err); 708 709 err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); 710 if (err != 0) { 711 dsl_pool_rele(dp, FTAG); 712 return (err); 713 } 714 715 if (fromsnap != 0) { 716 zfs_bookmark_phys_t zb; 717 boolean_t is_clone; 718 719 err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); 720 if (err != 0) { 721 dsl_dataset_rele(ds, FTAG); 722 dsl_pool_rele(dp, FTAG); 723 return (err); 724 } 725 if (!dsl_dataset_is_before(ds, fromds, 0)) 726 err = SET_ERROR(EXDEV); 727 zb.zbm_creation_time = 728 dsl_dataset_phys(fromds)->ds_creation_time; 729 zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; 730 zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; 731 is_clone = (fromds->ds_dir != ds->ds_dir); 732 dsl_dataset_rele(fromds, FTAG); 733 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, 734 embedok, large_block_ok, outfd, vp, off); 735 } else { 736 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, 737 embedok, large_block_ok, outfd, vp, off); 738 } 739 dsl_dataset_rele(ds, FTAG); 740 return (err); 741 } 742 743 int 744 dmu_send(const char *tosnap, const char *fromsnap, 745 boolean_t embedok, boolean_t large_block_ok, 746 int outfd, vnode_t *vp, offset_t *off) 747 { 748 dsl_pool_t *dp; 749 dsl_dataset_t *ds; 750 int err; 751 boolean_t owned = B_FALSE; 752 753 if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) 754 return (SET_ERROR(EINVAL)); 755 756 err = dsl_pool_hold(tosnap, FTAG, &dp); 757 if (err != 0) 758 return (err); 759 760 if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { 761 /* 762 * We are sending a filesystem or volume. Ensure 763 * that it doesn't change by owning the dataset. 764 */ 765 err = dsl_dataset_own(dp, tosnap, FTAG, &ds); 766 owned = B_TRUE; 767 } else { 768 err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); 769 } 770 if (err != 0) { 771 dsl_pool_rele(dp, FTAG); 772 return (err); 773 } 774 775 if (fromsnap != NULL) { 776 zfs_bookmark_phys_t zb; 777 boolean_t is_clone = B_FALSE; 778 int fsnamelen = strchr(tosnap, '@') - tosnap; 779 780 /* 781 * If the fromsnap is in a different filesystem, then 782 * mark the send stream as a clone. 783 */ 784 if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || 785 (fromsnap[fsnamelen] != '@' && 786 fromsnap[fsnamelen] != '#')) { 787 is_clone = B_TRUE; 788 } 789 790 if (strchr(fromsnap, '@')) { 791 dsl_dataset_t *fromds; 792 err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); 793 if (err == 0) { 794 if (!dsl_dataset_is_before(ds, fromds, 0)) 795 err = SET_ERROR(EXDEV); 796 zb.zbm_creation_time = 797 dsl_dataset_phys(fromds)->ds_creation_time; 798 zb.zbm_creation_txg = 799 dsl_dataset_phys(fromds)->ds_creation_txg; 800 zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; 801 is_clone = (ds->ds_dir != fromds->ds_dir); 802 dsl_dataset_rele(fromds, FTAG); 803 } 804 } else { 805 err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); 806 } 807 if (err != 0) { 808 dsl_dataset_rele(ds, FTAG); 809 dsl_pool_rele(dp, FTAG); 810 return (err); 811 } 812 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, 813 embedok, large_block_ok, outfd, vp, off); 814 } else { 815 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, 816 embedok, large_block_ok, outfd, vp, off); 817 } 818 if (owned) 819 dsl_dataset_disown(ds, FTAG); 820 else 821 dsl_dataset_rele(ds, FTAG); 822 return (err); 823 } 824 825 static int 826 dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size, 827 uint64_t *sizep) 828 { 829 int err; 830 /* 831 * Assume that space (both on-disk and in-stream) is dominated by 832 * data. We will adjust for indirect blocks and the copies property, 833 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 834 */ 835 836 /* 837 * Subtract out approximate space used by indirect blocks. 838 * Assume most space is used by data blocks (non-indirect, non-dnode). 839 * Assume all blocks are recordsize. Assume ditto blocks and 840 * internal fragmentation counter out compression. 841 * 842 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 843 * block, which we observe in practice. 844 */ 845 uint64_t recordsize; 846 err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); 847 if (err != 0) 848 return (err); 849 size -= size / recordsize * sizeof (blkptr_t); 850 851 /* Add in the space for the record associated with each block. */ 852 size += size / recordsize * sizeof (dmu_replay_record_t); 853 854 *sizep = size; 855 856 return (0); 857 } 858 859 int 860 dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) 861 { 862 dsl_pool_t *dp = ds->ds_dir->dd_pool; 863 int err; 864 uint64_t size; 865 866 ASSERT(dsl_pool_config_held(dp)); 867 868 /* tosnap must be a snapshot */ 869 if (!ds->ds_is_snapshot) 870 return (SET_ERROR(EINVAL)); 871 872 /* fromsnap, if provided, must be a snapshot */ 873 if (fromds != NULL && !fromds->ds_is_snapshot) 874 return (SET_ERROR(EINVAL)); 875 876 /* 877 * fromsnap must be an earlier snapshot from the same fs as tosnap, 878 * or the origin's fs. 879 */ 880 if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) 881 return (SET_ERROR(EXDEV)); 882 883 /* Get uncompressed size estimate of changed data. */ 884 if (fromds == NULL) { 885 size = dsl_dataset_phys(ds)->ds_uncompressed_bytes; 886 } else { 887 uint64_t used, comp; 888 err = dsl_dataset_space_written(fromds, ds, 889 &used, &comp, &size); 890 if (err != 0) 891 return (err); 892 } 893 894 err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep); 895 return (err); 896 } 897 898 /* 899 * Simple callback used to traverse the blocks of a snapshot and sum their 900 * uncompressed size 901 */ 902 /* ARGSUSED */ 903 static int 904 dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 905 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 906 { 907 uint64_t *spaceptr = arg; 908 if (bp != NULL && !BP_IS_HOLE(bp)) { 909 *spaceptr += BP_GET_UCSIZE(bp); 910 } 911 return (0); 912 } 913 914 /* 915 * Given a desination snapshot and a TXG, calculate the approximate size of a 916 * send stream sent from that TXG. from_txg may be zero, indicating that the 917 * whole snapshot will be sent. 918 */ 919 int 920 dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, 921 uint64_t *sizep) 922 { 923 dsl_pool_t *dp = ds->ds_dir->dd_pool; 924 int err; 925 uint64_t size = 0; 926 927 ASSERT(dsl_pool_config_held(dp)); 928 929 /* tosnap must be a snapshot */ 930 if (!dsl_dataset_is_snapshot(ds)) 931 return (SET_ERROR(EINVAL)); 932 933 /* verify that from_txg is before the provided snapshot was taken */ 934 if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) { 935 return (SET_ERROR(EXDEV)); 936 } 937 938 /* 939 * traverse the blocks of the snapshot with birth times after 940 * from_txg, summing their uncompressed size 941 */ 942 err = traverse_dataset(ds, from_txg, TRAVERSE_POST, 943 dmu_calculate_send_traversal, &size); 944 if (err) 945 return (err); 946 947 err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep); 948 return (err); 949 } 950 951 typedef struct dmu_recv_begin_arg { 952 const char *drba_origin; 953 dmu_recv_cookie_t *drba_cookie; 954 cred_t *drba_cred; 955 uint64_t drba_snapobj; 956 } dmu_recv_begin_arg_t; 957 958 static int 959 recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, 960 uint64_t fromguid) 961 { 962 uint64_t val; 963 int error; 964 dsl_pool_t *dp = ds->ds_dir->dd_pool; 965 966 /* temporary clone name must not exist */ 967 error = zap_lookup(dp->dp_meta_objset, 968 dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, 969 8, 1, &val); 970 if (error != ENOENT) 971 return (error == 0 ? EBUSY : error); 972 973 /* new snapshot name must not exist */ 974 error = zap_lookup(dp->dp_meta_objset, 975 dsl_dataset_phys(ds)->ds_snapnames_zapobj, 976 drba->drba_cookie->drc_tosnap, 8, 1, &val); 977 if (error != ENOENT) 978 return (error == 0 ? EEXIST : error); 979 980 /* 981 * Check snapshot limit before receiving. We'll recheck again at the 982 * end, but might as well abort before receiving if we're already over 983 * the limit. 984 * 985 * Note that we do not check the file system limit with 986 * dsl_dir_fscount_check because the temporary %clones don't count 987 * against that limit. 988 */ 989 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, 990 NULL, drba->drba_cred); 991 if (error != 0) 992 return (error); 993 994 if (fromguid != 0) { 995 dsl_dataset_t *snap; 996 uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; 997 998 /* Find snapshot in this dir that matches fromguid. */ 999 while (obj != 0) { 1000 error = dsl_dataset_hold_obj(dp, obj, FTAG, 1001 &snap); 1002 if (error != 0) 1003 return (SET_ERROR(ENODEV)); 1004 if (snap->ds_dir != ds->ds_dir) { 1005 dsl_dataset_rele(snap, FTAG); 1006 return (SET_ERROR(ENODEV)); 1007 } 1008 if (dsl_dataset_phys(snap)->ds_guid == fromguid) 1009 break; 1010 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; 1011 dsl_dataset_rele(snap, FTAG); 1012 } 1013 if (obj == 0) 1014 return (SET_ERROR(ENODEV)); 1015 1016 if (drba->drba_cookie->drc_force) { 1017 drba->drba_snapobj = obj; 1018 } else { 1019 /* 1020 * If we are not forcing, there must be no 1021 * changes since fromsnap. 1022 */ 1023 if (dsl_dataset_modified_since_snap(ds, snap)) { 1024 dsl_dataset_rele(snap, FTAG); 1025 return (SET_ERROR(ETXTBSY)); 1026 } 1027 drba->drba_snapobj = ds->ds_prev->ds_object; 1028 } 1029 1030 dsl_dataset_rele(snap, FTAG); 1031 } else { 1032 /* if full, then must be forced */ 1033 if (!drba->drba_cookie->drc_force) 1034 return (SET_ERROR(EEXIST)); 1035 /* start from $ORIGIN@$ORIGIN, if supported */ 1036 drba->drba_snapobj = dp->dp_origin_snap != NULL ? 1037 dp->dp_origin_snap->ds_object : 0; 1038 } 1039 1040 return (0); 1041 1042 } 1043 1044 static int 1045 dmu_recv_begin_check(void *arg, dmu_tx_t *tx) 1046 { 1047 dmu_recv_begin_arg_t *drba = arg; 1048 dsl_pool_t *dp = dmu_tx_pool(tx); 1049 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 1050 uint64_t fromguid = drrb->drr_fromguid; 1051 int flags = drrb->drr_flags; 1052 int error; 1053 uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 1054 dsl_dataset_t *ds; 1055 const char *tofs = drba->drba_cookie->drc_tofs; 1056 1057 /* already checked */ 1058 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); 1059 1060 if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == 1061 DMU_COMPOUNDSTREAM || 1062 drrb->drr_type >= DMU_OST_NUMTYPES || 1063 ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) 1064 return (SET_ERROR(EINVAL)); 1065 1066 /* Verify pool version supports SA if SA_SPILL feature set */ 1067 if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 1068 spa_version(dp->dp_spa) < SPA_VERSION_SA) 1069 return (SET_ERROR(ENOTSUP)); 1070 1071 /* 1072 * The receiving code doesn't know how to translate a WRITE_EMBEDDED 1073 * record to a plan WRITE record, so the pool must have the 1074 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED 1075 * records. Same with WRITE_EMBEDDED records that use LZ4 compression. 1076 */ 1077 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && 1078 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) 1079 return (SET_ERROR(ENOTSUP)); 1080 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && 1081 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) 1082 return (SET_ERROR(ENOTSUP)); 1083 1084 /* 1085 * The receiving code doesn't know how to translate large blocks 1086 * to smaller ones, so the pool must have the LARGE_BLOCKS 1087 * feature enabled if the stream has LARGE_BLOCKS. 1088 */ 1089 if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 1090 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) 1091 return (SET_ERROR(ENOTSUP)); 1092 1093 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1094 if (error == 0) { 1095 /* target fs already exists; recv into temp clone */ 1096 1097 /* Can't recv a clone into an existing fs */ 1098 if (flags & DRR_FLAG_CLONE) { 1099 dsl_dataset_rele(ds, FTAG); 1100 return (SET_ERROR(EINVAL)); 1101 } 1102 1103 error = recv_begin_check_existing_impl(drba, ds, fromguid); 1104 dsl_dataset_rele(ds, FTAG); 1105 } else if (error == ENOENT) { 1106 /* target fs does not exist; must be a full backup or clone */ 1107 char buf[MAXNAMELEN]; 1108 1109 /* 1110 * If it's a non-clone incremental, we are missing the 1111 * target fs, so fail the recv. 1112 */ 1113 if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) 1114 return (SET_ERROR(ENOENT)); 1115 1116 /* Open the parent of tofs */ 1117 ASSERT3U(strlen(tofs), <, MAXNAMELEN); 1118 (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); 1119 error = dsl_dataset_hold(dp, buf, FTAG, &ds); 1120 if (error != 0) 1121 return (error); 1122 1123 /* 1124 * Check filesystem and snapshot limits before receiving. We'll 1125 * recheck snapshot limits again at the end (we create the 1126 * filesystems and increment those counts during begin_sync). 1127 */ 1128 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1129 ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); 1130 if (error != 0) { 1131 dsl_dataset_rele(ds, FTAG); 1132 return (error); 1133 } 1134 1135 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, 1136 ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); 1137 if (error != 0) { 1138 dsl_dataset_rele(ds, FTAG); 1139 return (error); 1140 } 1141 1142 if (drba->drba_origin != NULL) { 1143 dsl_dataset_t *origin; 1144 error = dsl_dataset_hold(dp, drba->drba_origin, 1145 FTAG, &origin); 1146 if (error != 0) { 1147 dsl_dataset_rele(ds, FTAG); 1148 return (error); 1149 } 1150 if (!origin->ds_is_snapshot) { 1151 dsl_dataset_rele(origin, FTAG); 1152 dsl_dataset_rele(ds, FTAG); 1153 return (SET_ERROR(EINVAL)); 1154 } 1155 if (dsl_dataset_phys(origin)->ds_guid != fromguid) { 1156 dsl_dataset_rele(origin, FTAG); 1157 dsl_dataset_rele(ds, FTAG); 1158 return (SET_ERROR(ENODEV)); 1159 } 1160 dsl_dataset_rele(origin, FTAG); 1161 } 1162 dsl_dataset_rele(ds, FTAG); 1163 error = 0; 1164 } 1165 return (error); 1166 } 1167 1168 static void 1169 dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) 1170 { 1171 dmu_recv_begin_arg_t *drba = arg; 1172 dsl_pool_t *dp = dmu_tx_pool(tx); 1173 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 1174 const char *tofs = drba->drba_cookie->drc_tofs; 1175 dsl_dataset_t *ds, *newds; 1176 uint64_t dsobj; 1177 int error; 1178 uint64_t crflags; 1179 1180 crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? 1181 DS_FLAG_CI_DATASET : 0; 1182 1183 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 1184 if (error == 0) { 1185 /* create temporary clone */ 1186 dsl_dataset_t *snap = NULL; 1187 if (drba->drba_snapobj != 0) { 1188 VERIFY0(dsl_dataset_hold_obj(dp, 1189 drba->drba_snapobj, FTAG, &snap)); 1190 } 1191 dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, 1192 snap, crflags, drba->drba_cred, tx); 1193 if (drba->drba_snapobj != 0) 1194 dsl_dataset_rele(snap, FTAG); 1195 dsl_dataset_rele(ds, FTAG); 1196 } else { 1197 dsl_dir_t *dd; 1198 const char *tail; 1199 dsl_dataset_t *origin = NULL; 1200 1201 VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); 1202 1203 if (drba->drba_origin != NULL) { 1204 VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, 1205 FTAG, &origin)); 1206 } 1207 1208 /* Create new dataset. */ 1209 dsobj = dsl_dataset_create_sync(dd, 1210 strrchr(tofs, '/') + 1, 1211 origin, crflags, drba->drba_cred, tx); 1212 if (origin != NULL) 1213 dsl_dataset_rele(origin, FTAG); 1214 dsl_dir_rele(dd, FTAG); 1215 drba->drba_cookie->drc_newfs = B_TRUE; 1216 } 1217 VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); 1218 1219 if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & 1220 DMU_BACKUP_FEATURE_LARGE_BLOCKS) && 1221 !newds->ds_large_blocks) { 1222 dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); 1223 newds->ds_large_blocks = B_TRUE; 1224 } 1225 1226 dmu_buf_will_dirty(newds->ds_dbuf, tx); 1227 dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; 1228 1229 /* 1230 * If we actually created a non-clone, we need to create the 1231 * objset in our new dataset. 1232 */ 1233 if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { 1234 (void) dmu_objset_create_impl(dp->dp_spa, 1235 newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); 1236 } 1237 1238 drba->drba_cookie->drc_ds = newds; 1239 1240 spa_history_log_internal_ds(newds, "receive", tx, ""); 1241 } 1242 1243 /* 1244 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 1245 * succeeds; otherwise we will leak the holds on the datasets. 1246 */ 1247 int 1248 dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, 1249 boolean_t force, char *origin, dmu_recv_cookie_t *drc) 1250 { 1251 dmu_recv_begin_arg_t drba = { 0 }; 1252 dmu_replay_record_t *drr; 1253 1254 bzero(drc, sizeof (dmu_recv_cookie_t)); 1255 drc->drc_drrb = drrb; 1256 drc->drc_tosnap = tosnap; 1257 drc->drc_tofs = tofs; 1258 drc->drc_force = force; 1259 drc->drc_cred = CRED(); 1260 1261 if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 1262 drc->drc_byteswap = B_TRUE; 1263 else if (drrb->drr_magic != DMU_BACKUP_MAGIC) 1264 return (SET_ERROR(EINVAL)); 1265 1266 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 1267 drr->drr_type = DRR_BEGIN; 1268 drr->drr_u.drr_begin = *drc->drc_drrb; 1269 if (drc->drc_byteswap) { 1270 fletcher_4_incremental_byteswap(drr, 1271 sizeof (dmu_replay_record_t), &drc->drc_cksum); 1272 } else { 1273 fletcher_4_incremental_native(drr, 1274 sizeof (dmu_replay_record_t), &drc->drc_cksum); 1275 } 1276 kmem_free(drr, sizeof (dmu_replay_record_t)); 1277 1278 if (drc->drc_byteswap) { 1279 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1280 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 1281 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1282 drrb->drr_type = BSWAP_32(drrb->drr_type); 1283 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1284 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1285 } 1286 1287 drba.drba_origin = origin; 1288 drba.drba_cookie = drc; 1289 drba.drba_cred = CRED(); 1290 1291 return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, 1292 &drba, 5, ZFS_SPACE_CHECK_NORMAL)); 1293 } 1294 1295 struct restorearg { 1296 objset_t *os; 1297 int err; 1298 boolean_t byteswap; 1299 vnode_t *vp; 1300 uint64_t voff; 1301 int bufsize; /* amount of memory allocated for buf */ 1302 1303 dmu_replay_record_t *drr; 1304 dmu_replay_record_t *next_drr; 1305 char *buf; 1306 zio_cksum_t cksum; 1307 zio_cksum_t prev_cksum; 1308 1309 avl_tree_t *guid_to_ds_map; 1310 }; 1311 1312 typedef struct guid_map_entry { 1313 uint64_t guid; 1314 dsl_dataset_t *gme_ds; 1315 avl_node_t avlnode; 1316 } guid_map_entry_t; 1317 1318 static int 1319 guid_compare(const void *arg1, const void *arg2) 1320 { 1321 const guid_map_entry_t *gmep1 = arg1; 1322 const guid_map_entry_t *gmep2 = arg2; 1323 1324 if (gmep1->guid < gmep2->guid) 1325 return (-1); 1326 else if (gmep1->guid > gmep2->guid) 1327 return (1); 1328 return (0); 1329 } 1330 1331 static void 1332 free_guid_map_onexit(void *arg) 1333 { 1334 avl_tree_t *ca = arg; 1335 void *cookie = NULL; 1336 guid_map_entry_t *gmep; 1337 1338 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 1339 dsl_dataset_long_rele(gmep->gme_ds, gmep); 1340 dsl_dataset_rele(gmep->gme_ds, gmep); 1341 kmem_free(gmep, sizeof (guid_map_entry_t)); 1342 } 1343 avl_destroy(ca); 1344 kmem_free(ca, sizeof (avl_tree_t)); 1345 } 1346 1347 static int 1348 restore_read(struct restorearg *ra, int len, void *buf) 1349 { 1350 int done = 0; 1351 1352 /* some things will require 8-byte alignment, so everything must */ 1353 ASSERT0(len % 8); 1354 ASSERT3U(len, <=, ra->bufsize); 1355 1356 while (done < len) { 1357 ssize_t resid; 1358 1359 ra->err = vn_rdwr(UIO_READ, ra->vp, 1360 (char *)buf + done, len - done, 1361 ra->voff, UIO_SYSSPACE, FAPPEND, 1362 RLIM64_INFINITY, CRED(), &resid); 1363 1364 if (resid == len - done) 1365 ra->err = SET_ERROR(EINVAL); 1366 ra->voff += len - done - resid; 1367 done = len - resid; 1368 if (ra->err != 0) 1369 return (ra->err); 1370 } 1371 1372 ASSERT3U(done, ==, len); 1373 return (0); 1374 } 1375 1376 static void 1377 byteswap_record(dmu_replay_record_t *drr) 1378 { 1379 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 1380 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 1381 drr->drr_type = BSWAP_32(drr->drr_type); 1382 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 1383 1384 switch (drr->drr_type) { 1385 case DRR_BEGIN: 1386 DO64(drr_begin.drr_magic); 1387 DO64(drr_begin.drr_versioninfo); 1388 DO64(drr_begin.drr_creation_time); 1389 DO32(drr_begin.drr_type); 1390 DO32(drr_begin.drr_flags); 1391 DO64(drr_begin.drr_toguid); 1392 DO64(drr_begin.drr_fromguid); 1393 break; 1394 case DRR_OBJECT: 1395 DO64(drr_object.drr_object); 1396 DO32(drr_object.drr_type); 1397 DO32(drr_object.drr_bonustype); 1398 DO32(drr_object.drr_blksz); 1399 DO32(drr_object.drr_bonuslen); 1400 DO64(drr_object.drr_toguid); 1401 break; 1402 case DRR_FREEOBJECTS: 1403 DO64(drr_freeobjects.drr_firstobj); 1404 DO64(drr_freeobjects.drr_numobjs); 1405 DO64(drr_freeobjects.drr_toguid); 1406 break; 1407 case DRR_WRITE: 1408 DO64(drr_write.drr_object); 1409 DO32(drr_write.drr_type); 1410 DO64(drr_write.drr_offset); 1411 DO64(drr_write.drr_length); 1412 DO64(drr_write.drr_toguid); 1413 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); 1414 DO64(drr_write.drr_key.ddk_prop); 1415 break; 1416 case DRR_WRITE_BYREF: 1417 DO64(drr_write_byref.drr_object); 1418 DO64(drr_write_byref.drr_offset); 1419 DO64(drr_write_byref.drr_length); 1420 DO64(drr_write_byref.drr_toguid); 1421 DO64(drr_write_byref.drr_refguid); 1422 DO64(drr_write_byref.drr_refobject); 1423 DO64(drr_write_byref.drr_refoffset); 1424 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. 1425 drr_key.ddk_cksum); 1426 DO64(drr_write_byref.drr_key.ddk_prop); 1427 break; 1428 case DRR_WRITE_EMBEDDED: 1429 DO64(drr_write_embedded.drr_object); 1430 DO64(drr_write_embedded.drr_offset); 1431 DO64(drr_write_embedded.drr_length); 1432 DO64(drr_write_embedded.drr_toguid); 1433 DO32(drr_write_embedded.drr_lsize); 1434 DO32(drr_write_embedded.drr_psize); 1435 break; 1436 case DRR_FREE: 1437 DO64(drr_free.drr_object); 1438 DO64(drr_free.drr_offset); 1439 DO64(drr_free.drr_length); 1440 DO64(drr_free.drr_toguid); 1441 break; 1442 case DRR_SPILL: 1443 DO64(drr_spill.drr_object); 1444 DO64(drr_spill.drr_length); 1445 DO64(drr_spill.drr_toguid); 1446 break; 1447 case DRR_END: 1448 DO64(drr_end.drr_toguid); 1449 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); 1450 break; 1451 } 1452 1453 if (drr->drr_type != DRR_BEGIN) { 1454 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); 1455 } 1456 1457 #undef DO64 1458 #undef DO32 1459 } 1460 1461 static inline uint8_t 1462 deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) 1463 { 1464 if (bonus_type == DMU_OT_SA) { 1465 return (1); 1466 } else { 1467 return (1 + 1468 ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT)); 1469 } 1470 } 1471 1472 static int 1473 restore_object(struct restorearg *ra, struct drr_object *drro, void *data) 1474 { 1475 dmu_object_info_t doi; 1476 dmu_tx_t *tx; 1477 uint64_t object; 1478 int err; 1479 1480 if (drro->drr_type == DMU_OT_NONE || 1481 !DMU_OT_IS_VALID(drro->drr_type) || 1482 !DMU_OT_IS_VALID(drro->drr_bonustype) || 1483 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 1484 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1485 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1486 drro->drr_blksz < SPA_MINBLOCKSIZE || 1487 drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(ra->os)) || 1488 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1489 return (SET_ERROR(EINVAL)); 1490 } 1491 1492 err = dmu_object_info(ra->os, drro->drr_object, &doi); 1493 1494 if (err != 0 && err != ENOENT) 1495 return (SET_ERROR(EINVAL)); 1496 object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; 1497 1498 /* 1499 * If we are losing blkptrs or changing the block size this must 1500 * be a new file instance. We must clear out the previous file 1501 * contents before we can change this type of metadata in the dnode. 1502 */ 1503 if (err == 0) { 1504 int nblkptr; 1505 1506 nblkptr = deduce_nblkptr(drro->drr_bonustype, 1507 drro->drr_bonuslen); 1508 1509 if (drro->drr_blksz != doi.doi_data_block_size || 1510 nblkptr < doi.doi_nblkptr) { 1511 err = dmu_free_long_range(ra->os, drro->drr_object, 1512 0, DMU_OBJECT_END); 1513 if (err != 0) 1514 return (SET_ERROR(EINVAL)); 1515 } 1516 } 1517 1518 tx = dmu_tx_create(ra->os); 1519 dmu_tx_hold_bonus(tx, object); 1520 err = dmu_tx_assign(tx, TXG_WAIT); 1521 if (err != 0) { 1522 dmu_tx_abort(tx); 1523 return (err); 1524 } 1525 1526 if (object == DMU_NEW_OBJECT) { 1527 /* currently free, want to be allocated */ 1528 err = dmu_object_claim(ra->os, drro->drr_object, 1529 drro->drr_type, drro->drr_blksz, 1530 drro->drr_bonustype, drro->drr_bonuslen, tx); 1531 } else if (drro->drr_type != doi.doi_type || 1532 drro->drr_blksz != doi.doi_data_block_size || 1533 drro->drr_bonustype != doi.doi_bonus_type || 1534 drro->drr_bonuslen != doi.doi_bonus_size) { 1535 /* currently allocated, but with different properties */ 1536 err = dmu_object_reclaim(ra->os, drro->drr_object, 1537 drro->drr_type, drro->drr_blksz, 1538 drro->drr_bonustype, drro->drr_bonuslen, tx); 1539 } 1540 if (err != 0) { 1541 dmu_tx_commit(tx); 1542 return (SET_ERROR(EINVAL)); 1543 } 1544 1545 dmu_object_set_checksum(ra->os, drro->drr_object, 1546 drro->drr_checksumtype, tx); 1547 dmu_object_set_compress(ra->os, drro->drr_object, 1548 drro->drr_compress, tx); 1549 1550 if (data != NULL) { 1551 dmu_buf_t *db; 1552 1553 VERIFY0(dmu_bonus_hold(ra->os, drro->drr_object, FTAG, &db)); 1554 dmu_buf_will_dirty(db, tx); 1555 1556 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 1557 bcopy(data, db->db_data, drro->drr_bonuslen); 1558 if (ra->byteswap) { 1559 dmu_object_byteswap_t byteswap = 1560 DMU_OT_BYTESWAP(drro->drr_bonustype); 1561 dmu_ot_byteswap[byteswap].ob_func(db->db_data, 1562 drro->drr_bonuslen); 1563 } 1564 dmu_buf_rele(db, FTAG); 1565 } 1566 dmu_tx_commit(tx); 1567 return (0); 1568 } 1569 1570 /* ARGSUSED */ 1571 static int 1572 restore_freeobjects(struct restorearg *ra, 1573 struct drr_freeobjects *drrfo) 1574 { 1575 uint64_t obj; 1576 1577 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1578 return (SET_ERROR(EINVAL)); 1579 1580 for (obj = drrfo->drr_firstobj; 1581 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1582 (void) dmu_object_next(ra->os, &obj, FALSE, 0)) { 1583 int err; 1584 1585 if (dmu_object_info(ra->os, obj, NULL) != 0) 1586 continue; 1587 1588 err = dmu_free_long_object(ra->os, obj); 1589 if (err != 0) 1590 return (err); 1591 } 1592 return (0); 1593 } 1594 1595 static int 1596 restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) 1597 { 1598 dmu_tx_t *tx; 1599 int err; 1600 1601 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1602 !DMU_OT_IS_VALID(drrw->drr_type)) 1603 return (SET_ERROR(EINVAL)); 1604 1605 if (dmu_object_info(ra->os, drrw->drr_object, NULL) != 0) 1606 return (SET_ERROR(EINVAL)); 1607 1608 tx = dmu_tx_create(ra->os); 1609 1610 dmu_tx_hold_write(tx, drrw->drr_object, 1611 drrw->drr_offset, drrw->drr_length); 1612 err = dmu_tx_assign(tx, TXG_WAIT); 1613 if (err != 0) { 1614 dmu_tx_abort(tx); 1615 return (err); 1616 } 1617 if (ra->byteswap) { 1618 dmu_object_byteswap_t byteswap = 1619 DMU_OT_BYTESWAP(drrw->drr_type); 1620 dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, 1621 drrw->drr_length); 1622 } 1623 1624 dmu_buf_t *bonus; 1625 if (dmu_bonus_hold(ra->os, drrw->drr_object, FTAG, &bonus) != 0) 1626 return (SET_ERROR(EINVAL)); 1627 dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); 1628 dmu_tx_commit(tx); 1629 dmu_buf_rele(bonus, FTAG); 1630 return (0); 1631 } 1632 1633 /* 1634 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1635 * streams to refer to a copy of the data that is already on the 1636 * system because it came in earlier in the stream. This function 1637 * finds the earlier copy of the data, and uses that copy instead of 1638 * data from the stream to fulfill this write. 1639 */ 1640 static int 1641 restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) 1642 { 1643 dmu_tx_t *tx; 1644 int err; 1645 guid_map_entry_t gmesrch; 1646 guid_map_entry_t *gmep; 1647 avl_index_t where; 1648 objset_t *ref_os = NULL; 1649 dmu_buf_t *dbp; 1650 1651 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1652 return (SET_ERROR(EINVAL)); 1653 1654 /* 1655 * If the GUID of the referenced dataset is different from the 1656 * GUID of the target dataset, find the referenced dataset. 1657 */ 1658 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1659 gmesrch.guid = drrwbr->drr_refguid; 1660 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 1661 &where)) == NULL) { 1662 return (SET_ERROR(EINVAL)); 1663 } 1664 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1665 return (SET_ERROR(EINVAL)); 1666 } else { 1667 ref_os = ra->os; 1668 } 1669 1670 err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1671 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); 1672 if (err != 0) 1673 return (err); 1674 1675 tx = dmu_tx_create(ra->os); 1676 1677 dmu_tx_hold_write(tx, drrwbr->drr_object, 1678 drrwbr->drr_offset, drrwbr->drr_length); 1679 err = dmu_tx_assign(tx, TXG_WAIT); 1680 if (err != 0) { 1681 dmu_tx_abort(tx); 1682 return (err); 1683 } 1684 dmu_write(ra->os, drrwbr->drr_object, 1685 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1686 dmu_buf_rele(dbp, FTAG); 1687 dmu_tx_commit(tx); 1688 return (0); 1689 } 1690 1691 static int 1692 restore_write_embedded(struct restorearg *ra, 1693 struct drr_write_embedded *drrwnp, void *data) 1694 { 1695 dmu_tx_t *tx; 1696 int err; 1697 1698 if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) 1699 return (EINVAL); 1700 1701 if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE) 1702 return (EINVAL); 1703 1704 if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES) 1705 return (EINVAL); 1706 if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) 1707 return (EINVAL); 1708 1709 tx = dmu_tx_create(ra->os); 1710 1711 dmu_tx_hold_write(tx, drrwnp->drr_object, 1712 drrwnp->drr_offset, drrwnp->drr_length); 1713 err = dmu_tx_assign(tx, TXG_WAIT); 1714 if (err != 0) { 1715 dmu_tx_abort(tx); 1716 return (err); 1717 } 1718 1719 dmu_write_embedded(ra->os, drrwnp->drr_object, 1720 drrwnp->drr_offset, data, drrwnp->drr_etype, 1721 drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, 1722 ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); 1723 1724 dmu_tx_commit(tx); 1725 return (0); 1726 } 1727 1728 static int 1729 restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data) 1730 { 1731 dmu_tx_t *tx; 1732 dmu_buf_t *db, *db_spill; 1733 int err; 1734 1735 if (drrs->drr_length < SPA_MINBLOCKSIZE || 1736 drrs->drr_length > spa_maxblocksize(dmu_objset_spa(ra->os))) 1737 return (SET_ERROR(EINVAL)); 1738 1739 if (dmu_object_info(ra->os, drrs->drr_object, NULL) != 0) 1740 return (SET_ERROR(EINVAL)); 1741 1742 VERIFY0(dmu_bonus_hold(ra->os, drrs->drr_object, FTAG, &db)); 1743 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 1744 dmu_buf_rele(db, FTAG); 1745 return (err); 1746 } 1747 1748 tx = dmu_tx_create(ra->os); 1749 1750 dmu_tx_hold_spill(tx, db->db_object); 1751 1752 err = dmu_tx_assign(tx, TXG_WAIT); 1753 if (err != 0) { 1754 dmu_buf_rele(db, FTAG); 1755 dmu_buf_rele(db_spill, FTAG); 1756 dmu_tx_abort(tx); 1757 return (err); 1758 } 1759 dmu_buf_will_dirty(db_spill, tx); 1760 1761 if (db_spill->db_size < drrs->drr_length) 1762 VERIFY(0 == dbuf_spill_set_blksz(db_spill, 1763 drrs->drr_length, tx)); 1764 bcopy(data, db_spill->db_data, drrs->drr_length); 1765 1766 dmu_buf_rele(db, FTAG); 1767 dmu_buf_rele(db_spill, FTAG); 1768 1769 dmu_tx_commit(tx); 1770 return (0); 1771 } 1772 1773 /* ARGSUSED */ 1774 static int 1775 restore_free(struct restorearg *ra, struct drr_free *drrf) 1776 { 1777 int err; 1778 uint64_t length = drrf->drr_length; 1779 uint64_t offset = drrf->drr_offset; 1780 1781 if (length != -1ULL && offset + length < offset) 1782 length = -1ULL; 1783 1784 if (dmu_object_info(ra->os, drrf->drr_object, NULL) != 0) 1785 return (SET_ERROR(EINVAL)); 1786 1787 err = dmu_free_long_range(ra->os, drrf->drr_object, 1788 drrf->drr_offset, drrf->drr_length); 1789 return (err); 1790 } 1791 1792 /* used to destroy the drc_ds on error */ 1793 static void 1794 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) 1795 { 1796 char name[MAXNAMELEN]; 1797 dsl_dataset_name(drc->drc_ds, name); 1798 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 1799 (void) dsl_destroy_head(name); 1800 } 1801 1802 static void 1803 restore_cksum(struct restorearg *ra, int len, void *buf) 1804 { 1805 if (ra->byteswap) { 1806 fletcher_4_incremental_byteswap(buf, len, &ra->cksum); 1807 } else { 1808 fletcher_4_incremental_native(buf, len, &ra->cksum); 1809 } 1810 } 1811 1812 /* 1813 * If len != 0, read payload into buf. 1814 * Read next record's header into ra->next_drr. 1815 * Verify checksum of payload and next record. 1816 */ 1817 static int 1818 restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf) 1819 { 1820 int err; 1821 1822 if (len != 0) { 1823 ASSERT3U(len, <=, ra->bufsize); 1824 err = restore_read(ra, len, buf); 1825 if (err != 0) 1826 return (err); 1827 restore_cksum(ra, len, buf); 1828 } 1829 1830 ra->prev_cksum = ra->cksum; 1831 1832 err = restore_read(ra, sizeof (*ra->next_drr), ra->next_drr); 1833 if (err != 0) 1834 return (err); 1835 if (ra->next_drr->drr_type == DRR_BEGIN) 1836 return (SET_ERROR(EINVAL)); 1837 1838 /* 1839 * Note: checksum is of everything up to but not including the 1840 * checksum itself. 1841 */ 1842 ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), 1843 ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); 1844 restore_cksum(ra, 1845 offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), 1846 ra->next_drr); 1847 1848 zio_cksum_t cksum_orig = ra->next_drr->drr_u.drr_checksum.drr_checksum; 1849 zio_cksum_t *cksump = &ra->next_drr->drr_u.drr_checksum.drr_checksum; 1850 1851 if (ra->byteswap) 1852 byteswap_record(ra->next_drr); 1853 1854 if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && 1855 !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) 1856 return (SET_ERROR(ECKSUM)); 1857 1858 restore_cksum(ra, sizeof (cksum_orig), &cksum_orig); 1859 1860 return (0); 1861 } 1862 1863 static int 1864 restore_process_record(struct restorearg *ra) 1865 { 1866 int err; 1867 1868 switch (ra->drr->drr_type) { 1869 case DRR_OBJECT: 1870 { 1871 struct drr_object *drro = &ra->drr->drr_u.drr_object; 1872 err = restore_read_payload_and_next_header(ra, 1873 P2ROUNDUP(drro->drr_bonuslen, 8), ra->buf); 1874 if (err != 0) 1875 return (err); 1876 return (restore_object(ra, drro, ra->buf)); 1877 } 1878 case DRR_FREEOBJECTS: 1879 { 1880 struct drr_freeobjects *drrfo = 1881 &ra->drr->drr_u.drr_freeobjects; 1882 err = restore_read_payload_and_next_header(ra, 0, NULL); 1883 if (err != 0) 1884 return (err); 1885 return (restore_freeobjects(ra, drrfo)); 1886 } 1887 case DRR_WRITE: 1888 { 1889 struct drr_write *drrw = &ra->drr->drr_u.drr_write; 1890 arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os), 1891 drrw->drr_length); 1892 1893 err = restore_read_payload_and_next_header(ra, 1894 drrw->drr_length, abuf->b_data); 1895 if (err != 0) 1896 return (err); 1897 err = restore_write(ra, drrw, abuf); 1898 /* if restore_write() is successful, it consumes the arc_buf */ 1899 if (err != 0) 1900 dmu_return_arcbuf(abuf); 1901 return (err); 1902 } 1903 case DRR_WRITE_BYREF: 1904 { 1905 struct drr_write_byref *drrwbr = 1906 &ra->drr->drr_u.drr_write_byref; 1907 err = restore_read_payload_and_next_header(ra, 0, NULL); 1908 if (err != 0) 1909 return (err); 1910 return (restore_write_byref(ra, drrwbr)); 1911 } 1912 case DRR_WRITE_EMBEDDED: 1913 { 1914 struct drr_write_embedded *drrwe = 1915 &ra->drr->drr_u.drr_write_embedded; 1916 err = restore_read_payload_and_next_header(ra, 1917 P2ROUNDUP(drrwe->drr_psize, 8), ra->buf); 1918 if (err != 0) 1919 return (err); 1920 return (restore_write_embedded(ra, drrwe, ra->buf)); 1921 } 1922 case DRR_FREE: 1923 { 1924 struct drr_free *drrf = &ra->drr->drr_u.drr_free; 1925 err = restore_read_payload_and_next_header(ra, 0, NULL); 1926 if (err != 0) 1927 return (err); 1928 return (restore_free(ra, drrf)); 1929 } 1930 case DRR_END: 1931 { 1932 struct drr_end *drre = &ra->drr->drr_u.drr_end; 1933 if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) 1934 return (SET_ERROR(EINVAL)); 1935 return (0); 1936 } 1937 case DRR_SPILL: 1938 { 1939 struct drr_spill *drrs = &ra->drr->drr_u.drr_spill; 1940 err = restore_read_payload_and_next_header(ra, 1941 drrs->drr_length, ra->buf); 1942 if (err != 0) 1943 return (err); 1944 return (restore_spill(ra, drrs, ra->buf)); 1945 } 1946 default: 1947 return (SET_ERROR(EINVAL)); 1948 } 1949 } 1950 1951 /* 1952 * NB: callers *must* call dmu_recv_end() if this succeeds. 1953 */ 1954 int 1955 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, 1956 int cleanup_fd, uint64_t *action_handlep) 1957 { 1958 int err = 0; 1959 struct restorearg ra = { 0 }; 1960 int featureflags; 1961 1962 ra.byteswap = drc->drc_byteswap; 1963 ra.cksum = drc->drc_cksum; 1964 ra.vp = vp; 1965 ra.voff = *voffp; 1966 ra.bufsize = SPA_MAXBLOCKSIZE; 1967 ra.drr = kmem_alloc(sizeof (*ra.drr), KM_SLEEP); 1968 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1969 ra.next_drr = kmem_alloc(sizeof (*ra.next_drr), KM_SLEEP); 1970 1971 /* these were verified in dmu_recv_begin */ 1972 ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, 1973 DMU_SUBSTREAM); 1974 ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); 1975 1976 /* 1977 * Open the objset we are modifying. 1978 */ 1979 VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os)); 1980 1981 ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); 1982 1983 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1984 1985 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1986 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1987 minor_t minor; 1988 1989 if (cleanup_fd == -1) { 1990 ra.err = SET_ERROR(EBADF); 1991 goto out; 1992 } 1993 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 1994 if (ra.err != 0) { 1995 cleanup_fd = -1; 1996 goto out; 1997 } 1998 1999 if (*action_handlep == 0) { 2000 ra.guid_to_ds_map = 2001 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 2002 avl_create(ra.guid_to_ds_map, guid_compare, 2003 sizeof (guid_map_entry_t), 2004 offsetof(guid_map_entry_t, avlnode)); 2005 err = zfs_onexit_add_cb(minor, 2006 free_guid_map_onexit, ra.guid_to_ds_map, 2007 action_handlep); 2008 if (ra.err != 0) 2009 goto out; 2010 } else { 2011 err = zfs_onexit_cb_data(minor, *action_handlep, 2012 (void **)&ra.guid_to_ds_map); 2013 if (ra.err != 0) 2014 goto out; 2015 } 2016 2017 drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 2018 } 2019 2020 err = restore_read_payload_and_next_header(&ra, 0, NULL); 2021 if (err != 0) 2022 goto out; 2023 for (;;) { 2024 void *tmp; 2025 2026 if (issig(JUSTLOOKING) && issig(FORREAL)) { 2027 err = SET_ERROR(EINTR); 2028 break; 2029 } 2030 2031 tmp = ra.next_drr; 2032 ra.next_drr = ra.drr; 2033 ra.drr = tmp; 2034 2035 /* process ra.drr, read in ra.next_drr */ 2036 err = restore_process_record(&ra); 2037 if (err != 0) 2038 break; 2039 if (ra.drr->drr_type == DRR_END) 2040 break; 2041 } 2042 2043 out: 2044 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 2045 zfs_onexit_fd_rele(cleanup_fd); 2046 2047 if (err != 0) { 2048 /* 2049 * destroy what we created, so we don't leave it in the 2050 * inconsistent restoring state. 2051 */ 2052 dmu_recv_cleanup_ds(drc); 2053 } 2054 2055 kmem_free(ra.drr, sizeof (*ra.drr)); 2056 kmem_free(ra.buf, ra.bufsize); 2057 kmem_free(ra.next_drr, sizeof (*ra.next_drr)); 2058 *voffp = ra.voff; 2059 return (err); 2060 } 2061 2062 static int 2063 dmu_recv_end_check(void *arg, dmu_tx_t *tx) 2064 { 2065 dmu_recv_cookie_t *drc = arg; 2066 dsl_pool_t *dp = dmu_tx_pool(tx); 2067 int error; 2068 2069 ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); 2070 2071 if (!drc->drc_newfs) { 2072 dsl_dataset_t *origin_head; 2073 2074 error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); 2075 if (error != 0) 2076 return (error); 2077 if (drc->drc_force) { 2078 /* 2079 * We will destroy any snapshots in tofs (i.e. before 2080 * origin_head) that are after the origin (which is 2081 * the snap before drc_ds, because drc_ds can not 2082 * have any snaps of its own). 2083 */ 2084 uint64_t obj; 2085 2086 obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; 2087 while (obj != 2088 dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { 2089 dsl_dataset_t *snap; 2090 error = dsl_dataset_hold_obj(dp, obj, FTAG, 2091 &snap); 2092 if (error != 0) 2093 break; 2094 if (snap->ds_dir != origin_head->ds_dir) 2095 error = SET_ERROR(EINVAL); 2096 if (error == 0) { 2097 error = dsl_destroy_snapshot_check_impl( 2098 snap, B_FALSE); 2099 } 2100 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; 2101 dsl_dataset_rele(snap, FTAG); 2102 if (error != 0) 2103 break; 2104 } 2105 if (error != 0) { 2106 dsl_dataset_rele(origin_head, FTAG); 2107 return (error); 2108 } 2109 } 2110 error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, 2111 origin_head, drc->drc_force, drc->drc_owner, tx); 2112 if (error != 0) { 2113 dsl_dataset_rele(origin_head, FTAG); 2114 return (error); 2115 } 2116 error = dsl_dataset_snapshot_check_impl(origin_head, 2117 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 2118 dsl_dataset_rele(origin_head, FTAG); 2119 if (error != 0) 2120 return (error); 2121 2122 error = dsl_destroy_head_check_impl(drc->drc_ds, 1); 2123 } else { 2124 error = dsl_dataset_snapshot_check_impl(drc->drc_ds, 2125 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); 2126 } 2127 return (error); 2128 } 2129 2130 static void 2131 dmu_recv_end_sync(void *arg, dmu_tx_t *tx) 2132 { 2133 dmu_recv_cookie_t *drc = arg; 2134 dsl_pool_t *dp = dmu_tx_pool(tx); 2135 2136 spa_history_log_internal_ds(drc->drc_ds, "finish receiving", 2137 tx, "snap=%s", drc->drc_tosnap); 2138 2139 if (!drc->drc_newfs) { 2140 dsl_dataset_t *origin_head; 2141 2142 VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, 2143 &origin_head)); 2144 2145 if (drc->drc_force) { 2146 /* 2147 * Destroy any snapshots of drc_tofs (origin_head) 2148 * after the origin (the snap before drc_ds). 2149 */ 2150 uint64_t obj; 2151 2152 obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; 2153 while (obj != 2154 dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { 2155 dsl_dataset_t *snap; 2156 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, 2157 &snap)); 2158 ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); 2159 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; 2160 dsl_destroy_snapshot_sync_impl(snap, 2161 B_FALSE, tx); 2162 dsl_dataset_rele(snap, FTAG); 2163 } 2164 } 2165 VERIFY3P(drc->drc_ds->ds_prev, ==, 2166 origin_head->ds_prev); 2167 2168 dsl_dataset_clone_swap_sync_impl(drc->drc_ds, 2169 origin_head, tx); 2170 dsl_dataset_snapshot_sync_impl(origin_head, 2171 drc->drc_tosnap, tx); 2172 2173 /* set snapshot's creation time and guid */ 2174 dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); 2175 dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = 2176 drc->drc_drrb->drr_creation_time; 2177 dsl_dataset_phys(origin_head->ds_prev)->ds_guid = 2178 drc->drc_drrb->drr_toguid; 2179 dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= 2180 ~DS_FLAG_INCONSISTENT; 2181 2182 dmu_buf_will_dirty(origin_head->ds_dbuf, tx); 2183 dsl_dataset_phys(origin_head)->ds_flags &= 2184 ~DS_FLAG_INCONSISTENT; 2185 2186 dsl_dataset_rele(origin_head, FTAG); 2187 dsl_destroy_head_sync_impl(drc->drc_ds, tx); 2188 2189 if (drc->drc_owner != NULL) 2190 VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); 2191 } else { 2192 dsl_dataset_t *ds = drc->drc_ds; 2193 2194 dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); 2195 2196 /* set snapshot's creation time and guid */ 2197 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 2198 dsl_dataset_phys(ds->ds_prev)->ds_creation_time = 2199 drc->drc_drrb->drr_creation_time; 2200 dsl_dataset_phys(ds->ds_prev)->ds_guid = 2201 drc->drc_drrb->drr_toguid; 2202 dsl_dataset_phys(ds->ds_prev)->ds_flags &= 2203 ~DS_FLAG_INCONSISTENT; 2204 2205 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2206 dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; 2207 } 2208 drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; 2209 /* 2210 * Release the hold from dmu_recv_begin. This must be done before 2211 * we return to open context, so that when we free the dataset's dnode, 2212 * we can evict its bonus buffer. 2213 */ 2214 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 2215 drc->drc_ds = NULL; 2216 } 2217 2218 static int 2219 add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) 2220 { 2221 dsl_pool_t *dp; 2222 dsl_dataset_t *snapds; 2223 guid_map_entry_t *gmep; 2224 int err; 2225 2226 ASSERT(guid_map != NULL); 2227 2228 err = dsl_pool_hold(name, FTAG, &dp); 2229 if (err != 0) 2230 return (err); 2231 gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); 2232 err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); 2233 if (err == 0) { 2234 gmep->guid = dsl_dataset_phys(snapds)->ds_guid; 2235 gmep->gme_ds = snapds; 2236 avl_add(guid_map, gmep); 2237 dsl_dataset_long_hold(snapds, gmep); 2238 } else { 2239 kmem_free(gmep, sizeof (*gmep)); 2240 } 2241 2242 dsl_pool_rele(dp, FTAG); 2243 return (err); 2244 } 2245 2246 static int dmu_recv_end_modified_blocks = 3; 2247 2248 static int 2249 dmu_recv_existing_end(dmu_recv_cookie_t *drc) 2250 { 2251 int error; 2252 char name[MAXNAMELEN]; 2253 2254 #ifdef _KERNEL 2255 /* 2256 * We will be destroying the ds; make sure its origin is unmounted if 2257 * necessary. 2258 */ 2259 dsl_dataset_name(drc->drc_ds, name); 2260 zfs_destroy_unmount_origin(name); 2261 #endif 2262 2263 error = dsl_sync_task(drc->drc_tofs, 2264 dmu_recv_end_check, dmu_recv_end_sync, drc, 2265 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); 2266 2267 if (error != 0) 2268 dmu_recv_cleanup_ds(drc); 2269 return (error); 2270 } 2271 2272 static int 2273 dmu_recv_new_end(dmu_recv_cookie_t *drc) 2274 { 2275 int error; 2276 2277 error = dsl_sync_task(drc->drc_tofs, 2278 dmu_recv_end_check, dmu_recv_end_sync, drc, 2279 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); 2280 2281 if (error != 0) { 2282 dmu_recv_cleanup_ds(drc); 2283 } else if (drc->drc_guid_to_ds_map != NULL) { 2284 (void) add_ds_to_guidmap(drc->drc_tofs, 2285 drc->drc_guid_to_ds_map, 2286 drc->drc_newsnapobj); 2287 } 2288 return (error); 2289 } 2290 2291 int 2292 dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) 2293 { 2294 drc->drc_owner = owner; 2295 2296 if (drc->drc_newfs) 2297 return (dmu_recv_new_end(drc)); 2298 else 2299 return (dmu_recv_existing_end(drc)); 2300 } 2301 2302 /* 2303 * Return TRUE if this objset is currently being received into. 2304 */ 2305 boolean_t 2306 dmu_objset_is_receiving(objset_t *os) 2307 { 2308 return (os->os_dsl_dataset != NULL && 2309 os->os_dsl_dataset->ds_owner == dmu_recv_tag); 2310 } 2311