1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/dmu.h> 27 #include <sys/dmu_impl.h> 28 #include <sys/dmu_tx.h> 29 #include <sys/dbuf.h> 30 #include <sys/dnode.h> 31 #include <sys/zfs_context.h> 32 #include <sys/dmu_objset.h> 33 #include <sys/dmu_traverse.h> 34 #include <sys/dsl_dataset.h> 35 #include <sys/dsl_dir.h> 36 #include <sys/dsl_prop.h> 37 #include <sys/dsl_pool.h> 38 #include <sys/dsl_synctask.h> 39 #include <sys/zfs_ioctl.h> 40 #include <sys/zap.h> 41 #include <sys/zio_checksum.h> 42 #include <sys/avl.h> 43 #include <sys/ddt.h> 44 45 static char *dmu_recv_tag = "dmu_recv_tag"; 46 47 /* 48 * The list of data whose inclusion in a send stream can be pending from 49 * one call to backup_cb to another. Multiple calls to dump_free() and 50 * dump_freeobjects() can be aggregated into a single DRR_FREE or 51 * DRR_FREEOBJECTS replay record. 52 */ 53 typedef enum { 54 PENDING_NONE, 55 PENDING_FREE, 56 PENDING_FREEOBJECTS 57 } pendop_t; 58 59 struct backuparg { 60 dmu_replay_record_t *drr; 61 vnode_t *vp; 62 offset_t *off; 63 objset_t *os; 64 zio_cksum_t zc; 65 uint64_t toguid; 66 int err; 67 pendop_t pending_op; 68 }; 69 70 static int 71 dump_bytes(struct backuparg *ba, void *buf, int len) 72 { 73 ssize_t resid; /* have to get resid to get detailed errno */ 74 ASSERT3U(len % 8, ==, 0); 75 76 fletcher_4_incremental_native(buf, len, &ba->zc); 77 ba->err = vn_rdwr(UIO_WRITE, ba->vp, 78 (caddr_t)buf, len, 79 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); 80 *ba->off += len; 81 return (ba->err); 82 } 83 84 static int 85 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, 86 uint64_t length) 87 { 88 struct drr_free *drrf = &(ba->drr->drr_u.drr_free); 89 90 /* 91 * If there is a pending op, but it's not PENDING_FREE, push it out, 92 * since free block aggregation can only be done for blocks of the 93 * same type (i.e., DRR_FREE records can only be aggregated with 94 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 95 * aggregated with other DRR_FREEOBJECTS records. 96 */ 97 if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) { 98 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 99 return (EINTR); 100 ba->pending_op = PENDING_NONE; 101 } 102 103 if (ba->pending_op == PENDING_FREE) { 104 /* 105 * There should never be a PENDING_FREE if length is -1 106 * (because dump_dnode is the only place where this 107 * function is called with a -1, and only after flushing 108 * any pending record). 109 */ 110 ASSERT(length != -1ULL); 111 /* 112 * Check to see whether this free block can be aggregated 113 * with pending one. 114 */ 115 if (drrf->drr_object == object && drrf->drr_offset + 116 drrf->drr_length == offset) { 117 drrf->drr_length += length; 118 return (0); 119 } else { 120 /* not a continuation. Push out pending record */ 121 if (dump_bytes(ba, ba->drr, 122 sizeof (dmu_replay_record_t)) != 0) 123 return (EINTR); 124 ba->pending_op = PENDING_NONE; 125 } 126 } 127 /* create a FREE record and make it pending */ 128 bzero(ba->drr, sizeof (dmu_replay_record_t)); 129 ba->drr->drr_type = DRR_FREE; 130 drrf->drr_object = object; 131 drrf->drr_offset = offset; 132 drrf->drr_length = length; 133 drrf->drr_toguid = ba->toguid; 134 if (length == -1ULL) { 135 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 136 return (EINTR); 137 } else { 138 ba->pending_op = PENDING_FREE; 139 } 140 141 return (0); 142 } 143 144 static int 145 dump_data(struct backuparg *ba, dmu_object_type_t type, 146 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 147 { 148 struct drr_write *drrw = &(ba->drr->drr_u.drr_write); 149 150 151 /* 152 * If there is any kind of pending aggregation (currently either 153 * a grouping of free objects or free blocks), push it out to 154 * the stream, since aggregation can't be done across operations 155 * of different types. 156 */ 157 if (ba->pending_op != PENDING_NONE) { 158 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 159 return (EINTR); 160 ba->pending_op = PENDING_NONE; 161 } 162 /* write a DATA record */ 163 bzero(ba->drr, sizeof (dmu_replay_record_t)); 164 ba->drr->drr_type = DRR_WRITE; 165 drrw->drr_object = object; 166 drrw->drr_type = type; 167 drrw->drr_offset = offset; 168 drrw->drr_length = blksz; 169 drrw->drr_toguid = ba->toguid; 170 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 171 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 172 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 173 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 174 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 175 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 176 drrw->drr_key.ddk_cksum = bp->blk_cksum; 177 178 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 179 return (EINTR); 180 if (dump_bytes(ba, data, blksz) != 0) 181 return (EINTR); 182 return (0); 183 } 184 185 static int 186 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) 187 { 188 struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects); 189 190 /* 191 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 192 * push it out, since free block aggregation can only be done for 193 * blocks of the same type (i.e., DRR_FREE records can only be 194 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 195 * can only be aggregated with other DRR_FREEOBJECTS records. 196 */ 197 if (ba->pending_op != PENDING_NONE && 198 ba->pending_op != PENDING_FREEOBJECTS) { 199 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 200 return (EINTR); 201 ba->pending_op = PENDING_NONE; 202 } 203 if (ba->pending_op == PENDING_FREEOBJECTS) { 204 /* 205 * See whether this free object array can be aggregated 206 * with pending one 207 */ 208 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 209 drrfo->drr_numobjs += numobjs; 210 return (0); 211 } else { 212 /* can't be aggregated. Push out pending record */ 213 if (dump_bytes(ba, ba->drr, 214 sizeof (dmu_replay_record_t)) != 0) 215 return (EINTR); 216 ba->pending_op = PENDING_NONE; 217 } 218 } 219 220 /* write a FREEOBJECTS record */ 221 bzero(ba->drr, sizeof (dmu_replay_record_t)); 222 ba->drr->drr_type = DRR_FREEOBJECTS; 223 drrfo->drr_firstobj = firstobj; 224 drrfo->drr_numobjs = numobjs; 225 drrfo->drr_toguid = ba->toguid; 226 227 ba->pending_op = PENDING_FREEOBJECTS; 228 229 return (0); 230 } 231 232 static int 233 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) 234 { 235 struct drr_object *drro = &(ba->drr->drr_u.drr_object); 236 237 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 238 return (dump_freeobjects(ba, object, 1)); 239 240 if (ba->pending_op != PENDING_NONE) { 241 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 242 return (EINTR); 243 ba->pending_op = PENDING_NONE; 244 } 245 246 /* write an OBJECT record */ 247 bzero(ba->drr, sizeof (dmu_replay_record_t)); 248 ba->drr->drr_type = DRR_OBJECT; 249 drro->drr_object = object; 250 drro->drr_type = dnp->dn_type; 251 drro->drr_bonustype = dnp->dn_bonustype; 252 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 253 drro->drr_bonuslen = dnp->dn_bonuslen; 254 drro->drr_checksumtype = dnp->dn_checksum; 255 drro->drr_compress = dnp->dn_compress; 256 drro->drr_toguid = ba->toguid; 257 258 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 259 return (EINTR); 260 261 if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) 262 return (EINTR); 263 264 /* free anything past the end of the file */ 265 if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * 266 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) 267 return (EINTR); 268 if (ba->err) 269 return (EINTR); 270 return (0); 271 } 272 273 #define BP_SPAN(dnp, level) \ 274 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 275 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 276 277 /* ARGSUSED */ 278 static int 279 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 280 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 281 { 282 struct backuparg *ba = arg; 283 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 284 int err = 0; 285 286 if (issig(JUSTLOOKING) && issig(FORREAL)) 287 return (EINTR); 288 289 if (zb->zb_object != DMU_META_DNODE_OBJECT && 290 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 291 return (0); 292 } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { 293 uint64_t span = BP_SPAN(dnp, zb->zb_level); 294 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 295 err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); 296 } else if (bp == NULL) { 297 uint64_t span = BP_SPAN(dnp, zb->zb_level); 298 err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span); 299 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 300 return (0); 301 } else if (type == DMU_OT_DNODE) { 302 dnode_phys_t *blk; 303 int i; 304 int blksz = BP_GET_LSIZE(bp); 305 uint32_t aflags = ARC_WAIT; 306 arc_buf_t *abuf; 307 308 if (arc_read_nolock(NULL, spa, bp, 309 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 310 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 311 return (EIO); 312 313 blk = abuf->b_data; 314 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 315 uint64_t dnobj = (zb->zb_blkid << 316 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 317 err = dump_dnode(ba, dnobj, blk+i); 318 if (err) 319 break; 320 } 321 (void) arc_buf_remove_ref(abuf, &abuf); 322 } else { /* it's a level-0 block of a regular object */ 323 uint32_t aflags = ARC_WAIT; 324 arc_buf_t *abuf; 325 int blksz = BP_GET_LSIZE(bp); 326 327 if (arc_read_nolock(NULL, spa, bp, 328 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 329 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 330 return (EIO); 331 332 err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz, 333 blksz, bp, abuf->b_data); 334 (void) arc_buf_remove_ref(abuf, &abuf); 335 } 336 337 ASSERT(err == 0 || err == EINTR); 338 return (err); 339 } 340 341 int 342 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, 343 vnode_t *vp, offset_t *off) 344 { 345 dsl_dataset_t *ds = tosnap->os_dsl_dataset; 346 dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; 347 dmu_replay_record_t *drr; 348 struct backuparg ba; 349 int err; 350 uint64_t fromtxg = 0; 351 352 /* tosnap must be a snapshot */ 353 if (ds->ds_phys->ds_next_snap_obj == 0) 354 return (EINVAL); 355 356 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 357 if (fromds && (ds->ds_dir != fromds->ds_dir || 358 fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) 359 return (EXDEV); 360 361 if (fromorigin) { 362 dsl_pool_t *dp = ds->ds_dir->dd_pool; 363 364 if (fromsnap) 365 return (EINVAL); 366 367 if (dsl_dir_is_clone(ds->ds_dir)) { 368 rw_enter(&dp->dp_config_rwlock, RW_READER); 369 err = dsl_dataset_hold_obj(dp, 370 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); 371 rw_exit(&dp->dp_config_rwlock); 372 if (err) 373 return (err); 374 } else { 375 fromorigin = B_FALSE; 376 } 377 } 378 379 380 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 381 drr->drr_type = DRR_BEGIN; 382 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 383 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 384 DMU_SUBSTREAM); 385 drr->drr_u.drr_begin.drr_creation_time = 386 ds->ds_phys->ds_creation_time; 387 drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; 388 if (fromorigin) 389 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 390 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 391 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 392 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 393 394 if (fromds) 395 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; 396 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 397 398 if (fromds) 399 fromtxg = fromds->ds_phys->ds_creation_txg; 400 if (fromorigin) 401 dsl_dataset_rele(fromds, FTAG); 402 403 ba.drr = drr; 404 ba.vp = vp; 405 ba.os = tosnap; 406 ba.off = off; 407 ba.toguid = ds->ds_phys->ds_guid; 408 ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); 409 ba.pending_op = PENDING_NONE; 410 411 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { 412 kmem_free(drr, sizeof (dmu_replay_record_t)); 413 return (ba.err); 414 } 415 416 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 417 backup_cb, &ba); 418 419 if (ba.pending_op != PENDING_NONE) 420 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) 421 err = EINTR; 422 423 if (err) { 424 if (err == EINTR && ba.err) 425 err = ba.err; 426 kmem_free(drr, sizeof (dmu_replay_record_t)); 427 return (err); 428 } 429 430 bzero(drr, sizeof (dmu_replay_record_t)); 431 drr->drr_type = DRR_END; 432 drr->drr_u.drr_end.drr_checksum = ba.zc; 433 drr->drr_u.drr_end.drr_toguid = ba.toguid; 434 435 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { 436 kmem_free(drr, sizeof (dmu_replay_record_t)); 437 return (ba.err); 438 } 439 440 kmem_free(drr, sizeof (dmu_replay_record_t)); 441 442 return (0); 443 } 444 445 struct recvbeginsyncarg { 446 const char *tofs; 447 const char *tosnap; 448 dsl_dataset_t *origin; 449 uint64_t fromguid; 450 dmu_objset_type_t type; 451 void *tag; 452 boolean_t force; 453 uint64_t dsflags; 454 char clonelastname[MAXNAMELEN]; 455 dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ 456 }; 457 458 /* ARGSUSED */ 459 static int 460 recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) 461 { 462 dsl_dir_t *dd = arg1; 463 struct recvbeginsyncarg *rbsa = arg2; 464 objset_t *mos = dd->dd_pool->dp_meta_objset; 465 uint64_t val; 466 int err; 467 468 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, 469 strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); 470 471 if (err != ENOENT) 472 return (err ? err : EEXIST); 473 474 if (rbsa->origin) { 475 /* make sure it's a snap in the same pool */ 476 if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) 477 return (EXDEV); 478 if (!dsl_dataset_is_snapshot(rbsa->origin)) 479 return (EINVAL); 480 if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) 481 return (ENODEV); 482 } 483 484 return (0); 485 } 486 487 static void 488 recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 489 { 490 dsl_dir_t *dd = arg1; 491 struct recvbeginsyncarg *rbsa = arg2; 492 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 493 uint64_t dsobj; 494 495 /* Create and open new dataset. */ 496 dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, 497 rbsa->origin, flags, cr, tx); 498 VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, 499 B_TRUE, dmu_recv_tag, &rbsa->ds)); 500 501 if (rbsa->origin == NULL) { 502 (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, 503 rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); 504 } 505 506 spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC, 507 dd->dd_pool->dp_spa, tx, cr, "dataset = %lld", dsobj); 508 } 509 510 /* ARGSUSED */ 511 static int 512 recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) 513 { 514 dsl_dataset_t *ds = arg1; 515 struct recvbeginsyncarg *rbsa = arg2; 516 int err; 517 uint64_t val; 518 519 /* must not have any changes since most recent snapshot */ 520 if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) 521 return (ETXTBSY); 522 523 if (rbsa->fromguid) { 524 /* if incremental, most recent snapshot must match fromguid */ 525 if (ds->ds_prev == NULL) 526 return (ENODEV); 527 528 /* 529 * most recent snapshot must match fromguid, or there are no 530 * changes since the fromguid one 531 */ 532 if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { 533 uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; 534 uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; 535 while (obj != 0) { 536 dsl_dataset_t *snap; 537 err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 538 obj, FTAG, &snap); 539 if (err) 540 return (ENODEV); 541 if (snap->ds_phys->ds_creation_txg < birth) { 542 dsl_dataset_rele(snap, FTAG); 543 return (ENODEV); 544 } 545 if (snap->ds_phys->ds_guid == rbsa->fromguid) { 546 dsl_dataset_rele(snap, FTAG); 547 break; /* it's ok */ 548 } 549 obj = snap->ds_phys->ds_prev_snap_obj; 550 dsl_dataset_rele(snap, FTAG); 551 } 552 if (obj == 0) 553 return (ENODEV); 554 } 555 } else { 556 /* if full, most recent snapshot must be $ORIGIN */ 557 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) 558 return (ENODEV); 559 } 560 561 /* temporary clone name must not exist */ 562 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 563 ds->ds_dir->dd_phys->dd_child_dir_zapobj, 564 rbsa->clonelastname, 8, 1, &val); 565 if (err == 0) 566 return (EEXIST); 567 if (err != ENOENT) 568 return (err); 569 570 /* new snapshot name must not exist */ 571 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 572 ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); 573 if (err == 0) 574 return (EEXIST); 575 if (err != ENOENT) 576 return (err); 577 return (0); 578 } 579 580 /* ARGSUSED */ 581 static void 582 recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 583 { 584 dsl_dataset_t *ohds = arg1; 585 struct recvbeginsyncarg *rbsa = arg2; 586 dsl_pool_t *dp = ohds->ds_dir->dd_pool; 587 dsl_dataset_t *cds; 588 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 589 uint64_t dsobj; 590 591 /* create and open the temporary clone */ 592 dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, 593 ohds->ds_prev, flags, cr, tx); 594 VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); 595 596 /* 597 * If we actually created a non-clone, we need to create the 598 * objset in our new dataset. 599 */ 600 if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { 601 (void) dmu_objset_create_impl(dp->dp_spa, 602 cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); 603 } 604 605 rbsa->ds = cds; 606 607 spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, 608 dp->dp_spa, tx, cr, "dataset = %lld", dsobj); 609 } 610 611 /* 612 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 613 * succeeds; otherwise we will leak the holds on the datasets. 614 */ 615 int 616 dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, 617 boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) 618 { 619 int err = 0; 620 boolean_t byteswap; 621 struct recvbeginsyncarg rbsa = { 0 }; 622 uint64_t versioninfo; 623 int flags; 624 dsl_dataset_t *ds; 625 626 if (drrb->drr_magic == DMU_BACKUP_MAGIC) 627 byteswap = FALSE; 628 else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 629 byteswap = TRUE; 630 else 631 return (EINVAL); 632 633 rbsa.tofs = tofs; 634 rbsa.tosnap = tosnap; 635 rbsa.origin = origin ? origin->os_dsl_dataset : NULL; 636 rbsa.fromguid = drrb->drr_fromguid; 637 rbsa.type = drrb->drr_type; 638 rbsa.tag = FTAG; 639 rbsa.dsflags = 0; 640 versioninfo = drrb->drr_versioninfo; 641 flags = drrb->drr_flags; 642 643 if (byteswap) { 644 rbsa.type = BSWAP_32(rbsa.type); 645 rbsa.fromguid = BSWAP_64(rbsa.fromguid); 646 versioninfo = BSWAP_64(versioninfo); 647 flags = BSWAP_32(flags); 648 } 649 650 if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || 651 rbsa.type >= DMU_OST_NUMTYPES || 652 ((flags & DRR_FLAG_CLONE) && origin == NULL)) 653 return (EINVAL); 654 655 if (flags & DRR_FLAG_CI_DATA) 656 rbsa.dsflags = DS_FLAG_CI_DATASET; 657 658 bzero(drc, sizeof (dmu_recv_cookie_t)); 659 drc->drc_drrb = drrb; 660 drc->drc_tosnap = tosnap; 661 drc->drc_top_ds = top_ds; 662 drc->drc_force = force; 663 664 /* 665 * Process the begin in syncing context. 666 */ 667 668 /* open the dataset we are logically receiving into */ 669 err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); 670 if (err == 0) { 671 /* target fs already exists; recv into temp clone */ 672 673 /* Can't recv a clone into an existing fs */ 674 if (flags & DRR_FLAG_CLONE) { 675 dsl_dataset_rele(ds, dmu_recv_tag); 676 return (EINVAL); 677 } 678 679 /* must not have an incremental recv already in progress */ 680 if (!mutex_tryenter(&ds->ds_recvlock)) { 681 dsl_dataset_rele(ds, dmu_recv_tag); 682 return (EBUSY); 683 } 684 685 /* tmp clone name is: tofs/%tosnap" */ 686 (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), 687 "%%%s", tosnap); 688 rbsa.force = force; 689 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 690 recv_existing_check, recv_existing_sync, ds, &rbsa, 5); 691 if (err) { 692 mutex_exit(&ds->ds_recvlock); 693 dsl_dataset_rele(ds, dmu_recv_tag); 694 return (err); 695 } 696 drc->drc_logical_ds = ds; 697 drc->drc_real_ds = rbsa.ds; 698 } else if (err == ENOENT) { 699 /* target fs does not exist; must be a full backup or clone */ 700 char *cp; 701 702 /* 703 * If it's a non-clone incremental, we are missing the 704 * target fs, so fail the recv. 705 */ 706 if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) 707 return (ENOENT); 708 709 /* Open the parent of tofs */ 710 cp = strrchr(tofs, '/'); 711 *cp = '\0'; 712 err = dsl_dataset_hold(tofs, FTAG, &ds); 713 *cp = '/'; 714 if (err) 715 return (err); 716 717 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 718 recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); 719 dsl_dataset_rele(ds, FTAG); 720 if (err) 721 return (err); 722 drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; 723 drc->drc_newfs = B_TRUE; 724 } 725 726 return (err); 727 } 728 729 struct restorearg { 730 int err; 731 int byteswap; 732 vnode_t *vp; 733 char *buf; 734 uint64_t voff; 735 int bufsize; /* amount of memory allocated for buf */ 736 zio_cksum_t cksum; 737 avl_tree_t guid_to_ds_map; 738 }; 739 740 typedef struct guid_map_entry { 741 uint64_t guid; 742 dsl_dataset_t *gme_ds; 743 avl_node_t avlnode; 744 } guid_map_entry_t; 745 746 static int 747 guid_compare(const void *arg1, const void *arg2) 748 { 749 const guid_map_entry_t *gmep1 = arg1; 750 const guid_map_entry_t *gmep2 = arg2; 751 752 if (gmep1->guid < gmep2->guid) 753 return (-1); 754 else if (gmep1->guid > gmep2->guid) 755 return (1); 756 return (0); 757 } 758 759 /* 760 * This function is a callback used by dmu_objset_find() (which 761 * enumerates the object sets) to build an avl tree that maps guids 762 * to datasets. The resulting table is used when processing DRR_WRITE_BYREF 763 * send stream records. These records, which are used in dedup'ed 764 * streams, do not contain data themselves, but refer to a copy 765 * of the data block that has already been written because it was 766 * earlier in the stream. That previous copy is identified by the 767 * guid of the dataset with the referenced data. 768 */ 769 int 770 find_ds_by_guid(const char *name, void *arg) 771 { 772 avl_tree_t *guid_map = arg; 773 dsl_dataset_t *ds, *snapds; 774 guid_map_entry_t *gmep; 775 dsl_pool_t *dp; 776 int err; 777 uint64_t lastobj, firstobj; 778 779 if (dsl_dataset_hold(name, FTAG, &ds) != 0) 780 return (0); 781 782 dp = ds->ds_dir->dd_pool; 783 rw_enter(&dp->dp_config_rwlock, RW_READER); 784 firstobj = ds->ds_dir->dd_phys->dd_origin_obj; 785 lastobj = ds->ds_phys->ds_prev_snap_obj; 786 787 while (lastobj != firstobj) { 788 err = dsl_dataset_hold_obj(dp, lastobj, guid_map, &snapds); 789 if (err) { 790 /* 791 * Skip this snapshot and move on. It's not 792 * clear why this would ever happen, but the 793 * remainder of the snapshot streadm can be 794 * processed. 795 */ 796 rw_exit(&dp->dp_config_rwlock); 797 dsl_dataset_rele(ds, FTAG); 798 return (0); 799 } 800 801 gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); 802 gmep->guid = snapds->ds_phys->ds_guid; 803 gmep->gme_ds = snapds; 804 avl_add(guid_map, gmep); 805 lastobj = snapds->ds_phys->ds_prev_snap_obj; 806 } 807 808 rw_exit(&dp->dp_config_rwlock); 809 dsl_dataset_rele(ds, FTAG); 810 811 return (0); 812 } 813 814 static void * 815 restore_read(struct restorearg *ra, int len) 816 { 817 void *rv; 818 int done = 0; 819 820 /* some things will require 8-byte alignment, so everything must */ 821 ASSERT3U(len % 8, ==, 0); 822 823 while (done < len) { 824 ssize_t resid; 825 826 ra->err = vn_rdwr(UIO_READ, ra->vp, 827 (caddr_t)ra->buf + done, len - done, 828 ra->voff, UIO_SYSSPACE, FAPPEND, 829 RLIM64_INFINITY, CRED(), &resid); 830 831 if (resid == len - done) 832 ra->err = EINVAL; 833 ra->voff += len - done - resid; 834 done = len - resid; 835 if (ra->err) 836 return (NULL); 837 } 838 839 ASSERT3U(done, ==, len); 840 rv = ra->buf; 841 if (ra->byteswap) 842 fletcher_4_incremental_byteswap(rv, len, &ra->cksum); 843 else 844 fletcher_4_incremental_native(rv, len, &ra->cksum); 845 return (rv); 846 } 847 848 static void 849 backup_byteswap(dmu_replay_record_t *drr) 850 { 851 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 852 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 853 drr->drr_type = BSWAP_32(drr->drr_type); 854 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 855 switch (drr->drr_type) { 856 case DRR_BEGIN: 857 DO64(drr_begin.drr_magic); 858 DO64(drr_begin.drr_versioninfo); 859 DO64(drr_begin.drr_creation_time); 860 DO32(drr_begin.drr_type); 861 DO32(drr_begin.drr_flags); 862 DO64(drr_begin.drr_toguid); 863 DO64(drr_begin.drr_fromguid); 864 break; 865 case DRR_OBJECT: 866 DO64(drr_object.drr_object); 867 /* DO64(drr_object.drr_allocation_txg); */ 868 DO32(drr_object.drr_type); 869 DO32(drr_object.drr_bonustype); 870 DO32(drr_object.drr_blksz); 871 DO32(drr_object.drr_bonuslen); 872 DO64(drr_object.drr_toguid); 873 break; 874 case DRR_FREEOBJECTS: 875 DO64(drr_freeobjects.drr_firstobj); 876 DO64(drr_freeobjects.drr_numobjs); 877 DO64(drr_freeobjects.drr_toguid); 878 break; 879 case DRR_WRITE: 880 DO64(drr_write.drr_object); 881 DO32(drr_write.drr_type); 882 DO64(drr_write.drr_offset); 883 DO64(drr_write.drr_length); 884 DO64(drr_write.drr_toguid); 885 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); 886 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); 887 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); 888 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); 889 DO64(drr_write.drr_key.ddk_prop); 890 break; 891 case DRR_WRITE_BYREF: 892 DO64(drr_write_byref.drr_object); 893 DO64(drr_write_byref.drr_offset); 894 DO64(drr_write_byref.drr_length); 895 DO64(drr_write_byref.drr_toguid); 896 DO64(drr_write_byref.drr_refguid); 897 DO64(drr_write_byref.drr_refobject); 898 DO64(drr_write_byref.drr_refoffset); 899 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); 900 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); 901 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); 902 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); 903 DO64(drr_write_byref.drr_key.ddk_prop); 904 break; 905 case DRR_FREE: 906 DO64(drr_free.drr_object); 907 DO64(drr_free.drr_offset); 908 DO64(drr_free.drr_length); 909 DO64(drr_free.drr_toguid); 910 break; 911 case DRR_END: 912 DO64(drr_end.drr_checksum.zc_word[0]); 913 DO64(drr_end.drr_checksum.zc_word[1]); 914 DO64(drr_end.drr_checksum.zc_word[2]); 915 DO64(drr_end.drr_checksum.zc_word[3]); 916 DO64(drr_end.drr_toguid); 917 break; 918 } 919 #undef DO64 920 #undef DO32 921 } 922 923 static int 924 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 925 { 926 int err; 927 dmu_tx_t *tx; 928 void *data = NULL; 929 930 if (drro->drr_type == DMU_OT_NONE || 931 drro->drr_type >= DMU_OT_NUMTYPES || 932 drro->drr_bonustype >= DMU_OT_NUMTYPES || 933 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 934 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 935 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 936 drro->drr_blksz < SPA_MINBLOCKSIZE || 937 drro->drr_blksz > SPA_MAXBLOCKSIZE || 938 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 939 return (EINVAL); 940 } 941 942 err = dmu_object_info(os, drro->drr_object, NULL); 943 944 if (err != 0 && err != ENOENT) 945 return (EINVAL); 946 947 if (drro->drr_bonuslen) { 948 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); 949 if (ra->err) 950 return (ra->err); 951 } 952 953 if (err == ENOENT) { 954 /* currently free, want to be allocated */ 955 tx = dmu_tx_create(os); 956 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 957 err = dmu_tx_assign(tx, TXG_WAIT); 958 if (err) { 959 dmu_tx_abort(tx); 960 return (err); 961 } 962 err = dmu_object_claim(os, drro->drr_object, 963 drro->drr_type, drro->drr_blksz, 964 drro->drr_bonustype, drro->drr_bonuslen, tx); 965 dmu_tx_commit(tx); 966 } else { 967 /* currently allocated, want to be allocated */ 968 err = dmu_object_reclaim(os, drro->drr_object, 969 drro->drr_type, drro->drr_blksz, 970 drro->drr_bonustype, drro->drr_bonuslen); 971 } 972 if (err) 973 return (EINVAL); 974 975 tx = dmu_tx_create(os); 976 dmu_tx_hold_bonus(tx, drro->drr_object); 977 err = dmu_tx_assign(tx, TXG_WAIT); 978 if (err) { 979 dmu_tx_abort(tx); 980 return (err); 981 } 982 983 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, 984 tx); 985 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 986 987 if (data != NULL) { 988 dmu_buf_t *db; 989 990 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 991 dmu_buf_will_dirty(db, tx); 992 993 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 994 bcopy(data, db->db_data, drro->drr_bonuslen); 995 if (ra->byteswap) { 996 dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, 997 drro->drr_bonuslen); 998 } 999 dmu_buf_rele(db, FTAG); 1000 } 1001 dmu_tx_commit(tx); 1002 return (0); 1003 } 1004 1005 /* ARGSUSED */ 1006 static int 1007 restore_freeobjects(struct restorearg *ra, objset_t *os, 1008 struct drr_freeobjects *drrfo) 1009 { 1010 uint64_t obj; 1011 1012 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1013 return (EINVAL); 1014 1015 for (obj = drrfo->drr_firstobj; 1016 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1017 (void) dmu_object_next(os, &obj, FALSE, 0)) { 1018 int err; 1019 1020 if (dmu_object_info(os, obj, NULL) != 0) 1021 continue; 1022 1023 err = dmu_free_object(os, obj); 1024 if (err) 1025 return (err); 1026 } 1027 return (0); 1028 } 1029 1030 static int 1031 restore_write(struct restorearg *ra, objset_t *os, 1032 struct drr_write *drrw) 1033 { 1034 dmu_tx_t *tx; 1035 void *data; 1036 int err; 1037 1038 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1039 drrw->drr_type >= DMU_OT_NUMTYPES) 1040 return (EINVAL); 1041 1042 data = restore_read(ra, drrw->drr_length); 1043 if (data == NULL) 1044 return (ra->err); 1045 1046 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1047 return (EINVAL); 1048 1049 tx = dmu_tx_create(os); 1050 1051 dmu_tx_hold_write(tx, drrw->drr_object, 1052 drrw->drr_offset, drrw->drr_length); 1053 err = dmu_tx_assign(tx, TXG_WAIT); 1054 if (err) { 1055 dmu_tx_abort(tx); 1056 return (err); 1057 } 1058 if (ra->byteswap) 1059 dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); 1060 dmu_write(os, drrw->drr_object, 1061 drrw->drr_offset, drrw->drr_length, data, tx); 1062 dmu_tx_commit(tx); 1063 return (0); 1064 } 1065 1066 /* 1067 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1068 * streams to refer to a copy of the data that is already on the 1069 * system because it came in earlier in the stream. This function 1070 * finds the earlier copy of the data, and uses that copy instead of 1071 * data from the stream to fulfill this write. 1072 */ 1073 static int 1074 restore_write_byref(struct restorearg *ra, objset_t *os, 1075 struct drr_write_byref *drrwbr) 1076 { 1077 dmu_tx_t *tx; 1078 int err; 1079 guid_map_entry_t gmesrch; 1080 guid_map_entry_t *gmep; 1081 avl_index_t where; 1082 objset_t *ref_os = NULL; 1083 dmu_buf_t *dbp; 1084 1085 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1086 return (EINVAL); 1087 1088 /* 1089 * If the GUID of the referenced dataset is different from the 1090 * GUID of the target dataset, find the referenced dataset. 1091 */ 1092 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1093 gmesrch.guid = drrwbr->drr_refguid; 1094 if ((gmep = avl_find(&ra->guid_to_ds_map, &gmesrch, 1095 &where)) == NULL) { 1096 return (EINVAL); 1097 } 1098 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1099 return (EINVAL); 1100 } else { 1101 ref_os = os; 1102 } 1103 1104 if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1105 drrwbr->drr_refoffset, FTAG, &dbp)) 1106 return (err); 1107 1108 tx = dmu_tx_create(os); 1109 1110 dmu_tx_hold_write(tx, drrwbr->drr_object, 1111 drrwbr->drr_offset, drrwbr->drr_length); 1112 err = dmu_tx_assign(tx, TXG_WAIT); 1113 if (err) { 1114 dmu_tx_abort(tx); 1115 return (err); 1116 } 1117 dmu_write(os, drrwbr->drr_object, 1118 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1119 dmu_buf_rele(dbp, FTAG); 1120 dmu_tx_commit(tx); 1121 return (0); 1122 } 1123 1124 /* ARGSUSED */ 1125 static int 1126 restore_free(struct restorearg *ra, objset_t *os, 1127 struct drr_free *drrf) 1128 { 1129 int err; 1130 1131 if (drrf->drr_length != -1ULL && 1132 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1133 return (EINVAL); 1134 1135 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1136 return (EINVAL); 1137 1138 err = dmu_free_long_range(os, drrf->drr_object, 1139 drrf->drr_offset, drrf->drr_length); 1140 return (err); 1141 } 1142 1143 /* 1144 * NB: callers *must* call dmu_recv_end() if this succeeds. 1145 */ 1146 int 1147 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp) 1148 { 1149 struct restorearg ra = { 0 }; 1150 dmu_replay_record_t *drr; 1151 objset_t *os; 1152 zio_cksum_t pcksum; 1153 guid_map_entry_t *gmep; 1154 int featureflags; 1155 1156 if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 1157 ra.byteswap = TRUE; 1158 1159 { 1160 /* compute checksum of drr_begin record */ 1161 dmu_replay_record_t *drr; 1162 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 1163 1164 drr->drr_type = DRR_BEGIN; 1165 drr->drr_u.drr_begin = *drc->drc_drrb; 1166 if (ra.byteswap) { 1167 fletcher_4_incremental_byteswap(drr, 1168 sizeof (dmu_replay_record_t), &ra.cksum); 1169 } else { 1170 fletcher_4_incremental_native(drr, 1171 sizeof (dmu_replay_record_t), &ra.cksum); 1172 } 1173 kmem_free(drr, sizeof (dmu_replay_record_t)); 1174 } 1175 1176 if (ra.byteswap) { 1177 struct drr_begin *drrb = drc->drc_drrb; 1178 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1179 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 1180 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1181 drrb->drr_type = BSWAP_32(drrb->drr_type); 1182 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1183 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1184 } 1185 1186 ra.vp = vp; 1187 ra.voff = *voffp; 1188 ra.bufsize = 1<<20; 1189 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1190 1191 /* these were verified in dmu_recv_begin */ 1192 ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == 1193 DMU_SUBSTREAM); 1194 ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); 1195 1196 /* 1197 * Open the objset we are modifying. 1198 */ 1199 VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); 1200 1201 ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); 1202 1203 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1204 1205 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1206 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1207 avl_create(&ra.guid_to_ds_map, guid_compare, 1208 sizeof (guid_map_entry_t), 1209 offsetof(guid_map_entry_t, avlnode)); 1210 (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid, 1211 (void *)&ra.guid_to_ds_map, 1212 DS_FIND_CHILDREN); 1213 } 1214 1215 /* 1216 * Read records and process them. 1217 */ 1218 pcksum = ra.cksum; 1219 while (ra.err == 0 && 1220 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { 1221 if (issig(JUSTLOOKING) && issig(FORREAL)) { 1222 ra.err = EINTR; 1223 goto out; 1224 } 1225 1226 if (ra.byteswap) 1227 backup_byteswap(drr); 1228 1229 switch (drr->drr_type) { 1230 case DRR_OBJECT: 1231 { 1232 /* 1233 * We need to make a copy of the record header, 1234 * because restore_{object,write} may need to 1235 * restore_read(), which will invalidate drr. 1236 */ 1237 struct drr_object drro = drr->drr_u.drr_object; 1238 ra.err = restore_object(&ra, os, &drro); 1239 break; 1240 } 1241 case DRR_FREEOBJECTS: 1242 { 1243 struct drr_freeobjects drrfo = 1244 drr->drr_u.drr_freeobjects; 1245 ra.err = restore_freeobjects(&ra, os, &drrfo); 1246 break; 1247 } 1248 case DRR_WRITE: 1249 { 1250 struct drr_write drrw = drr->drr_u.drr_write; 1251 ra.err = restore_write(&ra, os, &drrw); 1252 break; 1253 } 1254 case DRR_WRITE_BYREF: 1255 { 1256 struct drr_write_byref drrwbr = 1257 drr->drr_u.drr_write_byref; 1258 ra.err = restore_write_byref(&ra, os, &drrwbr); 1259 break; 1260 } 1261 case DRR_FREE: 1262 { 1263 struct drr_free drrf = drr->drr_u.drr_free; 1264 ra.err = restore_free(&ra, os, &drrf); 1265 break; 1266 } 1267 case DRR_END: 1268 { 1269 struct drr_end drre = drr->drr_u.drr_end; 1270 /* 1271 * We compare against the *previous* checksum 1272 * value, because the stored checksum is of 1273 * everything before the DRR_END record. 1274 */ 1275 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) 1276 ra.err = ECKSUM; 1277 goto out; 1278 } 1279 default: 1280 ra.err = EINVAL; 1281 goto out; 1282 } 1283 pcksum = ra.cksum; 1284 } 1285 ASSERT(ra.err != 0); 1286 1287 out: 1288 if (ra.err != 0) { 1289 /* 1290 * destroy what we created, so we don't leave it in the 1291 * inconsistent restoring state. 1292 */ 1293 txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); 1294 1295 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 1296 B_FALSE); 1297 if (drc->drc_real_ds != drc->drc_logical_ds) { 1298 mutex_exit(&drc->drc_logical_ds->ds_recvlock); 1299 dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); 1300 } 1301 } 1302 1303 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1304 void *cookie = NULL; 1305 1306 while (gmep = avl_destroy_nodes(&ra.guid_to_ds_map, &cookie)) { 1307 dsl_dataset_rele(gmep->gme_ds, &ra.guid_to_ds_map); 1308 kmem_free(gmep, sizeof (guid_map_entry_t)); 1309 } 1310 avl_destroy(&ra.guid_to_ds_map); 1311 } 1312 1313 kmem_free(ra.buf, ra.bufsize); 1314 *voffp = ra.voff; 1315 return (ra.err); 1316 } 1317 1318 struct recvendsyncarg { 1319 char *tosnap; 1320 uint64_t creation_time; 1321 uint64_t toguid; 1322 }; 1323 1324 static int 1325 recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) 1326 { 1327 dsl_dataset_t *ds = arg1; 1328 struct recvendsyncarg *resa = arg2; 1329 1330 return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); 1331 } 1332 1333 static void 1334 recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 1335 { 1336 dsl_dataset_t *ds = arg1; 1337 struct recvendsyncarg *resa = arg2; 1338 1339 dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx); 1340 1341 /* set snapshot's creation time and guid */ 1342 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 1343 ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; 1344 ds->ds_prev->ds_phys->ds_guid = resa->toguid; 1345 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1346 1347 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1348 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1349 } 1350 1351 static int 1352 dmu_recv_existing_end(dmu_recv_cookie_t *drc) 1353 { 1354 struct recvendsyncarg resa; 1355 dsl_dataset_t *ds = drc->drc_logical_ds; 1356 int err; 1357 1358 /* 1359 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 1360 * expects it to have a ds_user_ptr (and zil), but clone_swap() 1361 * can close it. 1362 */ 1363 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1364 1365 if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { 1366 err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, 1367 drc->drc_force); 1368 if (err) 1369 goto out; 1370 } else { 1371 mutex_exit(&ds->ds_recvlock); 1372 dsl_dataset_rele(ds, dmu_recv_tag); 1373 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 1374 B_FALSE); 1375 return (EBUSY); 1376 } 1377 1378 resa.creation_time = drc->drc_drrb->drr_creation_time; 1379 resa.toguid = drc->drc_drrb->drr_toguid; 1380 resa.tosnap = drc->drc_tosnap; 1381 1382 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1383 recv_end_check, recv_end_sync, ds, &resa, 3); 1384 if (err) { 1385 /* swap back */ 1386 (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); 1387 } 1388 1389 out: 1390 mutex_exit(&ds->ds_recvlock); 1391 dsl_dataset_disown(ds, dmu_recv_tag); 1392 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); 1393 return (err); 1394 } 1395 1396 static int 1397 dmu_recv_new_end(dmu_recv_cookie_t *drc) 1398 { 1399 struct recvendsyncarg resa; 1400 dsl_dataset_t *ds = drc->drc_logical_ds; 1401 int err; 1402 1403 /* 1404 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 1405 * expects it to have a ds_user_ptr (and zil), but clone_swap() 1406 * can close it. 1407 */ 1408 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1409 1410 resa.creation_time = drc->drc_drrb->drr_creation_time; 1411 resa.toguid = drc->drc_drrb->drr_toguid; 1412 resa.tosnap = drc->drc_tosnap; 1413 1414 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1415 recv_end_check, recv_end_sync, ds, &resa, 3); 1416 if (err) { 1417 /* clean up the fs we just recv'd into */ 1418 (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); 1419 } else { 1420 /* release the hold from dmu_recv_begin */ 1421 dsl_dataset_disown(ds, dmu_recv_tag); 1422 } 1423 return (err); 1424 } 1425 1426 int 1427 dmu_recv_end(dmu_recv_cookie_t *drc) 1428 { 1429 if (drc->drc_logical_ds != drc->drc_real_ds) 1430 return (dmu_recv_existing_end(drc)); 1431 else 1432 return (dmu_recv_new_end(drc)); 1433 } 1434