1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/dmu.h> 27 #include <sys/dmu_impl.h> 28 #include <sys/dmu_tx.h> 29 #include <sys/dbuf.h> 30 #include <sys/dnode.h> 31 #include <sys/zfs_context.h> 32 #include <sys/dmu_objset.h> 33 #include <sys/dmu_traverse.h> 34 #include <sys/dsl_dataset.h> 35 #include <sys/dsl_dir.h> 36 #include <sys/dsl_prop.h> 37 #include <sys/dsl_pool.h> 38 #include <sys/dsl_synctask.h> 39 #include <sys/zfs_ioctl.h> 40 #include <sys/zap.h> 41 #include <sys/zio_checksum.h> 42 #include <sys/avl.h> 43 44 static char *dmu_recv_tag = "dmu_recv_tag"; 45 46 /* 47 * The list of data whose inclusion in a send stream can be pending from 48 * one call to backup_cb to another. Multiple calls to dump_free() and 49 * dump_freeobjects() can be aggregated into a single DRR_FREE or 50 * DRR_FREEOBJECTS replay record. 51 */ 52 typedef enum { 53 PENDING_NONE, 54 PENDING_FREE, 55 PENDING_FREEOBJECTS 56 } pendop_t; 57 58 struct backuparg { 59 dmu_replay_record_t *drr; 60 vnode_t *vp; 61 offset_t *off; 62 objset_t *os; 63 zio_cksum_t zc; 64 uint64_t toguid; 65 int err; 66 pendop_t pending_op; 67 }; 68 69 static int 70 dump_bytes(struct backuparg *ba, void *buf, int len) 71 { 72 ssize_t resid; /* have to get resid to get detailed errno */ 73 ASSERT3U(len % 8, ==, 0); 74 75 fletcher_4_incremental_native(buf, len, &ba->zc); 76 ba->err = vn_rdwr(UIO_WRITE, ba->vp, 77 (caddr_t)buf, len, 78 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); 79 *ba->off += len; 80 return (ba->err); 81 } 82 83 static int 84 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, 85 uint64_t length) 86 { 87 struct drr_free *drrf = &(ba->drr->drr_u.drr_free); 88 89 /* 90 * If there is a pending op, but it's not PENDING_FREE, push it out, 91 * since free block aggregation can only be done for blocks of the 92 * same type (i.e., DRR_FREE records can only be aggregated with 93 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 94 * aggregated with other DRR_FREEOBJECTS records. 95 */ 96 if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) { 97 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 98 return (EINTR); 99 ba->pending_op = PENDING_NONE; 100 } 101 102 if (ba->pending_op == PENDING_FREE) { 103 /* 104 * There should never be a PENDING_FREE if length is -1 105 * (because dump_dnode is the only place where this 106 * function is called with a -1, and only after flushing 107 * any pending record). 108 */ 109 ASSERT(length != -1ULL); 110 /* 111 * Check to see whether this free block can be aggregated 112 * with pending one. 113 */ 114 if (drrf->drr_object == object && drrf->drr_offset + 115 drrf->drr_length == offset) { 116 drrf->drr_length += length; 117 return (0); 118 } else { 119 /* not a continuation. Push out pending record */ 120 if (dump_bytes(ba, ba->drr, 121 sizeof (dmu_replay_record_t)) != 0) 122 return (EINTR); 123 ba->pending_op = PENDING_NONE; 124 } 125 } 126 /* create a FREE record and make it pending */ 127 bzero(ba->drr, sizeof (dmu_replay_record_t)); 128 ba->drr->drr_type = DRR_FREE; 129 drrf->drr_object = object; 130 drrf->drr_offset = offset; 131 drrf->drr_length = length; 132 drrf->drr_toguid = ba->toguid; 133 if (length == -1ULL) { 134 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 135 return (EINTR); 136 } else { 137 ba->pending_op = PENDING_FREE; 138 } 139 140 return (0); 141 } 142 143 static int 144 dump_data(struct backuparg *ba, dmu_object_type_t type, 145 uint64_t object, uint64_t offset, int blksz, void *data) 146 { 147 struct drr_write *drrw = &(ba->drr->drr_u.drr_write); 148 149 /* 150 * If there is any kind of pending aggregation (currently either 151 * a grouping of free objects or free blocks), push it out to 152 * the stream, since aggregation can't be done across operations 153 * of different types. 154 */ 155 if (ba->pending_op != PENDING_NONE) { 156 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 157 return (EINTR); 158 ba->pending_op = PENDING_NONE; 159 } 160 /* write a DATA record */ 161 bzero(ba->drr, sizeof (dmu_replay_record_t)); 162 ba->drr->drr_type = DRR_WRITE; 163 drrw->drr_object = object; 164 drrw->drr_type = type; 165 drrw->drr_offset = offset; 166 drrw->drr_length = blksz; 167 drrw->drr_toguid = ba->toguid; 168 169 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 170 return (EINTR); 171 if (dump_bytes(ba, data, blksz) != 0) 172 return (EINTR); 173 return (0); 174 } 175 176 static int 177 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) 178 { 179 struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects); 180 181 /* 182 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 183 * push it out, since free block aggregation can only be done for 184 * blocks of the same type (i.e., DRR_FREE records can only be 185 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 186 * can only be aggregated with other DRR_FREEOBJECTS records. 187 */ 188 if (ba->pending_op != PENDING_NONE && 189 ba->pending_op != PENDING_FREEOBJECTS) { 190 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 191 return (EINTR); 192 ba->pending_op = PENDING_NONE; 193 } 194 if (ba->pending_op == PENDING_FREEOBJECTS) { 195 /* 196 * See whether this free object array can be aggregated 197 * with pending one 198 */ 199 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 200 drrfo->drr_numobjs += numobjs; 201 return (0); 202 } else { 203 /* can't be aggregated. Push out pending record */ 204 if (dump_bytes(ba, ba->drr, 205 sizeof (dmu_replay_record_t)) != 0) 206 return (EINTR); 207 ba->pending_op = PENDING_NONE; 208 } 209 } 210 211 /* write a FREEOBJECTS record */ 212 bzero(ba->drr, sizeof (dmu_replay_record_t)); 213 ba->drr->drr_type = DRR_FREEOBJECTS; 214 drrfo->drr_firstobj = firstobj; 215 drrfo->drr_numobjs = numobjs; 216 drrfo->drr_toguid = ba->toguid; 217 218 ba->pending_op = PENDING_FREEOBJECTS; 219 220 return (0); 221 } 222 223 static int 224 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) 225 { 226 struct drr_object *drro = &(ba->drr->drr_u.drr_object); 227 228 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 229 return (dump_freeobjects(ba, object, 1)); 230 231 if (ba->pending_op != PENDING_NONE) { 232 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 233 return (EINTR); 234 ba->pending_op = PENDING_NONE; 235 } 236 237 /* write an OBJECT record */ 238 bzero(ba->drr, sizeof (dmu_replay_record_t)); 239 ba->drr->drr_type = DRR_OBJECT; 240 drro->drr_object = object; 241 drro->drr_type = dnp->dn_type; 242 drro->drr_bonustype = dnp->dn_bonustype; 243 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 244 drro->drr_bonuslen = dnp->dn_bonuslen; 245 drro->drr_checksumtype = dnp->dn_checksum; 246 drro->drr_compress = dnp->dn_compress; 247 drro->drr_toguid = ba->toguid; 248 249 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 250 return (EINTR); 251 252 if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) 253 return (EINTR); 254 255 /* free anything past the end of the file */ 256 if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * 257 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) 258 return (EINTR); 259 if (ba->err) 260 return (EINTR); 261 return (0); 262 } 263 264 #define BP_SPAN(dnp, level) \ 265 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 266 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 267 268 /* ARGSUSED */ 269 static int 270 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 271 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 272 { 273 struct backuparg *ba = arg; 274 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 275 int err = 0; 276 277 if (issig(JUSTLOOKING) && issig(FORREAL)) 278 return (EINTR); 279 280 if (zb->zb_object != DMU_META_DNODE_OBJECT && 281 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 282 return (0); 283 } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { 284 uint64_t span = BP_SPAN(dnp, zb->zb_level); 285 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 286 err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); 287 } else if (bp == NULL) { 288 uint64_t span = BP_SPAN(dnp, zb->zb_level); 289 err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span); 290 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 291 return (0); 292 } else if (type == DMU_OT_DNODE) { 293 dnode_phys_t *blk; 294 int i; 295 int blksz = BP_GET_LSIZE(bp); 296 uint32_t aflags = ARC_WAIT; 297 arc_buf_t *abuf; 298 299 if (arc_read_nolock(NULL, spa, bp, 300 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 301 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 302 return (EIO); 303 304 blk = abuf->b_data; 305 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 306 uint64_t dnobj = (zb->zb_blkid << 307 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 308 err = dump_dnode(ba, dnobj, blk+i); 309 if (err) 310 break; 311 } 312 (void) arc_buf_remove_ref(abuf, &abuf); 313 } else { /* it's a level-0 block of a regular object */ 314 uint32_t aflags = ARC_WAIT; 315 arc_buf_t *abuf; 316 int blksz = BP_GET_LSIZE(bp); 317 318 if (arc_read_nolock(NULL, spa, bp, 319 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 320 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 321 return (EIO); 322 323 err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz, 324 blksz, abuf->b_data); 325 (void) arc_buf_remove_ref(abuf, &abuf); 326 } 327 328 ASSERT(err == 0 || err == EINTR); 329 return (err); 330 } 331 332 int 333 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, 334 vnode_t *vp, offset_t *off) 335 { 336 dsl_dataset_t *ds = tosnap->os_dsl_dataset; 337 dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; 338 dmu_replay_record_t *drr; 339 struct backuparg ba; 340 int err; 341 uint64_t fromtxg = 0; 342 343 /* tosnap must be a snapshot */ 344 if (ds->ds_phys->ds_next_snap_obj == 0) 345 return (EINVAL); 346 347 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 348 if (fromds && (ds->ds_dir != fromds->ds_dir || 349 fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) 350 return (EXDEV); 351 352 if (fromorigin) { 353 dsl_pool_t *dp = ds->ds_dir->dd_pool; 354 355 if (fromsnap) 356 return (EINVAL); 357 358 if (dsl_dir_is_clone(ds->ds_dir)) { 359 rw_enter(&dp->dp_config_rwlock, RW_READER); 360 err = dsl_dataset_hold_obj(dp, 361 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); 362 rw_exit(&dp->dp_config_rwlock); 363 if (err) 364 return (err); 365 } else { 366 fromorigin = B_FALSE; 367 } 368 } 369 370 371 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 372 drr->drr_type = DRR_BEGIN; 373 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 374 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 375 DMU_SUBSTREAM); 376 drr->drr_u.drr_begin.drr_creation_time = 377 ds->ds_phys->ds_creation_time; 378 drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; 379 if (fromorigin) 380 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 381 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 382 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 383 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 384 385 if (fromds) 386 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; 387 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 388 389 if (fromds) 390 fromtxg = fromds->ds_phys->ds_creation_txg; 391 if (fromorigin) 392 dsl_dataset_rele(fromds, FTAG); 393 394 ba.drr = drr; 395 ba.vp = vp; 396 ba.os = tosnap; 397 ba.off = off; 398 ba.toguid = ds->ds_phys->ds_guid; 399 ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); 400 ba.pending_op = PENDING_NONE; 401 402 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { 403 kmem_free(drr, sizeof (dmu_replay_record_t)); 404 return (ba.err); 405 } 406 407 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 408 backup_cb, &ba); 409 410 if (ba.pending_op != PENDING_NONE) 411 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) 412 err = EINTR; 413 414 if (err) { 415 if (err == EINTR && ba.err) 416 err = ba.err; 417 kmem_free(drr, sizeof (dmu_replay_record_t)); 418 return (err); 419 } 420 421 bzero(drr, sizeof (dmu_replay_record_t)); 422 drr->drr_type = DRR_END; 423 drr->drr_u.drr_end.drr_checksum = ba.zc; 424 drr->drr_u.drr_end.drr_toguid = ba.toguid; 425 426 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { 427 kmem_free(drr, sizeof (dmu_replay_record_t)); 428 return (ba.err); 429 } 430 431 kmem_free(drr, sizeof (dmu_replay_record_t)); 432 433 return (0); 434 } 435 436 struct recvbeginsyncarg { 437 const char *tofs; 438 const char *tosnap; 439 dsl_dataset_t *origin; 440 uint64_t fromguid; 441 dmu_objset_type_t type; 442 void *tag; 443 boolean_t force; 444 uint64_t dsflags; 445 char clonelastname[MAXNAMELEN]; 446 dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ 447 }; 448 449 /* ARGSUSED */ 450 static int 451 recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) 452 { 453 dsl_dir_t *dd = arg1; 454 struct recvbeginsyncarg *rbsa = arg2; 455 objset_t *mos = dd->dd_pool->dp_meta_objset; 456 uint64_t val; 457 int err; 458 459 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, 460 strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); 461 462 if (err != ENOENT) 463 return (err ? err : EEXIST); 464 465 if (rbsa->origin) { 466 /* make sure it's a snap in the same pool */ 467 if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) 468 return (EXDEV); 469 if (!dsl_dataset_is_snapshot(rbsa->origin)) 470 return (EINVAL); 471 if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) 472 return (ENODEV); 473 } 474 475 return (0); 476 } 477 478 static void 479 recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 480 { 481 dsl_dir_t *dd = arg1; 482 struct recvbeginsyncarg *rbsa = arg2; 483 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 484 uint64_t dsobj; 485 486 /* Create and open new dataset. */ 487 dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, 488 rbsa->origin, flags, cr, tx); 489 VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, 490 B_TRUE, dmu_recv_tag, &rbsa->ds)); 491 492 if (rbsa->origin == NULL) { 493 (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, 494 rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); 495 } 496 497 spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC, 498 dd->dd_pool->dp_spa, tx, cr, "dataset = %lld", dsobj); 499 } 500 501 /* ARGSUSED */ 502 static int 503 recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) 504 { 505 dsl_dataset_t *ds = arg1; 506 struct recvbeginsyncarg *rbsa = arg2; 507 int err; 508 uint64_t val; 509 510 /* must not have any changes since most recent snapshot */ 511 if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) 512 return (ETXTBSY); 513 514 if (rbsa->fromguid) { 515 /* if incremental, most recent snapshot must match fromguid */ 516 if (ds->ds_prev == NULL) 517 return (ENODEV); 518 519 /* 520 * most recent snapshot must match fromguid, or there are no 521 * changes since the fromguid one 522 */ 523 if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { 524 uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; 525 uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; 526 while (obj != 0) { 527 dsl_dataset_t *snap; 528 err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 529 obj, FTAG, &snap); 530 if (err) 531 return (ENODEV); 532 if (snap->ds_phys->ds_creation_txg < birth) { 533 dsl_dataset_rele(snap, FTAG); 534 return (ENODEV); 535 } 536 if (snap->ds_phys->ds_guid == rbsa->fromguid) { 537 dsl_dataset_rele(snap, FTAG); 538 break; /* it's ok */ 539 } 540 obj = snap->ds_phys->ds_prev_snap_obj; 541 dsl_dataset_rele(snap, FTAG); 542 } 543 if (obj == 0) 544 return (ENODEV); 545 } 546 } else { 547 /* if full, most recent snapshot must be $ORIGIN */ 548 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) 549 return (ENODEV); 550 } 551 552 /* temporary clone name must not exist */ 553 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 554 ds->ds_dir->dd_phys->dd_child_dir_zapobj, 555 rbsa->clonelastname, 8, 1, &val); 556 if (err == 0) 557 return (EEXIST); 558 if (err != ENOENT) 559 return (err); 560 561 /* new snapshot name must not exist */ 562 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 563 ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); 564 if (err == 0) 565 return (EEXIST); 566 if (err != ENOENT) 567 return (err); 568 return (0); 569 } 570 571 /* ARGSUSED */ 572 static void 573 recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 574 { 575 dsl_dataset_t *ohds = arg1; 576 struct recvbeginsyncarg *rbsa = arg2; 577 dsl_pool_t *dp = ohds->ds_dir->dd_pool; 578 dsl_dataset_t *cds; 579 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 580 uint64_t dsobj; 581 582 /* create and open the temporary clone */ 583 dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, 584 ohds->ds_prev, flags, cr, tx); 585 VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); 586 587 /* 588 * If we actually created a non-clone, we need to create the 589 * objset in our new dataset. 590 */ 591 if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { 592 (void) dmu_objset_create_impl(dp->dp_spa, 593 cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); 594 } 595 596 rbsa->ds = cds; 597 598 spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, 599 dp->dp_spa, tx, cr, "dataset = %lld", dsobj); 600 } 601 602 /* 603 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 604 * succeeds; otherwise we will leak the holds on the datasets. 605 */ 606 int 607 dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, 608 boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) 609 { 610 int err = 0; 611 boolean_t byteswap; 612 struct recvbeginsyncarg rbsa = { 0 }; 613 uint64_t versioninfo; 614 int flags; 615 dsl_dataset_t *ds; 616 617 if (drrb->drr_magic == DMU_BACKUP_MAGIC) 618 byteswap = FALSE; 619 else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 620 byteswap = TRUE; 621 else 622 return (EINVAL); 623 624 rbsa.tofs = tofs; 625 rbsa.tosnap = tosnap; 626 rbsa.origin = origin ? origin->os_dsl_dataset : NULL; 627 rbsa.fromguid = drrb->drr_fromguid; 628 rbsa.type = drrb->drr_type; 629 rbsa.tag = FTAG; 630 rbsa.dsflags = 0; 631 versioninfo = drrb->drr_versioninfo; 632 flags = drrb->drr_flags; 633 634 if (byteswap) { 635 rbsa.type = BSWAP_32(rbsa.type); 636 rbsa.fromguid = BSWAP_64(rbsa.fromguid); 637 versioninfo = BSWAP_64(versioninfo); 638 flags = BSWAP_32(flags); 639 } 640 641 if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || 642 rbsa.type >= DMU_OST_NUMTYPES || 643 ((flags & DRR_FLAG_CLONE) && origin == NULL)) 644 return (EINVAL); 645 646 if (flags & DRR_FLAG_CI_DATA) 647 rbsa.dsflags = DS_FLAG_CI_DATASET; 648 649 bzero(drc, sizeof (dmu_recv_cookie_t)); 650 drc->drc_drrb = drrb; 651 drc->drc_tosnap = tosnap; 652 drc->drc_top_ds = top_ds; 653 drc->drc_force = force; 654 655 /* 656 * Process the begin in syncing context. 657 */ 658 659 /* open the dataset we are logically receiving into */ 660 err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); 661 if (err == 0) { 662 /* target fs already exists; recv into temp clone */ 663 664 /* Can't recv a clone into an existing fs */ 665 if (flags & DRR_FLAG_CLONE) { 666 dsl_dataset_rele(ds, dmu_recv_tag); 667 return (EINVAL); 668 } 669 670 /* must not have an incremental recv already in progress */ 671 if (!mutex_tryenter(&ds->ds_recvlock)) { 672 dsl_dataset_rele(ds, dmu_recv_tag); 673 return (EBUSY); 674 } 675 676 /* tmp clone name is: tofs/%tosnap" */ 677 (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), 678 "%%%s", tosnap); 679 rbsa.force = force; 680 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 681 recv_existing_check, recv_existing_sync, ds, &rbsa, 5); 682 if (err) { 683 mutex_exit(&ds->ds_recvlock); 684 dsl_dataset_rele(ds, dmu_recv_tag); 685 return (err); 686 } 687 drc->drc_logical_ds = ds; 688 drc->drc_real_ds = rbsa.ds; 689 } else if (err == ENOENT) { 690 /* target fs does not exist; must be a full backup or clone */ 691 char *cp; 692 693 /* 694 * If it's a non-clone incremental, we are missing the 695 * target fs, so fail the recv. 696 */ 697 if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) 698 return (ENOENT); 699 700 /* Open the parent of tofs */ 701 cp = strrchr(tofs, '/'); 702 *cp = '\0'; 703 err = dsl_dataset_hold(tofs, FTAG, &ds); 704 *cp = '/'; 705 if (err) 706 return (err); 707 708 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 709 recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); 710 dsl_dataset_rele(ds, FTAG); 711 if (err) 712 return (err); 713 drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; 714 drc->drc_newfs = B_TRUE; 715 } 716 717 return (err); 718 } 719 720 struct restorearg { 721 int err; 722 int byteswap; 723 vnode_t *vp; 724 char *buf; 725 uint64_t voff; 726 int bufsize; /* amount of memory allocated for buf */ 727 zio_cksum_t cksum; 728 avl_tree_t guid_to_ds_map; 729 }; 730 731 typedef struct guid_map_entry { 732 uint64_t guid; 733 dsl_dataset_t *gme_ds; 734 avl_node_t avlnode; 735 } guid_map_entry_t; 736 737 static int 738 guid_compare(const void *arg1, const void *arg2) 739 { 740 const guid_map_entry_t *gmep1 = arg1; 741 const guid_map_entry_t *gmep2 = arg2; 742 743 if (gmep1->guid < gmep2->guid) 744 return (-1); 745 else if (gmep1->guid > gmep2->guid) 746 return (1); 747 return (0); 748 } 749 750 /* 751 * This function is a callback used by dmu_objset_find() (which 752 * enumerates the object sets) to build an avl tree that maps guids 753 * to datasets. The resulting table is used when processing DRR_WRITE_BYREF 754 * send stream records. These records, which are used in dedup'ed 755 * streams, do not contain data themselves, but refer to a copy 756 * of the data block that has already been written because it was 757 * earlier in the stream. That previous copy is identified by the 758 * guid of the dataset with the referenced data. 759 */ 760 int 761 find_ds_by_guid(char *name, void *arg) 762 { 763 dsl_dataset_t *ds, *snapds; 764 avl_tree_t *guid_map = arg; 765 guid_map_entry_t *gmep; 766 dsl_pool_t *dp; 767 int err; 768 uint64_t lastobj, firstobj; 769 770 if (dsl_dataset_hold(name, FTAG, &ds) != 0) 771 return (0); 772 773 dp = ds->ds_dir->dd_pool; 774 rw_enter(&dp->dp_config_rwlock, RW_READER); 775 firstobj = ds->ds_dir->dd_phys->dd_origin_obj; 776 lastobj = ds->ds_phys->ds_prev_snap_obj; 777 778 while (lastobj != firstobj) { 779 err = dsl_dataset_hold_obj(dp, lastobj, guid_map, &snapds); 780 if (err) { 781 /* 782 * Skip this snapshot and move on. It's not 783 * clear why this would ever happen, but the 784 * remainder of the snapshot streadm can be 785 * processed. 786 */ 787 rw_exit(&dp->dp_config_rwlock); 788 dsl_dataset_rele(ds, FTAG); 789 return (0); 790 } 791 792 gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); 793 gmep->guid = snapds->ds_phys->ds_guid; 794 gmep->gme_ds = snapds; 795 avl_add(guid_map, gmep); 796 lastobj = snapds->ds_phys->ds_prev_snap_obj; 797 } 798 799 rw_exit(&dp->dp_config_rwlock); 800 dsl_dataset_rele(ds, FTAG); 801 802 return (0); 803 } 804 805 static void * 806 restore_read(struct restorearg *ra, int len) 807 { 808 void *rv; 809 int done = 0; 810 811 /* some things will require 8-byte alignment, so everything must */ 812 ASSERT3U(len % 8, ==, 0); 813 814 while (done < len) { 815 ssize_t resid; 816 817 ra->err = vn_rdwr(UIO_READ, ra->vp, 818 (caddr_t)ra->buf + done, len - done, 819 ra->voff, UIO_SYSSPACE, FAPPEND, 820 RLIM64_INFINITY, CRED(), &resid); 821 822 if (resid == len - done) 823 ra->err = EINVAL; 824 ra->voff += len - done - resid; 825 done = len - resid; 826 if (ra->err) 827 return (NULL); 828 } 829 830 ASSERT3U(done, ==, len); 831 rv = ra->buf; 832 if (ra->byteswap) 833 fletcher_4_incremental_byteswap(rv, len, &ra->cksum); 834 else 835 fletcher_4_incremental_native(rv, len, &ra->cksum); 836 return (rv); 837 } 838 839 static void 840 backup_byteswap(dmu_replay_record_t *drr) 841 { 842 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 843 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 844 drr->drr_type = BSWAP_32(drr->drr_type); 845 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 846 switch (drr->drr_type) { 847 case DRR_BEGIN: 848 DO64(drr_begin.drr_magic); 849 DO64(drr_begin.drr_versioninfo); 850 DO64(drr_begin.drr_creation_time); 851 DO32(drr_begin.drr_type); 852 DO32(drr_begin.drr_flags); 853 DO64(drr_begin.drr_toguid); 854 DO64(drr_begin.drr_fromguid); 855 break; 856 case DRR_OBJECT: 857 DO64(drr_object.drr_object); 858 /* DO64(drr_object.drr_allocation_txg); */ 859 DO32(drr_object.drr_type); 860 DO32(drr_object.drr_bonustype); 861 DO32(drr_object.drr_blksz); 862 DO32(drr_object.drr_bonuslen); 863 DO64(drr_object.drr_toguid); 864 break; 865 case DRR_FREEOBJECTS: 866 DO64(drr_freeobjects.drr_firstobj); 867 DO64(drr_freeobjects.drr_numobjs); 868 DO64(drr_freeobjects.drr_toguid); 869 break; 870 case DRR_WRITE: 871 DO64(drr_write.drr_object); 872 DO32(drr_write.drr_type); 873 DO64(drr_write.drr_offset); 874 DO64(drr_write.drr_length); 875 DO64(drr_write.drr_toguid); 876 DO64(drr_write.drr_blkcksum.zc_word[0]); 877 DO64(drr_write.drr_blkcksum.zc_word[1]); 878 DO64(drr_write.drr_blkcksum.zc_word[2]); 879 DO64(drr_write.drr_blkcksum.zc_word[3]); 880 break; 881 case DRR_WRITE_BYREF: 882 DO64(drr_write_byref.drr_object); 883 DO64(drr_write_byref.drr_offset); 884 DO64(drr_write_byref.drr_length); 885 DO64(drr_write_byref.drr_toguid); 886 DO64(drr_write_byref.drr_refguid); 887 DO64(drr_write_byref.drr_refobject); 888 DO64(drr_write_byref.drr_refoffset); 889 DO64(drr_write_byref.drr_blkcksum.zc_word[0]); 890 DO64(drr_write_byref.drr_blkcksum.zc_word[1]); 891 DO64(drr_write_byref.drr_blkcksum.zc_word[2]); 892 DO64(drr_write_byref.drr_blkcksum.zc_word[3]); 893 break; 894 case DRR_FREE: 895 DO64(drr_free.drr_object); 896 DO64(drr_free.drr_offset); 897 DO64(drr_free.drr_length); 898 DO64(drr_free.drr_toguid); 899 break; 900 case DRR_END: 901 DO64(drr_end.drr_checksum.zc_word[0]); 902 DO64(drr_end.drr_checksum.zc_word[1]); 903 DO64(drr_end.drr_checksum.zc_word[2]); 904 DO64(drr_end.drr_checksum.zc_word[3]); 905 DO64(drr_end.drr_toguid); 906 break; 907 } 908 #undef DO64 909 #undef DO32 910 } 911 912 static int 913 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 914 { 915 int err; 916 dmu_tx_t *tx; 917 void *data = NULL; 918 919 if (drro->drr_type == DMU_OT_NONE || 920 drro->drr_type >= DMU_OT_NUMTYPES || 921 drro->drr_bonustype >= DMU_OT_NUMTYPES || 922 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 923 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 924 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 925 drro->drr_blksz < SPA_MINBLOCKSIZE || 926 drro->drr_blksz > SPA_MAXBLOCKSIZE || 927 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 928 return (EINVAL); 929 } 930 931 err = dmu_object_info(os, drro->drr_object, NULL); 932 933 if (err != 0 && err != ENOENT) 934 return (EINVAL); 935 936 if (drro->drr_bonuslen) { 937 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); 938 if (ra->err) 939 return (ra->err); 940 } 941 942 if (err == ENOENT) { 943 /* currently free, want to be allocated */ 944 tx = dmu_tx_create(os); 945 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 946 err = dmu_tx_assign(tx, TXG_WAIT); 947 if (err) { 948 dmu_tx_abort(tx); 949 return (err); 950 } 951 err = dmu_object_claim(os, drro->drr_object, 952 drro->drr_type, drro->drr_blksz, 953 drro->drr_bonustype, drro->drr_bonuslen, tx); 954 dmu_tx_commit(tx); 955 } else { 956 /* currently allocated, want to be allocated */ 957 err = dmu_object_reclaim(os, drro->drr_object, 958 drro->drr_type, drro->drr_blksz, 959 drro->drr_bonustype, drro->drr_bonuslen); 960 } 961 if (err) 962 return (EINVAL); 963 964 tx = dmu_tx_create(os); 965 dmu_tx_hold_bonus(tx, drro->drr_object); 966 err = dmu_tx_assign(tx, TXG_WAIT); 967 if (err) { 968 dmu_tx_abort(tx); 969 return (err); 970 } 971 972 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, 973 tx); 974 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 975 976 if (data != NULL) { 977 dmu_buf_t *db; 978 979 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 980 dmu_buf_will_dirty(db, tx); 981 982 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 983 bcopy(data, db->db_data, drro->drr_bonuslen); 984 if (ra->byteswap) { 985 dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, 986 drro->drr_bonuslen); 987 } 988 dmu_buf_rele(db, FTAG); 989 } 990 dmu_tx_commit(tx); 991 return (0); 992 } 993 994 /* ARGSUSED */ 995 static int 996 restore_freeobjects(struct restorearg *ra, objset_t *os, 997 struct drr_freeobjects *drrfo) 998 { 999 uint64_t obj; 1000 1001 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1002 return (EINVAL); 1003 1004 for (obj = drrfo->drr_firstobj; 1005 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1006 (void) dmu_object_next(os, &obj, FALSE, 0)) { 1007 int err; 1008 1009 if (dmu_object_info(os, obj, NULL) != 0) 1010 continue; 1011 1012 err = dmu_free_object(os, obj); 1013 if (err) 1014 return (err); 1015 } 1016 return (0); 1017 } 1018 1019 static int 1020 restore_write(struct restorearg *ra, objset_t *os, 1021 struct drr_write *drrw) 1022 { 1023 dmu_tx_t *tx; 1024 void *data; 1025 int err; 1026 1027 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1028 drrw->drr_type >= DMU_OT_NUMTYPES) 1029 return (EINVAL); 1030 1031 data = restore_read(ra, drrw->drr_length); 1032 if (data == NULL) 1033 return (ra->err); 1034 1035 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1036 return (EINVAL); 1037 1038 tx = dmu_tx_create(os); 1039 1040 dmu_tx_hold_write(tx, drrw->drr_object, 1041 drrw->drr_offset, drrw->drr_length); 1042 err = dmu_tx_assign(tx, TXG_WAIT); 1043 if (err) { 1044 dmu_tx_abort(tx); 1045 return (err); 1046 } 1047 if (ra->byteswap) 1048 dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); 1049 dmu_write(os, drrw->drr_object, 1050 drrw->drr_offset, drrw->drr_length, data, tx); 1051 dmu_tx_commit(tx); 1052 return (0); 1053 } 1054 1055 /* 1056 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1057 * streams to refer to a copy of the data that is already on the 1058 * system because it came in earlier in the stream. This function 1059 * finds the earlier copy of the data, and uses that copy instead of 1060 * data from the stream to fulfill this write. 1061 */ 1062 static int 1063 restore_write_byref(struct restorearg *ra, objset_t *os, 1064 struct drr_write_byref *drrwbr) 1065 { 1066 dmu_tx_t *tx; 1067 int err; 1068 guid_map_entry_t gmesrch; 1069 guid_map_entry_t *gmep; 1070 avl_index_t where; 1071 objset_t *ref_os = NULL; 1072 dmu_buf_t *dbp; 1073 1074 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1075 return (EINVAL); 1076 1077 /* 1078 * If the GUID of the referenced dataset is different from the 1079 * GUID of the target dataset, find the referenced dataset. 1080 */ 1081 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1082 gmesrch.guid = drrwbr->drr_refguid; 1083 if ((gmep = avl_find(&ra->guid_to_ds_map, &gmesrch, 1084 &where)) == NULL) { 1085 return (EINVAL); 1086 } 1087 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1088 return (EINVAL); 1089 } else { 1090 ref_os = os; 1091 } 1092 1093 if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1094 drrwbr->drr_refoffset, FTAG, &dbp)) 1095 return (err); 1096 1097 tx = dmu_tx_create(os); 1098 1099 dmu_tx_hold_write(tx, drrwbr->drr_object, 1100 drrwbr->drr_offset, drrwbr->drr_length); 1101 err = dmu_tx_assign(tx, TXG_WAIT); 1102 if (err) { 1103 dmu_tx_abort(tx); 1104 return (err); 1105 } 1106 dmu_write(os, drrwbr->drr_object, 1107 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1108 dmu_buf_rele(dbp, FTAG); 1109 dmu_tx_commit(tx); 1110 return (0); 1111 } 1112 1113 /* ARGSUSED */ 1114 static int 1115 restore_free(struct restorearg *ra, objset_t *os, 1116 struct drr_free *drrf) 1117 { 1118 int err; 1119 1120 if (drrf->drr_length != -1ULL && 1121 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1122 return (EINVAL); 1123 1124 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1125 return (EINVAL); 1126 1127 err = dmu_free_long_range(os, drrf->drr_object, 1128 drrf->drr_offset, drrf->drr_length); 1129 return (err); 1130 } 1131 1132 /* 1133 * NB: callers *must* call dmu_recv_end() if this succeeds. 1134 */ 1135 int 1136 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp) 1137 { 1138 struct restorearg ra = { 0 }; 1139 dmu_replay_record_t *drr; 1140 objset_t *os; 1141 zio_cksum_t pcksum; 1142 guid_map_entry_t *gmep; 1143 int featureflags; 1144 1145 if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 1146 ra.byteswap = TRUE; 1147 1148 { 1149 /* compute checksum of drr_begin record */ 1150 dmu_replay_record_t *drr; 1151 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 1152 1153 drr->drr_type = DRR_BEGIN; 1154 drr->drr_u.drr_begin = *drc->drc_drrb; 1155 if (ra.byteswap) { 1156 fletcher_4_incremental_byteswap(drr, 1157 sizeof (dmu_replay_record_t), &ra.cksum); 1158 } else { 1159 fletcher_4_incremental_native(drr, 1160 sizeof (dmu_replay_record_t), &ra.cksum); 1161 } 1162 kmem_free(drr, sizeof (dmu_replay_record_t)); 1163 } 1164 1165 if (ra.byteswap) { 1166 struct drr_begin *drrb = drc->drc_drrb; 1167 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1168 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 1169 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1170 drrb->drr_type = BSWAP_32(drrb->drr_type); 1171 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1172 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1173 } 1174 1175 ra.vp = vp; 1176 ra.voff = *voffp; 1177 ra.bufsize = 1<<20; 1178 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1179 1180 /* these were verified in dmu_recv_begin */ 1181 ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == 1182 DMU_SUBSTREAM); 1183 ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); 1184 1185 /* 1186 * Open the objset we are modifying. 1187 */ 1188 VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); 1189 1190 ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); 1191 1192 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1193 1194 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1195 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1196 avl_create(&ra.guid_to_ds_map, guid_compare, 1197 sizeof (guid_map_entry_t), 1198 offsetof(guid_map_entry_t, avlnode)); 1199 (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid, 1200 (void *)&ra.guid_to_ds_map, 1201 DS_FIND_CHILDREN); 1202 } 1203 1204 /* 1205 * Read records and process them. 1206 */ 1207 pcksum = ra.cksum; 1208 while (ra.err == 0 && 1209 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { 1210 if (issig(JUSTLOOKING) && issig(FORREAL)) { 1211 ra.err = EINTR; 1212 goto out; 1213 } 1214 1215 if (ra.byteswap) 1216 backup_byteswap(drr); 1217 1218 switch (drr->drr_type) { 1219 case DRR_OBJECT: 1220 { 1221 /* 1222 * We need to make a copy of the record header, 1223 * because restore_{object,write} may need to 1224 * restore_read(), which will invalidate drr. 1225 */ 1226 struct drr_object drro = drr->drr_u.drr_object; 1227 ra.err = restore_object(&ra, os, &drro); 1228 break; 1229 } 1230 case DRR_FREEOBJECTS: 1231 { 1232 struct drr_freeobjects drrfo = 1233 drr->drr_u.drr_freeobjects; 1234 ra.err = restore_freeobjects(&ra, os, &drrfo); 1235 break; 1236 } 1237 case DRR_WRITE: 1238 { 1239 struct drr_write drrw = drr->drr_u.drr_write; 1240 ra.err = restore_write(&ra, os, &drrw); 1241 break; 1242 } 1243 case DRR_WRITE_BYREF: 1244 { 1245 struct drr_write_byref drrwbr = 1246 drr->drr_u.drr_write_byref; 1247 ra.err = restore_write_byref(&ra, os, &drrwbr); 1248 break; 1249 } 1250 case DRR_FREE: 1251 { 1252 struct drr_free drrf = drr->drr_u.drr_free; 1253 ra.err = restore_free(&ra, os, &drrf); 1254 break; 1255 } 1256 case DRR_END: 1257 { 1258 struct drr_end drre = drr->drr_u.drr_end; 1259 /* 1260 * We compare against the *previous* checksum 1261 * value, because the stored checksum is of 1262 * everything before the DRR_END record. 1263 */ 1264 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) 1265 ra.err = ECKSUM; 1266 goto out; 1267 } 1268 default: 1269 ra.err = EINVAL; 1270 goto out; 1271 } 1272 pcksum = ra.cksum; 1273 } 1274 ASSERT(ra.err != 0); 1275 1276 out: 1277 if (ra.err != 0) { 1278 /* 1279 * destroy what we created, so we don't leave it in the 1280 * inconsistent restoring state. 1281 */ 1282 txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); 1283 1284 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 1285 B_FALSE); 1286 if (drc->drc_real_ds != drc->drc_logical_ds) { 1287 mutex_exit(&drc->drc_logical_ds->ds_recvlock); 1288 dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); 1289 } 1290 } 1291 1292 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1293 void *cookie = NULL; 1294 1295 while (gmep = avl_destroy_nodes(&ra.guid_to_ds_map, &cookie)) { 1296 dsl_dataset_rele(gmep->gme_ds, &ra.guid_to_ds_map); 1297 kmem_free(gmep, sizeof (guid_map_entry_t)); 1298 } 1299 avl_destroy(&ra.guid_to_ds_map); 1300 } 1301 1302 kmem_free(ra.buf, ra.bufsize); 1303 *voffp = ra.voff; 1304 return (ra.err); 1305 } 1306 1307 struct recvendsyncarg { 1308 char *tosnap; 1309 uint64_t creation_time; 1310 uint64_t toguid; 1311 }; 1312 1313 static int 1314 recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) 1315 { 1316 dsl_dataset_t *ds = arg1; 1317 struct recvendsyncarg *resa = arg2; 1318 1319 return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); 1320 } 1321 1322 static void 1323 recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) 1324 { 1325 dsl_dataset_t *ds = arg1; 1326 struct recvendsyncarg *resa = arg2; 1327 1328 dsl_dataset_snapshot_sync(ds, resa->tosnap, cr, tx); 1329 1330 /* set snapshot's creation time and guid */ 1331 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 1332 ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; 1333 ds->ds_prev->ds_phys->ds_guid = resa->toguid; 1334 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1335 1336 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1337 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1338 } 1339 1340 static int 1341 dmu_recv_existing_end(dmu_recv_cookie_t *drc) 1342 { 1343 struct recvendsyncarg resa; 1344 dsl_dataset_t *ds = drc->drc_logical_ds; 1345 int err; 1346 1347 /* 1348 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 1349 * expects it to have a ds_user_ptr (and zil), but clone_swap() 1350 * can close it. 1351 */ 1352 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1353 1354 if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { 1355 err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, 1356 drc->drc_force); 1357 if (err) 1358 goto out; 1359 } else { 1360 mutex_exit(&ds->ds_recvlock); 1361 dsl_dataset_rele(ds, dmu_recv_tag); 1362 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 1363 B_FALSE); 1364 return (EBUSY); 1365 } 1366 1367 resa.creation_time = drc->drc_drrb->drr_creation_time; 1368 resa.toguid = drc->drc_drrb->drr_toguid; 1369 resa.tosnap = drc->drc_tosnap; 1370 1371 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1372 recv_end_check, recv_end_sync, ds, &resa, 3); 1373 if (err) { 1374 /* swap back */ 1375 (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); 1376 } 1377 1378 out: 1379 mutex_exit(&ds->ds_recvlock); 1380 dsl_dataset_disown(ds, dmu_recv_tag); 1381 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); 1382 return (err); 1383 } 1384 1385 static int 1386 dmu_recv_new_end(dmu_recv_cookie_t *drc) 1387 { 1388 struct recvendsyncarg resa; 1389 dsl_dataset_t *ds = drc->drc_logical_ds; 1390 int err; 1391 1392 /* 1393 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 1394 * expects it to have a ds_user_ptr (and zil), but clone_swap() 1395 * can close it. 1396 */ 1397 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1398 1399 resa.creation_time = drc->drc_drrb->drr_creation_time; 1400 resa.toguid = drc->drc_drrb->drr_toguid; 1401 resa.tosnap = drc->drc_tosnap; 1402 1403 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1404 recv_end_check, recv_end_sync, ds, &resa, 3); 1405 if (err) { 1406 /* clean up the fs we just recv'd into */ 1407 (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); 1408 } else { 1409 /* release the hold from dmu_recv_begin */ 1410 dsl_dataset_disown(ds, dmu_recv_tag); 1411 } 1412 return (err); 1413 } 1414 1415 int 1416 dmu_recv_end(dmu_recv_cookie_t *drc) 1417 { 1418 if (drc->drc_logical_ds != drc->drc_real_ds) 1419 return (dmu_recv_existing_end(drc)); 1420 else 1421 return (dmu_recv_new_end(drc)); 1422 } 1423