1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011 by Delphix. All rights reserved. 24 */ 25 /* 26 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 27 * Copyright (c) 2011 by Delphix. All rights reserved. 28 */ 29 30 #include <sys/dmu.h> 31 #include <sys/dmu_impl.h> 32 #include <sys/dmu_tx.h> 33 #include <sys/dbuf.h> 34 #include <sys/dnode.h> 35 #include <sys/zfs_context.h> 36 #include <sys/dmu_objset.h> 37 #include <sys/dmu_traverse.h> 38 #include <sys/dsl_dataset.h> 39 #include <sys/dsl_dir.h> 40 #include <sys/dsl_prop.h> 41 #include <sys/dsl_pool.h> 42 #include <sys/dsl_synctask.h> 43 #include <sys/zfs_ioctl.h> 44 #include <sys/zap.h> 45 #include <sys/zio_checksum.h> 46 #include <sys/zfs_znode.h> 47 #include <zfs_fletcher.h> 48 #include <sys/avl.h> 49 #include <sys/ddt.h> 50 #include <sys/zfs_onexit.h> 51 52 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 53 int zfs_send_corrupt_data = B_FALSE; 54 55 static char *dmu_recv_tag = "dmu_recv_tag"; 56 57 /* 58 * The list of data whose inclusion in a send stream can be pending from 59 * one call to backup_cb to another. Multiple calls to dump_free() and 60 * dump_freeobjects() can be aggregated into a single DRR_FREE or 61 * DRR_FREEOBJECTS replay record. 62 */ 63 typedef enum { 64 PENDING_NONE, 65 PENDING_FREE, 66 PENDING_FREEOBJECTS 67 } pendop_t; 68 69 struct backuparg { 70 dmu_replay_record_t *drr; 71 vnode_t *vp; 72 offset_t *off; 73 objset_t *os; 74 zio_cksum_t zc; 75 uint64_t toguid; 76 int err; 77 pendop_t pending_op; 78 }; 79 80 static int 81 dump_bytes(struct backuparg *ba, void *buf, int len) 82 { 83 ssize_t resid; /* have to get resid to get detailed errno */ 84 ASSERT3U(len % 8, ==, 0); 85 86 fletcher_4_incremental_native(buf, len, &ba->zc); 87 ba->err = vn_rdwr(UIO_WRITE, ba->vp, 88 (caddr_t)buf, len, 89 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); 90 *ba->off += len; 91 return (ba->err); 92 } 93 94 static int 95 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, 96 uint64_t length) 97 { 98 struct drr_free *drrf = &(ba->drr->drr_u.drr_free); 99 100 /* 101 * If there is a pending op, but it's not PENDING_FREE, push it out, 102 * since free block aggregation can only be done for blocks of the 103 * same type (i.e., DRR_FREE records can only be aggregated with 104 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 105 * aggregated with other DRR_FREEOBJECTS records. 106 */ 107 if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) { 108 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 109 return (EINTR); 110 ba->pending_op = PENDING_NONE; 111 } 112 113 if (ba->pending_op == PENDING_FREE) { 114 /* 115 * There should never be a PENDING_FREE if length is -1 116 * (because dump_dnode is the only place where this 117 * function is called with a -1, and only after flushing 118 * any pending record). 119 */ 120 ASSERT(length != -1ULL); 121 /* 122 * Check to see whether this free block can be aggregated 123 * with pending one. 124 */ 125 if (drrf->drr_object == object && drrf->drr_offset + 126 drrf->drr_length == offset) { 127 drrf->drr_length += length; 128 return (0); 129 } else { 130 /* not a continuation. Push out pending record */ 131 if (dump_bytes(ba, ba->drr, 132 sizeof (dmu_replay_record_t)) != 0) 133 return (EINTR); 134 ba->pending_op = PENDING_NONE; 135 } 136 } 137 /* create a FREE record and make it pending */ 138 bzero(ba->drr, sizeof (dmu_replay_record_t)); 139 ba->drr->drr_type = DRR_FREE; 140 drrf->drr_object = object; 141 drrf->drr_offset = offset; 142 drrf->drr_length = length; 143 drrf->drr_toguid = ba->toguid; 144 if (length == -1ULL) { 145 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 146 return (EINTR); 147 } else { 148 ba->pending_op = PENDING_FREE; 149 } 150 151 return (0); 152 } 153 154 static int 155 dump_data(struct backuparg *ba, dmu_object_type_t type, 156 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 157 { 158 struct drr_write *drrw = &(ba->drr->drr_u.drr_write); 159 160 161 /* 162 * If there is any kind of pending aggregation (currently either 163 * a grouping of free objects or free blocks), push it out to 164 * the stream, since aggregation can't be done across operations 165 * of different types. 166 */ 167 if (ba->pending_op != PENDING_NONE) { 168 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 169 return (EINTR); 170 ba->pending_op = PENDING_NONE; 171 } 172 /* write a DATA record */ 173 bzero(ba->drr, sizeof (dmu_replay_record_t)); 174 ba->drr->drr_type = DRR_WRITE; 175 drrw->drr_object = object; 176 drrw->drr_type = type; 177 drrw->drr_offset = offset; 178 drrw->drr_length = blksz; 179 drrw->drr_toguid = ba->toguid; 180 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 181 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 182 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 183 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 184 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 185 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 186 drrw->drr_key.ddk_cksum = bp->blk_cksum; 187 188 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 189 return (EINTR); 190 if (dump_bytes(ba, data, blksz) != 0) 191 return (EINTR); 192 return (0); 193 } 194 195 static int 196 dump_spill(struct backuparg *ba, uint64_t object, int blksz, void *data) 197 { 198 struct drr_spill *drrs = &(ba->drr->drr_u.drr_spill); 199 200 if (ba->pending_op != PENDING_NONE) { 201 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 202 return (EINTR); 203 ba->pending_op = PENDING_NONE; 204 } 205 206 /* write a SPILL record */ 207 bzero(ba->drr, sizeof (dmu_replay_record_t)); 208 ba->drr->drr_type = DRR_SPILL; 209 drrs->drr_object = object; 210 drrs->drr_length = blksz; 211 drrs->drr_toguid = ba->toguid; 212 213 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) 214 return (EINTR); 215 if (dump_bytes(ba, data, blksz)) 216 return (EINTR); 217 return (0); 218 } 219 220 static int 221 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) 222 { 223 struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects); 224 225 /* 226 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 227 * push it out, since free block aggregation can only be done for 228 * blocks of the same type (i.e., DRR_FREE records can only be 229 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 230 * can only be aggregated with other DRR_FREEOBJECTS records. 231 */ 232 if (ba->pending_op != PENDING_NONE && 233 ba->pending_op != PENDING_FREEOBJECTS) { 234 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 235 return (EINTR); 236 ba->pending_op = PENDING_NONE; 237 } 238 if (ba->pending_op == PENDING_FREEOBJECTS) { 239 /* 240 * See whether this free object array can be aggregated 241 * with pending one 242 */ 243 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 244 drrfo->drr_numobjs += numobjs; 245 return (0); 246 } else { 247 /* can't be aggregated. Push out pending record */ 248 if (dump_bytes(ba, ba->drr, 249 sizeof (dmu_replay_record_t)) != 0) 250 return (EINTR); 251 ba->pending_op = PENDING_NONE; 252 } 253 } 254 255 /* write a FREEOBJECTS record */ 256 bzero(ba->drr, sizeof (dmu_replay_record_t)); 257 ba->drr->drr_type = DRR_FREEOBJECTS; 258 drrfo->drr_firstobj = firstobj; 259 drrfo->drr_numobjs = numobjs; 260 drrfo->drr_toguid = ba->toguid; 261 262 ba->pending_op = PENDING_FREEOBJECTS; 263 264 return (0); 265 } 266 267 static int 268 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) 269 { 270 struct drr_object *drro = &(ba->drr->drr_u.drr_object); 271 272 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 273 return (dump_freeobjects(ba, object, 1)); 274 275 if (ba->pending_op != PENDING_NONE) { 276 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 277 return (EINTR); 278 ba->pending_op = PENDING_NONE; 279 } 280 281 /* write an OBJECT record */ 282 bzero(ba->drr, sizeof (dmu_replay_record_t)); 283 ba->drr->drr_type = DRR_OBJECT; 284 drro->drr_object = object; 285 drro->drr_type = dnp->dn_type; 286 drro->drr_bonustype = dnp->dn_bonustype; 287 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 288 drro->drr_bonuslen = dnp->dn_bonuslen; 289 drro->drr_checksumtype = dnp->dn_checksum; 290 drro->drr_compress = dnp->dn_compress; 291 drro->drr_toguid = ba->toguid; 292 293 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 294 return (EINTR); 295 296 if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) 297 return (EINTR); 298 299 /* free anything past the end of the file */ 300 if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * 301 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) 302 return (EINTR); 303 if (ba->err) 304 return (EINTR); 305 return (0); 306 } 307 308 #define BP_SPAN(dnp, level) \ 309 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 310 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 311 312 /* ARGSUSED */ 313 static int 314 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, 315 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 316 { 317 struct backuparg *ba = arg; 318 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 319 int err = 0; 320 321 if (issig(JUSTLOOKING) && issig(FORREAL)) 322 return (EINTR); 323 324 if (zb->zb_object != DMU_META_DNODE_OBJECT && 325 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 326 return (0); 327 } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { 328 uint64_t span = BP_SPAN(dnp, zb->zb_level); 329 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 330 err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); 331 } else if (bp == NULL) { 332 uint64_t span = BP_SPAN(dnp, zb->zb_level); 333 err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span); 334 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 335 return (0); 336 } else if (type == DMU_OT_DNODE) { 337 dnode_phys_t *blk; 338 int i; 339 int blksz = BP_GET_LSIZE(bp); 340 uint32_t aflags = ARC_WAIT; 341 arc_buf_t *abuf; 342 343 if (dsl_read(NULL, spa, bp, pbuf, 344 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 345 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 346 return (EIO); 347 348 blk = abuf->b_data; 349 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 350 uint64_t dnobj = (zb->zb_blkid << 351 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 352 err = dump_dnode(ba, dnobj, blk+i); 353 if (err) 354 break; 355 } 356 (void) arc_buf_remove_ref(abuf, &abuf); 357 } else if (type == DMU_OT_SA) { 358 uint32_t aflags = ARC_WAIT; 359 arc_buf_t *abuf; 360 int blksz = BP_GET_LSIZE(bp); 361 362 if (arc_read_nolock(NULL, spa, bp, 363 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 364 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 365 return (EIO); 366 367 err = dump_spill(ba, zb->zb_object, blksz, abuf->b_data); 368 (void) arc_buf_remove_ref(abuf, &abuf); 369 } else { /* it's a level-0 block of a regular object */ 370 uint32_t aflags = ARC_WAIT; 371 arc_buf_t *abuf; 372 int blksz = BP_GET_LSIZE(bp); 373 374 if (dsl_read(NULL, spa, bp, pbuf, 375 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 376 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { 377 if (zfs_send_corrupt_data) { 378 /* Send a block filled with 0x"zfs badd bloc" */ 379 abuf = arc_buf_alloc(spa, blksz, &abuf, 380 ARC_BUFC_DATA); 381 uint64_t *ptr; 382 for (ptr = abuf->b_data; 383 (char *)ptr < (char *)abuf->b_data + blksz; 384 ptr++) 385 *ptr = 0x2f5baddb10c; 386 } else { 387 return (EIO); 388 } 389 } 390 391 err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz, 392 blksz, bp, abuf->b_data); 393 (void) arc_buf_remove_ref(abuf, &abuf); 394 } 395 396 ASSERT(err == 0 || err == EINTR); 397 return (err); 398 } 399 400 int 401 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, 402 vnode_t *vp, offset_t *off) 403 { 404 dsl_dataset_t *ds = tosnap->os_dsl_dataset; 405 dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; 406 dmu_replay_record_t *drr; 407 struct backuparg ba; 408 int err; 409 uint64_t fromtxg = 0; 410 411 /* tosnap must be a snapshot */ 412 if (ds->ds_phys->ds_next_snap_obj == 0) 413 return (EINVAL); 414 415 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 416 if (fromds && (ds->ds_dir != fromds->ds_dir || 417 fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) 418 return (EXDEV); 419 420 if (fromorigin) { 421 dsl_pool_t *dp = ds->ds_dir->dd_pool; 422 423 if (fromsnap) 424 return (EINVAL); 425 426 if (dsl_dir_is_clone(ds->ds_dir)) { 427 rw_enter(&dp->dp_config_rwlock, RW_READER); 428 err = dsl_dataset_hold_obj(dp, 429 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); 430 rw_exit(&dp->dp_config_rwlock); 431 if (err) 432 return (err); 433 } else { 434 fromorigin = B_FALSE; 435 } 436 } 437 438 439 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 440 drr->drr_type = DRR_BEGIN; 441 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 442 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 443 DMU_SUBSTREAM); 444 445 #ifdef _KERNEL 446 if (dmu_objset_type(tosnap) == DMU_OST_ZFS) { 447 uint64_t version; 448 if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) 449 return (EINVAL); 450 if (version == ZPL_VERSION_SA) { 451 DMU_SET_FEATUREFLAGS( 452 drr->drr_u.drr_begin.drr_versioninfo, 453 DMU_BACKUP_FEATURE_SA_SPILL); 454 } 455 } 456 #endif 457 458 drr->drr_u.drr_begin.drr_creation_time = 459 ds->ds_phys->ds_creation_time; 460 drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; 461 if (fromorigin) 462 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 463 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 464 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 465 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 466 467 if (fromds) 468 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; 469 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 470 471 if (fromds) 472 fromtxg = fromds->ds_phys->ds_creation_txg; 473 if (fromorigin) 474 dsl_dataset_rele(fromds, FTAG); 475 476 ba.drr = drr; 477 ba.vp = vp; 478 ba.os = tosnap; 479 ba.off = off; 480 ba.toguid = ds->ds_phys->ds_guid; 481 ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); 482 ba.pending_op = PENDING_NONE; 483 484 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { 485 kmem_free(drr, sizeof (dmu_replay_record_t)); 486 return (ba.err); 487 } 488 489 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 490 backup_cb, &ba); 491 492 if (ba.pending_op != PENDING_NONE) 493 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) 494 err = EINTR; 495 496 if (err) { 497 if (err == EINTR && ba.err) 498 err = ba.err; 499 kmem_free(drr, sizeof (dmu_replay_record_t)); 500 return (err); 501 } 502 503 bzero(drr, sizeof (dmu_replay_record_t)); 504 drr->drr_type = DRR_END; 505 drr->drr_u.drr_end.drr_checksum = ba.zc; 506 drr->drr_u.drr_end.drr_toguid = ba.toguid; 507 508 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { 509 kmem_free(drr, sizeof (dmu_replay_record_t)); 510 return (ba.err); 511 } 512 513 kmem_free(drr, sizeof (dmu_replay_record_t)); 514 515 return (0); 516 } 517 518 int 519 dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, 520 uint64_t *sizep) 521 { 522 dsl_dataset_t *ds = tosnap->os_dsl_dataset; 523 dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; 524 dsl_pool_t *dp = ds->ds_dir->dd_pool; 525 int err; 526 uint64_t size; 527 528 /* tosnap must be a snapshot */ 529 if (ds->ds_phys->ds_next_snap_obj == 0) 530 return (EINVAL); 531 532 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 533 if (fromds && (ds->ds_dir != fromds->ds_dir || 534 fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) 535 return (EXDEV); 536 537 if (fromorigin) { 538 if (fromsnap) 539 return (EINVAL); 540 541 if (dsl_dir_is_clone(ds->ds_dir)) { 542 rw_enter(&dp->dp_config_rwlock, RW_READER); 543 err = dsl_dataset_hold_obj(dp, 544 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); 545 rw_exit(&dp->dp_config_rwlock); 546 if (err) 547 return (err); 548 } else { 549 fromorigin = B_FALSE; 550 } 551 } 552 553 /* Get uncompressed size estimate of changed data. */ 554 if (fromds == NULL) { 555 size = ds->ds_phys->ds_uncompressed_bytes; 556 } else { 557 uint64_t used, comp; 558 err = dsl_dataset_space_written(fromds, ds, 559 &used, &comp, &size); 560 if (fromorigin) 561 dsl_dataset_rele(fromds, FTAG); 562 if (err) 563 return (err); 564 } 565 566 /* 567 * Assume that space (both on-disk and in-stream) is dominated by 568 * data. We will adjust for indirect blocks and the copies property, 569 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 570 */ 571 572 /* 573 * Subtract out approximate space used by indirect blocks. 574 * Assume most space is used by data blocks (non-indirect, non-dnode). 575 * Assume all blocks are recordsize. Assume ditto blocks and 576 * internal fragmentation counter out compression. 577 * 578 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 579 * block, which we observe in practice. 580 */ 581 uint64_t recordsize; 582 rw_enter(&dp->dp_config_rwlock, RW_READER); 583 err = dsl_prop_get_ds(ds, "recordsize", 584 sizeof (recordsize), 1, &recordsize, NULL); 585 rw_exit(&dp->dp_config_rwlock); 586 if (err) 587 return (err); 588 size -= size / recordsize * sizeof (blkptr_t); 589 590 /* Add in the space for the record associated with each block. */ 591 size += size / recordsize * sizeof (dmu_replay_record_t); 592 593 *sizep = size; 594 595 return (0); 596 } 597 598 struct recvbeginsyncarg { 599 const char *tofs; 600 const char *tosnap; 601 dsl_dataset_t *origin; 602 uint64_t fromguid; 603 dmu_objset_type_t type; 604 void *tag; 605 boolean_t force; 606 uint64_t dsflags; 607 char clonelastname[MAXNAMELEN]; 608 dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ 609 cred_t *cr; 610 }; 611 612 /* ARGSUSED */ 613 static int 614 recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) 615 { 616 dsl_dir_t *dd = arg1; 617 struct recvbeginsyncarg *rbsa = arg2; 618 objset_t *mos = dd->dd_pool->dp_meta_objset; 619 uint64_t val; 620 int err; 621 622 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, 623 strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); 624 625 if (err != ENOENT) 626 return (err ? err : EEXIST); 627 628 if (rbsa->origin) { 629 /* make sure it's a snap in the same pool */ 630 if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) 631 return (EXDEV); 632 if (!dsl_dataset_is_snapshot(rbsa->origin)) 633 return (EINVAL); 634 if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) 635 return (ENODEV); 636 } 637 638 return (0); 639 } 640 641 static void 642 recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) 643 { 644 dsl_dir_t *dd = arg1; 645 struct recvbeginsyncarg *rbsa = arg2; 646 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 647 uint64_t dsobj; 648 649 /* Create and open new dataset. */ 650 dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, 651 rbsa->origin, flags, rbsa->cr, tx); 652 VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, 653 B_TRUE, dmu_recv_tag, &rbsa->ds)); 654 655 if (rbsa->origin == NULL) { 656 (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, 657 rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); 658 } 659 660 spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC, 661 dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj); 662 } 663 664 /* ARGSUSED */ 665 static int 666 recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) 667 { 668 dsl_dataset_t *ds = arg1; 669 struct recvbeginsyncarg *rbsa = arg2; 670 int err; 671 uint64_t val; 672 673 /* must not have any changes since most recent snapshot */ 674 if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) 675 return (ETXTBSY); 676 677 /* new snapshot name must not exist */ 678 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 679 ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); 680 if (err == 0) 681 return (EEXIST); 682 if (err != ENOENT) 683 return (err); 684 685 if (rbsa->fromguid) { 686 /* if incremental, most recent snapshot must match fromguid */ 687 if (ds->ds_prev == NULL) 688 return (ENODEV); 689 690 /* 691 * most recent snapshot must match fromguid, or there are no 692 * changes since the fromguid one 693 */ 694 if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { 695 uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; 696 uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; 697 while (obj != 0) { 698 dsl_dataset_t *snap; 699 err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 700 obj, FTAG, &snap); 701 if (err) 702 return (ENODEV); 703 if (snap->ds_phys->ds_creation_txg < birth) { 704 dsl_dataset_rele(snap, FTAG); 705 return (ENODEV); 706 } 707 if (snap->ds_phys->ds_guid == rbsa->fromguid) { 708 dsl_dataset_rele(snap, FTAG); 709 break; /* it's ok */ 710 } 711 obj = snap->ds_phys->ds_prev_snap_obj; 712 dsl_dataset_rele(snap, FTAG); 713 } 714 if (obj == 0) 715 return (ENODEV); 716 } 717 } else { 718 /* if full, most recent snapshot must be $ORIGIN */ 719 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) 720 return (ENODEV); 721 } 722 723 /* temporary clone name must not exist */ 724 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 725 ds->ds_dir->dd_phys->dd_child_dir_zapobj, 726 rbsa->clonelastname, 8, 1, &val); 727 if (err == 0) 728 return (EEXIST); 729 if (err != ENOENT) 730 return (err); 731 732 return (0); 733 } 734 735 /* ARGSUSED */ 736 static void 737 recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) 738 { 739 dsl_dataset_t *ohds = arg1; 740 struct recvbeginsyncarg *rbsa = arg2; 741 dsl_pool_t *dp = ohds->ds_dir->dd_pool; 742 dsl_dataset_t *cds; 743 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 744 uint64_t dsobj; 745 746 /* create and open the temporary clone */ 747 dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, 748 ohds->ds_prev, flags, rbsa->cr, tx); 749 VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); 750 751 /* 752 * If we actually created a non-clone, we need to create the 753 * objset in our new dataset. 754 */ 755 if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { 756 (void) dmu_objset_create_impl(dp->dp_spa, 757 cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); 758 } 759 760 rbsa->ds = cds; 761 762 spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC, 763 dp->dp_spa, tx, "dataset = %lld", dsobj); 764 } 765 766 static boolean_t 767 dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) 768 { 769 int featureflags; 770 771 featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 772 773 /* Verify pool version supports SA if SA_SPILL feature set */ 774 return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 775 (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA)); 776 } 777 778 /* 779 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 780 * succeeds; otherwise we will leak the holds on the datasets. 781 */ 782 int 783 dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, 784 boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) 785 { 786 int err = 0; 787 boolean_t byteswap; 788 struct recvbeginsyncarg rbsa = { 0 }; 789 uint64_t versioninfo; 790 int flags; 791 dsl_dataset_t *ds; 792 793 if (drrb->drr_magic == DMU_BACKUP_MAGIC) 794 byteswap = FALSE; 795 else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 796 byteswap = TRUE; 797 else 798 return (EINVAL); 799 800 rbsa.tofs = tofs; 801 rbsa.tosnap = tosnap; 802 rbsa.origin = origin ? origin->os_dsl_dataset : NULL; 803 rbsa.fromguid = drrb->drr_fromguid; 804 rbsa.type = drrb->drr_type; 805 rbsa.tag = FTAG; 806 rbsa.dsflags = 0; 807 rbsa.cr = CRED(); 808 versioninfo = drrb->drr_versioninfo; 809 flags = drrb->drr_flags; 810 811 if (byteswap) { 812 rbsa.type = BSWAP_32(rbsa.type); 813 rbsa.fromguid = BSWAP_64(rbsa.fromguid); 814 versioninfo = BSWAP_64(versioninfo); 815 flags = BSWAP_32(flags); 816 } 817 818 if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || 819 rbsa.type >= DMU_OST_NUMTYPES || 820 ((flags & DRR_FLAG_CLONE) && origin == NULL)) 821 return (EINVAL); 822 823 if (flags & DRR_FLAG_CI_DATA) 824 rbsa.dsflags = DS_FLAG_CI_DATASET; 825 826 bzero(drc, sizeof (dmu_recv_cookie_t)); 827 drc->drc_drrb = drrb; 828 drc->drc_tosnap = tosnap; 829 drc->drc_top_ds = top_ds; 830 drc->drc_force = force; 831 832 /* 833 * Process the begin in syncing context. 834 */ 835 836 /* open the dataset we are logically receiving into */ 837 err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); 838 if (err == 0) { 839 if (dmu_recv_verify_features(ds, drrb)) { 840 dsl_dataset_rele(ds, dmu_recv_tag); 841 return (ENOTSUP); 842 } 843 /* target fs already exists; recv into temp clone */ 844 845 /* Can't recv a clone into an existing fs */ 846 if (flags & DRR_FLAG_CLONE) { 847 dsl_dataset_rele(ds, dmu_recv_tag); 848 return (EINVAL); 849 } 850 851 /* must not have an incremental recv already in progress */ 852 if (!mutex_tryenter(&ds->ds_recvlock)) { 853 dsl_dataset_rele(ds, dmu_recv_tag); 854 return (EBUSY); 855 } 856 857 /* tmp clone name is: tofs/%tosnap" */ 858 (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), 859 "%%%s", tosnap); 860 rbsa.force = force; 861 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 862 recv_existing_check, recv_existing_sync, ds, &rbsa, 5); 863 if (err) { 864 mutex_exit(&ds->ds_recvlock); 865 dsl_dataset_rele(ds, dmu_recv_tag); 866 return (err); 867 } 868 drc->drc_logical_ds = ds; 869 drc->drc_real_ds = rbsa.ds; 870 } else if (err == ENOENT) { 871 /* target fs does not exist; must be a full backup or clone */ 872 char *cp; 873 874 /* 875 * If it's a non-clone incremental, we are missing the 876 * target fs, so fail the recv. 877 */ 878 if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) 879 return (ENOENT); 880 881 /* Open the parent of tofs */ 882 cp = strrchr(tofs, '/'); 883 *cp = '\0'; 884 err = dsl_dataset_hold(tofs, FTAG, &ds); 885 *cp = '/'; 886 if (err) 887 return (err); 888 889 if (dmu_recv_verify_features(ds, drrb)) { 890 dsl_dataset_rele(ds, FTAG); 891 return (ENOTSUP); 892 } 893 894 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 895 recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); 896 dsl_dataset_rele(ds, FTAG); 897 if (err) 898 return (err); 899 drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; 900 drc->drc_newfs = B_TRUE; 901 } 902 903 return (err); 904 } 905 906 struct restorearg { 907 int err; 908 int byteswap; 909 vnode_t *vp; 910 char *buf; 911 uint64_t voff; 912 int bufsize; /* amount of memory allocated for buf */ 913 zio_cksum_t cksum; 914 avl_tree_t *guid_to_ds_map; 915 }; 916 917 typedef struct guid_map_entry { 918 uint64_t guid; 919 dsl_dataset_t *gme_ds; 920 avl_node_t avlnode; 921 } guid_map_entry_t; 922 923 static int 924 guid_compare(const void *arg1, const void *arg2) 925 { 926 const guid_map_entry_t *gmep1 = arg1; 927 const guid_map_entry_t *gmep2 = arg2; 928 929 if (gmep1->guid < gmep2->guid) 930 return (-1); 931 else if (gmep1->guid > gmep2->guid) 932 return (1); 933 return (0); 934 } 935 936 static void 937 free_guid_map_onexit(void *arg) 938 { 939 avl_tree_t *ca = arg; 940 void *cookie = NULL; 941 guid_map_entry_t *gmep; 942 943 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 944 dsl_dataset_rele(gmep->gme_ds, ca); 945 kmem_free(gmep, sizeof (guid_map_entry_t)); 946 } 947 avl_destroy(ca); 948 kmem_free(ca, sizeof (avl_tree_t)); 949 } 950 951 static void * 952 restore_read(struct restorearg *ra, int len) 953 { 954 void *rv; 955 int done = 0; 956 957 /* some things will require 8-byte alignment, so everything must */ 958 ASSERT3U(len % 8, ==, 0); 959 960 while (done < len) { 961 ssize_t resid; 962 963 ra->err = vn_rdwr(UIO_READ, ra->vp, 964 (caddr_t)ra->buf + done, len - done, 965 ra->voff, UIO_SYSSPACE, FAPPEND, 966 RLIM64_INFINITY, CRED(), &resid); 967 968 if (resid == len - done) 969 ra->err = EINVAL; 970 ra->voff += len - done - resid; 971 done = len - resid; 972 if (ra->err) 973 return (NULL); 974 } 975 976 ASSERT3U(done, ==, len); 977 rv = ra->buf; 978 if (ra->byteswap) 979 fletcher_4_incremental_byteswap(rv, len, &ra->cksum); 980 else 981 fletcher_4_incremental_native(rv, len, &ra->cksum); 982 return (rv); 983 } 984 985 static void 986 backup_byteswap(dmu_replay_record_t *drr) 987 { 988 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 989 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 990 drr->drr_type = BSWAP_32(drr->drr_type); 991 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 992 switch (drr->drr_type) { 993 case DRR_BEGIN: 994 DO64(drr_begin.drr_magic); 995 DO64(drr_begin.drr_versioninfo); 996 DO64(drr_begin.drr_creation_time); 997 DO32(drr_begin.drr_type); 998 DO32(drr_begin.drr_flags); 999 DO64(drr_begin.drr_toguid); 1000 DO64(drr_begin.drr_fromguid); 1001 break; 1002 case DRR_OBJECT: 1003 DO64(drr_object.drr_object); 1004 /* DO64(drr_object.drr_allocation_txg); */ 1005 DO32(drr_object.drr_type); 1006 DO32(drr_object.drr_bonustype); 1007 DO32(drr_object.drr_blksz); 1008 DO32(drr_object.drr_bonuslen); 1009 DO64(drr_object.drr_toguid); 1010 break; 1011 case DRR_FREEOBJECTS: 1012 DO64(drr_freeobjects.drr_firstobj); 1013 DO64(drr_freeobjects.drr_numobjs); 1014 DO64(drr_freeobjects.drr_toguid); 1015 break; 1016 case DRR_WRITE: 1017 DO64(drr_write.drr_object); 1018 DO32(drr_write.drr_type); 1019 DO64(drr_write.drr_offset); 1020 DO64(drr_write.drr_length); 1021 DO64(drr_write.drr_toguid); 1022 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); 1023 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); 1024 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); 1025 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); 1026 DO64(drr_write.drr_key.ddk_prop); 1027 break; 1028 case DRR_WRITE_BYREF: 1029 DO64(drr_write_byref.drr_object); 1030 DO64(drr_write_byref.drr_offset); 1031 DO64(drr_write_byref.drr_length); 1032 DO64(drr_write_byref.drr_toguid); 1033 DO64(drr_write_byref.drr_refguid); 1034 DO64(drr_write_byref.drr_refobject); 1035 DO64(drr_write_byref.drr_refoffset); 1036 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); 1037 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); 1038 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); 1039 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); 1040 DO64(drr_write_byref.drr_key.ddk_prop); 1041 break; 1042 case DRR_FREE: 1043 DO64(drr_free.drr_object); 1044 DO64(drr_free.drr_offset); 1045 DO64(drr_free.drr_length); 1046 DO64(drr_free.drr_toguid); 1047 break; 1048 case DRR_SPILL: 1049 DO64(drr_spill.drr_object); 1050 DO64(drr_spill.drr_length); 1051 DO64(drr_spill.drr_toguid); 1052 break; 1053 case DRR_END: 1054 DO64(drr_end.drr_checksum.zc_word[0]); 1055 DO64(drr_end.drr_checksum.zc_word[1]); 1056 DO64(drr_end.drr_checksum.zc_word[2]); 1057 DO64(drr_end.drr_checksum.zc_word[3]); 1058 DO64(drr_end.drr_toguid); 1059 break; 1060 } 1061 #undef DO64 1062 #undef DO32 1063 } 1064 1065 static int 1066 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 1067 { 1068 int err; 1069 dmu_tx_t *tx; 1070 void *data = NULL; 1071 1072 if (drro->drr_type == DMU_OT_NONE || 1073 drro->drr_type >= DMU_OT_NUMTYPES || 1074 drro->drr_bonustype >= DMU_OT_NUMTYPES || 1075 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 1076 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1077 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1078 drro->drr_blksz < SPA_MINBLOCKSIZE || 1079 drro->drr_blksz > SPA_MAXBLOCKSIZE || 1080 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1081 return (EINVAL); 1082 } 1083 1084 err = dmu_object_info(os, drro->drr_object, NULL); 1085 1086 if (err != 0 && err != ENOENT) 1087 return (EINVAL); 1088 1089 if (drro->drr_bonuslen) { 1090 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); 1091 if (ra->err) 1092 return (ra->err); 1093 } 1094 1095 if (err == ENOENT) { 1096 /* currently free, want to be allocated */ 1097 tx = dmu_tx_create(os); 1098 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1099 err = dmu_tx_assign(tx, TXG_WAIT); 1100 if (err) { 1101 dmu_tx_abort(tx); 1102 return (err); 1103 } 1104 err = dmu_object_claim(os, drro->drr_object, 1105 drro->drr_type, drro->drr_blksz, 1106 drro->drr_bonustype, drro->drr_bonuslen, tx); 1107 dmu_tx_commit(tx); 1108 } else { 1109 /* currently allocated, want to be allocated */ 1110 err = dmu_object_reclaim(os, drro->drr_object, 1111 drro->drr_type, drro->drr_blksz, 1112 drro->drr_bonustype, drro->drr_bonuslen); 1113 } 1114 if (err) { 1115 return (EINVAL); 1116 } 1117 1118 tx = dmu_tx_create(os); 1119 dmu_tx_hold_bonus(tx, drro->drr_object); 1120 err = dmu_tx_assign(tx, TXG_WAIT); 1121 if (err) { 1122 dmu_tx_abort(tx); 1123 return (err); 1124 } 1125 1126 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, 1127 tx); 1128 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 1129 1130 if (data != NULL) { 1131 dmu_buf_t *db; 1132 1133 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 1134 dmu_buf_will_dirty(db, tx); 1135 1136 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 1137 bcopy(data, db->db_data, drro->drr_bonuslen); 1138 if (ra->byteswap) { 1139 dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, 1140 drro->drr_bonuslen); 1141 } 1142 dmu_buf_rele(db, FTAG); 1143 } 1144 dmu_tx_commit(tx); 1145 return (0); 1146 } 1147 1148 /* ARGSUSED */ 1149 static int 1150 restore_freeobjects(struct restorearg *ra, objset_t *os, 1151 struct drr_freeobjects *drrfo) 1152 { 1153 uint64_t obj; 1154 1155 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1156 return (EINVAL); 1157 1158 for (obj = drrfo->drr_firstobj; 1159 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1160 (void) dmu_object_next(os, &obj, FALSE, 0)) { 1161 int err; 1162 1163 if (dmu_object_info(os, obj, NULL) != 0) 1164 continue; 1165 1166 err = dmu_free_object(os, obj); 1167 if (err) 1168 return (err); 1169 } 1170 return (0); 1171 } 1172 1173 static int 1174 restore_write(struct restorearg *ra, objset_t *os, 1175 struct drr_write *drrw) 1176 { 1177 dmu_tx_t *tx; 1178 void *data; 1179 int err; 1180 1181 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1182 drrw->drr_type >= DMU_OT_NUMTYPES) 1183 return (EINVAL); 1184 1185 data = restore_read(ra, drrw->drr_length); 1186 if (data == NULL) 1187 return (ra->err); 1188 1189 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1190 return (EINVAL); 1191 1192 tx = dmu_tx_create(os); 1193 1194 dmu_tx_hold_write(tx, drrw->drr_object, 1195 drrw->drr_offset, drrw->drr_length); 1196 err = dmu_tx_assign(tx, TXG_WAIT); 1197 if (err) { 1198 dmu_tx_abort(tx); 1199 return (err); 1200 } 1201 if (ra->byteswap) 1202 dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); 1203 dmu_write(os, drrw->drr_object, 1204 drrw->drr_offset, drrw->drr_length, data, tx); 1205 dmu_tx_commit(tx); 1206 return (0); 1207 } 1208 1209 /* 1210 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1211 * streams to refer to a copy of the data that is already on the 1212 * system because it came in earlier in the stream. This function 1213 * finds the earlier copy of the data, and uses that copy instead of 1214 * data from the stream to fulfill this write. 1215 */ 1216 static int 1217 restore_write_byref(struct restorearg *ra, objset_t *os, 1218 struct drr_write_byref *drrwbr) 1219 { 1220 dmu_tx_t *tx; 1221 int err; 1222 guid_map_entry_t gmesrch; 1223 guid_map_entry_t *gmep; 1224 avl_index_t where; 1225 objset_t *ref_os = NULL; 1226 dmu_buf_t *dbp; 1227 1228 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1229 return (EINVAL); 1230 1231 /* 1232 * If the GUID of the referenced dataset is different from the 1233 * GUID of the target dataset, find the referenced dataset. 1234 */ 1235 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1236 gmesrch.guid = drrwbr->drr_refguid; 1237 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 1238 &where)) == NULL) { 1239 return (EINVAL); 1240 } 1241 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1242 return (EINVAL); 1243 } else { 1244 ref_os = os; 1245 } 1246 1247 if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1248 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH)) 1249 return (err); 1250 1251 tx = dmu_tx_create(os); 1252 1253 dmu_tx_hold_write(tx, drrwbr->drr_object, 1254 drrwbr->drr_offset, drrwbr->drr_length); 1255 err = dmu_tx_assign(tx, TXG_WAIT); 1256 if (err) { 1257 dmu_tx_abort(tx); 1258 return (err); 1259 } 1260 dmu_write(os, drrwbr->drr_object, 1261 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1262 dmu_buf_rele(dbp, FTAG); 1263 dmu_tx_commit(tx); 1264 return (0); 1265 } 1266 1267 static int 1268 restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) 1269 { 1270 dmu_tx_t *tx; 1271 void *data; 1272 dmu_buf_t *db, *db_spill; 1273 int err; 1274 1275 if (drrs->drr_length < SPA_MINBLOCKSIZE || 1276 drrs->drr_length > SPA_MAXBLOCKSIZE) 1277 return (EINVAL); 1278 1279 data = restore_read(ra, drrs->drr_length); 1280 if (data == NULL) 1281 return (ra->err); 1282 1283 if (dmu_object_info(os, drrs->drr_object, NULL) != 0) 1284 return (EINVAL); 1285 1286 VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); 1287 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 1288 dmu_buf_rele(db, FTAG); 1289 return (err); 1290 } 1291 1292 tx = dmu_tx_create(os); 1293 1294 dmu_tx_hold_spill(tx, db->db_object); 1295 1296 err = dmu_tx_assign(tx, TXG_WAIT); 1297 if (err) { 1298 dmu_buf_rele(db, FTAG); 1299 dmu_buf_rele(db_spill, FTAG); 1300 dmu_tx_abort(tx); 1301 return (err); 1302 } 1303 dmu_buf_will_dirty(db_spill, tx); 1304 1305 if (db_spill->db_size < drrs->drr_length) 1306 VERIFY(0 == dbuf_spill_set_blksz(db_spill, 1307 drrs->drr_length, tx)); 1308 bcopy(data, db_spill->db_data, drrs->drr_length); 1309 1310 dmu_buf_rele(db, FTAG); 1311 dmu_buf_rele(db_spill, FTAG); 1312 1313 dmu_tx_commit(tx); 1314 return (0); 1315 } 1316 1317 /* ARGSUSED */ 1318 static int 1319 restore_free(struct restorearg *ra, objset_t *os, 1320 struct drr_free *drrf) 1321 { 1322 int err; 1323 1324 if (drrf->drr_length != -1ULL && 1325 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1326 return (EINVAL); 1327 1328 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1329 return (EINVAL); 1330 1331 err = dmu_free_long_range(os, drrf->drr_object, 1332 drrf->drr_offset, drrf->drr_length); 1333 return (err); 1334 } 1335 1336 /* 1337 * NB: callers *must* call dmu_recv_end() if this succeeds. 1338 */ 1339 int 1340 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, 1341 int cleanup_fd, uint64_t *action_handlep) 1342 { 1343 struct restorearg ra = { 0 }; 1344 dmu_replay_record_t *drr; 1345 objset_t *os; 1346 zio_cksum_t pcksum; 1347 int featureflags; 1348 1349 if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 1350 ra.byteswap = TRUE; 1351 1352 { 1353 /* compute checksum of drr_begin record */ 1354 dmu_replay_record_t *drr; 1355 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 1356 1357 drr->drr_type = DRR_BEGIN; 1358 drr->drr_u.drr_begin = *drc->drc_drrb; 1359 if (ra.byteswap) { 1360 fletcher_4_incremental_byteswap(drr, 1361 sizeof (dmu_replay_record_t), &ra.cksum); 1362 } else { 1363 fletcher_4_incremental_native(drr, 1364 sizeof (dmu_replay_record_t), &ra.cksum); 1365 } 1366 kmem_free(drr, sizeof (dmu_replay_record_t)); 1367 } 1368 1369 if (ra.byteswap) { 1370 struct drr_begin *drrb = drc->drc_drrb; 1371 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1372 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 1373 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1374 drrb->drr_type = BSWAP_32(drrb->drr_type); 1375 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1376 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1377 } 1378 1379 ra.vp = vp; 1380 ra.voff = *voffp; 1381 ra.bufsize = 1<<20; 1382 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1383 1384 /* these were verified in dmu_recv_begin */ 1385 ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == 1386 DMU_SUBSTREAM); 1387 ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); 1388 1389 /* 1390 * Open the objset we are modifying. 1391 */ 1392 VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); 1393 1394 ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); 1395 1396 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1397 1398 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1399 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1400 minor_t minor; 1401 1402 if (cleanup_fd == -1) { 1403 ra.err = EBADF; 1404 goto out; 1405 } 1406 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 1407 if (ra.err) { 1408 cleanup_fd = -1; 1409 goto out; 1410 } 1411 1412 if (*action_handlep == 0) { 1413 ra.guid_to_ds_map = 1414 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1415 avl_create(ra.guid_to_ds_map, guid_compare, 1416 sizeof (guid_map_entry_t), 1417 offsetof(guid_map_entry_t, avlnode)); 1418 ra.err = zfs_onexit_add_cb(minor, 1419 free_guid_map_onexit, ra.guid_to_ds_map, 1420 action_handlep); 1421 if (ra.err) 1422 goto out; 1423 } else { 1424 ra.err = zfs_onexit_cb_data(minor, *action_handlep, 1425 (void **)&ra.guid_to_ds_map); 1426 if (ra.err) 1427 goto out; 1428 } 1429 1430 drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 1431 } 1432 1433 /* 1434 * Read records and process them. 1435 */ 1436 pcksum = ra.cksum; 1437 while (ra.err == 0 && 1438 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { 1439 if (issig(JUSTLOOKING) && issig(FORREAL)) { 1440 ra.err = EINTR; 1441 goto out; 1442 } 1443 1444 if (ra.byteswap) 1445 backup_byteswap(drr); 1446 1447 switch (drr->drr_type) { 1448 case DRR_OBJECT: 1449 { 1450 /* 1451 * We need to make a copy of the record header, 1452 * because restore_{object,write} may need to 1453 * restore_read(), which will invalidate drr. 1454 */ 1455 struct drr_object drro = drr->drr_u.drr_object; 1456 ra.err = restore_object(&ra, os, &drro); 1457 break; 1458 } 1459 case DRR_FREEOBJECTS: 1460 { 1461 struct drr_freeobjects drrfo = 1462 drr->drr_u.drr_freeobjects; 1463 ra.err = restore_freeobjects(&ra, os, &drrfo); 1464 break; 1465 } 1466 case DRR_WRITE: 1467 { 1468 struct drr_write drrw = drr->drr_u.drr_write; 1469 ra.err = restore_write(&ra, os, &drrw); 1470 break; 1471 } 1472 case DRR_WRITE_BYREF: 1473 { 1474 struct drr_write_byref drrwbr = 1475 drr->drr_u.drr_write_byref; 1476 ra.err = restore_write_byref(&ra, os, &drrwbr); 1477 break; 1478 } 1479 case DRR_FREE: 1480 { 1481 struct drr_free drrf = drr->drr_u.drr_free; 1482 ra.err = restore_free(&ra, os, &drrf); 1483 break; 1484 } 1485 case DRR_END: 1486 { 1487 struct drr_end drre = drr->drr_u.drr_end; 1488 /* 1489 * We compare against the *previous* checksum 1490 * value, because the stored checksum is of 1491 * everything before the DRR_END record. 1492 */ 1493 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) 1494 ra.err = ECKSUM; 1495 goto out; 1496 } 1497 case DRR_SPILL: 1498 { 1499 struct drr_spill drrs = drr->drr_u.drr_spill; 1500 ra.err = restore_spill(&ra, os, &drrs); 1501 break; 1502 } 1503 default: 1504 ra.err = EINVAL; 1505 goto out; 1506 } 1507 pcksum = ra.cksum; 1508 } 1509 ASSERT(ra.err != 0); 1510 1511 out: 1512 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 1513 zfs_onexit_fd_rele(cleanup_fd); 1514 1515 if (ra.err != 0) { 1516 /* 1517 * destroy what we created, so we don't leave it in the 1518 * inconsistent restoring state. 1519 */ 1520 txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); 1521 1522 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 1523 B_FALSE); 1524 if (drc->drc_real_ds != drc->drc_logical_ds) { 1525 mutex_exit(&drc->drc_logical_ds->ds_recvlock); 1526 dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); 1527 } 1528 } 1529 1530 kmem_free(ra.buf, ra.bufsize); 1531 *voffp = ra.voff; 1532 return (ra.err); 1533 } 1534 1535 struct recvendsyncarg { 1536 char *tosnap; 1537 uint64_t creation_time; 1538 uint64_t toguid; 1539 }; 1540 1541 static int 1542 recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) 1543 { 1544 dsl_dataset_t *ds = arg1; 1545 struct recvendsyncarg *resa = arg2; 1546 1547 return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); 1548 } 1549 1550 static void 1551 recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) 1552 { 1553 dsl_dataset_t *ds = arg1; 1554 struct recvendsyncarg *resa = arg2; 1555 1556 dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); 1557 1558 /* set snapshot's creation time and guid */ 1559 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 1560 ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; 1561 ds->ds_prev->ds_phys->ds_guid = resa->toguid; 1562 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1563 1564 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1565 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1566 } 1567 1568 static int 1569 add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) 1570 { 1571 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1572 uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj; 1573 dsl_dataset_t *snapds; 1574 guid_map_entry_t *gmep; 1575 int err; 1576 1577 ASSERT(guid_map != NULL); 1578 1579 rw_enter(&dp->dp_config_rwlock, RW_READER); 1580 err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds); 1581 if (err == 0) { 1582 gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); 1583 gmep->guid = snapds->ds_phys->ds_guid; 1584 gmep->gme_ds = snapds; 1585 avl_add(guid_map, gmep); 1586 } 1587 1588 rw_exit(&dp->dp_config_rwlock); 1589 return (err); 1590 } 1591 1592 static int 1593 dmu_recv_existing_end(dmu_recv_cookie_t *drc) 1594 { 1595 struct recvendsyncarg resa; 1596 dsl_dataset_t *ds = drc->drc_logical_ds; 1597 int err, myerr; 1598 1599 /* 1600 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 1601 * expects it to have a ds_user_ptr (and zil), but clone_swap() 1602 * can close it. 1603 */ 1604 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1605 1606 if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { 1607 err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, 1608 drc->drc_force); 1609 if (err) 1610 goto out; 1611 } else { 1612 mutex_exit(&ds->ds_recvlock); 1613 dsl_dataset_rele(ds, dmu_recv_tag); 1614 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 1615 B_FALSE); 1616 return (EBUSY); 1617 } 1618 1619 resa.creation_time = drc->drc_drrb->drr_creation_time; 1620 resa.toguid = drc->drc_drrb->drr_toguid; 1621 resa.tosnap = drc->drc_tosnap; 1622 1623 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1624 recv_end_check, recv_end_sync, ds, &resa, 3); 1625 if (err) { 1626 /* swap back */ 1627 (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); 1628 } 1629 1630 out: 1631 mutex_exit(&ds->ds_recvlock); 1632 if (err == 0 && drc->drc_guid_to_ds_map != NULL) 1633 (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); 1634 dsl_dataset_disown(ds, dmu_recv_tag); 1635 myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); 1636 ASSERT3U(myerr, ==, 0); 1637 return (err); 1638 } 1639 1640 static int 1641 dmu_recv_new_end(dmu_recv_cookie_t *drc) 1642 { 1643 struct recvendsyncarg resa; 1644 dsl_dataset_t *ds = drc->drc_logical_ds; 1645 int err; 1646 1647 /* 1648 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 1649 * expects it to have a ds_user_ptr (and zil), but clone_swap() 1650 * can close it. 1651 */ 1652 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1653 1654 resa.creation_time = drc->drc_drrb->drr_creation_time; 1655 resa.toguid = drc->drc_drrb->drr_toguid; 1656 resa.tosnap = drc->drc_tosnap; 1657 1658 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1659 recv_end_check, recv_end_sync, ds, &resa, 3); 1660 if (err) { 1661 /* clean up the fs we just recv'd into */ 1662 (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); 1663 } else { 1664 if (drc->drc_guid_to_ds_map != NULL) 1665 (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); 1666 /* release the hold from dmu_recv_begin */ 1667 dsl_dataset_disown(ds, dmu_recv_tag); 1668 } 1669 return (err); 1670 } 1671 1672 int 1673 dmu_recv_end(dmu_recv_cookie_t *drc) 1674 { 1675 if (drc->drc_logical_ds != drc->drc_real_ds) 1676 return (dmu_recv_existing_end(drc)); 1677 else 1678 return (dmu_recv_new_end(drc)); 1679 } 1680