1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 /* 25 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2011 by Delphix. All rights reserved. 27 */ 28 29 #include <sys/dmu.h> 30 #include <sys/dmu_impl.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/dbuf.h> 33 #include <sys/dnode.h> 34 #include <sys/zfs_context.h> 35 #include <sys/dmu_objset.h> 36 #include <sys/dmu_traverse.h> 37 #include <sys/dsl_dataset.h> 38 #include <sys/dsl_dir.h> 39 #include <sys/dsl_prop.h> 40 #include <sys/dsl_pool.h> 41 #include <sys/dsl_synctask.h> 42 #include <sys/zfs_ioctl.h> 43 #include <sys/zap.h> 44 #include <sys/zio_checksum.h> 45 #include <sys/zfs_znode.h> 46 #include <zfs_fletcher.h> 47 #include <sys/avl.h> 48 #include <sys/ddt.h> 49 #include <sys/zfs_onexit.h> 50 51 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 52 int zfs_send_corrupt_data = B_FALSE; 53 54 static char *dmu_recv_tag = "dmu_recv_tag"; 55 56 /* 57 * The list of data whose inclusion in a send stream can be pending from 58 * one call to backup_cb to another. Multiple calls to dump_free() and 59 * dump_freeobjects() can be aggregated into a single DRR_FREE or 60 * DRR_FREEOBJECTS replay record. 61 */ 62 typedef enum { 63 PENDING_NONE, 64 PENDING_FREE, 65 PENDING_FREEOBJECTS 66 } pendop_t; 67 68 struct backuparg { 69 dmu_replay_record_t *drr; 70 vnode_t *vp; 71 offset_t *off; 72 objset_t *os; 73 zio_cksum_t zc; 74 uint64_t toguid; 75 int err; 76 pendop_t pending_op; 77 }; 78 79 static int 80 dump_bytes(struct backuparg *ba, void *buf, int len) 81 { 82 ssize_t resid; /* have to get resid to get detailed errno */ 83 ASSERT3U(len % 8, ==, 0); 84 85 fletcher_4_incremental_native(buf, len, &ba->zc); 86 ba->err = vn_rdwr(UIO_WRITE, ba->vp, 87 (caddr_t)buf, len, 88 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); 89 *ba->off += len; 90 return (ba->err); 91 } 92 93 static int 94 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, 95 uint64_t length) 96 { 97 struct drr_free *drrf = &(ba->drr->drr_u.drr_free); 98 99 /* 100 * If there is a pending op, but it's not PENDING_FREE, push it out, 101 * since free block aggregation can only be done for blocks of the 102 * same type (i.e., DRR_FREE records can only be aggregated with 103 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 104 * aggregated with other DRR_FREEOBJECTS records. 105 */ 106 if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) { 107 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 108 return (EINTR); 109 ba->pending_op = PENDING_NONE; 110 } 111 112 if (ba->pending_op == PENDING_FREE) { 113 /* 114 * There should never be a PENDING_FREE if length is -1 115 * (because dump_dnode is the only place where this 116 * function is called with a -1, and only after flushing 117 * any pending record). 118 */ 119 ASSERT(length != -1ULL); 120 /* 121 * Check to see whether this free block can be aggregated 122 * with pending one. 123 */ 124 if (drrf->drr_object == object && drrf->drr_offset + 125 drrf->drr_length == offset) { 126 drrf->drr_length += length; 127 return (0); 128 } else { 129 /* not a continuation. Push out pending record */ 130 if (dump_bytes(ba, ba->drr, 131 sizeof (dmu_replay_record_t)) != 0) 132 return (EINTR); 133 ba->pending_op = PENDING_NONE; 134 } 135 } 136 /* create a FREE record and make it pending */ 137 bzero(ba->drr, sizeof (dmu_replay_record_t)); 138 ba->drr->drr_type = DRR_FREE; 139 drrf->drr_object = object; 140 drrf->drr_offset = offset; 141 drrf->drr_length = length; 142 drrf->drr_toguid = ba->toguid; 143 if (length == -1ULL) { 144 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 145 return (EINTR); 146 } else { 147 ba->pending_op = PENDING_FREE; 148 } 149 150 return (0); 151 } 152 153 static int 154 dump_data(struct backuparg *ba, dmu_object_type_t type, 155 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 156 { 157 struct drr_write *drrw = &(ba->drr->drr_u.drr_write); 158 159 160 /* 161 * If there is any kind of pending aggregation (currently either 162 * a grouping of free objects or free blocks), push it out to 163 * the stream, since aggregation can't be done across operations 164 * of different types. 165 */ 166 if (ba->pending_op != PENDING_NONE) { 167 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 168 return (EINTR); 169 ba->pending_op = PENDING_NONE; 170 } 171 /* write a DATA record */ 172 bzero(ba->drr, sizeof (dmu_replay_record_t)); 173 ba->drr->drr_type = DRR_WRITE; 174 drrw->drr_object = object; 175 drrw->drr_type = type; 176 drrw->drr_offset = offset; 177 drrw->drr_length = blksz; 178 drrw->drr_toguid = ba->toguid; 179 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 180 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 181 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 182 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 183 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 184 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 185 drrw->drr_key.ddk_cksum = bp->blk_cksum; 186 187 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 188 return (EINTR); 189 if (dump_bytes(ba, data, blksz) != 0) 190 return (EINTR); 191 return (0); 192 } 193 194 static int 195 dump_spill(struct backuparg *ba, uint64_t object, int blksz, void *data) 196 { 197 struct drr_spill *drrs = &(ba->drr->drr_u.drr_spill); 198 199 if (ba->pending_op != PENDING_NONE) { 200 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 201 return (EINTR); 202 ba->pending_op = PENDING_NONE; 203 } 204 205 /* write a SPILL record */ 206 bzero(ba->drr, sizeof (dmu_replay_record_t)); 207 ba->drr->drr_type = DRR_SPILL; 208 drrs->drr_object = object; 209 drrs->drr_length = blksz; 210 drrs->drr_toguid = ba->toguid; 211 212 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) 213 return (EINTR); 214 if (dump_bytes(ba, data, blksz)) 215 return (EINTR); 216 return (0); 217 } 218 219 static int 220 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) 221 { 222 struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects); 223 224 /* 225 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 226 * push it out, since free block aggregation can only be done for 227 * blocks of the same type (i.e., DRR_FREE records can only be 228 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 229 * can only be aggregated with other DRR_FREEOBJECTS records. 230 */ 231 if (ba->pending_op != PENDING_NONE && 232 ba->pending_op != PENDING_FREEOBJECTS) { 233 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 234 return (EINTR); 235 ba->pending_op = PENDING_NONE; 236 } 237 if (ba->pending_op == PENDING_FREEOBJECTS) { 238 /* 239 * See whether this free object array can be aggregated 240 * with pending one 241 */ 242 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 243 drrfo->drr_numobjs += numobjs; 244 return (0); 245 } else { 246 /* can't be aggregated. Push out pending record */ 247 if (dump_bytes(ba, ba->drr, 248 sizeof (dmu_replay_record_t)) != 0) 249 return (EINTR); 250 ba->pending_op = PENDING_NONE; 251 } 252 } 253 254 /* write a FREEOBJECTS record */ 255 bzero(ba->drr, sizeof (dmu_replay_record_t)); 256 ba->drr->drr_type = DRR_FREEOBJECTS; 257 drrfo->drr_firstobj = firstobj; 258 drrfo->drr_numobjs = numobjs; 259 drrfo->drr_toguid = ba->toguid; 260 261 ba->pending_op = PENDING_FREEOBJECTS; 262 263 return (0); 264 } 265 266 static int 267 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) 268 { 269 struct drr_object *drro = &(ba->drr->drr_u.drr_object); 270 271 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 272 return (dump_freeobjects(ba, object, 1)); 273 274 if (ba->pending_op != PENDING_NONE) { 275 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 276 return (EINTR); 277 ba->pending_op = PENDING_NONE; 278 } 279 280 /* write an OBJECT record */ 281 bzero(ba->drr, sizeof (dmu_replay_record_t)); 282 ba->drr->drr_type = DRR_OBJECT; 283 drro->drr_object = object; 284 drro->drr_type = dnp->dn_type; 285 drro->drr_bonustype = dnp->dn_bonustype; 286 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 287 drro->drr_bonuslen = dnp->dn_bonuslen; 288 drro->drr_checksumtype = dnp->dn_checksum; 289 drro->drr_compress = dnp->dn_compress; 290 drro->drr_toguid = ba->toguid; 291 292 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) 293 return (EINTR); 294 295 if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) 296 return (EINTR); 297 298 /* free anything past the end of the file */ 299 if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * 300 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) 301 return (EINTR); 302 if (ba->err) 303 return (EINTR); 304 return (0); 305 } 306 307 #define BP_SPAN(dnp, level) \ 308 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 309 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 310 311 /* ARGSUSED */ 312 static int 313 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf, 314 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 315 { 316 struct backuparg *ba = arg; 317 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 318 int err = 0; 319 320 if (issig(JUSTLOOKING) && issig(FORREAL)) 321 return (EINTR); 322 323 if (zb->zb_object != DMU_META_DNODE_OBJECT && 324 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 325 return (0); 326 } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { 327 uint64_t span = BP_SPAN(dnp, zb->zb_level); 328 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 329 err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); 330 } else if (bp == NULL) { 331 uint64_t span = BP_SPAN(dnp, zb->zb_level); 332 err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span); 333 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 334 return (0); 335 } else if (type == DMU_OT_DNODE) { 336 dnode_phys_t *blk; 337 int i; 338 int blksz = BP_GET_LSIZE(bp); 339 uint32_t aflags = ARC_WAIT; 340 arc_buf_t *abuf; 341 342 if (dsl_read(NULL, spa, bp, pbuf, 343 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 344 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 345 return (EIO); 346 347 blk = abuf->b_data; 348 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 349 uint64_t dnobj = (zb->zb_blkid << 350 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 351 err = dump_dnode(ba, dnobj, blk+i); 352 if (err) 353 break; 354 } 355 (void) arc_buf_remove_ref(abuf, &abuf); 356 } else if (type == DMU_OT_SA) { 357 uint32_t aflags = ARC_WAIT; 358 arc_buf_t *abuf; 359 int blksz = BP_GET_LSIZE(bp); 360 361 if (arc_read_nolock(NULL, spa, bp, 362 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 363 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) 364 return (EIO); 365 366 err = dump_spill(ba, zb->zb_object, blksz, abuf->b_data); 367 (void) arc_buf_remove_ref(abuf, &abuf); 368 } else { /* it's a level-0 block of a regular object */ 369 uint32_t aflags = ARC_WAIT; 370 arc_buf_t *abuf; 371 int blksz = BP_GET_LSIZE(bp); 372 373 if (dsl_read(NULL, spa, bp, pbuf, 374 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, 375 ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { 376 if (zfs_send_corrupt_data) { 377 /* Send a block filled with 0x"zfs badd bloc" */ 378 abuf = arc_buf_alloc(spa, blksz, &abuf, 379 ARC_BUFC_DATA); 380 uint64_t *ptr; 381 for (ptr = abuf->b_data; 382 (char *)ptr < (char *)abuf->b_data + blksz; 383 ptr++) 384 *ptr = 0x2f5baddb10c; 385 } else { 386 return (EIO); 387 } 388 } 389 390 err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz, 391 blksz, bp, abuf->b_data); 392 (void) arc_buf_remove_ref(abuf, &abuf); 393 } 394 395 ASSERT(err == 0 || err == EINTR); 396 return (err); 397 } 398 399 int 400 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, 401 vnode_t *vp, offset_t *off) 402 { 403 dsl_dataset_t *ds = tosnap->os_dsl_dataset; 404 dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; 405 dmu_replay_record_t *drr; 406 struct backuparg ba; 407 int err; 408 uint64_t fromtxg = 0; 409 410 /* tosnap must be a snapshot */ 411 if (ds->ds_phys->ds_next_snap_obj == 0) 412 return (EINVAL); 413 414 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 415 if (fromds && (ds->ds_dir != fromds->ds_dir || 416 fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) 417 return (EXDEV); 418 419 if (fromorigin) { 420 dsl_pool_t *dp = ds->ds_dir->dd_pool; 421 422 if (fromsnap) 423 return (EINVAL); 424 425 if (dsl_dir_is_clone(ds->ds_dir)) { 426 rw_enter(&dp->dp_config_rwlock, RW_READER); 427 err = dsl_dataset_hold_obj(dp, 428 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); 429 rw_exit(&dp->dp_config_rwlock); 430 if (err) 431 return (err); 432 } else { 433 fromorigin = B_FALSE; 434 } 435 } 436 437 438 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 439 drr->drr_type = DRR_BEGIN; 440 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 441 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 442 DMU_SUBSTREAM); 443 444 #ifdef _KERNEL 445 if (dmu_objset_type(tosnap) == DMU_OST_ZFS) { 446 uint64_t version; 447 if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) 448 return (EINVAL); 449 if (version == ZPL_VERSION_SA) { 450 DMU_SET_FEATUREFLAGS( 451 drr->drr_u.drr_begin.drr_versioninfo, 452 DMU_BACKUP_FEATURE_SA_SPILL); 453 } 454 } 455 #endif 456 457 drr->drr_u.drr_begin.drr_creation_time = 458 ds->ds_phys->ds_creation_time; 459 drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; 460 if (fromorigin) 461 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 462 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 463 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 464 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 465 466 if (fromds) 467 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; 468 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 469 470 if (fromds) 471 fromtxg = fromds->ds_phys->ds_creation_txg; 472 if (fromorigin) 473 dsl_dataset_rele(fromds, FTAG); 474 475 ba.drr = drr; 476 ba.vp = vp; 477 ba.os = tosnap; 478 ba.off = off; 479 ba.toguid = ds->ds_phys->ds_guid; 480 ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); 481 ba.pending_op = PENDING_NONE; 482 483 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { 484 kmem_free(drr, sizeof (dmu_replay_record_t)); 485 return (ba.err); 486 } 487 488 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 489 backup_cb, &ba); 490 491 if (ba.pending_op != PENDING_NONE) 492 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) 493 err = EINTR; 494 495 if (err) { 496 if (err == EINTR && ba.err) 497 err = ba.err; 498 kmem_free(drr, sizeof (dmu_replay_record_t)); 499 return (err); 500 } 501 502 bzero(drr, sizeof (dmu_replay_record_t)); 503 drr->drr_type = DRR_END; 504 drr->drr_u.drr_end.drr_checksum = ba.zc; 505 drr->drr_u.drr_end.drr_toguid = ba.toguid; 506 507 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { 508 kmem_free(drr, sizeof (dmu_replay_record_t)); 509 return (ba.err); 510 } 511 512 kmem_free(drr, sizeof (dmu_replay_record_t)); 513 514 return (0); 515 } 516 517 int 518 dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, 519 uint64_t *sizep) 520 { 521 dsl_dataset_t *ds = tosnap->os_dsl_dataset; 522 dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; 523 dsl_pool_t *dp = ds->ds_dir->dd_pool; 524 int err; 525 uint64_t size; 526 527 /* tosnap must be a snapshot */ 528 if (ds->ds_phys->ds_next_snap_obj == 0) 529 return (EINVAL); 530 531 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 532 if (fromds && (ds->ds_dir != fromds->ds_dir || 533 fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg)) 534 return (EXDEV); 535 536 if (fromorigin) { 537 if (fromsnap) 538 return (EINVAL); 539 540 if (dsl_dir_is_clone(ds->ds_dir)) { 541 rw_enter(&dp->dp_config_rwlock, RW_READER); 542 err = dsl_dataset_hold_obj(dp, 543 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds); 544 rw_exit(&dp->dp_config_rwlock); 545 if (err) 546 return (err); 547 } else { 548 fromorigin = B_FALSE; 549 } 550 } 551 552 /* Get uncompressed size estimate of changed data. */ 553 if (fromds == NULL) { 554 size = ds->ds_phys->ds_uncompressed_bytes; 555 } else { 556 uint64_t used, comp; 557 err = dsl_dataset_space_written(fromds, ds, 558 &used, &comp, &size); 559 if (fromorigin) 560 dsl_dataset_rele(fromds, FTAG); 561 if (err) 562 return (err); 563 } 564 565 /* 566 * Assume that space (both on-disk and in-stream) is dominated by 567 * data. We will adjust for indirect blocks and the copies property, 568 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 569 */ 570 571 /* 572 * Subtract out approximate space used by indirect blocks. 573 * Assume most space is used by data blocks (non-indirect, non-dnode). 574 * Assume all blocks are recordsize. Assume ditto blocks and 575 * internal fragmentation counter out compression. 576 * 577 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 578 * block, which we observe in practice. 579 */ 580 uint64_t recordsize; 581 rw_enter(&dp->dp_config_rwlock, RW_READER); 582 err = dsl_prop_get_ds(ds, "recordsize", 583 sizeof (recordsize), 1, &recordsize, NULL); 584 rw_exit(&dp->dp_config_rwlock); 585 if (err) 586 return (err); 587 size -= size / recordsize * sizeof (blkptr_t); 588 589 /* Add in the space for the record associated with each block. */ 590 size += size / recordsize * sizeof (dmu_replay_record_t); 591 592 *sizep = size; 593 594 return (0); 595 } 596 597 struct recvbeginsyncarg { 598 const char *tofs; 599 const char *tosnap; 600 dsl_dataset_t *origin; 601 uint64_t fromguid; 602 dmu_objset_type_t type; 603 void *tag; 604 boolean_t force; 605 uint64_t dsflags; 606 char clonelastname[MAXNAMELEN]; 607 dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ 608 cred_t *cr; 609 }; 610 611 /* ARGSUSED */ 612 static int 613 recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) 614 { 615 dsl_dir_t *dd = arg1; 616 struct recvbeginsyncarg *rbsa = arg2; 617 objset_t *mos = dd->dd_pool->dp_meta_objset; 618 uint64_t val; 619 int err; 620 621 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, 622 strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val); 623 624 if (err != ENOENT) 625 return (err ? err : EEXIST); 626 627 if (rbsa->origin) { 628 /* make sure it's a snap in the same pool */ 629 if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) 630 return (EXDEV); 631 if (!dsl_dataset_is_snapshot(rbsa->origin)) 632 return (EINVAL); 633 if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) 634 return (ENODEV); 635 } 636 637 return (0); 638 } 639 640 static void 641 recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx) 642 { 643 dsl_dir_t *dd = arg1; 644 struct recvbeginsyncarg *rbsa = arg2; 645 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 646 uint64_t dsobj; 647 648 /* Create and open new dataset. */ 649 dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, 650 rbsa->origin, flags, rbsa->cr, tx); 651 VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, 652 B_TRUE, dmu_recv_tag, &rbsa->ds)); 653 654 if (rbsa->origin == NULL) { 655 (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, 656 rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); 657 } 658 659 spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC, 660 dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj); 661 } 662 663 /* ARGSUSED */ 664 static int 665 recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) 666 { 667 dsl_dataset_t *ds = arg1; 668 struct recvbeginsyncarg *rbsa = arg2; 669 int err; 670 uint64_t val; 671 672 /* must not have any changes since most recent snapshot */ 673 if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) 674 return (ETXTBSY); 675 676 /* new snapshot name must not exist */ 677 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 678 ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val); 679 if (err == 0) 680 return (EEXIST); 681 if (err != ENOENT) 682 return (err); 683 684 if (rbsa->fromguid) { 685 /* if incremental, most recent snapshot must match fromguid */ 686 if (ds->ds_prev == NULL) 687 return (ENODEV); 688 689 /* 690 * most recent snapshot must match fromguid, or there are no 691 * changes since the fromguid one 692 */ 693 if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { 694 uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; 695 uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; 696 while (obj != 0) { 697 dsl_dataset_t *snap; 698 err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, 699 obj, FTAG, &snap); 700 if (err) 701 return (ENODEV); 702 if (snap->ds_phys->ds_creation_txg < birth) { 703 dsl_dataset_rele(snap, FTAG); 704 return (ENODEV); 705 } 706 if (snap->ds_phys->ds_guid == rbsa->fromguid) { 707 dsl_dataset_rele(snap, FTAG); 708 break; /* it's ok */ 709 } 710 obj = snap->ds_phys->ds_prev_snap_obj; 711 dsl_dataset_rele(snap, FTAG); 712 } 713 if (obj == 0) 714 return (ENODEV); 715 } 716 } else { 717 /* if full, most recent snapshot must be $ORIGIN */ 718 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) 719 return (ENODEV); 720 } 721 722 /* temporary clone name must not exist */ 723 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 724 ds->ds_dir->dd_phys->dd_child_dir_zapobj, 725 rbsa->clonelastname, 8, 1, &val); 726 if (err == 0) 727 return (EEXIST); 728 if (err != ENOENT) 729 return (err); 730 731 return (0); 732 } 733 734 /* ARGSUSED */ 735 static void 736 recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx) 737 { 738 dsl_dataset_t *ohds = arg1; 739 struct recvbeginsyncarg *rbsa = arg2; 740 dsl_pool_t *dp = ohds->ds_dir->dd_pool; 741 dsl_dataset_t *cds; 742 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; 743 uint64_t dsobj; 744 745 /* create and open the temporary clone */ 746 dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, 747 ohds->ds_prev, flags, rbsa->cr, tx); 748 VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); 749 750 /* 751 * If we actually created a non-clone, we need to create the 752 * objset in our new dataset. 753 */ 754 if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { 755 (void) dmu_objset_create_impl(dp->dp_spa, 756 cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); 757 } 758 759 rbsa->ds = cds; 760 761 spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC, 762 dp->dp_spa, tx, "dataset = %lld", dsobj); 763 } 764 765 static boolean_t 766 dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb) 767 { 768 int featureflags; 769 770 featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); 771 772 /* Verify pool version supports SA if SA_SPILL feature set */ 773 return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && 774 (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA)); 775 } 776 777 /* 778 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 779 * succeeds; otherwise we will leak the holds on the datasets. 780 */ 781 int 782 dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, 783 boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) 784 { 785 int err = 0; 786 boolean_t byteswap; 787 struct recvbeginsyncarg rbsa = { 0 }; 788 uint64_t versioninfo; 789 int flags; 790 dsl_dataset_t *ds; 791 792 if (drrb->drr_magic == DMU_BACKUP_MAGIC) 793 byteswap = FALSE; 794 else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 795 byteswap = TRUE; 796 else 797 return (EINVAL); 798 799 rbsa.tofs = tofs; 800 rbsa.tosnap = tosnap; 801 rbsa.origin = origin ? origin->os_dsl_dataset : NULL; 802 rbsa.fromguid = drrb->drr_fromguid; 803 rbsa.type = drrb->drr_type; 804 rbsa.tag = FTAG; 805 rbsa.dsflags = 0; 806 rbsa.cr = CRED(); 807 versioninfo = drrb->drr_versioninfo; 808 flags = drrb->drr_flags; 809 810 if (byteswap) { 811 rbsa.type = BSWAP_32(rbsa.type); 812 rbsa.fromguid = BSWAP_64(rbsa.fromguid); 813 versioninfo = BSWAP_64(versioninfo); 814 flags = BSWAP_32(flags); 815 } 816 817 if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || 818 rbsa.type >= DMU_OST_NUMTYPES || 819 ((flags & DRR_FLAG_CLONE) && origin == NULL)) 820 return (EINVAL); 821 822 if (flags & DRR_FLAG_CI_DATA) 823 rbsa.dsflags = DS_FLAG_CI_DATASET; 824 825 bzero(drc, sizeof (dmu_recv_cookie_t)); 826 drc->drc_drrb = drrb; 827 drc->drc_tosnap = tosnap; 828 drc->drc_top_ds = top_ds; 829 drc->drc_force = force; 830 831 /* 832 * Process the begin in syncing context. 833 */ 834 835 /* open the dataset we are logically receiving into */ 836 err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); 837 if (err == 0) { 838 if (dmu_recv_verify_features(ds, drrb)) { 839 dsl_dataset_rele(ds, dmu_recv_tag); 840 return (ENOTSUP); 841 } 842 /* target fs already exists; recv into temp clone */ 843 844 /* Can't recv a clone into an existing fs */ 845 if (flags & DRR_FLAG_CLONE) { 846 dsl_dataset_rele(ds, dmu_recv_tag); 847 return (EINVAL); 848 } 849 850 /* must not have an incremental recv already in progress */ 851 if (!mutex_tryenter(&ds->ds_recvlock)) { 852 dsl_dataset_rele(ds, dmu_recv_tag); 853 return (EBUSY); 854 } 855 856 /* tmp clone name is: tofs/%tosnap" */ 857 (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), 858 "%%%s", tosnap); 859 rbsa.force = force; 860 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 861 recv_existing_check, recv_existing_sync, ds, &rbsa, 5); 862 if (err) { 863 mutex_exit(&ds->ds_recvlock); 864 dsl_dataset_rele(ds, dmu_recv_tag); 865 return (err); 866 } 867 drc->drc_logical_ds = ds; 868 drc->drc_real_ds = rbsa.ds; 869 } else if (err == ENOENT) { 870 /* target fs does not exist; must be a full backup or clone */ 871 char *cp; 872 873 /* 874 * If it's a non-clone incremental, we are missing the 875 * target fs, so fail the recv. 876 */ 877 if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) 878 return (ENOENT); 879 880 /* Open the parent of tofs */ 881 cp = strrchr(tofs, '/'); 882 *cp = '\0'; 883 err = dsl_dataset_hold(tofs, FTAG, &ds); 884 *cp = '/'; 885 if (err) 886 return (err); 887 888 if (dmu_recv_verify_features(ds, drrb)) { 889 dsl_dataset_rele(ds, FTAG); 890 return (ENOTSUP); 891 } 892 893 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 894 recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); 895 dsl_dataset_rele(ds, FTAG); 896 if (err) 897 return (err); 898 drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; 899 drc->drc_newfs = B_TRUE; 900 } 901 902 return (err); 903 } 904 905 struct restorearg { 906 int err; 907 int byteswap; 908 vnode_t *vp; 909 char *buf; 910 uint64_t voff; 911 int bufsize; /* amount of memory allocated for buf */ 912 zio_cksum_t cksum; 913 avl_tree_t *guid_to_ds_map; 914 }; 915 916 typedef struct guid_map_entry { 917 uint64_t guid; 918 dsl_dataset_t *gme_ds; 919 avl_node_t avlnode; 920 } guid_map_entry_t; 921 922 static int 923 guid_compare(const void *arg1, const void *arg2) 924 { 925 const guid_map_entry_t *gmep1 = arg1; 926 const guid_map_entry_t *gmep2 = arg2; 927 928 if (gmep1->guid < gmep2->guid) 929 return (-1); 930 else if (gmep1->guid > gmep2->guid) 931 return (1); 932 return (0); 933 } 934 935 static void 936 free_guid_map_onexit(void *arg) 937 { 938 avl_tree_t *ca = arg; 939 void *cookie = NULL; 940 guid_map_entry_t *gmep; 941 942 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 943 dsl_dataset_rele(gmep->gme_ds, ca); 944 kmem_free(gmep, sizeof (guid_map_entry_t)); 945 } 946 avl_destroy(ca); 947 kmem_free(ca, sizeof (avl_tree_t)); 948 } 949 950 static void * 951 restore_read(struct restorearg *ra, int len) 952 { 953 void *rv; 954 int done = 0; 955 956 /* some things will require 8-byte alignment, so everything must */ 957 ASSERT3U(len % 8, ==, 0); 958 959 while (done < len) { 960 ssize_t resid; 961 962 ra->err = vn_rdwr(UIO_READ, ra->vp, 963 (caddr_t)ra->buf + done, len - done, 964 ra->voff, UIO_SYSSPACE, FAPPEND, 965 RLIM64_INFINITY, CRED(), &resid); 966 967 if (resid == len - done) 968 ra->err = EINVAL; 969 ra->voff += len - done - resid; 970 done = len - resid; 971 if (ra->err) 972 return (NULL); 973 } 974 975 ASSERT3U(done, ==, len); 976 rv = ra->buf; 977 if (ra->byteswap) 978 fletcher_4_incremental_byteswap(rv, len, &ra->cksum); 979 else 980 fletcher_4_incremental_native(rv, len, &ra->cksum); 981 return (rv); 982 } 983 984 static void 985 backup_byteswap(dmu_replay_record_t *drr) 986 { 987 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 988 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 989 drr->drr_type = BSWAP_32(drr->drr_type); 990 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 991 switch (drr->drr_type) { 992 case DRR_BEGIN: 993 DO64(drr_begin.drr_magic); 994 DO64(drr_begin.drr_versioninfo); 995 DO64(drr_begin.drr_creation_time); 996 DO32(drr_begin.drr_type); 997 DO32(drr_begin.drr_flags); 998 DO64(drr_begin.drr_toguid); 999 DO64(drr_begin.drr_fromguid); 1000 break; 1001 case DRR_OBJECT: 1002 DO64(drr_object.drr_object); 1003 /* DO64(drr_object.drr_allocation_txg); */ 1004 DO32(drr_object.drr_type); 1005 DO32(drr_object.drr_bonustype); 1006 DO32(drr_object.drr_blksz); 1007 DO32(drr_object.drr_bonuslen); 1008 DO64(drr_object.drr_toguid); 1009 break; 1010 case DRR_FREEOBJECTS: 1011 DO64(drr_freeobjects.drr_firstobj); 1012 DO64(drr_freeobjects.drr_numobjs); 1013 DO64(drr_freeobjects.drr_toguid); 1014 break; 1015 case DRR_WRITE: 1016 DO64(drr_write.drr_object); 1017 DO32(drr_write.drr_type); 1018 DO64(drr_write.drr_offset); 1019 DO64(drr_write.drr_length); 1020 DO64(drr_write.drr_toguid); 1021 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); 1022 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); 1023 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); 1024 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); 1025 DO64(drr_write.drr_key.ddk_prop); 1026 break; 1027 case DRR_WRITE_BYREF: 1028 DO64(drr_write_byref.drr_object); 1029 DO64(drr_write_byref.drr_offset); 1030 DO64(drr_write_byref.drr_length); 1031 DO64(drr_write_byref.drr_toguid); 1032 DO64(drr_write_byref.drr_refguid); 1033 DO64(drr_write_byref.drr_refobject); 1034 DO64(drr_write_byref.drr_refoffset); 1035 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); 1036 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); 1037 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); 1038 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); 1039 DO64(drr_write_byref.drr_key.ddk_prop); 1040 break; 1041 case DRR_FREE: 1042 DO64(drr_free.drr_object); 1043 DO64(drr_free.drr_offset); 1044 DO64(drr_free.drr_length); 1045 DO64(drr_free.drr_toguid); 1046 break; 1047 case DRR_SPILL: 1048 DO64(drr_spill.drr_object); 1049 DO64(drr_spill.drr_length); 1050 DO64(drr_spill.drr_toguid); 1051 break; 1052 case DRR_END: 1053 DO64(drr_end.drr_checksum.zc_word[0]); 1054 DO64(drr_end.drr_checksum.zc_word[1]); 1055 DO64(drr_end.drr_checksum.zc_word[2]); 1056 DO64(drr_end.drr_checksum.zc_word[3]); 1057 DO64(drr_end.drr_toguid); 1058 break; 1059 } 1060 #undef DO64 1061 #undef DO32 1062 } 1063 1064 static int 1065 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 1066 { 1067 int err; 1068 dmu_tx_t *tx; 1069 void *data = NULL; 1070 1071 if (drro->drr_type == DMU_OT_NONE || 1072 drro->drr_type >= DMU_OT_NUMTYPES || 1073 drro->drr_bonustype >= DMU_OT_NUMTYPES || 1074 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 1075 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1076 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1077 drro->drr_blksz < SPA_MINBLOCKSIZE || 1078 drro->drr_blksz > SPA_MAXBLOCKSIZE || 1079 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1080 return (EINVAL); 1081 } 1082 1083 err = dmu_object_info(os, drro->drr_object, NULL); 1084 1085 if (err != 0 && err != ENOENT) 1086 return (EINVAL); 1087 1088 if (drro->drr_bonuslen) { 1089 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); 1090 if (ra->err) 1091 return (ra->err); 1092 } 1093 1094 if (err == ENOENT) { 1095 /* currently free, want to be allocated */ 1096 tx = dmu_tx_create(os); 1097 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1098 err = dmu_tx_assign(tx, TXG_WAIT); 1099 if (err) { 1100 dmu_tx_abort(tx); 1101 return (err); 1102 } 1103 err = dmu_object_claim(os, drro->drr_object, 1104 drro->drr_type, drro->drr_blksz, 1105 drro->drr_bonustype, drro->drr_bonuslen, tx); 1106 dmu_tx_commit(tx); 1107 } else { 1108 /* currently allocated, want to be allocated */ 1109 err = dmu_object_reclaim(os, drro->drr_object, 1110 drro->drr_type, drro->drr_blksz, 1111 drro->drr_bonustype, drro->drr_bonuslen); 1112 } 1113 if (err) { 1114 return (EINVAL); 1115 } 1116 1117 tx = dmu_tx_create(os); 1118 dmu_tx_hold_bonus(tx, drro->drr_object); 1119 err = dmu_tx_assign(tx, TXG_WAIT); 1120 if (err) { 1121 dmu_tx_abort(tx); 1122 return (err); 1123 } 1124 1125 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, 1126 tx); 1127 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 1128 1129 if (data != NULL) { 1130 dmu_buf_t *db; 1131 1132 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 1133 dmu_buf_will_dirty(db, tx); 1134 1135 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 1136 bcopy(data, db->db_data, drro->drr_bonuslen); 1137 if (ra->byteswap) { 1138 dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, 1139 drro->drr_bonuslen); 1140 } 1141 dmu_buf_rele(db, FTAG); 1142 } 1143 dmu_tx_commit(tx); 1144 return (0); 1145 } 1146 1147 /* ARGSUSED */ 1148 static int 1149 restore_freeobjects(struct restorearg *ra, objset_t *os, 1150 struct drr_freeobjects *drrfo) 1151 { 1152 uint64_t obj; 1153 1154 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1155 return (EINVAL); 1156 1157 for (obj = drrfo->drr_firstobj; 1158 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1159 (void) dmu_object_next(os, &obj, FALSE, 0)) { 1160 int err; 1161 1162 if (dmu_object_info(os, obj, NULL) != 0) 1163 continue; 1164 1165 err = dmu_free_object(os, obj); 1166 if (err) 1167 return (err); 1168 } 1169 return (0); 1170 } 1171 1172 static int 1173 restore_write(struct restorearg *ra, objset_t *os, 1174 struct drr_write *drrw) 1175 { 1176 dmu_tx_t *tx; 1177 void *data; 1178 int err; 1179 1180 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1181 drrw->drr_type >= DMU_OT_NUMTYPES) 1182 return (EINVAL); 1183 1184 data = restore_read(ra, drrw->drr_length); 1185 if (data == NULL) 1186 return (ra->err); 1187 1188 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1189 return (EINVAL); 1190 1191 tx = dmu_tx_create(os); 1192 1193 dmu_tx_hold_write(tx, drrw->drr_object, 1194 drrw->drr_offset, drrw->drr_length); 1195 err = dmu_tx_assign(tx, TXG_WAIT); 1196 if (err) { 1197 dmu_tx_abort(tx); 1198 return (err); 1199 } 1200 if (ra->byteswap) 1201 dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); 1202 dmu_write(os, drrw->drr_object, 1203 drrw->drr_offset, drrw->drr_length, data, tx); 1204 dmu_tx_commit(tx); 1205 return (0); 1206 } 1207 1208 /* 1209 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1210 * streams to refer to a copy of the data that is already on the 1211 * system because it came in earlier in the stream. This function 1212 * finds the earlier copy of the data, and uses that copy instead of 1213 * data from the stream to fulfill this write. 1214 */ 1215 static int 1216 restore_write_byref(struct restorearg *ra, objset_t *os, 1217 struct drr_write_byref *drrwbr) 1218 { 1219 dmu_tx_t *tx; 1220 int err; 1221 guid_map_entry_t gmesrch; 1222 guid_map_entry_t *gmep; 1223 avl_index_t where; 1224 objset_t *ref_os = NULL; 1225 dmu_buf_t *dbp; 1226 1227 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1228 return (EINVAL); 1229 1230 /* 1231 * If the GUID of the referenced dataset is different from the 1232 * GUID of the target dataset, find the referenced dataset. 1233 */ 1234 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1235 gmesrch.guid = drrwbr->drr_refguid; 1236 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 1237 &where)) == NULL) { 1238 return (EINVAL); 1239 } 1240 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1241 return (EINVAL); 1242 } else { 1243 ref_os = os; 1244 } 1245 1246 if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1247 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH)) 1248 return (err); 1249 1250 tx = dmu_tx_create(os); 1251 1252 dmu_tx_hold_write(tx, drrwbr->drr_object, 1253 drrwbr->drr_offset, drrwbr->drr_length); 1254 err = dmu_tx_assign(tx, TXG_WAIT); 1255 if (err) { 1256 dmu_tx_abort(tx); 1257 return (err); 1258 } 1259 dmu_write(os, drrwbr->drr_object, 1260 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1261 dmu_buf_rele(dbp, FTAG); 1262 dmu_tx_commit(tx); 1263 return (0); 1264 } 1265 1266 static int 1267 restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) 1268 { 1269 dmu_tx_t *tx; 1270 void *data; 1271 dmu_buf_t *db, *db_spill; 1272 int err; 1273 1274 if (drrs->drr_length < SPA_MINBLOCKSIZE || 1275 drrs->drr_length > SPA_MAXBLOCKSIZE) 1276 return (EINVAL); 1277 1278 data = restore_read(ra, drrs->drr_length); 1279 if (data == NULL) 1280 return (ra->err); 1281 1282 if (dmu_object_info(os, drrs->drr_object, NULL) != 0) 1283 return (EINVAL); 1284 1285 VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); 1286 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 1287 dmu_buf_rele(db, FTAG); 1288 return (err); 1289 } 1290 1291 tx = dmu_tx_create(os); 1292 1293 dmu_tx_hold_spill(tx, db->db_object); 1294 1295 err = dmu_tx_assign(tx, TXG_WAIT); 1296 if (err) { 1297 dmu_buf_rele(db, FTAG); 1298 dmu_buf_rele(db_spill, FTAG); 1299 dmu_tx_abort(tx); 1300 return (err); 1301 } 1302 dmu_buf_will_dirty(db_spill, tx); 1303 1304 if (db_spill->db_size < drrs->drr_length) 1305 VERIFY(0 == dbuf_spill_set_blksz(db_spill, 1306 drrs->drr_length, tx)); 1307 bcopy(data, db_spill->db_data, drrs->drr_length); 1308 1309 dmu_buf_rele(db, FTAG); 1310 dmu_buf_rele(db_spill, FTAG); 1311 1312 dmu_tx_commit(tx); 1313 return (0); 1314 } 1315 1316 /* ARGSUSED */ 1317 static int 1318 restore_free(struct restorearg *ra, objset_t *os, 1319 struct drr_free *drrf) 1320 { 1321 int err; 1322 1323 if (drrf->drr_length != -1ULL && 1324 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1325 return (EINVAL); 1326 1327 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1328 return (EINVAL); 1329 1330 err = dmu_free_long_range(os, drrf->drr_object, 1331 drrf->drr_offset, drrf->drr_length); 1332 return (err); 1333 } 1334 1335 /* 1336 * NB: callers *must* call dmu_recv_end() if this succeeds. 1337 */ 1338 int 1339 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, 1340 int cleanup_fd, uint64_t *action_handlep) 1341 { 1342 struct restorearg ra = { 0 }; 1343 dmu_replay_record_t *drr; 1344 objset_t *os; 1345 zio_cksum_t pcksum; 1346 int featureflags; 1347 1348 if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 1349 ra.byteswap = TRUE; 1350 1351 { 1352 /* compute checksum of drr_begin record */ 1353 dmu_replay_record_t *drr; 1354 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 1355 1356 drr->drr_type = DRR_BEGIN; 1357 drr->drr_u.drr_begin = *drc->drc_drrb; 1358 if (ra.byteswap) { 1359 fletcher_4_incremental_byteswap(drr, 1360 sizeof (dmu_replay_record_t), &ra.cksum); 1361 } else { 1362 fletcher_4_incremental_native(drr, 1363 sizeof (dmu_replay_record_t), &ra.cksum); 1364 } 1365 kmem_free(drr, sizeof (dmu_replay_record_t)); 1366 } 1367 1368 if (ra.byteswap) { 1369 struct drr_begin *drrb = drc->drc_drrb; 1370 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1371 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 1372 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1373 drrb->drr_type = BSWAP_32(drrb->drr_type); 1374 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1375 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1376 } 1377 1378 ra.vp = vp; 1379 ra.voff = *voffp; 1380 ra.bufsize = 1<<20; 1381 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1382 1383 /* these were verified in dmu_recv_begin */ 1384 ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == 1385 DMU_SUBSTREAM); 1386 ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); 1387 1388 /* 1389 * Open the objset we are modifying. 1390 */ 1391 VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); 1392 1393 ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); 1394 1395 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1396 1397 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1398 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1399 minor_t minor; 1400 1401 if (cleanup_fd == -1) { 1402 ra.err = EBADF; 1403 goto out; 1404 } 1405 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 1406 if (ra.err) { 1407 cleanup_fd = -1; 1408 goto out; 1409 } 1410 1411 if (*action_handlep == 0) { 1412 ra.guid_to_ds_map = 1413 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1414 avl_create(ra.guid_to_ds_map, guid_compare, 1415 sizeof (guid_map_entry_t), 1416 offsetof(guid_map_entry_t, avlnode)); 1417 ra.err = zfs_onexit_add_cb(minor, 1418 free_guid_map_onexit, ra.guid_to_ds_map, 1419 action_handlep); 1420 if (ra.err) 1421 goto out; 1422 } else { 1423 ra.err = zfs_onexit_cb_data(minor, *action_handlep, 1424 (void **)&ra.guid_to_ds_map); 1425 if (ra.err) 1426 goto out; 1427 } 1428 1429 drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 1430 } 1431 1432 /* 1433 * Read records and process them. 1434 */ 1435 pcksum = ra.cksum; 1436 while (ra.err == 0 && 1437 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { 1438 if (issig(JUSTLOOKING) && issig(FORREAL)) { 1439 ra.err = EINTR; 1440 goto out; 1441 } 1442 1443 if (ra.byteswap) 1444 backup_byteswap(drr); 1445 1446 switch (drr->drr_type) { 1447 case DRR_OBJECT: 1448 { 1449 /* 1450 * We need to make a copy of the record header, 1451 * because restore_{object,write} may need to 1452 * restore_read(), which will invalidate drr. 1453 */ 1454 struct drr_object drro = drr->drr_u.drr_object; 1455 ra.err = restore_object(&ra, os, &drro); 1456 break; 1457 } 1458 case DRR_FREEOBJECTS: 1459 { 1460 struct drr_freeobjects drrfo = 1461 drr->drr_u.drr_freeobjects; 1462 ra.err = restore_freeobjects(&ra, os, &drrfo); 1463 break; 1464 } 1465 case DRR_WRITE: 1466 { 1467 struct drr_write drrw = drr->drr_u.drr_write; 1468 ra.err = restore_write(&ra, os, &drrw); 1469 break; 1470 } 1471 case DRR_WRITE_BYREF: 1472 { 1473 struct drr_write_byref drrwbr = 1474 drr->drr_u.drr_write_byref; 1475 ra.err = restore_write_byref(&ra, os, &drrwbr); 1476 break; 1477 } 1478 case DRR_FREE: 1479 { 1480 struct drr_free drrf = drr->drr_u.drr_free; 1481 ra.err = restore_free(&ra, os, &drrf); 1482 break; 1483 } 1484 case DRR_END: 1485 { 1486 struct drr_end drre = drr->drr_u.drr_end; 1487 /* 1488 * We compare against the *previous* checksum 1489 * value, because the stored checksum is of 1490 * everything before the DRR_END record. 1491 */ 1492 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) 1493 ra.err = ECKSUM; 1494 goto out; 1495 } 1496 case DRR_SPILL: 1497 { 1498 struct drr_spill drrs = drr->drr_u.drr_spill; 1499 ra.err = restore_spill(&ra, os, &drrs); 1500 break; 1501 } 1502 default: 1503 ra.err = EINVAL; 1504 goto out; 1505 } 1506 pcksum = ra.cksum; 1507 } 1508 ASSERT(ra.err != 0); 1509 1510 out: 1511 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 1512 zfs_onexit_fd_rele(cleanup_fd); 1513 1514 if (ra.err != 0) { 1515 /* 1516 * destroy what we created, so we don't leave it in the 1517 * inconsistent restoring state. 1518 */ 1519 txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); 1520 1521 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 1522 B_FALSE); 1523 if (drc->drc_real_ds != drc->drc_logical_ds) { 1524 mutex_exit(&drc->drc_logical_ds->ds_recvlock); 1525 dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); 1526 } 1527 } 1528 1529 kmem_free(ra.buf, ra.bufsize); 1530 *voffp = ra.voff; 1531 return (ra.err); 1532 } 1533 1534 struct recvendsyncarg { 1535 char *tosnap; 1536 uint64_t creation_time; 1537 uint64_t toguid; 1538 }; 1539 1540 static int 1541 recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx) 1542 { 1543 dsl_dataset_t *ds = arg1; 1544 struct recvendsyncarg *resa = arg2; 1545 1546 return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx)); 1547 } 1548 1549 static void 1550 recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) 1551 { 1552 dsl_dataset_t *ds = arg1; 1553 struct recvendsyncarg *resa = arg2; 1554 1555 dsl_dataset_snapshot_sync(ds, resa->tosnap, tx); 1556 1557 /* set snapshot's creation time and guid */ 1558 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 1559 ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time; 1560 ds->ds_prev->ds_phys->ds_guid = resa->toguid; 1561 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1562 1563 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1564 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1565 } 1566 1567 static int 1568 add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds) 1569 { 1570 dsl_pool_t *dp = ds->ds_dir->dd_pool; 1571 uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj; 1572 dsl_dataset_t *snapds; 1573 guid_map_entry_t *gmep; 1574 int err; 1575 1576 ASSERT(guid_map != NULL); 1577 1578 rw_enter(&dp->dp_config_rwlock, RW_READER); 1579 err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds); 1580 if (err == 0) { 1581 gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); 1582 gmep->guid = snapds->ds_phys->ds_guid; 1583 gmep->gme_ds = snapds; 1584 avl_add(guid_map, gmep); 1585 } 1586 1587 rw_exit(&dp->dp_config_rwlock); 1588 return (err); 1589 } 1590 1591 static int 1592 dmu_recv_existing_end(dmu_recv_cookie_t *drc) 1593 { 1594 struct recvendsyncarg resa; 1595 dsl_dataset_t *ds = drc->drc_logical_ds; 1596 int err; 1597 1598 /* 1599 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 1600 * expects it to have a ds_user_ptr (and zil), but clone_swap() 1601 * can close it. 1602 */ 1603 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1604 1605 if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { 1606 err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, 1607 drc->drc_force); 1608 if (err) 1609 goto out; 1610 } else { 1611 mutex_exit(&ds->ds_recvlock); 1612 dsl_dataset_rele(ds, dmu_recv_tag); 1613 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, 1614 B_FALSE); 1615 return (EBUSY); 1616 } 1617 1618 resa.creation_time = drc->drc_drrb->drr_creation_time; 1619 resa.toguid = drc->drc_drrb->drr_toguid; 1620 resa.tosnap = drc->drc_tosnap; 1621 1622 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1623 recv_end_check, recv_end_sync, ds, &resa, 3); 1624 if (err) { 1625 /* swap back */ 1626 (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); 1627 } 1628 1629 out: 1630 mutex_exit(&ds->ds_recvlock); 1631 if (err == 0 && drc->drc_guid_to_ds_map != NULL) 1632 (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); 1633 dsl_dataset_disown(ds, dmu_recv_tag); 1634 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); 1635 return (err); 1636 } 1637 1638 static int 1639 dmu_recv_new_end(dmu_recv_cookie_t *drc) 1640 { 1641 struct recvendsyncarg resa; 1642 dsl_dataset_t *ds = drc->drc_logical_ds; 1643 int err; 1644 1645 /* 1646 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() 1647 * expects it to have a ds_user_ptr (and zil), but clone_swap() 1648 * can close it. 1649 */ 1650 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1651 1652 resa.creation_time = drc->drc_drrb->drr_creation_time; 1653 resa.toguid = drc->drc_drrb->drr_toguid; 1654 resa.tosnap = drc->drc_tosnap; 1655 1656 err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1657 recv_end_check, recv_end_sync, ds, &resa, 3); 1658 if (err) { 1659 /* clean up the fs we just recv'd into */ 1660 (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); 1661 } else { 1662 if (drc->drc_guid_to_ds_map != NULL) 1663 (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds); 1664 /* release the hold from dmu_recv_begin */ 1665 dsl_dataset_disown(ds, dmu_recv_tag); 1666 } 1667 return (err); 1668 } 1669 1670 int 1671 dmu_recv_end(dmu_recv_cookie_t *drc) 1672 { 1673 if (drc->drc_logical_ds != drc->drc_real_ds) 1674 return (dmu_recv_existing_end(drc)); 1675 else 1676 return (dmu_recv_new_end(drc)); 1677 } 1678