1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2013 by Delphix. All rights reserved. 25 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 26 */ 27 28 #include <sys/dmu.h> 29 #include <sys/dmu_impl.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/dbuf.h> 32 #include <sys/dnode.h> 33 #include <sys/zfs_context.h> 34 #include <sys/dmu_objset.h> 35 #include <sys/dmu_traverse.h> 36 #include <sys/dsl_dataset.h> 37 #include <sys/dsl_dir.h> 38 #include <sys/dsl_prop.h> 39 #include <sys/dsl_pool.h> 40 #include <sys/dsl_synctask.h> 41 #include <sys/zfs_ioctl.h> 42 #include <sys/zap.h> 43 #include <sys/zio_checksum.h> 44 #include <sys/zfs_znode.h> 45 #include <zfs_fletcher.h> 46 #include <sys/avl.h> 47 #include <sys/ddt.h> 48 #include <sys/zfs_onexit.h> 49 #include <sys/dmu_send.h> 50 #include <sys/dsl_destroy.h> 51 52 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ 53 int zfs_send_corrupt_data = B_FALSE; 54 55 static char *dmu_recv_tag = "dmu_recv_tag"; 56 static const char *recv_clone_name = "%recv"; 57 58 static int 59 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) 60 { 61 dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; 62 ssize_t resid; /* have to get resid to get detailed errno */ 63 ASSERT0(len % 8); 64 65 fletcher_4_incremental_native(buf, len, &dsp->dsa_zc); 66 dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, 67 (caddr_t)buf, len, 68 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); 69 70 mutex_enter(&ds->ds_sendstream_lock); 71 *dsp->dsa_off += len; 72 mutex_exit(&ds->ds_sendstream_lock); 73 74 return (dsp->dsa_err); 75 } 76 77 static int 78 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, 79 uint64_t length) 80 { 81 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); 82 83 if (length != -1ULL && offset + length < offset) 84 length = -1ULL; 85 86 /* 87 * If there is a pending op, but it's not PENDING_FREE, push it out, 88 * since free block aggregation can only be done for blocks of the 89 * same type (i.e., DRR_FREE records can only be aggregated with 90 * other DRR_FREE records. DRR_FREEOBJECTS records can only be 91 * aggregated with other DRR_FREEOBJECTS records. 92 */ 93 if (dsp->dsa_pending_op != PENDING_NONE && 94 dsp->dsa_pending_op != PENDING_FREE) { 95 if (dump_bytes(dsp, dsp->dsa_drr, 96 sizeof (dmu_replay_record_t)) != 0) 97 return (SET_ERROR(EINTR)); 98 dsp->dsa_pending_op = PENDING_NONE; 99 } 100 101 if (dsp->dsa_pending_op == PENDING_FREE) { 102 /* 103 * There should never be a PENDING_FREE if length is -1 104 * (because dump_dnode is the only place where this 105 * function is called with a -1, and only after flushing 106 * any pending record). 107 */ 108 ASSERT(length != -1ULL); 109 /* 110 * Check to see whether this free block can be aggregated 111 * with pending one. 112 */ 113 if (drrf->drr_object == object && drrf->drr_offset + 114 drrf->drr_length == offset) { 115 drrf->drr_length += length; 116 return (0); 117 } else { 118 /* not a continuation. Push out pending record */ 119 if (dump_bytes(dsp, dsp->dsa_drr, 120 sizeof (dmu_replay_record_t)) != 0) 121 return (SET_ERROR(EINTR)); 122 dsp->dsa_pending_op = PENDING_NONE; 123 } 124 } 125 /* create a FREE record and make it pending */ 126 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 127 dsp->dsa_drr->drr_type = DRR_FREE; 128 drrf->drr_object = object; 129 drrf->drr_offset = offset; 130 drrf->drr_length = length; 131 drrf->drr_toguid = dsp->dsa_toguid; 132 if (length == -1ULL) { 133 if (dump_bytes(dsp, dsp->dsa_drr, 134 sizeof (dmu_replay_record_t)) != 0) 135 return (SET_ERROR(EINTR)); 136 } else { 137 dsp->dsa_pending_op = PENDING_FREE; 138 } 139 140 return (0); 141 } 142 143 static int 144 dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, 145 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) 146 { 147 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); 148 149 150 /* 151 * If there is any kind of pending aggregation (currently either 152 * a grouping of free objects or free blocks), push it out to 153 * the stream, since aggregation can't be done across operations 154 * of different types. 155 */ 156 if (dsp->dsa_pending_op != PENDING_NONE) { 157 if (dump_bytes(dsp, dsp->dsa_drr, 158 sizeof (dmu_replay_record_t)) != 0) 159 return (SET_ERROR(EINTR)); 160 dsp->dsa_pending_op = PENDING_NONE; 161 } 162 /* write a DATA record */ 163 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 164 dsp->dsa_drr->drr_type = DRR_WRITE; 165 drrw->drr_object = object; 166 drrw->drr_type = type; 167 drrw->drr_offset = offset; 168 drrw->drr_length = blksz; 169 drrw->drr_toguid = dsp->dsa_toguid; 170 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); 171 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) 172 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; 173 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); 174 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); 175 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); 176 drrw->drr_key.ddk_cksum = bp->blk_cksum; 177 178 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 179 return (SET_ERROR(EINTR)); 180 if (dump_bytes(dsp, data, blksz) != 0) 181 return (SET_ERROR(EINTR)); 182 return (0); 183 } 184 185 static int 186 dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) 187 { 188 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); 189 190 if (dsp->dsa_pending_op != PENDING_NONE) { 191 if (dump_bytes(dsp, dsp->dsa_drr, 192 sizeof (dmu_replay_record_t)) != 0) 193 return (SET_ERROR(EINTR)); 194 dsp->dsa_pending_op = PENDING_NONE; 195 } 196 197 /* write a SPILL record */ 198 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 199 dsp->dsa_drr->drr_type = DRR_SPILL; 200 drrs->drr_object = object; 201 drrs->drr_length = blksz; 202 drrs->drr_toguid = dsp->dsa_toguid; 203 204 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) 205 return (SET_ERROR(EINTR)); 206 if (dump_bytes(dsp, data, blksz)) 207 return (SET_ERROR(EINTR)); 208 return (0); 209 } 210 211 static int 212 dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) 213 { 214 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); 215 216 /* 217 * If there is a pending op, but it's not PENDING_FREEOBJECTS, 218 * push it out, since free block aggregation can only be done for 219 * blocks of the same type (i.e., DRR_FREE records can only be 220 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records 221 * can only be aggregated with other DRR_FREEOBJECTS records. 222 */ 223 if (dsp->dsa_pending_op != PENDING_NONE && 224 dsp->dsa_pending_op != PENDING_FREEOBJECTS) { 225 if (dump_bytes(dsp, dsp->dsa_drr, 226 sizeof (dmu_replay_record_t)) != 0) 227 return (SET_ERROR(EINTR)); 228 dsp->dsa_pending_op = PENDING_NONE; 229 } 230 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { 231 /* 232 * See whether this free object array can be aggregated 233 * with pending one 234 */ 235 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { 236 drrfo->drr_numobjs += numobjs; 237 return (0); 238 } else { 239 /* can't be aggregated. Push out pending record */ 240 if (dump_bytes(dsp, dsp->dsa_drr, 241 sizeof (dmu_replay_record_t)) != 0) 242 return (SET_ERROR(EINTR)); 243 dsp->dsa_pending_op = PENDING_NONE; 244 } 245 } 246 247 /* write a FREEOBJECTS record */ 248 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 249 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; 250 drrfo->drr_firstobj = firstobj; 251 drrfo->drr_numobjs = numobjs; 252 drrfo->drr_toguid = dsp->dsa_toguid; 253 254 dsp->dsa_pending_op = PENDING_FREEOBJECTS; 255 256 return (0); 257 } 258 259 static int 260 dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) 261 { 262 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); 263 264 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 265 return (dump_freeobjects(dsp, object, 1)); 266 267 if (dsp->dsa_pending_op != PENDING_NONE) { 268 if (dump_bytes(dsp, dsp->dsa_drr, 269 sizeof (dmu_replay_record_t)) != 0) 270 return (SET_ERROR(EINTR)); 271 dsp->dsa_pending_op = PENDING_NONE; 272 } 273 274 /* write an OBJECT record */ 275 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); 276 dsp->dsa_drr->drr_type = DRR_OBJECT; 277 drro->drr_object = object; 278 drro->drr_type = dnp->dn_type; 279 drro->drr_bonustype = dnp->dn_bonustype; 280 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 281 drro->drr_bonuslen = dnp->dn_bonuslen; 282 drro->drr_checksumtype = dnp->dn_checksum; 283 drro->drr_compress = dnp->dn_compress; 284 drro->drr_toguid = dsp->dsa_toguid; 285 286 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) 287 return (SET_ERROR(EINTR)); 288 289 if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) 290 return (SET_ERROR(EINTR)); 291 292 /* free anything past the end of the file */ 293 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * 294 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) 295 return (SET_ERROR(EINTR)); 296 if (dsp->dsa_err != 0) 297 return (SET_ERROR(EINTR)); 298 return (0); 299 } 300 301 #define BP_SPAN(dnp, level) \ 302 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 303 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 304 305 /* ARGSUSED */ 306 static int 307 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 308 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 309 { 310 dmu_sendarg_t *dsp = arg; 311 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 312 int err = 0; 313 314 if (issig(JUSTLOOKING) && issig(FORREAL)) 315 return (SET_ERROR(EINTR)); 316 317 if (zb->zb_object != DMU_META_DNODE_OBJECT && 318 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { 319 return (0); 320 } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { 321 uint64_t span = BP_SPAN(dnp, zb->zb_level); 322 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; 323 err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); 324 } else if (bp == NULL) { 325 uint64_t span = BP_SPAN(dnp, zb->zb_level); 326 err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); 327 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { 328 return (0); 329 } else if (type == DMU_OT_DNODE) { 330 dnode_phys_t *blk; 331 int i; 332 int blksz = BP_GET_LSIZE(bp); 333 uint32_t aflags = ARC_WAIT; 334 arc_buf_t *abuf; 335 336 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 337 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 338 &aflags, zb) != 0) 339 return (SET_ERROR(EIO)); 340 341 blk = abuf->b_data; 342 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 343 uint64_t dnobj = (zb->zb_blkid << 344 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 345 err = dump_dnode(dsp, dnobj, blk+i); 346 if (err != 0) 347 break; 348 } 349 (void) arc_buf_remove_ref(abuf, &abuf); 350 } else if (type == DMU_OT_SA) { 351 uint32_t aflags = ARC_WAIT; 352 arc_buf_t *abuf; 353 int blksz = BP_GET_LSIZE(bp); 354 355 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 356 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 357 &aflags, zb) != 0) 358 return (SET_ERROR(EIO)); 359 360 err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); 361 (void) arc_buf_remove_ref(abuf, &abuf); 362 } else { /* it's a level-0 block of a regular object */ 363 uint32_t aflags = ARC_WAIT; 364 arc_buf_t *abuf; 365 int blksz = BP_GET_LSIZE(bp); 366 367 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, 368 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, 369 &aflags, zb) != 0) { 370 if (zfs_send_corrupt_data) { 371 /* Send a block filled with 0x"zfs badd bloc" */ 372 abuf = arc_buf_alloc(spa, blksz, &abuf, 373 ARC_BUFC_DATA); 374 uint64_t *ptr; 375 for (ptr = abuf->b_data; 376 (char *)ptr < (char *)abuf->b_data + blksz; 377 ptr++) 378 *ptr = 0x2f5baddb10c; 379 } else { 380 return (SET_ERROR(EIO)); 381 } 382 } 383 384 err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz, 385 blksz, bp, abuf->b_data); 386 (void) arc_buf_remove_ref(abuf, &abuf); 387 } 388 389 ASSERT(err == 0 || err == EINTR); 390 return (err); 391 } 392 393 /* 394 * Releases dp, ds, and fromds, using the specified tag. 395 */ 396 static int 397 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, 398 dsl_dataset_t *fromds, int outfd, vnode_t *vp, offset_t *off) 399 { 400 objset_t *os; 401 dmu_replay_record_t *drr; 402 dmu_sendarg_t *dsp; 403 int err; 404 uint64_t fromtxg = 0; 405 406 if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) { 407 dsl_dataset_rele(fromds, tag); 408 dsl_dataset_rele(ds, tag); 409 dsl_pool_rele(dp, tag); 410 return (SET_ERROR(EXDEV)); 411 } 412 413 err = dmu_objset_from_ds(ds, &os); 414 if (err != 0) { 415 if (fromds != NULL) 416 dsl_dataset_rele(fromds, tag); 417 dsl_dataset_rele(ds, tag); 418 dsl_pool_rele(dp, tag); 419 return (err); 420 } 421 422 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 423 drr->drr_type = DRR_BEGIN; 424 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 425 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, 426 DMU_SUBSTREAM); 427 428 #ifdef _KERNEL 429 if (dmu_objset_type(os) == DMU_OST_ZFS) { 430 uint64_t version; 431 if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { 432 kmem_free(drr, sizeof (dmu_replay_record_t)); 433 if (fromds != NULL) 434 dsl_dataset_rele(fromds, tag); 435 dsl_dataset_rele(ds, tag); 436 dsl_pool_rele(dp, tag); 437 return (SET_ERROR(EINVAL)); 438 } 439 if (version >= ZPL_VERSION_SA) { 440 DMU_SET_FEATUREFLAGS( 441 drr->drr_u.drr_begin.drr_versioninfo, 442 DMU_BACKUP_FEATURE_SA_SPILL); 443 } 444 } 445 #endif 446 447 drr->drr_u.drr_begin.drr_creation_time = 448 ds->ds_phys->ds_creation_time; 449 drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); 450 if (fromds != NULL && ds->ds_dir != fromds->ds_dir) 451 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; 452 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 453 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) 454 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; 455 456 if (fromds != NULL) 457 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; 458 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 459 460 if (fromds != NULL) { 461 fromtxg = fromds->ds_phys->ds_creation_txg; 462 dsl_dataset_rele(fromds, tag); 463 fromds = NULL; 464 } 465 466 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); 467 468 dsp->dsa_drr = drr; 469 dsp->dsa_vp = vp; 470 dsp->dsa_outfd = outfd; 471 dsp->dsa_proc = curproc; 472 dsp->dsa_os = os; 473 dsp->dsa_off = off; 474 dsp->dsa_toguid = ds->ds_phys->ds_guid; 475 ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); 476 dsp->dsa_pending_op = PENDING_NONE; 477 478 mutex_enter(&ds->ds_sendstream_lock); 479 list_insert_head(&ds->ds_sendstreams, dsp); 480 mutex_exit(&ds->ds_sendstream_lock); 481 482 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 483 err = dsp->dsa_err; 484 goto out; 485 } 486 487 dsl_dataset_long_hold(ds, FTAG); 488 dsl_pool_rele(dp, tag); 489 490 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, 491 backup_cb, dsp); 492 493 if (dsp->dsa_pending_op != PENDING_NONE) 494 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) 495 err = SET_ERROR(EINTR); 496 497 if (err != 0) { 498 if (err == EINTR && dsp->dsa_err != 0) 499 err = dsp->dsa_err; 500 goto out; 501 } 502 503 bzero(drr, sizeof (dmu_replay_record_t)); 504 drr->drr_type = DRR_END; 505 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; 506 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; 507 508 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { 509 err = dsp->dsa_err; 510 goto out; 511 } 512 513 out: 514 mutex_enter(&ds->ds_sendstream_lock); 515 list_remove(&ds->ds_sendstreams, dsp); 516 mutex_exit(&ds->ds_sendstream_lock); 517 518 kmem_free(drr, sizeof (dmu_replay_record_t)); 519 kmem_free(dsp, sizeof (dmu_sendarg_t)); 520 521 dsl_dataset_long_rele(ds, FTAG); 522 dsl_dataset_rele(ds, tag); 523 524 return (err); 525 } 526 527 int 528 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, 529 int outfd, vnode_t *vp, offset_t *off) 530 { 531 dsl_pool_t *dp; 532 dsl_dataset_t *ds; 533 dsl_dataset_t *fromds = NULL; 534 int err; 535 536 err = dsl_pool_hold(pool, FTAG, &dp); 537 if (err != 0) 538 return (err); 539 540 err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); 541 if (err != 0) { 542 dsl_pool_rele(dp, FTAG); 543 return (err); 544 } 545 546 if (fromsnap != 0) { 547 err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); 548 if (err != 0) { 549 dsl_dataset_rele(ds, FTAG); 550 dsl_pool_rele(dp, FTAG); 551 return (err); 552 } 553 } 554 555 return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off)); 556 } 557 558 int 559 dmu_send(const char *tosnap, const char *fromsnap, 560 int outfd, vnode_t *vp, offset_t *off) 561 { 562 dsl_pool_t *dp; 563 dsl_dataset_t *ds; 564 dsl_dataset_t *fromds = NULL; 565 int err; 566 567 if (strchr(tosnap, '@') == NULL) 568 return (SET_ERROR(EINVAL)); 569 if (fromsnap != NULL && strchr(fromsnap, '@') == NULL) 570 return (SET_ERROR(EINVAL)); 571 572 err = dsl_pool_hold(tosnap, FTAG, &dp); 573 if (err != 0) 574 return (err); 575 576 err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); 577 if (err != 0) { 578 dsl_pool_rele(dp, FTAG); 579 return (err); 580 } 581 582 if (fromsnap != NULL) { 583 err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); 584 if (err != 0) { 585 dsl_dataset_rele(ds, FTAG); 586 dsl_pool_rele(dp, FTAG); 587 return (err); 588 } 589 } 590 return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, vp, off)); 591 } 592 593 int 594 dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) 595 { 596 dsl_pool_t *dp = ds->ds_dir->dd_pool; 597 int err; 598 uint64_t size; 599 600 ASSERT(dsl_pool_config_held(dp)); 601 602 /* tosnap must be a snapshot */ 603 if (!dsl_dataset_is_snapshot(ds)) 604 return (SET_ERROR(EINVAL)); 605 606 /* 607 * fromsnap must be an earlier snapshot from the same fs as tosnap, 608 * or the origin's fs. 609 */ 610 if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) 611 return (SET_ERROR(EXDEV)); 612 613 /* Get uncompressed size estimate of changed data. */ 614 if (fromds == NULL) { 615 size = ds->ds_phys->ds_uncompressed_bytes; 616 } else { 617 uint64_t used, comp; 618 err = dsl_dataset_space_written(fromds, ds, 619 &used, &comp, &size); 620 if (err != 0) 621 return (err); 622 } 623 624 /* 625 * Assume that space (both on-disk and in-stream) is dominated by 626 * data. We will adjust for indirect blocks and the copies property, 627 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). 628 */ 629 630 /* 631 * Subtract out approximate space used by indirect blocks. 632 * Assume most space is used by data blocks (non-indirect, non-dnode). 633 * Assume all blocks are recordsize. Assume ditto blocks and 634 * internal fragmentation counter out compression. 635 * 636 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per 637 * block, which we observe in practice. 638 */ 639 uint64_t recordsize; 640 err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); 641 if (err != 0) 642 return (err); 643 size -= size / recordsize * sizeof (blkptr_t); 644 645 /* Add in the space for the record associated with each block. */ 646 size += size / recordsize * sizeof (dmu_replay_record_t); 647 648 *sizep = size; 649 650 return (0); 651 } 652 653 typedef struct dmu_recv_begin_arg { 654 const char *drba_origin; 655 dmu_recv_cookie_t *drba_cookie; 656 cred_t *drba_cred; 657 } dmu_recv_begin_arg_t; 658 659 static int 660 recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, 661 uint64_t fromguid) 662 { 663 uint64_t val; 664 int error; 665 dsl_pool_t *dp = ds->ds_dir->dd_pool; 666 667 /* must not have any changes since most recent snapshot */ 668 if (!drba->drba_cookie->drc_force && 669 dsl_dataset_modified_since_lastsnap(ds)) 670 return (SET_ERROR(ETXTBSY)); 671 672 /* temporary clone name must not exist */ 673 error = zap_lookup(dp->dp_meta_objset, 674 ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name, 675 8, 1, &val); 676 if (error != ENOENT) 677 return (error == 0 ? EBUSY : error); 678 679 /* new snapshot name must not exist */ 680 error = zap_lookup(dp->dp_meta_objset, 681 ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, 682 8, 1, &val); 683 if (error != ENOENT) 684 return (error == 0 ? EEXIST : error); 685 686 if (fromguid != 0) { 687 /* if incremental, most recent snapshot must match fromguid */ 688 if (ds->ds_prev == NULL) 689 return (SET_ERROR(ENODEV)); 690 691 /* 692 * most recent snapshot must match fromguid, or there are no 693 * changes since the fromguid one 694 */ 695 if (ds->ds_prev->ds_phys->ds_guid != fromguid) { 696 uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; 697 uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; 698 while (obj != 0) { 699 dsl_dataset_t *snap; 700 error = dsl_dataset_hold_obj(dp, obj, FTAG, 701 &snap); 702 if (error != 0) 703 return (SET_ERROR(ENODEV)); 704 if (snap->ds_phys->ds_creation_txg < birth) { 705 dsl_dataset_rele(snap, FTAG); 706 return (SET_ERROR(ENODEV)); 707 } 708 if (snap->ds_phys->ds_guid == fromguid) { 709 dsl_dataset_rele(snap, FTAG); 710 break; /* it's ok */ 711 } 712 obj = snap->ds_phys->ds_prev_snap_obj; 713 dsl_dataset_rele(snap, FTAG); 714 } 715 if (obj == 0) 716 return (SET_ERROR(ENODEV)); 717 } 718 } else { 719 /* if full, most recent snapshot must be $ORIGIN */ 720 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) 721 return (SET_ERROR(ENODEV)); 722 } 723 724 return (0); 725 726 } 727 728 static int 729 dmu_recv_begin_check(void *arg, dmu_tx_t *tx) 730 { 731 dmu_recv_begin_arg_t *drba = arg; 732 dsl_pool_t *dp = dmu_tx_pool(tx); 733 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 734 uint64_t fromguid = drrb->drr_fromguid; 735 int flags = drrb->drr_flags; 736 int error; 737 dsl_dataset_t *ds; 738 const char *tofs = drba->drba_cookie->drc_tofs; 739 740 /* already checked */ 741 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); 742 743 if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == 744 DMU_COMPOUNDSTREAM || 745 drrb->drr_type >= DMU_OST_NUMTYPES || 746 ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) 747 return (SET_ERROR(EINVAL)); 748 749 /* Verify pool version supports SA if SA_SPILL feature set */ 750 if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & 751 DMU_BACKUP_FEATURE_SA_SPILL) && 752 spa_version(dp->dp_spa) < SPA_VERSION_SA) { 753 return (SET_ERROR(ENOTSUP)); 754 } 755 756 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 757 if (error == 0) { 758 /* target fs already exists; recv into temp clone */ 759 760 /* Can't recv a clone into an existing fs */ 761 if (flags & DRR_FLAG_CLONE) { 762 dsl_dataset_rele(ds, FTAG); 763 return (SET_ERROR(EINVAL)); 764 } 765 766 error = recv_begin_check_existing_impl(drba, ds, fromguid); 767 dsl_dataset_rele(ds, FTAG); 768 } else if (error == ENOENT) { 769 /* target fs does not exist; must be a full backup or clone */ 770 char buf[MAXNAMELEN]; 771 772 /* 773 * If it's a non-clone incremental, we are missing the 774 * target fs, so fail the recv. 775 */ 776 if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) 777 return (SET_ERROR(ENOENT)); 778 779 /* Open the parent of tofs */ 780 ASSERT3U(strlen(tofs), <, MAXNAMELEN); 781 (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); 782 error = dsl_dataset_hold(dp, buf, FTAG, &ds); 783 if (error != 0) 784 return (error); 785 786 if (drba->drba_origin != NULL) { 787 dsl_dataset_t *origin; 788 error = dsl_dataset_hold(dp, drba->drba_origin, 789 FTAG, &origin); 790 if (error != 0) { 791 dsl_dataset_rele(ds, FTAG); 792 return (error); 793 } 794 if (!dsl_dataset_is_snapshot(origin)) { 795 dsl_dataset_rele(origin, FTAG); 796 dsl_dataset_rele(ds, FTAG); 797 return (SET_ERROR(EINVAL)); 798 } 799 if (origin->ds_phys->ds_guid != fromguid) { 800 dsl_dataset_rele(origin, FTAG); 801 dsl_dataset_rele(ds, FTAG); 802 return (SET_ERROR(ENODEV)); 803 } 804 dsl_dataset_rele(origin, FTAG); 805 } 806 dsl_dataset_rele(ds, FTAG); 807 error = 0; 808 } 809 return (error); 810 } 811 812 static void 813 dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) 814 { 815 dmu_recv_begin_arg_t *drba = arg; 816 dsl_pool_t *dp = dmu_tx_pool(tx); 817 struct drr_begin *drrb = drba->drba_cookie->drc_drrb; 818 const char *tofs = drba->drba_cookie->drc_tofs; 819 dsl_dataset_t *ds, *newds; 820 uint64_t dsobj; 821 int error; 822 uint64_t crflags; 823 824 crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? 825 DS_FLAG_CI_DATASET : 0; 826 827 error = dsl_dataset_hold(dp, tofs, FTAG, &ds); 828 if (error == 0) { 829 /* create temporary clone */ 830 dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, 831 ds->ds_prev, crflags, drba->drba_cred, tx); 832 dsl_dataset_rele(ds, FTAG); 833 } else { 834 dsl_dir_t *dd; 835 const char *tail; 836 dsl_dataset_t *origin = NULL; 837 838 VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); 839 840 if (drba->drba_origin != NULL) { 841 VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, 842 FTAG, &origin)); 843 } 844 845 /* Create new dataset. */ 846 dsobj = dsl_dataset_create_sync(dd, 847 strrchr(tofs, '/') + 1, 848 origin, crflags, drba->drba_cred, tx); 849 if (origin != NULL) 850 dsl_dataset_rele(origin, FTAG); 851 dsl_dir_rele(dd, FTAG); 852 drba->drba_cookie->drc_newfs = B_TRUE; 853 } 854 VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); 855 856 dmu_buf_will_dirty(newds->ds_dbuf, tx); 857 newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 858 859 /* 860 * If we actually created a non-clone, we need to create the 861 * objset in our new dataset. 862 */ 863 if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { 864 (void) dmu_objset_create_impl(dp->dp_spa, 865 newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); 866 } 867 868 drba->drba_cookie->drc_ds = newds; 869 870 spa_history_log_internal_ds(newds, "receive", tx, ""); 871 } 872 873 /* 874 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() 875 * succeeds; otherwise we will leak the holds on the datasets. 876 */ 877 int 878 dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, 879 boolean_t force, char *origin, dmu_recv_cookie_t *drc) 880 { 881 dmu_recv_begin_arg_t drba = { 0 }; 882 dmu_replay_record_t *drr; 883 884 bzero(drc, sizeof (dmu_recv_cookie_t)); 885 drc->drc_drrb = drrb; 886 drc->drc_tosnap = tosnap; 887 drc->drc_tofs = tofs; 888 drc->drc_force = force; 889 890 if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) 891 drc->drc_byteswap = B_TRUE; 892 else if (drrb->drr_magic != DMU_BACKUP_MAGIC) 893 return (SET_ERROR(EINVAL)); 894 895 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 896 drr->drr_type = DRR_BEGIN; 897 drr->drr_u.drr_begin = *drc->drc_drrb; 898 if (drc->drc_byteswap) { 899 fletcher_4_incremental_byteswap(drr, 900 sizeof (dmu_replay_record_t), &drc->drc_cksum); 901 } else { 902 fletcher_4_incremental_native(drr, 903 sizeof (dmu_replay_record_t), &drc->drc_cksum); 904 } 905 kmem_free(drr, sizeof (dmu_replay_record_t)); 906 907 if (drc->drc_byteswap) { 908 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 909 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); 910 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 911 drrb->drr_type = BSWAP_32(drrb->drr_type); 912 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 913 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 914 } 915 916 drba.drba_origin = origin; 917 drba.drba_cookie = drc; 918 drba.drba_cred = CRED(); 919 920 return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, 921 &drba, 5)); 922 } 923 924 struct restorearg { 925 int err; 926 boolean_t byteswap; 927 vnode_t *vp; 928 char *buf; 929 uint64_t voff; 930 int bufsize; /* amount of memory allocated for buf */ 931 zio_cksum_t cksum; 932 avl_tree_t *guid_to_ds_map; 933 }; 934 935 typedef struct guid_map_entry { 936 uint64_t guid; 937 dsl_dataset_t *gme_ds; 938 avl_node_t avlnode; 939 } guid_map_entry_t; 940 941 static int 942 guid_compare(const void *arg1, const void *arg2) 943 { 944 const guid_map_entry_t *gmep1 = arg1; 945 const guid_map_entry_t *gmep2 = arg2; 946 947 if (gmep1->guid < gmep2->guid) 948 return (-1); 949 else if (gmep1->guid > gmep2->guid) 950 return (1); 951 return (0); 952 } 953 954 static void 955 free_guid_map_onexit(void *arg) 956 { 957 avl_tree_t *ca = arg; 958 void *cookie = NULL; 959 guid_map_entry_t *gmep; 960 961 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { 962 dsl_dataset_long_rele(gmep->gme_ds, gmep); 963 kmem_free(gmep, sizeof (guid_map_entry_t)); 964 } 965 avl_destroy(ca); 966 kmem_free(ca, sizeof (avl_tree_t)); 967 } 968 969 static void * 970 restore_read(struct restorearg *ra, int len) 971 { 972 void *rv; 973 int done = 0; 974 975 /* some things will require 8-byte alignment, so everything must */ 976 ASSERT0(len % 8); 977 978 while (done < len) { 979 ssize_t resid; 980 981 ra->err = vn_rdwr(UIO_READ, ra->vp, 982 (caddr_t)ra->buf + done, len - done, 983 ra->voff, UIO_SYSSPACE, FAPPEND, 984 RLIM64_INFINITY, CRED(), &resid); 985 986 if (resid == len - done) 987 ra->err = SET_ERROR(EINVAL); 988 ra->voff += len - done - resid; 989 done = len - resid; 990 if (ra->err != 0) 991 return (NULL); 992 } 993 994 ASSERT3U(done, ==, len); 995 rv = ra->buf; 996 if (ra->byteswap) 997 fletcher_4_incremental_byteswap(rv, len, &ra->cksum); 998 else 999 fletcher_4_incremental_native(rv, len, &ra->cksum); 1000 return (rv); 1001 } 1002 1003 static void 1004 backup_byteswap(dmu_replay_record_t *drr) 1005 { 1006 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 1007 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 1008 drr->drr_type = BSWAP_32(drr->drr_type); 1009 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); 1010 switch (drr->drr_type) { 1011 case DRR_BEGIN: 1012 DO64(drr_begin.drr_magic); 1013 DO64(drr_begin.drr_versioninfo); 1014 DO64(drr_begin.drr_creation_time); 1015 DO32(drr_begin.drr_type); 1016 DO32(drr_begin.drr_flags); 1017 DO64(drr_begin.drr_toguid); 1018 DO64(drr_begin.drr_fromguid); 1019 break; 1020 case DRR_OBJECT: 1021 DO64(drr_object.drr_object); 1022 /* DO64(drr_object.drr_allocation_txg); */ 1023 DO32(drr_object.drr_type); 1024 DO32(drr_object.drr_bonustype); 1025 DO32(drr_object.drr_blksz); 1026 DO32(drr_object.drr_bonuslen); 1027 DO64(drr_object.drr_toguid); 1028 break; 1029 case DRR_FREEOBJECTS: 1030 DO64(drr_freeobjects.drr_firstobj); 1031 DO64(drr_freeobjects.drr_numobjs); 1032 DO64(drr_freeobjects.drr_toguid); 1033 break; 1034 case DRR_WRITE: 1035 DO64(drr_write.drr_object); 1036 DO32(drr_write.drr_type); 1037 DO64(drr_write.drr_offset); 1038 DO64(drr_write.drr_length); 1039 DO64(drr_write.drr_toguid); 1040 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); 1041 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); 1042 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); 1043 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); 1044 DO64(drr_write.drr_key.ddk_prop); 1045 break; 1046 case DRR_WRITE_BYREF: 1047 DO64(drr_write_byref.drr_object); 1048 DO64(drr_write_byref.drr_offset); 1049 DO64(drr_write_byref.drr_length); 1050 DO64(drr_write_byref.drr_toguid); 1051 DO64(drr_write_byref.drr_refguid); 1052 DO64(drr_write_byref.drr_refobject); 1053 DO64(drr_write_byref.drr_refoffset); 1054 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); 1055 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); 1056 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); 1057 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); 1058 DO64(drr_write_byref.drr_key.ddk_prop); 1059 break; 1060 case DRR_FREE: 1061 DO64(drr_free.drr_object); 1062 DO64(drr_free.drr_offset); 1063 DO64(drr_free.drr_length); 1064 DO64(drr_free.drr_toguid); 1065 break; 1066 case DRR_SPILL: 1067 DO64(drr_spill.drr_object); 1068 DO64(drr_spill.drr_length); 1069 DO64(drr_spill.drr_toguid); 1070 break; 1071 case DRR_END: 1072 DO64(drr_end.drr_checksum.zc_word[0]); 1073 DO64(drr_end.drr_checksum.zc_word[1]); 1074 DO64(drr_end.drr_checksum.zc_word[2]); 1075 DO64(drr_end.drr_checksum.zc_word[3]); 1076 DO64(drr_end.drr_toguid); 1077 break; 1078 } 1079 #undef DO64 1080 #undef DO32 1081 } 1082 1083 static int 1084 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 1085 { 1086 int err; 1087 dmu_tx_t *tx; 1088 void *data = NULL; 1089 1090 if (drro->drr_type == DMU_OT_NONE || 1091 !DMU_OT_IS_VALID(drro->drr_type) || 1092 !DMU_OT_IS_VALID(drro->drr_bonustype) || 1093 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || 1094 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1095 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1096 drro->drr_blksz < SPA_MINBLOCKSIZE || 1097 drro->drr_blksz > SPA_MAXBLOCKSIZE || 1098 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1099 return (SET_ERROR(EINVAL)); 1100 } 1101 1102 err = dmu_object_info(os, drro->drr_object, NULL); 1103 1104 if (err != 0 && err != ENOENT) 1105 return (SET_ERROR(EINVAL)); 1106 1107 if (drro->drr_bonuslen) { 1108 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); 1109 if (ra->err != 0) 1110 return (ra->err); 1111 } 1112 1113 if (err == ENOENT) { 1114 /* currently free, want to be allocated */ 1115 tx = dmu_tx_create(os); 1116 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1117 err = dmu_tx_assign(tx, TXG_WAIT); 1118 if (err != 0) { 1119 dmu_tx_abort(tx); 1120 return (err); 1121 } 1122 err = dmu_object_claim(os, drro->drr_object, 1123 drro->drr_type, drro->drr_blksz, 1124 drro->drr_bonustype, drro->drr_bonuslen, tx); 1125 dmu_tx_commit(tx); 1126 } else { 1127 /* currently allocated, want to be allocated */ 1128 err = dmu_object_reclaim(os, drro->drr_object, 1129 drro->drr_type, drro->drr_blksz, 1130 drro->drr_bonustype, drro->drr_bonuslen); 1131 } 1132 if (err != 0) { 1133 return (SET_ERROR(EINVAL)); 1134 } 1135 1136 tx = dmu_tx_create(os); 1137 dmu_tx_hold_bonus(tx, drro->drr_object); 1138 err = dmu_tx_assign(tx, TXG_WAIT); 1139 if (err != 0) { 1140 dmu_tx_abort(tx); 1141 return (err); 1142 } 1143 1144 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, 1145 tx); 1146 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 1147 1148 if (data != NULL) { 1149 dmu_buf_t *db; 1150 1151 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 1152 dmu_buf_will_dirty(db, tx); 1153 1154 ASSERT3U(db->db_size, >=, drro->drr_bonuslen); 1155 bcopy(data, db->db_data, drro->drr_bonuslen); 1156 if (ra->byteswap) { 1157 dmu_object_byteswap_t byteswap = 1158 DMU_OT_BYTESWAP(drro->drr_bonustype); 1159 dmu_ot_byteswap[byteswap].ob_func(db->db_data, 1160 drro->drr_bonuslen); 1161 } 1162 dmu_buf_rele(db, FTAG); 1163 } 1164 dmu_tx_commit(tx); 1165 return (0); 1166 } 1167 1168 /* ARGSUSED */ 1169 static int 1170 restore_freeobjects(struct restorearg *ra, objset_t *os, 1171 struct drr_freeobjects *drrfo) 1172 { 1173 uint64_t obj; 1174 1175 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1176 return (SET_ERROR(EINVAL)); 1177 1178 for (obj = drrfo->drr_firstobj; 1179 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; 1180 (void) dmu_object_next(os, &obj, FALSE, 0)) { 1181 int err; 1182 1183 if (dmu_object_info(os, obj, NULL) != 0) 1184 continue; 1185 1186 err = dmu_free_object(os, obj); 1187 if (err != 0) 1188 return (err); 1189 } 1190 return (0); 1191 } 1192 1193 static int 1194 restore_write(struct restorearg *ra, objset_t *os, 1195 struct drr_write *drrw) 1196 { 1197 dmu_tx_t *tx; 1198 void *data; 1199 int err; 1200 1201 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1202 !DMU_OT_IS_VALID(drrw->drr_type)) 1203 return (SET_ERROR(EINVAL)); 1204 1205 data = restore_read(ra, drrw->drr_length); 1206 if (data == NULL) 1207 return (ra->err); 1208 1209 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1210 return (SET_ERROR(EINVAL)); 1211 1212 tx = dmu_tx_create(os); 1213 1214 dmu_tx_hold_write(tx, drrw->drr_object, 1215 drrw->drr_offset, drrw->drr_length); 1216 err = dmu_tx_assign(tx, TXG_WAIT); 1217 if (err != 0) { 1218 dmu_tx_abort(tx); 1219 return (err); 1220 } 1221 if (ra->byteswap) { 1222 dmu_object_byteswap_t byteswap = 1223 DMU_OT_BYTESWAP(drrw->drr_type); 1224 dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); 1225 } 1226 dmu_write(os, drrw->drr_object, 1227 drrw->drr_offset, drrw->drr_length, data, tx); 1228 dmu_tx_commit(tx); 1229 return (0); 1230 } 1231 1232 /* 1233 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed 1234 * streams to refer to a copy of the data that is already on the 1235 * system because it came in earlier in the stream. This function 1236 * finds the earlier copy of the data, and uses that copy instead of 1237 * data from the stream to fulfill this write. 1238 */ 1239 static int 1240 restore_write_byref(struct restorearg *ra, objset_t *os, 1241 struct drr_write_byref *drrwbr) 1242 { 1243 dmu_tx_t *tx; 1244 int err; 1245 guid_map_entry_t gmesrch; 1246 guid_map_entry_t *gmep; 1247 avl_index_t where; 1248 objset_t *ref_os = NULL; 1249 dmu_buf_t *dbp; 1250 1251 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) 1252 return (SET_ERROR(EINVAL)); 1253 1254 /* 1255 * If the GUID of the referenced dataset is different from the 1256 * GUID of the target dataset, find the referenced dataset. 1257 */ 1258 if (drrwbr->drr_toguid != drrwbr->drr_refguid) { 1259 gmesrch.guid = drrwbr->drr_refguid; 1260 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, 1261 &where)) == NULL) { 1262 return (SET_ERROR(EINVAL)); 1263 } 1264 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) 1265 return (SET_ERROR(EINVAL)); 1266 } else { 1267 ref_os = os; 1268 } 1269 1270 if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, 1271 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH)) 1272 return (err); 1273 1274 tx = dmu_tx_create(os); 1275 1276 dmu_tx_hold_write(tx, drrwbr->drr_object, 1277 drrwbr->drr_offset, drrwbr->drr_length); 1278 err = dmu_tx_assign(tx, TXG_WAIT); 1279 if (err != 0) { 1280 dmu_tx_abort(tx); 1281 return (err); 1282 } 1283 dmu_write(os, drrwbr->drr_object, 1284 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); 1285 dmu_buf_rele(dbp, FTAG); 1286 dmu_tx_commit(tx); 1287 return (0); 1288 } 1289 1290 static int 1291 restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) 1292 { 1293 dmu_tx_t *tx; 1294 void *data; 1295 dmu_buf_t *db, *db_spill; 1296 int err; 1297 1298 if (drrs->drr_length < SPA_MINBLOCKSIZE || 1299 drrs->drr_length > SPA_MAXBLOCKSIZE) 1300 return (SET_ERROR(EINVAL)); 1301 1302 data = restore_read(ra, drrs->drr_length); 1303 if (data == NULL) 1304 return (ra->err); 1305 1306 if (dmu_object_info(os, drrs->drr_object, NULL) != 0) 1307 return (SET_ERROR(EINVAL)); 1308 1309 VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); 1310 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { 1311 dmu_buf_rele(db, FTAG); 1312 return (err); 1313 } 1314 1315 tx = dmu_tx_create(os); 1316 1317 dmu_tx_hold_spill(tx, db->db_object); 1318 1319 err = dmu_tx_assign(tx, TXG_WAIT); 1320 if (err != 0) { 1321 dmu_buf_rele(db, FTAG); 1322 dmu_buf_rele(db_spill, FTAG); 1323 dmu_tx_abort(tx); 1324 return (err); 1325 } 1326 dmu_buf_will_dirty(db_spill, tx); 1327 1328 if (db_spill->db_size < drrs->drr_length) 1329 VERIFY(0 == dbuf_spill_set_blksz(db_spill, 1330 drrs->drr_length, tx)); 1331 bcopy(data, db_spill->db_data, drrs->drr_length); 1332 1333 dmu_buf_rele(db, FTAG); 1334 dmu_buf_rele(db_spill, FTAG); 1335 1336 dmu_tx_commit(tx); 1337 return (0); 1338 } 1339 1340 /* ARGSUSED */ 1341 static int 1342 restore_free(struct restorearg *ra, objset_t *os, 1343 struct drr_free *drrf) 1344 { 1345 int err; 1346 1347 if (drrf->drr_length != -1ULL && 1348 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1349 return (SET_ERROR(EINVAL)); 1350 1351 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1352 return (SET_ERROR(EINVAL)); 1353 1354 err = dmu_free_long_range(os, drrf->drr_object, 1355 drrf->drr_offset, drrf->drr_length); 1356 return (err); 1357 } 1358 1359 /* used to destroy the drc_ds on error */ 1360 static void 1361 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) 1362 { 1363 char name[MAXNAMELEN]; 1364 dsl_dataset_name(drc->drc_ds, name); 1365 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 1366 (void) dsl_destroy_head(name); 1367 } 1368 1369 /* 1370 * NB: callers *must* call dmu_recv_end() if this succeeds. 1371 */ 1372 int 1373 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, 1374 int cleanup_fd, uint64_t *action_handlep) 1375 { 1376 struct restorearg ra = { 0 }; 1377 dmu_replay_record_t *drr; 1378 objset_t *os; 1379 zio_cksum_t pcksum; 1380 int featureflags; 1381 1382 ra.byteswap = drc->drc_byteswap; 1383 ra.cksum = drc->drc_cksum; 1384 ra.vp = vp; 1385 ra.voff = *voffp; 1386 ra.bufsize = 1<<20; 1387 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1388 1389 /* these were verified in dmu_recv_begin */ 1390 ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, 1391 DMU_SUBSTREAM); 1392 ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); 1393 1394 /* 1395 * Open the objset we are modifying. 1396 */ 1397 VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os)); 1398 1399 ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); 1400 1401 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); 1402 1403 /* if this stream is dedup'ed, set up the avl tree for guid mapping */ 1404 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { 1405 minor_t minor; 1406 1407 if (cleanup_fd == -1) { 1408 ra.err = SET_ERROR(EBADF); 1409 goto out; 1410 } 1411 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); 1412 if (ra.err != 0) { 1413 cleanup_fd = -1; 1414 goto out; 1415 } 1416 1417 if (*action_handlep == 0) { 1418 ra.guid_to_ds_map = 1419 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 1420 avl_create(ra.guid_to_ds_map, guid_compare, 1421 sizeof (guid_map_entry_t), 1422 offsetof(guid_map_entry_t, avlnode)); 1423 ra.err = zfs_onexit_add_cb(minor, 1424 free_guid_map_onexit, ra.guid_to_ds_map, 1425 action_handlep); 1426 if (ra.err != 0) 1427 goto out; 1428 } else { 1429 ra.err = zfs_onexit_cb_data(minor, *action_handlep, 1430 (void **)&ra.guid_to_ds_map); 1431 if (ra.err != 0) 1432 goto out; 1433 } 1434 1435 drc->drc_guid_to_ds_map = ra.guid_to_ds_map; 1436 } 1437 1438 /* 1439 * Read records and process them. 1440 */ 1441 pcksum = ra.cksum; 1442 while (ra.err == 0 && 1443 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { 1444 if (issig(JUSTLOOKING) && issig(FORREAL)) { 1445 ra.err = SET_ERROR(EINTR); 1446 goto out; 1447 } 1448 1449 if (ra.byteswap) 1450 backup_byteswap(drr); 1451 1452 switch (drr->drr_type) { 1453 case DRR_OBJECT: 1454 { 1455 /* 1456 * We need to make a copy of the record header, 1457 * because restore_{object,write} may need to 1458 * restore_read(), which will invalidate drr. 1459 */ 1460 struct drr_object drro = drr->drr_u.drr_object; 1461 ra.err = restore_object(&ra, os, &drro); 1462 break; 1463 } 1464 case DRR_FREEOBJECTS: 1465 { 1466 struct drr_freeobjects drrfo = 1467 drr->drr_u.drr_freeobjects; 1468 ra.err = restore_freeobjects(&ra, os, &drrfo); 1469 break; 1470 } 1471 case DRR_WRITE: 1472 { 1473 struct drr_write drrw = drr->drr_u.drr_write; 1474 ra.err = restore_write(&ra, os, &drrw); 1475 break; 1476 } 1477 case DRR_WRITE_BYREF: 1478 { 1479 struct drr_write_byref drrwbr = 1480 drr->drr_u.drr_write_byref; 1481 ra.err = restore_write_byref(&ra, os, &drrwbr); 1482 break; 1483 } 1484 case DRR_FREE: 1485 { 1486 struct drr_free drrf = drr->drr_u.drr_free; 1487 ra.err = restore_free(&ra, os, &drrf); 1488 break; 1489 } 1490 case DRR_END: 1491 { 1492 struct drr_end drre = drr->drr_u.drr_end; 1493 /* 1494 * We compare against the *previous* checksum 1495 * value, because the stored checksum is of 1496 * everything before the DRR_END record. 1497 */ 1498 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) 1499 ra.err = SET_ERROR(ECKSUM); 1500 goto out; 1501 } 1502 case DRR_SPILL: 1503 { 1504 struct drr_spill drrs = drr->drr_u.drr_spill; 1505 ra.err = restore_spill(&ra, os, &drrs); 1506 break; 1507 } 1508 default: 1509 ra.err = SET_ERROR(EINVAL); 1510 goto out; 1511 } 1512 pcksum = ra.cksum; 1513 } 1514 ASSERT(ra.err != 0); 1515 1516 out: 1517 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) 1518 zfs_onexit_fd_rele(cleanup_fd); 1519 1520 if (ra.err != 0) { 1521 /* 1522 * destroy what we created, so we don't leave it in the 1523 * inconsistent restoring state. 1524 */ 1525 dmu_recv_cleanup_ds(drc); 1526 } 1527 1528 kmem_free(ra.buf, ra.bufsize); 1529 *voffp = ra.voff; 1530 return (ra.err); 1531 } 1532 1533 static int 1534 dmu_recv_end_check(void *arg, dmu_tx_t *tx) 1535 { 1536 dmu_recv_cookie_t *drc = arg; 1537 dsl_pool_t *dp = dmu_tx_pool(tx); 1538 int error; 1539 1540 ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); 1541 1542 if (!drc->drc_newfs) { 1543 dsl_dataset_t *origin_head; 1544 1545 error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); 1546 if (error != 0) 1547 return (error); 1548 error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, 1549 origin_head, drc->drc_force); 1550 if (error != 0) { 1551 dsl_dataset_rele(origin_head, FTAG); 1552 return (error); 1553 } 1554 error = dsl_dataset_snapshot_check_impl(origin_head, 1555 drc->drc_tosnap, tx); 1556 dsl_dataset_rele(origin_head, FTAG); 1557 if (error != 0) 1558 return (error); 1559 1560 error = dsl_destroy_head_check_impl(drc->drc_ds, 1); 1561 } else { 1562 error = dsl_dataset_snapshot_check_impl(drc->drc_ds, 1563 drc->drc_tosnap, tx); 1564 } 1565 return (error); 1566 } 1567 1568 static void 1569 dmu_recv_end_sync(void *arg, dmu_tx_t *tx) 1570 { 1571 dmu_recv_cookie_t *drc = arg; 1572 dsl_pool_t *dp = dmu_tx_pool(tx); 1573 1574 spa_history_log_internal_ds(drc->drc_ds, "finish receiving", 1575 tx, "snap=%s", drc->drc_tosnap); 1576 1577 if (!drc->drc_newfs) { 1578 dsl_dataset_t *origin_head; 1579 1580 VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, 1581 &origin_head)); 1582 dsl_dataset_clone_swap_sync_impl(drc->drc_ds, 1583 origin_head, tx); 1584 dsl_dataset_snapshot_sync_impl(origin_head, 1585 drc->drc_tosnap, tx); 1586 1587 /* set snapshot's creation time and guid */ 1588 dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); 1589 origin_head->ds_prev->ds_phys->ds_creation_time = 1590 drc->drc_drrb->drr_creation_time; 1591 origin_head->ds_prev->ds_phys->ds_guid = 1592 drc->drc_drrb->drr_toguid; 1593 origin_head->ds_prev->ds_phys->ds_flags &= 1594 ~DS_FLAG_INCONSISTENT; 1595 1596 dmu_buf_will_dirty(origin_head->ds_dbuf, tx); 1597 origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1598 1599 dsl_dataset_rele(origin_head, FTAG); 1600 dsl_destroy_head_sync_impl(drc->drc_ds, tx); 1601 } else { 1602 dsl_dataset_t *ds = drc->drc_ds; 1603 1604 dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); 1605 1606 /* set snapshot's creation time and guid */ 1607 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); 1608 ds->ds_prev->ds_phys->ds_creation_time = 1609 drc->drc_drrb->drr_creation_time; 1610 ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid; 1611 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1612 1613 dmu_buf_will_dirty(ds->ds_dbuf, tx); 1614 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 1615 } 1616 drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj; 1617 /* 1618 * Release the hold from dmu_recv_begin. This must be done before 1619 * we return to open context, so that when we free the dataset's dnode, 1620 * we can evict its bonus buffer. 1621 */ 1622 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); 1623 drc->drc_ds = NULL; 1624 } 1625 1626 static int 1627 add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) 1628 { 1629 dsl_pool_t *dp; 1630 dsl_dataset_t *snapds; 1631 guid_map_entry_t *gmep; 1632 int err; 1633 1634 ASSERT(guid_map != NULL); 1635 1636 err = dsl_pool_hold(name, FTAG, &dp); 1637 if (err != 0) 1638 return (err); 1639 err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snapds); 1640 if (err == 0) { 1641 gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); 1642 gmep->guid = snapds->ds_phys->ds_guid; 1643 gmep->gme_ds = snapds; 1644 avl_add(guid_map, gmep); 1645 dsl_dataset_long_hold(snapds, gmep); 1646 dsl_dataset_rele(snapds, FTAG); 1647 } 1648 1649 dsl_pool_rele(dp, FTAG); 1650 return (err); 1651 } 1652 1653 static int dmu_recv_end_modified_blocks = 3; 1654 1655 static int 1656 dmu_recv_existing_end(dmu_recv_cookie_t *drc) 1657 { 1658 int error; 1659 char name[MAXNAMELEN]; 1660 1661 #ifdef _KERNEL 1662 /* 1663 * We will be destroying the ds; make sure its origin is unmounted if 1664 * necessary. 1665 */ 1666 dsl_dataset_name(drc->drc_ds, name); 1667 zfs_destroy_unmount_origin(name); 1668 #endif 1669 1670 error = dsl_sync_task(drc->drc_tofs, 1671 dmu_recv_end_check, dmu_recv_end_sync, drc, 1672 dmu_recv_end_modified_blocks); 1673 1674 if (error != 0) 1675 dmu_recv_cleanup_ds(drc); 1676 return (error); 1677 } 1678 1679 static int 1680 dmu_recv_new_end(dmu_recv_cookie_t *drc) 1681 { 1682 int error; 1683 1684 error = dsl_sync_task(drc->drc_tofs, 1685 dmu_recv_end_check, dmu_recv_end_sync, drc, 1686 dmu_recv_end_modified_blocks); 1687 1688 if (error != 0) { 1689 dmu_recv_cleanup_ds(drc); 1690 } else if (drc->drc_guid_to_ds_map != NULL) { 1691 (void) add_ds_to_guidmap(drc->drc_tofs, 1692 drc->drc_guid_to_ds_map, 1693 drc->drc_newsnapobj); 1694 } 1695 return (error); 1696 } 1697 1698 int 1699 dmu_recv_end(dmu_recv_cookie_t *drc) 1700 { 1701 if (drc->drc_newfs) 1702 return (dmu_recv_new_end(drc)); 1703 else 1704 return (dmu_recv_existing_end(drc)); 1705 } 1706