1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/dmu.h> 29 #include <sys/dmu_impl.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/dbuf.h> 32 #include <sys/dnode.h> 33 #include <sys/zfs_context.h> 34 #include <sys/dmu_objset.h> 35 #include <sys/dmu_traverse.h> 36 #include <sys/dsl_dataset.h> 37 #include <sys/dsl_dir.h> 38 #include <sys/dsl_pool.h> 39 #include <sys/dsl_synctask.h> 40 #include <sys/dmu_zfetch.h> 41 #include <sys/zfs_ioctl.h> 42 #include <sys/zap.h> 43 #include <sys/zio_checksum.h> 44 45 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { 46 { byteswap_uint8_array, TRUE, "unallocated" }, 47 { zap_byteswap, TRUE, "object directory" }, 48 { byteswap_uint64_array, TRUE, "object array" }, 49 { byteswap_uint8_array, TRUE, "packed nvlist" }, 50 { byteswap_uint64_array, TRUE, "packed nvlist size" }, 51 { byteswap_uint64_array, TRUE, "bplist" }, 52 { byteswap_uint64_array, TRUE, "bplist header" }, 53 { byteswap_uint64_array, TRUE, "SPA space map header" }, 54 { byteswap_uint64_array, TRUE, "SPA space map" }, 55 { byteswap_uint64_array, TRUE, "ZIL intent log" }, 56 { dnode_buf_byteswap, TRUE, "DMU dnode" }, 57 { dmu_objset_byteswap, TRUE, "DMU objset" }, 58 { byteswap_uint64_array, TRUE, "DSL directory" }, 59 { zap_byteswap, TRUE, "DSL directory child map"}, 60 { zap_byteswap, TRUE, "DSL dataset snap map" }, 61 { zap_byteswap, TRUE, "DSL props" }, 62 { byteswap_uint64_array, TRUE, "DSL dataset" }, 63 { zfs_znode_byteswap, TRUE, "ZFS znode" }, 64 { zfs_acl_byteswap, TRUE, "ZFS ACL" }, 65 { byteswap_uint8_array, FALSE, "ZFS plain file" }, 66 { zap_byteswap, TRUE, "ZFS directory" }, 67 { zap_byteswap, TRUE, "ZFS master node" }, 68 { zap_byteswap, TRUE, "ZFS delete queue" }, 69 { byteswap_uint8_array, FALSE, "zvol object" }, 70 { zap_byteswap, TRUE, "zvol prop" }, 71 { byteswap_uint8_array, FALSE, "other uint8[]" }, 72 { byteswap_uint64_array, FALSE, "other uint64[]" }, 73 { zap_byteswap, TRUE, "other ZAP" }, 74 { zap_byteswap, TRUE, "persistent error log" }, 75 }; 76 77 int 78 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, 79 void *tag, dmu_buf_t **dbp) 80 { 81 dnode_t *dn; 82 uint64_t blkid; 83 dmu_buf_impl_t *db; 84 int err; 85 86 err = dnode_hold(os->os, object, FTAG, &dn); 87 if (err) 88 return (err); 89 blkid = dbuf_whichblock(dn, offset); 90 rw_enter(&dn->dn_struct_rwlock, RW_READER); 91 db = dbuf_hold(dn, blkid, tag); 92 rw_exit(&dn->dn_struct_rwlock); 93 if (db == NULL) { 94 err = EIO; 95 } else { 96 err = dbuf_read(db, NULL, DB_RF_CANFAIL); 97 if (err) { 98 dbuf_rele(db, tag); 99 db = NULL; 100 } 101 } 102 103 dnode_rele(dn, FTAG); 104 *dbp = &db->db; 105 return (err); 106 } 107 108 int 109 dmu_bonus_max(void) 110 { 111 return (DN_MAX_BONUSLEN); 112 } 113 114 /* 115 * returns ENOENT, EIO, or 0. 116 */ 117 int 118 dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) 119 { 120 dnode_t *dn; 121 int err, count; 122 dmu_buf_impl_t *db; 123 124 err = dnode_hold(os->os, object, FTAG, &dn); 125 if (err) 126 return (err); 127 128 rw_enter(&dn->dn_struct_rwlock, RW_READER); 129 if (dn->dn_bonus == NULL) { 130 rw_exit(&dn->dn_struct_rwlock); 131 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 132 if (dn->dn_bonus == NULL) 133 dn->dn_bonus = dbuf_create_bonus(dn); 134 } 135 db = dn->dn_bonus; 136 rw_exit(&dn->dn_struct_rwlock); 137 mutex_enter(&db->db_mtx); 138 count = refcount_add(&db->db_holds, tag); 139 mutex_exit(&db->db_mtx); 140 if (count == 1) 141 dnode_add_ref(dn, db); 142 dnode_rele(dn, FTAG); 143 144 VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); 145 146 *dbp = &db->db; 147 return (0); 148 } 149 150 int 151 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, 152 uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 153 { 154 dnode_t *dn; 155 dmu_buf_t **dbp; 156 uint64_t blkid, nblks, i; 157 uint32_t flags; 158 int err; 159 zio_t *zio; 160 161 ASSERT(length <= DMU_MAX_ACCESS); 162 163 if (length == 0) { 164 if (numbufsp) 165 *numbufsp = 0; 166 *dbpp = NULL; 167 return (0); 168 } 169 170 flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; 171 if (length > zfetch_array_rd_sz) 172 flags |= DB_RF_NOPREFETCH; 173 174 err = dnode_hold(os->os, object, FTAG, &dn); 175 if (err) 176 return (err); 177 178 rw_enter(&dn->dn_struct_rwlock, RW_READER); 179 if (dn->dn_datablkshift) { 180 int blkshift = dn->dn_datablkshift; 181 nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - 182 P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; 183 } else { 184 ASSERT3U(offset + length, <=, dn->dn_datablksz); 185 nblks = 1; 186 } 187 dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); 188 189 zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE); 190 blkid = dbuf_whichblock(dn, offset); 191 for (i = 0; i < nblks; i++) { 192 dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); 193 if (db == NULL) { 194 rw_exit(&dn->dn_struct_rwlock); 195 dmu_buf_rele_array(dbp, nblks, tag); 196 dnode_rele(dn, FTAG); 197 zio_nowait(zio); 198 return (EIO); 199 } 200 /* initiate async i/o */ 201 if (read && db->db_state == DB_UNCACHED) { 202 rw_exit(&dn->dn_struct_rwlock); 203 (void) dbuf_read(db, zio, flags); 204 rw_enter(&dn->dn_struct_rwlock, RW_READER); 205 } 206 dbp[i] = &db->db; 207 } 208 rw_exit(&dn->dn_struct_rwlock); 209 dnode_rele(dn, FTAG); 210 211 /* wait for async i/o */ 212 err = zio_wait(zio); 213 if (err) { 214 dmu_buf_rele_array(dbp, nblks, tag); 215 return (err); 216 } 217 218 /* wait for other io to complete */ 219 if (read) { 220 for (i = 0; i < nblks; i++) { 221 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; 222 mutex_enter(&db->db_mtx); 223 while (db->db_state == DB_READ || 224 db->db_state == DB_FILL) 225 cv_wait(&db->db_changed, &db->db_mtx); 226 if (db->db_state == DB_UNCACHED) 227 err = EIO; 228 mutex_exit(&db->db_mtx); 229 if (err) { 230 dmu_buf_rele_array(dbp, nblks, tag); 231 return (err); 232 } 233 } 234 } 235 236 *numbufsp = nblks; 237 *dbpp = dbp; 238 return (0); 239 } 240 241 void 242 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) 243 { 244 int i; 245 dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; 246 247 if (numbufs == 0) 248 return; 249 250 for (i = 0; i < numbufs; i++) { 251 if (dbp[i]) 252 dbuf_rele(dbp[i], tag); 253 } 254 255 kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); 256 } 257 258 void 259 dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) 260 { 261 dnode_t *dn; 262 uint64_t blkid; 263 int nblks, i, err; 264 265 if (len == 0) { /* they're interested in the bonus buffer */ 266 dn = os->os->os_meta_dnode; 267 268 if (object == 0 || object >= DN_MAX_OBJECT) 269 return; 270 271 rw_enter(&dn->dn_struct_rwlock, RW_READER); 272 blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); 273 dbuf_prefetch(dn, blkid); 274 rw_exit(&dn->dn_struct_rwlock); 275 return; 276 } 277 278 /* 279 * XXX - Note, if the dnode for the requested object is not 280 * already cached, we will do a *synchronous* read in the 281 * dnode_hold() call. The same is true for any indirects. 282 */ 283 err = dnode_hold(os->os, object, FTAG, &dn); 284 if (err != 0) 285 return; 286 287 rw_enter(&dn->dn_struct_rwlock, RW_READER); 288 if (dn->dn_datablkshift) { 289 int blkshift = dn->dn_datablkshift; 290 nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - 291 P2ALIGN(offset, 1<<blkshift)) >> blkshift; 292 } else { 293 nblks = (offset < dn->dn_datablksz); 294 } 295 296 if (nblks != 0) { 297 blkid = dbuf_whichblock(dn, offset); 298 for (i = 0; i < nblks; i++) 299 dbuf_prefetch(dn, blkid+i); 300 } 301 302 rw_exit(&dn->dn_struct_rwlock); 303 304 dnode_rele(dn, FTAG); 305 } 306 307 int 308 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, 309 uint64_t size, dmu_tx_t *tx) 310 { 311 dnode_t *dn; 312 int err = dnode_hold(os->os, object, FTAG, &dn); 313 if (err) 314 return (err); 315 ASSERT(offset < UINT64_MAX); 316 ASSERT(size == -1ULL || size <= UINT64_MAX - offset); 317 dnode_free_range(dn, offset, size, tx); 318 dnode_rele(dn, FTAG); 319 return (0); 320 } 321 322 int 323 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 324 void *buf) 325 { 326 dnode_t *dn; 327 dmu_buf_t **dbp; 328 int numbufs, i, err; 329 330 /* 331 * Deal with odd block sizes, where there can't be data past the 332 * first block. 333 */ 334 err = dnode_hold(os->os, object, FTAG, &dn); 335 if (err) 336 return (err); 337 if (dn->dn_datablkshift == 0) { 338 int newsz = offset > dn->dn_datablksz ? 0 : 339 MIN(size, dn->dn_datablksz - offset); 340 bzero((char *)buf + newsz, size - newsz); 341 size = newsz; 342 } 343 dnode_rele(dn, FTAG); 344 345 while (size > 0) { 346 uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); 347 int err; 348 349 /* 350 * NB: we could do this block-at-a-time, but it's nice 351 * to be reading in parallel. 352 */ 353 err = dmu_buf_hold_array(os, object, offset, mylen, 354 TRUE, FTAG, &numbufs, &dbp); 355 if (err) 356 return (err); 357 358 for (i = 0; i < numbufs; i++) { 359 int tocpy; 360 int bufoff; 361 dmu_buf_t *db = dbp[i]; 362 363 ASSERT(size > 0); 364 365 bufoff = offset - db->db_offset; 366 tocpy = (int)MIN(db->db_size - bufoff, size); 367 368 bcopy((char *)db->db_data + bufoff, buf, tocpy); 369 370 offset += tocpy; 371 size -= tocpy; 372 buf = (char *)buf + tocpy; 373 } 374 dmu_buf_rele_array(dbp, numbufs, FTAG); 375 } 376 return (0); 377 } 378 379 void 380 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 381 const void *buf, dmu_tx_t *tx) 382 { 383 dmu_buf_t **dbp; 384 int numbufs, i; 385 386 VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, 387 FALSE, FTAG, &numbufs, &dbp)); 388 389 for (i = 0; i < numbufs; i++) { 390 int tocpy; 391 int bufoff; 392 dmu_buf_t *db = dbp[i]; 393 394 ASSERT(size > 0); 395 396 bufoff = offset - db->db_offset; 397 tocpy = (int)MIN(db->db_size - bufoff, size); 398 399 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 400 401 if (tocpy == db->db_size) 402 dmu_buf_will_fill(db, tx); 403 else 404 dmu_buf_will_dirty(db, tx); 405 406 bcopy(buf, (char *)db->db_data + bufoff, tocpy); 407 408 if (tocpy == db->db_size) 409 dmu_buf_fill_done(db, tx); 410 411 offset += tocpy; 412 size -= tocpy; 413 buf = (char *)buf + tocpy; 414 } 415 dmu_buf_rele_array(dbp, numbufs, FTAG); 416 } 417 418 #ifdef _KERNEL 419 int 420 dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 421 uio_t *uio, dmu_tx_t *tx) 422 { 423 dmu_buf_t **dbp; 424 int numbufs, i; 425 int err = 0; 426 427 err = dmu_buf_hold_array(os, object, offset, size, 428 FALSE, FTAG, &numbufs, &dbp); 429 if (err) 430 return (err); 431 432 for (i = 0; i < numbufs; i++) { 433 int tocpy; 434 int bufoff; 435 dmu_buf_t *db = dbp[i]; 436 437 ASSERT(size > 0); 438 439 bufoff = offset - db->db_offset; 440 tocpy = (int)MIN(db->db_size - bufoff, size); 441 442 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 443 444 if (tocpy == db->db_size) 445 dmu_buf_will_fill(db, tx); 446 else 447 dmu_buf_will_dirty(db, tx); 448 449 /* 450 * XXX uiomove could block forever (eg. nfs-backed 451 * pages). There needs to be a uiolockdown() function 452 * to lock the pages in memory, so that uiomove won't 453 * block. 454 */ 455 err = uiomove((char *)db->db_data + bufoff, tocpy, 456 UIO_WRITE, uio); 457 458 if (tocpy == db->db_size) 459 dmu_buf_fill_done(db, tx); 460 461 if (err) 462 break; 463 464 offset += tocpy; 465 size -= tocpy; 466 } 467 dmu_buf_rele_array(dbp, numbufs, FTAG); 468 return (err); 469 } 470 #endif 471 472 /* 473 * XXX move send/recv stuff to its own new file! 474 */ 475 476 struct backuparg { 477 dmu_replay_record_t *drr; 478 vnode_t *vp; 479 objset_t *os; 480 zio_cksum_t zc; 481 int err; 482 }; 483 484 static int 485 dump_bytes(struct backuparg *ba, void *buf, int len) 486 { 487 ssize_t resid; /* have to get resid to get detailed errno */ 488 ASSERT3U(len % 8, ==, 0); 489 490 fletcher_4_incremental_native(buf, len, &ba->zc); 491 ba->err = vn_rdwr(UIO_WRITE, ba->vp, 492 (caddr_t)buf, len, 493 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); 494 return (ba->err); 495 } 496 497 static int 498 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, 499 uint64_t length) 500 { 501 /* write a FREE record */ 502 bzero(ba->drr, sizeof (dmu_replay_record_t)); 503 ba->drr->drr_type = DRR_FREE; 504 ba->drr->drr_u.drr_free.drr_object = object; 505 ba->drr->drr_u.drr_free.drr_offset = offset; 506 ba->drr->drr_u.drr_free.drr_length = length; 507 508 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) 509 return (EINTR); 510 return (0); 511 } 512 513 static int 514 dump_data(struct backuparg *ba, dmu_object_type_t type, 515 uint64_t object, uint64_t offset, int blksz, void *data) 516 { 517 /* write a DATA record */ 518 bzero(ba->drr, sizeof (dmu_replay_record_t)); 519 ba->drr->drr_type = DRR_WRITE; 520 ba->drr->drr_u.drr_write.drr_object = object; 521 ba->drr->drr_u.drr_write.drr_type = type; 522 ba->drr->drr_u.drr_write.drr_offset = offset; 523 ba->drr->drr_u.drr_write.drr_length = blksz; 524 525 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) 526 return (EINTR); 527 if (dump_bytes(ba, data, blksz)) 528 return (EINTR); 529 return (0); 530 } 531 532 static int 533 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) 534 { 535 /* write a FREEOBJECTS record */ 536 bzero(ba->drr, sizeof (dmu_replay_record_t)); 537 ba->drr->drr_type = DRR_FREEOBJECTS; 538 ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj; 539 ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs; 540 541 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) 542 return (EINTR); 543 return (0); 544 } 545 546 static int 547 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) 548 { 549 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 550 return (dump_freeobjects(ba, object, 1)); 551 552 /* write an OBJECT record */ 553 bzero(ba->drr, sizeof (dmu_replay_record_t)); 554 ba->drr->drr_type = DRR_OBJECT; 555 ba->drr->drr_u.drr_object.drr_object = object; 556 ba->drr->drr_u.drr_object.drr_type = dnp->dn_type; 557 ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype; 558 ba->drr->drr_u.drr_object.drr_blksz = 559 dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 560 ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen; 561 ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum; 562 ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress; 563 564 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) 565 return (EINTR); 566 567 if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8))) 568 return (EINTR); 569 570 /* free anything past the end of the file */ 571 if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * 572 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) 573 return (EINTR); 574 if (ba->err) 575 return (EINTR); 576 return (0); 577 } 578 579 #define BP_SPAN(dnp, level) \ 580 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 581 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 582 583 static int 584 backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) 585 { 586 struct backuparg *ba = arg; 587 uint64_t object = bc->bc_bookmark.zb_object; 588 int level = bc->bc_bookmark.zb_level; 589 uint64_t blkid = bc->bc_bookmark.zb_blkid; 590 blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL; 591 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 592 void *data = bc->bc_data; 593 int err = 0; 594 595 if (issig(JUSTLOOKING) && issig(FORREAL)) 596 return (EINTR); 597 598 ASSERT(data || bp == NULL); 599 600 if (bp == NULL && object == 0) { 601 uint64_t span = BP_SPAN(bc->bc_dnode, level); 602 uint64_t dnobj = (blkid * span) >> DNODE_SHIFT; 603 err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); 604 } else if (bp == NULL) { 605 uint64_t span = BP_SPAN(bc->bc_dnode, level); 606 err = dump_free(ba, object, blkid * span, span); 607 } else if (data && level == 0 && type == DMU_OT_DNODE) { 608 dnode_phys_t *blk = data; 609 int i; 610 int blksz = BP_GET_LSIZE(bp); 611 612 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 613 uint64_t dnobj = 614 (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 615 err = dump_dnode(ba, dnobj, blk+i); 616 if (err) 617 break; 618 } 619 } else if (level == 0 && 620 type != DMU_OT_DNODE && type != DMU_OT_OBJSET) { 621 int blksz = BP_GET_LSIZE(bp); 622 if (data == NULL) { 623 arc_buf_t *abuf; 624 zbookmark_t zb; 625 626 zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object; 627 zb.zb_object = object; 628 zb.zb_level = level; 629 zb.zb_blkid = blkid; 630 (void) arc_read(NULL, spa, bp, 631 dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf, 632 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED, 633 ARC_WAIT, &zb); 634 635 if (abuf) { 636 err = dump_data(ba, type, object, blkid * blksz, 637 blksz, abuf->b_data); 638 (void) arc_buf_remove_ref(abuf, &abuf); 639 } 640 } else { 641 err = dump_data(ba, type, object, blkid * blksz, 642 blksz, data); 643 } 644 } 645 646 ASSERT(err == 0 || err == EINTR); 647 return (err); 648 } 649 650 int 651 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp) 652 { 653 dsl_dataset_t *ds = tosnap->os->os_dsl_dataset; 654 dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL; 655 dmu_replay_record_t *drr; 656 struct backuparg ba; 657 int err; 658 659 /* tosnap must be a snapshot */ 660 if (ds->ds_phys->ds_next_snap_obj == 0) 661 return (EINVAL); 662 663 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 664 if (fromds && (ds->ds_dir != fromds->ds_dir || 665 fromds->ds_phys->ds_creation_txg >= 666 ds->ds_phys->ds_creation_txg)) 667 return (EXDEV); 668 669 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 670 drr->drr_type = DRR_BEGIN; 671 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 672 drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION; 673 drr->drr_u.drr_begin.drr_creation_time = 674 ds->ds_phys->ds_creation_time; 675 drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type; 676 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 677 if (fromds) 678 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; 679 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 680 681 ba.drr = drr; 682 ba.vp = vp; 683 ba.os = tosnap; 684 ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); 685 686 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { 687 kmem_free(drr, sizeof (dmu_replay_record_t)); 688 return (ba.err); 689 } 690 691 err = traverse_dsl_dataset(ds, 692 fromds ? fromds->ds_phys->ds_creation_txg : 0, 693 ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK, 694 backup_cb, &ba); 695 696 if (err) { 697 if (err == EINTR && ba.err) 698 err = ba.err; 699 return (err); 700 } 701 702 bzero(drr, sizeof (dmu_replay_record_t)); 703 drr->drr_type = DRR_END; 704 drr->drr_u.drr_end.drr_checksum = ba.zc; 705 706 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) 707 return (ba.err); 708 709 kmem_free(drr, sizeof (dmu_replay_record_t)); 710 711 return (0); 712 } 713 714 struct restorearg { 715 int err; 716 int byteswap; 717 vnode_t *vp; 718 char *buf; 719 uint64_t voff; 720 int buflen; /* number of valid bytes in buf */ 721 int bufoff; /* next offset to read */ 722 int bufsize; /* amount of memory allocated for buf */ 723 zio_cksum_t zc; 724 }; 725 726 /* ARGSUSED */ 727 static int 728 replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) 729 { 730 dsl_dataset_t *ds = arg1; 731 struct drr_begin *drrb = arg2; 732 const char *snapname; 733 int err; 734 uint64_t val; 735 736 /* must already be a snapshot of this fs */ 737 if (ds->ds_phys->ds_prev_snap_obj == 0) 738 return (ENODEV); 739 740 /* most recent snapshot must match fromguid */ 741 if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) 742 return (ENODEV); 743 /* must not have any changes since most recent snapshot */ 744 if (ds->ds_phys->ds_bp.blk_birth > 745 ds->ds_prev->ds_phys->ds_creation_txg) 746 return (ETXTBSY); 747 748 /* new snapshot name must not exist */ 749 snapname = strrchr(drrb->drr_toname, '@'); 750 if (snapname == NULL) 751 return (EEXIST); 752 753 snapname++; 754 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 755 ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val); 756 if (err == 0) 757 return (EEXIST); 758 if (err != ENOENT) 759 return (err); 760 761 return (0); 762 } 763 764 /* ARGSUSED */ 765 static void 766 replay_incremental_sync(void *arg1, void *arg2, dmu_tx_t *tx) 767 { 768 dsl_dataset_t *ds = arg1; 769 dmu_buf_will_dirty(ds->ds_dbuf, tx); 770 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 771 } 772 773 /* ARGSUSED */ 774 static int 775 replay_full_check(void *arg1, void *arg2, dmu_tx_t *tx) 776 { 777 dsl_dir_t *dd = arg1; 778 struct drr_begin *drrb = arg2; 779 objset_t *mos = dd->dd_pool->dp_meta_objset; 780 char *cp; 781 uint64_t val; 782 int err; 783 784 cp = strchr(drrb->drr_toname, '@'); 785 *cp = '\0'; 786 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, 787 strrchr(drrb->drr_toname, '/') + 1, 788 sizeof (uint64_t), 1, &val); 789 *cp = '@'; 790 791 if (err != ENOENT) 792 return (err ? err : EEXIST); 793 794 return (0); 795 } 796 797 static void 798 replay_full_sync(void *arg1, void *arg2, dmu_tx_t *tx) 799 { 800 dsl_dir_t *dd = arg1; 801 struct drr_begin *drrb = arg2; 802 char *cp; 803 dsl_dataset_t *ds; 804 uint64_t dsobj; 805 806 cp = strchr(drrb->drr_toname, '@'); 807 *cp = '\0'; 808 dsobj = dsl_dataset_create_sync(dd, strrchr(drrb->drr_toname, '/') + 1, 809 NULL, tx); 810 *cp = '@'; 811 812 VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL, 813 DS_MODE_EXCLUSIVE, FTAG, &ds)); 814 815 (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds), 816 ds, drrb->drr_type, tx); 817 818 dmu_buf_will_dirty(ds->ds_dbuf, tx); 819 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 820 821 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); 822 } 823 824 static int 825 replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx) 826 { 827 objset_t *os = arg1; 828 struct drr_begin *drrb = arg2; 829 char *snapname; 830 831 /* XXX verify that drr_toname is in dd */ 832 833 snapname = strchr(drrb->drr_toname, '@'); 834 if (snapname == NULL) 835 return (EINVAL); 836 snapname++; 837 838 return (dsl_dataset_snapshot_check(os, snapname, tx)); 839 } 840 841 static void 842 replay_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) 843 { 844 objset_t *os = arg1; 845 struct drr_begin *drrb = arg2; 846 char *snapname; 847 dsl_dataset_t *ds, *hds; 848 849 snapname = strchr(drrb->drr_toname, '@') + 1; 850 851 dsl_dataset_snapshot_sync(os, snapname, tx); 852 853 /* set snapshot's creation time and guid */ 854 hds = os->os->os_dsl_dataset; 855 VERIFY(0 == dsl_dataset_open_obj(hds->ds_dir->dd_pool, 856 hds->ds_phys->ds_prev_snap_obj, NULL, 857 DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_INCONSISTENT, 858 FTAG, &ds)); 859 860 dmu_buf_will_dirty(ds->ds_dbuf, tx); 861 ds->ds_phys->ds_creation_time = drrb->drr_creation_time; 862 ds->ds_phys->ds_guid = drrb->drr_toguid; 863 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 864 865 dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG); 866 867 dmu_buf_will_dirty(hds->ds_dbuf, tx); 868 hds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 869 } 870 871 void * 872 restore_read(struct restorearg *ra, int len) 873 { 874 void *rv; 875 876 /* some things will require 8-byte alignment, so everything must */ 877 ASSERT3U(len % 8, ==, 0); 878 879 while (ra->buflen - ra->bufoff < len) { 880 ssize_t resid; 881 int leftover = ra->buflen - ra->bufoff; 882 883 (void) memmove(ra->buf, ra->buf + ra->bufoff, leftover); 884 ra->err = vn_rdwr(UIO_READ, ra->vp, 885 (caddr_t)ra->buf + leftover, ra->bufsize - leftover, 886 ra->voff, UIO_SYSSPACE, FAPPEND, 887 RLIM64_INFINITY, CRED(), &resid); 888 889 ra->voff += ra->bufsize - leftover - resid; 890 ra->buflen = ra->bufsize - resid; 891 ra->bufoff = 0; 892 if (resid == ra->bufsize - leftover) 893 ra->err = EINVAL; 894 if (ra->err) 895 return (NULL); 896 /* Could compute checksum here? */ 897 } 898 899 ASSERT3U(ra->bufoff % 8, ==, 0); 900 ASSERT3U(ra->buflen - ra->bufoff, >=, len); 901 rv = ra->buf + ra->bufoff; 902 ra->bufoff += len; 903 if (ra->byteswap) 904 fletcher_4_incremental_byteswap(rv, len, &ra->zc); 905 else 906 fletcher_4_incremental_native(rv, len, &ra->zc); 907 return (rv); 908 } 909 910 static void 911 backup_byteswap(dmu_replay_record_t *drr) 912 { 913 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 914 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 915 drr->drr_type = BSWAP_32(drr->drr_type); 916 switch (drr->drr_type) { 917 case DRR_BEGIN: 918 DO64(drr_begin.drr_magic); 919 DO64(drr_begin.drr_version); 920 DO64(drr_begin.drr_creation_time); 921 DO32(drr_begin.drr_type); 922 DO64(drr_begin.drr_toguid); 923 DO64(drr_begin.drr_fromguid); 924 break; 925 case DRR_OBJECT: 926 DO64(drr_object.drr_object); 927 /* DO64(drr_object.drr_allocation_txg); */ 928 DO32(drr_object.drr_type); 929 DO32(drr_object.drr_bonustype); 930 DO32(drr_object.drr_blksz); 931 DO32(drr_object.drr_bonuslen); 932 break; 933 case DRR_FREEOBJECTS: 934 DO64(drr_freeobjects.drr_firstobj); 935 DO64(drr_freeobjects.drr_numobjs); 936 break; 937 case DRR_WRITE: 938 DO64(drr_write.drr_object); 939 DO32(drr_write.drr_type); 940 DO64(drr_write.drr_offset); 941 DO64(drr_write.drr_length); 942 break; 943 case DRR_FREE: 944 DO64(drr_free.drr_object); 945 DO64(drr_free.drr_offset); 946 DO64(drr_free.drr_length); 947 break; 948 case DRR_END: 949 DO64(drr_end.drr_checksum.zc_word[0]); 950 DO64(drr_end.drr_checksum.zc_word[1]); 951 DO64(drr_end.drr_checksum.zc_word[2]); 952 DO64(drr_end.drr_checksum.zc_word[3]); 953 break; 954 } 955 #undef DO64 956 #undef DO32 957 } 958 959 static int 960 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 961 { 962 int err; 963 dmu_tx_t *tx; 964 965 err = dmu_object_info(os, drro->drr_object, NULL); 966 967 if (err != 0 && err != ENOENT) 968 return (EINVAL); 969 970 if (drro->drr_type == DMU_OT_NONE || 971 drro->drr_type >= DMU_OT_NUMTYPES || 972 drro->drr_bonustype >= DMU_OT_NUMTYPES || 973 drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS || 974 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 975 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 976 drro->drr_blksz < SPA_MINBLOCKSIZE || 977 drro->drr_blksz > SPA_MAXBLOCKSIZE || 978 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 979 return (EINVAL); 980 } 981 982 tx = dmu_tx_create(os); 983 984 if (err == ENOENT) { 985 /* currently free, want to be allocated */ 986 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 987 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1); 988 err = dmu_tx_assign(tx, TXG_WAIT); 989 if (err) { 990 dmu_tx_abort(tx); 991 return (err); 992 } 993 err = dmu_object_claim(os, drro->drr_object, 994 drro->drr_type, drro->drr_blksz, 995 drro->drr_bonustype, drro->drr_bonuslen, tx); 996 } else { 997 /* currently allocated, want to be allocated */ 998 dmu_tx_hold_bonus(tx, drro->drr_object); 999 /* 1000 * We may change blocksize, so need to 1001 * hold_write 1002 */ 1003 dmu_tx_hold_write(tx, drro->drr_object, 0, 1); 1004 err = dmu_tx_assign(tx, TXG_WAIT); 1005 if (err) { 1006 dmu_tx_abort(tx); 1007 return (err); 1008 } 1009 1010 err = dmu_object_reclaim(os, drro->drr_object, 1011 drro->drr_type, drro->drr_blksz, 1012 drro->drr_bonustype, drro->drr_bonuslen, tx); 1013 } 1014 if (err) { 1015 dmu_tx_commit(tx); 1016 return (EINVAL); 1017 } 1018 1019 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx); 1020 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 1021 1022 if (drro->drr_bonuslen) { 1023 dmu_buf_t *db; 1024 void *data; 1025 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 1026 dmu_buf_will_dirty(db, tx); 1027 1028 ASSERT3U(db->db_size, ==, drro->drr_bonuslen); 1029 data = restore_read(ra, P2ROUNDUP(db->db_size, 8)); 1030 if (data == NULL) { 1031 dmu_tx_commit(tx); 1032 return (ra->err); 1033 } 1034 bcopy(data, db->db_data, db->db_size); 1035 if (ra->byteswap) { 1036 dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, 1037 drro->drr_bonuslen); 1038 } 1039 dmu_buf_rele(db, FTAG); 1040 } 1041 dmu_tx_commit(tx); 1042 return (0); 1043 } 1044 1045 /* ARGSUSED */ 1046 static int 1047 restore_freeobjects(struct restorearg *ra, objset_t *os, 1048 struct drr_freeobjects *drrfo) 1049 { 1050 uint64_t obj; 1051 1052 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1053 return (EINVAL); 1054 1055 for (obj = drrfo->drr_firstobj; 1056 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; obj++) { 1057 dmu_tx_t *tx; 1058 int err; 1059 1060 if (dmu_object_info(os, obj, NULL) != 0) 1061 continue; 1062 1063 tx = dmu_tx_create(os); 1064 dmu_tx_hold_bonus(tx, obj); 1065 err = dmu_tx_assign(tx, TXG_WAIT); 1066 if (err) { 1067 dmu_tx_abort(tx); 1068 return (err); 1069 } 1070 err = dmu_object_free(os, obj, tx); 1071 dmu_tx_commit(tx); 1072 if (err && err != ENOENT) 1073 return (EINVAL); 1074 } 1075 return (0); 1076 } 1077 1078 static int 1079 restore_write(struct restorearg *ra, objset_t *os, 1080 struct drr_write *drrw) 1081 { 1082 dmu_tx_t *tx; 1083 void *data; 1084 int err; 1085 1086 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1087 drrw->drr_type >= DMU_OT_NUMTYPES) 1088 return (EINVAL); 1089 1090 data = restore_read(ra, drrw->drr_length); 1091 if (data == NULL) 1092 return (ra->err); 1093 1094 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1095 return (EINVAL); 1096 1097 tx = dmu_tx_create(os); 1098 1099 dmu_tx_hold_write(tx, drrw->drr_object, 1100 drrw->drr_offset, drrw->drr_length); 1101 err = dmu_tx_assign(tx, TXG_WAIT); 1102 if (err) { 1103 dmu_tx_abort(tx); 1104 return (err); 1105 } 1106 if (ra->byteswap) 1107 dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); 1108 dmu_write(os, drrw->drr_object, 1109 drrw->drr_offset, drrw->drr_length, data, tx); 1110 dmu_tx_commit(tx); 1111 return (0); 1112 } 1113 1114 /* ARGSUSED */ 1115 static int 1116 restore_free(struct restorearg *ra, objset_t *os, 1117 struct drr_free *drrf) 1118 { 1119 dmu_tx_t *tx; 1120 int err; 1121 1122 if (drrf->drr_length != -1ULL && 1123 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1124 return (EINVAL); 1125 1126 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1127 return (EINVAL); 1128 1129 tx = dmu_tx_create(os); 1130 1131 dmu_tx_hold_free(tx, drrf->drr_object, 1132 drrf->drr_offset, drrf->drr_length); 1133 err = dmu_tx_assign(tx, TXG_WAIT); 1134 if (err) { 1135 dmu_tx_abort(tx); 1136 return (err); 1137 } 1138 err = dmu_free_range(os, drrf->drr_object, 1139 drrf->drr_offset, drrf->drr_length, tx); 1140 dmu_tx_commit(tx); 1141 return (err); 1142 } 1143 1144 int 1145 dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, 1146 vnode_t *vp, uint64_t voffset) 1147 { 1148 struct restorearg ra; 1149 dmu_replay_record_t *drr; 1150 char *cp; 1151 objset_t *os = NULL; 1152 zio_cksum_t pzc; 1153 1154 bzero(&ra, sizeof (ra)); 1155 ra.vp = vp; 1156 ra.voff = voffset; 1157 ra.bufsize = 1<<20; 1158 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1159 1160 if (drrb->drr_magic == DMU_BACKUP_MAGIC) { 1161 ra.byteswap = FALSE; 1162 } else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { 1163 ra.byteswap = TRUE; 1164 } else { 1165 ra.err = EINVAL; 1166 goto out; 1167 } 1168 1169 /* 1170 * NB: this assumes that struct drr_begin will be the largest in 1171 * dmu_replay_record_t's drr_u, and thus we don't need to pad it 1172 * with zeros to make it the same length as we wrote out. 1173 */ 1174 ((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN; 1175 ((dmu_replay_record_t *)ra.buf)->drr_pad = 0; 1176 ((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb; 1177 if (ra.byteswap) { 1178 fletcher_4_incremental_byteswap(ra.buf, 1179 sizeof (dmu_replay_record_t), &ra.zc); 1180 } else { 1181 fletcher_4_incremental_native(ra.buf, 1182 sizeof (dmu_replay_record_t), &ra.zc); 1183 } 1184 (void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */ 1185 1186 if (ra.byteswap) { 1187 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1188 drrb->drr_version = BSWAP_64(drrb->drr_version); 1189 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1190 drrb->drr_type = BSWAP_32(drrb->drr_type); 1191 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1192 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1193 } 1194 1195 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); 1196 1197 if (drrb->drr_version != DMU_BACKUP_VERSION || 1198 drrb->drr_type >= DMU_OST_NUMTYPES || 1199 strchr(drrb->drr_toname, '@') == NULL) { 1200 ra.err = EINVAL; 1201 goto out; 1202 } 1203 1204 /* 1205 * Process the begin in syncing context. 1206 */ 1207 if (drrb->drr_fromguid) { 1208 /* incremental backup */ 1209 dsl_dataset_t *ds = NULL; 1210 1211 cp = strchr(tosnap, '@'); 1212 *cp = '\0'; 1213 ra.err = dsl_dataset_open(tosnap, DS_MODE_EXCLUSIVE, FTAG, &ds); 1214 *cp = '@'; 1215 if (ra.err) 1216 goto out; 1217 1218 ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1219 replay_incremental_check, replay_incremental_sync, 1220 ds, drrb, 1); 1221 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); 1222 } else { 1223 /* full backup */ 1224 dsl_dir_t *dd = NULL; 1225 const char *tail; 1226 1227 /* can't restore full backup into topmost fs, for now */ 1228 if (strrchr(drrb->drr_toname, '/') == NULL) { 1229 ra.err = EINVAL; 1230 goto out; 1231 } 1232 1233 cp = strchr(tosnap, '@'); 1234 *cp = '\0'; 1235 ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail); 1236 *cp = '@'; 1237 if (ra.err) 1238 goto out; 1239 if (tail == NULL) { 1240 ra.err = EEXIST; 1241 goto out; 1242 } 1243 1244 ra.err = dsl_sync_task_do(dd->dd_pool, replay_full_check, 1245 replay_full_sync, dd, drrb, 5); 1246 dsl_dir_close(dd, FTAG); 1247 } 1248 if (ra.err) 1249 goto out; 1250 1251 /* 1252 * Open the objset we are modifying. 1253 */ 1254 1255 cp = strchr(tosnap, '@'); 1256 *cp = '\0'; 1257 ra.err = dmu_objset_open(tosnap, DMU_OST_ANY, 1258 DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os); 1259 *cp = '@'; 1260 ASSERT3U(ra.err, ==, 0); 1261 1262 /* 1263 * Read records and process them. 1264 */ 1265 pzc = ra.zc; 1266 while (ra.err == 0 && 1267 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { 1268 if (issig(JUSTLOOKING) && issig(FORREAL)) { 1269 ra.err = EINTR; 1270 goto out; 1271 } 1272 1273 if (ra.byteswap) 1274 backup_byteswap(drr); 1275 1276 switch (drr->drr_type) { 1277 case DRR_OBJECT: 1278 { 1279 /* 1280 * We need to make a copy of the record header, 1281 * because restore_{object,write} may need to 1282 * restore_read(), which will invalidate drr. 1283 */ 1284 struct drr_object drro = drr->drr_u.drr_object; 1285 ra.err = restore_object(&ra, os, &drro); 1286 break; 1287 } 1288 case DRR_FREEOBJECTS: 1289 { 1290 struct drr_freeobjects drrfo = 1291 drr->drr_u.drr_freeobjects; 1292 ra.err = restore_freeobjects(&ra, os, &drrfo); 1293 break; 1294 } 1295 case DRR_WRITE: 1296 { 1297 struct drr_write drrw = drr->drr_u.drr_write; 1298 ra.err = restore_write(&ra, os, &drrw); 1299 break; 1300 } 1301 case DRR_FREE: 1302 { 1303 struct drr_free drrf = drr->drr_u.drr_free; 1304 ra.err = restore_free(&ra, os, &drrf); 1305 break; 1306 } 1307 case DRR_END: 1308 { 1309 struct drr_end drre = drr->drr_u.drr_end; 1310 /* 1311 * We compare against the *previous* checksum 1312 * value, because the stored checksum is of 1313 * everything before the DRR_END record. 1314 */ 1315 if (drre.drr_checksum.zc_word[0] != 0 && 1316 ((drre.drr_checksum.zc_word[0] - pzc.zc_word[0]) | 1317 (drre.drr_checksum.zc_word[1] - pzc.zc_word[1]) | 1318 (drre.drr_checksum.zc_word[2] - pzc.zc_word[2]) | 1319 (drre.drr_checksum.zc_word[3] - pzc.zc_word[3]))) { 1320 ra.err = ECKSUM; 1321 goto out; 1322 } 1323 1324 ra.err = dsl_sync_task_do(dmu_objset_ds(os)-> 1325 ds_dir->dd_pool, replay_end_check, replay_end_sync, 1326 os, drrb, 3); 1327 goto out; 1328 } 1329 default: 1330 ra.err = EINVAL; 1331 goto out; 1332 } 1333 pzc = ra.zc; 1334 } 1335 1336 out: 1337 if (os) 1338 dmu_objset_close(os); 1339 1340 /* 1341 * Make sure we don't rollback/destroy unless we actually 1342 * processed the begin properly. 'os' will only be set if this 1343 * is the case. 1344 */ 1345 if (ra.err && os && tosnap && strchr(tosnap, '@')) { 1346 /* 1347 * rollback or destroy what we created, so we don't 1348 * leave it in the restoring state. 1349 */ 1350 dsl_dataset_t *ds; 1351 int err; 1352 1353 cp = strchr(tosnap, '@'); 1354 *cp = '\0'; 1355 err = dsl_dataset_open(tosnap, 1356 DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, 1357 FTAG, &ds); 1358 if (err == 0) { 1359 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1360 if (drrb->drr_fromguid) { 1361 /* incremental: rollback to most recent snap */ 1362 (void) dsl_dataset_rollback(ds); 1363 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); 1364 } else { 1365 /* full: destroy whole fs */ 1366 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); 1367 (void) dsl_dataset_destroy(tosnap); 1368 } 1369 } 1370 *cp = '@'; 1371 } 1372 1373 kmem_free(ra.buf, ra.bufsize); 1374 if (sizep) 1375 *sizep = ra.voff; 1376 return (ra.err); 1377 } 1378 1379 typedef struct { 1380 uint64_t txg; 1381 dmu_buf_impl_t *db; 1382 dmu_sync_cb_t *done; 1383 void *arg; 1384 } dmu_sync_cbin_t; 1385 1386 typedef union { 1387 dmu_sync_cbin_t data; 1388 blkptr_t blk; 1389 } dmu_sync_cbarg_t; 1390 1391 /* ARGSUSED */ 1392 static void 1393 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) 1394 { 1395 dmu_sync_cbin_t *in = (dmu_sync_cbin_t *)varg; 1396 dmu_buf_impl_t *db = in->db; 1397 uint64_t txg = in->txg; 1398 dmu_sync_cb_t *done = in->done; 1399 void *arg = in->arg; 1400 blkptr_t *blk = (blkptr_t *)varg; 1401 1402 if (!BP_IS_HOLE(zio->io_bp)) { 1403 zio->io_bp->blk_fill = 1; 1404 BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type); 1405 BP_SET_LEVEL(zio->io_bp, 0); 1406 } 1407 1408 *blk = *zio->io_bp; /* structure assignment */ 1409 1410 mutex_enter(&db->db_mtx); 1411 ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC); 1412 db->db_d.db_overridden_by[txg&TXG_MASK] = blk; 1413 cv_broadcast(&db->db_changed); 1414 mutex_exit(&db->db_mtx); 1415 1416 if (done) 1417 done(&(db->db), arg); 1418 } 1419 1420 /* 1421 * Intent log support: sync the block associated with db to disk. 1422 * N.B. and XXX: the caller is responsible for making sure that the 1423 * data isn't changing while dmu_sync() is writing it. 1424 * 1425 * Return values: 1426 * 1427 * EEXIST: this txg has already been synced, so there's nothing to to. 1428 * The caller should not log the write. 1429 * 1430 * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. 1431 * The caller should not log the write. 1432 * 1433 * EALREADY: this block is already in the process of being synced. 1434 * The caller should track its progress (somehow). 1435 * 1436 * EINPROGRESS: the IO has been initiated. 1437 * The caller should log this blkptr in the callback. 1438 * 1439 * 0: completed. Sets *bp to the blkptr just written. 1440 * The caller should log this blkptr immediately. 1441 */ 1442 int 1443 dmu_sync(zio_t *pio, dmu_buf_t *db_fake, 1444 blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg) 1445 { 1446 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1447 objset_impl_t *os = db->db_objset; 1448 dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; 1449 tx_state_t *tx = &dp->dp_tx; 1450 dmu_sync_cbin_t *in; 1451 blkptr_t *blk; 1452 zbookmark_t zb; 1453 uint32_t arc_flag; 1454 int err; 1455 1456 ASSERT(BP_IS_HOLE(bp)); 1457 ASSERT(txg != 0); 1458 1459 1460 dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", 1461 txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); 1462 1463 /* 1464 * XXX - would be nice if we could do this without suspending... 1465 */ 1466 txg_suspend(dp); 1467 1468 /* 1469 * If this txg already synced, there's nothing to do. 1470 */ 1471 if (txg <= tx->tx_synced_txg) { 1472 txg_resume(dp); 1473 /* 1474 * If we're running ziltest, we need the blkptr regardless. 1475 */ 1476 if (txg > spa_freeze_txg(dp->dp_spa)) { 1477 /* if db_blkptr == NULL, this was an empty write */ 1478 if (db->db_blkptr) 1479 *bp = *db->db_blkptr; /* structure assignment */ 1480 return (0); 1481 } 1482 return (EEXIST); 1483 } 1484 1485 mutex_enter(&db->db_mtx); 1486 1487 blk = db->db_d.db_overridden_by[txg&TXG_MASK]; 1488 if (blk == IN_DMU_SYNC) { 1489 /* 1490 * We have already issued a sync write for this buffer. 1491 */ 1492 mutex_exit(&db->db_mtx); 1493 txg_resume(dp); 1494 return (EALREADY); 1495 } else if (blk != NULL) { 1496 /* 1497 * This buffer had already been synced. It could not 1498 * have been dirtied since, or we would have cleared blk. 1499 */ 1500 *bp = *blk; /* structure assignment */ 1501 mutex_exit(&db->db_mtx); 1502 txg_resume(dp); 1503 return (0); 1504 } 1505 1506 if (txg == tx->tx_syncing_txg) { 1507 while (db->db_data_pending) { 1508 /* 1509 * IO is in-progress. Wait for it to finish. 1510 * XXX - would be nice to be able to somehow "attach" 1511 * this zio to the parent zio passed in. 1512 */ 1513 cv_wait(&db->db_changed, &db->db_mtx); 1514 ASSERT(db->db_data_pending || 1515 (db->db_blkptr && db->db_blkptr->blk_birth == txg)); 1516 } 1517 1518 if (db->db_blkptr && db->db_blkptr->blk_birth == txg) { 1519 /* 1520 * IO is already completed. 1521 */ 1522 *bp = *db->db_blkptr; /* structure assignment */ 1523 mutex_exit(&db->db_mtx); 1524 txg_resume(dp); 1525 return (0); 1526 } 1527 } 1528 1529 if (db->db_d.db_data_old[txg&TXG_MASK] == NULL) { 1530 /* 1531 * This dbuf isn't dirty, must have been free_range'd. 1532 * There's no need to log writes to freed blocks, so we're done. 1533 */ 1534 mutex_exit(&db->db_mtx); 1535 txg_resume(dp); 1536 return (ENOENT); 1537 } 1538 1539 ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == NULL); 1540 db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC; 1541 /* 1542 * XXX - a little ugly to stash the blkptr in the callback 1543 * buffer. We always need to make sure the following is true: 1544 * ASSERT(sizeof(blkptr_t) >= sizeof(dmu_sync_cbin_t)); 1545 */ 1546 in = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); 1547 in->db = db; 1548 in->txg = txg; 1549 in->done = done; 1550 in->arg = arg; 1551 mutex_exit(&db->db_mtx); 1552 txg_resume(dp); 1553 1554 arc_flag = pio == NULL ? ARC_WAIT : ARC_NOWAIT; 1555 zb.zb_objset = os->os_dsl_dataset->ds_object; 1556 zb.zb_object = db->db.db_object; 1557 zb.zb_level = db->db_level; 1558 zb.zb_blkid = db->db_blkid; 1559 err = arc_write(pio, os->os_spa, 1560 zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum), 1561 zio_compress_select(db->db_dnode->dn_compress, os->os_compress), 1562 dmu_get_replication_level(os->os_spa, &zb, db->db_dnode->dn_type), 1563 txg, bp, db->db_d.db_data_old[txg&TXG_MASK], dmu_sync_done, in, 1564 ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, arc_flag, &zb); 1565 ASSERT(err == 0); 1566 1567 return (arc_flag == ARC_NOWAIT ? EINPROGRESS : 0); 1568 } 1569 1570 uint64_t 1571 dmu_object_max_nonzero_offset(objset_t *os, uint64_t object) 1572 { 1573 dnode_t *dn; 1574 1575 /* XXX assumes dnode_hold will not get an i/o error */ 1576 (void) dnode_hold(os->os, object, FTAG, &dn); 1577 uint64_t rv = dnode_max_nonzero_offset(dn); 1578 dnode_rele(dn, FTAG); 1579 return (rv); 1580 } 1581 1582 int 1583 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, 1584 dmu_tx_t *tx) 1585 { 1586 dnode_t *dn; 1587 int err; 1588 1589 err = dnode_hold(os->os, object, FTAG, &dn); 1590 if (err) 1591 return (err); 1592 err = dnode_set_blksz(dn, size, ibs, tx); 1593 dnode_rele(dn, FTAG); 1594 return (err); 1595 } 1596 1597 void 1598 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, 1599 dmu_tx_t *tx) 1600 { 1601 dnode_t *dn; 1602 1603 /* XXX assumes dnode_hold will not get an i/o error */ 1604 (void) dnode_hold(os->os, object, FTAG, &dn); 1605 ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); 1606 dn->dn_checksum = checksum; 1607 dnode_setdirty(dn, tx); 1608 dnode_rele(dn, FTAG); 1609 } 1610 1611 void 1612 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, 1613 dmu_tx_t *tx) 1614 { 1615 dnode_t *dn; 1616 1617 /* XXX assumes dnode_hold will not get an i/o error */ 1618 (void) dnode_hold(os->os, object, FTAG, &dn); 1619 ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); 1620 dn->dn_compress = compress; 1621 dnode_setdirty(dn, tx); 1622 dnode_rele(dn, FTAG); 1623 } 1624 1625 /* 1626 * XXX - eventually, this should take into account per-dataset (or 1627 * even per-object?) user requests for higher levels of replication. 1628 */ 1629 int 1630 dmu_get_replication_level(spa_t *spa, zbookmark_t *zb, dmu_object_type_t ot) 1631 { 1632 int ncopies = 1; 1633 1634 if (dmu_ot[ot].ot_metadata) 1635 ncopies++; 1636 if (zb->zb_level != 0) 1637 ncopies++; 1638 if (zb->zb_objset == 0 && zb->zb_object == 0) 1639 ncopies++; 1640 return (MIN(ncopies, spa_max_replication(spa))); 1641 } 1642 1643 int 1644 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) 1645 { 1646 dnode_t *dn; 1647 int i, err; 1648 1649 err = dnode_hold(os->os, object, FTAG, &dn); 1650 if (err) 1651 return (err); 1652 /* 1653 * Sync any current changes before 1654 * we go trundling through the block pointers. 1655 */ 1656 for (i = 0; i < TXG_SIZE; i++) { 1657 if (list_link_active(&dn->dn_dirty_link[i])) 1658 break; 1659 } 1660 if (i != TXG_SIZE) { 1661 dnode_rele(dn, FTAG); 1662 txg_wait_synced(dmu_objset_pool(os), 0); 1663 err = dnode_hold(os->os, object, FTAG, &dn); 1664 if (err) 1665 return (err); 1666 } 1667 1668 err = dnode_next_offset(dn, hole, off, 1, 1); 1669 dnode_rele(dn, FTAG); 1670 1671 return (err); 1672 } 1673 1674 void 1675 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) 1676 { 1677 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1678 mutex_enter(&dn->dn_mtx); 1679 1680 doi->doi_data_block_size = dn->dn_datablksz; 1681 doi->doi_metadata_block_size = dn->dn_indblkshift ? 1682 1ULL << dn->dn_indblkshift : 0; 1683 doi->doi_indirection = dn->dn_nlevels; 1684 doi->doi_checksum = dn->dn_checksum; 1685 doi->doi_compress = dn->dn_compress; 1686 doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) + 1687 SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT; 1688 doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; 1689 doi->doi_type = dn->dn_type; 1690 doi->doi_bonus_size = dn->dn_bonuslen; 1691 doi->doi_bonus_type = dn->dn_bonustype; 1692 1693 mutex_exit(&dn->dn_mtx); 1694 rw_exit(&dn->dn_struct_rwlock); 1695 } 1696 1697 /* 1698 * Get information on a DMU object. 1699 * If doi is NULL, just indicates whether the object exists. 1700 */ 1701 int 1702 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) 1703 { 1704 dnode_t *dn; 1705 int err = dnode_hold(os->os, object, FTAG, &dn); 1706 1707 if (err) 1708 return (err); 1709 1710 if (doi != NULL) 1711 dmu_object_info_from_dnode(dn, doi); 1712 1713 dnode_rele(dn, FTAG); 1714 return (0); 1715 } 1716 1717 /* 1718 * As above, but faster; can be used when you have a held dbuf in hand. 1719 */ 1720 void 1721 dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) 1722 { 1723 dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); 1724 } 1725 1726 /* 1727 * Faster still when you only care about the size. 1728 * This is specifically optimized for zfs_getattr(). 1729 */ 1730 void 1731 dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) 1732 { 1733 dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 1734 1735 *blksize = dn->dn_datablksz; 1736 /* add 1 for dnode space */ 1737 *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> 1738 SPA_MINBLOCKSHIFT) + 1; 1739 } 1740 1741 /* 1742 * Given a bookmark, return the name of the dataset, object, and range in 1743 * human-readable format. 1744 */ 1745 int 1746 spa_bookmark_name(spa_t *spa, zbookmark_t *zb, char *dsname, size_t dslen, 1747 char *objname, size_t objlen, char *range, size_t rangelen) 1748 { 1749 dsl_pool_t *dp; 1750 dsl_dataset_t *ds = NULL; 1751 objset_t *os = NULL; 1752 dnode_t *dn = NULL; 1753 int err, shift; 1754 1755 if (dslen < MAXNAMELEN || objlen < 32 || rangelen < 64) 1756 return (ENOSPC); 1757 1758 dp = spa_get_dsl(spa); 1759 if (zb->zb_objset != 0) { 1760 rw_enter(&dp->dp_config_rwlock, RW_READER); 1761 err = dsl_dataset_open_obj(dp, zb->zb_objset, 1762 NULL, DS_MODE_NONE, FTAG, &ds); 1763 if (err) { 1764 rw_exit(&dp->dp_config_rwlock); 1765 return (err); 1766 } 1767 dsl_dataset_name(ds, dsname); 1768 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 1769 rw_exit(&dp->dp_config_rwlock); 1770 1771 err = dmu_objset_open(dsname, DMU_OST_ANY, DS_MODE_NONE, &os); 1772 if (err) 1773 goto out; 1774 1775 } else { 1776 dsl_dataset_name(NULL, dsname); 1777 os = dp->dp_meta_objset; 1778 } 1779 1780 1781 if (zb->zb_object == DMU_META_DNODE_OBJECT) { 1782 (void) strncpy(objname, "mdn", objlen); 1783 } else { 1784 (void) snprintf(objname, objlen, "%lld", 1785 (longlong_t)zb->zb_object); 1786 } 1787 1788 err = dnode_hold(os->os, zb->zb_object, FTAG, &dn); 1789 if (err) 1790 goto out; 1791 1792 shift = (dn->dn_datablkshift?dn->dn_datablkshift:SPA_MAXBLOCKSHIFT) + 1793 zb->zb_level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT); 1794 (void) snprintf(range, rangelen, "%llu-%llu", 1795 (u_longlong_t)(zb->zb_blkid << shift), 1796 (u_longlong_t)((zb->zb_blkid+1) << shift)); 1797 1798 out: 1799 if (dn) 1800 dnode_rele(dn, FTAG); 1801 if (os && os != dp->dp_meta_objset) 1802 dmu_objset_close(os); 1803 return (err); 1804 } 1805 1806 void 1807 byteswap_uint64_array(void *vbuf, size_t size) 1808 { 1809 uint64_t *buf = vbuf; 1810 size_t count = size >> 3; 1811 int i; 1812 1813 ASSERT((size & 7) == 0); 1814 1815 for (i = 0; i < count; i++) 1816 buf[i] = BSWAP_64(buf[i]); 1817 } 1818 1819 void 1820 byteswap_uint32_array(void *vbuf, size_t size) 1821 { 1822 uint32_t *buf = vbuf; 1823 size_t count = size >> 2; 1824 int i; 1825 1826 ASSERT((size & 3) == 0); 1827 1828 for (i = 0; i < count; i++) 1829 buf[i] = BSWAP_32(buf[i]); 1830 } 1831 1832 void 1833 byteswap_uint16_array(void *vbuf, size_t size) 1834 { 1835 uint16_t *buf = vbuf; 1836 size_t count = size >> 1; 1837 int i; 1838 1839 ASSERT((size & 1) == 0); 1840 1841 for (i = 0; i < count; i++) 1842 buf[i] = BSWAP_16(buf[i]); 1843 } 1844 1845 /* ARGSUSED */ 1846 void 1847 byteswap_uint8_array(void *vbuf, size_t size) 1848 { 1849 } 1850 1851 void 1852 dmu_init(void) 1853 { 1854 dbuf_init(); 1855 dnode_init(); 1856 arc_init(); 1857 } 1858 1859 void 1860 dmu_fini(void) 1861 { 1862 arc_fini(); 1863 dnode_fini(); 1864 dbuf_fini(); 1865 } 1866