1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/dmu.h> 29 #include <sys/dmu_impl.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/dbuf.h> 32 #include <sys/dnode.h> 33 #include <sys/zfs_context.h> 34 #include <sys/dmu_objset.h> 35 #include <sys/dmu_traverse.h> 36 #include <sys/dsl_dataset.h> 37 #include <sys/dsl_dir.h> 38 #include <sys/dsl_pool.h> 39 #include <sys/dsl_synctask.h> 40 #include <sys/dmu_zfetch.h> 41 #include <sys/zfs_ioctl.h> 42 #include <sys/zap.h> 43 #include <sys/zio_checksum.h> 44 45 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { 46 { byteswap_uint8_array, TRUE, "unallocated" }, 47 { zap_byteswap, TRUE, "object directory" }, 48 { byteswap_uint64_array, TRUE, "object array" }, 49 { byteswap_uint8_array, TRUE, "packed nvlist" }, 50 { byteswap_uint64_array, TRUE, "packed nvlist size" }, 51 { byteswap_uint64_array, TRUE, "bplist" }, 52 { byteswap_uint64_array, TRUE, "bplist header" }, 53 { byteswap_uint64_array, TRUE, "SPA space map header" }, 54 { byteswap_uint64_array, TRUE, "SPA space map" }, 55 { byteswap_uint64_array, TRUE, "ZIL intent log" }, 56 { dnode_buf_byteswap, TRUE, "DMU dnode" }, 57 { dmu_objset_byteswap, TRUE, "DMU objset" }, 58 { byteswap_uint64_array, TRUE, "DSL directory" }, 59 { zap_byteswap, TRUE, "DSL directory child map"}, 60 { zap_byteswap, TRUE, "DSL dataset snap map" }, 61 { zap_byteswap, TRUE, "DSL props" }, 62 { byteswap_uint64_array, TRUE, "DSL dataset" }, 63 { zfs_znode_byteswap, TRUE, "ZFS znode" }, 64 { zfs_acl_byteswap, TRUE, "ZFS ACL" }, 65 { byteswap_uint8_array, FALSE, "ZFS plain file" }, 66 { zap_byteswap, TRUE, "ZFS directory" }, 67 { zap_byteswap, TRUE, "ZFS master node" }, 68 { zap_byteswap, TRUE, "ZFS delete queue" }, 69 { byteswap_uint8_array, FALSE, "zvol object" }, 70 { zap_byteswap, TRUE, "zvol prop" }, 71 { byteswap_uint8_array, FALSE, "other uint8[]" }, 72 { byteswap_uint64_array, FALSE, "other uint64[]" }, 73 { zap_byteswap, TRUE, "other ZAP" }, 74 { zap_byteswap, TRUE, "persistent error log" }, 75 }; 76 77 int 78 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, 79 void *tag, dmu_buf_t **dbp) 80 { 81 dnode_t *dn; 82 uint64_t blkid; 83 dmu_buf_impl_t *db; 84 int err; 85 86 err = dnode_hold(os->os, object, FTAG, &dn); 87 if (err) 88 return (err); 89 blkid = dbuf_whichblock(dn, offset); 90 rw_enter(&dn->dn_struct_rwlock, RW_READER); 91 db = dbuf_hold(dn, blkid, tag); 92 rw_exit(&dn->dn_struct_rwlock); 93 if (db == NULL) { 94 err = EIO; 95 } else { 96 err = dbuf_read(db, NULL, DB_RF_CANFAIL); 97 if (err) { 98 dbuf_rele(db, tag); 99 db = NULL; 100 } 101 } 102 103 dnode_rele(dn, FTAG); 104 *dbp = &db->db; 105 return (err); 106 } 107 108 int 109 dmu_bonus_max(void) 110 { 111 return (DN_MAX_BONUSLEN); 112 } 113 114 /* 115 * returns ENOENT, EIO, or 0. 116 */ 117 int 118 dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) 119 { 120 dnode_t *dn; 121 int err, count; 122 dmu_buf_impl_t *db; 123 124 err = dnode_hold(os->os, object, FTAG, &dn); 125 if (err) 126 return (err); 127 128 rw_enter(&dn->dn_struct_rwlock, RW_READER); 129 if (dn->dn_bonus == NULL) { 130 rw_exit(&dn->dn_struct_rwlock); 131 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 132 if (dn->dn_bonus == NULL) 133 dn->dn_bonus = dbuf_create_bonus(dn); 134 } 135 db = dn->dn_bonus; 136 rw_exit(&dn->dn_struct_rwlock); 137 mutex_enter(&db->db_mtx); 138 count = refcount_add(&db->db_holds, tag); 139 mutex_exit(&db->db_mtx); 140 if (count == 1) 141 dnode_add_ref(dn, db); 142 dnode_rele(dn, FTAG); 143 144 VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); 145 146 *dbp = &db->db; 147 return (0); 148 } 149 150 int 151 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, 152 uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 153 { 154 dnode_t *dn; 155 dmu_buf_t **dbp; 156 uint64_t blkid, nblks, i; 157 uint32_t flags; 158 int err; 159 zio_t *zio; 160 161 ASSERT(length <= DMU_MAX_ACCESS); 162 163 if (length == 0) { 164 if (numbufsp) 165 *numbufsp = 0; 166 *dbpp = NULL; 167 return (0); 168 } 169 170 flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; 171 if (length > zfetch_array_rd_sz) 172 flags |= DB_RF_NOPREFETCH; 173 174 err = dnode_hold(os->os, object, FTAG, &dn); 175 if (err) 176 return (err); 177 178 rw_enter(&dn->dn_struct_rwlock, RW_READER); 179 if (dn->dn_datablkshift) { 180 int blkshift = dn->dn_datablkshift; 181 nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - 182 P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; 183 } else { 184 ASSERT3U(offset + length, <=, dn->dn_datablksz); 185 nblks = 1; 186 } 187 dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); 188 189 zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE); 190 blkid = dbuf_whichblock(dn, offset); 191 for (i = 0; i < nblks; i++) { 192 dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); 193 if (db == NULL) { 194 rw_exit(&dn->dn_struct_rwlock); 195 dmu_buf_rele_array(dbp, nblks, tag); 196 dnode_rele(dn, FTAG); 197 zio_nowait(zio); 198 return (EIO); 199 } 200 /* initiate async i/o */ 201 if (read && db->db_state == DB_UNCACHED) { 202 rw_exit(&dn->dn_struct_rwlock); 203 (void) dbuf_read(db, zio, flags); 204 rw_enter(&dn->dn_struct_rwlock, RW_READER); 205 } 206 dbp[i] = &db->db; 207 } 208 rw_exit(&dn->dn_struct_rwlock); 209 dnode_rele(dn, FTAG); 210 211 /* wait for async i/o */ 212 err = zio_wait(zio); 213 if (err) { 214 dmu_buf_rele_array(dbp, nblks, tag); 215 return (err); 216 } 217 218 /* wait for other io to complete */ 219 if (read) { 220 for (i = 0; i < nblks; i++) { 221 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; 222 mutex_enter(&db->db_mtx); 223 while (db->db_state == DB_READ || 224 db->db_state == DB_FILL) 225 cv_wait(&db->db_changed, &db->db_mtx); 226 if (db->db_state == DB_UNCACHED) 227 err = EIO; 228 mutex_exit(&db->db_mtx); 229 if (err) { 230 dmu_buf_rele_array(dbp, nblks, tag); 231 return (err); 232 } 233 } 234 } 235 236 *numbufsp = nblks; 237 *dbpp = dbp; 238 return (0); 239 } 240 241 void 242 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) 243 { 244 int i; 245 dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; 246 247 if (numbufs == 0) 248 return; 249 250 for (i = 0; i < numbufs; i++) { 251 if (dbp[i]) 252 dbuf_rele(dbp[i], tag); 253 } 254 255 kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); 256 } 257 258 void 259 dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) 260 { 261 dnode_t *dn; 262 uint64_t blkid; 263 int nblks, i, err; 264 265 if (len == 0) { /* they're interested in the bonus buffer */ 266 dn = os->os->os_meta_dnode; 267 268 if (object == 0 || object >= DN_MAX_OBJECT) 269 return; 270 271 rw_enter(&dn->dn_struct_rwlock, RW_READER); 272 blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); 273 dbuf_prefetch(dn, blkid); 274 rw_exit(&dn->dn_struct_rwlock); 275 return; 276 } 277 278 /* 279 * XXX - Note, if the dnode for the requested object is not 280 * already cached, we will do a *synchronous* read in the 281 * dnode_hold() call. The same is true for any indirects. 282 */ 283 err = dnode_hold(os->os, object, FTAG, &dn); 284 if (err != 0) 285 return; 286 287 rw_enter(&dn->dn_struct_rwlock, RW_READER); 288 if (dn->dn_datablkshift) { 289 int blkshift = dn->dn_datablkshift; 290 nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - 291 P2ALIGN(offset, 1<<blkshift)) >> blkshift; 292 } else { 293 nblks = (offset < dn->dn_datablksz); 294 } 295 296 if (nblks != 0) { 297 blkid = dbuf_whichblock(dn, offset); 298 for (i = 0; i < nblks; i++) 299 dbuf_prefetch(dn, blkid+i); 300 } 301 302 rw_exit(&dn->dn_struct_rwlock); 303 304 dnode_rele(dn, FTAG); 305 } 306 307 int 308 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, 309 uint64_t size, dmu_tx_t *tx) 310 { 311 dnode_t *dn; 312 int err = dnode_hold(os->os, object, FTAG, &dn); 313 if (err) 314 return (err); 315 ASSERT(offset < UINT64_MAX); 316 ASSERT(size == -1ULL || size <= UINT64_MAX - offset); 317 dnode_free_range(dn, offset, size, tx); 318 dnode_rele(dn, FTAG); 319 return (0); 320 } 321 322 int 323 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 324 void *buf) 325 { 326 dnode_t *dn; 327 dmu_buf_t **dbp; 328 int numbufs, i, err; 329 330 /* 331 * Deal with odd block sizes, where there can't be data past the 332 * first block. 333 */ 334 err = dnode_hold(os->os, object, FTAG, &dn); 335 if (err) 336 return (err); 337 if (dn->dn_datablkshift == 0) { 338 int newsz = offset > dn->dn_datablksz ? 0 : 339 MIN(size, dn->dn_datablksz - offset); 340 bzero((char *)buf + newsz, size - newsz); 341 size = newsz; 342 } 343 dnode_rele(dn, FTAG); 344 345 while (size > 0) { 346 uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); 347 int err; 348 349 /* 350 * NB: we could do this block-at-a-time, but it's nice 351 * to be reading in parallel. 352 */ 353 err = dmu_buf_hold_array(os, object, offset, mylen, 354 TRUE, FTAG, &numbufs, &dbp); 355 if (err) 356 return (err); 357 358 for (i = 0; i < numbufs; i++) { 359 int tocpy; 360 int bufoff; 361 dmu_buf_t *db = dbp[i]; 362 363 ASSERT(size > 0); 364 365 bufoff = offset - db->db_offset; 366 tocpy = (int)MIN(db->db_size - bufoff, size); 367 368 bcopy((char *)db->db_data + bufoff, buf, tocpy); 369 370 offset += tocpy; 371 size -= tocpy; 372 buf = (char *)buf + tocpy; 373 } 374 dmu_buf_rele_array(dbp, numbufs, FTAG); 375 } 376 return (0); 377 } 378 379 void 380 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 381 const void *buf, dmu_tx_t *tx) 382 { 383 dmu_buf_t **dbp; 384 int numbufs, i; 385 386 VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, 387 FALSE, FTAG, &numbufs, &dbp)); 388 389 for (i = 0; i < numbufs; i++) { 390 int tocpy; 391 int bufoff; 392 dmu_buf_t *db = dbp[i]; 393 394 ASSERT(size > 0); 395 396 bufoff = offset - db->db_offset; 397 tocpy = (int)MIN(db->db_size - bufoff, size); 398 399 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 400 401 if (tocpy == db->db_size) 402 dmu_buf_will_fill(db, tx); 403 else 404 dmu_buf_will_dirty(db, tx); 405 406 bcopy(buf, (char *)db->db_data + bufoff, tocpy); 407 408 if (tocpy == db->db_size) 409 dmu_buf_fill_done(db, tx); 410 411 offset += tocpy; 412 size -= tocpy; 413 buf = (char *)buf + tocpy; 414 } 415 dmu_buf_rele_array(dbp, numbufs, FTAG); 416 } 417 418 #ifdef _KERNEL 419 int 420 dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 421 uio_t *uio, dmu_tx_t *tx) 422 { 423 dmu_buf_t **dbp; 424 int numbufs, i; 425 int err = 0; 426 427 err = dmu_buf_hold_array(os, object, offset, size, 428 FALSE, FTAG, &numbufs, &dbp); 429 if (err) 430 return (err); 431 432 for (i = 0; i < numbufs; i++) { 433 int tocpy; 434 int bufoff; 435 dmu_buf_t *db = dbp[i]; 436 437 ASSERT(size > 0); 438 439 bufoff = offset - db->db_offset; 440 tocpy = (int)MIN(db->db_size - bufoff, size); 441 442 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 443 444 if (tocpy == db->db_size) 445 dmu_buf_will_fill(db, tx); 446 else 447 dmu_buf_will_dirty(db, tx); 448 449 /* 450 * XXX uiomove could block forever (eg. nfs-backed 451 * pages). There needs to be a uiolockdown() function 452 * to lock the pages in memory, so that uiomove won't 453 * block. 454 */ 455 err = uiomove((char *)db->db_data + bufoff, tocpy, 456 UIO_WRITE, uio); 457 458 if (tocpy == db->db_size) 459 dmu_buf_fill_done(db, tx); 460 461 if (err) 462 break; 463 464 offset += tocpy; 465 size -= tocpy; 466 } 467 dmu_buf_rele_array(dbp, numbufs, FTAG); 468 return (err); 469 } 470 #endif 471 472 /* 473 * XXX move send/recv stuff to its own new file! 474 */ 475 476 struct backuparg { 477 dmu_replay_record_t *drr; 478 vnode_t *vp; 479 objset_t *os; 480 zio_cksum_t zc; 481 int err; 482 }; 483 484 static int 485 dump_bytes(struct backuparg *ba, void *buf, int len) 486 { 487 ssize_t resid; /* have to get resid to get detailed errno */ 488 ASSERT3U(len % 8, ==, 0); 489 490 fletcher_4_incremental_native(buf, len, &ba->zc); 491 ba->err = vn_rdwr(UIO_WRITE, ba->vp, 492 (caddr_t)buf, len, 493 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); 494 return (ba->err); 495 } 496 497 static int 498 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, 499 uint64_t length) 500 { 501 /* write a FREE record */ 502 bzero(ba->drr, sizeof (dmu_replay_record_t)); 503 ba->drr->drr_type = DRR_FREE; 504 ba->drr->drr_u.drr_free.drr_object = object; 505 ba->drr->drr_u.drr_free.drr_offset = offset; 506 ba->drr->drr_u.drr_free.drr_length = length; 507 508 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) 509 return (EINTR); 510 return (0); 511 } 512 513 static int 514 dump_data(struct backuparg *ba, dmu_object_type_t type, 515 uint64_t object, uint64_t offset, int blksz, void *data) 516 { 517 /* write a DATA record */ 518 bzero(ba->drr, sizeof (dmu_replay_record_t)); 519 ba->drr->drr_type = DRR_WRITE; 520 ba->drr->drr_u.drr_write.drr_object = object; 521 ba->drr->drr_u.drr_write.drr_type = type; 522 ba->drr->drr_u.drr_write.drr_offset = offset; 523 ba->drr->drr_u.drr_write.drr_length = blksz; 524 525 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) 526 return (EINTR); 527 if (dump_bytes(ba, data, blksz)) 528 return (EINTR); 529 return (0); 530 } 531 532 static int 533 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) 534 { 535 /* write a FREEOBJECTS record */ 536 bzero(ba->drr, sizeof (dmu_replay_record_t)); 537 ba->drr->drr_type = DRR_FREEOBJECTS; 538 ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj; 539 ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs; 540 541 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) 542 return (EINTR); 543 return (0); 544 } 545 546 static int 547 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) 548 { 549 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 550 return (dump_freeobjects(ba, object, 1)); 551 552 /* write an OBJECT record */ 553 bzero(ba->drr, sizeof (dmu_replay_record_t)); 554 ba->drr->drr_type = DRR_OBJECT; 555 ba->drr->drr_u.drr_object.drr_object = object; 556 ba->drr->drr_u.drr_object.drr_type = dnp->dn_type; 557 ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype; 558 ba->drr->drr_u.drr_object.drr_blksz = 559 dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 560 ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen; 561 ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum; 562 ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress; 563 564 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) 565 return (EINTR); 566 567 if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8))) 568 return (EINTR); 569 570 /* free anything past the end of the file */ 571 if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * 572 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) 573 return (EINTR); 574 if (ba->err) 575 return (EINTR); 576 return (0); 577 } 578 579 #define BP_SPAN(dnp, level) \ 580 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 581 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 582 583 static int 584 backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) 585 { 586 struct backuparg *ba = arg; 587 uint64_t object = bc->bc_bookmark.zb_object; 588 int level = bc->bc_bookmark.zb_level; 589 uint64_t blkid = bc->bc_bookmark.zb_blkid; 590 blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL; 591 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 592 void *data = bc->bc_data; 593 int err = 0; 594 595 if (issig(JUSTLOOKING) && issig(FORREAL)) 596 return (EINTR); 597 598 ASSERT(data || bp == NULL); 599 600 if (bp == NULL && object == 0) { 601 uint64_t span = BP_SPAN(bc->bc_dnode, level); 602 uint64_t dnobj = (blkid * span) >> DNODE_SHIFT; 603 err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); 604 } else if (bp == NULL) { 605 uint64_t span = BP_SPAN(bc->bc_dnode, level); 606 err = dump_free(ba, object, blkid * span, span); 607 } else if (data && level == 0 && type == DMU_OT_DNODE) { 608 dnode_phys_t *blk = data; 609 int i; 610 int blksz = BP_GET_LSIZE(bp); 611 612 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 613 uint64_t dnobj = 614 (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 615 err = dump_dnode(ba, dnobj, blk+i); 616 if (err) 617 break; 618 } 619 } else if (level == 0 && 620 type != DMU_OT_DNODE && type != DMU_OT_OBJSET) { 621 int blksz = BP_GET_LSIZE(bp); 622 if (data == NULL) { 623 arc_buf_t *abuf; 624 zbookmark_t zb; 625 626 zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object; 627 zb.zb_object = object; 628 zb.zb_level = level; 629 zb.zb_blkid = blkid; 630 (void) arc_read(NULL, spa, bp, 631 dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf, 632 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED, 633 ARC_WAIT, &zb); 634 635 if (abuf) { 636 err = dump_data(ba, type, object, blkid * blksz, 637 blksz, abuf->b_data); 638 (void) arc_buf_remove_ref(abuf, &abuf); 639 } 640 } else { 641 err = dump_data(ba, type, object, blkid * blksz, 642 blksz, data); 643 } 644 } 645 646 ASSERT(err == 0 || err == EINTR); 647 return (err); 648 } 649 650 int 651 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp) 652 { 653 dsl_dataset_t *ds = tosnap->os->os_dsl_dataset; 654 dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL; 655 dmu_replay_record_t *drr; 656 struct backuparg ba; 657 int err; 658 659 /* tosnap must be a snapshot */ 660 if (ds->ds_phys->ds_next_snap_obj == 0) 661 return (EINVAL); 662 663 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 664 if (fromds && (ds->ds_dir != fromds->ds_dir || 665 fromds->ds_phys->ds_creation_txg >= 666 ds->ds_phys->ds_creation_txg)) 667 return (EXDEV); 668 669 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 670 drr->drr_type = DRR_BEGIN; 671 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 672 drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION; 673 drr->drr_u.drr_begin.drr_creation_time = 674 ds->ds_phys->ds_creation_time; 675 drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type; 676 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 677 if (fromds) 678 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; 679 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 680 681 ba.drr = drr; 682 ba.vp = vp; 683 ba.os = tosnap; 684 ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); 685 686 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { 687 kmem_free(drr, sizeof (dmu_replay_record_t)); 688 return (ba.err); 689 } 690 691 err = traverse_dsl_dataset(ds, 692 fromds ? fromds->ds_phys->ds_creation_txg : 0, 693 ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK, 694 backup_cb, &ba); 695 696 if (err) { 697 if (err == EINTR && ba.err) 698 err = ba.err; 699 return (err); 700 } 701 702 bzero(drr, sizeof (dmu_replay_record_t)); 703 drr->drr_type = DRR_END; 704 drr->drr_u.drr_end.drr_checksum = ba.zc; 705 706 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) 707 return (ba.err); 708 709 kmem_free(drr, sizeof (dmu_replay_record_t)); 710 711 return (0); 712 } 713 714 struct restorearg { 715 int err; 716 int byteswap; 717 vnode_t *vp; 718 char *buf; 719 uint64_t voff; 720 int buflen; /* number of valid bytes in buf */ 721 int bufoff; /* next offset to read */ 722 int bufsize; /* amount of memory allocated for buf */ 723 zio_cksum_t zc; 724 }; 725 726 /* ARGSUSED */ 727 static int 728 replay_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) 729 { 730 dsl_dataset_t *ds = arg1; 731 struct drr_begin *drrb = arg2; 732 const char *snapname; 733 int err; 734 uint64_t val; 735 736 /* must already be a snapshot of this fs */ 737 if (ds->ds_phys->ds_prev_snap_obj == 0) 738 return (ENODEV); 739 740 /* most recent snapshot must match fromguid */ 741 if (ds->ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) 742 return (ENODEV); 743 /* must not have any changes since most recent snapshot */ 744 if (ds->ds_phys->ds_bp.blk_birth > 745 ds->ds_prev->ds_phys->ds_creation_txg) 746 return (ETXTBSY); 747 748 /* new snapshot name must not exist */ 749 snapname = strrchr(drrb->drr_toname, '@'); 750 if (snapname == NULL) 751 return (EEXIST); 752 753 snapname++; 754 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, 755 ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val); 756 if (err == 0) 757 return (EEXIST); 758 if (err != ENOENT) 759 return (err); 760 761 return (0); 762 } 763 764 /* ARGSUSED */ 765 static void 766 replay_incremental_sync(void *arg1, void *arg2, dmu_tx_t *tx) 767 { 768 dsl_dataset_t *ds = arg1; 769 dmu_buf_will_dirty(ds->ds_dbuf, tx); 770 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 771 } 772 773 /* ARGSUSED */ 774 static int 775 replay_full_check(void *arg1, void *arg2, dmu_tx_t *tx) 776 { 777 dsl_dir_t *dd = arg1; 778 struct drr_begin *drrb = arg2; 779 objset_t *mos = dd->dd_pool->dp_meta_objset; 780 char *cp; 781 uint64_t val; 782 int err; 783 784 cp = strchr(drrb->drr_toname, '@'); 785 *cp = '\0'; 786 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj, 787 strrchr(drrb->drr_toname, '/') + 1, 788 sizeof (uint64_t), 1, &val); 789 *cp = '@'; 790 791 if (err != ENOENT) 792 return (err ? err : EEXIST); 793 794 return (0); 795 } 796 797 static void 798 replay_full_sync(void *arg1, void *arg2, dmu_tx_t *tx) 799 { 800 dsl_dir_t *dd = arg1; 801 struct drr_begin *drrb = arg2; 802 char *cp; 803 dsl_dataset_t *ds; 804 uint64_t dsobj; 805 806 cp = strchr(drrb->drr_toname, '@'); 807 *cp = '\0'; 808 dsobj = dsl_dataset_create_sync(dd, strrchr(drrb->drr_toname, '/') + 1, 809 NULL, tx); 810 *cp = '@'; 811 812 VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL, 813 DS_MODE_EXCLUSIVE, FTAG, &ds)); 814 815 (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds), 816 ds, drrb->drr_type, tx); 817 818 dmu_buf_will_dirty(ds->ds_dbuf, tx); 819 ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; 820 821 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); 822 } 823 824 static int 825 replay_end_check(void *arg1, void *arg2, dmu_tx_t *tx) 826 { 827 objset_t *os = arg1; 828 struct drr_begin *drrb = arg2; 829 char *snapname; 830 831 /* XXX verify that drr_toname is in dd */ 832 833 snapname = strchr(drrb->drr_toname, '@'); 834 if (snapname == NULL) 835 return (EINVAL); 836 snapname++; 837 838 return (dsl_dataset_snapshot_check(os, snapname, tx)); 839 } 840 841 static void 842 replay_end_sync(void *arg1, void *arg2, dmu_tx_t *tx) 843 { 844 objset_t *os = arg1; 845 struct drr_begin *drrb = arg2; 846 char *snapname; 847 dsl_dataset_t *ds, *hds; 848 849 snapname = strchr(drrb->drr_toname, '@') + 1; 850 851 dsl_dataset_snapshot_sync(os, snapname, tx); 852 853 /* set snapshot's creation time and guid */ 854 hds = os->os->os_dsl_dataset; 855 VERIFY(0 == dsl_dataset_open_obj(hds->ds_dir->dd_pool, 856 hds->ds_phys->ds_prev_snap_obj, NULL, 857 DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_INCONSISTENT, 858 FTAG, &ds)); 859 860 dmu_buf_will_dirty(ds->ds_dbuf, tx); 861 ds->ds_phys->ds_creation_time = drrb->drr_creation_time; 862 ds->ds_phys->ds_guid = drrb->drr_toguid; 863 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 864 865 dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG); 866 867 dmu_buf_will_dirty(hds->ds_dbuf, tx); 868 hds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; 869 } 870 871 void * 872 restore_read(struct restorearg *ra, int len) 873 { 874 void *rv; 875 876 /* some things will require 8-byte alignment, so everything must */ 877 ASSERT3U(len % 8, ==, 0); 878 879 while (ra->buflen - ra->bufoff < len) { 880 ssize_t resid; 881 int leftover = ra->buflen - ra->bufoff; 882 883 (void) memmove(ra->buf, ra->buf + ra->bufoff, leftover); 884 ra->err = vn_rdwr(UIO_READ, ra->vp, 885 (caddr_t)ra->buf + leftover, ra->bufsize - leftover, 886 ra->voff, UIO_SYSSPACE, FAPPEND, 887 RLIM64_INFINITY, CRED(), &resid); 888 889 ra->voff += ra->bufsize - leftover - resid; 890 ra->buflen = ra->bufsize - resid; 891 ra->bufoff = 0; 892 if (resid == ra->bufsize - leftover) 893 ra->err = EINVAL; 894 if (ra->err) 895 return (NULL); 896 /* Could compute checksum here? */ 897 } 898 899 ASSERT3U(ra->bufoff % 8, ==, 0); 900 ASSERT3U(ra->buflen - ra->bufoff, >=, len); 901 rv = ra->buf + ra->bufoff; 902 ra->bufoff += len; 903 if (ra->byteswap) 904 fletcher_4_incremental_byteswap(rv, len, &ra->zc); 905 else 906 fletcher_4_incremental_native(rv, len, &ra->zc); 907 return (rv); 908 } 909 910 static void 911 backup_byteswap(dmu_replay_record_t *drr) 912 { 913 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 914 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 915 drr->drr_type = BSWAP_32(drr->drr_type); 916 switch (drr->drr_type) { 917 case DRR_BEGIN: 918 DO64(drr_begin.drr_magic); 919 DO64(drr_begin.drr_version); 920 DO64(drr_begin.drr_creation_time); 921 DO32(drr_begin.drr_type); 922 DO64(drr_begin.drr_toguid); 923 DO64(drr_begin.drr_fromguid); 924 break; 925 case DRR_OBJECT: 926 DO64(drr_object.drr_object); 927 /* DO64(drr_object.drr_allocation_txg); */ 928 DO32(drr_object.drr_type); 929 DO32(drr_object.drr_bonustype); 930 DO32(drr_object.drr_blksz); 931 DO32(drr_object.drr_bonuslen); 932 break; 933 case DRR_FREEOBJECTS: 934 DO64(drr_freeobjects.drr_firstobj); 935 DO64(drr_freeobjects.drr_numobjs); 936 break; 937 case DRR_WRITE: 938 DO64(drr_write.drr_object); 939 DO32(drr_write.drr_type); 940 DO64(drr_write.drr_offset); 941 DO64(drr_write.drr_length); 942 break; 943 case DRR_FREE: 944 DO64(drr_free.drr_object); 945 DO64(drr_free.drr_offset); 946 DO64(drr_free.drr_length); 947 break; 948 case DRR_END: 949 DO64(drr_end.drr_checksum.zc_word[0]); 950 DO64(drr_end.drr_checksum.zc_word[1]); 951 DO64(drr_end.drr_checksum.zc_word[2]); 952 DO64(drr_end.drr_checksum.zc_word[3]); 953 break; 954 } 955 #undef DO64 956 #undef DO32 957 } 958 959 static int 960 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 961 { 962 int err; 963 dmu_tx_t *tx; 964 965 err = dmu_object_info(os, drro->drr_object, NULL); 966 967 if (err != 0 && err != ENOENT) 968 return (EINVAL); 969 970 if (drro->drr_type == DMU_OT_NONE || 971 drro->drr_type >= DMU_OT_NUMTYPES || 972 drro->drr_bonustype >= DMU_OT_NUMTYPES || 973 drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS || 974 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 975 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 976 drro->drr_blksz < SPA_MINBLOCKSIZE || 977 drro->drr_blksz > SPA_MAXBLOCKSIZE || 978 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 979 return (EINVAL); 980 } 981 982 tx = dmu_tx_create(os); 983 984 if (err == ENOENT) { 985 /* currently free, want to be allocated */ 986 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 987 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1); 988 err = dmu_tx_assign(tx, TXG_WAIT); 989 if (err) { 990 dmu_tx_abort(tx); 991 return (err); 992 } 993 err = dmu_object_claim(os, drro->drr_object, 994 drro->drr_type, drro->drr_blksz, 995 drro->drr_bonustype, drro->drr_bonuslen, tx); 996 } else { 997 /* currently allocated, want to be allocated */ 998 dmu_tx_hold_bonus(tx, drro->drr_object); 999 /* 1000 * We may change blocksize, so need to 1001 * hold_write 1002 */ 1003 dmu_tx_hold_write(tx, drro->drr_object, 0, 1); 1004 err = dmu_tx_assign(tx, TXG_WAIT); 1005 if (err) { 1006 dmu_tx_abort(tx); 1007 return (err); 1008 } 1009 1010 err = dmu_object_reclaim(os, drro->drr_object, 1011 drro->drr_type, drro->drr_blksz, 1012 drro->drr_bonustype, drro->drr_bonuslen, tx); 1013 } 1014 if (err) { 1015 dmu_tx_commit(tx); 1016 return (EINVAL); 1017 } 1018 1019 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx); 1020 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 1021 1022 if (drro->drr_bonuslen) { 1023 dmu_buf_t *db; 1024 void *data; 1025 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); 1026 dmu_buf_will_dirty(db, tx); 1027 1028 ASSERT3U(db->db_size, ==, drro->drr_bonuslen); 1029 data = restore_read(ra, P2ROUNDUP(db->db_size, 8)); 1030 if (data == NULL) { 1031 dmu_tx_commit(tx); 1032 return (ra->err); 1033 } 1034 bcopy(data, db->db_data, db->db_size); 1035 if (ra->byteswap) { 1036 dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, 1037 drro->drr_bonuslen); 1038 } 1039 dmu_buf_rele(db, FTAG); 1040 } 1041 dmu_tx_commit(tx); 1042 return (0); 1043 } 1044 1045 /* ARGSUSED */ 1046 static int 1047 restore_freeobjects(struct restorearg *ra, objset_t *os, 1048 struct drr_freeobjects *drrfo) 1049 { 1050 uint64_t obj; 1051 1052 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1053 return (EINVAL); 1054 1055 for (obj = drrfo->drr_firstobj; 1056 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; obj++) { 1057 dmu_tx_t *tx; 1058 int err; 1059 1060 if (dmu_object_info(os, obj, NULL) != 0) 1061 continue; 1062 1063 tx = dmu_tx_create(os); 1064 dmu_tx_hold_bonus(tx, obj); 1065 err = dmu_tx_assign(tx, TXG_WAIT); 1066 if (err) { 1067 dmu_tx_abort(tx); 1068 return (err); 1069 } 1070 err = dmu_object_free(os, obj, tx); 1071 dmu_tx_commit(tx); 1072 if (err && err != ENOENT) 1073 return (EINVAL); 1074 } 1075 return (0); 1076 } 1077 1078 static int 1079 restore_write(struct restorearg *ra, objset_t *os, 1080 struct drr_write *drrw) 1081 { 1082 dmu_tx_t *tx; 1083 void *data; 1084 int err; 1085 1086 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1087 drrw->drr_type >= DMU_OT_NUMTYPES) 1088 return (EINVAL); 1089 1090 data = restore_read(ra, drrw->drr_length); 1091 if (data == NULL) 1092 return (ra->err); 1093 1094 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1095 return (EINVAL); 1096 1097 tx = dmu_tx_create(os); 1098 1099 dmu_tx_hold_write(tx, drrw->drr_object, 1100 drrw->drr_offset, drrw->drr_length); 1101 err = dmu_tx_assign(tx, TXG_WAIT); 1102 if (err) { 1103 dmu_tx_abort(tx); 1104 return (err); 1105 } 1106 if (ra->byteswap) 1107 dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); 1108 dmu_write(os, drrw->drr_object, 1109 drrw->drr_offset, drrw->drr_length, data, tx); 1110 dmu_tx_commit(tx); 1111 return (0); 1112 } 1113 1114 /* ARGSUSED */ 1115 static int 1116 restore_free(struct restorearg *ra, objset_t *os, 1117 struct drr_free *drrf) 1118 { 1119 dmu_tx_t *tx; 1120 int err; 1121 1122 if (drrf->drr_length != -1ULL && 1123 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1124 return (EINVAL); 1125 1126 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1127 return (EINVAL); 1128 1129 tx = dmu_tx_create(os); 1130 1131 dmu_tx_hold_free(tx, drrf->drr_object, 1132 drrf->drr_offset, drrf->drr_length); 1133 err = dmu_tx_assign(tx, TXG_WAIT); 1134 if (err) { 1135 dmu_tx_abort(tx); 1136 return (err); 1137 } 1138 err = dmu_free_range(os, drrf->drr_object, 1139 drrf->drr_offset, drrf->drr_length, tx); 1140 dmu_tx_commit(tx); 1141 return (err); 1142 } 1143 1144 int 1145 dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, 1146 vnode_t *vp, uint64_t voffset) 1147 { 1148 struct restorearg ra; 1149 dmu_replay_record_t *drr; 1150 char *cp; 1151 objset_t *os = NULL; 1152 zio_cksum_t pzc; 1153 1154 bzero(&ra, sizeof (ra)); 1155 ra.vp = vp; 1156 ra.voff = voffset; 1157 ra.bufsize = 1<<20; 1158 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1159 1160 if (drrb->drr_magic == DMU_BACKUP_MAGIC) { 1161 ra.byteswap = FALSE; 1162 } else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { 1163 ra.byteswap = TRUE; 1164 } else { 1165 ra.err = EINVAL; 1166 goto out; 1167 } 1168 1169 /* 1170 * NB: this assumes that struct drr_begin will be the largest in 1171 * dmu_replay_record_t's drr_u, and thus we don't need to pad it 1172 * with zeros to make it the same length as we wrote out. 1173 */ 1174 ((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN; 1175 ((dmu_replay_record_t *)ra.buf)->drr_pad = 0; 1176 ((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb; 1177 if (ra.byteswap) { 1178 fletcher_4_incremental_byteswap(ra.buf, 1179 sizeof (dmu_replay_record_t), &ra.zc); 1180 } else { 1181 fletcher_4_incremental_native(ra.buf, 1182 sizeof (dmu_replay_record_t), &ra.zc); 1183 } 1184 (void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */ 1185 1186 if (ra.byteswap) { 1187 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1188 drrb->drr_version = BSWAP_64(drrb->drr_version); 1189 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1190 drrb->drr_type = BSWAP_32(drrb->drr_type); 1191 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1192 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1193 } 1194 1195 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); 1196 1197 if (drrb->drr_version != DMU_BACKUP_VERSION || 1198 drrb->drr_type >= DMU_OST_NUMTYPES || 1199 strchr(drrb->drr_toname, '@') == NULL) { 1200 ra.err = EINVAL; 1201 goto out; 1202 } 1203 1204 /* 1205 * Process the begin in syncing context. 1206 */ 1207 if (drrb->drr_fromguid) { 1208 /* incremental backup */ 1209 dsl_dataset_t *ds = NULL; 1210 1211 cp = strchr(tosnap, '@'); 1212 *cp = '\0'; 1213 ra.err = dsl_dataset_open(tosnap, DS_MODE_EXCLUSIVE, FTAG, &ds); 1214 *cp = '@'; 1215 if (ra.err) 1216 goto out; 1217 1218 ra.err = dsl_sync_task_do(ds->ds_dir->dd_pool, 1219 replay_incremental_check, replay_incremental_sync, 1220 ds, drrb, 1); 1221 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); 1222 } else { 1223 /* full backup */ 1224 dsl_dir_t *dd = NULL; 1225 const char *tail; 1226 1227 /* can't restore full backup into topmost fs, for now */ 1228 if (strrchr(drrb->drr_toname, '/') == NULL) { 1229 ra.err = EINVAL; 1230 goto out; 1231 } 1232 1233 cp = strchr(tosnap, '@'); 1234 *cp = '\0'; 1235 ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail); 1236 *cp = '@'; 1237 if (ra.err) 1238 goto out; 1239 if (tail == NULL) { 1240 ra.err = EEXIST; 1241 goto out; 1242 } 1243 1244 ra.err = dsl_sync_task_do(dd->dd_pool, replay_full_check, 1245 replay_full_sync, dd, drrb, 5); 1246 dsl_dir_close(dd, FTAG); 1247 } 1248 if (ra.err) 1249 goto out; 1250 1251 /* 1252 * Open the objset we are modifying. 1253 */ 1254 1255 cp = strchr(tosnap, '@'); 1256 *cp = '\0'; 1257 ra.err = dmu_objset_open(tosnap, DMU_OST_ANY, 1258 DS_MODE_PRIMARY | DS_MODE_INCONSISTENT, &os); 1259 *cp = '@'; 1260 ASSERT3U(ra.err, ==, 0); 1261 1262 /* 1263 * Read records and process them. 1264 */ 1265 pzc = ra.zc; 1266 while (ra.err == 0 && 1267 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { 1268 if (issig(JUSTLOOKING) && issig(FORREAL)) { 1269 ra.err = EINTR; 1270 goto out; 1271 } 1272 1273 if (ra.byteswap) 1274 backup_byteswap(drr); 1275 1276 switch (drr->drr_type) { 1277 case DRR_OBJECT: 1278 { 1279 /* 1280 * We need to make a copy of the record header, 1281 * because restore_{object,write} may need to 1282 * restore_read(), which will invalidate drr. 1283 */ 1284 struct drr_object drro = drr->drr_u.drr_object; 1285 ra.err = restore_object(&ra, os, &drro); 1286 break; 1287 } 1288 case DRR_FREEOBJECTS: 1289 { 1290 struct drr_freeobjects drrfo = 1291 drr->drr_u.drr_freeobjects; 1292 ra.err = restore_freeobjects(&ra, os, &drrfo); 1293 break; 1294 } 1295 case DRR_WRITE: 1296 { 1297 struct drr_write drrw = drr->drr_u.drr_write; 1298 ra.err = restore_write(&ra, os, &drrw); 1299 break; 1300 } 1301 case DRR_FREE: 1302 { 1303 struct drr_free drrf = drr->drr_u.drr_free; 1304 ra.err = restore_free(&ra, os, &drrf); 1305 break; 1306 } 1307 case DRR_END: 1308 { 1309 struct drr_end drre = drr->drr_u.drr_end; 1310 /* 1311 * We compare against the *previous* checksum 1312 * value, because the stored checksum is of 1313 * everything before the DRR_END record. 1314 */ 1315 if (drre.drr_checksum.zc_word[0] != 0 && 1316 ((drre.drr_checksum.zc_word[0] - pzc.zc_word[0]) | 1317 (drre.drr_checksum.zc_word[1] - pzc.zc_word[1]) | 1318 (drre.drr_checksum.zc_word[2] - pzc.zc_word[2]) | 1319 (drre.drr_checksum.zc_word[3] - pzc.zc_word[3]))) { 1320 ra.err = ECKSUM; 1321 goto out; 1322 } 1323 1324 ra.err = dsl_sync_task_do(dmu_objset_ds(os)-> 1325 ds_dir->dd_pool, replay_end_check, replay_end_sync, 1326 os, drrb, 3); 1327 goto out; 1328 } 1329 default: 1330 ra.err = EINVAL; 1331 goto out; 1332 } 1333 pzc = ra.zc; 1334 } 1335 1336 out: 1337 if (os) 1338 dmu_objset_close(os); 1339 1340 /* 1341 * Make sure we don't rollback/destroy unless we actually 1342 * processed the begin properly. 'os' will only be set if this 1343 * is the case. 1344 */ 1345 if (ra.err && os && tosnap && strchr(tosnap, '@')) { 1346 /* 1347 * rollback or destroy what we created, so we don't 1348 * leave it in the restoring state. 1349 */ 1350 dsl_dataset_t *ds; 1351 int err; 1352 1353 cp = strchr(tosnap, '@'); 1354 *cp = '\0'; 1355 err = dsl_dataset_open(tosnap, 1356 DS_MODE_EXCLUSIVE | DS_MODE_INCONSISTENT, 1357 FTAG, &ds); 1358 if (err == 0) { 1359 txg_wait_synced(ds->ds_dir->dd_pool, 0); 1360 if (drrb->drr_fromguid) { 1361 /* incremental: rollback to most recent snap */ 1362 (void) dsl_dataset_rollback(ds); 1363 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); 1364 } else { 1365 /* full: destroy whole fs */ 1366 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); 1367 (void) dsl_dataset_destroy(tosnap); 1368 } 1369 } 1370 *cp = '@'; 1371 } 1372 1373 kmem_free(ra.buf, ra.bufsize); 1374 if (sizep) 1375 *sizep = ra.voff; 1376 return (ra.err); 1377 } 1378 1379 /* 1380 * Intent log support: sync the block at <os, object, offset> to disk. 1381 * N.B. and XXX: the caller is responsible for serializing dmu_sync()s 1382 * of the same block, and for making sure that the data isn't changing 1383 * while dmu_sync() is writing it. 1384 * 1385 * Return values: 1386 * 1387 * EALREADY: this txg has already been synced, so there's nothing to to. 1388 * The caller should not log the write. 1389 * 1390 * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. 1391 * The caller should not log the write. 1392 * 1393 * EINPROGRESS: the block is in the process of being synced by the 1394 * usual mechanism (spa_sync()), so we can't sync it here. 1395 * The caller should txg_wait_synced() and not log the write. 1396 * 1397 * EBUSY: another thread is trying to dmu_sync() the same dbuf. 1398 * (This case cannot arise under the current locking rules.) 1399 * The caller should txg_wait_synced() and not log the write. 1400 * 1401 * ESTALE: the block was dirtied or freed while we were writing it, 1402 * so the data is no longer valid. 1403 * The caller should txg_wait_synced() and not log the write. 1404 * 1405 * 0: success. Sets *bp to the blkptr just written, and sets 1406 * *blkoff to the data's offset within that block. 1407 * The caller should log this blkptr/blkoff in its lr_write_t. 1408 */ 1409 int 1410 dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, 1411 blkptr_t *bp, uint64_t txg) 1412 { 1413 objset_impl_t *osi = os->os; 1414 dsl_pool_t *dp = osi->os_dsl_dataset->ds_dir->dd_pool; 1415 tx_state_t *tx = &dp->dp_tx; 1416 dmu_buf_impl_t *db; 1417 blkptr_t *blk; 1418 int err; 1419 zbookmark_t zb; 1420 1421 ASSERT(RW_LOCK_HELD(&tx->tx_suspend)); 1422 ASSERT(BP_IS_HOLE(bp)); 1423 ASSERT(txg != 0); 1424 1425 dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", 1426 txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); 1427 1428 /* 1429 * XXX why is this routine using dmu_buf_*() and casting between 1430 * dmu_buf_impl_t and dmu_buf_t? 1431 */ 1432 1433 /* 1434 * If this txg already synced, there's nothing to do. 1435 */ 1436 if (txg <= tx->tx_synced_txg) { 1437 /* 1438 * If we're running ziltest, we need the blkptr regardless. 1439 */ 1440 if (txg > spa_freeze_txg(dp->dp_spa)) { 1441 err = dmu_buf_hold(os, object, offset, 1442 FTAG, (dmu_buf_t **)&db); 1443 if (err) 1444 return (err); 1445 /* if db_blkptr == NULL, this was an empty write */ 1446 if (db->db_blkptr) 1447 *bp = *db->db_blkptr; /* structure assignment */ 1448 else 1449 bzero(bp, sizeof (blkptr_t)); 1450 *blkoff = offset - db->db.db_offset; 1451 ASSERT3U(*blkoff, <, db->db.db_size); 1452 dmu_buf_rele((dmu_buf_t *)db, FTAG); 1453 return (0); 1454 } 1455 return (EALREADY); 1456 } 1457 1458 /* 1459 * If this txg is in the middle of syncing, just wait for it. 1460 */ 1461 if (txg == tx->tx_syncing_txg) { 1462 ASSERT(txg != tx->tx_open_txg); 1463 return (EINPROGRESS); 1464 } 1465 1466 err = dmu_buf_hold(os, object, offset, FTAG, (dmu_buf_t **)&db); 1467 if (err) 1468 return (err); 1469 1470 mutex_enter(&db->db_mtx); 1471 1472 /* 1473 * If this dbuf isn't dirty, must have been free_range'd. 1474 * There's no need to log writes to freed blocks, so we're done. 1475 */ 1476 if (!list_link_active(&db->db_dirty_node[txg&TXG_MASK])) { 1477 mutex_exit(&db->db_mtx); 1478 dmu_buf_rele((dmu_buf_t *)db, FTAG); 1479 return (ENOENT); 1480 } 1481 1482 blk = db->db_d.db_overridden_by[txg&TXG_MASK]; 1483 1484 /* 1485 * If we already did a dmu_sync() of this dbuf in this txg, 1486 * free the old block before writing the new one. 1487 */ 1488 if (blk != NULL) { 1489 ASSERT(blk != IN_DMU_SYNC); 1490 if (blk == IN_DMU_SYNC) { 1491 mutex_exit(&db->db_mtx); 1492 dmu_buf_rele((dmu_buf_t *)db, FTAG); 1493 return (EBUSY); 1494 } 1495 arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); 1496 if (!BP_IS_HOLE(blk)) { 1497 (void) arc_free(NULL, osi->os_spa, txg, blk, 1498 NULL, NULL, ARC_WAIT); 1499 } 1500 kmem_free(blk, sizeof (blkptr_t)); 1501 } 1502 1503 db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC; 1504 mutex_exit(&db->db_mtx); 1505 1506 blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); 1507 blk->blk_birth = 0; /* mark as invalid */ 1508 1509 zb.zb_objset = osi->os_dsl_dataset->ds_object; 1510 zb.zb_object = db->db.db_object; 1511 zb.zb_level = db->db_level; 1512 zb.zb_blkid = db->db_blkid; 1513 err = arc_write(NULL, osi->os_spa, 1514 zio_checksum_select(db->db_dnode->dn_checksum, osi->os_checksum), 1515 zio_compress_select(db->db_dnode->dn_compress, osi->os_compress), 1516 dmu_get_replication_level(osi->os_spa, &zb, db->db_dnode->dn_type), 1517 txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL, 1518 ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb); 1519 ASSERT(err == 0); 1520 1521 if (!BP_IS_HOLE(blk)) { 1522 blk->blk_fill = 1; 1523 BP_SET_TYPE(blk, db->db_dnode->dn_type); 1524 BP_SET_LEVEL(blk, 0); 1525 } 1526 1527 /* copy the block pointer back to caller */ 1528 *bp = *blk; /* structure assignment */ 1529 *blkoff = offset - db->db.db_offset; 1530 ASSERT3U(*blkoff, <, db->db.db_size); 1531 1532 mutex_enter(&db->db_mtx); 1533 if (db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC) { 1534 /* we were dirtied/freed during the sync */ 1535 ASSERT3P(db->db_d.db_overridden_by[txg&TXG_MASK], ==, NULL); 1536 arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); 1537 mutex_exit(&db->db_mtx); 1538 dmu_buf_rele((dmu_buf_t *)db, FTAG); 1539 /* Note that this block does not free on disk until txg syncs */ 1540 1541 /* 1542 * XXX can we use ARC_NOWAIT here? 1543 * XXX should we be ignoring the return code? 1544 */ 1545 if (!BP_IS_HOLE(blk)) { 1546 (void) arc_free(NULL, osi->os_spa, txg, blk, 1547 NULL, NULL, ARC_WAIT); 1548 } 1549 kmem_free(blk, sizeof (blkptr_t)); 1550 return (ESTALE); 1551 } 1552 1553 db->db_d.db_overridden_by[txg&TXG_MASK] = blk; 1554 mutex_exit(&db->db_mtx); 1555 dmu_buf_rele((dmu_buf_t *)db, FTAG); 1556 ASSERT3U(txg, >, tx->tx_syncing_txg); 1557 return (0); 1558 } 1559 1560 uint64_t 1561 dmu_object_max_nonzero_offset(objset_t *os, uint64_t object) 1562 { 1563 dnode_t *dn; 1564 1565 /* XXX assumes dnode_hold will not get an i/o error */ 1566 (void) dnode_hold(os->os, object, FTAG, &dn); 1567 uint64_t rv = dnode_max_nonzero_offset(dn); 1568 dnode_rele(dn, FTAG); 1569 return (rv); 1570 } 1571 1572 int 1573 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, 1574 dmu_tx_t *tx) 1575 { 1576 dnode_t *dn; 1577 int err; 1578 1579 err = dnode_hold(os->os, object, FTAG, &dn); 1580 if (err) 1581 return (err); 1582 err = dnode_set_blksz(dn, size, ibs, tx); 1583 dnode_rele(dn, FTAG); 1584 return (err); 1585 } 1586 1587 void 1588 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, 1589 dmu_tx_t *tx) 1590 { 1591 dnode_t *dn; 1592 1593 /* XXX assumes dnode_hold will not get an i/o error */ 1594 (void) dnode_hold(os->os, object, FTAG, &dn); 1595 ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); 1596 dn->dn_checksum = checksum; 1597 dnode_setdirty(dn, tx); 1598 dnode_rele(dn, FTAG); 1599 } 1600 1601 void 1602 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, 1603 dmu_tx_t *tx) 1604 { 1605 dnode_t *dn; 1606 1607 /* XXX assumes dnode_hold will not get an i/o error */ 1608 (void) dnode_hold(os->os, object, FTAG, &dn); 1609 ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); 1610 dn->dn_compress = compress; 1611 dnode_setdirty(dn, tx); 1612 dnode_rele(dn, FTAG); 1613 } 1614 1615 /* 1616 * XXX - eventually, this should take into account per-dataset (or 1617 * even per-object?) user requests for higher levels of replication. 1618 */ 1619 int 1620 dmu_get_replication_level(spa_t *spa, zbookmark_t *zb, dmu_object_type_t ot) 1621 { 1622 int ncopies = 1; 1623 1624 if (dmu_ot[ot].ot_metadata) 1625 ncopies++; 1626 if (zb->zb_level != 0) 1627 ncopies++; 1628 if (zb->zb_objset == 0 && zb->zb_object == 0) 1629 ncopies++; 1630 return (MIN(ncopies, spa_max_replication(spa))); 1631 } 1632 1633 int 1634 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) 1635 { 1636 dnode_t *dn; 1637 int i, err; 1638 1639 err = dnode_hold(os->os, object, FTAG, &dn); 1640 if (err) 1641 return (err); 1642 /* 1643 * Sync any current changes before 1644 * we go trundling through the block pointers. 1645 */ 1646 for (i = 0; i < TXG_SIZE; i++) { 1647 if (list_link_active(&dn->dn_dirty_link[i])) 1648 break; 1649 } 1650 if (i != TXG_SIZE) { 1651 dnode_rele(dn, FTAG); 1652 txg_wait_synced(dmu_objset_pool(os), 0); 1653 err = dnode_hold(os->os, object, FTAG, &dn); 1654 if (err) 1655 return (err); 1656 } 1657 1658 err = dnode_next_offset(dn, hole, off, 1, 1); 1659 dnode_rele(dn, FTAG); 1660 1661 return (err); 1662 } 1663 1664 void 1665 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) 1666 { 1667 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1668 mutex_enter(&dn->dn_mtx); 1669 1670 doi->doi_data_block_size = dn->dn_datablksz; 1671 doi->doi_metadata_block_size = dn->dn_indblkshift ? 1672 1ULL << dn->dn_indblkshift : 0; 1673 doi->doi_indirection = dn->dn_nlevels; 1674 doi->doi_checksum = dn->dn_checksum; 1675 doi->doi_compress = dn->dn_compress; 1676 doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) + 1677 SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT; 1678 doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; 1679 doi->doi_type = dn->dn_type; 1680 doi->doi_bonus_size = dn->dn_bonuslen; 1681 doi->doi_bonus_type = dn->dn_bonustype; 1682 1683 mutex_exit(&dn->dn_mtx); 1684 rw_exit(&dn->dn_struct_rwlock); 1685 } 1686 1687 /* 1688 * Get information on a DMU object. 1689 * If doi is NULL, just indicates whether the object exists. 1690 */ 1691 int 1692 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) 1693 { 1694 dnode_t *dn; 1695 int err = dnode_hold(os->os, object, FTAG, &dn); 1696 1697 if (err) 1698 return (err); 1699 1700 if (doi != NULL) 1701 dmu_object_info_from_dnode(dn, doi); 1702 1703 dnode_rele(dn, FTAG); 1704 return (0); 1705 } 1706 1707 /* 1708 * As above, but faster; can be used when you have a held dbuf in hand. 1709 */ 1710 void 1711 dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) 1712 { 1713 dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); 1714 } 1715 1716 /* 1717 * Faster still when you only care about the size. 1718 * This is specifically optimized for zfs_getattr(). 1719 */ 1720 void 1721 dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) 1722 { 1723 dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 1724 1725 *blksize = dn->dn_datablksz; 1726 /* add 1 for dnode space */ 1727 *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> 1728 SPA_MINBLOCKSHIFT) + 1; 1729 } 1730 1731 /* 1732 * Given a bookmark, return the name of the dataset, object, and range in 1733 * human-readable format. 1734 */ 1735 int 1736 spa_bookmark_name(spa_t *spa, zbookmark_t *zb, char *dsname, size_t dslen, 1737 char *objname, size_t objlen, char *range, size_t rangelen) 1738 { 1739 dsl_pool_t *dp; 1740 dsl_dataset_t *ds = NULL; 1741 objset_t *os = NULL; 1742 dnode_t *dn = NULL; 1743 int err, shift; 1744 1745 if (dslen < MAXNAMELEN || objlen < 32 || rangelen < 64) 1746 return (ENOSPC); 1747 1748 dp = spa_get_dsl(spa); 1749 if (zb->zb_objset != 0) { 1750 rw_enter(&dp->dp_config_rwlock, RW_READER); 1751 err = dsl_dataset_open_obj(dp, zb->zb_objset, 1752 NULL, DS_MODE_NONE, FTAG, &ds); 1753 if (err) { 1754 rw_exit(&dp->dp_config_rwlock); 1755 return (err); 1756 } 1757 dsl_dataset_name(ds, dsname); 1758 dsl_dataset_close(ds, DS_MODE_NONE, FTAG); 1759 rw_exit(&dp->dp_config_rwlock); 1760 1761 err = dmu_objset_open(dsname, DMU_OST_ANY, DS_MODE_NONE, &os); 1762 if (err) 1763 goto out; 1764 1765 } else { 1766 dsl_dataset_name(NULL, dsname); 1767 os = dp->dp_meta_objset; 1768 } 1769 1770 1771 if (zb->zb_object == DMU_META_DNODE_OBJECT) { 1772 (void) strncpy(objname, "mdn", objlen); 1773 } else { 1774 (void) snprintf(objname, objlen, "%lld", 1775 (longlong_t)zb->zb_object); 1776 } 1777 1778 err = dnode_hold(os->os, zb->zb_object, FTAG, &dn); 1779 if (err) 1780 goto out; 1781 1782 shift = (dn->dn_datablkshift?dn->dn_datablkshift:SPA_MAXBLOCKSHIFT) + 1783 zb->zb_level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT); 1784 (void) snprintf(range, rangelen, "%llu-%llu", 1785 (u_longlong_t)(zb->zb_blkid << shift), 1786 (u_longlong_t)((zb->zb_blkid+1) << shift)); 1787 1788 out: 1789 if (dn) 1790 dnode_rele(dn, FTAG); 1791 if (os && os != dp->dp_meta_objset) 1792 dmu_objset_close(os); 1793 return (err); 1794 } 1795 1796 void 1797 byteswap_uint64_array(void *vbuf, size_t size) 1798 { 1799 uint64_t *buf = vbuf; 1800 size_t count = size >> 3; 1801 int i; 1802 1803 ASSERT((size & 7) == 0); 1804 1805 for (i = 0; i < count; i++) 1806 buf[i] = BSWAP_64(buf[i]); 1807 } 1808 1809 void 1810 byteswap_uint32_array(void *vbuf, size_t size) 1811 { 1812 uint32_t *buf = vbuf; 1813 size_t count = size >> 2; 1814 int i; 1815 1816 ASSERT((size & 3) == 0); 1817 1818 for (i = 0; i < count; i++) 1819 buf[i] = BSWAP_32(buf[i]); 1820 } 1821 1822 void 1823 byteswap_uint16_array(void *vbuf, size_t size) 1824 { 1825 uint16_t *buf = vbuf; 1826 size_t count = size >> 1; 1827 int i; 1828 1829 ASSERT((size & 1) == 0); 1830 1831 for (i = 0; i < count; i++) 1832 buf[i] = BSWAP_16(buf[i]); 1833 } 1834 1835 /* ARGSUSED */ 1836 void 1837 byteswap_uint8_array(void *vbuf, size_t size) 1838 { 1839 } 1840 1841 void 1842 dmu_init(void) 1843 { 1844 dbuf_init(); 1845 dnode_init(); 1846 arc_init(); 1847 } 1848 1849 void 1850 dmu_fini(void) 1851 { 1852 arc_fini(); 1853 dnode_fini(); 1854 dbuf_fini(); 1855 } 1856