1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/dmu.h> 30 #include <sys/dmu_impl.h> 31 #include <sys/dmu_tx.h> 32 #include <sys/dbuf.h> 33 #include <sys/dnode.h> 34 #include <sys/zfs_context.h> 35 #include <sys/dmu_objset.h> 36 #include <sys/dmu_traverse.h> 37 #include <sys/dsl_dataset.h> 38 #include <sys/dsl_dir.h> 39 #include <sys/dsl_pool.h> 40 #include <sys/dmu_zfetch.h> 41 #include <sys/zfs_ioctl.h> 42 #include <sys/zap.h> 43 44 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { 45 { byteswap_uint8_array, TRUE, "unallocated" }, 46 { zap_byteswap, TRUE, "object directory" }, 47 { byteswap_uint64_array, TRUE, "object array" }, 48 { byteswap_uint8_array, TRUE, "packed nvlist" }, 49 { byteswap_uint64_array, TRUE, "packed nvlist size" }, 50 { byteswap_uint64_array, TRUE, "bplist" }, 51 { byteswap_uint64_array, TRUE, "bplist header" }, 52 { byteswap_uint64_array, TRUE, "SPA space map header" }, 53 { byteswap_uint64_array, TRUE, "SPA space map" }, 54 { byteswap_uint64_array, TRUE, "ZIL intent log" }, 55 { dnode_buf_byteswap, TRUE, "DMU dnode" }, 56 { dmu_objset_byteswap, TRUE, "DMU objset" }, 57 { byteswap_uint64_array, TRUE, "DSL directory" }, 58 { zap_byteswap, TRUE, "DSL directory child map"}, 59 { zap_byteswap, TRUE, "DSL dataset snap map" }, 60 { zap_byteswap, TRUE, "DSL props" }, 61 { byteswap_uint64_array, TRUE, "DSL dataset" }, 62 { zfs_znode_byteswap, TRUE, "ZFS znode" }, 63 { zfs_acl_byteswap, TRUE, "ZFS ACL" }, 64 { byteswap_uint8_array, FALSE, "ZFS plain file" }, 65 { zap_byteswap, TRUE, "ZFS directory" }, 66 { zap_byteswap, TRUE, "ZFS master node" }, 67 { zap_byteswap, TRUE, "ZFS delete queue" }, 68 { byteswap_uint8_array, FALSE, "zvol object" }, 69 { zap_byteswap, TRUE, "zvol prop" }, 70 { byteswap_uint8_array, FALSE, "other uint8[]" }, 71 { byteswap_uint64_array, FALSE, "other uint64[]" }, 72 { zap_byteswap, TRUE, "other ZAP" }, 73 }; 74 75 static int 76 dmu_buf_read_array_impl(dmu_buf_impl_t **dbp, int numbufs, uint32_t flags) 77 { 78 int i, err = 0; 79 dnode_t *dn; 80 zio_t *zio; 81 int canfail; 82 uint64_t rd_sz; 83 84 if (numbufs == 0) 85 return (0); 86 87 rd_sz = numbufs * dbp[0]->db.db_size; 88 ASSERT(rd_sz <= DMU_MAX_ACCESS); 89 90 dn = dbp[0]->db_dnode; 91 if (flags & DB_RF_CANFAIL) { 92 canfail = 1; 93 } else { 94 canfail = 0; 95 } 96 zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, canfail); 97 98 /* don't prefetch if read the read is large */ 99 if (rd_sz >= zfetch_array_rd_sz) { 100 flags |= DB_RF_NOPREFETCH; 101 } 102 103 /* initiate async reads */ 104 rw_enter(&dn->dn_struct_rwlock, RW_READER); 105 for (i = 0; i < numbufs; i++) { 106 if (dbp[i]->db_state == DB_UNCACHED) 107 dbuf_read_impl(dbp[i], zio, flags); 108 } 109 rw_exit(&dn->dn_struct_rwlock); 110 err = zio_wait(zio); 111 112 if (err) 113 return (err); 114 115 /* wait for other io to complete */ 116 for (i = 0; i < numbufs; i++) { 117 mutex_enter(&dbp[i]->db_mtx); 118 while (dbp[i]->db_state == DB_READ || 119 dbp[i]->db_state == DB_FILL) 120 cv_wait(&dbp[i]->db_changed, &dbp[i]->db_mtx); 121 ASSERT(dbp[i]->db_state == DB_CACHED); 122 mutex_exit(&dbp[i]->db_mtx); 123 } 124 125 return (0); 126 } 127 128 void 129 dmu_buf_read_array(dmu_buf_t **dbp_fake, int numbufs) 130 { 131 dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; 132 int err; 133 134 err = dmu_buf_read_array_impl(dbp, numbufs, DB_RF_MUST_SUCCEED); 135 ASSERT(err == 0); 136 } 137 138 int 139 dmu_buf_read_array_canfail(dmu_buf_t **dbp_fake, int numbufs) 140 { 141 dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; 142 143 return (dmu_buf_read_array_impl(dbp, numbufs, DB_RF_CANFAIL)); 144 } 145 146 dmu_buf_t * 147 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset) 148 { 149 dnode_t *dn; 150 uint64_t blkid; 151 dmu_buf_impl_t *db; 152 153 /* dataset_verify(dd); */ 154 155 dn = dnode_hold(os->os, object, FTAG); 156 blkid = dbuf_whichblock(dn, offset); 157 rw_enter(&dn->dn_struct_rwlock, RW_READER); 158 db = dbuf_hold(dn, blkid); 159 rw_exit(&dn->dn_struct_rwlock); 160 dnode_rele(dn, FTAG); 161 return (&db->db); 162 } 163 164 dmu_buf_t * 165 dmu_bonus_hold(objset_t *os, uint64_t object) 166 { 167 return (dmu_bonus_hold_tag(os, object, NULL)); 168 } 169 170 int 171 dmu_bonus_max(void) 172 { 173 return (DN_MAX_BONUSLEN); 174 } 175 176 /* 177 * Returns held bonus buffer if the object exists, NULL if it doesn't. 178 */ 179 dmu_buf_t * 180 dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag) 181 { 182 dnode_t *dn = dnode_hold(os->os, object, FTAG); 183 dmu_buf_impl_t *db; 184 185 if (dn == NULL) 186 return (NULL); 187 188 db = dbuf_hold_bonus(dn, tag); 189 /* XXX - hack: hold the first block if this is a ZAP object */ 190 if (dmu_ot[dn->dn_type].ot_byteswap == zap_byteswap) { 191 rw_enter(&dn->dn_struct_rwlock, RW_READER); 192 dn->dn_db0 = dbuf_hold(dn, 0); 193 rw_exit(&dn->dn_struct_rwlock); 194 } 195 dnode_rele(dn, FTAG); 196 return (&db->db); 197 } 198 199 static dmu_buf_t ** 200 dbuf_hold_array(dnode_t *dn, 201 uint64_t offset, uint64_t length, int *numbufsp) 202 { 203 dmu_buf_t **dbp; 204 uint64_t blkid, nblks, i; 205 206 if (length == 0) { 207 if (numbufsp) 208 *numbufsp = 0; 209 return (NULL); 210 } 211 212 rw_enter(&dn->dn_struct_rwlock, RW_READER); 213 if (dn->dn_datablkshift) { 214 int blkshift = dn->dn_datablkshift; 215 nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - 216 P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; 217 } else { 218 ASSERT3U(offset + length, <=, dn->dn_datablksz); 219 nblks = 1; 220 } 221 dbp = kmem_alloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); 222 223 blkid = dbuf_whichblock(dn, offset); 224 for (i = 0; i < nblks; i++) { 225 dmu_buf_impl_t *dbuf; 226 dbuf = dbuf_hold(dn, blkid+i); 227 dbp[i] = &dbuf->db; 228 } 229 rw_exit(&dn->dn_struct_rwlock); 230 231 if (numbufsp) 232 *numbufsp = nblks; 233 return (dbp); 234 } 235 236 dmu_buf_t ** 237 dmu_buf_hold_array(objset_t *os, uint64_t object, 238 uint64_t offset, uint64_t length, int *numbufsp) 239 { 240 dnode_t *dn; 241 dmu_buf_t **dbp; 242 243 ASSERT(length <= DMU_MAX_ACCESS); 244 245 if (length == 0) { 246 if (numbufsp) 247 *numbufsp = 0; 248 return (NULL); 249 } 250 251 dn = dnode_hold(os->os, object, FTAG); 252 dbp = dbuf_hold_array(dn, offset, length, numbufsp); 253 dnode_rele(dn, FTAG); 254 255 return (dbp); 256 } 257 258 void 259 dmu_buf_add_ref(dmu_buf_t *dbuf, void *tag) 260 { 261 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 262 dbuf_add_ref(db, tag); 263 } 264 265 void 266 dmu_buf_remove_ref(dmu_buf_t *dbuf, void *tag) 267 { 268 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 269 dbuf_remove_ref(db, tag); 270 } 271 272 void 273 dmu_buf_rele(dmu_buf_t *dbuf_fake) 274 { 275 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake; 276 277 /* XXX - hack: hold the first block if this is a ZAP object */ 278 if (db->db_blkid == DB_BONUS_BLKID && 279 dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap) 280 dbuf_rele(db->db_dnode->dn_db0); 281 dbuf_rele(db); 282 } 283 284 void 285 dmu_buf_rele_tag(dmu_buf_t *dbuf_fake, void *tag) 286 { 287 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake; 288 289 /* XXX - hack: hold the first block if this is a ZAP object */ 290 if (db->db_blkid == DB_BONUS_BLKID && 291 dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap) 292 dbuf_rele(db->db_dnode->dn_db0); 293 dbuf_remove_ref(db, tag); 294 } 295 296 void 297 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs) 298 { 299 int i; 300 dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; 301 302 if (numbufs == 0) 303 return; 304 305 ASSERT((numbufs * dbp[0]->db.db_size) <= DMU_MAX_ACCESS); 306 307 for (i = 0; i < numbufs; i++) 308 dbuf_rele(dbp[i]); 309 310 kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); 311 } 312 313 void 314 dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) 315 { 316 dnode_t *dn; 317 uint64_t blkid; 318 int nblks, i; 319 320 if (len == 0) { /* they're interested in the bonus buffer */ 321 dn = os->os->os_meta_dnode; 322 323 if (object == 0 || object >= DN_MAX_OBJECT) 324 return; 325 326 rw_enter(&dn->dn_struct_rwlock, RW_READER); 327 blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); 328 dbuf_prefetch(dn, blkid); 329 rw_exit(&dn->dn_struct_rwlock); 330 return; 331 } 332 333 /* 334 * XXX - Note, if the dnode for the requested object is not 335 * already cached, we will do a *synchronous* read in the 336 * dnode_hold() call. The same is true for any indirects. 337 */ 338 dn = dnode_hold(os->os, object, FTAG); 339 if (dn == NULL) 340 return; 341 342 rw_enter(&dn->dn_struct_rwlock, RW_READER); 343 if (dn->dn_datablkshift) { 344 int blkshift = dn->dn_datablkshift; 345 nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - 346 P2ALIGN(offset, 1<<blkshift)) >> blkshift; 347 } else { 348 nblks = (offset < dn->dn_datablksz); 349 } 350 351 if (nblks != 0) { 352 blkid = dbuf_whichblock(dn, offset); 353 for (i = 0; i < nblks; i++) 354 dbuf_prefetch(dn, blkid+i); 355 } 356 357 rw_exit(&dn->dn_struct_rwlock); 358 359 dnode_rele(dn, FTAG); 360 } 361 362 void 363 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, 364 uint64_t size, dmu_tx_t *tx) 365 { 366 dnode_t *dn = dnode_hold(os->os, object, FTAG); 367 ASSERT(offset < UINT64_MAX); 368 ASSERT(size == -1ULL || size <= UINT64_MAX - offset); 369 dnode_free_range(dn, offset, size, tx); 370 dnode_rele(dn, FTAG); 371 } 372 373 static int 374 dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 375 void *buf, uint32_t flags) 376 { 377 dnode_t *dn; 378 dmu_buf_t **dbp; 379 int numbufs, i; 380 381 dn = dnode_hold(os->os, object, FTAG); 382 383 if (dn->dn_datablkshift == 0) { 384 int newsz = offset > dn->dn_datablksz ? 0 : 385 MIN(size, dn->dn_datablksz - offset); 386 bzero((char *)buf + newsz, size - newsz); 387 size = newsz; 388 } 389 390 dnode_rele(dn, FTAG); 391 392 if (size == 0) 393 return (0); 394 395 while (size > 0) { 396 uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); 397 int err; 398 399 /* 400 * NB: we could do this block-at-a-time, but it's nice 401 * to be reading in parallel. 402 */ 403 dbp = dmu_buf_hold_array(os, object, offset, mylen, &numbufs); 404 err = dmu_buf_read_array_impl((dmu_buf_impl_t **)dbp, numbufs, 405 flags); 406 if (err) { 407 dmu_buf_rele_array(dbp, numbufs); 408 return (err); 409 } 410 411 for (i = 0; i < numbufs; i++) { 412 int tocpy; 413 int bufoff; 414 dmu_buf_t *db = dbp[i]; 415 416 ASSERT(size > 0); 417 418 bufoff = offset - db->db_offset; 419 tocpy = (int)MIN(db->db_size - bufoff, size); 420 421 bcopy((char *)db->db_data + bufoff, buf, tocpy); 422 423 offset += tocpy; 424 size -= tocpy; 425 buf = (char *)buf + tocpy; 426 } 427 dmu_buf_rele_array(dbp, numbufs); 428 } 429 return (0); 430 } 431 432 void 433 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 434 void *buf) 435 { 436 int err; 437 438 err = dmu_read_impl(os, object, offset, size, buf, DB_RF_MUST_SUCCEED); 439 ASSERT3U(err, ==, 0); 440 } 441 442 int 443 dmu_read_canfail(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 444 void *buf) 445 { 446 return (dmu_read_impl(os, object, offset, size, buf, DB_RF_CANFAIL)); 447 } 448 449 void 450 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 451 const void *buf, dmu_tx_t *tx) 452 { 453 dmu_buf_t **dbp; 454 int numbufs, i; 455 456 dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs); 457 458 for (i = 0; i < numbufs; i++) { 459 int tocpy; 460 int bufoff; 461 dmu_buf_t *db = dbp[i]; 462 463 ASSERT(size > 0); 464 465 bufoff = offset - db->db_offset; 466 tocpy = (int)MIN(db->db_size - bufoff, size); 467 468 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 469 470 if (tocpy == db->db_size) 471 dmu_buf_will_fill(db, tx); 472 else 473 dmu_buf_will_dirty(db, tx); 474 475 bcopy(buf, (char *)db->db_data + bufoff, tocpy); 476 477 if (tocpy == db->db_size) 478 dmu_buf_fill_done(db, tx); 479 480 offset += tocpy; 481 size -= tocpy; 482 buf = (char *)buf + tocpy; 483 } 484 dmu_buf_rele_array(dbp, numbufs); 485 } 486 487 #ifdef _KERNEL 488 int 489 dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 490 uio_t *uio, dmu_tx_t *tx) 491 { 492 dmu_buf_t **dbp; 493 int numbufs, i; 494 int err = 0; 495 496 dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs); 497 498 for (i = 0; i < numbufs; i++) { 499 int tocpy; 500 int bufoff; 501 dmu_buf_t *db = dbp[i]; 502 503 ASSERT(size > 0); 504 505 bufoff = offset - db->db_offset; 506 tocpy = (int)MIN(db->db_size - bufoff, size); 507 508 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 509 510 if (tocpy == db->db_size) 511 dmu_buf_will_fill(db, tx); 512 else 513 dmu_buf_will_dirty(db, tx); 514 515 /* 516 * XXX uiomove could block forever (eg. nfs-backed 517 * pages). There needs to be a uiolockdown() function 518 * to lock the pages in memory, so that uiomove won't 519 * block. 520 */ 521 err = uiomove((char *)db->db_data + bufoff, tocpy, 522 UIO_WRITE, uio); 523 524 if (tocpy == db->db_size) 525 dmu_buf_fill_done(db, tx); 526 527 if (err) 528 break; 529 530 offset += tocpy; 531 size -= tocpy; 532 } 533 dmu_buf_rele_array(dbp, numbufs); 534 return (err); 535 } 536 #endif 537 538 struct backuparg { 539 dmu_replay_record_t *drr; 540 vnode_t *vp; 541 objset_t *os; 542 int err; 543 }; 544 545 static int 546 dump_bytes(struct backuparg *ba, void *buf, int len) 547 { 548 ssize_t resid; /* have to get resid to get detailed errno */ 549 /* Need to compute checksum here */ 550 ASSERT3U(len % 8, ==, 0); 551 ba->err = vn_rdwr(UIO_WRITE, ba->vp, 552 (caddr_t)buf, len, 553 0, UIO_SYSSPACE, FAPPEND, RLIM_INFINITY, CRED(), &resid); 554 return (ba->err); 555 } 556 557 static int 558 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, 559 uint64_t length) 560 { 561 /* write a FREE record */ 562 bzero(ba->drr, sizeof (dmu_replay_record_t)); 563 ba->drr->drr_type = DRR_FREE; 564 ba->drr->drr_u.drr_free.drr_object = object; 565 ba->drr->drr_u.drr_free.drr_offset = offset; 566 ba->drr->drr_u.drr_free.drr_length = length; 567 568 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) 569 return (EINTR); 570 return (0); 571 } 572 573 static int 574 dump_data(struct backuparg *ba, dmu_object_type_t type, 575 uint64_t object, uint64_t offset, int blksz, void *data) 576 { 577 /* write a DATA record */ 578 bzero(ba->drr, sizeof (dmu_replay_record_t)); 579 ba->drr->drr_type = DRR_WRITE; 580 ba->drr->drr_u.drr_write.drr_object = object; 581 ba->drr->drr_u.drr_write.drr_type = type; 582 ba->drr->drr_u.drr_write.drr_offset = offset; 583 ba->drr->drr_u.drr_write.drr_length = blksz; 584 585 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) 586 return (EINTR); 587 if (dump_bytes(ba, data, blksz)) 588 return (EINTR); 589 return (0); 590 } 591 592 static int 593 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) 594 { 595 /* write a FREEOBJECTS record */ 596 bzero(ba->drr, sizeof (dmu_replay_record_t)); 597 ba->drr->drr_type = DRR_FREEOBJECTS; 598 ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj; 599 ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs; 600 601 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) 602 return (EINTR); 603 return (0); 604 } 605 606 static int 607 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) 608 { 609 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) 610 return (dump_freeobjects(ba, object, 1)); 611 612 /* write an OBJECT record */ 613 bzero(ba->drr, sizeof (dmu_replay_record_t)); 614 ba->drr->drr_type = DRR_OBJECT; 615 ba->drr->drr_u.drr_object.drr_object = object; 616 ba->drr->drr_u.drr_object.drr_type = dnp->dn_type; 617 ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype; 618 ba->drr->drr_u.drr_object.drr_blksz = 619 dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; 620 ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen; 621 ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum; 622 ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress; 623 624 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) 625 return (EINTR); 626 627 if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8))) 628 return (EINTR); 629 630 /* free anything past the end of the file */ 631 if (dump_free(ba, object, (dnp->dn_maxblkid + 1) * 632 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) 633 return (EINTR); 634 if (ba->err) 635 return (EINTR); 636 return (0); 637 } 638 639 #define BP_SPAN(dnp, level) \ 640 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ 641 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) 642 643 static int 644 backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) 645 { 646 struct backuparg *ba = arg; 647 uint64_t object = bc->bc_bookmark.zb_object; 648 int level = bc->bc_bookmark.zb_level; 649 uint64_t blkid = bc->bc_bookmark.zb_blkid; 650 blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL; 651 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; 652 void *data = bc->bc_data; 653 int err = 0; 654 655 if (issig(JUSTLOOKING)) 656 return (EINTR); 657 658 ASSERT(data || bp == NULL); 659 660 if (bp == NULL && object == 0) { 661 uint64_t span = BP_SPAN(bc->bc_dnode, level); 662 uint64_t dnobj = (blkid * span) >> DNODE_SHIFT; 663 err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); 664 } else if (bp == NULL) { 665 uint64_t span = BP_SPAN(bc->bc_dnode, level); 666 err = dump_free(ba, object, blkid * span, span); 667 } else if (data && level == 0 && type == DMU_OT_DNODE) { 668 dnode_phys_t *blk = data; 669 int i; 670 int blksz = BP_GET_LSIZE(bp); 671 672 for (i = 0; i < blksz >> DNODE_SHIFT; i++) { 673 uint64_t dnobj = 674 (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; 675 err = dump_dnode(ba, dnobj, blk+i); 676 if (err) 677 break; 678 } 679 } else if (level == 0 && 680 type != DMU_OT_DNODE && type != DMU_OT_OBJSET) { 681 int blksz = BP_GET_LSIZE(bp); 682 if (data == NULL) { 683 arc_buf_t *abuf; 684 685 (void) arc_read(NULL, spa, bp, 686 dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf, 687 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED, 688 ARC_WAIT); 689 690 if (abuf) { 691 err = dump_data(ba, type, object, blkid * blksz, 692 blksz, abuf->b_data); 693 arc_buf_free(abuf, &abuf); 694 } 695 } else { 696 err = dump_data(ba, type, object, blkid * blksz, 697 blksz, data); 698 } 699 } 700 701 ASSERT(err == 0 || err == EINTR); 702 return (err); 703 } 704 705 int 706 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp) 707 { 708 dsl_dataset_t *ds = tosnap->os->os_dsl_dataset; 709 dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL; 710 dmu_replay_record_t *drr; 711 struct backuparg ba; 712 int err; 713 714 /* tosnap must be a snapshot */ 715 if (ds->ds_phys->ds_next_snap_obj == 0) 716 return (EINVAL); 717 718 /* fromsnap must be an earlier snapshot from the same fs as tosnap */ 719 if (fromds && (ds->ds_dir != fromds->ds_dir || 720 fromds->ds_phys->ds_creation_txg >= 721 ds->ds_phys->ds_creation_txg)) 722 return (EXDEV); 723 724 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); 725 drr->drr_type = DRR_BEGIN; 726 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; 727 drr->drr_u.drr_begin.drr_version = DMU_BACKUP_VERSION; 728 drr->drr_u.drr_begin.drr_creation_time = 729 ds->ds_phys->ds_creation_time; 730 drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type; 731 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; 732 if (fromds) 733 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid; 734 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); 735 736 ba.drr = drr; 737 ba.vp = vp; 738 ba.os = tosnap; 739 740 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { 741 kmem_free(drr, sizeof (dmu_replay_record_t)); 742 return (ba.err); 743 } 744 745 err = traverse_dsl_dataset(ds, 746 fromds ? fromds->ds_phys->ds_creation_txg : 0, 747 ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK, 748 backup_cb, &ba); 749 750 if (err) { 751 if (err == EINTR && ba.err) 752 err = ba.err; 753 return (err); 754 } 755 756 bzero(drr, sizeof (dmu_replay_record_t)); 757 drr->drr_type = DRR_END; 758 759 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) 760 return (ba.err); 761 762 kmem_free(drr, sizeof (dmu_replay_record_t)); 763 764 return (0); 765 } 766 767 struct restorearg { 768 int err; 769 int byteswap; 770 vnode_t *vp; 771 char *buf; 772 uint64_t voff; 773 int buflen; /* number of valid bytes in buf */ 774 int bufoff; /* next offset to read */ 775 int bufsize; /* amount of memory allocated for buf */ 776 }; 777 778 static int 779 replay_incremental_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) 780 { 781 struct drr_begin *drrb = arg; 782 dsl_dataset_t *ds = NULL; 783 dsl_dataset_t *ds_prev = NULL; 784 const char *snapname; 785 int err = EINVAL; 786 uint64_t val; 787 788 /* this must be a filesytem */ 789 if (dd->dd_phys->dd_head_dataset_obj == 0) 790 goto die; 791 792 ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj, 793 NULL, DS_MODE_EXCLUSIVE, FTAG); 794 795 if (ds == NULL) { 796 err = EBUSY; 797 goto die; 798 } 799 800 /* must already be a snapshot of this fs */ 801 if (ds->ds_phys->ds_prev_snap_obj == 0) { 802 err = ENODEV; 803 goto die; 804 } 805 806 /* most recent snapshot must match fromguid */ 807 ds_prev = dsl_dataset_open_obj(dd->dd_pool, 808 ds->ds_phys->ds_prev_snap_obj, NULL, 809 DS_MODE_STANDARD | DS_MODE_READONLY, FTAG); 810 if (ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) { 811 err = ENODEV; 812 goto die; 813 } 814 815 /* must not have any changes since most recent snapshot */ 816 if (ds->ds_phys->ds_bp.blk_birth > 817 ds_prev->ds_phys->ds_creation_txg) { 818 err = ETXTBSY; 819 goto die; 820 } 821 822 /* new snapshot name must not exist */ 823 snapname = strrchr(drrb->drr_toname, '@'); 824 if (snapname == NULL) { 825 err = EEXIST; 826 goto die; 827 } 828 snapname++; 829 err = zap_lookup(dd->dd_pool->dp_meta_objset, 830 ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &val); 831 if (err != ENOENT) { 832 if (err == 0) 833 err = EEXIST; 834 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); 835 dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG); 836 return (err); 837 } 838 839 dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG); 840 841 /* The point of no (unsuccessful) return. */ 842 843 dmu_buf_will_dirty(ds->ds_dbuf, tx); 844 ds->ds_phys->ds_restoring = TRUE; 845 846 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); 847 return (0); 848 849 die: 850 if (ds_prev) 851 dsl_dataset_close(ds_prev, DS_MODE_STANDARD, FTAG); 852 if (ds) 853 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); 854 return (err); 855 } 856 857 static int 858 replay_full_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) 859 { 860 struct drr_begin *drrb = arg; 861 int err; 862 char *fsfullname, *fslastname, *cp; 863 dsl_dataset_t *ds; 864 865 fsfullname = kmem_alloc(MAXNAMELEN, KM_SLEEP); 866 (void) strncpy(fsfullname, drrb->drr_toname, MAXNAMELEN); 867 cp = strchr(fsfullname, '@'); 868 if (cp == NULL) { 869 kmem_free(fsfullname, MAXNAMELEN); 870 return (EINVAL); 871 } 872 *cp = '\0'; 873 fslastname = strrchr(fsfullname, '/'); 874 if (fslastname == NULL) { 875 kmem_free(fsfullname, MAXNAMELEN); 876 return (EINVAL); 877 } 878 fslastname++; 879 880 err = dsl_dataset_create_sync(dd, fsfullname, fslastname, NULL, tx); 881 if (err) { 882 kmem_free(fsfullname, MAXNAMELEN); 883 return (err); 884 } 885 886 /* the point of no (unsuccessful) return */ 887 888 err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, fsfullname, 889 DS_MODE_EXCLUSIVE, FTAG, &ds); 890 ASSERT3U(err, ==, 0); 891 kmem_free(fsfullname, MAXNAMELEN); 892 893 (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds), 894 ds, drrb->drr_type, tx); 895 896 dmu_buf_will_dirty(ds->ds_dbuf, tx); 897 ds->ds_phys->ds_restoring = TRUE; 898 899 dsl_dataset_close(ds, DS_MODE_EXCLUSIVE, FTAG); 900 return (0); 901 } 902 903 static int 904 replay_end_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) 905 { 906 struct drr_begin *drrb = arg; 907 int err; 908 char *snapname; 909 dsl_dataset_t *ds; 910 911 /* XXX verify that drr_toname is in dd */ 912 913 snapname = strchr(drrb->drr_toname, '@'); 914 if (snapname == NULL) 915 return (EINVAL); 916 snapname++; 917 918 /* create snapshot */ 919 err = dsl_dataset_snapshot_sync(dd, snapname, tx); 920 if (err) 921 return (err); 922 923 /* set snapshot's creation time and guid */ 924 err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, drrb->drr_toname, 925 DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_RESTORE, FTAG, &ds); 926 ASSERT3U(err, ==, 0); 927 928 dmu_buf_will_dirty(ds->ds_dbuf, tx); 929 ds->ds_phys->ds_creation_time = drrb->drr_creation_time; 930 ds->ds_phys->ds_guid = drrb->drr_toguid; 931 ds->ds_phys->ds_restoring = FALSE; 932 933 dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG); 934 935 ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj, 936 NULL, DS_MODE_STANDARD | DS_MODE_RESTORE, FTAG); 937 dmu_buf_will_dirty(ds->ds_dbuf, tx); 938 ds->ds_phys->ds_restoring = FALSE; 939 dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG); 940 941 return (0); 942 } 943 944 void * 945 restore_read(struct restorearg *ra, int len) 946 { 947 void *rv; 948 949 /* some things will require 8-byte alignment, so everything must */ 950 ASSERT3U(len % 8, ==, 0); 951 952 while (ra->buflen - ra->bufoff < len) { 953 ssize_t resid; 954 int leftover = ra->buflen - ra->bufoff; 955 956 (void) memmove(ra->buf, ra->buf + ra->bufoff, leftover); 957 ra->err = vn_rdwr(UIO_READ, ra->vp, 958 (caddr_t)ra->buf + leftover, ra->bufsize - leftover, 959 ra->voff, UIO_SYSSPACE, FAPPEND, 960 RLIM_INFINITY, CRED(), &resid); 961 962 /* Need to compute checksum */ 963 964 ra->voff += ra->bufsize - leftover - resid; 965 ra->buflen = ra->bufsize - resid; 966 ra->bufoff = 0; 967 if (resid == ra->bufsize - leftover) 968 ra->err = EINVAL; 969 if (ra->err) 970 return (NULL); 971 } 972 973 ASSERT3U(ra->bufoff % 8, ==, 0); 974 ASSERT3U(ra->buflen - ra->bufoff, >=, len); 975 rv = ra->buf + ra->bufoff; 976 ra->bufoff += len; 977 return (rv); 978 } 979 980 static void 981 backup_byteswap(dmu_replay_record_t *drr) 982 { 983 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) 984 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) 985 drr->drr_type = BSWAP_32(drr->drr_type); 986 switch (drr->drr_type) { 987 case DRR_BEGIN: 988 DO64(drr_begin.drr_magic); 989 DO64(drr_begin.drr_version); 990 DO64(drr_begin.drr_creation_time); 991 DO32(drr_begin.drr_type); 992 DO64(drr_begin.drr_toguid); 993 DO64(drr_begin.drr_fromguid); 994 break; 995 case DRR_OBJECT: 996 DO64(drr_object.drr_object); 997 /* DO64(drr_object.drr_allocation_txg); */ 998 DO32(drr_object.drr_type); 999 DO32(drr_object.drr_bonustype); 1000 DO32(drr_object.drr_blksz); 1001 DO32(drr_object.drr_bonuslen); 1002 break; 1003 case DRR_FREEOBJECTS: 1004 DO64(drr_freeobjects.drr_firstobj); 1005 DO64(drr_freeobjects.drr_numobjs); 1006 break; 1007 case DRR_WRITE: 1008 DO64(drr_write.drr_object); 1009 DO32(drr_write.drr_type); 1010 DO64(drr_write.drr_offset); 1011 DO64(drr_write.drr_length); 1012 break; 1013 case DRR_FREE: 1014 DO64(drr_free.drr_object); 1015 DO64(drr_free.drr_offset); 1016 DO64(drr_free.drr_length); 1017 break; 1018 case DRR_END: 1019 DO64(drr_end.drr_checksum); 1020 break; 1021 } 1022 #undef DO64 1023 #undef DO32 1024 } 1025 1026 static int 1027 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) 1028 { 1029 int err; 1030 dmu_tx_t *tx; 1031 1032 err = dmu_object_info(os, drro->drr_object, NULL); 1033 1034 if (err != 0 && err != ENOENT) 1035 return (EINVAL); 1036 1037 if (drro->drr_type == DMU_OT_NONE || 1038 drro->drr_type >= DMU_OT_NUMTYPES || 1039 drro->drr_bonustype >= DMU_OT_NUMTYPES || 1040 drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS || 1041 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || 1042 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || 1043 drro->drr_blksz < SPA_MINBLOCKSIZE || 1044 drro->drr_blksz > SPA_MAXBLOCKSIZE || 1045 drro->drr_bonuslen > DN_MAX_BONUSLEN) { 1046 return (EINVAL); 1047 } 1048 1049 tx = dmu_tx_create(os); 1050 1051 if (err == ENOENT) { 1052 /* currently free, want to be allocated */ 1053 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); 1054 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1); 1055 err = dmu_tx_assign(tx, TXG_WAIT); 1056 if (err) { 1057 dmu_tx_abort(tx); 1058 return (err); 1059 } 1060 err = dmu_object_claim(os, drro->drr_object, 1061 drro->drr_type, drro->drr_blksz, 1062 drro->drr_bonustype, drro->drr_bonuslen, tx); 1063 } else { 1064 /* currently allocated, want to be allocated */ 1065 dmu_tx_hold_bonus(tx, drro->drr_object); 1066 /* 1067 * We may change blocksize, so need to 1068 * hold_write 1069 */ 1070 dmu_tx_hold_write(tx, drro->drr_object, 0, 1); 1071 err = dmu_tx_assign(tx, TXG_WAIT); 1072 if (err) { 1073 dmu_tx_abort(tx); 1074 return (err); 1075 } 1076 1077 err = dmu_object_reclaim(os, drro->drr_object, 1078 drro->drr_type, drro->drr_blksz, 1079 drro->drr_bonustype, drro->drr_bonuslen, tx); 1080 } 1081 if (err) { 1082 dmu_tx_commit(tx); 1083 return (EINVAL); 1084 } 1085 1086 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx); 1087 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); 1088 1089 if (drro->drr_bonuslen) { 1090 dmu_buf_t *db; 1091 void *data; 1092 db = dmu_bonus_hold(os, drro->drr_object); 1093 dmu_buf_will_dirty(db, tx); 1094 1095 ASSERT3U(db->db_size, ==, drro->drr_bonuslen); 1096 data = restore_read(ra, P2ROUNDUP(db->db_size, 8)); 1097 if (data == NULL) { 1098 dmu_tx_commit(tx); 1099 return (ra->err); 1100 } 1101 bcopy(data, db->db_data, db->db_size); 1102 if (ra->byteswap) { 1103 dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, 1104 drro->drr_bonuslen); 1105 } 1106 dmu_buf_rele(db); 1107 } 1108 dmu_tx_commit(tx); 1109 return (0); 1110 } 1111 1112 /* ARGSUSED */ 1113 static int 1114 restore_freeobjects(struct restorearg *ra, objset_t *os, 1115 struct drr_freeobjects *drrfo) 1116 { 1117 uint64_t obj; 1118 1119 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) 1120 return (EINVAL); 1121 1122 for (obj = drrfo->drr_firstobj; 1123 obj < drrfo->drr_firstobj + drrfo->drr_numobjs; obj++) { 1124 dmu_tx_t *tx; 1125 int err; 1126 1127 if (dmu_object_info(os, obj, NULL) != 0) 1128 continue; 1129 1130 tx = dmu_tx_create(os); 1131 dmu_tx_hold_bonus(tx, obj); 1132 err = dmu_tx_assign(tx, TXG_WAIT); 1133 if (err) { 1134 dmu_tx_abort(tx); 1135 return (err); 1136 } 1137 err = dmu_object_free(os, obj, tx); 1138 dmu_tx_commit(tx); 1139 if (err && err != ENOENT) 1140 return (EINVAL); 1141 } 1142 return (0); 1143 } 1144 1145 static int 1146 restore_write(struct restorearg *ra, objset_t *os, 1147 struct drr_write *drrw) 1148 { 1149 dmu_tx_t *tx; 1150 void *data; 1151 int err; 1152 1153 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || 1154 drrw->drr_type >= DMU_OT_NUMTYPES) 1155 return (EINVAL); 1156 1157 data = restore_read(ra, drrw->drr_length); 1158 if (data == NULL) 1159 return (ra->err); 1160 1161 if (dmu_object_info(os, drrw->drr_object, NULL) != 0) 1162 return (EINVAL); 1163 1164 tx = dmu_tx_create(os); 1165 1166 dmu_tx_hold_write(tx, drrw->drr_object, 1167 drrw->drr_offset, drrw->drr_length); 1168 err = dmu_tx_assign(tx, TXG_WAIT); 1169 if (err) { 1170 dmu_tx_abort(tx); 1171 return (err); 1172 } 1173 if (ra->byteswap) 1174 dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length); 1175 dmu_write(os, drrw->drr_object, 1176 drrw->drr_offset, drrw->drr_length, data, tx); 1177 dmu_tx_commit(tx); 1178 return (0); 1179 } 1180 1181 /* ARGSUSED */ 1182 static int 1183 restore_free(struct restorearg *ra, objset_t *os, 1184 struct drr_free *drrf) 1185 { 1186 dmu_tx_t *tx; 1187 int err; 1188 1189 if (drrf->drr_length != -1ULL && 1190 drrf->drr_offset + drrf->drr_length < drrf->drr_offset) 1191 return (EINVAL); 1192 1193 if (dmu_object_info(os, drrf->drr_object, NULL) != 0) 1194 return (EINVAL); 1195 1196 tx = dmu_tx_create(os); 1197 1198 dmu_tx_hold_free(tx, drrf->drr_object, 1199 drrf->drr_offset, drrf->drr_length); 1200 err = dmu_tx_assign(tx, TXG_WAIT); 1201 if (err) { 1202 dmu_tx_abort(tx); 1203 return (err); 1204 } 1205 dmu_free_range(os, drrf->drr_object, 1206 drrf->drr_offset, drrf->drr_length, tx); 1207 dmu_tx_commit(tx); 1208 return (0); 1209 } 1210 1211 int 1212 dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep, 1213 vnode_t *vp, uint64_t voffset) 1214 { 1215 struct restorearg ra; 1216 dmu_replay_record_t *drr; 1217 char *cp, *tosnap; 1218 dsl_dir_t *dd = NULL; 1219 objset_t *os = NULL; 1220 1221 bzero(&ra, sizeof (ra)); 1222 ra.vp = vp; 1223 ra.voff = voffset; 1224 ra.bufsize = 1<<20; 1225 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); 1226 1227 if (drrb->drr_magic == DMU_BACKUP_MAGIC) { 1228 ra.byteswap = FALSE; 1229 } else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { 1230 ra.byteswap = TRUE; 1231 } else { 1232 ra.err = EINVAL; 1233 goto out; 1234 } 1235 1236 if (ra.byteswap) { 1237 drrb->drr_magic = BSWAP_64(drrb->drr_magic); 1238 drrb->drr_version = BSWAP_64(drrb->drr_version); 1239 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); 1240 drrb->drr_type = BSWAP_32(drrb->drr_type); 1241 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); 1242 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); 1243 } 1244 1245 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); 1246 1247 tosnap = drrb->drr_toname; 1248 if (drrb->drr_version != DMU_BACKUP_VERSION || 1249 drrb->drr_type >= DMU_OST_NUMTYPES || 1250 strchr(drrb->drr_toname, '@') == NULL) { 1251 ra.err = EINVAL; 1252 goto out; 1253 } 1254 1255 /* 1256 * Process the begin in syncing context. 1257 */ 1258 if (drrb->drr_fromguid) { 1259 /* incremental backup */ 1260 1261 cp = strchr(tosnap, '@'); 1262 *cp = '\0'; 1263 dd = dsl_dir_open(tosnap, FTAG, NULL); 1264 *cp = '@'; 1265 if (dd == NULL) { 1266 ra.err = ENOENT; 1267 goto out; 1268 } 1269 1270 ra.err = dsl_dir_sync_task(dd, replay_incremental_sync, 1271 drrb, 1<<20); 1272 } else { 1273 /* full backup */ 1274 const char *tail; 1275 1276 cp = strchr(tosnap, '@'); 1277 *cp = '\0'; 1278 dd = dsl_dir_open(tosnap, FTAG, &tail); 1279 *cp = '@'; 1280 if (dd == NULL) { 1281 ra.err = ENOENT; 1282 goto out; 1283 } 1284 if (tail == NULL) { 1285 ra.err = EEXIST; 1286 goto out; 1287 } 1288 1289 ra.err = dsl_dir_sync_task(dd, replay_full_sync, 1290 drrb, 1<<20); 1291 } 1292 if (ra.err) 1293 goto out; 1294 1295 /* 1296 * Open the objset we are modifying. 1297 */ 1298 1299 cp = strchr(tosnap, '@'); 1300 *cp = '\0'; 1301 ra.err = dmu_objset_open(tosnap, DMU_OST_ANY, 1302 DS_MODE_PRIMARY | DS_MODE_RESTORE, &os); 1303 *cp = '@'; 1304 ASSERT3U(ra.err, ==, 0); 1305 1306 /* 1307 * Read records and process them. 1308 */ 1309 while (ra.err == 0 && 1310 NULL != (drr = restore_read(&ra, sizeof (*drr)))) { 1311 if (issig(JUSTLOOKING)) { 1312 ra.err = EINTR; 1313 goto out; 1314 } 1315 1316 if (ra.byteswap) 1317 backup_byteswap(drr); 1318 1319 switch (drr->drr_type) { 1320 case DRR_OBJECT: 1321 { 1322 /* 1323 * We need to make a copy of the record header, 1324 * because restore_{object,write} may need to 1325 * restore_read(), which will invalidate drr. 1326 */ 1327 struct drr_object drro = drr->drr_u.drr_object; 1328 ra.err = restore_object(&ra, os, &drro); 1329 break; 1330 } 1331 case DRR_FREEOBJECTS: 1332 { 1333 struct drr_freeobjects drrfo = 1334 drr->drr_u.drr_freeobjects; 1335 ra.err = restore_freeobjects(&ra, os, &drrfo); 1336 break; 1337 } 1338 case DRR_WRITE: 1339 { 1340 struct drr_write drrw = drr->drr_u.drr_write; 1341 ra.err = restore_write(&ra, os, &drrw); 1342 break; 1343 } 1344 case DRR_FREE: 1345 { 1346 struct drr_free drrf = drr->drr_u.drr_free; 1347 ra.err = restore_free(&ra, os, &drrf); 1348 break; 1349 } 1350 case DRR_END: 1351 /* Need to verify checksum. */ 1352 /* 1353 * dd may be the parent of the dd we are 1354 * restoring into (eg. if it's a full backup). 1355 */ 1356 ra.err = dsl_dir_sync_task(dmu_objset_ds(os)-> 1357 ds_dir, replay_end_sync, drrb, 1<<20); 1358 goto out; 1359 default: 1360 ra.err = EINVAL; 1361 goto out; 1362 } 1363 } 1364 1365 out: 1366 if (os) 1367 dmu_objset_close(os); 1368 1369 /* 1370 * Make sure we don't rollback/destroy unless we actually 1371 * processed the begin properly. 'os' will only be set if this 1372 * is the case. 1373 */ 1374 if (ra.err && os && dd && tosnap && strchr(tosnap, '@')) { 1375 /* 1376 * rollback or destroy what we created, so we don't 1377 * leave it in the restoring state. 1378 */ 1379 txg_wait_synced(dd->dd_pool, 0); 1380 if (drrb->drr_fromguid) { 1381 /* incremental: rollback to most recent snapshot */ 1382 (void) dsl_dir_sync_task(dd, 1383 dsl_dataset_rollback_sync, NULL, 0); 1384 } else { 1385 /* full: destroy whole fs */ 1386 cp = strchr(tosnap, '@'); 1387 *cp = '\0'; 1388 cp = strchr(tosnap, '/'); 1389 if (cp) { 1390 (void) dsl_dir_sync_task(dd, 1391 dsl_dir_destroy_sync, cp+1, 0); 1392 } 1393 cp = strchr(tosnap, '\0'); 1394 *cp = '@'; 1395 } 1396 1397 } 1398 1399 if (dd) 1400 dsl_dir_close(dd, FTAG); 1401 kmem_free(ra.buf, ra.bufsize); 1402 if (sizep) 1403 *sizep = ra.voff; 1404 return (ra.err); 1405 } 1406 1407 /* 1408 * Intent log support: sync the block at <os, object, offset> to disk. 1409 * N.B. and XXX: the caller is responsible for serializing dmu_sync()s 1410 * of the same block, and for making sure that the data isn't changing 1411 * while dmu_sync() is writing it. 1412 * 1413 * Return values: 1414 * 1415 * EALREADY: this txg has already been synced, so there's nothing to to. 1416 * The caller should not log the write. 1417 * 1418 * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. 1419 * The caller should not log the write. 1420 * 1421 * EINPROGRESS: the block is in the process of being synced by the 1422 * usual mechanism (spa_sync()), so we can't sync it here. 1423 * The caller should txg_wait_synced() and not log the write. 1424 * 1425 * EBUSY: another thread is trying to dmu_sync() the same dbuf. 1426 * (This case cannot arise under the current locking rules.) 1427 * The caller should txg_wait_synced() and not log the write. 1428 * 1429 * ESTALE: the block was dirtied or freed while we were writing it, 1430 * so the data is no longer valid. 1431 * The caller should txg_wait_synced() and not log the write. 1432 * 1433 * 0: success. Sets *bp to the blkptr just written, and sets 1434 * *blkoff to the data's offset within that block. 1435 * The caller should log this blkptr/blkoff in its lr_write_t. 1436 */ 1437 int 1438 dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, 1439 blkptr_t *bp, uint64_t txg) 1440 { 1441 dsl_pool_t *dp = os->os->os_dsl_dataset->ds_dir->dd_pool; 1442 tx_state_t *tx = &dp->dp_tx; 1443 dmu_buf_impl_t *db; 1444 blkptr_t *blk; 1445 int err; 1446 1447 ASSERT(RW_LOCK_HELD(&tx->tx_suspend)); 1448 ASSERT(BP_IS_HOLE(bp)); 1449 ASSERT(txg != 0); 1450 1451 dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", 1452 txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); 1453 1454 /* 1455 * If this txg already synced, there's nothing to do. 1456 */ 1457 if (txg <= tx->tx_synced_txg) { 1458 /* 1459 * If we're running ziltest, we need the blkptr regardless. 1460 */ 1461 if (txg > spa_freeze_txg(dp->dp_spa)) { 1462 db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset); 1463 /* if db_blkptr == NULL, this was an empty write */ 1464 if (db->db_blkptr) 1465 *bp = *db->db_blkptr; /* structure assignment */ 1466 else 1467 bzero(bp, sizeof (blkptr_t)); 1468 *blkoff = offset - db->db.db_offset; 1469 ASSERT3U(*blkoff, <, db->db.db_size); 1470 dmu_buf_rele((dmu_buf_t *)db); 1471 return (0); 1472 } 1473 return (EALREADY); 1474 } 1475 1476 /* 1477 * If this txg is in the middle of syncing, just wait for it. 1478 */ 1479 if (txg == tx->tx_syncing_txg) { 1480 ASSERT(txg != tx->tx_open_txg); 1481 return (EINPROGRESS); 1482 } 1483 1484 db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset); 1485 1486 mutex_enter(&db->db_mtx); 1487 1488 /* 1489 * If this dbuf isn't dirty, must have been free_range'd. 1490 * There's no need to log writes to freed blocks, so we're done. 1491 */ 1492 if (!list_link_active(&db->db_dirty_node[txg&TXG_MASK])) { 1493 mutex_exit(&db->db_mtx); 1494 dmu_buf_rele((dmu_buf_t *)db); 1495 return (ENOENT); 1496 } 1497 1498 blk = db->db_d.db_overridden_by[txg&TXG_MASK]; 1499 1500 /* 1501 * If we already did a dmu_sync() of this dbuf in this txg, 1502 * free the old block before writing the new one. 1503 */ 1504 if (blk != NULL) { 1505 ASSERT(blk != IN_DMU_SYNC); 1506 if (blk == IN_DMU_SYNC) { 1507 mutex_exit(&db->db_mtx); 1508 dmu_buf_rele((dmu_buf_t *)db); 1509 return (EBUSY); 1510 } 1511 arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); 1512 if (!BP_IS_HOLE(blk)) { 1513 (void) arc_free(NULL, os->os->os_spa, txg, blk, 1514 NULL, NULL, ARC_WAIT); 1515 } 1516 kmem_free(blk, sizeof (blkptr_t)); 1517 } 1518 1519 db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC; 1520 mutex_exit(&db->db_mtx); 1521 1522 blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); 1523 blk->blk_birth = 0; /* mark as invalid */ 1524 1525 err = arc_write(NULL, os->os->os_spa, 1526 zio_checksum_select(db->db_dnode->dn_checksum, os->os->os_checksum), 1527 zio_compress_select(db->db_dnode->dn_compress, os->os->os_compress), 1528 txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL, 1529 ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT); 1530 ASSERT(err == 0); 1531 1532 if (!BP_IS_HOLE(blk)) { 1533 blk->blk_fill = 1; 1534 BP_SET_TYPE(blk, db->db_dnode->dn_type); 1535 BP_SET_LEVEL(blk, 0); 1536 } 1537 1538 /* copy the block pointer back to caller */ 1539 *bp = *blk; /* structure assignment */ 1540 *blkoff = offset - db->db.db_offset; 1541 ASSERT3U(*blkoff, <, db->db.db_size); 1542 1543 mutex_enter(&db->db_mtx); 1544 if (db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC) { 1545 /* we were dirtied/freed during the sync */ 1546 ASSERT3P(db->db_d.db_overridden_by[txg&TXG_MASK], ==, NULL); 1547 arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); 1548 mutex_exit(&db->db_mtx); 1549 dmu_buf_rele((dmu_buf_t *)db); 1550 /* Note that this block does not free on disk until txg syncs */ 1551 1552 /* 1553 * XXX can we use ARC_NOWAIT here? 1554 * XXX should we be ignoring the return code? 1555 */ 1556 if (!BP_IS_HOLE(blk)) { 1557 (void) arc_free(NULL, os->os->os_spa, txg, blk, 1558 NULL, NULL, ARC_WAIT); 1559 } 1560 kmem_free(blk, sizeof (blkptr_t)); 1561 return (ESTALE); 1562 } 1563 1564 db->db_d.db_overridden_by[txg&TXG_MASK] = blk; 1565 mutex_exit(&db->db_mtx); 1566 dmu_buf_rele((dmu_buf_t *)db); 1567 ASSERT3U(txg, >, tx->tx_syncing_txg); 1568 return (0); 1569 } 1570 1571 uint64_t 1572 dmu_object_max_nonzero_offset(objset_t *os, uint64_t object) 1573 { 1574 dnode_t *dn = dnode_hold(os->os, object, FTAG); 1575 uint64_t rv = dnode_max_nonzero_offset(dn); 1576 dnode_rele(dn, FTAG); 1577 return (rv); 1578 } 1579 1580 int 1581 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, 1582 dmu_tx_t *tx) 1583 { 1584 dnode_t *dn = dnode_hold(os->os, object, FTAG); 1585 int err = dnode_set_blksz(dn, size, ibs, tx); 1586 dnode_rele(dn, FTAG); 1587 return (err); 1588 } 1589 1590 void 1591 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, 1592 dmu_tx_t *tx) 1593 { 1594 dnode_t *dn = dnode_hold(os->os, object, FTAG); 1595 ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); 1596 dn->dn_checksum = checksum; 1597 dnode_setdirty(dn, tx); 1598 dnode_rele(dn, FTAG); 1599 } 1600 1601 void 1602 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, 1603 dmu_tx_t *tx) 1604 { 1605 dnode_t *dn = dnode_hold(os->os, object, FTAG); 1606 ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); 1607 dn->dn_compress = compress; 1608 dnode_setdirty(dn, tx); 1609 dnode_rele(dn, FTAG); 1610 } 1611 1612 int 1613 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) 1614 { 1615 dnode_t *dn; 1616 int i, err; 1617 1618 dn = dnode_hold(os->os, object, FTAG); 1619 /* 1620 * Sync any current changes before 1621 * we go trundling through the block pointers. 1622 */ 1623 for (i = 0; i < TXG_SIZE; i++) { 1624 if (dn->dn_dirtyblksz[i]) 1625 break; 1626 } 1627 if (i != TXG_SIZE) { 1628 dnode_rele(dn, FTAG); 1629 txg_wait_synced(dmu_objset_pool(os), 0); 1630 dn = dnode_hold(os->os, object, FTAG); 1631 } 1632 1633 err = dnode_next_offset(dn, hole, off, 1, 1); 1634 dnode_rele(dn, FTAG); 1635 1636 return (err); 1637 } 1638 1639 void 1640 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) 1641 { 1642 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1643 mutex_enter(&dn->dn_mtx); 1644 1645 doi->doi_data_block_size = dn->dn_datablksz; 1646 doi->doi_metadata_block_size = dn->dn_indblkshift ? 1647 1ULL << dn->dn_indblkshift : 0; 1648 doi->doi_indirection = dn->dn_nlevels; 1649 doi->doi_checksum = dn->dn_checksum; 1650 doi->doi_compress = dn->dn_compress; 1651 doi->doi_physical_blks = dn->dn_phys->dn_secphys; 1652 doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; 1653 doi->doi_type = dn->dn_type; 1654 doi->doi_bonus_size = dn->dn_bonuslen; 1655 doi->doi_bonus_type = dn->dn_bonustype; 1656 1657 mutex_exit(&dn->dn_mtx); 1658 rw_exit(&dn->dn_struct_rwlock); 1659 } 1660 1661 /* 1662 * Get information on a DMU object. 1663 * If doi is NULL, just indicates whether the object exists. 1664 */ 1665 int 1666 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) 1667 { 1668 dnode_t *dn = dnode_hold(os->os, object, FTAG); 1669 1670 if (dn == NULL) 1671 return (ENOENT); 1672 1673 if (doi != NULL) 1674 dmu_object_info_from_dnode(dn, doi); 1675 1676 dnode_rele(dn, FTAG); 1677 return (0); 1678 } 1679 1680 /* 1681 * As above, but faster; can be used when you have a held dbuf in hand. 1682 */ 1683 void 1684 dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) 1685 { 1686 dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); 1687 } 1688 1689 /* 1690 * Faster still when you only care about the size. 1691 * This is specifically optimized for zfs_getattr(). 1692 */ 1693 void 1694 dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) 1695 { 1696 dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 1697 1698 *blksize = dn->dn_datablksz; 1699 *nblk512 = dn->dn_phys->dn_secphys + 1; /* add 1 for dnode space */ 1700 } 1701 1702 void 1703 byteswap_uint64_array(void *vbuf, size_t size) 1704 { 1705 uint64_t *buf = vbuf; 1706 size_t count = size >> 3; 1707 int i; 1708 1709 ASSERT((size & 7) == 0); 1710 1711 for (i = 0; i < count; i++) 1712 buf[i] = BSWAP_64(buf[i]); 1713 } 1714 1715 void 1716 byteswap_uint32_array(void *vbuf, size_t size) 1717 { 1718 uint32_t *buf = vbuf; 1719 size_t count = size >> 2; 1720 int i; 1721 1722 ASSERT((size & 3) == 0); 1723 1724 for (i = 0; i < count; i++) 1725 buf[i] = BSWAP_32(buf[i]); 1726 } 1727 1728 void 1729 byteswap_uint16_array(void *vbuf, size_t size) 1730 { 1731 uint16_t *buf = vbuf; 1732 size_t count = size >> 1; 1733 int i; 1734 1735 ASSERT((size & 1) == 0); 1736 1737 for (i = 0; i < count; i++) 1738 buf[i] = BSWAP_16(buf[i]); 1739 } 1740 1741 /* ARGSUSED */ 1742 void 1743 byteswap_uint8_array(void *vbuf, size_t size) 1744 { 1745 } 1746 1747 void 1748 dmu_init(void) 1749 { 1750 dbuf_init(); 1751 dnode_init(); 1752 arc_init(); 1753 } 1754 1755 void 1756 dmu_fini(void) 1757 { 1758 arc_fini(); 1759 dnode_fini(); 1760 dbuf_fini(); 1761 } 1762