1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/dmu.h> 29 #include <sys/dmu_impl.h> 30 #include <sys/dmu_tx.h> 31 #include <sys/dbuf.h> 32 #include <sys/dnode.h> 33 #include <sys/zfs_context.h> 34 #include <sys/dmu_objset.h> 35 #include <sys/dmu_traverse.h> 36 #include <sys/dsl_dataset.h> 37 #include <sys/dsl_dir.h> 38 #include <sys/dsl_pool.h> 39 #include <sys/dsl_synctask.h> 40 #include <sys/dsl_prop.h> 41 #include <sys/dmu_zfetch.h> 42 #include <sys/zfs_ioctl.h> 43 #include <sys/zap.h> 44 #include <sys/zio_checksum.h> 45 #ifdef _KERNEL 46 #include <sys/vmsystm.h> 47 #endif 48 49 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { 50 { byteswap_uint8_array, TRUE, "unallocated" }, 51 { zap_byteswap, TRUE, "object directory" }, 52 { byteswap_uint64_array, TRUE, "object array" }, 53 { byteswap_uint8_array, TRUE, "packed nvlist" }, 54 { byteswap_uint64_array, TRUE, "packed nvlist size" }, 55 { byteswap_uint64_array, TRUE, "bplist" }, 56 { byteswap_uint64_array, TRUE, "bplist header" }, 57 { byteswap_uint64_array, TRUE, "SPA space map header" }, 58 { byteswap_uint64_array, TRUE, "SPA space map" }, 59 { byteswap_uint64_array, TRUE, "ZIL intent log" }, 60 { dnode_buf_byteswap, TRUE, "DMU dnode" }, 61 { dmu_objset_byteswap, TRUE, "DMU objset" }, 62 { byteswap_uint64_array, TRUE, "DSL directory" }, 63 { zap_byteswap, TRUE, "DSL directory child map"}, 64 { zap_byteswap, TRUE, "DSL dataset snap map" }, 65 { zap_byteswap, TRUE, "DSL props" }, 66 { byteswap_uint64_array, TRUE, "DSL dataset" }, 67 { zfs_znode_byteswap, TRUE, "ZFS znode" }, 68 { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" }, 69 { byteswap_uint8_array, FALSE, "ZFS plain file" }, 70 { zap_byteswap, TRUE, "ZFS directory" }, 71 { zap_byteswap, TRUE, "ZFS master node" }, 72 { zap_byteswap, TRUE, "ZFS delete queue" }, 73 { byteswap_uint8_array, FALSE, "zvol object" }, 74 { zap_byteswap, TRUE, "zvol prop" }, 75 { byteswap_uint8_array, FALSE, "other uint8[]" }, 76 { byteswap_uint64_array, FALSE, "other uint64[]" }, 77 { zap_byteswap, TRUE, "other ZAP" }, 78 { zap_byteswap, TRUE, "persistent error log" }, 79 { byteswap_uint8_array, TRUE, "SPA history" }, 80 { byteswap_uint64_array, TRUE, "SPA history offsets" }, 81 { zap_byteswap, TRUE, "Pool properties" }, 82 { zap_byteswap, TRUE, "DSL permissions" }, 83 { zfs_acl_byteswap, TRUE, "ZFS ACL" }, 84 { byteswap_uint8_array, TRUE, "ZFS SYSACL" }, 85 { byteswap_uint8_array, TRUE, "FUID table" }, 86 { byteswap_uint64_array, TRUE, "FUID table size" }, 87 }; 88 89 int 90 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, 91 void *tag, dmu_buf_t **dbp) 92 { 93 dnode_t *dn; 94 uint64_t blkid; 95 dmu_buf_impl_t *db; 96 int err; 97 98 err = dnode_hold(os->os, object, FTAG, &dn); 99 if (err) 100 return (err); 101 blkid = dbuf_whichblock(dn, offset); 102 rw_enter(&dn->dn_struct_rwlock, RW_READER); 103 db = dbuf_hold(dn, blkid, tag); 104 rw_exit(&dn->dn_struct_rwlock); 105 if (db == NULL) { 106 err = EIO; 107 } else { 108 err = dbuf_read(db, NULL, DB_RF_CANFAIL); 109 if (err) { 110 dbuf_rele(db, tag); 111 db = NULL; 112 } 113 } 114 115 dnode_rele(dn, FTAG); 116 *dbp = &db->db; 117 return (err); 118 } 119 120 int 121 dmu_bonus_max(void) 122 { 123 return (DN_MAX_BONUSLEN); 124 } 125 126 int 127 dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx) 128 { 129 dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 130 131 if (dn->dn_bonus != (dmu_buf_impl_t *)db) 132 return (EINVAL); 133 if (newsize < 0 || newsize > db->db_size) 134 return (EINVAL); 135 dnode_setbonuslen(dn, newsize, tx); 136 return (0); 137 } 138 139 /* 140 * returns ENOENT, EIO, or 0. 141 */ 142 int 143 dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) 144 { 145 dnode_t *dn; 146 dmu_buf_impl_t *db; 147 int error; 148 149 error = dnode_hold(os->os, object, FTAG, &dn); 150 if (error) 151 return (error); 152 153 rw_enter(&dn->dn_struct_rwlock, RW_READER); 154 if (dn->dn_bonus == NULL) { 155 rw_exit(&dn->dn_struct_rwlock); 156 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 157 if (dn->dn_bonus == NULL) 158 dbuf_create_bonus(dn); 159 } 160 db = dn->dn_bonus; 161 rw_exit(&dn->dn_struct_rwlock); 162 163 /* as long as the bonus buf is held, the dnode will be held */ 164 if (refcount_add(&db->db_holds, tag) == 1) 165 VERIFY(dnode_add_ref(dn, db)); 166 167 dnode_rele(dn, FTAG); 168 169 VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); 170 171 *dbp = &db->db; 172 return (0); 173 } 174 175 /* 176 * Note: longer-term, we should modify all of the dmu_buf_*() interfaces 177 * to take a held dnode rather than <os, object> -- the lookup is wasteful, 178 * and can induce severe lock contention when writing to several files 179 * whose dnodes are in the same block. 180 */ 181 static int 182 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, 183 uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 184 { 185 dmu_buf_t **dbp; 186 uint64_t blkid, nblks, i; 187 uint32_t flags; 188 int err; 189 zio_t *zio; 190 191 ASSERT(length <= DMU_MAX_ACCESS); 192 193 flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; 194 if (length > zfetch_array_rd_sz) 195 flags |= DB_RF_NOPREFETCH; 196 197 rw_enter(&dn->dn_struct_rwlock, RW_READER); 198 if (dn->dn_datablkshift) { 199 int blkshift = dn->dn_datablkshift; 200 nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - 201 P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; 202 } else { 203 if (offset + length > dn->dn_datablksz) { 204 zfs_panic_recover("zfs: accessing past end of object " 205 "%llx/%llx (size=%u access=%llu+%llu)", 206 (longlong_t)dn->dn_objset-> 207 os_dsl_dataset->ds_object, 208 (longlong_t)dn->dn_object, dn->dn_datablksz, 209 (longlong_t)offset, (longlong_t)length); 210 return (EIO); 211 } 212 nblks = 1; 213 } 214 dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); 215 216 zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE); 217 blkid = dbuf_whichblock(dn, offset); 218 for (i = 0; i < nblks; i++) { 219 dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); 220 if (db == NULL) { 221 rw_exit(&dn->dn_struct_rwlock); 222 dmu_buf_rele_array(dbp, nblks, tag); 223 zio_nowait(zio); 224 return (EIO); 225 } 226 /* initiate async i/o */ 227 if (read) { 228 rw_exit(&dn->dn_struct_rwlock); 229 (void) dbuf_read(db, zio, flags); 230 rw_enter(&dn->dn_struct_rwlock, RW_READER); 231 } 232 dbp[i] = &db->db; 233 } 234 rw_exit(&dn->dn_struct_rwlock); 235 236 /* wait for async i/o */ 237 err = zio_wait(zio); 238 if (err) { 239 dmu_buf_rele_array(dbp, nblks, tag); 240 return (err); 241 } 242 243 /* wait for other io to complete */ 244 if (read) { 245 for (i = 0; i < nblks; i++) { 246 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; 247 mutex_enter(&db->db_mtx); 248 while (db->db_state == DB_READ || 249 db->db_state == DB_FILL) 250 cv_wait(&db->db_changed, &db->db_mtx); 251 if (db->db_state == DB_UNCACHED) 252 err = EIO; 253 mutex_exit(&db->db_mtx); 254 if (err) { 255 dmu_buf_rele_array(dbp, nblks, tag); 256 return (err); 257 } 258 } 259 } 260 261 *numbufsp = nblks; 262 *dbpp = dbp; 263 return (0); 264 } 265 266 static int 267 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, 268 uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 269 { 270 dnode_t *dn; 271 int err; 272 273 err = dnode_hold(os->os, object, FTAG, &dn); 274 if (err) 275 return (err); 276 277 err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 278 numbufsp, dbpp); 279 280 dnode_rele(dn, FTAG); 281 282 return (err); 283 } 284 285 int 286 dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, 287 uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 288 { 289 dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 290 int err; 291 292 err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 293 numbufsp, dbpp); 294 295 return (err); 296 } 297 298 void 299 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) 300 { 301 int i; 302 dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; 303 304 if (numbufs == 0) 305 return; 306 307 for (i = 0; i < numbufs; i++) { 308 if (dbp[i]) 309 dbuf_rele(dbp[i], tag); 310 } 311 312 kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); 313 } 314 315 void 316 dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) 317 { 318 dnode_t *dn; 319 uint64_t blkid; 320 int nblks, i, err; 321 322 if (zfs_prefetch_disable) 323 return; 324 325 if (len == 0) { /* they're interested in the bonus buffer */ 326 dn = os->os->os_meta_dnode; 327 328 if (object == 0 || object >= DN_MAX_OBJECT) 329 return; 330 331 rw_enter(&dn->dn_struct_rwlock, RW_READER); 332 blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); 333 dbuf_prefetch(dn, blkid); 334 rw_exit(&dn->dn_struct_rwlock); 335 return; 336 } 337 338 /* 339 * XXX - Note, if the dnode for the requested object is not 340 * already cached, we will do a *synchronous* read in the 341 * dnode_hold() call. The same is true for any indirects. 342 */ 343 err = dnode_hold(os->os, object, FTAG, &dn); 344 if (err != 0) 345 return; 346 347 rw_enter(&dn->dn_struct_rwlock, RW_READER); 348 if (dn->dn_datablkshift) { 349 int blkshift = dn->dn_datablkshift; 350 nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - 351 P2ALIGN(offset, 1<<blkshift)) >> blkshift; 352 } else { 353 nblks = (offset < dn->dn_datablksz); 354 } 355 356 if (nblks != 0) { 357 blkid = dbuf_whichblock(dn, offset); 358 for (i = 0; i < nblks; i++) 359 dbuf_prefetch(dn, blkid+i); 360 } 361 362 rw_exit(&dn->dn_struct_rwlock); 363 364 dnode_rele(dn, FTAG); 365 } 366 367 int 368 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, 369 uint64_t size, dmu_tx_t *tx) 370 { 371 dnode_t *dn; 372 int err = dnode_hold(os->os, object, FTAG, &dn); 373 if (err) 374 return (err); 375 ASSERT(offset < UINT64_MAX); 376 ASSERT(size == -1ULL || size <= UINT64_MAX - offset); 377 dnode_free_range(dn, offset, size, tx); 378 dnode_rele(dn, FTAG); 379 return (0); 380 } 381 382 int 383 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 384 void *buf) 385 { 386 dnode_t *dn; 387 dmu_buf_t **dbp; 388 int numbufs, i, err; 389 390 err = dnode_hold(os->os, object, FTAG, &dn); 391 if (err) 392 return (err); 393 394 /* 395 * Deal with odd block sizes, where there can't be data past the first 396 * block. If we ever do the tail block optimization, we will need to 397 * handle that here as well. 398 */ 399 if (dn->dn_datablkshift == 0) { 400 int newsz = offset > dn->dn_datablksz ? 0 : 401 MIN(size, dn->dn_datablksz - offset); 402 bzero((char *)buf + newsz, size - newsz); 403 size = newsz; 404 } 405 406 while (size > 0) { 407 uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); 408 409 /* 410 * NB: we could do this block-at-a-time, but it's nice 411 * to be reading in parallel. 412 */ 413 err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, 414 TRUE, FTAG, &numbufs, &dbp); 415 if (err) 416 break; 417 418 for (i = 0; i < numbufs; i++) { 419 int tocpy; 420 int bufoff; 421 dmu_buf_t *db = dbp[i]; 422 423 ASSERT(size > 0); 424 425 bufoff = offset - db->db_offset; 426 tocpy = (int)MIN(db->db_size - bufoff, size); 427 428 bcopy((char *)db->db_data + bufoff, buf, tocpy); 429 430 offset += tocpy; 431 size -= tocpy; 432 buf = (char *)buf + tocpy; 433 } 434 dmu_buf_rele_array(dbp, numbufs, FTAG); 435 } 436 dnode_rele(dn, FTAG); 437 return (err); 438 } 439 440 void 441 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 442 const void *buf, dmu_tx_t *tx) 443 { 444 dmu_buf_t **dbp; 445 int numbufs, i; 446 447 if (size == 0) 448 return; 449 450 VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, 451 FALSE, FTAG, &numbufs, &dbp)); 452 453 for (i = 0; i < numbufs; i++) { 454 int tocpy; 455 int bufoff; 456 dmu_buf_t *db = dbp[i]; 457 458 ASSERT(size > 0); 459 460 bufoff = offset - db->db_offset; 461 tocpy = (int)MIN(db->db_size - bufoff, size); 462 463 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 464 465 if (tocpy == db->db_size) 466 dmu_buf_will_fill(db, tx); 467 else 468 dmu_buf_will_dirty(db, tx); 469 470 bcopy(buf, (char *)db->db_data + bufoff, tocpy); 471 472 if (tocpy == db->db_size) 473 dmu_buf_fill_done(db, tx); 474 475 offset += tocpy; 476 size -= tocpy; 477 buf = (char *)buf + tocpy; 478 } 479 dmu_buf_rele_array(dbp, numbufs, FTAG); 480 } 481 482 #ifdef _KERNEL 483 int 484 dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) 485 { 486 dmu_buf_t **dbp; 487 int numbufs, i, err; 488 489 /* 490 * NB: we could do this block-at-a-time, but it's nice 491 * to be reading in parallel. 492 */ 493 err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, 494 &numbufs, &dbp); 495 if (err) 496 return (err); 497 498 for (i = 0; i < numbufs; i++) { 499 int tocpy; 500 int bufoff; 501 dmu_buf_t *db = dbp[i]; 502 503 ASSERT(size > 0); 504 505 bufoff = uio->uio_loffset - db->db_offset; 506 tocpy = (int)MIN(db->db_size - bufoff, size); 507 508 err = uiomove((char *)db->db_data + bufoff, tocpy, 509 UIO_READ, uio); 510 if (err) 511 break; 512 513 size -= tocpy; 514 } 515 dmu_buf_rele_array(dbp, numbufs, FTAG); 516 517 return (err); 518 } 519 520 int 521 dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, 522 dmu_tx_t *tx) 523 { 524 dmu_buf_t **dbp; 525 int numbufs, i; 526 int err = 0; 527 528 if (size == 0) 529 return (0); 530 531 err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, 532 FALSE, FTAG, &numbufs, &dbp); 533 if (err) 534 return (err); 535 536 for (i = 0; i < numbufs; i++) { 537 int tocpy; 538 int bufoff; 539 dmu_buf_t *db = dbp[i]; 540 541 ASSERT(size > 0); 542 543 bufoff = uio->uio_loffset - db->db_offset; 544 tocpy = (int)MIN(db->db_size - bufoff, size); 545 546 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 547 548 if (tocpy == db->db_size) 549 dmu_buf_will_fill(db, tx); 550 else 551 dmu_buf_will_dirty(db, tx); 552 553 /* 554 * XXX uiomove could block forever (eg. nfs-backed 555 * pages). There needs to be a uiolockdown() function 556 * to lock the pages in memory, so that uiomove won't 557 * block. 558 */ 559 err = uiomove((char *)db->db_data + bufoff, tocpy, 560 UIO_WRITE, uio); 561 562 if (tocpy == db->db_size) 563 dmu_buf_fill_done(db, tx); 564 565 if (err) 566 break; 567 568 size -= tocpy; 569 } 570 dmu_buf_rele_array(dbp, numbufs, FTAG); 571 return (err); 572 } 573 574 int 575 dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 576 page_t *pp, dmu_tx_t *tx) 577 { 578 dmu_buf_t **dbp; 579 int numbufs, i; 580 int err; 581 582 if (size == 0) 583 return (0); 584 585 err = dmu_buf_hold_array(os, object, offset, size, 586 FALSE, FTAG, &numbufs, &dbp); 587 if (err) 588 return (err); 589 590 for (i = 0; i < numbufs; i++) { 591 int tocpy, copied, thiscpy; 592 int bufoff; 593 dmu_buf_t *db = dbp[i]; 594 caddr_t va; 595 596 ASSERT(size > 0); 597 ASSERT3U(db->db_size, >=, PAGESIZE); 598 599 bufoff = offset - db->db_offset; 600 tocpy = (int)MIN(db->db_size - bufoff, size); 601 602 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 603 604 if (tocpy == db->db_size) 605 dmu_buf_will_fill(db, tx); 606 else 607 dmu_buf_will_dirty(db, tx); 608 609 for (copied = 0; copied < tocpy; copied += PAGESIZE) { 610 ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); 611 thiscpy = MIN(PAGESIZE, tocpy - copied); 612 va = ppmapin(pp, PROT_READ, (caddr_t)-1); 613 bcopy(va, (char *)db->db_data + bufoff, thiscpy); 614 ppmapout(va); 615 pp = pp->p_next; 616 bufoff += PAGESIZE; 617 } 618 619 if (tocpy == db->db_size) 620 dmu_buf_fill_done(db, tx); 621 622 if (err) 623 break; 624 625 offset += tocpy; 626 size -= tocpy; 627 } 628 dmu_buf_rele_array(dbp, numbufs, FTAG); 629 return (err); 630 } 631 #endif 632 633 typedef struct { 634 dbuf_dirty_record_t *dr; 635 dmu_sync_cb_t *done; 636 void *arg; 637 } dmu_sync_arg_t; 638 639 /* ARGSUSED */ 640 static void 641 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) 642 { 643 dmu_sync_arg_t *in = varg; 644 dbuf_dirty_record_t *dr = in->dr; 645 dmu_buf_impl_t *db = dr->dr_dbuf; 646 dmu_sync_cb_t *done = in->done; 647 648 if (!BP_IS_HOLE(zio->io_bp)) { 649 zio->io_bp->blk_fill = 1; 650 BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type); 651 BP_SET_LEVEL(zio->io_bp, 0); 652 } 653 654 mutex_enter(&db->db_mtx); 655 ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); 656 dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */ 657 dr->dt.dl.dr_override_state = DR_OVERRIDDEN; 658 cv_broadcast(&db->db_changed); 659 mutex_exit(&db->db_mtx); 660 661 if (done) 662 done(&(db->db), in->arg); 663 664 kmem_free(in, sizeof (dmu_sync_arg_t)); 665 } 666 667 /* 668 * Intent log support: sync the block associated with db to disk. 669 * N.B. and XXX: the caller is responsible for making sure that the 670 * data isn't changing while dmu_sync() is writing it. 671 * 672 * Return values: 673 * 674 * EEXIST: this txg has already been synced, so there's nothing to to. 675 * The caller should not log the write. 676 * 677 * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. 678 * The caller should not log the write. 679 * 680 * EALREADY: this block is already in the process of being synced. 681 * The caller should track its progress (somehow). 682 * 683 * EINPROGRESS: the IO has been initiated. 684 * The caller should log this blkptr in the callback. 685 * 686 * 0: completed. Sets *bp to the blkptr just written. 687 * The caller should log this blkptr immediately. 688 */ 689 int 690 dmu_sync(zio_t *pio, dmu_buf_t *db_fake, 691 blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg) 692 { 693 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 694 objset_impl_t *os = db->db_objset; 695 dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; 696 tx_state_t *tx = &dp->dp_tx; 697 dbuf_dirty_record_t *dr; 698 dmu_sync_arg_t *in; 699 zbookmark_t zb; 700 zio_t *zio; 701 int zio_flags; 702 int err; 703 704 ASSERT(BP_IS_HOLE(bp)); 705 ASSERT(txg != 0); 706 707 708 dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", 709 txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); 710 711 /* 712 * XXX - would be nice if we could do this without suspending... 713 */ 714 txg_suspend(dp); 715 716 /* 717 * If this txg already synced, there's nothing to do. 718 */ 719 if (txg <= tx->tx_synced_txg) { 720 txg_resume(dp); 721 /* 722 * If we're running ziltest, we need the blkptr regardless. 723 */ 724 if (txg > spa_freeze_txg(dp->dp_spa)) { 725 /* if db_blkptr == NULL, this was an empty write */ 726 if (db->db_blkptr) 727 *bp = *db->db_blkptr; /* structure assignment */ 728 return (0); 729 } 730 return (EEXIST); 731 } 732 733 mutex_enter(&db->db_mtx); 734 735 if (txg == tx->tx_syncing_txg) { 736 while (db->db_data_pending) { 737 /* 738 * IO is in-progress. Wait for it to finish. 739 * XXX - would be nice to be able to somehow "attach" 740 * this zio to the parent zio passed in. 741 */ 742 cv_wait(&db->db_changed, &db->db_mtx); 743 if (!db->db_data_pending && 744 db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) { 745 /* 746 * IO was compressed away 747 */ 748 *bp = *db->db_blkptr; /* structure assignment */ 749 mutex_exit(&db->db_mtx); 750 txg_resume(dp); 751 return (0); 752 } 753 ASSERT(db->db_data_pending || 754 (db->db_blkptr && db->db_blkptr->blk_birth == txg)); 755 } 756 757 if (db->db_blkptr && db->db_blkptr->blk_birth == txg) { 758 /* 759 * IO is already completed. 760 */ 761 *bp = *db->db_blkptr; /* structure assignment */ 762 mutex_exit(&db->db_mtx); 763 txg_resume(dp); 764 return (0); 765 } 766 } 767 768 dr = db->db_last_dirty; 769 while (dr && dr->dr_txg > txg) 770 dr = dr->dr_next; 771 if (dr == NULL || dr->dr_txg < txg) { 772 /* 773 * This dbuf isn't dirty, must have been free_range'd. 774 * There's no need to log writes to freed blocks, so we're done. 775 */ 776 mutex_exit(&db->db_mtx); 777 txg_resume(dp); 778 return (ENOENT); 779 } 780 781 ASSERT(dr->dr_txg == txg); 782 if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 783 /* 784 * We have already issued a sync write for this buffer. 785 */ 786 mutex_exit(&db->db_mtx); 787 txg_resume(dp); 788 return (EALREADY); 789 } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 790 /* 791 * This buffer has already been synced. It could not 792 * have been dirtied since, or we would have cleared the state. 793 */ 794 *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */ 795 mutex_exit(&db->db_mtx); 796 txg_resume(dp); 797 return (0); 798 } 799 800 dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; 801 in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 802 in->dr = dr; 803 in->done = done; 804 in->arg = arg; 805 mutex_exit(&db->db_mtx); 806 txg_resume(dp); 807 808 zb.zb_objset = os->os_dsl_dataset->ds_object; 809 zb.zb_object = db->db.db_object; 810 zb.zb_level = db->db_level; 811 zb.zb_blkid = db->db_blkid; 812 zio_flags = ZIO_FLAG_MUSTSUCCEED; 813 if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0) 814 zio_flags |= ZIO_FLAG_METADATA; 815 zio = arc_write(pio, os->os_spa, 816 zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum), 817 zio_compress_select(db->db_dnode->dn_compress, os->os_compress), 818 dmu_get_replication_level(os, &zb, db->db_dnode->dn_type), 819 txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in, 820 ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb); 821 822 if (pio) { 823 zio_nowait(zio); 824 err = EINPROGRESS; 825 } else { 826 err = zio_wait(zio); 827 ASSERT(err == 0); 828 } 829 return (err); 830 } 831 832 int 833 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, 834 dmu_tx_t *tx) 835 { 836 dnode_t *dn; 837 int err; 838 839 err = dnode_hold(os->os, object, FTAG, &dn); 840 if (err) 841 return (err); 842 err = dnode_set_blksz(dn, size, ibs, tx); 843 dnode_rele(dn, FTAG); 844 return (err); 845 } 846 847 void 848 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, 849 dmu_tx_t *tx) 850 { 851 dnode_t *dn; 852 853 /* XXX assumes dnode_hold will not get an i/o error */ 854 (void) dnode_hold(os->os, object, FTAG, &dn); 855 ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); 856 dn->dn_checksum = checksum; 857 dnode_setdirty(dn, tx); 858 dnode_rele(dn, FTAG); 859 } 860 861 void 862 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, 863 dmu_tx_t *tx) 864 { 865 dnode_t *dn; 866 867 /* XXX assumes dnode_hold will not get an i/o error */ 868 (void) dnode_hold(os->os, object, FTAG, &dn); 869 ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); 870 dn->dn_compress = compress; 871 dnode_setdirty(dn, tx); 872 dnode_rele(dn, FTAG); 873 } 874 875 int 876 dmu_get_replication_level(objset_impl_t *os, 877 zbookmark_t *zb, dmu_object_type_t ot) 878 { 879 int ncopies = os->os_copies; 880 881 /* If it's the mos, it should have max copies set. */ 882 ASSERT(zb->zb_objset != 0 || 883 ncopies == spa_max_replication(os->os_spa)); 884 885 if (dmu_ot[ot].ot_metadata || zb->zb_level != 0) 886 ncopies++; 887 return (MIN(ncopies, spa_max_replication(os->os_spa))); 888 } 889 890 int 891 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) 892 { 893 dnode_t *dn; 894 int i, err; 895 896 err = dnode_hold(os->os, object, FTAG, &dn); 897 if (err) 898 return (err); 899 /* 900 * Sync any current changes before 901 * we go trundling through the block pointers. 902 */ 903 for (i = 0; i < TXG_SIZE; i++) { 904 if (list_link_active(&dn->dn_dirty_link[i])) 905 break; 906 } 907 if (i != TXG_SIZE) { 908 dnode_rele(dn, FTAG); 909 txg_wait_synced(dmu_objset_pool(os), 0); 910 err = dnode_hold(os->os, object, FTAG, &dn); 911 if (err) 912 return (err); 913 } 914 915 err = dnode_next_offset(dn, hole, off, 1, 1, 0); 916 dnode_rele(dn, FTAG); 917 918 return (err); 919 } 920 921 void 922 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) 923 { 924 rw_enter(&dn->dn_struct_rwlock, RW_READER); 925 mutex_enter(&dn->dn_mtx); 926 927 doi->doi_data_block_size = dn->dn_datablksz; 928 doi->doi_metadata_block_size = dn->dn_indblkshift ? 929 1ULL << dn->dn_indblkshift : 0; 930 doi->doi_indirection = dn->dn_nlevels; 931 doi->doi_checksum = dn->dn_checksum; 932 doi->doi_compress = dn->dn_compress; 933 doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) + 934 SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT; 935 doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; 936 doi->doi_type = dn->dn_type; 937 doi->doi_bonus_size = dn->dn_bonuslen; 938 doi->doi_bonus_type = dn->dn_bonustype; 939 940 mutex_exit(&dn->dn_mtx); 941 rw_exit(&dn->dn_struct_rwlock); 942 } 943 944 /* 945 * Get information on a DMU object. 946 * If doi is NULL, just indicates whether the object exists. 947 */ 948 int 949 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) 950 { 951 dnode_t *dn; 952 int err = dnode_hold(os->os, object, FTAG, &dn); 953 954 if (err) 955 return (err); 956 957 if (doi != NULL) 958 dmu_object_info_from_dnode(dn, doi); 959 960 dnode_rele(dn, FTAG); 961 return (0); 962 } 963 964 /* 965 * As above, but faster; can be used when you have a held dbuf in hand. 966 */ 967 void 968 dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) 969 { 970 dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); 971 } 972 973 /* 974 * Faster still when you only care about the size. 975 * This is specifically optimized for zfs_getattr(). 976 */ 977 void 978 dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) 979 { 980 dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 981 982 *blksize = dn->dn_datablksz; 983 /* add 1 for dnode space */ 984 *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> 985 SPA_MINBLOCKSHIFT) + 1; 986 } 987 988 void 989 byteswap_uint64_array(void *vbuf, size_t size) 990 { 991 uint64_t *buf = vbuf; 992 size_t count = size >> 3; 993 int i; 994 995 ASSERT((size & 7) == 0); 996 997 for (i = 0; i < count; i++) 998 buf[i] = BSWAP_64(buf[i]); 999 } 1000 1001 void 1002 byteswap_uint32_array(void *vbuf, size_t size) 1003 { 1004 uint32_t *buf = vbuf; 1005 size_t count = size >> 2; 1006 int i; 1007 1008 ASSERT((size & 3) == 0); 1009 1010 for (i = 0; i < count; i++) 1011 buf[i] = BSWAP_32(buf[i]); 1012 } 1013 1014 void 1015 byteswap_uint16_array(void *vbuf, size_t size) 1016 { 1017 uint16_t *buf = vbuf; 1018 size_t count = size >> 1; 1019 int i; 1020 1021 ASSERT((size & 1) == 0); 1022 1023 for (i = 0; i < count; i++) 1024 buf[i] = BSWAP_16(buf[i]); 1025 } 1026 1027 /* ARGSUSED */ 1028 void 1029 byteswap_uint8_array(void *vbuf, size_t size) 1030 { 1031 } 1032 1033 void 1034 dmu_init(void) 1035 { 1036 dbuf_init(); 1037 dnode_init(); 1038 arc_init(); 1039 } 1040 1041 void 1042 dmu_fini(void) 1043 { 1044 arc_fini(); 1045 dnode_fini(); 1046 dbuf_fini(); 1047 } 1048