1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/dmu.h> 28 #include <sys/dmu_impl.h> 29 #include <sys/dbuf.h> 30 #include <sys/dmu_objset.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_dir.h> 33 #include <sys/dmu_tx.h> 34 #include <sys/spa.h> 35 #include <sys/zio.h> 36 #include <sys/dmu_zfetch.h> 37 38 static void dbuf_destroy(dmu_buf_impl_t *db); 39 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 40 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 41 42 /* 43 * Global data structures and functions for the dbuf cache. 44 */ 45 static kmem_cache_t *dbuf_cache; 46 47 /* ARGSUSED */ 48 static int 49 dbuf_cons(void *vdb, void *unused, int kmflag) 50 { 51 dmu_buf_impl_t *db = vdb; 52 bzero(db, sizeof (dmu_buf_impl_t)); 53 54 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 55 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 56 refcount_create(&db->db_holds); 57 return (0); 58 } 59 60 /* ARGSUSED */ 61 static void 62 dbuf_dest(void *vdb, void *unused) 63 { 64 dmu_buf_impl_t *db = vdb; 65 mutex_destroy(&db->db_mtx); 66 cv_destroy(&db->db_changed); 67 refcount_destroy(&db->db_holds); 68 } 69 70 /* 71 * dbuf hash table routines 72 */ 73 static dbuf_hash_table_t dbuf_hash_table; 74 75 static uint64_t dbuf_hash_count; 76 77 static uint64_t 78 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 79 { 80 uintptr_t osv = (uintptr_t)os; 81 uint64_t crc = -1ULL; 82 83 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 84 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 85 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 86 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 87 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 88 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 90 91 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 92 93 return (crc); 94 } 95 96 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 97 98 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 99 ((dbuf)->db.db_object == (obj) && \ 100 (dbuf)->db_objset == (os) && \ 101 (dbuf)->db_level == (level) && \ 102 (dbuf)->db_blkid == (blkid)) 103 104 dmu_buf_impl_t * 105 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 106 { 107 dbuf_hash_table_t *h = &dbuf_hash_table; 108 objset_t *os = dn->dn_objset; 109 uint64_t obj = dn->dn_object; 110 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 111 uint64_t idx = hv & h->hash_table_mask; 112 dmu_buf_impl_t *db; 113 114 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 115 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 116 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 117 mutex_enter(&db->db_mtx); 118 if (db->db_state != DB_EVICTING) { 119 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 120 return (db); 121 } 122 mutex_exit(&db->db_mtx); 123 } 124 } 125 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 126 return (NULL); 127 } 128 129 /* 130 * Insert an entry into the hash table. If there is already an element 131 * equal to elem in the hash table, then the already existing element 132 * will be returned and the new element will not be inserted. 133 * Otherwise returns NULL. 134 */ 135 static dmu_buf_impl_t * 136 dbuf_hash_insert(dmu_buf_impl_t *db) 137 { 138 dbuf_hash_table_t *h = &dbuf_hash_table; 139 objset_t *os = db->db_objset; 140 uint64_t obj = db->db.db_object; 141 int level = db->db_level; 142 uint64_t blkid = db->db_blkid; 143 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 144 uint64_t idx = hv & h->hash_table_mask; 145 dmu_buf_impl_t *dbf; 146 147 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 148 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 149 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 150 mutex_enter(&dbf->db_mtx); 151 if (dbf->db_state != DB_EVICTING) { 152 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 153 return (dbf); 154 } 155 mutex_exit(&dbf->db_mtx); 156 } 157 } 158 159 mutex_enter(&db->db_mtx); 160 db->db_hash_next = h->hash_table[idx]; 161 h->hash_table[idx] = db; 162 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 163 atomic_add_64(&dbuf_hash_count, 1); 164 165 return (NULL); 166 } 167 168 /* 169 * Remove an entry from the hash table. This operation will 170 * fail if there are any existing holds on the db. 171 */ 172 static void 173 dbuf_hash_remove(dmu_buf_impl_t *db) 174 { 175 dbuf_hash_table_t *h = &dbuf_hash_table; 176 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 177 db->db_level, db->db_blkid); 178 uint64_t idx = hv & h->hash_table_mask; 179 dmu_buf_impl_t *dbf, **dbp; 180 181 /* 182 * We musn't hold db_mtx to maintin lock ordering: 183 * DBUF_HASH_MUTEX > db_mtx. 184 */ 185 ASSERT(refcount_is_zero(&db->db_holds)); 186 ASSERT(db->db_state == DB_EVICTING); 187 ASSERT(!MUTEX_HELD(&db->db_mtx)); 188 189 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 190 dbp = &h->hash_table[idx]; 191 while ((dbf = *dbp) != db) { 192 dbp = &dbf->db_hash_next; 193 ASSERT(dbf != NULL); 194 } 195 *dbp = db->db_hash_next; 196 db->db_hash_next = NULL; 197 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 198 atomic_add_64(&dbuf_hash_count, -1); 199 } 200 201 static arc_evict_func_t dbuf_do_evict; 202 203 static void 204 dbuf_evict_user(dmu_buf_impl_t *db) 205 { 206 ASSERT(MUTEX_HELD(&db->db_mtx)); 207 208 if (db->db_level != 0 || db->db_evict_func == NULL) 209 return; 210 211 if (db->db_user_data_ptr_ptr) 212 *db->db_user_data_ptr_ptr = db->db.db_data; 213 db->db_evict_func(&db->db, db->db_user_ptr); 214 db->db_user_ptr = NULL; 215 db->db_user_data_ptr_ptr = NULL; 216 db->db_evict_func = NULL; 217 } 218 219 void 220 dbuf_evict(dmu_buf_impl_t *db) 221 { 222 ASSERT(MUTEX_HELD(&db->db_mtx)); 223 ASSERT(db->db_buf == NULL); 224 ASSERT(db->db_data_pending == NULL); 225 226 dbuf_clear(db); 227 dbuf_destroy(db); 228 } 229 230 void 231 dbuf_init(void) 232 { 233 uint64_t hsize = 1ULL << 16; 234 dbuf_hash_table_t *h = &dbuf_hash_table; 235 int i; 236 237 /* 238 * The hash table is big enough to fill all of physical memory 239 * with an average 4K block size. The table will take up 240 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 241 */ 242 while (hsize * 4096 < physmem * PAGESIZE) 243 hsize <<= 1; 244 245 retry: 246 h->hash_table_mask = hsize - 1; 247 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 248 if (h->hash_table == NULL) { 249 /* XXX - we should really return an error instead of assert */ 250 ASSERT(hsize > (1ULL << 10)); 251 hsize >>= 1; 252 goto retry; 253 } 254 255 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 256 sizeof (dmu_buf_impl_t), 257 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 258 259 for (i = 0; i < DBUF_MUTEXES; i++) 260 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 261 } 262 263 void 264 dbuf_fini(void) 265 { 266 dbuf_hash_table_t *h = &dbuf_hash_table; 267 int i; 268 269 for (i = 0; i < DBUF_MUTEXES; i++) 270 mutex_destroy(&h->hash_mutexes[i]); 271 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 272 kmem_cache_destroy(dbuf_cache); 273 } 274 275 /* 276 * Other stuff. 277 */ 278 279 #ifdef ZFS_DEBUG 280 static void 281 dbuf_verify(dmu_buf_impl_t *db) 282 { 283 dnode_t *dn = db->db_dnode; 284 dbuf_dirty_record_t *dr; 285 286 ASSERT(MUTEX_HELD(&db->db_mtx)); 287 288 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 289 return; 290 291 ASSERT(db->db_objset != NULL); 292 if (dn == NULL) { 293 ASSERT(db->db_parent == NULL); 294 ASSERT(db->db_blkptr == NULL); 295 } else { 296 ASSERT3U(db->db.db_object, ==, dn->dn_object); 297 ASSERT3P(db->db_objset, ==, dn->dn_objset); 298 ASSERT3U(db->db_level, <, dn->dn_nlevels); 299 ASSERT(db->db_blkid == DB_BONUS_BLKID || 300 list_head(&dn->dn_dbufs)); 301 } 302 if (db->db_blkid == DB_BONUS_BLKID) { 303 ASSERT(dn != NULL); 304 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 305 ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); 306 } else { 307 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 308 } 309 310 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 311 ASSERT(dr->dr_dbuf == db); 312 313 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 314 ASSERT(dr->dr_dbuf == db); 315 316 /* 317 * We can't assert that db_size matches dn_datablksz because it 318 * can be momentarily different when another thread is doing 319 * dnode_set_blksz(). 320 */ 321 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 322 dr = db->db_data_pending; 323 /* 324 * It should only be modified in syncing context, so 325 * make sure we only have one copy of the data. 326 */ 327 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 328 } 329 330 /* verify db->db_blkptr */ 331 if (db->db_blkptr) { 332 if (db->db_parent == dn->dn_dbuf) { 333 /* db is pointed to by the dnode */ 334 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 335 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 336 ASSERT(db->db_parent == NULL); 337 else 338 ASSERT(db->db_parent != NULL); 339 ASSERT3P(db->db_blkptr, ==, 340 &dn->dn_phys->dn_blkptr[db->db_blkid]); 341 } else { 342 /* db is pointed to by an indirect block */ 343 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 344 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 345 ASSERT3U(db->db_parent->db.db_object, ==, 346 db->db.db_object); 347 /* 348 * dnode_grow_indblksz() can make this fail if we don't 349 * have the struct_rwlock. XXX indblksz no longer 350 * grows. safe to do this now? 351 */ 352 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { 353 ASSERT3P(db->db_blkptr, ==, 354 ((blkptr_t *)db->db_parent->db.db_data + 355 db->db_blkid % epb)); 356 } 357 } 358 } 359 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 360 db->db.db_data && db->db_blkid != DB_BONUS_BLKID && 361 db->db_state != DB_FILL && !dn->dn_free_txg) { 362 /* 363 * If the blkptr isn't set but they have nonzero data, 364 * it had better be dirty, otherwise we'll lose that 365 * data when we evict this buffer. 366 */ 367 if (db->db_dirtycnt == 0) { 368 uint64_t *buf = db->db.db_data; 369 int i; 370 371 for (i = 0; i < db->db.db_size >> 3; i++) { 372 ASSERT(buf[i] == 0); 373 } 374 } 375 } 376 } 377 #endif 378 379 static void 380 dbuf_update_data(dmu_buf_impl_t *db) 381 { 382 ASSERT(MUTEX_HELD(&db->db_mtx)); 383 if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 384 ASSERT(!refcount_is_zero(&db->db_holds)); 385 *db->db_user_data_ptr_ptr = db->db.db_data; 386 } 387 } 388 389 static void 390 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 391 { 392 ASSERT(MUTEX_HELD(&db->db_mtx)); 393 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 394 db->db_buf = buf; 395 if (buf != NULL) { 396 ASSERT(buf->b_data != NULL); 397 db->db.db_data = buf->b_data; 398 if (!arc_released(buf)) 399 arc_set_callback(buf, dbuf_do_evict, db); 400 dbuf_update_data(db); 401 } else { 402 dbuf_evict_user(db); 403 db->db.db_data = NULL; 404 if (db->db_state != DB_NOFILL) 405 db->db_state = DB_UNCACHED; 406 } 407 } 408 409 /* 410 * Loan out an arc_buf for read. Return the loaned arc_buf. 411 */ 412 arc_buf_t * 413 dbuf_loan_arcbuf(dmu_buf_impl_t *db) 414 { 415 arc_buf_t *abuf; 416 417 mutex_enter(&db->db_mtx); 418 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 419 int blksz = db->db.db_size; 420 mutex_exit(&db->db_mtx); 421 abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz); 422 bcopy(db->db.db_data, abuf->b_data, blksz); 423 } else { 424 abuf = db->db_buf; 425 arc_loan_inuse_buf(abuf, db); 426 dbuf_set_data(db, NULL); 427 mutex_exit(&db->db_mtx); 428 } 429 return (abuf); 430 } 431 432 uint64_t 433 dbuf_whichblock(dnode_t *dn, uint64_t offset) 434 { 435 if (dn->dn_datablkshift) { 436 return (offset >> dn->dn_datablkshift); 437 } else { 438 ASSERT3U(offset, <, dn->dn_datablksz); 439 return (0); 440 } 441 } 442 443 static void 444 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 445 { 446 dmu_buf_impl_t *db = vdb; 447 448 mutex_enter(&db->db_mtx); 449 ASSERT3U(db->db_state, ==, DB_READ); 450 /* 451 * All reads are synchronous, so we must have a hold on the dbuf 452 */ 453 ASSERT(refcount_count(&db->db_holds) > 0); 454 ASSERT(db->db_buf == NULL); 455 ASSERT(db->db.db_data == NULL); 456 if (db->db_level == 0 && db->db_freed_in_flight) { 457 /* we were freed in flight; disregard any error */ 458 arc_release(buf, db); 459 bzero(buf->b_data, db->db.db_size); 460 arc_buf_freeze(buf); 461 db->db_freed_in_flight = FALSE; 462 dbuf_set_data(db, buf); 463 db->db_state = DB_CACHED; 464 } else if (zio == NULL || zio->io_error == 0) { 465 dbuf_set_data(db, buf); 466 db->db_state = DB_CACHED; 467 } else { 468 ASSERT(db->db_blkid != DB_BONUS_BLKID); 469 ASSERT3P(db->db_buf, ==, NULL); 470 VERIFY(arc_buf_remove_ref(buf, db) == 1); 471 db->db_state = DB_UNCACHED; 472 } 473 cv_broadcast(&db->db_changed); 474 mutex_exit(&db->db_mtx); 475 dbuf_rele(db, NULL); 476 } 477 478 static void 479 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 480 { 481 dnode_t *dn = db->db_dnode; 482 zbookmark_t zb; 483 uint32_t aflags = ARC_NOWAIT; 484 arc_buf_t *pbuf; 485 486 ASSERT(!refcount_is_zero(&db->db_holds)); 487 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 488 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 489 ASSERT(MUTEX_HELD(&db->db_mtx)); 490 ASSERT(db->db_state == DB_UNCACHED); 491 ASSERT(db->db_buf == NULL); 492 493 if (db->db_blkid == DB_BONUS_BLKID) { 494 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 495 496 ASSERT3U(bonuslen, <=, db->db.db_size); 497 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 498 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 499 if (bonuslen < DN_MAX_BONUSLEN) 500 bzero(db->db.db_data, DN_MAX_BONUSLEN); 501 if (bonuslen) 502 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 503 dbuf_update_data(db); 504 db->db_state = DB_CACHED; 505 mutex_exit(&db->db_mtx); 506 return; 507 } 508 509 /* 510 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 511 * processes the delete record and clears the bp while we are waiting 512 * for the dn_mtx (resulting in a "no" from block_freed). 513 */ 514 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 515 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 516 BP_IS_HOLE(db->db_blkptr)))) { 517 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 518 519 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, 520 db->db.db_size, db, type)); 521 bzero(db->db.db_data, db->db.db_size); 522 db->db_state = DB_CACHED; 523 *flags |= DB_RF_CACHED; 524 mutex_exit(&db->db_mtx); 525 return; 526 } 527 528 db->db_state = DB_READ; 529 mutex_exit(&db->db_mtx); 530 531 if (DBUF_IS_L2CACHEABLE(db)) 532 aflags |= ARC_L2CACHE; 533 534 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 535 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 536 db->db.db_object, db->db_level, db->db_blkid); 537 538 dbuf_add_ref(db, NULL); 539 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 540 541 if (db->db_parent) 542 pbuf = db->db_parent->db_buf; 543 else 544 pbuf = db->db_objset->os_phys_buf; 545 546 (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, 547 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 548 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 549 &aflags, &zb); 550 if (aflags & ARC_CACHED) 551 *flags |= DB_RF_CACHED; 552 } 553 554 int 555 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 556 { 557 int err = 0; 558 int havepzio = (zio != NULL); 559 int prefetch; 560 561 /* 562 * We don't have to hold the mutex to check db_state because it 563 * can't be freed while we have a hold on the buffer. 564 */ 565 ASSERT(!refcount_is_zero(&db->db_holds)); 566 567 if (db->db_state == DB_NOFILL) 568 return (EIO); 569 570 if ((flags & DB_RF_HAVESTRUCT) == 0) 571 rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); 572 573 prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 574 (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL && 575 DBUF_IS_CACHEABLE(db); 576 577 mutex_enter(&db->db_mtx); 578 if (db->db_state == DB_CACHED) { 579 mutex_exit(&db->db_mtx); 580 if (prefetch) 581 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 582 db->db.db_size, TRUE); 583 if ((flags & DB_RF_HAVESTRUCT) == 0) 584 rw_exit(&db->db_dnode->dn_struct_rwlock); 585 } else if (db->db_state == DB_UNCACHED) { 586 if (zio == NULL) { 587 zio = zio_root(db->db_dnode->dn_objset->os_spa, 588 NULL, NULL, ZIO_FLAG_CANFAIL); 589 } 590 dbuf_read_impl(db, zio, &flags); 591 592 /* dbuf_read_impl has dropped db_mtx for us */ 593 594 if (prefetch) 595 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 596 db->db.db_size, flags & DB_RF_CACHED); 597 598 if ((flags & DB_RF_HAVESTRUCT) == 0) 599 rw_exit(&db->db_dnode->dn_struct_rwlock); 600 601 if (!havepzio) 602 err = zio_wait(zio); 603 } else { 604 mutex_exit(&db->db_mtx); 605 if (prefetch) 606 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 607 db->db.db_size, TRUE); 608 if ((flags & DB_RF_HAVESTRUCT) == 0) 609 rw_exit(&db->db_dnode->dn_struct_rwlock); 610 611 mutex_enter(&db->db_mtx); 612 if ((flags & DB_RF_NEVERWAIT) == 0) { 613 while (db->db_state == DB_READ || 614 db->db_state == DB_FILL) { 615 ASSERT(db->db_state == DB_READ || 616 (flags & DB_RF_HAVESTRUCT) == 0); 617 cv_wait(&db->db_changed, &db->db_mtx); 618 } 619 if (db->db_state == DB_UNCACHED) 620 err = EIO; 621 } 622 mutex_exit(&db->db_mtx); 623 } 624 625 ASSERT(err || havepzio || db->db_state == DB_CACHED); 626 return (err); 627 } 628 629 static void 630 dbuf_noread(dmu_buf_impl_t *db) 631 { 632 ASSERT(!refcount_is_zero(&db->db_holds)); 633 ASSERT(db->db_blkid != DB_BONUS_BLKID); 634 mutex_enter(&db->db_mtx); 635 while (db->db_state == DB_READ || db->db_state == DB_FILL) 636 cv_wait(&db->db_changed, &db->db_mtx); 637 if (db->db_state == DB_UNCACHED) { 638 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 639 640 ASSERT(db->db_buf == NULL); 641 ASSERT(db->db.db_data == NULL); 642 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 643 db->db.db_size, db, type)); 644 db->db_state = DB_FILL; 645 } else if (db->db_state == DB_NOFILL) { 646 dbuf_set_data(db, NULL); 647 } else { 648 ASSERT3U(db->db_state, ==, DB_CACHED); 649 } 650 mutex_exit(&db->db_mtx); 651 } 652 653 /* 654 * This is our just-in-time copy function. It makes a copy of 655 * buffers, that have been modified in a previous transaction 656 * group, before we modify them in the current active group. 657 * 658 * This function is used in two places: when we are dirtying a 659 * buffer for the first time in a txg, and when we are freeing 660 * a range in a dnode that includes this buffer. 661 * 662 * Note that when we are called from dbuf_free_range() we do 663 * not put a hold on the buffer, we just traverse the active 664 * dbuf list for the dnode. 665 */ 666 static void 667 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 668 { 669 dbuf_dirty_record_t *dr = db->db_last_dirty; 670 671 ASSERT(MUTEX_HELD(&db->db_mtx)); 672 ASSERT(db->db.db_data != NULL); 673 ASSERT(db->db_level == 0); 674 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 675 676 if (dr == NULL || 677 (dr->dt.dl.dr_data != 678 ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 679 return; 680 681 /* 682 * If the last dirty record for this dbuf has not yet synced 683 * and its referencing the dbuf data, either: 684 * reset the reference to point to a new copy, 685 * or (if there a no active holders) 686 * just null out the current db_data pointer. 687 */ 688 ASSERT(dr->dr_txg >= txg - 2); 689 if (db->db_blkid == DB_BONUS_BLKID) { 690 /* Note that the data bufs here are zio_bufs */ 691 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 692 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 693 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 694 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 695 int size = db->db.db_size; 696 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 697 dr->dt.dl.dr_data = arc_buf_alloc( 698 db->db_dnode->dn_objset->os_spa, size, db, type); 699 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 700 } else { 701 dbuf_set_data(db, NULL); 702 } 703 } 704 705 void 706 dbuf_unoverride(dbuf_dirty_record_t *dr) 707 { 708 dmu_buf_impl_t *db = dr->dr_dbuf; 709 blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 710 uint64_t txg = dr->dr_txg; 711 712 ASSERT(MUTEX_HELD(&db->db_mtx)); 713 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 714 ASSERT(db->db_level == 0); 715 716 if (db->db_blkid == DB_BONUS_BLKID || 717 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 718 return; 719 720 ASSERT(db->db_data_pending != dr); 721 722 /* free this block */ 723 if (!BP_IS_HOLE(bp)) 724 dsl_free(spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, bp); 725 726 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 727 /* 728 * Release the already-written buffer, so we leave it in 729 * a consistent dirty state. Note that all callers are 730 * modifying the buffer, so they will immediately do 731 * another (redundant) arc_release(). Therefore, leave 732 * the buf thawed to save the effort of freezing & 733 * immediately re-thawing it. 734 */ 735 arc_release(dr->dt.dl.dr_data, db); 736 } 737 738 /* 739 * Evict (if its unreferenced) or clear (if its referenced) any level-0 740 * data blocks in the free range, so that any future readers will find 741 * empty blocks. Also, if we happen accross any level-1 dbufs in the 742 * range that have not already been marked dirty, mark them dirty so 743 * they stay in memory. 744 */ 745 void 746 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) 747 { 748 dmu_buf_impl_t *db, *db_next; 749 uint64_t txg = tx->tx_txg; 750 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 751 uint64_t first_l1 = start >> epbs; 752 uint64_t last_l1 = end >> epbs; 753 754 if (end > dn->dn_maxblkid) { 755 end = dn->dn_maxblkid; 756 last_l1 = end >> epbs; 757 } 758 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); 759 mutex_enter(&dn->dn_dbufs_mtx); 760 for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 761 db_next = list_next(&dn->dn_dbufs, db); 762 ASSERT(db->db_blkid != DB_BONUS_BLKID); 763 764 if (db->db_level == 1 && 765 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { 766 mutex_enter(&db->db_mtx); 767 if (db->db_last_dirty && 768 db->db_last_dirty->dr_txg < txg) { 769 dbuf_add_ref(db, FTAG); 770 mutex_exit(&db->db_mtx); 771 dbuf_will_dirty(db, tx); 772 dbuf_rele(db, FTAG); 773 } else { 774 mutex_exit(&db->db_mtx); 775 } 776 } 777 778 if (db->db_level != 0) 779 continue; 780 dprintf_dbuf(db, "found buf %s\n", ""); 781 if (db->db_blkid < start || db->db_blkid > end) 782 continue; 783 784 /* found a level 0 buffer in the range */ 785 if (dbuf_undirty(db, tx)) 786 continue; 787 788 mutex_enter(&db->db_mtx); 789 if (db->db_state == DB_UNCACHED || 790 db->db_state == DB_NOFILL || 791 db->db_state == DB_EVICTING) { 792 ASSERT(db->db.db_data == NULL); 793 mutex_exit(&db->db_mtx); 794 continue; 795 } 796 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 797 /* will be handled in dbuf_read_done or dbuf_rele */ 798 db->db_freed_in_flight = TRUE; 799 mutex_exit(&db->db_mtx); 800 continue; 801 } 802 if (refcount_count(&db->db_holds) == 0) { 803 ASSERT(db->db_buf); 804 dbuf_clear(db); 805 continue; 806 } 807 /* The dbuf is referenced */ 808 809 if (db->db_last_dirty != NULL) { 810 dbuf_dirty_record_t *dr = db->db_last_dirty; 811 812 if (dr->dr_txg == txg) { 813 /* 814 * This buffer is "in-use", re-adjust the file 815 * size to reflect that this buffer may 816 * contain new data when we sync. 817 */ 818 if (db->db_blkid > dn->dn_maxblkid) 819 dn->dn_maxblkid = db->db_blkid; 820 dbuf_unoverride(dr); 821 } else { 822 /* 823 * This dbuf is not dirty in the open context. 824 * Either uncache it (if its not referenced in 825 * the open context) or reset its contents to 826 * empty. 827 */ 828 dbuf_fix_old_data(db, txg); 829 } 830 } 831 /* clear the contents if its cached */ 832 if (db->db_state == DB_CACHED) { 833 ASSERT(db->db.db_data != NULL); 834 arc_release(db->db_buf, db); 835 bzero(db->db.db_data, db->db.db_size); 836 arc_buf_freeze(db->db_buf); 837 } 838 839 mutex_exit(&db->db_mtx); 840 } 841 mutex_exit(&dn->dn_dbufs_mtx); 842 } 843 844 static int 845 dbuf_block_freeable(dmu_buf_impl_t *db) 846 { 847 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 848 uint64_t birth_txg = 0; 849 850 /* 851 * We don't need any locking to protect db_blkptr: 852 * If it's syncing, then db_last_dirty will be set 853 * so we'll ignore db_blkptr. 854 */ 855 ASSERT(MUTEX_HELD(&db->db_mtx)); 856 if (db->db_last_dirty) 857 birth_txg = db->db_last_dirty->dr_txg; 858 else if (db->db_blkptr) 859 birth_txg = db->db_blkptr->blk_birth; 860 861 /* If we don't exist or are in a snapshot, we can't be freed */ 862 if (birth_txg) 863 return (ds == NULL || 864 dsl_dataset_block_freeable(ds, birth_txg)); 865 else 866 return (FALSE); 867 } 868 869 void 870 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 871 { 872 arc_buf_t *buf, *obuf; 873 int osize = db->db.db_size; 874 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 875 876 ASSERT(db->db_blkid != DB_BONUS_BLKID); 877 878 /* XXX does *this* func really need the lock? */ 879 ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); 880 881 /* 882 * This call to dbuf_will_dirty() with the dn_struct_rwlock held 883 * is OK, because there can be no other references to the db 884 * when we are changing its size, so no concurrent DB_FILL can 885 * be happening. 886 */ 887 /* 888 * XXX we should be doing a dbuf_read, checking the return 889 * value and returning that up to our callers 890 */ 891 dbuf_will_dirty(db, tx); 892 893 /* create the data buffer for the new block */ 894 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); 895 896 /* copy old block data to the new block */ 897 obuf = db->db_buf; 898 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 899 /* zero the remainder */ 900 if (size > osize) 901 bzero((uint8_t *)buf->b_data + osize, size - osize); 902 903 mutex_enter(&db->db_mtx); 904 dbuf_set_data(db, buf); 905 VERIFY(arc_buf_remove_ref(obuf, db) == 1); 906 db->db.db_size = size; 907 908 if (db->db_level == 0) { 909 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 910 db->db_last_dirty->dt.dl.dr_data = buf; 911 } 912 mutex_exit(&db->db_mtx); 913 914 dnode_willuse_space(db->db_dnode, size-osize, tx); 915 } 916 917 dbuf_dirty_record_t * 918 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 919 { 920 dnode_t *dn = db->db_dnode; 921 objset_t *os = dn->dn_objset; 922 dbuf_dirty_record_t **drp, *dr; 923 int drop_struct_lock = FALSE; 924 boolean_t do_free_accounting = B_FALSE; 925 int txgoff = tx->tx_txg & TXG_MASK; 926 927 ASSERT(tx->tx_txg != 0); 928 ASSERT(!refcount_is_zero(&db->db_holds)); 929 DMU_TX_DIRTY_BUF(tx, db); 930 931 /* 932 * Shouldn't dirty a regular buffer in syncing context. Private 933 * objects may be dirtied in syncing context, but only if they 934 * were already pre-dirtied in open context. 935 */ 936 ASSERT(!dmu_tx_is_syncing(tx) || 937 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 938 DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 939 dn->dn_objset->os_dsl_dataset == NULL); 940 /* 941 * We make this assert for private objects as well, but after we 942 * check if we're already dirty. They are allowed to re-dirty 943 * in syncing context. 944 */ 945 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 946 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 947 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 948 949 mutex_enter(&db->db_mtx); 950 /* 951 * XXX make this true for indirects too? The problem is that 952 * transactions created with dmu_tx_create_assigned() from 953 * syncing context don't bother holding ahead. 954 */ 955 ASSERT(db->db_level != 0 || 956 db->db_state == DB_CACHED || db->db_state == DB_FILL || 957 db->db_state == DB_NOFILL); 958 959 mutex_enter(&dn->dn_mtx); 960 /* 961 * Don't set dirtyctx to SYNC if we're just modifying this as we 962 * initialize the objset. 963 */ 964 if (dn->dn_dirtyctx == DN_UNDIRTIED && 965 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 966 dn->dn_dirtyctx = 967 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 968 ASSERT(dn->dn_dirtyctx_firstset == NULL); 969 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 970 } 971 mutex_exit(&dn->dn_mtx); 972 973 /* 974 * If this buffer is already dirty, we're done. 975 */ 976 drp = &db->db_last_dirty; 977 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 978 db->db.db_object == DMU_META_DNODE_OBJECT); 979 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 980 drp = &dr->dr_next; 981 if (dr && dr->dr_txg == tx->tx_txg) { 982 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 983 /* 984 * If this buffer has already been written out, 985 * we now need to reset its state. 986 */ 987 dbuf_unoverride(dr); 988 if (db->db.db_object != DMU_META_DNODE_OBJECT && 989 db->db_state != DB_NOFILL) 990 arc_buf_thaw(db->db_buf); 991 } 992 mutex_exit(&db->db_mtx); 993 return (dr); 994 } 995 996 /* 997 * Only valid if not already dirty. 998 */ 999 ASSERT(dn->dn_object == 0 || 1000 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1001 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1002 1003 ASSERT3U(dn->dn_nlevels, >, db->db_level); 1004 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1005 dn->dn_phys->dn_nlevels > db->db_level || 1006 dn->dn_next_nlevels[txgoff] > db->db_level || 1007 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1008 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1009 1010 /* 1011 * We should only be dirtying in syncing context if it's the 1012 * mos or we're initializing the os or it's a special object. 1013 * However, we are allowed to dirty in syncing context provided 1014 * we already dirtied it in open context. Hence we must make 1015 * this assertion only if we're not already dirty. 1016 */ 1017 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1018 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1019 ASSERT(db->db.db_size != 0); 1020 1021 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1022 1023 if (db->db_blkid != DB_BONUS_BLKID) { 1024 /* 1025 * Update the accounting. 1026 * Note: we delay "free accounting" until after we drop 1027 * the db_mtx. This keeps us from grabbing other locks 1028 * (and possibly deadlocking) in bp_get_dsize() while 1029 * also holding the db_mtx. 1030 */ 1031 dnode_willuse_space(dn, db->db.db_size, tx); 1032 do_free_accounting = dbuf_block_freeable(db); 1033 } 1034 1035 /* 1036 * If this buffer is dirty in an old transaction group we need 1037 * to make a copy of it so that the changes we make in this 1038 * transaction group won't leak out when we sync the older txg. 1039 */ 1040 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1041 if (db->db_level == 0) { 1042 void *data_old = db->db_buf; 1043 1044 if (db->db_state != DB_NOFILL) { 1045 if (db->db_blkid == DB_BONUS_BLKID) { 1046 dbuf_fix_old_data(db, tx->tx_txg); 1047 data_old = db->db.db_data; 1048 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1049 /* 1050 * Release the data buffer from the cache so 1051 * that we can modify it without impacting 1052 * possible other users of this cached data 1053 * block. Note that indirect blocks and 1054 * private objects are not released until the 1055 * syncing state (since they are only modified 1056 * then). 1057 */ 1058 arc_release(db->db_buf, db); 1059 dbuf_fix_old_data(db, tx->tx_txg); 1060 data_old = db->db_buf; 1061 } 1062 ASSERT(data_old != NULL); 1063 } 1064 dr->dt.dl.dr_data = data_old; 1065 } else { 1066 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1067 list_create(&dr->dt.di.dr_children, 1068 sizeof (dbuf_dirty_record_t), 1069 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1070 } 1071 dr->dr_dbuf = db; 1072 dr->dr_txg = tx->tx_txg; 1073 dr->dr_next = *drp; 1074 *drp = dr; 1075 1076 /* 1077 * We could have been freed_in_flight between the dbuf_noread 1078 * and dbuf_dirty. We win, as though the dbuf_noread() had 1079 * happened after the free. 1080 */ 1081 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 1082 mutex_enter(&dn->dn_mtx); 1083 dnode_clear_range(dn, db->db_blkid, 1, tx); 1084 mutex_exit(&dn->dn_mtx); 1085 db->db_freed_in_flight = FALSE; 1086 } 1087 1088 /* 1089 * This buffer is now part of this txg 1090 */ 1091 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1092 db->db_dirtycnt += 1; 1093 ASSERT3U(db->db_dirtycnt, <=, 3); 1094 1095 mutex_exit(&db->db_mtx); 1096 1097 if (db->db_blkid == DB_BONUS_BLKID) { 1098 mutex_enter(&dn->dn_mtx); 1099 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1100 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1101 mutex_exit(&dn->dn_mtx); 1102 dnode_setdirty(dn, tx); 1103 return (dr); 1104 } else if (do_free_accounting) { 1105 blkptr_t *bp = db->db_blkptr; 1106 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1107 bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1108 /* 1109 * This is only a guess -- if the dbuf is dirty 1110 * in a previous txg, we don't know how much 1111 * space it will use on disk yet. We should 1112 * really have the struct_rwlock to access 1113 * db_blkptr, but since this is just a guess, 1114 * it's OK if we get an odd answer. 1115 */ 1116 dnode_willuse_space(dn, -willfree, tx); 1117 } 1118 1119 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1120 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1121 drop_struct_lock = TRUE; 1122 } 1123 1124 if (db->db_level == 0) { 1125 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1126 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1127 } 1128 1129 if (db->db_level+1 < dn->dn_nlevels) { 1130 dmu_buf_impl_t *parent = db->db_parent; 1131 dbuf_dirty_record_t *di; 1132 int parent_held = FALSE; 1133 1134 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1135 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1136 1137 parent = dbuf_hold_level(dn, db->db_level+1, 1138 db->db_blkid >> epbs, FTAG); 1139 parent_held = TRUE; 1140 } 1141 if (drop_struct_lock) 1142 rw_exit(&dn->dn_struct_rwlock); 1143 ASSERT3U(db->db_level+1, ==, parent->db_level); 1144 di = dbuf_dirty(parent, tx); 1145 if (parent_held) 1146 dbuf_rele(parent, FTAG); 1147 1148 mutex_enter(&db->db_mtx); 1149 /* possible race with dbuf_undirty() */ 1150 if (db->db_last_dirty == dr || 1151 dn->dn_object == DMU_META_DNODE_OBJECT) { 1152 mutex_enter(&di->dt.di.dr_mtx); 1153 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1154 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1155 list_insert_tail(&di->dt.di.dr_children, dr); 1156 mutex_exit(&di->dt.di.dr_mtx); 1157 dr->dr_parent = di; 1158 } 1159 mutex_exit(&db->db_mtx); 1160 } else { 1161 ASSERT(db->db_level+1 == dn->dn_nlevels); 1162 ASSERT(db->db_blkid < dn->dn_nblkptr); 1163 ASSERT(db->db_parent == NULL || 1164 db->db_parent == db->db_dnode->dn_dbuf); 1165 mutex_enter(&dn->dn_mtx); 1166 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1167 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1168 mutex_exit(&dn->dn_mtx); 1169 if (drop_struct_lock) 1170 rw_exit(&dn->dn_struct_rwlock); 1171 } 1172 1173 dnode_setdirty(dn, tx); 1174 return (dr); 1175 } 1176 1177 static int 1178 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1179 { 1180 dnode_t *dn = db->db_dnode; 1181 uint64_t txg = tx->tx_txg; 1182 dbuf_dirty_record_t *dr, **drp; 1183 1184 ASSERT(txg != 0); 1185 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1186 1187 mutex_enter(&db->db_mtx); 1188 /* 1189 * If this buffer is not dirty, we're done. 1190 */ 1191 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1192 if (dr->dr_txg <= txg) 1193 break; 1194 if (dr == NULL || dr->dr_txg < txg) { 1195 mutex_exit(&db->db_mtx); 1196 return (0); 1197 } 1198 ASSERT(dr->dr_txg == txg); 1199 ASSERT(dr->dr_dbuf == db); 1200 1201 /* 1202 * If this buffer is currently held, we cannot undirty 1203 * it, since one of the current holders may be in the 1204 * middle of an update. Note that users of dbuf_undirty() 1205 * should not place a hold on the dbuf before the call. 1206 */ 1207 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1208 mutex_exit(&db->db_mtx); 1209 /* Make sure we don't toss this buffer at sync phase */ 1210 mutex_enter(&dn->dn_mtx); 1211 dnode_clear_range(dn, db->db_blkid, 1, tx); 1212 mutex_exit(&dn->dn_mtx); 1213 return (0); 1214 } 1215 1216 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1217 1218 ASSERT(db->db.db_size != 0); 1219 1220 /* XXX would be nice to fix up dn_towrite_space[] */ 1221 1222 *drp = dr->dr_next; 1223 1224 if (dr->dr_parent) { 1225 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1226 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1227 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1228 } else if (db->db_level+1 == dn->dn_nlevels) { 1229 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1230 mutex_enter(&dn->dn_mtx); 1231 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1232 mutex_exit(&dn->dn_mtx); 1233 } 1234 1235 if (db->db_level == 0) { 1236 if (db->db_state != DB_NOFILL) { 1237 dbuf_unoverride(dr); 1238 1239 ASSERT(db->db_buf != NULL); 1240 ASSERT(dr->dt.dl.dr_data != NULL); 1241 if (dr->dt.dl.dr_data != db->db_buf) 1242 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 1243 db) == 1); 1244 } 1245 } else { 1246 ASSERT(db->db_buf != NULL); 1247 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 1248 mutex_destroy(&dr->dt.di.dr_mtx); 1249 list_destroy(&dr->dt.di.dr_children); 1250 } 1251 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1252 1253 ASSERT(db->db_dirtycnt > 0); 1254 db->db_dirtycnt -= 1; 1255 1256 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1257 arc_buf_t *buf = db->db_buf; 1258 1259 ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1260 dbuf_set_data(db, NULL); 1261 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1262 dbuf_evict(db); 1263 return (1); 1264 } 1265 1266 mutex_exit(&db->db_mtx); 1267 return (0); 1268 } 1269 1270 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty 1271 void 1272 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1273 { 1274 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1275 1276 ASSERT(tx->tx_txg != 0); 1277 ASSERT(!refcount_is_zero(&db->db_holds)); 1278 1279 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) 1280 rf |= DB_RF_HAVESTRUCT; 1281 (void) dbuf_read(db, NULL, rf); 1282 (void) dbuf_dirty(db, tx); 1283 } 1284 1285 void 1286 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1287 { 1288 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1289 1290 db->db_state = DB_NOFILL; 1291 1292 dmu_buf_will_fill(db_fake, tx); 1293 } 1294 1295 void 1296 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1297 { 1298 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1299 1300 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1301 ASSERT(tx->tx_txg != 0); 1302 ASSERT(db->db_level == 0); 1303 ASSERT(!refcount_is_zero(&db->db_holds)); 1304 1305 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1306 dmu_tx_private_ok(tx)); 1307 1308 dbuf_noread(db); 1309 (void) dbuf_dirty(db, tx); 1310 } 1311 1312 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1313 /* ARGSUSED */ 1314 void 1315 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1316 { 1317 mutex_enter(&db->db_mtx); 1318 DBUF_VERIFY(db); 1319 1320 if (db->db_state == DB_FILL) { 1321 if (db->db_level == 0 && db->db_freed_in_flight) { 1322 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1323 /* we were freed while filling */ 1324 /* XXX dbuf_undirty? */ 1325 bzero(db->db.db_data, db->db.db_size); 1326 db->db_freed_in_flight = FALSE; 1327 } 1328 db->db_state = DB_CACHED; 1329 cv_broadcast(&db->db_changed); 1330 } 1331 mutex_exit(&db->db_mtx); 1332 } 1333 1334 /* 1335 * Directly assign a provided arc buf to a given dbuf if it's not referenced 1336 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1337 */ 1338 void 1339 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1340 { 1341 ASSERT(!refcount_is_zero(&db->db_holds)); 1342 ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT); 1343 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1344 ASSERT(db->db_level == 0); 1345 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1346 ASSERT(buf != NULL); 1347 ASSERT(arc_buf_size(buf) == db->db.db_size); 1348 ASSERT(tx->tx_txg != 0); 1349 1350 arc_return_buf(buf, db); 1351 ASSERT(arc_released(buf)); 1352 1353 mutex_enter(&db->db_mtx); 1354 1355 while (db->db_state == DB_READ || db->db_state == DB_FILL) 1356 cv_wait(&db->db_changed, &db->db_mtx); 1357 1358 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1359 1360 if (db->db_state == DB_CACHED && 1361 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1362 mutex_exit(&db->db_mtx); 1363 (void) dbuf_dirty(db, tx); 1364 bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1365 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1366 xuio_stat_wbuf_copied(); 1367 return; 1368 } 1369 1370 xuio_stat_wbuf_nocopy(); 1371 if (db->db_state == DB_CACHED) { 1372 dbuf_dirty_record_t *dr = db->db_last_dirty; 1373 1374 ASSERT(db->db_buf != NULL); 1375 if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1376 ASSERT(dr->dt.dl.dr_data == db->db_buf); 1377 if (!arc_released(db->db_buf)) { 1378 ASSERT(dr->dt.dl.dr_override_state == 1379 DR_OVERRIDDEN); 1380 arc_release(db->db_buf, db); 1381 } 1382 dr->dt.dl.dr_data = buf; 1383 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); 1384 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1385 arc_release(db->db_buf, db); 1386 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); 1387 } 1388 db->db_buf = NULL; 1389 } 1390 ASSERT(db->db_buf == NULL); 1391 dbuf_set_data(db, buf); 1392 db->db_state = DB_FILL; 1393 mutex_exit(&db->db_mtx); 1394 (void) dbuf_dirty(db, tx); 1395 dbuf_fill_done(db, tx); 1396 } 1397 1398 /* 1399 * "Clear" the contents of this dbuf. This will mark the dbuf 1400 * EVICTING and clear *most* of its references. Unfortunetely, 1401 * when we are not holding the dn_dbufs_mtx, we can't clear the 1402 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1403 * in this case. For callers from the DMU we will usually see: 1404 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() 1405 * For the arc callback, we will usually see: 1406 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1407 * Sometimes, though, we will get a mix of these two: 1408 * DMU: dbuf_clear()->arc_buf_evict() 1409 * ARC: dbuf_do_evict()->dbuf_destroy() 1410 */ 1411 void 1412 dbuf_clear(dmu_buf_impl_t *db) 1413 { 1414 dnode_t *dn = db->db_dnode; 1415 dmu_buf_impl_t *parent = db->db_parent; 1416 dmu_buf_impl_t *dndb = dn->dn_dbuf; 1417 int dbuf_gone = FALSE; 1418 1419 ASSERT(MUTEX_HELD(&db->db_mtx)); 1420 ASSERT(refcount_is_zero(&db->db_holds)); 1421 1422 dbuf_evict_user(db); 1423 1424 if (db->db_state == DB_CACHED) { 1425 ASSERT(db->db.db_data != NULL); 1426 if (db->db_blkid == DB_BONUS_BLKID) { 1427 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1428 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1429 } 1430 db->db.db_data = NULL; 1431 db->db_state = DB_UNCACHED; 1432 } 1433 1434 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1435 ASSERT(db->db_data_pending == NULL); 1436 1437 db->db_state = DB_EVICTING; 1438 db->db_blkptr = NULL; 1439 1440 if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1441 list_remove(&dn->dn_dbufs, db); 1442 dnode_rele(dn, db); 1443 db->db_dnode = NULL; 1444 } 1445 1446 if (db->db_buf) 1447 dbuf_gone = arc_buf_evict(db->db_buf); 1448 1449 if (!dbuf_gone) 1450 mutex_exit(&db->db_mtx); 1451 1452 /* 1453 * If this dbuf is referened from an indirect dbuf, 1454 * decrement the ref count on the indirect dbuf. 1455 */ 1456 if (parent && parent != dndb) 1457 dbuf_rele(parent, db); 1458 } 1459 1460 static int 1461 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1462 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1463 { 1464 int nlevels, epbs; 1465 1466 *parentp = NULL; 1467 *bpp = NULL; 1468 1469 ASSERT(blkid != DB_BONUS_BLKID); 1470 1471 if (dn->dn_phys->dn_nlevels == 0) 1472 nlevels = 1; 1473 else 1474 nlevels = dn->dn_phys->dn_nlevels; 1475 1476 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1477 1478 ASSERT3U(level * epbs, <, 64); 1479 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1480 if (level >= nlevels || 1481 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1482 /* the buffer has no parent yet */ 1483 return (ENOENT); 1484 } else if (level < nlevels-1) { 1485 /* this block is referenced from an indirect block */ 1486 int err = dbuf_hold_impl(dn, level+1, 1487 blkid >> epbs, fail_sparse, NULL, parentp); 1488 if (err) 1489 return (err); 1490 err = dbuf_read(*parentp, NULL, 1491 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1492 if (err) { 1493 dbuf_rele(*parentp, NULL); 1494 *parentp = NULL; 1495 return (err); 1496 } 1497 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1498 (blkid & ((1ULL << epbs) - 1)); 1499 return (0); 1500 } else { 1501 /* the block is referenced from the dnode */ 1502 ASSERT3U(level, ==, nlevels-1); 1503 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1504 blkid < dn->dn_phys->dn_nblkptr); 1505 if (dn->dn_dbuf) { 1506 dbuf_add_ref(dn->dn_dbuf, NULL); 1507 *parentp = dn->dn_dbuf; 1508 } 1509 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1510 return (0); 1511 } 1512 } 1513 1514 static dmu_buf_impl_t * 1515 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1516 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1517 { 1518 objset_t *os = dn->dn_objset; 1519 dmu_buf_impl_t *db, *odb; 1520 1521 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1522 ASSERT(dn->dn_type != DMU_OT_NONE); 1523 1524 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1525 1526 db->db_objset = os; 1527 db->db.db_object = dn->dn_object; 1528 db->db_level = level; 1529 db->db_blkid = blkid; 1530 db->db_last_dirty = NULL; 1531 db->db_dirtycnt = 0; 1532 db->db_dnode = dn; 1533 db->db_parent = parent; 1534 db->db_blkptr = blkptr; 1535 1536 db->db_user_ptr = NULL; 1537 db->db_user_data_ptr_ptr = NULL; 1538 db->db_evict_func = NULL; 1539 db->db_immediate_evict = 0; 1540 db->db_freed_in_flight = 0; 1541 1542 if (blkid == DB_BONUS_BLKID) { 1543 ASSERT3P(parent, ==, dn->dn_dbuf); 1544 db->db.db_size = DN_MAX_BONUSLEN - 1545 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1546 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1547 db->db.db_offset = DB_BONUS_BLKID; 1548 db->db_state = DB_UNCACHED; 1549 /* the bonus dbuf is not placed in the hash table */ 1550 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1551 return (db); 1552 } else { 1553 int blocksize = 1554 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1555 db->db.db_size = blocksize; 1556 db->db.db_offset = db->db_blkid * blocksize; 1557 } 1558 1559 /* 1560 * Hold the dn_dbufs_mtx while we get the new dbuf 1561 * in the hash table *and* added to the dbufs list. 1562 * This prevents a possible deadlock with someone 1563 * trying to look up this dbuf before its added to the 1564 * dn_dbufs list. 1565 */ 1566 mutex_enter(&dn->dn_dbufs_mtx); 1567 db->db_state = DB_EVICTING; 1568 if ((odb = dbuf_hash_insert(db)) != NULL) { 1569 /* someone else inserted it first */ 1570 kmem_cache_free(dbuf_cache, db); 1571 mutex_exit(&dn->dn_dbufs_mtx); 1572 return (odb); 1573 } 1574 list_insert_head(&dn->dn_dbufs, db); 1575 db->db_state = DB_UNCACHED; 1576 mutex_exit(&dn->dn_dbufs_mtx); 1577 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1578 1579 if (parent && parent != dn->dn_dbuf) 1580 dbuf_add_ref(parent, db); 1581 1582 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1583 refcount_count(&dn->dn_holds) > 0); 1584 (void) refcount_add(&dn->dn_holds, db); 1585 1586 dprintf_dbuf(db, "db=%p\n", db); 1587 1588 return (db); 1589 } 1590 1591 static int 1592 dbuf_do_evict(void *private) 1593 { 1594 arc_buf_t *buf = private; 1595 dmu_buf_impl_t *db = buf->b_private; 1596 1597 if (!MUTEX_HELD(&db->db_mtx)) 1598 mutex_enter(&db->db_mtx); 1599 1600 ASSERT(refcount_is_zero(&db->db_holds)); 1601 1602 if (db->db_state != DB_EVICTING) { 1603 ASSERT(db->db_state == DB_CACHED); 1604 DBUF_VERIFY(db); 1605 db->db_buf = NULL; 1606 dbuf_evict(db); 1607 } else { 1608 mutex_exit(&db->db_mtx); 1609 dbuf_destroy(db); 1610 } 1611 return (0); 1612 } 1613 1614 static void 1615 dbuf_destroy(dmu_buf_impl_t *db) 1616 { 1617 ASSERT(refcount_is_zero(&db->db_holds)); 1618 1619 if (db->db_blkid != DB_BONUS_BLKID) { 1620 /* 1621 * If this dbuf is still on the dn_dbufs list, 1622 * remove it from that list. 1623 */ 1624 if (db->db_dnode) { 1625 dnode_t *dn = db->db_dnode; 1626 1627 mutex_enter(&dn->dn_dbufs_mtx); 1628 list_remove(&dn->dn_dbufs, db); 1629 mutex_exit(&dn->dn_dbufs_mtx); 1630 1631 dnode_rele(dn, db); 1632 db->db_dnode = NULL; 1633 } 1634 dbuf_hash_remove(db); 1635 } 1636 db->db_parent = NULL; 1637 db->db_buf = NULL; 1638 1639 ASSERT(!list_link_active(&db->db_link)); 1640 ASSERT(db->db.db_data == NULL); 1641 ASSERT(db->db_hash_next == NULL); 1642 ASSERT(db->db_blkptr == NULL); 1643 ASSERT(db->db_data_pending == NULL); 1644 1645 kmem_cache_free(dbuf_cache, db); 1646 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1647 } 1648 1649 void 1650 dbuf_prefetch(dnode_t *dn, uint64_t blkid) 1651 { 1652 dmu_buf_impl_t *db = NULL; 1653 blkptr_t *bp = NULL; 1654 1655 ASSERT(blkid != DB_BONUS_BLKID); 1656 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1657 1658 if (dnode_block_freed(dn, blkid)) 1659 return; 1660 1661 /* dbuf_find() returns with db_mtx held */ 1662 if (db = dbuf_find(dn, 0, blkid)) { 1663 if (refcount_count(&db->db_holds) > 0) { 1664 /* 1665 * This dbuf is active. We assume that it is 1666 * already CACHED, or else about to be either 1667 * read or filled. 1668 */ 1669 mutex_exit(&db->db_mtx); 1670 return; 1671 } 1672 mutex_exit(&db->db_mtx); 1673 db = NULL; 1674 } 1675 1676 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1677 if (bp && !BP_IS_HOLE(bp)) { 1678 arc_buf_t *pbuf; 1679 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1680 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1681 zbookmark_t zb; 1682 1683 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1684 dn->dn_object, 0, blkid); 1685 1686 if (db) 1687 pbuf = db->db_buf; 1688 else 1689 pbuf = dn->dn_objset->os_phys_buf; 1690 1691 (void) arc_read(NULL, dn->dn_objset->os_spa, 1692 bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 1693 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1694 &aflags, &zb); 1695 } 1696 if (db) 1697 dbuf_rele(db, NULL); 1698 } 1699 } 1700 1701 /* 1702 * Returns with db_holds incremented, and db_mtx not held. 1703 * Note: dn_struct_rwlock must be held. 1704 */ 1705 int 1706 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1707 void *tag, dmu_buf_impl_t **dbp) 1708 { 1709 dmu_buf_impl_t *db, *parent = NULL; 1710 1711 ASSERT(blkid != DB_BONUS_BLKID); 1712 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1713 ASSERT3U(dn->dn_nlevels, >, level); 1714 1715 *dbp = NULL; 1716 top: 1717 /* dbuf_find() returns with db_mtx held */ 1718 db = dbuf_find(dn, level, blkid); 1719 1720 if (db == NULL) { 1721 blkptr_t *bp = NULL; 1722 int err; 1723 1724 ASSERT3P(parent, ==, NULL); 1725 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1726 if (fail_sparse) { 1727 if (err == 0 && bp && BP_IS_HOLE(bp)) 1728 err = ENOENT; 1729 if (err) { 1730 if (parent) 1731 dbuf_rele(parent, NULL); 1732 return (err); 1733 } 1734 } 1735 if (err && err != ENOENT) 1736 return (err); 1737 db = dbuf_create(dn, level, blkid, parent, bp); 1738 } 1739 1740 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1741 arc_buf_add_ref(db->db_buf, db); 1742 if (db->db_buf->b_data == NULL) { 1743 dbuf_clear(db); 1744 if (parent) { 1745 dbuf_rele(parent, NULL); 1746 parent = NULL; 1747 } 1748 goto top; 1749 } 1750 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1751 } 1752 1753 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1754 1755 /* 1756 * If this buffer is currently syncing out, and we are are 1757 * still referencing it from db_data, we need to make a copy 1758 * of it in case we decide we want to dirty it again in this txg. 1759 */ 1760 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 1761 dn->dn_object != DMU_META_DNODE_OBJECT && 1762 db->db_state == DB_CACHED && db->db_data_pending) { 1763 dbuf_dirty_record_t *dr = db->db_data_pending; 1764 1765 if (dr->dt.dl.dr_data == db->db_buf) { 1766 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1767 1768 dbuf_set_data(db, 1769 arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 1770 db->db.db_size, db, type)); 1771 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1772 db->db.db_size); 1773 } 1774 } 1775 1776 (void) refcount_add(&db->db_holds, tag); 1777 dbuf_update_data(db); 1778 DBUF_VERIFY(db); 1779 mutex_exit(&db->db_mtx); 1780 1781 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1782 if (parent) 1783 dbuf_rele(parent, NULL); 1784 1785 ASSERT3P(db->db_dnode, ==, dn); 1786 ASSERT3U(db->db_blkid, ==, blkid); 1787 ASSERT3U(db->db_level, ==, level); 1788 *dbp = db; 1789 1790 return (0); 1791 } 1792 1793 dmu_buf_impl_t * 1794 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1795 { 1796 dmu_buf_impl_t *db; 1797 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1798 return (err ? NULL : db); 1799 } 1800 1801 dmu_buf_impl_t * 1802 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1803 { 1804 dmu_buf_impl_t *db; 1805 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1806 return (err ? NULL : db); 1807 } 1808 1809 void 1810 dbuf_create_bonus(dnode_t *dn) 1811 { 1812 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1813 1814 ASSERT(dn->dn_bonus == NULL); 1815 dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); 1816 } 1817 1818 #pragma weak dmu_buf_add_ref = dbuf_add_ref 1819 void 1820 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1821 { 1822 int64_t holds = refcount_add(&db->db_holds, tag); 1823 ASSERT(holds > 1); 1824 } 1825 1826 #pragma weak dmu_buf_rele = dbuf_rele 1827 void 1828 dbuf_rele(dmu_buf_impl_t *db, void *tag) 1829 { 1830 mutex_enter(&db->db_mtx); 1831 dbuf_rele_and_unlock(db, tag); 1832 } 1833 1834 /* 1835 * dbuf_rele() for an already-locked dbuf. This is necessary to allow 1836 * db_dirtycnt and db_holds to be updated atomically. 1837 */ 1838 void 1839 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 1840 { 1841 int64_t holds; 1842 1843 ASSERT(MUTEX_HELD(&db->db_mtx)); 1844 DBUF_VERIFY(db); 1845 1846 holds = refcount_remove(&db->db_holds, tag); 1847 ASSERT(holds >= 0); 1848 1849 /* 1850 * We can't freeze indirects if there is a possibility that they 1851 * may be modified in the current syncing context. 1852 */ 1853 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 1854 arc_buf_freeze(db->db_buf); 1855 1856 if (holds == db->db_dirtycnt && 1857 db->db_level == 0 && db->db_immediate_evict) 1858 dbuf_evict_user(db); 1859 1860 if (holds == 0) { 1861 if (db->db_blkid == DB_BONUS_BLKID) { 1862 mutex_exit(&db->db_mtx); 1863 dnode_rele(db->db_dnode, db); 1864 } else if (db->db_buf == NULL) { 1865 /* 1866 * This is a special case: we never associated this 1867 * dbuf with any data allocated from the ARC. 1868 */ 1869 ASSERT(db->db_state == DB_UNCACHED || 1870 db->db_state == DB_NOFILL); 1871 dbuf_evict(db); 1872 } else if (arc_released(db->db_buf)) { 1873 arc_buf_t *buf = db->db_buf; 1874 /* 1875 * This dbuf has anonymous data associated with it. 1876 */ 1877 dbuf_set_data(db, NULL); 1878 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1879 dbuf_evict(db); 1880 } else { 1881 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); 1882 if (!DBUF_IS_CACHEABLE(db)) 1883 dbuf_clear(db); 1884 else 1885 mutex_exit(&db->db_mtx); 1886 } 1887 } else { 1888 mutex_exit(&db->db_mtx); 1889 } 1890 } 1891 1892 #pragma weak dmu_buf_refcount = dbuf_refcount 1893 uint64_t 1894 dbuf_refcount(dmu_buf_impl_t *db) 1895 { 1896 return (refcount_count(&db->db_holds)); 1897 } 1898 1899 void * 1900 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1901 dmu_buf_evict_func_t *evict_func) 1902 { 1903 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1904 user_data_ptr_ptr, evict_func)); 1905 } 1906 1907 void * 1908 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1909 dmu_buf_evict_func_t *evict_func) 1910 { 1911 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1912 1913 db->db_immediate_evict = TRUE; 1914 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1915 user_data_ptr_ptr, evict_func)); 1916 } 1917 1918 void * 1919 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 1920 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 1921 { 1922 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1923 ASSERT(db->db_level == 0); 1924 1925 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 1926 1927 mutex_enter(&db->db_mtx); 1928 1929 if (db->db_user_ptr == old_user_ptr) { 1930 db->db_user_ptr = user_ptr; 1931 db->db_user_data_ptr_ptr = user_data_ptr_ptr; 1932 db->db_evict_func = evict_func; 1933 1934 dbuf_update_data(db); 1935 } else { 1936 old_user_ptr = db->db_user_ptr; 1937 } 1938 1939 mutex_exit(&db->db_mtx); 1940 return (old_user_ptr); 1941 } 1942 1943 void * 1944 dmu_buf_get_user(dmu_buf_t *db_fake) 1945 { 1946 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1947 ASSERT(!refcount_is_zero(&db->db_holds)); 1948 1949 return (db->db_user_ptr); 1950 } 1951 1952 boolean_t 1953 dmu_buf_freeable(dmu_buf_t *dbuf) 1954 { 1955 boolean_t res = B_FALSE; 1956 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 1957 1958 if (db->db_blkptr) 1959 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 1960 db->db_blkptr->blk_birth); 1961 1962 return (res); 1963 } 1964 1965 static void 1966 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 1967 { 1968 /* ASSERT(dmu_tx_is_syncing(tx) */ 1969 ASSERT(MUTEX_HELD(&db->db_mtx)); 1970 1971 if (db->db_blkptr != NULL) 1972 return; 1973 1974 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 1975 /* 1976 * This buffer was allocated at a time when there was 1977 * no available blkptrs from the dnode, or it was 1978 * inappropriate to hook it in (i.e., nlevels mis-match). 1979 */ 1980 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 1981 ASSERT(db->db_parent == NULL); 1982 db->db_parent = dn->dn_dbuf; 1983 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 1984 DBUF_VERIFY(db); 1985 } else { 1986 dmu_buf_impl_t *parent = db->db_parent; 1987 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 1988 1989 ASSERT(dn->dn_phys->dn_nlevels > 1); 1990 if (parent == NULL) { 1991 mutex_exit(&db->db_mtx); 1992 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1993 (void) dbuf_hold_impl(dn, db->db_level+1, 1994 db->db_blkid >> epbs, FALSE, db, &parent); 1995 rw_exit(&dn->dn_struct_rwlock); 1996 mutex_enter(&db->db_mtx); 1997 db->db_parent = parent; 1998 } 1999 db->db_blkptr = (blkptr_t *)parent->db.db_data + 2000 (db->db_blkid & ((1ULL << epbs) - 1)); 2001 DBUF_VERIFY(db); 2002 } 2003 } 2004 2005 static void 2006 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2007 { 2008 dmu_buf_impl_t *db = dr->dr_dbuf; 2009 dnode_t *dn = db->db_dnode; 2010 zio_t *zio; 2011 2012 ASSERT(dmu_tx_is_syncing(tx)); 2013 2014 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2015 2016 mutex_enter(&db->db_mtx); 2017 2018 ASSERT(db->db_level > 0); 2019 DBUF_VERIFY(db); 2020 2021 if (db->db_buf == NULL) { 2022 mutex_exit(&db->db_mtx); 2023 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2024 mutex_enter(&db->db_mtx); 2025 } 2026 ASSERT3U(db->db_state, ==, DB_CACHED); 2027 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2028 ASSERT(db->db_buf != NULL); 2029 2030 dbuf_check_blkptr(dn, db); 2031 2032 db->db_data_pending = dr; 2033 2034 mutex_exit(&db->db_mtx); 2035 dbuf_write(dr, db->db_buf, tx); 2036 2037 zio = dr->dr_zio; 2038 mutex_enter(&dr->dt.di.dr_mtx); 2039 dbuf_sync_list(&dr->dt.di.dr_children, tx); 2040 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2041 mutex_exit(&dr->dt.di.dr_mtx); 2042 zio_nowait(zio); 2043 } 2044 2045 static void 2046 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2047 { 2048 arc_buf_t **datap = &dr->dt.dl.dr_data; 2049 dmu_buf_impl_t *db = dr->dr_dbuf; 2050 dnode_t *dn = db->db_dnode; 2051 objset_t *os = dn->dn_objset; 2052 uint64_t txg = tx->tx_txg; 2053 2054 ASSERT(dmu_tx_is_syncing(tx)); 2055 2056 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2057 2058 mutex_enter(&db->db_mtx); 2059 /* 2060 * To be synced, we must be dirtied. But we 2061 * might have been freed after the dirty. 2062 */ 2063 if (db->db_state == DB_UNCACHED) { 2064 /* This buffer has been freed since it was dirtied */ 2065 ASSERT(db->db.db_data == NULL); 2066 } else if (db->db_state == DB_FILL) { 2067 /* This buffer was freed and is now being re-filled */ 2068 ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2069 } else { 2070 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2071 } 2072 DBUF_VERIFY(db); 2073 2074 /* 2075 * If this is a bonus buffer, simply copy the bonus data into the 2076 * dnode. It will be written out when the dnode is synced (and it 2077 * will be synced, since it must have been dirty for dbuf_sync to 2078 * be called). 2079 */ 2080 if (db->db_blkid == DB_BONUS_BLKID) { 2081 dbuf_dirty_record_t **drp; 2082 2083 ASSERT(*datap != NULL); 2084 ASSERT3U(db->db_level, ==, 0); 2085 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2086 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2087 if (*datap != db->db.db_data) { 2088 zio_buf_free(*datap, DN_MAX_BONUSLEN); 2089 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2090 } 2091 db->db_data_pending = NULL; 2092 drp = &db->db_last_dirty; 2093 while (*drp != dr) 2094 drp = &(*drp)->dr_next; 2095 ASSERT(dr->dr_next == NULL); 2096 ASSERT(dr->dr_dbuf == db); 2097 *drp = dr->dr_next; 2098 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2099 ASSERT(db->db_dirtycnt > 0); 2100 db->db_dirtycnt -= 1; 2101 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2102 return; 2103 } 2104 2105 /* 2106 * This function may have dropped the db_mtx lock allowing a dmu_sync 2107 * operation to sneak in. As a result, we need to ensure that we 2108 * don't check the dr_override_state until we have returned from 2109 * dbuf_check_blkptr. 2110 */ 2111 dbuf_check_blkptr(dn, db); 2112 2113 /* 2114 * If this buffer is in the middle of an immdiate write, 2115 * wait for the synchronous IO to complete. 2116 */ 2117 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2118 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2119 cv_wait(&db->db_changed, &db->db_mtx); 2120 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2121 } 2122 2123 if (db->db_state != DB_NOFILL && 2124 dn->dn_object != DMU_META_DNODE_OBJECT && 2125 refcount_count(&db->db_holds) > 1 && 2126 dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2127 *datap == db->db_buf) { 2128 /* 2129 * If this buffer is currently "in use" (i.e., there 2130 * are active holds and db_data still references it), 2131 * then make a copy before we start the write so that 2132 * any modifications from the open txg will not leak 2133 * into this write. 2134 * 2135 * NOTE: this copy does not need to be made for 2136 * objects only modified in the syncing context (e.g. 2137 * DNONE_DNODE blocks). 2138 */ 2139 int blksz = arc_buf_size(*datap); 2140 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2141 *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2142 bcopy(db->db.db_data, (*datap)->b_data, blksz); 2143 } 2144 db->db_data_pending = dr; 2145 2146 mutex_exit(&db->db_mtx); 2147 2148 dbuf_write(dr, *datap, tx); 2149 2150 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2151 if (dn->dn_object == DMU_META_DNODE_OBJECT) 2152 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2153 else 2154 zio_nowait(dr->dr_zio); 2155 } 2156 2157 void 2158 dbuf_sync_list(list_t *list, dmu_tx_t *tx) 2159 { 2160 dbuf_dirty_record_t *dr; 2161 2162 while (dr = list_head(list)) { 2163 if (dr->dr_zio != NULL) { 2164 /* 2165 * If we find an already initialized zio then we 2166 * are processing the meta-dnode, and we have finished. 2167 * The dbufs for all dnodes are put back on the list 2168 * during processing, so that we can zio_wait() 2169 * these IOs after initiating all child IOs. 2170 */ 2171 ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2172 DMU_META_DNODE_OBJECT); 2173 break; 2174 } 2175 list_remove(list, dr); 2176 if (dr->dr_dbuf->db_level > 0) 2177 dbuf_sync_indirect(dr, tx); 2178 else 2179 dbuf_sync_leaf(dr, tx); 2180 } 2181 } 2182 2183 /* ARGSUSED */ 2184 static void 2185 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2186 { 2187 dmu_buf_impl_t *db = vdb; 2188 blkptr_t *bp = zio->io_bp; 2189 blkptr_t *bp_orig = &zio->io_bp_orig; 2190 dnode_t *dn = db->db_dnode; 2191 spa_t *spa = zio->io_spa; 2192 int64_t delta; 2193 uint64_t fill = 0; 2194 int i; 2195 2196 ASSERT(db->db_blkptr == bp); 2197 2198 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2199 dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2200 zio->io_prev_space_delta = delta; 2201 2202 if (BP_IS_HOLE(bp)) { 2203 ASSERT(bp->blk_fill == 0); 2204 return; 2205 } 2206 2207 ASSERT(BP_GET_TYPE(bp) == dn->dn_type); 2208 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2209 2210 mutex_enter(&db->db_mtx); 2211 2212 if (db->db_level == 0) { 2213 mutex_enter(&dn->dn_mtx); 2214 if (db->db_blkid > dn->dn_phys->dn_maxblkid) 2215 dn->dn_phys->dn_maxblkid = db->db_blkid; 2216 mutex_exit(&dn->dn_mtx); 2217 2218 if (dn->dn_type == DMU_OT_DNODE) { 2219 dnode_phys_t *dnp = db->db.db_data; 2220 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2221 i--, dnp++) { 2222 if (dnp->dn_type != DMU_OT_NONE) 2223 fill++; 2224 } 2225 } else { 2226 fill = 1; 2227 } 2228 } else { 2229 blkptr_t *ibp = db->db.db_data; 2230 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2231 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2232 if (BP_IS_HOLE(ibp)) 2233 continue; 2234 fill += ibp->blk_fill; 2235 } 2236 } 2237 2238 bp->blk_fill = fill; 2239 2240 mutex_exit(&db->db_mtx); 2241 } 2242 2243 /* ARGSUSED */ 2244 static void 2245 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2246 { 2247 dmu_buf_impl_t *db = vdb; 2248 blkptr_t *bp = zio->io_bp; 2249 blkptr_t *bp_orig = &zio->io_bp_orig; 2250 dnode_t *dn = db->db_dnode; 2251 objset_t *os = dn->dn_objset; 2252 uint64_t txg = zio->io_txg; 2253 dbuf_dirty_record_t **drp, *dr; 2254 2255 ASSERT3U(zio->io_error, ==, 0); 2256 ASSERT(db->db_blkptr == bp); 2257 2258 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 2259 ASSERT(BP_EQUAL(bp, bp_orig)); 2260 } else { 2261 dsl_dataset_t *ds = os->os_dsl_dataset; 2262 dmu_tx_t *tx = os->os_synctx; 2263 2264 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2265 dsl_dataset_block_born(ds, bp, tx); 2266 } 2267 2268 mutex_enter(&db->db_mtx); 2269 2270 DBUF_VERIFY(db); 2271 2272 drp = &db->db_last_dirty; 2273 while ((dr = *drp) != db->db_data_pending) 2274 drp = &dr->dr_next; 2275 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2276 ASSERT(dr->dr_txg == txg); 2277 ASSERT(dr->dr_dbuf == db); 2278 ASSERT(dr->dr_next == NULL); 2279 *drp = dr->dr_next; 2280 2281 if (db->db_level == 0) { 2282 ASSERT(db->db_blkid != DB_BONUS_BLKID); 2283 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2284 if (db->db_state != DB_NOFILL) { 2285 if (dr->dt.dl.dr_data != db->db_buf) 2286 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2287 db) == 1); 2288 else if (!arc_released(db->db_buf)) 2289 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2290 } 2291 } else { 2292 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2293 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2294 if (!BP_IS_HOLE(db->db_blkptr)) { 2295 int epbs = 2296 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2297 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2298 db->db.db_size); 2299 ASSERT3U(dn->dn_phys->dn_maxblkid 2300 >> (db->db_level * epbs), >=, db->db_blkid); 2301 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2302 } 2303 mutex_destroy(&dr->dt.di.dr_mtx); 2304 list_destroy(&dr->dt.di.dr_children); 2305 } 2306 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2307 2308 cv_broadcast(&db->db_changed); 2309 ASSERT(db->db_dirtycnt > 0); 2310 db->db_dirtycnt -= 1; 2311 db->db_data_pending = NULL; 2312 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2313 } 2314 2315 static void 2316 dbuf_write_nofill_ready(zio_t *zio) 2317 { 2318 dbuf_write_ready(zio, NULL, zio->io_private); 2319 } 2320 2321 static void 2322 dbuf_write_nofill_done(zio_t *zio) 2323 { 2324 dbuf_write_done(zio, NULL, zio->io_private); 2325 } 2326 2327 static void 2328 dbuf_write_override_ready(zio_t *zio) 2329 { 2330 dbuf_dirty_record_t *dr = zio->io_private; 2331 dmu_buf_impl_t *db = dr->dr_dbuf; 2332 2333 dbuf_write_ready(zio, NULL, db); 2334 } 2335 2336 static void 2337 dbuf_write_override_done(zio_t *zio) 2338 { 2339 dbuf_dirty_record_t *dr = zio->io_private; 2340 dmu_buf_impl_t *db = dr->dr_dbuf; 2341 blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2342 2343 mutex_enter(&db->db_mtx); 2344 if (!BP_EQUAL(zio->io_bp, obp)) { 2345 if (!BP_IS_HOLE(obp)) 2346 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2347 arc_release(dr->dt.dl.dr_data, db); 2348 } 2349 mutex_exit(&db->db_mtx); 2350 2351 dbuf_write_done(zio, NULL, db); 2352 } 2353 2354 static void 2355 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2356 { 2357 dmu_buf_impl_t *db = dr->dr_dbuf; 2358 dnode_t *dn = db->db_dnode; 2359 objset_t *os = dn->dn_objset; 2360 dmu_buf_impl_t *parent = db->db_parent; 2361 uint64_t txg = tx->tx_txg; 2362 zbookmark_t zb; 2363 zio_prop_t zp; 2364 zio_t *zio; 2365 2366 if (db->db_state != DB_NOFILL) { 2367 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2368 /* 2369 * Private object buffers are released here rather 2370 * than in dbuf_dirty() since they are only modified 2371 * in the syncing context and we don't want the 2372 * overhead of making multiple copies of the data. 2373 */ 2374 if (BP_IS_HOLE(db->db_blkptr)) { 2375 arc_buf_thaw(data); 2376 } else { 2377 arc_release(data, db); 2378 } 2379 } 2380 } 2381 2382 if (parent != dn->dn_dbuf) { 2383 ASSERT(parent && parent->db_data_pending); 2384 ASSERT(db->db_level == parent->db_level-1); 2385 ASSERT(arc_released(parent->db_buf)); 2386 zio = parent->db_data_pending->dr_zio; 2387 } else { 2388 ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); 2389 ASSERT3P(db->db_blkptr, ==, 2390 &dn->dn_phys->dn_blkptr[db->db_blkid]); 2391 zio = dn->dn_zio; 2392 } 2393 2394 ASSERT(db->db_level == 0 || data == db->db_buf); 2395 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2396 ASSERT(zio); 2397 2398 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2399 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2400 db->db.db_object, db->db_level, db->db_blkid); 2401 2402 dmu_write_policy(os, dn, db->db_level, 2403 db->db_state == DB_NOFILL ? WP_NOFILL : 0, &zp); 2404 2405 if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2406 ASSERT(db->db_state != DB_NOFILL); 2407 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2408 db->db_blkptr, data->b_data, arc_buf_size(data), &zp, 2409 dbuf_write_override_ready, dbuf_write_override_done, dr, 2410 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2411 mutex_enter(&db->db_mtx); 2412 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2413 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2414 dr->dt.dl.dr_copies); 2415 mutex_exit(&db->db_mtx); 2416 } else if (db->db_state == DB_NOFILL) { 2417 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); 2418 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2419 db->db_blkptr, NULL, db->db.db_size, &zp, 2420 dbuf_write_nofill_ready, dbuf_write_nofill_done, db, 2421 ZIO_PRIORITY_ASYNC_WRITE, 2422 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2423 } else { 2424 ASSERT(arc_released(data)); 2425 dr->dr_zio = arc_write(zio, os->os_spa, txg, 2426 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, 2427 dbuf_write_ready, dbuf_write_done, db, 2428 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2429 } 2430 } 2431