1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/dmu.h> 28 #include <sys/dmu_impl.h> 29 #include <sys/dbuf.h> 30 #include <sys/dmu_objset.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_dir.h> 33 #include <sys/dmu_tx.h> 34 #include <sys/spa.h> 35 #include <sys/zio.h> 36 #include <sys/dmu_zfetch.h> 37 #include <sys/sa.h> 38 #include <sys/sa_impl.h> 39 40 static void dbuf_destroy(dmu_buf_impl_t *db); 41 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 42 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 43 44 /* 45 * Global data structures and functions for the dbuf cache. 46 */ 47 static kmem_cache_t *dbuf_cache; 48 49 /* ARGSUSED */ 50 static int 51 dbuf_cons(void *vdb, void *unused, int kmflag) 52 { 53 dmu_buf_impl_t *db = vdb; 54 bzero(db, sizeof (dmu_buf_impl_t)); 55 56 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 57 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 58 refcount_create(&db->db_holds); 59 return (0); 60 } 61 62 /* ARGSUSED */ 63 static void 64 dbuf_dest(void *vdb, void *unused) 65 { 66 dmu_buf_impl_t *db = vdb; 67 mutex_destroy(&db->db_mtx); 68 cv_destroy(&db->db_changed); 69 refcount_destroy(&db->db_holds); 70 } 71 72 /* 73 * dbuf hash table routines 74 */ 75 static dbuf_hash_table_t dbuf_hash_table; 76 77 static uint64_t dbuf_hash_count; 78 79 static uint64_t 80 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 81 { 82 uintptr_t osv = (uintptr_t)os; 83 uint64_t crc = -1ULL; 84 85 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 86 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 87 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 88 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 90 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 91 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 92 93 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 94 95 return (crc); 96 } 97 98 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 99 100 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 101 ((dbuf)->db.db_object == (obj) && \ 102 (dbuf)->db_objset == (os) && \ 103 (dbuf)->db_level == (level) && \ 104 (dbuf)->db_blkid == (blkid)) 105 106 dmu_buf_impl_t * 107 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 108 { 109 dbuf_hash_table_t *h = &dbuf_hash_table; 110 objset_t *os = dn->dn_objset; 111 uint64_t obj = dn->dn_object; 112 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 113 uint64_t idx = hv & h->hash_table_mask; 114 dmu_buf_impl_t *db; 115 116 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 117 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 118 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 119 mutex_enter(&db->db_mtx); 120 if (db->db_state != DB_EVICTING) { 121 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 122 return (db); 123 } 124 mutex_exit(&db->db_mtx); 125 } 126 } 127 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 128 return (NULL); 129 } 130 131 /* 132 * Insert an entry into the hash table. If there is already an element 133 * equal to elem in the hash table, then the already existing element 134 * will be returned and the new element will not be inserted. 135 * Otherwise returns NULL. 136 */ 137 static dmu_buf_impl_t * 138 dbuf_hash_insert(dmu_buf_impl_t *db) 139 { 140 dbuf_hash_table_t *h = &dbuf_hash_table; 141 objset_t *os = db->db_objset; 142 uint64_t obj = db->db.db_object; 143 int level = db->db_level; 144 uint64_t blkid = db->db_blkid; 145 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 146 uint64_t idx = hv & h->hash_table_mask; 147 dmu_buf_impl_t *dbf; 148 149 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 150 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 151 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 152 mutex_enter(&dbf->db_mtx); 153 if (dbf->db_state != DB_EVICTING) { 154 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 155 return (dbf); 156 } 157 mutex_exit(&dbf->db_mtx); 158 } 159 } 160 161 mutex_enter(&db->db_mtx); 162 db->db_hash_next = h->hash_table[idx]; 163 h->hash_table[idx] = db; 164 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 165 atomic_add_64(&dbuf_hash_count, 1); 166 167 return (NULL); 168 } 169 170 /* 171 * Remove an entry from the hash table. This operation will 172 * fail if there are any existing holds on the db. 173 */ 174 static void 175 dbuf_hash_remove(dmu_buf_impl_t *db) 176 { 177 dbuf_hash_table_t *h = &dbuf_hash_table; 178 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 179 db->db_level, db->db_blkid); 180 uint64_t idx = hv & h->hash_table_mask; 181 dmu_buf_impl_t *dbf, **dbp; 182 183 /* 184 * We musn't hold db_mtx to maintin lock ordering: 185 * DBUF_HASH_MUTEX > db_mtx. 186 */ 187 ASSERT(refcount_is_zero(&db->db_holds)); 188 ASSERT(db->db_state == DB_EVICTING); 189 ASSERT(!MUTEX_HELD(&db->db_mtx)); 190 191 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 192 dbp = &h->hash_table[idx]; 193 while ((dbf = *dbp) != db) { 194 dbp = &dbf->db_hash_next; 195 ASSERT(dbf != NULL); 196 } 197 *dbp = db->db_hash_next; 198 db->db_hash_next = NULL; 199 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 200 atomic_add_64(&dbuf_hash_count, -1); 201 } 202 203 static arc_evict_func_t dbuf_do_evict; 204 205 static void 206 dbuf_evict_user(dmu_buf_impl_t *db) 207 { 208 ASSERT(MUTEX_HELD(&db->db_mtx)); 209 210 if (db->db_level != 0 || db->db_evict_func == NULL) 211 return; 212 213 if (db->db_user_data_ptr_ptr) 214 *db->db_user_data_ptr_ptr = db->db.db_data; 215 db->db_evict_func(&db->db, db->db_user_ptr); 216 db->db_user_ptr = NULL; 217 db->db_user_data_ptr_ptr = NULL; 218 db->db_evict_func = NULL; 219 } 220 221 boolean_t 222 dbuf_is_metadata(dmu_buf_impl_t *db) 223 { 224 if (db->db_level > 0) { 225 return (B_TRUE); 226 } else { 227 boolean_t is_metadata; 228 229 DB_DNODE_ENTER(db); 230 is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata; 231 DB_DNODE_EXIT(db); 232 233 return (is_metadata); 234 } 235 } 236 237 void 238 dbuf_evict(dmu_buf_impl_t *db) 239 { 240 ASSERT(MUTEX_HELD(&db->db_mtx)); 241 ASSERT(db->db_buf == NULL); 242 ASSERT(db->db_data_pending == NULL); 243 244 dbuf_clear(db); 245 dbuf_destroy(db); 246 } 247 248 void 249 dbuf_init(void) 250 { 251 uint64_t hsize = 1ULL << 16; 252 dbuf_hash_table_t *h = &dbuf_hash_table; 253 int i; 254 255 /* 256 * The hash table is big enough to fill all of physical memory 257 * with an average 4K block size. The table will take up 258 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 259 */ 260 while (hsize * 4096 < physmem * PAGESIZE) 261 hsize <<= 1; 262 263 retry: 264 h->hash_table_mask = hsize - 1; 265 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 266 if (h->hash_table == NULL) { 267 /* XXX - we should really return an error instead of assert */ 268 ASSERT(hsize > (1ULL << 10)); 269 hsize >>= 1; 270 goto retry; 271 } 272 273 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 274 sizeof (dmu_buf_impl_t), 275 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 276 277 for (i = 0; i < DBUF_MUTEXES; i++) 278 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 279 } 280 281 void 282 dbuf_fini(void) 283 { 284 dbuf_hash_table_t *h = &dbuf_hash_table; 285 int i; 286 287 for (i = 0; i < DBUF_MUTEXES; i++) 288 mutex_destroy(&h->hash_mutexes[i]); 289 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 290 kmem_cache_destroy(dbuf_cache); 291 } 292 293 /* 294 * Other stuff. 295 */ 296 297 #ifdef ZFS_DEBUG 298 static void 299 dbuf_verify(dmu_buf_impl_t *db) 300 { 301 dnode_t *dn; 302 dbuf_dirty_record_t *dr; 303 304 ASSERT(MUTEX_HELD(&db->db_mtx)); 305 306 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 307 return; 308 309 ASSERT(db->db_objset != NULL); 310 DB_DNODE_ENTER(db); 311 dn = DB_DNODE(db); 312 if (dn == NULL) { 313 ASSERT(db->db_parent == NULL); 314 ASSERT(db->db_blkptr == NULL); 315 } else { 316 ASSERT3U(db->db.db_object, ==, dn->dn_object); 317 ASSERT3P(db->db_objset, ==, dn->dn_objset); 318 ASSERT3U(db->db_level, <, dn->dn_nlevels); 319 ASSERT(db->db_blkid == DMU_BONUS_BLKID || 320 db->db_blkid == DMU_SPILL_BLKID || 321 !list_is_empty(&dn->dn_dbufs)); 322 } 323 if (db->db_blkid == DMU_BONUS_BLKID) { 324 ASSERT(dn != NULL); 325 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 326 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 327 } else if (db->db_blkid == DMU_SPILL_BLKID) { 328 ASSERT(dn != NULL); 329 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 330 ASSERT3U(db->db.db_offset, ==, 0); 331 } else { 332 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 333 } 334 335 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 336 ASSERT(dr->dr_dbuf == db); 337 338 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 339 ASSERT(dr->dr_dbuf == db); 340 341 /* 342 * We can't assert that db_size matches dn_datablksz because it 343 * can be momentarily different when another thread is doing 344 * dnode_set_blksz(). 345 */ 346 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 347 dr = db->db_data_pending; 348 /* 349 * It should only be modified in syncing context, so 350 * make sure we only have one copy of the data. 351 */ 352 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 353 } 354 355 /* verify db->db_blkptr */ 356 if (db->db_blkptr) { 357 if (db->db_parent == dn->dn_dbuf) { 358 /* db is pointed to by the dnode */ 359 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 360 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 361 ASSERT(db->db_parent == NULL); 362 else 363 ASSERT(db->db_parent != NULL); 364 if (db->db_blkid != DMU_SPILL_BLKID) 365 ASSERT3P(db->db_blkptr, ==, 366 &dn->dn_phys->dn_blkptr[db->db_blkid]); 367 } else { 368 /* db is pointed to by an indirect block */ 369 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 370 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 371 ASSERT3U(db->db_parent->db.db_object, ==, 372 db->db.db_object); 373 /* 374 * dnode_grow_indblksz() can make this fail if we don't 375 * have the struct_rwlock. XXX indblksz no longer 376 * grows. safe to do this now? 377 */ 378 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 379 ASSERT3P(db->db_blkptr, ==, 380 ((blkptr_t *)db->db_parent->db.db_data + 381 db->db_blkid % epb)); 382 } 383 } 384 } 385 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 386 (db->db_buf == NULL || db->db_buf->b_data) && 387 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 388 db->db_state != DB_FILL && !dn->dn_free_txg) { 389 /* 390 * If the blkptr isn't set but they have nonzero data, 391 * it had better be dirty, otherwise we'll lose that 392 * data when we evict this buffer. 393 */ 394 if (db->db_dirtycnt == 0) { 395 uint64_t *buf = db->db.db_data; 396 int i; 397 398 for (i = 0; i < db->db.db_size >> 3; i++) { 399 ASSERT(buf[i] == 0); 400 } 401 } 402 } 403 DB_DNODE_EXIT(db); 404 } 405 #endif 406 407 static void 408 dbuf_update_data(dmu_buf_impl_t *db) 409 { 410 ASSERT(MUTEX_HELD(&db->db_mtx)); 411 if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 412 ASSERT(!refcount_is_zero(&db->db_holds)); 413 *db->db_user_data_ptr_ptr = db->db.db_data; 414 } 415 } 416 417 static void 418 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 419 { 420 ASSERT(MUTEX_HELD(&db->db_mtx)); 421 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 422 db->db_buf = buf; 423 if (buf != NULL) { 424 ASSERT(buf->b_data != NULL); 425 db->db.db_data = buf->b_data; 426 if (!arc_released(buf)) 427 arc_set_callback(buf, dbuf_do_evict, db); 428 dbuf_update_data(db); 429 } else { 430 dbuf_evict_user(db); 431 db->db.db_data = NULL; 432 if (db->db_state != DB_NOFILL) 433 db->db_state = DB_UNCACHED; 434 } 435 } 436 437 /* 438 * Loan out an arc_buf for read. Return the loaned arc_buf. 439 */ 440 arc_buf_t * 441 dbuf_loan_arcbuf(dmu_buf_impl_t *db) 442 { 443 arc_buf_t *abuf; 444 445 mutex_enter(&db->db_mtx); 446 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 447 int blksz = db->db.db_size; 448 spa_t *spa; 449 450 mutex_exit(&db->db_mtx); 451 DB_GET_SPA(&spa, db); 452 abuf = arc_loan_buf(spa, blksz); 453 bcopy(db->db.db_data, abuf->b_data, blksz); 454 } else { 455 abuf = db->db_buf; 456 arc_loan_inuse_buf(abuf, db); 457 dbuf_set_data(db, NULL); 458 mutex_exit(&db->db_mtx); 459 } 460 return (abuf); 461 } 462 463 uint64_t 464 dbuf_whichblock(dnode_t *dn, uint64_t offset) 465 { 466 if (dn->dn_datablkshift) { 467 return (offset >> dn->dn_datablkshift); 468 } else { 469 ASSERT3U(offset, <, dn->dn_datablksz); 470 return (0); 471 } 472 } 473 474 static void 475 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 476 { 477 dmu_buf_impl_t *db = vdb; 478 479 mutex_enter(&db->db_mtx); 480 ASSERT3U(db->db_state, ==, DB_READ); 481 /* 482 * All reads are synchronous, so we must have a hold on the dbuf 483 */ 484 ASSERT(refcount_count(&db->db_holds) > 0); 485 ASSERT(db->db_buf == NULL); 486 ASSERT(db->db.db_data == NULL); 487 if (db->db_level == 0 && db->db_freed_in_flight) { 488 /* we were freed in flight; disregard any error */ 489 arc_release(buf, db); 490 bzero(buf->b_data, db->db.db_size); 491 arc_buf_freeze(buf); 492 db->db_freed_in_flight = FALSE; 493 dbuf_set_data(db, buf); 494 db->db_state = DB_CACHED; 495 } else if (zio == NULL || zio->io_error == 0) { 496 dbuf_set_data(db, buf); 497 db->db_state = DB_CACHED; 498 } else { 499 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 500 ASSERT3P(db->db_buf, ==, NULL); 501 VERIFY(arc_buf_remove_ref(buf, db) == 1); 502 db->db_state = DB_UNCACHED; 503 } 504 cv_broadcast(&db->db_changed); 505 dbuf_rele_and_unlock(db, NULL); 506 } 507 508 static void 509 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 510 { 511 dnode_t *dn; 512 spa_t *spa; 513 zbookmark_t zb; 514 uint32_t aflags = ARC_NOWAIT; 515 arc_buf_t *pbuf; 516 517 DB_DNODE_ENTER(db); 518 dn = DB_DNODE(db); 519 ASSERT(!refcount_is_zero(&db->db_holds)); 520 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 521 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 522 ASSERT(MUTEX_HELD(&db->db_mtx)); 523 ASSERT(db->db_state == DB_UNCACHED); 524 ASSERT(db->db_buf == NULL); 525 526 if (db->db_blkid == DMU_BONUS_BLKID) { 527 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 528 529 ASSERT3U(bonuslen, <=, db->db.db_size); 530 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 531 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 532 if (bonuslen < DN_MAX_BONUSLEN) 533 bzero(db->db.db_data, DN_MAX_BONUSLEN); 534 if (bonuslen) 535 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 536 DB_DNODE_EXIT(db); 537 dbuf_update_data(db); 538 db->db_state = DB_CACHED; 539 mutex_exit(&db->db_mtx); 540 return; 541 } 542 543 /* 544 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 545 * processes the delete record and clears the bp while we are waiting 546 * for the dn_mtx (resulting in a "no" from block_freed). 547 */ 548 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 549 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 550 BP_IS_HOLE(db->db_blkptr)))) { 551 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 552 553 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, 554 db->db.db_size, db, type)); 555 DB_DNODE_EXIT(db); 556 bzero(db->db.db_data, db->db.db_size); 557 db->db_state = DB_CACHED; 558 *flags |= DB_RF_CACHED; 559 mutex_exit(&db->db_mtx); 560 return; 561 } 562 563 spa = dn->dn_objset->os_spa; 564 DB_DNODE_EXIT(db); 565 566 db->db_state = DB_READ; 567 mutex_exit(&db->db_mtx); 568 569 if (DBUF_IS_L2CACHEABLE(db)) 570 aflags |= ARC_L2CACHE; 571 572 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 573 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 574 db->db.db_object, db->db_level, db->db_blkid); 575 576 dbuf_add_ref(db, NULL); 577 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 578 579 if (db->db_parent) 580 pbuf = db->db_parent->db_buf; 581 else 582 pbuf = db->db_objset->os_phys_buf; 583 584 (void) dsl_read(zio, spa, db->db_blkptr, pbuf, 585 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 586 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 587 &aflags, &zb); 588 if (aflags & ARC_CACHED) 589 *flags |= DB_RF_CACHED; 590 } 591 592 int 593 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 594 { 595 int err = 0; 596 int havepzio = (zio != NULL); 597 int prefetch; 598 dnode_t *dn; 599 600 /* 601 * We don't have to hold the mutex to check db_state because it 602 * can't be freed while we have a hold on the buffer. 603 */ 604 ASSERT(!refcount_is_zero(&db->db_holds)); 605 606 if (db->db_state == DB_NOFILL) 607 return (EIO); 608 609 DB_DNODE_ENTER(db); 610 dn = DB_DNODE(db); 611 if ((flags & DB_RF_HAVESTRUCT) == 0) 612 rw_enter(&dn->dn_struct_rwlock, RW_READER); 613 614 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 615 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 616 DBUF_IS_CACHEABLE(db); 617 618 mutex_enter(&db->db_mtx); 619 if (db->db_state == DB_CACHED) { 620 mutex_exit(&db->db_mtx); 621 if (prefetch) 622 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 623 db->db.db_size, TRUE); 624 if ((flags & DB_RF_HAVESTRUCT) == 0) 625 rw_exit(&dn->dn_struct_rwlock); 626 DB_DNODE_EXIT(db); 627 } else if (db->db_state == DB_UNCACHED) { 628 spa_t *spa = dn->dn_objset->os_spa; 629 630 if (zio == NULL) 631 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 632 dbuf_read_impl(db, zio, &flags); 633 634 /* dbuf_read_impl has dropped db_mtx for us */ 635 636 if (prefetch) 637 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 638 db->db.db_size, flags & DB_RF_CACHED); 639 640 if ((flags & DB_RF_HAVESTRUCT) == 0) 641 rw_exit(&dn->dn_struct_rwlock); 642 DB_DNODE_EXIT(db); 643 644 if (!havepzio) 645 err = zio_wait(zio); 646 } else { 647 mutex_exit(&db->db_mtx); 648 if (prefetch) 649 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 650 db->db.db_size, TRUE); 651 if ((flags & DB_RF_HAVESTRUCT) == 0) 652 rw_exit(&dn->dn_struct_rwlock); 653 DB_DNODE_EXIT(db); 654 655 mutex_enter(&db->db_mtx); 656 if ((flags & DB_RF_NEVERWAIT) == 0) { 657 while (db->db_state == DB_READ || 658 db->db_state == DB_FILL) { 659 ASSERT(db->db_state == DB_READ || 660 (flags & DB_RF_HAVESTRUCT) == 0); 661 cv_wait(&db->db_changed, &db->db_mtx); 662 } 663 if (db->db_state == DB_UNCACHED) 664 err = EIO; 665 } 666 mutex_exit(&db->db_mtx); 667 } 668 669 ASSERT(err || havepzio || db->db_state == DB_CACHED); 670 return (err); 671 } 672 673 static void 674 dbuf_noread(dmu_buf_impl_t *db) 675 { 676 ASSERT(!refcount_is_zero(&db->db_holds)); 677 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 678 mutex_enter(&db->db_mtx); 679 while (db->db_state == DB_READ || db->db_state == DB_FILL) 680 cv_wait(&db->db_changed, &db->db_mtx); 681 if (db->db_state == DB_UNCACHED) { 682 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 683 spa_t *spa; 684 685 ASSERT(db->db_buf == NULL); 686 ASSERT(db->db.db_data == NULL); 687 DB_GET_SPA(&spa, db); 688 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 689 db->db_state = DB_FILL; 690 } else if (db->db_state == DB_NOFILL) { 691 dbuf_set_data(db, NULL); 692 } else { 693 ASSERT3U(db->db_state, ==, DB_CACHED); 694 } 695 mutex_exit(&db->db_mtx); 696 } 697 698 /* 699 * This is our just-in-time copy function. It makes a copy of 700 * buffers, that have been modified in a previous transaction 701 * group, before we modify them in the current active group. 702 * 703 * This function is used in two places: when we are dirtying a 704 * buffer for the first time in a txg, and when we are freeing 705 * a range in a dnode that includes this buffer. 706 * 707 * Note that when we are called from dbuf_free_range() we do 708 * not put a hold on the buffer, we just traverse the active 709 * dbuf list for the dnode. 710 */ 711 static void 712 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 713 { 714 dbuf_dirty_record_t *dr = db->db_last_dirty; 715 716 ASSERT(MUTEX_HELD(&db->db_mtx)); 717 ASSERT(db->db.db_data != NULL); 718 ASSERT(db->db_level == 0); 719 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 720 721 if (dr == NULL || 722 (dr->dt.dl.dr_data != 723 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 724 return; 725 726 /* 727 * If the last dirty record for this dbuf has not yet synced 728 * and its referencing the dbuf data, either: 729 * reset the reference to point to a new copy, 730 * or (if there a no active holders) 731 * just null out the current db_data pointer. 732 */ 733 ASSERT(dr->dr_txg >= txg - 2); 734 if (db->db_blkid == DMU_BONUS_BLKID) { 735 /* Note that the data bufs here are zio_bufs */ 736 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 737 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 738 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 739 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 740 int size = db->db.db_size; 741 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 742 spa_t *spa; 743 744 DB_GET_SPA(&spa, db); 745 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 746 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 747 } else { 748 dbuf_set_data(db, NULL); 749 } 750 } 751 752 void 753 dbuf_unoverride(dbuf_dirty_record_t *dr) 754 { 755 dmu_buf_impl_t *db = dr->dr_dbuf; 756 blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 757 uint64_t txg = dr->dr_txg; 758 759 ASSERT(MUTEX_HELD(&db->db_mtx)); 760 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 761 ASSERT(db->db_level == 0); 762 763 if (db->db_blkid == DMU_BONUS_BLKID || 764 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 765 return; 766 767 ASSERT(db->db_data_pending != dr); 768 769 /* free this block */ 770 if (!BP_IS_HOLE(bp)) { 771 spa_t *spa; 772 773 DB_GET_SPA(&spa, db); 774 zio_free(spa, txg, bp); 775 } 776 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 777 /* 778 * Release the already-written buffer, so we leave it in 779 * a consistent dirty state. Note that all callers are 780 * modifying the buffer, so they will immediately do 781 * another (redundant) arc_release(). Therefore, leave 782 * the buf thawed to save the effort of freezing & 783 * immediately re-thawing it. 784 */ 785 arc_release(dr->dt.dl.dr_data, db); 786 } 787 788 /* 789 * Evict (if its unreferenced) or clear (if its referenced) any level-0 790 * data blocks in the free range, so that any future readers will find 791 * empty blocks. Also, if we happen accross any level-1 dbufs in the 792 * range that have not already been marked dirty, mark them dirty so 793 * they stay in memory. 794 */ 795 void 796 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) 797 { 798 dmu_buf_impl_t *db, *db_next; 799 uint64_t txg = tx->tx_txg; 800 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 801 uint64_t first_l1 = start >> epbs; 802 uint64_t last_l1 = end >> epbs; 803 804 if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { 805 end = dn->dn_maxblkid; 806 last_l1 = end >> epbs; 807 } 808 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); 809 mutex_enter(&dn->dn_dbufs_mtx); 810 for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 811 db_next = list_next(&dn->dn_dbufs, db); 812 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 813 814 if (db->db_level == 1 && 815 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { 816 mutex_enter(&db->db_mtx); 817 if (db->db_last_dirty && 818 db->db_last_dirty->dr_txg < txg) { 819 dbuf_add_ref(db, FTAG); 820 mutex_exit(&db->db_mtx); 821 dbuf_will_dirty(db, tx); 822 dbuf_rele(db, FTAG); 823 } else { 824 mutex_exit(&db->db_mtx); 825 } 826 } 827 828 if (db->db_level != 0) 829 continue; 830 dprintf_dbuf(db, "found buf %s\n", ""); 831 if (db->db_blkid < start || db->db_blkid > end) 832 continue; 833 834 /* found a level 0 buffer in the range */ 835 if (dbuf_undirty(db, tx)) 836 continue; 837 838 mutex_enter(&db->db_mtx); 839 if (db->db_state == DB_UNCACHED || 840 db->db_state == DB_NOFILL || 841 db->db_state == DB_EVICTING) { 842 ASSERT(db->db.db_data == NULL); 843 mutex_exit(&db->db_mtx); 844 continue; 845 } 846 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 847 /* will be handled in dbuf_read_done or dbuf_rele */ 848 db->db_freed_in_flight = TRUE; 849 mutex_exit(&db->db_mtx); 850 continue; 851 } 852 if (refcount_count(&db->db_holds) == 0) { 853 ASSERT(db->db_buf); 854 dbuf_clear(db); 855 continue; 856 } 857 /* The dbuf is referenced */ 858 859 if (db->db_last_dirty != NULL) { 860 dbuf_dirty_record_t *dr = db->db_last_dirty; 861 862 if (dr->dr_txg == txg) { 863 /* 864 * This buffer is "in-use", re-adjust the file 865 * size to reflect that this buffer may 866 * contain new data when we sync. 867 */ 868 if (db->db_blkid != DMU_SPILL_BLKID && 869 db->db_blkid > dn->dn_maxblkid) 870 dn->dn_maxblkid = db->db_blkid; 871 dbuf_unoverride(dr); 872 } else { 873 /* 874 * This dbuf is not dirty in the open context. 875 * Either uncache it (if its not referenced in 876 * the open context) or reset its contents to 877 * empty. 878 */ 879 dbuf_fix_old_data(db, txg); 880 } 881 } 882 /* clear the contents if its cached */ 883 if (db->db_state == DB_CACHED) { 884 ASSERT(db->db.db_data != NULL); 885 arc_release(db->db_buf, db); 886 bzero(db->db.db_data, db->db.db_size); 887 arc_buf_freeze(db->db_buf); 888 } 889 890 mutex_exit(&db->db_mtx); 891 } 892 mutex_exit(&dn->dn_dbufs_mtx); 893 } 894 895 static int 896 dbuf_block_freeable(dmu_buf_impl_t *db) 897 { 898 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 899 uint64_t birth_txg = 0; 900 901 /* 902 * We don't need any locking to protect db_blkptr: 903 * If it's syncing, then db_last_dirty will be set 904 * so we'll ignore db_blkptr. 905 */ 906 ASSERT(MUTEX_HELD(&db->db_mtx)); 907 if (db->db_last_dirty) 908 birth_txg = db->db_last_dirty->dr_txg; 909 else if (db->db_blkptr) 910 birth_txg = db->db_blkptr->blk_birth; 911 912 /* 913 * If we don't exist or are in a snapshot, we can't be freed. 914 * Don't pass the bp to dsl_dataset_block_freeable() since we 915 * are holding the db_mtx lock and might deadlock if we are 916 * prefetching a dedup-ed block. 917 */ 918 if (birth_txg) 919 return (ds == NULL || 920 dsl_dataset_block_freeable(ds, NULL, birth_txg)); 921 else 922 return (FALSE); 923 } 924 925 void 926 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 927 { 928 arc_buf_t *buf, *obuf; 929 int osize = db->db.db_size; 930 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 931 dnode_t *dn; 932 933 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 934 935 DB_DNODE_ENTER(db); 936 dn = DB_DNODE(db); 937 938 /* XXX does *this* func really need the lock? */ 939 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 940 941 /* 942 * This call to dbuf_will_dirty() with the dn_struct_rwlock held 943 * is OK, because there can be no other references to the db 944 * when we are changing its size, so no concurrent DB_FILL can 945 * be happening. 946 */ 947 /* 948 * XXX we should be doing a dbuf_read, checking the return 949 * value and returning that up to our callers 950 */ 951 dbuf_will_dirty(db, tx); 952 953 /* create the data buffer for the new block */ 954 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 955 956 /* copy old block data to the new block */ 957 obuf = db->db_buf; 958 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 959 /* zero the remainder */ 960 if (size > osize) 961 bzero((uint8_t *)buf->b_data + osize, size - osize); 962 963 mutex_enter(&db->db_mtx); 964 dbuf_set_data(db, buf); 965 VERIFY(arc_buf_remove_ref(obuf, db) == 1); 966 db->db.db_size = size; 967 968 if (db->db_level == 0) { 969 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 970 db->db_last_dirty->dt.dl.dr_data = buf; 971 } 972 mutex_exit(&db->db_mtx); 973 974 dnode_willuse_space(dn, size-osize, tx); 975 DB_DNODE_EXIT(db); 976 } 977 978 void 979 dbuf_release_bp(dmu_buf_impl_t *db) 980 { 981 objset_t *os; 982 zbookmark_t zb; 983 984 DB_GET_OBJSET(&os, db); 985 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 986 ASSERT(arc_released(os->os_phys_buf) || 987 list_link_active(&os->os_dsl_dataset->ds_synced_link)); 988 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 989 990 zb.zb_objset = os->os_dsl_dataset ? 991 os->os_dsl_dataset->ds_object : 0; 992 zb.zb_object = db->db.db_object; 993 zb.zb_level = db->db_level; 994 zb.zb_blkid = db->db_blkid; 995 (void) arc_release_bp(db->db_buf, db, 996 db->db_blkptr, os->os_spa, &zb); 997 } 998 999 dbuf_dirty_record_t * 1000 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1001 { 1002 dnode_t *dn; 1003 objset_t *os; 1004 dbuf_dirty_record_t **drp, *dr; 1005 int drop_struct_lock = FALSE; 1006 boolean_t do_free_accounting = B_FALSE; 1007 int txgoff = tx->tx_txg & TXG_MASK; 1008 1009 ASSERT(tx->tx_txg != 0); 1010 ASSERT(!refcount_is_zero(&db->db_holds)); 1011 DMU_TX_DIRTY_BUF(tx, db); 1012 1013 DB_DNODE_ENTER(db); 1014 dn = DB_DNODE(db); 1015 /* 1016 * Shouldn't dirty a regular buffer in syncing context. Private 1017 * objects may be dirtied in syncing context, but only if they 1018 * were already pre-dirtied in open context. 1019 */ 1020 ASSERT(!dmu_tx_is_syncing(tx) || 1021 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1022 DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1023 dn->dn_objset->os_dsl_dataset == NULL); 1024 /* 1025 * We make this assert for private objects as well, but after we 1026 * check if we're already dirty. They are allowed to re-dirty 1027 * in syncing context. 1028 */ 1029 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1030 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1031 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1032 1033 mutex_enter(&db->db_mtx); 1034 /* 1035 * XXX make this true for indirects too? The problem is that 1036 * transactions created with dmu_tx_create_assigned() from 1037 * syncing context don't bother holding ahead. 1038 */ 1039 ASSERT(db->db_level != 0 || 1040 db->db_state == DB_CACHED || db->db_state == DB_FILL || 1041 db->db_state == DB_NOFILL); 1042 1043 mutex_enter(&dn->dn_mtx); 1044 /* 1045 * Don't set dirtyctx to SYNC if we're just modifying this as we 1046 * initialize the objset. 1047 */ 1048 if (dn->dn_dirtyctx == DN_UNDIRTIED && 1049 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1050 dn->dn_dirtyctx = 1051 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1052 ASSERT(dn->dn_dirtyctx_firstset == NULL); 1053 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1054 } 1055 mutex_exit(&dn->dn_mtx); 1056 1057 if (db->db_blkid == DMU_SPILL_BLKID) 1058 dn->dn_have_spill = B_TRUE; 1059 1060 /* 1061 * If this buffer is already dirty, we're done. 1062 */ 1063 drp = &db->db_last_dirty; 1064 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1065 db->db.db_object == DMU_META_DNODE_OBJECT); 1066 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1067 drp = &dr->dr_next; 1068 if (dr && dr->dr_txg == tx->tx_txg) { 1069 DB_DNODE_EXIT(db); 1070 1071 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1072 /* 1073 * If this buffer has already been written out, 1074 * we now need to reset its state. 1075 */ 1076 dbuf_unoverride(dr); 1077 if (db->db.db_object != DMU_META_DNODE_OBJECT && 1078 db->db_state != DB_NOFILL) 1079 arc_buf_thaw(db->db_buf); 1080 } 1081 mutex_exit(&db->db_mtx); 1082 return (dr); 1083 } 1084 1085 /* 1086 * Only valid if not already dirty. 1087 */ 1088 ASSERT(dn->dn_object == 0 || 1089 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1090 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1091 1092 ASSERT3U(dn->dn_nlevels, >, db->db_level); 1093 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1094 dn->dn_phys->dn_nlevels > db->db_level || 1095 dn->dn_next_nlevels[txgoff] > db->db_level || 1096 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1097 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1098 1099 /* 1100 * We should only be dirtying in syncing context if it's the 1101 * mos or we're initializing the os or it's a special object. 1102 * However, we are allowed to dirty in syncing context provided 1103 * we already dirtied it in open context. Hence we must make 1104 * this assertion only if we're not already dirty. 1105 */ 1106 os = dn->dn_objset; 1107 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1108 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1109 ASSERT(db->db.db_size != 0); 1110 1111 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1112 1113 if (db->db_blkid != DMU_BONUS_BLKID) { 1114 /* 1115 * Update the accounting. 1116 * Note: we delay "free accounting" until after we drop 1117 * the db_mtx. This keeps us from grabbing other locks 1118 * (and possibly deadlocking) in bp_get_dsize() while 1119 * also holding the db_mtx. 1120 */ 1121 dnode_willuse_space(dn, db->db.db_size, tx); 1122 do_free_accounting = dbuf_block_freeable(db); 1123 } 1124 1125 /* 1126 * If this buffer is dirty in an old transaction group we need 1127 * to make a copy of it so that the changes we make in this 1128 * transaction group won't leak out when we sync the older txg. 1129 */ 1130 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1131 if (db->db_level == 0) { 1132 void *data_old = db->db_buf; 1133 1134 if (db->db_state != DB_NOFILL) { 1135 if (db->db_blkid == DMU_BONUS_BLKID) { 1136 dbuf_fix_old_data(db, tx->tx_txg); 1137 data_old = db->db.db_data; 1138 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1139 /* 1140 * Release the data buffer from the cache so 1141 * that we can modify it without impacting 1142 * possible other users of this cached data 1143 * block. Note that indirect blocks and 1144 * private objects are not released until the 1145 * syncing state (since they are only modified 1146 * then). 1147 */ 1148 arc_release(db->db_buf, db); 1149 dbuf_fix_old_data(db, tx->tx_txg); 1150 data_old = db->db_buf; 1151 } 1152 ASSERT(data_old != NULL); 1153 } 1154 dr->dt.dl.dr_data = data_old; 1155 } else { 1156 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1157 list_create(&dr->dt.di.dr_children, 1158 sizeof (dbuf_dirty_record_t), 1159 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1160 } 1161 dr->dr_dbuf = db; 1162 dr->dr_txg = tx->tx_txg; 1163 dr->dr_next = *drp; 1164 *drp = dr; 1165 1166 /* 1167 * We could have been freed_in_flight between the dbuf_noread 1168 * and dbuf_dirty. We win, as though the dbuf_noread() had 1169 * happened after the free. 1170 */ 1171 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1172 db->db_blkid != DMU_SPILL_BLKID) { 1173 mutex_enter(&dn->dn_mtx); 1174 dnode_clear_range(dn, db->db_blkid, 1, tx); 1175 mutex_exit(&dn->dn_mtx); 1176 db->db_freed_in_flight = FALSE; 1177 } 1178 1179 /* 1180 * This buffer is now part of this txg 1181 */ 1182 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1183 db->db_dirtycnt += 1; 1184 ASSERT3U(db->db_dirtycnt, <=, 3); 1185 1186 mutex_exit(&db->db_mtx); 1187 1188 if (db->db_blkid == DMU_BONUS_BLKID || 1189 db->db_blkid == DMU_SPILL_BLKID) { 1190 mutex_enter(&dn->dn_mtx); 1191 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1192 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1193 mutex_exit(&dn->dn_mtx); 1194 dnode_setdirty(dn, tx); 1195 DB_DNODE_EXIT(db); 1196 return (dr); 1197 } else if (do_free_accounting) { 1198 blkptr_t *bp = db->db_blkptr; 1199 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1200 bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1201 /* 1202 * This is only a guess -- if the dbuf is dirty 1203 * in a previous txg, we don't know how much 1204 * space it will use on disk yet. We should 1205 * really have the struct_rwlock to access 1206 * db_blkptr, but since this is just a guess, 1207 * it's OK if we get an odd answer. 1208 */ 1209 ddt_prefetch(os->os_spa, bp); 1210 dnode_willuse_space(dn, -willfree, tx); 1211 } 1212 1213 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1214 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1215 drop_struct_lock = TRUE; 1216 } 1217 1218 if (db->db_level == 0) { 1219 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1220 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1221 } 1222 1223 if (db->db_level+1 < dn->dn_nlevels) { 1224 dmu_buf_impl_t *parent = db->db_parent; 1225 dbuf_dirty_record_t *di; 1226 int parent_held = FALSE; 1227 1228 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1229 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1230 1231 parent = dbuf_hold_level(dn, db->db_level+1, 1232 db->db_blkid >> epbs, FTAG); 1233 ASSERT(parent != NULL); 1234 parent_held = TRUE; 1235 } 1236 if (drop_struct_lock) 1237 rw_exit(&dn->dn_struct_rwlock); 1238 ASSERT3U(db->db_level+1, ==, parent->db_level); 1239 di = dbuf_dirty(parent, tx); 1240 if (parent_held) 1241 dbuf_rele(parent, FTAG); 1242 1243 mutex_enter(&db->db_mtx); 1244 /* possible race with dbuf_undirty() */ 1245 if (db->db_last_dirty == dr || 1246 dn->dn_object == DMU_META_DNODE_OBJECT) { 1247 mutex_enter(&di->dt.di.dr_mtx); 1248 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1249 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1250 list_insert_tail(&di->dt.di.dr_children, dr); 1251 mutex_exit(&di->dt.di.dr_mtx); 1252 dr->dr_parent = di; 1253 } 1254 mutex_exit(&db->db_mtx); 1255 } else { 1256 ASSERT(db->db_level+1 == dn->dn_nlevels); 1257 ASSERT(db->db_blkid < dn->dn_nblkptr); 1258 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1259 mutex_enter(&dn->dn_mtx); 1260 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1261 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1262 mutex_exit(&dn->dn_mtx); 1263 if (drop_struct_lock) 1264 rw_exit(&dn->dn_struct_rwlock); 1265 } 1266 1267 dnode_setdirty(dn, tx); 1268 DB_DNODE_EXIT(db); 1269 return (dr); 1270 } 1271 1272 static int 1273 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1274 { 1275 dnode_t *dn; 1276 uint64_t txg = tx->tx_txg; 1277 dbuf_dirty_record_t *dr, **drp; 1278 1279 ASSERT(txg != 0); 1280 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1281 1282 mutex_enter(&db->db_mtx); 1283 /* 1284 * If this buffer is not dirty, we're done. 1285 */ 1286 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1287 if (dr->dr_txg <= txg) 1288 break; 1289 if (dr == NULL || dr->dr_txg < txg) { 1290 mutex_exit(&db->db_mtx); 1291 return (0); 1292 } 1293 ASSERT(dr->dr_txg == txg); 1294 ASSERT(dr->dr_dbuf == db); 1295 1296 DB_DNODE_ENTER(db); 1297 dn = DB_DNODE(db); 1298 1299 /* 1300 * If this buffer is currently held, we cannot undirty 1301 * it, since one of the current holders may be in the 1302 * middle of an update. Note that users of dbuf_undirty() 1303 * should not place a hold on the dbuf before the call. 1304 * Also note: we can get here with a spill block, so 1305 * test for that similar to how dbuf_dirty does. 1306 */ 1307 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1308 mutex_exit(&db->db_mtx); 1309 /* Make sure we don't toss this buffer at sync phase */ 1310 if (db->db_blkid != DMU_SPILL_BLKID) { 1311 mutex_enter(&dn->dn_mtx); 1312 dnode_clear_range(dn, db->db_blkid, 1, tx); 1313 mutex_exit(&dn->dn_mtx); 1314 } 1315 DB_DNODE_EXIT(db); 1316 return (0); 1317 } 1318 1319 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1320 1321 ASSERT(db->db.db_size != 0); 1322 1323 /* XXX would be nice to fix up dn_towrite_space[] */ 1324 1325 *drp = dr->dr_next; 1326 1327 /* 1328 * Note that there are three places in dbuf_dirty() 1329 * where this dirty record may be put on a list. 1330 * Make sure to do a list_remove corresponding to 1331 * every one of those list_insert calls. 1332 */ 1333 if (dr->dr_parent) { 1334 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1335 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1336 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1337 } else if (db->db_blkid == DMU_SPILL_BLKID || 1338 db->db_level+1 == dn->dn_nlevels) { 1339 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1340 mutex_enter(&dn->dn_mtx); 1341 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1342 mutex_exit(&dn->dn_mtx); 1343 } 1344 DB_DNODE_EXIT(db); 1345 1346 if (db->db_level == 0) { 1347 if (db->db_state != DB_NOFILL) { 1348 dbuf_unoverride(dr); 1349 1350 ASSERT(db->db_buf != NULL); 1351 ASSERT(dr->dt.dl.dr_data != NULL); 1352 if (dr->dt.dl.dr_data != db->db_buf) 1353 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 1354 db) == 1); 1355 } 1356 } else { 1357 ASSERT(db->db_buf != NULL); 1358 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 1359 mutex_destroy(&dr->dt.di.dr_mtx); 1360 list_destroy(&dr->dt.di.dr_children); 1361 } 1362 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1363 1364 ASSERT(db->db_dirtycnt > 0); 1365 db->db_dirtycnt -= 1; 1366 1367 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1368 arc_buf_t *buf = db->db_buf; 1369 1370 ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1371 dbuf_set_data(db, NULL); 1372 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1373 dbuf_evict(db); 1374 return (1); 1375 } 1376 1377 mutex_exit(&db->db_mtx); 1378 return (0); 1379 } 1380 1381 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty 1382 void 1383 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1384 { 1385 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1386 1387 ASSERT(tx->tx_txg != 0); 1388 ASSERT(!refcount_is_zero(&db->db_holds)); 1389 1390 DB_DNODE_ENTER(db); 1391 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1392 rf |= DB_RF_HAVESTRUCT; 1393 DB_DNODE_EXIT(db); 1394 (void) dbuf_read(db, NULL, rf); 1395 (void) dbuf_dirty(db, tx); 1396 } 1397 1398 void 1399 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1400 { 1401 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1402 1403 db->db_state = DB_NOFILL; 1404 1405 dmu_buf_will_fill(db_fake, tx); 1406 } 1407 1408 void 1409 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1410 { 1411 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1412 1413 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1414 ASSERT(tx->tx_txg != 0); 1415 ASSERT(db->db_level == 0); 1416 ASSERT(!refcount_is_zero(&db->db_holds)); 1417 1418 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1419 dmu_tx_private_ok(tx)); 1420 1421 dbuf_noread(db); 1422 (void) dbuf_dirty(db, tx); 1423 } 1424 1425 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1426 /* ARGSUSED */ 1427 void 1428 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1429 { 1430 mutex_enter(&db->db_mtx); 1431 DBUF_VERIFY(db); 1432 1433 if (db->db_state == DB_FILL) { 1434 if (db->db_level == 0 && db->db_freed_in_flight) { 1435 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1436 /* we were freed while filling */ 1437 /* XXX dbuf_undirty? */ 1438 bzero(db->db.db_data, db->db.db_size); 1439 db->db_freed_in_flight = FALSE; 1440 } 1441 db->db_state = DB_CACHED; 1442 cv_broadcast(&db->db_changed); 1443 } 1444 mutex_exit(&db->db_mtx); 1445 } 1446 1447 /* 1448 * Directly assign a provided arc buf to a given dbuf if it's not referenced 1449 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1450 */ 1451 void 1452 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1453 { 1454 ASSERT(!refcount_is_zero(&db->db_holds)); 1455 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1456 ASSERT(db->db_level == 0); 1457 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1458 ASSERT(buf != NULL); 1459 ASSERT(arc_buf_size(buf) == db->db.db_size); 1460 ASSERT(tx->tx_txg != 0); 1461 1462 arc_return_buf(buf, db); 1463 ASSERT(arc_released(buf)); 1464 1465 mutex_enter(&db->db_mtx); 1466 1467 while (db->db_state == DB_READ || db->db_state == DB_FILL) 1468 cv_wait(&db->db_changed, &db->db_mtx); 1469 1470 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1471 1472 if (db->db_state == DB_CACHED && 1473 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1474 mutex_exit(&db->db_mtx); 1475 (void) dbuf_dirty(db, tx); 1476 bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1477 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1478 xuio_stat_wbuf_copied(); 1479 return; 1480 } 1481 1482 xuio_stat_wbuf_nocopy(); 1483 if (db->db_state == DB_CACHED) { 1484 dbuf_dirty_record_t *dr = db->db_last_dirty; 1485 1486 ASSERT(db->db_buf != NULL); 1487 if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1488 ASSERT(dr->dt.dl.dr_data == db->db_buf); 1489 if (!arc_released(db->db_buf)) { 1490 ASSERT(dr->dt.dl.dr_override_state == 1491 DR_OVERRIDDEN); 1492 arc_release(db->db_buf, db); 1493 } 1494 dr->dt.dl.dr_data = buf; 1495 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); 1496 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1497 arc_release(db->db_buf, db); 1498 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); 1499 } 1500 db->db_buf = NULL; 1501 } 1502 ASSERT(db->db_buf == NULL); 1503 dbuf_set_data(db, buf); 1504 db->db_state = DB_FILL; 1505 mutex_exit(&db->db_mtx); 1506 (void) dbuf_dirty(db, tx); 1507 dbuf_fill_done(db, tx); 1508 } 1509 1510 /* 1511 * "Clear" the contents of this dbuf. This will mark the dbuf 1512 * EVICTING and clear *most* of its references. Unfortunetely, 1513 * when we are not holding the dn_dbufs_mtx, we can't clear the 1514 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1515 * in this case. For callers from the DMU we will usually see: 1516 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() 1517 * For the arc callback, we will usually see: 1518 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1519 * Sometimes, though, we will get a mix of these two: 1520 * DMU: dbuf_clear()->arc_buf_evict() 1521 * ARC: dbuf_do_evict()->dbuf_destroy() 1522 */ 1523 void 1524 dbuf_clear(dmu_buf_impl_t *db) 1525 { 1526 dnode_t *dn; 1527 dmu_buf_impl_t *parent = db->db_parent; 1528 dmu_buf_impl_t *dndb; 1529 int dbuf_gone = FALSE; 1530 1531 ASSERT(MUTEX_HELD(&db->db_mtx)); 1532 ASSERT(refcount_is_zero(&db->db_holds)); 1533 1534 dbuf_evict_user(db); 1535 1536 if (db->db_state == DB_CACHED) { 1537 ASSERT(db->db.db_data != NULL); 1538 if (db->db_blkid == DMU_BONUS_BLKID) { 1539 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1540 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1541 } 1542 db->db.db_data = NULL; 1543 db->db_state = DB_UNCACHED; 1544 } 1545 1546 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1547 ASSERT(db->db_data_pending == NULL); 1548 1549 db->db_state = DB_EVICTING; 1550 db->db_blkptr = NULL; 1551 1552 DB_DNODE_ENTER(db); 1553 dn = DB_DNODE(db); 1554 dndb = dn->dn_dbuf; 1555 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1556 list_remove(&dn->dn_dbufs, db); 1557 (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1558 membar_producer(); 1559 DB_DNODE_EXIT(db); 1560 /* 1561 * Decrementing the dbuf count means that the hold corresponding 1562 * to the removed dbuf is no longer discounted in dnode_move(), 1563 * so the dnode cannot be moved until after we release the hold. 1564 * The membar_producer() ensures visibility of the decremented 1565 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1566 * release any lock. 1567 */ 1568 dnode_rele(dn, db); 1569 db->db_dnode_handle = NULL; 1570 } else { 1571 DB_DNODE_EXIT(db); 1572 } 1573 1574 if (db->db_buf) 1575 dbuf_gone = arc_buf_evict(db->db_buf); 1576 1577 if (!dbuf_gone) 1578 mutex_exit(&db->db_mtx); 1579 1580 /* 1581 * If this dbuf is referenced from an indirect dbuf, 1582 * decrement the ref count on the indirect dbuf. 1583 */ 1584 if (parent && parent != dndb) 1585 dbuf_rele(parent, db); 1586 } 1587 1588 static int 1589 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1590 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1591 { 1592 int nlevels, epbs; 1593 1594 *parentp = NULL; 1595 *bpp = NULL; 1596 1597 ASSERT(blkid != DMU_BONUS_BLKID); 1598 1599 if (blkid == DMU_SPILL_BLKID) { 1600 mutex_enter(&dn->dn_mtx); 1601 if (dn->dn_have_spill && 1602 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1603 *bpp = &dn->dn_phys->dn_spill; 1604 else 1605 *bpp = NULL; 1606 dbuf_add_ref(dn->dn_dbuf, NULL); 1607 *parentp = dn->dn_dbuf; 1608 mutex_exit(&dn->dn_mtx); 1609 return (0); 1610 } 1611 1612 if (dn->dn_phys->dn_nlevels == 0) 1613 nlevels = 1; 1614 else 1615 nlevels = dn->dn_phys->dn_nlevels; 1616 1617 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1618 1619 ASSERT3U(level * epbs, <, 64); 1620 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1621 if (level >= nlevels || 1622 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1623 /* the buffer has no parent yet */ 1624 return (ENOENT); 1625 } else if (level < nlevels-1) { 1626 /* this block is referenced from an indirect block */ 1627 int err = dbuf_hold_impl(dn, level+1, 1628 blkid >> epbs, fail_sparse, NULL, parentp); 1629 if (err) 1630 return (err); 1631 err = dbuf_read(*parentp, NULL, 1632 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1633 if (err) { 1634 dbuf_rele(*parentp, NULL); 1635 *parentp = NULL; 1636 return (err); 1637 } 1638 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1639 (blkid & ((1ULL << epbs) - 1)); 1640 return (0); 1641 } else { 1642 /* the block is referenced from the dnode */ 1643 ASSERT3U(level, ==, nlevels-1); 1644 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1645 blkid < dn->dn_phys->dn_nblkptr); 1646 if (dn->dn_dbuf) { 1647 dbuf_add_ref(dn->dn_dbuf, NULL); 1648 *parentp = dn->dn_dbuf; 1649 } 1650 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1651 return (0); 1652 } 1653 } 1654 1655 static dmu_buf_impl_t * 1656 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1657 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1658 { 1659 objset_t *os = dn->dn_objset; 1660 dmu_buf_impl_t *db, *odb; 1661 1662 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1663 ASSERT(dn->dn_type != DMU_OT_NONE); 1664 1665 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1666 1667 db->db_objset = os; 1668 db->db.db_object = dn->dn_object; 1669 db->db_level = level; 1670 db->db_blkid = blkid; 1671 db->db_last_dirty = NULL; 1672 db->db_dirtycnt = 0; 1673 db->db_dnode_handle = dn->dn_handle; 1674 db->db_parent = parent; 1675 db->db_blkptr = blkptr; 1676 1677 db->db_user_ptr = NULL; 1678 db->db_user_data_ptr_ptr = NULL; 1679 db->db_evict_func = NULL; 1680 db->db_immediate_evict = 0; 1681 db->db_freed_in_flight = 0; 1682 1683 if (blkid == DMU_BONUS_BLKID) { 1684 ASSERT3P(parent, ==, dn->dn_dbuf); 1685 db->db.db_size = DN_MAX_BONUSLEN - 1686 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1687 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1688 db->db.db_offset = DMU_BONUS_BLKID; 1689 db->db_state = DB_UNCACHED; 1690 /* the bonus dbuf is not placed in the hash table */ 1691 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1692 return (db); 1693 } else if (blkid == DMU_SPILL_BLKID) { 1694 db->db.db_size = (blkptr != NULL) ? 1695 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 1696 db->db.db_offset = 0; 1697 } else { 1698 int blocksize = 1699 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1700 db->db.db_size = blocksize; 1701 db->db.db_offset = db->db_blkid * blocksize; 1702 } 1703 1704 /* 1705 * Hold the dn_dbufs_mtx while we get the new dbuf 1706 * in the hash table *and* added to the dbufs list. 1707 * This prevents a possible deadlock with someone 1708 * trying to look up this dbuf before its added to the 1709 * dn_dbufs list. 1710 */ 1711 mutex_enter(&dn->dn_dbufs_mtx); 1712 db->db_state = DB_EVICTING; 1713 if ((odb = dbuf_hash_insert(db)) != NULL) { 1714 /* someone else inserted it first */ 1715 kmem_cache_free(dbuf_cache, db); 1716 mutex_exit(&dn->dn_dbufs_mtx); 1717 return (odb); 1718 } 1719 list_insert_head(&dn->dn_dbufs, db); 1720 db->db_state = DB_UNCACHED; 1721 mutex_exit(&dn->dn_dbufs_mtx); 1722 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1723 1724 if (parent && parent != dn->dn_dbuf) 1725 dbuf_add_ref(parent, db); 1726 1727 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1728 refcount_count(&dn->dn_holds) > 0); 1729 (void) refcount_add(&dn->dn_holds, db); 1730 (void) atomic_inc_32_nv(&dn->dn_dbufs_count); 1731 1732 dprintf_dbuf(db, "db=%p\n", db); 1733 1734 return (db); 1735 } 1736 1737 static int 1738 dbuf_do_evict(void *private) 1739 { 1740 arc_buf_t *buf = private; 1741 dmu_buf_impl_t *db = buf->b_private; 1742 1743 if (!MUTEX_HELD(&db->db_mtx)) 1744 mutex_enter(&db->db_mtx); 1745 1746 ASSERT(refcount_is_zero(&db->db_holds)); 1747 1748 if (db->db_state != DB_EVICTING) { 1749 ASSERT(db->db_state == DB_CACHED); 1750 DBUF_VERIFY(db); 1751 db->db_buf = NULL; 1752 dbuf_evict(db); 1753 } else { 1754 mutex_exit(&db->db_mtx); 1755 dbuf_destroy(db); 1756 } 1757 return (0); 1758 } 1759 1760 static void 1761 dbuf_destroy(dmu_buf_impl_t *db) 1762 { 1763 ASSERT(refcount_is_zero(&db->db_holds)); 1764 1765 if (db->db_blkid != DMU_BONUS_BLKID) { 1766 /* 1767 * If this dbuf is still on the dn_dbufs list, 1768 * remove it from that list. 1769 */ 1770 if (db->db_dnode_handle != NULL) { 1771 dnode_t *dn; 1772 1773 DB_DNODE_ENTER(db); 1774 dn = DB_DNODE(db); 1775 mutex_enter(&dn->dn_dbufs_mtx); 1776 list_remove(&dn->dn_dbufs, db); 1777 (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1778 mutex_exit(&dn->dn_dbufs_mtx); 1779 DB_DNODE_EXIT(db); 1780 /* 1781 * Decrementing the dbuf count means that the hold 1782 * corresponding to the removed dbuf is no longer 1783 * discounted in dnode_move(), so the dnode cannot be 1784 * moved until after we release the hold. 1785 */ 1786 dnode_rele(dn, db); 1787 db->db_dnode_handle = NULL; 1788 } 1789 dbuf_hash_remove(db); 1790 } 1791 db->db_parent = NULL; 1792 db->db_buf = NULL; 1793 1794 ASSERT(!list_link_active(&db->db_link)); 1795 ASSERT(db->db.db_data == NULL); 1796 ASSERT(db->db_hash_next == NULL); 1797 ASSERT(db->db_blkptr == NULL); 1798 ASSERT(db->db_data_pending == NULL); 1799 1800 kmem_cache_free(dbuf_cache, db); 1801 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1802 } 1803 1804 void 1805 dbuf_prefetch(dnode_t *dn, uint64_t blkid) 1806 { 1807 dmu_buf_impl_t *db = NULL; 1808 blkptr_t *bp = NULL; 1809 1810 ASSERT(blkid != DMU_BONUS_BLKID); 1811 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1812 1813 if (dnode_block_freed(dn, blkid)) 1814 return; 1815 1816 /* dbuf_find() returns with db_mtx held */ 1817 if (db = dbuf_find(dn, 0, blkid)) { 1818 /* 1819 * This dbuf is already in the cache. We assume that 1820 * it is already CACHED, or else about to be either 1821 * read or filled. 1822 */ 1823 mutex_exit(&db->db_mtx); 1824 return; 1825 } 1826 1827 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1828 if (bp && !BP_IS_HOLE(bp)) { 1829 int priority = dn->dn_type == DMU_OT_DDT_ZAP ? 1830 ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; 1831 arc_buf_t *pbuf; 1832 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1833 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1834 zbookmark_t zb; 1835 1836 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1837 dn->dn_object, 0, blkid); 1838 1839 if (db) 1840 pbuf = db->db_buf; 1841 else 1842 pbuf = dn->dn_objset->os_phys_buf; 1843 1844 (void) dsl_read(NULL, dn->dn_objset->os_spa, 1845 bp, pbuf, NULL, NULL, priority, 1846 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1847 &aflags, &zb); 1848 } 1849 if (db) 1850 dbuf_rele(db, NULL); 1851 } 1852 } 1853 1854 /* 1855 * Returns with db_holds incremented, and db_mtx not held. 1856 * Note: dn_struct_rwlock must be held. 1857 */ 1858 int 1859 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1860 void *tag, dmu_buf_impl_t **dbp) 1861 { 1862 dmu_buf_impl_t *db, *parent = NULL; 1863 1864 ASSERT(blkid != DMU_BONUS_BLKID); 1865 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1866 ASSERT3U(dn->dn_nlevels, >, level); 1867 1868 *dbp = NULL; 1869 top: 1870 /* dbuf_find() returns with db_mtx held */ 1871 db = dbuf_find(dn, level, blkid); 1872 1873 if (db == NULL) { 1874 blkptr_t *bp = NULL; 1875 int err; 1876 1877 ASSERT3P(parent, ==, NULL); 1878 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1879 if (fail_sparse) { 1880 if (err == 0 && bp && BP_IS_HOLE(bp)) 1881 err = ENOENT; 1882 if (err) { 1883 if (parent) 1884 dbuf_rele(parent, NULL); 1885 return (err); 1886 } 1887 } 1888 if (err && err != ENOENT) 1889 return (err); 1890 db = dbuf_create(dn, level, blkid, parent, bp); 1891 } 1892 1893 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1894 arc_buf_add_ref(db->db_buf, db); 1895 if (db->db_buf->b_data == NULL) { 1896 dbuf_clear(db); 1897 if (parent) { 1898 dbuf_rele(parent, NULL); 1899 parent = NULL; 1900 } 1901 goto top; 1902 } 1903 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1904 } 1905 1906 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1907 1908 /* 1909 * If this buffer is currently syncing out, and we are are 1910 * still referencing it from db_data, we need to make a copy 1911 * of it in case we decide we want to dirty it again in this txg. 1912 */ 1913 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1914 dn->dn_object != DMU_META_DNODE_OBJECT && 1915 db->db_state == DB_CACHED && db->db_data_pending) { 1916 dbuf_dirty_record_t *dr = db->db_data_pending; 1917 1918 if (dr->dt.dl.dr_data == db->db_buf) { 1919 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1920 1921 dbuf_set_data(db, 1922 arc_buf_alloc(dn->dn_objset->os_spa, 1923 db->db.db_size, db, type)); 1924 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1925 db->db.db_size); 1926 } 1927 } 1928 1929 (void) refcount_add(&db->db_holds, tag); 1930 dbuf_update_data(db); 1931 DBUF_VERIFY(db); 1932 mutex_exit(&db->db_mtx); 1933 1934 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1935 if (parent) 1936 dbuf_rele(parent, NULL); 1937 1938 ASSERT3P(DB_DNODE(db), ==, dn); 1939 ASSERT3U(db->db_blkid, ==, blkid); 1940 ASSERT3U(db->db_level, ==, level); 1941 *dbp = db; 1942 1943 return (0); 1944 } 1945 1946 dmu_buf_impl_t * 1947 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1948 { 1949 dmu_buf_impl_t *db; 1950 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1951 return (err ? NULL : db); 1952 } 1953 1954 dmu_buf_impl_t * 1955 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1956 { 1957 dmu_buf_impl_t *db; 1958 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1959 return (err ? NULL : db); 1960 } 1961 1962 void 1963 dbuf_create_bonus(dnode_t *dn) 1964 { 1965 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1966 1967 ASSERT(dn->dn_bonus == NULL); 1968 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 1969 } 1970 1971 int 1972 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 1973 { 1974 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1975 dnode_t *dn; 1976 1977 if (db->db_blkid != DMU_SPILL_BLKID) 1978 return (ENOTSUP); 1979 if (blksz == 0) 1980 blksz = SPA_MINBLOCKSIZE; 1981 if (blksz > SPA_MAXBLOCKSIZE) 1982 blksz = SPA_MAXBLOCKSIZE; 1983 else 1984 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 1985 1986 DB_DNODE_ENTER(db); 1987 dn = DB_DNODE(db); 1988 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 1989 dbuf_new_size(db, blksz, tx); 1990 rw_exit(&dn->dn_struct_rwlock); 1991 DB_DNODE_EXIT(db); 1992 1993 return (0); 1994 } 1995 1996 void 1997 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 1998 { 1999 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2000 } 2001 2002 #pragma weak dmu_buf_add_ref = dbuf_add_ref 2003 void 2004 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 2005 { 2006 int64_t holds = refcount_add(&db->db_holds, tag); 2007 ASSERT(holds > 1); 2008 } 2009 2010 /* 2011 * If you call dbuf_rele() you had better not be referencing the dnode handle 2012 * unless you have some other direct or indirect hold on the dnode. (An indirect 2013 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2014 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2015 * dnode's parent dbuf evicting its dnode handles. 2016 */ 2017 #pragma weak dmu_buf_rele = dbuf_rele 2018 void 2019 dbuf_rele(dmu_buf_impl_t *db, void *tag) 2020 { 2021 mutex_enter(&db->db_mtx); 2022 dbuf_rele_and_unlock(db, tag); 2023 } 2024 2025 /* 2026 * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2027 * db_dirtycnt and db_holds to be updated atomically. 2028 */ 2029 void 2030 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2031 { 2032 int64_t holds; 2033 2034 ASSERT(MUTEX_HELD(&db->db_mtx)); 2035 DBUF_VERIFY(db); 2036 2037 /* 2038 * Remove the reference to the dbuf before removing its hold on the 2039 * dnode so we can guarantee in dnode_move() that a referenced bonus 2040 * buffer has a corresponding dnode hold. 2041 */ 2042 holds = refcount_remove(&db->db_holds, tag); 2043 ASSERT(holds >= 0); 2044 2045 /* 2046 * We can't freeze indirects if there is a possibility that they 2047 * may be modified in the current syncing context. 2048 */ 2049 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 2050 arc_buf_freeze(db->db_buf); 2051 2052 if (holds == db->db_dirtycnt && 2053 db->db_level == 0 && db->db_immediate_evict) 2054 dbuf_evict_user(db); 2055 2056 if (holds == 0) { 2057 if (db->db_blkid == DMU_BONUS_BLKID) { 2058 mutex_exit(&db->db_mtx); 2059 2060 /* 2061 * If the dnode moves here, we cannot cross this barrier 2062 * until the move completes. 2063 */ 2064 DB_DNODE_ENTER(db); 2065 (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); 2066 DB_DNODE_EXIT(db); 2067 /* 2068 * The bonus buffer's dnode hold is no longer discounted 2069 * in dnode_move(). The dnode cannot move until after 2070 * the dnode_rele(). 2071 */ 2072 dnode_rele(DB_DNODE(db), db); 2073 } else if (db->db_buf == NULL) { 2074 /* 2075 * This is a special case: we never associated this 2076 * dbuf with any data allocated from the ARC. 2077 */ 2078 ASSERT(db->db_state == DB_UNCACHED || 2079 db->db_state == DB_NOFILL); 2080 dbuf_evict(db); 2081 } else if (arc_released(db->db_buf)) { 2082 arc_buf_t *buf = db->db_buf; 2083 /* 2084 * This dbuf has anonymous data associated with it. 2085 */ 2086 dbuf_set_data(db, NULL); 2087 VERIFY(arc_buf_remove_ref(buf, db) == 1); 2088 dbuf_evict(db); 2089 } else { 2090 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); 2091 if (!DBUF_IS_CACHEABLE(db)) 2092 dbuf_clear(db); 2093 else 2094 mutex_exit(&db->db_mtx); 2095 } 2096 } else { 2097 mutex_exit(&db->db_mtx); 2098 } 2099 } 2100 2101 #pragma weak dmu_buf_refcount = dbuf_refcount 2102 uint64_t 2103 dbuf_refcount(dmu_buf_impl_t *db) 2104 { 2105 return (refcount_count(&db->db_holds)); 2106 } 2107 2108 void * 2109 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2110 dmu_buf_evict_func_t *evict_func) 2111 { 2112 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2113 user_data_ptr_ptr, evict_func)); 2114 } 2115 2116 void * 2117 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2118 dmu_buf_evict_func_t *evict_func) 2119 { 2120 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2121 2122 db->db_immediate_evict = TRUE; 2123 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2124 user_data_ptr_ptr, evict_func)); 2125 } 2126 2127 void * 2128 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 2129 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 2130 { 2131 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2132 ASSERT(db->db_level == 0); 2133 2134 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 2135 2136 mutex_enter(&db->db_mtx); 2137 2138 if (db->db_user_ptr == old_user_ptr) { 2139 db->db_user_ptr = user_ptr; 2140 db->db_user_data_ptr_ptr = user_data_ptr_ptr; 2141 db->db_evict_func = evict_func; 2142 2143 dbuf_update_data(db); 2144 } else { 2145 old_user_ptr = db->db_user_ptr; 2146 } 2147 2148 mutex_exit(&db->db_mtx); 2149 return (old_user_ptr); 2150 } 2151 2152 void * 2153 dmu_buf_get_user(dmu_buf_t *db_fake) 2154 { 2155 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2156 ASSERT(!refcount_is_zero(&db->db_holds)); 2157 2158 return (db->db_user_ptr); 2159 } 2160 2161 boolean_t 2162 dmu_buf_freeable(dmu_buf_t *dbuf) 2163 { 2164 boolean_t res = B_FALSE; 2165 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2166 2167 if (db->db_blkptr) 2168 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2169 db->db_blkptr, db->db_blkptr->blk_birth); 2170 2171 return (res); 2172 } 2173 2174 static void 2175 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2176 { 2177 /* ASSERT(dmu_tx_is_syncing(tx) */ 2178 ASSERT(MUTEX_HELD(&db->db_mtx)); 2179 2180 if (db->db_blkptr != NULL) 2181 return; 2182 2183 if (db->db_blkid == DMU_SPILL_BLKID) { 2184 db->db_blkptr = &dn->dn_phys->dn_spill; 2185 BP_ZERO(db->db_blkptr); 2186 return; 2187 } 2188 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2189 /* 2190 * This buffer was allocated at a time when there was 2191 * no available blkptrs from the dnode, or it was 2192 * inappropriate to hook it in (i.e., nlevels mis-match). 2193 */ 2194 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2195 ASSERT(db->db_parent == NULL); 2196 db->db_parent = dn->dn_dbuf; 2197 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2198 DBUF_VERIFY(db); 2199 } else { 2200 dmu_buf_impl_t *parent = db->db_parent; 2201 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2202 2203 ASSERT(dn->dn_phys->dn_nlevels > 1); 2204 if (parent == NULL) { 2205 mutex_exit(&db->db_mtx); 2206 rw_enter(&dn->dn_struct_rwlock, RW_READER); 2207 (void) dbuf_hold_impl(dn, db->db_level+1, 2208 db->db_blkid >> epbs, FALSE, db, &parent); 2209 rw_exit(&dn->dn_struct_rwlock); 2210 mutex_enter(&db->db_mtx); 2211 db->db_parent = parent; 2212 } 2213 db->db_blkptr = (blkptr_t *)parent->db.db_data + 2214 (db->db_blkid & ((1ULL << epbs) - 1)); 2215 DBUF_VERIFY(db); 2216 } 2217 } 2218 2219 static void 2220 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2221 { 2222 dmu_buf_impl_t *db = dr->dr_dbuf; 2223 dnode_t *dn; 2224 zio_t *zio; 2225 2226 ASSERT(dmu_tx_is_syncing(tx)); 2227 2228 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2229 2230 mutex_enter(&db->db_mtx); 2231 2232 ASSERT(db->db_level > 0); 2233 DBUF_VERIFY(db); 2234 2235 if (db->db_buf == NULL) { 2236 mutex_exit(&db->db_mtx); 2237 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2238 mutex_enter(&db->db_mtx); 2239 } 2240 ASSERT3U(db->db_state, ==, DB_CACHED); 2241 ASSERT(db->db_buf != NULL); 2242 2243 DB_DNODE_ENTER(db); 2244 dn = DB_DNODE(db); 2245 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2246 dbuf_check_blkptr(dn, db); 2247 DB_DNODE_EXIT(db); 2248 2249 db->db_data_pending = dr; 2250 2251 mutex_exit(&db->db_mtx); 2252 dbuf_write(dr, db->db_buf, tx); 2253 2254 zio = dr->dr_zio; 2255 mutex_enter(&dr->dt.di.dr_mtx); 2256 dbuf_sync_list(&dr->dt.di.dr_children, tx); 2257 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2258 mutex_exit(&dr->dt.di.dr_mtx); 2259 zio_nowait(zio); 2260 } 2261 2262 static void 2263 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2264 { 2265 arc_buf_t **datap = &dr->dt.dl.dr_data; 2266 dmu_buf_impl_t *db = dr->dr_dbuf; 2267 dnode_t *dn; 2268 objset_t *os; 2269 uint64_t txg = tx->tx_txg; 2270 2271 ASSERT(dmu_tx_is_syncing(tx)); 2272 2273 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2274 2275 mutex_enter(&db->db_mtx); 2276 /* 2277 * To be synced, we must be dirtied. But we 2278 * might have been freed after the dirty. 2279 */ 2280 if (db->db_state == DB_UNCACHED) { 2281 /* This buffer has been freed since it was dirtied */ 2282 ASSERT(db->db.db_data == NULL); 2283 } else if (db->db_state == DB_FILL) { 2284 /* This buffer was freed and is now being re-filled */ 2285 ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2286 } else { 2287 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2288 } 2289 DBUF_VERIFY(db); 2290 2291 DB_DNODE_ENTER(db); 2292 dn = DB_DNODE(db); 2293 2294 if (db->db_blkid == DMU_SPILL_BLKID) { 2295 mutex_enter(&dn->dn_mtx); 2296 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 2297 mutex_exit(&dn->dn_mtx); 2298 } 2299 2300 /* 2301 * If this is a bonus buffer, simply copy the bonus data into the 2302 * dnode. It will be written out when the dnode is synced (and it 2303 * will be synced, since it must have been dirty for dbuf_sync to 2304 * be called). 2305 */ 2306 if (db->db_blkid == DMU_BONUS_BLKID) { 2307 dbuf_dirty_record_t **drp; 2308 2309 ASSERT(*datap != NULL); 2310 ASSERT3U(db->db_level, ==, 0); 2311 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2312 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2313 DB_DNODE_EXIT(db); 2314 2315 if (*datap != db->db.db_data) { 2316 zio_buf_free(*datap, DN_MAX_BONUSLEN); 2317 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2318 } 2319 db->db_data_pending = NULL; 2320 drp = &db->db_last_dirty; 2321 while (*drp != dr) 2322 drp = &(*drp)->dr_next; 2323 ASSERT(dr->dr_next == NULL); 2324 ASSERT(dr->dr_dbuf == db); 2325 *drp = dr->dr_next; 2326 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2327 ASSERT(db->db_dirtycnt > 0); 2328 db->db_dirtycnt -= 1; 2329 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2330 return; 2331 } 2332 2333 os = dn->dn_objset; 2334 2335 /* 2336 * This function may have dropped the db_mtx lock allowing a dmu_sync 2337 * operation to sneak in. As a result, we need to ensure that we 2338 * don't check the dr_override_state until we have returned from 2339 * dbuf_check_blkptr. 2340 */ 2341 dbuf_check_blkptr(dn, db); 2342 2343 /* 2344 * If this buffer is in the middle of an immediate write, 2345 * wait for the synchronous IO to complete. 2346 */ 2347 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2348 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2349 cv_wait(&db->db_changed, &db->db_mtx); 2350 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2351 } 2352 2353 if (db->db_state != DB_NOFILL && 2354 dn->dn_object != DMU_META_DNODE_OBJECT && 2355 refcount_count(&db->db_holds) > 1 && 2356 dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2357 *datap == db->db_buf) { 2358 /* 2359 * If this buffer is currently "in use" (i.e., there 2360 * are active holds and db_data still references it), 2361 * then make a copy before we start the write so that 2362 * any modifications from the open txg will not leak 2363 * into this write. 2364 * 2365 * NOTE: this copy does not need to be made for 2366 * objects only modified in the syncing context (e.g. 2367 * DNONE_DNODE blocks). 2368 */ 2369 int blksz = arc_buf_size(*datap); 2370 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2371 *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2372 bcopy(db->db.db_data, (*datap)->b_data, blksz); 2373 } 2374 db->db_data_pending = dr; 2375 2376 mutex_exit(&db->db_mtx); 2377 2378 dbuf_write(dr, *datap, tx); 2379 2380 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2381 if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2382 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2383 DB_DNODE_EXIT(db); 2384 } else { 2385 /* 2386 * Although zio_nowait() does not "wait for an IO", it does 2387 * initiate the IO. If this is an empty write it seems plausible 2388 * that the IO could actually be completed before the nowait 2389 * returns. We need to DB_DNODE_EXIT() first in case 2390 * zio_nowait() invalidates the dbuf. 2391 */ 2392 DB_DNODE_EXIT(db); 2393 zio_nowait(dr->dr_zio); 2394 } 2395 } 2396 2397 void 2398 dbuf_sync_list(list_t *list, dmu_tx_t *tx) 2399 { 2400 dbuf_dirty_record_t *dr; 2401 2402 while (dr = list_head(list)) { 2403 if (dr->dr_zio != NULL) { 2404 /* 2405 * If we find an already initialized zio then we 2406 * are processing the meta-dnode, and we have finished. 2407 * The dbufs for all dnodes are put back on the list 2408 * during processing, so that we can zio_wait() 2409 * these IOs after initiating all child IOs. 2410 */ 2411 ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2412 DMU_META_DNODE_OBJECT); 2413 break; 2414 } 2415 list_remove(list, dr); 2416 if (dr->dr_dbuf->db_level > 0) 2417 dbuf_sync_indirect(dr, tx); 2418 else 2419 dbuf_sync_leaf(dr, tx); 2420 } 2421 } 2422 2423 /* ARGSUSED */ 2424 static void 2425 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2426 { 2427 dmu_buf_impl_t *db = vdb; 2428 dnode_t *dn; 2429 blkptr_t *bp = zio->io_bp; 2430 blkptr_t *bp_orig = &zio->io_bp_orig; 2431 spa_t *spa = zio->io_spa; 2432 int64_t delta; 2433 uint64_t fill = 0; 2434 int i; 2435 2436 ASSERT(db->db_blkptr == bp); 2437 2438 DB_DNODE_ENTER(db); 2439 dn = DB_DNODE(db); 2440 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2441 dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2442 zio->io_prev_space_delta = delta; 2443 2444 if (BP_IS_HOLE(bp)) { 2445 ASSERT(bp->blk_fill == 0); 2446 DB_DNODE_EXIT(db); 2447 return; 2448 } 2449 2450 ASSERT((db->db_blkid != DMU_SPILL_BLKID && 2451 BP_GET_TYPE(bp) == dn->dn_type) || 2452 (db->db_blkid == DMU_SPILL_BLKID && 2453 BP_GET_TYPE(bp) == dn->dn_bonustype)); 2454 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2455 2456 mutex_enter(&db->db_mtx); 2457 2458 #ifdef ZFS_DEBUG 2459 if (db->db_blkid == DMU_SPILL_BLKID) { 2460 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2461 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2462 db->db_blkptr == &dn->dn_phys->dn_spill); 2463 } 2464 #endif 2465 2466 if (db->db_level == 0) { 2467 mutex_enter(&dn->dn_mtx); 2468 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2469 db->db_blkid != DMU_SPILL_BLKID) 2470 dn->dn_phys->dn_maxblkid = db->db_blkid; 2471 mutex_exit(&dn->dn_mtx); 2472 2473 if (dn->dn_type == DMU_OT_DNODE) { 2474 dnode_phys_t *dnp = db->db.db_data; 2475 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2476 i--, dnp++) { 2477 if (dnp->dn_type != DMU_OT_NONE) 2478 fill++; 2479 } 2480 } else { 2481 fill = 1; 2482 } 2483 } else { 2484 blkptr_t *ibp = db->db.db_data; 2485 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2486 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2487 if (BP_IS_HOLE(ibp)) 2488 continue; 2489 fill += ibp->blk_fill; 2490 } 2491 } 2492 DB_DNODE_EXIT(db); 2493 2494 bp->blk_fill = fill; 2495 2496 mutex_exit(&db->db_mtx); 2497 } 2498 2499 /* ARGSUSED */ 2500 static void 2501 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2502 { 2503 dmu_buf_impl_t *db = vdb; 2504 blkptr_t *bp = zio->io_bp; 2505 blkptr_t *bp_orig = &zio->io_bp_orig; 2506 uint64_t txg = zio->io_txg; 2507 dbuf_dirty_record_t **drp, *dr; 2508 2509 ASSERT3U(zio->io_error, ==, 0); 2510 ASSERT(db->db_blkptr == bp); 2511 2512 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 2513 ASSERT(BP_EQUAL(bp, bp_orig)); 2514 } else { 2515 objset_t *os; 2516 dsl_dataset_t *ds; 2517 dmu_tx_t *tx; 2518 2519 DB_GET_OBJSET(&os, db); 2520 ds = os->os_dsl_dataset; 2521 tx = os->os_synctx; 2522 2523 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2524 dsl_dataset_block_born(ds, bp, tx); 2525 } 2526 2527 mutex_enter(&db->db_mtx); 2528 2529 DBUF_VERIFY(db); 2530 2531 drp = &db->db_last_dirty; 2532 while ((dr = *drp) != db->db_data_pending) 2533 drp = &dr->dr_next; 2534 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2535 ASSERT(dr->dr_txg == txg); 2536 ASSERT(dr->dr_dbuf == db); 2537 ASSERT(dr->dr_next == NULL); 2538 *drp = dr->dr_next; 2539 2540 #ifdef ZFS_DEBUG 2541 if (db->db_blkid == DMU_SPILL_BLKID) { 2542 dnode_t *dn; 2543 2544 DB_DNODE_ENTER(db); 2545 dn = DB_DNODE(db); 2546 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2547 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2548 db->db_blkptr == &dn->dn_phys->dn_spill); 2549 DB_DNODE_EXIT(db); 2550 } 2551 #endif 2552 2553 if (db->db_level == 0) { 2554 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2555 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2556 if (db->db_state != DB_NOFILL) { 2557 if (dr->dt.dl.dr_data != db->db_buf) 2558 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2559 db) == 1); 2560 else if (!arc_released(db->db_buf)) 2561 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2562 } 2563 } else { 2564 dnode_t *dn; 2565 2566 DB_DNODE_ENTER(db); 2567 dn = DB_DNODE(db); 2568 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2569 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2570 if (!BP_IS_HOLE(db->db_blkptr)) { 2571 int epbs = 2572 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2573 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2574 db->db.db_size); 2575 ASSERT3U(dn->dn_phys->dn_maxblkid 2576 >> (db->db_level * epbs), >=, db->db_blkid); 2577 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2578 } 2579 DB_DNODE_EXIT(db); 2580 mutex_destroy(&dr->dt.di.dr_mtx); 2581 list_destroy(&dr->dt.di.dr_children); 2582 } 2583 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2584 2585 cv_broadcast(&db->db_changed); 2586 ASSERT(db->db_dirtycnt > 0); 2587 db->db_dirtycnt -= 1; 2588 db->db_data_pending = NULL; 2589 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2590 } 2591 2592 static void 2593 dbuf_write_nofill_ready(zio_t *zio) 2594 { 2595 dbuf_write_ready(zio, NULL, zio->io_private); 2596 } 2597 2598 static void 2599 dbuf_write_nofill_done(zio_t *zio) 2600 { 2601 dbuf_write_done(zio, NULL, zio->io_private); 2602 } 2603 2604 static void 2605 dbuf_write_override_ready(zio_t *zio) 2606 { 2607 dbuf_dirty_record_t *dr = zio->io_private; 2608 dmu_buf_impl_t *db = dr->dr_dbuf; 2609 2610 dbuf_write_ready(zio, NULL, db); 2611 } 2612 2613 static void 2614 dbuf_write_override_done(zio_t *zio) 2615 { 2616 dbuf_dirty_record_t *dr = zio->io_private; 2617 dmu_buf_impl_t *db = dr->dr_dbuf; 2618 blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2619 2620 mutex_enter(&db->db_mtx); 2621 if (!BP_EQUAL(zio->io_bp, obp)) { 2622 if (!BP_IS_HOLE(obp)) 2623 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2624 arc_release(dr->dt.dl.dr_data, db); 2625 } 2626 mutex_exit(&db->db_mtx); 2627 2628 dbuf_write_done(zio, NULL, db); 2629 } 2630 2631 static void 2632 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2633 { 2634 dmu_buf_impl_t *db = dr->dr_dbuf; 2635 dnode_t *dn; 2636 objset_t *os; 2637 dmu_buf_impl_t *parent = db->db_parent; 2638 uint64_t txg = tx->tx_txg; 2639 zbookmark_t zb; 2640 zio_prop_t zp; 2641 zio_t *zio; 2642 int wp_flag = 0; 2643 2644 DB_DNODE_ENTER(db); 2645 dn = DB_DNODE(db); 2646 os = dn->dn_objset; 2647 2648 if (db->db_state != DB_NOFILL) { 2649 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2650 /* 2651 * Private object buffers are released here rather 2652 * than in dbuf_dirty() since they are only modified 2653 * in the syncing context and we don't want the 2654 * overhead of making multiple copies of the data. 2655 */ 2656 if (BP_IS_HOLE(db->db_blkptr)) { 2657 arc_buf_thaw(data); 2658 } else { 2659 dbuf_release_bp(db); 2660 } 2661 } 2662 } 2663 2664 if (parent != dn->dn_dbuf) { 2665 ASSERT(parent && parent->db_data_pending); 2666 ASSERT(db->db_level == parent->db_level-1); 2667 ASSERT(arc_released(parent->db_buf)); 2668 zio = parent->db_data_pending->dr_zio; 2669 } else { 2670 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 2671 db->db_blkid != DMU_SPILL_BLKID) || 2672 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 2673 if (db->db_blkid != DMU_SPILL_BLKID) 2674 ASSERT3P(db->db_blkptr, ==, 2675 &dn->dn_phys->dn_blkptr[db->db_blkid]); 2676 zio = dn->dn_zio; 2677 } 2678 2679 ASSERT(db->db_level == 0 || data == db->db_buf); 2680 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2681 ASSERT(zio); 2682 2683 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2684 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2685 db->db.db_object, db->db_level, db->db_blkid); 2686 2687 if (db->db_blkid == DMU_SPILL_BLKID) 2688 wp_flag = WP_SPILL; 2689 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 2690 2691 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 2692 DB_DNODE_EXIT(db); 2693 2694 if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2695 ASSERT(db->db_state != DB_NOFILL); 2696 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2697 db->db_blkptr, data->b_data, arc_buf_size(data), &zp, 2698 dbuf_write_override_ready, dbuf_write_override_done, dr, 2699 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2700 mutex_enter(&db->db_mtx); 2701 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2702 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2703 dr->dt.dl.dr_copies); 2704 mutex_exit(&db->db_mtx); 2705 } else if (db->db_state == DB_NOFILL) { 2706 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); 2707 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2708 db->db_blkptr, NULL, db->db.db_size, &zp, 2709 dbuf_write_nofill_ready, dbuf_write_nofill_done, db, 2710 ZIO_PRIORITY_ASYNC_WRITE, 2711 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2712 } else { 2713 ASSERT(arc_released(data)); 2714 dr->dr_zio = arc_write(zio, os->os_spa, txg, 2715 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, 2716 dbuf_write_ready, dbuf_write_done, db, 2717 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2718 } 2719 } 2720