1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012 by Delphix. All rights reserved. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/dmu.h> 29 #include <sys/dmu_impl.h> 30 #include <sys/dbuf.h> 31 #include <sys/dmu_objset.h> 32 #include <sys/dsl_dataset.h> 33 #include <sys/dsl_dir.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/spa.h> 36 #include <sys/zio.h> 37 #include <sys/dmu_zfetch.h> 38 #include <sys/sa.h> 39 #include <sys/sa_impl.h> 40 41 static void dbuf_destroy(dmu_buf_impl_t *db); 42 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 43 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 44 45 /* 46 * Global data structures and functions for the dbuf cache. 47 */ 48 static kmem_cache_t *dbuf_cache; 49 50 /* ARGSUSED */ 51 static int 52 dbuf_cons(void *vdb, void *unused, int kmflag) 53 { 54 dmu_buf_impl_t *db = vdb; 55 bzero(db, sizeof (dmu_buf_impl_t)); 56 57 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 58 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 59 refcount_create(&db->db_holds); 60 return (0); 61 } 62 63 /* ARGSUSED */ 64 static void 65 dbuf_dest(void *vdb, void *unused) 66 { 67 dmu_buf_impl_t *db = vdb; 68 mutex_destroy(&db->db_mtx); 69 cv_destroy(&db->db_changed); 70 refcount_destroy(&db->db_holds); 71 } 72 73 /* 74 * dbuf hash table routines 75 */ 76 static dbuf_hash_table_t dbuf_hash_table; 77 78 static uint64_t dbuf_hash_count; 79 80 static uint64_t 81 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 82 { 83 uintptr_t osv = (uintptr_t)os; 84 uint64_t crc = -1ULL; 85 86 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 87 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 88 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 90 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 91 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 92 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 93 94 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 95 96 return (crc); 97 } 98 99 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 100 101 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 102 ((dbuf)->db.db_object == (obj) && \ 103 (dbuf)->db_objset == (os) && \ 104 (dbuf)->db_level == (level) && \ 105 (dbuf)->db_blkid == (blkid)) 106 107 dmu_buf_impl_t * 108 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 109 { 110 dbuf_hash_table_t *h = &dbuf_hash_table; 111 objset_t *os = dn->dn_objset; 112 uint64_t obj = dn->dn_object; 113 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 114 uint64_t idx = hv & h->hash_table_mask; 115 dmu_buf_impl_t *db; 116 117 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 118 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 119 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 120 mutex_enter(&db->db_mtx); 121 if (db->db_state != DB_EVICTING) { 122 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 123 return (db); 124 } 125 mutex_exit(&db->db_mtx); 126 } 127 } 128 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 129 return (NULL); 130 } 131 132 /* 133 * Insert an entry into the hash table. If there is already an element 134 * equal to elem in the hash table, then the already existing element 135 * will be returned and the new element will not be inserted. 136 * Otherwise returns NULL. 137 */ 138 static dmu_buf_impl_t * 139 dbuf_hash_insert(dmu_buf_impl_t *db) 140 { 141 dbuf_hash_table_t *h = &dbuf_hash_table; 142 objset_t *os = db->db_objset; 143 uint64_t obj = db->db.db_object; 144 int level = db->db_level; 145 uint64_t blkid = db->db_blkid; 146 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 147 uint64_t idx = hv & h->hash_table_mask; 148 dmu_buf_impl_t *dbf; 149 150 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 151 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 152 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 153 mutex_enter(&dbf->db_mtx); 154 if (dbf->db_state != DB_EVICTING) { 155 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 156 return (dbf); 157 } 158 mutex_exit(&dbf->db_mtx); 159 } 160 } 161 162 mutex_enter(&db->db_mtx); 163 db->db_hash_next = h->hash_table[idx]; 164 h->hash_table[idx] = db; 165 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 166 atomic_add_64(&dbuf_hash_count, 1); 167 168 return (NULL); 169 } 170 171 /* 172 * Remove an entry from the hash table. This operation will 173 * fail if there are any existing holds on the db. 174 */ 175 static void 176 dbuf_hash_remove(dmu_buf_impl_t *db) 177 { 178 dbuf_hash_table_t *h = &dbuf_hash_table; 179 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 180 db->db_level, db->db_blkid); 181 uint64_t idx = hv & h->hash_table_mask; 182 dmu_buf_impl_t *dbf, **dbp; 183 184 /* 185 * We musn't hold db_mtx to maintin lock ordering: 186 * DBUF_HASH_MUTEX > db_mtx. 187 */ 188 ASSERT(refcount_is_zero(&db->db_holds)); 189 ASSERT(db->db_state == DB_EVICTING); 190 ASSERT(!MUTEX_HELD(&db->db_mtx)); 191 192 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 193 dbp = &h->hash_table[idx]; 194 while ((dbf = *dbp) != db) { 195 dbp = &dbf->db_hash_next; 196 ASSERT(dbf != NULL); 197 } 198 *dbp = db->db_hash_next; 199 db->db_hash_next = NULL; 200 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 201 atomic_add_64(&dbuf_hash_count, -1); 202 } 203 204 static arc_evict_func_t dbuf_do_evict; 205 206 static void 207 dbuf_evict_user(dmu_buf_impl_t *db) 208 { 209 ASSERT(MUTEX_HELD(&db->db_mtx)); 210 211 if (db->db_level != 0 || db->db_evict_func == NULL) 212 return; 213 214 if (db->db_user_data_ptr_ptr) 215 *db->db_user_data_ptr_ptr = db->db.db_data; 216 db->db_evict_func(&db->db, db->db_user_ptr); 217 db->db_user_ptr = NULL; 218 db->db_user_data_ptr_ptr = NULL; 219 db->db_evict_func = NULL; 220 } 221 222 boolean_t 223 dbuf_is_metadata(dmu_buf_impl_t *db) 224 { 225 if (db->db_level > 0) { 226 return (B_TRUE); 227 } else { 228 boolean_t is_metadata; 229 230 DB_DNODE_ENTER(db); 231 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 232 DB_DNODE_EXIT(db); 233 234 return (is_metadata); 235 } 236 } 237 238 void 239 dbuf_evict(dmu_buf_impl_t *db) 240 { 241 ASSERT(MUTEX_HELD(&db->db_mtx)); 242 ASSERT(db->db_buf == NULL); 243 ASSERT(db->db_data_pending == NULL); 244 245 dbuf_clear(db); 246 dbuf_destroy(db); 247 } 248 249 void 250 dbuf_init(void) 251 { 252 uint64_t hsize = 1ULL << 16; 253 dbuf_hash_table_t *h = &dbuf_hash_table; 254 int i; 255 256 /* 257 * The hash table is big enough to fill all of physical memory 258 * with an average 4K block size. The table will take up 259 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 260 */ 261 while (hsize * 4096 < physmem * PAGESIZE) 262 hsize <<= 1; 263 264 retry: 265 h->hash_table_mask = hsize - 1; 266 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 267 if (h->hash_table == NULL) { 268 /* XXX - we should really return an error instead of assert */ 269 ASSERT(hsize > (1ULL << 10)); 270 hsize >>= 1; 271 goto retry; 272 } 273 274 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 275 sizeof (dmu_buf_impl_t), 276 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 277 278 for (i = 0; i < DBUF_MUTEXES; i++) 279 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 280 } 281 282 void 283 dbuf_fini(void) 284 { 285 dbuf_hash_table_t *h = &dbuf_hash_table; 286 int i; 287 288 for (i = 0; i < DBUF_MUTEXES; i++) 289 mutex_destroy(&h->hash_mutexes[i]); 290 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 291 kmem_cache_destroy(dbuf_cache); 292 } 293 294 /* 295 * Other stuff. 296 */ 297 298 #ifdef ZFS_DEBUG 299 static void 300 dbuf_verify(dmu_buf_impl_t *db) 301 { 302 dnode_t *dn; 303 dbuf_dirty_record_t *dr; 304 305 ASSERT(MUTEX_HELD(&db->db_mtx)); 306 307 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 308 return; 309 310 ASSERT(db->db_objset != NULL); 311 DB_DNODE_ENTER(db); 312 dn = DB_DNODE(db); 313 if (dn == NULL) { 314 ASSERT(db->db_parent == NULL); 315 ASSERT(db->db_blkptr == NULL); 316 } else { 317 ASSERT3U(db->db.db_object, ==, dn->dn_object); 318 ASSERT3P(db->db_objset, ==, dn->dn_objset); 319 ASSERT3U(db->db_level, <, dn->dn_nlevels); 320 ASSERT(db->db_blkid == DMU_BONUS_BLKID || 321 db->db_blkid == DMU_SPILL_BLKID || 322 !list_is_empty(&dn->dn_dbufs)); 323 } 324 if (db->db_blkid == DMU_BONUS_BLKID) { 325 ASSERT(dn != NULL); 326 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 327 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 328 } else if (db->db_blkid == DMU_SPILL_BLKID) { 329 ASSERT(dn != NULL); 330 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 331 ASSERT0(db->db.db_offset); 332 } else { 333 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 334 } 335 336 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 337 ASSERT(dr->dr_dbuf == db); 338 339 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 340 ASSERT(dr->dr_dbuf == db); 341 342 /* 343 * We can't assert that db_size matches dn_datablksz because it 344 * can be momentarily different when another thread is doing 345 * dnode_set_blksz(). 346 */ 347 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 348 dr = db->db_data_pending; 349 /* 350 * It should only be modified in syncing context, so 351 * make sure we only have one copy of the data. 352 */ 353 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 354 } 355 356 /* verify db->db_blkptr */ 357 if (db->db_blkptr) { 358 if (db->db_parent == dn->dn_dbuf) { 359 /* db is pointed to by the dnode */ 360 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 361 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 362 ASSERT(db->db_parent == NULL); 363 else 364 ASSERT(db->db_parent != NULL); 365 if (db->db_blkid != DMU_SPILL_BLKID) 366 ASSERT3P(db->db_blkptr, ==, 367 &dn->dn_phys->dn_blkptr[db->db_blkid]); 368 } else { 369 /* db is pointed to by an indirect block */ 370 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 371 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 372 ASSERT3U(db->db_parent->db.db_object, ==, 373 db->db.db_object); 374 /* 375 * dnode_grow_indblksz() can make this fail if we don't 376 * have the struct_rwlock. XXX indblksz no longer 377 * grows. safe to do this now? 378 */ 379 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 380 ASSERT3P(db->db_blkptr, ==, 381 ((blkptr_t *)db->db_parent->db.db_data + 382 db->db_blkid % epb)); 383 } 384 } 385 } 386 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 387 (db->db_buf == NULL || db->db_buf->b_data) && 388 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 389 db->db_state != DB_FILL && !dn->dn_free_txg) { 390 /* 391 * If the blkptr isn't set but they have nonzero data, 392 * it had better be dirty, otherwise we'll lose that 393 * data when we evict this buffer. 394 */ 395 if (db->db_dirtycnt == 0) { 396 uint64_t *buf = db->db.db_data; 397 int i; 398 399 for (i = 0; i < db->db.db_size >> 3; i++) { 400 ASSERT(buf[i] == 0); 401 } 402 } 403 } 404 DB_DNODE_EXIT(db); 405 } 406 #endif 407 408 static void 409 dbuf_update_data(dmu_buf_impl_t *db) 410 { 411 ASSERT(MUTEX_HELD(&db->db_mtx)); 412 if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 413 ASSERT(!refcount_is_zero(&db->db_holds)); 414 *db->db_user_data_ptr_ptr = db->db.db_data; 415 } 416 } 417 418 static void 419 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 420 { 421 ASSERT(MUTEX_HELD(&db->db_mtx)); 422 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 423 db->db_buf = buf; 424 if (buf != NULL) { 425 ASSERT(buf->b_data != NULL); 426 db->db.db_data = buf->b_data; 427 if (!arc_released(buf)) 428 arc_set_callback(buf, dbuf_do_evict, db); 429 dbuf_update_data(db); 430 } else { 431 dbuf_evict_user(db); 432 db->db.db_data = NULL; 433 if (db->db_state != DB_NOFILL) 434 db->db_state = DB_UNCACHED; 435 } 436 } 437 438 /* 439 * Loan out an arc_buf for read. Return the loaned arc_buf. 440 */ 441 arc_buf_t * 442 dbuf_loan_arcbuf(dmu_buf_impl_t *db) 443 { 444 arc_buf_t *abuf; 445 446 mutex_enter(&db->db_mtx); 447 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 448 int blksz = db->db.db_size; 449 spa_t *spa; 450 451 mutex_exit(&db->db_mtx); 452 DB_GET_SPA(&spa, db); 453 abuf = arc_loan_buf(spa, blksz); 454 bcopy(db->db.db_data, abuf->b_data, blksz); 455 } else { 456 abuf = db->db_buf; 457 arc_loan_inuse_buf(abuf, db); 458 dbuf_set_data(db, NULL); 459 mutex_exit(&db->db_mtx); 460 } 461 return (abuf); 462 } 463 464 uint64_t 465 dbuf_whichblock(dnode_t *dn, uint64_t offset) 466 { 467 if (dn->dn_datablkshift) { 468 return (offset >> dn->dn_datablkshift); 469 } else { 470 ASSERT3U(offset, <, dn->dn_datablksz); 471 return (0); 472 } 473 } 474 475 static void 476 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 477 { 478 dmu_buf_impl_t *db = vdb; 479 480 mutex_enter(&db->db_mtx); 481 ASSERT3U(db->db_state, ==, DB_READ); 482 /* 483 * All reads are synchronous, so we must have a hold on the dbuf 484 */ 485 ASSERT(refcount_count(&db->db_holds) > 0); 486 ASSERT(db->db_buf == NULL); 487 ASSERT(db->db.db_data == NULL); 488 if (db->db_level == 0 && db->db_freed_in_flight) { 489 /* we were freed in flight; disregard any error */ 490 arc_release(buf, db); 491 bzero(buf->b_data, db->db.db_size); 492 arc_buf_freeze(buf); 493 db->db_freed_in_flight = FALSE; 494 dbuf_set_data(db, buf); 495 db->db_state = DB_CACHED; 496 } else if (zio == NULL || zio->io_error == 0) { 497 dbuf_set_data(db, buf); 498 db->db_state = DB_CACHED; 499 } else { 500 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 501 ASSERT3P(db->db_buf, ==, NULL); 502 VERIFY(arc_buf_remove_ref(buf, db) == 1); 503 db->db_state = DB_UNCACHED; 504 } 505 cv_broadcast(&db->db_changed); 506 dbuf_rele_and_unlock(db, NULL); 507 } 508 509 static void 510 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 511 { 512 dnode_t *dn; 513 spa_t *spa; 514 zbookmark_t zb; 515 uint32_t aflags = ARC_NOWAIT; 516 517 DB_DNODE_ENTER(db); 518 dn = DB_DNODE(db); 519 ASSERT(!refcount_is_zero(&db->db_holds)); 520 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 521 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 522 ASSERT(MUTEX_HELD(&db->db_mtx)); 523 ASSERT(db->db_state == DB_UNCACHED); 524 ASSERT(db->db_buf == NULL); 525 526 if (db->db_blkid == DMU_BONUS_BLKID) { 527 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 528 529 ASSERT3U(bonuslen, <=, db->db.db_size); 530 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 531 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 532 if (bonuslen < DN_MAX_BONUSLEN) 533 bzero(db->db.db_data, DN_MAX_BONUSLEN); 534 if (bonuslen) 535 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 536 DB_DNODE_EXIT(db); 537 dbuf_update_data(db); 538 db->db_state = DB_CACHED; 539 mutex_exit(&db->db_mtx); 540 return; 541 } 542 543 /* 544 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 545 * processes the delete record and clears the bp while we are waiting 546 * for the dn_mtx (resulting in a "no" from block_freed). 547 */ 548 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 549 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 550 BP_IS_HOLE(db->db_blkptr)))) { 551 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 552 553 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, 554 db->db.db_size, db, type)); 555 DB_DNODE_EXIT(db); 556 bzero(db->db.db_data, db->db.db_size); 557 db->db_state = DB_CACHED; 558 *flags |= DB_RF_CACHED; 559 mutex_exit(&db->db_mtx); 560 return; 561 } 562 563 spa = dn->dn_objset->os_spa; 564 DB_DNODE_EXIT(db); 565 566 db->db_state = DB_READ; 567 mutex_exit(&db->db_mtx); 568 569 if (DBUF_IS_L2CACHEABLE(db)) 570 aflags |= ARC_L2CACHE; 571 572 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 573 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 574 db->db.db_object, db->db_level, db->db_blkid); 575 576 dbuf_add_ref(db, NULL); 577 578 (void) arc_read(zio, spa, db->db_blkptr, 579 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 580 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 581 &aflags, &zb); 582 if (aflags & ARC_CACHED) 583 *flags |= DB_RF_CACHED; 584 } 585 586 int 587 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 588 { 589 int err = 0; 590 int havepzio = (zio != NULL); 591 int prefetch; 592 dnode_t *dn; 593 594 /* 595 * We don't have to hold the mutex to check db_state because it 596 * can't be freed while we have a hold on the buffer. 597 */ 598 ASSERT(!refcount_is_zero(&db->db_holds)); 599 600 if (db->db_state == DB_NOFILL) 601 return (EIO); 602 603 DB_DNODE_ENTER(db); 604 dn = DB_DNODE(db); 605 if ((flags & DB_RF_HAVESTRUCT) == 0) 606 rw_enter(&dn->dn_struct_rwlock, RW_READER); 607 608 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 609 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 610 DBUF_IS_CACHEABLE(db); 611 612 mutex_enter(&db->db_mtx); 613 if (db->db_state == DB_CACHED) { 614 mutex_exit(&db->db_mtx); 615 if (prefetch) 616 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 617 db->db.db_size, TRUE); 618 if ((flags & DB_RF_HAVESTRUCT) == 0) 619 rw_exit(&dn->dn_struct_rwlock); 620 DB_DNODE_EXIT(db); 621 } else if (db->db_state == DB_UNCACHED) { 622 spa_t *spa = dn->dn_objset->os_spa; 623 624 if (zio == NULL) 625 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 626 dbuf_read_impl(db, zio, &flags); 627 628 /* dbuf_read_impl has dropped db_mtx for us */ 629 630 if (prefetch) 631 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 632 db->db.db_size, flags & DB_RF_CACHED); 633 634 if ((flags & DB_RF_HAVESTRUCT) == 0) 635 rw_exit(&dn->dn_struct_rwlock); 636 DB_DNODE_EXIT(db); 637 638 if (!havepzio) 639 err = zio_wait(zio); 640 } else { 641 mutex_exit(&db->db_mtx); 642 if (prefetch) 643 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 644 db->db.db_size, TRUE); 645 if ((flags & DB_RF_HAVESTRUCT) == 0) 646 rw_exit(&dn->dn_struct_rwlock); 647 DB_DNODE_EXIT(db); 648 649 mutex_enter(&db->db_mtx); 650 if ((flags & DB_RF_NEVERWAIT) == 0) { 651 while (db->db_state == DB_READ || 652 db->db_state == DB_FILL) { 653 ASSERT(db->db_state == DB_READ || 654 (flags & DB_RF_HAVESTRUCT) == 0); 655 cv_wait(&db->db_changed, &db->db_mtx); 656 } 657 if (db->db_state == DB_UNCACHED) 658 err = EIO; 659 } 660 mutex_exit(&db->db_mtx); 661 } 662 663 ASSERT(err || havepzio || db->db_state == DB_CACHED); 664 return (err); 665 } 666 667 static void 668 dbuf_noread(dmu_buf_impl_t *db) 669 { 670 ASSERT(!refcount_is_zero(&db->db_holds)); 671 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 672 mutex_enter(&db->db_mtx); 673 while (db->db_state == DB_READ || db->db_state == DB_FILL) 674 cv_wait(&db->db_changed, &db->db_mtx); 675 if (db->db_state == DB_UNCACHED) { 676 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 677 spa_t *spa; 678 679 ASSERT(db->db_buf == NULL); 680 ASSERT(db->db.db_data == NULL); 681 DB_GET_SPA(&spa, db); 682 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 683 db->db_state = DB_FILL; 684 } else if (db->db_state == DB_NOFILL) { 685 dbuf_set_data(db, NULL); 686 } else { 687 ASSERT3U(db->db_state, ==, DB_CACHED); 688 } 689 mutex_exit(&db->db_mtx); 690 } 691 692 /* 693 * This is our just-in-time copy function. It makes a copy of 694 * buffers, that have been modified in a previous transaction 695 * group, before we modify them in the current active group. 696 * 697 * This function is used in two places: when we are dirtying a 698 * buffer for the first time in a txg, and when we are freeing 699 * a range in a dnode that includes this buffer. 700 * 701 * Note that when we are called from dbuf_free_range() we do 702 * not put a hold on the buffer, we just traverse the active 703 * dbuf list for the dnode. 704 */ 705 static void 706 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 707 { 708 dbuf_dirty_record_t *dr = db->db_last_dirty; 709 710 ASSERT(MUTEX_HELD(&db->db_mtx)); 711 ASSERT(db->db.db_data != NULL); 712 ASSERT(db->db_level == 0); 713 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 714 715 if (dr == NULL || 716 (dr->dt.dl.dr_data != 717 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 718 return; 719 720 /* 721 * If the last dirty record for this dbuf has not yet synced 722 * and its referencing the dbuf data, either: 723 * reset the reference to point to a new copy, 724 * or (if there a no active holders) 725 * just null out the current db_data pointer. 726 */ 727 ASSERT(dr->dr_txg >= txg - 2); 728 if (db->db_blkid == DMU_BONUS_BLKID) { 729 /* Note that the data bufs here are zio_bufs */ 730 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 731 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 732 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 733 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 734 int size = db->db.db_size; 735 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 736 spa_t *spa; 737 738 DB_GET_SPA(&spa, db); 739 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 740 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 741 } else { 742 dbuf_set_data(db, NULL); 743 } 744 } 745 746 void 747 dbuf_unoverride(dbuf_dirty_record_t *dr) 748 { 749 dmu_buf_impl_t *db = dr->dr_dbuf; 750 blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 751 uint64_t txg = dr->dr_txg; 752 753 ASSERT(MUTEX_HELD(&db->db_mtx)); 754 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 755 ASSERT(db->db_level == 0); 756 757 if (db->db_blkid == DMU_BONUS_BLKID || 758 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 759 return; 760 761 ASSERT(db->db_data_pending != dr); 762 763 /* free this block */ 764 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) { 765 spa_t *spa; 766 767 DB_GET_SPA(&spa, db); 768 zio_free(spa, txg, bp); 769 } 770 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 771 dr->dt.dl.dr_nopwrite = B_FALSE; 772 773 /* 774 * Release the already-written buffer, so we leave it in 775 * a consistent dirty state. Note that all callers are 776 * modifying the buffer, so they will immediately do 777 * another (redundant) arc_release(). Therefore, leave 778 * the buf thawed to save the effort of freezing & 779 * immediately re-thawing it. 780 */ 781 arc_release(dr->dt.dl.dr_data, db); 782 } 783 784 /* 785 * Evict (if its unreferenced) or clear (if its referenced) any level-0 786 * data blocks in the free range, so that any future readers will find 787 * empty blocks. Also, if we happen accross any level-1 dbufs in the 788 * range that have not already been marked dirty, mark them dirty so 789 * they stay in memory. 790 */ 791 void 792 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) 793 { 794 dmu_buf_impl_t *db, *db_next; 795 uint64_t txg = tx->tx_txg; 796 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 797 uint64_t first_l1 = start >> epbs; 798 uint64_t last_l1 = end >> epbs; 799 800 if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { 801 end = dn->dn_maxblkid; 802 last_l1 = end >> epbs; 803 } 804 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); 805 mutex_enter(&dn->dn_dbufs_mtx); 806 for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 807 db_next = list_next(&dn->dn_dbufs, db); 808 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 809 810 if (db->db_level == 1 && 811 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { 812 mutex_enter(&db->db_mtx); 813 if (db->db_last_dirty && 814 db->db_last_dirty->dr_txg < txg) { 815 dbuf_add_ref(db, FTAG); 816 mutex_exit(&db->db_mtx); 817 dbuf_will_dirty(db, tx); 818 dbuf_rele(db, FTAG); 819 } else { 820 mutex_exit(&db->db_mtx); 821 } 822 } 823 824 if (db->db_level != 0) 825 continue; 826 dprintf_dbuf(db, "found buf %s\n", ""); 827 if (db->db_blkid < start || db->db_blkid > end) 828 continue; 829 830 /* found a level 0 buffer in the range */ 831 if (dbuf_undirty(db, tx)) 832 continue; 833 834 mutex_enter(&db->db_mtx); 835 if (db->db_state == DB_UNCACHED || 836 db->db_state == DB_NOFILL || 837 db->db_state == DB_EVICTING) { 838 ASSERT(db->db.db_data == NULL); 839 mutex_exit(&db->db_mtx); 840 continue; 841 } 842 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 843 /* will be handled in dbuf_read_done or dbuf_rele */ 844 db->db_freed_in_flight = TRUE; 845 mutex_exit(&db->db_mtx); 846 continue; 847 } 848 if (refcount_count(&db->db_holds) == 0) { 849 ASSERT(db->db_buf); 850 dbuf_clear(db); 851 continue; 852 } 853 /* The dbuf is referenced */ 854 855 if (db->db_last_dirty != NULL) { 856 dbuf_dirty_record_t *dr = db->db_last_dirty; 857 858 if (dr->dr_txg == txg) { 859 /* 860 * This buffer is "in-use", re-adjust the file 861 * size to reflect that this buffer may 862 * contain new data when we sync. 863 */ 864 if (db->db_blkid != DMU_SPILL_BLKID && 865 db->db_blkid > dn->dn_maxblkid) 866 dn->dn_maxblkid = db->db_blkid; 867 dbuf_unoverride(dr); 868 } else { 869 /* 870 * This dbuf is not dirty in the open context. 871 * Either uncache it (if its not referenced in 872 * the open context) or reset its contents to 873 * empty. 874 */ 875 dbuf_fix_old_data(db, txg); 876 } 877 } 878 /* clear the contents if its cached */ 879 if (db->db_state == DB_CACHED) { 880 ASSERT(db->db.db_data != NULL); 881 arc_release(db->db_buf, db); 882 bzero(db->db.db_data, db->db.db_size); 883 arc_buf_freeze(db->db_buf); 884 } 885 886 mutex_exit(&db->db_mtx); 887 } 888 mutex_exit(&dn->dn_dbufs_mtx); 889 } 890 891 static int 892 dbuf_block_freeable(dmu_buf_impl_t *db) 893 { 894 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 895 uint64_t birth_txg = 0; 896 897 /* 898 * We don't need any locking to protect db_blkptr: 899 * If it's syncing, then db_last_dirty will be set 900 * so we'll ignore db_blkptr. 901 */ 902 ASSERT(MUTEX_HELD(&db->db_mtx)); 903 if (db->db_last_dirty) 904 birth_txg = db->db_last_dirty->dr_txg; 905 else if (db->db_blkptr) 906 birth_txg = db->db_blkptr->blk_birth; 907 908 /* 909 * If we don't exist or are in a snapshot, we can't be freed. 910 * Don't pass the bp to dsl_dataset_block_freeable() since we 911 * are holding the db_mtx lock and might deadlock if we are 912 * prefetching a dedup-ed block. 913 */ 914 if (birth_txg) 915 return (ds == NULL || 916 dsl_dataset_block_freeable(ds, NULL, birth_txg)); 917 else 918 return (FALSE); 919 } 920 921 void 922 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 923 { 924 arc_buf_t *buf, *obuf; 925 int osize = db->db.db_size; 926 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 927 dnode_t *dn; 928 929 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 930 931 DB_DNODE_ENTER(db); 932 dn = DB_DNODE(db); 933 934 /* XXX does *this* func really need the lock? */ 935 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 936 937 /* 938 * This call to dbuf_will_dirty() with the dn_struct_rwlock held 939 * is OK, because there can be no other references to the db 940 * when we are changing its size, so no concurrent DB_FILL can 941 * be happening. 942 */ 943 /* 944 * XXX we should be doing a dbuf_read, checking the return 945 * value and returning that up to our callers 946 */ 947 dbuf_will_dirty(db, tx); 948 949 /* create the data buffer for the new block */ 950 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 951 952 /* copy old block data to the new block */ 953 obuf = db->db_buf; 954 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 955 /* zero the remainder */ 956 if (size > osize) 957 bzero((uint8_t *)buf->b_data + osize, size - osize); 958 959 mutex_enter(&db->db_mtx); 960 dbuf_set_data(db, buf); 961 VERIFY(arc_buf_remove_ref(obuf, db) == 1); 962 db->db.db_size = size; 963 964 if (db->db_level == 0) { 965 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 966 db->db_last_dirty->dt.dl.dr_data = buf; 967 } 968 mutex_exit(&db->db_mtx); 969 970 dnode_willuse_space(dn, size-osize, tx); 971 DB_DNODE_EXIT(db); 972 } 973 974 void 975 dbuf_release_bp(dmu_buf_impl_t *db) 976 { 977 objset_t *os; 978 979 DB_GET_OBJSET(&os, db); 980 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 981 ASSERT(arc_released(os->os_phys_buf) || 982 list_link_active(&os->os_dsl_dataset->ds_synced_link)); 983 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 984 985 (void) arc_release(db->db_buf, db); 986 } 987 988 dbuf_dirty_record_t * 989 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 990 { 991 dnode_t *dn; 992 objset_t *os; 993 dbuf_dirty_record_t **drp, *dr; 994 int drop_struct_lock = FALSE; 995 boolean_t do_free_accounting = B_FALSE; 996 int txgoff = tx->tx_txg & TXG_MASK; 997 998 ASSERT(tx->tx_txg != 0); 999 ASSERT(!refcount_is_zero(&db->db_holds)); 1000 DMU_TX_DIRTY_BUF(tx, db); 1001 1002 DB_DNODE_ENTER(db); 1003 dn = DB_DNODE(db); 1004 /* 1005 * Shouldn't dirty a regular buffer in syncing context. Private 1006 * objects may be dirtied in syncing context, but only if they 1007 * were already pre-dirtied in open context. 1008 */ 1009 ASSERT(!dmu_tx_is_syncing(tx) || 1010 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1011 DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1012 dn->dn_objset->os_dsl_dataset == NULL); 1013 /* 1014 * We make this assert for private objects as well, but after we 1015 * check if we're already dirty. They are allowed to re-dirty 1016 * in syncing context. 1017 */ 1018 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1019 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1020 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1021 1022 mutex_enter(&db->db_mtx); 1023 /* 1024 * XXX make this true for indirects too? The problem is that 1025 * transactions created with dmu_tx_create_assigned() from 1026 * syncing context don't bother holding ahead. 1027 */ 1028 ASSERT(db->db_level != 0 || 1029 db->db_state == DB_CACHED || db->db_state == DB_FILL || 1030 db->db_state == DB_NOFILL); 1031 1032 mutex_enter(&dn->dn_mtx); 1033 /* 1034 * Don't set dirtyctx to SYNC if we're just modifying this as we 1035 * initialize the objset. 1036 */ 1037 if (dn->dn_dirtyctx == DN_UNDIRTIED && 1038 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1039 dn->dn_dirtyctx = 1040 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1041 ASSERT(dn->dn_dirtyctx_firstset == NULL); 1042 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1043 } 1044 mutex_exit(&dn->dn_mtx); 1045 1046 if (db->db_blkid == DMU_SPILL_BLKID) 1047 dn->dn_have_spill = B_TRUE; 1048 1049 /* 1050 * If this buffer is already dirty, we're done. 1051 */ 1052 drp = &db->db_last_dirty; 1053 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1054 db->db.db_object == DMU_META_DNODE_OBJECT); 1055 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1056 drp = &dr->dr_next; 1057 if (dr && dr->dr_txg == tx->tx_txg) { 1058 DB_DNODE_EXIT(db); 1059 1060 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1061 /* 1062 * If this buffer has already been written out, 1063 * we now need to reset its state. 1064 */ 1065 dbuf_unoverride(dr); 1066 if (db->db.db_object != DMU_META_DNODE_OBJECT && 1067 db->db_state != DB_NOFILL) 1068 arc_buf_thaw(db->db_buf); 1069 } 1070 mutex_exit(&db->db_mtx); 1071 return (dr); 1072 } 1073 1074 /* 1075 * Only valid if not already dirty. 1076 */ 1077 ASSERT(dn->dn_object == 0 || 1078 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1079 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1080 1081 ASSERT3U(dn->dn_nlevels, >, db->db_level); 1082 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1083 dn->dn_phys->dn_nlevels > db->db_level || 1084 dn->dn_next_nlevels[txgoff] > db->db_level || 1085 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1086 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1087 1088 /* 1089 * We should only be dirtying in syncing context if it's the 1090 * mos or we're initializing the os or it's a special object. 1091 * However, we are allowed to dirty in syncing context provided 1092 * we already dirtied it in open context. Hence we must make 1093 * this assertion only if we're not already dirty. 1094 */ 1095 os = dn->dn_objset; 1096 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1097 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1098 ASSERT(db->db.db_size != 0); 1099 1100 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1101 1102 if (db->db_blkid != DMU_BONUS_BLKID) { 1103 /* 1104 * Update the accounting. 1105 * Note: we delay "free accounting" until after we drop 1106 * the db_mtx. This keeps us from grabbing other locks 1107 * (and possibly deadlocking) in bp_get_dsize() while 1108 * also holding the db_mtx. 1109 */ 1110 dnode_willuse_space(dn, db->db.db_size, tx); 1111 do_free_accounting = dbuf_block_freeable(db); 1112 } 1113 1114 /* 1115 * If this buffer is dirty in an old transaction group we need 1116 * to make a copy of it so that the changes we make in this 1117 * transaction group won't leak out when we sync the older txg. 1118 */ 1119 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1120 if (db->db_level == 0) { 1121 void *data_old = db->db_buf; 1122 1123 if (db->db_state != DB_NOFILL) { 1124 if (db->db_blkid == DMU_BONUS_BLKID) { 1125 dbuf_fix_old_data(db, tx->tx_txg); 1126 data_old = db->db.db_data; 1127 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1128 /* 1129 * Release the data buffer from the cache so 1130 * that we can modify it without impacting 1131 * possible other users of this cached data 1132 * block. Note that indirect blocks and 1133 * private objects are not released until the 1134 * syncing state (since they are only modified 1135 * then). 1136 */ 1137 arc_release(db->db_buf, db); 1138 dbuf_fix_old_data(db, tx->tx_txg); 1139 data_old = db->db_buf; 1140 } 1141 ASSERT(data_old != NULL); 1142 } 1143 dr->dt.dl.dr_data = data_old; 1144 } else { 1145 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1146 list_create(&dr->dt.di.dr_children, 1147 sizeof (dbuf_dirty_record_t), 1148 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1149 } 1150 dr->dr_dbuf = db; 1151 dr->dr_txg = tx->tx_txg; 1152 dr->dr_next = *drp; 1153 *drp = dr; 1154 1155 /* 1156 * We could have been freed_in_flight between the dbuf_noread 1157 * and dbuf_dirty. We win, as though the dbuf_noread() had 1158 * happened after the free. 1159 */ 1160 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1161 db->db_blkid != DMU_SPILL_BLKID) { 1162 mutex_enter(&dn->dn_mtx); 1163 dnode_clear_range(dn, db->db_blkid, 1, tx); 1164 mutex_exit(&dn->dn_mtx); 1165 db->db_freed_in_flight = FALSE; 1166 } 1167 1168 /* 1169 * This buffer is now part of this txg 1170 */ 1171 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1172 db->db_dirtycnt += 1; 1173 ASSERT3U(db->db_dirtycnt, <=, 3); 1174 1175 mutex_exit(&db->db_mtx); 1176 1177 if (db->db_blkid == DMU_BONUS_BLKID || 1178 db->db_blkid == DMU_SPILL_BLKID) { 1179 mutex_enter(&dn->dn_mtx); 1180 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1181 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1182 mutex_exit(&dn->dn_mtx); 1183 dnode_setdirty(dn, tx); 1184 DB_DNODE_EXIT(db); 1185 return (dr); 1186 } else if (do_free_accounting) { 1187 blkptr_t *bp = db->db_blkptr; 1188 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1189 bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1190 /* 1191 * This is only a guess -- if the dbuf is dirty 1192 * in a previous txg, we don't know how much 1193 * space it will use on disk yet. We should 1194 * really have the struct_rwlock to access 1195 * db_blkptr, but since this is just a guess, 1196 * it's OK if we get an odd answer. 1197 */ 1198 ddt_prefetch(os->os_spa, bp); 1199 dnode_willuse_space(dn, -willfree, tx); 1200 } 1201 1202 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1203 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1204 drop_struct_lock = TRUE; 1205 } 1206 1207 if (db->db_level == 0) { 1208 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1209 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1210 } 1211 1212 if (db->db_level+1 < dn->dn_nlevels) { 1213 dmu_buf_impl_t *parent = db->db_parent; 1214 dbuf_dirty_record_t *di; 1215 int parent_held = FALSE; 1216 1217 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1218 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1219 1220 parent = dbuf_hold_level(dn, db->db_level+1, 1221 db->db_blkid >> epbs, FTAG); 1222 ASSERT(parent != NULL); 1223 parent_held = TRUE; 1224 } 1225 if (drop_struct_lock) 1226 rw_exit(&dn->dn_struct_rwlock); 1227 ASSERT3U(db->db_level+1, ==, parent->db_level); 1228 di = dbuf_dirty(parent, tx); 1229 if (parent_held) 1230 dbuf_rele(parent, FTAG); 1231 1232 mutex_enter(&db->db_mtx); 1233 /* possible race with dbuf_undirty() */ 1234 if (db->db_last_dirty == dr || 1235 dn->dn_object == DMU_META_DNODE_OBJECT) { 1236 mutex_enter(&di->dt.di.dr_mtx); 1237 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1238 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1239 list_insert_tail(&di->dt.di.dr_children, dr); 1240 mutex_exit(&di->dt.di.dr_mtx); 1241 dr->dr_parent = di; 1242 } 1243 mutex_exit(&db->db_mtx); 1244 } else { 1245 ASSERT(db->db_level+1 == dn->dn_nlevels); 1246 ASSERT(db->db_blkid < dn->dn_nblkptr); 1247 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1248 mutex_enter(&dn->dn_mtx); 1249 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1250 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1251 mutex_exit(&dn->dn_mtx); 1252 if (drop_struct_lock) 1253 rw_exit(&dn->dn_struct_rwlock); 1254 } 1255 1256 dnode_setdirty(dn, tx); 1257 DB_DNODE_EXIT(db); 1258 return (dr); 1259 } 1260 1261 static int 1262 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1263 { 1264 dnode_t *dn; 1265 uint64_t txg = tx->tx_txg; 1266 dbuf_dirty_record_t *dr, **drp; 1267 1268 ASSERT(txg != 0); 1269 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1270 1271 mutex_enter(&db->db_mtx); 1272 /* 1273 * If this buffer is not dirty, we're done. 1274 */ 1275 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1276 if (dr->dr_txg <= txg) 1277 break; 1278 if (dr == NULL || dr->dr_txg < txg) { 1279 mutex_exit(&db->db_mtx); 1280 return (0); 1281 } 1282 ASSERT(dr->dr_txg == txg); 1283 ASSERT(dr->dr_dbuf == db); 1284 1285 DB_DNODE_ENTER(db); 1286 dn = DB_DNODE(db); 1287 1288 /* 1289 * If this buffer is currently held, we cannot undirty 1290 * it, since one of the current holders may be in the 1291 * middle of an update. Note that users of dbuf_undirty() 1292 * should not place a hold on the dbuf before the call. 1293 * Also note: we can get here with a spill block, so 1294 * test for that similar to how dbuf_dirty does. 1295 */ 1296 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1297 mutex_exit(&db->db_mtx); 1298 /* Make sure we don't toss this buffer at sync phase */ 1299 if (db->db_blkid != DMU_SPILL_BLKID) { 1300 mutex_enter(&dn->dn_mtx); 1301 dnode_clear_range(dn, db->db_blkid, 1, tx); 1302 mutex_exit(&dn->dn_mtx); 1303 } 1304 DB_DNODE_EXIT(db); 1305 return (0); 1306 } 1307 1308 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1309 1310 ASSERT(db->db.db_size != 0); 1311 1312 /* XXX would be nice to fix up dn_towrite_space[] */ 1313 1314 *drp = dr->dr_next; 1315 1316 /* 1317 * Note that there are three places in dbuf_dirty() 1318 * where this dirty record may be put on a list. 1319 * Make sure to do a list_remove corresponding to 1320 * every one of those list_insert calls. 1321 */ 1322 if (dr->dr_parent) { 1323 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1324 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1325 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1326 } else if (db->db_blkid == DMU_SPILL_BLKID || 1327 db->db_level+1 == dn->dn_nlevels) { 1328 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1329 mutex_enter(&dn->dn_mtx); 1330 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1331 mutex_exit(&dn->dn_mtx); 1332 } 1333 DB_DNODE_EXIT(db); 1334 1335 if (db->db_level == 0) { 1336 if (db->db_state != DB_NOFILL) { 1337 dbuf_unoverride(dr); 1338 1339 ASSERT(db->db_buf != NULL); 1340 ASSERT(dr->dt.dl.dr_data != NULL); 1341 if (dr->dt.dl.dr_data != db->db_buf) 1342 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 1343 db) == 1); 1344 } 1345 } else { 1346 ASSERT(db->db_buf != NULL); 1347 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 1348 mutex_destroy(&dr->dt.di.dr_mtx); 1349 list_destroy(&dr->dt.di.dr_children); 1350 } 1351 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1352 1353 ASSERT(db->db_dirtycnt > 0); 1354 db->db_dirtycnt -= 1; 1355 1356 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1357 arc_buf_t *buf = db->db_buf; 1358 1359 ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1360 dbuf_set_data(db, NULL); 1361 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1362 dbuf_evict(db); 1363 return (1); 1364 } 1365 1366 mutex_exit(&db->db_mtx); 1367 return (0); 1368 } 1369 1370 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty 1371 void 1372 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1373 { 1374 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1375 1376 ASSERT(tx->tx_txg != 0); 1377 ASSERT(!refcount_is_zero(&db->db_holds)); 1378 1379 DB_DNODE_ENTER(db); 1380 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1381 rf |= DB_RF_HAVESTRUCT; 1382 DB_DNODE_EXIT(db); 1383 (void) dbuf_read(db, NULL, rf); 1384 (void) dbuf_dirty(db, tx); 1385 } 1386 1387 void 1388 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1389 { 1390 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1391 1392 db->db_state = DB_NOFILL; 1393 1394 dmu_buf_will_fill(db_fake, tx); 1395 } 1396 1397 void 1398 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1399 { 1400 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1401 1402 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1403 ASSERT(tx->tx_txg != 0); 1404 ASSERT(db->db_level == 0); 1405 ASSERT(!refcount_is_zero(&db->db_holds)); 1406 1407 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1408 dmu_tx_private_ok(tx)); 1409 1410 dbuf_noread(db); 1411 (void) dbuf_dirty(db, tx); 1412 } 1413 1414 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1415 /* ARGSUSED */ 1416 void 1417 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1418 { 1419 mutex_enter(&db->db_mtx); 1420 DBUF_VERIFY(db); 1421 1422 if (db->db_state == DB_FILL) { 1423 if (db->db_level == 0 && db->db_freed_in_flight) { 1424 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1425 /* we were freed while filling */ 1426 /* XXX dbuf_undirty? */ 1427 bzero(db->db.db_data, db->db.db_size); 1428 db->db_freed_in_flight = FALSE; 1429 } 1430 db->db_state = DB_CACHED; 1431 cv_broadcast(&db->db_changed); 1432 } 1433 mutex_exit(&db->db_mtx); 1434 } 1435 1436 /* 1437 * Directly assign a provided arc buf to a given dbuf if it's not referenced 1438 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1439 */ 1440 void 1441 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1442 { 1443 ASSERT(!refcount_is_zero(&db->db_holds)); 1444 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1445 ASSERT(db->db_level == 0); 1446 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1447 ASSERT(buf != NULL); 1448 ASSERT(arc_buf_size(buf) == db->db.db_size); 1449 ASSERT(tx->tx_txg != 0); 1450 1451 arc_return_buf(buf, db); 1452 ASSERT(arc_released(buf)); 1453 1454 mutex_enter(&db->db_mtx); 1455 1456 while (db->db_state == DB_READ || db->db_state == DB_FILL) 1457 cv_wait(&db->db_changed, &db->db_mtx); 1458 1459 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1460 1461 if (db->db_state == DB_CACHED && 1462 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1463 mutex_exit(&db->db_mtx); 1464 (void) dbuf_dirty(db, tx); 1465 bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1466 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1467 xuio_stat_wbuf_copied(); 1468 return; 1469 } 1470 1471 xuio_stat_wbuf_nocopy(); 1472 if (db->db_state == DB_CACHED) { 1473 dbuf_dirty_record_t *dr = db->db_last_dirty; 1474 1475 ASSERT(db->db_buf != NULL); 1476 if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1477 ASSERT(dr->dt.dl.dr_data == db->db_buf); 1478 if (!arc_released(db->db_buf)) { 1479 ASSERT(dr->dt.dl.dr_override_state == 1480 DR_OVERRIDDEN); 1481 arc_release(db->db_buf, db); 1482 } 1483 dr->dt.dl.dr_data = buf; 1484 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); 1485 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1486 arc_release(db->db_buf, db); 1487 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); 1488 } 1489 db->db_buf = NULL; 1490 } 1491 ASSERT(db->db_buf == NULL); 1492 dbuf_set_data(db, buf); 1493 db->db_state = DB_FILL; 1494 mutex_exit(&db->db_mtx); 1495 (void) dbuf_dirty(db, tx); 1496 dbuf_fill_done(db, tx); 1497 } 1498 1499 /* 1500 * "Clear" the contents of this dbuf. This will mark the dbuf 1501 * EVICTING and clear *most* of its references. Unfortunetely, 1502 * when we are not holding the dn_dbufs_mtx, we can't clear the 1503 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1504 * in this case. For callers from the DMU we will usually see: 1505 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() 1506 * For the arc callback, we will usually see: 1507 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1508 * Sometimes, though, we will get a mix of these two: 1509 * DMU: dbuf_clear()->arc_buf_evict() 1510 * ARC: dbuf_do_evict()->dbuf_destroy() 1511 */ 1512 void 1513 dbuf_clear(dmu_buf_impl_t *db) 1514 { 1515 dnode_t *dn; 1516 dmu_buf_impl_t *parent = db->db_parent; 1517 dmu_buf_impl_t *dndb; 1518 int dbuf_gone = FALSE; 1519 1520 ASSERT(MUTEX_HELD(&db->db_mtx)); 1521 ASSERT(refcount_is_zero(&db->db_holds)); 1522 1523 dbuf_evict_user(db); 1524 1525 if (db->db_state == DB_CACHED) { 1526 ASSERT(db->db.db_data != NULL); 1527 if (db->db_blkid == DMU_BONUS_BLKID) { 1528 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1529 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1530 } 1531 db->db.db_data = NULL; 1532 db->db_state = DB_UNCACHED; 1533 } 1534 1535 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1536 ASSERT(db->db_data_pending == NULL); 1537 1538 db->db_state = DB_EVICTING; 1539 db->db_blkptr = NULL; 1540 1541 DB_DNODE_ENTER(db); 1542 dn = DB_DNODE(db); 1543 dndb = dn->dn_dbuf; 1544 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1545 list_remove(&dn->dn_dbufs, db); 1546 (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1547 membar_producer(); 1548 DB_DNODE_EXIT(db); 1549 /* 1550 * Decrementing the dbuf count means that the hold corresponding 1551 * to the removed dbuf is no longer discounted in dnode_move(), 1552 * so the dnode cannot be moved until after we release the hold. 1553 * The membar_producer() ensures visibility of the decremented 1554 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1555 * release any lock. 1556 */ 1557 dnode_rele(dn, db); 1558 db->db_dnode_handle = NULL; 1559 } else { 1560 DB_DNODE_EXIT(db); 1561 } 1562 1563 if (db->db_buf) 1564 dbuf_gone = arc_buf_evict(db->db_buf); 1565 1566 if (!dbuf_gone) 1567 mutex_exit(&db->db_mtx); 1568 1569 /* 1570 * If this dbuf is referenced from an indirect dbuf, 1571 * decrement the ref count on the indirect dbuf. 1572 */ 1573 if (parent && parent != dndb) 1574 dbuf_rele(parent, db); 1575 } 1576 1577 static int 1578 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1579 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1580 { 1581 int nlevels, epbs; 1582 1583 *parentp = NULL; 1584 *bpp = NULL; 1585 1586 ASSERT(blkid != DMU_BONUS_BLKID); 1587 1588 if (blkid == DMU_SPILL_BLKID) { 1589 mutex_enter(&dn->dn_mtx); 1590 if (dn->dn_have_spill && 1591 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1592 *bpp = &dn->dn_phys->dn_spill; 1593 else 1594 *bpp = NULL; 1595 dbuf_add_ref(dn->dn_dbuf, NULL); 1596 *parentp = dn->dn_dbuf; 1597 mutex_exit(&dn->dn_mtx); 1598 return (0); 1599 } 1600 1601 if (dn->dn_phys->dn_nlevels == 0) 1602 nlevels = 1; 1603 else 1604 nlevels = dn->dn_phys->dn_nlevels; 1605 1606 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1607 1608 ASSERT3U(level * epbs, <, 64); 1609 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1610 if (level >= nlevels || 1611 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1612 /* the buffer has no parent yet */ 1613 return (ENOENT); 1614 } else if (level < nlevels-1) { 1615 /* this block is referenced from an indirect block */ 1616 int err = dbuf_hold_impl(dn, level+1, 1617 blkid >> epbs, fail_sparse, NULL, parentp); 1618 if (err) 1619 return (err); 1620 err = dbuf_read(*parentp, NULL, 1621 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1622 if (err) { 1623 dbuf_rele(*parentp, NULL); 1624 *parentp = NULL; 1625 return (err); 1626 } 1627 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1628 (blkid & ((1ULL << epbs) - 1)); 1629 return (0); 1630 } else { 1631 /* the block is referenced from the dnode */ 1632 ASSERT3U(level, ==, nlevels-1); 1633 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1634 blkid < dn->dn_phys->dn_nblkptr); 1635 if (dn->dn_dbuf) { 1636 dbuf_add_ref(dn->dn_dbuf, NULL); 1637 *parentp = dn->dn_dbuf; 1638 } 1639 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1640 return (0); 1641 } 1642 } 1643 1644 static dmu_buf_impl_t * 1645 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1646 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1647 { 1648 objset_t *os = dn->dn_objset; 1649 dmu_buf_impl_t *db, *odb; 1650 1651 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1652 ASSERT(dn->dn_type != DMU_OT_NONE); 1653 1654 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1655 1656 db->db_objset = os; 1657 db->db.db_object = dn->dn_object; 1658 db->db_level = level; 1659 db->db_blkid = blkid; 1660 db->db_last_dirty = NULL; 1661 db->db_dirtycnt = 0; 1662 db->db_dnode_handle = dn->dn_handle; 1663 db->db_parent = parent; 1664 db->db_blkptr = blkptr; 1665 1666 db->db_user_ptr = NULL; 1667 db->db_user_data_ptr_ptr = NULL; 1668 db->db_evict_func = NULL; 1669 db->db_immediate_evict = 0; 1670 db->db_freed_in_flight = 0; 1671 1672 if (blkid == DMU_BONUS_BLKID) { 1673 ASSERT3P(parent, ==, dn->dn_dbuf); 1674 db->db.db_size = DN_MAX_BONUSLEN - 1675 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1676 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1677 db->db.db_offset = DMU_BONUS_BLKID; 1678 db->db_state = DB_UNCACHED; 1679 /* the bonus dbuf is not placed in the hash table */ 1680 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1681 return (db); 1682 } else if (blkid == DMU_SPILL_BLKID) { 1683 db->db.db_size = (blkptr != NULL) ? 1684 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 1685 db->db.db_offset = 0; 1686 } else { 1687 int blocksize = 1688 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1689 db->db.db_size = blocksize; 1690 db->db.db_offset = db->db_blkid * blocksize; 1691 } 1692 1693 /* 1694 * Hold the dn_dbufs_mtx while we get the new dbuf 1695 * in the hash table *and* added to the dbufs list. 1696 * This prevents a possible deadlock with someone 1697 * trying to look up this dbuf before its added to the 1698 * dn_dbufs list. 1699 */ 1700 mutex_enter(&dn->dn_dbufs_mtx); 1701 db->db_state = DB_EVICTING; 1702 if ((odb = dbuf_hash_insert(db)) != NULL) { 1703 /* someone else inserted it first */ 1704 kmem_cache_free(dbuf_cache, db); 1705 mutex_exit(&dn->dn_dbufs_mtx); 1706 return (odb); 1707 } 1708 list_insert_head(&dn->dn_dbufs, db); 1709 db->db_state = DB_UNCACHED; 1710 mutex_exit(&dn->dn_dbufs_mtx); 1711 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1712 1713 if (parent && parent != dn->dn_dbuf) 1714 dbuf_add_ref(parent, db); 1715 1716 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1717 refcount_count(&dn->dn_holds) > 0); 1718 (void) refcount_add(&dn->dn_holds, db); 1719 (void) atomic_inc_32_nv(&dn->dn_dbufs_count); 1720 1721 dprintf_dbuf(db, "db=%p\n", db); 1722 1723 return (db); 1724 } 1725 1726 static int 1727 dbuf_do_evict(void *private) 1728 { 1729 arc_buf_t *buf = private; 1730 dmu_buf_impl_t *db = buf->b_private; 1731 1732 if (!MUTEX_HELD(&db->db_mtx)) 1733 mutex_enter(&db->db_mtx); 1734 1735 ASSERT(refcount_is_zero(&db->db_holds)); 1736 1737 if (db->db_state != DB_EVICTING) { 1738 ASSERT(db->db_state == DB_CACHED); 1739 DBUF_VERIFY(db); 1740 db->db_buf = NULL; 1741 dbuf_evict(db); 1742 } else { 1743 mutex_exit(&db->db_mtx); 1744 dbuf_destroy(db); 1745 } 1746 return (0); 1747 } 1748 1749 static void 1750 dbuf_destroy(dmu_buf_impl_t *db) 1751 { 1752 ASSERT(refcount_is_zero(&db->db_holds)); 1753 1754 if (db->db_blkid != DMU_BONUS_BLKID) { 1755 /* 1756 * If this dbuf is still on the dn_dbufs list, 1757 * remove it from that list. 1758 */ 1759 if (db->db_dnode_handle != NULL) { 1760 dnode_t *dn; 1761 1762 DB_DNODE_ENTER(db); 1763 dn = DB_DNODE(db); 1764 mutex_enter(&dn->dn_dbufs_mtx); 1765 list_remove(&dn->dn_dbufs, db); 1766 (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1767 mutex_exit(&dn->dn_dbufs_mtx); 1768 DB_DNODE_EXIT(db); 1769 /* 1770 * Decrementing the dbuf count means that the hold 1771 * corresponding to the removed dbuf is no longer 1772 * discounted in dnode_move(), so the dnode cannot be 1773 * moved until after we release the hold. 1774 */ 1775 dnode_rele(dn, db); 1776 db->db_dnode_handle = NULL; 1777 } 1778 dbuf_hash_remove(db); 1779 } 1780 db->db_parent = NULL; 1781 db->db_buf = NULL; 1782 1783 ASSERT(!list_link_active(&db->db_link)); 1784 ASSERT(db->db.db_data == NULL); 1785 ASSERT(db->db_hash_next == NULL); 1786 ASSERT(db->db_blkptr == NULL); 1787 ASSERT(db->db_data_pending == NULL); 1788 1789 kmem_cache_free(dbuf_cache, db); 1790 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1791 } 1792 1793 void 1794 dbuf_prefetch(dnode_t *dn, uint64_t blkid) 1795 { 1796 dmu_buf_impl_t *db = NULL; 1797 blkptr_t *bp = NULL; 1798 1799 ASSERT(blkid != DMU_BONUS_BLKID); 1800 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1801 1802 if (dnode_block_freed(dn, blkid)) 1803 return; 1804 1805 /* dbuf_find() returns with db_mtx held */ 1806 if (db = dbuf_find(dn, 0, blkid)) { 1807 /* 1808 * This dbuf is already in the cache. We assume that 1809 * it is already CACHED, or else about to be either 1810 * read or filled. 1811 */ 1812 mutex_exit(&db->db_mtx); 1813 return; 1814 } 1815 1816 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1817 if (bp && !BP_IS_HOLE(bp)) { 1818 int priority = dn->dn_type == DMU_OT_DDT_ZAP ? 1819 ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; 1820 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1821 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1822 zbookmark_t zb; 1823 1824 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1825 dn->dn_object, 0, blkid); 1826 1827 (void) arc_read(NULL, dn->dn_objset->os_spa, 1828 bp, NULL, NULL, priority, 1829 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1830 &aflags, &zb); 1831 } 1832 if (db) 1833 dbuf_rele(db, NULL); 1834 } 1835 } 1836 1837 /* 1838 * Returns with db_holds incremented, and db_mtx not held. 1839 * Note: dn_struct_rwlock must be held. 1840 */ 1841 int 1842 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1843 void *tag, dmu_buf_impl_t **dbp) 1844 { 1845 dmu_buf_impl_t *db, *parent = NULL; 1846 1847 ASSERT(blkid != DMU_BONUS_BLKID); 1848 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1849 ASSERT3U(dn->dn_nlevels, >, level); 1850 1851 *dbp = NULL; 1852 top: 1853 /* dbuf_find() returns with db_mtx held */ 1854 db = dbuf_find(dn, level, blkid); 1855 1856 if (db == NULL) { 1857 blkptr_t *bp = NULL; 1858 int err; 1859 1860 ASSERT3P(parent, ==, NULL); 1861 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1862 if (fail_sparse) { 1863 if (err == 0 && bp && BP_IS_HOLE(bp)) 1864 err = ENOENT; 1865 if (err) { 1866 if (parent) 1867 dbuf_rele(parent, NULL); 1868 return (err); 1869 } 1870 } 1871 if (err && err != ENOENT) 1872 return (err); 1873 db = dbuf_create(dn, level, blkid, parent, bp); 1874 } 1875 1876 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1877 arc_buf_add_ref(db->db_buf, db); 1878 if (db->db_buf->b_data == NULL) { 1879 dbuf_clear(db); 1880 if (parent) { 1881 dbuf_rele(parent, NULL); 1882 parent = NULL; 1883 } 1884 goto top; 1885 } 1886 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1887 } 1888 1889 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1890 1891 /* 1892 * If this buffer is currently syncing out, and we are are 1893 * still referencing it from db_data, we need to make a copy 1894 * of it in case we decide we want to dirty it again in this txg. 1895 */ 1896 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1897 dn->dn_object != DMU_META_DNODE_OBJECT && 1898 db->db_state == DB_CACHED && db->db_data_pending) { 1899 dbuf_dirty_record_t *dr = db->db_data_pending; 1900 1901 if (dr->dt.dl.dr_data == db->db_buf) { 1902 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1903 1904 dbuf_set_data(db, 1905 arc_buf_alloc(dn->dn_objset->os_spa, 1906 db->db.db_size, db, type)); 1907 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1908 db->db.db_size); 1909 } 1910 } 1911 1912 (void) refcount_add(&db->db_holds, tag); 1913 dbuf_update_data(db); 1914 DBUF_VERIFY(db); 1915 mutex_exit(&db->db_mtx); 1916 1917 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1918 if (parent) 1919 dbuf_rele(parent, NULL); 1920 1921 ASSERT3P(DB_DNODE(db), ==, dn); 1922 ASSERT3U(db->db_blkid, ==, blkid); 1923 ASSERT3U(db->db_level, ==, level); 1924 *dbp = db; 1925 1926 return (0); 1927 } 1928 1929 dmu_buf_impl_t * 1930 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1931 { 1932 dmu_buf_impl_t *db; 1933 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1934 return (err ? NULL : db); 1935 } 1936 1937 dmu_buf_impl_t * 1938 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1939 { 1940 dmu_buf_impl_t *db; 1941 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1942 return (err ? NULL : db); 1943 } 1944 1945 void 1946 dbuf_create_bonus(dnode_t *dn) 1947 { 1948 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1949 1950 ASSERT(dn->dn_bonus == NULL); 1951 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 1952 } 1953 1954 int 1955 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 1956 { 1957 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1958 dnode_t *dn; 1959 1960 if (db->db_blkid != DMU_SPILL_BLKID) 1961 return (ENOTSUP); 1962 if (blksz == 0) 1963 blksz = SPA_MINBLOCKSIZE; 1964 if (blksz > SPA_MAXBLOCKSIZE) 1965 blksz = SPA_MAXBLOCKSIZE; 1966 else 1967 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 1968 1969 DB_DNODE_ENTER(db); 1970 dn = DB_DNODE(db); 1971 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 1972 dbuf_new_size(db, blksz, tx); 1973 rw_exit(&dn->dn_struct_rwlock); 1974 DB_DNODE_EXIT(db); 1975 1976 return (0); 1977 } 1978 1979 void 1980 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 1981 { 1982 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 1983 } 1984 1985 #pragma weak dmu_buf_add_ref = dbuf_add_ref 1986 void 1987 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1988 { 1989 int64_t holds = refcount_add(&db->db_holds, tag); 1990 ASSERT(holds > 1); 1991 } 1992 1993 /* 1994 * If you call dbuf_rele() you had better not be referencing the dnode handle 1995 * unless you have some other direct or indirect hold on the dnode. (An indirect 1996 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 1997 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 1998 * dnode's parent dbuf evicting its dnode handles. 1999 */ 2000 #pragma weak dmu_buf_rele = dbuf_rele 2001 void 2002 dbuf_rele(dmu_buf_impl_t *db, void *tag) 2003 { 2004 mutex_enter(&db->db_mtx); 2005 dbuf_rele_and_unlock(db, tag); 2006 } 2007 2008 /* 2009 * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2010 * db_dirtycnt and db_holds to be updated atomically. 2011 */ 2012 void 2013 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2014 { 2015 int64_t holds; 2016 2017 ASSERT(MUTEX_HELD(&db->db_mtx)); 2018 DBUF_VERIFY(db); 2019 2020 /* 2021 * Remove the reference to the dbuf before removing its hold on the 2022 * dnode so we can guarantee in dnode_move() that a referenced bonus 2023 * buffer has a corresponding dnode hold. 2024 */ 2025 holds = refcount_remove(&db->db_holds, tag); 2026 ASSERT(holds >= 0); 2027 2028 /* 2029 * We can't freeze indirects if there is a possibility that they 2030 * may be modified in the current syncing context. 2031 */ 2032 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 2033 arc_buf_freeze(db->db_buf); 2034 2035 if (holds == db->db_dirtycnt && 2036 db->db_level == 0 && db->db_immediate_evict) 2037 dbuf_evict_user(db); 2038 2039 if (holds == 0) { 2040 if (db->db_blkid == DMU_BONUS_BLKID) { 2041 mutex_exit(&db->db_mtx); 2042 2043 /* 2044 * If the dnode moves here, we cannot cross this barrier 2045 * until the move completes. 2046 */ 2047 DB_DNODE_ENTER(db); 2048 (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); 2049 DB_DNODE_EXIT(db); 2050 /* 2051 * The bonus buffer's dnode hold is no longer discounted 2052 * in dnode_move(). The dnode cannot move until after 2053 * the dnode_rele(). 2054 */ 2055 dnode_rele(DB_DNODE(db), db); 2056 } else if (db->db_buf == NULL) { 2057 /* 2058 * This is a special case: we never associated this 2059 * dbuf with any data allocated from the ARC. 2060 */ 2061 ASSERT(db->db_state == DB_UNCACHED || 2062 db->db_state == DB_NOFILL); 2063 dbuf_evict(db); 2064 } else if (arc_released(db->db_buf)) { 2065 arc_buf_t *buf = db->db_buf; 2066 /* 2067 * This dbuf has anonymous data associated with it. 2068 */ 2069 dbuf_set_data(db, NULL); 2070 VERIFY(arc_buf_remove_ref(buf, db) == 1); 2071 dbuf_evict(db); 2072 } else { 2073 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); 2074 2075 /* 2076 * A dbuf will be eligible for eviction if either the 2077 * 'primarycache' property is set or a duplicate 2078 * copy of this buffer is already cached in the arc. 2079 * 2080 * In the case of the 'primarycache' a buffer 2081 * is considered for eviction if it matches the 2082 * criteria set in the property. 2083 * 2084 * To decide if our buffer is considered a 2085 * duplicate, we must call into the arc to determine 2086 * if multiple buffers are referencing the same 2087 * block on-disk. If so, then we simply evict 2088 * ourselves. 2089 */ 2090 if (!DBUF_IS_CACHEABLE(db) || 2091 arc_buf_eviction_needed(db->db_buf)) 2092 dbuf_clear(db); 2093 else 2094 mutex_exit(&db->db_mtx); 2095 } 2096 } else { 2097 mutex_exit(&db->db_mtx); 2098 } 2099 } 2100 2101 #pragma weak dmu_buf_refcount = dbuf_refcount 2102 uint64_t 2103 dbuf_refcount(dmu_buf_impl_t *db) 2104 { 2105 return (refcount_count(&db->db_holds)); 2106 } 2107 2108 void * 2109 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2110 dmu_buf_evict_func_t *evict_func) 2111 { 2112 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2113 user_data_ptr_ptr, evict_func)); 2114 } 2115 2116 void * 2117 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2118 dmu_buf_evict_func_t *evict_func) 2119 { 2120 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2121 2122 db->db_immediate_evict = TRUE; 2123 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2124 user_data_ptr_ptr, evict_func)); 2125 } 2126 2127 void * 2128 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 2129 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 2130 { 2131 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2132 ASSERT(db->db_level == 0); 2133 2134 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 2135 2136 mutex_enter(&db->db_mtx); 2137 2138 if (db->db_user_ptr == old_user_ptr) { 2139 db->db_user_ptr = user_ptr; 2140 db->db_user_data_ptr_ptr = user_data_ptr_ptr; 2141 db->db_evict_func = evict_func; 2142 2143 dbuf_update_data(db); 2144 } else { 2145 old_user_ptr = db->db_user_ptr; 2146 } 2147 2148 mutex_exit(&db->db_mtx); 2149 return (old_user_ptr); 2150 } 2151 2152 void * 2153 dmu_buf_get_user(dmu_buf_t *db_fake) 2154 { 2155 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2156 ASSERT(!refcount_is_zero(&db->db_holds)); 2157 2158 return (db->db_user_ptr); 2159 } 2160 2161 boolean_t 2162 dmu_buf_freeable(dmu_buf_t *dbuf) 2163 { 2164 boolean_t res = B_FALSE; 2165 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2166 2167 if (db->db_blkptr) 2168 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2169 db->db_blkptr, db->db_blkptr->blk_birth); 2170 2171 return (res); 2172 } 2173 2174 blkptr_t * 2175 dmu_buf_get_blkptr(dmu_buf_t *db) 2176 { 2177 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2178 return (dbi->db_blkptr); 2179 } 2180 2181 static void 2182 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2183 { 2184 /* ASSERT(dmu_tx_is_syncing(tx) */ 2185 ASSERT(MUTEX_HELD(&db->db_mtx)); 2186 2187 if (db->db_blkptr != NULL) 2188 return; 2189 2190 if (db->db_blkid == DMU_SPILL_BLKID) { 2191 db->db_blkptr = &dn->dn_phys->dn_spill; 2192 BP_ZERO(db->db_blkptr); 2193 return; 2194 } 2195 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2196 /* 2197 * This buffer was allocated at a time when there was 2198 * no available blkptrs from the dnode, or it was 2199 * inappropriate to hook it in (i.e., nlevels mis-match). 2200 */ 2201 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2202 ASSERT(db->db_parent == NULL); 2203 db->db_parent = dn->dn_dbuf; 2204 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2205 DBUF_VERIFY(db); 2206 } else { 2207 dmu_buf_impl_t *parent = db->db_parent; 2208 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2209 2210 ASSERT(dn->dn_phys->dn_nlevels > 1); 2211 if (parent == NULL) { 2212 mutex_exit(&db->db_mtx); 2213 rw_enter(&dn->dn_struct_rwlock, RW_READER); 2214 (void) dbuf_hold_impl(dn, db->db_level+1, 2215 db->db_blkid >> epbs, FALSE, db, &parent); 2216 rw_exit(&dn->dn_struct_rwlock); 2217 mutex_enter(&db->db_mtx); 2218 db->db_parent = parent; 2219 } 2220 db->db_blkptr = (blkptr_t *)parent->db.db_data + 2221 (db->db_blkid & ((1ULL << epbs) - 1)); 2222 DBUF_VERIFY(db); 2223 } 2224 } 2225 2226 static void 2227 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2228 { 2229 dmu_buf_impl_t *db = dr->dr_dbuf; 2230 dnode_t *dn; 2231 zio_t *zio; 2232 2233 ASSERT(dmu_tx_is_syncing(tx)); 2234 2235 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2236 2237 mutex_enter(&db->db_mtx); 2238 2239 ASSERT(db->db_level > 0); 2240 DBUF_VERIFY(db); 2241 2242 if (db->db_buf == NULL) { 2243 mutex_exit(&db->db_mtx); 2244 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2245 mutex_enter(&db->db_mtx); 2246 } 2247 ASSERT3U(db->db_state, ==, DB_CACHED); 2248 ASSERT(db->db_buf != NULL); 2249 2250 DB_DNODE_ENTER(db); 2251 dn = DB_DNODE(db); 2252 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2253 dbuf_check_blkptr(dn, db); 2254 DB_DNODE_EXIT(db); 2255 2256 db->db_data_pending = dr; 2257 2258 mutex_exit(&db->db_mtx); 2259 dbuf_write(dr, db->db_buf, tx); 2260 2261 zio = dr->dr_zio; 2262 mutex_enter(&dr->dt.di.dr_mtx); 2263 dbuf_sync_list(&dr->dt.di.dr_children, tx); 2264 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2265 mutex_exit(&dr->dt.di.dr_mtx); 2266 zio_nowait(zio); 2267 } 2268 2269 static void 2270 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2271 { 2272 arc_buf_t **datap = &dr->dt.dl.dr_data; 2273 dmu_buf_impl_t *db = dr->dr_dbuf; 2274 dnode_t *dn; 2275 objset_t *os; 2276 uint64_t txg = tx->tx_txg; 2277 2278 ASSERT(dmu_tx_is_syncing(tx)); 2279 2280 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2281 2282 mutex_enter(&db->db_mtx); 2283 /* 2284 * To be synced, we must be dirtied. But we 2285 * might have been freed after the dirty. 2286 */ 2287 if (db->db_state == DB_UNCACHED) { 2288 /* This buffer has been freed since it was dirtied */ 2289 ASSERT(db->db.db_data == NULL); 2290 } else if (db->db_state == DB_FILL) { 2291 /* This buffer was freed and is now being re-filled */ 2292 ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2293 } else { 2294 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2295 } 2296 DBUF_VERIFY(db); 2297 2298 DB_DNODE_ENTER(db); 2299 dn = DB_DNODE(db); 2300 2301 if (db->db_blkid == DMU_SPILL_BLKID) { 2302 mutex_enter(&dn->dn_mtx); 2303 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 2304 mutex_exit(&dn->dn_mtx); 2305 } 2306 2307 /* 2308 * If this is a bonus buffer, simply copy the bonus data into the 2309 * dnode. It will be written out when the dnode is synced (and it 2310 * will be synced, since it must have been dirty for dbuf_sync to 2311 * be called). 2312 */ 2313 if (db->db_blkid == DMU_BONUS_BLKID) { 2314 dbuf_dirty_record_t **drp; 2315 2316 ASSERT(*datap != NULL); 2317 ASSERT0(db->db_level); 2318 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2319 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2320 DB_DNODE_EXIT(db); 2321 2322 if (*datap != db->db.db_data) { 2323 zio_buf_free(*datap, DN_MAX_BONUSLEN); 2324 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2325 } 2326 db->db_data_pending = NULL; 2327 drp = &db->db_last_dirty; 2328 while (*drp != dr) 2329 drp = &(*drp)->dr_next; 2330 ASSERT(dr->dr_next == NULL); 2331 ASSERT(dr->dr_dbuf == db); 2332 *drp = dr->dr_next; 2333 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2334 ASSERT(db->db_dirtycnt > 0); 2335 db->db_dirtycnt -= 1; 2336 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2337 return; 2338 } 2339 2340 os = dn->dn_objset; 2341 2342 /* 2343 * This function may have dropped the db_mtx lock allowing a dmu_sync 2344 * operation to sneak in. As a result, we need to ensure that we 2345 * don't check the dr_override_state until we have returned from 2346 * dbuf_check_blkptr. 2347 */ 2348 dbuf_check_blkptr(dn, db); 2349 2350 /* 2351 * If this buffer is in the middle of an immediate write, 2352 * wait for the synchronous IO to complete. 2353 */ 2354 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2355 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2356 cv_wait(&db->db_changed, &db->db_mtx); 2357 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2358 } 2359 2360 if (db->db_state != DB_NOFILL && 2361 dn->dn_object != DMU_META_DNODE_OBJECT && 2362 refcount_count(&db->db_holds) > 1 && 2363 dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2364 *datap == db->db_buf) { 2365 /* 2366 * If this buffer is currently "in use" (i.e., there 2367 * are active holds and db_data still references it), 2368 * then make a copy before we start the write so that 2369 * any modifications from the open txg will not leak 2370 * into this write. 2371 * 2372 * NOTE: this copy does not need to be made for 2373 * objects only modified in the syncing context (e.g. 2374 * DNONE_DNODE blocks). 2375 */ 2376 int blksz = arc_buf_size(*datap); 2377 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2378 *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2379 bcopy(db->db.db_data, (*datap)->b_data, blksz); 2380 } 2381 db->db_data_pending = dr; 2382 2383 mutex_exit(&db->db_mtx); 2384 2385 dbuf_write(dr, *datap, tx); 2386 2387 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2388 if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2389 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2390 DB_DNODE_EXIT(db); 2391 } else { 2392 /* 2393 * Although zio_nowait() does not "wait for an IO", it does 2394 * initiate the IO. If this is an empty write it seems plausible 2395 * that the IO could actually be completed before the nowait 2396 * returns. We need to DB_DNODE_EXIT() first in case 2397 * zio_nowait() invalidates the dbuf. 2398 */ 2399 DB_DNODE_EXIT(db); 2400 zio_nowait(dr->dr_zio); 2401 } 2402 } 2403 2404 void 2405 dbuf_sync_list(list_t *list, dmu_tx_t *tx) 2406 { 2407 dbuf_dirty_record_t *dr; 2408 2409 while (dr = list_head(list)) { 2410 if (dr->dr_zio != NULL) { 2411 /* 2412 * If we find an already initialized zio then we 2413 * are processing the meta-dnode, and we have finished. 2414 * The dbufs for all dnodes are put back on the list 2415 * during processing, so that we can zio_wait() 2416 * these IOs after initiating all child IOs. 2417 */ 2418 ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2419 DMU_META_DNODE_OBJECT); 2420 break; 2421 } 2422 list_remove(list, dr); 2423 if (dr->dr_dbuf->db_level > 0) 2424 dbuf_sync_indirect(dr, tx); 2425 else 2426 dbuf_sync_leaf(dr, tx); 2427 } 2428 } 2429 2430 /* ARGSUSED */ 2431 static void 2432 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2433 { 2434 dmu_buf_impl_t *db = vdb; 2435 dnode_t *dn; 2436 blkptr_t *bp = zio->io_bp; 2437 blkptr_t *bp_orig = &zio->io_bp_orig; 2438 spa_t *spa = zio->io_spa; 2439 int64_t delta; 2440 uint64_t fill = 0; 2441 int i; 2442 2443 ASSERT(db->db_blkptr == bp); 2444 2445 DB_DNODE_ENTER(db); 2446 dn = DB_DNODE(db); 2447 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2448 dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2449 zio->io_prev_space_delta = delta; 2450 2451 if (BP_IS_HOLE(bp)) { 2452 ASSERT(bp->blk_fill == 0); 2453 DB_DNODE_EXIT(db); 2454 return; 2455 } 2456 2457 ASSERT((db->db_blkid != DMU_SPILL_BLKID && 2458 BP_GET_TYPE(bp) == dn->dn_type) || 2459 (db->db_blkid == DMU_SPILL_BLKID && 2460 BP_GET_TYPE(bp) == dn->dn_bonustype)); 2461 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2462 2463 mutex_enter(&db->db_mtx); 2464 2465 #ifdef ZFS_DEBUG 2466 if (db->db_blkid == DMU_SPILL_BLKID) { 2467 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2468 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2469 db->db_blkptr == &dn->dn_phys->dn_spill); 2470 } 2471 #endif 2472 2473 if (db->db_level == 0) { 2474 mutex_enter(&dn->dn_mtx); 2475 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2476 db->db_blkid != DMU_SPILL_BLKID) 2477 dn->dn_phys->dn_maxblkid = db->db_blkid; 2478 mutex_exit(&dn->dn_mtx); 2479 2480 if (dn->dn_type == DMU_OT_DNODE) { 2481 dnode_phys_t *dnp = db->db.db_data; 2482 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2483 i--, dnp++) { 2484 if (dnp->dn_type != DMU_OT_NONE) 2485 fill++; 2486 } 2487 } else { 2488 fill = 1; 2489 } 2490 } else { 2491 blkptr_t *ibp = db->db.db_data; 2492 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2493 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2494 if (BP_IS_HOLE(ibp)) 2495 continue; 2496 fill += ibp->blk_fill; 2497 } 2498 } 2499 DB_DNODE_EXIT(db); 2500 2501 bp->blk_fill = fill; 2502 2503 mutex_exit(&db->db_mtx); 2504 } 2505 2506 /* ARGSUSED */ 2507 static void 2508 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2509 { 2510 dmu_buf_impl_t *db = vdb; 2511 blkptr_t *bp = zio->io_bp; 2512 blkptr_t *bp_orig = &zio->io_bp_orig; 2513 uint64_t txg = zio->io_txg; 2514 dbuf_dirty_record_t **drp, *dr; 2515 2516 ASSERT0(zio->io_error); 2517 ASSERT(db->db_blkptr == bp); 2518 2519 /* 2520 * For nopwrites and rewrites we ensure that the bp matches our 2521 * original and bypass all the accounting. 2522 */ 2523 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 2524 ASSERT(BP_EQUAL(bp, bp_orig)); 2525 } else { 2526 objset_t *os; 2527 dsl_dataset_t *ds; 2528 dmu_tx_t *tx; 2529 2530 DB_GET_OBJSET(&os, db); 2531 ds = os->os_dsl_dataset; 2532 tx = os->os_synctx; 2533 2534 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2535 dsl_dataset_block_born(ds, bp, tx); 2536 } 2537 2538 mutex_enter(&db->db_mtx); 2539 2540 DBUF_VERIFY(db); 2541 2542 drp = &db->db_last_dirty; 2543 while ((dr = *drp) != db->db_data_pending) 2544 drp = &dr->dr_next; 2545 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2546 ASSERT(dr->dr_txg == txg); 2547 ASSERT(dr->dr_dbuf == db); 2548 ASSERT(dr->dr_next == NULL); 2549 *drp = dr->dr_next; 2550 2551 #ifdef ZFS_DEBUG 2552 if (db->db_blkid == DMU_SPILL_BLKID) { 2553 dnode_t *dn; 2554 2555 DB_DNODE_ENTER(db); 2556 dn = DB_DNODE(db); 2557 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2558 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2559 db->db_blkptr == &dn->dn_phys->dn_spill); 2560 DB_DNODE_EXIT(db); 2561 } 2562 #endif 2563 2564 if (db->db_level == 0) { 2565 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2566 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2567 if (db->db_state != DB_NOFILL) { 2568 if (dr->dt.dl.dr_data != db->db_buf) 2569 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2570 db) == 1); 2571 else if (!arc_released(db->db_buf)) 2572 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2573 } 2574 } else { 2575 dnode_t *dn; 2576 2577 DB_DNODE_ENTER(db); 2578 dn = DB_DNODE(db); 2579 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2580 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2581 if (!BP_IS_HOLE(db->db_blkptr)) { 2582 int epbs = 2583 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2584 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2585 db->db.db_size); 2586 ASSERT3U(dn->dn_phys->dn_maxblkid 2587 >> (db->db_level * epbs), >=, db->db_blkid); 2588 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2589 } 2590 DB_DNODE_EXIT(db); 2591 mutex_destroy(&dr->dt.di.dr_mtx); 2592 list_destroy(&dr->dt.di.dr_children); 2593 } 2594 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2595 2596 cv_broadcast(&db->db_changed); 2597 ASSERT(db->db_dirtycnt > 0); 2598 db->db_dirtycnt -= 1; 2599 db->db_data_pending = NULL; 2600 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2601 } 2602 2603 static void 2604 dbuf_write_nofill_ready(zio_t *zio) 2605 { 2606 dbuf_write_ready(zio, NULL, zio->io_private); 2607 } 2608 2609 static void 2610 dbuf_write_nofill_done(zio_t *zio) 2611 { 2612 dbuf_write_done(zio, NULL, zio->io_private); 2613 } 2614 2615 static void 2616 dbuf_write_override_ready(zio_t *zio) 2617 { 2618 dbuf_dirty_record_t *dr = zio->io_private; 2619 dmu_buf_impl_t *db = dr->dr_dbuf; 2620 2621 dbuf_write_ready(zio, NULL, db); 2622 } 2623 2624 static void 2625 dbuf_write_override_done(zio_t *zio) 2626 { 2627 dbuf_dirty_record_t *dr = zio->io_private; 2628 dmu_buf_impl_t *db = dr->dr_dbuf; 2629 blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2630 2631 mutex_enter(&db->db_mtx); 2632 if (!BP_EQUAL(zio->io_bp, obp)) { 2633 if (!BP_IS_HOLE(obp)) 2634 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2635 arc_release(dr->dt.dl.dr_data, db); 2636 } 2637 mutex_exit(&db->db_mtx); 2638 2639 dbuf_write_done(zio, NULL, db); 2640 } 2641 2642 static void 2643 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2644 { 2645 dmu_buf_impl_t *db = dr->dr_dbuf; 2646 dnode_t *dn; 2647 objset_t *os; 2648 dmu_buf_impl_t *parent = db->db_parent; 2649 uint64_t txg = tx->tx_txg; 2650 zbookmark_t zb; 2651 zio_prop_t zp; 2652 zio_t *zio; 2653 int wp_flag = 0; 2654 2655 DB_DNODE_ENTER(db); 2656 dn = DB_DNODE(db); 2657 os = dn->dn_objset; 2658 2659 if (db->db_state != DB_NOFILL) { 2660 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2661 /* 2662 * Private object buffers are released here rather 2663 * than in dbuf_dirty() since they are only modified 2664 * in the syncing context and we don't want the 2665 * overhead of making multiple copies of the data. 2666 */ 2667 if (BP_IS_HOLE(db->db_blkptr)) { 2668 arc_buf_thaw(data); 2669 } else { 2670 dbuf_release_bp(db); 2671 } 2672 } 2673 } 2674 2675 if (parent != dn->dn_dbuf) { 2676 ASSERT(parent && parent->db_data_pending); 2677 ASSERT(db->db_level == parent->db_level-1); 2678 ASSERT(arc_released(parent->db_buf)); 2679 zio = parent->db_data_pending->dr_zio; 2680 } else { 2681 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 2682 db->db_blkid != DMU_SPILL_BLKID) || 2683 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 2684 if (db->db_blkid != DMU_SPILL_BLKID) 2685 ASSERT3P(db->db_blkptr, ==, 2686 &dn->dn_phys->dn_blkptr[db->db_blkid]); 2687 zio = dn->dn_zio; 2688 } 2689 2690 ASSERT(db->db_level == 0 || data == db->db_buf); 2691 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2692 ASSERT(zio); 2693 2694 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2695 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2696 db->db.db_object, db->db_level, db->db_blkid); 2697 2698 if (db->db_blkid == DMU_SPILL_BLKID) 2699 wp_flag = WP_SPILL; 2700 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 2701 2702 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 2703 DB_DNODE_EXIT(db); 2704 2705 if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2706 ASSERT(db->db_state != DB_NOFILL); 2707 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2708 db->db_blkptr, data->b_data, arc_buf_size(data), &zp, 2709 dbuf_write_override_ready, dbuf_write_override_done, dr, 2710 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2711 mutex_enter(&db->db_mtx); 2712 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2713 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2714 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 2715 mutex_exit(&db->db_mtx); 2716 } else if (db->db_state == DB_NOFILL) { 2717 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); 2718 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2719 db->db_blkptr, NULL, db->db.db_size, &zp, 2720 dbuf_write_nofill_ready, dbuf_write_nofill_done, db, 2721 ZIO_PRIORITY_ASYNC_WRITE, 2722 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2723 } else { 2724 ASSERT(arc_released(data)); 2725 dr->dr_zio = arc_write(zio, os->os_spa, txg, 2726 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, 2727 dbuf_write_ready, dbuf_write_done, db, 2728 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2729 } 2730 } 2731