1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/dmu.h> 31 #include <sys/dmu_send.h> 32 #include <sys/dmu_impl.h> 33 #include <sys/dbuf.h> 34 #include <sys/dmu_objset.h> 35 #include <sys/dsl_dataset.h> 36 #include <sys/dsl_dir.h> 37 #include <sys/dmu_tx.h> 38 #include <sys/spa.h> 39 #include <sys/zio.h> 40 #include <sys/dmu_zfetch.h> 41 #include <sys/sa.h> 42 #include <sys/sa_impl.h> 43 #include <sys/zfeature.h> 44 #include <sys/blkptr.h> 45 #include <sys/range_tree.h> 46 47 /* 48 * Number of times that zfs_free_range() took the slow path while doing 49 * a zfs receive. A nonzero value indicates a potential performance problem. 50 */ 51 uint64_t zfs_free_range_recv_miss; 52 53 static void dbuf_destroy(dmu_buf_impl_t *db); 54 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 55 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 56 57 /* 58 * Global data structures and functions for the dbuf cache. 59 */ 60 static kmem_cache_t *dbuf_cache; 61 62 /* ARGSUSED */ 63 static int 64 dbuf_cons(void *vdb, void *unused, int kmflag) 65 { 66 dmu_buf_impl_t *db = vdb; 67 bzero(db, sizeof (dmu_buf_impl_t)); 68 69 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 70 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 71 refcount_create(&db->db_holds); 72 return (0); 73 } 74 75 /* ARGSUSED */ 76 static void 77 dbuf_dest(void *vdb, void *unused) 78 { 79 dmu_buf_impl_t *db = vdb; 80 mutex_destroy(&db->db_mtx); 81 cv_destroy(&db->db_changed); 82 refcount_destroy(&db->db_holds); 83 } 84 85 /* 86 * dbuf hash table routines 87 */ 88 static dbuf_hash_table_t dbuf_hash_table; 89 90 static uint64_t dbuf_hash_count; 91 92 static uint64_t 93 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 94 { 95 uintptr_t osv = (uintptr_t)os; 96 uint64_t crc = -1ULL; 97 98 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 99 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 100 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 101 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 102 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 103 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 104 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 105 106 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 107 108 return (crc); 109 } 110 111 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 112 113 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 114 ((dbuf)->db.db_object == (obj) && \ 115 (dbuf)->db_objset == (os) && \ 116 (dbuf)->db_level == (level) && \ 117 (dbuf)->db_blkid == (blkid)) 118 119 dmu_buf_impl_t * 120 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 121 { 122 dbuf_hash_table_t *h = &dbuf_hash_table; 123 objset_t *os = dn->dn_objset; 124 uint64_t obj = dn->dn_object; 125 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 126 uint64_t idx = hv & h->hash_table_mask; 127 dmu_buf_impl_t *db; 128 129 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 130 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 131 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 132 mutex_enter(&db->db_mtx); 133 if (db->db_state != DB_EVICTING) { 134 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 135 return (db); 136 } 137 mutex_exit(&db->db_mtx); 138 } 139 } 140 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 141 return (NULL); 142 } 143 144 /* 145 * Insert an entry into the hash table. If there is already an element 146 * equal to elem in the hash table, then the already existing element 147 * will be returned and the new element will not be inserted. 148 * Otherwise returns NULL. 149 */ 150 static dmu_buf_impl_t * 151 dbuf_hash_insert(dmu_buf_impl_t *db) 152 { 153 dbuf_hash_table_t *h = &dbuf_hash_table; 154 objset_t *os = db->db_objset; 155 uint64_t obj = db->db.db_object; 156 int level = db->db_level; 157 uint64_t blkid = db->db_blkid; 158 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 159 uint64_t idx = hv & h->hash_table_mask; 160 dmu_buf_impl_t *dbf; 161 162 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 163 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 164 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 165 mutex_enter(&dbf->db_mtx); 166 if (dbf->db_state != DB_EVICTING) { 167 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 168 return (dbf); 169 } 170 mutex_exit(&dbf->db_mtx); 171 } 172 } 173 174 mutex_enter(&db->db_mtx); 175 db->db_hash_next = h->hash_table[idx]; 176 h->hash_table[idx] = db; 177 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 178 atomic_add_64(&dbuf_hash_count, 1); 179 180 return (NULL); 181 } 182 183 /* 184 * Remove an entry from the hash table. It must be in the EVICTING state. 185 */ 186 static void 187 dbuf_hash_remove(dmu_buf_impl_t *db) 188 { 189 dbuf_hash_table_t *h = &dbuf_hash_table; 190 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 191 db->db_level, db->db_blkid); 192 uint64_t idx = hv & h->hash_table_mask; 193 dmu_buf_impl_t *dbf, **dbp; 194 195 /* 196 * We musn't hold db_mtx to maintain lock ordering: 197 * DBUF_HASH_MUTEX > db_mtx. 198 */ 199 ASSERT(refcount_is_zero(&db->db_holds)); 200 ASSERT(db->db_state == DB_EVICTING); 201 ASSERT(!MUTEX_HELD(&db->db_mtx)); 202 203 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 204 dbp = &h->hash_table[idx]; 205 while ((dbf = *dbp) != db) { 206 dbp = &dbf->db_hash_next; 207 ASSERT(dbf != NULL); 208 } 209 *dbp = db->db_hash_next; 210 db->db_hash_next = NULL; 211 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 212 atomic_add_64(&dbuf_hash_count, -1); 213 } 214 215 static arc_evict_func_t dbuf_do_evict; 216 217 static void 218 dbuf_evict_user(dmu_buf_impl_t *db) 219 { 220 ASSERT(MUTEX_HELD(&db->db_mtx)); 221 222 if (db->db_level != 0 || db->db_evict_func == NULL) 223 return; 224 225 if (db->db_user_data_ptr_ptr) 226 *db->db_user_data_ptr_ptr = db->db.db_data; 227 db->db_evict_func(&db->db, db->db_user_ptr); 228 db->db_user_ptr = NULL; 229 db->db_user_data_ptr_ptr = NULL; 230 db->db_evict_func = NULL; 231 } 232 233 boolean_t 234 dbuf_is_metadata(dmu_buf_impl_t *db) 235 { 236 if (db->db_level > 0) { 237 return (B_TRUE); 238 } else { 239 boolean_t is_metadata; 240 241 DB_DNODE_ENTER(db); 242 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 243 DB_DNODE_EXIT(db); 244 245 return (is_metadata); 246 } 247 } 248 249 void 250 dbuf_evict(dmu_buf_impl_t *db) 251 { 252 ASSERT(MUTEX_HELD(&db->db_mtx)); 253 ASSERT(db->db_buf == NULL); 254 ASSERT(db->db_data_pending == NULL); 255 256 dbuf_clear(db); 257 dbuf_destroy(db); 258 } 259 260 void 261 dbuf_init(void) 262 { 263 uint64_t hsize = 1ULL << 16; 264 dbuf_hash_table_t *h = &dbuf_hash_table; 265 int i; 266 267 /* 268 * The hash table is big enough to fill all of physical memory 269 * with an average 4K block size. The table will take up 270 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 271 */ 272 while (hsize * 4096 < physmem * PAGESIZE) 273 hsize <<= 1; 274 275 retry: 276 h->hash_table_mask = hsize - 1; 277 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 278 if (h->hash_table == NULL) { 279 /* XXX - we should really return an error instead of assert */ 280 ASSERT(hsize > (1ULL << 10)); 281 hsize >>= 1; 282 goto retry; 283 } 284 285 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 286 sizeof (dmu_buf_impl_t), 287 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 288 289 for (i = 0; i < DBUF_MUTEXES; i++) 290 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 291 } 292 293 void 294 dbuf_fini(void) 295 { 296 dbuf_hash_table_t *h = &dbuf_hash_table; 297 int i; 298 299 for (i = 0; i < DBUF_MUTEXES; i++) 300 mutex_destroy(&h->hash_mutexes[i]); 301 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 302 kmem_cache_destroy(dbuf_cache); 303 } 304 305 /* 306 * Other stuff. 307 */ 308 309 #ifdef ZFS_DEBUG 310 static void 311 dbuf_verify(dmu_buf_impl_t *db) 312 { 313 dnode_t *dn; 314 dbuf_dirty_record_t *dr; 315 316 ASSERT(MUTEX_HELD(&db->db_mtx)); 317 318 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 319 return; 320 321 ASSERT(db->db_objset != NULL); 322 DB_DNODE_ENTER(db); 323 dn = DB_DNODE(db); 324 if (dn == NULL) { 325 ASSERT(db->db_parent == NULL); 326 ASSERT(db->db_blkptr == NULL); 327 } else { 328 ASSERT3U(db->db.db_object, ==, dn->dn_object); 329 ASSERT3P(db->db_objset, ==, dn->dn_objset); 330 ASSERT3U(db->db_level, <, dn->dn_nlevels); 331 ASSERT(db->db_blkid == DMU_BONUS_BLKID || 332 db->db_blkid == DMU_SPILL_BLKID || 333 !list_is_empty(&dn->dn_dbufs)); 334 } 335 if (db->db_blkid == DMU_BONUS_BLKID) { 336 ASSERT(dn != NULL); 337 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 338 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 339 } else if (db->db_blkid == DMU_SPILL_BLKID) { 340 ASSERT(dn != NULL); 341 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 342 ASSERT0(db->db.db_offset); 343 } else { 344 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 345 } 346 347 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 348 ASSERT(dr->dr_dbuf == db); 349 350 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 351 ASSERT(dr->dr_dbuf == db); 352 353 /* 354 * We can't assert that db_size matches dn_datablksz because it 355 * can be momentarily different when another thread is doing 356 * dnode_set_blksz(). 357 */ 358 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 359 dr = db->db_data_pending; 360 /* 361 * It should only be modified in syncing context, so 362 * make sure we only have one copy of the data. 363 */ 364 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 365 } 366 367 /* verify db->db_blkptr */ 368 if (db->db_blkptr) { 369 if (db->db_parent == dn->dn_dbuf) { 370 /* db is pointed to by the dnode */ 371 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 372 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 373 ASSERT(db->db_parent == NULL); 374 else 375 ASSERT(db->db_parent != NULL); 376 if (db->db_blkid != DMU_SPILL_BLKID) 377 ASSERT3P(db->db_blkptr, ==, 378 &dn->dn_phys->dn_blkptr[db->db_blkid]); 379 } else { 380 /* db is pointed to by an indirect block */ 381 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 382 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 383 ASSERT3U(db->db_parent->db.db_object, ==, 384 db->db.db_object); 385 /* 386 * dnode_grow_indblksz() can make this fail if we don't 387 * have the struct_rwlock. XXX indblksz no longer 388 * grows. safe to do this now? 389 */ 390 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 391 ASSERT3P(db->db_blkptr, ==, 392 ((blkptr_t *)db->db_parent->db.db_data + 393 db->db_blkid % epb)); 394 } 395 } 396 } 397 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 398 (db->db_buf == NULL || db->db_buf->b_data) && 399 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 400 db->db_state != DB_FILL && !dn->dn_free_txg) { 401 /* 402 * If the blkptr isn't set but they have nonzero data, 403 * it had better be dirty, otherwise we'll lose that 404 * data when we evict this buffer. 405 */ 406 if (db->db_dirtycnt == 0) { 407 uint64_t *buf = db->db.db_data; 408 int i; 409 410 for (i = 0; i < db->db.db_size >> 3; i++) { 411 ASSERT(buf[i] == 0); 412 } 413 } 414 } 415 DB_DNODE_EXIT(db); 416 } 417 #endif 418 419 static void 420 dbuf_update_data(dmu_buf_impl_t *db) 421 { 422 ASSERT(MUTEX_HELD(&db->db_mtx)); 423 if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 424 ASSERT(!refcount_is_zero(&db->db_holds)); 425 *db->db_user_data_ptr_ptr = db->db.db_data; 426 } 427 } 428 429 static void 430 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 431 { 432 ASSERT(MUTEX_HELD(&db->db_mtx)); 433 db->db_buf = buf; 434 if (buf != NULL) { 435 ASSERT(buf->b_data != NULL); 436 db->db.db_data = buf->b_data; 437 if (!arc_released(buf)) 438 arc_set_callback(buf, dbuf_do_evict, db); 439 dbuf_update_data(db); 440 } else { 441 dbuf_evict_user(db); 442 db->db.db_data = NULL; 443 if (db->db_state != DB_NOFILL) 444 db->db_state = DB_UNCACHED; 445 } 446 } 447 448 /* 449 * Loan out an arc_buf for read. Return the loaned arc_buf. 450 */ 451 arc_buf_t * 452 dbuf_loan_arcbuf(dmu_buf_impl_t *db) 453 { 454 arc_buf_t *abuf; 455 456 mutex_enter(&db->db_mtx); 457 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 458 int blksz = db->db.db_size; 459 spa_t *spa = db->db_objset->os_spa; 460 461 mutex_exit(&db->db_mtx); 462 abuf = arc_loan_buf(spa, blksz); 463 bcopy(db->db.db_data, abuf->b_data, blksz); 464 } else { 465 abuf = db->db_buf; 466 arc_loan_inuse_buf(abuf, db); 467 dbuf_set_data(db, NULL); 468 mutex_exit(&db->db_mtx); 469 } 470 return (abuf); 471 } 472 473 uint64_t 474 dbuf_whichblock(dnode_t *dn, uint64_t offset) 475 { 476 if (dn->dn_datablkshift) { 477 return (offset >> dn->dn_datablkshift); 478 } else { 479 ASSERT3U(offset, <, dn->dn_datablksz); 480 return (0); 481 } 482 } 483 484 static void 485 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 486 { 487 dmu_buf_impl_t *db = vdb; 488 489 mutex_enter(&db->db_mtx); 490 ASSERT3U(db->db_state, ==, DB_READ); 491 /* 492 * All reads are synchronous, so we must have a hold on the dbuf 493 */ 494 ASSERT(refcount_count(&db->db_holds) > 0); 495 ASSERT(db->db_buf == NULL); 496 ASSERT(db->db.db_data == NULL); 497 if (db->db_level == 0 && db->db_freed_in_flight) { 498 /* we were freed in flight; disregard any error */ 499 arc_release(buf, db); 500 bzero(buf->b_data, db->db.db_size); 501 arc_buf_freeze(buf); 502 db->db_freed_in_flight = FALSE; 503 dbuf_set_data(db, buf); 504 db->db_state = DB_CACHED; 505 } else if (zio == NULL || zio->io_error == 0) { 506 dbuf_set_data(db, buf); 507 db->db_state = DB_CACHED; 508 } else { 509 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 510 ASSERT3P(db->db_buf, ==, NULL); 511 VERIFY(arc_buf_remove_ref(buf, db)); 512 db->db_state = DB_UNCACHED; 513 } 514 cv_broadcast(&db->db_changed); 515 dbuf_rele_and_unlock(db, NULL); 516 } 517 518 static void 519 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 520 { 521 dnode_t *dn; 522 zbookmark_phys_t zb; 523 uint32_t aflags = ARC_NOWAIT; 524 525 DB_DNODE_ENTER(db); 526 dn = DB_DNODE(db); 527 ASSERT(!refcount_is_zero(&db->db_holds)); 528 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 529 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 530 ASSERT(MUTEX_HELD(&db->db_mtx)); 531 ASSERT(db->db_state == DB_UNCACHED); 532 ASSERT(db->db_buf == NULL); 533 534 if (db->db_blkid == DMU_BONUS_BLKID) { 535 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 536 537 ASSERT3U(bonuslen, <=, db->db.db_size); 538 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 539 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 540 if (bonuslen < DN_MAX_BONUSLEN) 541 bzero(db->db.db_data, DN_MAX_BONUSLEN); 542 if (bonuslen) 543 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 544 DB_DNODE_EXIT(db); 545 dbuf_update_data(db); 546 db->db_state = DB_CACHED; 547 mutex_exit(&db->db_mtx); 548 return; 549 } 550 551 /* 552 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 553 * processes the delete record and clears the bp while we are waiting 554 * for the dn_mtx (resulting in a "no" from block_freed). 555 */ 556 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 557 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 558 BP_IS_HOLE(db->db_blkptr)))) { 559 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 560 561 DB_DNODE_EXIT(db); 562 dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, 563 db->db.db_size, db, type)); 564 bzero(db->db.db_data, db->db.db_size); 565 db->db_state = DB_CACHED; 566 *flags |= DB_RF_CACHED; 567 mutex_exit(&db->db_mtx); 568 return; 569 } 570 571 DB_DNODE_EXIT(db); 572 573 db->db_state = DB_READ; 574 mutex_exit(&db->db_mtx); 575 576 if (DBUF_IS_L2CACHEABLE(db)) 577 aflags |= ARC_L2CACHE; 578 if (DBUF_IS_L2COMPRESSIBLE(db)) 579 aflags |= ARC_L2COMPRESS; 580 581 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 582 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 583 db->db.db_object, db->db_level, db->db_blkid); 584 585 dbuf_add_ref(db, NULL); 586 587 (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 588 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 589 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 590 &aflags, &zb); 591 if (aflags & ARC_CACHED) 592 *flags |= DB_RF_CACHED; 593 } 594 595 int 596 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 597 { 598 int err = 0; 599 boolean_t havepzio = (zio != NULL); 600 boolean_t prefetch; 601 dnode_t *dn; 602 603 /* 604 * We don't have to hold the mutex to check db_state because it 605 * can't be freed while we have a hold on the buffer. 606 */ 607 ASSERT(!refcount_is_zero(&db->db_holds)); 608 609 if (db->db_state == DB_NOFILL) 610 return (SET_ERROR(EIO)); 611 612 DB_DNODE_ENTER(db); 613 dn = DB_DNODE(db); 614 if ((flags & DB_RF_HAVESTRUCT) == 0) 615 rw_enter(&dn->dn_struct_rwlock, RW_READER); 616 617 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 618 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 619 DBUF_IS_CACHEABLE(db); 620 621 mutex_enter(&db->db_mtx); 622 if (db->db_state == DB_CACHED) { 623 mutex_exit(&db->db_mtx); 624 if (prefetch) 625 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 626 db->db.db_size, TRUE); 627 if ((flags & DB_RF_HAVESTRUCT) == 0) 628 rw_exit(&dn->dn_struct_rwlock); 629 DB_DNODE_EXIT(db); 630 } else if (db->db_state == DB_UNCACHED) { 631 spa_t *spa = dn->dn_objset->os_spa; 632 633 if (zio == NULL) 634 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 635 dbuf_read_impl(db, zio, &flags); 636 637 /* dbuf_read_impl has dropped db_mtx for us */ 638 639 if (prefetch) 640 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 641 db->db.db_size, flags & DB_RF_CACHED); 642 643 if ((flags & DB_RF_HAVESTRUCT) == 0) 644 rw_exit(&dn->dn_struct_rwlock); 645 DB_DNODE_EXIT(db); 646 647 if (!havepzio) 648 err = zio_wait(zio); 649 } else { 650 /* 651 * Another reader came in while the dbuf was in flight 652 * between UNCACHED and CACHED. Either a writer will finish 653 * writing the buffer (sending the dbuf to CACHED) or the 654 * first reader's request will reach the read_done callback 655 * and send the dbuf to CACHED. Otherwise, a failure 656 * occurred and the dbuf went to UNCACHED. 657 */ 658 mutex_exit(&db->db_mtx); 659 if (prefetch) 660 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 661 db->db.db_size, TRUE); 662 if ((flags & DB_RF_HAVESTRUCT) == 0) 663 rw_exit(&dn->dn_struct_rwlock); 664 DB_DNODE_EXIT(db); 665 666 /* Skip the wait per the caller's request. */ 667 mutex_enter(&db->db_mtx); 668 if ((flags & DB_RF_NEVERWAIT) == 0) { 669 while (db->db_state == DB_READ || 670 db->db_state == DB_FILL) { 671 ASSERT(db->db_state == DB_READ || 672 (flags & DB_RF_HAVESTRUCT) == 0); 673 cv_wait(&db->db_changed, &db->db_mtx); 674 } 675 if (db->db_state == DB_UNCACHED) 676 err = SET_ERROR(EIO); 677 } 678 mutex_exit(&db->db_mtx); 679 } 680 681 ASSERT(err || havepzio || db->db_state == DB_CACHED); 682 return (err); 683 } 684 685 static void 686 dbuf_noread(dmu_buf_impl_t *db) 687 { 688 ASSERT(!refcount_is_zero(&db->db_holds)); 689 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 690 mutex_enter(&db->db_mtx); 691 while (db->db_state == DB_READ || db->db_state == DB_FILL) 692 cv_wait(&db->db_changed, &db->db_mtx); 693 if (db->db_state == DB_UNCACHED) { 694 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 695 spa_t *spa = db->db_objset->os_spa; 696 697 ASSERT(db->db_buf == NULL); 698 ASSERT(db->db.db_data == NULL); 699 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 700 db->db_state = DB_FILL; 701 } else if (db->db_state == DB_NOFILL) { 702 dbuf_set_data(db, NULL); 703 } else { 704 ASSERT3U(db->db_state, ==, DB_CACHED); 705 } 706 mutex_exit(&db->db_mtx); 707 } 708 709 /* 710 * This is our just-in-time copy function. It makes a copy of 711 * buffers, that have been modified in a previous transaction 712 * group, before we modify them in the current active group. 713 * 714 * This function is used in two places: when we are dirtying a 715 * buffer for the first time in a txg, and when we are freeing 716 * a range in a dnode that includes this buffer. 717 * 718 * Note that when we are called from dbuf_free_range() we do 719 * not put a hold on the buffer, we just traverse the active 720 * dbuf list for the dnode. 721 */ 722 static void 723 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 724 { 725 dbuf_dirty_record_t *dr = db->db_last_dirty; 726 727 ASSERT(MUTEX_HELD(&db->db_mtx)); 728 ASSERT(db->db.db_data != NULL); 729 ASSERT(db->db_level == 0); 730 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 731 732 if (dr == NULL || 733 (dr->dt.dl.dr_data != 734 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 735 return; 736 737 /* 738 * If the last dirty record for this dbuf has not yet synced 739 * and its referencing the dbuf data, either: 740 * reset the reference to point to a new copy, 741 * or (if there a no active holders) 742 * just null out the current db_data pointer. 743 */ 744 ASSERT(dr->dr_txg >= txg - 2); 745 if (db->db_blkid == DMU_BONUS_BLKID) { 746 /* Note that the data bufs here are zio_bufs */ 747 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 748 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 749 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 750 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 751 int size = db->db.db_size; 752 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 753 spa_t *spa = db->db_objset->os_spa; 754 755 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 756 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 757 } else { 758 dbuf_set_data(db, NULL); 759 } 760 } 761 762 void 763 dbuf_unoverride(dbuf_dirty_record_t *dr) 764 { 765 dmu_buf_impl_t *db = dr->dr_dbuf; 766 blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 767 uint64_t txg = dr->dr_txg; 768 769 ASSERT(MUTEX_HELD(&db->db_mtx)); 770 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 771 ASSERT(db->db_level == 0); 772 773 if (db->db_blkid == DMU_BONUS_BLKID || 774 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 775 return; 776 777 ASSERT(db->db_data_pending != dr); 778 779 /* free this block */ 780 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 781 zio_free(db->db_objset->os_spa, txg, bp); 782 783 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 784 dr->dt.dl.dr_nopwrite = B_FALSE; 785 786 /* 787 * Release the already-written buffer, so we leave it in 788 * a consistent dirty state. Note that all callers are 789 * modifying the buffer, so they will immediately do 790 * another (redundant) arc_release(). Therefore, leave 791 * the buf thawed to save the effort of freezing & 792 * immediately re-thawing it. 793 */ 794 arc_release(dr->dt.dl.dr_data, db); 795 } 796 797 /* 798 * Evict (if its unreferenced) or clear (if its referenced) any level-0 799 * data blocks in the free range, so that any future readers will find 800 * empty blocks. 801 * 802 * This is a no-op if the dataset is in the middle of an incremental 803 * receive; see comment below for details. 804 */ 805 void 806 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) 807 { 808 dmu_buf_impl_t *db, *db_next; 809 uint64_t txg = tx->tx_txg; 810 811 if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) 812 end = dn->dn_maxblkid; 813 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); 814 815 mutex_enter(&dn->dn_dbufs_mtx); 816 if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) { 817 /* There can't be any dbufs in this range; no need to search. */ 818 mutex_exit(&dn->dn_dbufs_mtx); 819 return; 820 } else if (dmu_objset_is_receiving(dn->dn_objset)) { 821 /* 822 * If we are receiving, we expect there to be no dbufs in 823 * the range to be freed, because receive modifies each 824 * block at most once, and in offset order. If this is 825 * not the case, it can lead to performance problems, 826 * so note that we unexpectedly took the slow path. 827 */ 828 atomic_inc_64(&zfs_free_range_recv_miss); 829 } 830 831 for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) { 832 db_next = list_next(&dn->dn_dbufs, db); 833 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 834 835 if (db->db_level != 0) 836 continue; 837 if (db->db_blkid < start || db->db_blkid > end) 838 continue; 839 840 /* found a level 0 buffer in the range */ 841 mutex_enter(&db->db_mtx); 842 if (dbuf_undirty(db, tx)) { 843 /* mutex has been dropped and dbuf destroyed */ 844 continue; 845 } 846 847 if (db->db_state == DB_UNCACHED || 848 db->db_state == DB_NOFILL || 849 db->db_state == DB_EVICTING) { 850 ASSERT(db->db.db_data == NULL); 851 mutex_exit(&db->db_mtx); 852 continue; 853 } 854 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 855 /* will be handled in dbuf_read_done or dbuf_rele */ 856 db->db_freed_in_flight = TRUE; 857 mutex_exit(&db->db_mtx); 858 continue; 859 } 860 if (refcount_count(&db->db_holds) == 0) { 861 ASSERT(db->db_buf); 862 dbuf_clear(db); 863 continue; 864 } 865 /* The dbuf is referenced */ 866 867 if (db->db_last_dirty != NULL) { 868 dbuf_dirty_record_t *dr = db->db_last_dirty; 869 870 if (dr->dr_txg == txg) { 871 /* 872 * This buffer is "in-use", re-adjust the file 873 * size to reflect that this buffer may 874 * contain new data when we sync. 875 */ 876 if (db->db_blkid != DMU_SPILL_BLKID && 877 db->db_blkid > dn->dn_maxblkid) 878 dn->dn_maxblkid = db->db_blkid; 879 dbuf_unoverride(dr); 880 } else { 881 /* 882 * This dbuf is not dirty in the open context. 883 * Either uncache it (if its not referenced in 884 * the open context) or reset its contents to 885 * empty. 886 */ 887 dbuf_fix_old_data(db, txg); 888 } 889 } 890 /* clear the contents if its cached */ 891 if (db->db_state == DB_CACHED) { 892 ASSERT(db->db.db_data != NULL); 893 arc_release(db->db_buf, db); 894 bzero(db->db.db_data, db->db.db_size); 895 arc_buf_freeze(db->db_buf); 896 } 897 898 mutex_exit(&db->db_mtx); 899 } 900 mutex_exit(&dn->dn_dbufs_mtx); 901 } 902 903 static int 904 dbuf_block_freeable(dmu_buf_impl_t *db) 905 { 906 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 907 uint64_t birth_txg = 0; 908 909 /* 910 * We don't need any locking to protect db_blkptr: 911 * If it's syncing, then db_last_dirty will be set 912 * so we'll ignore db_blkptr. 913 * 914 * This logic ensures that only block births for 915 * filled blocks are considered. 916 */ 917 ASSERT(MUTEX_HELD(&db->db_mtx)); 918 if (db->db_last_dirty && (db->db_blkptr == NULL || 919 !BP_IS_HOLE(db->db_blkptr))) { 920 birth_txg = db->db_last_dirty->dr_txg; 921 } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 922 birth_txg = db->db_blkptr->blk_birth; 923 } 924 925 /* 926 * If this block don't exist or is in a snapshot, it can't be freed. 927 * Don't pass the bp to dsl_dataset_block_freeable() since we 928 * are holding the db_mtx lock and might deadlock if we are 929 * prefetching a dedup-ed block. 930 */ 931 if (birth_txg != 0) 932 return (ds == NULL || 933 dsl_dataset_block_freeable(ds, NULL, birth_txg)); 934 else 935 return (B_FALSE); 936 } 937 938 void 939 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 940 { 941 arc_buf_t *buf, *obuf; 942 int osize = db->db.db_size; 943 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 944 dnode_t *dn; 945 946 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 947 948 DB_DNODE_ENTER(db); 949 dn = DB_DNODE(db); 950 951 /* XXX does *this* func really need the lock? */ 952 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 953 954 /* 955 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 956 * is OK, because there can be no other references to the db 957 * when we are changing its size, so no concurrent DB_FILL can 958 * be happening. 959 */ 960 /* 961 * XXX we should be doing a dbuf_read, checking the return 962 * value and returning that up to our callers 963 */ 964 dmu_buf_will_dirty(&db->db, tx); 965 966 /* create the data buffer for the new block */ 967 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 968 969 /* copy old block data to the new block */ 970 obuf = db->db_buf; 971 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 972 /* zero the remainder */ 973 if (size > osize) 974 bzero((uint8_t *)buf->b_data + osize, size - osize); 975 976 mutex_enter(&db->db_mtx); 977 dbuf_set_data(db, buf); 978 VERIFY(arc_buf_remove_ref(obuf, db)); 979 db->db.db_size = size; 980 981 if (db->db_level == 0) { 982 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 983 db->db_last_dirty->dt.dl.dr_data = buf; 984 } 985 mutex_exit(&db->db_mtx); 986 987 dnode_willuse_space(dn, size-osize, tx); 988 DB_DNODE_EXIT(db); 989 } 990 991 void 992 dbuf_release_bp(dmu_buf_impl_t *db) 993 { 994 objset_t *os = db->db_objset; 995 996 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 997 ASSERT(arc_released(os->os_phys_buf) || 998 list_link_active(&os->os_dsl_dataset->ds_synced_link)); 999 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1000 1001 (void) arc_release(db->db_buf, db); 1002 } 1003 1004 dbuf_dirty_record_t * 1005 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1006 { 1007 dnode_t *dn; 1008 objset_t *os; 1009 dbuf_dirty_record_t **drp, *dr; 1010 int drop_struct_lock = FALSE; 1011 boolean_t do_free_accounting = B_FALSE; 1012 int txgoff = tx->tx_txg & TXG_MASK; 1013 1014 ASSERT(tx->tx_txg != 0); 1015 ASSERT(!refcount_is_zero(&db->db_holds)); 1016 DMU_TX_DIRTY_BUF(tx, db); 1017 1018 DB_DNODE_ENTER(db); 1019 dn = DB_DNODE(db); 1020 /* 1021 * Shouldn't dirty a regular buffer in syncing context. Private 1022 * objects may be dirtied in syncing context, but only if they 1023 * were already pre-dirtied in open context. 1024 */ 1025 ASSERT(!dmu_tx_is_syncing(tx) || 1026 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1027 DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1028 dn->dn_objset->os_dsl_dataset == NULL); 1029 /* 1030 * We make this assert for private objects as well, but after we 1031 * check if we're already dirty. They are allowed to re-dirty 1032 * in syncing context. 1033 */ 1034 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1035 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1036 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1037 1038 mutex_enter(&db->db_mtx); 1039 /* 1040 * XXX make this true for indirects too? The problem is that 1041 * transactions created with dmu_tx_create_assigned() from 1042 * syncing context don't bother holding ahead. 1043 */ 1044 ASSERT(db->db_level != 0 || 1045 db->db_state == DB_CACHED || db->db_state == DB_FILL || 1046 db->db_state == DB_NOFILL); 1047 1048 mutex_enter(&dn->dn_mtx); 1049 /* 1050 * Don't set dirtyctx to SYNC if we're just modifying this as we 1051 * initialize the objset. 1052 */ 1053 if (dn->dn_dirtyctx == DN_UNDIRTIED && 1054 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1055 dn->dn_dirtyctx = 1056 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1057 ASSERT(dn->dn_dirtyctx_firstset == NULL); 1058 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1059 } 1060 mutex_exit(&dn->dn_mtx); 1061 1062 if (db->db_blkid == DMU_SPILL_BLKID) 1063 dn->dn_have_spill = B_TRUE; 1064 1065 /* 1066 * If this buffer is already dirty, we're done. 1067 */ 1068 drp = &db->db_last_dirty; 1069 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1070 db->db.db_object == DMU_META_DNODE_OBJECT); 1071 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1072 drp = &dr->dr_next; 1073 if (dr && dr->dr_txg == tx->tx_txg) { 1074 DB_DNODE_EXIT(db); 1075 1076 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1077 /* 1078 * If this buffer has already been written out, 1079 * we now need to reset its state. 1080 */ 1081 dbuf_unoverride(dr); 1082 if (db->db.db_object != DMU_META_DNODE_OBJECT && 1083 db->db_state != DB_NOFILL) 1084 arc_buf_thaw(db->db_buf); 1085 } 1086 mutex_exit(&db->db_mtx); 1087 return (dr); 1088 } 1089 1090 /* 1091 * Only valid if not already dirty. 1092 */ 1093 ASSERT(dn->dn_object == 0 || 1094 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1095 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1096 1097 ASSERT3U(dn->dn_nlevels, >, db->db_level); 1098 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1099 dn->dn_phys->dn_nlevels > db->db_level || 1100 dn->dn_next_nlevels[txgoff] > db->db_level || 1101 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1102 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1103 1104 /* 1105 * We should only be dirtying in syncing context if it's the 1106 * mos or we're initializing the os or it's a special object. 1107 * However, we are allowed to dirty in syncing context provided 1108 * we already dirtied it in open context. Hence we must make 1109 * this assertion only if we're not already dirty. 1110 */ 1111 os = dn->dn_objset; 1112 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1113 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1114 ASSERT(db->db.db_size != 0); 1115 1116 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1117 1118 if (db->db_blkid != DMU_BONUS_BLKID) { 1119 /* 1120 * Update the accounting. 1121 * Note: we delay "free accounting" until after we drop 1122 * the db_mtx. This keeps us from grabbing other locks 1123 * (and possibly deadlocking) in bp_get_dsize() while 1124 * also holding the db_mtx. 1125 */ 1126 dnode_willuse_space(dn, db->db.db_size, tx); 1127 do_free_accounting = dbuf_block_freeable(db); 1128 } 1129 1130 /* 1131 * If this buffer is dirty in an old transaction group we need 1132 * to make a copy of it so that the changes we make in this 1133 * transaction group won't leak out when we sync the older txg. 1134 */ 1135 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1136 if (db->db_level == 0) { 1137 void *data_old = db->db_buf; 1138 1139 if (db->db_state != DB_NOFILL) { 1140 if (db->db_blkid == DMU_BONUS_BLKID) { 1141 dbuf_fix_old_data(db, tx->tx_txg); 1142 data_old = db->db.db_data; 1143 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1144 /* 1145 * Release the data buffer from the cache so 1146 * that we can modify it without impacting 1147 * possible other users of this cached data 1148 * block. Note that indirect blocks and 1149 * private objects are not released until the 1150 * syncing state (since they are only modified 1151 * then). 1152 */ 1153 arc_release(db->db_buf, db); 1154 dbuf_fix_old_data(db, tx->tx_txg); 1155 data_old = db->db_buf; 1156 } 1157 ASSERT(data_old != NULL); 1158 } 1159 dr->dt.dl.dr_data = data_old; 1160 } else { 1161 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1162 list_create(&dr->dt.di.dr_children, 1163 sizeof (dbuf_dirty_record_t), 1164 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1165 } 1166 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 1167 dr->dr_accounted = db->db.db_size; 1168 dr->dr_dbuf = db; 1169 dr->dr_txg = tx->tx_txg; 1170 dr->dr_next = *drp; 1171 *drp = dr; 1172 1173 /* 1174 * We could have been freed_in_flight between the dbuf_noread 1175 * and dbuf_dirty. We win, as though the dbuf_noread() had 1176 * happened after the free. 1177 */ 1178 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1179 db->db_blkid != DMU_SPILL_BLKID) { 1180 mutex_enter(&dn->dn_mtx); 1181 if (dn->dn_free_ranges[txgoff] != NULL) { 1182 range_tree_clear(dn->dn_free_ranges[txgoff], 1183 db->db_blkid, 1); 1184 } 1185 mutex_exit(&dn->dn_mtx); 1186 db->db_freed_in_flight = FALSE; 1187 } 1188 1189 /* 1190 * This buffer is now part of this txg 1191 */ 1192 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1193 db->db_dirtycnt += 1; 1194 ASSERT3U(db->db_dirtycnt, <=, 3); 1195 1196 mutex_exit(&db->db_mtx); 1197 1198 if (db->db_blkid == DMU_BONUS_BLKID || 1199 db->db_blkid == DMU_SPILL_BLKID) { 1200 mutex_enter(&dn->dn_mtx); 1201 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1202 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1203 mutex_exit(&dn->dn_mtx); 1204 dnode_setdirty(dn, tx); 1205 DB_DNODE_EXIT(db); 1206 return (dr); 1207 } else if (do_free_accounting) { 1208 blkptr_t *bp = db->db_blkptr; 1209 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1210 bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1211 /* 1212 * This is only a guess -- if the dbuf is dirty 1213 * in a previous txg, we don't know how much 1214 * space it will use on disk yet. We should 1215 * really have the struct_rwlock to access 1216 * db_blkptr, but since this is just a guess, 1217 * it's OK if we get an odd answer. 1218 */ 1219 ddt_prefetch(os->os_spa, bp); 1220 dnode_willuse_space(dn, -willfree, tx); 1221 } 1222 1223 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1224 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1225 drop_struct_lock = TRUE; 1226 } 1227 1228 if (db->db_level == 0) { 1229 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1230 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1231 } 1232 1233 if (db->db_level+1 < dn->dn_nlevels) { 1234 dmu_buf_impl_t *parent = db->db_parent; 1235 dbuf_dirty_record_t *di; 1236 int parent_held = FALSE; 1237 1238 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1239 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1240 1241 parent = dbuf_hold_level(dn, db->db_level+1, 1242 db->db_blkid >> epbs, FTAG); 1243 ASSERT(parent != NULL); 1244 parent_held = TRUE; 1245 } 1246 if (drop_struct_lock) 1247 rw_exit(&dn->dn_struct_rwlock); 1248 ASSERT3U(db->db_level+1, ==, parent->db_level); 1249 di = dbuf_dirty(parent, tx); 1250 if (parent_held) 1251 dbuf_rele(parent, FTAG); 1252 1253 mutex_enter(&db->db_mtx); 1254 /* 1255 * Since we've dropped the mutex, it's possible that 1256 * dbuf_undirty() might have changed this out from under us. 1257 */ 1258 if (db->db_last_dirty == dr || 1259 dn->dn_object == DMU_META_DNODE_OBJECT) { 1260 mutex_enter(&di->dt.di.dr_mtx); 1261 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1262 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1263 list_insert_tail(&di->dt.di.dr_children, dr); 1264 mutex_exit(&di->dt.di.dr_mtx); 1265 dr->dr_parent = di; 1266 } 1267 mutex_exit(&db->db_mtx); 1268 } else { 1269 ASSERT(db->db_level+1 == dn->dn_nlevels); 1270 ASSERT(db->db_blkid < dn->dn_nblkptr); 1271 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1272 mutex_enter(&dn->dn_mtx); 1273 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1274 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1275 mutex_exit(&dn->dn_mtx); 1276 if (drop_struct_lock) 1277 rw_exit(&dn->dn_struct_rwlock); 1278 } 1279 1280 dnode_setdirty(dn, tx); 1281 DB_DNODE_EXIT(db); 1282 return (dr); 1283 } 1284 1285 /* 1286 * Undirty a buffer in the transaction group referenced by the given 1287 * transaction. Return whether this evicted the dbuf. 1288 */ 1289 static boolean_t 1290 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1291 { 1292 dnode_t *dn; 1293 uint64_t txg = tx->tx_txg; 1294 dbuf_dirty_record_t *dr, **drp; 1295 1296 ASSERT(txg != 0); 1297 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1298 ASSERT0(db->db_level); 1299 ASSERT(MUTEX_HELD(&db->db_mtx)); 1300 1301 /* 1302 * If this buffer is not dirty, we're done. 1303 */ 1304 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1305 if (dr->dr_txg <= txg) 1306 break; 1307 if (dr == NULL || dr->dr_txg < txg) 1308 return (B_FALSE); 1309 ASSERT(dr->dr_txg == txg); 1310 ASSERT(dr->dr_dbuf == db); 1311 1312 DB_DNODE_ENTER(db); 1313 dn = DB_DNODE(db); 1314 1315 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1316 1317 ASSERT(db->db.db_size != 0); 1318 1319 /* 1320 * Any space we accounted for in dp_dirty_* will be cleaned up by 1321 * dsl_pool_sync(). This is relatively rare so the discrepancy 1322 * is not a big deal. 1323 */ 1324 1325 *drp = dr->dr_next; 1326 1327 /* 1328 * Note that there are three places in dbuf_dirty() 1329 * where this dirty record may be put on a list. 1330 * Make sure to do a list_remove corresponding to 1331 * every one of those list_insert calls. 1332 */ 1333 if (dr->dr_parent) { 1334 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1335 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1336 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1337 } else if (db->db_blkid == DMU_SPILL_BLKID || 1338 db->db_level+1 == dn->dn_nlevels) { 1339 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1340 mutex_enter(&dn->dn_mtx); 1341 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1342 mutex_exit(&dn->dn_mtx); 1343 } 1344 DB_DNODE_EXIT(db); 1345 1346 if (db->db_state != DB_NOFILL) { 1347 dbuf_unoverride(dr); 1348 1349 ASSERT(db->db_buf != NULL); 1350 ASSERT(dr->dt.dl.dr_data != NULL); 1351 if (dr->dt.dl.dr_data != db->db_buf) 1352 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); 1353 } 1354 1355 if (db->db_level != 0) { 1356 mutex_destroy(&dr->dt.di.dr_mtx); 1357 list_destroy(&dr->dt.di.dr_children); 1358 } 1359 1360 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1361 1362 ASSERT(db->db_dirtycnt > 0); 1363 db->db_dirtycnt -= 1; 1364 1365 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1366 arc_buf_t *buf = db->db_buf; 1367 1368 ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1369 dbuf_set_data(db, NULL); 1370 VERIFY(arc_buf_remove_ref(buf, db)); 1371 dbuf_evict(db); 1372 return (B_TRUE); 1373 } 1374 1375 return (B_FALSE); 1376 } 1377 1378 void 1379 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1380 { 1381 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1382 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1383 1384 ASSERT(tx->tx_txg != 0); 1385 ASSERT(!refcount_is_zero(&db->db_holds)); 1386 1387 DB_DNODE_ENTER(db); 1388 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1389 rf |= DB_RF_HAVESTRUCT; 1390 DB_DNODE_EXIT(db); 1391 (void) dbuf_read(db, NULL, rf); 1392 (void) dbuf_dirty(db, tx); 1393 } 1394 1395 void 1396 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1397 { 1398 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1399 1400 db->db_state = DB_NOFILL; 1401 1402 dmu_buf_will_fill(db_fake, tx); 1403 } 1404 1405 void 1406 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1407 { 1408 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1409 1410 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1411 ASSERT(tx->tx_txg != 0); 1412 ASSERT(db->db_level == 0); 1413 ASSERT(!refcount_is_zero(&db->db_holds)); 1414 1415 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1416 dmu_tx_private_ok(tx)); 1417 1418 dbuf_noread(db); 1419 (void) dbuf_dirty(db, tx); 1420 } 1421 1422 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1423 /* ARGSUSED */ 1424 void 1425 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1426 { 1427 mutex_enter(&db->db_mtx); 1428 DBUF_VERIFY(db); 1429 1430 if (db->db_state == DB_FILL) { 1431 if (db->db_level == 0 && db->db_freed_in_flight) { 1432 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1433 /* we were freed while filling */ 1434 /* XXX dbuf_undirty? */ 1435 bzero(db->db.db_data, db->db.db_size); 1436 db->db_freed_in_flight = FALSE; 1437 } 1438 db->db_state = DB_CACHED; 1439 cv_broadcast(&db->db_changed); 1440 } 1441 mutex_exit(&db->db_mtx); 1442 } 1443 1444 void 1445 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, 1446 bp_embedded_type_t etype, enum zio_compress comp, 1447 int uncompressed_size, int compressed_size, int byteorder, 1448 dmu_tx_t *tx) 1449 { 1450 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 1451 struct dirty_leaf *dl; 1452 dmu_object_type_t type; 1453 1454 DB_DNODE_ENTER(db); 1455 type = DB_DNODE(db)->dn_type; 1456 DB_DNODE_EXIT(db); 1457 1458 ASSERT0(db->db_level); 1459 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1460 1461 dmu_buf_will_not_fill(dbuf, tx); 1462 1463 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1464 dl = &db->db_last_dirty->dt.dl; 1465 encode_embedded_bp_compressed(&dl->dr_overridden_by, 1466 data, comp, uncompressed_size, compressed_size); 1467 BPE_SET_ETYPE(&dl->dr_overridden_by, etype); 1468 BP_SET_TYPE(&dl->dr_overridden_by, type); 1469 BP_SET_LEVEL(&dl->dr_overridden_by, 0); 1470 BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); 1471 1472 dl->dr_override_state = DR_OVERRIDDEN; 1473 dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; 1474 } 1475 1476 /* 1477 * Directly assign a provided arc buf to a given dbuf if it's not referenced 1478 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1479 */ 1480 void 1481 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1482 { 1483 ASSERT(!refcount_is_zero(&db->db_holds)); 1484 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1485 ASSERT(db->db_level == 0); 1486 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1487 ASSERT(buf != NULL); 1488 ASSERT(arc_buf_size(buf) == db->db.db_size); 1489 ASSERT(tx->tx_txg != 0); 1490 1491 arc_return_buf(buf, db); 1492 ASSERT(arc_released(buf)); 1493 1494 mutex_enter(&db->db_mtx); 1495 1496 while (db->db_state == DB_READ || db->db_state == DB_FILL) 1497 cv_wait(&db->db_changed, &db->db_mtx); 1498 1499 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1500 1501 if (db->db_state == DB_CACHED && 1502 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1503 mutex_exit(&db->db_mtx); 1504 (void) dbuf_dirty(db, tx); 1505 bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1506 VERIFY(arc_buf_remove_ref(buf, db)); 1507 xuio_stat_wbuf_copied(); 1508 return; 1509 } 1510 1511 xuio_stat_wbuf_nocopy(); 1512 if (db->db_state == DB_CACHED) { 1513 dbuf_dirty_record_t *dr = db->db_last_dirty; 1514 1515 ASSERT(db->db_buf != NULL); 1516 if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1517 ASSERT(dr->dt.dl.dr_data == db->db_buf); 1518 if (!arc_released(db->db_buf)) { 1519 ASSERT(dr->dt.dl.dr_override_state == 1520 DR_OVERRIDDEN); 1521 arc_release(db->db_buf, db); 1522 } 1523 dr->dt.dl.dr_data = buf; 1524 VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1525 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1526 arc_release(db->db_buf, db); 1527 VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1528 } 1529 db->db_buf = NULL; 1530 } 1531 ASSERT(db->db_buf == NULL); 1532 dbuf_set_data(db, buf); 1533 db->db_state = DB_FILL; 1534 mutex_exit(&db->db_mtx); 1535 (void) dbuf_dirty(db, tx); 1536 dmu_buf_fill_done(&db->db, tx); 1537 } 1538 1539 /* 1540 * "Clear" the contents of this dbuf. This will mark the dbuf 1541 * EVICTING and clear *most* of its references. Unfortunately, 1542 * when we are not holding the dn_dbufs_mtx, we can't clear the 1543 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1544 * in this case. For callers from the DMU we will usually see: 1545 * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() 1546 * For the arc callback, we will usually see: 1547 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1548 * Sometimes, though, we will get a mix of these two: 1549 * DMU: dbuf_clear()->arc_clear_callback() 1550 * ARC: dbuf_do_evict()->dbuf_destroy() 1551 * 1552 * This routine will dissociate the dbuf from the arc, by calling 1553 * arc_clear_callback(), but will not evict the data from the ARC. 1554 */ 1555 void 1556 dbuf_clear(dmu_buf_impl_t *db) 1557 { 1558 dnode_t *dn; 1559 dmu_buf_impl_t *parent = db->db_parent; 1560 dmu_buf_impl_t *dndb; 1561 boolean_t dbuf_gone = B_FALSE; 1562 1563 ASSERT(MUTEX_HELD(&db->db_mtx)); 1564 ASSERT(refcount_is_zero(&db->db_holds)); 1565 1566 dbuf_evict_user(db); 1567 1568 if (db->db_state == DB_CACHED) { 1569 ASSERT(db->db.db_data != NULL); 1570 if (db->db_blkid == DMU_BONUS_BLKID) { 1571 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1572 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1573 } 1574 db->db.db_data = NULL; 1575 db->db_state = DB_UNCACHED; 1576 } 1577 1578 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1579 ASSERT(db->db_data_pending == NULL); 1580 1581 db->db_state = DB_EVICTING; 1582 db->db_blkptr = NULL; 1583 1584 DB_DNODE_ENTER(db); 1585 dn = DB_DNODE(db); 1586 dndb = dn->dn_dbuf; 1587 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1588 list_remove(&dn->dn_dbufs, db); 1589 (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1590 membar_producer(); 1591 DB_DNODE_EXIT(db); 1592 /* 1593 * Decrementing the dbuf count means that the hold corresponding 1594 * to the removed dbuf is no longer discounted in dnode_move(), 1595 * so the dnode cannot be moved until after we release the hold. 1596 * The membar_producer() ensures visibility of the decremented 1597 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1598 * release any lock. 1599 */ 1600 dnode_rele(dn, db); 1601 db->db_dnode_handle = NULL; 1602 } else { 1603 DB_DNODE_EXIT(db); 1604 } 1605 1606 if (db->db_buf) 1607 dbuf_gone = arc_clear_callback(db->db_buf); 1608 1609 if (!dbuf_gone) 1610 mutex_exit(&db->db_mtx); 1611 1612 /* 1613 * If this dbuf is referenced from an indirect dbuf, 1614 * decrement the ref count on the indirect dbuf. 1615 */ 1616 if (parent && parent != dndb) 1617 dbuf_rele(parent, db); 1618 } 1619 1620 static int 1621 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1622 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1623 { 1624 int nlevels, epbs; 1625 1626 *parentp = NULL; 1627 *bpp = NULL; 1628 1629 ASSERT(blkid != DMU_BONUS_BLKID); 1630 1631 if (blkid == DMU_SPILL_BLKID) { 1632 mutex_enter(&dn->dn_mtx); 1633 if (dn->dn_have_spill && 1634 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1635 *bpp = &dn->dn_phys->dn_spill; 1636 else 1637 *bpp = NULL; 1638 dbuf_add_ref(dn->dn_dbuf, NULL); 1639 *parentp = dn->dn_dbuf; 1640 mutex_exit(&dn->dn_mtx); 1641 return (0); 1642 } 1643 1644 if (dn->dn_phys->dn_nlevels == 0) 1645 nlevels = 1; 1646 else 1647 nlevels = dn->dn_phys->dn_nlevels; 1648 1649 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1650 1651 ASSERT3U(level * epbs, <, 64); 1652 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1653 if (level >= nlevels || 1654 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1655 /* the buffer has no parent yet */ 1656 return (SET_ERROR(ENOENT)); 1657 } else if (level < nlevels-1) { 1658 /* this block is referenced from an indirect block */ 1659 int err = dbuf_hold_impl(dn, level+1, 1660 blkid >> epbs, fail_sparse, NULL, parentp); 1661 if (err) 1662 return (err); 1663 err = dbuf_read(*parentp, NULL, 1664 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1665 if (err) { 1666 dbuf_rele(*parentp, NULL); 1667 *parentp = NULL; 1668 return (err); 1669 } 1670 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1671 (blkid & ((1ULL << epbs) - 1)); 1672 return (0); 1673 } else { 1674 /* the block is referenced from the dnode */ 1675 ASSERT3U(level, ==, nlevels-1); 1676 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1677 blkid < dn->dn_phys->dn_nblkptr); 1678 if (dn->dn_dbuf) { 1679 dbuf_add_ref(dn->dn_dbuf, NULL); 1680 *parentp = dn->dn_dbuf; 1681 } 1682 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1683 return (0); 1684 } 1685 } 1686 1687 static dmu_buf_impl_t * 1688 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1689 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1690 { 1691 objset_t *os = dn->dn_objset; 1692 dmu_buf_impl_t *db, *odb; 1693 1694 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1695 ASSERT(dn->dn_type != DMU_OT_NONE); 1696 1697 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1698 1699 db->db_objset = os; 1700 db->db.db_object = dn->dn_object; 1701 db->db_level = level; 1702 db->db_blkid = blkid; 1703 db->db_last_dirty = NULL; 1704 db->db_dirtycnt = 0; 1705 db->db_dnode_handle = dn->dn_handle; 1706 db->db_parent = parent; 1707 db->db_blkptr = blkptr; 1708 1709 db->db_user_ptr = NULL; 1710 db->db_user_data_ptr_ptr = NULL; 1711 db->db_evict_func = NULL; 1712 db->db_immediate_evict = 0; 1713 db->db_freed_in_flight = 0; 1714 1715 if (blkid == DMU_BONUS_BLKID) { 1716 ASSERT3P(parent, ==, dn->dn_dbuf); 1717 db->db.db_size = DN_MAX_BONUSLEN - 1718 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1719 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1720 db->db.db_offset = DMU_BONUS_BLKID; 1721 db->db_state = DB_UNCACHED; 1722 /* the bonus dbuf is not placed in the hash table */ 1723 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1724 return (db); 1725 } else if (blkid == DMU_SPILL_BLKID) { 1726 db->db.db_size = (blkptr != NULL) ? 1727 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 1728 db->db.db_offset = 0; 1729 } else { 1730 int blocksize = 1731 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 1732 db->db.db_size = blocksize; 1733 db->db.db_offset = db->db_blkid * blocksize; 1734 } 1735 1736 /* 1737 * Hold the dn_dbufs_mtx while we get the new dbuf 1738 * in the hash table *and* added to the dbufs list. 1739 * This prevents a possible deadlock with someone 1740 * trying to look up this dbuf before its added to the 1741 * dn_dbufs list. 1742 */ 1743 mutex_enter(&dn->dn_dbufs_mtx); 1744 db->db_state = DB_EVICTING; 1745 if ((odb = dbuf_hash_insert(db)) != NULL) { 1746 /* someone else inserted it first */ 1747 kmem_cache_free(dbuf_cache, db); 1748 mutex_exit(&dn->dn_dbufs_mtx); 1749 return (odb); 1750 } 1751 list_insert_head(&dn->dn_dbufs, db); 1752 if (db->db_level == 0 && db->db_blkid >= 1753 dn->dn_unlisted_l0_blkid) 1754 dn->dn_unlisted_l0_blkid = db->db_blkid + 1; 1755 db->db_state = DB_UNCACHED; 1756 mutex_exit(&dn->dn_dbufs_mtx); 1757 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1758 1759 if (parent && parent != dn->dn_dbuf) 1760 dbuf_add_ref(parent, db); 1761 1762 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1763 refcount_count(&dn->dn_holds) > 0); 1764 (void) refcount_add(&dn->dn_holds, db); 1765 (void) atomic_inc_32_nv(&dn->dn_dbufs_count); 1766 1767 dprintf_dbuf(db, "db=%p\n", db); 1768 1769 return (db); 1770 } 1771 1772 static int 1773 dbuf_do_evict(void *private) 1774 { 1775 dmu_buf_impl_t *db = private; 1776 1777 if (!MUTEX_HELD(&db->db_mtx)) 1778 mutex_enter(&db->db_mtx); 1779 1780 ASSERT(refcount_is_zero(&db->db_holds)); 1781 1782 if (db->db_state != DB_EVICTING) { 1783 ASSERT(db->db_state == DB_CACHED); 1784 DBUF_VERIFY(db); 1785 db->db_buf = NULL; 1786 dbuf_evict(db); 1787 } else { 1788 mutex_exit(&db->db_mtx); 1789 dbuf_destroy(db); 1790 } 1791 return (0); 1792 } 1793 1794 static void 1795 dbuf_destroy(dmu_buf_impl_t *db) 1796 { 1797 ASSERT(refcount_is_zero(&db->db_holds)); 1798 1799 if (db->db_blkid != DMU_BONUS_BLKID) { 1800 /* 1801 * If this dbuf is still on the dn_dbufs list, 1802 * remove it from that list. 1803 */ 1804 if (db->db_dnode_handle != NULL) { 1805 dnode_t *dn; 1806 1807 DB_DNODE_ENTER(db); 1808 dn = DB_DNODE(db); 1809 mutex_enter(&dn->dn_dbufs_mtx); 1810 list_remove(&dn->dn_dbufs, db); 1811 (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1812 mutex_exit(&dn->dn_dbufs_mtx); 1813 DB_DNODE_EXIT(db); 1814 /* 1815 * Decrementing the dbuf count means that the hold 1816 * corresponding to the removed dbuf is no longer 1817 * discounted in dnode_move(), so the dnode cannot be 1818 * moved until after we release the hold. 1819 */ 1820 dnode_rele(dn, db); 1821 db->db_dnode_handle = NULL; 1822 } 1823 dbuf_hash_remove(db); 1824 } 1825 db->db_parent = NULL; 1826 db->db_buf = NULL; 1827 1828 ASSERT(!list_link_active(&db->db_link)); 1829 ASSERT(db->db.db_data == NULL); 1830 ASSERT(db->db_hash_next == NULL); 1831 ASSERT(db->db_blkptr == NULL); 1832 ASSERT(db->db_data_pending == NULL); 1833 1834 kmem_cache_free(dbuf_cache, db); 1835 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1836 } 1837 1838 void 1839 dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) 1840 { 1841 dmu_buf_impl_t *db = NULL; 1842 blkptr_t *bp = NULL; 1843 1844 ASSERT(blkid != DMU_BONUS_BLKID); 1845 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1846 1847 if (dnode_block_freed(dn, blkid)) 1848 return; 1849 1850 /* dbuf_find() returns with db_mtx held */ 1851 if (db = dbuf_find(dn, 0, blkid)) { 1852 /* 1853 * This dbuf is already in the cache. We assume that 1854 * it is already CACHED, or else about to be either 1855 * read or filled. 1856 */ 1857 mutex_exit(&db->db_mtx); 1858 return; 1859 } 1860 1861 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1862 if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { 1863 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1864 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1865 zbookmark_phys_t zb; 1866 1867 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1868 dn->dn_object, 0, blkid); 1869 1870 (void) arc_read(NULL, dn->dn_objset->os_spa, 1871 bp, NULL, NULL, prio, 1872 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1873 &aflags, &zb); 1874 } 1875 if (db) 1876 dbuf_rele(db, NULL); 1877 } 1878 } 1879 1880 /* 1881 * Returns with db_holds incremented, and db_mtx not held. 1882 * Note: dn_struct_rwlock must be held. 1883 */ 1884 int 1885 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1886 void *tag, dmu_buf_impl_t **dbp) 1887 { 1888 dmu_buf_impl_t *db, *parent = NULL; 1889 1890 ASSERT(blkid != DMU_BONUS_BLKID); 1891 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1892 ASSERT3U(dn->dn_nlevels, >, level); 1893 1894 *dbp = NULL; 1895 top: 1896 /* dbuf_find() returns with db_mtx held */ 1897 db = dbuf_find(dn, level, blkid); 1898 1899 if (db == NULL) { 1900 blkptr_t *bp = NULL; 1901 int err; 1902 1903 ASSERT3P(parent, ==, NULL); 1904 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1905 if (fail_sparse) { 1906 if (err == 0 && bp && BP_IS_HOLE(bp)) 1907 err = SET_ERROR(ENOENT); 1908 if (err) { 1909 if (parent) 1910 dbuf_rele(parent, NULL); 1911 return (err); 1912 } 1913 } 1914 if (err && err != ENOENT) 1915 return (err); 1916 db = dbuf_create(dn, level, blkid, parent, bp); 1917 } 1918 1919 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1920 arc_buf_add_ref(db->db_buf, db); 1921 if (db->db_buf->b_data == NULL) { 1922 dbuf_clear(db); 1923 if (parent) { 1924 dbuf_rele(parent, NULL); 1925 parent = NULL; 1926 } 1927 goto top; 1928 } 1929 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1930 } 1931 1932 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1933 1934 /* 1935 * If this buffer is currently syncing out, and we are are 1936 * still referencing it from db_data, we need to make a copy 1937 * of it in case we decide we want to dirty it again in this txg. 1938 */ 1939 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1940 dn->dn_object != DMU_META_DNODE_OBJECT && 1941 db->db_state == DB_CACHED && db->db_data_pending) { 1942 dbuf_dirty_record_t *dr = db->db_data_pending; 1943 1944 if (dr->dt.dl.dr_data == db->db_buf) { 1945 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1946 1947 dbuf_set_data(db, 1948 arc_buf_alloc(dn->dn_objset->os_spa, 1949 db->db.db_size, db, type)); 1950 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1951 db->db.db_size); 1952 } 1953 } 1954 1955 (void) refcount_add(&db->db_holds, tag); 1956 dbuf_update_data(db); 1957 DBUF_VERIFY(db); 1958 mutex_exit(&db->db_mtx); 1959 1960 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1961 if (parent) 1962 dbuf_rele(parent, NULL); 1963 1964 ASSERT3P(DB_DNODE(db), ==, dn); 1965 ASSERT3U(db->db_blkid, ==, blkid); 1966 ASSERT3U(db->db_level, ==, level); 1967 *dbp = db; 1968 1969 return (0); 1970 } 1971 1972 dmu_buf_impl_t * 1973 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1974 { 1975 dmu_buf_impl_t *db; 1976 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1977 return (err ? NULL : db); 1978 } 1979 1980 dmu_buf_impl_t * 1981 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1982 { 1983 dmu_buf_impl_t *db; 1984 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1985 return (err ? NULL : db); 1986 } 1987 1988 void 1989 dbuf_create_bonus(dnode_t *dn) 1990 { 1991 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1992 1993 ASSERT(dn->dn_bonus == NULL); 1994 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 1995 } 1996 1997 int 1998 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 1999 { 2000 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2001 dnode_t *dn; 2002 2003 if (db->db_blkid != DMU_SPILL_BLKID) 2004 return (SET_ERROR(ENOTSUP)); 2005 if (blksz == 0) 2006 blksz = SPA_MINBLOCKSIZE; 2007 if (blksz > SPA_MAXBLOCKSIZE) 2008 blksz = SPA_MAXBLOCKSIZE; 2009 else 2010 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 2011 2012 DB_DNODE_ENTER(db); 2013 dn = DB_DNODE(db); 2014 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2015 dbuf_new_size(db, blksz, tx); 2016 rw_exit(&dn->dn_struct_rwlock); 2017 DB_DNODE_EXIT(db); 2018 2019 return (0); 2020 } 2021 2022 void 2023 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 2024 { 2025 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2026 } 2027 2028 #pragma weak dmu_buf_add_ref = dbuf_add_ref 2029 void 2030 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 2031 { 2032 int64_t holds = refcount_add(&db->db_holds, tag); 2033 ASSERT(holds > 1); 2034 } 2035 2036 /* 2037 * If you call dbuf_rele() you had better not be referencing the dnode handle 2038 * unless you have some other direct or indirect hold on the dnode. (An indirect 2039 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2040 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2041 * dnode's parent dbuf evicting its dnode handles. 2042 */ 2043 void 2044 dbuf_rele(dmu_buf_impl_t *db, void *tag) 2045 { 2046 mutex_enter(&db->db_mtx); 2047 dbuf_rele_and_unlock(db, tag); 2048 } 2049 2050 void 2051 dmu_buf_rele(dmu_buf_t *db, void *tag) 2052 { 2053 dbuf_rele((dmu_buf_impl_t *)db, tag); 2054 } 2055 2056 /* 2057 * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2058 * db_dirtycnt and db_holds to be updated atomically. 2059 */ 2060 void 2061 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2062 { 2063 int64_t holds; 2064 2065 ASSERT(MUTEX_HELD(&db->db_mtx)); 2066 DBUF_VERIFY(db); 2067 2068 /* 2069 * Remove the reference to the dbuf before removing its hold on the 2070 * dnode so we can guarantee in dnode_move() that a referenced bonus 2071 * buffer has a corresponding dnode hold. 2072 */ 2073 holds = refcount_remove(&db->db_holds, tag); 2074 ASSERT(holds >= 0); 2075 2076 /* 2077 * We can't freeze indirects if there is a possibility that they 2078 * may be modified in the current syncing context. 2079 */ 2080 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 2081 arc_buf_freeze(db->db_buf); 2082 2083 if (holds == db->db_dirtycnt && 2084 db->db_level == 0 && db->db_immediate_evict) 2085 dbuf_evict_user(db); 2086 2087 if (holds == 0) { 2088 if (db->db_blkid == DMU_BONUS_BLKID) { 2089 mutex_exit(&db->db_mtx); 2090 2091 /* 2092 * If the dnode moves here, we cannot cross this barrier 2093 * until the move completes. 2094 */ 2095 DB_DNODE_ENTER(db); 2096 (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); 2097 DB_DNODE_EXIT(db); 2098 /* 2099 * The bonus buffer's dnode hold is no longer discounted 2100 * in dnode_move(). The dnode cannot move until after 2101 * the dnode_rele(). 2102 */ 2103 dnode_rele(DB_DNODE(db), db); 2104 } else if (db->db_buf == NULL) { 2105 /* 2106 * This is a special case: we never associated this 2107 * dbuf with any data allocated from the ARC. 2108 */ 2109 ASSERT(db->db_state == DB_UNCACHED || 2110 db->db_state == DB_NOFILL); 2111 dbuf_evict(db); 2112 } else if (arc_released(db->db_buf)) { 2113 arc_buf_t *buf = db->db_buf; 2114 /* 2115 * This dbuf has anonymous data associated with it. 2116 */ 2117 dbuf_set_data(db, NULL); 2118 VERIFY(arc_buf_remove_ref(buf, db)); 2119 dbuf_evict(db); 2120 } else { 2121 VERIFY(!arc_buf_remove_ref(db->db_buf, db)); 2122 2123 /* 2124 * A dbuf will be eligible for eviction if either the 2125 * 'primarycache' property is set or a duplicate 2126 * copy of this buffer is already cached in the arc. 2127 * 2128 * In the case of the 'primarycache' a buffer 2129 * is considered for eviction if it matches the 2130 * criteria set in the property. 2131 * 2132 * To decide if our buffer is considered a 2133 * duplicate, we must call into the arc to determine 2134 * if multiple buffers are referencing the same 2135 * block on-disk. If so, then we simply evict 2136 * ourselves. 2137 */ 2138 if (!DBUF_IS_CACHEABLE(db)) { 2139 if (db->db_blkptr != NULL && 2140 !BP_IS_HOLE(db->db_blkptr) && 2141 !BP_IS_EMBEDDED(db->db_blkptr)) { 2142 spa_t *spa = 2143 dmu_objset_spa(db->db_objset); 2144 blkptr_t bp = *db->db_blkptr; 2145 dbuf_clear(db); 2146 arc_freed(spa, &bp); 2147 } else { 2148 dbuf_clear(db); 2149 } 2150 } else if (arc_buf_eviction_needed(db->db_buf)) { 2151 dbuf_clear(db); 2152 } else { 2153 mutex_exit(&db->db_mtx); 2154 } 2155 } 2156 } else { 2157 mutex_exit(&db->db_mtx); 2158 } 2159 } 2160 2161 #pragma weak dmu_buf_refcount = dbuf_refcount 2162 uint64_t 2163 dbuf_refcount(dmu_buf_impl_t *db) 2164 { 2165 return (refcount_count(&db->db_holds)); 2166 } 2167 2168 void * 2169 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2170 dmu_buf_evict_func_t *evict_func) 2171 { 2172 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2173 user_data_ptr_ptr, evict_func)); 2174 } 2175 2176 void * 2177 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2178 dmu_buf_evict_func_t *evict_func) 2179 { 2180 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2181 2182 db->db_immediate_evict = TRUE; 2183 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2184 user_data_ptr_ptr, evict_func)); 2185 } 2186 2187 void * 2188 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 2189 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 2190 { 2191 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2192 ASSERT(db->db_level == 0); 2193 2194 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 2195 2196 mutex_enter(&db->db_mtx); 2197 2198 if (db->db_user_ptr == old_user_ptr) { 2199 db->db_user_ptr = user_ptr; 2200 db->db_user_data_ptr_ptr = user_data_ptr_ptr; 2201 db->db_evict_func = evict_func; 2202 2203 dbuf_update_data(db); 2204 } else { 2205 old_user_ptr = db->db_user_ptr; 2206 } 2207 2208 mutex_exit(&db->db_mtx); 2209 return (old_user_ptr); 2210 } 2211 2212 void * 2213 dmu_buf_get_user(dmu_buf_t *db_fake) 2214 { 2215 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2216 ASSERT(!refcount_is_zero(&db->db_holds)); 2217 2218 return (db->db_user_ptr); 2219 } 2220 2221 boolean_t 2222 dmu_buf_freeable(dmu_buf_t *dbuf) 2223 { 2224 boolean_t res = B_FALSE; 2225 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2226 2227 if (db->db_blkptr) 2228 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2229 db->db_blkptr, db->db_blkptr->blk_birth); 2230 2231 return (res); 2232 } 2233 2234 blkptr_t * 2235 dmu_buf_get_blkptr(dmu_buf_t *db) 2236 { 2237 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2238 return (dbi->db_blkptr); 2239 } 2240 2241 static void 2242 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2243 { 2244 /* ASSERT(dmu_tx_is_syncing(tx) */ 2245 ASSERT(MUTEX_HELD(&db->db_mtx)); 2246 2247 if (db->db_blkptr != NULL) 2248 return; 2249 2250 if (db->db_blkid == DMU_SPILL_BLKID) { 2251 db->db_blkptr = &dn->dn_phys->dn_spill; 2252 BP_ZERO(db->db_blkptr); 2253 return; 2254 } 2255 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2256 /* 2257 * This buffer was allocated at a time when there was 2258 * no available blkptrs from the dnode, or it was 2259 * inappropriate to hook it in (i.e., nlevels mis-match). 2260 */ 2261 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2262 ASSERT(db->db_parent == NULL); 2263 db->db_parent = dn->dn_dbuf; 2264 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2265 DBUF_VERIFY(db); 2266 } else { 2267 dmu_buf_impl_t *parent = db->db_parent; 2268 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2269 2270 ASSERT(dn->dn_phys->dn_nlevels > 1); 2271 if (parent == NULL) { 2272 mutex_exit(&db->db_mtx); 2273 rw_enter(&dn->dn_struct_rwlock, RW_READER); 2274 (void) dbuf_hold_impl(dn, db->db_level+1, 2275 db->db_blkid >> epbs, FALSE, db, &parent); 2276 rw_exit(&dn->dn_struct_rwlock); 2277 mutex_enter(&db->db_mtx); 2278 db->db_parent = parent; 2279 } 2280 db->db_blkptr = (blkptr_t *)parent->db.db_data + 2281 (db->db_blkid & ((1ULL << epbs) - 1)); 2282 DBUF_VERIFY(db); 2283 } 2284 } 2285 2286 static void 2287 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2288 { 2289 dmu_buf_impl_t *db = dr->dr_dbuf; 2290 dnode_t *dn; 2291 zio_t *zio; 2292 2293 ASSERT(dmu_tx_is_syncing(tx)); 2294 2295 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2296 2297 mutex_enter(&db->db_mtx); 2298 2299 ASSERT(db->db_level > 0); 2300 DBUF_VERIFY(db); 2301 2302 /* Read the block if it hasn't been read yet. */ 2303 if (db->db_buf == NULL) { 2304 mutex_exit(&db->db_mtx); 2305 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2306 mutex_enter(&db->db_mtx); 2307 } 2308 ASSERT3U(db->db_state, ==, DB_CACHED); 2309 ASSERT(db->db_buf != NULL); 2310 2311 DB_DNODE_ENTER(db); 2312 dn = DB_DNODE(db); 2313 /* Indirect block size must match what the dnode thinks it is. */ 2314 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2315 dbuf_check_blkptr(dn, db); 2316 DB_DNODE_EXIT(db); 2317 2318 /* Provide the pending dirty record to child dbufs */ 2319 db->db_data_pending = dr; 2320 2321 mutex_exit(&db->db_mtx); 2322 dbuf_write(dr, db->db_buf, tx); 2323 2324 zio = dr->dr_zio; 2325 mutex_enter(&dr->dt.di.dr_mtx); 2326 dbuf_sync_list(&dr->dt.di.dr_children, tx); 2327 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2328 mutex_exit(&dr->dt.di.dr_mtx); 2329 zio_nowait(zio); 2330 } 2331 2332 static void 2333 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2334 { 2335 arc_buf_t **datap = &dr->dt.dl.dr_data; 2336 dmu_buf_impl_t *db = dr->dr_dbuf; 2337 dnode_t *dn; 2338 objset_t *os; 2339 uint64_t txg = tx->tx_txg; 2340 2341 ASSERT(dmu_tx_is_syncing(tx)); 2342 2343 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2344 2345 mutex_enter(&db->db_mtx); 2346 /* 2347 * To be synced, we must be dirtied. But we 2348 * might have been freed after the dirty. 2349 */ 2350 if (db->db_state == DB_UNCACHED) { 2351 /* This buffer has been freed since it was dirtied */ 2352 ASSERT(db->db.db_data == NULL); 2353 } else if (db->db_state == DB_FILL) { 2354 /* This buffer was freed and is now being re-filled */ 2355 ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2356 } else { 2357 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2358 } 2359 DBUF_VERIFY(db); 2360 2361 DB_DNODE_ENTER(db); 2362 dn = DB_DNODE(db); 2363 2364 if (db->db_blkid == DMU_SPILL_BLKID) { 2365 mutex_enter(&dn->dn_mtx); 2366 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 2367 mutex_exit(&dn->dn_mtx); 2368 } 2369 2370 /* 2371 * If this is a bonus buffer, simply copy the bonus data into the 2372 * dnode. It will be written out when the dnode is synced (and it 2373 * will be synced, since it must have been dirty for dbuf_sync to 2374 * be called). 2375 */ 2376 if (db->db_blkid == DMU_BONUS_BLKID) { 2377 dbuf_dirty_record_t **drp; 2378 2379 ASSERT(*datap != NULL); 2380 ASSERT0(db->db_level); 2381 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2382 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2383 DB_DNODE_EXIT(db); 2384 2385 if (*datap != db->db.db_data) { 2386 zio_buf_free(*datap, DN_MAX_BONUSLEN); 2387 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2388 } 2389 db->db_data_pending = NULL; 2390 drp = &db->db_last_dirty; 2391 while (*drp != dr) 2392 drp = &(*drp)->dr_next; 2393 ASSERT(dr->dr_next == NULL); 2394 ASSERT(dr->dr_dbuf == db); 2395 *drp = dr->dr_next; 2396 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2397 ASSERT(db->db_dirtycnt > 0); 2398 db->db_dirtycnt -= 1; 2399 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2400 return; 2401 } 2402 2403 os = dn->dn_objset; 2404 2405 /* 2406 * This function may have dropped the db_mtx lock allowing a dmu_sync 2407 * operation to sneak in. As a result, we need to ensure that we 2408 * don't check the dr_override_state until we have returned from 2409 * dbuf_check_blkptr. 2410 */ 2411 dbuf_check_blkptr(dn, db); 2412 2413 /* 2414 * If this buffer is in the middle of an immediate write, 2415 * wait for the synchronous IO to complete. 2416 */ 2417 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2418 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2419 cv_wait(&db->db_changed, &db->db_mtx); 2420 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2421 } 2422 2423 if (db->db_state != DB_NOFILL && 2424 dn->dn_object != DMU_META_DNODE_OBJECT && 2425 refcount_count(&db->db_holds) > 1 && 2426 dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2427 *datap == db->db_buf) { 2428 /* 2429 * If this buffer is currently "in use" (i.e., there 2430 * are active holds and db_data still references it), 2431 * then make a copy before we start the write so that 2432 * any modifications from the open txg will not leak 2433 * into this write. 2434 * 2435 * NOTE: this copy does not need to be made for 2436 * objects only modified in the syncing context (e.g. 2437 * DNONE_DNODE blocks). 2438 */ 2439 int blksz = arc_buf_size(*datap); 2440 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2441 *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2442 bcopy(db->db.db_data, (*datap)->b_data, blksz); 2443 } 2444 db->db_data_pending = dr; 2445 2446 mutex_exit(&db->db_mtx); 2447 2448 dbuf_write(dr, *datap, tx); 2449 2450 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2451 if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2452 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2453 DB_DNODE_EXIT(db); 2454 } else { 2455 /* 2456 * Although zio_nowait() does not "wait for an IO", it does 2457 * initiate the IO. If this is an empty write it seems plausible 2458 * that the IO could actually be completed before the nowait 2459 * returns. We need to DB_DNODE_EXIT() first in case 2460 * zio_nowait() invalidates the dbuf. 2461 */ 2462 DB_DNODE_EXIT(db); 2463 zio_nowait(dr->dr_zio); 2464 } 2465 } 2466 2467 void 2468 dbuf_sync_list(list_t *list, dmu_tx_t *tx) 2469 { 2470 dbuf_dirty_record_t *dr; 2471 2472 while (dr = list_head(list)) { 2473 if (dr->dr_zio != NULL) { 2474 /* 2475 * If we find an already initialized zio then we 2476 * are processing the meta-dnode, and we have finished. 2477 * The dbufs for all dnodes are put back on the list 2478 * during processing, so that we can zio_wait() 2479 * these IOs after initiating all child IOs. 2480 */ 2481 ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2482 DMU_META_DNODE_OBJECT); 2483 break; 2484 } 2485 list_remove(list, dr); 2486 if (dr->dr_dbuf->db_level > 0) 2487 dbuf_sync_indirect(dr, tx); 2488 else 2489 dbuf_sync_leaf(dr, tx); 2490 } 2491 } 2492 2493 /* ARGSUSED */ 2494 static void 2495 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2496 { 2497 dmu_buf_impl_t *db = vdb; 2498 dnode_t *dn; 2499 blkptr_t *bp = zio->io_bp; 2500 blkptr_t *bp_orig = &zio->io_bp_orig; 2501 spa_t *spa = zio->io_spa; 2502 int64_t delta; 2503 uint64_t fill = 0; 2504 int i; 2505 2506 ASSERT3P(db->db_blkptr, ==, bp); 2507 2508 DB_DNODE_ENTER(db); 2509 dn = DB_DNODE(db); 2510 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2511 dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2512 zio->io_prev_space_delta = delta; 2513 2514 if (bp->blk_birth != 0) { 2515 ASSERT((db->db_blkid != DMU_SPILL_BLKID && 2516 BP_GET_TYPE(bp) == dn->dn_type) || 2517 (db->db_blkid == DMU_SPILL_BLKID && 2518 BP_GET_TYPE(bp) == dn->dn_bonustype) || 2519 BP_IS_EMBEDDED(bp)); 2520 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2521 } 2522 2523 mutex_enter(&db->db_mtx); 2524 2525 #ifdef ZFS_DEBUG 2526 if (db->db_blkid == DMU_SPILL_BLKID) { 2527 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2528 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2529 db->db_blkptr == &dn->dn_phys->dn_spill); 2530 } 2531 #endif 2532 2533 if (db->db_level == 0) { 2534 mutex_enter(&dn->dn_mtx); 2535 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2536 db->db_blkid != DMU_SPILL_BLKID) 2537 dn->dn_phys->dn_maxblkid = db->db_blkid; 2538 mutex_exit(&dn->dn_mtx); 2539 2540 if (dn->dn_type == DMU_OT_DNODE) { 2541 dnode_phys_t *dnp = db->db.db_data; 2542 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2543 i--, dnp++) { 2544 if (dnp->dn_type != DMU_OT_NONE) 2545 fill++; 2546 } 2547 } else { 2548 if (BP_IS_HOLE(bp)) { 2549 fill = 0; 2550 } else { 2551 fill = 1; 2552 } 2553 } 2554 } else { 2555 blkptr_t *ibp = db->db.db_data; 2556 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2557 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2558 if (BP_IS_HOLE(ibp)) 2559 continue; 2560 fill += BP_GET_FILL(ibp); 2561 } 2562 } 2563 DB_DNODE_EXIT(db); 2564 2565 if (!BP_IS_EMBEDDED(bp)) 2566 bp->blk_fill = fill; 2567 2568 mutex_exit(&db->db_mtx); 2569 } 2570 2571 /* 2572 * The SPA will call this callback several times for each zio - once 2573 * for every physical child i/o (zio->io_phys_children times). This 2574 * allows the DMU to monitor the progress of each logical i/o. For example, 2575 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 2576 * block. There may be a long delay before all copies/fragments are completed, 2577 * so this callback allows us to retire dirty space gradually, as the physical 2578 * i/os complete. 2579 */ 2580 /* ARGSUSED */ 2581 static void 2582 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) 2583 { 2584 dmu_buf_impl_t *db = arg; 2585 objset_t *os = db->db_objset; 2586 dsl_pool_t *dp = dmu_objset_pool(os); 2587 dbuf_dirty_record_t *dr; 2588 int delta = 0; 2589 2590 dr = db->db_data_pending; 2591 ASSERT3U(dr->dr_txg, ==, zio->io_txg); 2592 2593 /* 2594 * The callback will be called io_phys_children times. Retire one 2595 * portion of our dirty space each time we are called. Any rounding 2596 * error will be cleaned up by dsl_pool_sync()'s call to 2597 * dsl_pool_undirty_space(). 2598 */ 2599 delta = dr->dr_accounted / zio->io_phys_children; 2600 dsl_pool_undirty_space(dp, delta, zio->io_txg); 2601 } 2602 2603 /* ARGSUSED */ 2604 static void 2605 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2606 { 2607 dmu_buf_impl_t *db = vdb; 2608 blkptr_t *bp_orig = &zio->io_bp_orig; 2609 blkptr_t *bp = db->db_blkptr; 2610 objset_t *os = db->db_objset; 2611 dmu_tx_t *tx = os->os_synctx; 2612 dbuf_dirty_record_t **drp, *dr; 2613 2614 ASSERT0(zio->io_error); 2615 ASSERT(db->db_blkptr == bp); 2616 2617 /* 2618 * For nopwrites and rewrites we ensure that the bp matches our 2619 * original and bypass all the accounting. 2620 */ 2621 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 2622 ASSERT(BP_EQUAL(bp, bp_orig)); 2623 } else { 2624 dsl_dataset_t *ds = os->os_dsl_dataset; 2625 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2626 dsl_dataset_block_born(ds, bp, tx); 2627 } 2628 2629 mutex_enter(&db->db_mtx); 2630 2631 DBUF_VERIFY(db); 2632 2633 drp = &db->db_last_dirty; 2634 while ((dr = *drp) != db->db_data_pending) 2635 drp = &dr->dr_next; 2636 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2637 ASSERT(dr->dr_dbuf == db); 2638 ASSERT(dr->dr_next == NULL); 2639 *drp = dr->dr_next; 2640 2641 #ifdef ZFS_DEBUG 2642 if (db->db_blkid == DMU_SPILL_BLKID) { 2643 dnode_t *dn; 2644 2645 DB_DNODE_ENTER(db); 2646 dn = DB_DNODE(db); 2647 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2648 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2649 db->db_blkptr == &dn->dn_phys->dn_spill); 2650 DB_DNODE_EXIT(db); 2651 } 2652 #endif 2653 2654 if (db->db_level == 0) { 2655 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2656 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2657 if (db->db_state != DB_NOFILL) { 2658 if (dr->dt.dl.dr_data != db->db_buf) 2659 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2660 db)); 2661 else if (!arc_released(db->db_buf)) 2662 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2663 } 2664 } else { 2665 dnode_t *dn; 2666 2667 DB_DNODE_ENTER(db); 2668 dn = DB_DNODE(db); 2669 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2670 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 2671 if (!BP_IS_HOLE(db->db_blkptr)) { 2672 int epbs = 2673 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2674 ASSERT3U(db->db_blkid, <=, 2675 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 2676 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2677 db->db.db_size); 2678 if (!arc_released(db->db_buf)) 2679 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2680 } 2681 DB_DNODE_EXIT(db); 2682 mutex_destroy(&dr->dt.di.dr_mtx); 2683 list_destroy(&dr->dt.di.dr_children); 2684 } 2685 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2686 2687 cv_broadcast(&db->db_changed); 2688 ASSERT(db->db_dirtycnt > 0); 2689 db->db_dirtycnt -= 1; 2690 db->db_data_pending = NULL; 2691 dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); 2692 } 2693 2694 static void 2695 dbuf_write_nofill_ready(zio_t *zio) 2696 { 2697 dbuf_write_ready(zio, NULL, zio->io_private); 2698 } 2699 2700 static void 2701 dbuf_write_nofill_done(zio_t *zio) 2702 { 2703 dbuf_write_done(zio, NULL, zio->io_private); 2704 } 2705 2706 static void 2707 dbuf_write_override_ready(zio_t *zio) 2708 { 2709 dbuf_dirty_record_t *dr = zio->io_private; 2710 dmu_buf_impl_t *db = dr->dr_dbuf; 2711 2712 dbuf_write_ready(zio, NULL, db); 2713 } 2714 2715 static void 2716 dbuf_write_override_done(zio_t *zio) 2717 { 2718 dbuf_dirty_record_t *dr = zio->io_private; 2719 dmu_buf_impl_t *db = dr->dr_dbuf; 2720 blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2721 2722 mutex_enter(&db->db_mtx); 2723 if (!BP_EQUAL(zio->io_bp, obp)) { 2724 if (!BP_IS_HOLE(obp)) 2725 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2726 arc_release(dr->dt.dl.dr_data, db); 2727 } 2728 mutex_exit(&db->db_mtx); 2729 2730 dbuf_write_done(zio, NULL, db); 2731 } 2732 2733 /* Issue I/O to commit a dirty buffer to disk. */ 2734 static void 2735 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2736 { 2737 dmu_buf_impl_t *db = dr->dr_dbuf; 2738 dnode_t *dn; 2739 objset_t *os; 2740 dmu_buf_impl_t *parent = db->db_parent; 2741 uint64_t txg = tx->tx_txg; 2742 zbookmark_phys_t zb; 2743 zio_prop_t zp; 2744 zio_t *zio; 2745 int wp_flag = 0; 2746 2747 DB_DNODE_ENTER(db); 2748 dn = DB_DNODE(db); 2749 os = dn->dn_objset; 2750 2751 if (db->db_state != DB_NOFILL) { 2752 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2753 /* 2754 * Private object buffers are released here rather 2755 * than in dbuf_dirty() since they are only modified 2756 * in the syncing context and we don't want the 2757 * overhead of making multiple copies of the data. 2758 */ 2759 if (BP_IS_HOLE(db->db_blkptr)) { 2760 arc_buf_thaw(data); 2761 } else { 2762 dbuf_release_bp(db); 2763 } 2764 } 2765 } 2766 2767 if (parent != dn->dn_dbuf) { 2768 /* Our parent is an indirect block. */ 2769 /* We have a dirty parent that has been scheduled for write. */ 2770 ASSERT(parent && parent->db_data_pending); 2771 /* Our parent's buffer is one level closer to the dnode. */ 2772 ASSERT(db->db_level == parent->db_level-1); 2773 /* 2774 * We're about to modify our parent's db_data by modifying 2775 * our block pointer, so the parent must be released. 2776 */ 2777 ASSERT(arc_released(parent->db_buf)); 2778 zio = parent->db_data_pending->dr_zio; 2779 } else { 2780 /* Our parent is the dnode itself. */ 2781 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 2782 db->db_blkid != DMU_SPILL_BLKID) || 2783 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 2784 if (db->db_blkid != DMU_SPILL_BLKID) 2785 ASSERT3P(db->db_blkptr, ==, 2786 &dn->dn_phys->dn_blkptr[db->db_blkid]); 2787 zio = dn->dn_zio; 2788 } 2789 2790 ASSERT(db->db_level == 0 || data == db->db_buf); 2791 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2792 ASSERT(zio); 2793 2794 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2795 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2796 db->db.db_object, db->db_level, db->db_blkid); 2797 2798 if (db->db_blkid == DMU_SPILL_BLKID) 2799 wp_flag = WP_SPILL; 2800 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 2801 2802 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 2803 DB_DNODE_EXIT(db); 2804 2805 if (db->db_level == 0 && 2806 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2807 /* 2808 * The BP for this block has been provided by open context 2809 * (by dmu_sync() or dmu_buf_write_embedded()). 2810 */ 2811 void *contents = (data != NULL) ? data->b_data : NULL; 2812 2813 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2814 db->db_blkptr, contents, db->db.db_size, &zp, 2815 dbuf_write_override_ready, NULL, dbuf_write_override_done, 2816 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2817 mutex_enter(&db->db_mtx); 2818 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2819 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2820 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 2821 mutex_exit(&db->db_mtx); 2822 } else if (db->db_state == DB_NOFILL) { 2823 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || 2824 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 2825 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2826 db->db_blkptr, NULL, db->db.db_size, &zp, 2827 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, 2828 ZIO_PRIORITY_ASYNC_WRITE, 2829 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2830 } else { 2831 ASSERT(arc_released(data)); 2832 dr->dr_zio = arc_write(zio, os->os_spa, txg, 2833 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), 2834 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, 2835 dbuf_write_physdone, dbuf_write_done, db, 2836 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2837 } 2838 } 2839