1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/dmu.h> 32 #include <sys/dmu_send.h> 33 #include <sys/dmu_impl.h> 34 #include <sys/dbuf.h> 35 #include <sys/dmu_objset.h> 36 #include <sys/dsl_dataset.h> 37 #include <sys/dsl_dir.h> 38 #include <sys/dmu_tx.h> 39 #include <sys/spa.h> 40 #include <sys/zio.h> 41 #include <sys/dmu_zfetch.h> 42 #include <sys/sa.h> 43 #include <sys/sa_impl.h> 44 #include <sys/zfeature.h> 45 #include <sys/blkptr.h> 46 #include <sys/range_tree.h> 47 48 /* 49 * Number of times that zfs_free_range() took the slow path while doing 50 * a zfs receive. A nonzero value indicates a potential performance problem. 51 */ 52 uint64_t zfs_free_range_recv_miss; 53 54 static void dbuf_destroy(dmu_buf_impl_t *db); 55 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 56 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 57 58 #ifndef __lint 59 extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, 60 dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp); 61 #endif /* ! __lint */ 62 63 /* 64 * Global data structures and functions for the dbuf cache. 65 */ 66 static kmem_cache_t *dbuf_cache; 67 static taskq_t *dbu_evict_taskq; 68 69 /* ARGSUSED */ 70 static int 71 dbuf_cons(void *vdb, void *unused, int kmflag) 72 { 73 dmu_buf_impl_t *db = vdb; 74 bzero(db, sizeof (dmu_buf_impl_t)); 75 76 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 77 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 78 refcount_create(&db->db_holds); 79 80 return (0); 81 } 82 83 /* ARGSUSED */ 84 static void 85 dbuf_dest(void *vdb, void *unused) 86 { 87 dmu_buf_impl_t *db = vdb; 88 mutex_destroy(&db->db_mtx); 89 cv_destroy(&db->db_changed); 90 refcount_destroy(&db->db_holds); 91 } 92 93 /* 94 * dbuf hash table routines 95 */ 96 static dbuf_hash_table_t dbuf_hash_table; 97 98 static uint64_t dbuf_hash_count; 99 100 static uint64_t 101 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 102 { 103 uintptr_t osv = (uintptr_t)os; 104 uint64_t crc = -1ULL; 105 106 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 107 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 108 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 109 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 110 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 111 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 112 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 113 114 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 115 116 return (crc); 117 } 118 119 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 120 121 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 122 ((dbuf)->db.db_object == (obj) && \ 123 (dbuf)->db_objset == (os) && \ 124 (dbuf)->db_level == (level) && \ 125 (dbuf)->db_blkid == (blkid)) 126 127 dmu_buf_impl_t * 128 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 129 { 130 dbuf_hash_table_t *h = &dbuf_hash_table; 131 objset_t *os = dn->dn_objset; 132 uint64_t obj = dn->dn_object; 133 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 134 uint64_t idx = hv & h->hash_table_mask; 135 dmu_buf_impl_t *db; 136 137 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 138 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 139 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 140 mutex_enter(&db->db_mtx); 141 if (db->db_state != DB_EVICTING) { 142 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 143 return (db); 144 } 145 mutex_exit(&db->db_mtx); 146 } 147 } 148 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 149 return (NULL); 150 } 151 152 /* 153 * Insert an entry into the hash table. If there is already an element 154 * equal to elem in the hash table, then the already existing element 155 * will be returned and the new element will not be inserted. 156 * Otherwise returns NULL. 157 */ 158 static dmu_buf_impl_t * 159 dbuf_hash_insert(dmu_buf_impl_t *db) 160 { 161 dbuf_hash_table_t *h = &dbuf_hash_table; 162 objset_t *os = db->db_objset; 163 uint64_t obj = db->db.db_object; 164 int level = db->db_level; 165 uint64_t blkid = db->db_blkid; 166 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 167 uint64_t idx = hv & h->hash_table_mask; 168 dmu_buf_impl_t *dbf; 169 170 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 171 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 172 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 173 mutex_enter(&dbf->db_mtx); 174 if (dbf->db_state != DB_EVICTING) { 175 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 176 return (dbf); 177 } 178 mutex_exit(&dbf->db_mtx); 179 } 180 } 181 182 mutex_enter(&db->db_mtx); 183 db->db_hash_next = h->hash_table[idx]; 184 h->hash_table[idx] = db; 185 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 186 atomic_inc_64(&dbuf_hash_count); 187 188 return (NULL); 189 } 190 191 /* 192 * Remove an entry from the hash table. It must be in the EVICTING state. 193 */ 194 static void 195 dbuf_hash_remove(dmu_buf_impl_t *db) 196 { 197 dbuf_hash_table_t *h = &dbuf_hash_table; 198 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 199 db->db_level, db->db_blkid); 200 uint64_t idx = hv & h->hash_table_mask; 201 dmu_buf_impl_t *dbf, **dbp; 202 203 /* 204 * We musn't hold db_mtx to maintain lock ordering: 205 * DBUF_HASH_MUTEX > db_mtx. 206 */ 207 ASSERT(refcount_is_zero(&db->db_holds)); 208 ASSERT(db->db_state == DB_EVICTING); 209 ASSERT(!MUTEX_HELD(&db->db_mtx)); 210 211 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 212 dbp = &h->hash_table[idx]; 213 while ((dbf = *dbp) != db) { 214 dbp = &dbf->db_hash_next; 215 ASSERT(dbf != NULL); 216 } 217 *dbp = db->db_hash_next; 218 db->db_hash_next = NULL; 219 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 220 atomic_dec_64(&dbuf_hash_count); 221 } 222 223 static arc_evict_func_t dbuf_do_evict; 224 225 typedef enum { 226 DBVU_EVICTING, 227 DBVU_NOT_EVICTING 228 } dbvu_verify_type_t; 229 230 static void 231 dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) 232 { 233 #ifdef ZFS_DEBUG 234 int64_t holds; 235 236 if (db->db_user == NULL) 237 return; 238 239 /* Only data blocks support the attachment of user data. */ 240 ASSERT(db->db_level == 0); 241 242 /* Clients must resolve a dbuf before attaching user data. */ 243 ASSERT(db->db.db_data != NULL); 244 ASSERT3U(db->db_state, ==, DB_CACHED); 245 246 holds = refcount_count(&db->db_holds); 247 if (verify_type == DBVU_EVICTING) { 248 /* 249 * Immediate eviction occurs when holds == dirtycnt. 250 * For normal eviction buffers, holds is zero on 251 * eviction, except when dbuf_fix_old_data() calls 252 * dbuf_clear_data(). However, the hold count can grow 253 * during eviction even though db_mtx is held (see 254 * dmu_bonus_hold() for an example), so we can only 255 * test the generic invariant that holds >= dirtycnt. 256 */ 257 ASSERT3U(holds, >=, db->db_dirtycnt); 258 } else { 259 if (db->db_immediate_evict == TRUE) 260 ASSERT3U(holds, >=, db->db_dirtycnt); 261 else 262 ASSERT3U(holds, >, 0); 263 } 264 #endif 265 } 266 267 static void 268 dbuf_evict_user(dmu_buf_impl_t *db) 269 { 270 dmu_buf_user_t *dbu = db->db_user; 271 272 ASSERT(MUTEX_HELD(&db->db_mtx)); 273 274 if (dbu == NULL) 275 return; 276 277 dbuf_verify_user(db, DBVU_EVICTING); 278 db->db_user = NULL; 279 280 #ifdef ZFS_DEBUG 281 if (dbu->dbu_clear_on_evict_dbufp != NULL) 282 *dbu->dbu_clear_on_evict_dbufp = NULL; 283 #endif 284 285 /* 286 * Invoke the callback from a taskq to avoid lock order reversals 287 * and limit stack depth. 288 */ 289 taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0, 290 &dbu->dbu_tqent); 291 } 292 293 boolean_t 294 dbuf_is_metadata(dmu_buf_impl_t *db) 295 { 296 if (db->db_level > 0) { 297 return (B_TRUE); 298 } else { 299 boolean_t is_metadata; 300 301 DB_DNODE_ENTER(db); 302 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 303 DB_DNODE_EXIT(db); 304 305 return (is_metadata); 306 } 307 } 308 309 void 310 dbuf_evict(dmu_buf_impl_t *db) 311 { 312 ASSERT(MUTEX_HELD(&db->db_mtx)); 313 ASSERT(db->db_buf == NULL); 314 ASSERT(db->db_data_pending == NULL); 315 316 dbuf_clear(db); 317 dbuf_destroy(db); 318 } 319 320 void 321 dbuf_init(void) 322 { 323 uint64_t hsize = 1ULL << 16; 324 dbuf_hash_table_t *h = &dbuf_hash_table; 325 int i; 326 327 /* 328 * The hash table is big enough to fill all of physical memory 329 * with an average 4K block size. The table will take up 330 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 331 */ 332 while (hsize * 4096 < physmem * PAGESIZE) 333 hsize <<= 1; 334 335 retry: 336 h->hash_table_mask = hsize - 1; 337 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 338 if (h->hash_table == NULL) { 339 /* XXX - we should really return an error instead of assert */ 340 ASSERT(hsize > (1ULL << 10)); 341 hsize >>= 1; 342 goto retry; 343 } 344 345 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 346 sizeof (dmu_buf_impl_t), 347 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 348 349 for (i = 0; i < DBUF_MUTEXES; i++) 350 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 351 352 /* 353 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc 354 * configuration is not required. 355 */ 356 dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); 357 } 358 359 void 360 dbuf_fini(void) 361 { 362 dbuf_hash_table_t *h = &dbuf_hash_table; 363 int i; 364 365 for (i = 0; i < DBUF_MUTEXES; i++) 366 mutex_destroy(&h->hash_mutexes[i]); 367 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 368 kmem_cache_destroy(dbuf_cache); 369 taskq_destroy(dbu_evict_taskq); 370 } 371 372 /* 373 * Other stuff. 374 */ 375 376 #ifdef ZFS_DEBUG 377 static void 378 dbuf_verify(dmu_buf_impl_t *db) 379 { 380 dnode_t *dn; 381 dbuf_dirty_record_t *dr; 382 383 ASSERT(MUTEX_HELD(&db->db_mtx)); 384 385 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 386 return; 387 388 ASSERT(db->db_objset != NULL); 389 DB_DNODE_ENTER(db); 390 dn = DB_DNODE(db); 391 if (dn == NULL) { 392 ASSERT(db->db_parent == NULL); 393 ASSERT(db->db_blkptr == NULL); 394 } else { 395 ASSERT3U(db->db.db_object, ==, dn->dn_object); 396 ASSERT3P(db->db_objset, ==, dn->dn_objset); 397 ASSERT3U(db->db_level, <, dn->dn_nlevels); 398 ASSERT(db->db_blkid == DMU_BONUS_BLKID || 399 db->db_blkid == DMU_SPILL_BLKID || 400 !avl_is_empty(&dn->dn_dbufs)); 401 } 402 if (db->db_blkid == DMU_BONUS_BLKID) { 403 ASSERT(dn != NULL); 404 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 405 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 406 } else if (db->db_blkid == DMU_SPILL_BLKID) { 407 ASSERT(dn != NULL); 408 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 409 ASSERT0(db->db.db_offset); 410 } else { 411 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 412 } 413 414 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 415 ASSERT(dr->dr_dbuf == db); 416 417 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 418 ASSERT(dr->dr_dbuf == db); 419 420 /* 421 * We can't assert that db_size matches dn_datablksz because it 422 * can be momentarily different when another thread is doing 423 * dnode_set_blksz(). 424 */ 425 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 426 dr = db->db_data_pending; 427 /* 428 * It should only be modified in syncing context, so 429 * make sure we only have one copy of the data. 430 */ 431 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 432 } 433 434 /* verify db->db_blkptr */ 435 if (db->db_blkptr) { 436 if (db->db_parent == dn->dn_dbuf) { 437 /* db is pointed to by the dnode */ 438 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 439 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 440 ASSERT(db->db_parent == NULL); 441 else 442 ASSERT(db->db_parent != NULL); 443 if (db->db_blkid != DMU_SPILL_BLKID) 444 ASSERT3P(db->db_blkptr, ==, 445 &dn->dn_phys->dn_blkptr[db->db_blkid]); 446 } else { 447 /* db is pointed to by an indirect block */ 448 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 449 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 450 ASSERT3U(db->db_parent->db.db_object, ==, 451 db->db.db_object); 452 /* 453 * dnode_grow_indblksz() can make this fail if we don't 454 * have the struct_rwlock. XXX indblksz no longer 455 * grows. safe to do this now? 456 */ 457 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 458 ASSERT3P(db->db_blkptr, ==, 459 ((blkptr_t *)db->db_parent->db.db_data + 460 db->db_blkid % epb)); 461 } 462 } 463 } 464 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 465 (db->db_buf == NULL || db->db_buf->b_data) && 466 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 467 db->db_state != DB_FILL && !dn->dn_free_txg) { 468 /* 469 * If the blkptr isn't set but they have nonzero data, 470 * it had better be dirty, otherwise we'll lose that 471 * data when we evict this buffer. 472 */ 473 if (db->db_dirtycnt == 0) { 474 uint64_t *buf = db->db.db_data; 475 int i; 476 477 for (i = 0; i < db->db.db_size >> 3; i++) { 478 ASSERT(buf[i] == 0); 479 } 480 } 481 } 482 DB_DNODE_EXIT(db); 483 } 484 #endif 485 486 static void 487 dbuf_clear_data(dmu_buf_impl_t *db) 488 { 489 ASSERT(MUTEX_HELD(&db->db_mtx)); 490 dbuf_evict_user(db); 491 db->db_buf = NULL; 492 db->db.db_data = NULL; 493 if (db->db_state != DB_NOFILL) 494 db->db_state = DB_UNCACHED; 495 } 496 497 static void 498 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 499 { 500 ASSERT(MUTEX_HELD(&db->db_mtx)); 501 ASSERT(buf != NULL); 502 503 db->db_buf = buf; 504 ASSERT(buf->b_data != NULL); 505 db->db.db_data = buf->b_data; 506 if (!arc_released(buf)) 507 arc_set_callback(buf, dbuf_do_evict, db); 508 } 509 510 /* 511 * Loan out an arc_buf for read. Return the loaned arc_buf. 512 */ 513 arc_buf_t * 514 dbuf_loan_arcbuf(dmu_buf_impl_t *db) 515 { 516 arc_buf_t *abuf; 517 518 mutex_enter(&db->db_mtx); 519 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 520 int blksz = db->db.db_size; 521 spa_t *spa = db->db_objset->os_spa; 522 523 mutex_exit(&db->db_mtx); 524 abuf = arc_loan_buf(spa, blksz); 525 bcopy(db->db.db_data, abuf->b_data, blksz); 526 } else { 527 abuf = db->db_buf; 528 arc_loan_inuse_buf(abuf, db); 529 dbuf_clear_data(db); 530 mutex_exit(&db->db_mtx); 531 } 532 return (abuf); 533 } 534 535 uint64_t 536 dbuf_whichblock(dnode_t *dn, uint64_t offset) 537 { 538 if (dn->dn_datablkshift) { 539 return (offset >> dn->dn_datablkshift); 540 } else { 541 ASSERT3U(offset, <, dn->dn_datablksz); 542 return (0); 543 } 544 } 545 546 static void 547 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 548 { 549 dmu_buf_impl_t *db = vdb; 550 551 mutex_enter(&db->db_mtx); 552 ASSERT3U(db->db_state, ==, DB_READ); 553 /* 554 * All reads are synchronous, so we must have a hold on the dbuf 555 */ 556 ASSERT(refcount_count(&db->db_holds) > 0); 557 ASSERT(db->db_buf == NULL); 558 ASSERT(db->db.db_data == NULL); 559 if (db->db_level == 0 && db->db_freed_in_flight) { 560 /* we were freed in flight; disregard any error */ 561 arc_release(buf, db); 562 bzero(buf->b_data, db->db.db_size); 563 arc_buf_freeze(buf); 564 db->db_freed_in_flight = FALSE; 565 dbuf_set_data(db, buf); 566 db->db_state = DB_CACHED; 567 } else if (zio == NULL || zio->io_error == 0) { 568 dbuf_set_data(db, buf); 569 db->db_state = DB_CACHED; 570 } else { 571 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 572 ASSERT3P(db->db_buf, ==, NULL); 573 VERIFY(arc_buf_remove_ref(buf, db)); 574 db->db_state = DB_UNCACHED; 575 } 576 cv_broadcast(&db->db_changed); 577 dbuf_rele_and_unlock(db, NULL); 578 } 579 580 static void 581 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 582 { 583 dnode_t *dn; 584 zbookmark_phys_t zb; 585 arc_flags_t aflags = ARC_FLAG_NOWAIT; 586 587 DB_DNODE_ENTER(db); 588 dn = DB_DNODE(db); 589 ASSERT(!refcount_is_zero(&db->db_holds)); 590 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 591 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 592 ASSERT(MUTEX_HELD(&db->db_mtx)); 593 ASSERT(db->db_state == DB_UNCACHED); 594 ASSERT(db->db_buf == NULL); 595 596 if (db->db_blkid == DMU_BONUS_BLKID) { 597 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 598 599 ASSERT3U(bonuslen, <=, db->db.db_size); 600 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 601 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 602 if (bonuslen < DN_MAX_BONUSLEN) 603 bzero(db->db.db_data, DN_MAX_BONUSLEN); 604 if (bonuslen) 605 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 606 DB_DNODE_EXIT(db); 607 db->db_state = DB_CACHED; 608 mutex_exit(&db->db_mtx); 609 return; 610 } 611 612 /* 613 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 614 * processes the delete record and clears the bp while we are waiting 615 * for the dn_mtx (resulting in a "no" from block_freed). 616 */ 617 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 618 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 619 BP_IS_HOLE(db->db_blkptr)))) { 620 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 621 622 DB_DNODE_EXIT(db); 623 dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, 624 db->db.db_size, db, type)); 625 bzero(db->db.db_data, db->db.db_size); 626 db->db_state = DB_CACHED; 627 *flags |= DB_RF_CACHED; 628 mutex_exit(&db->db_mtx); 629 return; 630 } 631 632 DB_DNODE_EXIT(db); 633 634 db->db_state = DB_READ; 635 mutex_exit(&db->db_mtx); 636 637 if (DBUF_IS_L2CACHEABLE(db)) 638 aflags |= ARC_FLAG_L2CACHE; 639 if (DBUF_IS_L2COMPRESSIBLE(db)) 640 aflags |= ARC_FLAG_L2COMPRESS; 641 642 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 643 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 644 db->db.db_object, db->db_level, db->db_blkid); 645 646 dbuf_add_ref(db, NULL); 647 648 (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 649 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 650 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 651 &aflags, &zb); 652 if (aflags & ARC_FLAG_CACHED) 653 *flags |= DB_RF_CACHED; 654 } 655 656 int 657 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 658 { 659 int err = 0; 660 boolean_t havepzio = (zio != NULL); 661 boolean_t prefetch; 662 dnode_t *dn; 663 664 /* 665 * We don't have to hold the mutex to check db_state because it 666 * can't be freed while we have a hold on the buffer. 667 */ 668 ASSERT(!refcount_is_zero(&db->db_holds)); 669 670 if (db->db_state == DB_NOFILL) 671 return (SET_ERROR(EIO)); 672 673 DB_DNODE_ENTER(db); 674 dn = DB_DNODE(db); 675 if ((flags & DB_RF_HAVESTRUCT) == 0) 676 rw_enter(&dn->dn_struct_rwlock, RW_READER); 677 678 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 679 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 680 DBUF_IS_CACHEABLE(db); 681 682 mutex_enter(&db->db_mtx); 683 if (db->db_state == DB_CACHED) { 684 mutex_exit(&db->db_mtx); 685 if (prefetch) 686 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 687 db->db.db_size, TRUE); 688 if ((flags & DB_RF_HAVESTRUCT) == 0) 689 rw_exit(&dn->dn_struct_rwlock); 690 DB_DNODE_EXIT(db); 691 } else if (db->db_state == DB_UNCACHED) { 692 spa_t *spa = dn->dn_objset->os_spa; 693 694 if (zio == NULL) 695 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 696 dbuf_read_impl(db, zio, &flags); 697 698 /* dbuf_read_impl has dropped db_mtx for us */ 699 700 if (prefetch) 701 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 702 db->db.db_size, flags & DB_RF_CACHED); 703 704 if ((flags & DB_RF_HAVESTRUCT) == 0) 705 rw_exit(&dn->dn_struct_rwlock); 706 DB_DNODE_EXIT(db); 707 708 if (!havepzio) 709 err = zio_wait(zio); 710 } else { 711 /* 712 * Another reader came in while the dbuf was in flight 713 * between UNCACHED and CACHED. Either a writer will finish 714 * writing the buffer (sending the dbuf to CACHED) or the 715 * first reader's request will reach the read_done callback 716 * and send the dbuf to CACHED. Otherwise, a failure 717 * occurred and the dbuf went to UNCACHED. 718 */ 719 mutex_exit(&db->db_mtx); 720 if (prefetch) 721 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 722 db->db.db_size, TRUE); 723 if ((flags & DB_RF_HAVESTRUCT) == 0) 724 rw_exit(&dn->dn_struct_rwlock); 725 DB_DNODE_EXIT(db); 726 727 /* Skip the wait per the caller's request. */ 728 mutex_enter(&db->db_mtx); 729 if ((flags & DB_RF_NEVERWAIT) == 0) { 730 while (db->db_state == DB_READ || 731 db->db_state == DB_FILL) { 732 ASSERT(db->db_state == DB_READ || 733 (flags & DB_RF_HAVESTRUCT) == 0); 734 DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, 735 db, zio_t *, zio); 736 cv_wait(&db->db_changed, &db->db_mtx); 737 } 738 if (db->db_state == DB_UNCACHED) 739 err = SET_ERROR(EIO); 740 } 741 mutex_exit(&db->db_mtx); 742 } 743 744 ASSERT(err || havepzio || db->db_state == DB_CACHED); 745 return (err); 746 } 747 748 static void 749 dbuf_noread(dmu_buf_impl_t *db) 750 { 751 ASSERT(!refcount_is_zero(&db->db_holds)); 752 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 753 mutex_enter(&db->db_mtx); 754 while (db->db_state == DB_READ || db->db_state == DB_FILL) 755 cv_wait(&db->db_changed, &db->db_mtx); 756 if (db->db_state == DB_UNCACHED) { 757 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 758 spa_t *spa = db->db_objset->os_spa; 759 760 ASSERT(db->db_buf == NULL); 761 ASSERT(db->db.db_data == NULL); 762 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 763 db->db_state = DB_FILL; 764 } else if (db->db_state == DB_NOFILL) { 765 dbuf_clear_data(db); 766 } else { 767 ASSERT3U(db->db_state, ==, DB_CACHED); 768 } 769 mutex_exit(&db->db_mtx); 770 } 771 772 /* 773 * This is our just-in-time copy function. It makes a copy of 774 * buffers, that have been modified in a previous transaction 775 * group, before we modify them in the current active group. 776 * 777 * This function is used in two places: when we are dirtying a 778 * buffer for the first time in a txg, and when we are freeing 779 * a range in a dnode that includes this buffer. 780 * 781 * Note that when we are called from dbuf_free_range() we do 782 * not put a hold on the buffer, we just traverse the active 783 * dbuf list for the dnode. 784 */ 785 static void 786 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 787 { 788 dbuf_dirty_record_t *dr = db->db_last_dirty; 789 790 ASSERT(MUTEX_HELD(&db->db_mtx)); 791 ASSERT(db->db.db_data != NULL); 792 ASSERT(db->db_level == 0); 793 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 794 795 if (dr == NULL || 796 (dr->dt.dl.dr_data != 797 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 798 return; 799 800 /* 801 * If the last dirty record for this dbuf has not yet synced 802 * and its referencing the dbuf data, either: 803 * reset the reference to point to a new copy, 804 * or (if there a no active holders) 805 * just null out the current db_data pointer. 806 */ 807 ASSERT(dr->dr_txg >= txg - 2); 808 if (db->db_blkid == DMU_BONUS_BLKID) { 809 /* Note that the data bufs here are zio_bufs */ 810 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 811 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 812 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 813 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 814 int size = db->db.db_size; 815 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 816 spa_t *spa = db->db_objset->os_spa; 817 818 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 819 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 820 } else { 821 dbuf_clear_data(db); 822 } 823 } 824 825 void 826 dbuf_unoverride(dbuf_dirty_record_t *dr) 827 { 828 dmu_buf_impl_t *db = dr->dr_dbuf; 829 blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 830 uint64_t txg = dr->dr_txg; 831 832 ASSERT(MUTEX_HELD(&db->db_mtx)); 833 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 834 ASSERT(db->db_level == 0); 835 836 if (db->db_blkid == DMU_BONUS_BLKID || 837 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 838 return; 839 840 ASSERT(db->db_data_pending != dr); 841 842 /* free this block */ 843 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 844 zio_free(db->db_objset->os_spa, txg, bp); 845 846 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 847 dr->dt.dl.dr_nopwrite = B_FALSE; 848 849 /* 850 * Release the already-written buffer, so we leave it in 851 * a consistent dirty state. Note that all callers are 852 * modifying the buffer, so they will immediately do 853 * another (redundant) arc_release(). Therefore, leave 854 * the buf thawed to save the effort of freezing & 855 * immediately re-thawing it. 856 */ 857 arc_release(dr->dt.dl.dr_data, db); 858 } 859 860 /* 861 * Evict (if its unreferenced) or clear (if its referenced) any level-0 862 * data blocks in the free range, so that any future readers will find 863 * empty blocks. 864 * 865 * This is a no-op if the dataset is in the middle of an incremental 866 * receive; see comment below for details. 867 */ 868 void 869 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, 870 dmu_tx_t *tx) 871 { 872 dmu_buf_impl_t db_search; 873 dmu_buf_impl_t *db, *db_next; 874 uint64_t txg = tx->tx_txg; 875 avl_index_t where; 876 877 if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID)) 878 end_blkid = dn->dn_maxblkid; 879 dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); 880 881 db_search.db_level = 0; 882 db_search.db_blkid = start_blkid; 883 db_search.db_state = DB_SEARCH; 884 885 mutex_enter(&dn->dn_dbufs_mtx); 886 if (start_blkid >= dn->dn_unlisted_l0_blkid) { 887 /* There can't be any dbufs in this range; no need to search. */ 888 #ifdef DEBUG 889 db = avl_find(&dn->dn_dbufs, &db_search, &where); 890 ASSERT3P(db, ==, NULL); 891 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 892 ASSERT(db == NULL || db->db_level > 0); 893 #endif 894 mutex_exit(&dn->dn_dbufs_mtx); 895 return; 896 } else if (dmu_objset_is_receiving(dn->dn_objset)) { 897 /* 898 * If we are receiving, we expect there to be no dbufs in 899 * the range to be freed, because receive modifies each 900 * block at most once, and in offset order. If this is 901 * not the case, it can lead to performance problems, 902 * so note that we unexpectedly took the slow path. 903 */ 904 atomic_inc_64(&zfs_free_range_recv_miss); 905 } 906 907 db = avl_find(&dn->dn_dbufs, &db_search, &where); 908 ASSERT3P(db, ==, NULL); 909 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 910 911 for (; db != NULL; db = db_next) { 912 db_next = AVL_NEXT(&dn->dn_dbufs, db); 913 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 914 915 if (db->db_level != 0 || db->db_blkid > end_blkid) { 916 break; 917 } 918 ASSERT3U(db->db_blkid, >=, start_blkid); 919 920 /* found a level 0 buffer in the range */ 921 mutex_enter(&db->db_mtx); 922 if (dbuf_undirty(db, tx)) { 923 /* mutex has been dropped and dbuf destroyed */ 924 continue; 925 } 926 927 if (db->db_state == DB_UNCACHED || 928 db->db_state == DB_NOFILL || 929 db->db_state == DB_EVICTING) { 930 ASSERT(db->db.db_data == NULL); 931 mutex_exit(&db->db_mtx); 932 continue; 933 } 934 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 935 /* will be handled in dbuf_read_done or dbuf_rele */ 936 db->db_freed_in_flight = TRUE; 937 mutex_exit(&db->db_mtx); 938 continue; 939 } 940 if (refcount_count(&db->db_holds) == 0) { 941 ASSERT(db->db_buf); 942 dbuf_clear(db); 943 continue; 944 } 945 /* The dbuf is referenced */ 946 947 if (db->db_last_dirty != NULL) { 948 dbuf_dirty_record_t *dr = db->db_last_dirty; 949 950 if (dr->dr_txg == txg) { 951 /* 952 * This buffer is "in-use", re-adjust the file 953 * size to reflect that this buffer may 954 * contain new data when we sync. 955 */ 956 if (db->db_blkid != DMU_SPILL_BLKID && 957 db->db_blkid > dn->dn_maxblkid) 958 dn->dn_maxblkid = db->db_blkid; 959 dbuf_unoverride(dr); 960 } else { 961 /* 962 * This dbuf is not dirty in the open context. 963 * Either uncache it (if its not referenced in 964 * the open context) or reset its contents to 965 * empty. 966 */ 967 dbuf_fix_old_data(db, txg); 968 } 969 } 970 /* clear the contents if its cached */ 971 if (db->db_state == DB_CACHED) { 972 ASSERT(db->db.db_data != NULL); 973 arc_release(db->db_buf, db); 974 bzero(db->db.db_data, db->db.db_size); 975 arc_buf_freeze(db->db_buf); 976 } 977 978 mutex_exit(&db->db_mtx); 979 } 980 mutex_exit(&dn->dn_dbufs_mtx); 981 } 982 983 static int 984 dbuf_block_freeable(dmu_buf_impl_t *db) 985 { 986 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 987 uint64_t birth_txg = 0; 988 989 /* 990 * We don't need any locking to protect db_blkptr: 991 * If it's syncing, then db_last_dirty will be set 992 * so we'll ignore db_blkptr. 993 * 994 * This logic ensures that only block births for 995 * filled blocks are considered. 996 */ 997 ASSERT(MUTEX_HELD(&db->db_mtx)); 998 if (db->db_last_dirty && (db->db_blkptr == NULL || 999 !BP_IS_HOLE(db->db_blkptr))) { 1000 birth_txg = db->db_last_dirty->dr_txg; 1001 } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 1002 birth_txg = db->db_blkptr->blk_birth; 1003 } 1004 1005 /* 1006 * If this block don't exist or is in a snapshot, it can't be freed. 1007 * Don't pass the bp to dsl_dataset_block_freeable() since we 1008 * are holding the db_mtx lock and might deadlock if we are 1009 * prefetching a dedup-ed block. 1010 */ 1011 if (birth_txg != 0) 1012 return (ds == NULL || 1013 dsl_dataset_block_freeable(ds, NULL, birth_txg)); 1014 else 1015 return (B_FALSE); 1016 } 1017 1018 void 1019 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 1020 { 1021 arc_buf_t *buf, *obuf; 1022 int osize = db->db.db_size; 1023 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1024 dnode_t *dn; 1025 1026 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1027 1028 DB_DNODE_ENTER(db); 1029 dn = DB_DNODE(db); 1030 1031 /* XXX does *this* func really need the lock? */ 1032 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1033 1034 /* 1035 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 1036 * is OK, because there can be no other references to the db 1037 * when we are changing its size, so no concurrent DB_FILL can 1038 * be happening. 1039 */ 1040 /* 1041 * XXX we should be doing a dbuf_read, checking the return 1042 * value and returning that up to our callers 1043 */ 1044 dmu_buf_will_dirty(&db->db, tx); 1045 1046 /* create the data buffer for the new block */ 1047 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 1048 1049 /* copy old block data to the new block */ 1050 obuf = db->db_buf; 1051 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 1052 /* zero the remainder */ 1053 if (size > osize) 1054 bzero((uint8_t *)buf->b_data + osize, size - osize); 1055 1056 mutex_enter(&db->db_mtx); 1057 dbuf_set_data(db, buf); 1058 VERIFY(arc_buf_remove_ref(obuf, db)); 1059 db->db.db_size = size; 1060 1061 if (db->db_level == 0) { 1062 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1063 db->db_last_dirty->dt.dl.dr_data = buf; 1064 } 1065 mutex_exit(&db->db_mtx); 1066 1067 dnode_willuse_space(dn, size-osize, tx); 1068 DB_DNODE_EXIT(db); 1069 } 1070 1071 void 1072 dbuf_release_bp(dmu_buf_impl_t *db) 1073 { 1074 objset_t *os = db->db_objset; 1075 1076 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 1077 ASSERT(arc_released(os->os_phys_buf) || 1078 list_link_active(&os->os_dsl_dataset->ds_synced_link)); 1079 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1080 1081 (void) arc_release(db->db_buf, db); 1082 } 1083 1084 dbuf_dirty_record_t * 1085 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1086 { 1087 dnode_t *dn; 1088 objset_t *os; 1089 dbuf_dirty_record_t **drp, *dr; 1090 int drop_struct_lock = FALSE; 1091 boolean_t do_free_accounting = B_FALSE; 1092 int txgoff = tx->tx_txg & TXG_MASK; 1093 1094 ASSERT(tx->tx_txg != 0); 1095 ASSERT(!refcount_is_zero(&db->db_holds)); 1096 DMU_TX_DIRTY_BUF(tx, db); 1097 1098 DB_DNODE_ENTER(db); 1099 dn = DB_DNODE(db); 1100 /* 1101 * Shouldn't dirty a regular buffer in syncing context. Private 1102 * objects may be dirtied in syncing context, but only if they 1103 * were already pre-dirtied in open context. 1104 */ 1105 ASSERT(!dmu_tx_is_syncing(tx) || 1106 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1107 DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1108 dn->dn_objset->os_dsl_dataset == NULL); 1109 /* 1110 * We make this assert for private objects as well, but after we 1111 * check if we're already dirty. They are allowed to re-dirty 1112 * in syncing context. 1113 */ 1114 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1115 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1116 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1117 1118 mutex_enter(&db->db_mtx); 1119 /* 1120 * XXX make this true for indirects too? The problem is that 1121 * transactions created with dmu_tx_create_assigned() from 1122 * syncing context don't bother holding ahead. 1123 */ 1124 ASSERT(db->db_level != 0 || 1125 db->db_state == DB_CACHED || db->db_state == DB_FILL || 1126 db->db_state == DB_NOFILL); 1127 1128 mutex_enter(&dn->dn_mtx); 1129 /* 1130 * Don't set dirtyctx to SYNC if we're just modifying this as we 1131 * initialize the objset. 1132 */ 1133 if (dn->dn_dirtyctx == DN_UNDIRTIED && 1134 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1135 dn->dn_dirtyctx = 1136 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1137 ASSERT(dn->dn_dirtyctx_firstset == NULL); 1138 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1139 } 1140 mutex_exit(&dn->dn_mtx); 1141 1142 if (db->db_blkid == DMU_SPILL_BLKID) 1143 dn->dn_have_spill = B_TRUE; 1144 1145 /* 1146 * If this buffer is already dirty, we're done. 1147 */ 1148 drp = &db->db_last_dirty; 1149 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1150 db->db.db_object == DMU_META_DNODE_OBJECT); 1151 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1152 drp = &dr->dr_next; 1153 if (dr && dr->dr_txg == tx->tx_txg) { 1154 DB_DNODE_EXIT(db); 1155 1156 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1157 /* 1158 * If this buffer has already been written out, 1159 * we now need to reset its state. 1160 */ 1161 dbuf_unoverride(dr); 1162 if (db->db.db_object != DMU_META_DNODE_OBJECT && 1163 db->db_state != DB_NOFILL) 1164 arc_buf_thaw(db->db_buf); 1165 } 1166 mutex_exit(&db->db_mtx); 1167 return (dr); 1168 } 1169 1170 /* 1171 * Only valid if not already dirty. 1172 */ 1173 ASSERT(dn->dn_object == 0 || 1174 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1175 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1176 1177 ASSERT3U(dn->dn_nlevels, >, db->db_level); 1178 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1179 dn->dn_phys->dn_nlevels > db->db_level || 1180 dn->dn_next_nlevels[txgoff] > db->db_level || 1181 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1182 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1183 1184 /* 1185 * We should only be dirtying in syncing context if it's the 1186 * mos or we're initializing the os or it's a special object. 1187 * However, we are allowed to dirty in syncing context provided 1188 * we already dirtied it in open context. Hence we must make 1189 * this assertion only if we're not already dirty. 1190 */ 1191 os = dn->dn_objset; 1192 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1193 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1194 ASSERT(db->db.db_size != 0); 1195 1196 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1197 1198 if (db->db_blkid != DMU_BONUS_BLKID) { 1199 /* 1200 * Update the accounting. 1201 * Note: we delay "free accounting" until after we drop 1202 * the db_mtx. This keeps us from grabbing other locks 1203 * (and possibly deadlocking) in bp_get_dsize() while 1204 * also holding the db_mtx. 1205 */ 1206 dnode_willuse_space(dn, db->db.db_size, tx); 1207 do_free_accounting = dbuf_block_freeable(db); 1208 } 1209 1210 /* 1211 * If this buffer is dirty in an old transaction group we need 1212 * to make a copy of it so that the changes we make in this 1213 * transaction group won't leak out when we sync the older txg. 1214 */ 1215 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1216 if (db->db_level == 0) { 1217 void *data_old = db->db_buf; 1218 1219 if (db->db_state != DB_NOFILL) { 1220 if (db->db_blkid == DMU_BONUS_BLKID) { 1221 dbuf_fix_old_data(db, tx->tx_txg); 1222 data_old = db->db.db_data; 1223 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1224 /* 1225 * Release the data buffer from the cache so 1226 * that we can modify it without impacting 1227 * possible other users of this cached data 1228 * block. Note that indirect blocks and 1229 * private objects are not released until the 1230 * syncing state (since they are only modified 1231 * then). 1232 */ 1233 arc_release(db->db_buf, db); 1234 dbuf_fix_old_data(db, tx->tx_txg); 1235 data_old = db->db_buf; 1236 } 1237 ASSERT(data_old != NULL); 1238 } 1239 dr->dt.dl.dr_data = data_old; 1240 } else { 1241 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1242 list_create(&dr->dt.di.dr_children, 1243 sizeof (dbuf_dirty_record_t), 1244 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1245 } 1246 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 1247 dr->dr_accounted = db->db.db_size; 1248 dr->dr_dbuf = db; 1249 dr->dr_txg = tx->tx_txg; 1250 dr->dr_next = *drp; 1251 *drp = dr; 1252 1253 /* 1254 * We could have been freed_in_flight between the dbuf_noread 1255 * and dbuf_dirty. We win, as though the dbuf_noread() had 1256 * happened after the free. 1257 */ 1258 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1259 db->db_blkid != DMU_SPILL_BLKID) { 1260 mutex_enter(&dn->dn_mtx); 1261 if (dn->dn_free_ranges[txgoff] != NULL) { 1262 range_tree_clear(dn->dn_free_ranges[txgoff], 1263 db->db_blkid, 1); 1264 } 1265 mutex_exit(&dn->dn_mtx); 1266 db->db_freed_in_flight = FALSE; 1267 } 1268 1269 /* 1270 * This buffer is now part of this txg 1271 */ 1272 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1273 db->db_dirtycnt += 1; 1274 ASSERT3U(db->db_dirtycnt, <=, 3); 1275 1276 mutex_exit(&db->db_mtx); 1277 1278 if (db->db_blkid == DMU_BONUS_BLKID || 1279 db->db_blkid == DMU_SPILL_BLKID) { 1280 mutex_enter(&dn->dn_mtx); 1281 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1282 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1283 mutex_exit(&dn->dn_mtx); 1284 dnode_setdirty(dn, tx); 1285 DB_DNODE_EXIT(db); 1286 return (dr); 1287 } else if (do_free_accounting) { 1288 blkptr_t *bp = db->db_blkptr; 1289 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1290 bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1291 /* 1292 * This is only a guess -- if the dbuf is dirty 1293 * in a previous txg, we don't know how much 1294 * space it will use on disk yet. We should 1295 * really have the struct_rwlock to access 1296 * db_blkptr, but since this is just a guess, 1297 * it's OK if we get an odd answer. 1298 */ 1299 ddt_prefetch(os->os_spa, bp); 1300 dnode_willuse_space(dn, -willfree, tx); 1301 } 1302 1303 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1304 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1305 drop_struct_lock = TRUE; 1306 } 1307 1308 if (db->db_level == 0) { 1309 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1310 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1311 } 1312 1313 if (db->db_level+1 < dn->dn_nlevels) { 1314 dmu_buf_impl_t *parent = db->db_parent; 1315 dbuf_dirty_record_t *di; 1316 int parent_held = FALSE; 1317 1318 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1319 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1320 1321 parent = dbuf_hold_level(dn, db->db_level+1, 1322 db->db_blkid >> epbs, FTAG); 1323 ASSERT(parent != NULL); 1324 parent_held = TRUE; 1325 } 1326 if (drop_struct_lock) 1327 rw_exit(&dn->dn_struct_rwlock); 1328 ASSERT3U(db->db_level+1, ==, parent->db_level); 1329 di = dbuf_dirty(parent, tx); 1330 if (parent_held) 1331 dbuf_rele(parent, FTAG); 1332 1333 mutex_enter(&db->db_mtx); 1334 /* 1335 * Since we've dropped the mutex, it's possible that 1336 * dbuf_undirty() might have changed this out from under us. 1337 */ 1338 if (db->db_last_dirty == dr || 1339 dn->dn_object == DMU_META_DNODE_OBJECT) { 1340 mutex_enter(&di->dt.di.dr_mtx); 1341 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1342 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1343 list_insert_tail(&di->dt.di.dr_children, dr); 1344 mutex_exit(&di->dt.di.dr_mtx); 1345 dr->dr_parent = di; 1346 } 1347 mutex_exit(&db->db_mtx); 1348 } else { 1349 ASSERT(db->db_level+1 == dn->dn_nlevels); 1350 ASSERT(db->db_blkid < dn->dn_nblkptr); 1351 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1352 mutex_enter(&dn->dn_mtx); 1353 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1354 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1355 mutex_exit(&dn->dn_mtx); 1356 if (drop_struct_lock) 1357 rw_exit(&dn->dn_struct_rwlock); 1358 } 1359 1360 dnode_setdirty(dn, tx); 1361 DB_DNODE_EXIT(db); 1362 return (dr); 1363 } 1364 1365 /* 1366 * Undirty a buffer in the transaction group referenced by the given 1367 * transaction. Return whether this evicted the dbuf. 1368 */ 1369 static boolean_t 1370 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1371 { 1372 dnode_t *dn; 1373 uint64_t txg = tx->tx_txg; 1374 dbuf_dirty_record_t *dr, **drp; 1375 1376 ASSERT(txg != 0); 1377 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1378 ASSERT0(db->db_level); 1379 ASSERT(MUTEX_HELD(&db->db_mtx)); 1380 1381 /* 1382 * If this buffer is not dirty, we're done. 1383 */ 1384 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1385 if (dr->dr_txg <= txg) 1386 break; 1387 if (dr == NULL || dr->dr_txg < txg) 1388 return (B_FALSE); 1389 ASSERT(dr->dr_txg == txg); 1390 ASSERT(dr->dr_dbuf == db); 1391 1392 DB_DNODE_ENTER(db); 1393 dn = DB_DNODE(db); 1394 1395 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1396 1397 ASSERT(db->db.db_size != 0); 1398 1399 /* 1400 * Any space we accounted for in dp_dirty_* will be cleaned up by 1401 * dsl_pool_sync(). This is relatively rare so the discrepancy 1402 * is not a big deal. 1403 */ 1404 1405 *drp = dr->dr_next; 1406 1407 /* 1408 * Note that there are three places in dbuf_dirty() 1409 * where this dirty record may be put on a list. 1410 * Make sure to do a list_remove corresponding to 1411 * every one of those list_insert calls. 1412 */ 1413 if (dr->dr_parent) { 1414 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1415 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1416 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1417 } else if (db->db_blkid == DMU_SPILL_BLKID || 1418 db->db_level+1 == dn->dn_nlevels) { 1419 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1420 mutex_enter(&dn->dn_mtx); 1421 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1422 mutex_exit(&dn->dn_mtx); 1423 } 1424 DB_DNODE_EXIT(db); 1425 1426 if (db->db_state != DB_NOFILL) { 1427 dbuf_unoverride(dr); 1428 1429 ASSERT(db->db_buf != NULL); 1430 ASSERT(dr->dt.dl.dr_data != NULL); 1431 if (dr->dt.dl.dr_data != db->db_buf) 1432 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); 1433 } 1434 1435 if (db->db_level != 0) { 1436 mutex_destroy(&dr->dt.di.dr_mtx); 1437 list_destroy(&dr->dt.di.dr_children); 1438 } 1439 1440 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1441 1442 ASSERT(db->db_dirtycnt > 0); 1443 db->db_dirtycnt -= 1; 1444 1445 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1446 arc_buf_t *buf = db->db_buf; 1447 1448 ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1449 dbuf_clear_data(db); 1450 VERIFY(arc_buf_remove_ref(buf, db)); 1451 dbuf_evict(db); 1452 return (B_TRUE); 1453 } 1454 1455 return (B_FALSE); 1456 } 1457 1458 void 1459 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1460 { 1461 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1462 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1463 1464 ASSERT(tx->tx_txg != 0); 1465 ASSERT(!refcount_is_zero(&db->db_holds)); 1466 1467 DB_DNODE_ENTER(db); 1468 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1469 rf |= DB_RF_HAVESTRUCT; 1470 DB_DNODE_EXIT(db); 1471 (void) dbuf_read(db, NULL, rf); 1472 (void) dbuf_dirty(db, tx); 1473 } 1474 1475 void 1476 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1477 { 1478 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1479 1480 db->db_state = DB_NOFILL; 1481 1482 dmu_buf_will_fill(db_fake, tx); 1483 } 1484 1485 void 1486 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1487 { 1488 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1489 1490 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1491 ASSERT(tx->tx_txg != 0); 1492 ASSERT(db->db_level == 0); 1493 ASSERT(!refcount_is_zero(&db->db_holds)); 1494 1495 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1496 dmu_tx_private_ok(tx)); 1497 1498 dbuf_noread(db); 1499 (void) dbuf_dirty(db, tx); 1500 } 1501 1502 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1503 /* ARGSUSED */ 1504 void 1505 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1506 { 1507 mutex_enter(&db->db_mtx); 1508 DBUF_VERIFY(db); 1509 1510 if (db->db_state == DB_FILL) { 1511 if (db->db_level == 0 && db->db_freed_in_flight) { 1512 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1513 /* we were freed while filling */ 1514 /* XXX dbuf_undirty? */ 1515 bzero(db->db.db_data, db->db.db_size); 1516 db->db_freed_in_flight = FALSE; 1517 } 1518 db->db_state = DB_CACHED; 1519 cv_broadcast(&db->db_changed); 1520 } 1521 mutex_exit(&db->db_mtx); 1522 } 1523 1524 void 1525 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, 1526 bp_embedded_type_t etype, enum zio_compress comp, 1527 int uncompressed_size, int compressed_size, int byteorder, 1528 dmu_tx_t *tx) 1529 { 1530 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 1531 struct dirty_leaf *dl; 1532 dmu_object_type_t type; 1533 1534 DB_DNODE_ENTER(db); 1535 type = DB_DNODE(db)->dn_type; 1536 DB_DNODE_EXIT(db); 1537 1538 ASSERT0(db->db_level); 1539 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1540 1541 dmu_buf_will_not_fill(dbuf, tx); 1542 1543 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1544 dl = &db->db_last_dirty->dt.dl; 1545 encode_embedded_bp_compressed(&dl->dr_overridden_by, 1546 data, comp, uncompressed_size, compressed_size); 1547 BPE_SET_ETYPE(&dl->dr_overridden_by, etype); 1548 BP_SET_TYPE(&dl->dr_overridden_by, type); 1549 BP_SET_LEVEL(&dl->dr_overridden_by, 0); 1550 BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); 1551 1552 dl->dr_override_state = DR_OVERRIDDEN; 1553 dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; 1554 } 1555 1556 /* 1557 * Directly assign a provided arc buf to a given dbuf if it's not referenced 1558 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1559 */ 1560 void 1561 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1562 { 1563 ASSERT(!refcount_is_zero(&db->db_holds)); 1564 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1565 ASSERT(db->db_level == 0); 1566 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1567 ASSERT(buf != NULL); 1568 ASSERT(arc_buf_size(buf) == db->db.db_size); 1569 ASSERT(tx->tx_txg != 0); 1570 1571 arc_return_buf(buf, db); 1572 ASSERT(arc_released(buf)); 1573 1574 mutex_enter(&db->db_mtx); 1575 1576 while (db->db_state == DB_READ || db->db_state == DB_FILL) 1577 cv_wait(&db->db_changed, &db->db_mtx); 1578 1579 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1580 1581 if (db->db_state == DB_CACHED && 1582 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1583 mutex_exit(&db->db_mtx); 1584 (void) dbuf_dirty(db, tx); 1585 bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1586 VERIFY(arc_buf_remove_ref(buf, db)); 1587 xuio_stat_wbuf_copied(); 1588 return; 1589 } 1590 1591 xuio_stat_wbuf_nocopy(); 1592 if (db->db_state == DB_CACHED) { 1593 dbuf_dirty_record_t *dr = db->db_last_dirty; 1594 1595 ASSERT(db->db_buf != NULL); 1596 if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1597 ASSERT(dr->dt.dl.dr_data == db->db_buf); 1598 if (!arc_released(db->db_buf)) { 1599 ASSERT(dr->dt.dl.dr_override_state == 1600 DR_OVERRIDDEN); 1601 arc_release(db->db_buf, db); 1602 } 1603 dr->dt.dl.dr_data = buf; 1604 VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1605 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1606 arc_release(db->db_buf, db); 1607 VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1608 } 1609 db->db_buf = NULL; 1610 } 1611 ASSERT(db->db_buf == NULL); 1612 dbuf_set_data(db, buf); 1613 db->db_state = DB_FILL; 1614 mutex_exit(&db->db_mtx); 1615 (void) dbuf_dirty(db, tx); 1616 dmu_buf_fill_done(&db->db, tx); 1617 } 1618 1619 /* 1620 * "Clear" the contents of this dbuf. This will mark the dbuf 1621 * EVICTING and clear *most* of its references. Unfortunately, 1622 * when we are not holding the dn_dbufs_mtx, we can't clear the 1623 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1624 * in this case. For callers from the DMU we will usually see: 1625 * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() 1626 * For the arc callback, we will usually see: 1627 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1628 * Sometimes, though, we will get a mix of these two: 1629 * DMU: dbuf_clear()->arc_clear_callback() 1630 * ARC: dbuf_do_evict()->dbuf_destroy() 1631 * 1632 * This routine will dissociate the dbuf from the arc, by calling 1633 * arc_clear_callback(), but will not evict the data from the ARC. 1634 */ 1635 void 1636 dbuf_clear(dmu_buf_impl_t *db) 1637 { 1638 dnode_t *dn; 1639 dmu_buf_impl_t *parent = db->db_parent; 1640 dmu_buf_impl_t *dndb; 1641 boolean_t dbuf_gone = B_FALSE; 1642 1643 ASSERT(MUTEX_HELD(&db->db_mtx)); 1644 ASSERT(refcount_is_zero(&db->db_holds)); 1645 1646 dbuf_evict_user(db); 1647 1648 if (db->db_state == DB_CACHED) { 1649 ASSERT(db->db.db_data != NULL); 1650 if (db->db_blkid == DMU_BONUS_BLKID) { 1651 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1652 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1653 } 1654 db->db.db_data = NULL; 1655 db->db_state = DB_UNCACHED; 1656 } 1657 1658 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1659 ASSERT(db->db_data_pending == NULL); 1660 1661 db->db_state = DB_EVICTING; 1662 db->db_blkptr = NULL; 1663 1664 DB_DNODE_ENTER(db); 1665 dn = DB_DNODE(db); 1666 dndb = dn->dn_dbuf; 1667 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1668 avl_remove(&dn->dn_dbufs, db); 1669 atomic_dec_32(&dn->dn_dbufs_count); 1670 membar_producer(); 1671 DB_DNODE_EXIT(db); 1672 /* 1673 * Decrementing the dbuf count means that the hold corresponding 1674 * to the removed dbuf is no longer discounted in dnode_move(), 1675 * so the dnode cannot be moved until after we release the hold. 1676 * The membar_producer() ensures visibility of the decremented 1677 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1678 * release any lock. 1679 */ 1680 dnode_rele(dn, db); 1681 db->db_dnode_handle = NULL; 1682 } else { 1683 DB_DNODE_EXIT(db); 1684 } 1685 1686 if (db->db_buf) 1687 dbuf_gone = arc_clear_callback(db->db_buf); 1688 1689 if (!dbuf_gone) 1690 mutex_exit(&db->db_mtx); 1691 1692 /* 1693 * If this dbuf is referenced from an indirect dbuf, 1694 * decrement the ref count on the indirect dbuf. 1695 */ 1696 if (parent && parent != dndb) 1697 dbuf_rele(parent, db); 1698 } 1699 1700 static int 1701 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1702 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1703 { 1704 int nlevels, epbs; 1705 1706 *parentp = NULL; 1707 *bpp = NULL; 1708 1709 ASSERT(blkid != DMU_BONUS_BLKID); 1710 1711 if (blkid == DMU_SPILL_BLKID) { 1712 mutex_enter(&dn->dn_mtx); 1713 if (dn->dn_have_spill && 1714 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1715 *bpp = &dn->dn_phys->dn_spill; 1716 else 1717 *bpp = NULL; 1718 dbuf_add_ref(dn->dn_dbuf, NULL); 1719 *parentp = dn->dn_dbuf; 1720 mutex_exit(&dn->dn_mtx); 1721 return (0); 1722 } 1723 1724 if (dn->dn_phys->dn_nlevels == 0) 1725 nlevels = 1; 1726 else 1727 nlevels = dn->dn_phys->dn_nlevels; 1728 1729 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1730 1731 ASSERT3U(level * epbs, <, 64); 1732 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1733 if (level >= nlevels || 1734 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1735 /* the buffer has no parent yet */ 1736 return (SET_ERROR(ENOENT)); 1737 } else if (level < nlevels-1) { 1738 /* this block is referenced from an indirect block */ 1739 int err = dbuf_hold_impl(dn, level+1, 1740 blkid >> epbs, fail_sparse, NULL, parentp); 1741 if (err) 1742 return (err); 1743 err = dbuf_read(*parentp, NULL, 1744 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1745 if (err) { 1746 dbuf_rele(*parentp, NULL); 1747 *parentp = NULL; 1748 return (err); 1749 } 1750 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1751 (blkid & ((1ULL << epbs) - 1)); 1752 return (0); 1753 } else { 1754 /* the block is referenced from the dnode */ 1755 ASSERT3U(level, ==, nlevels-1); 1756 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1757 blkid < dn->dn_phys->dn_nblkptr); 1758 if (dn->dn_dbuf) { 1759 dbuf_add_ref(dn->dn_dbuf, NULL); 1760 *parentp = dn->dn_dbuf; 1761 } 1762 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1763 return (0); 1764 } 1765 } 1766 1767 static dmu_buf_impl_t * 1768 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1769 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1770 { 1771 objset_t *os = dn->dn_objset; 1772 dmu_buf_impl_t *db, *odb; 1773 1774 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1775 ASSERT(dn->dn_type != DMU_OT_NONE); 1776 1777 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1778 1779 db->db_objset = os; 1780 db->db.db_object = dn->dn_object; 1781 db->db_level = level; 1782 db->db_blkid = blkid; 1783 db->db_last_dirty = NULL; 1784 db->db_dirtycnt = 0; 1785 db->db_dnode_handle = dn->dn_handle; 1786 db->db_parent = parent; 1787 db->db_blkptr = blkptr; 1788 1789 db->db_user = NULL; 1790 db->db_immediate_evict = 0; 1791 db->db_freed_in_flight = 0; 1792 1793 if (blkid == DMU_BONUS_BLKID) { 1794 ASSERT3P(parent, ==, dn->dn_dbuf); 1795 db->db.db_size = DN_MAX_BONUSLEN - 1796 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1797 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1798 db->db.db_offset = DMU_BONUS_BLKID; 1799 db->db_state = DB_UNCACHED; 1800 /* the bonus dbuf is not placed in the hash table */ 1801 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1802 return (db); 1803 } else if (blkid == DMU_SPILL_BLKID) { 1804 db->db.db_size = (blkptr != NULL) ? 1805 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 1806 db->db.db_offset = 0; 1807 } else { 1808 int blocksize = 1809 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 1810 db->db.db_size = blocksize; 1811 db->db.db_offset = db->db_blkid * blocksize; 1812 } 1813 1814 /* 1815 * Hold the dn_dbufs_mtx while we get the new dbuf 1816 * in the hash table *and* added to the dbufs list. 1817 * This prevents a possible deadlock with someone 1818 * trying to look up this dbuf before its added to the 1819 * dn_dbufs list. 1820 */ 1821 mutex_enter(&dn->dn_dbufs_mtx); 1822 db->db_state = DB_EVICTING; 1823 if ((odb = dbuf_hash_insert(db)) != NULL) { 1824 /* someone else inserted it first */ 1825 kmem_cache_free(dbuf_cache, db); 1826 mutex_exit(&dn->dn_dbufs_mtx); 1827 return (odb); 1828 } 1829 avl_add(&dn->dn_dbufs, db); 1830 if (db->db_level == 0 && db->db_blkid >= 1831 dn->dn_unlisted_l0_blkid) 1832 dn->dn_unlisted_l0_blkid = db->db_blkid + 1; 1833 db->db_state = DB_UNCACHED; 1834 mutex_exit(&dn->dn_dbufs_mtx); 1835 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1836 1837 if (parent && parent != dn->dn_dbuf) 1838 dbuf_add_ref(parent, db); 1839 1840 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1841 refcount_count(&dn->dn_holds) > 0); 1842 (void) refcount_add(&dn->dn_holds, db); 1843 atomic_inc_32(&dn->dn_dbufs_count); 1844 1845 dprintf_dbuf(db, "db=%p\n", db); 1846 1847 return (db); 1848 } 1849 1850 static int 1851 dbuf_do_evict(void *private) 1852 { 1853 dmu_buf_impl_t *db = private; 1854 1855 if (!MUTEX_HELD(&db->db_mtx)) 1856 mutex_enter(&db->db_mtx); 1857 1858 ASSERT(refcount_is_zero(&db->db_holds)); 1859 1860 if (db->db_state != DB_EVICTING) { 1861 ASSERT(db->db_state == DB_CACHED); 1862 DBUF_VERIFY(db); 1863 db->db_buf = NULL; 1864 dbuf_evict(db); 1865 } else { 1866 mutex_exit(&db->db_mtx); 1867 dbuf_destroy(db); 1868 } 1869 return (0); 1870 } 1871 1872 static void 1873 dbuf_destroy(dmu_buf_impl_t *db) 1874 { 1875 ASSERT(refcount_is_zero(&db->db_holds)); 1876 1877 if (db->db_blkid != DMU_BONUS_BLKID) { 1878 /* 1879 * If this dbuf is still on the dn_dbufs list, 1880 * remove it from that list. 1881 */ 1882 if (db->db_dnode_handle != NULL) { 1883 dnode_t *dn; 1884 1885 DB_DNODE_ENTER(db); 1886 dn = DB_DNODE(db); 1887 mutex_enter(&dn->dn_dbufs_mtx); 1888 avl_remove(&dn->dn_dbufs, db); 1889 atomic_dec_32(&dn->dn_dbufs_count); 1890 mutex_exit(&dn->dn_dbufs_mtx); 1891 DB_DNODE_EXIT(db); 1892 /* 1893 * Decrementing the dbuf count means that the hold 1894 * corresponding to the removed dbuf is no longer 1895 * discounted in dnode_move(), so the dnode cannot be 1896 * moved until after we release the hold. 1897 */ 1898 dnode_rele(dn, db); 1899 db->db_dnode_handle = NULL; 1900 } 1901 dbuf_hash_remove(db); 1902 } 1903 db->db_parent = NULL; 1904 db->db_buf = NULL; 1905 1906 ASSERT(db->db.db_data == NULL); 1907 ASSERT(db->db_hash_next == NULL); 1908 ASSERT(db->db_blkptr == NULL); 1909 ASSERT(db->db_data_pending == NULL); 1910 1911 kmem_cache_free(dbuf_cache, db); 1912 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1913 } 1914 1915 void 1916 dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) 1917 { 1918 dmu_buf_impl_t *db = NULL; 1919 blkptr_t *bp = NULL; 1920 1921 ASSERT(blkid != DMU_BONUS_BLKID); 1922 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1923 1924 if (dnode_block_freed(dn, blkid)) 1925 return; 1926 1927 /* dbuf_find() returns with db_mtx held */ 1928 if (db = dbuf_find(dn, 0, blkid)) { 1929 /* 1930 * This dbuf is already in the cache. We assume that 1931 * it is already CACHED, or else about to be either 1932 * read or filled. 1933 */ 1934 mutex_exit(&db->db_mtx); 1935 return; 1936 } 1937 1938 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1939 if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { 1940 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1941 arc_flags_t aflags = 1942 ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; 1943 zbookmark_phys_t zb; 1944 1945 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1946 dn->dn_object, 0, blkid); 1947 1948 (void) arc_read(NULL, dn->dn_objset->os_spa, 1949 bp, NULL, NULL, prio, 1950 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1951 &aflags, &zb); 1952 } 1953 if (db) 1954 dbuf_rele(db, NULL); 1955 } 1956 } 1957 1958 /* 1959 * Returns with db_holds incremented, and db_mtx not held. 1960 * Note: dn_struct_rwlock must be held. 1961 */ 1962 int 1963 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1964 void *tag, dmu_buf_impl_t **dbp) 1965 { 1966 dmu_buf_impl_t *db, *parent = NULL; 1967 1968 ASSERT(blkid != DMU_BONUS_BLKID); 1969 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1970 ASSERT3U(dn->dn_nlevels, >, level); 1971 1972 *dbp = NULL; 1973 top: 1974 /* dbuf_find() returns with db_mtx held */ 1975 db = dbuf_find(dn, level, blkid); 1976 1977 if (db == NULL) { 1978 blkptr_t *bp = NULL; 1979 int err; 1980 1981 ASSERT3P(parent, ==, NULL); 1982 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1983 if (fail_sparse) { 1984 if (err == 0 && bp && BP_IS_HOLE(bp)) 1985 err = SET_ERROR(ENOENT); 1986 if (err) { 1987 if (parent) 1988 dbuf_rele(parent, NULL); 1989 return (err); 1990 } 1991 } 1992 if (err && err != ENOENT) 1993 return (err); 1994 db = dbuf_create(dn, level, blkid, parent, bp); 1995 } 1996 1997 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1998 arc_buf_add_ref(db->db_buf, db); 1999 if (db->db_buf->b_data == NULL) { 2000 dbuf_clear(db); 2001 if (parent) { 2002 dbuf_rele(parent, NULL); 2003 parent = NULL; 2004 } 2005 goto top; 2006 } 2007 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 2008 } 2009 2010 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 2011 2012 /* 2013 * If this buffer is currently syncing out, and we are are 2014 * still referencing it from db_data, we need to make a copy 2015 * of it in case we decide we want to dirty it again in this txg. 2016 */ 2017 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 2018 dn->dn_object != DMU_META_DNODE_OBJECT && 2019 db->db_state == DB_CACHED && db->db_data_pending) { 2020 dbuf_dirty_record_t *dr = db->db_data_pending; 2021 2022 if (dr->dt.dl.dr_data == db->db_buf) { 2023 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2024 2025 dbuf_set_data(db, 2026 arc_buf_alloc(dn->dn_objset->os_spa, 2027 db->db.db_size, db, type)); 2028 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 2029 db->db.db_size); 2030 } 2031 } 2032 2033 (void) refcount_add(&db->db_holds, tag); 2034 DBUF_VERIFY(db); 2035 mutex_exit(&db->db_mtx); 2036 2037 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 2038 if (parent) 2039 dbuf_rele(parent, NULL); 2040 2041 ASSERT3P(DB_DNODE(db), ==, dn); 2042 ASSERT3U(db->db_blkid, ==, blkid); 2043 ASSERT3U(db->db_level, ==, level); 2044 *dbp = db; 2045 2046 return (0); 2047 } 2048 2049 dmu_buf_impl_t * 2050 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 2051 { 2052 dmu_buf_impl_t *db; 2053 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 2054 return (err ? NULL : db); 2055 } 2056 2057 dmu_buf_impl_t * 2058 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 2059 { 2060 dmu_buf_impl_t *db; 2061 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 2062 return (err ? NULL : db); 2063 } 2064 2065 void 2066 dbuf_create_bonus(dnode_t *dn) 2067 { 2068 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 2069 2070 ASSERT(dn->dn_bonus == NULL); 2071 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 2072 } 2073 2074 int 2075 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 2076 { 2077 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2078 dnode_t *dn; 2079 2080 if (db->db_blkid != DMU_SPILL_BLKID) 2081 return (SET_ERROR(ENOTSUP)); 2082 if (blksz == 0) 2083 blksz = SPA_MINBLOCKSIZE; 2084 ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); 2085 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 2086 2087 DB_DNODE_ENTER(db); 2088 dn = DB_DNODE(db); 2089 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2090 dbuf_new_size(db, blksz, tx); 2091 rw_exit(&dn->dn_struct_rwlock); 2092 DB_DNODE_EXIT(db); 2093 2094 return (0); 2095 } 2096 2097 void 2098 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 2099 { 2100 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2101 } 2102 2103 #pragma weak dmu_buf_add_ref = dbuf_add_ref 2104 void 2105 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 2106 { 2107 int64_t holds = refcount_add(&db->db_holds, tag); 2108 ASSERT(holds > 1); 2109 } 2110 2111 /* 2112 * If you call dbuf_rele() you had better not be referencing the dnode handle 2113 * unless you have some other direct or indirect hold on the dnode. (An indirect 2114 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2115 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2116 * dnode's parent dbuf evicting its dnode handles. 2117 */ 2118 void 2119 dbuf_rele(dmu_buf_impl_t *db, void *tag) 2120 { 2121 mutex_enter(&db->db_mtx); 2122 dbuf_rele_and_unlock(db, tag); 2123 } 2124 2125 void 2126 dmu_buf_rele(dmu_buf_t *db, void *tag) 2127 { 2128 dbuf_rele((dmu_buf_impl_t *)db, tag); 2129 } 2130 2131 /* 2132 * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2133 * db_dirtycnt and db_holds to be updated atomically. 2134 */ 2135 void 2136 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2137 { 2138 int64_t holds; 2139 2140 ASSERT(MUTEX_HELD(&db->db_mtx)); 2141 DBUF_VERIFY(db); 2142 2143 /* 2144 * Remove the reference to the dbuf before removing its hold on the 2145 * dnode so we can guarantee in dnode_move() that a referenced bonus 2146 * buffer has a corresponding dnode hold. 2147 */ 2148 holds = refcount_remove(&db->db_holds, tag); 2149 ASSERT(holds >= 0); 2150 2151 /* 2152 * We can't freeze indirects if there is a possibility that they 2153 * may be modified in the current syncing context. 2154 */ 2155 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 2156 arc_buf_freeze(db->db_buf); 2157 2158 if (holds == db->db_dirtycnt && 2159 db->db_level == 0 && db->db_immediate_evict) 2160 dbuf_evict_user(db); 2161 2162 if (holds == 0) { 2163 if (db->db_blkid == DMU_BONUS_BLKID) { 2164 mutex_exit(&db->db_mtx); 2165 2166 /* 2167 * If the dnode moves here, we cannot cross this barrier 2168 * until the move completes. 2169 */ 2170 DB_DNODE_ENTER(db); 2171 atomic_dec_32(&DB_DNODE(db)->dn_dbufs_count); 2172 DB_DNODE_EXIT(db); 2173 /* 2174 * The bonus buffer's dnode hold is no longer discounted 2175 * in dnode_move(). The dnode cannot move until after 2176 * the dnode_rele(). 2177 */ 2178 dnode_rele(DB_DNODE(db), db); 2179 } else if (db->db_buf == NULL) { 2180 /* 2181 * This is a special case: we never associated this 2182 * dbuf with any data allocated from the ARC. 2183 */ 2184 ASSERT(db->db_state == DB_UNCACHED || 2185 db->db_state == DB_NOFILL); 2186 dbuf_evict(db); 2187 } else if (arc_released(db->db_buf)) { 2188 arc_buf_t *buf = db->db_buf; 2189 /* 2190 * This dbuf has anonymous data associated with it. 2191 */ 2192 dbuf_clear_data(db); 2193 VERIFY(arc_buf_remove_ref(buf, db)); 2194 dbuf_evict(db); 2195 } else { 2196 VERIFY(!arc_buf_remove_ref(db->db_buf, db)); 2197 2198 /* 2199 * A dbuf will be eligible for eviction if either the 2200 * 'primarycache' property is set or a duplicate 2201 * copy of this buffer is already cached in the arc. 2202 * 2203 * In the case of the 'primarycache' a buffer 2204 * is considered for eviction if it matches the 2205 * criteria set in the property. 2206 * 2207 * To decide if our buffer is considered a 2208 * duplicate, we must call into the arc to determine 2209 * if multiple buffers are referencing the same 2210 * block on-disk. If so, then we simply evict 2211 * ourselves. 2212 */ 2213 if (!DBUF_IS_CACHEABLE(db)) { 2214 if (db->db_blkptr != NULL && 2215 !BP_IS_HOLE(db->db_blkptr) && 2216 !BP_IS_EMBEDDED(db->db_blkptr)) { 2217 spa_t *spa = 2218 dmu_objset_spa(db->db_objset); 2219 blkptr_t bp = *db->db_blkptr; 2220 dbuf_clear(db); 2221 arc_freed(spa, &bp); 2222 } else { 2223 dbuf_clear(db); 2224 } 2225 } else if (db->db_objset->os_evicting || 2226 arc_buf_eviction_needed(db->db_buf)) { 2227 dbuf_clear(db); 2228 } else { 2229 mutex_exit(&db->db_mtx); 2230 } 2231 } 2232 } else { 2233 mutex_exit(&db->db_mtx); 2234 } 2235 } 2236 2237 #pragma weak dmu_buf_refcount = dbuf_refcount 2238 uint64_t 2239 dbuf_refcount(dmu_buf_impl_t *db) 2240 { 2241 return (refcount_count(&db->db_holds)); 2242 } 2243 2244 void * 2245 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, 2246 dmu_buf_user_t *new_user) 2247 { 2248 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2249 2250 mutex_enter(&db->db_mtx); 2251 dbuf_verify_user(db, DBVU_NOT_EVICTING); 2252 if (db->db_user == old_user) 2253 db->db_user = new_user; 2254 else 2255 old_user = db->db_user; 2256 dbuf_verify_user(db, DBVU_NOT_EVICTING); 2257 mutex_exit(&db->db_mtx); 2258 2259 return (old_user); 2260 } 2261 2262 void * 2263 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2264 { 2265 return (dmu_buf_replace_user(db_fake, NULL, user)); 2266 } 2267 2268 void * 2269 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2270 { 2271 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2272 2273 db->db_immediate_evict = TRUE; 2274 return (dmu_buf_set_user(db_fake, user)); 2275 } 2276 2277 void * 2278 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2279 { 2280 return (dmu_buf_replace_user(db_fake, user, NULL)); 2281 } 2282 2283 void * 2284 dmu_buf_get_user(dmu_buf_t *db_fake) 2285 { 2286 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2287 2288 dbuf_verify_user(db, DBVU_NOT_EVICTING); 2289 return (db->db_user); 2290 } 2291 2292 void 2293 dmu_buf_user_evict_wait() 2294 { 2295 taskq_wait(dbu_evict_taskq); 2296 } 2297 2298 boolean_t 2299 dmu_buf_freeable(dmu_buf_t *dbuf) 2300 { 2301 boolean_t res = B_FALSE; 2302 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2303 2304 if (db->db_blkptr) 2305 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2306 db->db_blkptr, db->db_blkptr->blk_birth); 2307 2308 return (res); 2309 } 2310 2311 blkptr_t * 2312 dmu_buf_get_blkptr(dmu_buf_t *db) 2313 { 2314 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2315 return (dbi->db_blkptr); 2316 } 2317 2318 static void 2319 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2320 { 2321 /* ASSERT(dmu_tx_is_syncing(tx) */ 2322 ASSERT(MUTEX_HELD(&db->db_mtx)); 2323 2324 if (db->db_blkptr != NULL) 2325 return; 2326 2327 if (db->db_blkid == DMU_SPILL_BLKID) { 2328 db->db_blkptr = &dn->dn_phys->dn_spill; 2329 BP_ZERO(db->db_blkptr); 2330 return; 2331 } 2332 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2333 /* 2334 * This buffer was allocated at a time when there was 2335 * no available blkptrs from the dnode, or it was 2336 * inappropriate to hook it in (i.e., nlevels mis-match). 2337 */ 2338 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2339 ASSERT(db->db_parent == NULL); 2340 db->db_parent = dn->dn_dbuf; 2341 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2342 DBUF_VERIFY(db); 2343 } else { 2344 dmu_buf_impl_t *parent = db->db_parent; 2345 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2346 2347 ASSERT(dn->dn_phys->dn_nlevels > 1); 2348 if (parent == NULL) { 2349 mutex_exit(&db->db_mtx); 2350 rw_enter(&dn->dn_struct_rwlock, RW_READER); 2351 (void) dbuf_hold_impl(dn, db->db_level+1, 2352 db->db_blkid >> epbs, FALSE, db, &parent); 2353 rw_exit(&dn->dn_struct_rwlock); 2354 mutex_enter(&db->db_mtx); 2355 db->db_parent = parent; 2356 } 2357 db->db_blkptr = (blkptr_t *)parent->db.db_data + 2358 (db->db_blkid & ((1ULL << epbs) - 1)); 2359 DBUF_VERIFY(db); 2360 } 2361 } 2362 2363 static void 2364 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2365 { 2366 dmu_buf_impl_t *db = dr->dr_dbuf; 2367 dnode_t *dn; 2368 zio_t *zio; 2369 2370 ASSERT(dmu_tx_is_syncing(tx)); 2371 2372 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2373 2374 mutex_enter(&db->db_mtx); 2375 2376 ASSERT(db->db_level > 0); 2377 DBUF_VERIFY(db); 2378 2379 /* Read the block if it hasn't been read yet. */ 2380 if (db->db_buf == NULL) { 2381 mutex_exit(&db->db_mtx); 2382 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2383 mutex_enter(&db->db_mtx); 2384 } 2385 ASSERT3U(db->db_state, ==, DB_CACHED); 2386 ASSERT(db->db_buf != NULL); 2387 2388 DB_DNODE_ENTER(db); 2389 dn = DB_DNODE(db); 2390 /* Indirect block size must match what the dnode thinks it is. */ 2391 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2392 dbuf_check_blkptr(dn, db); 2393 DB_DNODE_EXIT(db); 2394 2395 /* Provide the pending dirty record to child dbufs */ 2396 db->db_data_pending = dr; 2397 2398 mutex_exit(&db->db_mtx); 2399 dbuf_write(dr, db->db_buf, tx); 2400 2401 zio = dr->dr_zio; 2402 mutex_enter(&dr->dt.di.dr_mtx); 2403 dbuf_sync_list(&dr->dt.di.dr_children, tx); 2404 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2405 mutex_exit(&dr->dt.di.dr_mtx); 2406 zio_nowait(zio); 2407 } 2408 2409 static void 2410 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2411 { 2412 arc_buf_t **datap = &dr->dt.dl.dr_data; 2413 dmu_buf_impl_t *db = dr->dr_dbuf; 2414 dnode_t *dn; 2415 objset_t *os; 2416 uint64_t txg = tx->tx_txg; 2417 2418 ASSERT(dmu_tx_is_syncing(tx)); 2419 2420 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2421 2422 mutex_enter(&db->db_mtx); 2423 /* 2424 * To be synced, we must be dirtied. But we 2425 * might have been freed after the dirty. 2426 */ 2427 if (db->db_state == DB_UNCACHED) { 2428 /* This buffer has been freed since it was dirtied */ 2429 ASSERT(db->db.db_data == NULL); 2430 } else if (db->db_state == DB_FILL) { 2431 /* This buffer was freed and is now being re-filled */ 2432 ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2433 } else { 2434 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2435 } 2436 DBUF_VERIFY(db); 2437 2438 DB_DNODE_ENTER(db); 2439 dn = DB_DNODE(db); 2440 2441 if (db->db_blkid == DMU_SPILL_BLKID) { 2442 mutex_enter(&dn->dn_mtx); 2443 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 2444 mutex_exit(&dn->dn_mtx); 2445 } 2446 2447 /* 2448 * If this is a bonus buffer, simply copy the bonus data into the 2449 * dnode. It will be written out when the dnode is synced (and it 2450 * will be synced, since it must have been dirty for dbuf_sync to 2451 * be called). 2452 */ 2453 if (db->db_blkid == DMU_BONUS_BLKID) { 2454 dbuf_dirty_record_t **drp; 2455 2456 ASSERT(*datap != NULL); 2457 ASSERT0(db->db_level); 2458 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2459 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2460 DB_DNODE_EXIT(db); 2461 2462 if (*datap != db->db.db_data) { 2463 zio_buf_free(*datap, DN_MAX_BONUSLEN); 2464 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2465 } 2466 db->db_data_pending = NULL; 2467 drp = &db->db_last_dirty; 2468 while (*drp != dr) 2469 drp = &(*drp)->dr_next; 2470 ASSERT(dr->dr_next == NULL); 2471 ASSERT(dr->dr_dbuf == db); 2472 *drp = dr->dr_next; 2473 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2474 ASSERT(db->db_dirtycnt > 0); 2475 db->db_dirtycnt -= 1; 2476 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2477 return; 2478 } 2479 2480 os = dn->dn_objset; 2481 2482 /* 2483 * This function may have dropped the db_mtx lock allowing a dmu_sync 2484 * operation to sneak in. As a result, we need to ensure that we 2485 * don't check the dr_override_state until we have returned from 2486 * dbuf_check_blkptr. 2487 */ 2488 dbuf_check_blkptr(dn, db); 2489 2490 /* 2491 * If this buffer is in the middle of an immediate write, 2492 * wait for the synchronous IO to complete. 2493 */ 2494 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2495 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2496 cv_wait(&db->db_changed, &db->db_mtx); 2497 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2498 } 2499 2500 if (db->db_state != DB_NOFILL && 2501 dn->dn_object != DMU_META_DNODE_OBJECT && 2502 refcount_count(&db->db_holds) > 1 && 2503 dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2504 *datap == db->db_buf) { 2505 /* 2506 * If this buffer is currently "in use" (i.e., there 2507 * are active holds and db_data still references it), 2508 * then make a copy before we start the write so that 2509 * any modifications from the open txg will not leak 2510 * into this write. 2511 * 2512 * NOTE: this copy does not need to be made for 2513 * objects only modified in the syncing context (e.g. 2514 * DNONE_DNODE blocks). 2515 */ 2516 int blksz = arc_buf_size(*datap); 2517 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2518 *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2519 bcopy(db->db.db_data, (*datap)->b_data, blksz); 2520 } 2521 db->db_data_pending = dr; 2522 2523 mutex_exit(&db->db_mtx); 2524 2525 dbuf_write(dr, *datap, tx); 2526 2527 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2528 if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2529 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2530 DB_DNODE_EXIT(db); 2531 } else { 2532 /* 2533 * Although zio_nowait() does not "wait for an IO", it does 2534 * initiate the IO. If this is an empty write it seems plausible 2535 * that the IO could actually be completed before the nowait 2536 * returns. We need to DB_DNODE_EXIT() first in case 2537 * zio_nowait() invalidates the dbuf. 2538 */ 2539 DB_DNODE_EXIT(db); 2540 zio_nowait(dr->dr_zio); 2541 } 2542 } 2543 2544 void 2545 dbuf_sync_list(list_t *list, dmu_tx_t *tx) 2546 { 2547 dbuf_dirty_record_t *dr; 2548 2549 while (dr = list_head(list)) { 2550 if (dr->dr_zio != NULL) { 2551 /* 2552 * If we find an already initialized zio then we 2553 * are processing the meta-dnode, and we have finished. 2554 * The dbufs for all dnodes are put back on the list 2555 * during processing, so that we can zio_wait() 2556 * these IOs after initiating all child IOs. 2557 */ 2558 ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2559 DMU_META_DNODE_OBJECT); 2560 break; 2561 } 2562 list_remove(list, dr); 2563 if (dr->dr_dbuf->db_level > 0) 2564 dbuf_sync_indirect(dr, tx); 2565 else 2566 dbuf_sync_leaf(dr, tx); 2567 } 2568 } 2569 2570 /* ARGSUSED */ 2571 static void 2572 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2573 { 2574 dmu_buf_impl_t *db = vdb; 2575 dnode_t *dn; 2576 blkptr_t *bp = zio->io_bp; 2577 blkptr_t *bp_orig = &zio->io_bp_orig; 2578 spa_t *spa = zio->io_spa; 2579 int64_t delta; 2580 uint64_t fill = 0; 2581 int i; 2582 2583 ASSERT3P(db->db_blkptr, ==, bp); 2584 2585 DB_DNODE_ENTER(db); 2586 dn = DB_DNODE(db); 2587 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2588 dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2589 zio->io_prev_space_delta = delta; 2590 2591 if (bp->blk_birth != 0) { 2592 ASSERT((db->db_blkid != DMU_SPILL_BLKID && 2593 BP_GET_TYPE(bp) == dn->dn_type) || 2594 (db->db_blkid == DMU_SPILL_BLKID && 2595 BP_GET_TYPE(bp) == dn->dn_bonustype) || 2596 BP_IS_EMBEDDED(bp)); 2597 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2598 } 2599 2600 mutex_enter(&db->db_mtx); 2601 2602 #ifdef ZFS_DEBUG 2603 if (db->db_blkid == DMU_SPILL_BLKID) { 2604 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2605 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2606 db->db_blkptr == &dn->dn_phys->dn_spill); 2607 } 2608 #endif 2609 2610 if (db->db_level == 0) { 2611 mutex_enter(&dn->dn_mtx); 2612 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2613 db->db_blkid != DMU_SPILL_BLKID) 2614 dn->dn_phys->dn_maxblkid = db->db_blkid; 2615 mutex_exit(&dn->dn_mtx); 2616 2617 if (dn->dn_type == DMU_OT_DNODE) { 2618 dnode_phys_t *dnp = db->db.db_data; 2619 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2620 i--, dnp++) { 2621 if (dnp->dn_type != DMU_OT_NONE) 2622 fill++; 2623 } 2624 } else { 2625 if (BP_IS_HOLE(bp)) { 2626 fill = 0; 2627 } else { 2628 fill = 1; 2629 } 2630 } 2631 } else { 2632 blkptr_t *ibp = db->db.db_data; 2633 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2634 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2635 if (BP_IS_HOLE(ibp)) 2636 continue; 2637 fill += BP_GET_FILL(ibp); 2638 } 2639 } 2640 DB_DNODE_EXIT(db); 2641 2642 if (!BP_IS_EMBEDDED(bp)) 2643 bp->blk_fill = fill; 2644 2645 mutex_exit(&db->db_mtx); 2646 } 2647 2648 /* 2649 * The SPA will call this callback several times for each zio - once 2650 * for every physical child i/o (zio->io_phys_children times). This 2651 * allows the DMU to monitor the progress of each logical i/o. For example, 2652 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 2653 * block. There may be a long delay before all copies/fragments are completed, 2654 * so this callback allows us to retire dirty space gradually, as the physical 2655 * i/os complete. 2656 */ 2657 /* ARGSUSED */ 2658 static void 2659 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) 2660 { 2661 dmu_buf_impl_t *db = arg; 2662 objset_t *os = db->db_objset; 2663 dsl_pool_t *dp = dmu_objset_pool(os); 2664 dbuf_dirty_record_t *dr; 2665 int delta = 0; 2666 2667 dr = db->db_data_pending; 2668 ASSERT3U(dr->dr_txg, ==, zio->io_txg); 2669 2670 /* 2671 * The callback will be called io_phys_children times. Retire one 2672 * portion of our dirty space each time we are called. Any rounding 2673 * error will be cleaned up by dsl_pool_sync()'s call to 2674 * dsl_pool_undirty_space(). 2675 */ 2676 delta = dr->dr_accounted / zio->io_phys_children; 2677 dsl_pool_undirty_space(dp, delta, zio->io_txg); 2678 } 2679 2680 /* ARGSUSED */ 2681 static void 2682 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2683 { 2684 dmu_buf_impl_t *db = vdb; 2685 blkptr_t *bp_orig = &zio->io_bp_orig; 2686 blkptr_t *bp = db->db_blkptr; 2687 objset_t *os = db->db_objset; 2688 dmu_tx_t *tx = os->os_synctx; 2689 dbuf_dirty_record_t **drp, *dr; 2690 2691 ASSERT0(zio->io_error); 2692 ASSERT(db->db_blkptr == bp); 2693 2694 /* 2695 * For nopwrites and rewrites we ensure that the bp matches our 2696 * original and bypass all the accounting. 2697 */ 2698 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 2699 ASSERT(BP_EQUAL(bp, bp_orig)); 2700 } else { 2701 dsl_dataset_t *ds = os->os_dsl_dataset; 2702 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2703 dsl_dataset_block_born(ds, bp, tx); 2704 } 2705 2706 mutex_enter(&db->db_mtx); 2707 2708 DBUF_VERIFY(db); 2709 2710 drp = &db->db_last_dirty; 2711 while ((dr = *drp) != db->db_data_pending) 2712 drp = &dr->dr_next; 2713 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2714 ASSERT(dr->dr_dbuf == db); 2715 ASSERT(dr->dr_next == NULL); 2716 *drp = dr->dr_next; 2717 2718 #ifdef ZFS_DEBUG 2719 if (db->db_blkid == DMU_SPILL_BLKID) { 2720 dnode_t *dn; 2721 2722 DB_DNODE_ENTER(db); 2723 dn = DB_DNODE(db); 2724 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2725 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2726 db->db_blkptr == &dn->dn_phys->dn_spill); 2727 DB_DNODE_EXIT(db); 2728 } 2729 #endif 2730 2731 if (db->db_level == 0) { 2732 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2733 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2734 if (db->db_state != DB_NOFILL) { 2735 if (dr->dt.dl.dr_data != db->db_buf) 2736 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2737 db)); 2738 else if (!arc_released(db->db_buf)) 2739 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2740 } 2741 } else { 2742 dnode_t *dn; 2743 2744 DB_DNODE_ENTER(db); 2745 dn = DB_DNODE(db); 2746 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2747 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 2748 if (!BP_IS_HOLE(db->db_blkptr)) { 2749 int epbs = 2750 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2751 ASSERT3U(db->db_blkid, <=, 2752 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 2753 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2754 db->db.db_size); 2755 if (!arc_released(db->db_buf)) 2756 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2757 } 2758 DB_DNODE_EXIT(db); 2759 mutex_destroy(&dr->dt.di.dr_mtx); 2760 list_destroy(&dr->dt.di.dr_children); 2761 } 2762 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2763 2764 cv_broadcast(&db->db_changed); 2765 ASSERT(db->db_dirtycnt > 0); 2766 db->db_dirtycnt -= 1; 2767 db->db_data_pending = NULL; 2768 dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); 2769 } 2770 2771 static void 2772 dbuf_write_nofill_ready(zio_t *zio) 2773 { 2774 dbuf_write_ready(zio, NULL, zio->io_private); 2775 } 2776 2777 static void 2778 dbuf_write_nofill_done(zio_t *zio) 2779 { 2780 dbuf_write_done(zio, NULL, zio->io_private); 2781 } 2782 2783 static void 2784 dbuf_write_override_ready(zio_t *zio) 2785 { 2786 dbuf_dirty_record_t *dr = zio->io_private; 2787 dmu_buf_impl_t *db = dr->dr_dbuf; 2788 2789 dbuf_write_ready(zio, NULL, db); 2790 } 2791 2792 static void 2793 dbuf_write_override_done(zio_t *zio) 2794 { 2795 dbuf_dirty_record_t *dr = zio->io_private; 2796 dmu_buf_impl_t *db = dr->dr_dbuf; 2797 blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2798 2799 mutex_enter(&db->db_mtx); 2800 if (!BP_EQUAL(zio->io_bp, obp)) { 2801 if (!BP_IS_HOLE(obp)) 2802 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2803 arc_release(dr->dt.dl.dr_data, db); 2804 } 2805 mutex_exit(&db->db_mtx); 2806 2807 dbuf_write_done(zio, NULL, db); 2808 } 2809 2810 /* Issue I/O to commit a dirty buffer to disk. */ 2811 static void 2812 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2813 { 2814 dmu_buf_impl_t *db = dr->dr_dbuf; 2815 dnode_t *dn; 2816 objset_t *os; 2817 dmu_buf_impl_t *parent = db->db_parent; 2818 uint64_t txg = tx->tx_txg; 2819 zbookmark_phys_t zb; 2820 zio_prop_t zp; 2821 zio_t *zio; 2822 int wp_flag = 0; 2823 2824 DB_DNODE_ENTER(db); 2825 dn = DB_DNODE(db); 2826 os = dn->dn_objset; 2827 2828 if (db->db_state != DB_NOFILL) { 2829 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2830 /* 2831 * Private object buffers are released here rather 2832 * than in dbuf_dirty() since they are only modified 2833 * in the syncing context and we don't want the 2834 * overhead of making multiple copies of the data. 2835 */ 2836 if (BP_IS_HOLE(db->db_blkptr)) { 2837 arc_buf_thaw(data); 2838 } else { 2839 dbuf_release_bp(db); 2840 } 2841 } 2842 } 2843 2844 if (parent != dn->dn_dbuf) { 2845 /* Our parent is an indirect block. */ 2846 /* We have a dirty parent that has been scheduled for write. */ 2847 ASSERT(parent && parent->db_data_pending); 2848 /* Our parent's buffer is one level closer to the dnode. */ 2849 ASSERT(db->db_level == parent->db_level-1); 2850 /* 2851 * We're about to modify our parent's db_data by modifying 2852 * our block pointer, so the parent must be released. 2853 */ 2854 ASSERT(arc_released(parent->db_buf)); 2855 zio = parent->db_data_pending->dr_zio; 2856 } else { 2857 /* Our parent is the dnode itself. */ 2858 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 2859 db->db_blkid != DMU_SPILL_BLKID) || 2860 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 2861 if (db->db_blkid != DMU_SPILL_BLKID) 2862 ASSERT3P(db->db_blkptr, ==, 2863 &dn->dn_phys->dn_blkptr[db->db_blkid]); 2864 zio = dn->dn_zio; 2865 } 2866 2867 ASSERT(db->db_level == 0 || data == db->db_buf); 2868 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2869 ASSERT(zio); 2870 2871 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2872 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2873 db->db.db_object, db->db_level, db->db_blkid); 2874 2875 if (db->db_blkid == DMU_SPILL_BLKID) 2876 wp_flag = WP_SPILL; 2877 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 2878 2879 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 2880 DB_DNODE_EXIT(db); 2881 2882 if (db->db_level == 0 && 2883 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2884 /* 2885 * The BP for this block has been provided by open context 2886 * (by dmu_sync() or dmu_buf_write_embedded()). 2887 */ 2888 void *contents = (data != NULL) ? data->b_data : NULL; 2889 2890 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2891 db->db_blkptr, contents, db->db.db_size, &zp, 2892 dbuf_write_override_ready, NULL, dbuf_write_override_done, 2893 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2894 mutex_enter(&db->db_mtx); 2895 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2896 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2897 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 2898 mutex_exit(&db->db_mtx); 2899 } else if (db->db_state == DB_NOFILL) { 2900 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || 2901 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 2902 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2903 db->db_blkptr, NULL, db->db.db_size, &zp, 2904 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, 2905 ZIO_PRIORITY_ASYNC_WRITE, 2906 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2907 } else { 2908 ASSERT(arc_released(data)); 2909 dr->dr_zio = arc_write(zio, os->os_spa, txg, 2910 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), 2911 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, 2912 dbuf_write_physdone, dbuf_write_done, db, 2913 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2914 } 2915 } 2916