1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/dmu.h> 32 #include <sys/dmu_send.h> 33 #include <sys/dmu_impl.h> 34 #include <sys/dbuf.h> 35 #include <sys/dmu_objset.h> 36 #include <sys/dsl_dataset.h> 37 #include <sys/dsl_dir.h> 38 #include <sys/dmu_tx.h> 39 #include <sys/spa.h> 40 #include <sys/zio.h> 41 #include <sys/dmu_zfetch.h> 42 #include <sys/sa.h> 43 #include <sys/sa_impl.h> 44 #include <sys/zfeature.h> 45 #include <sys/blkptr.h> 46 #include <sys/range_tree.h> 47 48 /* 49 * Number of times that zfs_free_range() took the slow path while doing 50 * a zfs receive. A nonzero value indicates a potential performance problem. 51 */ 52 uint64_t zfs_free_range_recv_miss; 53 54 static void dbuf_destroy(dmu_buf_impl_t *db); 55 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 56 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 57 58 #ifndef __lint 59 extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, 60 dmu_buf_evict_func_t *evict_func_sync, 61 dmu_buf_evict_func_t *evict_func_async, 62 dmu_buf_t **clear_on_evict_dbufp); 63 #endif /* ! __lint */ 64 65 /* 66 * Global data structures and functions for the dbuf cache. 67 */ 68 static kmem_cache_t *dbuf_cache; 69 static taskq_t *dbu_evict_taskq; 70 71 /* ARGSUSED */ 72 static int 73 dbuf_cons(void *vdb, void *unused, int kmflag) 74 { 75 dmu_buf_impl_t *db = vdb; 76 bzero(db, sizeof (dmu_buf_impl_t)); 77 78 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 79 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 80 refcount_create(&db->db_holds); 81 82 return (0); 83 } 84 85 /* ARGSUSED */ 86 static void 87 dbuf_dest(void *vdb, void *unused) 88 { 89 dmu_buf_impl_t *db = vdb; 90 mutex_destroy(&db->db_mtx); 91 cv_destroy(&db->db_changed); 92 refcount_destroy(&db->db_holds); 93 } 94 95 /* 96 * dbuf hash table routines 97 */ 98 static dbuf_hash_table_t dbuf_hash_table; 99 100 static uint64_t dbuf_hash_count; 101 102 static uint64_t 103 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 104 { 105 uintptr_t osv = (uintptr_t)os; 106 uint64_t crc = -1ULL; 107 108 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 109 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 110 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 111 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 112 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 113 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 114 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 115 116 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 117 118 return (crc); 119 } 120 121 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 122 123 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 124 ((dbuf)->db.db_object == (obj) && \ 125 (dbuf)->db_objset == (os) && \ 126 (dbuf)->db_level == (level) && \ 127 (dbuf)->db_blkid == (blkid)) 128 129 dmu_buf_impl_t * 130 dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) 131 { 132 dbuf_hash_table_t *h = &dbuf_hash_table; 133 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 134 uint64_t idx = hv & h->hash_table_mask; 135 dmu_buf_impl_t *db; 136 137 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 138 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 139 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 140 mutex_enter(&db->db_mtx); 141 if (db->db_state != DB_EVICTING) { 142 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 143 return (db); 144 } 145 mutex_exit(&db->db_mtx); 146 } 147 } 148 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 149 return (NULL); 150 } 151 152 static dmu_buf_impl_t * 153 dbuf_find_bonus(objset_t *os, uint64_t object) 154 { 155 dnode_t *dn; 156 dmu_buf_impl_t *db = NULL; 157 158 if (dnode_hold(os, object, FTAG, &dn) == 0) { 159 rw_enter(&dn->dn_struct_rwlock, RW_READER); 160 if (dn->dn_bonus != NULL) { 161 db = dn->dn_bonus; 162 mutex_enter(&db->db_mtx); 163 } 164 rw_exit(&dn->dn_struct_rwlock); 165 dnode_rele(dn, FTAG); 166 } 167 return (db); 168 } 169 170 /* 171 * Insert an entry into the hash table. If there is already an element 172 * equal to elem in the hash table, then the already existing element 173 * will be returned and the new element will not be inserted. 174 * Otherwise returns NULL. 175 */ 176 static dmu_buf_impl_t * 177 dbuf_hash_insert(dmu_buf_impl_t *db) 178 { 179 dbuf_hash_table_t *h = &dbuf_hash_table; 180 objset_t *os = db->db_objset; 181 uint64_t obj = db->db.db_object; 182 int level = db->db_level; 183 uint64_t blkid = db->db_blkid; 184 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 185 uint64_t idx = hv & h->hash_table_mask; 186 dmu_buf_impl_t *dbf; 187 188 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 189 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 190 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 191 mutex_enter(&dbf->db_mtx); 192 if (dbf->db_state != DB_EVICTING) { 193 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 194 return (dbf); 195 } 196 mutex_exit(&dbf->db_mtx); 197 } 198 } 199 200 mutex_enter(&db->db_mtx); 201 db->db_hash_next = h->hash_table[idx]; 202 h->hash_table[idx] = db; 203 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 204 atomic_inc_64(&dbuf_hash_count); 205 206 return (NULL); 207 } 208 209 /* 210 * Remove an entry from the hash table. It must be in the EVICTING state. 211 */ 212 static void 213 dbuf_hash_remove(dmu_buf_impl_t *db) 214 { 215 dbuf_hash_table_t *h = &dbuf_hash_table; 216 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 217 db->db_level, db->db_blkid); 218 uint64_t idx = hv & h->hash_table_mask; 219 dmu_buf_impl_t *dbf, **dbp; 220 221 /* 222 * We musn't hold db_mtx to maintain lock ordering: 223 * DBUF_HASH_MUTEX > db_mtx. 224 */ 225 ASSERT(refcount_is_zero(&db->db_holds)); 226 ASSERT(db->db_state == DB_EVICTING); 227 ASSERT(!MUTEX_HELD(&db->db_mtx)); 228 229 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 230 dbp = &h->hash_table[idx]; 231 while ((dbf = *dbp) != db) { 232 dbp = &dbf->db_hash_next; 233 ASSERT(dbf != NULL); 234 } 235 *dbp = db->db_hash_next; 236 db->db_hash_next = NULL; 237 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 238 atomic_dec_64(&dbuf_hash_count); 239 } 240 241 static arc_evict_func_t dbuf_do_evict; 242 243 typedef enum { 244 DBVU_EVICTING, 245 DBVU_NOT_EVICTING 246 } dbvu_verify_type_t; 247 248 static void 249 dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) 250 { 251 #ifdef ZFS_DEBUG 252 int64_t holds; 253 254 if (db->db_user == NULL) 255 return; 256 257 /* Only data blocks support the attachment of user data. */ 258 ASSERT(db->db_level == 0); 259 260 /* Clients must resolve a dbuf before attaching user data. */ 261 ASSERT(db->db.db_data != NULL); 262 ASSERT3U(db->db_state, ==, DB_CACHED); 263 264 holds = refcount_count(&db->db_holds); 265 if (verify_type == DBVU_EVICTING) { 266 /* 267 * Immediate eviction occurs when holds == dirtycnt. 268 * For normal eviction buffers, holds is zero on 269 * eviction, except when dbuf_fix_old_data() calls 270 * dbuf_clear_data(). However, the hold count can grow 271 * during eviction even though db_mtx is held (see 272 * dmu_bonus_hold() for an example), so we can only 273 * test the generic invariant that holds >= dirtycnt. 274 */ 275 ASSERT3U(holds, >=, db->db_dirtycnt); 276 } else { 277 if (db->db_user_immediate_evict == TRUE) 278 ASSERT3U(holds, >=, db->db_dirtycnt); 279 else 280 ASSERT3U(holds, >, 0); 281 } 282 #endif 283 } 284 285 static void 286 dbuf_evict_user(dmu_buf_impl_t *db) 287 { 288 dmu_buf_user_t *dbu = db->db_user; 289 290 ASSERT(MUTEX_HELD(&db->db_mtx)); 291 292 if (dbu == NULL) 293 return; 294 295 dbuf_verify_user(db, DBVU_EVICTING); 296 db->db_user = NULL; 297 298 #ifdef ZFS_DEBUG 299 if (dbu->dbu_clear_on_evict_dbufp != NULL) 300 *dbu->dbu_clear_on_evict_dbufp = NULL; 301 #endif 302 303 /* 304 * There are two eviction callbacks - one that we call synchronously 305 * and one that we invoke via a taskq. The async one is useful for 306 * avoiding lock order reversals and limiting stack depth. 307 * 308 * Note that if we have a sync callback but no async callback, 309 * it's likely that the sync callback will free the structure 310 * containing the dbu. In that case we need to take care to not 311 * dereference dbu after calling the sync evict func. 312 */ 313 boolean_t has_async = (dbu->dbu_evict_func_async != NULL); 314 315 if (dbu->dbu_evict_func_sync != NULL) 316 dbu->dbu_evict_func_sync(dbu); 317 318 if (has_async) { 319 taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async, 320 dbu, 0, &dbu->dbu_tqent); 321 } 322 } 323 324 boolean_t 325 dbuf_is_metadata(dmu_buf_impl_t *db) 326 { 327 if (db->db_level > 0) { 328 return (B_TRUE); 329 } else { 330 boolean_t is_metadata; 331 332 DB_DNODE_ENTER(db); 333 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 334 DB_DNODE_EXIT(db); 335 336 return (is_metadata); 337 } 338 } 339 340 void 341 dbuf_evict(dmu_buf_impl_t *db) 342 { 343 ASSERT(MUTEX_HELD(&db->db_mtx)); 344 ASSERT(db->db_buf == NULL); 345 ASSERT(db->db_data_pending == NULL); 346 347 dbuf_clear(db); 348 dbuf_destroy(db); 349 } 350 351 void 352 dbuf_init(void) 353 { 354 uint64_t hsize = 1ULL << 16; 355 dbuf_hash_table_t *h = &dbuf_hash_table; 356 int i; 357 358 /* 359 * The hash table is big enough to fill all of physical memory 360 * with an average 4K block size. The table will take up 361 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 362 */ 363 while (hsize * 4096 < physmem * PAGESIZE) 364 hsize <<= 1; 365 366 retry: 367 h->hash_table_mask = hsize - 1; 368 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 369 if (h->hash_table == NULL) { 370 /* XXX - we should really return an error instead of assert */ 371 ASSERT(hsize > (1ULL << 10)); 372 hsize >>= 1; 373 goto retry; 374 } 375 376 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 377 sizeof (dmu_buf_impl_t), 378 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 379 380 for (i = 0; i < DBUF_MUTEXES; i++) 381 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 382 383 /* 384 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc 385 * configuration is not required. 386 */ 387 dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); 388 } 389 390 void 391 dbuf_fini(void) 392 { 393 dbuf_hash_table_t *h = &dbuf_hash_table; 394 int i; 395 396 for (i = 0; i < DBUF_MUTEXES; i++) 397 mutex_destroy(&h->hash_mutexes[i]); 398 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 399 kmem_cache_destroy(dbuf_cache); 400 taskq_destroy(dbu_evict_taskq); 401 } 402 403 /* 404 * Other stuff. 405 */ 406 407 #ifdef ZFS_DEBUG 408 static void 409 dbuf_verify(dmu_buf_impl_t *db) 410 { 411 dnode_t *dn; 412 dbuf_dirty_record_t *dr; 413 414 ASSERT(MUTEX_HELD(&db->db_mtx)); 415 416 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 417 return; 418 419 ASSERT(db->db_objset != NULL); 420 DB_DNODE_ENTER(db); 421 dn = DB_DNODE(db); 422 if (dn == NULL) { 423 ASSERT(db->db_parent == NULL); 424 ASSERT(db->db_blkptr == NULL); 425 } else { 426 ASSERT3U(db->db.db_object, ==, dn->dn_object); 427 ASSERT3P(db->db_objset, ==, dn->dn_objset); 428 ASSERT3U(db->db_level, <, dn->dn_nlevels); 429 ASSERT(db->db_blkid == DMU_BONUS_BLKID || 430 db->db_blkid == DMU_SPILL_BLKID || 431 !avl_is_empty(&dn->dn_dbufs)); 432 } 433 if (db->db_blkid == DMU_BONUS_BLKID) { 434 ASSERT(dn != NULL); 435 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 436 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 437 } else if (db->db_blkid == DMU_SPILL_BLKID) { 438 ASSERT(dn != NULL); 439 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 440 ASSERT0(db->db.db_offset); 441 } else { 442 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 443 } 444 445 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 446 ASSERT(dr->dr_dbuf == db); 447 448 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 449 ASSERT(dr->dr_dbuf == db); 450 451 /* 452 * We can't assert that db_size matches dn_datablksz because it 453 * can be momentarily different when another thread is doing 454 * dnode_set_blksz(). 455 */ 456 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 457 dr = db->db_data_pending; 458 /* 459 * It should only be modified in syncing context, so 460 * make sure we only have one copy of the data. 461 */ 462 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 463 } 464 465 /* verify db->db_blkptr */ 466 if (db->db_blkptr) { 467 if (db->db_parent == dn->dn_dbuf) { 468 /* db is pointed to by the dnode */ 469 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 470 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 471 ASSERT(db->db_parent == NULL); 472 else 473 ASSERT(db->db_parent != NULL); 474 if (db->db_blkid != DMU_SPILL_BLKID) 475 ASSERT3P(db->db_blkptr, ==, 476 &dn->dn_phys->dn_blkptr[db->db_blkid]); 477 } else { 478 /* db is pointed to by an indirect block */ 479 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 480 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 481 ASSERT3U(db->db_parent->db.db_object, ==, 482 db->db.db_object); 483 /* 484 * dnode_grow_indblksz() can make this fail if we don't 485 * have the struct_rwlock. XXX indblksz no longer 486 * grows. safe to do this now? 487 */ 488 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 489 ASSERT3P(db->db_blkptr, ==, 490 ((blkptr_t *)db->db_parent->db.db_data + 491 db->db_blkid % epb)); 492 } 493 } 494 } 495 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 496 (db->db_buf == NULL || db->db_buf->b_data) && 497 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 498 db->db_state != DB_FILL && !dn->dn_free_txg) { 499 /* 500 * If the blkptr isn't set but they have nonzero data, 501 * it had better be dirty, otherwise we'll lose that 502 * data when we evict this buffer. 503 */ 504 if (db->db_dirtycnt == 0) { 505 uint64_t *buf = db->db.db_data; 506 int i; 507 508 for (i = 0; i < db->db.db_size >> 3; i++) { 509 ASSERT(buf[i] == 0); 510 } 511 } 512 } 513 DB_DNODE_EXIT(db); 514 } 515 #endif 516 517 static void 518 dbuf_clear_data(dmu_buf_impl_t *db) 519 { 520 ASSERT(MUTEX_HELD(&db->db_mtx)); 521 dbuf_evict_user(db); 522 db->db_buf = NULL; 523 db->db.db_data = NULL; 524 if (db->db_state != DB_NOFILL) 525 db->db_state = DB_UNCACHED; 526 } 527 528 static void 529 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 530 { 531 ASSERT(MUTEX_HELD(&db->db_mtx)); 532 ASSERT(buf != NULL); 533 534 db->db_buf = buf; 535 ASSERT(buf->b_data != NULL); 536 db->db.db_data = buf->b_data; 537 if (!arc_released(buf)) 538 arc_set_callback(buf, dbuf_do_evict, db); 539 } 540 541 /* 542 * Loan out an arc_buf for read. Return the loaned arc_buf. 543 */ 544 arc_buf_t * 545 dbuf_loan_arcbuf(dmu_buf_impl_t *db) 546 { 547 arc_buf_t *abuf; 548 549 mutex_enter(&db->db_mtx); 550 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 551 int blksz = db->db.db_size; 552 spa_t *spa = db->db_objset->os_spa; 553 554 mutex_exit(&db->db_mtx); 555 abuf = arc_loan_buf(spa, blksz); 556 bcopy(db->db.db_data, abuf->b_data, blksz); 557 } else { 558 abuf = db->db_buf; 559 arc_loan_inuse_buf(abuf, db); 560 dbuf_clear_data(db); 561 mutex_exit(&db->db_mtx); 562 } 563 return (abuf); 564 } 565 566 /* 567 * Calculate which level n block references the data at the level 0 offset 568 * provided. 569 */ 570 uint64_t 571 dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) 572 { 573 if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { 574 /* 575 * The level n blkid is equal to the level 0 blkid divided by 576 * the number of level 0s in a level n block. 577 * 578 * The level 0 blkid is offset >> datablkshift = 579 * offset / 2^datablkshift. 580 * 581 * The number of level 0s in a level n is the number of block 582 * pointers in an indirect block, raised to the power of level. 583 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = 584 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). 585 * 586 * Thus, the level n blkid is: offset / 587 * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) 588 * = offset / 2^(datablkshift + level * 589 * (indblkshift - SPA_BLKPTRSHIFT)) 590 * = offset >> (datablkshift + level * 591 * (indblkshift - SPA_BLKPTRSHIFT)) 592 */ 593 return (offset >> (dn->dn_datablkshift + level * 594 (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); 595 } else { 596 ASSERT3U(offset, <, dn->dn_datablksz); 597 return (0); 598 } 599 } 600 601 static void 602 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 603 { 604 dmu_buf_impl_t *db = vdb; 605 606 mutex_enter(&db->db_mtx); 607 ASSERT3U(db->db_state, ==, DB_READ); 608 /* 609 * All reads are synchronous, so we must have a hold on the dbuf 610 */ 611 ASSERT(refcount_count(&db->db_holds) > 0); 612 ASSERT(db->db_buf == NULL); 613 ASSERT(db->db.db_data == NULL); 614 if (db->db_level == 0 && db->db_freed_in_flight) { 615 /* we were freed in flight; disregard any error */ 616 arc_release(buf, db); 617 bzero(buf->b_data, db->db.db_size); 618 arc_buf_freeze(buf); 619 db->db_freed_in_flight = FALSE; 620 dbuf_set_data(db, buf); 621 db->db_state = DB_CACHED; 622 } else if (zio == NULL || zio->io_error == 0) { 623 dbuf_set_data(db, buf); 624 db->db_state = DB_CACHED; 625 } else { 626 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 627 ASSERT3P(db->db_buf, ==, NULL); 628 VERIFY(arc_buf_remove_ref(buf, db)); 629 db->db_state = DB_UNCACHED; 630 } 631 cv_broadcast(&db->db_changed); 632 dbuf_rele_and_unlock(db, NULL); 633 } 634 635 static void 636 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 637 { 638 dnode_t *dn; 639 zbookmark_phys_t zb; 640 arc_flags_t aflags = ARC_FLAG_NOWAIT; 641 642 DB_DNODE_ENTER(db); 643 dn = DB_DNODE(db); 644 ASSERT(!refcount_is_zero(&db->db_holds)); 645 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 646 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 647 ASSERT(MUTEX_HELD(&db->db_mtx)); 648 ASSERT(db->db_state == DB_UNCACHED); 649 ASSERT(db->db_buf == NULL); 650 651 if (db->db_blkid == DMU_BONUS_BLKID) { 652 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 653 654 ASSERT3U(bonuslen, <=, db->db.db_size); 655 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 656 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 657 if (bonuslen < DN_MAX_BONUSLEN) 658 bzero(db->db.db_data, DN_MAX_BONUSLEN); 659 if (bonuslen) 660 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 661 DB_DNODE_EXIT(db); 662 db->db_state = DB_CACHED; 663 mutex_exit(&db->db_mtx); 664 return; 665 } 666 667 /* 668 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 669 * processes the delete record and clears the bp while we are waiting 670 * for the dn_mtx (resulting in a "no" from block_freed). 671 */ 672 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 673 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 674 BP_IS_HOLE(db->db_blkptr)))) { 675 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 676 677 DB_DNODE_EXIT(db); 678 dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, 679 db->db.db_size, db, type)); 680 bzero(db->db.db_data, db->db.db_size); 681 db->db_state = DB_CACHED; 682 *flags |= DB_RF_CACHED; 683 mutex_exit(&db->db_mtx); 684 return; 685 } 686 687 DB_DNODE_EXIT(db); 688 689 db->db_state = DB_READ; 690 mutex_exit(&db->db_mtx); 691 692 if (DBUF_IS_L2CACHEABLE(db)) 693 aflags |= ARC_FLAG_L2CACHE; 694 if (DBUF_IS_L2COMPRESSIBLE(db)) 695 aflags |= ARC_FLAG_L2COMPRESS; 696 697 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 698 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 699 db->db.db_object, db->db_level, db->db_blkid); 700 701 dbuf_add_ref(db, NULL); 702 703 (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 704 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 705 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 706 &aflags, &zb); 707 if (aflags & ARC_FLAG_CACHED) 708 *flags |= DB_RF_CACHED; 709 } 710 711 int 712 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 713 { 714 int err = 0; 715 boolean_t havepzio = (zio != NULL); 716 boolean_t prefetch; 717 dnode_t *dn; 718 719 /* 720 * We don't have to hold the mutex to check db_state because it 721 * can't be freed while we have a hold on the buffer. 722 */ 723 ASSERT(!refcount_is_zero(&db->db_holds)); 724 725 if (db->db_state == DB_NOFILL) 726 return (SET_ERROR(EIO)); 727 728 DB_DNODE_ENTER(db); 729 dn = DB_DNODE(db); 730 if ((flags & DB_RF_HAVESTRUCT) == 0) 731 rw_enter(&dn->dn_struct_rwlock, RW_READER); 732 733 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 734 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 735 DBUF_IS_CACHEABLE(db); 736 737 mutex_enter(&db->db_mtx); 738 if (db->db_state == DB_CACHED) { 739 mutex_exit(&db->db_mtx); 740 if (prefetch) 741 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 742 db->db.db_size, TRUE); 743 if ((flags & DB_RF_HAVESTRUCT) == 0) 744 rw_exit(&dn->dn_struct_rwlock); 745 DB_DNODE_EXIT(db); 746 } else if (db->db_state == DB_UNCACHED) { 747 spa_t *spa = dn->dn_objset->os_spa; 748 749 if (zio == NULL) 750 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 751 dbuf_read_impl(db, zio, &flags); 752 753 /* dbuf_read_impl has dropped db_mtx for us */ 754 755 if (prefetch) 756 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 757 db->db.db_size, flags & DB_RF_CACHED); 758 759 if ((flags & DB_RF_HAVESTRUCT) == 0) 760 rw_exit(&dn->dn_struct_rwlock); 761 DB_DNODE_EXIT(db); 762 763 if (!havepzio) 764 err = zio_wait(zio); 765 } else { 766 /* 767 * Another reader came in while the dbuf was in flight 768 * between UNCACHED and CACHED. Either a writer will finish 769 * writing the buffer (sending the dbuf to CACHED) or the 770 * first reader's request will reach the read_done callback 771 * and send the dbuf to CACHED. Otherwise, a failure 772 * occurred and the dbuf went to UNCACHED. 773 */ 774 mutex_exit(&db->db_mtx); 775 if (prefetch) 776 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 777 db->db.db_size, TRUE); 778 if ((flags & DB_RF_HAVESTRUCT) == 0) 779 rw_exit(&dn->dn_struct_rwlock); 780 DB_DNODE_EXIT(db); 781 782 /* Skip the wait per the caller's request. */ 783 mutex_enter(&db->db_mtx); 784 if ((flags & DB_RF_NEVERWAIT) == 0) { 785 while (db->db_state == DB_READ || 786 db->db_state == DB_FILL) { 787 ASSERT(db->db_state == DB_READ || 788 (flags & DB_RF_HAVESTRUCT) == 0); 789 DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, 790 db, zio_t *, zio); 791 cv_wait(&db->db_changed, &db->db_mtx); 792 } 793 if (db->db_state == DB_UNCACHED) 794 err = SET_ERROR(EIO); 795 } 796 mutex_exit(&db->db_mtx); 797 } 798 799 ASSERT(err || havepzio || db->db_state == DB_CACHED); 800 return (err); 801 } 802 803 static void 804 dbuf_noread(dmu_buf_impl_t *db) 805 { 806 ASSERT(!refcount_is_zero(&db->db_holds)); 807 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 808 mutex_enter(&db->db_mtx); 809 while (db->db_state == DB_READ || db->db_state == DB_FILL) 810 cv_wait(&db->db_changed, &db->db_mtx); 811 if (db->db_state == DB_UNCACHED) { 812 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 813 spa_t *spa = db->db_objset->os_spa; 814 815 ASSERT(db->db_buf == NULL); 816 ASSERT(db->db.db_data == NULL); 817 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 818 db->db_state = DB_FILL; 819 } else if (db->db_state == DB_NOFILL) { 820 dbuf_clear_data(db); 821 } else { 822 ASSERT3U(db->db_state, ==, DB_CACHED); 823 } 824 mutex_exit(&db->db_mtx); 825 } 826 827 /* 828 * This is our just-in-time copy function. It makes a copy of 829 * buffers, that have been modified in a previous transaction 830 * group, before we modify them in the current active group. 831 * 832 * This function is used in two places: when we are dirtying a 833 * buffer for the first time in a txg, and when we are freeing 834 * a range in a dnode that includes this buffer. 835 * 836 * Note that when we are called from dbuf_free_range() we do 837 * not put a hold on the buffer, we just traverse the active 838 * dbuf list for the dnode. 839 */ 840 static void 841 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 842 { 843 dbuf_dirty_record_t *dr = db->db_last_dirty; 844 845 ASSERT(MUTEX_HELD(&db->db_mtx)); 846 ASSERT(db->db.db_data != NULL); 847 ASSERT(db->db_level == 0); 848 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 849 850 if (dr == NULL || 851 (dr->dt.dl.dr_data != 852 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 853 return; 854 855 /* 856 * If the last dirty record for this dbuf has not yet synced 857 * and its referencing the dbuf data, either: 858 * reset the reference to point to a new copy, 859 * or (if there a no active holders) 860 * just null out the current db_data pointer. 861 */ 862 ASSERT(dr->dr_txg >= txg - 2); 863 if (db->db_blkid == DMU_BONUS_BLKID) { 864 /* Note that the data bufs here are zio_bufs */ 865 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 866 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 867 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 868 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 869 int size = db->db.db_size; 870 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 871 spa_t *spa = db->db_objset->os_spa; 872 873 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 874 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 875 } else { 876 dbuf_clear_data(db); 877 } 878 } 879 880 void 881 dbuf_unoverride(dbuf_dirty_record_t *dr) 882 { 883 dmu_buf_impl_t *db = dr->dr_dbuf; 884 blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 885 uint64_t txg = dr->dr_txg; 886 887 ASSERT(MUTEX_HELD(&db->db_mtx)); 888 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 889 ASSERT(db->db_level == 0); 890 891 if (db->db_blkid == DMU_BONUS_BLKID || 892 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 893 return; 894 895 ASSERT(db->db_data_pending != dr); 896 897 /* free this block */ 898 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 899 zio_free(db->db_objset->os_spa, txg, bp); 900 901 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 902 dr->dt.dl.dr_nopwrite = B_FALSE; 903 904 /* 905 * Release the already-written buffer, so we leave it in 906 * a consistent dirty state. Note that all callers are 907 * modifying the buffer, so they will immediately do 908 * another (redundant) arc_release(). Therefore, leave 909 * the buf thawed to save the effort of freezing & 910 * immediately re-thawing it. 911 */ 912 arc_release(dr->dt.dl.dr_data, db); 913 } 914 915 /* 916 * Evict (if its unreferenced) or clear (if its referenced) any level-0 917 * data blocks in the free range, so that any future readers will find 918 * empty blocks. 919 * 920 * This is a no-op if the dataset is in the middle of an incremental 921 * receive; see comment below for details. 922 */ 923 void 924 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, 925 dmu_tx_t *tx) 926 { 927 dmu_buf_impl_t db_search; 928 dmu_buf_impl_t *db, *db_next; 929 uint64_t txg = tx->tx_txg; 930 avl_index_t where; 931 932 if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID)) 933 end_blkid = dn->dn_maxblkid; 934 dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); 935 936 db_search.db_level = 0; 937 db_search.db_blkid = start_blkid; 938 db_search.db_state = DB_SEARCH; 939 940 mutex_enter(&dn->dn_dbufs_mtx); 941 if (start_blkid >= dn->dn_unlisted_l0_blkid) { 942 /* There can't be any dbufs in this range; no need to search. */ 943 #ifdef DEBUG 944 db = avl_find(&dn->dn_dbufs, &db_search, &where); 945 ASSERT3P(db, ==, NULL); 946 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 947 ASSERT(db == NULL || db->db_level > 0); 948 #endif 949 mutex_exit(&dn->dn_dbufs_mtx); 950 return; 951 } else if (dmu_objset_is_receiving(dn->dn_objset)) { 952 /* 953 * If we are receiving, we expect there to be no dbufs in 954 * the range to be freed, because receive modifies each 955 * block at most once, and in offset order. If this is 956 * not the case, it can lead to performance problems, 957 * so note that we unexpectedly took the slow path. 958 */ 959 atomic_inc_64(&zfs_free_range_recv_miss); 960 } 961 962 db = avl_find(&dn->dn_dbufs, &db_search, &where); 963 ASSERT3P(db, ==, NULL); 964 db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 965 966 for (; db != NULL; db = db_next) { 967 db_next = AVL_NEXT(&dn->dn_dbufs, db); 968 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 969 970 if (db->db_level != 0 || db->db_blkid > end_blkid) { 971 break; 972 } 973 ASSERT3U(db->db_blkid, >=, start_blkid); 974 975 /* found a level 0 buffer in the range */ 976 mutex_enter(&db->db_mtx); 977 if (dbuf_undirty(db, tx)) { 978 /* mutex has been dropped and dbuf destroyed */ 979 continue; 980 } 981 982 if (db->db_state == DB_UNCACHED || 983 db->db_state == DB_NOFILL || 984 db->db_state == DB_EVICTING) { 985 ASSERT(db->db.db_data == NULL); 986 mutex_exit(&db->db_mtx); 987 continue; 988 } 989 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 990 /* will be handled in dbuf_read_done or dbuf_rele */ 991 db->db_freed_in_flight = TRUE; 992 mutex_exit(&db->db_mtx); 993 continue; 994 } 995 if (refcount_count(&db->db_holds) == 0) { 996 ASSERT(db->db_buf); 997 dbuf_clear(db); 998 continue; 999 } 1000 /* The dbuf is referenced */ 1001 1002 if (db->db_last_dirty != NULL) { 1003 dbuf_dirty_record_t *dr = db->db_last_dirty; 1004 1005 if (dr->dr_txg == txg) { 1006 /* 1007 * This buffer is "in-use", re-adjust the file 1008 * size to reflect that this buffer may 1009 * contain new data when we sync. 1010 */ 1011 if (db->db_blkid != DMU_SPILL_BLKID && 1012 db->db_blkid > dn->dn_maxblkid) 1013 dn->dn_maxblkid = db->db_blkid; 1014 dbuf_unoverride(dr); 1015 } else { 1016 /* 1017 * This dbuf is not dirty in the open context. 1018 * Either uncache it (if its not referenced in 1019 * the open context) or reset its contents to 1020 * empty. 1021 */ 1022 dbuf_fix_old_data(db, txg); 1023 } 1024 } 1025 /* clear the contents if its cached */ 1026 if (db->db_state == DB_CACHED) { 1027 ASSERT(db->db.db_data != NULL); 1028 arc_release(db->db_buf, db); 1029 bzero(db->db.db_data, db->db.db_size); 1030 arc_buf_freeze(db->db_buf); 1031 } 1032 1033 mutex_exit(&db->db_mtx); 1034 } 1035 mutex_exit(&dn->dn_dbufs_mtx); 1036 } 1037 1038 static int 1039 dbuf_block_freeable(dmu_buf_impl_t *db) 1040 { 1041 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 1042 uint64_t birth_txg = 0; 1043 1044 /* 1045 * We don't need any locking to protect db_blkptr: 1046 * If it's syncing, then db_last_dirty will be set 1047 * so we'll ignore db_blkptr. 1048 * 1049 * This logic ensures that only block births for 1050 * filled blocks are considered. 1051 */ 1052 ASSERT(MUTEX_HELD(&db->db_mtx)); 1053 if (db->db_last_dirty && (db->db_blkptr == NULL || 1054 !BP_IS_HOLE(db->db_blkptr))) { 1055 birth_txg = db->db_last_dirty->dr_txg; 1056 } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 1057 birth_txg = db->db_blkptr->blk_birth; 1058 } 1059 1060 /* 1061 * If this block don't exist or is in a snapshot, it can't be freed. 1062 * Don't pass the bp to dsl_dataset_block_freeable() since we 1063 * are holding the db_mtx lock and might deadlock if we are 1064 * prefetching a dedup-ed block. 1065 */ 1066 if (birth_txg != 0) 1067 return (ds == NULL || 1068 dsl_dataset_block_freeable(ds, NULL, birth_txg)); 1069 else 1070 return (B_FALSE); 1071 } 1072 1073 void 1074 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 1075 { 1076 arc_buf_t *buf, *obuf; 1077 int osize = db->db.db_size; 1078 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1079 dnode_t *dn; 1080 1081 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1082 1083 DB_DNODE_ENTER(db); 1084 dn = DB_DNODE(db); 1085 1086 /* XXX does *this* func really need the lock? */ 1087 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1088 1089 /* 1090 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 1091 * is OK, because there can be no other references to the db 1092 * when we are changing its size, so no concurrent DB_FILL can 1093 * be happening. 1094 */ 1095 /* 1096 * XXX we should be doing a dbuf_read, checking the return 1097 * value and returning that up to our callers 1098 */ 1099 dmu_buf_will_dirty(&db->db, tx); 1100 1101 /* create the data buffer for the new block */ 1102 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 1103 1104 /* copy old block data to the new block */ 1105 obuf = db->db_buf; 1106 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 1107 /* zero the remainder */ 1108 if (size > osize) 1109 bzero((uint8_t *)buf->b_data + osize, size - osize); 1110 1111 mutex_enter(&db->db_mtx); 1112 dbuf_set_data(db, buf); 1113 VERIFY(arc_buf_remove_ref(obuf, db)); 1114 db->db.db_size = size; 1115 1116 if (db->db_level == 0) { 1117 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1118 db->db_last_dirty->dt.dl.dr_data = buf; 1119 } 1120 mutex_exit(&db->db_mtx); 1121 1122 dnode_willuse_space(dn, size-osize, tx); 1123 DB_DNODE_EXIT(db); 1124 } 1125 1126 void 1127 dbuf_release_bp(dmu_buf_impl_t *db) 1128 { 1129 objset_t *os = db->db_objset; 1130 1131 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 1132 ASSERT(arc_released(os->os_phys_buf) || 1133 list_link_active(&os->os_dsl_dataset->ds_synced_link)); 1134 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1135 1136 (void) arc_release(db->db_buf, db); 1137 } 1138 1139 dbuf_dirty_record_t * 1140 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1141 { 1142 dnode_t *dn; 1143 objset_t *os; 1144 dbuf_dirty_record_t **drp, *dr; 1145 int drop_struct_lock = FALSE; 1146 boolean_t do_free_accounting = B_FALSE; 1147 int txgoff = tx->tx_txg & TXG_MASK; 1148 1149 ASSERT(tx->tx_txg != 0); 1150 ASSERT(!refcount_is_zero(&db->db_holds)); 1151 DMU_TX_DIRTY_BUF(tx, db); 1152 1153 DB_DNODE_ENTER(db); 1154 dn = DB_DNODE(db); 1155 /* 1156 * Shouldn't dirty a regular buffer in syncing context. Private 1157 * objects may be dirtied in syncing context, but only if they 1158 * were already pre-dirtied in open context. 1159 */ 1160 ASSERT(!dmu_tx_is_syncing(tx) || 1161 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1162 DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1163 dn->dn_objset->os_dsl_dataset == NULL); 1164 /* 1165 * We make this assert for private objects as well, but after we 1166 * check if we're already dirty. They are allowed to re-dirty 1167 * in syncing context. 1168 */ 1169 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1170 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1171 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1172 1173 mutex_enter(&db->db_mtx); 1174 /* 1175 * XXX make this true for indirects too? The problem is that 1176 * transactions created with dmu_tx_create_assigned() from 1177 * syncing context don't bother holding ahead. 1178 */ 1179 ASSERT(db->db_level != 0 || 1180 db->db_state == DB_CACHED || db->db_state == DB_FILL || 1181 db->db_state == DB_NOFILL); 1182 1183 mutex_enter(&dn->dn_mtx); 1184 /* 1185 * Don't set dirtyctx to SYNC if we're just modifying this as we 1186 * initialize the objset. 1187 */ 1188 if (dn->dn_dirtyctx == DN_UNDIRTIED && 1189 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1190 dn->dn_dirtyctx = 1191 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1192 ASSERT(dn->dn_dirtyctx_firstset == NULL); 1193 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1194 } 1195 mutex_exit(&dn->dn_mtx); 1196 1197 if (db->db_blkid == DMU_SPILL_BLKID) 1198 dn->dn_have_spill = B_TRUE; 1199 1200 /* 1201 * If this buffer is already dirty, we're done. 1202 */ 1203 drp = &db->db_last_dirty; 1204 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1205 db->db.db_object == DMU_META_DNODE_OBJECT); 1206 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1207 drp = &dr->dr_next; 1208 if (dr && dr->dr_txg == tx->tx_txg) { 1209 DB_DNODE_EXIT(db); 1210 1211 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1212 /* 1213 * If this buffer has already been written out, 1214 * we now need to reset its state. 1215 */ 1216 dbuf_unoverride(dr); 1217 if (db->db.db_object != DMU_META_DNODE_OBJECT && 1218 db->db_state != DB_NOFILL) 1219 arc_buf_thaw(db->db_buf); 1220 } 1221 mutex_exit(&db->db_mtx); 1222 return (dr); 1223 } 1224 1225 /* 1226 * Only valid if not already dirty. 1227 */ 1228 ASSERT(dn->dn_object == 0 || 1229 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1230 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1231 1232 ASSERT3U(dn->dn_nlevels, >, db->db_level); 1233 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1234 dn->dn_phys->dn_nlevels > db->db_level || 1235 dn->dn_next_nlevels[txgoff] > db->db_level || 1236 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1237 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1238 1239 /* 1240 * We should only be dirtying in syncing context if it's the 1241 * mos or we're initializing the os or it's a special object. 1242 * However, we are allowed to dirty in syncing context provided 1243 * we already dirtied it in open context. Hence we must make 1244 * this assertion only if we're not already dirty. 1245 */ 1246 os = dn->dn_objset; 1247 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1248 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1249 ASSERT(db->db.db_size != 0); 1250 1251 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1252 1253 if (db->db_blkid != DMU_BONUS_BLKID) { 1254 /* 1255 * Update the accounting. 1256 * Note: we delay "free accounting" until after we drop 1257 * the db_mtx. This keeps us from grabbing other locks 1258 * (and possibly deadlocking) in bp_get_dsize() while 1259 * also holding the db_mtx. 1260 */ 1261 dnode_willuse_space(dn, db->db.db_size, tx); 1262 do_free_accounting = dbuf_block_freeable(db); 1263 } 1264 1265 /* 1266 * If this buffer is dirty in an old transaction group we need 1267 * to make a copy of it so that the changes we make in this 1268 * transaction group won't leak out when we sync the older txg. 1269 */ 1270 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1271 if (db->db_level == 0) { 1272 void *data_old = db->db_buf; 1273 1274 if (db->db_state != DB_NOFILL) { 1275 if (db->db_blkid == DMU_BONUS_BLKID) { 1276 dbuf_fix_old_data(db, tx->tx_txg); 1277 data_old = db->db.db_data; 1278 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1279 /* 1280 * Release the data buffer from the cache so 1281 * that we can modify it without impacting 1282 * possible other users of this cached data 1283 * block. Note that indirect blocks and 1284 * private objects are not released until the 1285 * syncing state (since they are only modified 1286 * then). 1287 */ 1288 arc_release(db->db_buf, db); 1289 dbuf_fix_old_data(db, tx->tx_txg); 1290 data_old = db->db_buf; 1291 } 1292 ASSERT(data_old != NULL); 1293 } 1294 dr->dt.dl.dr_data = data_old; 1295 } else { 1296 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1297 list_create(&dr->dt.di.dr_children, 1298 sizeof (dbuf_dirty_record_t), 1299 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1300 } 1301 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 1302 dr->dr_accounted = db->db.db_size; 1303 dr->dr_dbuf = db; 1304 dr->dr_txg = tx->tx_txg; 1305 dr->dr_next = *drp; 1306 *drp = dr; 1307 1308 /* 1309 * We could have been freed_in_flight between the dbuf_noread 1310 * and dbuf_dirty. We win, as though the dbuf_noread() had 1311 * happened after the free. 1312 */ 1313 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1314 db->db_blkid != DMU_SPILL_BLKID) { 1315 mutex_enter(&dn->dn_mtx); 1316 if (dn->dn_free_ranges[txgoff] != NULL) { 1317 range_tree_clear(dn->dn_free_ranges[txgoff], 1318 db->db_blkid, 1); 1319 } 1320 mutex_exit(&dn->dn_mtx); 1321 db->db_freed_in_flight = FALSE; 1322 } 1323 1324 /* 1325 * This buffer is now part of this txg 1326 */ 1327 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1328 db->db_dirtycnt += 1; 1329 ASSERT3U(db->db_dirtycnt, <=, 3); 1330 1331 mutex_exit(&db->db_mtx); 1332 1333 if (db->db_blkid == DMU_BONUS_BLKID || 1334 db->db_blkid == DMU_SPILL_BLKID) { 1335 mutex_enter(&dn->dn_mtx); 1336 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1337 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1338 mutex_exit(&dn->dn_mtx); 1339 dnode_setdirty(dn, tx); 1340 DB_DNODE_EXIT(db); 1341 return (dr); 1342 } else if (do_free_accounting) { 1343 blkptr_t *bp = db->db_blkptr; 1344 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1345 bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1346 /* 1347 * This is only a guess -- if the dbuf is dirty 1348 * in a previous txg, we don't know how much 1349 * space it will use on disk yet. We should 1350 * really have the struct_rwlock to access 1351 * db_blkptr, but since this is just a guess, 1352 * it's OK if we get an odd answer. 1353 */ 1354 ddt_prefetch(os->os_spa, bp); 1355 dnode_willuse_space(dn, -willfree, tx); 1356 } 1357 1358 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1359 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1360 drop_struct_lock = TRUE; 1361 } 1362 1363 if (db->db_level == 0) { 1364 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1365 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1366 } 1367 1368 if (db->db_level+1 < dn->dn_nlevels) { 1369 dmu_buf_impl_t *parent = db->db_parent; 1370 dbuf_dirty_record_t *di; 1371 int parent_held = FALSE; 1372 1373 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1374 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1375 1376 parent = dbuf_hold_level(dn, db->db_level+1, 1377 db->db_blkid >> epbs, FTAG); 1378 ASSERT(parent != NULL); 1379 parent_held = TRUE; 1380 } 1381 if (drop_struct_lock) 1382 rw_exit(&dn->dn_struct_rwlock); 1383 ASSERT3U(db->db_level+1, ==, parent->db_level); 1384 di = dbuf_dirty(parent, tx); 1385 if (parent_held) 1386 dbuf_rele(parent, FTAG); 1387 1388 mutex_enter(&db->db_mtx); 1389 /* 1390 * Since we've dropped the mutex, it's possible that 1391 * dbuf_undirty() might have changed this out from under us. 1392 */ 1393 if (db->db_last_dirty == dr || 1394 dn->dn_object == DMU_META_DNODE_OBJECT) { 1395 mutex_enter(&di->dt.di.dr_mtx); 1396 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1397 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1398 list_insert_tail(&di->dt.di.dr_children, dr); 1399 mutex_exit(&di->dt.di.dr_mtx); 1400 dr->dr_parent = di; 1401 } 1402 mutex_exit(&db->db_mtx); 1403 } else { 1404 ASSERT(db->db_level+1 == dn->dn_nlevels); 1405 ASSERT(db->db_blkid < dn->dn_nblkptr); 1406 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1407 mutex_enter(&dn->dn_mtx); 1408 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1409 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1410 mutex_exit(&dn->dn_mtx); 1411 if (drop_struct_lock) 1412 rw_exit(&dn->dn_struct_rwlock); 1413 } 1414 1415 dnode_setdirty(dn, tx); 1416 DB_DNODE_EXIT(db); 1417 return (dr); 1418 } 1419 1420 /* 1421 * Undirty a buffer in the transaction group referenced by the given 1422 * transaction. Return whether this evicted the dbuf. 1423 */ 1424 static boolean_t 1425 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1426 { 1427 dnode_t *dn; 1428 uint64_t txg = tx->tx_txg; 1429 dbuf_dirty_record_t *dr, **drp; 1430 1431 ASSERT(txg != 0); 1432 1433 /* 1434 * Due to our use of dn_nlevels below, this can only be called 1435 * in open context, unless we are operating on the MOS. 1436 * From syncing context, dn_nlevels may be different from the 1437 * dn_nlevels used when dbuf was dirtied. 1438 */ 1439 ASSERT(db->db_objset == 1440 dmu_objset_pool(db->db_objset)->dp_meta_objset || 1441 txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); 1442 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1443 ASSERT0(db->db_level); 1444 ASSERT(MUTEX_HELD(&db->db_mtx)); 1445 1446 /* 1447 * If this buffer is not dirty, we're done. 1448 */ 1449 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1450 if (dr->dr_txg <= txg) 1451 break; 1452 if (dr == NULL || dr->dr_txg < txg) 1453 return (B_FALSE); 1454 ASSERT(dr->dr_txg == txg); 1455 ASSERT(dr->dr_dbuf == db); 1456 1457 DB_DNODE_ENTER(db); 1458 dn = DB_DNODE(db); 1459 1460 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1461 1462 ASSERT(db->db.db_size != 0); 1463 1464 dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), 1465 dr->dr_accounted, txg); 1466 1467 *drp = dr->dr_next; 1468 1469 /* 1470 * Note that there are three places in dbuf_dirty() 1471 * where this dirty record may be put on a list. 1472 * Make sure to do a list_remove corresponding to 1473 * every one of those list_insert calls. 1474 */ 1475 if (dr->dr_parent) { 1476 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1477 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1478 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1479 } else if (db->db_blkid == DMU_SPILL_BLKID || 1480 db->db_level + 1 == dn->dn_nlevels) { 1481 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1482 mutex_enter(&dn->dn_mtx); 1483 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1484 mutex_exit(&dn->dn_mtx); 1485 } 1486 DB_DNODE_EXIT(db); 1487 1488 if (db->db_state != DB_NOFILL) { 1489 dbuf_unoverride(dr); 1490 1491 ASSERT(db->db_buf != NULL); 1492 ASSERT(dr->dt.dl.dr_data != NULL); 1493 if (dr->dt.dl.dr_data != db->db_buf) 1494 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); 1495 } 1496 1497 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1498 1499 ASSERT(db->db_dirtycnt > 0); 1500 db->db_dirtycnt -= 1; 1501 1502 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1503 arc_buf_t *buf = db->db_buf; 1504 1505 ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1506 dbuf_clear_data(db); 1507 VERIFY(arc_buf_remove_ref(buf, db)); 1508 dbuf_evict(db); 1509 return (B_TRUE); 1510 } 1511 1512 return (B_FALSE); 1513 } 1514 1515 void 1516 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1517 { 1518 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1519 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1520 1521 ASSERT(tx->tx_txg != 0); 1522 ASSERT(!refcount_is_zero(&db->db_holds)); 1523 1524 DB_DNODE_ENTER(db); 1525 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1526 rf |= DB_RF_HAVESTRUCT; 1527 DB_DNODE_EXIT(db); 1528 (void) dbuf_read(db, NULL, rf); 1529 (void) dbuf_dirty(db, tx); 1530 } 1531 1532 void 1533 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1534 { 1535 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1536 1537 db->db_state = DB_NOFILL; 1538 1539 dmu_buf_will_fill(db_fake, tx); 1540 } 1541 1542 void 1543 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1544 { 1545 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1546 1547 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1548 ASSERT(tx->tx_txg != 0); 1549 ASSERT(db->db_level == 0); 1550 ASSERT(!refcount_is_zero(&db->db_holds)); 1551 1552 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1553 dmu_tx_private_ok(tx)); 1554 1555 dbuf_noread(db); 1556 (void) dbuf_dirty(db, tx); 1557 } 1558 1559 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1560 /* ARGSUSED */ 1561 void 1562 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1563 { 1564 mutex_enter(&db->db_mtx); 1565 DBUF_VERIFY(db); 1566 1567 if (db->db_state == DB_FILL) { 1568 if (db->db_level == 0 && db->db_freed_in_flight) { 1569 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1570 /* we were freed while filling */ 1571 /* XXX dbuf_undirty? */ 1572 bzero(db->db.db_data, db->db.db_size); 1573 db->db_freed_in_flight = FALSE; 1574 } 1575 db->db_state = DB_CACHED; 1576 cv_broadcast(&db->db_changed); 1577 } 1578 mutex_exit(&db->db_mtx); 1579 } 1580 1581 void 1582 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, 1583 bp_embedded_type_t etype, enum zio_compress comp, 1584 int uncompressed_size, int compressed_size, int byteorder, 1585 dmu_tx_t *tx) 1586 { 1587 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 1588 struct dirty_leaf *dl; 1589 dmu_object_type_t type; 1590 1591 if (etype == BP_EMBEDDED_TYPE_DATA) { 1592 ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), 1593 SPA_FEATURE_EMBEDDED_DATA)); 1594 } 1595 1596 DB_DNODE_ENTER(db); 1597 type = DB_DNODE(db)->dn_type; 1598 DB_DNODE_EXIT(db); 1599 1600 ASSERT0(db->db_level); 1601 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1602 1603 dmu_buf_will_not_fill(dbuf, tx); 1604 1605 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1606 dl = &db->db_last_dirty->dt.dl; 1607 encode_embedded_bp_compressed(&dl->dr_overridden_by, 1608 data, comp, uncompressed_size, compressed_size); 1609 BPE_SET_ETYPE(&dl->dr_overridden_by, etype); 1610 BP_SET_TYPE(&dl->dr_overridden_by, type); 1611 BP_SET_LEVEL(&dl->dr_overridden_by, 0); 1612 BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); 1613 1614 dl->dr_override_state = DR_OVERRIDDEN; 1615 dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; 1616 } 1617 1618 /* 1619 * Directly assign a provided arc buf to a given dbuf if it's not referenced 1620 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1621 */ 1622 void 1623 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1624 { 1625 ASSERT(!refcount_is_zero(&db->db_holds)); 1626 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1627 ASSERT(db->db_level == 0); 1628 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1629 ASSERT(buf != NULL); 1630 ASSERT(arc_buf_size(buf) == db->db.db_size); 1631 ASSERT(tx->tx_txg != 0); 1632 1633 arc_return_buf(buf, db); 1634 ASSERT(arc_released(buf)); 1635 1636 mutex_enter(&db->db_mtx); 1637 1638 while (db->db_state == DB_READ || db->db_state == DB_FILL) 1639 cv_wait(&db->db_changed, &db->db_mtx); 1640 1641 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1642 1643 if (db->db_state == DB_CACHED && 1644 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1645 mutex_exit(&db->db_mtx); 1646 (void) dbuf_dirty(db, tx); 1647 bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1648 VERIFY(arc_buf_remove_ref(buf, db)); 1649 xuio_stat_wbuf_copied(); 1650 return; 1651 } 1652 1653 xuio_stat_wbuf_nocopy(); 1654 if (db->db_state == DB_CACHED) { 1655 dbuf_dirty_record_t *dr = db->db_last_dirty; 1656 1657 ASSERT(db->db_buf != NULL); 1658 if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1659 ASSERT(dr->dt.dl.dr_data == db->db_buf); 1660 if (!arc_released(db->db_buf)) { 1661 ASSERT(dr->dt.dl.dr_override_state == 1662 DR_OVERRIDDEN); 1663 arc_release(db->db_buf, db); 1664 } 1665 dr->dt.dl.dr_data = buf; 1666 VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1667 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1668 arc_release(db->db_buf, db); 1669 VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1670 } 1671 db->db_buf = NULL; 1672 } 1673 ASSERT(db->db_buf == NULL); 1674 dbuf_set_data(db, buf); 1675 db->db_state = DB_FILL; 1676 mutex_exit(&db->db_mtx); 1677 (void) dbuf_dirty(db, tx); 1678 dmu_buf_fill_done(&db->db, tx); 1679 } 1680 1681 /* 1682 * "Clear" the contents of this dbuf. This will mark the dbuf 1683 * EVICTING and clear *most* of its references. Unfortunately, 1684 * when we are not holding the dn_dbufs_mtx, we can't clear the 1685 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1686 * in this case. For callers from the DMU we will usually see: 1687 * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() 1688 * For the arc callback, we will usually see: 1689 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1690 * Sometimes, though, we will get a mix of these two: 1691 * DMU: dbuf_clear()->arc_clear_callback() 1692 * ARC: dbuf_do_evict()->dbuf_destroy() 1693 * 1694 * This routine will dissociate the dbuf from the arc, by calling 1695 * arc_clear_callback(), but will not evict the data from the ARC. 1696 */ 1697 void 1698 dbuf_clear(dmu_buf_impl_t *db) 1699 { 1700 dnode_t *dn; 1701 dmu_buf_impl_t *parent = db->db_parent; 1702 dmu_buf_impl_t *dndb; 1703 boolean_t dbuf_gone = B_FALSE; 1704 1705 ASSERT(MUTEX_HELD(&db->db_mtx)); 1706 ASSERT(refcount_is_zero(&db->db_holds)); 1707 1708 dbuf_evict_user(db); 1709 1710 if (db->db_state == DB_CACHED) { 1711 ASSERT(db->db.db_data != NULL); 1712 if (db->db_blkid == DMU_BONUS_BLKID) { 1713 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1714 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1715 } 1716 db->db.db_data = NULL; 1717 db->db_state = DB_UNCACHED; 1718 } 1719 1720 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1721 ASSERT(db->db_data_pending == NULL); 1722 1723 db->db_state = DB_EVICTING; 1724 db->db_blkptr = NULL; 1725 1726 DB_DNODE_ENTER(db); 1727 dn = DB_DNODE(db); 1728 dndb = dn->dn_dbuf; 1729 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1730 avl_remove(&dn->dn_dbufs, db); 1731 atomic_dec_32(&dn->dn_dbufs_count); 1732 membar_producer(); 1733 DB_DNODE_EXIT(db); 1734 /* 1735 * Decrementing the dbuf count means that the hold corresponding 1736 * to the removed dbuf is no longer discounted in dnode_move(), 1737 * so the dnode cannot be moved until after we release the hold. 1738 * The membar_producer() ensures visibility of the decremented 1739 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1740 * release any lock. 1741 */ 1742 dnode_rele(dn, db); 1743 db->db_dnode_handle = NULL; 1744 } else { 1745 DB_DNODE_EXIT(db); 1746 } 1747 1748 if (db->db_buf) 1749 dbuf_gone = arc_clear_callback(db->db_buf); 1750 1751 if (!dbuf_gone) 1752 mutex_exit(&db->db_mtx); 1753 1754 /* 1755 * If this dbuf is referenced from an indirect dbuf, 1756 * decrement the ref count on the indirect dbuf. 1757 */ 1758 if (parent && parent != dndb) 1759 dbuf_rele(parent, db); 1760 } 1761 1762 /* 1763 * Note: While bpp will always be updated if the function returns success, 1764 * parentp will not be updated if the dnode does not have dn_dbuf filled in; 1765 * this happens when the dnode is the meta-dnode, or a userused or groupused 1766 * object. 1767 */ 1768 static int 1769 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1770 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1771 { 1772 int nlevels, epbs; 1773 1774 *parentp = NULL; 1775 *bpp = NULL; 1776 1777 ASSERT(blkid != DMU_BONUS_BLKID); 1778 1779 if (blkid == DMU_SPILL_BLKID) { 1780 mutex_enter(&dn->dn_mtx); 1781 if (dn->dn_have_spill && 1782 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1783 *bpp = &dn->dn_phys->dn_spill; 1784 else 1785 *bpp = NULL; 1786 dbuf_add_ref(dn->dn_dbuf, NULL); 1787 *parentp = dn->dn_dbuf; 1788 mutex_exit(&dn->dn_mtx); 1789 return (0); 1790 } 1791 1792 if (dn->dn_phys->dn_nlevels == 0) 1793 nlevels = 1; 1794 else 1795 nlevels = dn->dn_phys->dn_nlevels; 1796 1797 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1798 1799 ASSERT3U(level * epbs, <, 64); 1800 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1801 if (level >= nlevels || 1802 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1803 /* the buffer has no parent yet */ 1804 return (SET_ERROR(ENOENT)); 1805 } else if (level < nlevels-1) { 1806 /* this block is referenced from an indirect block */ 1807 int err = dbuf_hold_impl(dn, level+1, 1808 blkid >> epbs, fail_sparse, FALSE, NULL, parentp); 1809 if (err) 1810 return (err); 1811 err = dbuf_read(*parentp, NULL, 1812 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1813 if (err) { 1814 dbuf_rele(*parentp, NULL); 1815 *parentp = NULL; 1816 return (err); 1817 } 1818 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1819 (blkid & ((1ULL << epbs) - 1)); 1820 return (0); 1821 } else { 1822 /* the block is referenced from the dnode */ 1823 ASSERT3U(level, ==, nlevels-1); 1824 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1825 blkid < dn->dn_phys->dn_nblkptr); 1826 if (dn->dn_dbuf) { 1827 dbuf_add_ref(dn->dn_dbuf, NULL); 1828 *parentp = dn->dn_dbuf; 1829 } 1830 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1831 return (0); 1832 } 1833 } 1834 1835 static dmu_buf_impl_t * 1836 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1837 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1838 { 1839 objset_t *os = dn->dn_objset; 1840 dmu_buf_impl_t *db, *odb; 1841 1842 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1843 ASSERT(dn->dn_type != DMU_OT_NONE); 1844 1845 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1846 1847 db->db_objset = os; 1848 db->db.db_object = dn->dn_object; 1849 db->db_level = level; 1850 db->db_blkid = blkid; 1851 db->db_last_dirty = NULL; 1852 db->db_dirtycnt = 0; 1853 db->db_dnode_handle = dn->dn_handle; 1854 db->db_parent = parent; 1855 db->db_blkptr = blkptr; 1856 1857 db->db_user = NULL; 1858 db->db_user_immediate_evict = FALSE; 1859 db->db_freed_in_flight = FALSE; 1860 db->db_pending_evict = FALSE; 1861 1862 if (blkid == DMU_BONUS_BLKID) { 1863 ASSERT3P(parent, ==, dn->dn_dbuf); 1864 db->db.db_size = DN_MAX_BONUSLEN - 1865 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1866 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1867 db->db.db_offset = DMU_BONUS_BLKID; 1868 db->db_state = DB_UNCACHED; 1869 /* the bonus dbuf is not placed in the hash table */ 1870 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1871 return (db); 1872 } else if (blkid == DMU_SPILL_BLKID) { 1873 db->db.db_size = (blkptr != NULL) ? 1874 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 1875 db->db.db_offset = 0; 1876 } else { 1877 int blocksize = 1878 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 1879 db->db.db_size = blocksize; 1880 db->db.db_offset = db->db_blkid * blocksize; 1881 } 1882 1883 /* 1884 * Hold the dn_dbufs_mtx while we get the new dbuf 1885 * in the hash table *and* added to the dbufs list. 1886 * This prevents a possible deadlock with someone 1887 * trying to look up this dbuf before its added to the 1888 * dn_dbufs list. 1889 */ 1890 mutex_enter(&dn->dn_dbufs_mtx); 1891 db->db_state = DB_EVICTING; 1892 if ((odb = dbuf_hash_insert(db)) != NULL) { 1893 /* someone else inserted it first */ 1894 kmem_cache_free(dbuf_cache, db); 1895 mutex_exit(&dn->dn_dbufs_mtx); 1896 return (odb); 1897 } 1898 avl_add(&dn->dn_dbufs, db); 1899 if (db->db_level == 0 && db->db_blkid >= 1900 dn->dn_unlisted_l0_blkid) 1901 dn->dn_unlisted_l0_blkid = db->db_blkid + 1; 1902 db->db_state = DB_UNCACHED; 1903 mutex_exit(&dn->dn_dbufs_mtx); 1904 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1905 1906 if (parent && parent != dn->dn_dbuf) 1907 dbuf_add_ref(parent, db); 1908 1909 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1910 refcount_count(&dn->dn_holds) > 0); 1911 (void) refcount_add(&dn->dn_holds, db); 1912 atomic_inc_32(&dn->dn_dbufs_count); 1913 1914 dprintf_dbuf(db, "db=%p\n", db); 1915 1916 return (db); 1917 } 1918 1919 static int 1920 dbuf_do_evict(void *private) 1921 { 1922 dmu_buf_impl_t *db = private; 1923 1924 if (!MUTEX_HELD(&db->db_mtx)) 1925 mutex_enter(&db->db_mtx); 1926 1927 ASSERT(refcount_is_zero(&db->db_holds)); 1928 1929 if (db->db_state != DB_EVICTING) { 1930 ASSERT(db->db_state == DB_CACHED); 1931 DBUF_VERIFY(db); 1932 db->db_buf = NULL; 1933 dbuf_evict(db); 1934 } else { 1935 mutex_exit(&db->db_mtx); 1936 dbuf_destroy(db); 1937 } 1938 return (0); 1939 } 1940 1941 static void 1942 dbuf_destroy(dmu_buf_impl_t *db) 1943 { 1944 ASSERT(refcount_is_zero(&db->db_holds)); 1945 1946 if (db->db_blkid != DMU_BONUS_BLKID) { 1947 /* 1948 * If this dbuf is still on the dn_dbufs list, 1949 * remove it from that list. 1950 */ 1951 if (db->db_dnode_handle != NULL) { 1952 dnode_t *dn; 1953 1954 DB_DNODE_ENTER(db); 1955 dn = DB_DNODE(db); 1956 mutex_enter(&dn->dn_dbufs_mtx); 1957 avl_remove(&dn->dn_dbufs, db); 1958 atomic_dec_32(&dn->dn_dbufs_count); 1959 mutex_exit(&dn->dn_dbufs_mtx); 1960 DB_DNODE_EXIT(db); 1961 /* 1962 * Decrementing the dbuf count means that the hold 1963 * corresponding to the removed dbuf is no longer 1964 * discounted in dnode_move(), so the dnode cannot be 1965 * moved until after we release the hold. 1966 */ 1967 dnode_rele(dn, db); 1968 db->db_dnode_handle = NULL; 1969 } 1970 dbuf_hash_remove(db); 1971 } 1972 db->db_parent = NULL; 1973 db->db_buf = NULL; 1974 1975 ASSERT(db->db.db_data == NULL); 1976 ASSERT(db->db_hash_next == NULL); 1977 ASSERT(db->db_blkptr == NULL); 1978 ASSERT(db->db_data_pending == NULL); 1979 1980 kmem_cache_free(dbuf_cache, db); 1981 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1982 } 1983 1984 typedef struct dbuf_prefetch_arg { 1985 spa_t *dpa_spa; /* The spa to issue the prefetch in. */ 1986 zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ 1987 int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ 1988 int dpa_curlevel; /* The current level that we're reading */ 1989 zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ 1990 zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ 1991 arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ 1992 } dbuf_prefetch_arg_t; 1993 1994 /* 1995 * Actually issue the prefetch read for the block given. 1996 */ 1997 static void 1998 dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) 1999 { 2000 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 2001 return; 2002 2003 arc_flags_t aflags = 2004 dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; 2005 2006 ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2007 ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); 2008 ASSERT(dpa->dpa_zio != NULL); 2009 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, 2010 dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2011 &aflags, &dpa->dpa_zb); 2012 } 2013 2014 /* 2015 * Called when an indirect block above our prefetch target is read in. This 2016 * will either read in the next indirect block down the tree or issue the actual 2017 * prefetch if the next block down is our target. 2018 */ 2019 static void 2020 dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) 2021 { 2022 dbuf_prefetch_arg_t *dpa = private; 2023 2024 ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); 2025 ASSERT3S(dpa->dpa_curlevel, >, 0); 2026 if (zio != NULL) { 2027 ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); 2028 ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); 2029 ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); 2030 } 2031 2032 dpa->dpa_curlevel--; 2033 2034 uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> 2035 (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); 2036 blkptr_t *bp = ((blkptr_t *)abuf->b_data) + 2037 P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); 2038 if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { 2039 kmem_free(dpa, sizeof (*dpa)); 2040 } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { 2041 ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); 2042 dbuf_issue_final_prefetch(dpa, bp); 2043 kmem_free(dpa, sizeof (*dpa)); 2044 } else { 2045 arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 2046 zbookmark_phys_t zb; 2047 2048 ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2049 2050 SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, 2051 dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); 2052 2053 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2054 bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, 2055 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2056 &iter_aflags, &zb); 2057 } 2058 (void) arc_buf_remove_ref(abuf, private); 2059 } 2060 2061 /* 2062 * Issue prefetch reads for the given block on the given level. If the indirect 2063 * blocks above that block are not in memory, we will read them in 2064 * asynchronously. As a result, this call never blocks waiting for a read to 2065 * complete. 2066 */ 2067 void 2068 dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, 2069 arc_flags_t aflags) 2070 { 2071 blkptr_t bp; 2072 int epbs, nlevels, curlevel; 2073 uint64_t curblkid; 2074 2075 ASSERT(blkid != DMU_BONUS_BLKID); 2076 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2077 2078 if (dnode_block_freed(dn, blkid)) 2079 return; 2080 2081 /* 2082 * This dnode hasn't been written to disk yet, so there's nothing to 2083 * prefetch. 2084 */ 2085 nlevels = dn->dn_phys->dn_nlevels; 2086 if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) 2087 return; 2088 2089 epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2090 if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) 2091 return; 2092 2093 dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, 2094 level, blkid); 2095 if (db != NULL) { 2096 mutex_exit(&db->db_mtx); 2097 /* 2098 * This dbuf already exists. It is either CACHED, or 2099 * (we assume) about to be read or filled. 2100 */ 2101 return; 2102 } 2103 2104 /* 2105 * Find the closest ancestor (indirect block) of the target block 2106 * that is present in the cache. In this indirect block, we will 2107 * find the bp that is at curlevel, curblkid. 2108 */ 2109 curlevel = level; 2110 curblkid = blkid; 2111 while (curlevel < nlevels - 1) { 2112 int parent_level = curlevel + 1; 2113 uint64_t parent_blkid = curblkid >> epbs; 2114 dmu_buf_impl_t *db; 2115 2116 if (dbuf_hold_impl(dn, parent_level, parent_blkid, 2117 FALSE, TRUE, FTAG, &db) == 0) { 2118 blkptr_t *bpp = db->db_buf->b_data; 2119 bp = bpp[P2PHASE(curblkid, 1 << epbs)]; 2120 dbuf_rele(db, FTAG); 2121 break; 2122 } 2123 2124 curlevel = parent_level; 2125 curblkid = parent_blkid; 2126 } 2127 2128 if (curlevel == nlevels - 1) { 2129 /* No cached indirect blocks found. */ 2130 ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); 2131 bp = dn->dn_phys->dn_blkptr[curblkid]; 2132 } 2133 if (BP_IS_HOLE(&bp)) 2134 return; 2135 2136 ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); 2137 2138 zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, 2139 ZIO_FLAG_CANFAIL); 2140 2141 dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); 2142 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 2143 SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2144 dn->dn_object, level, blkid); 2145 dpa->dpa_curlevel = curlevel; 2146 dpa->dpa_prio = prio; 2147 dpa->dpa_aflags = aflags; 2148 dpa->dpa_spa = dn->dn_objset->os_spa; 2149 dpa->dpa_epbs = epbs; 2150 dpa->dpa_zio = pio; 2151 2152 /* 2153 * If we have the indirect just above us, no need to do the asynchronous 2154 * prefetch chain; we'll just run the last step ourselves. If we're at 2155 * a higher level, though, we want to issue the prefetches for all the 2156 * indirect blocks asynchronously, so we can go on with whatever we were 2157 * doing. 2158 */ 2159 if (curlevel == level) { 2160 ASSERT3U(curblkid, ==, blkid); 2161 dbuf_issue_final_prefetch(dpa, &bp); 2162 kmem_free(dpa, sizeof (*dpa)); 2163 } else { 2164 arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 2165 zbookmark_phys_t zb; 2166 2167 SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2168 dn->dn_object, curlevel, curblkid); 2169 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2170 &bp, dbuf_prefetch_indirect_done, dpa, prio, 2171 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2172 &iter_aflags, &zb); 2173 } 2174 /* 2175 * We use pio here instead of dpa_zio since it's possible that 2176 * dpa may have already been freed. 2177 */ 2178 zio_nowait(pio); 2179 } 2180 2181 /* 2182 * Returns with db_holds incremented, and db_mtx not held. 2183 * Note: dn_struct_rwlock must be held. 2184 */ 2185 int 2186 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, 2187 boolean_t fail_sparse, boolean_t fail_uncached, 2188 void *tag, dmu_buf_impl_t **dbp) 2189 { 2190 dmu_buf_impl_t *db, *parent = NULL; 2191 2192 ASSERT(blkid != DMU_BONUS_BLKID); 2193 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2194 ASSERT3U(dn->dn_nlevels, >, level); 2195 2196 *dbp = NULL; 2197 top: 2198 /* dbuf_find() returns with db_mtx held */ 2199 db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); 2200 2201 if (db == NULL) { 2202 blkptr_t *bp = NULL; 2203 int err; 2204 2205 if (fail_uncached) 2206 return (SET_ERROR(ENOENT)); 2207 2208 ASSERT3P(parent, ==, NULL); 2209 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 2210 if (fail_sparse) { 2211 if (err == 0 && bp && BP_IS_HOLE(bp)) 2212 err = SET_ERROR(ENOENT); 2213 if (err) { 2214 if (parent) 2215 dbuf_rele(parent, NULL); 2216 return (err); 2217 } 2218 } 2219 if (err && err != ENOENT) 2220 return (err); 2221 db = dbuf_create(dn, level, blkid, parent, bp); 2222 } 2223 2224 if (fail_uncached && db->db_state != DB_CACHED) { 2225 mutex_exit(&db->db_mtx); 2226 return (SET_ERROR(ENOENT)); 2227 } 2228 2229 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 2230 arc_buf_add_ref(db->db_buf, db); 2231 if (db->db_buf->b_data == NULL) { 2232 dbuf_clear(db); 2233 if (parent) { 2234 dbuf_rele(parent, NULL); 2235 parent = NULL; 2236 } 2237 goto top; 2238 } 2239 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 2240 } 2241 2242 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 2243 2244 /* 2245 * If this buffer is currently syncing out, and we are are 2246 * still referencing it from db_data, we need to make a copy 2247 * of it in case we decide we want to dirty it again in this txg. 2248 */ 2249 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 2250 dn->dn_object != DMU_META_DNODE_OBJECT && 2251 db->db_state == DB_CACHED && db->db_data_pending) { 2252 dbuf_dirty_record_t *dr = db->db_data_pending; 2253 2254 if (dr->dt.dl.dr_data == db->db_buf) { 2255 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2256 2257 dbuf_set_data(db, 2258 arc_buf_alloc(dn->dn_objset->os_spa, 2259 db->db.db_size, db, type)); 2260 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 2261 db->db.db_size); 2262 } 2263 } 2264 2265 (void) refcount_add(&db->db_holds, tag); 2266 DBUF_VERIFY(db); 2267 mutex_exit(&db->db_mtx); 2268 2269 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 2270 if (parent) 2271 dbuf_rele(parent, NULL); 2272 2273 ASSERT3P(DB_DNODE(db), ==, dn); 2274 ASSERT3U(db->db_blkid, ==, blkid); 2275 ASSERT3U(db->db_level, ==, level); 2276 *dbp = db; 2277 2278 return (0); 2279 } 2280 2281 dmu_buf_impl_t * 2282 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 2283 { 2284 return (dbuf_hold_level(dn, 0, blkid, tag)); 2285 } 2286 2287 dmu_buf_impl_t * 2288 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 2289 { 2290 dmu_buf_impl_t *db; 2291 int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); 2292 return (err ? NULL : db); 2293 } 2294 2295 void 2296 dbuf_create_bonus(dnode_t *dn) 2297 { 2298 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 2299 2300 ASSERT(dn->dn_bonus == NULL); 2301 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 2302 } 2303 2304 int 2305 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 2306 { 2307 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2308 dnode_t *dn; 2309 2310 if (db->db_blkid != DMU_SPILL_BLKID) 2311 return (SET_ERROR(ENOTSUP)); 2312 if (blksz == 0) 2313 blksz = SPA_MINBLOCKSIZE; 2314 ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); 2315 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 2316 2317 DB_DNODE_ENTER(db); 2318 dn = DB_DNODE(db); 2319 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2320 dbuf_new_size(db, blksz, tx); 2321 rw_exit(&dn->dn_struct_rwlock); 2322 DB_DNODE_EXIT(db); 2323 2324 return (0); 2325 } 2326 2327 void 2328 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 2329 { 2330 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2331 } 2332 2333 #pragma weak dmu_buf_add_ref = dbuf_add_ref 2334 void 2335 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 2336 { 2337 int64_t holds = refcount_add(&db->db_holds, tag); 2338 ASSERT(holds > 1); 2339 } 2340 2341 #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref 2342 boolean_t 2343 dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, 2344 void *tag) 2345 { 2346 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2347 dmu_buf_impl_t *found_db; 2348 boolean_t result = B_FALSE; 2349 2350 if (db->db_blkid == DMU_BONUS_BLKID) 2351 found_db = dbuf_find_bonus(os, obj); 2352 else 2353 found_db = dbuf_find(os, obj, 0, blkid); 2354 2355 if (found_db != NULL) { 2356 if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { 2357 (void) refcount_add(&db->db_holds, tag); 2358 result = B_TRUE; 2359 } 2360 mutex_exit(&db->db_mtx); 2361 } 2362 return (result); 2363 } 2364 2365 /* 2366 * If you call dbuf_rele() you had better not be referencing the dnode handle 2367 * unless you have some other direct or indirect hold on the dnode. (An indirect 2368 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2369 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2370 * dnode's parent dbuf evicting its dnode handles. 2371 */ 2372 void 2373 dbuf_rele(dmu_buf_impl_t *db, void *tag) 2374 { 2375 mutex_enter(&db->db_mtx); 2376 dbuf_rele_and_unlock(db, tag); 2377 } 2378 2379 void 2380 dmu_buf_rele(dmu_buf_t *db, void *tag) 2381 { 2382 dbuf_rele((dmu_buf_impl_t *)db, tag); 2383 } 2384 2385 /* 2386 * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2387 * db_dirtycnt and db_holds to be updated atomically. 2388 */ 2389 void 2390 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2391 { 2392 int64_t holds; 2393 2394 ASSERT(MUTEX_HELD(&db->db_mtx)); 2395 DBUF_VERIFY(db); 2396 2397 /* 2398 * Remove the reference to the dbuf before removing its hold on the 2399 * dnode so we can guarantee in dnode_move() that a referenced bonus 2400 * buffer has a corresponding dnode hold. 2401 */ 2402 holds = refcount_remove(&db->db_holds, tag); 2403 ASSERT(holds >= 0); 2404 2405 /* 2406 * We can't freeze indirects if there is a possibility that they 2407 * may be modified in the current syncing context. 2408 */ 2409 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 2410 arc_buf_freeze(db->db_buf); 2411 2412 if (holds == db->db_dirtycnt && 2413 db->db_level == 0 && db->db_user_immediate_evict) 2414 dbuf_evict_user(db); 2415 2416 if (holds == 0) { 2417 if (db->db_blkid == DMU_BONUS_BLKID) { 2418 dnode_t *dn; 2419 boolean_t evict_dbuf = db->db_pending_evict; 2420 2421 /* 2422 * If the dnode moves here, we cannot cross this 2423 * barrier until the move completes. 2424 */ 2425 DB_DNODE_ENTER(db); 2426 2427 dn = DB_DNODE(db); 2428 atomic_dec_32(&dn->dn_dbufs_count); 2429 2430 /* 2431 * Decrementing the dbuf count means that the bonus 2432 * buffer's dnode hold is no longer discounted in 2433 * dnode_move(). The dnode cannot move until after 2434 * the dnode_rele() below. 2435 */ 2436 DB_DNODE_EXIT(db); 2437 2438 /* 2439 * Do not reference db after its lock is dropped. 2440 * Another thread may evict it. 2441 */ 2442 mutex_exit(&db->db_mtx); 2443 2444 if (evict_dbuf) 2445 dnode_evict_bonus(dn); 2446 2447 dnode_rele(dn, db); 2448 } else if (db->db_buf == NULL) { 2449 /* 2450 * This is a special case: we never associated this 2451 * dbuf with any data allocated from the ARC. 2452 */ 2453 ASSERT(db->db_state == DB_UNCACHED || 2454 db->db_state == DB_NOFILL); 2455 dbuf_evict(db); 2456 } else if (arc_released(db->db_buf)) { 2457 arc_buf_t *buf = db->db_buf; 2458 /* 2459 * This dbuf has anonymous data associated with it. 2460 */ 2461 dbuf_clear_data(db); 2462 VERIFY(arc_buf_remove_ref(buf, db)); 2463 dbuf_evict(db); 2464 } else { 2465 VERIFY(!arc_buf_remove_ref(db->db_buf, db)); 2466 2467 /* 2468 * A dbuf will be eligible for eviction if either the 2469 * 'primarycache' property is set or a duplicate 2470 * copy of this buffer is already cached in the arc. 2471 * 2472 * In the case of the 'primarycache' a buffer 2473 * is considered for eviction if it matches the 2474 * criteria set in the property. 2475 * 2476 * To decide if our buffer is considered a 2477 * duplicate, we must call into the arc to determine 2478 * if multiple buffers are referencing the same 2479 * block on-disk. If so, then we simply evict 2480 * ourselves. 2481 */ 2482 if (!DBUF_IS_CACHEABLE(db)) { 2483 if (db->db_blkptr != NULL && 2484 !BP_IS_HOLE(db->db_blkptr) && 2485 !BP_IS_EMBEDDED(db->db_blkptr)) { 2486 spa_t *spa = 2487 dmu_objset_spa(db->db_objset); 2488 blkptr_t bp = *db->db_blkptr; 2489 dbuf_clear(db); 2490 arc_freed(spa, &bp); 2491 } else { 2492 dbuf_clear(db); 2493 } 2494 } else if (db->db_pending_evict || 2495 arc_buf_eviction_needed(db->db_buf)) { 2496 dbuf_clear(db); 2497 } else { 2498 mutex_exit(&db->db_mtx); 2499 } 2500 } 2501 } else { 2502 mutex_exit(&db->db_mtx); 2503 } 2504 } 2505 2506 #pragma weak dmu_buf_refcount = dbuf_refcount 2507 uint64_t 2508 dbuf_refcount(dmu_buf_impl_t *db) 2509 { 2510 return (refcount_count(&db->db_holds)); 2511 } 2512 2513 void * 2514 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, 2515 dmu_buf_user_t *new_user) 2516 { 2517 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2518 2519 mutex_enter(&db->db_mtx); 2520 dbuf_verify_user(db, DBVU_NOT_EVICTING); 2521 if (db->db_user == old_user) 2522 db->db_user = new_user; 2523 else 2524 old_user = db->db_user; 2525 dbuf_verify_user(db, DBVU_NOT_EVICTING); 2526 mutex_exit(&db->db_mtx); 2527 2528 return (old_user); 2529 } 2530 2531 void * 2532 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2533 { 2534 return (dmu_buf_replace_user(db_fake, NULL, user)); 2535 } 2536 2537 void * 2538 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2539 { 2540 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2541 2542 db->db_user_immediate_evict = TRUE; 2543 return (dmu_buf_set_user(db_fake, user)); 2544 } 2545 2546 void * 2547 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2548 { 2549 return (dmu_buf_replace_user(db_fake, user, NULL)); 2550 } 2551 2552 void * 2553 dmu_buf_get_user(dmu_buf_t *db_fake) 2554 { 2555 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2556 2557 dbuf_verify_user(db, DBVU_NOT_EVICTING); 2558 return (db->db_user); 2559 } 2560 2561 void 2562 dmu_buf_user_evict_wait() 2563 { 2564 taskq_wait(dbu_evict_taskq); 2565 } 2566 2567 boolean_t 2568 dmu_buf_freeable(dmu_buf_t *dbuf) 2569 { 2570 boolean_t res = B_FALSE; 2571 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2572 2573 if (db->db_blkptr) 2574 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2575 db->db_blkptr, db->db_blkptr->blk_birth); 2576 2577 return (res); 2578 } 2579 2580 blkptr_t * 2581 dmu_buf_get_blkptr(dmu_buf_t *db) 2582 { 2583 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2584 return (dbi->db_blkptr); 2585 } 2586 2587 static void 2588 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2589 { 2590 /* ASSERT(dmu_tx_is_syncing(tx) */ 2591 ASSERT(MUTEX_HELD(&db->db_mtx)); 2592 2593 if (db->db_blkptr != NULL) 2594 return; 2595 2596 if (db->db_blkid == DMU_SPILL_BLKID) { 2597 db->db_blkptr = &dn->dn_phys->dn_spill; 2598 BP_ZERO(db->db_blkptr); 2599 return; 2600 } 2601 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2602 /* 2603 * This buffer was allocated at a time when there was 2604 * no available blkptrs from the dnode, or it was 2605 * inappropriate to hook it in (i.e., nlevels mis-match). 2606 */ 2607 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2608 ASSERT(db->db_parent == NULL); 2609 db->db_parent = dn->dn_dbuf; 2610 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2611 DBUF_VERIFY(db); 2612 } else { 2613 dmu_buf_impl_t *parent = db->db_parent; 2614 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2615 2616 ASSERT(dn->dn_phys->dn_nlevels > 1); 2617 if (parent == NULL) { 2618 mutex_exit(&db->db_mtx); 2619 rw_enter(&dn->dn_struct_rwlock, RW_READER); 2620 parent = dbuf_hold_level(dn, db->db_level + 1, 2621 db->db_blkid >> epbs, db); 2622 rw_exit(&dn->dn_struct_rwlock); 2623 mutex_enter(&db->db_mtx); 2624 db->db_parent = parent; 2625 } 2626 db->db_blkptr = (blkptr_t *)parent->db.db_data + 2627 (db->db_blkid & ((1ULL << epbs) - 1)); 2628 DBUF_VERIFY(db); 2629 } 2630 } 2631 2632 static void 2633 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2634 { 2635 dmu_buf_impl_t *db = dr->dr_dbuf; 2636 dnode_t *dn; 2637 zio_t *zio; 2638 2639 ASSERT(dmu_tx_is_syncing(tx)); 2640 2641 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2642 2643 mutex_enter(&db->db_mtx); 2644 2645 ASSERT(db->db_level > 0); 2646 DBUF_VERIFY(db); 2647 2648 /* Read the block if it hasn't been read yet. */ 2649 if (db->db_buf == NULL) { 2650 mutex_exit(&db->db_mtx); 2651 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2652 mutex_enter(&db->db_mtx); 2653 } 2654 ASSERT3U(db->db_state, ==, DB_CACHED); 2655 ASSERT(db->db_buf != NULL); 2656 2657 DB_DNODE_ENTER(db); 2658 dn = DB_DNODE(db); 2659 /* Indirect block size must match what the dnode thinks it is. */ 2660 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2661 dbuf_check_blkptr(dn, db); 2662 DB_DNODE_EXIT(db); 2663 2664 /* Provide the pending dirty record to child dbufs */ 2665 db->db_data_pending = dr; 2666 2667 mutex_exit(&db->db_mtx); 2668 dbuf_write(dr, db->db_buf, tx); 2669 2670 zio = dr->dr_zio; 2671 mutex_enter(&dr->dt.di.dr_mtx); 2672 dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); 2673 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2674 mutex_exit(&dr->dt.di.dr_mtx); 2675 zio_nowait(zio); 2676 } 2677 2678 static void 2679 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2680 { 2681 arc_buf_t **datap = &dr->dt.dl.dr_data; 2682 dmu_buf_impl_t *db = dr->dr_dbuf; 2683 dnode_t *dn; 2684 objset_t *os; 2685 uint64_t txg = tx->tx_txg; 2686 2687 ASSERT(dmu_tx_is_syncing(tx)); 2688 2689 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2690 2691 mutex_enter(&db->db_mtx); 2692 /* 2693 * To be synced, we must be dirtied. But we 2694 * might have been freed after the dirty. 2695 */ 2696 if (db->db_state == DB_UNCACHED) { 2697 /* This buffer has been freed since it was dirtied */ 2698 ASSERT(db->db.db_data == NULL); 2699 } else if (db->db_state == DB_FILL) { 2700 /* This buffer was freed and is now being re-filled */ 2701 ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2702 } else { 2703 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2704 } 2705 DBUF_VERIFY(db); 2706 2707 DB_DNODE_ENTER(db); 2708 dn = DB_DNODE(db); 2709 2710 if (db->db_blkid == DMU_SPILL_BLKID) { 2711 mutex_enter(&dn->dn_mtx); 2712 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 2713 mutex_exit(&dn->dn_mtx); 2714 } 2715 2716 /* 2717 * If this is a bonus buffer, simply copy the bonus data into the 2718 * dnode. It will be written out when the dnode is synced (and it 2719 * will be synced, since it must have been dirty for dbuf_sync to 2720 * be called). 2721 */ 2722 if (db->db_blkid == DMU_BONUS_BLKID) { 2723 dbuf_dirty_record_t **drp; 2724 2725 ASSERT(*datap != NULL); 2726 ASSERT0(db->db_level); 2727 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2728 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2729 DB_DNODE_EXIT(db); 2730 2731 if (*datap != db->db.db_data) { 2732 zio_buf_free(*datap, DN_MAX_BONUSLEN); 2733 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2734 } 2735 db->db_data_pending = NULL; 2736 drp = &db->db_last_dirty; 2737 while (*drp != dr) 2738 drp = &(*drp)->dr_next; 2739 ASSERT(dr->dr_next == NULL); 2740 ASSERT(dr->dr_dbuf == db); 2741 *drp = dr->dr_next; 2742 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2743 ASSERT(db->db_dirtycnt > 0); 2744 db->db_dirtycnt -= 1; 2745 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2746 return; 2747 } 2748 2749 os = dn->dn_objset; 2750 2751 /* 2752 * This function may have dropped the db_mtx lock allowing a dmu_sync 2753 * operation to sneak in. As a result, we need to ensure that we 2754 * don't check the dr_override_state until we have returned from 2755 * dbuf_check_blkptr. 2756 */ 2757 dbuf_check_blkptr(dn, db); 2758 2759 /* 2760 * If this buffer is in the middle of an immediate write, 2761 * wait for the synchronous IO to complete. 2762 */ 2763 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2764 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2765 cv_wait(&db->db_changed, &db->db_mtx); 2766 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2767 } 2768 2769 if (db->db_state != DB_NOFILL && 2770 dn->dn_object != DMU_META_DNODE_OBJECT && 2771 refcount_count(&db->db_holds) > 1 && 2772 dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2773 *datap == db->db_buf) { 2774 /* 2775 * If this buffer is currently "in use" (i.e., there 2776 * are active holds and db_data still references it), 2777 * then make a copy before we start the write so that 2778 * any modifications from the open txg will not leak 2779 * into this write. 2780 * 2781 * NOTE: this copy does not need to be made for 2782 * objects only modified in the syncing context (e.g. 2783 * DNONE_DNODE blocks). 2784 */ 2785 int blksz = arc_buf_size(*datap); 2786 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2787 *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2788 bcopy(db->db.db_data, (*datap)->b_data, blksz); 2789 } 2790 db->db_data_pending = dr; 2791 2792 mutex_exit(&db->db_mtx); 2793 2794 dbuf_write(dr, *datap, tx); 2795 2796 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2797 if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2798 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2799 DB_DNODE_EXIT(db); 2800 } else { 2801 /* 2802 * Although zio_nowait() does not "wait for an IO", it does 2803 * initiate the IO. If this is an empty write it seems plausible 2804 * that the IO could actually be completed before the nowait 2805 * returns. We need to DB_DNODE_EXIT() first in case 2806 * zio_nowait() invalidates the dbuf. 2807 */ 2808 DB_DNODE_EXIT(db); 2809 zio_nowait(dr->dr_zio); 2810 } 2811 } 2812 2813 void 2814 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) 2815 { 2816 dbuf_dirty_record_t *dr; 2817 2818 while (dr = list_head(list)) { 2819 if (dr->dr_zio != NULL) { 2820 /* 2821 * If we find an already initialized zio then we 2822 * are processing the meta-dnode, and we have finished. 2823 * The dbufs for all dnodes are put back on the list 2824 * during processing, so that we can zio_wait() 2825 * these IOs after initiating all child IOs. 2826 */ 2827 ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2828 DMU_META_DNODE_OBJECT); 2829 break; 2830 } 2831 if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && 2832 dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { 2833 VERIFY3U(dr->dr_dbuf->db_level, ==, level); 2834 } 2835 list_remove(list, dr); 2836 if (dr->dr_dbuf->db_level > 0) 2837 dbuf_sync_indirect(dr, tx); 2838 else 2839 dbuf_sync_leaf(dr, tx); 2840 } 2841 } 2842 2843 /* ARGSUSED */ 2844 static void 2845 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2846 { 2847 dmu_buf_impl_t *db = vdb; 2848 dnode_t *dn; 2849 blkptr_t *bp = zio->io_bp; 2850 blkptr_t *bp_orig = &zio->io_bp_orig; 2851 spa_t *spa = zio->io_spa; 2852 int64_t delta; 2853 uint64_t fill = 0; 2854 int i; 2855 2856 ASSERT3P(db->db_blkptr, ==, bp); 2857 2858 DB_DNODE_ENTER(db); 2859 dn = DB_DNODE(db); 2860 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2861 dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2862 zio->io_prev_space_delta = delta; 2863 2864 if (bp->blk_birth != 0) { 2865 ASSERT((db->db_blkid != DMU_SPILL_BLKID && 2866 BP_GET_TYPE(bp) == dn->dn_type) || 2867 (db->db_blkid == DMU_SPILL_BLKID && 2868 BP_GET_TYPE(bp) == dn->dn_bonustype) || 2869 BP_IS_EMBEDDED(bp)); 2870 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2871 } 2872 2873 mutex_enter(&db->db_mtx); 2874 2875 #ifdef ZFS_DEBUG 2876 if (db->db_blkid == DMU_SPILL_BLKID) { 2877 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2878 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2879 db->db_blkptr == &dn->dn_phys->dn_spill); 2880 } 2881 #endif 2882 2883 if (db->db_level == 0) { 2884 mutex_enter(&dn->dn_mtx); 2885 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2886 db->db_blkid != DMU_SPILL_BLKID) 2887 dn->dn_phys->dn_maxblkid = db->db_blkid; 2888 mutex_exit(&dn->dn_mtx); 2889 2890 if (dn->dn_type == DMU_OT_DNODE) { 2891 dnode_phys_t *dnp = db->db.db_data; 2892 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2893 i--, dnp++) { 2894 if (dnp->dn_type != DMU_OT_NONE) 2895 fill++; 2896 } 2897 } else { 2898 if (BP_IS_HOLE(bp)) { 2899 fill = 0; 2900 } else { 2901 fill = 1; 2902 } 2903 } 2904 } else { 2905 blkptr_t *ibp = db->db.db_data; 2906 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2907 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2908 if (BP_IS_HOLE(ibp)) 2909 continue; 2910 fill += BP_GET_FILL(ibp); 2911 } 2912 } 2913 DB_DNODE_EXIT(db); 2914 2915 if (!BP_IS_EMBEDDED(bp)) 2916 bp->blk_fill = fill; 2917 2918 mutex_exit(&db->db_mtx); 2919 } 2920 2921 /* 2922 * The SPA will call this callback several times for each zio - once 2923 * for every physical child i/o (zio->io_phys_children times). This 2924 * allows the DMU to monitor the progress of each logical i/o. For example, 2925 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 2926 * block. There may be a long delay before all copies/fragments are completed, 2927 * so this callback allows us to retire dirty space gradually, as the physical 2928 * i/os complete. 2929 */ 2930 /* ARGSUSED */ 2931 static void 2932 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) 2933 { 2934 dmu_buf_impl_t *db = arg; 2935 objset_t *os = db->db_objset; 2936 dsl_pool_t *dp = dmu_objset_pool(os); 2937 dbuf_dirty_record_t *dr; 2938 int delta = 0; 2939 2940 dr = db->db_data_pending; 2941 ASSERT3U(dr->dr_txg, ==, zio->io_txg); 2942 2943 /* 2944 * The callback will be called io_phys_children times. Retire one 2945 * portion of our dirty space each time we are called. Any rounding 2946 * error will be cleaned up by dsl_pool_sync()'s call to 2947 * dsl_pool_undirty_space(). 2948 */ 2949 delta = dr->dr_accounted / zio->io_phys_children; 2950 dsl_pool_undirty_space(dp, delta, zio->io_txg); 2951 } 2952 2953 /* ARGSUSED */ 2954 static void 2955 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2956 { 2957 dmu_buf_impl_t *db = vdb; 2958 blkptr_t *bp_orig = &zio->io_bp_orig; 2959 blkptr_t *bp = db->db_blkptr; 2960 objset_t *os = db->db_objset; 2961 dmu_tx_t *tx = os->os_synctx; 2962 dbuf_dirty_record_t **drp, *dr; 2963 2964 ASSERT0(zio->io_error); 2965 ASSERT(db->db_blkptr == bp); 2966 2967 /* 2968 * For nopwrites and rewrites we ensure that the bp matches our 2969 * original and bypass all the accounting. 2970 */ 2971 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 2972 ASSERT(BP_EQUAL(bp, bp_orig)); 2973 } else { 2974 dsl_dataset_t *ds = os->os_dsl_dataset; 2975 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2976 dsl_dataset_block_born(ds, bp, tx); 2977 } 2978 2979 mutex_enter(&db->db_mtx); 2980 2981 DBUF_VERIFY(db); 2982 2983 drp = &db->db_last_dirty; 2984 while ((dr = *drp) != db->db_data_pending) 2985 drp = &dr->dr_next; 2986 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2987 ASSERT(dr->dr_dbuf == db); 2988 ASSERT(dr->dr_next == NULL); 2989 *drp = dr->dr_next; 2990 2991 #ifdef ZFS_DEBUG 2992 if (db->db_blkid == DMU_SPILL_BLKID) { 2993 dnode_t *dn; 2994 2995 DB_DNODE_ENTER(db); 2996 dn = DB_DNODE(db); 2997 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2998 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2999 db->db_blkptr == &dn->dn_phys->dn_spill); 3000 DB_DNODE_EXIT(db); 3001 } 3002 #endif 3003 3004 if (db->db_level == 0) { 3005 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 3006 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 3007 if (db->db_state != DB_NOFILL) { 3008 if (dr->dt.dl.dr_data != db->db_buf) 3009 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 3010 db)); 3011 else if (!arc_released(db->db_buf)) 3012 arc_set_callback(db->db_buf, dbuf_do_evict, db); 3013 } 3014 } else { 3015 dnode_t *dn; 3016 3017 DB_DNODE_ENTER(db); 3018 dn = DB_DNODE(db); 3019 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 3020 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 3021 if (!BP_IS_HOLE(db->db_blkptr)) { 3022 int epbs = 3023 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 3024 ASSERT3U(db->db_blkid, <=, 3025 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 3026 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 3027 db->db.db_size); 3028 if (!arc_released(db->db_buf)) 3029 arc_set_callback(db->db_buf, dbuf_do_evict, db); 3030 } 3031 DB_DNODE_EXIT(db); 3032 mutex_destroy(&dr->dt.di.dr_mtx); 3033 list_destroy(&dr->dt.di.dr_children); 3034 } 3035 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 3036 3037 cv_broadcast(&db->db_changed); 3038 ASSERT(db->db_dirtycnt > 0); 3039 db->db_dirtycnt -= 1; 3040 db->db_data_pending = NULL; 3041 dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); 3042 } 3043 3044 static void 3045 dbuf_write_nofill_ready(zio_t *zio) 3046 { 3047 dbuf_write_ready(zio, NULL, zio->io_private); 3048 } 3049 3050 static void 3051 dbuf_write_nofill_done(zio_t *zio) 3052 { 3053 dbuf_write_done(zio, NULL, zio->io_private); 3054 } 3055 3056 static void 3057 dbuf_write_override_ready(zio_t *zio) 3058 { 3059 dbuf_dirty_record_t *dr = zio->io_private; 3060 dmu_buf_impl_t *db = dr->dr_dbuf; 3061 3062 dbuf_write_ready(zio, NULL, db); 3063 } 3064 3065 static void 3066 dbuf_write_override_done(zio_t *zio) 3067 { 3068 dbuf_dirty_record_t *dr = zio->io_private; 3069 dmu_buf_impl_t *db = dr->dr_dbuf; 3070 blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 3071 3072 mutex_enter(&db->db_mtx); 3073 if (!BP_EQUAL(zio->io_bp, obp)) { 3074 if (!BP_IS_HOLE(obp)) 3075 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 3076 arc_release(dr->dt.dl.dr_data, db); 3077 } 3078 mutex_exit(&db->db_mtx); 3079 3080 dbuf_write_done(zio, NULL, db); 3081 } 3082 3083 /* Issue I/O to commit a dirty buffer to disk. */ 3084 static void 3085 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 3086 { 3087 dmu_buf_impl_t *db = dr->dr_dbuf; 3088 dnode_t *dn; 3089 objset_t *os; 3090 dmu_buf_impl_t *parent = db->db_parent; 3091 uint64_t txg = tx->tx_txg; 3092 zbookmark_phys_t zb; 3093 zio_prop_t zp; 3094 zio_t *zio; 3095 int wp_flag = 0; 3096 3097 DB_DNODE_ENTER(db); 3098 dn = DB_DNODE(db); 3099 os = dn->dn_objset; 3100 3101 if (db->db_state != DB_NOFILL) { 3102 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 3103 /* 3104 * Private object buffers are released here rather 3105 * than in dbuf_dirty() since they are only modified 3106 * in the syncing context and we don't want the 3107 * overhead of making multiple copies of the data. 3108 */ 3109 if (BP_IS_HOLE(db->db_blkptr)) { 3110 arc_buf_thaw(data); 3111 } else { 3112 dbuf_release_bp(db); 3113 } 3114 } 3115 } 3116 3117 if (parent != dn->dn_dbuf) { 3118 /* Our parent is an indirect block. */ 3119 /* We have a dirty parent that has been scheduled for write. */ 3120 ASSERT(parent && parent->db_data_pending); 3121 /* Our parent's buffer is one level closer to the dnode. */ 3122 ASSERT(db->db_level == parent->db_level-1); 3123 /* 3124 * We're about to modify our parent's db_data by modifying 3125 * our block pointer, so the parent must be released. 3126 */ 3127 ASSERT(arc_released(parent->db_buf)); 3128 zio = parent->db_data_pending->dr_zio; 3129 } else { 3130 /* Our parent is the dnode itself. */ 3131 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 3132 db->db_blkid != DMU_SPILL_BLKID) || 3133 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 3134 if (db->db_blkid != DMU_SPILL_BLKID) 3135 ASSERT3P(db->db_blkptr, ==, 3136 &dn->dn_phys->dn_blkptr[db->db_blkid]); 3137 zio = dn->dn_zio; 3138 } 3139 3140 ASSERT(db->db_level == 0 || data == db->db_buf); 3141 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 3142 ASSERT(zio); 3143 3144 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 3145 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 3146 db->db.db_object, db->db_level, db->db_blkid); 3147 3148 if (db->db_blkid == DMU_SPILL_BLKID) 3149 wp_flag = WP_SPILL; 3150 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 3151 3152 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 3153 DB_DNODE_EXIT(db); 3154 3155 if (db->db_level == 0 && 3156 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 3157 /* 3158 * The BP for this block has been provided by open context 3159 * (by dmu_sync() or dmu_buf_write_embedded()). 3160 */ 3161 void *contents = (data != NULL) ? data->b_data : NULL; 3162 3163 dr->dr_zio = zio_write(zio, os->os_spa, txg, 3164 db->db_blkptr, contents, db->db.db_size, &zp, 3165 dbuf_write_override_ready, NULL, dbuf_write_override_done, 3166 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3167 mutex_enter(&db->db_mtx); 3168 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 3169 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 3170 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 3171 mutex_exit(&db->db_mtx); 3172 } else if (db->db_state == DB_NOFILL) { 3173 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || 3174 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 3175 dr->dr_zio = zio_write(zio, os->os_spa, txg, 3176 db->db_blkptr, NULL, db->db.db_size, &zp, 3177 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, 3178 ZIO_PRIORITY_ASYNC_WRITE, 3179 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 3180 } else { 3181 ASSERT(arc_released(data)); 3182 dr->dr_zio = arc_write(zio, os->os_spa, txg, 3183 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), 3184 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, 3185 dbuf_write_physdone, dbuf_write_done, db, 3186 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3187 } 3188 } 3189