1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/dmu.h> 31 #include <sys/dmu_send.h> 32 #include <sys/dmu_impl.h> 33 #include <sys/dbuf.h> 34 #include <sys/dmu_objset.h> 35 #include <sys/dsl_dataset.h> 36 #include <sys/dsl_dir.h> 37 #include <sys/dmu_tx.h> 38 #include <sys/spa.h> 39 #include <sys/zio.h> 40 #include <sys/dmu_zfetch.h> 41 #include <sys/sa.h> 42 #include <sys/sa_impl.h> 43 #include <sys/range_tree.h> 44 45 /* 46 * Number of times that zfs_free_range() took the slow path while doing 47 * a zfs receive. A nonzero value indicates a potential performance problem. 48 */ 49 uint64_t zfs_free_range_recv_miss; 50 51 static void dbuf_destroy(dmu_buf_impl_t *db); 52 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 53 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 54 55 /* 56 * Global data structures and functions for the dbuf cache. 57 */ 58 static kmem_cache_t *dbuf_cache; 59 60 /* ARGSUSED */ 61 static int 62 dbuf_cons(void *vdb, void *unused, int kmflag) 63 { 64 dmu_buf_impl_t *db = vdb; 65 bzero(db, sizeof (dmu_buf_impl_t)); 66 67 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 68 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 69 refcount_create(&db->db_holds); 70 return (0); 71 } 72 73 /* ARGSUSED */ 74 static void 75 dbuf_dest(void *vdb, void *unused) 76 { 77 dmu_buf_impl_t *db = vdb; 78 mutex_destroy(&db->db_mtx); 79 cv_destroy(&db->db_changed); 80 refcount_destroy(&db->db_holds); 81 } 82 83 /* 84 * dbuf hash table routines 85 */ 86 static dbuf_hash_table_t dbuf_hash_table; 87 88 static uint64_t dbuf_hash_count; 89 90 static uint64_t 91 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 92 { 93 uintptr_t osv = (uintptr_t)os; 94 uint64_t crc = -1ULL; 95 96 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 97 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 98 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 99 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 100 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 101 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 102 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 103 104 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 105 106 return (crc); 107 } 108 109 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 110 111 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 112 ((dbuf)->db.db_object == (obj) && \ 113 (dbuf)->db_objset == (os) && \ 114 (dbuf)->db_level == (level) && \ 115 (dbuf)->db_blkid == (blkid)) 116 117 dmu_buf_impl_t * 118 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 119 { 120 dbuf_hash_table_t *h = &dbuf_hash_table; 121 objset_t *os = dn->dn_objset; 122 uint64_t obj = dn->dn_object; 123 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 124 uint64_t idx = hv & h->hash_table_mask; 125 dmu_buf_impl_t *db; 126 127 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 128 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 129 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 130 mutex_enter(&db->db_mtx); 131 if (db->db_state != DB_EVICTING) { 132 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 133 return (db); 134 } 135 mutex_exit(&db->db_mtx); 136 } 137 } 138 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 139 return (NULL); 140 } 141 142 /* 143 * Insert an entry into the hash table. If there is already an element 144 * equal to elem in the hash table, then the already existing element 145 * will be returned and the new element will not be inserted. 146 * Otherwise returns NULL. 147 */ 148 static dmu_buf_impl_t * 149 dbuf_hash_insert(dmu_buf_impl_t *db) 150 { 151 dbuf_hash_table_t *h = &dbuf_hash_table; 152 objset_t *os = db->db_objset; 153 uint64_t obj = db->db.db_object; 154 int level = db->db_level; 155 uint64_t blkid = db->db_blkid; 156 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 157 uint64_t idx = hv & h->hash_table_mask; 158 dmu_buf_impl_t *dbf; 159 160 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 161 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 162 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 163 mutex_enter(&dbf->db_mtx); 164 if (dbf->db_state != DB_EVICTING) { 165 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 166 return (dbf); 167 } 168 mutex_exit(&dbf->db_mtx); 169 } 170 } 171 172 mutex_enter(&db->db_mtx); 173 db->db_hash_next = h->hash_table[idx]; 174 h->hash_table[idx] = db; 175 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 176 atomic_add_64(&dbuf_hash_count, 1); 177 178 return (NULL); 179 } 180 181 /* 182 * Remove an entry from the hash table. This operation will 183 * fail if there are any existing holds on the db. 184 */ 185 static void 186 dbuf_hash_remove(dmu_buf_impl_t *db) 187 { 188 dbuf_hash_table_t *h = &dbuf_hash_table; 189 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 190 db->db_level, db->db_blkid); 191 uint64_t idx = hv & h->hash_table_mask; 192 dmu_buf_impl_t *dbf, **dbp; 193 194 /* 195 * We musn't hold db_mtx to maintin lock ordering: 196 * DBUF_HASH_MUTEX > db_mtx. 197 */ 198 ASSERT(refcount_is_zero(&db->db_holds)); 199 ASSERT(db->db_state == DB_EVICTING); 200 ASSERT(!MUTEX_HELD(&db->db_mtx)); 201 202 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 203 dbp = &h->hash_table[idx]; 204 while ((dbf = *dbp) != db) { 205 dbp = &dbf->db_hash_next; 206 ASSERT(dbf != NULL); 207 } 208 *dbp = db->db_hash_next; 209 db->db_hash_next = NULL; 210 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 211 atomic_add_64(&dbuf_hash_count, -1); 212 } 213 214 static arc_evict_func_t dbuf_do_evict; 215 216 static void 217 dbuf_evict_user(dmu_buf_impl_t *db) 218 { 219 ASSERT(MUTEX_HELD(&db->db_mtx)); 220 221 if (db->db_level != 0 || db->db_evict_func == NULL) 222 return; 223 224 if (db->db_user_data_ptr_ptr) 225 *db->db_user_data_ptr_ptr = db->db.db_data; 226 db->db_evict_func(&db->db, db->db_user_ptr); 227 db->db_user_ptr = NULL; 228 db->db_user_data_ptr_ptr = NULL; 229 db->db_evict_func = NULL; 230 } 231 232 boolean_t 233 dbuf_is_metadata(dmu_buf_impl_t *db) 234 { 235 if (db->db_level > 0) { 236 return (B_TRUE); 237 } else { 238 boolean_t is_metadata; 239 240 DB_DNODE_ENTER(db); 241 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 242 DB_DNODE_EXIT(db); 243 244 return (is_metadata); 245 } 246 } 247 248 void 249 dbuf_evict(dmu_buf_impl_t *db) 250 { 251 ASSERT(MUTEX_HELD(&db->db_mtx)); 252 ASSERT(db->db_buf == NULL); 253 ASSERT(db->db_data_pending == NULL); 254 255 dbuf_clear(db); 256 dbuf_destroy(db); 257 } 258 259 void 260 dbuf_init(void) 261 { 262 uint64_t hsize = 1ULL << 16; 263 dbuf_hash_table_t *h = &dbuf_hash_table; 264 int i; 265 266 /* 267 * The hash table is big enough to fill all of physical memory 268 * with an average 4K block size. The table will take up 269 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 270 */ 271 while (hsize * 4096 < physmem * PAGESIZE) 272 hsize <<= 1; 273 274 retry: 275 h->hash_table_mask = hsize - 1; 276 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 277 if (h->hash_table == NULL) { 278 /* XXX - we should really return an error instead of assert */ 279 ASSERT(hsize > (1ULL << 10)); 280 hsize >>= 1; 281 goto retry; 282 } 283 284 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 285 sizeof (dmu_buf_impl_t), 286 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 287 288 for (i = 0; i < DBUF_MUTEXES; i++) 289 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 290 } 291 292 void 293 dbuf_fini(void) 294 { 295 dbuf_hash_table_t *h = &dbuf_hash_table; 296 int i; 297 298 for (i = 0; i < DBUF_MUTEXES; i++) 299 mutex_destroy(&h->hash_mutexes[i]); 300 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 301 kmem_cache_destroy(dbuf_cache); 302 } 303 304 /* 305 * Other stuff. 306 */ 307 308 #ifdef ZFS_DEBUG 309 static void 310 dbuf_verify(dmu_buf_impl_t *db) 311 { 312 dnode_t *dn; 313 dbuf_dirty_record_t *dr; 314 315 ASSERT(MUTEX_HELD(&db->db_mtx)); 316 317 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 318 return; 319 320 ASSERT(db->db_objset != NULL); 321 DB_DNODE_ENTER(db); 322 dn = DB_DNODE(db); 323 if (dn == NULL) { 324 ASSERT(db->db_parent == NULL); 325 ASSERT(db->db_blkptr == NULL); 326 } else { 327 ASSERT3U(db->db.db_object, ==, dn->dn_object); 328 ASSERT3P(db->db_objset, ==, dn->dn_objset); 329 ASSERT3U(db->db_level, <, dn->dn_nlevels); 330 ASSERT(db->db_blkid == DMU_BONUS_BLKID || 331 db->db_blkid == DMU_SPILL_BLKID || 332 !list_is_empty(&dn->dn_dbufs)); 333 } 334 if (db->db_blkid == DMU_BONUS_BLKID) { 335 ASSERT(dn != NULL); 336 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 337 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 338 } else if (db->db_blkid == DMU_SPILL_BLKID) { 339 ASSERT(dn != NULL); 340 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 341 ASSERT0(db->db.db_offset); 342 } else { 343 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 344 } 345 346 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 347 ASSERT(dr->dr_dbuf == db); 348 349 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 350 ASSERT(dr->dr_dbuf == db); 351 352 /* 353 * We can't assert that db_size matches dn_datablksz because it 354 * can be momentarily different when another thread is doing 355 * dnode_set_blksz(). 356 */ 357 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 358 dr = db->db_data_pending; 359 /* 360 * It should only be modified in syncing context, so 361 * make sure we only have one copy of the data. 362 */ 363 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 364 } 365 366 /* verify db->db_blkptr */ 367 if (db->db_blkptr) { 368 if (db->db_parent == dn->dn_dbuf) { 369 /* db is pointed to by the dnode */ 370 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 371 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 372 ASSERT(db->db_parent == NULL); 373 else 374 ASSERT(db->db_parent != NULL); 375 if (db->db_blkid != DMU_SPILL_BLKID) 376 ASSERT3P(db->db_blkptr, ==, 377 &dn->dn_phys->dn_blkptr[db->db_blkid]); 378 } else { 379 /* db is pointed to by an indirect block */ 380 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 381 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 382 ASSERT3U(db->db_parent->db.db_object, ==, 383 db->db.db_object); 384 /* 385 * dnode_grow_indblksz() can make this fail if we don't 386 * have the struct_rwlock. XXX indblksz no longer 387 * grows. safe to do this now? 388 */ 389 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 390 ASSERT3P(db->db_blkptr, ==, 391 ((blkptr_t *)db->db_parent->db.db_data + 392 db->db_blkid % epb)); 393 } 394 } 395 } 396 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 397 (db->db_buf == NULL || db->db_buf->b_data) && 398 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 399 db->db_state != DB_FILL && !dn->dn_free_txg) { 400 /* 401 * If the blkptr isn't set but they have nonzero data, 402 * it had better be dirty, otherwise we'll lose that 403 * data when we evict this buffer. 404 */ 405 if (db->db_dirtycnt == 0) { 406 uint64_t *buf = db->db.db_data; 407 int i; 408 409 for (i = 0; i < db->db.db_size >> 3; i++) { 410 ASSERT(buf[i] == 0); 411 } 412 } 413 } 414 DB_DNODE_EXIT(db); 415 } 416 #endif 417 418 static void 419 dbuf_update_data(dmu_buf_impl_t *db) 420 { 421 ASSERT(MUTEX_HELD(&db->db_mtx)); 422 if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 423 ASSERT(!refcount_is_zero(&db->db_holds)); 424 *db->db_user_data_ptr_ptr = db->db.db_data; 425 } 426 } 427 428 static void 429 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 430 { 431 ASSERT(MUTEX_HELD(&db->db_mtx)); 432 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 433 db->db_buf = buf; 434 if (buf != NULL) { 435 ASSERT(buf->b_data != NULL); 436 db->db.db_data = buf->b_data; 437 if (!arc_released(buf)) 438 arc_set_callback(buf, dbuf_do_evict, db); 439 dbuf_update_data(db); 440 } else { 441 dbuf_evict_user(db); 442 db->db.db_data = NULL; 443 if (db->db_state != DB_NOFILL) 444 db->db_state = DB_UNCACHED; 445 } 446 } 447 448 /* 449 * Loan out an arc_buf for read. Return the loaned arc_buf. 450 */ 451 arc_buf_t * 452 dbuf_loan_arcbuf(dmu_buf_impl_t *db) 453 { 454 arc_buf_t *abuf; 455 456 mutex_enter(&db->db_mtx); 457 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 458 int blksz = db->db.db_size; 459 spa_t *spa = db->db_objset->os_spa; 460 461 mutex_exit(&db->db_mtx); 462 abuf = arc_loan_buf(spa, blksz); 463 bcopy(db->db.db_data, abuf->b_data, blksz); 464 } else { 465 abuf = db->db_buf; 466 arc_loan_inuse_buf(abuf, db); 467 dbuf_set_data(db, NULL); 468 mutex_exit(&db->db_mtx); 469 } 470 return (abuf); 471 } 472 473 uint64_t 474 dbuf_whichblock(dnode_t *dn, uint64_t offset) 475 { 476 if (dn->dn_datablkshift) { 477 return (offset >> dn->dn_datablkshift); 478 } else { 479 ASSERT3U(offset, <, dn->dn_datablksz); 480 return (0); 481 } 482 } 483 484 static void 485 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 486 { 487 dmu_buf_impl_t *db = vdb; 488 489 mutex_enter(&db->db_mtx); 490 ASSERT3U(db->db_state, ==, DB_READ); 491 /* 492 * All reads are synchronous, so we must have a hold on the dbuf 493 */ 494 ASSERT(refcount_count(&db->db_holds) > 0); 495 ASSERT(db->db_buf == NULL); 496 ASSERT(db->db.db_data == NULL); 497 if (db->db_level == 0 && db->db_freed_in_flight) { 498 /* we were freed in flight; disregard any error */ 499 arc_release(buf, db); 500 bzero(buf->b_data, db->db.db_size); 501 arc_buf_freeze(buf); 502 db->db_freed_in_flight = FALSE; 503 dbuf_set_data(db, buf); 504 db->db_state = DB_CACHED; 505 } else if (zio == NULL || zio->io_error == 0) { 506 dbuf_set_data(db, buf); 507 db->db_state = DB_CACHED; 508 } else { 509 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 510 ASSERT3P(db->db_buf, ==, NULL); 511 VERIFY(arc_buf_remove_ref(buf, db)); 512 db->db_state = DB_UNCACHED; 513 } 514 cv_broadcast(&db->db_changed); 515 dbuf_rele_and_unlock(db, NULL); 516 } 517 518 static void 519 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 520 { 521 dnode_t *dn; 522 zbookmark_t zb; 523 uint32_t aflags = ARC_NOWAIT; 524 525 DB_DNODE_ENTER(db); 526 dn = DB_DNODE(db); 527 ASSERT(!refcount_is_zero(&db->db_holds)); 528 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 529 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 530 ASSERT(MUTEX_HELD(&db->db_mtx)); 531 ASSERT(db->db_state == DB_UNCACHED); 532 ASSERT(db->db_buf == NULL); 533 534 if (db->db_blkid == DMU_BONUS_BLKID) { 535 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 536 537 ASSERT3U(bonuslen, <=, db->db.db_size); 538 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 539 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 540 if (bonuslen < DN_MAX_BONUSLEN) 541 bzero(db->db.db_data, DN_MAX_BONUSLEN); 542 if (bonuslen) 543 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 544 DB_DNODE_EXIT(db); 545 dbuf_update_data(db); 546 db->db_state = DB_CACHED; 547 mutex_exit(&db->db_mtx); 548 return; 549 } 550 551 /* 552 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 553 * processes the delete record and clears the bp while we are waiting 554 * for the dn_mtx (resulting in a "no" from block_freed). 555 */ 556 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 557 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 558 BP_IS_HOLE(db->db_blkptr)))) { 559 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 560 561 DB_DNODE_EXIT(db); 562 dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, 563 db->db.db_size, db, type)); 564 bzero(db->db.db_data, db->db.db_size); 565 db->db_state = DB_CACHED; 566 *flags |= DB_RF_CACHED; 567 mutex_exit(&db->db_mtx); 568 return; 569 } 570 571 DB_DNODE_EXIT(db); 572 573 db->db_state = DB_READ; 574 mutex_exit(&db->db_mtx); 575 576 if (DBUF_IS_L2CACHEABLE(db)) 577 aflags |= ARC_L2CACHE; 578 if (DBUF_IS_L2COMPRESSIBLE(db)) 579 aflags |= ARC_L2COMPRESS; 580 581 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 582 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 583 db->db.db_object, db->db_level, db->db_blkid); 584 585 dbuf_add_ref(db, NULL); 586 587 (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 588 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 589 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 590 &aflags, &zb); 591 if (aflags & ARC_CACHED) 592 *flags |= DB_RF_CACHED; 593 } 594 595 int 596 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 597 { 598 int err = 0; 599 boolean_t havepzio = (zio != NULL); 600 boolean_t prefetch; 601 dnode_t *dn; 602 603 /* 604 * We don't have to hold the mutex to check db_state because it 605 * can't be freed while we have a hold on the buffer. 606 */ 607 ASSERT(!refcount_is_zero(&db->db_holds)); 608 609 if (db->db_state == DB_NOFILL) 610 return (SET_ERROR(EIO)); 611 612 DB_DNODE_ENTER(db); 613 dn = DB_DNODE(db); 614 if ((flags & DB_RF_HAVESTRUCT) == 0) 615 rw_enter(&dn->dn_struct_rwlock, RW_READER); 616 617 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 618 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 619 DBUF_IS_CACHEABLE(db); 620 621 mutex_enter(&db->db_mtx); 622 if (db->db_state == DB_CACHED) { 623 mutex_exit(&db->db_mtx); 624 if (prefetch) 625 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 626 db->db.db_size, TRUE); 627 if ((flags & DB_RF_HAVESTRUCT) == 0) 628 rw_exit(&dn->dn_struct_rwlock); 629 DB_DNODE_EXIT(db); 630 } else if (db->db_state == DB_UNCACHED) { 631 spa_t *spa = dn->dn_objset->os_spa; 632 633 if (zio == NULL) 634 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 635 dbuf_read_impl(db, zio, &flags); 636 637 /* dbuf_read_impl has dropped db_mtx for us */ 638 639 if (prefetch) 640 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 641 db->db.db_size, flags & DB_RF_CACHED); 642 643 if ((flags & DB_RF_HAVESTRUCT) == 0) 644 rw_exit(&dn->dn_struct_rwlock); 645 DB_DNODE_EXIT(db); 646 647 if (!havepzio) 648 err = zio_wait(zio); 649 } else { 650 /* 651 * Another reader came in while the dbuf was in flight 652 * between UNCACHED and CACHED. Either a writer will finish 653 * writing the buffer (sending the dbuf to CACHED) or the 654 * first reader's request will reach the read_done callback 655 * and send the dbuf to CACHED. Otherwise, a failure 656 * occurred and the dbuf went to UNCACHED. 657 */ 658 mutex_exit(&db->db_mtx); 659 if (prefetch) 660 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 661 db->db.db_size, TRUE); 662 if ((flags & DB_RF_HAVESTRUCT) == 0) 663 rw_exit(&dn->dn_struct_rwlock); 664 DB_DNODE_EXIT(db); 665 666 /* Skip the wait per the caller's request. */ 667 mutex_enter(&db->db_mtx); 668 if ((flags & DB_RF_NEVERWAIT) == 0) { 669 while (db->db_state == DB_READ || 670 db->db_state == DB_FILL) { 671 ASSERT(db->db_state == DB_READ || 672 (flags & DB_RF_HAVESTRUCT) == 0); 673 cv_wait(&db->db_changed, &db->db_mtx); 674 } 675 if (db->db_state == DB_UNCACHED) 676 err = SET_ERROR(EIO); 677 } 678 mutex_exit(&db->db_mtx); 679 } 680 681 ASSERT(err || havepzio || db->db_state == DB_CACHED); 682 return (err); 683 } 684 685 static void 686 dbuf_noread(dmu_buf_impl_t *db) 687 { 688 ASSERT(!refcount_is_zero(&db->db_holds)); 689 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 690 mutex_enter(&db->db_mtx); 691 while (db->db_state == DB_READ || db->db_state == DB_FILL) 692 cv_wait(&db->db_changed, &db->db_mtx); 693 if (db->db_state == DB_UNCACHED) { 694 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 695 spa_t *spa = db->db_objset->os_spa; 696 697 ASSERT(db->db_buf == NULL); 698 ASSERT(db->db.db_data == NULL); 699 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 700 db->db_state = DB_FILL; 701 } else if (db->db_state == DB_NOFILL) { 702 dbuf_set_data(db, NULL); 703 } else { 704 ASSERT3U(db->db_state, ==, DB_CACHED); 705 } 706 mutex_exit(&db->db_mtx); 707 } 708 709 /* 710 * This is our just-in-time copy function. It makes a copy of 711 * buffers, that have been modified in a previous transaction 712 * group, before we modify them in the current active group. 713 * 714 * This function is used in two places: when we are dirtying a 715 * buffer for the first time in a txg, and when we are freeing 716 * a range in a dnode that includes this buffer. 717 * 718 * Note that when we are called from dbuf_free_range() we do 719 * not put a hold on the buffer, we just traverse the active 720 * dbuf list for the dnode. 721 */ 722 static void 723 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 724 { 725 dbuf_dirty_record_t *dr = db->db_last_dirty; 726 727 ASSERT(MUTEX_HELD(&db->db_mtx)); 728 ASSERT(db->db.db_data != NULL); 729 ASSERT(db->db_level == 0); 730 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 731 732 if (dr == NULL || 733 (dr->dt.dl.dr_data != 734 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 735 return; 736 737 /* 738 * If the last dirty record for this dbuf has not yet synced 739 * and its referencing the dbuf data, either: 740 * reset the reference to point to a new copy, 741 * or (if there a no active holders) 742 * just null out the current db_data pointer. 743 */ 744 ASSERT(dr->dr_txg >= txg - 2); 745 if (db->db_blkid == DMU_BONUS_BLKID) { 746 /* Note that the data bufs here are zio_bufs */ 747 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 748 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 749 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 750 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 751 int size = db->db.db_size; 752 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 753 spa_t *spa = db->db_objset->os_spa; 754 755 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 756 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 757 } else { 758 dbuf_set_data(db, NULL); 759 } 760 } 761 762 void 763 dbuf_unoverride(dbuf_dirty_record_t *dr) 764 { 765 dmu_buf_impl_t *db = dr->dr_dbuf; 766 blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 767 uint64_t txg = dr->dr_txg; 768 769 ASSERT(MUTEX_HELD(&db->db_mtx)); 770 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 771 ASSERT(db->db_level == 0); 772 773 if (db->db_blkid == DMU_BONUS_BLKID || 774 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 775 return; 776 777 ASSERT(db->db_data_pending != dr); 778 779 /* free this block */ 780 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 781 zio_free(db->db_objset->os_spa, txg, bp); 782 783 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 784 dr->dt.dl.dr_nopwrite = B_FALSE; 785 786 /* 787 * Release the already-written buffer, so we leave it in 788 * a consistent dirty state. Note that all callers are 789 * modifying the buffer, so they will immediately do 790 * another (redundant) arc_release(). Therefore, leave 791 * the buf thawed to save the effort of freezing & 792 * immediately re-thawing it. 793 */ 794 arc_release(dr->dt.dl.dr_data, db); 795 } 796 797 /* 798 * Evict (if its unreferenced) or clear (if its referenced) any level-0 799 * data blocks in the free range, so that any future readers will find 800 * empty blocks. 801 * 802 * This is a no-op if the dataset is in the middle of an incremental 803 * receive; see comment below for details. 804 */ 805 void 806 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) 807 { 808 dmu_buf_impl_t *db, *db_next; 809 uint64_t txg = tx->tx_txg; 810 811 if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) 812 end = dn->dn_maxblkid; 813 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); 814 815 mutex_enter(&dn->dn_dbufs_mtx); 816 if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) { 817 /* There can't be any dbufs in this range; no need to search. */ 818 mutex_exit(&dn->dn_dbufs_mtx); 819 return; 820 } else if (dmu_objset_is_receiving(dn->dn_objset)) { 821 /* 822 * If we are receiving, we expect there to be no dbufs in 823 * the range to be freed, because receive modifies each 824 * block at most once, and in offset order. If this is 825 * not the case, it can lead to performance problems, 826 * so note that we unexpectedly took the slow path. 827 */ 828 atomic_inc_64(&zfs_free_range_recv_miss); 829 } 830 831 for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) { 832 db_next = list_next(&dn->dn_dbufs, db); 833 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 834 835 if (db->db_level != 0) 836 continue; 837 if (db->db_blkid < start || db->db_blkid > end) 838 continue; 839 840 /* found a level 0 buffer in the range */ 841 mutex_enter(&db->db_mtx); 842 if (dbuf_undirty(db, tx)) { 843 /* mutex has been dropped and dbuf destroyed */ 844 continue; 845 } 846 847 if (db->db_state == DB_UNCACHED || 848 db->db_state == DB_NOFILL || 849 db->db_state == DB_EVICTING) { 850 ASSERT(db->db.db_data == NULL); 851 mutex_exit(&db->db_mtx); 852 continue; 853 } 854 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 855 /* will be handled in dbuf_read_done or dbuf_rele */ 856 db->db_freed_in_flight = TRUE; 857 mutex_exit(&db->db_mtx); 858 continue; 859 } 860 if (refcount_count(&db->db_holds) == 0) { 861 ASSERT(db->db_buf); 862 dbuf_clear(db); 863 continue; 864 } 865 /* The dbuf is referenced */ 866 867 if (db->db_last_dirty != NULL) { 868 dbuf_dirty_record_t *dr = db->db_last_dirty; 869 870 if (dr->dr_txg == txg) { 871 /* 872 * This buffer is "in-use", re-adjust the file 873 * size to reflect that this buffer may 874 * contain new data when we sync. 875 */ 876 if (db->db_blkid != DMU_SPILL_BLKID && 877 db->db_blkid > dn->dn_maxblkid) 878 dn->dn_maxblkid = db->db_blkid; 879 dbuf_unoverride(dr); 880 } else { 881 /* 882 * This dbuf is not dirty in the open context. 883 * Either uncache it (if its not referenced in 884 * the open context) or reset its contents to 885 * empty. 886 */ 887 dbuf_fix_old_data(db, txg); 888 } 889 } 890 /* clear the contents if its cached */ 891 if (db->db_state == DB_CACHED) { 892 ASSERT(db->db.db_data != NULL); 893 arc_release(db->db_buf, db); 894 bzero(db->db.db_data, db->db.db_size); 895 arc_buf_freeze(db->db_buf); 896 } 897 898 mutex_exit(&db->db_mtx); 899 } 900 mutex_exit(&dn->dn_dbufs_mtx); 901 } 902 903 static int 904 dbuf_block_freeable(dmu_buf_impl_t *db) 905 { 906 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 907 uint64_t birth_txg = 0; 908 909 /* 910 * We don't need any locking to protect db_blkptr: 911 * If it's syncing, then db_last_dirty will be set 912 * so we'll ignore db_blkptr. 913 * 914 * This logic ensures that only block births for 915 * filled blocks are considered. 916 */ 917 ASSERT(MUTEX_HELD(&db->db_mtx)); 918 if (db->db_last_dirty && (db->db_blkptr == NULL || 919 !BP_IS_HOLE(db->db_blkptr))) { 920 birth_txg = db->db_last_dirty->dr_txg; 921 } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 922 birth_txg = db->db_blkptr->blk_birth; 923 } 924 925 /* 926 * If this block don't exist or is in a snapshot, it can't be freed. 927 * Don't pass the bp to dsl_dataset_block_freeable() since we 928 * are holding the db_mtx lock and might deadlock if we are 929 * prefetching a dedup-ed block. 930 */ 931 if (birth_txg != 0) 932 return (ds == NULL || 933 dsl_dataset_block_freeable(ds, NULL, birth_txg)); 934 else 935 return (B_FALSE); 936 } 937 938 void 939 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 940 { 941 arc_buf_t *buf, *obuf; 942 int osize = db->db.db_size; 943 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 944 dnode_t *dn; 945 946 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 947 948 DB_DNODE_ENTER(db); 949 dn = DB_DNODE(db); 950 951 /* XXX does *this* func really need the lock? */ 952 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 953 954 /* 955 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 956 * is OK, because there can be no other references to the db 957 * when we are changing its size, so no concurrent DB_FILL can 958 * be happening. 959 */ 960 /* 961 * XXX we should be doing a dbuf_read, checking the return 962 * value and returning that up to our callers 963 */ 964 dmu_buf_will_dirty(&db->db, tx); 965 966 /* create the data buffer for the new block */ 967 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 968 969 /* copy old block data to the new block */ 970 obuf = db->db_buf; 971 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 972 /* zero the remainder */ 973 if (size > osize) 974 bzero((uint8_t *)buf->b_data + osize, size - osize); 975 976 mutex_enter(&db->db_mtx); 977 dbuf_set_data(db, buf); 978 VERIFY(arc_buf_remove_ref(obuf, db)); 979 db->db.db_size = size; 980 981 if (db->db_level == 0) { 982 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 983 db->db_last_dirty->dt.dl.dr_data = buf; 984 } 985 mutex_exit(&db->db_mtx); 986 987 dnode_willuse_space(dn, size-osize, tx); 988 DB_DNODE_EXIT(db); 989 } 990 991 void 992 dbuf_release_bp(dmu_buf_impl_t *db) 993 { 994 objset_t *os = db->db_objset; 995 996 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 997 ASSERT(arc_released(os->os_phys_buf) || 998 list_link_active(&os->os_dsl_dataset->ds_synced_link)); 999 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1000 1001 (void) arc_release(db->db_buf, db); 1002 } 1003 1004 dbuf_dirty_record_t * 1005 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1006 { 1007 dnode_t *dn; 1008 objset_t *os; 1009 dbuf_dirty_record_t **drp, *dr; 1010 int drop_struct_lock = FALSE; 1011 boolean_t do_free_accounting = B_FALSE; 1012 int txgoff = tx->tx_txg & TXG_MASK; 1013 1014 ASSERT(tx->tx_txg != 0); 1015 ASSERT(!refcount_is_zero(&db->db_holds)); 1016 DMU_TX_DIRTY_BUF(tx, db); 1017 1018 DB_DNODE_ENTER(db); 1019 dn = DB_DNODE(db); 1020 /* 1021 * Shouldn't dirty a regular buffer in syncing context. Private 1022 * objects may be dirtied in syncing context, but only if they 1023 * were already pre-dirtied in open context. 1024 */ 1025 ASSERT(!dmu_tx_is_syncing(tx) || 1026 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1027 DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1028 dn->dn_objset->os_dsl_dataset == NULL); 1029 /* 1030 * We make this assert for private objects as well, but after we 1031 * check if we're already dirty. They are allowed to re-dirty 1032 * in syncing context. 1033 */ 1034 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1035 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1036 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1037 1038 mutex_enter(&db->db_mtx); 1039 /* 1040 * XXX make this true for indirects too? The problem is that 1041 * transactions created with dmu_tx_create_assigned() from 1042 * syncing context don't bother holding ahead. 1043 */ 1044 ASSERT(db->db_level != 0 || 1045 db->db_state == DB_CACHED || db->db_state == DB_FILL || 1046 db->db_state == DB_NOFILL); 1047 1048 mutex_enter(&dn->dn_mtx); 1049 /* 1050 * Don't set dirtyctx to SYNC if we're just modifying this as we 1051 * initialize the objset. 1052 */ 1053 if (dn->dn_dirtyctx == DN_UNDIRTIED && 1054 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1055 dn->dn_dirtyctx = 1056 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1057 ASSERT(dn->dn_dirtyctx_firstset == NULL); 1058 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1059 } 1060 mutex_exit(&dn->dn_mtx); 1061 1062 if (db->db_blkid == DMU_SPILL_BLKID) 1063 dn->dn_have_spill = B_TRUE; 1064 1065 /* 1066 * If this buffer is already dirty, we're done. 1067 */ 1068 drp = &db->db_last_dirty; 1069 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1070 db->db.db_object == DMU_META_DNODE_OBJECT); 1071 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1072 drp = &dr->dr_next; 1073 if (dr && dr->dr_txg == tx->tx_txg) { 1074 DB_DNODE_EXIT(db); 1075 1076 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1077 /* 1078 * If this buffer has already been written out, 1079 * we now need to reset its state. 1080 */ 1081 dbuf_unoverride(dr); 1082 if (db->db.db_object != DMU_META_DNODE_OBJECT && 1083 db->db_state != DB_NOFILL) 1084 arc_buf_thaw(db->db_buf); 1085 } 1086 mutex_exit(&db->db_mtx); 1087 return (dr); 1088 } 1089 1090 /* 1091 * Only valid if not already dirty. 1092 */ 1093 ASSERT(dn->dn_object == 0 || 1094 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1095 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1096 1097 ASSERT3U(dn->dn_nlevels, >, db->db_level); 1098 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1099 dn->dn_phys->dn_nlevels > db->db_level || 1100 dn->dn_next_nlevels[txgoff] > db->db_level || 1101 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1102 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1103 1104 /* 1105 * We should only be dirtying in syncing context if it's the 1106 * mos or we're initializing the os or it's a special object. 1107 * However, we are allowed to dirty in syncing context provided 1108 * we already dirtied it in open context. Hence we must make 1109 * this assertion only if we're not already dirty. 1110 */ 1111 os = dn->dn_objset; 1112 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1113 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1114 ASSERT(db->db.db_size != 0); 1115 1116 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1117 1118 if (db->db_blkid != DMU_BONUS_BLKID) { 1119 /* 1120 * Update the accounting. 1121 * Note: we delay "free accounting" until after we drop 1122 * the db_mtx. This keeps us from grabbing other locks 1123 * (and possibly deadlocking) in bp_get_dsize() while 1124 * also holding the db_mtx. 1125 */ 1126 dnode_willuse_space(dn, db->db.db_size, tx); 1127 do_free_accounting = dbuf_block_freeable(db); 1128 } 1129 1130 /* 1131 * If this buffer is dirty in an old transaction group we need 1132 * to make a copy of it so that the changes we make in this 1133 * transaction group won't leak out when we sync the older txg. 1134 */ 1135 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1136 if (db->db_level == 0) { 1137 void *data_old = db->db_buf; 1138 1139 if (db->db_state != DB_NOFILL) { 1140 if (db->db_blkid == DMU_BONUS_BLKID) { 1141 dbuf_fix_old_data(db, tx->tx_txg); 1142 data_old = db->db.db_data; 1143 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1144 /* 1145 * Release the data buffer from the cache so 1146 * that we can modify it without impacting 1147 * possible other users of this cached data 1148 * block. Note that indirect blocks and 1149 * private objects are not released until the 1150 * syncing state (since they are only modified 1151 * then). 1152 */ 1153 arc_release(db->db_buf, db); 1154 dbuf_fix_old_data(db, tx->tx_txg); 1155 data_old = db->db_buf; 1156 } 1157 ASSERT(data_old != NULL); 1158 } 1159 dr->dt.dl.dr_data = data_old; 1160 } else { 1161 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1162 list_create(&dr->dt.di.dr_children, 1163 sizeof (dbuf_dirty_record_t), 1164 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1165 } 1166 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 1167 dr->dr_accounted = db->db.db_size; 1168 dr->dr_dbuf = db; 1169 dr->dr_txg = tx->tx_txg; 1170 dr->dr_next = *drp; 1171 *drp = dr; 1172 1173 /* 1174 * We could have been freed_in_flight between the dbuf_noread 1175 * and dbuf_dirty. We win, as though the dbuf_noread() had 1176 * happened after the free. 1177 */ 1178 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1179 db->db_blkid != DMU_SPILL_BLKID) { 1180 mutex_enter(&dn->dn_mtx); 1181 if (dn->dn_free_ranges[txgoff] != NULL) { 1182 range_tree_clear(dn->dn_free_ranges[txgoff], 1183 db->db_blkid, 1); 1184 } 1185 mutex_exit(&dn->dn_mtx); 1186 db->db_freed_in_flight = FALSE; 1187 } 1188 1189 /* 1190 * This buffer is now part of this txg 1191 */ 1192 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1193 db->db_dirtycnt += 1; 1194 ASSERT3U(db->db_dirtycnt, <=, 3); 1195 1196 mutex_exit(&db->db_mtx); 1197 1198 if (db->db_blkid == DMU_BONUS_BLKID || 1199 db->db_blkid == DMU_SPILL_BLKID) { 1200 mutex_enter(&dn->dn_mtx); 1201 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1202 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1203 mutex_exit(&dn->dn_mtx); 1204 dnode_setdirty(dn, tx); 1205 DB_DNODE_EXIT(db); 1206 return (dr); 1207 } else if (do_free_accounting) { 1208 blkptr_t *bp = db->db_blkptr; 1209 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1210 bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1211 /* 1212 * This is only a guess -- if the dbuf is dirty 1213 * in a previous txg, we don't know how much 1214 * space it will use on disk yet. We should 1215 * really have the struct_rwlock to access 1216 * db_blkptr, but since this is just a guess, 1217 * it's OK if we get an odd answer. 1218 */ 1219 ddt_prefetch(os->os_spa, bp); 1220 dnode_willuse_space(dn, -willfree, tx); 1221 } 1222 1223 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1224 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1225 drop_struct_lock = TRUE; 1226 } 1227 1228 if (db->db_level == 0) { 1229 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1230 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1231 } 1232 1233 if (db->db_level+1 < dn->dn_nlevels) { 1234 dmu_buf_impl_t *parent = db->db_parent; 1235 dbuf_dirty_record_t *di; 1236 int parent_held = FALSE; 1237 1238 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1239 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1240 1241 parent = dbuf_hold_level(dn, db->db_level+1, 1242 db->db_blkid >> epbs, FTAG); 1243 ASSERT(parent != NULL); 1244 parent_held = TRUE; 1245 } 1246 if (drop_struct_lock) 1247 rw_exit(&dn->dn_struct_rwlock); 1248 ASSERT3U(db->db_level+1, ==, parent->db_level); 1249 di = dbuf_dirty(parent, tx); 1250 if (parent_held) 1251 dbuf_rele(parent, FTAG); 1252 1253 mutex_enter(&db->db_mtx); 1254 /* 1255 * Since we've dropped the mutex, it's possible that 1256 * dbuf_undirty() might have changed this out from under us. 1257 */ 1258 if (db->db_last_dirty == dr || 1259 dn->dn_object == DMU_META_DNODE_OBJECT) { 1260 mutex_enter(&di->dt.di.dr_mtx); 1261 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1262 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1263 list_insert_tail(&di->dt.di.dr_children, dr); 1264 mutex_exit(&di->dt.di.dr_mtx); 1265 dr->dr_parent = di; 1266 } 1267 mutex_exit(&db->db_mtx); 1268 } else { 1269 ASSERT(db->db_level+1 == dn->dn_nlevels); 1270 ASSERT(db->db_blkid < dn->dn_nblkptr); 1271 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1272 mutex_enter(&dn->dn_mtx); 1273 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1274 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1275 mutex_exit(&dn->dn_mtx); 1276 if (drop_struct_lock) 1277 rw_exit(&dn->dn_struct_rwlock); 1278 } 1279 1280 dnode_setdirty(dn, tx); 1281 DB_DNODE_EXIT(db); 1282 return (dr); 1283 } 1284 1285 /* 1286 * Undirty a buffer in the transaction group referenced by the given 1287 * transaction. Return whether this evicted the dbuf. 1288 */ 1289 static boolean_t 1290 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1291 { 1292 dnode_t *dn; 1293 uint64_t txg = tx->tx_txg; 1294 dbuf_dirty_record_t *dr, **drp; 1295 1296 ASSERT(txg != 0); 1297 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1298 ASSERT0(db->db_level); 1299 ASSERT(MUTEX_HELD(&db->db_mtx)); 1300 1301 /* 1302 * If this buffer is not dirty, we're done. 1303 */ 1304 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1305 if (dr->dr_txg <= txg) 1306 break; 1307 if (dr == NULL || dr->dr_txg < txg) 1308 return (B_FALSE); 1309 ASSERT(dr->dr_txg == txg); 1310 ASSERT(dr->dr_dbuf == db); 1311 1312 DB_DNODE_ENTER(db); 1313 dn = DB_DNODE(db); 1314 1315 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1316 1317 ASSERT(db->db.db_size != 0); 1318 1319 /* 1320 * Any space we accounted for in dp_dirty_* will be cleaned up by 1321 * dsl_pool_sync(). This is relatively rare so the discrepancy 1322 * is not a big deal. 1323 */ 1324 1325 *drp = dr->dr_next; 1326 1327 /* 1328 * Note that there are three places in dbuf_dirty() 1329 * where this dirty record may be put on a list. 1330 * Make sure to do a list_remove corresponding to 1331 * every one of those list_insert calls. 1332 */ 1333 if (dr->dr_parent) { 1334 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1335 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1336 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1337 } else if (db->db_blkid == DMU_SPILL_BLKID || 1338 db->db_level+1 == dn->dn_nlevels) { 1339 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1340 mutex_enter(&dn->dn_mtx); 1341 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1342 mutex_exit(&dn->dn_mtx); 1343 } 1344 DB_DNODE_EXIT(db); 1345 1346 if (db->db_state != DB_NOFILL) { 1347 dbuf_unoverride(dr); 1348 1349 ASSERT(db->db_buf != NULL); 1350 ASSERT(dr->dt.dl.dr_data != NULL); 1351 if (dr->dt.dl.dr_data != db->db_buf) 1352 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); 1353 } 1354 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1355 1356 ASSERT(db->db_dirtycnt > 0); 1357 db->db_dirtycnt -= 1; 1358 1359 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1360 arc_buf_t *buf = db->db_buf; 1361 1362 ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1363 dbuf_set_data(db, NULL); 1364 VERIFY(arc_buf_remove_ref(buf, db)); 1365 dbuf_evict(db); 1366 return (B_TRUE); 1367 } 1368 1369 return (B_FALSE); 1370 } 1371 1372 void 1373 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1374 { 1375 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1376 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1377 1378 ASSERT(tx->tx_txg != 0); 1379 ASSERT(!refcount_is_zero(&db->db_holds)); 1380 1381 DB_DNODE_ENTER(db); 1382 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1383 rf |= DB_RF_HAVESTRUCT; 1384 DB_DNODE_EXIT(db); 1385 (void) dbuf_read(db, NULL, rf); 1386 (void) dbuf_dirty(db, tx); 1387 } 1388 1389 void 1390 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1391 { 1392 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1393 1394 db->db_state = DB_NOFILL; 1395 1396 dmu_buf_will_fill(db_fake, tx); 1397 } 1398 1399 void 1400 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1401 { 1402 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1403 1404 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1405 ASSERT(tx->tx_txg != 0); 1406 ASSERT(db->db_level == 0); 1407 ASSERT(!refcount_is_zero(&db->db_holds)); 1408 1409 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1410 dmu_tx_private_ok(tx)); 1411 1412 dbuf_noread(db); 1413 (void) dbuf_dirty(db, tx); 1414 } 1415 1416 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1417 /* ARGSUSED */ 1418 void 1419 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1420 { 1421 mutex_enter(&db->db_mtx); 1422 DBUF_VERIFY(db); 1423 1424 if (db->db_state == DB_FILL) { 1425 if (db->db_level == 0 && db->db_freed_in_flight) { 1426 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1427 /* we were freed while filling */ 1428 /* XXX dbuf_undirty? */ 1429 bzero(db->db.db_data, db->db.db_size); 1430 db->db_freed_in_flight = FALSE; 1431 } 1432 db->db_state = DB_CACHED; 1433 cv_broadcast(&db->db_changed); 1434 } 1435 mutex_exit(&db->db_mtx); 1436 } 1437 1438 /* 1439 * Directly assign a provided arc buf to a given dbuf if it's not referenced 1440 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1441 */ 1442 void 1443 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1444 { 1445 ASSERT(!refcount_is_zero(&db->db_holds)); 1446 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1447 ASSERT(db->db_level == 0); 1448 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1449 ASSERT(buf != NULL); 1450 ASSERT(arc_buf_size(buf) == db->db.db_size); 1451 ASSERT(tx->tx_txg != 0); 1452 1453 arc_return_buf(buf, db); 1454 ASSERT(arc_released(buf)); 1455 1456 mutex_enter(&db->db_mtx); 1457 1458 while (db->db_state == DB_READ || db->db_state == DB_FILL) 1459 cv_wait(&db->db_changed, &db->db_mtx); 1460 1461 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1462 1463 if (db->db_state == DB_CACHED && 1464 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1465 mutex_exit(&db->db_mtx); 1466 (void) dbuf_dirty(db, tx); 1467 bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1468 VERIFY(arc_buf_remove_ref(buf, db)); 1469 xuio_stat_wbuf_copied(); 1470 return; 1471 } 1472 1473 xuio_stat_wbuf_nocopy(); 1474 if (db->db_state == DB_CACHED) { 1475 dbuf_dirty_record_t *dr = db->db_last_dirty; 1476 1477 ASSERT(db->db_buf != NULL); 1478 if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1479 ASSERT(dr->dt.dl.dr_data == db->db_buf); 1480 if (!arc_released(db->db_buf)) { 1481 ASSERT(dr->dt.dl.dr_override_state == 1482 DR_OVERRIDDEN); 1483 arc_release(db->db_buf, db); 1484 } 1485 dr->dt.dl.dr_data = buf; 1486 VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1487 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1488 arc_release(db->db_buf, db); 1489 VERIFY(arc_buf_remove_ref(db->db_buf, db)); 1490 } 1491 db->db_buf = NULL; 1492 } 1493 ASSERT(db->db_buf == NULL); 1494 dbuf_set_data(db, buf); 1495 db->db_state = DB_FILL; 1496 mutex_exit(&db->db_mtx); 1497 (void) dbuf_dirty(db, tx); 1498 dmu_buf_fill_done(&db->db, tx); 1499 } 1500 1501 /* 1502 * "Clear" the contents of this dbuf. This will mark the dbuf 1503 * EVICTING and clear *most* of its references. Unfortunately, 1504 * when we are not holding the dn_dbufs_mtx, we can't clear the 1505 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1506 * in this case. For callers from the DMU we will usually see: 1507 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() 1508 * For the arc callback, we will usually see: 1509 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1510 * Sometimes, though, we will get a mix of these two: 1511 * DMU: dbuf_clear()->arc_buf_evict() 1512 * ARC: dbuf_do_evict()->dbuf_destroy() 1513 */ 1514 void 1515 dbuf_clear(dmu_buf_impl_t *db) 1516 { 1517 dnode_t *dn; 1518 dmu_buf_impl_t *parent = db->db_parent; 1519 dmu_buf_impl_t *dndb; 1520 int dbuf_gone = FALSE; 1521 1522 ASSERT(MUTEX_HELD(&db->db_mtx)); 1523 ASSERT(refcount_is_zero(&db->db_holds)); 1524 1525 dbuf_evict_user(db); 1526 1527 if (db->db_state == DB_CACHED) { 1528 ASSERT(db->db.db_data != NULL); 1529 if (db->db_blkid == DMU_BONUS_BLKID) { 1530 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1531 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1532 } 1533 db->db.db_data = NULL; 1534 db->db_state = DB_UNCACHED; 1535 } 1536 1537 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1538 ASSERT(db->db_data_pending == NULL); 1539 1540 db->db_state = DB_EVICTING; 1541 db->db_blkptr = NULL; 1542 1543 DB_DNODE_ENTER(db); 1544 dn = DB_DNODE(db); 1545 dndb = dn->dn_dbuf; 1546 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1547 list_remove(&dn->dn_dbufs, db); 1548 (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1549 membar_producer(); 1550 DB_DNODE_EXIT(db); 1551 /* 1552 * Decrementing the dbuf count means that the hold corresponding 1553 * to the removed dbuf is no longer discounted in dnode_move(), 1554 * so the dnode cannot be moved until after we release the hold. 1555 * The membar_producer() ensures visibility of the decremented 1556 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1557 * release any lock. 1558 */ 1559 dnode_rele(dn, db); 1560 db->db_dnode_handle = NULL; 1561 } else { 1562 DB_DNODE_EXIT(db); 1563 } 1564 1565 if (db->db_buf) 1566 dbuf_gone = arc_buf_evict(db->db_buf); 1567 1568 if (!dbuf_gone) 1569 mutex_exit(&db->db_mtx); 1570 1571 /* 1572 * If this dbuf is referenced from an indirect dbuf, 1573 * decrement the ref count on the indirect dbuf. 1574 */ 1575 if (parent && parent != dndb) 1576 dbuf_rele(parent, db); 1577 } 1578 1579 static int 1580 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1581 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1582 { 1583 int nlevels, epbs; 1584 1585 *parentp = NULL; 1586 *bpp = NULL; 1587 1588 ASSERT(blkid != DMU_BONUS_BLKID); 1589 1590 if (blkid == DMU_SPILL_BLKID) { 1591 mutex_enter(&dn->dn_mtx); 1592 if (dn->dn_have_spill && 1593 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1594 *bpp = &dn->dn_phys->dn_spill; 1595 else 1596 *bpp = NULL; 1597 dbuf_add_ref(dn->dn_dbuf, NULL); 1598 *parentp = dn->dn_dbuf; 1599 mutex_exit(&dn->dn_mtx); 1600 return (0); 1601 } 1602 1603 if (dn->dn_phys->dn_nlevels == 0) 1604 nlevels = 1; 1605 else 1606 nlevels = dn->dn_phys->dn_nlevels; 1607 1608 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1609 1610 ASSERT3U(level * epbs, <, 64); 1611 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1612 if (level >= nlevels || 1613 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1614 /* the buffer has no parent yet */ 1615 return (SET_ERROR(ENOENT)); 1616 } else if (level < nlevels-1) { 1617 /* this block is referenced from an indirect block */ 1618 int err = dbuf_hold_impl(dn, level+1, 1619 blkid >> epbs, fail_sparse, NULL, parentp); 1620 if (err) 1621 return (err); 1622 err = dbuf_read(*parentp, NULL, 1623 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1624 if (err) { 1625 dbuf_rele(*parentp, NULL); 1626 *parentp = NULL; 1627 return (err); 1628 } 1629 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1630 (blkid & ((1ULL << epbs) - 1)); 1631 return (0); 1632 } else { 1633 /* the block is referenced from the dnode */ 1634 ASSERT3U(level, ==, nlevels-1); 1635 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1636 blkid < dn->dn_phys->dn_nblkptr); 1637 if (dn->dn_dbuf) { 1638 dbuf_add_ref(dn->dn_dbuf, NULL); 1639 *parentp = dn->dn_dbuf; 1640 } 1641 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1642 return (0); 1643 } 1644 } 1645 1646 static dmu_buf_impl_t * 1647 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1648 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1649 { 1650 objset_t *os = dn->dn_objset; 1651 dmu_buf_impl_t *db, *odb; 1652 1653 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1654 ASSERT(dn->dn_type != DMU_OT_NONE); 1655 1656 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1657 1658 db->db_objset = os; 1659 db->db.db_object = dn->dn_object; 1660 db->db_level = level; 1661 db->db_blkid = blkid; 1662 db->db_last_dirty = NULL; 1663 db->db_dirtycnt = 0; 1664 db->db_dnode_handle = dn->dn_handle; 1665 db->db_parent = parent; 1666 db->db_blkptr = blkptr; 1667 1668 db->db_user_ptr = NULL; 1669 db->db_user_data_ptr_ptr = NULL; 1670 db->db_evict_func = NULL; 1671 db->db_immediate_evict = 0; 1672 db->db_freed_in_flight = 0; 1673 1674 if (blkid == DMU_BONUS_BLKID) { 1675 ASSERT3P(parent, ==, dn->dn_dbuf); 1676 db->db.db_size = DN_MAX_BONUSLEN - 1677 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1678 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1679 db->db.db_offset = DMU_BONUS_BLKID; 1680 db->db_state = DB_UNCACHED; 1681 /* the bonus dbuf is not placed in the hash table */ 1682 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1683 return (db); 1684 } else if (blkid == DMU_SPILL_BLKID) { 1685 db->db.db_size = (blkptr != NULL) ? 1686 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 1687 db->db.db_offset = 0; 1688 } else { 1689 int blocksize = 1690 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 1691 db->db.db_size = blocksize; 1692 db->db.db_offset = db->db_blkid * blocksize; 1693 } 1694 1695 /* 1696 * Hold the dn_dbufs_mtx while we get the new dbuf 1697 * in the hash table *and* added to the dbufs list. 1698 * This prevents a possible deadlock with someone 1699 * trying to look up this dbuf before its added to the 1700 * dn_dbufs list. 1701 */ 1702 mutex_enter(&dn->dn_dbufs_mtx); 1703 db->db_state = DB_EVICTING; 1704 if ((odb = dbuf_hash_insert(db)) != NULL) { 1705 /* someone else inserted it first */ 1706 kmem_cache_free(dbuf_cache, db); 1707 mutex_exit(&dn->dn_dbufs_mtx); 1708 return (odb); 1709 } 1710 list_insert_head(&dn->dn_dbufs, db); 1711 if (db->db_level == 0 && db->db_blkid >= 1712 dn->dn_unlisted_l0_blkid) 1713 dn->dn_unlisted_l0_blkid = db->db_blkid + 1; 1714 db->db_state = DB_UNCACHED; 1715 mutex_exit(&dn->dn_dbufs_mtx); 1716 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1717 1718 if (parent && parent != dn->dn_dbuf) 1719 dbuf_add_ref(parent, db); 1720 1721 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1722 refcount_count(&dn->dn_holds) > 0); 1723 (void) refcount_add(&dn->dn_holds, db); 1724 (void) atomic_inc_32_nv(&dn->dn_dbufs_count); 1725 1726 dprintf_dbuf(db, "db=%p\n", db); 1727 1728 return (db); 1729 } 1730 1731 static int 1732 dbuf_do_evict(void *private) 1733 { 1734 arc_buf_t *buf = private; 1735 dmu_buf_impl_t *db = buf->b_private; 1736 1737 if (!MUTEX_HELD(&db->db_mtx)) 1738 mutex_enter(&db->db_mtx); 1739 1740 ASSERT(refcount_is_zero(&db->db_holds)); 1741 1742 if (db->db_state != DB_EVICTING) { 1743 ASSERT(db->db_state == DB_CACHED); 1744 DBUF_VERIFY(db); 1745 db->db_buf = NULL; 1746 dbuf_evict(db); 1747 } else { 1748 mutex_exit(&db->db_mtx); 1749 dbuf_destroy(db); 1750 } 1751 return (0); 1752 } 1753 1754 static void 1755 dbuf_destroy(dmu_buf_impl_t *db) 1756 { 1757 ASSERT(refcount_is_zero(&db->db_holds)); 1758 1759 if (db->db_blkid != DMU_BONUS_BLKID) { 1760 /* 1761 * If this dbuf is still on the dn_dbufs list, 1762 * remove it from that list. 1763 */ 1764 if (db->db_dnode_handle != NULL) { 1765 dnode_t *dn; 1766 1767 DB_DNODE_ENTER(db); 1768 dn = DB_DNODE(db); 1769 mutex_enter(&dn->dn_dbufs_mtx); 1770 list_remove(&dn->dn_dbufs, db); 1771 (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1772 mutex_exit(&dn->dn_dbufs_mtx); 1773 DB_DNODE_EXIT(db); 1774 /* 1775 * Decrementing the dbuf count means that the hold 1776 * corresponding to the removed dbuf is no longer 1777 * discounted in dnode_move(), so the dnode cannot be 1778 * moved until after we release the hold. 1779 */ 1780 dnode_rele(dn, db); 1781 db->db_dnode_handle = NULL; 1782 } 1783 dbuf_hash_remove(db); 1784 } 1785 db->db_parent = NULL; 1786 db->db_buf = NULL; 1787 1788 ASSERT(!list_link_active(&db->db_link)); 1789 ASSERT(db->db.db_data == NULL); 1790 ASSERT(db->db_hash_next == NULL); 1791 ASSERT(db->db_blkptr == NULL); 1792 ASSERT(db->db_data_pending == NULL); 1793 1794 kmem_cache_free(dbuf_cache, db); 1795 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1796 } 1797 1798 void 1799 dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) 1800 { 1801 dmu_buf_impl_t *db = NULL; 1802 blkptr_t *bp = NULL; 1803 1804 ASSERT(blkid != DMU_BONUS_BLKID); 1805 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1806 1807 if (dnode_block_freed(dn, blkid)) 1808 return; 1809 1810 /* dbuf_find() returns with db_mtx held */ 1811 if (db = dbuf_find(dn, 0, blkid)) { 1812 /* 1813 * This dbuf is already in the cache. We assume that 1814 * it is already CACHED, or else about to be either 1815 * read or filled. 1816 */ 1817 mutex_exit(&db->db_mtx); 1818 return; 1819 } 1820 1821 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1822 if (bp && !BP_IS_HOLE(bp)) { 1823 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1824 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1825 zbookmark_t zb; 1826 1827 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1828 dn->dn_object, 0, blkid); 1829 1830 (void) arc_read(NULL, dn->dn_objset->os_spa, 1831 bp, NULL, NULL, prio, 1832 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1833 &aflags, &zb); 1834 } 1835 if (db) 1836 dbuf_rele(db, NULL); 1837 } 1838 } 1839 1840 /* 1841 * Returns with db_holds incremented, and db_mtx not held. 1842 * Note: dn_struct_rwlock must be held. 1843 */ 1844 int 1845 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1846 void *tag, dmu_buf_impl_t **dbp) 1847 { 1848 dmu_buf_impl_t *db, *parent = NULL; 1849 1850 ASSERT(blkid != DMU_BONUS_BLKID); 1851 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1852 ASSERT3U(dn->dn_nlevels, >, level); 1853 1854 *dbp = NULL; 1855 top: 1856 /* dbuf_find() returns with db_mtx held */ 1857 db = dbuf_find(dn, level, blkid); 1858 1859 if (db == NULL) { 1860 blkptr_t *bp = NULL; 1861 int err; 1862 1863 ASSERT3P(parent, ==, NULL); 1864 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1865 if (fail_sparse) { 1866 if (err == 0 && bp && BP_IS_HOLE(bp)) 1867 err = SET_ERROR(ENOENT); 1868 if (err) { 1869 if (parent) 1870 dbuf_rele(parent, NULL); 1871 return (err); 1872 } 1873 } 1874 if (err && err != ENOENT) 1875 return (err); 1876 db = dbuf_create(dn, level, blkid, parent, bp); 1877 } 1878 1879 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1880 arc_buf_add_ref(db->db_buf, db); 1881 if (db->db_buf->b_data == NULL) { 1882 dbuf_clear(db); 1883 if (parent) { 1884 dbuf_rele(parent, NULL); 1885 parent = NULL; 1886 } 1887 goto top; 1888 } 1889 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1890 } 1891 1892 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1893 1894 /* 1895 * If this buffer is currently syncing out, and we are are 1896 * still referencing it from db_data, we need to make a copy 1897 * of it in case we decide we want to dirty it again in this txg. 1898 */ 1899 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1900 dn->dn_object != DMU_META_DNODE_OBJECT && 1901 db->db_state == DB_CACHED && db->db_data_pending) { 1902 dbuf_dirty_record_t *dr = db->db_data_pending; 1903 1904 if (dr->dt.dl.dr_data == db->db_buf) { 1905 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1906 1907 dbuf_set_data(db, 1908 arc_buf_alloc(dn->dn_objset->os_spa, 1909 db->db.db_size, db, type)); 1910 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1911 db->db.db_size); 1912 } 1913 } 1914 1915 (void) refcount_add(&db->db_holds, tag); 1916 dbuf_update_data(db); 1917 DBUF_VERIFY(db); 1918 mutex_exit(&db->db_mtx); 1919 1920 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1921 if (parent) 1922 dbuf_rele(parent, NULL); 1923 1924 ASSERT3P(DB_DNODE(db), ==, dn); 1925 ASSERT3U(db->db_blkid, ==, blkid); 1926 ASSERT3U(db->db_level, ==, level); 1927 *dbp = db; 1928 1929 return (0); 1930 } 1931 1932 dmu_buf_impl_t * 1933 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1934 { 1935 dmu_buf_impl_t *db; 1936 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1937 return (err ? NULL : db); 1938 } 1939 1940 dmu_buf_impl_t * 1941 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1942 { 1943 dmu_buf_impl_t *db; 1944 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1945 return (err ? NULL : db); 1946 } 1947 1948 void 1949 dbuf_create_bonus(dnode_t *dn) 1950 { 1951 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1952 1953 ASSERT(dn->dn_bonus == NULL); 1954 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 1955 } 1956 1957 int 1958 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 1959 { 1960 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1961 dnode_t *dn; 1962 1963 if (db->db_blkid != DMU_SPILL_BLKID) 1964 return (SET_ERROR(ENOTSUP)); 1965 if (blksz == 0) 1966 blksz = SPA_MINBLOCKSIZE; 1967 if (blksz > SPA_MAXBLOCKSIZE) 1968 blksz = SPA_MAXBLOCKSIZE; 1969 else 1970 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 1971 1972 DB_DNODE_ENTER(db); 1973 dn = DB_DNODE(db); 1974 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 1975 dbuf_new_size(db, blksz, tx); 1976 rw_exit(&dn->dn_struct_rwlock); 1977 DB_DNODE_EXIT(db); 1978 1979 return (0); 1980 } 1981 1982 void 1983 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 1984 { 1985 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 1986 } 1987 1988 #pragma weak dmu_buf_add_ref = dbuf_add_ref 1989 void 1990 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1991 { 1992 int64_t holds = refcount_add(&db->db_holds, tag); 1993 ASSERT(holds > 1); 1994 } 1995 1996 /* 1997 * If you call dbuf_rele() you had better not be referencing the dnode handle 1998 * unless you have some other direct or indirect hold on the dnode. (An indirect 1999 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2000 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2001 * dnode's parent dbuf evicting its dnode handles. 2002 */ 2003 void 2004 dbuf_rele(dmu_buf_impl_t *db, void *tag) 2005 { 2006 mutex_enter(&db->db_mtx); 2007 dbuf_rele_and_unlock(db, tag); 2008 } 2009 2010 void 2011 dmu_buf_rele(dmu_buf_t *db, void *tag) 2012 { 2013 dbuf_rele((dmu_buf_impl_t *)db, tag); 2014 } 2015 2016 /* 2017 * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2018 * db_dirtycnt and db_holds to be updated atomically. 2019 */ 2020 void 2021 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2022 { 2023 int64_t holds; 2024 2025 ASSERT(MUTEX_HELD(&db->db_mtx)); 2026 DBUF_VERIFY(db); 2027 2028 /* 2029 * Remove the reference to the dbuf before removing its hold on the 2030 * dnode so we can guarantee in dnode_move() that a referenced bonus 2031 * buffer has a corresponding dnode hold. 2032 */ 2033 holds = refcount_remove(&db->db_holds, tag); 2034 ASSERT(holds >= 0); 2035 2036 /* 2037 * We can't freeze indirects if there is a possibility that they 2038 * may be modified in the current syncing context. 2039 */ 2040 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 2041 arc_buf_freeze(db->db_buf); 2042 2043 if (holds == db->db_dirtycnt && 2044 db->db_level == 0 && db->db_immediate_evict) 2045 dbuf_evict_user(db); 2046 2047 if (holds == 0) { 2048 if (db->db_blkid == DMU_BONUS_BLKID) { 2049 mutex_exit(&db->db_mtx); 2050 2051 /* 2052 * If the dnode moves here, we cannot cross this barrier 2053 * until the move completes. 2054 */ 2055 DB_DNODE_ENTER(db); 2056 (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); 2057 DB_DNODE_EXIT(db); 2058 /* 2059 * The bonus buffer's dnode hold is no longer discounted 2060 * in dnode_move(). The dnode cannot move until after 2061 * the dnode_rele(). 2062 */ 2063 dnode_rele(DB_DNODE(db), db); 2064 } else if (db->db_buf == NULL) { 2065 /* 2066 * This is a special case: we never associated this 2067 * dbuf with any data allocated from the ARC. 2068 */ 2069 ASSERT(db->db_state == DB_UNCACHED || 2070 db->db_state == DB_NOFILL); 2071 dbuf_evict(db); 2072 } else if (arc_released(db->db_buf)) { 2073 arc_buf_t *buf = db->db_buf; 2074 /* 2075 * This dbuf has anonymous data associated with it. 2076 */ 2077 dbuf_set_data(db, NULL); 2078 VERIFY(arc_buf_remove_ref(buf, db)); 2079 dbuf_evict(db); 2080 } else { 2081 VERIFY(!arc_buf_remove_ref(db->db_buf, db)); 2082 2083 /* 2084 * A dbuf will be eligible for eviction if either the 2085 * 'primarycache' property is set or a duplicate 2086 * copy of this buffer is already cached in the arc. 2087 * 2088 * In the case of the 'primarycache' a buffer 2089 * is considered for eviction if it matches the 2090 * criteria set in the property. 2091 * 2092 * To decide if our buffer is considered a 2093 * duplicate, we must call into the arc to determine 2094 * if multiple buffers are referencing the same 2095 * block on-disk. If so, then we simply evict 2096 * ourselves. 2097 */ 2098 if (!DBUF_IS_CACHEABLE(db) || 2099 arc_buf_eviction_needed(db->db_buf)) 2100 dbuf_clear(db); 2101 else 2102 mutex_exit(&db->db_mtx); 2103 } 2104 } else { 2105 mutex_exit(&db->db_mtx); 2106 } 2107 } 2108 2109 #pragma weak dmu_buf_refcount = dbuf_refcount 2110 uint64_t 2111 dbuf_refcount(dmu_buf_impl_t *db) 2112 { 2113 return (refcount_count(&db->db_holds)); 2114 } 2115 2116 void * 2117 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2118 dmu_buf_evict_func_t *evict_func) 2119 { 2120 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2121 user_data_ptr_ptr, evict_func)); 2122 } 2123 2124 void * 2125 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2126 dmu_buf_evict_func_t *evict_func) 2127 { 2128 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2129 2130 db->db_immediate_evict = TRUE; 2131 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2132 user_data_ptr_ptr, evict_func)); 2133 } 2134 2135 void * 2136 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 2137 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 2138 { 2139 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2140 ASSERT(db->db_level == 0); 2141 2142 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 2143 2144 mutex_enter(&db->db_mtx); 2145 2146 if (db->db_user_ptr == old_user_ptr) { 2147 db->db_user_ptr = user_ptr; 2148 db->db_user_data_ptr_ptr = user_data_ptr_ptr; 2149 db->db_evict_func = evict_func; 2150 2151 dbuf_update_data(db); 2152 } else { 2153 old_user_ptr = db->db_user_ptr; 2154 } 2155 2156 mutex_exit(&db->db_mtx); 2157 return (old_user_ptr); 2158 } 2159 2160 void * 2161 dmu_buf_get_user(dmu_buf_t *db_fake) 2162 { 2163 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2164 ASSERT(!refcount_is_zero(&db->db_holds)); 2165 2166 return (db->db_user_ptr); 2167 } 2168 2169 boolean_t 2170 dmu_buf_freeable(dmu_buf_t *dbuf) 2171 { 2172 boolean_t res = B_FALSE; 2173 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2174 2175 if (db->db_blkptr) 2176 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2177 db->db_blkptr, db->db_blkptr->blk_birth); 2178 2179 return (res); 2180 } 2181 2182 blkptr_t * 2183 dmu_buf_get_blkptr(dmu_buf_t *db) 2184 { 2185 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 2186 return (dbi->db_blkptr); 2187 } 2188 2189 static void 2190 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2191 { 2192 /* ASSERT(dmu_tx_is_syncing(tx) */ 2193 ASSERT(MUTEX_HELD(&db->db_mtx)); 2194 2195 if (db->db_blkptr != NULL) 2196 return; 2197 2198 if (db->db_blkid == DMU_SPILL_BLKID) { 2199 db->db_blkptr = &dn->dn_phys->dn_spill; 2200 BP_ZERO(db->db_blkptr); 2201 return; 2202 } 2203 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2204 /* 2205 * This buffer was allocated at a time when there was 2206 * no available blkptrs from the dnode, or it was 2207 * inappropriate to hook it in (i.e., nlevels mis-match). 2208 */ 2209 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2210 ASSERT(db->db_parent == NULL); 2211 db->db_parent = dn->dn_dbuf; 2212 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2213 DBUF_VERIFY(db); 2214 } else { 2215 dmu_buf_impl_t *parent = db->db_parent; 2216 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2217 2218 ASSERT(dn->dn_phys->dn_nlevels > 1); 2219 if (parent == NULL) { 2220 mutex_exit(&db->db_mtx); 2221 rw_enter(&dn->dn_struct_rwlock, RW_READER); 2222 (void) dbuf_hold_impl(dn, db->db_level+1, 2223 db->db_blkid >> epbs, FALSE, db, &parent); 2224 rw_exit(&dn->dn_struct_rwlock); 2225 mutex_enter(&db->db_mtx); 2226 db->db_parent = parent; 2227 } 2228 db->db_blkptr = (blkptr_t *)parent->db.db_data + 2229 (db->db_blkid & ((1ULL << epbs) - 1)); 2230 DBUF_VERIFY(db); 2231 } 2232 } 2233 2234 static void 2235 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2236 { 2237 dmu_buf_impl_t *db = dr->dr_dbuf; 2238 dnode_t *dn; 2239 zio_t *zio; 2240 2241 ASSERT(dmu_tx_is_syncing(tx)); 2242 2243 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2244 2245 mutex_enter(&db->db_mtx); 2246 2247 ASSERT(db->db_level > 0); 2248 DBUF_VERIFY(db); 2249 2250 /* Read the block if it hasn't been read yet. */ 2251 if (db->db_buf == NULL) { 2252 mutex_exit(&db->db_mtx); 2253 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2254 mutex_enter(&db->db_mtx); 2255 } 2256 ASSERT3U(db->db_state, ==, DB_CACHED); 2257 ASSERT(db->db_buf != NULL); 2258 2259 DB_DNODE_ENTER(db); 2260 dn = DB_DNODE(db); 2261 /* Indirect block size must match what the dnode thinks it is. */ 2262 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2263 dbuf_check_blkptr(dn, db); 2264 DB_DNODE_EXIT(db); 2265 2266 /* Provide the pending dirty record to child dbufs */ 2267 db->db_data_pending = dr; 2268 2269 mutex_exit(&db->db_mtx); 2270 dbuf_write(dr, db->db_buf, tx); 2271 2272 zio = dr->dr_zio; 2273 mutex_enter(&dr->dt.di.dr_mtx); 2274 dbuf_sync_list(&dr->dt.di.dr_children, tx); 2275 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2276 mutex_exit(&dr->dt.di.dr_mtx); 2277 zio_nowait(zio); 2278 } 2279 2280 static void 2281 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2282 { 2283 arc_buf_t **datap = &dr->dt.dl.dr_data; 2284 dmu_buf_impl_t *db = dr->dr_dbuf; 2285 dnode_t *dn; 2286 objset_t *os; 2287 uint64_t txg = tx->tx_txg; 2288 2289 ASSERT(dmu_tx_is_syncing(tx)); 2290 2291 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2292 2293 mutex_enter(&db->db_mtx); 2294 /* 2295 * To be synced, we must be dirtied. But we 2296 * might have been freed after the dirty. 2297 */ 2298 if (db->db_state == DB_UNCACHED) { 2299 /* This buffer has been freed since it was dirtied */ 2300 ASSERT(db->db.db_data == NULL); 2301 } else if (db->db_state == DB_FILL) { 2302 /* This buffer was freed and is now being re-filled */ 2303 ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2304 } else { 2305 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2306 } 2307 DBUF_VERIFY(db); 2308 2309 DB_DNODE_ENTER(db); 2310 dn = DB_DNODE(db); 2311 2312 if (db->db_blkid == DMU_SPILL_BLKID) { 2313 mutex_enter(&dn->dn_mtx); 2314 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 2315 mutex_exit(&dn->dn_mtx); 2316 } 2317 2318 /* 2319 * If this is a bonus buffer, simply copy the bonus data into the 2320 * dnode. It will be written out when the dnode is synced (and it 2321 * will be synced, since it must have been dirty for dbuf_sync to 2322 * be called). 2323 */ 2324 if (db->db_blkid == DMU_BONUS_BLKID) { 2325 dbuf_dirty_record_t **drp; 2326 2327 ASSERT(*datap != NULL); 2328 ASSERT0(db->db_level); 2329 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2330 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2331 DB_DNODE_EXIT(db); 2332 2333 if (*datap != db->db.db_data) { 2334 zio_buf_free(*datap, DN_MAX_BONUSLEN); 2335 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2336 } 2337 db->db_data_pending = NULL; 2338 drp = &db->db_last_dirty; 2339 while (*drp != dr) 2340 drp = &(*drp)->dr_next; 2341 ASSERT(dr->dr_next == NULL); 2342 ASSERT(dr->dr_dbuf == db); 2343 *drp = dr->dr_next; 2344 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2345 ASSERT(db->db_dirtycnt > 0); 2346 db->db_dirtycnt -= 1; 2347 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2348 return; 2349 } 2350 2351 os = dn->dn_objset; 2352 2353 /* 2354 * This function may have dropped the db_mtx lock allowing a dmu_sync 2355 * operation to sneak in. As a result, we need to ensure that we 2356 * don't check the dr_override_state until we have returned from 2357 * dbuf_check_blkptr. 2358 */ 2359 dbuf_check_blkptr(dn, db); 2360 2361 /* 2362 * If this buffer is in the middle of an immediate write, 2363 * wait for the synchronous IO to complete. 2364 */ 2365 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2366 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2367 cv_wait(&db->db_changed, &db->db_mtx); 2368 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2369 } 2370 2371 if (db->db_state != DB_NOFILL && 2372 dn->dn_object != DMU_META_DNODE_OBJECT && 2373 refcount_count(&db->db_holds) > 1 && 2374 dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2375 *datap == db->db_buf) { 2376 /* 2377 * If this buffer is currently "in use" (i.e., there 2378 * are active holds and db_data still references it), 2379 * then make a copy before we start the write so that 2380 * any modifications from the open txg will not leak 2381 * into this write. 2382 * 2383 * NOTE: this copy does not need to be made for 2384 * objects only modified in the syncing context (e.g. 2385 * DNONE_DNODE blocks). 2386 */ 2387 int blksz = arc_buf_size(*datap); 2388 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2389 *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2390 bcopy(db->db.db_data, (*datap)->b_data, blksz); 2391 } 2392 db->db_data_pending = dr; 2393 2394 mutex_exit(&db->db_mtx); 2395 2396 dbuf_write(dr, *datap, tx); 2397 2398 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2399 if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2400 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2401 DB_DNODE_EXIT(db); 2402 } else { 2403 /* 2404 * Although zio_nowait() does not "wait for an IO", it does 2405 * initiate the IO. If this is an empty write it seems plausible 2406 * that the IO could actually be completed before the nowait 2407 * returns. We need to DB_DNODE_EXIT() first in case 2408 * zio_nowait() invalidates the dbuf. 2409 */ 2410 DB_DNODE_EXIT(db); 2411 zio_nowait(dr->dr_zio); 2412 } 2413 } 2414 2415 void 2416 dbuf_sync_list(list_t *list, dmu_tx_t *tx) 2417 { 2418 dbuf_dirty_record_t *dr; 2419 2420 while (dr = list_head(list)) { 2421 if (dr->dr_zio != NULL) { 2422 /* 2423 * If we find an already initialized zio then we 2424 * are processing the meta-dnode, and we have finished. 2425 * The dbufs for all dnodes are put back on the list 2426 * during processing, so that we can zio_wait() 2427 * these IOs after initiating all child IOs. 2428 */ 2429 ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2430 DMU_META_DNODE_OBJECT); 2431 break; 2432 } 2433 list_remove(list, dr); 2434 if (dr->dr_dbuf->db_level > 0) 2435 dbuf_sync_indirect(dr, tx); 2436 else 2437 dbuf_sync_leaf(dr, tx); 2438 } 2439 } 2440 2441 /* ARGSUSED */ 2442 static void 2443 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2444 { 2445 dmu_buf_impl_t *db = vdb; 2446 dnode_t *dn; 2447 blkptr_t *bp = zio->io_bp; 2448 blkptr_t *bp_orig = &zio->io_bp_orig; 2449 spa_t *spa = zio->io_spa; 2450 int64_t delta; 2451 uint64_t fill = 0; 2452 int i; 2453 2454 ASSERT(db->db_blkptr == bp); 2455 2456 DB_DNODE_ENTER(db); 2457 dn = DB_DNODE(db); 2458 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2459 dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2460 zio->io_prev_space_delta = delta; 2461 2462 if (bp->blk_birth != 0) { 2463 ASSERT((db->db_blkid != DMU_SPILL_BLKID && 2464 BP_GET_TYPE(bp) == dn->dn_type) || 2465 (db->db_blkid == DMU_SPILL_BLKID && 2466 BP_GET_TYPE(bp) == dn->dn_bonustype)); 2467 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2468 } 2469 2470 mutex_enter(&db->db_mtx); 2471 2472 #ifdef ZFS_DEBUG 2473 if (db->db_blkid == DMU_SPILL_BLKID) { 2474 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2475 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2476 db->db_blkptr == &dn->dn_phys->dn_spill); 2477 } 2478 #endif 2479 2480 if (db->db_level == 0) { 2481 mutex_enter(&dn->dn_mtx); 2482 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2483 db->db_blkid != DMU_SPILL_BLKID) 2484 dn->dn_phys->dn_maxblkid = db->db_blkid; 2485 mutex_exit(&dn->dn_mtx); 2486 2487 if (dn->dn_type == DMU_OT_DNODE) { 2488 dnode_phys_t *dnp = db->db.db_data; 2489 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2490 i--, dnp++) { 2491 if (dnp->dn_type != DMU_OT_NONE) 2492 fill++; 2493 } 2494 } else { 2495 if (BP_IS_HOLE(bp)) { 2496 fill = 0; 2497 } else { 2498 fill = 1; 2499 } 2500 } 2501 } else { 2502 blkptr_t *ibp = db->db.db_data; 2503 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2504 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2505 if (BP_IS_HOLE(ibp)) 2506 continue; 2507 fill += ibp->blk_fill; 2508 } 2509 } 2510 DB_DNODE_EXIT(db); 2511 2512 bp->blk_fill = fill; 2513 2514 mutex_exit(&db->db_mtx); 2515 } 2516 2517 /* 2518 * The SPA will call this callback several times for each zio - once 2519 * for every physical child i/o (zio->io_phys_children times). This 2520 * allows the DMU to monitor the progress of each logical i/o. For example, 2521 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 2522 * block. There may be a long delay before all copies/fragments are completed, 2523 * so this callback allows us to retire dirty space gradually, as the physical 2524 * i/os complete. 2525 */ 2526 /* ARGSUSED */ 2527 static void 2528 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) 2529 { 2530 dmu_buf_impl_t *db = arg; 2531 objset_t *os = db->db_objset; 2532 dsl_pool_t *dp = dmu_objset_pool(os); 2533 dbuf_dirty_record_t *dr; 2534 int delta = 0; 2535 2536 dr = db->db_data_pending; 2537 ASSERT3U(dr->dr_txg, ==, zio->io_txg); 2538 2539 /* 2540 * The callback will be called io_phys_children times. Retire one 2541 * portion of our dirty space each time we are called. Any rounding 2542 * error will be cleaned up by dsl_pool_sync()'s call to 2543 * dsl_pool_undirty_space(). 2544 */ 2545 delta = dr->dr_accounted / zio->io_phys_children; 2546 dsl_pool_undirty_space(dp, delta, zio->io_txg); 2547 } 2548 2549 /* ARGSUSED */ 2550 static void 2551 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2552 { 2553 dmu_buf_impl_t *db = vdb; 2554 blkptr_t *bp_orig = &zio->io_bp_orig; 2555 blkptr_t *bp = db->db_blkptr; 2556 objset_t *os = db->db_objset; 2557 dmu_tx_t *tx = os->os_synctx; 2558 dbuf_dirty_record_t **drp, *dr; 2559 2560 ASSERT0(zio->io_error); 2561 ASSERT(db->db_blkptr == bp); 2562 2563 /* 2564 * For nopwrites and rewrites we ensure that the bp matches our 2565 * original and bypass all the accounting. 2566 */ 2567 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 2568 ASSERT(BP_EQUAL(bp, bp_orig)); 2569 } else { 2570 dsl_dataset_t *ds = os->os_dsl_dataset; 2571 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2572 dsl_dataset_block_born(ds, bp, tx); 2573 } 2574 2575 mutex_enter(&db->db_mtx); 2576 2577 DBUF_VERIFY(db); 2578 2579 drp = &db->db_last_dirty; 2580 while ((dr = *drp) != db->db_data_pending) 2581 drp = &dr->dr_next; 2582 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2583 ASSERT(dr->dr_dbuf == db); 2584 ASSERT(dr->dr_next == NULL); 2585 *drp = dr->dr_next; 2586 2587 #ifdef ZFS_DEBUG 2588 if (db->db_blkid == DMU_SPILL_BLKID) { 2589 dnode_t *dn; 2590 2591 DB_DNODE_ENTER(db); 2592 dn = DB_DNODE(db); 2593 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2594 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2595 db->db_blkptr == &dn->dn_phys->dn_spill); 2596 DB_DNODE_EXIT(db); 2597 } 2598 #endif 2599 2600 if (db->db_level == 0) { 2601 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2602 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2603 if (db->db_state != DB_NOFILL) { 2604 if (dr->dt.dl.dr_data != db->db_buf) 2605 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2606 db)); 2607 else if (!arc_released(db->db_buf)) 2608 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2609 } 2610 } else { 2611 dnode_t *dn; 2612 2613 DB_DNODE_ENTER(db); 2614 dn = DB_DNODE(db); 2615 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2616 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 2617 if (!BP_IS_HOLE(db->db_blkptr)) { 2618 int epbs = 2619 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2620 ASSERT3U(db->db_blkid, <=, 2621 dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 2622 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2623 db->db.db_size); 2624 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2625 } 2626 DB_DNODE_EXIT(db); 2627 mutex_destroy(&dr->dt.di.dr_mtx); 2628 list_destroy(&dr->dt.di.dr_children); 2629 } 2630 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2631 2632 cv_broadcast(&db->db_changed); 2633 ASSERT(db->db_dirtycnt > 0); 2634 db->db_dirtycnt -= 1; 2635 db->db_data_pending = NULL; 2636 dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); 2637 } 2638 2639 static void 2640 dbuf_write_nofill_ready(zio_t *zio) 2641 { 2642 dbuf_write_ready(zio, NULL, zio->io_private); 2643 } 2644 2645 static void 2646 dbuf_write_nofill_done(zio_t *zio) 2647 { 2648 dbuf_write_done(zio, NULL, zio->io_private); 2649 } 2650 2651 static void 2652 dbuf_write_override_ready(zio_t *zio) 2653 { 2654 dbuf_dirty_record_t *dr = zio->io_private; 2655 dmu_buf_impl_t *db = dr->dr_dbuf; 2656 2657 dbuf_write_ready(zio, NULL, db); 2658 } 2659 2660 static void 2661 dbuf_write_override_done(zio_t *zio) 2662 { 2663 dbuf_dirty_record_t *dr = zio->io_private; 2664 dmu_buf_impl_t *db = dr->dr_dbuf; 2665 blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2666 2667 mutex_enter(&db->db_mtx); 2668 if (!BP_EQUAL(zio->io_bp, obp)) { 2669 if (!BP_IS_HOLE(obp)) 2670 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2671 arc_release(dr->dt.dl.dr_data, db); 2672 } 2673 mutex_exit(&db->db_mtx); 2674 2675 dbuf_write_done(zio, NULL, db); 2676 } 2677 2678 /* Issue I/O to commit a dirty buffer to disk. */ 2679 static void 2680 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2681 { 2682 dmu_buf_impl_t *db = dr->dr_dbuf; 2683 dnode_t *dn; 2684 objset_t *os; 2685 dmu_buf_impl_t *parent = db->db_parent; 2686 uint64_t txg = tx->tx_txg; 2687 zbookmark_t zb; 2688 zio_prop_t zp; 2689 zio_t *zio; 2690 int wp_flag = 0; 2691 2692 DB_DNODE_ENTER(db); 2693 dn = DB_DNODE(db); 2694 os = dn->dn_objset; 2695 2696 if (db->db_state != DB_NOFILL) { 2697 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2698 /* 2699 * Private object buffers are released here rather 2700 * than in dbuf_dirty() since they are only modified 2701 * in the syncing context and we don't want the 2702 * overhead of making multiple copies of the data. 2703 */ 2704 if (BP_IS_HOLE(db->db_blkptr)) { 2705 arc_buf_thaw(data); 2706 } else { 2707 dbuf_release_bp(db); 2708 } 2709 } 2710 } 2711 2712 if (parent != dn->dn_dbuf) { 2713 /* Our parent is an indirect block. */ 2714 /* We have a dirty parent that has been scheduled for write. */ 2715 ASSERT(parent && parent->db_data_pending); 2716 /* Our parent's buffer is one level closer to the dnode. */ 2717 ASSERT(db->db_level == parent->db_level-1); 2718 /* 2719 * We're about to modify our parent's db_data by modifying 2720 * our block pointer, so the parent must be released. 2721 */ 2722 ASSERT(arc_released(parent->db_buf)); 2723 zio = parent->db_data_pending->dr_zio; 2724 } else { 2725 /* Our parent is the dnode itself. */ 2726 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 2727 db->db_blkid != DMU_SPILL_BLKID) || 2728 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 2729 if (db->db_blkid != DMU_SPILL_BLKID) 2730 ASSERT3P(db->db_blkptr, ==, 2731 &dn->dn_phys->dn_blkptr[db->db_blkid]); 2732 zio = dn->dn_zio; 2733 } 2734 2735 ASSERT(db->db_level == 0 || data == db->db_buf); 2736 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2737 ASSERT(zio); 2738 2739 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2740 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2741 db->db.db_object, db->db_level, db->db_blkid); 2742 2743 if (db->db_blkid == DMU_SPILL_BLKID) 2744 wp_flag = WP_SPILL; 2745 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 2746 2747 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 2748 DB_DNODE_EXIT(db); 2749 2750 if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2751 ASSERT(db->db_state != DB_NOFILL); 2752 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2753 db->db_blkptr, data->b_data, arc_buf_size(data), &zp, 2754 dbuf_write_override_ready, NULL, dbuf_write_override_done, 2755 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2756 mutex_enter(&db->db_mtx); 2757 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2758 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2759 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 2760 mutex_exit(&db->db_mtx); 2761 } else if (db->db_state == DB_NOFILL) { 2762 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || 2763 zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 2764 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2765 db->db_blkptr, NULL, db->db.db_size, &zp, 2766 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, 2767 ZIO_PRIORITY_ASYNC_WRITE, 2768 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2769 } else { 2770 ASSERT(arc_released(data)); 2771 dr->dr_zio = arc_write(zio, os->os_spa, txg, 2772 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), 2773 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, 2774 dbuf_write_physdone, dbuf_write_done, db, 2775 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2776 } 2777 } 2778