1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 * Copyright (c) 2012 by Delphix. All rights reserved. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/dmu.h> 29 #include <sys/dmu_impl.h> 30 #include <sys/dbuf.h> 31 #include <sys/dmu_objset.h> 32 #include <sys/dsl_dataset.h> 33 #include <sys/dsl_dir.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/spa.h> 36 #include <sys/zio.h> 37 #include <sys/dmu_zfetch.h> 38 #include <sys/sa.h> 39 #include <sys/sa_impl.h> 40 41 static void dbuf_destroy(dmu_buf_impl_t *db); 42 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 43 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 44 45 /* 46 * Global data structures and functions for the dbuf cache. 47 */ 48 static kmem_cache_t *dbuf_cache; 49 50 /* ARGSUSED */ 51 static int 52 dbuf_cons(void *vdb, void *unused, int kmflag) 53 { 54 dmu_buf_impl_t *db = vdb; 55 bzero(db, sizeof (dmu_buf_impl_t)); 56 57 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 58 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 59 refcount_create(&db->db_holds); 60 return (0); 61 } 62 63 /* ARGSUSED */ 64 static void 65 dbuf_dest(void *vdb, void *unused) 66 { 67 dmu_buf_impl_t *db = vdb; 68 mutex_destroy(&db->db_mtx); 69 cv_destroy(&db->db_changed); 70 refcount_destroy(&db->db_holds); 71 } 72 73 /* 74 * dbuf hash table routines 75 */ 76 static dbuf_hash_table_t dbuf_hash_table; 77 78 static uint64_t dbuf_hash_count; 79 80 static uint64_t 81 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 82 { 83 uintptr_t osv = (uintptr_t)os; 84 uint64_t crc = -1ULL; 85 86 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 87 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 88 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 90 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 91 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 92 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 93 94 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 95 96 return (crc); 97 } 98 99 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 100 101 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 102 ((dbuf)->db.db_object == (obj) && \ 103 (dbuf)->db_objset == (os) && \ 104 (dbuf)->db_level == (level) && \ 105 (dbuf)->db_blkid == (blkid)) 106 107 dmu_buf_impl_t * 108 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 109 { 110 dbuf_hash_table_t *h = &dbuf_hash_table; 111 objset_t *os = dn->dn_objset; 112 uint64_t obj = dn->dn_object; 113 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 114 uint64_t idx = hv & h->hash_table_mask; 115 dmu_buf_impl_t *db; 116 117 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 118 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 119 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 120 mutex_enter(&db->db_mtx); 121 if (db->db_state != DB_EVICTING) { 122 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 123 return (db); 124 } 125 mutex_exit(&db->db_mtx); 126 } 127 } 128 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 129 return (NULL); 130 } 131 132 /* 133 * Insert an entry into the hash table. If there is already an element 134 * equal to elem in the hash table, then the already existing element 135 * will be returned and the new element will not be inserted. 136 * Otherwise returns NULL. 137 */ 138 static dmu_buf_impl_t * 139 dbuf_hash_insert(dmu_buf_impl_t *db) 140 { 141 dbuf_hash_table_t *h = &dbuf_hash_table; 142 objset_t *os = db->db_objset; 143 uint64_t obj = db->db.db_object; 144 int level = db->db_level; 145 uint64_t blkid = db->db_blkid; 146 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 147 uint64_t idx = hv & h->hash_table_mask; 148 dmu_buf_impl_t *dbf; 149 150 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 151 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 152 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 153 mutex_enter(&dbf->db_mtx); 154 if (dbf->db_state != DB_EVICTING) { 155 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 156 return (dbf); 157 } 158 mutex_exit(&dbf->db_mtx); 159 } 160 } 161 162 mutex_enter(&db->db_mtx); 163 db->db_hash_next = h->hash_table[idx]; 164 h->hash_table[idx] = db; 165 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 166 atomic_add_64(&dbuf_hash_count, 1); 167 168 return (NULL); 169 } 170 171 /* 172 * Remove an entry from the hash table. This operation will 173 * fail if there are any existing holds on the db. 174 */ 175 static void 176 dbuf_hash_remove(dmu_buf_impl_t *db) 177 { 178 dbuf_hash_table_t *h = &dbuf_hash_table; 179 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 180 db->db_level, db->db_blkid); 181 uint64_t idx = hv & h->hash_table_mask; 182 dmu_buf_impl_t *dbf, **dbp; 183 184 /* 185 * We musn't hold db_mtx to maintin lock ordering: 186 * DBUF_HASH_MUTEX > db_mtx. 187 */ 188 ASSERT(refcount_is_zero(&db->db_holds)); 189 ASSERT(db->db_state == DB_EVICTING); 190 ASSERT(!MUTEX_HELD(&db->db_mtx)); 191 192 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 193 dbp = &h->hash_table[idx]; 194 while ((dbf = *dbp) != db) { 195 dbp = &dbf->db_hash_next; 196 ASSERT(dbf != NULL); 197 } 198 *dbp = db->db_hash_next; 199 db->db_hash_next = NULL; 200 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 201 atomic_add_64(&dbuf_hash_count, -1); 202 } 203 204 static arc_evict_func_t dbuf_do_evict; 205 206 static void 207 dbuf_evict_user(dmu_buf_impl_t *db) 208 { 209 ASSERT(MUTEX_HELD(&db->db_mtx)); 210 211 if (db->db_level != 0 || db->db_evict_func == NULL) 212 return; 213 214 if (db->db_user_data_ptr_ptr) 215 *db->db_user_data_ptr_ptr = db->db.db_data; 216 db->db_evict_func(&db->db, db->db_user_ptr); 217 db->db_user_ptr = NULL; 218 db->db_user_data_ptr_ptr = NULL; 219 db->db_evict_func = NULL; 220 } 221 222 boolean_t 223 dbuf_is_metadata(dmu_buf_impl_t *db) 224 { 225 if (db->db_level > 0) { 226 return (B_TRUE); 227 } else { 228 boolean_t is_metadata; 229 230 DB_DNODE_ENTER(db); 231 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 232 DB_DNODE_EXIT(db); 233 234 return (is_metadata); 235 } 236 } 237 238 void 239 dbuf_evict(dmu_buf_impl_t *db) 240 { 241 ASSERT(MUTEX_HELD(&db->db_mtx)); 242 ASSERT(db->db_buf == NULL); 243 ASSERT(db->db_data_pending == NULL); 244 245 dbuf_clear(db); 246 dbuf_destroy(db); 247 } 248 249 void 250 dbuf_init(void) 251 { 252 uint64_t hsize = 1ULL << 16; 253 dbuf_hash_table_t *h = &dbuf_hash_table; 254 int i; 255 256 /* 257 * The hash table is big enough to fill all of physical memory 258 * with an average 4K block size. The table will take up 259 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 260 */ 261 while (hsize * 4096 < physmem * PAGESIZE) 262 hsize <<= 1; 263 264 retry: 265 h->hash_table_mask = hsize - 1; 266 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 267 if (h->hash_table == NULL) { 268 /* XXX - we should really return an error instead of assert */ 269 ASSERT(hsize > (1ULL << 10)); 270 hsize >>= 1; 271 goto retry; 272 } 273 274 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 275 sizeof (dmu_buf_impl_t), 276 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 277 278 for (i = 0; i < DBUF_MUTEXES; i++) 279 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 280 } 281 282 void 283 dbuf_fini(void) 284 { 285 dbuf_hash_table_t *h = &dbuf_hash_table; 286 int i; 287 288 for (i = 0; i < DBUF_MUTEXES; i++) 289 mutex_destroy(&h->hash_mutexes[i]); 290 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 291 kmem_cache_destroy(dbuf_cache); 292 } 293 294 /* 295 * Other stuff. 296 */ 297 298 #ifdef ZFS_DEBUG 299 static void 300 dbuf_verify(dmu_buf_impl_t *db) 301 { 302 dnode_t *dn; 303 dbuf_dirty_record_t *dr; 304 305 ASSERT(MUTEX_HELD(&db->db_mtx)); 306 307 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 308 return; 309 310 ASSERT(db->db_objset != NULL); 311 DB_DNODE_ENTER(db); 312 dn = DB_DNODE(db); 313 if (dn == NULL) { 314 ASSERT(db->db_parent == NULL); 315 ASSERT(db->db_blkptr == NULL); 316 } else { 317 ASSERT3U(db->db.db_object, ==, dn->dn_object); 318 ASSERT3P(db->db_objset, ==, dn->dn_objset); 319 ASSERT3U(db->db_level, <, dn->dn_nlevels); 320 ASSERT(db->db_blkid == DMU_BONUS_BLKID || 321 db->db_blkid == DMU_SPILL_BLKID || 322 !list_is_empty(&dn->dn_dbufs)); 323 } 324 if (db->db_blkid == DMU_BONUS_BLKID) { 325 ASSERT(dn != NULL); 326 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 327 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 328 } else if (db->db_blkid == DMU_SPILL_BLKID) { 329 ASSERT(dn != NULL); 330 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 331 ASSERT0(db->db.db_offset); 332 } else { 333 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 334 } 335 336 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 337 ASSERT(dr->dr_dbuf == db); 338 339 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 340 ASSERT(dr->dr_dbuf == db); 341 342 /* 343 * We can't assert that db_size matches dn_datablksz because it 344 * can be momentarily different when another thread is doing 345 * dnode_set_blksz(). 346 */ 347 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 348 dr = db->db_data_pending; 349 /* 350 * It should only be modified in syncing context, so 351 * make sure we only have one copy of the data. 352 */ 353 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 354 } 355 356 /* verify db->db_blkptr */ 357 if (db->db_blkptr) { 358 if (db->db_parent == dn->dn_dbuf) { 359 /* db is pointed to by the dnode */ 360 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 361 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 362 ASSERT(db->db_parent == NULL); 363 else 364 ASSERT(db->db_parent != NULL); 365 if (db->db_blkid != DMU_SPILL_BLKID) 366 ASSERT3P(db->db_blkptr, ==, 367 &dn->dn_phys->dn_blkptr[db->db_blkid]); 368 } else { 369 /* db is pointed to by an indirect block */ 370 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 371 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 372 ASSERT3U(db->db_parent->db.db_object, ==, 373 db->db.db_object); 374 /* 375 * dnode_grow_indblksz() can make this fail if we don't 376 * have the struct_rwlock. XXX indblksz no longer 377 * grows. safe to do this now? 378 */ 379 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 380 ASSERT3P(db->db_blkptr, ==, 381 ((blkptr_t *)db->db_parent->db.db_data + 382 db->db_blkid % epb)); 383 } 384 } 385 } 386 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 387 (db->db_buf == NULL || db->db_buf->b_data) && 388 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 389 db->db_state != DB_FILL && !dn->dn_free_txg) { 390 /* 391 * If the blkptr isn't set but they have nonzero data, 392 * it had better be dirty, otherwise we'll lose that 393 * data when we evict this buffer. 394 */ 395 if (db->db_dirtycnt == 0) { 396 uint64_t *buf = db->db.db_data; 397 int i; 398 399 for (i = 0; i < db->db.db_size >> 3; i++) { 400 ASSERT(buf[i] == 0); 401 } 402 } 403 } 404 DB_DNODE_EXIT(db); 405 } 406 #endif 407 408 static void 409 dbuf_update_data(dmu_buf_impl_t *db) 410 { 411 ASSERT(MUTEX_HELD(&db->db_mtx)); 412 if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 413 ASSERT(!refcount_is_zero(&db->db_holds)); 414 *db->db_user_data_ptr_ptr = db->db.db_data; 415 } 416 } 417 418 static void 419 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 420 { 421 ASSERT(MUTEX_HELD(&db->db_mtx)); 422 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 423 db->db_buf = buf; 424 if (buf != NULL) { 425 ASSERT(buf->b_data != NULL); 426 db->db.db_data = buf->b_data; 427 if (!arc_released(buf)) 428 arc_set_callback(buf, dbuf_do_evict, db); 429 dbuf_update_data(db); 430 } else { 431 dbuf_evict_user(db); 432 db->db.db_data = NULL; 433 if (db->db_state != DB_NOFILL) 434 db->db_state = DB_UNCACHED; 435 } 436 } 437 438 /* 439 * Loan out an arc_buf for read. Return the loaned arc_buf. 440 */ 441 arc_buf_t * 442 dbuf_loan_arcbuf(dmu_buf_impl_t *db) 443 { 444 arc_buf_t *abuf; 445 446 mutex_enter(&db->db_mtx); 447 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 448 int blksz = db->db.db_size; 449 spa_t *spa; 450 451 mutex_exit(&db->db_mtx); 452 DB_GET_SPA(&spa, db); 453 abuf = arc_loan_buf(spa, blksz); 454 bcopy(db->db.db_data, abuf->b_data, blksz); 455 } else { 456 abuf = db->db_buf; 457 arc_loan_inuse_buf(abuf, db); 458 dbuf_set_data(db, NULL); 459 mutex_exit(&db->db_mtx); 460 } 461 return (abuf); 462 } 463 464 uint64_t 465 dbuf_whichblock(dnode_t *dn, uint64_t offset) 466 { 467 if (dn->dn_datablkshift) { 468 return (offset >> dn->dn_datablkshift); 469 } else { 470 ASSERT3U(offset, <, dn->dn_datablksz); 471 return (0); 472 } 473 } 474 475 static void 476 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 477 { 478 dmu_buf_impl_t *db = vdb; 479 480 mutex_enter(&db->db_mtx); 481 ASSERT3U(db->db_state, ==, DB_READ); 482 /* 483 * All reads are synchronous, so we must have a hold on the dbuf 484 */ 485 ASSERT(refcount_count(&db->db_holds) > 0); 486 ASSERT(db->db_buf == NULL); 487 ASSERT(db->db.db_data == NULL); 488 if (db->db_level == 0 && db->db_freed_in_flight) { 489 /* we were freed in flight; disregard any error */ 490 arc_release(buf, db); 491 bzero(buf->b_data, db->db.db_size); 492 arc_buf_freeze(buf); 493 db->db_freed_in_flight = FALSE; 494 dbuf_set_data(db, buf); 495 db->db_state = DB_CACHED; 496 } else if (zio == NULL || zio->io_error == 0) { 497 dbuf_set_data(db, buf); 498 db->db_state = DB_CACHED; 499 } else { 500 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 501 ASSERT3P(db->db_buf, ==, NULL); 502 VERIFY(arc_buf_remove_ref(buf, db) == 1); 503 db->db_state = DB_UNCACHED; 504 } 505 cv_broadcast(&db->db_changed); 506 dbuf_rele_and_unlock(db, NULL); 507 } 508 509 static void 510 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 511 { 512 dnode_t *dn; 513 spa_t *spa; 514 zbookmark_t zb; 515 uint32_t aflags = ARC_NOWAIT; 516 arc_buf_t *pbuf; 517 518 DB_DNODE_ENTER(db); 519 dn = DB_DNODE(db); 520 ASSERT(!refcount_is_zero(&db->db_holds)); 521 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 522 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 523 ASSERT(MUTEX_HELD(&db->db_mtx)); 524 ASSERT(db->db_state == DB_UNCACHED); 525 ASSERT(db->db_buf == NULL); 526 527 if (db->db_blkid == DMU_BONUS_BLKID) { 528 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 529 530 ASSERT3U(bonuslen, <=, db->db.db_size); 531 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 532 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 533 if (bonuslen < DN_MAX_BONUSLEN) 534 bzero(db->db.db_data, DN_MAX_BONUSLEN); 535 if (bonuslen) 536 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 537 DB_DNODE_EXIT(db); 538 dbuf_update_data(db); 539 db->db_state = DB_CACHED; 540 mutex_exit(&db->db_mtx); 541 return; 542 } 543 544 /* 545 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 546 * processes the delete record and clears the bp while we are waiting 547 * for the dn_mtx (resulting in a "no" from block_freed). 548 */ 549 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 550 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 551 BP_IS_HOLE(db->db_blkptr)))) { 552 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 553 554 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, 555 db->db.db_size, db, type)); 556 DB_DNODE_EXIT(db); 557 bzero(db->db.db_data, db->db.db_size); 558 db->db_state = DB_CACHED; 559 *flags |= DB_RF_CACHED; 560 mutex_exit(&db->db_mtx); 561 return; 562 } 563 564 spa = dn->dn_objset->os_spa; 565 DB_DNODE_EXIT(db); 566 567 db->db_state = DB_READ; 568 mutex_exit(&db->db_mtx); 569 570 if (DBUF_IS_L2CACHEABLE(db)) 571 aflags |= ARC_L2CACHE; 572 573 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 574 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 575 db->db.db_object, db->db_level, db->db_blkid); 576 577 dbuf_add_ref(db, NULL); 578 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 579 580 if (db->db_parent) 581 pbuf = db->db_parent->db_buf; 582 else 583 pbuf = db->db_objset->os_phys_buf; 584 585 (void) dsl_read(zio, spa, db->db_blkptr, pbuf, 586 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 587 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 588 &aflags, &zb); 589 if (aflags & ARC_CACHED) 590 *flags |= DB_RF_CACHED; 591 } 592 593 int 594 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 595 { 596 int err = 0; 597 int havepzio = (zio != NULL); 598 int prefetch; 599 dnode_t *dn; 600 601 /* 602 * We don't have to hold the mutex to check db_state because it 603 * can't be freed while we have a hold on the buffer. 604 */ 605 ASSERT(!refcount_is_zero(&db->db_holds)); 606 607 if (db->db_state == DB_NOFILL) 608 return (EIO); 609 610 DB_DNODE_ENTER(db); 611 dn = DB_DNODE(db); 612 if ((flags & DB_RF_HAVESTRUCT) == 0) 613 rw_enter(&dn->dn_struct_rwlock, RW_READER); 614 615 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 616 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 617 DBUF_IS_CACHEABLE(db); 618 619 mutex_enter(&db->db_mtx); 620 if (db->db_state == DB_CACHED) { 621 mutex_exit(&db->db_mtx); 622 if (prefetch) 623 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 624 db->db.db_size, TRUE); 625 if ((flags & DB_RF_HAVESTRUCT) == 0) 626 rw_exit(&dn->dn_struct_rwlock); 627 DB_DNODE_EXIT(db); 628 } else if (db->db_state == DB_UNCACHED) { 629 spa_t *spa = dn->dn_objset->os_spa; 630 631 if (zio == NULL) 632 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 633 dbuf_read_impl(db, zio, &flags); 634 635 /* dbuf_read_impl has dropped db_mtx for us */ 636 637 if (prefetch) 638 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 639 db->db.db_size, flags & DB_RF_CACHED); 640 641 if ((flags & DB_RF_HAVESTRUCT) == 0) 642 rw_exit(&dn->dn_struct_rwlock); 643 DB_DNODE_EXIT(db); 644 645 if (!havepzio) 646 err = zio_wait(zio); 647 } else { 648 mutex_exit(&db->db_mtx); 649 if (prefetch) 650 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, 651 db->db.db_size, TRUE); 652 if ((flags & DB_RF_HAVESTRUCT) == 0) 653 rw_exit(&dn->dn_struct_rwlock); 654 DB_DNODE_EXIT(db); 655 656 mutex_enter(&db->db_mtx); 657 if ((flags & DB_RF_NEVERWAIT) == 0) { 658 while (db->db_state == DB_READ || 659 db->db_state == DB_FILL) { 660 ASSERT(db->db_state == DB_READ || 661 (flags & DB_RF_HAVESTRUCT) == 0); 662 cv_wait(&db->db_changed, &db->db_mtx); 663 } 664 if (db->db_state == DB_UNCACHED) 665 err = EIO; 666 } 667 mutex_exit(&db->db_mtx); 668 } 669 670 ASSERT(err || havepzio || db->db_state == DB_CACHED); 671 return (err); 672 } 673 674 static void 675 dbuf_noread(dmu_buf_impl_t *db) 676 { 677 ASSERT(!refcount_is_zero(&db->db_holds)); 678 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 679 mutex_enter(&db->db_mtx); 680 while (db->db_state == DB_READ || db->db_state == DB_FILL) 681 cv_wait(&db->db_changed, &db->db_mtx); 682 if (db->db_state == DB_UNCACHED) { 683 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 684 spa_t *spa; 685 686 ASSERT(db->db_buf == NULL); 687 ASSERT(db->db.db_data == NULL); 688 DB_GET_SPA(&spa, db); 689 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 690 db->db_state = DB_FILL; 691 } else if (db->db_state == DB_NOFILL) { 692 dbuf_set_data(db, NULL); 693 } else { 694 ASSERT3U(db->db_state, ==, DB_CACHED); 695 } 696 mutex_exit(&db->db_mtx); 697 } 698 699 /* 700 * This is our just-in-time copy function. It makes a copy of 701 * buffers, that have been modified in a previous transaction 702 * group, before we modify them in the current active group. 703 * 704 * This function is used in two places: when we are dirtying a 705 * buffer for the first time in a txg, and when we are freeing 706 * a range in a dnode that includes this buffer. 707 * 708 * Note that when we are called from dbuf_free_range() we do 709 * not put a hold on the buffer, we just traverse the active 710 * dbuf list for the dnode. 711 */ 712 static void 713 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 714 { 715 dbuf_dirty_record_t *dr = db->db_last_dirty; 716 717 ASSERT(MUTEX_HELD(&db->db_mtx)); 718 ASSERT(db->db.db_data != NULL); 719 ASSERT(db->db_level == 0); 720 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 721 722 if (dr == NULL || 723 (dr->dt.dl.dr_data != 724 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 725 return; 726 727 /* 728 * If the last dirty record for this dbuf has not yet synced 729 * and its referencing the dbuf data, either: 730 * reset the reference to point to a new copy, 731 * or (if there a no active holders) 732 * just null out the current db_data pointer. 733 */ 734 ASSERT(dr->dr_txg >= txg - 2); 735 if (db->db_blkid == DMU_BONUS_BLKID) { 736 /* Note that the data bufs here are zio_bufs */ 737 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 738 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 739 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 740 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 741 int size = db->db.db_size; 742 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 743 spa_t *spa; 744 745 DB_GET_SPA(&spa, db); 746 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 747 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 748 } else { 749 dbuf_set_data(db, NULL); 750 } 751 } 752 753 void 754 dbuf_unoverride(dbuf_dirty_record_t *dr) 755 { 756 dmu_buf_impl_t *db = dr->dr_dbuf; 757 blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 758 uint64_t txg = dr->dr_txg; 759 760 ASSERT(MUTEX_HELD(&db->db_mtx)); 761 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 762 ASSERT(db->db_level == 0); 763 764 if (db->db_blkid == DMU_BONUS_BLKID || 765 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 766 return; 767 768 ASSERT(db->db_data_pending != dr); 769 770 /* free this block */ 771 if (!BP_IS_HOLE(bp)) { 772 spa_t *spa; 773 774 DB_GET_SPA(&spa, db); 775 zio_free(spa, txg, bp); 776 } 777 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 778 /* 779 * Release the already-written buffer, so we leave it in 780 * a consistent dirty state. Note that all callers are 781 * modifying the buffer, so they will immediately do 782 * another (redundant) arc_release(). Therefore, leave 783 * the buf thawed to save the effort of freezing & 784 * immediately re-thawing it. 785 */ 786 arc_release(dr->dt.dl.dr_data, db); 787 } 788 789 /* 790 * Evict (if its unreferenced) or clear (if its referenced) any level-0 791 * data blocks in the free range, so that any future readers will find 792 * empty blocks. Also, if we happen accross any level-1 dbufs in the 793 * range that have not already been marked dirty, mark them dirty so 794 * they stay in memory. 795 */ 796 void 797 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) 798 { 799 dmu_buf_impl_t *db, *db_next; 800 uint64_t txg = tx->tx_txg; 801 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 802 uint64_t first_l1 = start >> epbs; 803 uint64_t last_l1 = end >> epbs; 804 805 if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { 806 end = dn->dn_maxblkid; 807 last_l1 = end >> epbs; 808 } 809 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); 810 mutex_enter(&dn->dn_dbufs_mtx); 811 for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 812 db_next = list_next(&dn->dn_dbufs, db); 813 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 814 815 if (db->db_level == 1 && 816 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { 817 mutex_enter(&db->db_mtx); 818 if (db->db_last_dirty && 819 db->db_last_dirty->dr_txg < txg) { 820 dbuf_add_ref(db, FTAG); 821 mutex_exit(&db->db_mtx); 822 dbuf_will_dirty(db, tx); 823 dbuf_rele(db, FTAG); 824 } else { 825 mutex_exit(&db->db_mtx); 826 } 827 } 828 829 if (db->db_level != 0) 830 continue; 831 dprintf_dbuf(db, "found buf %s\n", ""); 832 if (db->db_blkid < start || db->db_blkid > end) 833 continue; 834 835 /* found a level 0 buffer in the range */ 836 if (dbuf_undirty(db, tx)) 837 continue; 838 839 mutex_enter(&db->db_mtx); 840 if (db->db_state == DB_UNCACHED || 841 db->db_state == DB_NOFILL || 842 db->db_state == DB_EVICTING) { 843 ASSERT(db->db.db_data == NULL); 844 mutex_exit(&db->db_mtx); 845 continue; 846 } 847 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 848 /* will be handled in dbuf_read_done or dbuf_rele */ 849 db->db_freed_in_flight = TRUE; 850 mutex_exit(&db->db_mtx); 851 continue; 852 } 853 if (refcount_count(&db->db_holds) == 0) { 854 ASSERT(db->db_buf); 855 dbuf_clear(db); 856 continue; 857 } 858 /* The dbuf is referenced */ 859 860 if (db->db_last_dirty != NULL) { 861 dbuf_dirty_record_t *dr = db->db_last_dirty; 862 863 if (dr->dr_txg == txg) { 864 /* 865 * This buffer is "in-use", re-adjust the file 866 * size to reflect that this buffer may 867 * contain new data when we sync. 868 */ 869 if (db->db_blkid != DMU_SPILL_BLKID && 870 db->db_blkid > dn->dn_maxblkid) 871 dn->dn_maxblkid = db->db_blkid; 872 dbuf_unoverride(dr); 873 } else { 874 /* 875 * This dbuf is not dirty in the open context. 876 * Either uncache it (if its not referenced in 877 * the open context) or reset its contents to 878 * empty. 879 */ 880 dbuf_fix_old_data(db, txg); 881 } 882 } 883 /* clear the contents if its cached */ 884 if (db->db_state == DB_CACHED) { 885 ASSERT(db->db.db_data != NULL); 886 arc_release(db->db_buf, db); 887 bzero(db->db.db_data, db->db.db_size); 888 arc_buf_freeze(db->db_buf); 889 } 890 891 mutex_exit(&db->db_mtx); 892 } 893 mutex_exit(&dn->dn_dbufs_mtx); 894 } 895 896 static int 897 dbuf_block_freeable(dmu_buf_impl_t *db) 898 { 899 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 900 uint64_t birth_txg = 0; 901 902 /* 903 * We don't need any locking to protect db_blkptr: 904 * If it's syncing, then db_last_dirty will be set 905 * so we'll ignore db_blkptr. 906 */ 907 ASSERT(MUTEX_HELD(&db->db_mtx)); 908 if (db->db_last_dirty) 909 birth_txg = db->db_last_dirty->dr_txg; 910 else if (db->db_blkptr) 911 birth_txg = db->db_blkptr->blk_birth; 912 913 /* 914 * If we don't exist or are in a snapshot, we can't be freed. 915 * Don't pass the bp to dsl_dataset_block_freeable() since we 916 * are holding the db_mtx lock and might deadlock if we are 917 * prefetching a dedup-ed block. 918 */ 919 if (birth_txg) 920 return (ds == NULL || 921 dsl_dataset_block_freeable(ds, NULL, birth_txg)); 922 else 923 return (FALSE); 924 } 925 926 void 927 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 928 { 929 arc_buf_t *buf, *obuf; 930 int osize = db->db.db_size; 931 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 932 dnode_t *dn; 933 934 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 935 936 DB_DNODE_ENTER(db); 937 dn = DB_DNODE(db); 938 939 /* XXX does *this* func really need the lock? */ 940 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 941 942 /* 943 * This call to dbuf_will_dirty() with the dn_struct_rwlock held 944 * is OK, because there can be no other references to the db 945 * when we are changing its size, so no concurrent DB_FILL can 946 * be happening. 947 */ 948 /* 949 * XXX we should be doing a dbuf_read, checking the return 950 * value and returning that up to our callers 951 */ 952 dbuf_will_dirty(db, tx); 953 954 /* create the data buffer for the new block */ 955 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 956 957 /* copy old block data to the new block */ 958 obuf = db->db_buf; 959 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 960 /* zero the remainder */ 961 if (size > osize) 962 bzero((uint8_t *)buf->b_data + osize, size - osize); 963 964 mutex_enter(&db->db_mtx); 965 dbuf_set_data(db, buf); 966 VERIFY(arc_buf_remove_ref(obuf, db) == 1); 967 db->db.db_size = size; 968 969 if (db->db_level == 0) { 970 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 971 db->db_last_dirty->dt.dl.dr_data = buf; 972 } 973 mutex_exit(&db->db_mtx); 974 975 dnode_willuse_space(dn, size-osize, tx); 976 DB_DNODE_EXIT(db); 977 } 978 979 void 980 dbuf_release_bp(dmu_buf_impl_t *db) 981 { 982 objset_t *os; 983 zbookmark_t zb; 984 985 DB_GET_OBJSET(&os, db); 986 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 987 ASSERT(arc_released(os->os_phys_buf) || 988 list_link_active(&os->os_dsl_dataset->ds_synced_link)); 989 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 990 991 zb.zb_objset = os->os_dsl_dataset ? 992 os->os_dsl_dataset->ds_object : 0; 993 zb.zb_object = db->db.db_object; 994 zb.zb_level = db->db_level; 995 zb.zb_blkid = db->db_blkid; 996 (void) arc_release_bp(db->db_buf, db, 997 db->db_blkptr, os->os_spa, &zb); 998 } 999 1000 dbuf_dirty_record_t * 1001 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1002 { 1003 dnode_t *dn; 1004 objset_t *os; 1005 dbuf_dirty_record_t **drp, *dr; 1006 int drop_struct_lock = FALSE; 1007 boolean_t do_free_accounting = B_FALSE; 1008 int txgoff = tx->tx_txg & TXG_MASK; 1009 1010 ASSERT(tx->tx_txg != 0); 1011 ASSERT(!refcount_is_zero(&db->db_holds)); 1012 DMU_TX_DIRTY_BUF(tx, db); 1013 1014 DB_DNODE_ENTER(db); 1015 dn = DB_DNODE(db); 1016 /* 1017 * Shouldn't dirty a regular buffer in syncing context. Private 1018 * objects may be dirtied in syncing context, but only if they 1019 * were already pre-dirtied in open context. 1020 */ 1021 ASSERT(!dmu_tx_is_syncing(tx) || 1022 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 1023 DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1024 dn->dn_objset->os_dsl_dataset == NULL); 1025 /* 1026 * We make this assert for private objects as well, but after we 1027 * check if we're already dirty. They are allowed to re-dirty 1028 * in syncing context. 1029 */ 1030 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1031 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1032 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1033 1034 mutex_enter(&db->db_mtx); 1035 /* 1036 * XXX make this true for indirects too? The problem is that 1037 * transactions created with dmu_tx_create_assigned() from 1038 * syncing context don't bother holding ahead. 1039 */ 1040 ASSERT(db->db_level != 0 || 1041 db->db_state == DB_CACHED || db->db_state == DB_FILL || 1042 db->db_state == DB_NOFILL); 1043 1044 mutex_enter(&dn->dn_mtx); 1045 /* 1046 * Don't set dirtyctx to SYNC if we're just modifying this as we 1047 * initialize the objset. 1048 */ 1049 if (dn->dn_dirtyctx == DN_UNDIRTIED && 1050 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1051 dn->dn_dirtyctx = 1052 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1053 ASSERT(dn->dn_dirtyctx_firstset == NULL); 1054 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1055 } 1056 mutex_exit(&dn->dn_mtx); 1057 1058 if (db->db_blkid == DMU_SPILL_BLKID) 1059 dn->dn_have_spill = B_TRUE; 1060 1061 /* 1062 * If this buffer is already dirty, we're done. 1063 */ 1064 drp = &db->db_last_dirty; 1065 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1066 db->db.db_object == DMU_META_DNODE_OBJECT); 1067 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 1068 drp = &dr->dr_next; 1069 if (dr && dr->dr_txg == tx->tx_txg) { 1070 DB_DNODE_EXIT(db); 1071 1072 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 1073 /* 1074 * If this buffer has already been written out, 1075 * we now need to reset its state. 1076 */ 1077 dbuf_unoverride(dr); 1078 if (db->db.db_object != DMU_META_DNODE_OBJECT && 1079 db->db_state != DB_NOFILL) 1080 arc_buf_thaw(db->db_buf); 1081 } 1082 mutex_exit(&db->db_mtx); 1083 return (dr); 1084 } 1085 1086 /* 1087 * Only valid if not already dirty. 1088 */ 1089 ASSERT(dn->dn_object == 0 || 1090 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1091 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1092 1093 ASSERT3U(dn->dn_nlevels, >, db->db_level); 1094 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1095 dn->dn_phys->dn_nlevels > db->db_level || 1096 dn->dn_next_nlevels[txgoff] > db->db_level || 1097 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1098 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1099 1100 /* 1101 * We should only be dirtying in syncing context if it's the 1102 * mos or we're initializing the os or it's a special object. 1103 * However, we are allowed to dirty in syncing context provided 1104 * we already dirtied it in open context. Hence we must make 1105 * this assertion only if we're not already dirty. 1106 */ 1107 os = dn->dn_objset; 1108 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 1109 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1110 ASSERT(db->db.db_size != 0); 1111 1112 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1113 1114 if (db->db_blkid != DMU_BONUS_BLKID) { 1115 /* 1116 * Update the accounting. 1117 * Note: we delay "free accounting" until after we drop 1118 * the db_mtx. This keeps us from grabbing other locks 1119 * (and possibly deadlocking) in bp_get_dsize() while 1120 * also holding the db_mtx. 1121 */ 1122 dnode_willuse_space(dn, db->db.db_size, tx); 1123 do_free_accounting = dbuf_block_freeable(db); 1124 } 1125 1126 /* 1127 * If this buffer is dirty in an old transaction group we need 1128 * to make a copy of it so that the changes we make in this 1129 * transaction group won't leak out when we sync the older txg. 1130 */ 1131 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1132 if (db->db_level == 0) { 1133 void *data_old = db->db_buf; 1134 1135 if (db->db_state != DB_NOFILL) { 1136 if (db->db_blkid == DMU_BONUS_BLKID) { 1137 dbuf_fix_old_data(db, tx->tx_txg); 1138 data_old = db->db.db_data; 1139 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1140 /* 1141 * Release the data buffer from the cache so 1142 * that we can modify it without impacting 1143 * possible other users of this cached data 1144 * block. Note that indirect blocks and 1145 * private objects are not released until the 1146 * syncing state (since they are only modified 1147 * then). 1148 */ 1149 arc_release(db->db_buf, db); 1150 dbuf_fix_old_data(db, tx->tx_txg); 1151 data_old = db->db_buf; 1152 } 1153 ASSERT(data_old != NULL); 1154 } 1155 dr->dt.dl.dr_data = data_old; 1156 } else { 1157 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1158 list_create(&dr->dt.di.dr_children, 1159 sizeof (dbuf_dirty_record_t), 1160 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1161 } 1162 dr->dr_dbuf = db; 1163 dr->dr_txg = tx->tx_txg; 1164 dr->dr_next = *drp; 1165 *drp = dr; 1166 1167 /* 1168 * We could have been freed_in_flight between the dbuf_noread 1169 * and dbuf_dirty. We win, as though the dbuf_noread() had 1170 * happened after the free. 1171 */ 1172 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1173 db->db_blkid != DMU_SPILL_BLKID) { 1174 mutex_enter(&dn->dn_mtx); 1175 dnode_clear_range(dn, db->db_blkid, 1, tx); 1176 mutex_exit(&dn->dn_mtx); 1177 db->db_freed_in_flight = FALSE; 1178 } 1179 1180 /* 1181 * This buffer is now part of this txg 1182 */ 1183 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1184 db->db_dirtycnt += 1; 1185 ASSERT3U(db->db_dirtycnt, <=, 3); 1186 1187 mutex_exit(&db->db_mtx); 1188 1189 if (db->db_blkid == DMU_BONUS_BLKID || 1190 db->db_blkid == DMU_SPILL_BLKID) { 1191 mutex_enter(&dn->dn_mtx); 1192 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1193 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1194 mutex_exit(&dn->dn_mtx); 1195 dnode_setdirty(dn, tx); 1196 DB_DNODE_EXIT(db); 1197 return (dr); 1198 } else if (do_free_accounting) { 1199 blkptr_t *bp = db->db_blkptr; 1200 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1201 bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1202 /* 1203 * This is only a guess -- if the dbuf is dirty 1204 * in a previous txg, we don't know how much 1205 * space it will use on disk yet. We should 1206 * really have the struct_rwlock to access 1207 * db_blkptr, but since this is just a guess, 1208 * it's OK if we get an odd answer. 1209 */ 1210 ddt_prefetch(os->os_spa, bp); 1211 dnode_willuse_space(dn, -willfree, tx); 1212 } 1213 1214 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1215 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1216 drop_struct_lock = TRUE; 1217 } 1218 1219 if (db->db_level == 0) { 1220 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1221 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1222 } 1223 1224 if (db->db_level+1 < dn->dn_nlevels) { 1225 dmu_buf_impl_t *parent = db->db_parent; 1226 dbuf_dirty_record_t *di; 1227 int parent_held = FALSE; 1228 1229 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1230 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1231 1232 parent = dbuf_hold_level(dn, db->db_level+1, 1233 db->db_blkid >> epbs, FTAG); 1234 ASSERT(parent != NULL); 1235 parent_held = TRUE; 1236 } 1237 if (drop_struct_lock) 1238 rw_exit(&dn->dn_struct_rwlock); 1239 ASSERT3U(db->db_level+1, ==, parent->db_level); 1240 di = dbuf_dirty(parent, tx); 1241 if (parent_held) 1242 dbuf_rele(parent, FTAG); 1243 1244 mutex_enter(&db->db_mtx); 1245 /* possible race with dbuf_undirty() */ 1246 if (db->db_last_dirty == dr || 1247 dn->dn_object == DMU_META_DNODE_OBJECT) { 1248 mutex_enter(&di->dt.di.dr_mtx); 1249 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1250 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1251 list_insert_tail(&di->dt.di.dr_children, dr); 1252 mutex_exit(&di->dt.di.dr_mtx); 1253 dr->dr_parent = di; 1254 } 1255 mutex_exit(&db->db_mtx); 1256 } else { 1257 ASSERT(db->db_level+1 == dn->dn_nlevels); 1258 ASSERT(db->db_blkid < dn->dn_nblkptr); 1259 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1260 mutex_enter(&dn->dn_mtx); 1261 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1262 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1263 mutex_exit(&dn->dn_mtx); 1264 if (drop_struct_lock) 1265 rw_exit(&dn->dn_struct_rwlock); 1266 } 1267 1268 dnode_setdirty(dn, tx); 1269 DB_DNODE_EXIT(db); 1270 return (dr); 1271 } 1272 1273 static int 1274 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1275 { 1276 dnode_t *dn; 1277 uint64_t txg = tx->tx_txg; 1278 dbuf_dirty_record_t *dr, **drp; 1279 1280 ASSERT(txg != 0); 1281 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1282 1283 mutex_enter(&db->db_mtx); 1284 /* 1285 * If this buffer is not dirty, we're done. 1286 */ 1287 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1288 if (dr->dr_txg <= txg) 1289 break; 1290 if (dr == NULL || dr->dr_txg < txg) { 1291 mutex_exit(&db->db_mtx); 1292 return (0); 1293 } 1294 ASSERT(dr->dr_txg == txg); 1295 ASSERT(dr->dr_dbuf == db); 1296 1297 DB_DNODE_ENTER(db); 1298 dn = DB_DNODE(db); 1299 1300 /* 1301 * If this buffer is currently held, we cannot undirty 1302 * it, since one of the current holders may be in the 1303 * middle of an update. Note that users of dbuf_undirty() 1304 * should not place a hold on the dbuf before the call. 1305 * Also note: we can get here with a spill block, so 1306 * test for that similar to how dbuf_dirty does. 1307 */ 1308 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1309 mutex_exit(&db->db_mtx); 1310 /* Make sure we don't toss this buffer at sync phase */ 1311 if (db->db_blkid != DMU_SPILL_BLKID) { 1312 mutex_enter(&dn->dn_mtx); 1313 dnode_clear_range(dn, db->db_blkid, 1, tx); 1314 mutex_exit(&dn->dn_mtx); 1315 } 1316 DB_DNODE_EXIT(db); 1317 return (0); 1318 } 1319 1320 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1321 1322 ASSERT(db->db.db_size != 0); 1323 1324 /* XXX would be nice to fix up dn_towrite_space[] */ 1325 1326 *drp = dr->dr_next; 1327 1328 /* 1329 * Note that there are three places in dbuf_dirty() 1330 * where this dirty record may be put on a list. 1331 * Make sure to do a list_remove corresponding to 1332 * every one of those list_insert calls. 1333 */ 1334 if (dr->dr_parent) { 1335 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1336 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1337 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1338 } else if (db->db_blkid == DMU_SPILL_BLKID || 1339 db->db_level+1 == dn->dn_nlevels) { 1340 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1341 mutex_enter(&dn->dn_mtx); 1342 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1343 mutex_exit(&dn->dn_mtx); 1344 } 1345 DB_DNODE_EXIT(db); 1346 1347 if (db->db_level == 0) { 1348 if (db->db_state != DB_NOFILL) { 1349 dbuf_unoverride(dr); 1350 1351 ASSERT(db->db_buf != NULL); 1352 ASSERT(dr->dt.dl.dr_data != NULL); 1353 if (dr->dt.dl.dr_data != db->db_buf) 1354 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 1355 db) == 1); 1356 } 1357 } else { 1358 ASSERT(db->db_buf != NULL); 1359 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 1360 mutex_destroy(&dr->dt.di.dr_mtx); 1361 list_destroy(&dr->dt.di.dr_children); 1362 } 1363 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1364 1365 ASSERT(db->db_dirtycnt > 0); 1366 db->db_dirtycnt -= 1; 1367 1368 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1369 arc_buf_t *buf = db->db_buf; 1370 1371 ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1372 dbuf_set_data(db, NULL); 1373 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1374 dbuf_evict(db); 1375 return (1); 1376 } 1377 1378 mutex_exit(&db->db_mtx); 1379 return (0); 1380 } 1381 1382 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty 1383 void 1384 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1385 { 1386 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1387 1388 ASSERT(tx->tx_txg != 0); 1389 ASSERT(!refcount_is_zero(&db->db_holds)); 1390 1391 DB_DNODE_ENTER(db); 1392 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1393 rf |= DB_RF_HAVESTRUCT; 1394 DB_DNODE_EXIT(db); 1395 (void) dbuf_read(db, NULL, rf); 1396 (void) dbuf_dirty(db, tx); 1397 } 1398 1399 void 1400 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1401 { 1402 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1403 1404 db->db_state = DB_NOFILL; 1405 1406 dmu_buf_will_fill(db_fake, tx); 1407 } 1408 1409 void 1410 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1411 { 1412 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1413 1414 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1415 ASSERT(tx->tx_txg != 0); 1416 ASSERT(db->db_level == 0); 1417 ASSERT(!refcount_is_zero(&db->db_holds)); 1418 1419 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1420 dmu_tx_private_ok(tx)); 1421 1422 dbuf_noread(db); 1423 (void) dbuf_dirty(db, tx); 1424 } 1425 1426 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1427 /* ARGSUSED */ 1428 void 1429 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1430 { 1431 mutex_enter(&db->db_mtx); 1432 DBUF_VERIFY(db); 1433 1434 if (db->db_state == DB_FILL) { 1435 if (db->db_level == 0 && db->db_freed_in_flight) { 1436 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1437 /* we were freed while filling */ 1438 /* XXX dbuf_undirty? */ 1439 bzero(db->db.db_data, db->db.db_size); 1440 db->db_freed_in_flight = FALSE; 1441 } 1442 db->db_state = DB_CACHED; 1443 cv_broadcast(&db->db_changed); 1444 } 1445 mutex_exit(&db->db_mtx); 1446 } 1447 1448 /* 1449 * Directly assign a provided arc buf to a given dbuf if it's not referenced 1450 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 1451 */ 1452 void 1453 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 1454 { 1455 ASSERT(!refcount_is_zero(&db->db_holds)); 1456 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1457 ASSERT(db->db_level == 0); 1458 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 1459 ASSERT(buf != NULL); 1460 ASSERT(arc_buf_size(buf) == db->db.db_size); 1461 ASSERT(tx->tx_txg != 0); 1462 1463 arc_return_buf(buf, db); 1464 ASSERT(arc_released(buf)); 1465 1466 mutex_enter(&db->db_mtx); 1467 1468 while (db->db_state == DB_READ || db->db_state == DB_FILL) 1469 cv_wait(&db->db_changed, &db->db_mtx); 1470 1471 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 1472 1473 if (db->db_state == DB_CACHED && 1474 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 1475 mutex_exit(&db->db_mtx); 1476 (void) dbuf_dirty(db, tx); 1477 bcopy(buf->b_data, db->db.db_data, db->db.db_size); 1478 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1479 xuio_stat_wbuf_copied(); 1480 return; 1481 } 1482 1483 xuio_stat_wbuf_nocopy(); 1484 if (db->db_state == DB_CACHED) { 1485 dbuf_dirty_record_t *dr = db->db_last_dirty; 1486 1487 ASSERT(db->db_buf != NULL); 1488 if (dr != NULL && dr->dr_txg == tx->tx_txg) { 1489 ASSERT(dr->dt.dl.dr_data == db->db_buf); 1490 if (!arc_released(db->db_buf)) { 1491 ASSERT(dr->dt.dl.dr_override_state == 1492 DR_OVERRIDDEN); 1493 arc_release(db->db_buf, db); 1494 } 1495 dr->dt.dl.dr_data = buf; 1496 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); 1497 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 1498 arc_release(db->db_buf, db); 1499 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); 1500 } 1501 db->db_buf = NULL; 1502 } 1503 ASSERT(db->db_buf == NULL); 1504 dbuf_set_data(db, buf); 1505 db->db_state = DB_FILL; 1506 mutex_exit(&db->db_mtx); 1507 (void) dbuf_dirty(db, tx); 1508 dbuf_fill_done(db, tx); 1509 } 1510 1511 /* 1512 * "Clear" the contents of this dbuf. This will mark the dbuf 1513 * EVICTING and clear *most* of its references. Unfortunetely, 1514 * when we are not holding the dn_dbufs_mtx, we can't clear the 1515 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1516 * in this case. For callers from the DMU we will usually see: 1517 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() 1518 * For the arc callback, we will usually see: 1519 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1520 * Sometimes, though, we will get a mix of these two: 1521 * DMU: dbuf_clear()->arc_buf_evict() 1522 * ARC: dbuf_do_evict()->dbuf_destroy() 1523 */ 1524 void 1525 dbuf_clear(dmu_buf_impl_t *db) 1526 { 1527 dnode_t *dn; 1528 dmu_buf_impl_t *parent = db->db_parent; 1529 dmu_buf_impl_t *dndb; 1530 int dbuf_gone = FALSE; 1531 1532 ASSERT(MUTEX_HELD(&db->db_mtx)); 1533 ASSERT(refcount_is_zero(&db->db_holds)); 1534 1535 dbuf_evict_user(db); 1536 1537 if (db->db_state == DB_CACHED) { 1538 ASSERT(db->db.db_data != NULL); 1539 if (db->db_blkid == DMU_BONUS_BLKID) { 1540 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1541 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 1542 } 1543 db->db.db_data = NULL; 1544 db->db_state = DB_UNCACHED; 1545 } 1546 1547 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1548 ASSERT(db->db_data_pending == NULL); 1549 1550 db->db_state = DB_EVICTING; 1551 db->db_blkptr = NULL; 1552 1553 DB_DNODE_ENTER(db); 1554 dn = DB_DNODE(db); 1555 dndb = dn->dn_dbuf; 1556 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1557 list_remove(&dn->dn_dbufs, db); 1558 (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1559 membar_producer(); 1560 DB_DNODE_EXIT(db); 1561 /* 1562 * Decrementing the dbuf count means that the hold corresponding 1563 * to the removed dbuf is no longer discounted in dnode_move(), 1564 * so the dnode cannot be moved until after we release the hold. 1565 * The membar_producer() ensures visibility of the decremented 1566 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1567 * release any lock. 1568 */ 1569 dnode_rele(dn, db); 1570 db->db_dnode_handle = NULL; 1571 } else { 1572 DB_DNODE_EXIT(db); 1573 } 1574 1575 if (db->db_buf) 1576 dbuf_gone = arc_buf_evict(db->db_buf); 1577 1578 if (!dbuf_gone) 1579 mutex_exit(&db->db_mtx); 1580 1581 /* 1582 * If this dbuf is referenced from an indirect dbuf, 1583 * decrement the ref count on the indirect dbuf. 1584 */ 1585 if (parent && parent != dndb) 1586 dbuf_rele(parent, db); 1587 } 1588 1589 static int 1590 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1591 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1592 { 1593 int nlevels, epbs; 1594 1595 *parentp = NULL; 1596 *bpp = NULL; 1597 1598 ASSERT(blkid != DMU_BONUS_BLKID); 1599 1600 if (blkid == DMU_SPILL_BLKID) { 1601 mutex_enter(&dn->dn_mtx); 1602 if (dn->dn_have_spill && 1603 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1604 *bpp = &dn->dn_phys->dn_spill; 1605 else 1606 *bpp = NULL; 1607 dbuf_add_ref(dn->dn_dbuf, NULL); 1608 *parentp = dn->dn_dbuf; 1609 mutex_exit(&dn->dn_mtx); 1610 return (0); 1611 } 1612 1613 if (dn->dn_phys->dn_nlevels == 0) 1614 nlevels = 1; 1615 else 1616 nlevels = dn->dn_phys->dn_nlevels; 1617 1618 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1619 1620 ASSERT3U(level * epbs, <, 64); 1621 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1622 if (level >= nlevels || 1623 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1624 /* the buffer has no parent yet */ 1625 return (ENOENT); 1626 } else if (level < nlevels-1) { 1627 /* this block is referenced from an indirect block */ 1628 int err = dbuf_hold_impl(dn, level+1, 1629 blkid >> epbs, fail_sparse, NULL, parentp); 1630 if (err) 1631 return (err); 1632 err = dbuf_read(*parentp, NULL, 1633 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1634 if (err) { 1635 dbuf_rele(*parentp, NULL); 1636 *parentp = NULL; 1637 return (err); 1638 } 1639 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1640 (blkid & ((1ULL << epbs) - 1)); 1641 return (0); 1642 } else { 1643 /* the block is referenced from the dnode */ 1644 ASSERT3U(level, ==, nlevels-1); 1645 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1646 blkid < dn->dn_phys->dn_nblkptr); 1647 if (dn->dn_dbuf) { 1648 dbuf_add_ref(dn->dn_dbuf, NULL); 1649 *parentp = dn->dn_dbuf; 1650 } 1651 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1652 return (0); 1653 } 1654 } 1655 1656 static dmu_buf_impl_t * 1657 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1658 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1659 { 1660 objset_t *os = dn->dn_objset; 1661 dmu_buf_impl_t *db, *odb; 1662 1663 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1664 ASSERT(dn->dn_type != DMU_OT_NONE); 1665 1666 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1667 1668 db->db_objset = os; 1669 db->db.db_object = dn->dn_object; 1670 db->db_level = level; 1671 db->db_blkid = blkid; 1672 db->db_last_dirty = NULL; 1673 db->db_dirtycnt = 0; 1674 db->db_dnode_handle = dn->dn_handle; 1675 db->db_parent = parent; 1676 db->db_blkptr = blkptr; 1677 1678 db->db_user_ptr = NULL; 1679 db->db_user_data_ptr_ptr = NULL; 1680 db->db_evict_func = NULL; 1681 db->db_immediate_evict = 0; 1682 db->db_freed_in_flight = 0; 1683 1684 if (blkid == DMU_BONUS_BLKID) { 1685 ASSERT3P(parent, ==, dn->dn_dbuf); 1686 db->db.db_size = DN_MAX_BONUSLEN - 1687 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1688 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1689 db->db.db_offset = DMU_BONUS_BLKID; 1690 db->db_state = DB_UNCACHED; 1691 /* the bonus dbuf is not placed in the hash table */ 1692 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1693 return (db); 1694 } else if (blkid == DMU_SPILL_BLKID) { 1695 db->db.db_size = (blkptr != NULL) ? 1696 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 1697 db->db.db_offset = 0; 1698 } else { 1699 int blocksize = 1700 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1701 db->db.db_size = blocksize; 1702 db->db.db_offset = db->db_blkid * blocksize; 1703 } 1704 1705 /* 1706 * Hold the dn_dbufs_mtx while we get the new dbuf 1707 * in the hash table *and* added to the dbufs list. 1708 * This prevents a possible deadlock with someone 1709 * trying to look up this dbuf before its added to the 1710 * dn_dbufs list. 1711 */ 1712 mutex_enter(&dn->dn_dbufs_mtx); 1713 db->db_state = DB_EVICTING; 1714 if ((odb = dbuf_hash_insert(db)) != NULL) { 1715 /* someone else inserted it first */ 1716 kmem_cache_free(dbuf_cache, db); 1717 mutex_exit(&dn->dn_dbufs_mtx); 1718 return (odb); 1719 } 1720 list_insert_head(&dn->dn_dbufs, db); 1721 db->db_state = DB_UNCACHED; 1722 mutex_exit(&dn->dn_dbufs_mtx); 1723 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1724 1725 if (parent && parent != dn->dn_dbuf) 1726 dbuf_add_ref(parent, db); 1727 1728 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1729 refcount_count(&dn->dn_holds) > 0); 1730 (void) refcount_add(&dn->dn_holds, db); 1731 (void) atomic_inc_32_nv(&dn->dn_dbufs_count); 1732 1733 dprintf_dbuf(db, "db=%p\n", db); 1734 1735 return (db); 1736 } 1737 1738 static int 1739 dbuf_do_evict(void *private) 1740 { 1741 arc_buf_t *buf = private; 1742 dmu_buf_impl_t *db = buf->b_private; 1743 1744 if (!MUTEX_HELD(&db->db_mtx)) 1745 mutex_enter(&db->db_mtx); 1746 1747 ASSERT(refcount_is_zero(&db->db_holds)); 1748 1749 if (db->db_state != DB_EVICTING) { 1750 ASSERT(db->db_state == DB_CACHED); 1751 DBUF_VERIFY(db); 1752 db->db_buf = NULL; 1753 dbuf_evict(db); 1754 } else { 1755 mutex_exit(&db->db_mtx); 1756 dbuf_destroy(db); 1757 } 1758 return (0); 1759 } 1760 1761 static void 1762 dbuf_destroy(dmu_buf_impl_t *db) 1763 { 1764 ASSERT(refcount_is_zero(&db->db_holds)); 1765 1766 if (db->db_blkid != DMU_BONUS_BLKID) { 1767 /* 1768 * If this dbuf is still on the dn_dbufs list, 1769 * remove it from that list. 1770 */ 1771 if (db->db_dnode_handle != NULL) { 1772 dnode_t *dn; 1773 1774 DB_DNODE_ENTER(db); 1775 dn = DB_DNODE(db); 1776 mutex_enter(&dn->dn_dbufs_mtx); 1777 list_remove(&dn->dn_dbufs, db); 1778 (void) atomic_dec_32_nv(&dn->dn_dbufs_count); 1779 mutex_exit(&dn->dn_dbufs_mtx); 1780 DB_DNODE_EXIT(db); 1781 /* 1782 * Decrementing the dbuf count means that the hold 1783 * corresponding to the removed dbuf is no longer 1784 * discounted in dnode_move(), so the dnode cannot be 1785 * moved until after we release the hold. 1786 */ 1787 dnode_rele(dn, db); 1788 db->db_dnode_handle = NULL; 1789 } 1790 dbuf_hash_remove(db); 1791 } 1792 db->db_parent = NULL; 1793 db->db_buf = NULL; 1794 1795 ASSERT(!list_link_active(&db->db_link)); 1796 ASSERT(db->db.db_data == NULL); 1797 ASSERT(db->db_hash_next == NULL); 1798 ASSERT(db->db_blkptr == NULL); 1799 ASSERT(db->db_data_pending == NULL); 1800 1801 kmem_cache_free(dbuf_cache, db); 1802 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1803 } 1804 1805 void 1806 dbuf_prefetch(dnode_t *dn, uint64_t blkid) 1807 { 1808 dmu_buf_impl_t *db = NULL; 1809 blkptr_t *bp = NULL; 1810 1811 ASSERT(blkid != DMU_BONUS_BLKID); 1812 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1813 1814 if (dnode_block_freed(dn, blkid)) 1815 return; 1816 1817 /* dbuf_find() returns with db_mtx held */ 1818 if (db = dbuf_find(dn, 0, blkid)) { 1819 /* 1820 * This dbuf is already in the cache. We assume that 1821 * it is already CACHED, or else about to be either 1822 * read or filled. 1823 */ 1824 mutex_exit(&db->db_mtx); 1825 return; 1826 } 1827 1828 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1829 if (bp && !BP_IS_HOLE(bp)) { 1830 int priority = dn->dn_type == DMU_OT_DDT_ZAP ? 1831 ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; 1832 arc_buf_t *pbuf; 1833 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 1834 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1835 zbookmark_t zb; 1836 1837 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 1838 dn->dn_object, 0, blkid); 1839 1840 if (db) 1841 pbuf = db->db_buf; 1842 else 1843 pbuf = dn->dn_objset->os_phys_buf; 1844 1845 (void) dsl_read(NULL, dn->dn_objset->os_spa, 1846 bp, pbuf, NULL, NULL, priority, 1847 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1848 &aflags, &zb); 1849 } 1850 if (db) 1851 dbuf_rele(db, NULL); 1852 } 1853 } 1854 1855 /* 1856 * Returns with db_holds incremented, and db_mtx not held. 1857 * Note: dn_struct_rwlock must be held. 1858 */ 1859 int 1860 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1861 void *tag, dmu_buf_impl_t **dbp) 1862 { 1863 dmu_buf_impl_t *db, *parent = NULL; 1864 1865 ASSERT(blkid != DMU_BONUS_BLKID); 1866 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1867 ASSERT3U(dn->dn_nlevels, >, level); 1868 1869 *dbp = NULL; 1870 top: 1871 /* dbuf_find() returns with db_mtx held */ 1872 db = dbuf_find(dn, level, blkid); 1873 1874 if (db == NULL) { 1875 blkptr_t *bp = NULL; 1876 int err; 1877 1878 ASSERT3P(parent, ==, NULL); 1879 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1880 if (fail_sparse) { 1881 if (err == 0 && bp && BP_IS_HOLE(bp)) 1882 err = ENOENT; 1883 if (err) { 1884 if (parent) 1885 dbuf_rele(parent, NULL); 1886 return (err); 1887 } 1888 } 1889 if (err && err != ENOENT) 1890 return (err); 1891 db = dbuf_create(dn, level, blkid, parent, bp); 1892 } 1893 1894 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1895 arc_buf_add_ref(db->db_buf, db); 1896 if (db->db_buf->b_data == NULL) { 1897 dbuf_clear(db); 1898 if (parent) { 1899 dbuf_rele(parent, NULL); 1900 parent = NULL; 1901 } 1902 goto top; 1903 } 1904 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1905 } 1906 1907 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1908 1909 /* 1910 * If this buffer is currently syncing out, and we are are 1911 * still referencing it from db_data, we need to make a copy 1912 * of it in case we decide we want to dirty it again in this txg. 1913 */ 1914 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 1915 dn->dn_object != DMU_META_DNODE_OBJECT && 1916 db->db_state == DB_CACHED && db->db_data_pending) { 1917 dbuf_dirty_record_t *dr = db->db_data_pending; 1918 1919 if (dr->dt.dl.dr_data == db->db_buf) { 1920 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1921 1922 dbuf_set_data(db, 1923 arc_buf_alloc(dn->dn_objset->os_spa, 1924 db->db.db_size, db, type)); 1925 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1926 db->db.db_size); 1927 } 1928 } 1929 1930 (void) refcount_add(&db->db_holds, tag); 1931 dbuf_update_data(db); 1932 DBUF_VERIFY(db); 1933 mutex_exit(&db->db_mtx); 1934 1935 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1936 if (parent) 1937 dbuf_rele(parent, NULL); 1938 1939 ASSERT3P(DB_DNODE(db), ==, dn); 1940 ASSERT3U(db->db_blkid, ==, blkid); 1941 ASSERT3U(db->db_level, ==, level); 1942 *dbp = db; 1943 1944 return (0); 1945 } 1946 1947 dmu_buf_impl_t * 1948 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1949 { 1950 dmu_buf_impl_t *db; 1951 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1952 return (err ? NULL : db); 1953 } 1954 1955 dmu_buf_impl_t * 1956 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1957 { 1958 dmu_buf_impl_t *db; 1959 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1960 return (err ? NULL : db); 1961 } 1962 1963 void 1964 dbuf_create_bonus(dnode_t *dn) 1965 { 1966 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1967 1968 ASSERT(dn->dn_bonus == NULL); 1969 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 1970 } 1971 1972 int 1973 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 1974 { 1975 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1976 dnode_t *dn; 1977 1978 if (db->db_blkid != DMU_SPILL_BLKID) 1979 return (ENOTSUP); 1980 if (blksz == 0) 1981 blksz = SPA_MINBLOCKSIZE; 1982 if (blksz > SPA_MAXBLOCKSIZE) 1983 blksz = SPA_MAXBLOCKSIZE; 1984 else 1985 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 1986 1987 DB_DNODE_ENTER(db); 1988 dn = DB_DNODE(db); 1989 rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 1990 dbuf_new_size(db, blksz, tx); 1991 rw_exit(&dn->dn_struct_rwlock); 1992 DB_DNODE_EXIT(db); 1993 1994 return (0); 1995 } 1996 1997 void 1998 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 1999 { 2000 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2001 } 2002 2003 #pragma weak dmu_buf_add_ref = dbuf_add_ref 2004 void 2005 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 2006 { 2007 int64_t holds = refcount_add(&db->db_holds, tag); 2008 ASSERT(holds > 1); 2009 } 2010 2011 /* 2012 * If you call dbuf_rele() you had better not be referencing the dnode handle 2013 * unless you have some other direct or indirect hold on the dnode. (An indirect 2014 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2015 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2016 * dnode's parent dbuf evicting its dnode handles. 2017 */ 2018 #pragma weak dmu_buf_rele = dbuf_rele 2019 void 2020 dbuf_rele(dmu_buf_impl_t *db, void *tag) 2021 { 2022 mutex_enter(&db->db_mtx); 2023 dbuf_rele_and_unlock(db, tag); 2024 } 2025 2026 /* 2027 * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2028 * db_dirtycnt and db_holds to be updated atomically. 2029 */ 2030 void 2031 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2032 { 2033 int64_t holds; 2034 2035 ASSERT(MUTEX_HELD(&db->db_mtx)); 2036 DBUF_VERIFY(db); 2037 2038 /* 2039 * Remove the reference to the dbuf before removing its hold on the 2040 * dnode so we can guarantee in dnode_move() that a referenced bonus 2041 * buffer has a corresponding dnode hold. 2042 */ 2043 holds = refcount_remove(&db->db_holds, tag); 2044 ASSERT(holds >= 0); 2045 2046 /* 2047 * We can't freeze indirects if there is a possibility that they 2048 * may be modified in the current syncing context. 2049 */ 2050 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 2051 arc_buf_freeze(db->db_buf); 2052 2053 if (holds == db->db_dirtycnt && 2054 db->db_level == 0 && db->db_immediate_evict) 2055 dbuf_evict_user(db); 2056 2057 if (holds == 0) { 2058 if (db->db_blkid == DMU_BONUS_BLKID) { 2059 mutex_exit(&db->db_mtx); 2060 2061 /* 2062 * If the dnode moves here, we cannot cross this barrier 2063 * until the move completes. 2064 */ 2065 DB_DNODE_ENTER(db); 2066 (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count); 2067 DB_DNODE_EXIT(db); 2068 /* 2069 * The bonus buffer's dnode hold is no longer discounted 2070 * in dnode_move(). The dnode cannot move until after 2071 * the dnode_rele(). 2072 */ 2073 dnode_rele(DB_DNODE(db), db); 2074 } else if (db->db_buf == NULL) { 2075 /* 2076 * This is a special case: we never associated this 2077 * dbuf with any data allocated from the ARC. 2078 */ 2079 ASSERT(db->db_state == DB_UNCACHED || 2080 db->db_state == DB_NOFILL); 2081 dbuf_evict(db); 2082 } else if (arc_released(db->db_buf)) { 2083 arc_buf_t *buf = db->db_buf; 2084 /* 2085 * This dbuf has anonymous data associated with it. 2086 */ 2087 dbuf_set_data(db, NULL); 2088 VERIFY(arc_buf_remove_ref(buf, db) == 1); 2089 dbuf_evict(db); 2090 } else { 2091 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); 2092 2093 /* 2094 * A dbuf will be eligible for eviction if either the 2095 * 'primarycache' property is set or a duplicate 2096 * copy of this buffer is already cached in the arc. 2097 * 2098 * In the case of the 'primarycache' a buffer 2099 * is considered for eviction if it matches the 2100 * criteria set in the property. 2101 * 2102 * To decide if our buffer is considered a 2103 * duplicate, we must call into the arc to determine 2104 * if multiple buffers are referencing the same 2105 * block on-disk. If so, then we simply evict 2106 * ourselves. 2107 */ 2108 if (!DBUF_IS_CACHEABLE(db) || 2109 arc_buf_eviction_needed(db->db_buf)) 2110 dbuf_clear(db); 2111 else 2112 mutex_exit(&db->db_mtx); 2113 } 2114 } else { 2115 mutex_exit(&db->db_mtx); 2116 } 2117 } 2118 2119 #pragma weak dmu_buf_refcount = dbuf_refcount 2120 uint64_t 2121 dbuf_refcount(dmu_buf_impl_t *db) 2122 { 2123 return (refcount_count(&db->db_holds)); 2124 } 2125 2126 void * 2127 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2128 dmu_buf_evict_func_t *evict_func) 2129 { 2130 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2131 user_data_ptr_ptr, evict_func)); 2132 } 2133 2134 void * 2135 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 2136 dmu_buf_evict_func_t *evict_func) 2137 { 2138 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2139 2140 db->db_immediate_evict = TRUE; 2141 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 2142 user_data_ptr_ptr, evict_func)); 2143 } 2144 2145 void * 2146 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 2147 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 2148 { 2149 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2150 ASSERT(db->db_level == 0); 2151 2152 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 2153 2154 mutex_enter(&db->db_mtx); 2155 2156 if (db->db_user_ptr == old_user_ptr) { 2157 db->db_user_ptr = user_ptr; 2158 db->db_user_data_ptr_ptr = user_data_ptr_ptr; 2159 db->db_evict_func = evict_func; 2160 2161 dbuf_update_data(db); 2162 } else { 2163 old_user_ptr = db->db_user_ptr; 2164 } 2165 2166 mutex_exit(&db->db_mtx); 2167 return (old_user_ptr); 2168 } 2169 2170 void * 2171 dmu_buf_get_user(dmu_buf_t *db_fake) 2172 { 2173 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2174 ASSERT(!refcount_is_zero(&db->db_holds)); 2175 2176 return (db->db_user_ptr); 2177 } 2178 2179 boolean_t 2180 dmu_buf_freeable(dmu_buf_t *dbuf) 2181 { 2182 boolean_t res = B_FALSE; 2183 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 2184 2185 if (db->db_blkptr) 2186 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2187 db->db_blkptr, db->db_blkptr->blk_birth); 2188 2189 return (res); 2190 } 2191 2192 static void 2193 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2194 { 2195 /* ASSERT(dmu_tx_is_syncing(tx) */ 2196 ASSERT(MUTEX_HELD(&db->db_mtx)); 2197 2198 if (db->db_blkptr != NULL) 2199 return; 2200 2201 if (db->db_blkid == DMU_SPILL_BLKID) { 2202 db->db_blkptr = &dn->dn_phys->dn_spill; 2203 BP_ZERO(db->db_blkptr); 2204 return; 2205 } 2206 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2207 /* 2208 * This buffer was allocated at a time when there was 2209 * no available blkptrs from the dnode, or it was 2210 * inappropriate to hook it in (i.e., nlevels mis-match). 2211 */ 2212 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2213 ASSERT(db->db_parent == NULL); 2214 db->db_parent = dn->dn_dbuf; 2215 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2216 DBUF_VERIFY(db); 2217 } else { 2218 dmu_buf_impl_t *parent = db->db_parent; 2219 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2220 2221 ASSERT(dn->dn_phys->dn_nlevels > 1); 2222 if (parent == NULL) { 2223 mutex_exit(&db->db_mtx); 2224 rw_enter(&dn->dn_struct_rwlock, RW_READER); 2225 (void) dbuf_hold_impl(dn, db->db_level+1, 2226 db->db_blkid >> epbs, FALSE, db, &parent); 2227 rw_exit(&dn->dn_struct_rwlock); 2228 mutex_enter(&db->db_mtx); 2229 db->db_parent = parent; 2230 } 2231 db->db_blkptr = (blkptr_t *)parent->db.db_data + 2232 (db->db_blkid & ((1ULL << epbs) - 1)); 2233 DBUF_VERIFY(db); 2234 } 2235 } 2236 2237 static void 2238 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2239 { 2240 dmu_buf_impl_t *db = dr->dr_dbuf; 2241 dnode_t *dn; 2242 zio_t *zio; 2243 2244 ASSERT(dmu_tx_is_syncing(tx)); 2245 2246 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2247 2248 mutex_enter(&db->db_mtx); 2249 2250 ASSERT(db->db_level > 0); 2251 DBUF_VERIFY(db); 2252 2253 if (db->db_buf == NULL) { 2254 mutex_exit(&db->db_mtx); 2255 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2256 mutex_enter(&db->db_mtx); 2257 } 2258 ASSERT3U(db->db_state, ==, DB_CACHED); 2259 ASSERT(db->db_buf != NULL); 2260 2261 DB_DNODE_ENTER(db); 2262 dn = DB_DNODE(db); 2263 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2264 dbuf_check_blkptr(dn, db); 2265 DB_DNODE_EXIT(db); 2266 2267 db->db_data_pending = dr; 2268 2269 mutex_exit(&db->db_mtx); 2270 dbuf_write(dr, db->db_buf, tx); 2271 2272 zio = dr->dr_zio; 2273 mutex_enter(&dr->dt.di.dr_mtx); 2274 dbuf_sync_list(&dr->dt.di.dr_children, tx); 2275 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2276 mutex_exit(&dr->dt.di.dr_mtx); 2277 zio_nowait(zio); 2278 } 2279 2280 static void 2281 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2282 { 2283 arc_buf_t **datap = &dr->dt.dl.dr_data; 2284 dmu_buf_impl_t *db = dr->dr_dbuf; 2285 dnode_t *dn; 2286 objset_t *os; 2287 uint64_t txg = tx->tx_txg; 2288 2289 ASSERT(dmu_tx_is_syncing(tx)); 2290 2291 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2292 2293 mutex_enter(&db->db_mtx); 2294 /* 2295 * To be synced, we must be dirtied. But we 2296 * might have been freed after the dirty. 2297 */ 2298 if (db->db_state == DB_UNCACHED) { 2299 /* This buffer has been freed since it was dirtied */ 2300 ASSERT(db->db.db_data == NULL); 2301 } else if (db->db_state == DB_FILL) { 2302 /* This buffer was freed and is now being re-filled */ 2303 ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2304 } else { 2305 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2306 } 2307 DBUF_VERIFY(db); 2308 2309 DB_DNODE_ENTER(db); 2310 dn = DB_DNODE(db); 2311 2312 if (db->db_blkid == DMU_SPILL_BLKID) { 2313 mutex_enter(&dn->dn_mtx); 2314 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 2315 mutex_exit(&dn->dn_mtx); 2316 } 2317 2318 /* 2319 * If this is a bonus buffer, simply copy the bonus data into the 2320 * dnode. It will be written out when the dnode is synced (and it 2321 * will be synced, since it must have been dirty for dbuf_sync to 2322 * be called). 2323 */ 2324 if (db->db_blkid == DMU_BONUS_BLKID) { 2325 dbuf_dirty_record_t **drp; 2326 2327 ASSERT(*datap != NULL); 2328 ASSERT0(db->db_level); 2329 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2330 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2331 DB_DNODE_EXIT(db); 2332 2333 if (*datap != db->db.db_data) { 2334 zio_buf_free(*datap, DN_MAX_BONUSLEN); 2335 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 2336 } 2337 db->db_data_pending = NULL; 2338 drp = &db->db_last_dirty; 2339 while (*drp != dr) 2340 drp = &(*drp)->dr_next; 2341 ASSERT(dr->dr_next == NULL); 2342 ASSERT(dr->dr_dbuf == db); 2343 *drp = dr->dr_next; 2344 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2345 ASSERT(db->db_dirtycnt > 0); 2346 db->db_dirtycnt -= 1; 2347 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2348 return; 2349 } 2350 2351 os = dn->dn_objset; 2352 2353 /* 2354 * This function may have dropped the db_mtx lock allowing a dmu_sync 2355 * operation to sneak in. As a result, we need to ensure that we 2356 * don't check the dr_override_state until we have returned from 2357 * dbuf_check_blkptr. 2358 */ 2359 dbuf_check_blkptr(dn, db); 2360 2361 /* 2362 * If this buffer is in the middle of an immediate write, 2363 * wait for the synchronous IO to complete. 2364 */ 2365 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2366 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2367 cv_wait(&db->db_changed, &db->db_mtx); 2368 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2369 } 2370 2371 if (db->db_state != DB_NOFILL && 2372 dn->dn_object != DMU_META_DNODE_OBJECT && 2373 refcount_count(&db->db_holds) > 1 && 2374 dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2375 *datap == db->db_buf) { 2376 /* 2377 * If this buffer is currently "in use" (i.e., there 2378 * are active holds and db_data still references it), 2379 * then make a copy before we start the write so that 2380 * any modifications from the open txg will not leak 2381 * into this write. 2382 * 2383 * NOTE: this copy does not need to be made for 2384 * objects only modified in the syncing context (e.g. 2385 * DNONE_DNODE blocks). 2386 */ 2387 int blksz = arc_buf_size(*datap); 2388 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2389 *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2390 bcopy(db->db.db_data, (*datap)->b_data, blksz); 2391 } 2392 db->db_data_pending = dr; 2393 2394 mutex_exit(&db->db_mtx); 2395 2396 dbuf_write(dr, *datap, tx); 2397 2398 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2399 if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2400 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2401 DB_DNODE_EXIT(db); 2402 } else { 2403 /* 2404 * Although zio_nowait() does not "wait for an IO", it does 2405 * initiate the IO. If this is an empty write it seems plausible 2406 * that the IO could actually be completed before the nowait 2407 * returns. We need to DB_DNODE_EXIT() first in case 2408 * zio_nowait() invalidates the dbuf. 2409 */ 2410 DB_DNODE_EXIT(db); 2411 zio_nowait(dr->dr_zio); 2412 } 2413 } 2414 2415 void 2416 dbuf_sync_list(list_t *list, dmu_tx_t *tx) 2417 { 2418 dbuf_dirty_record_t *dr; 2419 2420 while (dr = list_head(list)) { 2421 if (dr->dr_zio != NULL) { 2422 /* 2423 * If we find an already initialized zio then we 2424 * are processing the meta-dnode, and we have finished. 2425 * The dbufs for all dnodes are put back on the list 2426 * during processing, so that we can zio_wait() 2427 * these IOs after initiating all child IOs. 2428 */ 2429 ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2430 DMU_META_DNODE_OBJECT); 2431 break; 2432 } 2433 list_remove(list, dr); 2434 if (dr->dr_dbuf->db_level > 0) 2435 dbuf_sync_indirect(dr, tx); 2436 else 2437 dbuf_sync_leaf(dr, tx); 2438 } 2439 } 2440 2441 /* ARGSUSED */ 2442 static void 2443 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2444 { 2445 dmu_buf_impl_t *db = vdb; 2446 dnode_t *dn; 2447 blkptr_t *bp = zio->io_bp; 2448 blkptr_t *bp_orig = &zio->io_bp_orig; 2449 spa_t *spa = zio->io_spa; 2450 int64_t delta; 2451 uint64_t fill = 0; 2452 int i; 2453 2454 ASSERT(db->db_blkptr == bp); 2455 2456 DB_DNODE_ENTER(db); 2457 dn = DB_DNODE(db); 2458 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2459 dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2460 zio->io_prev_space_delta = delta; 2461 2462 if (BP_IS_HOLE(bp)) { 2463 ASSERT(bp->blk_fill == 0); 2464 DB_DNODE_EXIT(db); 2465 return; 2466 } 2467 2468 ASSERT((db->db_blkid != DMU_SPILL_BLKID && 2469 BP_GET_TYPE(bp) == dn->dn_type) || 2470 (db->db_blkid == DMU_SPILL_BLKID && 2471 BP_GET_TYPE(bp) == dn->dn_bonustype)); 2472 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2473 2474 mutex_enter(&db->db_mtx); 2475 2476 #ifdef ZFS_DEBUG 2477 if (db->db_blkid == DMU_SPILL_BLKID) { 2478 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2479 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2480 db->db_blkptr == &dn->dn_phys->dn_spill); 2481 } 2482 #endif 2483 2484 if (db->db_level == 0) { 2485 mutex_enter(&dn->dn_mtx); 2486 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2487 db->db_blkid != DMU_SPILL_BLKID) 2488 dn->dn_phys->dn_maxblkid = db->db_blkid; 2489 mutex_exit(&dn->dn_mtx); 2490 2491 if (dn->dn_type == DMU_OT_DNODE) { 2492 dnode_phys_t *dnp = db->db.db_data; 2493 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2494 i--, dnp++) { 2495 if (dnp->dn_type != DMU_OT_NONE) 2496 fill++; 2497 } 2498 } else { 2499 fill = 1; 2500 } 2501 } else { 2502 blkptr_t *ibp = db->db.db_data; 2503 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2504 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2505 if (BP_IS_HOLE(ibp)) 2506 continue; 2507 fill += ibp->blk_fill; 2508 } 2509 } 2510 DB_DNODE_EXIT(db); 2511 2512 bp->blk_fill = fill; 2513 2514 mutex_exit(&db->db_mtx); 2515 } 2516 2517 /* ARGSUSED */ 2518 static void 2519 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2520 { 2521 dmu_buf_impl_t *db = vdb; 2522 blkptr_t *bp = zio->io_bp; 2523 blkptr_t *bp_orig = &zio->io_bp_orig; 2524 uint64_t txg = zio->io_txg; 2525 dbuf_dirty_record_t **drp, *dr; 2526 2527 ASSERT0(zio->io_error); 2528 ASSERT(db->db_blkptr == bp); 2529 2530 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 2531 ASSERT(BP_EQUAL(bp, bp_orig)); 2532 } else { 2533 objset_t *os; 2534 dsl_dataset_t *ds; 2535 dmu_tx_t *tx; 2536 2537 DB_GET_OBJSET(&os, db); 2538 ds = os->os_dsl_dataset; 2539 tx = os->os_synctx; 2540 2541 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 2542 dsl_dataset_block_born(ds, bp, tx); 2543 } 2544 2545 mutex_enter(&db->db_mtx); 2546 2547 DBUF_VERIFY(db); 2548 2549 drp = &db->db_last_dirty; 2550 while ((dr = *drp) != db->db_data_pending) 2551 drp = &dr->dr_next; 2552 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2553 ASSERT(dr->dr_txg == txg); 2554 ASSERT(dr->dr_dbuf == db); 2555 ASSERT(dr->dr_next == NULL); 2556 *drp = dr->dr_next; 2557 2558 #ifdef ZFS_DEBUG 2559 if (db->db_blkid == DMU_SPILL_BLKID) { 2560 dnode_t *dn; 2561 2562 DB_DNODE_ENTER(db); 2563 dn = DB_DNODE(db); 2564 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2565 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 2566 db->db_blkptr == &dn->dn_phys->dn_spill); 2567 DB_DNODE_EXIT(db); 2568 } 2569 #endif 2570 2571 if (db->db_level == 0) { 2572 ASSERT(db->db_blkid != DMU_BONUS_BLKID); 2573 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2574 if (db->db_state != DB_NOFILL) { 2575 if (dr->dt.dl.dr_data != db->db_buf) 2576 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 2577 db) == 1); 2578 else if (!arc_released(db->db_buf)) 2579 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2580 } 2581 } else { 2582 dnode_t *dn; 2583 2584 DB_DNODE_ENTER(db); 2585 dn = DB_DNODE(db); 2586 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2587 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2588 if (!BP_IS_HOLE(db->db_blkptr)) { 2589 int epbs = 2590 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2591 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2592 db->db.db_size); 2593 ASSERT3U(dn->dn_phys->dn_maxblkid 2594 >> (db->db_level * epbs), >=, db->db_blkid); 2595 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2596 } 2597 DB_DNODE_EXIT(db); 2598 mutex_destroy(&dr->dt.di.dr_mtx); 2599 list_destroy(&dr->dt.di.dr_children); 2600 } 2601 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2602 2603 cv_broadcast(&db->db_changed); 2604 ASSERT(db->db_dirtycnt > 0); 2605 db->db_dirtycnt -= 1; 2606 db->db_data_pending = NULL; 2607 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2608 } 2609 2610 static void 2611 dbuf_write_nofill_ready(zio_t *zio) 2612 { 2613 dbuf_write_ready(zio, NULL, zio->io_private); 2614 } 2615 2616 static void 2617 dbuf_write_nofill_done(zio_t *zio) 2618 { 2619 dbuf_write_done(zio, NULL, zio->io_private); 2620 } 2621 2622 static void 2623 dbuf_write_override_ready(zio_t *zio) 2624 { 2625 dbuf_dirty_record_t *dr = zio->io_private; 2626 dmu_buf_impl_t *db = dr->dr_dbuf; 2627 2628 dbuf_write_ready(zio, NULL, db); 2629 } 2630 2631 static void 2632 dbuf_write_override_done(zio_t *zio) 2633 { 2634 dbuf_dirty_record_t *dr = zio->io_private; 2635 dmu_buf_impl_t *db = dr->dr_dbuf; 2636 blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 2637 2638 mutex_enter(&db->db_mtx); 2639 if (!BP_EQUAL(zio->io_bp, obp)) { 2640 if (!BP_IS_HOLE(obp)) 2641 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 2642 arc_release(dr->dt.dl.dr_data, db); 2643 } 2644 mutex_exit(&db->db_mtx); 2645 2646 dbuf_write_done(zio, NULL, db); 2647 } 2648 2649 static void 2650 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2651 { 2652 dmu_buf_impl_t *db = dr->dr_dbuf; 2653 dnode_t *dn; 2654 objset_t *os; 2655 dmu_buf_impl_t *parent = db->db_parent; 2656 uint64_t txg = tx->tx_txg; 2657 zbookmark_t zb; 2658 zio_prop_t zp; 2659 zio_t *zio; 2660 int wp_flag = 0; 2661 2662 DB_DNODE_ENTER(db); 2663 dn = DB_DNODE(db); 2664 os = dn->dn_objset; 2665 2666 if (db->db_state != DB_NOFILL) { 2667 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 2668 /* 2669 * Private object buffers are released here rather 2670 * than in dbuf_dirty() since they are only modified 2671 * in the syncing context and we don't want the 2672 * overhead of making multiple copies of the data. 2673 */ 2674 if (BP_IS_HOLE(db->db_blkptr)) { 2675 arc_buf_thaw(data); 2676 } else { 2677 dbuf_release_bp(db); 2678 } 2679 } 2680 } 2681 2682 if (parent != dn->dn_dbuf) { 2683 ASSERT(parent && parent->db_data_pending); 2684 ASSERT(db->db_level == parent->db_level-1); 2685 ASSERT(arc_released(parent->db_buf)); 2686 zio = parent->db_data_pending->dr_zio; 2687 } else { 2688 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 2689 db->db_blkid != DMU_SPILL_BLKID) || 2690 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 2691 if (db->db_blkid != DMU_SPILL_BLKID) 2692 ASSERT3P(db->db_blkptr, ==, 2693 &dn->dn_phys->dn_blkptr[db->db_blkid]); 2694 zio = dn->dn_zio; 2695 } 2696 2697 ASSERT(db->db_level == 0 || data == db->db_buf); 2698 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2699 ASSERT(zio); 2700 2701 SET_BOOKMARK(&zb, os->os_dsl_dataset ? 2702 os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 2703 db->db.db_object, db->db_level, db->db_blkid); 2704 2705 if (db->db_blkid == DMU_SPILL_BLKID) 2706 wp_flag = WP_SPILL; 2707 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 2708 2709 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 2710 DB_DNODE_EXIT(db); 2711 2712 if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 2713 ASSERT(db->db_state != DB_NOFILL); 2714 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2715 db->db_blkptr, data->b_data, arc_buf_size(data), &zp, 2716 dbuf_write_override_ready, dbuf_write_override_done, dr, 2717 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2718 mutex_enter(&db->db_mtx); 2719 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2720 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 2721 dr->dt.dl.dr_copies); 2722 mutex_exit(&db->db_mtx); 2723 } else if (db->db_state == DB_NOFILL) { 2724 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); 2725 dr->dr_zio = zio_write(zio, os->os_spa, txg, 2726 db->db_blkptr, NULL, db->db.db_size, &zp, 2727 dbuf_write_nofill_ready, dbuf_write_nofill_done, db, 2728 ZIO_PRIORITY_ASYNC_WRITE, 2729 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 2730 } else { 2731 ASSERT(arc_released(data)); 2732 dr->dr_zio = arc_write(zio, os->os_spa, txg, 2733 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, 2734 dbuf_write_ready, dbuf_write_done, db, 2735 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2736 } 2737 } 2738