1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/dmu.h> 28 #include <sys/dmu_impl.h> 29 #include <sys/dbuf.h> 30 #include <sys/dmu_objset.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_dir.h> 33 #include <sys/dmu_tx.h> 34 #include <sys/spa.h> 35 #include <sys/zio.h> 36 #include <sys/dmu_zfetch.h> 37 38 static void dbuf_destroy(dmu_buf_impl_t *db); 39 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 40 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 41 static arc_done_func_t dbuf_write_ready; 42 static arc_done_func_t dbuf_write_done; 43 44 /* 45 * Global data structures and functions for the dbuf cache. 46 */ 47 static kmem_cache_t *dbuf_cache; 48 49 /* ARGSUSED */ 50 static int 51 dbuf_cons(void *vdb, void *unused, int kmflag) 52 { 53 dmu_buf_impl_t *db = vdb; 54 bzero(db, sizeof (dmu_buf_impl_t)); 55 56 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 57 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 58 refcount_create(&db->db_holds); 59 return (0); 60 } 61 62 /* ARGSUSED */ 63 static void 64 dbuf_dest(void *vdb, void *unused) 65 { 66 dmu_buf_impl_t *db = vdb; 67 mutex_destroy(&db->db_mtx); 68 cv_destroy(&db->db_changed); 69 refcount_destroy(&db->db_holds); 70 } 71 72 /* 73 * dbuf hash table routines 74 */ 75 static dbuf_hash_table_t dbuf_hash_table; 76 77 static uint64_t dbuf_hash_count; 78 79 static uint64_t 80 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 81 { 82 uintptr_t osv = (uintptr_t)os; 83 uint64_t crc = -1ULL; 84 85 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 86 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 87 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 88 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 90 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 91 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 92 93 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 94 95 return (crc); 96 } 97 98 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 99 100 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 101 ((dbuf)->db.db_object == (obj) && \ 102 (dbuf)->db_objset == (os) && \ 103 (dbuf)->db_level == (level) && \ 104 (dbuf)->db_blkid == (blkid)) 105 106 dmu_buf_impl_t * 107 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 108 { 109 dbuf_hash_table_t *h = &dbuf_hash_table; 110 objset_impl_t *os = dn->dn_objset; 111 uint64_t obj = dn->dn_object; 112 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 113 uint64_t idx = hv & h->hash_table_mask; 114 dmu_buf_impl_t *db; 115 116 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 117 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 118 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 119 mutex_enter(&db->db_mtx); 120 if (db->db_state != DB_EVICTING) { 121 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 122 return (db); 123 } 124 mutex_exit(&db->db_mtx); 125 } 126 } 127 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 128 return (NULL); 129 } 130 131 /* 132 * Insert an entry into the hash table. If there is already an element 133 * equal to elem in the hash table, then the already existing element 134 * will be returned and the new element will not be inserted. 135 * Otherwise returns NULL. 136 */ 137 static dmu_buf_impl_t * 138 dbuf_hash_insert(dmu_buf_impl_t *db) 139 { 140 dbuf_hash_table_t *h = &dbuf_hash_table; 141 objset_impl_t *os = db->db_objset; 142 uint64_t obj = db->db.db_object; 143 int level = db->db_level; 144 uint64_t blkid = db->db_blkid; 145 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 146 uint64_t idx = hv & h->hash_table_mask; 147 dmu_buf_impl_t *dbf; 148 149 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 150 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 151 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 152 mutex_enter(&dbf->db_mtx); 153 if (dbf->db_state != DB_EVICTING) { 154 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 155 return (dbf); 156 } 157 mutex_exit(&dbf->db_mtx); 158 } 159 } 160 161 mutex_enter(&db->db_mtx); 162 db->db_hash_next = h->hash_table[idx]; 163 h->hash_table[idx] = db; 164 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 165 atomic_add_64(&dbuf_hash_count, 1); 166 167 return (NULL); 168 } 169 170 /* 171 * Remove an entry from the hash table. This operation will 172 * fail if there are any existing holds on the db. 173 */ 174 static void 175 dbuf_hash_remove(dmu_buf_impl_t *db) 176 { 177 dbuf_hash_table_t *h = &dbuf_hash_table; 178 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 179 db->db_level, db->db_blkid); 180 uint64_t idx = hv & h->hash_table_mask; 181 dmu_buf_impl_t *dbf, **dbp; 182 183 /* 184 * We musn't hold db_mtx to maintin lock ordering: 185 * DBUF_HASH_MUTEX > db_mtx. 186 */ 187 ASSERT(refcount_is_zero(&db->db_holds)); 188 ASSERT(db->db_state == DB_EVICTING); 189 ASSERT(!MUTEX_HELD(&db->db_mtx)); 190 191 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 192 dbp = &h->hash_table[idx]; 193 while ((dbf = *dbp) != db) { 194 dbp = &dbf->db_hash_next; 195 ASSERT(dbf != NULL); 196 } 197 *dbp = db->db_hash_next; 198 db->db_hash_next = NULL; 199 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 200 atomic_add_64(&dbuf_hash_count, -1); 201 } 202 203 static arc_evict_func_t dbuf_do_evict; 204 205 static void 206 dbuf_evict_user(dmu_buf_impl_t *db) 207 { 208 ASSERT(MUTEX_HELD(&db->db_mtx)); 209 210 if (db->db_level != 0 || db->db_evict_func == NULL) 211 return; 212 213 if (db->db_user_data_ptr_ptr) 214 *db->db_user_data_ptr_ptr = db->db.db_data; 215 db->db_evict_func(&db->db, db->db_user_ptr); 216 db->db_user_ptr = NULL; 217 db->db_user_data_ptr_ptr = NULL; 218 db->db_evict_func = NULL; 219 } 220 221 void 222 dbuf_evict(dmu_buf_impl_t *db) 223 { 224 ASSERT(MUTEX_HELD(&db->db_mtx)); 225 ASSERT(db->db_buf == NULL); 226 ASSERT(db->db_data_pending == NULL); 227 228 dbuf_clear(db); 229 dbuf_destroy(db); 230 } 231 232 void 233 dbuf_init(void) 234 { 235 uint64_t hsize = 1ULL << 16; 236 dbuf_hash_table_t *h = &dbuf_hash_table; 237 int i; 238 239 /* 240 * The hash table is big enough to fill all of physical memory 241 * with an average 4K block size. The table will take up 242 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 243 */ 244 while (hsize * 4096 < physmem * PAGESIZE) 245 hsize <<= 1; 246 247 retry: 248 h->hash_table_mask = hsize - 1; 249 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 250 if (h->hash_table == NULL) { 251 /* XXX - we should really return an error instead of assert */ 252 ASSERT(hsize > (1ULL << 10)); 253 hsize >>= 1; 254 goto retry; 255 } 256 257 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 258 sizeof (dmu_buf_impl_t), 259 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 260 261 for (i = 0; i < DBUF_MUTEXES; i++) 262 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 263 } 264 265 void 266 dbuf_fini(void) 267 { 268 dbuf_hash_table_t *h = &dbuf_hash_table; 269 int i; 270 271 for (i = 0; i < DBUF_MUTEXES; i++) 272 mutex_destroy(&h->hash_mutexes[i]); 273 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 274 kmem_cache_destroy(dbuf_cache); 275 } 276 277 /* 278 * Other stuff. 279 */ 280 281 #ifdef ZFS_DEBUG 282 static void 283 dbuf_verify(dmu_buf_impl_t *db) 284 { 285 dnode_t *dn = db->db_dnode; 286 287 ASSERT(MUTEX_HELD(&db->db_mtx)); 288 289 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 290 return; 291 292 ASSERT(db->db_objset != NULL); 293 if (dn == NULL) { 294 ASSERT(db->db_parent == NULL); 295 ASSERT(db->db_blkptr == NULL); 296 } else { 297 ASSERT3U(db->db.db_object, ==, dn->dn_object); 298 ASSERT3P(db->db_objset, ==, dn->dn_objset); 299 ASSERT3U(db->db_level, <, dn->dn_nlevels); 300 ASSERT(db->db_blkid == DB_BONUS_BLKID || 301 list_head(&dn->dn_dbufs)); 302 } 303 if (db->db_blkid == DB_BONUS_BLKID) { 304 ASSERT(dn != NULL); 305 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 306 ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); 307 } else { 308 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 309 } 310 311 /* 312 * We can't assert that db_size matches dn_datablksz because it 313 * can be momentarily different when another thread is doing 314 * dnode_set_blksz(). 315 */ 316 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 317 dbuf_dirty_record_t *dr = db->db_data_pending; 318 /* 319 * It should only be modified in syncing context, so 320 * make sure we only have one copy of the data. 321 */ 322 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 323 } 324 325 /* verify db->db_blkptr */ 326 if (db->db_blkptr) { 327 if (db->db_parent == dn->dn_dbuf) { 328 /* db is pointed to by the dnode */ 329 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 330 if (db->db.db_object == DMU_META_DNODE_OBJECT) 331 ASSERT(db->db_parent == NULL); 332 else 333 ASSERT(db->db_parent != NULL); 334 ASSERT3P(db->db_blkptr, ==, 335 &dn->dn_phys->dn_blkptr[db->db_blkid]); 336 } else { 337 /* db is pointed to by an indirect block */ 338 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 339 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 340 ASSERT3U(db->db_parent->db.db_object, ==, 341 db->db.db_object); 342 /* 343 * dnode_grow_indblksz() can make this fail if we don't 344 * have the struct_rwlock. XXX indblksz no longer 345 * grows. safe to do this now? 346 */ 347 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { 348 ASSERT3P(db->db_blkptr, ==, 349 ((blkptr_t *)db->db_parent->db.db_data + 350 db->db_blkid % epb)); 351 } 352 } 353 } 354 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 355 db->db.db_data && db->db_blkid != DB_BONUS_BLKID && 356 db->db_state != DB_FILL && !dn->dn_free_txg) { 357 /* 358 * If the blkptr isn't set but they have nonzero data, 359 * it had better be dirty, otherwise we'll lose that 360 * data when we evict this buffer. 361 */ 362 if (db->db_dirtycnt == 0) { 363 uint64_t *buf = db->db.db_data; 364 int i; 365 366 for (i = 0; i < db->db.db_size >> 3; i++) { 367 ASSERT(buf[i] == 0); 368 } 369 } 370 } 371 } 372 #endif 373 374 static void 375 dbuf_update_data(dmu_buf_impl_t *db) 376 { 377 ASSERT(MUTEX_HELD(&db->db_mtx)); 378 if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 379 ASSERT(!refcount_is_zero(&db->db_holds)); 380 *db->db_user_data_ptr_ptr = db->db.db_data; 381 } 382 } 383 384 static void 385 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 386 { 387 ASSERT(MUTEX_HELD(&db->db_mtx)); 388 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 389 db->db_buf = buf; 390 if (buf != NULL) { 391 ASSERT(buf->b_data != NULL); 392 db->db.db_data = buf->b_data; 393 if (!arc_released(buf)) 394 arc_set_callback(buf, dbuf_do_evict, db); 395 dbuf_update_data(db); 396 } else { 397 dbuf_evict_user(db); 398 db->db.db_data = NULL; 399 db->db_state = DB_UNCACHED; 400 } 401 } 402 403 uint64_t 404 dbuf_whichblock(dnode_t *dn, uint64_t offset) 405 { 406 if (dn->dn_datablkshift) { 407 return (offset >> dn->dn_datablkshift); 408 } else { 409 ASSERT3U(offset, <, dn->dn_datablksz); 410 return (0); 411 } 412 } 413 414 static void 415 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 416 { 417 dmu_buf_impl_t *db = vdb; 418 419 mutex_enter(&db->db_mtx); 420 ASSERT3U(db->db_state, ==, DB_READ); 421 /* 422 * All reads are synchronous, so we must have a hold on the dbuf 423 */ 424 ASSERT(refcount_count(&db->db_holds) > 0); 425 ASSERT(db->db_buf == NULL); 426 ASSERT(db->db.db_data == NULL); 427 if (db->db_level == 0 && db->db_freed_in_flight) { 428 /* we were freed in flight; disregard any error */ 429 arc_release(buf, db); 430 bzero(buf->b_data, db->db.db_size); 431 arc_buf_freeze(buf); 432 db->db_freed_in_flight = FALSE; 433 dbuf_set_data(db, buf); 434 db->db_state = DB_CACHED; 435 } else if (zio == NULL || zio->io_error == 0) { 436 dbuf_set_data(db, buf); 437 db->db_state = DB_CACHED; 438 } else { 439 ASSERT(db->db_blkid != DB_BONUS_BLKID); 440 ASSERT3P(db->db_buf, ==, NULL); 441 VERIFY(arc_buf_remove_ref(buf, db) == 1); 442 db->db_state = DB_UNCACHED; 443 } 444 cv_broadcast(&db->db_changed); 445 mutex_exit(&db->db_mtx); 446 dbuf_rele(db, NULL); 447 } 448 449 static void 450 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 451 { 452 dnode_t *dn = db->db_dnode; 453 zbookmark_t zb; 454 uint32_t aflags = ARC_NOWAIT; 455 arc_buf_t *pbuf; 456 457 ASSERT(!refcount_is_zero(&db->db_holds)); 458 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 459 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 460 ASSERT(MUTEX_HELD(&db->db_mtx)); 461 ASSERT(db->db_state == DB_UNCACHED); 462 ASSERT(db->db_buf == NULL); 463 464 if (db->db_blkid == DB_BONUS_BLKID) { 465 int bonuslen = dn->dn_bonuslen; 466 467 ASSERT3U(bonuslen, <=, db->db.db_size); 468 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 469 arc_space_consume(DN_MAX_BONUSLEN); 470 if (bonuslen < DN_MAX_BONUSLEN) 471 bzero(db->db.db_data, DN_MAX_BONUSLEN); 472 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, 473 bonuslen); 474 dbuf_update_data(db); 475 db->db_state = DB_CACHED; 476 mutex_exit(&db->db_mtx); 477 return; 478 } 479 480 /* 481 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 482 * processes the delete record and clears the bp while we are waiting 483 * for the dn_mtx (resulting in a "no" from block_freed). 484 */ 485 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 486 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 487 BP_IS_HOLE(db->db_blkptr)))) { 488 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 489 490 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, 491 db->db.db_size, db, type)); 492 bzero(db->db.db_data, db->db.db_size); 493 db->db_state = DB_CACHED; 494 *flags |= DB_RF_CACHED; 495 mutex_exit(&db->db_mtx); 496 return; 497 } 498 499 db->db_state = DB_READ; 500 mutex_exit(&db->db_mtx); 501 502 if (DBUF_IS_L2CACHEABLE(db)) 503 aflags |= ARC_L2CACHE; 504 505 zb.zb_objset = db->db_objset->os_dsl_dataset ? 506 db->db_objset->os_dsl_dataset->ds_object : 0; 507 zb.zb_object = db->db.db_object; 508 zb.zb_level = db->db_level; 509 zb.zb_blkid = db->db_blkid; 510 511 dbuf_add_ref(db, NULL); 512 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 513 514 if (db->db_parent) 515 pbuf = db->db_parent->db_buf; 516 else 517 pbuf = db->db_objset->os_phys_buf; 518 519 (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, 520 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 521 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 522 &aflags, &zb); 523 if (aflags & ARC_CACHED) 524 *flags |= DB_RF_CACHED; 525 } 526 527 int 528 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 529 { 530 int err = 0; 531 int havepzio = (zio != NULL); 532 int prefetch; 533 534 /* 535 * We don't have to hold the mutex to check db_state because it 536 * can't be freed while we have a hold on the buffer. 537 */ 538 ASSERT(!refcount_is_zero(&db->db_holds)); 539 540 if ((flags & DB_RF_HAVESTRUCT) == 0) 541 rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); 542 543 prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 544 (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL && 545 DBUF_IS_CACHEABLE(db); 546 547 mutex_enter(&db->db_mtx); 548 if (db->db_state == DB_CACHED) { 549 mutex_exit(&db->db_mtx); 550 if (prefetch) 551 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 552 db->db.db_size, TRUE); 553 if ((flags & DB_RF_HAVESTRUCT) == 0) 554 rw_exit(&db->db_dnode->dn_struct_rwlock); 555 } else if (db->db_state == DB_UNCACHED) { 556 if (zio == NULL) { 557 zio = zio_root(db->db_dnode->dn_objset->os_spa, 558 NULL, NULL, ZIO_FLAG_CANFAIL); 559 } 560 dbuf_read_impl(db, zio, &flags); 561 562 /* dbuf_read_impl has dropped db_mtx for us */ 563 564 if (prefetch) 565 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 566 db->db.db_size, flags & DB_RF_CACHED); 567 568 if ((flags & DB_RF_HAVESTRUCT) == 0) 569 rw_exit(&db->db_dnode->dn_struct_rwlock); 570 571 if (!havepzio) 572 err = zio_wait(zio); 573 } else { 574 mutex_exit(&db->db_mtx); 575 if (prefetch) 576 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 577 db->db.db_size, TRUE); 578 if ((flags & DB_RF_HAVESTRUCT) == 0) 579 rw_exit(&db->db_dnode->dn_struct_rwlock); 580 581 mutex_enter(&db->db_mtx); 582 if ((flags & DB_RF_NEVERWAIT) == 0) { 583 while (db->db_state == DB_READ || 584 db->db_state == DB_FILL) { 585 ASSERT(db->db_state == DB_READ || 586 (flags & DB_RF_HAVESTRUCT) == 0); 587 cv_wait(&db->db_changed, &db->db_mtx); 588 } 589 if (db->db_state == DB_UNCACHED) 590 err = EIO; 591 } 592 mutex_exit(&db->db_mtx); 593 } 594 595 ASSERT(err || havepzio || db->db_state == DB_CACHED); 596 return (err); 597 } 598 599 static void 600 dbuf_noread(dmu_buf_impl_t *db) 601 { 602 ASSERT(!refcount_is_zero(&db->db_holds)); 603 ASSERT(db->db_blkid != DB_BONUS_BLKID); 604 mutex_enter(&db->db_mtx); 605 while (db->db_state == DB_READ || db->db_state == DB_FILL) 606 cv_wait(&db->db_changed, &db->db_mtx); 607 if (db->db_state == DB_UNCACHED) { 608 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 609 610 ASSERT(db->db_buf == NULL); 611 ASSERT(db->db.db_data == NULL); 612 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 613 db->db.db_size, db, type)); 614 db->db_state = DB_FILL; 615 } else { 616 ASSERT3U(db->db_state, ==, DB_CACHED); 617 } 618 mutex_exit(&db->db_mtx); 619 } 620 621 /* 622 * This is our just-in-time copy function. It makes a copy of 623 * buffers, that have been modified in a previous transaction 624 * group, before we modify them in the current active group. 625 * 626 * This function is used in two places: when we are dirtying a 627 * buffer for the first time in a txg, and when we are freeing 628 * a range in a dnode that includes this buffer. 629 * 630 * Note that when we are called from dbuf_free_range() we do 631 * not put a hold on the buffer, we just traverse the active 632 * dbuf list for the dnode. 633 */ 634 static void 635 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 636 { 637 dbuf_dirty_record_t *dr = db->db_last_dirty; 638 639 ASSERT(MUTEX_HELD(&db->db_mtx)); 640 ASSERT(db->db.db_data != NULL); 641 ASSERT(db->db_level == 0); 642 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 643 644 if (dr == NULL || 645 (dr->dt.dl.dr_data != 646 ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 647 return; 648 649 /* 650 * If the last dirty record for this dbuf has not yet synced 651 * and its referencing the dbuf data, either: 652 * reset the reference to point to a new copy, 653 * or (if there a no active holders) 654 * just null out the current db_data pointer. 655 */ 656 ASSERT(dr->dr_txg >= txg - 2); 657 if (db->db_blkid == DB_BONUS_BLKID) { 658 /* Note that the data bufs here are zio_bufs */ 659 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 660 arc_space_consume(DN_MAX_BONUSLEN); 661 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 662 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 663 int size = db->db.db_size; 664 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 665 dr->dt.dl.dr_data = arc_buf_alloc( 666 db->db_dnode->dn_objset->os_spa, size, db, type); 667 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 668 } else { 669 dbuf_set_data(db, NULL); 670 } 671 } 672 673 void 674 dbuf_unoverride(dbuf_dirty_record_t *dr) 675 { 676 dmu_buf_impl_t *db = dr->dr_dbuf; 677 uint64_t txg = dr->dr_txg; 678 679 ASSERT(MUTEX_HELD(&db->db_mtx)); 680 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 681 ASSERT(db->db_level == 0); 682 683 if (db->db_blkid == DB_BONUS_BLKID || 684 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 685 return; 686 687 /* free this block */ 688 if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) { 689 /* XXX can get silent EIO here */ 690 (void) dsl_free(NULL, 691 spa_get_dsl(db->db_dnode->dn_objset->os_spa), 692 txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT); 693 } 694 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 695 /* 696 * Release the already-written buffer, so we leave it in 697 * a consistent dirty state. Note that all callers are 698 * modifying the buffer, so they will immediately do 699 * another (redundant) arc_release(). Therefore, leave 700 * the buf thawed to save the effort of freezing & 701 * immediately re-thawing it. 702 */ 703 arc_release(dr->dt.dl.dr_data, db); 704 } 705 706 /* 707 * Evict (if its unreferenced) or clear (if its referenced) any level-0 708 * data blocks in the free range, so that any future readers will find 709 * empty blocks. Also, if we happen accross any level-1 dbufs in the 710 * range that have not already been marked dirty, mark them dirty so 711 * they stay in memory. 712 */ 713 void 714 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) 715 { 716 dmu_buf_impl_t *db, *db_next; 717 uint64_t txg = tx->tx_txg; 718 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 719 uint64_t first_l1 = start >> epbs; 720 uint64_t last_l1 = end >> epbs; 721 722 if (end > dn->dn_maxblkid) { 723 end = dn->dn_maxblkid; 724 last_l1 = end >> epbs; 725 } 726 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); 727 mutex_enter(&dn->dn_dbufs_mtx); 728 for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 729 db_next = list_next(&dn->dn_dbufs, db); 730 ASSERT(db->db_blkid != DB_BONUS_BLKID); 731 732 if (db->db_level == 1 && 733 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { 734 mutex_enter(&db->db_mtx); 735 if (db->db_last_dirty && 736 db->db_last_dirty->dr_txg < txg) { 737 dbuf_add_ref(db, FTAG); 738 mutex_exit(&db->db_mtx); 739 dbuf_will_dirty(db, tx); 740 dbuf_rele(db, FTAG); 741 } else { 742 mutex_exit(&db->db_mtx); 743 } 744 } 745 746 if (db->db_level != 0) 747 continue; 748 dprintf_dbuf(db, "found buf %s\n", ""); 749 if (db->db_blkid < start || db->db_blkid > end) 750 continue; 751 752 /* found a level 0 buffer in the range */ 753 if (dbuf_undirty(db, tx)) 754 continue; 755 756 mutex_enter(&db->db_mtx); 757 if (db->db_state == DB_UNCACHED || 758 db->db_state == DB_EVICTING) { 759 ASSERT(db->db.db_data == NULL); 760 mutex_exit(&db->db_mtx); 761 continue; 762 } 763 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 764 /* will be handled in dbuf_read_done or dbuf_rele */ 765 db->db_freed_in_flight = TRUE; 766 mutex_exit(&db->db_mtx); 767 continue; 768 } 769 if (refcount_count(&db->db_holds) == 0) { 770 ASSERT(db->db_buf); 771 dbuf_clear(db); 772 continue; 773 } 774 /* The dbuf is referenced */ 775 776 if (db->db_last_dirty != NULL) { 777 dbuf_dirty_record_t *dr = db->db_last_dirty; 778 779 if (dr->dr_txg == txg) { 780 /* 781 * This buffer is "in-use", re-adjust the file 782 * size to reflect that this buffer may 783 * contain new data when we sync. 784 */ 785 if (db->db_blkid > dn->dn_maxblkid) 786 dn->dn_maxblkid = db->db_blkid; 787 dbuf_unoverride(dr); 788 } else { 789 /* 790 * This dbuf is not dirty in the open context. 791 * Either uncache it (if its not referenced in 792 * the open context) or reset its contents to 793 * empty. 794 */ 795 dbuf_fix_old_data(db, txg); 796 } 797 } 798 /* clear the contents if its cached */ 799 if (db->db_state == DB_CACHED) { 800 ASSERT(db->db.db_data != NULL); 801 arc_release(db->db_buf, db); 802 bzero(db->db.db_data, db->db.db_size); 803 arc_buf_freeze(db->db_buf); 804 } 805 806 mutex_exit(&db->db_mtx); 807 } 808 mutex_exit(&dn->dn_dbufs_mtx); 809 } 810 811 static int 812 dbuf_block_freeable(dmu_buf_impl_t *db) 813 { 814 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 815 uint64_t birth_txg = 0; 816 817 /* 818 * We don't need any locking to protect db_blkptr: 819 * If it's syncing, then db_last_dirty will be set 820 * so we'll ignore db_blkptr. 821 */ 822 ASSERT(MUTEX_HELD(&db->db_mtx)); 823 if (db->db_last_dirty) 824 birth_txg = db->db_last_dirty->dr_txg; 825 else if (db->db_blkptr) 826 birth_txg = db->db_blkptr->blk_birth; 827 828 /* If we don't exist or are in a snapshot, we can't be freed */ 829 if (birth_txg) 830 return (ds == NULL || 831 dsl_dataset_block_freeable(ds, birth_txg)); 832 else 833 return (FALSE); 834 } 835 836 void 837 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 838 { 839 arc_buf_t *buf, *obuf; 840 int osize = db->db.db_size; 841 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 842 843 ASSERT(db->db_blkid != DB_BONUS_BLKID); 844 845 /* XXX does *this* func really need the lock? */ 846 ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); 847 848 /* 849 * This call to dbuf_will_dirty() with the dn_struct_rwlock held 850 * is OK, because there can be no other references to the db 851 * when we are changing its size, so no concurrent DB_FILL can 852 * be happening. 853 */ 854 /* 855 * XXX we should be doing a dbuf_read, checking the return 856 * value and returning that up to our callers 857 */ 858 dbuf_will_dirty(db, tx); 859 860 /* create the data buffer for the new block */ 861 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); 862 863 /* copy old block data to the new block */ 864 obuf = db->db_buf; 865 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 866 /* zero the remainder */ 867 if (size > osize) 868 bzero((uint8_t *)buf->b_data + osize, size - osize); 869 870 mutex_enter(&db->db_mtx); 871 dbuf_set_data(db, buf); 872 VERIFY(arc_buf_remove_ref(obuf, db) == 1); 873 db->db.db_size = size; 874 875 if (db->db_level == 0) { 876 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 877 db->db_last_dirty->dt.dl.dr_data = buf; 878 } 879 mutex_exit(&db->db_mtx); 880 881 dnode_willuse_space(db->db_dnode, size-osize, tx); 882 } 883 884 dbuf_dirty_record_t * 885 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 886 { 887 dnode_t *dn = db->db_dnode; 888 objset_impl_t *os = dn->dn_objset; 889 dbuf_dirty_record_t **drp, *dr; 890 int drop_struct_lock = FALSE; 891 boolean_t do_free_accounting = B_FALSE; 892 int txgoff = tx->tx_txg & TXG_MASK; 893 894 ASSERT(tx->tx_txg != 0); 895 ASSERT(!refcount_is_zero(&db->db_holds)); 896 DMU_TX_DIRTY_BUF(tx, db); 897 898 /* 899 * Shouldn't dirty a regular buffer in syncing context. Private 900 * objects may be dirtied in syncing context, but only if they 901 * were already pre-dirtied in open context. 902 * XXX We may want to prohibit dirtying in syncing context even 903 * if they did pre-dirty. 904 */ 905 ASSERT(!dmu_tx_is_syncing(tx) || 906 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 907 dn->dn_object == DMU_META_DNODE_OBJECT || 908 dn->dn_objset->os_dsl_dataset == NULL || 909 dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir)); 910 911 /* 912 * We make this assert for private objects as well, but after we 913 * check if we're already dirty. They are allowed to re-dirty 914 * in syncing context. 915 */ 916 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 917 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 918 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 919 920 mutex_enter(&db->db_mtx); 921 /* 922 * XXX make this true for indirects too? The problem is that 923 * transactions created with dmu_tx_create_assigned() from 924 * syncing context don't bother holding ahead. 925 */ 926 ASSERT(db->db_level != 0 || 927 db->db_state == DB_CACHED || db->db_state == DB_FILL); 928 929 mutex_enter(&dn->dn_mtx); 930 /* 931 * Don't set dirtyctx to SYNC if we're just modifying this as we 932 * initialize the objset. 933 */ 934 if (dn->dn_dirtyctx == DN_UNDIRTIED && 935 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 936 dn->dn_dirtyctx = 937 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 938 ASSERT(dn->dn_dirtyctx_firstset == NULL); 939 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 940 } 941 mutex_exit(&dn->dn_mtx); 942 943 /* 944 * If this buffer is already dirty, we're done. 945 */ 946 drp = &db->db_last_dirty; 947 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 948 db->db.db_object == DMU_META_DNODE_OBJECT); 949 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 950 drp = &dr->dr_next; 951 if (dr && dr->dr_txg == tx->tx_txg) { 952 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 953 /* 954 * If this buffer has already been written out, 955 * we now need to reset its state. 956 */ 957 dbuf_unoverride(dr); 958 if (db->db.db_object != DMU_META_DNODE_OBJECT) 959 arc_buf_thaw(db->db_buf); 960 } 961 mutex_exit(&db->db_mtx); 962 return (dr); 963 } 964 965 /* 966 * Only valid if not already dirty. 967 */ 968 ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 969 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 970 971 ASSERT3U(dn->dn_nlevels, >, db->db_level); 972 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 973 dn->dn_phys->dn_nlevels > db->db_level || 974 dn->dn_next_nlevels[txgoff] > db->db_level || 975 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 976 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 977 978 /* 979 * We should only be dirtying in syncing context if it's the 980 * mos, a spa os, or we're initializing the os. However, we are 981 * allowed to dirty in syncing context provided we already 982 * dirtied it in open context. Hence we must make this 983 * assertion only if we're not already dirty. 984 */ 985 ASSERT(!dmu_tx_is_syncing(tx) || 986 os->os_dsl_dataset == NULL || 987 !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || 988 !BP_IS_HOLE(os->os_rootbp)); 989 ASSERT(db->db.db_size != 0); 990 991 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 992 993 if (db->db_blkid != DB_BONUS_BLKID) { 994 /* 995 * Update the accounting. 996 * Note: we delay "free accounting" until after we drop 997 * the db_mtx. This keeps us from grabbing other locks 998 * (and possibly deadlocking) in bp_get_dasize() while 999 * also holding the db_mtx. 1000 */ 1001 dnode_willuse_space(dn, db->db.db_size, tx); 1002 do_free_accounting = dbuf_block_freeable(db); 1003 } 1004 1005 /* 1006 * If this buffer is dirty in an old transaction group we need 1007 * to make a copy of it so that the changes we make in this 1008 * transaction group won't leak out when we sync the older txg. 1009 */ 1010 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1011 if (db->db_level == 0) { 1012 void *data_old = db->db_buf; 1013 1014 if (db->db_blkid == DB_BONUS_BLKID) { 1015 dbuf_fix_old_data(db, tx->tx_txg); 1016 data_old = db->db.db_data; 1017 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1018 /* 1019 * Release the data buffer from the cache so that we 1020 * can modify it without impacting possible other users 1021 * of this cached data block. Note that indirect 1022 * blocks and private objects are not released until the 1023 * syncing state (since they are only modified then). 1024 */ 1025 arc_release(db->db_buf, db); 1026 dbuf_fix_old_data(db, tx->tx_txg); 1027 data_old = db->db_buf; 1028 } 1029 ASSERT(data_old != NULL); 1030 dr->dt.dl.dr_data = data_old; 1031 } else { 1032 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1033 list_create(&dr->dt.di.dr_children, 1034 sizeof (dbuf_dirty_record_t), 1035 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1036 } 1037 dr->dr_dbuf = db; 1038 dr->dr_txg = tx->tx_txg; 1039 dr->dr_next = *drp; 1040 *drp = dr; 1041 1042 /* 1043 * We could have been freed_in_flight between the dbuf_noread 1044 * and dbuf_dirty. We win, as though the dbuf_noread() had 1045 * happened after the free. 1046 */ 1047 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 1048 mutex_enter(&dn->dn_mtx); 1049 dnode_clear_range(dn, db->db_blkid, 1, tx); 1050 mutex_exit(&dn->dn_mtx); 1051 db->db_freed_in_flight = FALSE; 1052 } 1053 1054 /* 1055 * This buffer is now part of this txg 1056 */ 1057 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1058 db->db_dirtycnt += 1; 1059 ASSERT3U(db->db_dirtycnt, <=, 3); 1060 1061 mutex_exit(&db->db_mtx); 1062 1063 if (db->db_blkid == DB_BONUS_BLKID) { 1064 mutex_enter(&dn->dn_mtx); 1065 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1066 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1067 mutex_exit(&dn->dn_mtx); 1068 dnode_setdirty(dn, tx); 1069 return (dr); 1070 } else if (do_free_accounting) { 1071 blkptr_t *bp = db->db_blkptr; 1072 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1073 bp_get_dasize(os->os_spa, bp) : db->db.db_size; 1074 /* 1075 * This is only a guess -- if the dbuf is dirty 1076 * in a previous txg, we don't know how much 1077 * space it will use on disk yet. We should 1078 * really have the struct_rwlock to access 1079 * db_blkptr, but since this is just a guess, 1080 * it's OK if we get an odd answer. 1081 */ 1082 dnode_willuse_space(dn, -willfree, tx); 1083 } 1084 1085 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1086 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1087 drop_struct_lock = TRUE; 1088 } 1089 1090 if (db->db_level == 0) { 1091 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 1092 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1093 } 1094 1095 if (db->db_level+1 < dn->dn_nlevels) { 1096 dmu_buf_impl_t *parent = db->db_parent; 1097 dbuf_dirty_record_t *di; 1098 int parent_held = FALSE; 1099 1100 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1101 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1102 1103 parent = dbuf_hold_level(dn, db->db_level+1, 1104 db->db_blkid >> epbs, FTAG); 1105 parent_held = TRUE; 1106 } 1107 if (drop_struct_lock) 1108 rw_exit(&dn->dn_struct_rwlock); 1109 ASSERT3U(db->db_level+1, ==, parent->db_level); 1110 di = dbuf_dirty(parent, tx); 1111 if (parent_held) 1112 dbuf_rele(parent, FTAG); 1113 1114 mutex_enter(&db->db_mtx); 1115 /* possible race with dbuf_undirty() */ 1116 if (db->db_last_dirty == dr || 1117 dn->dn_object == DMU_META_DNODE_OBJECT) { 1118 mutex_enter(&di->dt.di.dr_mtx); 1119 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1120 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1121 list_insert_tail(&di->dt.di.dr_children, dr); 1122 mutex_exit(&di->dt.di.dr_mtx); 1123 dr->dr_parent = di; 1124 } 1125 mutex_exit(&db->db_mtx); 1126 } else { 1127 ASSERT(db->db_level+1 == dn->dn_nlevels); 1128 ASSERT(db->db_blkid < dn->dn_nblkptr); 1129 ASSERT(db->db_parent == NULL || 1130 db->db_parent == db->db_dnode->dn_dbuf); 1131 mutex_enter(&dn->dn_mtx); 1132 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1133 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1134 mutex_exit(&dn->dn_mtx); 1135 if (drop_struct_lock) 1136 rw_exit(&dn->dn_struct_rwlock); 1137 } 1138 1139 dnode_setdirty(dn, tx); 1140 return (dr); 1141 } 1142 1143 static int 1144 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1145 { 1146 dnode_t *dn = db->db_dnode; 1147 uint64_t txg = tx->tx_txg; 1148 dbuf_dirty_record_t *dr, **drp; 1149 1150 ASSERT(txg != 0); 1151 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1152 1153 mutex_enter(&db->db_mtx); 1154 1155 /* 1156 * If this buffer is not dirty, we're done. 1157 */ 1158 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1159 if (dr->dr_txg <= txg) 1160 break; 1161 if (dr == NULL || dr->dr_txg < txg) { 1162 mutex_exit(&db->db_mtx); 1163 return (0); 1164 } 1165 ASSERT(dr->dr_txg == txg); 1166 1167 /* 1168 * If this buffer is currently held, we cannot undirty 1169 * it, since one of the current holders may be in the 1170 * middle of an update. Note that users of dbuf_undirty() 1171 * should not place a hold on the dbuf before the call. 1172 */ 1173 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1174 mutex_exit(&db->db_mtx); 1175 /* Make sure we don't toss this buffer at sync phase */ 1176 mutex_enter(&dn->dn_mtx); 1177 dnode_clear_range(dn, db->db_blkid, 1, tx); 1178 mutex_exit(&dn->dn_mtx); 1179 return (0); 1180 } 1181 1182 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1183 1184 ASSERT(db->db.db_size != 0); 1185 1186 /* XXX would be nice to fix up dn_towrite_space[] */ 1187 1188 *drp = dr->dr_next; 1189 1190 if (dr->dr_parent) { 1191 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1192 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1193 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1194 } else if (db->db_level+1 == dn->dn_nlevels) { 1195 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1196 mutex_enter(&dn->dn_mtx); 1197 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1198 mutex_exit(&dn->dn_mtx); 1199 } 1200 1201 if (db->db_level == 0) { 1202 dbuf_unoverride(dr); 1203 1204 ASSERT(db->db_buf != NULL); 1205 ASSERT(dr->dt.dl.dr_data != NULL); 1206 if (dr->dt.dl.dr_data != db->db_buf) 1207 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); 1208 } else { 1209 ASSERT(db->db_buf != NULL); 1210 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 1211 mutex_destroy(&dr->dt.di.dr_mtx); 1212 list_destroy(&dr->dt.di.dr_children); 1213 } 1214 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1215 1216 ASSERT(db->db_dirtycnt > 0); 1217 db->db_dirtycnt -= 1; 1218 1219 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1220 arc_buf_t *buf = db->db_buf; 1221 1222 ASSERT(arc_released(buf)); 1223 dbuf_set_data(db, NULL); 1224 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1225 dbuf_evict(db); 1226 return (1); 1227 } 1228 1229 mutex_exit(&db->db_mtx); 1230 return (0); 1231 } 1232 1233 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty 1234 void 1235 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1236 { 1237 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1238 1239 ASSERT(tx->tx_txg != 0); 1240 ASSERT(!refcount_is_zero(&db->db_holds)); 1241 1242 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) 1243 rf |= DB_RF_HAVESTRUCT; 1244 (void) dbuf_read(db, NULL, rf); 1245 (void) dbuf_dirty(db, tx); 1246 } 1247 1248 void 1249 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1250 { 1251 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1252 1253 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1254 ASSERT(tx->tx_txg != 0); 1255 ASSERT(db->db_level == 0); 1256 ASSERT(!refcount_is_zero(&db->db_holds)); 1257 1258 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1259 dmu_tx_private_ok(tx)); 1260 1261 dbuf_noread(db); 1262 (void) dbuf_dirty(db, tx); 1263 } 1264 1265 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1266 /* ARGSUSED */ 1267 void 1268 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1269 { 1270 mutex_enter(&db->db_mtx); 1271 DBUF_VERIFY(db); 1272 1273 if (db->db_state == DB_FILL) { 1274 if (db->db_level == 0 && db->db_freed_in_flight) { 1275 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1276 /* we were freed while filling */ 1277 /* XXX dbuf_undirty? */ 1278 bzero(db->db.db_data, db->db.db_size); 1279 db->db_freed_in_flight = FALSE; 1280 } 1281 db->db_state = DB_CACHED; 1282 cv_broadcast(&db->db_changed); 1283 } 1284 mutex_exit(&db->db_mtx); 1285 } 1286 1287 /* 1288 * "Clear" the contents of this dbuf. This will mark the dbuf 1289 * EVICTING and clear *most* of its references. Unfortunetely, 1290 * when we are not holding the dn_dbufs_mtx, we can't clear the 1291 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1292 * in this case. For callers from the DMU we will usually see: 1293 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() 1294 * For the arc callback, we will usually see: 1295 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1296 * Sometimes, though, we will get a mix of these two: 1297 * DMU: dbuf_clear()->arc_buf_evict() 1298 * ARC: dbuf_do_evict()->dbuf_destroy() 1299 */ 1300 void 1301 dbuf_clear(dmu_buf_impl_t *db) 1302 { 1303 dnode_t *dn = db->db_dnode; 1304 dmu_buf_impl_t *parent = db->db_parent; 1305 dmu_buf_impl_t *dndb = dn->dn_dbuf; 1306 int dbuf_gone = FALSE; 1307 1308 ASSERT(MUTEX_HELD(&db->db_mtx)); 1309 ASSERT(refcount_is_zero(&db->db_holds)); 1310 1311 dbuf_evict_user(db); 1312 1313 if (db->db_state == DB_CACHED) { 1314 ASSERT(db->db.db_data != NULL); 1315 if (db->db_blkid == DB_BONUS_BLKID) { 1316 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1317 arc_space_return(DN_MAX_BONUSLEN); 1318 } 1319 db->db.db_data = NULL; 1320 db->db_state = DB_UNCACHED; 1321 } 1322 1323 ASSERT3U(db->db_state, ==, DB_UNCACHED); 1324 ASSERT(db->db_data_pending == NULL); 1325 1326 db->db_state = DB_EVICTING; 1327 db->db_blkptr = NULL; 1328 1329 if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1330 list_remove(&dn->dn_dbufs, db); 1331 dnode_rele(dn, db); 1332 db->db_dnode = NULL; 1333 } 1334 1335 if (db->db_buf) 1336 dbuf_gone = arc_buf_evict(db->db_buf); 1337 1338 if (!dbuf_gone) 1339 mutex_exit(&db->db_mtx); 1340 1341 /* 1342 * If this dbuf is referened from an indirect dbuf, 1343 * decrement the ref count on the indirect dbuf. 1344 */ 1345 if (parent && parent != dndb) 1346 dbuf_rele(parent, db); 1347 } 1348 1349 static int 1350 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1351 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1352 { 1353 int nlevels, epbs; 1354 1355 *parentp = NULL; 1356 *bpp = NULL; 1357 1358 ASSERT(blkid != DB_BONUS_BLKID); 1359 1360 if (dn->dn_phys->dn_nlevels == 0) 1361 nlevels = 1; 1362 else 1363 nlevels = dn->dn_phys->dn_nlevels; 1364 1365 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1366 1367 ASSERT3U(level * epbs, <, 64); 1368 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1369 if (level >= nlevels || 1370 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1371 /* the buffer has no parent yet */ 1372 return (ENOENT); 1373 } else if (level < nlevels-1) { 1374 /* this block is referenced from an indirect block */ 1375 int err = dbuf_hold_impl(dn, level+1, 1376 blkid >> epbs, fail_sparse, NULL, parentp); 1377 if (err) 1378 return (err); 1379 err = dbuf_read(*parentp, NULL, 1380 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1381 if (err) { 1382 dbuf_rele(*parentp, NULL); 1383 *parentp = NULL; 1384 return (err); 1385 } 1386 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1387 (blkid & ((1ULL << epbs) - 1)); 1388 return (0); 1389 } else { 1390 /* the block is referenced from the dnode */ 1391 ASSERT3U(level, ==, nlevels-1); 1392 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1393 blkid < dn->dn_phys->dn_nblkptr); 1394 if (dn->dn_dbuf) { 1395 dbuf_add_ref(dn->dn_dbuf, NULL); 1396 *parentp = dn->dn_dbuf; 1397 } 1398 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1399 return (0); 1400 } 1401 } 1402 1403 static dmu_buf_impl_t * 1404 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1405 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1406 { 1407 objset_impl_t *os = dn->dn_objset; 1408 dmu_buf_impl_t *db, *odb; 1409 1410 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1411 ASSERT(dn->dn_type != DMU_OT_NONE); 1412 1413 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1414 1415 db->db_objset = os; 1416 db->db.db_object = dn->dn_object; 1417 db->db_level = level; 1418 db->db_blkid = blkid; 1419 db->db_last_dirty = NULL; 1420 db->db_dirtycnt = 0; 1421 db->db_dnode = dn; 1422 db->db_parent = parent; 1423 db->db_blkptr = blkptr; 1424 1425 db->db_user_ptr = NULL; 1426 db->db_user_data_ptr_ptr = NULL; 1427 db->db_evict_func = NULL; 1428 db->db_immediate_evict = 0; 1429 db->db_freed_in_flight = 0; 1430 1431 if (blkid == DB_BONUS_BLKID) { 1432 ASSERT3P(parent, ==, dn->dn_dbuf); 1433 db->db.db_size = DN_MAX_BONUSLEN - 1434 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1435 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1436 db->db.db_offset = DB_BONUS_BLKID; 1437 db->db_state = DB_UNCACHED; 1438 /* the bonus dbuf is not placed in the hash table */ 1439 arc_space_consume(sizeof (dmu_buf_impl_t)); 1440 return (db); 1441 } else { 1442 int blocksize = 1443 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1444 db->db.db_size = blocksize; 1445 db->db.db_offset = db->db_blkid * blocksize; 1446 } 1447 1448 /* 1449 * Hold the dn_dbufs_mtx while we get the new dbuf 1450 * in the hash table *and* added to the dbufs list. 1451 * This prevents a possible deadlock with someone 1452 * trying to look up this dbuf before its added to the 1453 * dn_dbufs list. 1454 */ 1455 mutex_enter(&dn->dn_dbufs_mtx); 1456 db->db_state = DB_EVICTING; 1457 if ((odb = dbuf_hash_insert(db)) != NULL) { 1458 /* someone else inserted it first */ 1459 kmem_cache_free(dbuf_cache, db); 1460 mutex_exit(&dn->dn_dbufs_mtx); 1461 return (odb); 1462 } 1463 list_insert_head(&dn->dn_dbufs, db); 1464 db->db_state = DB_UNCACHED; 1465 mutex_exit(&dn->dn_dbufs_mtx); 1466 arc_space_consume(sizeof (dmu_buf_impl_t)); 1467 1468 if (parent && parent != dn->dn_dbuf) 1469 dbuf_add_ref(parent, db); 1470 1471 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1472 refcount_count(&dn->dn_holds) > 0); 1473 (void) refcount_add(&dn->dn_holds, db); 1474 1475 dprintf_dbuf(db, "db=%p\n", db); 1476 1477 return (db); 1478 } 1479 1480 static int 1481 dbuf_do_evict(void *private) 1482 { 1483 arc_buf_t *buf = private; 1484 dmu_buf_impl_t *db = buf->b_private; 1485 1486 if (!MUTEX_HELD(&db->db_mtx)) 1487 mutex_enter(&db->db_mtx); 1488 1489 ASSERT(refcount_is_zero(&db->db_holds)); 1490 1491 if (db->db_state != DB_EVICTING) { 1492 ASSERT(db->db_state == DB_CACHED); 1493 DBUF_VERIFY(db); 1494 db->db_buf = NULL; 1495 dbuf_evict(db); 1496 } else { 1497 mutex_exit(&db->db_mtx); 1498 dbuf_destroy(db); 1499 } 1500 return (0); 1501 } 1502 1503 static void 1504 dbuf_destroy(dmu_buf_impl_t *db) 1505 { 1506 ASSERT(refcount_is_zero(&db->db_holds)); 1507 1508 if (db->db_blkid != DB_BONUS_BLKID) { 1509 /* 1510 * If this dbuf is still on the dn_dbufs list, 1511 * remove it from that list. 1512 */ 1513 if (db->db_dnode) { 1514 dnode_t *dn = db->db_dnode; 1515 1516 mutex_enter(&dn->dn_dbufs_mtx); 1517 list_remove(&dn->dn_dbufs, db); 1518 mutex_exit(&dn->dn_dbufs_mtx); 1519 1520 dnode_rele(dn, db); 1521 db->db_dnode = NULL; 1522 } 1523 dbuf_hash_remove(db); 1524 } 1525 db->db_parent = NULL; 1526 db->db_buf = NULL; 1527 1528 ASSERT(!list_link_active(&db->db_link)); 1529 ASSERT(db->db.db_data == NULL); 1530 ASSERT(db->db_hash_next == NULL); 1531 ASSERT(db->db_blkptr == NULL); 1532 ASSERT(db->db_data_pending == NULL); 1533 1534 kmem_cache_free(dbuf_cache, db); 1535 arc_space_return(sizeof (dmu_buf_impl_t)); 1536 } 1537 1538 void 1539 dbuf_prefetch(dnode_t *dn, uint64_t blkid) 1540 { 1541 dmu_buf_impl_t *db = NULL; 1542 blkptr_t *bp = NULL; 1543 1544 ASSERT(blkid != DB_BONUS_BLKID); 1545 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1546 1547 if (dnode_block_freed(dn, blkid)) 1548 return; 1549 1550 /* dbuf_find() returns with db_mtx held */ 1551 if (db = dbuf_find(dn, 0, blkid)) { 1552 if (refcount_count(&db->db_holds) > 0) { 1553 /* 1554 * This dbuf is active. We assume that it is 1555 * already CACHED, or else about to be either 1556 * read or filled. 1557 */ 1558 mutex_exit(&db->db_mtx); 1559 return; 1560 } 1561 mutex_exit(&db->db_mtx); 1562 db = NULL; 1563 } 1564 1565 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1566 if (bp && !BP_IS_HOLE(bp)) { 1567 arc_buf_t *pbuf; 1568 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1569 zbookmark_t zb; 1570 zb.zb_objset = dn->dn_objset->os_dsl_dataset ? 1571 dn->dn_objset->os_dsl_dataset->ds_object : 0; 1572 zb.zb_object = dn->dn_object; 1573 zb.zb_level = 0; 1574 zb.zb_blkid = blkid; 1575 1576 if (db) 1577 pbuf = db->db_buf; 1578 else 1579 pbuf = dn->dn_objset->os_phys_buf; 1580 1581 (void) arc_read(NULL, dn->dn_objset->os_spa, 1582 bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 1583 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1584 &aflags, &zb); 1585 } 1586 if (db) 1587 dbuf_rele(db, NULL); 1588 } 1589 } 1590 1591 /* 1592 * Returns with db_holds incremented, and db_mtx not held. 1593 * Note: dn_struct_rwlock must be held. 1594 */ 1595 int 1596 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1597 void *tag, dmu_buf_impl_t **dbp) 1598 { 1599 dmu_buf_impl_t *db, *parent = NULL; 1600 1601 ASSERT(blkid != DB_BONUS_BLKID); 1602 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1603 ASSERT3U(dn->dn_nlevels, >, level); 1604 1605 *dbp = NULL; 1606 top: 1607 /* dbuf_find() returns with db_mtx held */ 1608 db = dbuf_find(dn, level, blkid); 1609 1610 if (db == NULL) { 1611 blkptr_t *bp = NULL; 1612 int err; 1613 1614 ASSERT3P(parent, ==, NULL); 1615 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1616 if (fail_sparse) { 1617 if (err == 0 && bp && BP_IS_HOLE(bp)) 1618 err = ENOENT; 1619 if (err) { 1620 if (parent) 1621 dbuf_rele(parent, NULL); 1622 return (err); 1623 } 1624 } 1625 if (err && err != ENOENT) 1626 return (err); 1627 db = dbuf_create(dn, level, blkid, parent, bp); 1628 } 1629 1630 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1631 arc_buf_add_ref(db->db_buf, db); 1632 if (db->db_buf->b_data == NULL) { 1633 dbuf_clear(db); 1634 if (parent) { 1635 dbuf_rele(parent, NULL); 1636 parent = NULL; 1637 } 1638 goto top; 1639 } 1640 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1641 } 1642 1643 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1644 1645 /* 1646 * If this buffer is currently syncing out, and we are are 1647 * still referencing it from db_data, we need to make a copy 1648 * of it in case we decide we want to dirty it again in this txg. 1649 */ 1650 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 1651 dn->dn_object != DMU_META_DNODE_OBJECT && 1652 db->db_state == DB_CACHED && db->db_data_pending) { 1653 dbuf_dirty_record_t *dr = db->db_data_pending; 1654 1655 if (dr->dt.dl.dr_data == db->db_buf) { 1656 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1657 1658 dbuf_set_data(db, 1659 arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 1660 db->db.db_size, db, type)); 1661 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1662 db->db.db_size); 1663 } 1664 } 1665 1666 (void) refcount_add(&db->db_holds, tag); 1667 dbuf_update_data(db); 1668 DBUF_VERIFY(db); 1669 mutex_exit(&db->db_mtx); 1670 1671 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1672 if (parent) 1673 dbuf_rele(parent, NULL); 1674 1675 ASSERT3P(db->db_dnode, ==, dn); 1676 ASSERT3U(db->db_blkid, ==, blkid); 1677 ASSERT3U(db->db_level, ==, level); 1678 *dbp = db; 1679 1680 return (0); 1681 } 1682 1683 dmu_buf_impl_t * 1684 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1685 { 1686 dmu_buf_impl_t *db; 1687 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1688 return (err ? NULL : db); 1689 } 1690 1691 dmu_buf_impl_t * 1692 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1693 { 1694 dmu_buf_impl_t *db; 1695 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1696 return (err ? NULL : db); 1697 } 1698 1699 void 1700 dbuf_create_bonus(dnode_t *dn) 1701 { 1702 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1703 1704 ASSERT(dn->dn_bonus == NULL); 1705 dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); 1706 } 1707 1708 #pragma weak dmu_buf_add_ref = dbuf_add_ref 1709 void 1710 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1711 { 1712 int64_t holds = refcount_add(&db->db_holds, tag); 1713 ASSERT(holds > 1); 1714 } 1715 1716 #pragma weak dmu_buf_rele = dbuf_rele 1717 void 1718 dbuf_rele(dmu_buf_impl_t *db, void *tag) 1719 { 1720 int64_t holds; 1721 1722 mutex_enter(&db->db_mtx); 1723 DBUF_VERIFY(db); 1724 1725 holds = refcount_remove(&db->db_holds, tag); 1726 ASSERT(holds >= 0); 1727 1728 /* 1729 * We can't freeze indirects if there is a possibility that they 1730 * may be modified in the current syncing context. 1731 */ 1732 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 1733 arc_buf_freeze(db->db_buf); 1734 1735 if (holds == db->db_dirtycnt && 1736 db->db_level == 0 && db->db_immediate_evict) 1737 dbuf_evict_user(db); 1738 1739 if (holds == 0) { 1740 if (db->db_blkid == DB_BONUS_BLKID) { 1741 mutex_exit(&db->db_mtx); 1742 dnode_rele(db->db_dnode, db); 1743 } else if (db->db_buf == NULL) { 1744 /* 1745 * This is a special case: we never associated this 1746 * dbuf with any data allocated from the ARC. 1747 */ 1748 ASSERT3U(db->db_state, ==, DB_UNCACHED); 1749 dbuf_evict(db); 1750 } else if (arc_released(db->db_buf)) { 1751 arc_buf_t *buf = db->db_buf; 1752 /* 1753 * This dbuf has anonymous data associated with it. 1754 */ 1755 dbuf_set_data(db, NULL); 1756 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1757 dbuf_evict(db); 1758 } else { 1759 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); 1760 if (!DBUF_IS_CACHEABLE(db)) 1761 dbuf_clear(db); 1762 else 1763 mutex_exit(&db->db_mtx); 1764 } 1765 } else { 1766 mutex_exit(&db->db_mtx); 1767 } 1768 } 1769 1770 #pragma weak dmu_buf_refcount = dbuf_refcount 1771 uint64_t 1772 dbuf_refcount(dmu_buf_impl_t *db) 1773 { 1774 return (refcount_count(&db->db_holds)); 1775 } 1776 1777 void * 1778 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1779 dmu_buf_evict_func_t *evict_func) 1780 { 1781 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1782 user_data_ptr_ptr, evict_func)); 1783 } 1784 1785 void * 1786 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1787 dmu_buf_evict_func_t *evict_func) 1788 { 1789 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1790 1791 db->db_immediate_evict = TRUE; 1792 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1793 user_data_ptr_ptr, evict_func)); 1794 } 1795 1796 void * 1797 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 1798 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 1799 { 1800 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1801 ASSERT(db->db_level == 0); 1802 1803 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 1804 1805 mutex_enter(&db->db_mtx); 1806 1807 if (db->db_user_ptr == old_user_ptr) { 1808 db->db_user_ptr = user_ptr; 1809 db->db_user_data_ptr_ptr = user_data_ptr_ptr; 1810 db->db_evict_func = evict_func; 1811 1812 dbuf_update_data(db); 1813 } else { 1814 old_user_ptr = db->db_user_ptr; 1815 } 1816 1817 mutex_exit(&db->db_mtx); 1818 return (old_user_ptr); 1819 } 1820 1821 void * 1822 dmu_buf_get_user(dmu_buf_t *db_fake) 1823 { 1824 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1825 ASSERT(!refcount_is_zero(&db->db_holds)); 1826 1827 return (db->db_user_ptr); 1828 } 1829 1830 static void 1831 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 1832 { 1833 /* ASSERT(dmu_tx_is_syncing(tx) */ 1834 ASSERT(MUTEX_HELD(&db->db_mtx)); 1835 1836 if (db->db_blkptr != NULL) 1837 return; 1838 1839 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 1840 /* 1841 * This buffer was allocated at a time when there was 1842 * no available blkptrs from the dnode, or it was 1843 * inappropriate to hook it in (i.e., nlevels mis-match). 1844 */ 1845 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 1846 ASSERT(db->db_parent == NULL); 1847 db->db_parent = dn->dn_dbuf; 1848 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 1849 DBUF_VERIFY(db); 1850 } else { 1851 dmu_buf_impl_t *parent = db->db_parent; 1852 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 1853 1854 ASSERT(dn->dn_phys->dn_nlevels > 1); 1855 if (parent == NULL) { 1856 mutex_exit(&db->db_mtx); 1857 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1858 (void) dbuf_hold_impl(dn, db->db_level+1, 1859 db->db_blkid >> epbs, FALSE, db, &parent); 1860 rw_exit(&dn->dn_struct_rwlock); 1861 mutex_enter(&db->db_mtx); 1862 db->db_parent = parent; 1863 } 1864 db->db_blkptr = (blkptr_t *)parent->db.db_data + 1865 (db->db_blkid & ((1ULL << epbs) - 1)); 1866 DBUF_VERIFY(db); 1867 } 1868 } 1869 1870 static void 1871 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 1872 { 1873 dmu_buf_impl_t *db = dr->dr_dbuf; 1874 dnode_t *dn = db->db_dnode; 1875 zio_t *zio; 1876 1877 ASSERT(dmu_tx_is_syncing(tx)); 1878 1879 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 1880 1881 mutex_enter(&db->db_mtx); 1882 1883 ASSERT(db->db_level > 0); 1884 DBUF_VERIFY(db); 1885 1886 if (db->db_buf == NULL) { 1887 mutex_exit(&db->db_mtx); 1888 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 1889 mutex_enter(&db->db_mtx); 1890 } 1891 ASSERT3U(db->db_state, ==, DB_CACHED); 1892 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 1893 ASSERT(db->db_buf != NULL); 1894 1895 dbuf_check_blkptr(dn, db); 1896 1897 db->db_data_pending = dr; 1898 1899 mutex_exit(&db->db_mtx); 1900 dbuf_write(dr, db->db_buf, tx); 1901 1902 zio = dr->dr_zio; 1903 mutex_enter(&dr->dt.di.dr_mtx); 1904 dbuf_sync_list(&dr->dt.di.dr_children, tx); 1905 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 1906 mutex_exit(&dr->dt.di.dr_mtx); 1907 zio_nowait(zio); 1908 } 1909 1910 static void 1911 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 1912 { 1913 arc_buf_t **datap = &dr->dt.dl.dr_data; 1914 dmu_buf_impl_t *db = dr->dr_dbuf; 1915 dnode_t *dn = db->db_dnode; 1916 objset_impl_t *os = dn->dn_objset; 1917 uint64_t txg = tx->tx_txg; 1918 int blksz; 1919 1920 ASSERT(dmu_tx_is_syncing(tx)); 1921 1922 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 1923 1924 mutex_enter(&db->db_mtx); 1925 /* 1926 * To be synced, we must be dirtied. But we 1927 * might have been freed after the dirty. 1928 */ 1929 if (db->db_state == DB_UNCACHED) { 1930 /* This buffer has been freed since it was dirtied */ 1931 ASSERT(db->db.db_data == NULL); 1932 } else if (db->db_state == DB_FILL) { 1933 /* This buffer was freed and is now being re-filled */ 1934 ASSERT(db->db.db_data != dr->dt.dl.dr_data); 1935 } else { 1936 ASSERT3U(db->db_state, ==, DB_CACHED); 1937 } 1938 DBUF_VERIFY(db); 1939 1940 /* 1941 * If this is a bonus buffer, simply copy the bonus data into the 1942 * dnode. It will be written out when the dnode is synced (and it 1943 * will be synced, since it must have been dirty for dbuf_sync to 1944 * be called). 1945 */ 1946 if (db->db_blkid == DB_BONUS_BLKID) { 1947 dbuf_dirty_record_t **drp; 1948 1949 ASSERT(*datap != NULL); 1950 ASSERT3U(db->db_level, ==, 0); 1951 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 1952 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 1953 if (*datap != db->db.db_data) { 1954 zio_buf_free(*datap, DN_MAX_BONUSLEN); 1955 arc_space_return(DN_MAX_BONUSLEN); 1956 } 1957 db->db_data_pending = NULL; 1958 drp = &db->db_last_dirty; 1959 while (*drp != dr) 1960 drp = &(*drp)->dr_next; 1961 ASSERT(dr->dr_next == NULL); 1962 *drp = dr->dr_next; 1963 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1964 ASSERT(db->db_dirtycnt > 0); 1965 db->db_dirtycnt -= 1; 1966 mutex_exit(&db->db_mtx); 1967 dbuf_rele(db, (void *)(uintptr_t)txg); 1968 return; 1969 } 1970 1971 /* 1972 * This function may have dropped the db_mtx lock allowing a dmu_sync 1973 * operation to sneak in. As a result, we need to ensure that we 1974 * don't check the dr_override_state until we have returned from 1975 * dbuf_check_blkptr. 1976 */ 1977 dbuf_check_blkptr(dn, db); 1978 1979 /* 1980 * If this buffer is in the middle of an immdiate write, 1981 * wait for the synchronous IO to complete. 1982 */ 1983 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 1984 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 1985 cv_wait(&db->db_changed, &db->db_mtx); 1986 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 1987 } 1988 1989 /* 1990 * If this dbuf has already been written out via an immediate write, 1991 * just complete the write by copying over the new block pointer and 1992 * updating the accounting via the write-completion functions. 1993 */ 1994 if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 1995 zio_t zio_fake; 1996 1997 zio_fake.io_private = &db; 1998 zio_fake.io_error = 0; 1999 zio_fake.io_bp = db->db_blkptr; 2000 zio_fake.io_bp_orig = *db->db_blkptr; 2001 zio_fake.io_txg = txg; 2002 zio_fake.io_flags = 0; 2003 2004 *db->db_blkptr = dr->dt.dl.dr_overridden_by; 2005 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 2006 db->db_data_pending = dr; 2007 dr->dr_zio = &zio_fake; 2008 mutex_exit(&db->db_mtx); 2009 2010 ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp), 2011 BP_IDENTITY(&zio_fake.io_bp_orig)) || 2012 BP_IS_HOLE(zio_fake.io_bp)); 2013 2014 if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg)) 2015 (void) dsl_dataset_block_kill(os->os_dsl_dataset, 2016 &zio_fake.io_bp_orig, dn->dn_zio, tx); 2017 2018 dbuf_write_ready(&zio_fake, db->db_buf, db); 2019 dbuf_write_done(&zio_fake, db->db_buf, db); 2020 2021 return; 2022 } 2023 2024 blksz = arc_buf_size(*datap); 2025 2026 if (dn->dn_object != DMU_META_DNODE_OBJECT) { 2027 /* 2028 * If this buffer is currently "in use" (i.e., there are 2029 * active holds and db_data still references it), then make 2030 * a copy before we start the write so that any modifications 2031 * from the open txg will not leak into this write. 2032 * 2033 * NOTE: this copy does not need to be made for objects only 2034 * modified in the syncing context (e.g. DNONE_DNODE blocks). 2035 */ 2036 if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) { 2037 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2038 *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2039 bcopy(db->db.db_data, (*datap)->b_data, blksz); 2040 } 2041 } 2042 2043 ASSERT(*datap != NULL); 2044 db->db_data_pending = dr; 2045 2046 mutex_exit(&db->db_mtx); 2047 2048 dbuf_write(dr, *datap, tx); 2049 2050 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2051 if (dn->dn_object == DMU_META_DNODE_OBJECT) 2052 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2053 else 2054 zio_nowait(dr->dr_zio); 2055 } 2056 2057 void 2058 dbuf_sync_list(list_t *list, dmu_tx_t *tx) 2059 { 2060 dbuf_dirty_record_t *dr; 2061 2062 while (dr = list_head(list)) { 2063 if (dr->dr_zio != NULL) { 2064 /* 2065 * If we find an already initialized zio then we 2066 * are processing the meta-dnode, and we have finished. 2067 * The dbufs for all dnodes are put back on the list 2068 * during processing, so that we can zio_wait() 2069 * these IOs after initiating all child IOs. 2070 */ 2071 ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2072 DMU_META_DNODE_OBJECT); 2073 break; 2074 } 2075 list_remove(list, dr); 2076 if (dr->dr_dbuf->db_level > 0) 2077 dbuf_sync_indirect(dr, tx); 2078 else 2079 dbuf_sync_leaf(dr, tx); 2080 } 2081 } 2082 2083 static void 2084 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2085 { 2086 dmu_buf_impl_t *db = dr->dr_dbuf; 2087 dnode_t *dn = db->db_dnode; 2088 objset_impl_t *os = dn->dn_objset; 2089 dmu_buf_impl_t *parent = db->db_parent; 2090 uint64_t txg = tx->tx_txg; 2091 zbookmark_t zb; 2092 writeprops_t wp = { 0 }; 2093 zio_t *zio; 2094 2095 if (!BP_IS_HOLE(db->db_blkptr) && 2096 (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) { 2097 /* 2098 * Private object buffers are released here rather 2099 * than in dbuf_dirty() since they are only modified 2100 * in the syncing context and we don't want the 2101 * overhead of making multiple copies of the data. 2102 */ 2103 arc_release(data, db); 2104 } else { 2105 ASSERT(arc_released(data)); 2106 /* XXX why do we need to thaw here? */ 2107 arc_buf_thaw(data); 2108 } 2109 2110 if (parent != dn->dn_dbuf) { 2111 ASSERT(parent && parent->db_data_pending); 2112 ASSERT(db->db_level == parent->db_level-1); 2113 ASSERT(arc_released(parent->db_buf)); 2114 zio = parent->db_data_pending->dr_zio; 2115 } else { 2116 ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); 2117 ASSERT3P(db->db_blkptr, ==, 2118 &dn->dn_phys->dn_blkptr[db->db_blkid]); 2119 zio = dn->dn_zio; 2120 } 2121 2122 ASSERT(db->db_level == 0 || data == db->db_buf); 2123 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2124 ASSERT(zio); 2125 2126 zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; 2127 zb.zb_object = db->db.db_object; 2128 zb.zb_level = db->db_level; 2129 zb.zb_blkid = db->db_blkid; 2130 2131 wp.wp_type = dn->dn_type; 2132 wp.wp_level = db->db_level; 2133 wp.wp_copies = os->os_copies; 2134 wp.wp_dncompress = dn->dn_compress; 2135 wp.wp_oscompress = os->os_compress; 2136 wp.wp_dnchecksum = dn->dn_checksum; 2137 wp.wp_oschecksum = os->os_checksum; 2138 2139 if (BP_IS_OLDER(db->db_blkptr, txg)) 2140 (void) dsl_dataset_block_kill( 2141 os->os_dsl_dataset, db->db_blkptr, zio, tx); 2142 2143 dr->dr_zio = arc_write(zio, os->os_spa, &wp, 2144 DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr, 2145 data, dbuf_write_ready, dbuf_write_done, db, 2146 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 2147 } 2148 2149 /* ARGSUSED */ 2150 static void 2151 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2152 { 2153 dmu_buf_impl_t *db = vdb; 2154 dnode_t *dn = db->db_dnode; 2155 objset_impl_t *os = dn->dn_objset; 2156 blkptr_t *bp = zio->io_bp; 2157 blkptr_t *bp_orig = &zio->io_bp_orig; 2158 uint64_t fill = 0; 2159 int old_size, new_size, i; 2160 2161 ASSERT(db->db_blkptr == bp); 2162 2163 dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", ""); 2164 2165 old_size = bp_get_dasize(os->os_spa, bp_orig); 2166 new_size = bp_get_dasize(os->os_spa, bp); 2167 2168 dnode_diduse_space(dn, new_size - old_size); 2169 2170 if (BP_IS_HOLE(bp)) { 2171 dsl_dataset_t *ds = os->os_dsl_dataset; 2172 dmu_tx_t *tx = os->os_synctx; 2173 2174 if (bp_orig->blk_birth == tx->tx_txg) 2175 (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx); 2176 ASSERT3U(bp->blk_fill, ==, 0); 2177 return; 2178 } 2179 2180 ASSERT(BP_GET_TYPE(bp) == dn->dn_type); 2181 ASSERT(BP_GET_LEVEL(bp) == db->db_level); 2182 2183 mutex_enter(&db->db_mtx); 2184 2185 if (db->db_level == 0) { 2186 mutex_enter(&dn->dn_mtx); 2187 if (db->db_blkid > dn->dn_phys->dn_maxblkid) 2188 dn->dn_phys->dn_maxblkid = db->db_blkid; 2189 mutex_exit(&dn->dn_mtx); 2190 2191 if (dn->dn_type == DMU_OT_DNODE) { 2192 dnode_phys_t *dnp = db->db.db_data; 2193 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2194 i--, dnp++) { 2195 if (dnp->dn_type != DMU_OT_NONE) 2196 fill++; 2197 } 2198 } else { 2199 fill = 1; 2200 } 2201 } else { 2202 blkptr_t *ibp = db->db.db_data; 2203 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2204 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2205 if (BP_IS_HOLE(ibp)) 2206 continue; 2207 ASSERT3U(BP_GET_LSIZE(ibp), ==, 2208 db->db_level == 1 ? dn->dn_datablksz : 2209 (1<<dn->dn_phys->dn_indblkshift)); 2210 fill += ibp->blk_fill; 2211 } 2212 } 2213 2214 bp->blk_fill = fill; 2215 2216 mutex_exit(&db->db_mtx); 2217 2218 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 2219 ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig))); 2220 } else { 2221 dsl_dataset_t *ds = os->os_dsl_dataset; 2222 dmu_tx_t *tx = os->os_synctx; 2223 2224 if (bp_orig->blk_birth == tx->tx_txg) 2225 (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx); 2226 dsl_dataset_block_born(ds, bp, tx); 2227 } 2228 } 2229 2230 /* ARGSUSED */ 2231 static void 2232 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2233 { 2234 dmu_buf_impl_t *db = vdb; 2235 uint64_t txg = zio->io_txg; 2236 dbuf_dirty_record_t **drp, *dr; 2237 2238 ASSERT3U(zio->io_error, ==, 0); 2239 2240 mutex_enter(&db->db_mtx); 2241 2242 drp = &db->db_last_dirty; 2243 while ((dr = *drp) != db->db_data_pending) 2244 drp = &dr->dr_next; 2245 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2246 ASSERT(dr->dr_txg == txg); 2247 ASSERT(dr->dr_next == NULL); 2248 *drp = dr->dr_next; 2249 2250 if (db->db_level == 0) { 2251 ASSERT(db->db_blkid != DB_BONUS_BLKID); 2252 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2253 2254 if (dr->dt.dl.dr_data != db->db_buf) 2255 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); 2256 else if (!BP_IS_HOLE(db->db_blkptr)) 2257 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2258 else 2259 ASSERT(arc_released(db->db_buf)); 2260 } else { 2261 dnode_t *dn = db->db_dnode; 2262 2263 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2264 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2265 if (!BP_IS_HOLE(db->db_blkptr)) { 2266 int epbs = 2267 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2268 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2269 db->db.db_size); 2270 ASSERT3U(dn->dn_phys->dn_maxblkid 2271 >> (db->db_level * epbs), >=, db->db_blkid); 2272 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2273 } 2274 mutex_destroy(&dr->dt.di.dr_mtx); 2275 list_destroy(&dr->dt.di.dr_children); 2276 } 2277 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2278 2279 cv_broadcast(&db->db_changed); 2280 ASSERT(db->db_dirtycnt > 0); 2281 db->db_dirtycnt -= 1; 2282 db->db_data_pending = NULL; 2283 mutex_exit(&db->db_mtx); 2284 2285 dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", ""); 2286 2287 dbuf_rele(db, (void *)(uintptr_t)txg); 2288 } 2289