1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_impl.h> 31 #include <sys/dbuf.h> 32 #include <sys/dmu_objset.h> 33 #include <sys/dsl_dataset.h> 34 #include <sys/dsl_dir.h> 35 #include <sys/dmu_tx.h> 36 #include <sys/spa.h> 37 #include <sys/zio.h> 38 #include <sys/dmu_zfetch.h> 39 40 static void dbuf_destroy(dmu_buf_impl_t *db); 41 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 42 static arc_done_func_t dbuf_write_done; 43 44 int zfs_mdcomp_disable = 0; 45 46 /* 47 * Global data structures and functions for the dbuf cache. 48 */ 49 taskq_t *dbuf_tq; 50 static kmem_cache_t *dbuf_cache; 51 52 /* ARGSUSED */ 53 static int 54 dbuf_cons(void *vdb, void *unused, int kmflag) 55 { 56 dmu_buf_impl_t *db = vdb; 57 bzero(db, sizeof (dmu_buf_impl_t)); 58 59 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 60 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 61 refcount_create(&db->db_holds); 62 return (0); 63 } 64 65 /* ARGSUSED */ 66 static void 67 dbuf_dest(void *vdb, void *unused) 68 { 69 dmu_buf_impl_t *db = vdb; 70 mutex_destroy(&db->db_mtx); 71 cv_destroy(&db->db_changed); 72 refcount_destroy(&db->db_holds); 73 } 74 75 /* 76 * dbuf hash table routines 77 */ 78 static dbuf_hash_table_t dbuf_hash_table; 79 80 static uint64_t dbuf_hash_count; 81 82 static uint64_t 83 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 84 { 85 uintptr_t osv = (uintptr_t)os; 86 uint64_t crc = -1ULL; 87 88 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 90 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 91 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 92 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 93 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 94 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 95 96 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 97 98 return (crc); 99 } 100 101 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 102 103 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 104 ((dbuf)->db.db_object == (obj) && \ 105 (dbuf)->db_objset == (os) && \ 106 (dbuf)->db_level == (level) && \ 107 (dbuf)->db_blkid == (blkid)) 108 109 dmu_buf_impl_t * 110 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 111 { 112 dbuf_hash_table_t *h = &dbuf_hash_table; 113 objset_impl_t *os = dn->dn_objset; 114 uint64_t obj = dn->dn_object; 115 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 116 uint64_t idx = hv & h->hash_table_mask; 117 dmu_buf_impl_t *db; 118 119 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 120 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 121 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 122 mutex_enter(&db->db_mtx); 123 if (db->db_state != DB_EVICTING) { 124 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 125 return (db); 126 } 127 mutex_exit(&db->db_mtx); 128 } 129 } 130 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 131 return (NULL); 132 } 133 134 /* 135 * Insert an entry into the hash table. If there is already an element 136 * equal to elem in the hash table, then the already existing element 137 * will be returned and the new element will not be inserted. 138 * Otherwise returns NULL. 139 */ 140 static dmu_buf_impl_t * 141 dbuf_hash_insert(dmu_buf_impl_t *db) 142 { 143 dbuf_hash_table_t *h = &dbuf_hash_table; 144 objset_impl_t *os = db->db_objset; 145 uint64_t obj = db->db.db_object; 146 int level = db->db_level; 147 uint64_t blkid = db->db_blkid; 148 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 149 uint64_t idx = hv & h->hash_table_mask; 150 dmu_buf_impl_t *dbf; 151 152 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 153 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 154 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 155 mutex_enter(&dbf->db_mtx); 156 if (dbf->db_state != DB_EVICTING) { 157 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 158 return (dbf); 159 } 160 mutex_exit(&dbf->db_mtx); 161 } 162 } 163 164 mutex_enter(&db->db_mtx); 165 db->db_hash_next = h->hash_table[idx]; 166 h->hash_table[idx] = db; 167 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 168 atomic_add_64(&dbuf_hash_count, 1); 169 170 return (NULL); 171 } 172 173 /* 174 * Remove an entry from the hash table. This operation will 175 * fail if there are any existing holds on the db. 176 */ 177 static void 178 dbuf_hash_remove(dmu_buf_impl_t *db) 179 { 180 dbuf_hash_table_t *h = &dbuf_hash_table; 181 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 182 db->db_level, db->db_blkid); 183 uint64_t idx = hv & h->hash_table_mask; 184 dmu_buf_impl_t *dbf, **dbp; 185 186 /* 187 * We musn't hold db_mtx to maintin lock ordering: 188 * DBUF_HASH_MUTEX > db_mtx. 189 */ 190 ASSERT(refcount_is_zero(&db->db_holds)); 191 ASSERT(db->db_state == DB_EVICTING); 192 ASSERT(!MUTEX_HELD(&db->db_mtx)); 193 194 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 195 dbp = &h->hash_table[idx]; 196 while ((dbf = *dbp) != db) { 197 dbp = &dbf->db_hash_next; 198 ASSERT(dbf != NULL); 199 } 200 *dbp = db->db_hash_next; 201 db->db_hash_next = NULL; 202 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 203 atomic_add_64(&dbuf_hash_count, -1); 204 } 205 206 static arc_evict_func_t dbuf_do_evict; 207 208 static void 209 dbuf_evict_user(dmu_buf_impl_t *db) 210 { 211 ASSERT(MUTEX_HELD(&db->db_mtx)); 212 213 if (db->db_level != 0 || db->db_d.db_evict_func == NULL) 214 return; 215 216 if (db->db_d.db_user_data_ptr_ptr) 217 *db->db_d.db_user_data_ptr_ptr = db->db.db_data; 218 db->db_d.db_evict_func(&db->db, db->db_d.db_user_ptr); 219 db->db_d.db_user_ptr = NULL; 220 db->db_d.db_user_data_ptr_ptr = NULL; 221 db->db_d.db_evict_func = NULL; 222 } 223 224 void 225 dbuf_evict(dmu_buf_impl_t *db) 226 { 227 int i; 228 229 ASSERT(MUTEX_HELD(&db->db_mtx)); 230 ASSERT(db->db_buf == NULL); 231 232 #ifdef ZFS_DEBUG 233 for (i = 0; i < TXG_SIZE; i++) { 234 ASSERT(!list_link_active(&db->db_dirty_node[i])); 235 ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL); 236 } 237 #endif 238 dbuf_clear(db); 239 dbuf_destroy(db); 240 } 241 242 void 243 dbuf_init(void) 244 { 245 uint64_t hsize = 1ULL << 16; 246 dbuf_hash_table_t *h = &dbuf_hash_table; 247 int i; 248 249 /* 250 * The hash table is big enough to fill all of physical memory 251 * with an average 4K block size. The table will take up 252 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 253 */ 254 while (hsize * 4096 < physmem * PAGESIZE) 255 hsize <<= 1; 256 257 retry: 258 h->hash_table_mask = hsize - 1; 259 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 260 if (h->hash_table == NULL) { 261 /* XXX - we should really return an error instead of assert */ 262 ASSERT(hsize > (1ULL << 10)); 263 hsize >>= 1; 264 goto retry; 265 } 266 267 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 268 sizeof (dmu_buf_impl_t), 269 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 270 dbuf_tq = taskq_create("dbuf_tq", 8, maxclsyspri, 50, INT_MAX, 271 TASKQ_PREPOPULATE); 272 273 for (i = 0; i < DBUF_MUTEXES; i++) 274 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 275 } 276 277 void 278 dbuf_fini(void) 279 { 280 dbuf_hash_table_t *h = &dbuf_hash_table; 281 int i; 282 283 taskq_destroy(dbuf_tq); 284 dbuf_tq = NULL; 285 286 for (i = 0; i < DBUF_MUTEXES; i++) 287 mutex_destroy(&h->hash_mutexes[i]); 288 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 289 kmem_cache_destroy(dbuf_cache); 290 } 291 292 /* 293 * Other stuff. 294 */ 295 296 #ifdef ZFS_DEBUG 297 static void 298 dbuf_verify(dmu_buf_impl_t *db) 299 { 300 int i; 301 dnode_t *dn = db->db_dnode; 302 303 ASSERT(MUTEX_HELD(&db->db_mtx)); 304 305 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 306 return; 307 308 ASSERT(db->db_objset != NULL); 309 if (dn == NULL) { 310 ASSERT(db->db_parent == NULL); 311 ASSERT(db->db_blkptr == NULL); 312 } else { 313 ASSERT3U(db->db.db_object, ==, dn->dn_object); 314 ASSERT3P(db->db_objset, ==, dn->dn_objset); 315 ASSERT3U(db->db_level, <, dn->dn_nlevels); 316 ASSERT(db->db_blkid == DB_BONUS_BLKID || 317 list_head(&dn->dn_dbufs)); 318 } 319 if (db->db_blkid == DB_BONUS_BLKID) { 320 ASSERT(dn != NULL); 321 ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); 322 ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); 323 } else { 324 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 325 } 326 327 if (db->db_level == 0) { 328 /* we can be momentarily larger in dnode_set_blksz() */ 329 if (db->db_blkid != DB_BONUS_BLKID && dn) { 330 ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); 331 } 332 if (db->db.db_object == DMU_META_DNODE_OBJECT) { 333 for (i = 0; i < TXG_SIZE; i++) { 334 /* 335 * it should only be modified in syncing 336 * context, so make sure we only have 337 * one copy of the data. 338 */ 339 ASSERT(db->db_d.db_data_old[i] == NULL || 340 db->db_d.db_data_old[i] == db->db_buf); 341 } 342 } 343 } 344 345 /* verify db->db_blkptr */ 346 if (db->db_blkptr) { 347 if (db->db_parent == dn->dn_dbuf) { 348 /* db is pointed to by the dnode */ 349 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 350 if (db->db.db_object == DMU_META_DNODE_OBJECT) 351 ASSERT(db->db_parent == NULL); 352 else 353 ASSERT(db->db_parent != NULL); 354 ASSERT3P(db->db_blkptr, ==, 355 &dn->dn_phys->dn_blkptr[db->db_blkid]); 356 } else { 357 /* db is pointed to by an indirect block */ 358 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 359 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 360 ASSERT3U(db->db_parent->db.db_object, ==, 361 db->db.db_object); 362 /* 363 * dnode_grow_indblksz() can make this fail if we don't 364 * have the struct_rwlock. XXX indblksz no longer 365 * grows. safe to do this now? 366 */ 367 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { 368 ASSERT3P(db->db_blkptr, ==, 369 ((blkptr_t *)db->db_parent->db.db_data + 370 db->db_blkid % epb)); 371 } 372 } 373 } 374 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 375 db->db.db_data && db->db_blkid != DB_BONUS_BLKID && 376 db->db_state != DB_FILL && !dn->dn_free_txg) { 377 /* 378 * If the blkptr isn't set but they have nonzero data, 379 * it had better be dirty, otherwise we'll lose that 380 * data when we evict this buffer. 381 */ 382 if (db->db_dirtycnt == 0) { 383 uint64_t *buf = db->db.db_data; 384 int i; 385 386 for (i = 0; i < db->db.db_size >> 3; i++) { 387 ASSERT(buf[i] == 0); 388 } 389 } 390 } 391 } 392 #endif 393 394 static void 395 dbuf_update_data(dmu_buf_impl_t *db) 396 { 397 ASSERT(MUTEX_HELD(&db->db_mtx)); 398 if (db->db_level == 0 && db->db_d.db_user_data_ptr_ptr) { 399 ASSERT(!refcount_is_zero(&db->db_holds)); 400 *db->db_d.db_user_data_ptr_ptr = db->db.db_data; 401 } 402 } 403 404 static void 405 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 406 { 407 ASSERT(MUTEX_HELD(&db->db_mtx)); 408 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 409 db->db_buf = buf; 410 if (buf != NULL) { 411 ASSERT(buf->b_data != NULL); 412 db->db.db_data = buf->b_data; 413 if (!arc_released(buf)) 414 arc_set_callback(buf, dbuf_do_evict, db); 415 dbuf_update_data(db); 416 } else { 417 dbuf_evict_user(db); 418 db->db.db_data = NULL; 419 db->db_state = DB_UNCACHED; 420 } 421 } 422 423 uint64_t 424 dbuf_whichblock(dnode_t *dn, uint64_t offset) 425 { 426 if (dn->dn_datablkshift) { 427 return (offset >> dn->dn_datablkshift); 428 } else { 429 ASSERT3U(offset, <, dn->dn_datablksz); 430 return (0); 431 } 432 } 433 434 static void 435 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 436 { 437 dmu_buf_impl_t *db = vdb; 438 439 mutex_enter(&db->db_mtx); 440 ASSERT3U(db->db_state, ==, DB_READ); 441 /* 442 * All reads are synchronous, so we must have a hold on the dbuf 443 */ 444 ASSERT(refcount_count(&db->db_holds) > 0); 445 ASSERT(db->db_buf == NULL); 446 ASSERT(db->db.db_data == NULL); 447 if (db->db_level == 0 && db->db_d.db_freed_in_flight) { 448 /* we were freed in flight; disregard any error */ 449 arc_release(buf, db); 450 bzero(buf->b_data, db->db.db_size); 451 db->db_d.db_freed_in_flight = FALSE; 452 dbuf_set_data(db, buf); 453 db->db_state = DB_CACHED; 454 } else if (zio == NULL || zio->io_error == 0) { 455 dbuf_set_data(db, buf); 456 db->db_state = DB_CACHED; 457 } else { 458 ASSERT(db->db_blkid != DB_BONUS_BLKID); 459 ASSERT3P(db->db_buf, ==, NULL); 460 VERIFY(arc_buf_remove_ref(buf, db) == 1); 461 db->db_state = DB_UNCACHED; 462 } 463 cv_broadcast(&db->db_changed); 464 mutex_exit(&db->db_mtx); 465 dbuf_rele(db, NULL); 466 } 467 468 static void 469 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 470 { 471 blkptr_t *bp; 472 zbookmark_t zb; 473 uint32_t aflags = ARC_NOWAIT; 474 475 ASSERT(!refcount_is_zero(&db->db_holds)); 476 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 477 ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); 478 ASSERT(MUTEX_HELD(&db->db_mtx)); 479 ASSERT(db->db_state == DB_UNCACHED); 480 ASSERT(db->db_buf == NULL); 481 482 if (db->db_blkid == DB_BONUS_BLKID) { 483 ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size); 484 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 485 if (db->db.db_size < DN_MAX_BONUSLEN) 486 bzero(db->db.db_data, DN_MAX_BONUSLEN); 487 bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data, 488 db->db.db_size); 489 dbuf_update_data(db); 490 db->db_state = DB_CACHED; 491 mutex_exit(&db->db_mtx); 492 return; 493 } 494 495 if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid)) 496 bp = NULL; 497 else 498 bp = db->db_blkptr; 499 500 if (bp == NULL) 501 dprintf_dbuf(db, "blkptr: %s\n", "NULL"); 502 else 503 dprintf_dbuf_bp(db, bp, "%s", "blkptr:"); 504 505 if (bp == NULL || BP_IS_HOLE(bp)) { 506 ASSERT(bp == NULL || BP_IS_HOLE(bp)); 507 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 508 db->db.db_size, db)); 509 bzero(db->db.db_data, db->db.db_size); 510 db->db_state = DB_CACHED; 511 *flags |= DB_RF_CACHED; 512 mutex_exit(&db->db_mtx); 513 return; 514 } 515 516 db->db_state = DB_READ; 517 mutex_exit(&db->db_mtx); 518 519 zb.zb_objset = db->db_objset->os_dsl_dataset ? 520 db->db_objset->os_dsl_dataset->ds_object : 0; 521 zb.zb_object = db->db.db_object; 522 zb.zb_level = db->db_level; 523 zb.zb_blkid = db->db_blkid; 524 525 dbuf_add_ref(db, NULL); 526 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 527 (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp, 528 db->db_level > 0 ? byteswap_uint64_array : 529 dmu_ot[db->db_dnode->dn_type].ot_byteswap, 530 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 531 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 532 &aflags, &zb); 533 if (aflags & ARC_CACHED) 534 *flags |= DB_RF_CACHED; 535 } 536 537 int 538 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 539 { 540 int err = 0; 541 int havepzio = (zio != NULL); 542 int prefetch; 543 544 /* 545 * We don't have to hold the mutex to check db_state because it 546 * can't be freed while we have a hold on the buffer. 547 */ 548 ASSERT(!refcount_is_zero(&db->db_holds)); 549 550 if ((flags & DB_RF_HAVESTRUCT) == 0) 551 rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); 552 553 prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 554 (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL; 555 556 mutex_enter(&db->db_mtx); 557 if (db->db_state == DB_CACHED) { 558 mutex_exit(&db->db_mtx); 559 if (prefetch) 560 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 561 db->db.db_size, TRUE); 562 if ((flags & DB_RF_HAVESTRUCT) == 0) 563 rw_exit(&db->db_dnode->dn_struct_rwlock); 564 } else if (db->db_state == DB_UNCACHED) { 565 if (zio == NULL) { 566 zio = zio_root(db->db_dnode->dn_objset->os_spa, 567 NULL, NULL, ZIO_FLAG_CANFAIL); 568 } 569 dbuf_read_impl(db, zio, &flags); 570 571 /* dbuf_read_impl has dropped db_mtx for us */ 572 573 if (prefetch) 574 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 575 db->db.db_size, flags & DB_RF_CACHED); 576 577 if ((flags & DB_RF_HAVESTRUCT) == 0) 578 rw_exit(&db->db_dnode->dn_struct_rwlock); 579 580 if (!havepzio) 581 err = zio_wait(zio); 582 } else { 583 mutex_exit(&db->db_mtx); 584 if (prefetch) 585 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 586 db->db.db_size, TRUE); 587 if ((flags & DB_RF_HAVESTRUCT) == 0) 588 rw_exit(&db->db_dnode->dn_struct_rwlock); 589 590 mutex_enter(&db->db_mtx); 591 if ((flags & DB_RF_NEVERWAIT) == 0) { 592 while (db->db_state == DB_READ || 593 db->db_state == DB_FILL) { 594 ASSERT(db->db_state == DB_READ || 595 (flags & DB_RF_HAVESTRUCT) == 0); 596 cv_wait(&db->db_changed, &db->db_mtx); 597 } 598 if (db->db_state == DB_UNCACHED) 599 err = EIO; 600 } 601 mutex_exit(&db->db_mtx); 602 } 603 604 ASSERT(err || havepzio || db->db_state == DB_CACHED); 605 return (err); 606 } 607 608 static void 609 dbuf_noread(dmu_buf_impl_t *db) 610 { 611 ASSERT(!refcount_is_zero(&db->db_holds)); 612 ASSERT(db->db_blkid != DB_BONUS_BLKID); 613 mutex_enter(&db->db_mtx); 614 while (db->db_state == DB_READ || db->db_state == DB_FILL) 615 cv_wait(&db->db_changed, &db->db_mtx); 616 if (db->db_state == DB_UNCACHED) { 617 ASSERT(db->db_buf == NULL); 618 ASSERT(db->db.db_data == NULL); 619 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 620 db->db.db_size, db)); 621 db->db_state = DB_FILL; 622 } else { 623 ASSERT3U(db->db_state, ==, DB_CACHED); 624 } 625 mutex_exit(&db->db_mtx); 626 } 627 628 /* 629 * This is our just-in-time copy function. It makes a copy of 630 * buffers, that have been modified in a previous transaction 631 * group, before we modify them in the current active group. 632 * 633 * This function is used in two places: when we are dirtying a 634 * buffer for the first time in a txg, and when we are freeing 635 * a range in a dnode that includes this buffer. 636 * 637 * Note that when we are called from dbuf_free_range() we do 638 * not put a hold on the buffer, we just traverse the active 639 * dbuf list for the dnode. 640 */ 641 static void 642 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 643 { 644 arc_buf_t **quiescing, **syncing; 645 646 ASSERT(MUTEX_HELD(&db->db_mtx)); 647 ASSERT(db->db.db_data != NULL); 648 ASSERT(db->db_blkid != DB_BONUS_BLKID); 649 650 quiescing = (arc_buf_t **)&db->db_d.db_data_old[(txg-1)&TXG_MASK]; 651 syncing = (arc_buf_t **)&db->db_d.db_data_old[(txg-2)&TXG_MASK]; 652 653 /* 654 * If this buffer is referenced from the current quiescing 655 * transaction group: either make a copy and reset the reference 656 * to point to the copy, or (if there a no active holders) just 657 * null out the current db_data pointer. 658 */ 659 if (*quiescing == db->db_buf) { 660 /* 661 * If the quiescing txg is "dirty", then we better not 662 * be referencing the same buffer from the syncing txg. 663 */ 664 ASSERT(*syncing != db->db_buf); 665 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 666 int size = db->db.db_size; 667 *quiescing = arc_buf_alloc( 668 db->db_dnode->dn_objset->os_spa, size, db); 669 bcopy(db->db.db_data, (*quiescing)->b_data, size); 670 } else { 671 dbuf_set_data(db, NULL); 672 } 673 return; 674 } 675 676 /* 677 * If this buffer is referenced from the current syncing 678 * transaction group: either 679 * 1 - make a copy and reset the reference, or 680 * 2 - if there are no holders, just null the current db_data. 681 */ 682 if (*syncing == db->db_buf) { 683 ASSERT3P(*quiescing, ==, NULL); 684 ASSERT3U(db->db_dirtycnt, ==, 1); 685 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 686 int size = db->db.db_size; 687 /* we can't copy if we have already started a write */ 688 ASSERT(*syncing != db->db_data_pending); 689 *syncing = arc_buf_alloc( 690 db->db_dnode->dn_objset->os_spa, size, db); 691 bcopy(db->db.db_data, (*syncing)->b_data, size); 692 } else { 693 dbuf_set_data(db, NULL); 694 } 695 } 696 } 697 698 /* 699 * This is the "bonus buffer" version of the above routine 700 */ 701 static void 702 dbuf_fix_old_bonus_data(dmu_buf_impl_t *db, uint64_t txg) 703 { 704 void **quiescing, **syncing; 705 706 ASSERT(MUTEX_HELD(&db->db_mtx)); 707 ASSERT(db->db.db_data != NULL); 708 ASSERT(db->db_blkid == DB_BONUS_BLKID); 709 710 quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK]; 711 syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK]; 712 713 if (*quiescing == db->db.db_data) { 714 ASSERT(*syncing != db->db.db_data); 715 *quiescing = zio_buf_alloc(DN_MAX_BONUSLEN); 716 bcopy(db->db.db_data, *quiescing, DN_MAX_BONUSLEN); 717 } else if (*syncing == db->db.db_data) { 718 ASSERT3P(*quiescing, ==, NULL); 719 ASSERT3U(db->db_dirtycnt, ==, 1); 720 *syncing = zio_buf_alloc(DN_MAX_BONUSLEN); 721 bcopy(db->db.db_data, *syncing, DN_MAX_BONUSLEN); 722 } 723 } 724 725 void 726 dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg) 727 { 728 ASSERT(db->db_blkid != DB_BONUS_BLKID); 729 ASSERT(MUTEX_HELD(&db->db_mtx)); 730 ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC); 731 732 if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { 733 /* free this block */ 734 ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) || 735 db->db_dnode->dn_free_txg == txg); 736 if (!BP_IS_HOLE(db->db_d.db_overridden_by[txg&TXG_MASK])) { 737 /* XXX can get silent EIO here */ 738 (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa, 739 txg, db->db_d.db_overridden_by[txg&TXG_MASK], 740 NULL, NULL, ARC_WAIT); 741 } 742 kmem_free(db->db_d.db_overridden_by[txg&TXG_MASK], 743 sizeof (blkptr_t)); 744 db->db_d.db_overridden_by[txg&TXG_MASK] = NULL; 745 /* release the already-written buffer */ 746 arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); 747 } 748 } 749 750 void 751 dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) 752 { 753 dmu_buf_impl_t *db, *db_next; 754 uint64_t txg = tx->tx_txg; 755 756 dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks); 757 mutex_enter(&dn->dn_dbufs_mtx); 758 for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 759 db_next = list_next(&dn->dn_dbufs, db); 760 ASSERT(db->db_blkid != DB_BONUS_BLKID); 761 if (db->db_level != 0) 762 continue; 763 dprintf_dbuf(db, "found buf %s\n", ""); 764 if (db->db_blkid < blkid || 765 db->db_blkid >= blkid+nblks) 766 continue; 767 768 /* found a level 0 buffer in the range */ 769 if (dbuf_undirty(db, tx)) 770 continue; 771 772 mutex_enter(&db->db_mtx); 773 if (db->db_state == DB_UNCACHED || 774 db->db_state == DB_EVICTING) { 775 ASSERT(db->db.db_data == NULL); 776 mutex_exit(&db->db_mtx); 777 continue; 778 } 779 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 780 /* will be handled in dbuf_read_done or dbuf_rele */ 781 db->db_d.db_freed_in_flight = TRUE; 782 mutex_exit(&db->db_mtx); 783 continue; 784 } 785 if (refcount_count(&db->db_holds) == 0) { 786 ASSERT(db->db_buf); 787 dbuf_clear(db); 788 continue; 789 } 790 /* The dbuf is CACHED and referenced */ 791 792 if (!list_link_active(&db->db_dirty_node[txg & TXG_MASK])) { 793 /* 794 * This dbuf is not currently dirty. Either 795 * uncache it (if its not referenced in the open 796 * context) or reset its contents to empty. 797 */ 798 dbuf_fix_old_data(db, txg); 799 } else { 800 if (db->db_d.db_overridden_by[txg & TXG_MASK] != NULL) { 801 /* 802 * This dbuf is overridden. Clear that state. 803 */ 804 dbuf_unoverride(db, txg); 805 } 806 if (db->db_blkid > dn->dn_maxblkid) 807 dn->dn_maxblkid = db->db_blkid; 808 } 809 /* fill in with appropriate data */ 810 if (db->db_state == DB_CACHED) { 811 ASSERT(db->db.db_data != NULL); 812 arc_release(db->db_buf, db); 813 bzero(db->db.db_data, db->db.db_size); 814 } 815 816 mutex_exit(&db->db_mtx); 817 } 818 mutex_exit(&dn->dn_dbufs_mtx); 819 } 820 821 static int 822 dbuf_new_block(dmu_buf_impl_t *db) 823 { 824 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 825 uint64_t birth_txg = 0; 826 827 /* Don't count meta-objects */ 828 if (ds == NULL) 829 return (FALSE); 830 831 /* 832 * We don't need any locking to protect db_blkptr: 833 * If it's syncing, then db_dirtied will be set so we'll 834 * ignore db_blkptr. 835 */ 836 ASSERT(MUTEX_HELD(&db->db_mtx)); /* XXX strictly necessary? */ 837 /* If we have been dirtied since the last snapshot, its not new */ 838 if (db->db_dirtied) 839 birth_txg = db->db_dirtied; 840 else if (db->db_blkptr) 841 birth_txg = db->db_blkptr->blk_birth; 842 843 if (birth_txg) 844 return (!dsl_dataset_block_freeable(ds, birth_txg)); 845 else 846 return (TRUE); 847 } 848 849 void 850 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 851 { 852 arc_buf_t *buf, *obuf; 853 int osize = db->db.db_size; 854 855 ASSERT(db->db_blkid != DB_BONUS_BLKID); 856 857 /* XXX does *this* func really need the lock? */ 858 ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); 859 860 /* 861 * This call to dbuf_will_dirty() with the dn_struct_rwlock held 862 * is OK, because there can be no other references to the db 863 * when we are changing its size, so no concurrent DB_FILL can 864 * be happening. 865 */ 866 /* 867 * XXX we should be doing a dbuf_read, checking the return 868 * value and returning that up to our callers 869 */ 870 dbuf_will_dirty(db, tx); 871 872 /* create the data buffer for the new block */ 873 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db); 874 875 /* copy old block data to the new block */ 876 obuf = db->db_buf; 877 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 878 /* zero the remainder */ 879 if (size > osize) 880 bzero((uint8_t *)buf->b_data + osize, size - osize); 881 882 mutex_enter(&db->db_mtx); 883 dbuf_set_data(db, buf); 884 VERIFY(arc_buf_remove_ref(obuf, db) == 1); 885 db->db.db_size = size; 886 887 if (db->db_level == 0) 888 db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf; 889 mutex_exit(&db->db_mtx); 890 891 dnode_willuse_space(db->db_dnode, size-osize, tx); 892 } 893 894 void 895 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 896 { 897 dnode_t *dn = db->db_dnode; 898 objset_impl_t *os = dn->dn_objset; 899 int drop_struct_lock = FALSE; 900 int txgoff = tx->tx_txg & TXG_MASK; 901 902 ASSERT(tx->tx_txg != 0); 903 ASSERT(!refcount_is_zero(&db->db_holds)); 904 DMU_TX_DIRTY_BUF(tx, db); 905 906 /* 907 * Shouldn't dirty a regular buffer in syncing context. Private 908 * objects may be dirtied in syncing context, but only if they 909 * were already pre-dirtied in open context. 910 * XXX We may want to prohibit dirtying in syncing context even 911 * if they did pre-dirty. 912 */ 913 ASSERT(!(dmu_tx_is_syncing(tx) && 914 !BP_IS_HOLE(&dn->dn_objset->os_rootbp) && 915 dn->dn_object != DMU_META_DNODE_OBJECT && 916 dn->dn_objset->os_dsl_dataset != NULL && 917 !dsl_dir_is_private( 918 dn->dn_objset->os_dsl_dataset->ds_dir))); 919 920 /* 921 * We make this assert for private objects as well, but after we 922 * check if we're already dirty. They are allowed to re-dirty 923 * in syncing context. 924 */ 925 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 926 dn->dn_dirtyctx == DN_UNDIRTIED || 927 dn->dn_dirtyctx == 928 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 929 930 mutex_enter(&db->db_mtx); 931 /* XXX make this true for indirects too? */ 932 ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || 933 db->db_state == DB_FILL); 934 935 /* 936 * If this buffer is currently part of an "overridden" region, 937 * we now need to remove it from that region. 938 */ 939 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 940 db->db_d.db_overridden_by[txgoff] != NULL) { 941 dbuf_unoverride(db, tx->tx_txg); 942 } 943 944 mutex_enter(&dn->dn_mtx); 945 /* 946 * Don't set dirtyctx to SYNC if we're just modifying this as we 947 * initialize the objset. 948 */ 949 if (dn->dn_dirtyctx == DN_UNDIRTIED && 950 !BP_IS_HOLE(&dn->dn_objset->os_rootbp)) { 951 dn->dn_dirtyctx = 952 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 953 ASSERT(dn->dn_dirtyctx_firstset == NULL); 954 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 955 } 956 mutex_exit(&dn->dn_mtx); 957 958 /* 959 * If this buffer is already dirty, we're done. 960 */ 961 if (list_link_active(&db->db_dirty_node[txgoff])) { 962 mutex_exit(&db->db_mtx); 963 return; 964 } 965 966 /* 967 * Only valid if not already dirty. 968 */ 969 ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 970 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 971 972 ASSERT3U(dn->dn_nlevels, >, db->db_level); 973 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 974 dn->dn_phys->dn_nlevels > db->db_level || 975 dn->dn_next_nlevels[txgoff] > db->db_level || 976 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 977 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 978 979 /* 980 * We should only be dirtying in syncing context if it's the 981 * mos, a spa os, or we're initializing the os. However, we are 982 * allowed to dirty in syncing context provided we already 983 * dirtied it in open context. Hence we must make this 984 * assertion only if we're not already dirty. 985 */ 986 ASSERT(!dmu_tx_is_syncing(tx) || 987 os->os_dsl_dataset == NULL || 988 !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || 989 !BP_IS_HOLE(&os->os_rootbp)); 990 ASSERT(db->db.db_size != 0); 991 992 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 993 994 /* 995 * If this buffer is dirty in an old transaction group we need 996 * to make a copy of it so that the changes we make in this 997 * transaction group won't leak out when we sync the older txg. 998 */ 999 if (db->db_blkid == DB_BONUS_BLKID) { 1000 ASSERT(db->db.db_data != NULL); 1001 ASSERT(db->db_d.db_data_old[txgoff] == NULL); 1002 dbuf_fix_old_bonus_data(db, tx->tx_txg); 1003 db->db_d.db_data_old[txgoff] = db->db.db_data; 1004 } else if (db->db_level == 0) { 1005 /* 1006 * Release the data buffer from the cache so that we 1007 * can modify it without impacting possible other users 1008 * of this cached data block. Note that indirect blocks 1009 * and private objects are not released until the syncing 1010 * state (since they are only modified then). 1011 */ 1012 ASSERT(db->db_buf != NULL); 1013 ASSERT(db->db_d.db_data_old[txgoff] == NULL); 1014 if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1015 arc_release(db->db_buf, db); 1016 dbuf_fix_old_data(db, tx->tx_txg); 1017 ASSERT(db->db_buf != NULL); 1018 } 1019 db->db_d.db_data_old[txgoff] = db->db_buf; 1020 } 1021 1022 mutex_enter(&dn->dn_mtx); 1023 /* 1024 * We could have been freed_in_flight between the dbuf_noread 1025 * and dbuf_dirty. We win, as though the dbuf_noread() had 1026 * happened after the free. 1027 */ 1028 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 1029 dnode_clear_range(dn, db->db_blkid, 1, tx); 1030 db->db_d.db_freed_in_flight = FALSE; 1031 } 1032 1033 db->db_dirtied = tx->tx_txg; 1034 list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db); 1035 mutex_exit(&dn->dn_mtx); 1036 1037 if (db->db_blkid != DB_BONUS_BLKID) { 1038 /* 1039 * Update the accounting. 1040 */ 1041 if (!dbuf_new_block(db) && db->db_blkptr) { 1042 /* 1043 * This is only a guess -- if the dbuf is dirty 1044 * in a previous txg, we don't know how much 1045 * space it will use on disk yet. We should 1046 * really have the struct_rwlock to access 1047 * db_blkptr, but since this is just a guess, 1048 * it's OK if we get an odd answer. 1049 */ 1050 dnode_willuse_space(dn, 1051 -bp_get_dasize(os->os_spa, db->db_blkptr), tx); 1052 } 1053 dnode_willuse_space(dn, db->db.db_size, tx); 1054 } 1055 1056 /* 1057 * This buffer is now part of this txg 1058 */ 1059 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1060 db->db_dirtycnt += 1; 1061 ASSERT3U(db->db_dirtycnt, <=, 3); 1062 1063 mutex_exit(&db->db_mtx); 1064 1065 if (db->db_blkid == DB_BONUS_BLKID) { 1066 dnode_setdirty(dn, tx); 1067 return; 1068 } 1069 1070 if (db->db_level == 0) { 1071 dnode_new_blkid(dn, db->db_blkid, tx); 1072 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1073 } 1074 1075 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1076 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1077 drop_struct_lock = TRUE; 1078 } 1079 1080 if (db->db_level+1 < dn->dn_nlevels) { 1081 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1082 dmu_buf_impl_t *parent; 1083 parent = dbuf_hold_level(dn, db->db_level+1, 1084 db->db_blkid >> epbs, FTAG); 1085 if (drop_struct_lock) 1086 rw_exit(&dn->dn_struct_rwlock); 1087 dbuf_dirty(parent, tx); 1088 dbuf_rele(parent, FTAG); 1089 } else { 1090 if (drop_struct_lock) 1091 rw_exit(&dn->dn_struct_rwlock); 1092 } 1093 1094 dnode_setdirty(dn, tx); 1095 } 1096 1097 static int 1098 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1099 { 1100 dnode_t *dn = db->db_dnode; 1101 int txgoff = tx->tx_txg & TXG_MASK; 1102 int64_t holds; 1103 1104 ASSERT(tx->tx_txg != 0); 1105 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1106 1107 mutex_enter(&db->db_mtx); 1108 1109 /* 1110 * If this buffer is not dirty, we're done. 1111 */ 1112 if (!list_link_active(&db->db_dirty_node[txgoff])) { 1113 mutex_exit(&db->db_mtx); 1114 return (0); 1115 } 1116 1117 /* 1118 * If this buffer is currently held, we cannot undirty 1119 * it, since one of the current holders may be in the 1120 * middle of an update. Note that users of dbuf_undirty() 1121 * should not place a hold on the dbuf before the call. 1122 */ 1123 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1124 mutex_exit(&db->db_mtx); 1125 /* Make sure we don't toss this buffer at sync phase */ 1126 mutex_enter(&dn->dn_mtx); 1127 dnode_clear_range(dn, db->db_blkid, 1, tx); 1128 mutex_exit(&dn->dn_mtx); 1129 return (0); 1130 } 1131 1132 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1133 1134 dbuf_unoverride(db, tx->tx_txg); 1135 1136 ASSERT(db->db.db_size != 0); 1137 if (db->db_level == 0) { 1138 ASSERT(db->db_buf != NULL); 1139 ASSERT(db->db_d.db_data_old[txgoff] != NULL); 1140 if (db->db_d.db_data_old[txgoff] != db->db_buf) 1141 VERIFY(arc_buf_remove_ref( 1142 db->db_d.db_data_old[txgoff], db) == 1); 1143 db->db_d.db_data_old[txgoff] = NULL; 1144 } 1145 1146 /* XXX would be nice to fix up dn_towrite_space[] */ 1147 /* XXX undo db_dirtied? but how? */ 1148 /* db->db_dirtied = tx->tx_txg; */ 1149 1150 mutex_enter(&dn->dn_mtx); 1151 list_remove(&dn->dn_dirty_dbufs[txgoff], db); 1152 mutex_exit(&dn->dn_mtx); 1153 1154 ASSERT(db->db_dirtycnt > 0); 1155 db->db_dirtycnt -= 1; 1156 1157 if ((holds = refcount_remove(&db->db_holds, 1158 (void *)(uintptr_t)tx->tx_txg)) == 0) { 1159 arc_buf_t *buf = db->db_buf; 1160 1161 ASSERT(arc_released(buf)); 1162 dbuf_set_data(db, NULL); 1163 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1164 dbuf_evict(db); 1165 return (1); 1166 } 1167 ASSERT(holds > 0); 1168 1169 mutex_exit(&db->db_mtx); 1170 return (0); 1171 } 1172 1173 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty 1174 void 1175 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1176 { 1177 int rf = DB_RF_MUST_SUCCEED; 1178 1179 ASSERT(tx->tx_txg != 0); 1180 ASSERT(!refcount_is_zero(&db->db_holds)); 1181 1182 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) 1183 rf |= DB_RF_HAVESTRUCT; 1184 (void) dbuf_read(db, NULL, rf); 1185 dbuf_dirty(db, tx); 1186 } 1187 1188 void 1189 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1190 { 1191 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1192 1193 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1194 ASSERT(tx->tx_txg != 0); 1195 ASSERT(db->db_level == 0); 1196 ASSERT(!refcount_is_zero(&db->db_holds)); 1197 1198 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1199 dmu_tx_private_ok(tx)); 1200 1201 dbuf_noread(db); 1202 dbuf_dirty(db, tx); 1203 } 1204 1205 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1206 /* ARGSUSED */ 1207 void 1208 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1209 { 1210 mutex_enter(&db->db_mtx); 1211 DBUF_VERIFY(db); 1212 1213 if (db->db_state == DB_FILL) { 1214 if (db->db_level == 0 && db->db_d.db_freed_in_flight) { 1215 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1216 /* we were freed while filling */ 1217 /* XXX dbuf_undirty? */ 1218 bzero(db->db.db_data, db->db.db_size); 1219 db->db_d.db_freed_in_flight = FALSE; 1220 } 1221 db->db_state = DB_CACHED; 1222 cv_broadcast(&db->db_changed); 1223 } 1224 mutex_exit(&db->db_mtx); 1225 } 1226 1227 /* 1228 * "Clear" the contents of this dbuf. This will mark the dbuf 1229 * EVICTING and clear *most* of its references. Unfortunetely, 1230 * when we are not holding the dn_dbufs_mtx, we can't clear the 1231 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1232 * in this case. For callers from the DMU we will usually see: 1233 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() 1234 * For the arc callback, we will usually see: 1235 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1236 * Sometimes, though, we will get a mix of these two: 1237 * DMU: dbuf_clear()->arc_buf_evict() 1238 * ARC: dbuf_do_evict()->dbuf_destroy() 1239 */ 1240 void 1241 dbuf_clear(dmu_buf_impl_t *db) 1242 { 1243 dnode_t *dn = db->db_dnode; 1244 dmu_buf_impl_t *parent = db->db_parent; 1245 dmu_buf_impl_t *dndb = dn->dn_dbuf; 1246 int dbuf_gone = FALSE; 1247 1248 ASSERT(MUTEX_HELD(&db->db_mtx)); 1249 ASSERT(refcount_is_zero(&db->db_holds)); 1250 1251 dbuf_evict_user(db); 1252 1253 if (db->db_state == DB_CACHED) { 1254 ASSERT(db->db.db_data != NULL); 1255 if (db->db_blkid == DB_BONUS_BLKID) 1256 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1257 db->db.db_data = NULL; 1258 db->db_state = DB_UNCACHED; 1259 } 1260 1261 ASSERT3U(db->db_state, ==, DB_UNCACHED); 1262 ASSERT(db->db_data_pending == NULL); 1263 1264 db->db_state = DB_EVICTING; 1265 db->db_blkptr = NULL; 1266 1267 if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1268 list_remove(&dn->dn_dbufs, db); 1269 dnode_rele(dn, db); 1270 } 1271 1272 if (db->db_buf) 1273 dbuf_gone = arc_buf_evict(db->db_buf); 1274 1275 if (!dbuf_gone) 1276 mutex_exit(&db->db_mtx); 1277 1278 /* 1279 * If this dbuf is referened from an indirect dbuf, 1280 * decrement the ref count on the indirect dbuf. 1281 */ 1282 if (parent && parent != dndb) 1283 dbuf_rele(parent, db); 1284 } 1285 1286 static int 1287 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1288 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1289 { 1290 int nlevels, epbs; 1291 1292 *parentp = NULL; 1293 *bpp = NULL; 1294 1295 ASSERT(blkid != DB_BONUS_BLKID); 1296 1297 if (dn->dn_phys->dn_nlevels == 0) 1298 nlevels = 1; 1299 else 1300 nlevels = dn->dn_phys->dn_nlevels; 1301 1302 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1303 1304 ASSERT3U(level * epbs, <, 64); 1305 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1306 if (level >= nlevels || 1307 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1308 /* the buffer has no parent yet */ 1309 return (ENOENT); 1310 } else if (level < nlevels-1) { 1311 /* this block is referenced from an indirect block */ 1312 int err = dbuf_hold_impl(dn, level+1, 1313 blkid >> epbs, fail_sparse, NULL, parentp); 1314 if (err) 1315 return (err); 1316 err = dbuf_read(*parentp, NULL, 1317 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1318 if (err) { 1319 dbuf_rele(*parentp, NULL); 1320 *parentp = NULL; 1321 return (err); 1322 } 1323 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1324 (blkid & ((1ULL << epbs) - 1)); 1325 return (0); 1326 } else { 1327 /* the block is referenced from the dnode */ 1328 ASSERT3U(level, ==, nlevels-1); 1329 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1330 blkid < dn->dn_phys->dn_nblkptr); 1331 if (dn->dn_dbuf) { 1332 dbuf_add_ref(dn->dn_dbuf, NULL); 1333 *parentp = dn->dn_dbuf; 1334 } 1335 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1336 return (0); 1337 } 1338 } 1339 1340 static dmu_buf_impl_t * 1341 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1342 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1343 { 1344 objset_impl_t *os = dn->dn_objset; 1345 dmu_buf_impl_t *db, *odb; 1346 1347 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1348 ASSERT(dn->dn_type != DMU_OT_NONE); 1349 1350 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1351 1352 db->db_objset = os; 1353 db->db.db_object = dn->dn_object; 1354 db->db_level = level; 1355 db->db_blkid = blkid; 1356 db->db_dirtied = 0; 1357 db->db_dirtycnt = 0; 1358 db->db_dnode = dn; 1359 db->db_parent = parent; 1360 db->db_blkptr = blkptr; 1361 1362 bzero(&db->db_d, sizeof (db->db_d)); 1363 1364 if (blkid == DB_BONUS_BLKID) { 1365 ASSERT3P(parent, ==, dn->dn_dbuf); 1366 db->db.db_size = dn->dn_bonuslen; 1367 db->db.db_offset = DB_BONUS_BLKID; 1368 db->db_state = DB_UNCACHED; 1369 /* the bonus dbuf is not placed in the hash table */ 1370 return (db); 1371 } else { 1372 int blocksize = 1373 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1374 db->db.db_size = blocksize; 1375 db->db.db_offset = db->db_blkid * blocksize; 1376 } 1377 1378 /* 1379 * Hold the dn_dbufs_mtx while we get the new dbuf 1380 * in the hash table *and* added to the dbufs list. 1381 * This prevents a possible deadlock with someone 1382 * trying to look up this dbuf before its added to the 1383 * dn_dbufs list. 1384 */ 1385 mutex_enter(&dn->dn_dbufs_mtx); 1386 db->db_state = DB_EVICTING; 1387 if ((odb = dbuf_hash_insert(db)) != NULL) { 1388 /* someone else inserted it first */ 1389 kmem_cache_free(dbuf_cache, db); 1390 mutex_exit(&dn->dn_dbufs_mtx); 1391 return (odb); 1392 } 1393 list_insert_head(&dn->dn_dbufs, db); 1394 db->db_state = DB_UNCACHED; 1395 mutex_exit(&dn->dn_dbufs_mtx); 1396 1397 if (parent && parent != dn->dn_dbuf) 1398 dbuf_add_ref(parent, db); 1399 1400 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1401 refcount_count(&dn->dn_holds) > 0); 1402 (void) refcount_add(&dn->dn_holds, db); 1403 1404 dprintf_dbuf(db, "db=%p\n", db); 1405 1406 return (db); 1407 } 1408 1409 static int 1410 dbuf_do_evict(void *private) 1411 { 1412 arc_buf_t *buf = private; 1413 dmu_buf_impl_t *db = buf->b_private; 1414 1415 if (!MUTEX_HELD(&db->db_mtx)) 1416 mutex_enter(&db->db_mtx); 1417 1418 ASSERT(refcount_is_zero(&db->db_holds)); 1419 1420 if (db->db_state != DB_EVICTING) { 1421 ASSERT(db->db_state == DB_CACHED); 1422 DBUF_VERIFY(db); 1423 db->db_buf = NULL; 1424 dbuf_evict(db); 1425 } else { 1426 mutex_exit(&db->db_mtx); 1427 dbuf_destroy(db); 1428 } 1429 return (0); 1430 } 1431 1432 static void 1433 dbuf_destroy(dmu_buf_impl_t *db) 1434 { 1435 ASSERT(refcount_is_zero(&db->db_holds)); 1436 1437 if (db->db_blkid != DB_BONUS_BLKID) { 1438 dnode_t *dn = db->db_dnode; 1439 1440 /* 1441 * If this dbuf is still on the dn_dbufs list, 1442 * remove it from that list. 1443 */ 1444 if (list_link_active(&db->db_link)) { 1445 mutex_enter(&dn->dn_dbufs_mtx); 1446 list_remove(&dn->dn_dbufs, db); 1447 mutex_exit(&dn->dn_dbufs_mtx); 1448 1449 dnode_rele(dn, db); 1450 } 1451 dbuf_hash_remove(db); 1452 } 1453 db->db_parent = NULL; 1454 db->db_dnode = NULL; 1455 db->db_buf = NULL; 1456 1457 ASSERT(db->db.db_data == NULL); 1458 ASSERT(db->db_hash_next == NULL); 1459 ASSERT(db->db_blkptr == NULL); 1460 ASSERT(db->db_data_pending == NULL); 1461 1462 kmem_cache_free(dbuf_cache, db); 1463 } 1464 1465 void 1466 dbuf_prefetch(dnode_t *dn, uint64_t blkid) 1467 { 1468 dmu_buf_impl_t *db = NULL; 1469 blkptr_t *bp = NULL; 1470 1471 ASSERT(blkid != DB_BONUS_BLKID); 1472 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1473 1474 if (dnode_block_freed(dn, blkid)) 1475 return; 1476 1477 /* dbuf_find() returns with db_mtx held */ 1478 if (db = dbuf_find(dn, 0, blkid)) { 1479 if (refcount_count(&db->db_holds) > 0) { 1480 /* 1481 * This dbuf is active. We assume that it is 1482 * already CACHED, or else about to be either 1483 * read or filled. 1484 */ 1485 mutex_exit(&db->db_mtx); 1486 return; 1487 } 1488 mutex_exit(&db->db_mtx); 1489 db = NULL; 1490 } 1491 1492 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1493 if (bp && !BP_IS_HOLE(bp)) { 1494 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1495 zbookmark_t zb; 1496 zb.zb_objset = dn->dn_objset->os_dsl_dataset ? 1497 dn->dn_objset->os_dsl_dataset->ds_object : 0; 1498 zb.zb_object = dn->dn_object; 1499 zb.zb_level = 0; 1500 zb.zb_blkid = blkid; 1501 1502 (void) arc_read(NULL, dn->dn_objset->os_spa, bp, 1503 dmu_ot[dn->dn_type].ot_byteswap, 1504 NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 1505 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1506 &aflags, &zb); 1507 } 1508 if (db) 1509 dbuf_rele(db, NULL); 1510 } 1511 } 1512 1513 /* 1514 * Returns with db_holds incremented, and db_mtx not held. 1515 * Note: dn_struct_rwlock must be held. 1516 */ 1517 int 1518 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1519 void *tag, dmu_buf_impl_t **dbp) 1520 { 1521 dmu_buf_impl_t *db, *parent = NULL; 1522 1523 ASSERT(blkid != DB_BONUS_BLKID); 1524 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1525 ASSERT3U(dn->dn_nlevels, >, level); 1526 1527 *dbp = NULL; 1528 top: 1529 /* dbuf_find() returns with db_mtx held */ 1530 db = dbuf_find(dn, level, blkid); 1531 1532 if (db == NULL) { 1533 blkptr_t *bp = NULL; 1534 int err; 1535 1536 ASSERT3P(parent, ==, NULL); 1537 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1538 if (fail_sparse) { 1539 if (err == 0 && bp && BP_IS_HOLE(bp)) 1540 err = ENOENT; 1541 if (err) { 1542 if (parent) 1543 dbuf_rele(parent, NULL); 1544 return (err); 1545 } 1546 } 1547 if (err && err != ENOENT) 1548 return (err); 1549 db = dbuf_create(dn, level, blkid, parent, bp); 1550 } 1551 1552 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1553 arc_buf_add_ref(db->db_buf, db); 1554 if (db->db_buf->b_data == NULL) { 1555 dbuf_clear(db); 1556 if (parent) { 1557 dbuf_rele(parent, NULL); 1558 parent = NULL; 1559 } 1560 goto top; 1561 } 1562 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1563 } 1564 1565 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1566 1567 /* 1568 * If this buffer is currently syncing out, and we are 1569 * are still referencing it from db_data, we need to make 1570 * a copy of it in case we decide we want to dirty it 1571 * again in this txg. 1572 */ 1573 if (db->db_level == 0 && db->db_state == DB_CACHED && 1574 dn->dn_object != DMU_META_DNODE_OBJECT && 1575 db->db_data_pending == db->db_buf) { 1576 int size = (db->db_blkid == DB_BONUS_BLKID) ? 1577 DN_MAX_BONUSLEN : db->db.db_size; 1578 1579 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 1580 size, db)); 1581 bcopy(db->db_data_pending->b_data, db->db.db_data, 1582 db->db.db_size); 1583 } 1584 1585 (void) refcount_add(&db->db_holds, tag); 1586 dbuf_update_data(db); 1587 DBUF_VERIFY(db); 1588 mutex_exit(&db->db_mtx); 1589 1590 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1591 if (parent) 1592 dbuf_rele(parent, NULL); 1593 1594 ASSERT3P(db->db_dnode, ==, dn); 1595 ASSERT3U(db->db_blkid, ==, blkid); 1596 ASSERT3U(db->db_level, ==, level); 1597 *dbp = db; 1598 1599 return (0); 1600 } 1601 1602 dmu_buf_impl_t * 1603 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1604 { 1605 dmu_buf_impl_t *db; 1606 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1607 return (err ? NULL : db); 1608 } 1609 1610 dmu_buf_impl_t * 1611 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1612 { 1613 dmu_buf_impl_t *db; 1614 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1615 return (err ? NULL : db); 1616 } 1617 1618 dmu_buf_impl_t * 1619 dbuf_create_bonus(dnode_t *dn) 1620 { 1621 dmu_buf_impl_t *db = dn->dn_bonus; 1622 1623 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1624 1625 ASSERT(dn->dn_bonus == NULL); 1626 db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); 1627 return (db); 1628 } 1629 1630 #pragma weak dmu_buf_add_ref = dbuf_add_ref 1631 void 1632 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1633 { 1634 int64_t holds = refcount_add(&db->db_holds, tag); 1635 ASSERT(holds > 1); 1636 } 1637 1638 #pragma weak dmu_buf_rele = dbuf_rele 1639 void 1640 dbuf_rele(dmu_buf_impl_t *db, void *tag) 1641 { 1642 int64_t holds; 1643 1644 mutex_enter(&db->db_mtx); 1645 DBUF_VERIFY(db); 1646 1647 holds = refcount_remove(&db->db_holds, tag); 1648 ASSERT(holds >= 0); 1649 1650 if (holds == db->db_dirtycnt && 1651 db->db_level == 0 && db->db_d.db_immediate_evict) 1652 dbuf_evict_user(db); 1653 1654 if (holds == 0) { 1655 if (db->db_blkid == DB_BONUS_BLKID) { 1656 mutex_exit(&db->db_mtx); 1657 dnode_rele(db->db_dnode, db); 1658 } else if (db->db_buf == NULL) { 1659 /* 1660 * This is a special case: we never associated this 1661 * dbuf with any data allocated from the ARC. 1662 */ 1663 ASSERT3U(db->db_state, ==, DB_UNCACHED); 1664 dbuf_evict(db); 1665 } else if (arc_released(db->db_buf)) { 1666 arc_buf_t *buf = db->db_buf; 1667 /* 1668 * This dbuf has anonymous data associated with it. 1669 */ 1670 dbuf_set_data(db, NULL); 1671 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1672 dbuf_evict(db); 1673 } else { 1674 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); 1675 mutex_exit(&db->db_mtx); 1676 } 1677 } else { 1678 mutex_exit(&db->db_mtx); 1679 } 1680 } 1681 1682 #pragma weak dmu_buf_refcount = dbuf_refcount 1683 uint64_t 1684 dbuf_refcount(dmu_buf_impl_t *db) 1685 { 1686 return (refcount_count(&db->db_holds)); 1687 } 1688 1689 void * 1690 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1691 dmu_buf_evict_func_t *evict_func) 1692 { 1693 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1694 user_data_ptr_ptr, evict_func)); 1695 } 1696 1697 void * 1698 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1699 dmu_buf_evict_func_t *evict_func) 1700 { 1701 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1702 1703 db->db_d.db_immediate_evict = TRUE; 1704 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1705 user_data_ptr_ptr, evict_func)); 1706 } 1707 1708 void * 1709 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 1710 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 1711 { 1712 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1713 ASSERT(db->db_level == 0); 1714 1715 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 1716 1717 mutex_enter(&db->db_mtx); 1718 1719 if (db->db_d.db_user_ptr == old_user_ptr) { 1720 db->db_d.db_user_ptr = user_ptr; 1721 db->db_d.db_user_data_ptr_ptr = user_data_ptr_ptr; 1722 db->db_d.db_evict_func = evict_func; 1723 1724 dbuf_update_data(db); 1725 } else { 1726 old_user_ptr = db->db_d.db_user_ptr; 1727 } 1728 1729 mutex_exit(&db->db_mtx); 1730 return (old_user_ptr); 1731 } 1732 1733 void * 1734 dmu_buf_get_user(dmu_buf_t *db_fake) 1735 { 1736 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1737 ASSERT(!refcount_is_zero(&db->db_holds)); 1738 1739 return (db->db_d.db_user_ptr); 1740 } 1741 1742 void 1743 dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) 1744 { 1745 arc_buf_t **data; 1746 uint64_t txg = tx->tx_txg; 1747 dnode_t *dn = db->db_dnode; 1748 objset_impl_t *os = dn->dn_objset; 1749 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 1750 int checksum, compress; 1751 zbookmark_t zb; 1752 int blksz; 1753 1754 ASSERT(dmu_tx_is_syncing(tx)); 1755 1756 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 1757 1758 mutex_enter(&db->db_mtx); 1759 /* 1760 * To be synced, we must be dirtied. But we 1761 * might have been freed after the dirty. 1762 */ 1763 if (db->db_state == DB_UNCACHED) { 1764 /* This buffer has been freed since it was dirtied */ 1765 ASSERT(db->db.db_data == NULL); 1766 } else if (db->db_state == DB_FILL) { 1767 /* This buffer was freed and is now being re-filled */ 1768 ASSERT(db->db.db_data != db->db_d.db_data_old[txg&TXG_MASK]); 1769 } else { 1770 ASSERT3U(db->db_state, ==, DB_CACHED); 1771 } 1772 DBUF_VERIFY(db); 1773 1774 /* 1775 * Don't need a lock on db_dirty (dn_mtx), because it can't 1776 * be modified yet. 1777 */ 1778 1779 if (db->db_blkid == DB_BONUS_BLKID) { 1780 void **datap = &db->db_d.db_data_old[txg&TXG_MASK]; 1781 /* 1782 * Simply copy the bonus data into the dnode. It will 1783 * be written out when the dnode is synced (and it will 1784 * be synced, since it must have been dirty for dbuf_sync 1785 * to be called). 1786 */ 1787 /* 1788 * Use dn_phys->dn_bonuslen since db.db_size is the length 1789 * of the bonus buffer in the open transaction rather than 1790 * the syncing transaction. 1791 */ 1792 ASSERT(*datap != NULL); 1793 ASSERT3U(db->db_level, ==, 0); 1794 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 1795 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 1796 if (*datap != db->db.db_data) 1797 zio_buf_free(*datap, DN_MAX_BONUSLEN); 1798 db->db_d.db_data_old[txg&TXG_MASK] = NULL; 1799 db->db_data_pending = NULL; 1800 if (db->db_dirtied == txg) 1801 db->db_dirtied = 0; 1802 ASSERT(db->db_dirtycnt > 0); 1803 db->db_dirtycnt -= 1; 1804 mutex_exit(&db->db_mtx); 1805 dbuf_rele(db, (void *)(uintptr_t)txg); 1806 return; 1807 } 1808 1809 if (db->db_level == 0) { 1810 data = (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK]; 1811 blksz = arc_buf_size(*data); 1812 1813 /* 1814 * This buffer is in the middle of an immdiate write. 1815 * Wait for the synchronous IO to complete. 1816 */ 1817 while (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) { 1818 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 1819 cv_wait(&db->db_changed, &db->db_mtx); 1820 ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK]); 1821 } 1822 /* 1823 * If this buffer is currently "in use" (i.e., there are 1824 * active holds and db_data still references it), then make 1825 * a copy before we start the write so that any modifications 1826 * from the open txg will not leak into this write. 1827 * 1828 * NOTE: this copy does not need to be made for objects only 1829 * modified in the syncing context (e.g. DNONE_DNODE blocks) 1830 * or if there is no actual write involved (bonus blocks). 1831 */ 1832 if (dn->dn_object != DMU_META_DNODE_OBJECT && 1833 db->db_d.db_overridden_by[txg&TXG_MASK] == NULL) { 1834 if (refcount_count(&db->db_holds) > 1 && 1835 *data == db->db_buf) { 1836 *data = arc_buf_alloc(os->os_spa, blksz, db); 1837 bcopy(db->db.db_data, (*data)->b_data, blksz); 1838 } 1839 db->db_data_pending = *data; 1840 } else if (dn->dn_object == DMU_META_DNODE_OBJECT) { 1841 /* 1842 * Private object buffers are released here rather 1843 * than in dbuf_dirty() since they are only modified 1844 * in the syncing context and we don't want the 1845 * overhead of making multiple copies of the data. 1846 */ 1847 arc_release(db->db_buf, db); 1848 } 1849 } else { 1850 data = &db->db_buf; 1851 if (*data == NULL) { 1852 /* 1853 * This can happen if we dirty and then free 1854 * the level-0 data blocks in the same txg. So 1855 * this indirect remains unchanged. 1856 */ 1857 if (db->db_dirtied == txg) 1858 db->db_dirtied = 0; 1859 ASSERT(db->db_dirtycnt > 0); 1860 db->db_dirtycnt -= 1; 1861 mutex_exit(&db->db_mtx); 1862 dbuf_rele(db, (void *)(uintptr_t)txg); 1863 return; 1864 } 1865 blksz = db->db.db_size; 1866 ASSERT3U(blksz, ==, 1<<dn->dn_phys->dn_indblkshift); 1867 } 1868 1869 ASSERT(*data != NULL); 1870 1871 if (db->db_level > 0 && !arc_released(db->db_buf)) { 1872 /* 1873 * This indirect buffer was marked dirty, but 1874 * never modified (if it had been modified, then 1875 * we would have released the buffer). There is 1876 * no reason to write anything. 1877 */ 1878 db->db_data_pending = NULL; 1879 if (db->db_dirtied == txg) 1880 db->db_dirtied = 0; 1881 ASSERT(db->db_dirtycnt > 0); 1882 db->db_dirtycnt -= 1; 1883 mutex_exit(&db->db_mtx); 1884 dbuf_rele(db, (void *)(uintptr_t)txg); 1885 return; 1886 } else if (db->db_blkptr == NULL && 1887 db->db_level == dn->dn_phys->dn_nlevels-1 && 1888 db->db_blkid < dn->dn_phys->dn_nblkptr) { 1889 /* 1890 * This buffer was allocated at a time when there was 1891 * no available blkptrs from the dnode, or it was 1892 * inappropriate to hook it in (i.e., nlevels mis-match). 1893 */ 1894 ASSERT(db->db_blkptr == NULL); 1895 ASSERT(db->db_parent == NULL); 1896 db->db_parent = dn->dn_dbuf; 1897 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 1898 DBUF_VERIFY(db); 1899 mutex_exit(&db->db_mtx); 1900 } else if (db->db_blkptr == NULL) { 1901 dmu_buf_impl_t *parent = db->db_parent; 1902 1903 mutex_exit(&db->db_mtx); 1904 ASSERT(dn->dn_phys->dn_nlevels > 1); 1905 if (parent == NULL) { 1906 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1907 (void) dbuf_hold_impl(dn, db->db_level+1, 1908 db->db_blkid >> epbs, FALSE, FTAG, &parent); 1909 rw_exit(&dn->dn_struct_rwlock); 1910 dbuf_add_ref(parent, db); 1911 db->db_parent = parent; 1912 dbuf_rele(parent, FTAG); 1913 } 1914 (void) dbuf_read(parent, NULL, DB_RF_MUST_SUCCEED); 1915 } else { 1916 mutex_exit(&db->db_mtx); 1917 } 1918 1919 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || db->db_parent != NULL); 1920 1921 if (db->db_level > 0 && 1922 db->db_blkid > dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)) { 1923 /* 1924 * Don't write indirect blocks past EOF. 1925 * We get these when we truncate a file *after* dirtying 1926 * blocks in the truncate range (we undirty the level 0 1927 * blocks in dbuf_free_range(), but not the indirects). 1928 */ 1929 #ifdef ZFS_DEBUG 1930 /* 1931 * Verify that this indirect block is empty. 1932 */ 1933 blkptr_t *bplist; 1934 int i; 1935 1936 mutex_enter(&db->db_mtx); 1937 bplist = db->db.db_data; 1938 for (i = 0; i < (1 << epbs); i++) { 1939 if (!BP_IS_HOLE(&bplist[i])) { 1940 panic("data past EOF: " 1941 "db=%p level=%d id=%llu i=%d\n", 1942 db, db->db_level, 1943 (u_longlong_t)db->db_blkid, i); 1944 } 1945 } 1946 mutex_exit(&db->db_mtx); 1947 #endif 1948 ASSERT(db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)); 1949 mutex_enter(&db->db_mtx); 1950 db->db_dirtycnt -= 1; 1951 mutex_exit(&db->db_mtx); 1952 dbuf_rele(db, (void *)(uintptr_t)txg); 1953 return; 1954 } 1955 1956 if (db->db_parent != dn->dn_dbuf) { 1957 dmu_buf_impl_t *parent = db->db_parent; 1958 1959 mutex_enter(&db->db_mtx); 1960 ASSERT(db->db_level == parent->db_level-1); 1961 ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK])); 1962 /* 1963 * We may have read this indirect block after we dirtied it, 1964 * so never released it from the cache. 1965 */ 1966 arc_release(parent->db_buf, db->db_parent); 1967 1968 db->db_blkptr = (blkptr_t *)parent->db.db_data + 1969 (db->db_blkid & ((1ULL << epbs) - 1)); 1970 DBUF_VERIFY(db); 1971 mutex_exit(&db->db_mtx); 1972 #ifdef ZFS_DEBUG 1973 } else { 1974 /* 1975 * We don't need to dnode_setdirty(dn) because if we got 1976 * here then the parent is already dirty. 1977 */ 1978 ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); 1979 ASSERT3P(db->db_blkptr, ==, 1980 &dn->dn_phys->dn_blkptr[db->db_blkid]); 1981 #endif 1982 } 1983 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1984 1985 if (db->db_level == 0 && 1986 db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { 1987 arc_buf_t **old = 1988 (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK]; 1989 blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK]; 1990 int old_size = bp_get_dasize(os->os_spa, db->db_blkptr); 1991 int new_size = bp_get_dasize(os->os_spa, *bpp); 1992 1993 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1994 1995 dnode_diduse_space(dn, new_size-old_size); 1996 mutex_enter(&dn->dn_mtx); 1997 if (db->db_blkid > dn->dn_phys->dn_maxblkid) 1998 dn->dn_phys->dn_maxblkid = db->db_blkid; 1999 mutex_exit(&dn->dn_mtx); 2000 2001 dsl_dataset_block_born(os->os_dsl_dataset, *bpp, tx); 2002 if (!BP_IS_HOLE(db->db_blkptr)) 2003 dsl_dataset_block_kill(os->os_dsl_dataset, 2004 db->db_blkptr, os->os_synctx); 2005 2006 mutex_enter(&db->db_mtx); 2007 *db->db_blkptr = **bpp; 2008 kmem_free(*bpp, sizeof (blkptr_t)); 2009 *bpp = NULL; 2010 2011 if (*old != db->db_buf) 2012 VERIFY(arc_buf_remove_ref(*old, db) == 1); 2013 else if (!BP_IS_HOLE(db->db_blkptr)) 2014 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2015 else 2016 ASSERT(arc_released(db->db_buf)); 2017 *old = NULL; 2018 db->db_data_pending = NULL; 2019 2020 cv_broadcast(&db->db_changed); 2021 2022 ASSERT(db->db_dirtycnt > 0); 2023 db->db_dirtycnt -= 1; 2024 mutex_exit(&db->db_mtx); 2025 dbuf_rele(db, (void *)(uintptr_t)txg); 2026 return; 2027 } 2028 2029 if (db->db_level > 0) { 2030 /* 2031 * XXX -- we should design a compression algorithm 2032 * that specializes in arrays of bps. 2033 */ 2034 checksum = ZIO_CHECKSUM_FLETCHER_4; 2035 if (zfs_mdcomp_disable) 2036 compress = ZIO_COMPRESS_EMPTY; 2037 else 2038 compress = ZIO_COMPRESS_LZJB; 2039 } else { 2040 /* 2041 * Allow dnode settings to override objset settings, 2042 * except for metadata checksums. 2043 */ 2044 if (dmu_ot[dn->dn_type].ot_metadata) { 2045 checksum = os->os_md_checksum; 2046 compress = zio_compress_select(dn->dn_compress, 2047 os->os_md_compress); 2048 } else { 2049 checksum = zio_checksum_select(dn->dn_checksum, 2050 os->os_checksum); 2051 compress = zio_compress_select(dn->dn_compress, 2052 os->os_compress); 2053 } 2054 } 2055 #ifdef ZFS_DEBUG 2056 if (db->db_parent) { 2057 ASSERT(list_link_active( 2058 &db->db_parent->db_dirty_node[txg&TXG_MASK])); 2059 ASSERT(db->db_parent == dn->dn_dbuf || 2060 db->db_parent->db_level > 0); 2061 if (dn->dn_object == DMU_META_DNODE_OBJECT || db->db_level > 0) 2062 ASSERT(*data == db->db_buf); 2063 } 2064 #endif 2065 ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg); 2066 zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; 2067 zb.zb_object = db->db.db_object; 2068 zb.zb_level = db->db_level; 2069 zb.zb_blkid = db->db_blkid; 2070 2071 (void) arc_write(zio, os->os_spa, checksum, compress, 2072 dmu_get_replication_level(os->os_spa, &zb, dn->dn_type), txg, 2073 db->db_blkptr, *data, dbuf_write_done, db, 2074 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT, &zb); 2075 /* 2076 * We can't access db after arc_write, since it could finish 2077 * and be freed, and we have no locks on it. 2078 */ 2079 } 2080 2081 struct dbuf_arg { 2082 objset_impl_t *os; 2083 blkptr_t bp; 2084 }; 2085 2086 static void 2087 dbuf_do_born(void *arg) 2088 { 2089 struct dbuf_arg *da = arg; 2090 dsl_dataset_block_born(da->os->os_dsl_dataset, 2091 &da->bp, da->os->os_synctx); 2092 kmem_free(da, sizeof (struct dbuf_arg)); 2093 } 2094 2095 static void 2096 dbuf_do_kill(void *arg) 2097 { 2098 struct dbuf_arg *da = arg; 2099 dsl_dataset_block_kill(da->os->os_dsl_dataset, 2100 &da->bp, da->os->os_synctx); 2101 kmem_free(da, sizeof (struct dbuf_arg)); 2102 } 2103 2104 /* ARGSUSED */ 2105 static void 2106 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2107 { 2108 dmu_buf_impl_t *db = vdb; 2109 dnode_t *dn = db->db_dnode; 2110 objset_impl_t *os = dn->dn_objset; 2111 uint64_t txg = zio->io_txg; 2112 uint64_t fill = 0; 2113 int i; 2114 int old_size, new_size; 2115 2116 ASSERT3U(zio->io_error, ==, 0); 2117 2118 dprintf_dbuf_bp(db, &zio->io_bp_orig, "bp_orig: %s", ""); 2119 2120 old_size = bp_get_dasize(os->os_spa, &zio->io_bp_orig); 2121 new_size = bp_get_dasize(os->os_spa, zio->io_bp); 2122 2123 dnode_diduse_space(dn, new_size-old_size); 2124 2125 mutex_enter(&db->db_mtx); 2126 2127 ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == NULL); 2128 2129 if (db->db_dirtied == txg) 2130 db->db_dirtied = 0; 2131 2132 if (db->db_level == 0) { 2133 arc_buf_t **old = 2134 (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK]; 2135 2136 ASSERT(db->db_blkid != DB_BONUS_BLKID); 2137 2138 if (*old != db->db_buf) 2139 VERIFY(arc_buf_remove_ref(*old, db) == 1); 2140 else if (!BP_IS_HOLE(db->db_blkptr)) 2141 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2142 else 2143 ASSERT(arc_released(db->db_buf)); 2144 *old = NULL; 2145 db->db_data_pending = NULL; 2146 2147 mutex_enter(&dn->dn_mtx); 2148 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2149 !BP_IS_HOLE(db->db_blkptr)) 2150 dn->dn_phys->dn_maxblkid = db->db_blkid; 2151 mutex_exit(&dn->dn_mtx); 2152 2153 if (dn->dn_type == DMU_OT_DNODE) { 2154 dnode_phys_t *dnp = db->db.db_data; 2155 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2156 i--, dnp++) { 2157 if (dnp->dn_type != DMU_OT_NONE) 2158 fill++; 2159 } 2160 } else { 2161 if (!BP_IS_HOLE(db->db_blkptr)) 2162 fill = 1; 2163 } 2164 } else { 2165 blkptr_t *bp = db->db.db_data; 2166 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2167 if (!BP_IS_HOLE(db->db_blkptr)) { 2168 int epbs = 2169 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2170 ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, db->db.db_size); 2171 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2172 db->db.db_size); 2173 ASSERT3U(dn->dn_phys->dn_maxblkid 2174 >> (db->db_level * epbs), >=, db->db_blkid); 2175 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2176 } 2177 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) { 2178 if (BP_IS_HOLE(bp)) 2179 continue; 2180 ASSERT3U(BP_GET_LSIZE(bp), ==, 2181 db->db_level == 1 ? dn->dn_datablksz : 2182 (1<<dn->dn_phys->dn_indblkshift)); 2183 fill += bp->blk_fill; 2184 } 2185 } 2186 2187 if (!BP_IS_HOLE(db->db_blkptr)) { 2188 db->db_blkptr->blk_fill = fill; 2189 BP_SET_TYPE(db->db_blkptr, dn->dn_type); 2190 BP_SET_LEVEL(db->db_blkptr, db->db_level); 2191 } else { 2192 ASSERT3U(fill, ==, 0); 2193 ASSERT3U(db->db_blkptr->blk_fill, ==, 0); 2194 } 2195 2196 dprintf_dbuf_bp(db, db->db_blkptr, 2197 "wrote %llu bytes to blkptr:", zio->io_size); 2198 2199 ASSERT(db->db_parent == NULL || 2200 list_link_active(&db->db_parent->db_dirty_node[txg&TXG_MASK])); 2201 cv_broadcast(&db->db_changed); 2202 ASSERT(db->db_dirtycnt > 0); 2203 db->db_dirtycnt -= 1; 2204 mutex_exit(&db->db_mtx); 2205 2206 /* We must do this after we've set the bp's type and level */ 2207 if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), 2208 BP_IDENTITY(&zio->io_bp_orig))) { 2209 struct dbuf_arg *da; 2210 da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP); 2211 da->os = os; 2212 da->bp = *zio->io_bp; 2213 (void) taskq_dispatch(dbuf_tq, dbuf_do_born, da, 0); 2214 if (!BP_IS_HOLE(&zio->io_bp_orig)) { 2215 da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP); 2216 da->os = os; 2217 da->bp = zio->io_bp_orig; 2218 (void) taskq_dispatch(dbuf_tq, dbuf_do_kill, da, 0); 2219 } 2220 } 2221 2222 dbuf_rele(db, (void *)(uintptr_t)txg); 2223 } 2224