1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_impl.h> 31 #include <sys/dbuf.h> 32 #include <sys/dmu_objset.h> 33 #include <sys/dsl_dataset.h> 34 #include <sys/dsl_dir.h> 35 #include <sys/dmu_tx.h> 36 #include <sys/spa.h> 37 #include <sys/zio.h> 38 #include <sys/dmu_zfetch.h> 39 40 static void dbuf_destroy(dmu_buf_impl_t *db); 41 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 42 static arc_done_func_t dbuf_write_done; 43 44 int zfs_mdcomp_disable = 0; 45 46 /* 47 * Global data structures and functions for the dbuf cache. 48 */ 49 taskq_t *dbuf_tq; 50 static kmem_cache_t *dbuf_cache; 51 52 /* ARGSUSED */ 53 static int 54 dbuf_cons(void *vdb, void *unused, int kmflag) 55 { 56 dmu_buf_impl_t *db = vdb; 57 bzero(db, sizeof (dmu_buf_impl_t)); 58 59 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 60 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 61 refcount_create(&db->db_holds); 62 return (0); 63 } 64 65 /* ARGSUSED */ 66 static void 67 dbuf_dest(void *vdb, void *unused) 68 { 69 dmu_buf_impl_t *db = vdb; 70 mutex_destroy(&db->db_mtx); 71 cv_destroy(&db->db_changed); 72 refcount_destroy(&db->db_holds); 73 } 74 75 /* 76 * dbuf hash table routines 77 */ 78 static dbuf_hash_table_t dbuf_hash_table; 79 80 static uint64_t dbuf_hash_count; 81 82 static uint64_t 83 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 84 { 85 uintptr_t osv = (uintptr_t)os; 86 uint64_t crc = -1ULL; 87 88 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 90 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 91 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 92 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 93 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 94 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 95 96 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 97 98 return (crc); 99 } 100 101 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 102 103 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 104 ((dbuf)->db.db_object == (obj) && \ 105 (dbuf)->db_objset == (os) && \ 106 (dbuf)->db_level == (level) && \ 107 (dbuf)->db_blkid == (blkid)) 108 109 dmu_buf_impl_t * 110 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 111 { 112 dbuf_hash_table_t *h = &dbuf_hash_table; 113 objset_impl_t *os = dn->dn_objset; 114 uint64_t obj = dn->dn_object; 115 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 116 uint64_t idx = hv & h->hash_table_mask; 117 dmu_buf_impl_t *db; 118 119 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 120 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 121 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 122 mutex_enter(&db->db_mtx); 123 if (db->db_state != DB_EVICTING) { 124 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 125 return (db); 126 } 127 mutex_exit(&db->db_mtx); 128 } 129 } 130 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 131 return (NULL); 132 } 133 134 /* 135 * Insert an entry into the hash table. If there is already an element 136 * equal to elem in the hash table, then the already existing element 137 * will be returned and the new element will not be inserted. 138 * Otherwise returns NULL. 139 */ 140 static dmu_buf_impl_t * 141 dbuf_hash_insert(dmu_buf_impl_t *db) 142 { 143 dbuf_hash_table_t *h = &dbuf_hash_table; 144 objset_impl_t *os = db->db_objset; 145 uint64_t obj = db->db.db_object; 146 int level = db->db_level; 147 uint64_t blkid = db->db_blkid; 148 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 149 uint64_t idx = hv & h->hash_table_mask; 150 dmu_buf_impl_t *dbf; 151 152 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 153 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 154 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 155 mutex_enter(&dbf->db_mtx); 156 if (dbf->db_state != DB_EVICTING) { 157 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 158 return (dbf); 159 } 160 mutex_exit(&dbf->db_mtx); 161 } 162 } 163 164 mutex_enter(&db->db_mtx); 165 db->db_hash_next = h->hash_table[idx]; 166 h->hash_table[idx] = db; 167 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 168 atomic_add_64(&dbuf_hash_count, 1); 169 170 return (NULL); 171 } 172 173 /* 174 * Remove an entry from the hash table. This operation will 175 * fail if there are any existing holds on the db. 176 */ 177 static void 178 dbuf_hash_remove(dmu_buf_impl_t *db) 179 { 180 dbuf_hash_table_t *h = &dbuf_hash_table; 181 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 182 db->db_level, db->db_blkid); 183 uint64_t idx = hv & h->hash_table_mask; 184 dmu_buf_impl_t *dbf, **dbp; 185 186 /* 187 * We musn't hold db_mtx to maintin lock ordering: 188 * DBUF_HASH_MUTEX > db_mtx. 189 */ 190 ASSERT(refcount_is_zero(&db->db_holds)); 191 ASSERT(db->db_state == DB_EVICTING); 192 ASSERT(!MUTEX_HELD(&db->db_mtx)); 193 194 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 195 dbp = &h->hash_table[idx]; 196 while ((dbf = *dbp) != db) { 197 dbp = &dbf->db_hash_next; 198 ASSERT(dbf != NULL); 199 } 200 *dbp = db->db_hash_next; 201 db->db_hash_next = NULL; 202 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 203 atomic_add_64(&dbuf_hash_count, -1); 204 } 205 206 static arc_evict_func_t dbuf_do_evict; 207 208 static void 209 dbuf_evict_user(dmu_buf_impl_t *db) 210 { 211 ASSERT(MUTEX_HELD(&db->db_mtx)); 212 213 if (db->db_level != 0 || db->db_d.db_evict_func == NULL) 214 return; 215 216 if (db->db_d.db_user_data_ptr_ptr) 217 *db->db_d.db_user_data_ptr_ptr = db->db.db_data; 218 db->db_d.db_evict_func(&db->db, db->db_d.db_user_ptr); 219 db->db_d.db_user_ptr = NULL; 220 db->db_d.db_user_data_ptr_ptr = NULL; 221 db->db_d.db_evict_func = NULL; 222 } 223 224 void 225 dbuf_evict(dmu_buf_impl_t *db) 226 { 227 int i; 228 229 ASSERT(MUTEX_HELD(&db->db_mtx)); 230 ASSERT(db->db_buf == NULL); 231 232 #ifdef ZFS_DEBUG 233 for (i = 0; i < TXG_SIZE; i++) { 234 ASSERT(!list_link_active(&db->db_dirty_node[i])); 235 ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL); 236 } 237 #endif 238 dbuf_clear(db); 239 dbuf_destroy(db); 240 } 241 242 void 243 dbuf_init(void) 244 { 245 uint64_t hsize = 1ULL << 16; 246 dbuf_hash_table_t *h = &dbuf_hash_table; 247 int i; 248 249 /* 250 * The hash table is big enough to fill all of physical memory 251 * with an average 4K block size. The table will take up 252 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 253 */ 254 while (hsize * 4096 < physmem * PAGESIZE) 255 hsize <<= 1; 256 257 retry: 258 h->hash_table_mask = hsize - 1; 259 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 260 if (h->hash_table == NULL) { 261 /* XXX - we should really return an error instead of assert */ 262 ASSERT(hsize > (1ULL << 10)); 263 hsize >>= 1; 264 goto retry; 265 } 266 267 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 268 sizeof (dmu_buf_impl_t), 269 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 270 dbuf_tq = taskq_create("dbuf_tq", 8, maxclsyspri, 50, INT_MAX, 271 TASKQ_PREPOPULATE); 272 273 for (i = 0; i < DBUF_MUTEXES; i++) 274 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 275 } 276 277 void 278 dbuf_fini(void) 279 { 280 dbuf_hash_table_t *h = &dbuf_hash_table; 281 int i; 282 283 taskq_destroy(dbuf_tq); 284 dbuf_tq = NULL; 285 286 for (i = 0; i < DBUF_MUTEXES; i++) 287 mutex_destroy(&h->hash_mutexes[i]); 288 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 289 kmem_cache_destroy(dbuf_cache); 290 } 291 292 /* 293 * Other stuff. 294 */ 295 296 #ifdef ZFS_DEBUG 297 static void 298 dbuf_verify(dmu_buf_impl_t *db) 299 { 300 int i; 301 dnode_t *dn = db->db_dnode; 302 303 ASSERT(MUTEX_HELD(&db->db_mtx)); 304 305 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 306 return; 307 308 ASSERT(db->db_objset != NULL); 309 if (dn == NULL) { 310 ASSERT(db->db_parent == NULL); 311 ASSERT(db->db_blkptr == NULL); 312 } else { 313 ASSERT3U(db->db.db_object, ==, dn->dn_object); 314 ASSERT3P(db->db_objset, ==, dn->dn_objset); 315 ASSERT3U(db->db_level, <, dn->dn_nlevels); 316 ASSERT(db->db_blkid == DB_BONUS_BLKID || 317 list_head(&dn->dn_dbufs)); 318 } 319 if (db->db_blkid == DB_BONUS_BLKID) { 320 ASSERT(dn != NULL); 321 ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); 322 ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); 323 } else { 324 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 325 } 326 327 if (db->db_level == 0) { 328 /* we can be momentarily larger in dnode_set_blksz() */ 329 if (db->db_blkid != DB_BONUS_BLKID && dn) { 330 ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); 331 } 332 if (db->db.db_object == DMU_META_DNODE_OBJECT) { 333 for (i = 0; i < TXG_SIZE; i++) { 334 /* 335 * it should only be modified in syncing 336 * context, so make sure we only have 337 * one copy of the data. 338 */ 339 ASSERT(db->db_d.db_data_old[i] == NULL || 340 db->db_d.db_data_old[i] == db->db_buf); 341 } 342 } 343 } 344 345 /* verify db->db_blkptr */ 346 if (db->db_blkptr) { 347 if (db->db_parent == dn->dn_dbuf) { 348 /* db is pointed to by the dnode */ 349 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 350 if (db->db.db_object == DMU_META_DNODE_OBJECT) 351 ASSERT(db->db_parent == NULL); 352 else 353 ASSERT(db->db_parent != NULL); 354 ASSERT3P(db->db_blkptr, ==, 355 &dn->dn_phys->dn_blkptr[db->db_blkid]); 356 } else { 357 /* db is pointed to by an indirect block */ 358 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 359 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 360 ASSERT3U(db->db_parent->db.db_object, ==, 361 db->db.db_object); 362 /* 363 * dnode_grow_indblksz() can make this fail if we don't 364 * have the struct_rwlock. XXX indblksz no longer 365 * grows. safe to do this now? 366 */ 367 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { 368 ASSERT3P(db->db_blkptr, ==, 369 ((blkptr_t *)db->db_parent->db.db_data + 370 db->db_blkid % epb)); 371 } 372 } 373 } 374 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 375 db->db.db_data && db->db_blkid != DB_BONUS_BLKID && 376 db->db_state != DB_FILL && !dn->dn_free_txg) { 377 /* 378 * If the blkptr isn't set but they have nonzero data, 379 * it had better be dirty, otherwise we'll lose that 380 * data when we evict this buffer. 381 */ 382 if (db->db_dirtycnt == 0) { 383 uint64_t *buf = db->db.db_data; 384 int i; 385 386 for (i = 0; i < db->db.db_size >> 3; i++) { 387 ASSERT(buf[i] == 0); 388 } 389 } 390 } 391 } 392 #endif 393 394 static void 395 dbuf_update_data(dmu_buf_impl_t *db) 396 { 397 ASSERT(MUTEX_HELD(&db->db_mtx)); 398 if (db->db_level == 0 && db->db_d.db_user_data_ptr_ptr) { 399 ASSERT(!refcount_is_zero(&db->db_holds)); 400 *db->db_d.db_user_data_ptr_ptr = db->db.db_data; 401 } 402 } 403 404 static void 405 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 406 { 407 ASSERT(MUTEX_HELD(&db->db_mtx)); 408 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 409 db->db_buf = buf; 410 if (buf != NULL) { 411 ASSERT(buf->b_data != NULL); 412 db->db.db_data = buf->b_data; 413 if (!arc_released(buf)) 414 arc_set_callback(buf, dbuf_do_evict, db); 415 dbuf_update_data(db); 416 } else { 417 dbuf_evict_user(db); 418 db->db.db_data = NULL; 419 db->db_state = DB_UNCACHED; 420 } 421 } 422 423 uint64_t 424 dbuf_whichblock(dnode_t *dn, uint64_t offset) 425 { 426 if (dn->dn_datablkshift) { 427 return (offset >> dn->dn_datablkshift); 428 } else { 429 ASSERT3U(offset, <, dn->dn_datablksz); 430 return (0); 431 } 432 } 433 434 static void 435 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 436 { 437 dmu_buf_impl_t *db = vdb; 438 439 mutex_enter(&db->db_mtx); 440 ASSERT3U(db->db_state, ==, DB_READ); 441 /* 442 * All reads are synchronous, so we must have a hold on the dbuf 443 */ 444 ASSERT(refcount_count(&db->db_holds) > 0); 445 ASSERT(db->db_buf == NULL); 446 ASSERT(db->db.db_data == NULL); 447 if (db->db_level == 0 && db->db_d.db_freed_in_flight) { 448 /* we were freed in flight; disregard any error */ 449 arc_release(buf, db); 450 bzero(buf->b_data, db->db.db_size); 451 arc_buf_freeze(buf); 452 db->db_d.db_freed_in_flight = FALSE; 453 dbuf_set_data(db, buf); 454 db->db_state = DB_CACHED; 455 } else if (zio == NULL || zio->io_error == 0) { 456 dbuf_set_data(db, buf); 457 db->db_state = DB_CACHED; 458 } else { 459 ASSERT(db->db_blkid != DB_BONUS_BLKID); 460 ASSERT3P(db->db_buf, ==, NULL); 461 VERIFY(arc_buf_remove_ref(buf, db) == 1); 462 db->db_state = DB_UNCACHED; 463 } 464 cv_broadcast(&db->db_changed); 465 mutex_exit(&db->db_mtx); 466 dbuf_rele(db, NULL); 467 } 468 469 static void 470 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 471 { 472 blkptr_t *bp; 473 zbookmark_t zb; 474 uint32_t aflags = ARC_NOWAIT; 475 476 ASSERT(!refcount_is_zero(&db->db_holds)); 477 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 478 ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); 479 ASSERT(MUTEX_HELD(&db->db_mtx)); 480 ASSERT(db->db_state == DB_UNCACHED); 481 ASSERT(db->db_buf == NULL); 482 483 if (db->db_blkid == DB_BONUS_BLKID) { 484 ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size); 485 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 486 if (db->db.db_size < DN_MAX_BONUSLEN) 487 bzero(db->db.db_data, DN_MAX_BONUSLEN); 488 bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data, 489 db->db.db_size); 490 dbuf_update_data(db); 491 db->db_state = DB_CACHED; 492 mutex_exit(&db->db_mtx); 493 return; 494 } 495 496 if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid)) 497 bp = NULL; 498 else 499 bp = db->db_blkptr; 500 501 if (bp == NULL) 502 dprintf_dbuf(db, "blkptr: %s\n", "NULL"); 503 else 504 dprintf_dbuf_bp(db, bp, "%s", "blkptr:"); 505 506 if (bp == NULL || BP_IS_HOLE(bp)) { 507 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 508 509 ASSERT(bp == NULL || BP_IS_HOLE(bp)); 510 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 511 db->db.db_size, db, type)); 512 bzero(db->db.db_data, db->db.db_size); 513 db->db_state = DB_CACHED; 514 *flags |= DB_RF_CACHED; 515 mutex_exit(&db->db_mtx); 516 return; 517 } 518 519 db->db_state = DB_READ; 520 mutex_exit(&db->db_mtx); 521 522 zb.zb_objset = db->db_objset->os_dsl_dataset ? 523 db->db_objset->os_dsl_dataset->ds_object : 0; 524 zb.zb_object = db->db.db_object; 525 zb.zb_level = db->db_level; 526 zb.zb_blkid = db->db_blkid; 527 528 dbuf_add_ref(db, NULL); 529 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 530 (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp, 531 db->db_level > 0 ? byteswap_uint64_array : 532 dmu_ot[db->db_dnode->dn_type].ot_byteswap, 533 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 534 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 535 &aflags, &zb); 536 if (aflags & ARC_CACHED) 537 *flags |= DB_RF_CACHED; 538 } 539 540 int 541 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 542 { 543 int err = 0; 544 int havepzio = (zio != NULL); 545 int prefetch; 546 547 /* 548 * We don't have to hold the mutex to check db_state because it 549 * can't be freed while we have a hold on the buffer. 550 */ 551 ASSERT(!refcount_is_zero(&db->db_holds)); 552 553 if ((flags & DB_RF_HAVESTRUCT) == 0) 554 rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); 555 556 prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 557 (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL; 558 559 mutex_enter(&db->db_mtx); 560 if (db->db_state == DB_CACHED) { 561 mutex_exit(&db->db_mtx); 562 if (prefetch) 563 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 564 db->db.db_size, TRUE); 565 if ((flags & DB_RF_HAVESTRUCT) == 0) 566 rw_exit(&db->db_dnode->dn_struct_rwlock); 567 } else if (db->db_state == DB_UNCACHED) { 568 if (zio == NULL) { 569 zio = zio_root(db->db_dnode->dn_objset->os_spa, 570 NULL, NULL, ZIO_FLAG_CANFAIL); 571 } 572 dbuf_read_impl(db, zio, &flags); 573 574 /* dbuf_read_impl has dropped db_mtx for us */ 575 576 if (prefetch) 577 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 578 db->db.db_size, flags & DB_RF_CACHED); 579 580 if ((flags & DB_RF_HAVESTRUCT) == 0) 581 rw_exit(&db->db_dnode->dn_struct_rwlock); 582 583 if (!havepzio) 584 err = zio_wait(zio); 585 } else { 586 mutex_exit(&db->db_mtx); 587 if (prefetch) 588 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 589 db->db.db_size, TRUE); 590 if ((flags & DB_RF_HAVESTRUCT) == 0) 591 rw_exit(&db->db_dnode->dn_struct_rwlock); 592 593 mutex_enter(&db->db_mtx); 594 if ((flags & DB_RF_NEVERWAIT) == 0) { 595 while (db->db_state == DB_READ || 596 db->db_state == DB_FILL) { 597 ASSERT(db->db_state == DB_READ || 598 (flags & DB_RF_HAVESTRUCT) == 0); 599 cv_wait(&db->db_changed, &db->db_mtx); 600 } 601 if (db->db_state == DB_UNCACHED) 602 err = EIO; 603 } 604 mutex_exit(&db->db_mtx); 605 } 606 607 ASSERT(err || havepzio || db->db_state == DB_CACHED); 608 return (err); 609 } 610 611 static void 612 dbuf_noread(dmu_buf_impl_t *db) 613 { 614 ASSERT(!refcount_is_zero(&db->db_holds)); 615 ASSERT(db->db_blkid != DB_BONUS_BLKID); 616 mutex_enter(&db->db_mtx); 617 while (db->db_state == DB_READ || db->db_state == DB_FILL) 618 cv_wait(&db->db_changed, &db->db_mtx); 619 if (db->db_state == DB_UNCACHED) { 620 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 621 622 ASSERT(db->db_buf == NULL); 623 ASSERT(db->db.db_data == NULL); 624 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 625 db->db.db_size, db, type)); 626 db->db_state = DB_FILL; 627 } else { 628 ASSERT3U(db->db_state, ==, DB_CACHED); 629 } 630 mutex_exit(&db->db_mtx); 631 } 632 633 /* 634 * This is our just-in-time copy function. It makes a copy of 635 * buffers, that have been modified in a previous transaction 636 * group, before we modify them in the current active group. 637 * 638 * This function is used in two places: when we are dirtying a 639 * buffer for the first time in a txg, and when we are freeing 640 * a range in a dnode that includes this buffer. 641 * 642 * Note that when we are called from dbuf_free_range() we do 643 * not put a hold on the buffer, we just traverse the active 644 * dbuf list for the dnode. 645 */ 646 static void 647 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 648 { 649 arc_buf_t **quiescing, **syncing; 650 arc_buf_contents_t type; 651 652 ASSERT(MUTEX_HELD(&db->db_mtx)); 653 ASSERT(db->db.db_data != NULL); 654 ASSERT(db->db_blkid != DB_BONUS_BLKID); 655 656 quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK]; 657 syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK]; 658 659 /* 660 * If this buffer is referenced from the current quiescing 661 * transaction group: either make a copy and reset the reference 662 * to point to the copy, or (if there a no active holders) just 663 * null out the current db_data pointer. 664 */ 665 if (*quiescing == db->db_buf) { 666 /* 667 * If the quiescing txg is "dirty", then we better not 668 * be referencing the same buffer from the syncing txg. 669 */ 670 ASSERT(*syncing != db->db_buf); 671 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 672 int size = db->db.db_size; 673 type = DBUF_GET_BUFC_TYPE(db); 674 *quiescing = arc_buf_alloc( 675 db->db_dnode->dn_objset->os_spa, size, db, type); 676 bcopy(db->db.db_data, (*quiescing)->b_data, size); 677 } else { 678 dbuf_set_data(db, NULL); 679 } 680 return; 681 } 682 683 /* 684 * If this buffer is referenced from the current syncing 685 * transaction group: either 686 * 1 - make a copy and reset the reference, or 687 * 2 - if there are no holders, just null the current db_data. 688 */ 689 if (*syncing == db->db_buf) { 690 ASSERT3P(*quiescing, ==, NULL); 691 ASSERT3U(db->db_dirtycnt, ==, 1); 692 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 693 int size = db->db.db_size; 694 type = DBUF_GET_BUFC_TYPE(db); 695 /* we can't copy if we have already started a write */ 696 ASSERT(*syncing != db->db_data_pending); 697 *syncing = arc_buf_alloc( 698 db->db_dnode->dn_objset->os_spa, size, db, type); 699 bcopy(db->db.db_data, (*syncing)->b_data, size); 700 } else { 701 dbuf_set_data(db, NULL); 702 } 703 } 704 } 705 706 /* 707 * This is the "bonus buffer" version of the above routine 708 */ 709 static void 710 dbuf_fix_old_bonus_data(dmu_buf_impl_t *db, uint64_t txg) 711 { 712 arc_buf_t **quiescing, **syncing; 713 714 ASSERT(MUTEX_HELD(&db->db_mtx)); 715 ASSERT(db->db.db_data != NULL); 716 ASSERT(db->db_blkid == DB_BONUS_BLKID); 717 718 quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK]; 719 syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK]; 720 721 if (*quiescing == db->db.db_data) { 722 ASSERT(*syncing != db->db.db_data); 723 *quiescing = zio_buf_alloc(DN_MAX_BONUSLEN); 724 bcopy(db->db.db_data, *quiescing, DN_MAX_BONUSLEN); 725 } else if (*syncing == db->db.db_data) { 726 ASSERT3P(*quiescing, ==, NULL); 727 ASSERT3U(db->db_dirtycnt, ==, 1); 728 *syncing = zio_buf_alloc(DN_MAX_BONUSLEN); 729 bcopy(db->db.db_data, *syncing, DN_MAX_BONUSLEN); 730 } 731 } 732 733 void 734 dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg) 735 { 736 ASSERT(db->db_blkid != DB_BONUS_BLKID); 737 ASSERT(MUTEX_HELD(&db->db_mtx)); 738 ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC); 739 740 if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { 741 /* free this block */ 742 ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) || 743 db->db_dnode->dn_free_txg == txg); 744 if (!BP_IS_HOLE(db->db_d.db_overridden_by[txg&TXG_MASK])) { 745 /* XXX can get silent EIO here */ 746 (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa, 747 txg, db->db_d.db_overridden_by[txg&TXG_MASK], 748 NULL, NULL, ARC_WAIT); 749 } 750 kmem_free(db->db_d.db_overridden_by[txg&TXG_MASK], 751 sizeof (blkptr_t)); 752 db->db_d.db_overridden_by[txg&TXG_MASK] = NULL; 753 /* 754 * Release the already-written buffer, so we leave it in 755 * a consistent dirty state. Note that all callers are 756 * modifying the buffer, so they will immediately do 757 * another (redundant) arc_release(). Therefore, leave 758 * the buf thawed to save the effort of freezing & 759 * immediately re-thawing it. 760 */ 761 arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); 762 } 763 } 764 765 void 766 dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) 767 { 768 dmu_buf_impl_t *db, *db_next; 769 uint64_t txg = tx->tx_txg; 770 771 dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks); 772 mutex_enter(&dn->dn_dbufs_mtx); 773 for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 774 db_next = list_next(&dn->dn_dbufs, db); 775 ASSERT(db->db_blkid != DB_BONUS_BLKID); 776 if (db->db_level != 0) 777 continue; 778 dprintf_dbuf(db, "found buf %s\n", ""); 779 if (db->db_blkid < blkid || 780 db->db_blkid >= blkid+nblks) 781 continue; 782 783 /* found a level 0 buffer in the range */ 784 if (dbuf_undirty(db, tx)) 785 continue; 786 787 mutex_enter(&db->db_mtx); 788 if (db->db_state == DB_UNCACHED || 789 db->db_state == DB_EVICTING) { 790 ASSERT(db->db.db_data == NULL); 791 mutex_exit(&db->db_mtx); 792 continue; 793 } 794 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 795 /* will be handled in dbuf_read_done or dbuf_rele */ 796 db->db_d.db_freed_in_flight = TRUE; 797 mutex_exit(&db->db_mtx); 798 continue; 799 } 800 if (refcount_count(&db->db_holds) == 0) { 801 ASSERT(db->db_buf); 802 dbuf_clear(db); 803 continue; 804 } 805 /* The dbuf is CACHED and referenced */ 806 807 if (!list_link_active(&db->db_dirty_node[txg & TXG_MASK])) { 808 /* 809 * This dbuf is not currently dirty. Either 810 * uncache it (if its not referenced in the open 811 * context) or reset its contents to empty. 812 */ 813 dbuf_fix_old_data(db, txg); 814 } else { 815 if (db->db_d.db_overridden_by[txg & TXG_MASK] != NULL) { 816 /* 817 * This dbuf is overridden. Clear that state. 818 */ 819 dbuf_unoverride(db, txg); 820 } 821 if (db->db_blkid > dn->dn_maxblkid) 822 dn->dn_maxblkid = db->db_blkid; 823 } 824 /* fill in with appropriate data */ 825 if (db->db_state == DB_CACHED) { 826 ASSERT(db->db.db_data != NULL); 827 arc_release(db->db_buf, db); 828 bzero(db->db.db_data, db->db.db_size); 829 arc_buf_freeze(db->db_buf); 830 } 831 832 mutex_exit(&db->db_mtx); 833 } 834 mutex_exit(&dn->dn_dbufs_mtx); 835 } 836 837 static int 838 dbuf_new_block(dmu_buf_impl_t *db) 839 { 840 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 841 uint64_t birth_txg = 0; 842 843 /* Don't count meta-objects */ 844 if (ds == NULL) 845 return (FALSE); 846 847 /* 848 * We don't need any locking to protect db_blkptr: 849 * If it's syncing, then db_dirtied will be set so we'll 850 * ignore db_blkptr. 851 */ 852 ASSERT(MUTEX_HELD(&db->db_mtx)); /* XXX strictly necessary? */ 853 /* If we have been dirtied since the last snapshot, its not new */ 854 if (db->db_dirtied) 855 birth_txg = db->db_dirtied; 856 else if (db->db_blkptr) 857 birth_txg = db->db_blkptr->blk_birth; 858 859 if (birth_txg) 860 return (!dsl_dataset_block_freeable(ds, birth_txg)); 861 else 862 return (TRUE); 863 } 864 865 void 866 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 867 { 868 arc_buf_t *buf, *obuf; 869 int osize = db->db.db_size; 870 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 871 872 ASSERT(db->db_blkid != DB_BONUS_BLKID); 873 874 /* XXX does *this* func really need the lock? */ 875 ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); 876 877 /* 878 * This call to dbuf_will_dirty() with the dn_struct_rwlock held 879 * is OK, because there can be no other references to the db 880 * when we are changing its size, so no concurrent DB_FILL can 881 * be happening. 882 */ 883 /* 884 * XXX we should be doing a dbuf_read, checking the return 885 * value and returning that up to our callers 886 */ 887 dbuf_will_dirty(db, tx); 888 889 /* create the data buffer for the new block */ 890 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); 891 892 /* copy old block data to the new block */ 893 obuf = db->db_buf; 894 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 895 /* zero the remainder */ 896 if (size > osize) 897 bzero((uint8_t *)buf->b_data + osize, size - osize); 898 899 mutex_enter(&db->db_mtx); 900 dbuf_set_data(db, buf); 901 VERIFY(arc_buf_remove_ref(obuf, db) == 1); 902 db->db.db_size = size; 903 904 if (db->db_level == 0) 905 db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf; 906 mutex_exit(&db->db_mtx); 907 908 dnode_willuse_space(db->db_dnode, size-osize, tx); 909 } 910 911 void 912 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 913 { 914 dnode_t *dn = db->db_dnode; 915 objset_impl_t *os = dn->dn_objset; 916 int drop_struct_lock = FALSE; 917 int txgoff = tx->tx_txg & TXG_MASK; 918 919 ASSERT(tx->tx_txg != 0); 920 ASSERT(!refcount_is_zero(&db->db_holds)); 921 DMU_TX_DIRTY_BUF(tx, db); 922 923 /* 924 * Shouldn't dirty a regular buffer in syncing context. Private 925 * objects may be dirtied in syncing context, but only if they 926 * were already pre-dirtied in open context. 927 * XXX We may want to prohibit dirtying in syncing context even 928 * if they did pre-dirty. 929 */ 930 ASSERT(!(dmu_tx_is_syncing(tx) && 931 !BP_IS_HOLE(&dn->dn_objset->os_rootbp) && 932 dn->dn_object != DMU_META_DNODE_OBJECT && 933 dn->dn_objset->os_dsl_dataset != NULL && 934 !dsl_dir_is_private( 935 dn->dn_objset->os_dsl_dataset->ds_dir))); 936 937 /* 938 * We make this assert for private objects as well, but after we 939 * check if we're already dirty. They are allowed to re-dirty 940 * in syncing context. 941 */ 942 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 943 dn->dn_dirtyctx == DN_UNDIRTIED || 944 dn->dn_dirtyctx == 945 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 946 947 mutex_enter(&db->db_mtx); 948 /* XXX make this true for indirects too? */ 949 ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || 950 db->db_state == DB_FILL); 951 952 /* 953 * If this buffer is currently part of an "overridden" region, 954 * we now need to remove it from that region. 955 */ 956 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 957 db->db_d.db_overridden_by[txgoff] != NULL) { 958 dbuf_unoverride(db, tx->tx_txg); 959 } 960 961 mutex_enter(&dn->dn_mtx); 962 /* 963 * Don't set dirtyctx to SYNC if we're just modifying this as we 964 * initialize the objset. 965 */ 966 if (dn->dn_dirtyctx == DN_UNDIRTIED && 967 !BP_IS_HOLE(&dn->dn_objset->os_rootbp)) { 968 dn->dn_dirtyctx = 969 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 970 ASSERT(dn->dn_dirtyctx_firstset == NULL); 971 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 972 } 973 mutex_exit(&dn->dn_mtx); 974 975 /* 976 * If this buffer is already dirty, we're done. 977 */ 978 if (list_link_active(&db->db_dirty_node[txgoff])) { 979 if (db->db_blkid != DB_BONUS_BLKID && db->db_level == 0 && 980 db->db.db_object != DMU_META_DNODE_OBJECT) 981 arc_buf_thaw(db->db_buf); 982 983 mutex_exit(&db->db_mtx); 984 return; 985 } 986 987 /* 988 * Only valid if not already dirty. 989 */ 990 ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 991 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 992 993 ASSERT3U(dn->dn_nlevels, >, db->db_level); 994 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 995 dn->dn_phys->dn_nlevels > db->db_level || 996 dn->dn_next_nlevels[txgoff] > db->db_level || 997 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 998 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 999 1000 /* 1001 * We should only be dirtying in syncing context if it's the 1002 * mos, a spa os, or we're initializing the os. However, we are 1003 * allowed to dirty in syncing context provided we already 1004 * dirtied it in open context. Hence we must make this 1005 * assertion only if we're not already dirty. 1006 */ 1007 ASSERT(!dmu_tx_is_syncing(tx) || 1008 os->os_dsl_dataset == NULL || 1009 !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || 1010 !BP_IS_HOLE(&os->os_rootbp)); 1011 ASSERT(db->db.db_size != 0); 1012 1013 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1014 1015 /* 1016 * If this buffer is dirty in an old transaction group we need 1017 * to make a copy of it so that the changes we make in this 1018 * transaction group won't leak out when we sync the older txg. 1019 */ 1020 if (db->db_blkid == DB_BONUS_BLKID) { 1021 ASSERT(db->db.db_data != NULL); 1022 ASSERT(db->db_d.db_data_old[txgoff] == NULL); 1023 dbuf_fix_old_bonus_data(db, tx->tx_txg); 1024 db->db_d.db_data_old[txgoff] = db->db.db_data; 1025 } else if (db->db_level == 0) { 1026 /* 1027 * Release the data buffer from the cache so that we 1028 * can modify it without impacting possible other users 1029 * of this cached data block. Note that indirect blocks 1030 * and private objects are not released until the syncing 1031 * state (since they are only modified then). 1032 */ 1033 ASSERT(db->db_buf != NULL); 1034 ASSERT(db->db_d.db_data_old[txgoff] == NULL); 1035 if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1036 arc_release(db->db_buf, db); 1037 dbuf_fix_old_data(db, tx->tx_txg); 1038 ASSERT(db->db_buf != NULL); 1039 } 1040 db->db_d.db_data_old[txgoff] = db->db_buf; 1041 } 1042 1043 mutex_enter(&dn->dn_mtx); 1044 /* 1045 * We could have been freed_in_flight between the dbuf_noread 1046 * and dbuf_dirty. We win, as though the dbuf_noread() had 1047 * happened after the free. 1048 */ 1049 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 1050 dnode_clear_range(dn, db->db_blkid, 1, tx); 1051 db->db_d.db_freed_in_flight = FALSE; 1052 } 1053 1054 db->db_dirtied = tx->tx_txg; 1055 list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db); 1056 mutex_exit(&dn->dn_mtx); 1057 1058 if (db->db_blkid != DB_BONUS_BLKID) { 1059 /* 1060 * Update the accounting. 1061 */ 1062 if (!dbuf_new_block(db) && db->db_blkptr) { 1063 /* 1064 * This is only a guess -- if the dbuf is dirty 1065 * in a previous txg, we don't know how much 1066 * space it will use on disk yet. We should 1067 * really have the struct_rwlock to access 1068 * db_blkptr, but since this is just a guess, 1069 * it's OK if we get an odd answer. 1070 */ 1071 dnode_willuse_space(dn, 1072 -bp_get_dasize(os->os_spa, db->db_blkptr), tx); 1073 } 1074 dnode_willuse_space(dn, db->db.db_size, tx); 1075 } 1076 1077 /* 1078 * This buffer is now part of this txg 1079 */ 1080 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1081 db->db_dirtycnt += 1; 1082 ASSERT3U(db->db_dirtycnt, <=, 3); 1083 1084 mutex_exit(&db->db_mtx); 1085 1086 if (db->db_blkid == DB_BONUS_BLKID) { 1087 dnode_setdirty(dn, tx); 1088 return; 1089 } 1090 1091 if (db->db_level == 0) { 1092 dnode_new_blkid(dn, db->db_blkid, tx); 1093 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1094 } 1095 1096 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1097 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1098 drop_struct_lock = TRUE; 1099 } 1100 1101 if (db->db_level+1 < dn->dn_nlevels) { 1102 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1103 dmu_buf_impl_t *parent; 1104 parent = dbuf_hold_level(dn, db->db_level+1, 1105 db->db_blkid >> epbs, FTAG); 1106 if (drop_struct_lock) 1107 rw_exit(&dn->dn_struct_rwlock); 1108 dbuf_dirty(parent, tx); 1109 dbuf_rele(parent, FTAG); 1110 } else { 1111 if (drop_struct_lock) 1112 rw_exit(&dn->dn_struct_rwlock); 1113 } 1114 1115 dnode_setdirty(dn, tx); 1116 } 1117 1118 static int 1119 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1120 { 1121 dnode_t *dn = db->db_dnode; 1122 int txgoff = tx->tx_txg & TXG_MASK; 1123 int64_t holds; 1124 1125 ASSERT(tx->tx_txg != 0); 1126 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1127 1128 mutex_enter(&db->db_mtx); 1129 1130 /* 1131 * If this buffer is not dirty, we're done. 1132 */ 1133 if (!list_link_active(&db->db_dirty_node[txgoff])) { 1134 mutex_exit(&db->db_mtx); 1135 return (0); 1136 } 1137 1138 /* 1139 * If this buffer is currently held, we cannot undirty 1140 * it, since one of the current holders may be in the 1141 * middle of an update. Note that users of dbuf_undirty() 1142 * should not place a hold on the dbuf before the call. 1143 */ 1144 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1145 mutex_exit(&db->db_mtx); 1146 /* Make sure we don't toss this buffer at sync phase */ 1147 mutex_enter(&dn->dn_mtx); 1148 dnode_clear_range(dn, db->db_blkid, 1, tx); 1149 mutex_exit(&dn->dn_mtx); 1150 return (0); 1151 } 1152 1153 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1154 1155 dbuf_unoverride(db, tx->tx_txg); 1156 1157 ASSERT(db->db.db_size != 0); 1158 if (db->db_level == 0) { 1159 ASSERT(db->db_buf != NULL); 1160 ASSERT(db->db_d.db_data_old[txgoff] != NULL); 1161 if (db->db_d.db_data_old[txgoff] != db->db_buf) 1162 VERIFY(arc_buf_remove_ref( 1163 db->db_d.db_data_old[txgoff], db) == 1); 1164 db->db_d.db_data_old[txgoff] = NULL; 1165 } 1166 1167 /* XXX would be nice to fix up dn_towrite_space[] */ 1168 /* XXX undo db_dirtied? but how? */ 1169 /* db->db_dirtied = tx->tx_txg; */ 1170 1171 mutex_enter(&dn->dn_mtx); 1172 list_remove(&dn->dn_dirty_dbufs[txgoff], db); 1173 mutex_exit(&dn->dn_mtx); 1174 1175 ASSERT(db->db_dirtycnt > 0); 1176 db->db_dirtycnt -= 1; 1177 1178 if ((holds = refcount_remove(&db->db_holds, 1179 (void *)(uintptr_t)tx->tx_txg)) == 0) { 1180 arc_buf_t *buf = db->db_buf; 1181 1182 ASSERT(arc_released(buf)); 1183 dbuf_set_data(db, NULL); 1184 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1185 dbuf_evict(db); 1186 return (1); 1187 } 1188 ASSERT(holds > 0); 1189 1190 mutex_exit(&db->db_mtx); 1191 return (0); 1192 } 1193 1194 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty 1195 void 1196 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1197 { 1198 int rf = DB_RF_MUST_SUCCEED; 1199 1200 ASSERT(tx->tx_txg != 0); 1201 ASSERT(!refcount_is_zero(&db->db_holds)); 1202 1203 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) 1204 rf |= DB_RF_HAVESTRUCT; 1205 (void) dbuf_read(db, NULL, rf); 1206 dbuf_dirty(db, tx); 1207 } 1208 1209 void 1210 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1211 { 1212 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1213 1214 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1215 ASSERT(tx->tx_txg != 0); 1216 ASSERT(db->db_level == 0); 1217 ASSERT(!refcount_is_zero(&db->db_holds)); 1218 1219 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1220 dmu_tx_private_ok(tx)); 1221 1222 dbuf_noread(db); 1223 dbuf_dirty(db, tx); 1224 } 1225 1226 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1227 /* ARGSUSED */ 1228 void 1229 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1230 { 1231 mutex_enter(&db->db_mtx); 1232 DBUF_VERIFY(db); 1233 1234 if (db->db_state == DB_FILL) { 1235 if (db->db_level == 0 && db->db_d.db_freed_in_flight) { 1236 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1237 /* we were freed while filling */ 1238 /* XXX dbuf_undirty? */ 1239 bzero(db->db.db_data, db->db.db_size); 1240 db->db_d.db_freed_in_flight = FALSE; 1241 } 1242 db->db_state = DB_CACHED; 1243 cv_broadcast(&db->db_changed); 1244 } 1245 mutex_exit(&db->db_mtx); 1246 } 1247 1248 /* 1249 * "Clear" the contents of this dbuf. This will mark the dbuf 1250 * EVICTING and clear *most* of its references. Unfortunetely, 1251 * when we are not holding the dn_dbufs_mtx, we can't clear the 1252 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1253 * in this case. For callers from the DMU we will usually see: 1254 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() 1255 * For the arc callback, we will usually see: 1256 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1257 * Sometimes, though, we will get a mix of these two: 1258 * DMU: dbuf_clear()->arc_buf_evict() 1259 * ARC: dbuf_do_evict()->dbuf_destroy() 1260 */ 1261 void 1262 dbuf_clear(dmu_buf_impl_t *db) 1263 { 1264 dnode_t *dn = db->db_dnode; 1265 dmu_buf_impl_t *parent = db->db_parent; 1266 dmu_buf_impl_t *dndb = dn->dn_dbuf; 1267 int dbuf_gone = FALSE; 1268 1269 ASSERT(MUTEX_HELD(&db->db_mtx)); 1270 ASSERT(refcount_is_zero(&db->db_holds)); 1271 1272 dbuf_evict_user(db); 1273 1274 if (db->db_state == DB_CACHED) { 1275 ASSERT(db->db.db_data != NULL); 1276 if (db->db_blkid == DB_BONUS_BLKID) 1277 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1278 db->db.db_data = NULL; 1279 db->db_state = DB_UNCACHED; 1280 } 1281 1282 ASSERT3U(db->db_state, ==, DB_UNCACHED); 1283 ASSERT(db->db_data_pending == NULL); 1284 1285 db->db_state = DB_EVICTING; 1286 db->db_blkptr = NULL; 1287 1288 if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1289 list_remove(&dn->dn_dbufs, db); 1290 dnode_rele(dn, db); 1291 } 1292 1293 if (db->db_buf) 1294 dbuf_gone = arc_buf_evict(db->db_buf); 1295 1296 if (!dbuf_gone) 1297 mutex_exit(&db->db_mtx); 1298 1299 /* 1300 * If this dbuf is referened from an indirect dbuf, 1301 * decrement the ref count on the indirect dbuf. 1302 */ 1303 if (parent && parent != dndb) 1304 dbuf_rele(parent, db); 1305 } 1306 1307 static int 1308 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1309 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1310 { 1311 int nlevels, epbs; 1312 1313 *parentp = NULL; 1314 *bpp = NULL; 1315 1316 ASSERT(blkid != DB_BONUS_BLKID); 1317 1318 if (dn->dn_phys->dn_nlevels == 0) 1319 nlevels = 1; 1320 else 1321 nlevels = dn->dn_phys->dn_nlevels; 1322 1323 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1324 1325 ASSERT3U(level * epbs, <, 64); 1326 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1327 if (level >= nlevels || 1328 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1329 /* the buffer has no parent yet */ 1330 return (ENOENT); 1331 } else if (level < nlevels-1) { 1332 /* this block is referenced from an indirect block */ 1333 int err = dbuf_hold_impl(dn, level+1, 1334 blkid >> epbs, fail_sparse, NULL, parentp); 1335 if (err) 1336 return (err); 1337 err = dbuf_read(*parentp, NULL, 1338 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1339 if (err) { 1340 dbuf_rele(*parentp, NULL); 1341 *parentp = NULL; 1342 return (err); 1343 } 1344 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1345 (blkid & ((1ULL << epbs) - 1)); 1346 return (0); 1347 } else { 1348 /* the block is referenced from the dnode */ 1349 ASSERT3U(level, ==, nlevels-1); 1350 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1351 blkid < dn->dn_phys->dn_nblkptr); 1352 if (dn->dn_dbuf) { 1353 dbuf_add_ref(dn->dn_dbuf, NULL); 1354 *parentp = dn->dn_dbuf; 1355 } 1356 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1357 return (0); 1358 } 1359 } 1360 1361 static dmu_buf_impl_t * 1362 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1363 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1364 { 1365 objset_impl_t *os = dn->dn_objset; 1366 dmu_buf_impl_t *db, *odb; 1367 1368 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1369 ASSERT(dn->dn_type != DMU_OT_NONE); 1370 1371 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1372 1373 db->db_objset = os; 1374 db->db.db_object = dn->dn_object; 1375 db->db_level = level; 1376 db->db_blkid = blkid; 1377 db->db_dirtied = 0; 1378 db->db_dirtycnt = 0; 1379 db->db_dnode = dn; 1380 db->db_parent = parent; 1381 db->db_blkptr = blkptr; 1382 1383 bzero(&db->db_d, sizeof (db->db_d)); 1384 1385 if (blkid == DB_BONUS_BLKID) { 1386 ASSERT3P(parent, ==, dn->dn_dbuf); 1387 db->db.db_size = dn->dn_bonuslen; 1388 db->db.db_offset = DB_BONUS_BLKID; 1389 db->db_state = DB_UNCACHED; 1390 /* the bonus dbuf is not placed in the hash table */ 1391 return (db); 1392 } else { 1393 int blocksize = 1394 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1395 db->db.db_size = blocksize; 1396 db->db.db_offset = db->db_blkid * blocksize; 1397 } 1398 1399 /* 1400 * Hold the dn_dbufs_mtx while we get the new dbuf 1401 * in the hash table *and* added to the dbufs list. 1402 * This prevents a possible deadlock with someone 1403 * trying to look up this dbuf before its added to the 1404 * dn_dbufs list. 1405 */ 1406 mutex_enter(&dn->dn_dbufs_mtx); 1407 db->db_state = DB_EVICTING; 1408 if ((odb = dbuf_hash_insert(db)) != NULL) { 1409 /* someone else inserted it first */ 1410 kmem_cache_free(dbuf_cache, db); 1411 mutex_exit(&dn->dn_dbufs_mtx); 1412 return (odb); 1413 } 1414 list_insert_head(&dn->dn_dbufs, db); 1415 db->db_state = DB_UNCACHED; 1416 mutex_exit(&dn->dn_dbufs_mtx); 1417 1418 if (parent && parent != dn->dn_dbuf) 1419 dbuf_add_ref(parent, db); 1420 1421 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1422 refcount_count(&dn->dn_holds) > 0); 1423 (void) refcount_add(&dn->dn_holds, db); 1424 1425 dprintf_dbuf(db, "db=%p\n", db); 1426 1427 return (db); 1428 } 1429 1430 static int 1431 dbuf_do_evict(void *private) 1432 { 1433 arc_buf_t *buf = private; 1434 dmu_buf_impl_t *db = buf->b_private; 1435 1436 if (!MUTEX_HELD(&db->db_mtx)) 1437 mutex_enter(&db->db_mtx); 1438 1439 ASSERT(refcount_is_zero(&db->db_holds)); 1440 1441 if (db->db_state != DB_EVICTING) { 1442 ASSERT(db->db_state == DB_CACHED); 1443 DBUF_VERIFY(db); 1444 db->db_buf = NULL; 1445 dbuf_evict(db); 1446 } else { 1447 mutex_exit(&db->db_mtx); 1448 dbuf_destroy(db); 1449 } 1450 return (0); 1451 } 1452 1453 static void 1454 dbuf_destroy(dmu_buf_impl_t *db) 1455 { 1456 ASSERT(refcount_is_zero(&db->db_holds)); 1457 1458 if (db->db_blkid != DB_BONUS_BLKID) { 1459 dnode_t *dn = db->db_dnode; 1460 1461 /* 1462 * If this dbuf is still on the dn_dbufs list, 1463 * remove it from that list. 1464 */ 1465 if (list_link_active(&db->db_link)) { 1466 mutex_enter(&dn->dn_dbufs_mtx); 1467 list_remove(&dn->dn_dbufs, db); 1468 mutex_exit(&dn->dn_dbufs_mtx); 1469 1470 dnode_rele(dn, db); 1471 } 1472 dbuf_hash_remove(db); 1473 } 1474 db->db_parent = NULL; 1475 db->db_dnode = NULL; 1476 db->db_buf = NULL; 1477 1478 ASSERT(db->db.db_data == NULL); 1479 ASSERT(db->db_hash_next == NULL); 1480 ASSERT(db->db_blkptr == NULL); 1481 ASSERT(db->db_data_pending == NULL); 1482 1483 kmem_cache_free(dbuf_cache, db); 1484 } 1485 1486 void 1487 dbuf_prefetch(dnode_t *dn, uint64_t blkid) 1488 { 1489 dmu_buf_impl_t *db = NULL; 1490 blkptr_t *bp = NULL; 1491 1492 ASSERT(blkid != DB_BONUS_BLKID); 1493 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1494 1495 if (dnode_block_freed(dn, blkid)) 1496 return; 1497 1498 /* dbuf_find() returns with db_mtx held */ 1499 if (db = dbuf_find(dn, 0, blkid)) { 1500 if (refcount_count(&db->db_holds) > 0) { 1501 /* 1502 * This dbuf is active. We assume that it is 1503 * already CACHED, or else about to be either 1504 * read or filled. 1505 */ 1506 mutex_exit(&db->db_mtx); 1507 return; 1508 } 1509 mutex_exit(&db->db_mtx); 1510 db = NULL; 1511 } 1512 1513 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1514 if (bp && !BP_IS_HOLE(bp)) { 1515 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1516 zbookmark_t zb; 1517 zb.zb_objset = dn->dn_objset->os_dsl_dataset ? 1518 dn->dn_objset->os_dsl_dataset->ds_object : 0; 1519 zb.zb_object = dn->dn_object; 1520 zb.zb_level = 0; 1521 zb.zb_blkid = blkid; 1522 1523 (void) arc_read(NULL, dn->dn_objset->os_spa, bp, 1524 dmu_ot[dn->dn_type].ot_byteswap, 1525 NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 1526 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1527 &aflags, &zb); 1528 } 1529 if (db) 1530 dbuf_rele(db, NULL); 1531 } 1532 } 1533 1534 /* 1535 * Returns with db_holds incremented, and db_mtx not held. 1536 * Note: dn_struct_rwlock must be held. 1537 */ 1538 int 1539 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1540 void *tag, dmu_buf_impl_t **dbp) 1541 { 1542 dmu_buf_impl_t *db, *parent = NULL; 1543 1544 ASSERT(blkid != DB_BONUS_BLKID); 1545 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1546 ASSERT3U(dn->dn_nlevels, >, level); 1547 1548 *dbp = NULL; 1549 top: 1550 /* dbuf_find() returns with db_mtx held */ 1551 db = dbuf_find(dn, level, blkid); 1552 1553 if (db == NULL) { 1554 blkptr_t *bp = NULL; 1555 int err; 1556 1557 ASSERT3P(parent, ==, NULL); 1558 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1559 if (fail_sparse) { 1560 if (err == 0 && bp && BP_IS_HOLE(bp)) 1561 err = ENOENT; 1562 if (err) { 1563 if (parent) 1564 dbuf_rele(parent, NULL); 1565 return (err); 1566 } 1567 } 1568 if (err && err != ENOENT) 1569 return (err); 1570 db = dbuf_create(dn, level, blkid, parent, bp); 1571 } 1572 1573 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1574 arc_buf_add_ref(db->db_buf, db); 1575 if (db->db_buf->b_data == NULL) { 1576 dbuf_clear(db); 1577 if (parent) { 1578 dbuf_rele(parent, NULL); 1579 parent = NULL; 1580 } 1581 goto top; 1582 } 1583 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1584 } 1585 1586 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1587 1588 /* 1589 * If this buffer is currently syncing out, and we are 1590 * are still referencing it from db_data, we need to make 1591 * a copy of it in case we decide we want to dirty it 1592 * again in this txg. 1593 */ 1594 if (db->db_level == 0 && db->db_state == DB_CACHED && 1595 dn->dn_object != DMU_META_DNODE_OBJECT && 1596 db->db_data_pending == db->db_buf) { 1597 int size = (db->db_blkid == DB_BONUS_BLKID) ? 1598 DN_MAX_BONUSLEN : db->db.db_size; 1599 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1600 1601 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 1602 size, db, type)); 1603 bcopy(db->db_data_pending->b_data, db->db.db_data, 1604 db->db.db_size); 1605 } 1606 1607 (void) refcount_add(&db->db_holds, tag); 1608 dbuf_update_data(db); 1609 DBUF_VERIFY(db); 1610 mutex_exit(&db->db_mtx); 1611 1612 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1613 if (parent) 1614 dbuf_rele(parent, NULL); 1615 1616 ASSERT3P(db->db_dnode, ==, dn); 1617 ASSERT3U(db->db_blkid, ==, blkid); 1618 ASSERT3U(db->db_level, ==, level); 1619 *dbp = db; 1620 1621 return (0); 1622 } 1623 1624 dmu_buf_impl_t * 1625 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1626 { 1627 dmu_buf_impl_t *db; 1628 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1629 return (err ? NULL : db); 1630 } 1631 1632 dmu_buf_impl_t * 1633 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1634 { 1635 dmu_buf_impl_t *db; 1636 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1637 return (err ? NULL : db); 1638 } 1639 1640 dmu_buf_impl_t * 1641 dbuf_create_bonus(dnode_t *dn) 1642 { 1643 dmu_buf_impl_t *db = dn->dn_bonus; 1644 1645 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1646 1647 ASSERT(dn->dn_bonus == NULL); 1648 db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); 1649 return (db); 1650 } 1651 1652 #pragma weak dmu_buf_add_ref = dbuf_add_ref 1653 void 1654 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1655 { 1656 int64_t holds = refcount_add(&db->db_holds, tag); 1657 ASSERT(holds > 1); 1658 } 1659 1660 #pragma weak dmu_buf_rele = dbuf_rele 1661 void 1662 dbuf_rele(dmu_buf_impl_t *db, void *tag) 1663 { 1664 int64_t holds; 1665 1666 mutex_enter(&db->db_mtx); 1667 DBUF_VERIFY(db); 1668 1669 holds = refcount_remove(&db->db_holds, tag); 1670 ASSERT(holds >= 0); 1671 1672 if (db->db_buf && holds == db->db_dirtycnt) 1673 arc_buf_freeze(db->db_buf); 1674 1675 if (holds == db->db_dirtycnt && 1676 db->db_level == 0 && db->db_d.db_immediate_evict) 1677 dbuf_evict_user(db); 1678 1679 if (holds == 0) { 1680 if (db->db_blkid == DB_BONUS_BLKID) { 1681 mutex_exit(&db->db_mtx); 1682 dnode_rele(db->db_dnode, db); 1683 } else if (db->db_buf == NULL) { 1684 /* 1685 * This is a special case: we never associated this 1686 * dbuf with any data allocated from the ARC. 1687 */ 1688 ASSERT3U(db->db_state, ==, DB_UNCACHED); 1689 dbuf_evict(db); 1690 } else if (arc_released(db->db_buf)) { 1691 arc_buf_t *buf = db->db_buf; 1692 /* 1693 * This dbuf has anonymous data associated with it. 1694 */ 1695 dbuf_set_data(db, NULL); 1696 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1697 dbuf_evict(db); 1698 } else { 1699 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); 1700 mutex_exit(&db->db_mtx); 1701 } 1702 } else { 1703 mutex_exit(&db->db_mtx); 1704 } 1705 } 1706 1707 #pragma weak dmu_buf_refcount = dbuf_refcount 1708 uint64_t 1709 dbuf_refcount(dmu_buf_impl_t *db) 1710 { 1711 return (refcount_count(&db->db_holds)); 1712 } 1713 1714 void * 1715 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1716 dmu_buf_evict_func_t *evict_func) 1717 { 1718 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1719 user_data_ptr_ptr, evict_func)); 1720 } 1721 1722 void * 1723 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1724 dmu_buf_evict_func_t *evict_func) 1725 { 1726 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1727 1728 db->db_d.db_immediate_evict = TRUE; 1729 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1730 user_data_ptr_ptr, evict_func)); 1731 } 1732 1733 void * 1734 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 1735 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 1736 { 1737 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1738 ASSERT(db->db_level == 0); 1739 1740 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 1741 1742 mutex_enter(&db->db_mtx); 1743 1744 if (db->db_d.db_user_ptr == old_user_ptr) { 1745 db->db_d.db_user_ptr = user_ptr; 1746 db->db_d.db_user_data_ptr_ptr = user_data_ptr_ptr; 1747 db->db_d.db_evict_func = evict_func; 1748 1749 dbuf_update_data(db); 1750 } else { 1751 old_user_ptr = db->db_d.db_user_ptr; 1752 } 1753 1754 mutex_exit(&db->db_mtx); 1755 return (old_user_ptr); 1756 } 1757 1758 void * 1759 dmu_buf_get_user(dmu_buf_t *db_fake) 1760 { 1761 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1762 ASSERT(!refcount_is_zero(&db->db_holds)); 1763 1764 return (db->db_d.db_user_ptr); 1765 } 1766 1767 void 1768 dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) 1769 { 1770 arc_buf_t **data; 1771 uint64_t txg = tx->tx_txg; 1772 dnode_t *dn = db->db_dnode; 1773 objset_impl_t *os = dn->dn_objset; 1774 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 1775 int checksum, compress; 1776 zbookmark_t zb; 1777 int blksz; 1778 arc_buf_contents_t type; 1779 1780 ASSERT(dmu_tx_is_syncing(tx)); 1781 1782 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 1783 1784 mutex_enter(&db->db_mtx); 1785 /* 1786 * To be synced, we must be dirtied. But we 1787 * might have been freed after the dirty. 1788 */ 1789 if (db->db_state == DB_UNCACHED) { 1790 /* This buffer has been freed since it was dirtied */ 1791 ASSERT(db->db.db_data == NULL); 1792 } else if (db->db_state == DB_FILL) { 1793 /* This buffer was freed and is now being re-filled */ 1794 ASSERT(db->db.db_data != db->db_d.db_data_old[txg&TXG_MASK]); 1795 } else { 1796 ASSERT3U(db->db_state, ==, DB_CACHED); 1797 } 1798 DBUF_VERIFY(db); 1799 1800 /* 1801 * Don't need a lock on db_dirty (dn_mtx), because it can't 1802 * be modified yet. 1803 */ 1804 1805 if (db->db_blkid == DB_BONUS_BLKID) { 1806 arc_buf_t **datap = &db->db_d.db_data_old[txg&TXG_MASK]; 1807 /* 1808 * Simply copy the bonus data into the dnode. It will 1809 * be written out when the dnode is synced (and it will 1810 * be synced, since it must have been dirty for dbuf_sync 1811 * to be called). 1812 */ 1813 /* 1814 * Use dn_phys->dn_bonuslen since db.db_size is the length 1815 * of the bonus buffer in the open transaction rather than 1816 * the syncing transaction. 1817 */ 1818 ASSERT(*datap != NULL); 1819 ASSERT3U(db->db_level, ==, 0); 1820 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 1821 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 1822 if (*datap != db->db.db_data) 1823 zio_buf_free(*datap, DN_MAX_BONUSLEN); 1824 db->db_d.db_data_old[txg&TXG_MASK] = NULL; 1825 db->db_data_pending = NULL; 1826 if (db->db_dirtied == txg) 1827 db->db_dirtied = 0; 1828 ASSERT(db->db_dirtycnt > 0); 1829 db->db_dirtycnt -= 1; 1830 mutex_exit(&db->db_mtx); 1831 dbuf_rele(db, (void *)(uintptr_t)txg); 1832 return; 1833 } 1834 1835 if (db->db_level == 0) { 1836 type = DBUF_GET_BUFC_TYPE(db); 1837 data = &db->db_d.db_data_old[txg&TXG_MASK]; 1838 blksz = arc_buf_size(*data); 1839 1840 /* 1841 * This buffer is in the middle of an immdiate write. 1842 * Wait for the synchronous IO to complete. 1843 */ 1844 while (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) { 1845 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 1846 cv_wait(&db->db_changed, &db->db_mtx); 1847 ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK]); 1848 } 1849 /* 1850 * If this buffer is currently "in use" (i.e., there are 1851 * active holds and db_data still references it), then make 1852 * a copy before we start the write so that any modifications 1853 * from the open txg will not leak into this write. 1854 * 1855 * NOTE: this copy does not need to be made for objects only 1856 * modified in the syncing context (e.g. DNONE_DNODE blocks) 1857 * or if there is no actual write involved (bonus blocks). 1858 */ 1859 if (dn->dn_object != DMU_META_DNODE_OBJECT && 1860 db->db_d.db_overridden_by[txg&TXG_MASK] == NULL) { 1861 if (refcount_count(&db->db_holds) > 1 && 1862 *data == db->db_buf) { 1863 *data = arc_buf_alloc(os->os_spa, blksz, db, 1864 type); 1865 bcopy(db->db.db_data, (*data)->b_data, blksz); 1866 } 1867 db->db_data_pending = *data; 1868 } else if (dn->dn_object == DMU_META_DNODE_OBJECT) { 1869 /* 1870 * Private object buffers are released here rather 1871 * than in dbuf_dirty() since they are only modified 1872 * in the syncing context and we don't want the 1873 * overhead of making multiple copies of the data. 1874 */ 1875 arc_release(db->db_buf, db); 1876 } 1877 } else { 1878 data = &db->db_buf; 1879 if (*data == NULL) { 1880 /* 1881 * This can happen if we dirty and then free 1882 * the level-0 data blocks in the same txg. So 1883 * this indirect remains unchanged. 1884 */ 1885 if (db->db_dirtied == txg) 1886 db->db_dirtied = 0; 1887 ASSERT(db->db_dirtycnt > 0); 1888 db->db_dirtycnt -= 1; 1889 mutex_exit(&db->db_mtx); 1890 dbuf_rele(db, (void *)(uintptr_t)txg); 1891 return; 1892 } 1893 blksz = db->db.db_size; 1894 ASSERT3U(blksz, ==, 1<<dn->dn_phys->dn_indblkshift); 1895 } 1896 1897 ASSERT(*data != NULL); 1898 1899 if (db->db_level > 0 && !arc_released(db->db_buf)) { 1900 /* 1901 * This indirect buffer was marked dirty, but 1902 * never modified (if it had been modified, then 1903 * we would have released the buffer). There is 1904 * no reason to write anything. 1905 */ 1906 db->db_data_pending = NULL; 1907 if (db->db_dirtied == txg) 1908 db->db_dirtied = 0; 1909 ASSERT(db->db_dirtycnt > 0); 1910 db->db_dirtycnt -= 1; 1911 mutex_exit(&db->db_mtx); 1912 dbuf_rele(db, (void *)(uintptr_t)txg); 1913 return; 1914 } else if (db->db_blkptr == NULL && 1915 db->db_level == dn->dn_phys->dn_nlevels-1 && 1916 db->db_blkid < dn->dn_phys->dn_nblkptr) { 1917 /* 1918 * This buffer was allocated at a time when there was 1919 * no available blkptrs from the dnode, or it was 1920 * inappropriate to hook it in (i.e., nlevels mis-match). 1921 */ 1922 ASSERT(db->db_blkptr == NULL); 1923 ASSERT(db->db_parent == NULL); 1924 db->db_parent = dn->dn_dbuf; 1925 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 1926 DBUF_VERIFY(db); 1927 mutex_exit(&db->db_mtx); 1928 } else if (db->db_blkptr == NULL) { 1929 dmu_buf_impl_t *parent = db->db_parent; 1930 1931 mutex_exit(&db->db_mtx); 1932 ASSERT(dn->dn_phys->dn_nlevels > 1); 1933 if (parent == NULL) { 1934 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1935 (void) dbuf_hold_impl(dn, db->db_level+1, 1936 db->db_blkid >> epbs, FALSE, FTAG, &parent); 1937 rw_exit(&dn->dn_struct_rwlock); 1938 dbuf_add_ref(parent, db); 1939 db->db_parent = parent; 1940 dbuf_rele(parent, FTAG); 1941 } 1942 (void) dbuf_read(parent, NULL, DB_RF_MUST_SUCCEED); 1943 } else { 1944 mutex_exit(&db->db_mtx); 1945 } 1946 1947 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || db->db_parent != NULL); 1948 1949 if (db->db_level > 0 && 1950 db->db_blkid > dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)) { 1951 /* 1952 * Don't write indirect blocks past EOF. 1953 * We get these when we truncate a file *after* dirtying 1954 * blocks in the truncate range (we undirty the level 0 1955 * blocks in dbuf_free_range(), but not the indirects). 1956 */ 1957 #ifdef ZFS_DEBUG 1958 /* 1959 * Verify that this indirect block is empty. 1960 */ 1961 blkptr_t *bplist; 1962 int i; 1963 1964 mutex_enter(&db->db_mtx); 1965 bplist = db->db.db_data; 1966 for (i = 0; i < (1 << epbs); i++) { 1967 if (!BP_IS_HOLE(&bplist[i])) { 1968 panic("data past EOF: " 1969 "db=%p level=%d id=%llu i=%d\n", 1970 db, db->db_level, 1971 (u_longlong_t)db->db_blkid, i); 1972 } 1973 } 1974 mutex_exit(&db->db_mtx); 1975 #endif 1976 ASSERT(db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)); 1977 mutex_enter(&db->db_mtx); 1978 db->db_dirtycnt -= 1; 1979 mutex_exit(&db->db_mtx); 1980 dbuf_rele(db, (void *)(uintptr_t)txg); 1981 return; 1982 } 1983 1984 if (db->db_parent != dn->dn_dbuf) { 1985 dmu_buf_impl_t *parent = db->db_parent; 1986 1987 mutex_enter(&db->db_mtx); 1988 ASSERT(db->db_level == parent->db_level-1); 1989 ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK])); 1990 /* 1991 * We may have read this indirect block after we dirtied it, 1992 * so never released it from the cache. 1993 */ 1994 arc_release(parent->db_buf, parent); 1995 1996 db->db_blkptr = (blkptr_t *)parent->db.db_data + 1997 (db->db_blkid & ((1ULL << epbs) - 1)); 1998 DBUF_VERIFY(db); 1999 mutex_exit(&db->db_mtx); 2000 #ifdef ZFS_DEBUG 2001 } else { 2002 /* 2003 * We don't need to dnode_setdirty(dn) because if we got 2004 * here then the parent is already dirty. 2005 */ 2006 ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); 2007 ASSERT3P(db->db_blkptr, ==, 2008 &dn->dn_phys->dn_blkptr[db->db_blkid]); 2009 #endif 2010 } 2011 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 2012 2013 if (db->db_level == 0 && 2014 db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { 2015 arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK]; 2016 blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK]; 2017 int old_size = bp_get_dasize(os->os_spa, db->db_blkptr); 2018 int new_size = bp_get_dasize(os->os_spa, *bpp); 2019 2020 ASSERT(db->db_blkid != DB_BONUS_BLKID); 2021 2022 dnode_diduse_space(dn, new_size-old_size); 2023 mutex_enter(&dn->dn_mtx); 2024 if (db->db_blkid > dn->dn_phys->dn_maxblkid) 2025 dn->dn_phys->dn_maxblkid = db->db_blkid; 2026 mutex_exit(&dn->dn_mtx); 2027 2028 dsl_dataset_block_born(os->os_dsl_dataset, *bpp, tx); 2029 if (!BP_IS_HOLE(db->db_blkptr)) 2030 dsl_dataset_block_kill(os->os_dsl_dataset, 2031 db->db_blkptr, os->os_synctx); 2032 2033 mutex_enter(&db->db_mtx); 2034 *db->db_blkptr = **bpp; 2035 kmem_free(*bpp, sizeof (blkptr_t)); 2036 *bpp = NULL; 2037 2038 if (*old != db->db_buf) 2039 VERIFY(arc_buf_remove_ref(*old, db) == 1); 2040 else if (!BP_IS_HOLE(db->db_blkptr)) 2041 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2042 else 2043 ASSERT(arc_released(db->db_buf)); 2044 *old = NULL; 2045 db->db_data_pending = NULL; 2046 2047 cv_broadcast(&db->db_changed); 2048 2049 ASSERT(db->db_dirtycnt > 0); 2050 db->db_dirtycnt -= 1; 2051 mutex_exit(&db->db_mtx); 2052 dbuf_rele(db, (void *)(uintptr_t)txg); 2053 return; 2054 } 2055 2056 if (db->db_level > 0) { 2057 /* 2058 * XXX -- we should design a compression algorithm 2059 * that specializes in arrays of bps. 2060 */ 2061 checksum = ZIO_CHECKSUM_FLETCHER_4; 2062 if (zfs_mdcomp_disable) 2063 compress = ZIO_COMPRESS_EMPTY; 2064 else 2065 compress = ZIO_COMPRESS_LZJB; 2066 } else { 2067 /* 2068 * Allow dnode settings to override objset settings, 2069 * except for metadata checksums. 2070 */ 2071 if (dmu_ot[dn->dn_type].ot_metadata) { 2072 checksum = os->os_md_checksum; 2073 compress = zio_compress_select(dn->dn_compress, 2074 os->os_md_compress); 2075 } else { 2076 checksum = zio_checksum_select(dn->dn_checksum, 2077 os->os_checksum); 2078 compress = zio_compress_select(dn->dn_compress, 2079 os->os_compress); 2080 } 2081 } 2082 #ifdef ZFS_DEBUG 2083 if (db->db_parent) { 2084 ASSERT(list_link_active( 2085 &db->db_parent->db_dirty_node[txg&TXG_MASK])); 2086 ASSERT(db->db_parent == dn->dn_dbuf || 2087 db->db_parent->db_level > 0); 2088 if (dn->dn_object == DMU_META_DNODE_OBJECT || db->db_level > 0) 2089 ASSERT(*data == db->db_buf); 2090 } 2091 #endif 2092 ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg); 2093 zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; 2094 zb.zb_object = db->db.db_object; 2095 zb.zb_level = db->db_level; 2096 zb.zb_blkid = db->db_blkid; 2097 2098 (void) arc_write(zio, os->os_spa, checksum, compress, 2099 dmu_get_replication_level(os->os_spa, &zb, dn->dn_type), txg, 2100 db->db_blkptr, *data, dbuf_write_done, db, 2101 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT, &zb); 2102 /* 2103 * We can't access db after arc_write, since it could finish 2104 * and be freed, and we have no locks on it. 2105 */ 2106 } 2107 2108 struct dbuf_arg { 2109 objset_impl_t *os; 2110 blkptr_t bp; 2111 }; 2112 2113 static void 2114 dbuf_do_born(void *arg) 2115 { 2116 struct dbuf_arg *da = arg; 2117 dsl_dataset_block_born(da->os->os_dsl_dataset, 2118 &da->bp, da->os->os_synctx); 2119 kmem_free(da, sizeof (struct dbuf_arg)); 2120 } 2121 2122 static void 2123 dbuf_do_kill(void *arg) 2124 { 2125 struct dbuf_arg *da = arg; 2126 dsl_dataset_block_kill(da->os->os_dsl_dataset, 2127 &da->bp, da->os->os_synctx); 2128 kmem_free(da, sizeof (struct dbuf_arg)); 2129 } 2130 2131 /* ARGSUSED */ 2132 static void 2133 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2134 { 2135 dmu_buf_impl_t *db = vdb; 2136 dnode_t *dn = db->db_dnode; 2137 objset_impl_t *os = dn->dn_objset; 2138 uint64_t txg = zio->io_txg; 2139 uint64_t fill = 0; 2140 int i; 2141 int old_size, new_size; 2142 2143 ASSERT3U(zio->io_error, ==, 0); 2144 2145 dprintf_dbuf_bp(db, &zio->io_bp_orig, "bp_orig: %s", ""); 2146 2147 old_size = bp_get_dasize(os->os_spa, &zio->io_bp_orig); 2148 new_size = bp_get_dasize(os->os_spa, zio->io_bp); 2149 2150 dnode_diduse_space(dn, new_size-old_size); 2151 2152 mutex_enter(&db->db_mtx); 2153 2154 ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == NULL); 2155 2156 if (db->db_dirtied == txg) 2157 db->db_dirtied = 0; 2158 2159 if (db->db_level == 0) { 2160 arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK]; 2161 2162 ASSERT(db->db_blkid != DB_BONUS_BLKID); 2163 2164 if (*old != db->db_buf) 2165 VERIFY(arc_buf_remove_ref(*old, db) == 1); 2166 else if (!BP_IS_HOLE(db->db_blkptr)) 2167 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2168 else 2169 ASSERT(arc_released(db->db_buf)); 2170 *old = NULL; 2171 db->db_data_pending = NULL; 2172 2173 mutex_enter(&dn->dn_mtx); 2174 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 2175 !BP_IS_HOLE(db->db_blkptr)) 2176 dn->dn_phys->dn_maxblkid = db->db_blkid; 2177 mutex_exit(&dn->dn_mtx); 2178 2179 if (dn->dn_type == DMU_OT_DNODE) { 2180 dnode_phys_t *dnp = db->db.db_data; 2181 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2182 i--, dnp++) { 2183 if (dnp->dn_type != DMU_OT_NONE) 2184 fill++; 2185 } 2186 } else { 2187 if (!BP_IS_HOLE(db->db_blkptr)) 2188 fill = 1; 2189 } 2190 } else { 2191 blkptr_t *bp = db->db.db_data; 2192 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2193 if (!BP_IS_HOLE(db->db_blkptr)) { 2194 int epbs = 2195 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2196 ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, db->db.db_size); 2197 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2198 db->db.db_size); 2199 ASSERT3U(dn->dn_phys->dn_maxblkid 2200 >> (db->db_level * epbs), >=, db->db_blkid); 2201 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2202 } 2203 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) { 2204 if (BP_IS_HOLE(bp)) 2205 continue; 2206 ASSERT3U(BP_GET_LSIZE(bp), ==, 2207 db->db_level == 1 ? dn->dn_datablksz : 2208 (1<<dn->dn_phys->dn_indblkshift)); 2209 fill += bp->blk_fill; 2210 } 2211 } 2212 2213 if (!BP_IS_HOLE(db->db_blkptr)) { 2214 db->db_blkptr->blk_fill = fill; 2215 BP_SET_TYPE(db->db_blkptr, dn->dn_type); 2216 BP_SET_LEVEL(db->db_blkptr, db->db_level); 2217 } else { 2218 ASSERT3U(fill, ==, 0); 2219 ASSERT3U(db->db_blkptr->blk_fill, ==, 0); 2220 } 2221 2222 dprintf_dbuf_bp(db, db->db_blkptr, 2223 "wrote %llu bytes to blkptr:", zio->io_size); 2224 2225 ASSERT(db->db_parent == NULL || 2226 list_link_active(&db->db_parent->db_dirty_node[txg&TXG_MASK])); 2227 cv_broadcast(&db->db_changed); 2228 ASSERT(db->db_dirtycnt > 0); 2229 db->db_dirtycnt -= 1; 2230 mutex_exit(&db->db_mtx); 2231 2232 /* We must do this after we've set the bp's type and level */ 2233 if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), 2234 BP_IDENTITY(&zio->io_bp_orig))) { 2235 struct dbuf_arg *da; 2236 da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP); 2237 da->os = os; 2238 da->bp = *zio->io_bp; 2239 (void) taskq_dispatch(dbuf_tq, dbuf_do_born, da, 0); 2240 if (!BP_IS_HOLE(&zio->io_bp_orig)) { 2241 da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP); 2242 da->os = os; 2243 da->bp = zio->io_bp_orig; 2244 (void) taskq_dispatch(dbuf_tq, dbuf_do_kill, da, 0); 2245 } 2246 } 2247 2248 dbuf_rele(db, (void *)(uintptr_t)txg); 2249 } 2250