1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_impl.h> 31 #include <sys/dbuf.h> 32 #include <sys/dmu_objset.h> 33 #include <sys/dsl_dataset.h> 34 #include <sys/dsl_dir.h> 35 #include <sys/dmu_tx.h> 36 #include <sys/spa.h> 37 #include <sys/zio.h> 38 #include <sys/dmu_zfetch.h> 39 40 static void dbuf_destroy(dmu_buf_impl_t *db); 41 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 42 static arc_done_func_t dbuf_write_done; 43 44 /* 45 * Global data structures and functions for the dbuf cache. 46 */ 47 taskq_t *dbuf_tq; 48 static kmem_cache_t *dbuf_cache; 49 50 /* ARGSUSED */ 51 static int 52 dbuf_cons(void *vdb, void *unused, int kmflag) 53 { 54 dmu_buf_impl_t *db = vdb; 55 bzero(db, sizeof (dmu_buf_impl_t)); 56 57 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 58 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 59 refcount_create(&db->db_holds); 60 return (0); 61 } 62 63 /* ARGSUSED */ 64 static void 65 dbuf_dest(void *vdb, void *unused) 66 { 67 dmu_buf_impl_t *db = vdb; 68 mutex_destroy(&db->db_mtx); 69 cv_destroy(&db->db_changed); 70 refcount_destroy(&db->db_holds); 71 } 72 73 /* 74 * dbuf hash table routines 75 */ 76 static dbuf_hash_table_t dbuf_hash_table; 77 78 static uint64_t dbuf_hash_count; 79 80 static uint64_t 81 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 82 { 83 uintptr_t osv = (uintptr_t)os; 84 uint64_t crc = -1ULL; 85 86 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 87 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 88 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 90 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 91 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 92 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 93 94 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 95 96 return (crc); 97 } 98 99 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 100 101 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 102 ((dbuf)->db.db_object == (obj) && \ 103 (dbuf)->db_objset == (os) && \ 104 (dbuf)->db_level == (level) && \ 105 (dbuf)->db_blkid == (blkid)) 106 107 dmu_buf_impl_t * 108 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 109 { 110 dbuf_hash_table_t *h = &dbuf_hash_table; 111 objset_impl_t *os = dn->dn_objset; 112 uint64_t obj = dn->dn_object; 113 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 114 uint64_t idx = hv & h->hash_table_mask; 115 dmu_buf_impl_t *db; 116 117 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 118 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 119 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 120 mutex_enter(&db->db_mtx); 121 if (!refcount_is_zero(&db->db_holds)) { 122 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 123 return (db); 124 } 125 mutex_exit(&db->db_mtx); 126 } 127 } 128 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 129 return (NULL); 130 } 131 132 /* 133 * Insert an entry into the hash table. If there is already an element 134 * equal to elem in the hash table, then the already existing element 135 * will be returned and the new element will not be inserted. 136 * Otherwise returns NULL. 137 */ 138 static dmu_buf_impl_t * 139 dbuf_hash_insert(dmu_buf_impl_t *db) 140 { 141 dbuf_hash_table_t *h = &dbuf_hash_table; 142 objset_impl_t *os = db->db_objset; 143 uint64_t obj = db->db.db_object; 144 int level = db->db_level; 145 uint64_t blkid = db->db_blkid; 146 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 147 uint64_t idx = hv & h->hash_table_mask; 148 dmu_buf_impl_t *dbf; 149 150 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 151 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 152 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 153 mutex_enter(&dbf->db_mtx); 154 if (!refcount_is_zero(&dbf->db_holds)) { 155 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 156 return (dbf); 157 } 158 mutex_exit(&dbf->db_mtx); 159 } 160 } 161 162 mutex_enter(&db->db_mtx); 163 db->db_hash_next = h->hash_table[idx]; 164 h->hash_table[idx] = db; 165 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 166 atomic_add_64(&dbuf_hash_count, 1); 167 168 return (NULL); 169 } 170 171 /* 172 * Remove an entry from the hash table. This operation will 173 * fail if there are any existing holds on the db. 174 */ 175 static void 176 dbuf_hash_remove(dmu_buf_impl_t *db) 177 { 178 dbuf_hash_table_t *h = &dbuf_hash_table; 179 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 180 db->db_level, db->db_blkid); 181 uint64_t idx = hv & h->hash_table_mask; 182 dmu_buf_impl_t *dbf, **dbp; 183 184 /* 185 * We musn't hold db_mtx to maintin lock ordering: 186 * DBUF_HASH_MUTEX > db_mtx. 187 */ 188 ASSERT(refcount_is_zero(&db->db_holds)); 189 ASSERT(db->db_dnode != NULL); 190 ASSERT(!MUTEX_HELD(&db->db_mtx)); 191 192 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 193 dbp = &h->hash_table[idx]; 194 while ((dbf = *dbp) != db) { 195 dbp = &dbf->db_hash_next; 196 ASSERT(dbf != NULL); 197 } 198 *dbp = db->db_hash_next; 199 db->db_hash_next = NULL; 200 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 201 atomic_add_64(&dbuf_hash_count, -1); 202 } 203 204 static int dbuf_evictable(dmu_buf_impl_t *db); 205 static void dbuf_clear(dmu_buf_impl_t *db); 206 207 void 208 dbuf_evict(dmu_buf_impl_t *db) 209 { 210 int err; 211 212 ASSERT(MUTEX_HELD(&db->db_mtx)); 213 err = dbuf_evictable(db); 214 ASSERT(err == TRUE); 215 dbuf_clear(db); 216 dbuf_destroy(db); 217 } 218 219 static void 220 dbuf_evict_user(dmu_buf_impl_t *db) 221 { 222 ASSERT(MUTEX_HELD(&db->db_mtx)); 223 224 if (db->db_level != 0 || db->db_d.db_evict_func == NULL) 225 return; 226 227 if (db->db_d.db_user_data_ptr_ptr) 228 *db->db_d.db_user_data_ptr_ptr = db->db.db_data; 229 db->db_d.db_evict_func(&db->db, db->db_d.db_user_ptr); 230 db->db_d.db_user_ptr = NULL; 231 db->db_d.db_user_data_ptr_ptr = NULL; 232 db->db_d.db_evict_func = NULL; 233 } 234 235 void 236 dbuf_init(void) 237 { 238 uint64_t hsize = 1; 239 dbuf_hash_table_t *h = &dbuf_hash_table; 240 int i; 241 242 /* 243 * The hash table is big enough to fill all of physical memory 244 * with an average 64k block size. The table will take up 245 * totalmem*sizeof(void*)/64k bytes (i.e. 128KB/GB with 8-byte 246 * pointers). 247 */ 248 while (hsize * 65536 < physmem * PAGESIZE) 249 hsize <<= 1; 250 251 h->hash_table_mask = hsize - 1; 252 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_SLEEP); 253 254 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 255 sizeof (dmu_buf_impl_t), 256 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 257 dbuf_tq = taskq_create("dbuf_tq", 8, maxclsyspri, 50, INT_MAX, 258 TASKQ_PREPOPULATE); 259 260 for (i = 0; i < DBUF_MUTEXES; i++) 261 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 262 } 263 264 void 265 dbuf_fini(void) 266 { 267 dbuf_hash_table_t *h = &dbuf_hash_table; 268 int i; 269 270 taskq_destroy(dbuf_tq); 271 dbuf_tq = NULL; 272 273 for (i = 0; i < DBUF_MUTEXES; i++) 274 mutex_destroy(&h->hash_mutexes[i]); 275 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 276 kmem_cache_destroy(dbuf_cache); 277 } 278 279 /* 280 * Other stuff. 281 */ 282 283 #ifdef ZFS_DEBUG 284 static void 285 dbuf_verify(dmu_buf_impl_t *db) 286 { 287 int i; 288 dnode_t *dn = db->db_dnode; 289 290 ASSERT(MUTEX_HELD(&db->db_mtx)); 291 292 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 293 return; 294 295 ASSERT(db->db_objset != NULL); 296 if (dn == NULL) { 297 ASSERT(db->db_parent == NULL); 298 ASSERT(db->db_blkptr == NULL); 299 } else { 300 ASSERT3U(db->db.db_object, ==, dn->dn_object); 301 ASSERT3P(db->db_objset, ==, dn->dn_objset); 302 ASSERT(list_head(&dn->dn_dbufs)); 303 ASSERT3U(db->db_level, <, dn->dn_nlevels); 304 } 305 if (db->db_blkid == DB_BONUS_BLKID) { 306 ASSERT(dn != NULL); 307 ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); 308 ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); 309 } else { 310 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 311 } 312 313 if (db->db_level == 0) { 314 void **udpp = db->db_d.db_user_data_ptr_ptr; 315 /* we can be momentarily larger in dnode_set_blksz() */ 316 if (db->db_blkid != DB_BONUS_BLKID && dn) { 317 ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); 318 } 319 if (udpp) { 320 ASSERT((refcount_is_zero(&db->db_holds) && 321 *udpp == NULL) || 322 (!refcount_is_zero(&db->db_holds) && 323 *udpp == db->db.db_data)); 324 } 325 326 if (IS_DNODE_DNODE(db->db.db_object)) { 327 for (i = 0; i < TXG_SIZE; i++) { 328 /* 329 * it should only be modified in syncing 330 * context, so make sure we only have 331 * one copy of the data. 332 */ 333 ASSERT(db->db_d.db_data_old[i] == NULL || 334 db->db_d.db_data_old[i] == db->db_buf); 335 } 336 } 337 } 338 339 /* verify db->db_blkptr */ 340 if (db->db_blkptr) { 341 if (db->db_parent == dn->dn_dbuf) { 342 /* db is pointed to by the dnode */ 343 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 344 if (IS_DNODE_DNODE(db->db.db_object)) 345 ASSERT(db->db_parent == NULL); 346 else 347 ASSERT(db->db_parent != NULL); 348 ASSERT3P(db->db_blkptr, ==, 349 &dn->dn_phys->dn_blkptr[db->db_blkid]); 350 } else { 351 /* db is pointed to by an indirect block */ 352 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 353 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 354 ASSERT3U(db->db_parent->db.db_object, ==, 355 db->db.db_object); 356 /* 357 * dnode_grow_indblksz() can make this fail if we don't 358 * have the struct_rwlock. XXX indblksz no longer 359 * grows. safe to do this now? 360 */ 361 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { 362 ASSERT3P(db->db_blkptr, ==, 363 ((blkptr_t *)db->db_parent->db.db_data + 364 db->db_blkid % epb)); 365 } 366 } 367 } 368 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 369 db->db.db_data && db->db_blkid != DB_BONUS_BLKID && 370 db->db_state != DB_FILL && !dn->dn_free_txg) { 371 /* 372 * If the blkptr isn't set but they have nonzero data, 373 * it had better be dirty, otherwise we'll lose that 374 * data when we evict this buffer. 375 */ 376 if (db->db_dirtycnt == 0) { 377 uint64_t *buf = db->db.db_data; 378 int i; 379 380 for (i = 0; i < db->db.db_size >> 3; i++) { 381 ASSERT(buf[i] == 0); 382 } 383 } 384 } 385 } 386 #endif 387 388 static void 389 dbuf_update_data(dmu_buf_impl_t *db) 390 { 391 ASSERT(MUTEX_HELD(&db->db_mtx)); 392 if (db->db_level == 0 && db->db_d.db_user_data_ptr_ptr) { 393 ASSERT(!refcount_is_zero(&db->db_holds)); 394 *db->db_d.db_user_data_ptr_ptr = db->db.db_data; 395 } 396 } 397 398 static void 399 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 400 { 401 ASSERT(MUTEX_HELD(&db->db_mtx)); 402 ASSERT(buf->b_data != NULL); 403 db->db_buf = buf; 404 db->db.db_data = buf->b_data; 405 dbuf_update_data(db); 406 } 407 408 uint64_t 409 dbuf_whichblock(dnode_t *dn, uint64_t offset) 410 { 411 if (dn->dn_datablkshift) { 412 return (offset >> dn->dn_datablkshift); 413 } else { 414 ASSERT3U(offset, <, dn->dn_datablksz); 415 return (0); 416 } 417 } 418 419 static void 420 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 421 { 422 dmu_buf_impl_t *db = vdb; 423 424 mutex_enter(&db->db_mtx); 425 ASSERT3U(db->db_state, ==, DB_READ); 426 /* 427 * All reads are synchronous, so we must have a hold on the dbuf 428 */ 429 ASSERT(refcount_count(&db->db_holds) > 0); 430 ASSERT(db->db.db_data == NULL); 431 if (db->db_level == 0 && db->db_d.db_freed_in_flight) { 432 /* we were freed in flight; disregard any error */ 433 arc_release(buf, db); 434 bzero(buf->b_data, db->db.db_size); 435 db->db_d.db_freed_in_flight = FALSE; 436 dbuf_set_data(db, buf); 437 db->db_state = DB_CACHED; 438 } else if (zio == NULL || zio->io_error == 0) { 439 dbuf_set_data(db, buf); 440 db->db_state = DB_CACHED; 441 } else { 442 ASSERT(db->db_blkid != DB_BONUS_BLKID); 443 arc_buf_free(buf, db); 444 db->db_state = DB_UNCACHED; 445 ASSERT3P(db->db_buf, ==, NULL); 446 } 447 cv_broadcast(&db->db_changed); 448 mutex_exit(&db->db_mtx); 449 } 450 451 void 452 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 453 { 454 arc_buf_t *buf; 455 blkptr_t *bp; 456 457 ASSERT(!refcount_is_zero(&db->db_holds)); 458 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 459 ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); 460 461 /* 462 * prefetch only data blocks (level 0) -- don't prefetch indirect 463 * blocks 464 */ 465 if ((db->db_level > 0) || (db->db_blkid == DB_BONUS_BLKID)) { 466 flags |= DB_RF_NOPREFETCH; 467 } 468 469 if (((flags & DB_RF_NOPREFETCH) == 0) && (db->db_dnode != NULL)) { 470 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 471 db->db.db_size); 472 } 473 474 if (db->db_state == DB_CACHED) { 475 ASSERT(db->db.db_data != NULL); 476 return; 477 } 478 479 mutex_enter(&db->db_mtx); 480 481 if (db->db_state != DB_UNCACHED) { 482 mutex_exit(&db->db_mtx); 483 return; 484 } 485 486 ASSERT3U(db->db_state, ==, DB_UNCACHED); 487 488 if (db->db_blkid == DB_BONUS_BLKID) { 489 ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size); 490 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 491 DN_MAX_BONUSLEN, db); 492 if (db->db.db_size < DN_MAX_BONUSLEN) 493 bzero(buf->b_data, DN_MAX_BONUSLEN); 494 bcopy(DN_BONUS(db->db_dnode->dn_phys), buf->b_data, 495 db->db.db_size); 496 dbuf_set_data(db, buf); 497 db->db_state = DB_CACHED; 498 mutex_exit(&db->db_mtx); 499 return; 500 } 501 502 if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid)) 503 bp = NULL; 504 else 505 bp = db->db_blkptr; 506 507 if (bp == NULL) 508 dprintf_dbuf(db, "blkptr: %s\n", "NULL"); 509 else 510 dprintf_dbuf_bp(db, bp, "%s", "blkptr:"); 511 512 if (bp == NULL || BP_IS_HOLE(bp)) { 513 ASSERT(bp == NULL || BP_IS_HOLE(bp)); 514 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 515 db->db.db_size, db)); 516 bzero(db->db.db_data, db->db.db_size); 517 db->db_state = DB_CACHED; 518 mutex_exit(&db->db_mtx); 519 return; 520 } 521 522 db->db_state = DB_READ; 523 mutex_exit(&db->db_mtx); 524 525 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 526 (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp, 527 db->db_level > 0 ? byteswap_uint64_array : 528 dmu_ot[db->db_dnode->dn_type].ot_byteswap, 529 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 530 (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 531 ARC_NOWAIT); 532 } 533 534 static int 535 dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags) 536 { 537 zio_t *zio; 538 int err; 539 540 /* 541 * We don't have to hold the mutex to check db_state because it 542 * can't be freed while we have a hold on the buffer. 543 */ 544 ASSERT(!refcount_is_zero(&db->db_holds)); 545 if (db->db_state == DB_CACHED) 546 return (0); 547 548 if (db->db_state == DB_UNCACHED) { 549 zio = zio_root(db->db_dnode->dn_objset->os_spa, NULL, NULL, 550 ZIO_FLAG_CANFAIL); 551 if ((flags & DB_RF_HAVESTRUCT) == 0) 552 rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); 553 dbuf_read_impl(db, zio, flags); 554 if ((flags & DB_RF_HAVESTRUCT) == 0) 555 rw_exit(&db->db_dnode->dn_struct_rwlock); 556 err = zio_wait(zio); 557 if (err) 558 return (err); 559 } 560 561 mutex_enter(&db->db_mtx); 562 while (db->db_state == DB_READ || db->db_state == DB_FILL) { 563 ASSERT(db->db_state == DB_READ || 564 (flags & DB_RF_HAVESTRUCT) == 0); 565 cv_wait(&db->db_changed, &db->db_mtx); 566 } 567 ASSERT3U(db->db_state, ==, DB_CACHED); 568 mutex_exit(&db->db_mtx); 569 570 return (0); 571 } 572 573 #pragma weak dmu_buf_read = dbuf_read 574 void 575 dbuf_read(dmu_buf_impl_t *db) 576 { 577 int err; 578 579 err = dbuf_read_generic(db, DB_RF_MUST_SUCCEED); 580 ASSERT(err == 0); 581 } 582 583 #pragma weak dmu_buf_read_canfail = dbuf_read_canfail 584 int 585 dbuf_read_canfail(dmu_buf_impl_t *db) 586 { 587 return (dbuf_read_generic(db, DB_RF_CANFAIL)); 588 } 589 590 void 591 dbuf_read_havestruct(dmu_buf_impl_t *db) 592 { 593 int err; 594 595 ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); 596 err = dbuf_read_generic(db, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH)); 597 ASSERT(err == 0); 598 } 599 600 static void 601 dbuf_noread(dmu_buf_impl_t *db) 602 { 603 ASSERT(!refcount_is_zero(&db->db_holds)); 604 mutex_enter(&db->db_mtx); 605 while (db->db_state == DB_READ || db->db_state == DB_FILL) 606 cv_wait(&db->db_changed, &db->db_mtx); 607 if (db->db_state == DB_UNCACHED) { 608 int blksz = (db->db_blkid == DB_BONUS_BLKID) ? 609 DN_MAX_BONUSLEN : db->db.db_size; 610 ASSERT(db->db.db_data == NULL); 611 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 612 blksz, db)); 613 db->db_state = DB_FILL; 614 } else { 615 ASSERT3U(db->db_state, ==, DB_CACHED); 616 } 617 mutex_exit(&db->db_mtx); 618 } 619 620 /* 621 * This is our just-in-time copy function. It makes a copy of 622 * buffers, that have been modified in a previous transaction 623 * group, before we modify them in the current active group. 624 * 625 * This function is used in two places: when we are dirtying a 626 * buffer for the first time in a txg, and when we are freeing 627 * a range in a dnode that includes this buffer. 628 * 629 * Note that when we are called from dbuf_free_range() we do 630 * not put a hold on the buffer, we just traverse the active 631 * dbuf list for the dnode. 632 */ 633 static void 634 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 635 { 636 arc_buf_t **quiescing, **syncing; 637 int size = (db->db_blkid == DB_BONUS_BLKID) ? 638 DN_MAX_BONUSLEN : db->db.db_size; 639 640 ASSERT(MUTEX_HELD(&db->db_mtx)); 641 ASSERT(db->db.db_data != NULL); 642 643 quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK]; 644 syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK]; 645 646 /* 647 * If this buffer is referenced from the current quiescing 648 * transaction group: either make a copy and reset the reference 649 * to point to the copy, or (if there a no active holders) just 650 * null out the current db_data pointer. 651 */ 652 if (*quiescing == db->db_buf) { 653 /* 654 * If the quiescing txg is "dirty", then we better not 655 * be referencing the same buffer from the syncing txg. 656 */ 657 ASSERT(*syncing != db->db_buf); 658 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 659 *quiescing = arc_buf_alloc( 660 db->db_dnode->dn_objset->os_spa, size, db); 661 bcopy(db->db.db_data, (*quiescing)->b_data, size); 662 } else { 663 db->db.db_data = NULL; 664 db->db_buf = NULL; 665 db->db_state = DB_UNCACHED; 666 } 667 return; 668 } 669 670 /* 671 * If this buffer is referenced from the current syncing 672 * transaction group: either 673 * 1 - make a copy and reset the reference, or 674 * 2 - if there are no holders, just null the current db_data. 675 */ 676 if (*syncing == db->db_buf) { 677 ASSERT3P(*quiescing, ==, NULL); 678 ASSERT3U(db->db_dirtycnt, ==, 1); 679 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 680 /* we can't copy if we have already started a write */ 681 ASSERT(*syncing != db->db_data_pending); 682 *syncing = arc_buf_alloc( 683 db->db_dnode->dn_objset->os_spa, size, db); 684 bcopy(db->db.db_data, (*syncing)->b_data, size); 685 } else { 686 db->db.db_data = NULL; 687 db->db_buf = NULL; 688 db->db_state = DB_UNCACHED; 689 } 690 } 691 } 692 693 void 694 dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg) 695 { 696 ASSERT(MUTEX_HELD(&db->db_mtx)); 697 if (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) { 698 db->db_d.db_overridden_by[txg&TXG_MASK] = NULL; 699 } else if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { 700 /* free this block */ 701 ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) || 702 db->db_dnode->dn_free_txg == txg); 703 if (!BP_IS_HOLE(db->db_d.db_overridden_by[txg&TXG_MASK])) { 704 /* XXX can get silent EIO here */ 705 (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa, 706 txg, db->db_d.db_overridden_by[txg&TXG_MASK], 707 NULL, NULL, ARC_WAIT); 708 } 709 kmem_free(db->db_d.db_overridden_by[txg&TXG_MASK], 710 sizeof (blkptr_t)); 711 db->db_d.db_overridden_by[txg&TXG_MASK] = NULL; 712 /* release the already-written buffer */ 713 arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); 714 } 715 } 716 717 void 718 dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) 719 { 720 dmu_buf_impl_t *db, *db_next; 721 uint64_t txg = tx->tx_txg; 722 723 dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks); 724 mutex_enter(&dn->dn_dbufs_mtx); 725 for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 726 db_next = list_next(&dn->dn_dbufs, db); 727 if ((db->db_level != 0) || (db->db_blkid == DB_BONUS_BLKID)) 728 continue; 729 dprintf_dbuf(db, "found buf %s\n", ""); 730 if (db->db_blkid < blkid || 731 db->db_blkid >= blkid+nblks) 732 continue; 733 734 /* found a level 0 buffer in the range */ 735 if (dbuf_undirty(db, tx)) 736 continue; 737 738 mutex_enter(&db->db_mtx); 739 if (db->db_state == DB_UNCACHED) { 740 ASSERT(db->db.db_data == NULL); 741 mutex_exit(&db->db_mtx); 742 continue; 743 } 744 if (db->db_state == DB_READ) { 745 /* this will be handled in dbuf_read_done() */ 746 db->db_d.db_freed_in_flight = TRUE; 747 mutex_exit(&db->db_mtx); 748 continue; 749 } 750 if (db->db_state == DB_FILL) { 751 /* this will be handled in dbuf_rele() */ 752 db->db_d.db_freed_in_flight = TRUE; 753 mutex_exit(&db->db_mtx); 754 continue; 755 } 756 757 /* make a copy of the data if necessary */ 758 dbuf_fix_old_data(db, txg); 759 760 if (db->db.db_data) { 761 /* fill in with appropriate data */ 762 arc_release(db->db_buf, db); 763 bzero(db->db.db_data, db->db.db_size); 764 } 765 mutex_exit(&db->db_mtx); 766 } 767 mutex_exit(&dn->dn_dbufs_mtx); 768 } 769 770 static int 771 dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx) 772 { 773 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 774 uint64_t birth_txg = 0; 775 776 /* Don't count meta-objects */ 777 if (ds == NULL) 778 return (FALSE); 779 780 /* 781 * We don't need any locking to protect db_blkptr: 782 * If it's syncing, then db_dirtied will be set so we'll 783 * ignore db_blkptr. 784 */ 785 ASSERT(MUTEX_HELD(&db->db_mtx)); /* XXX strictly necessary? */ 786 /* If we have been dirtied since the last snapshot, its not new */ 787 if (db->db_dirtied) 788 birth_txg = db->db_dirtied; 789 else if (db->db_blkptr) 790 birth_txg = db->db_blkptr->blk_birth; 791 792 if (birth_txg) 793 return (!dsl_dataset_block_freeable(ds, birth_txg, tx)); 794 else 795 return (TRUE); 796 } 797 798 void 799 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 800 { 801 arc_buf_t *buf, *obuf; 802 int osize = db->db.db_size; 803 804 /* XXX does *this* func really need the lock? */ 805 ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); 806 807 if (osize == size) 808 return; 809 810 /* 811 * This call to dbuf_will_dirty() with the dn_struct_rwlock held 812 * is OK, because there can be no other references to the db 813 * when we are changing its size, so no concurrent DB_FILL can 814 * be happening. 815 */ 816 /* Make a copy of the data if necessary */ 817 dbuf_will_dirty(db, tx); 818 819 /* create the data buffer for the new block */ 820 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db); 821 822 /* copy old block data to the new block */ 823 obuf = db->db_buf; 824 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 825 /* zero the remainder */ 826 if (size > osize) 827 bzero((uint8_t *)buf->b_data + osize, size - osize); 828 829 mutex_enter(&db->db_mtx); 830 /* ASSERT3U(refcount_count(&db->db_holds), ==, 1); */ 831 dbuf_set_data(db, buf); 832 arc_buf_free(obuf, db); 833 db->db.db_size = size; 834 835 /* fix up the dirty info */ 836 if (db->db_level == 0) 837 db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf; 838 mutex_exit(&db->db_mtx); 839 840 dnode_willuse_space(db->db_dnode, size-osize, tx); 841 } 842 843 void 844 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 845 { 846 dnode_t *dn = db->db_dnode; 847 objset_impl_t *os = dn->dn_objset; 848 int drop_struct_lock = FALSE; 849 int txgoff = tx->tx_txg & TXG_MASK; 850 851 ASSERT(tx->tx_txg != 0); 852 ASSERT(!refcount_is_zero(&db->db_holds)); 853 DMU_TX_DIRTY_BUF(tx, db); 854 855 /* 856 * Shouldn't dirty a regular buffer in syncing context. Private 857 * objects may be dirtied in syncing context, but only if they 858 * were already pre-dirtied in open context. 859 * XXX We may want to prohibit dirtying in syncing context even 860 * if they did pre-dirty. 861 */ 862 ASSERT(!(dmu_tx_is_syncing(tx) && 863 !BP_IS_HOLE(&dn->dn_objset->os_rootbp) && 864 !(dn->dn_object & DMU_PRIVATE_OBJECT) && 865 dn->dn_objset->os_dsl_dataset != NULL && 866 !dsl_dir_is_private( 867 dn->dn_objset->os_dsl_dataset->ds_dir))); 868 869 /* 870 * We make this assert for private objects as well, but after we 871 * check if we're already dirty. They are allowed to re-dirty 872 * in syncing context. 873 */ 874 ASSERT(dn->dn_object & DMU_PRIVATE_OBJECT || 875 dn->dn_dirtyctx == DN_UNDIRTIED || 876 dn->dn_dirtyctx == 877 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 878 879 mutex_enter(&db->db_mtx); 880 /* XXX make this true for indirects too? */ 881 ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || 882 db->db_state == DB_FILL); 883 884 /* 885 * If this buffer is currently part of an "overridden" region, 886 * we now need to remove it from that region. 887 */ 888 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 889 db->db_d.db_overridden_by[txgoff] != NULL) { 890 dbuf_unoverride(db, tx->tx_txg); 891 } 892 893 mutex_enter(&dn->dn_mtx); 894 /* 895 * Don't set dirtyctx to SYNC if we're just modifying this as we 896 * initialize the objset. 897 */ 898 if (dn->dn_dirtyctx == DN_UNDIRTIED && 899 !BP_IS_HOLE(&dn->dn_objset->os_rootbp)) { 900 dn->dn_dirtyctx = 901 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 902 ASSERT(dn->dn_dirtyctx_firstset == NULL); 903 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 904 } 905 mutex_exit(&dn->dn_mtx); 906 907 /* 908 * If this buffer is already dirty, we're done. 909 */ 910 if (list_link_active(&db->db_dirty_node[txgoff])) { 911 mutex_exit(&db->db_mtx); 912 return; 913 } 914 915 /* 916 * Only valid if not already dirty. 917 */ 918 ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 919 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 920 921 ASSERT3U(dn->dn_nlevels, >, db->db_level); 922 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 923 dn->dn_phys->dn_nlevels > db->db_level || 924 dn->dn_next_nlevels[txgoff] > db->db_level || 925 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 926 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 927 928 /* 929 * We should only be dirtying in syncing context if it's the 930 * mos, a spa os, or we're initializing the os. However, we are 931 * allowed to dirty in syncing context provided we already 932 * dirtied it in open context. Hence we must make this 933 * assertion only if we're not already dirty. 934 */ 935 ASSERT(!dmu_tx_is_syncing(tx) || 936 os->os_dsl_dataset == NULL || 937 !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || 938 !BP_IS_HOLE(&os->os_rootbp)); 939 ASSERT(db->db.db_size != 0); 940 941 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 942 943 if (db->db_level == 0) { 944 /* 945 * Release the data buffer from the cache so that we 946 * can modify it without impacting possible other users 947 * of this cached data block. Note that indirect blocks 948 * and private objects are not released until the syncing 949 * state (since they are only modified then). 950 * 951 * If this buffer is dirty in an old transaction group we need 952 * to make a copy of it so that the changes we make in this 953 * transaction group won't leak out when we sync the older txg. 954 */ 955 ASSERT(db->db_buf != NULL); 956 ASSERT(db->db.db_data != NULL); 957 ASSERT(db->db_d.db_data_old[txgoff] == NULL); 958 if (!(db->db.db_object & DMU_PRIVATE_OBJECT)) { 959 arc_release(db->db_buf, db); 960 dbuf_fix_old_data(db, tx->tx_txg); 961 ASSERT(db->db_buf != NULL); 962 } 963 db->db_d.db_data_old[txgoff] = db->db_buf; 964 } 965 966 mutex_enter(&dn->dn_mtx); 967 /* 968 * We could have been freed_in_flight between the dbuf_noread 969 * and dbuf_dirty. We win, as though the dbuf_noread() had 970 * happened after the free. 971 */ 972 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 973 dnode_clear_range(dn, db->db_blkid, 1, tx); 974 db->db_d.db_freed_in_flight = FALSE; 975 } 976 977 db->db_dirtied = tx->tx_txg; 978 list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db); 979 mutex_exit(&dn->dn_mtx); 980 981 /* 982 * If writting this buffer will consume a new block on disk, 983 * then update the accounting. 984 */ 985 if (db->db_blkid != DB_BONUS_BLKID) { 986 if (!dbuf_new_block(db, tx) && db->db_blkptr) { 987 /* 988 * This is only a guess -- if the dbuf is dirty 989 * in a previous txg, we don't know how much 990 * space it will use on disk yet. We should 991 * really have the struct_rwlock to access 992 * db_blkptr, but since this is just a guess, 993 * it's OK if we get an odd answer. 994 */ 995 dnode_willuse_space(dn, 996 -BP_GET_ASIZE(db->db_blkptr), tx); 997 } 998 dnode_willuse_space(dn, db->db.db_size, tx); 999 } 1000 1001 /* 1002 * This buffer is now part of this txg 1003 */ 1004 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1005 db->db_dirtycnt += 1; 1006 ASSERT3U(db->db_dirtycnt, <=, 3); 1007 1008 mutex_exit(&db->db_mtx); 1009 1010 if (db->db_blkid == DB_BONUS_BLKID) { 1011 dnode_setdirty(dn, tx); 1012 return; 1013 } 1014 1015 if (db->db_level == 0) 1016 dnode_new_blkid(dn, db->db_blkid, tx); 1017 1018 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1019 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1020 drop_struct_lock = TRUE; 1021 } 1022 1023 if (db->db_level < dn->dn_nlevels-1) { 1024 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1025 dmu_buf_impl_t *parent; 1026 parent = dbuf_hold_level(dn, db->db_level+1, 1027 db->db_blkid >> epbs, FTAG); 1028 if (drop_struct_lock) 1029 rw_exit(&dn->dn_struct_rwlock); 1030 dbuf_dirty(parent, tx); 1031 dbuf_remove_ref(parent, FTAG); 1032 } else { 1033 if (drop_struct_lock) 1034 rw_exit(&dn->dn_struct_rwlock); 1035 } 1036 1037 dnode_setdirty(dn, tx); 1038 } 1039 1040 static int 1041 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1042 { 1043 dnode_t *dn = db->db_dnode; 1044 int txgoff = tx->tx_txg & TXG_MASK; 1045 1046 ASSERT(tx->tx_txg != 0); 1047 1048 mutex_enter(&db->db_mtx); 1049 1050 /* 1051 * If this buffer is not dirty, we're done. 1052 */ 1053 if (!list_link_active(&db->db_dirty_node[txgoff])) { 1054 mutex_exit(&db->db_mtx); 1055 return (0); 1056 } 1057 1058 /* 1059 * If this buffer is currently held, we cannot undirty 1060 * it, since one of the current holders may be in the 1061 * middle of an update. Note that users of dbuf_undirty() 1062 * should not place a hold on the dbuf before the call. 1063 * XXX - this check assumes we are being called from 1064 * dbuf_free_range(), perhaps we should move it there? 1065 */ 1066 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1067 mutex_exit(&db->db_mtx); 1068 mutex_enter(&dn->dn_mtx); 1069 dnode_clear_range(dn, db->db_blkid, 1, tx); 1070 mutex_exit(&dn->dn_mtx); 1071 return (0); 1072 } 1073 1074 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1075 1076 dbuf_unoverride(db, tx->tx_txg); 1077 1078 ASSERT(db->db.db_size != 0); 1079 if (db->db_level == 0) { 1080 ASSERT(db->db_buf != NULL); 1081 ASSERT(db->db_d.db_data_old[txgoff] != NULL); 1082 if (db->db_d.db_data_old[txgoff] != db->db_buf) 1083 arc_buf_free(db->db_d.db_data_old[txgoff], db); 1084 db->db_d.db_data_old[txgoff] = NULL; 1085 } 1086 1087 /* XXX would be nice to fix up dn_towrite_space[] */ 1088 /* XXX undo db_dirtied? but how? */ 1089 /* db->db_dirtied = tx->tx_txg; */ 1090 1091 mutex_enter(&dn->dn_mtx); 1092 list_remove(&dn->dn_dirty_dbufs[txgoff], db); 1093 mutex_exit(&dn->dn_mtx); 1094 1095 ASSERT(db->db_dirtycnt > 0); 1096 db->db_dirtycnt -= 1; 1097 1098 if (refcount_remove(&db->db_holds, 1099 (void *)(uintptr_t)tx->tx_txg) == 0) { 1100 /* make duf_verify() happy */ 1101 if (db->db.db_data) 1102 bzero(db->db.db_data, db->db.db_size); 1103 1104 dbuf_evict(db); 1105 return (1); 1106 } 1107 1108 mutex_exit(&db->db_mtx); 1109 return (0); 1110 } 1111 1112 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty 1113 void 1114 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1115 { 1116 int rf = DB_RF_MUST_SUCCEED; 1117 1118 ASSERT(tx->tx_txg != 0); 1119 ASSERT(!refcount_is_zero(&db->db_holds)); 1120 1121 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) 1122 rf |= DB_RF_HAVESTRUCT; 1123 (void) dbuf_read_generic(db, rf); 1124 dbuf_dirty(db, tx); 1125 } 1126 1127 #pragma weak dmu_buf_will_fill = dbuf_will_fill 1128 void 1129 dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx) 1130 { 1131 ASSERT(tx->tx_txg != 0); 1132 ASSERT(db->db_level == 0); 1133 ASSERT(!refcount_is_zero(&db->db_holds)); 1134 1135 ASSERT(!(db->db.db_object & DMU_PRIVATE_OBJECT) || 1136 dmu_tx_private_ok(tx)); 1137 1138 dbuf_noread(db); 1139 dbuf_dirty(db, tx); 1140 } 1141 1142 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1143 /* ARGSUSED */ 1144 void 1145 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1146 { 1147 mutex_enter(&db->db_mtx); 1148 DBUF_VERIFY(db); 1149 1150 if (db->db_state == DB_FILL) { 1151 if (db->db_level == 0 && db->db_d.db_freed_in_flight) { 1152 /* we were freed while filling */ 1153 /* XXX dbuf_undirty? */ 1154 bzero(db->db.db_data, db->db.db_size); 1155 db->db_d.db_freed_in_flight = FALSE; 1156 } 1157 db->db_state = DB_CACHED; 1158 cv_broadcast(&db->db_changed); 1159 } 1160 mutex_exit(&db->db_mtx); 1161 } 1162 1163 1164 static void 1165 dbuf_clear(dmu_buf_impl_t *db) 1166 { 1167 dnode_t *dn = db->db_dnode; 1168 1169 ASSERT(MUTEX_HELD(&dn->dn_dbufs_mtx)); 1170 ASSERT(MUTEX_HELD(&db->db_mtx)); 1171 ASSERT(refcount_is_zero(&db->db_holds)); 1172 1173 if (db->db_state == DB_CACHED) { 1174 ASSERT(db->db_buf != NULL); 1175 arc_buf_free(db->db_buf, db); 1176 db->db.db_data = NULL; 1177 db->db_buf = NULL; 1178 db->db_state = DB_UNCACHED; 1179 } 1180 1181 ASSERT3U(db->db_state, ==, DB_UNCACHED); 1182 ASSERT(db->db_buf == NULL); 1183 ASSERT(db->db_data_pending == NULL); 1184 1185 mutex_exit(&db->db_mtx); 1186 1187 /* 1188 * If this dbuf is referened from an indirect dbuf, 1189 * decrement the ref count on the indirect dbuf. 1190 */ 1191 if (db->db_parent && db->db_parent != dn->dn_dbuf) 1192 dbuf_remove_ref(db->db_parent, db); 1193 1194 /* remove from dn_dbufs */ 1195 list_remove(&dn->dn_dbufs, db); 1196 1197 dnode_rele(dn, db); 1198 1199 dbuf_hash_remove(db); 1200 1201 db->db_dnode = NULL; 1202 db->db_parent = NULL; 1203 db->db_blkptr = NULL; 1204 } 1205 1206 static int 1207 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1208 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1209 { 1210 int nlevels, epbs; 1211 1212 if (dn->dn_phys->dn_nlevels == 0) 1213 nlevels = 1; 1214 else 1215 nlevels = dn->dn_phys->dn_nlevels; 1216 1217 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1218 1219 ASSERT3U(level * epbs, <, 64); 1220 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1221 if (blkid == DB_BONUS_BLKID) { 1222 /* this is the bonus buffer */ 1223 *parentp = NULL; 1224 *bpp = NULL; 1225 return (0); 1226 } else if (level >= nlevels || 1227 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1228 /* the buffer has no parent yet */ 1229 *parentp = NULL; 1230 *bpp = NULL; 1231 return (ENOENT); 1232 } else if (level < nlevels-1) { 1233 /* this block is referenced from an indirect block */ 1234 int err = dbuf_hold_impl(dn, level+1, 1235 blkid >> epbs, fail_sparse, NULL, parentp); 1236 if (err) 1237 return (err); 1238 dbuf_read_havestruct(*parentp); 1239 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1240 (blkid & ((1ULL << epbs) - 1)); 1241 return (0); 1242 } else { 1243 /* the block is referenced from the dnode */ 1244 ASSERT3U(level, ==, nlevels-1); 1245 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1246 blkid < dn->dn_phys->dn_nblkptr); 1247 *parentp = dn->dn_dbuf; 1248 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1249 return (0); 1250 } 1251 } 1252 1253 static dmu_buf_impl_t * 1254 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1255 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1256 { 1257 objset_impl_t *os = dn->dn_objset; 1258 dmu_buf_impl_t *db, *odb; 1259 1260 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1261 ASSERT(dn->dn_type != DMU_OT_NONE); 1262 1263 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1264 1265 db->db_objset = os; 1266 db->db.db_object = dn->dn_object; 1267 db->db_level = level; 1268 db->db_blkid = blkid; 1269 db->db_state = DB_UNCACHED; 1270 1271 if (db->db_blkid == DB_BONUS_BLKID) { 1272 db->db.db_size = dn->dn_bonuslen; 1273 db->db.db_offset = DB_BONUS_BLKID; 1274 } else { 1275 int blocksize = 1276 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1277 db->db.db_size = blocksize; 1278 db->db.db_offset = db->db_blkid * blocksize; 1279 } 1280 1281 db->db_dirtied = 0; 1282 db->db_dirtycnt = 0; 1283 1284 bzero(&db->db_d, sizeof (db->db_d)); 1285 1286 /* 1287 * Hold the dn_dbufs_mtx while we get the new dbuf 1288 * in the hash table *and* added to the dbufs list. 1289 * This prevents a possible deadlock with someone 1290 * trying to look up this dbuf before its added to the 1291 * dn_dbufs list. 1292 */ 1293 mutex_enter(&dn->dn_dbufs_mtx); 1294 if ((odb = dbuf_hash_insert(db)) != NULL) { 1295 /* someone else inserted it first */ 1296 kmem_cache_free(dbuf_cache, db); 1297 mutex_exit(&dn->dn_dbufs_mtx); 1298 return (odb); 1299 } 1300 list_insert_head(&dn->dn_dbufs, db); 1301 mutex_exit(&dn->dn_dbufs_mtx); 1302 1303 if (parent && parent != dn->dn_dbuf) 1304 dbuf_add_ref(parent, db); 1305 1306 (void) refcount_add(&dn->dn_holds, db); 1307 1308 db->db_dnode = dn; 1309 db->db_parent = parent; 1310 db->db_blkptr = blkptr; 1311 1312 dprintf_dbuf(db, "db=%p\n", db); 1313 1314 return (db); 1315 } 1316 1317 static int 1318 dbuf_evictable(dmu_buf_impl_t *db) 1319 { 1320 int i; 1321 1322 ASSERT(MUTEX_HELD(&db->db_mtx)); 1323 DBUF_VERIFY(db); 1324 1325 if (db->db_state != DB_UNCACHED && db->db_state != DB_CACHED) 1326 return (FALSE); 1327 1328 if (!refcount_is_zero(&db->db_holds)) 1329 return (FALSE); 1330 1331 #ifdef ZFS_DEBUG 1332 for (i = 0; i < TXG_SIZE; i++) { 1333 ASSERT(!list_link_active(&db->db_dirty_node[i])); 1334 ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL); 1335 } 1336 #endif 1337 1338 /* 1339 * Now we know we want to free it. 1340 * This call must be done last, since it has side effects - 1341 * calling the db_evict_func(). 1342 */ 1343 dbuf_evict_user(db); 1344 return (TRUE); 1345 } 1346 1347 static void 1348 dbuf_destroy(dmu_buf_impl_t *db) 1349 { 1350 ASSERT(refcount_is_zero(&db->db_holds)); 1351 1352 ASSERT(db->db.db_data == NULL); 1353 ASSERT(db->db_dnode == NULL); 1354 ASSERT(db->db_parent == NULL); 1355 ASSERT(db->db_hash_next == NULL); 1356 ASSERT(db->db_blkptr == NULL); 1357 ASSERT(db->db_data_pending == NULL); 1358 1359 kmem_cache_free(dbuf_cache, db); 1360 } 1361 1362 void 1363 dbuf_prefetch(dnode_t *dn, uint64_t blkid) 1364 { 1365 dmu_buf_impl_t *db, *parent = NULL; 1366 blkptr_t *bp = NULL; 1367 1368 ASSERT(blkid != DB_BONUS_BLKID); 1369 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1370 1371 if (dnode_block_freed(dn, blkid)) 1372 return; 1373 1374 /* dbuf_find() returns with db_mtx held */ 1375 if (db = dbuf_find(dn, 0, blkid)) { 1376 /* 1377 * This dbuf is already in the cache. We assume that 1378 * it is already CACHED, or else about to be either 1379 * read or filled. 1380 */ 1381 mutex_exit(&db->db_mtx); 1382 return; 1383 } 1384 1385 if (dbuf_findbp(dn, 0, blkid, TRUE, &parent, &bp) == 0) { 1386 if (bp && !BP_IS_HOLE(bp)) { 1387 (void) arc_read(NULL, dn->dn_objset->os_spa, bp, 1388 dmu_ot[dn->dn_type].ot_byteswap, 1389 NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 1390 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1391 (ARC_NOWAIT | ARC_PREFETCH)); 1392 } 1393 if (parent && parent != dn->dn_dbuf) 1394 dbuf_rele(parent); 1395 } 1396 } 1397 1398 /* 1399 * Returns with db_holds incremented, and db_mtx not held. 1400 * Note: dn_struct_rwlock must be held. 1401 */ 1402 int 1403 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1404 void *tag, dmu_buf_impl_t **dbp) 1405 { 1406 dmu_buf_impl_t *db, *parent = NULL; 1407 1408 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1409 ASSERT3U(dn->dn_nlevels, >, level); 1410 1411 *dbp = NULL; 1412 1413 /* dbuf_find() returns with db_mtx held */ 1414 db = dbuf_find(dn, level, blkid); 1415 1416 if (db == NULL) { 1417 blkptr_t *bp = NULL; 1418 int err; 1419 1420 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1421 if (fail_sparse) { 1422 if (err == 0 && bp && BP_IS_HOLE(bp)) 1423 err = ENOENT; 1424 if (err) { 1425 if (parent && parent != dn->dn_dbuf) 1426 dbuf_rele(parent); 1427 return (err); 1428 } 1429 } 1430 db = dbuf_create(dn, level, blkid, parent, bp); 1431 } 1432 1433 /* 1434 * If this buffer is currently syncing out, and we are 1435 * are still referencing it from db_data, we need to make 1436 * a copy of it in case we decide we want to dirty it 1437 * again in this txg. 1438 */ 1439 if (db->db_level == 0 && db->db_state == DB_CACHED && 1440 !(dn->dn_object & DMU_PRIVATE_OBJECT) && 1441 db->db_data_pending == db->db_buf) { 1442 int size = (db->db_blkid == DB_BONUS_BLKID) ? 1443 DN_MAX_BONUSLEN : db->db.db_size; 1444 1445 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 1446 size, db)); 1447 bcopy(db->db_data_pending->b_data, db->db.db_data, 1448 db->db.db_size); 1449 } 1450 1451 dbuf_add_ref(db, tag); 1452 dbuf_update_data(db); 1453 DBUF_VERIFY(db); 1454 mutex_exit(&db->db_mtx); 1455 1456 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1457 if (parent && parent != dn->dn_dbuf) 1458 dbuf_rele(parent); 1459 1460 ASSERT3P(db->db_dnode, ==, dn); 1461 ASSERT3U(db->db_blkid, ==, blkid); 1462 ASSERT3U(db->db_level, ==, level); 1463 *dbp = db; 1464 1465 return (0); 1466 } 1467 1468 dmu_buf_impl_t * 1469 dbuf_hold(dnode_t *dn, uint64_t blkid) 1470 { 1471 dmu_buf_impl_t *db; 1472 (void) dbuf_hold_impl(dn, 0, blkid, FALSE, NULL, &db); 1473 return (db); 1474 } 1475 1476 dmu_buf_impl_t * 1477 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1478 { 1479 dmu_buf_impl_t *db; 1480 (void) dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1481 return (db); 1482 } 1483 1484 dmu_buf_impl_t * 1485 dbuf_hold_bonus(dnode_t *dn, void *tag) 1486 { 1487 dmu_buf_impl_t *db; 1488 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1489 (void) dbuf_hold_impl(dn, 0, DB_BONUS_BLKID, FALSE, tag, &db); 1490 rw_exit(&dn->dn_struct_rwlock); 1491 return (db); 1492 } 1493 1494 void 1495 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1496 { 1497 (void) refcount_add(&db->db_holds, tag); 1498 /* dprintf_dbuf(db, "adding ref %p; holds up to %lld\n", tag, holds); */ 1499 } 1500 1501 void 1502 dbuf_remove_ref(dmu_buf_impl_t *db, void *tag) 1503 { 1504 int64_t holds; 1505 dnode_t *dn = db->db_dnode; 1506 int need_mutex; 1507 1508 ASSERT(dn != NULL); 1509 need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx); 1510 1511 if (need_mutex) { 1512 dnode_add_ref(dn, FTAG); 1513 mutex_enter(&dn->dn_dbufs_mtx); 1514 } 1515 1516 mutex_enter(&db->db_mtx); 1517 DBUF_VERIFY(db); 1518 1519 holds = refcount_remove(&db->db_holds, tag); 1520 1521 if (holds == 0) { 1522 ASSERT3U(db->db_state, !=, DB_FILL); 1523 if (db->db_level == 0 && 1524 db->db_d.db_user_data_ptr_ptr != NULL) 1525 *db->db_d.db_user_data_ptr_ptr = NULL; 1526 dbuf_evict(db); 1527 } else { 1528 if (holds == db->db_dirtycnt && 1529 db->db_level == 0 && db->db_d.db_immediate_evict) 1530 dbuf_evict_user(db); 1531 mutex_exit(&db->db_mtx); 1532 } 1533 1534 if (need_mutex) { 1535 mutex_exit(&dn->dn_dbufs_mtx); 1536 dnode_rele(dn, FTAG); 1537 } 1538 } 1539 1540 void 1541 dbuf_rele(dmu_buf_impl_t *db) 1542 { 1543 dbuf_remove_ref(db, NULL); 1544 } 1545 1546 #pragma weak dmu_buf_refcount = dbuf_refcount 1547 uint64_t 1548 dbuf_refcount(dmu_buf_impl_t *db) 1549 { 1550 return (refcount_count(&db->db_holds)); 1551 } 1552 1553 void * 1554 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1555 dmu_buf_evict_func_t *evict_func) 1556 { 1557 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1558 user_data_ptr_ptr, evict_func)); 1559 } 1560 1561 void * 1562 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1563 dmu_buf_evict_func_t *evict_func) 1564 { 1565 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1566 1567 db->db_d.db_immediate_evict = TRUE; 1568 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1569 user_data_ptr_ptr, evict_func)); 1570 } 1571 1572 void * 1573 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 1574 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 1575 { 1576 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1577 ASSERT(db->db_level == 0); 1578 1579 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 1580 1581 mutex_enter(&db->db_mtx); 1582 1583 if (db->db_d.db_user_ptr == old_user_ptr) { 1584 db->db_d.db_user_ptr = user_ptr; 1585 db->db_d.db_user_data_ptr_ptr = user_data_ptr_ptr; 1586 db->db_d.db_evict_func = evict_func; 1587 1588 dbuf_update_data(db); 1589 } else { 1590 old_user_ptr = db->db_d.db_user_ptr; 1591 } 1592 1593 mutex_exit(&db->db_mtx); 1594 return (old_user_ptr); 1595 } 1596 1597 void * 1598 dmu_buf_get_user(dmu_buf_t *db_fake) 1599 { 1600 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1601 ASSERT(!refcount_is_zero(&db->db_holds)); 1602 1603 return (db->db_d.db_user_ptr); 1604 } 1605 1606 void 1607 dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) 1608 { 1609 arc_buf_t **data; 1610 uint64_t txg = tx->tx_txg; 1611 dnode_t *dn = db->db_dnode; 1612 objset_impl_t *os = dn->dn_objset; 1613 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 1614 int blksz; 1615 1616 ASSERT(dmu_tx_is_syncing(tx)); 1617 1618 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 1619 1620 mutex_enter(&db->db_mtx); 1621 /* 1622 * To be synced, we must be dirtied. But we 1623 * might have been freed after the dirty. 1624 */ 1625 if (db->db_state == DB_UNCACHED) { 1626 /* This buffer has been freed since it was dirtied */ 1627 ASSERT(db->db.db_data == NULL); 1628 } else if (db->db_state == DB_FILL) { 1629 /* This buffer was freed and is now being re-filled */ 1630 ASSERT(db->db.db_data != db->db_d.db_data_old[txg&TXG_MASK]); 1631 } else { 1632 ASSERT3U(db->db_state, ==, DB_CACHED); 1633 } 1634 DBUF_VERIFY(db); 1635 1636 /* 1637 * Don't need a lock on db_dirty (dn_mtx), because it can't 1638 * be modified yet. 1639 */ 1640 1641 if (db->db_level == 0) { 1642 data = &db->db_d.db_data_old[txg&TXG_MASK]; 1643 blksz = arc_buf_size(*data); 1644 /* 1645 * If this buffer is currently "in use" (i.e., there are 1646 * active holds and db_data still references it), then make 1647 * a copy before we start the write so that any modifications 1648 * from the open txg will not leak into this write. 1649 * 1650 * NOTE: this copy does not need to be made for objects only 1651 * modified in the syncing context (e.g. DNONE_DNODE blocks) 1652 * or if there is no actual write involved (bonus blocks). 1653 */ 1654 if (!(dn->dn_object & DMU_PRIVATE_OBJECT) && 1655 db->db_d.db_overridden_by[txg&TXG_MASK] == NULL && 1656 db->db_blkid != DB_BONUS_BLKID) { 1657 if (refcount_count(&db->db_holds) > 1 && 1658 *data == db->db_buf) { 1659 *data = arc_buf_alloc( 1660 db->db_dnode->dn_objset->os_spa, blksz, db); 1661 bcopy(db->db.db_data, (*data)->b_data, blksz); 1662 } 1663 db->db_data_pending = *data; 1664 } else if (dn->dn_object & DMU_PRIVATE_OBJECT) { 1665 /* 1666 * Private object buffers are released here rather 1667 * than in dbuf_dirty() since they are only modified 1668 * in the syncing context and we don't want the 1669 * overhead of making multiple copies of the data. 1670 */ 1671 arc_release(db->db_buf, db); 1672 } 1673 } else { 1674 data = &db->db_buf; 1675 if (*data == NULL) { 1676 /* 1677 * This can happen if we dirty and then free 1678 * the level-0 data blocks in the same txg. So 1679 * this indirect remains unchanged. 1680 */ 1681 if (db->db_dirtied == txg) 1682 db->db_dirtied = 0; 1683 ASSERT(db->db_dirtycnt > 0); 1684 db->db_dirtycnt -= 1; 1685 mutex_exit(&db->db_mtx); 1686 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1687 return; 1688 } 1689 blksz = db->db.db_size; 1690 ASSERT3U(blksz, ==, 1<<dn->dn_phys->dn_indblkshift); 1691 } 1692 1693 ASSERT(*data != NULL); 1694 1695 if (db->db_blkid == DB_BONUS_BLKID) { 1696 /* 1697 * Simply copy the bonus data into the dnode. It will 1698 * be written out when the dnode is synced (and it will 1699 * be synced, since it must have been dirty for dbuf_sync 1700 * to be called). The bonus data will be byte swapped 1701 * in dnode_byteswap. 1702 */ 1703 /* 1704 * Use dn_phys->dn_bonuslen since db.db_size is the length 1705 * of the bonus buffer in the open transaction rather than 1706 * the syncing transaction. 1707 */ 1708 ASSERT3U(db->db_level, ==, 0); 1709 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, blksz); 1710 bcopy((*data)->b_data, DN_BONUS(dn->dn_phys), 1711 dn->dn_phys->dn_bonuslen); 1712 if (*data != db->db_buf) 1713 arc_buf_free(*data, db); 1714 db->db_d.db_data_old[txg&TXG_MASK] = NULL; 1715 db->db_data_pending = NULL; 1716 if (db->db_dirtied == txg) 1717 db->db_dirtied = 0; 1718 ASSERT(db->db_dirtycnt > 0); 1719 db->db_dirtycnt -= 1; 1720 mutex_exit(&db->db_mtx); 1721 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1722 return; 1723 } else if (db->db_level > 0 && !arc_released(db->db_buf)) { 1724 /* 1725 * This indirect buffer was marked dirty, but 1726 * never modified (if it had been modified, then 1727 * we would have released the buffer). There is 1728 * no reason to write anything. 1729 */ 1730 db->db_data_pending = NULL; 1731 if (db->db_dirtied == txg) 1732 db->db_dirtied = 0; 1733 ASSERT(db->db_dirtycnt > 0); 1734 db->db_dirtycnt -= 1; 1735 mutex_exit(&db->db_mtx); 1736 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1737 return; 1738 } else if (db->db_blkptr == NULL && 1739 db->db_level == dn->dn_phys->dn_nlevels-1 && 1740 db->db_blkid < dn->dn_phys->dn_nblkptr) { 1741 /* 1742 * This buffer was allocated at a time when there was 1743 * no available blkptrs from the dnode, or it was 1744 * inappropriate to hook it in (i.e., nlevels mis-match). 1745 */ 1746 ASSERT(db->db_blkptr == NULL); 1747 ASSERT(db->db_parent == NULL); 1748 db->db_parent = dn->dn_dbuf; 1749 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 1750 DBUF_VERIFY(db); 1751 mutex_exit(&db->db_mtx); 1752 } else if (db->db_blkptr == NULL) { 1753 dmu_buf_impl_t *parent = db->db_parent; 1754 1755 mutex_exit(&db->db_mtx); 1756 ASSERT(dn->dn_phys->dn_nlevels > 1); 1757 if (parent == NULL) { 1758 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1759 (void) dbuf_hold_impl(dn, db->db_level+1, 1760 db->db_blkid >> epbs, FALSE, NULL, &parent); 1761 rw_exit(&dn->dn_struct_rwlock); 1762 dbuf_add_ref(parent, db); 1763 db->db_parent = parent; 1764 dbuf_rele(parent); 1765 } 1766 dbuf_read(parent); 1767 } else { 1768 mutex_exit(&db->db_mtx); 1769 } 1770 1771 ASSERT(IS_DNODE_DNODE(dn->dn_object) || db->db_parent != NULL); 1772 1773 if (db->db_level > 0 && 1774 db->db_blkid > dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)) { 1775 /* 1776 * Don't write indirect blocks past EOF. 1777 * We get these when we truncate a file *after* dirtying 1778 * blocks in the truncate range (we undirty the level 0 1779 * blocks in dbuf_free_range(), but not the indirects). 1780 */ 1781 #ifdef ZFS_DEBUG 1782 /* 1783 * Verify that this indirect block is empty. 1784 */ 1785 blkptr_t *bplist; 1786 int i; 1787 1788 mutex_enter(&db->db_mtx); 1789 bplist = db->db.db_data; 1790 for (i = 0; i < (1 << epbs); i++) { 1791 if (!BP_IS_HOLE(&bplist[i])) { 1792 panic("data past EOF: " 1793 "db=%p level=%d id=%llu i=%d\n", 1794 db, db->db_level, 1795 (u_longlong_t)db->db_blkid, i); 1796 } 1797 } 1798 mutex_exit(&db->db_mtx); 1799 #endif 1800 ASSERT(db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)); 1801 mutex_enter(&db->db_mtx); 1802 db->db_dirtycnt -= 1; 1803 mutex_exit(&db->db_mtx); 1804 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1805 return; 1806 } 1807 1808 if (db->db_parent != dn->dn_dbuf) { 1809 dmu_buf_impl_t *parent = db->db_parent; 1810 1811 mutex_enter(&db->db_mtx); 1812 ASSERT(db->db_level == parent->db_level-1); 1813 ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK])); 1814 /* 1815 * We may have read this block after we dirtied it, 1816 * so never released it from the cache. 1817 */ 1818 arc_release(parent->db_buf, parent); 1819 1820 db->db_blkptr = (blkptr_t *)parent->db.db_data + 1821 (db->db_blkid & ((1ULL << epbs) - 1)); 1822 DBUF_VERIFY(db); 1823 mutex_exit(&db->db_mtx); 1824 } 1825 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1826 1827 #ifdef ZFS_DEBUG 1828 if (db->db_parent == dn->dn_dbuf) { 1829 /* 1830 * We don't need to dnode_setdirty(dn) because if we got 1831 * here then the parent is already dirty. 1832 */ 1833 ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); 1834 ASSERT3P(db->db_blkptr, ==, 1835 &dn->dn_phys->dn_blkptr[db->db_blkid]); 1836 } 1837 #endif 1838 if (db->db_level == 0 && 1839 db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { 1840 arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK]; 1841 blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK]; 1842 int old_size = BP_GET_ASIZE(db->db_blkptr); 1843 int new_size = BP_GET_ASIZE(*bpp); 1844 1845 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1846 1847 dnode_diduse_space(dn, new_size-old_size); 1848 mutex_enter(&dn->dn_mtx); 1849 if (db->db_blkid > dn->dn_phys->dn_maxblkid) 1850 dn->dn_phys->dn_maxblkid = db->db_blkid; 1851 mutex_exit(&dn->dn_mtx); 1852 1853 dsl_dataset_block_born(os->os_dsl_dataset, *bpp, tx); 1854 if (!BP_IS_HOLE(db->db_blkptr)) 1855 dsl_dataset_block_kill(os->os_dsl_dataset, 1856 db->db_blkptr, os->os_synctx); 1857 1858 mutex_enter(&db->db_mtx); 1859 *db->db_blkptr = **bpp; 1860 kmem_free(*bpp, sizeof (blkptr_t)); 1861 *bpp = NULL; 1862 1863 if (*old != db->db_buf) 1864 arc_buf_free(*old, db); 1865 *old = NULL; 1866 db->db_data_pending = NULL; 1867 1868 cv_broadcast(&db->db_changed); 1869 1870 ASSERT(db->db_dirtycnt > 0); 1871 db->db_dirtycnt -= 1; 1872 mutex_exit(&db->db_mtx); 1873 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1874 } else { 1875 int checksum, compress; 1876 1877 if (db->db_level > 0) { 1878 /* 1879 * XXX -- we should design a compression algorithm 1880 * that specializes in arrays of bps. 1881 */ 1882 checksum = ZIO_CHECKSUM_FLETCHER_4; 1883 /* XXX - disable compresssion for now */ 1884 compress = ZIO_COMPRESS_OFF; 1885 } else { 1886 /* 1887 * Allow dnode settings to override objset settings, 1888 * except for metadata checksums. 1889 */ 1890 if (dmu_ot[dn->dn_type].ot_metadata) { 1891 checksum = os->os_md_checksum; 1892 compress = zio_compress_select(dn->dn_compress, 1893 os->os_md_compress); 1894 } else { 1895 checksum = zio_checksum_select(dn->dn_checksum, 1896 os->os_checksum); 1897 compress = zio_compress_select(dn->dn_compress, 1898 os->os_compress); 1899 } 1900 } 1901 #ifdef ZFS_DEBUG 1902 if (db->db_parent) { 1903 ASSERT(list_link_active( 1904 &db->db_parent->db_dirty_node[txg&TXG_MASK])); 1905 ASSERT(db->db_parent == dn->dn_dbuf || 1906 db->db_parent->db_level > 0); 1907 if (dn->dn_object & DMU_PRIVATE_OBJECT || 1908 db->db_level > 0) 1909 ASSERT(*data == db->db_buf); 1910 } 1911 #endif 1912 ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg); 1913 (void) arc_write(zio, os->os_spa, checksum, compress, txg, 1914 db->db_blkptr, *data, dbuf_write_done, db, 1915 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT); 1916 /* 1917 * We can't access db after arc_write, since it could finish 1918 * and be freed, and we have no locks on it. 1919 */ 1920 } 1921 } 1922 1923 struct dbuf_arg { 1924 objset_impl_t *os; 1925 blkptr_t bp; 1926 }; 1927 1928 static void 1929 dbuf_do_born(void *arg) 1930 { 1931 struct dbuf_arg *da = arg; 1932 dsl_dataset_block_born(da->os->os_dsl_dataset, 1933 &da->bp, da->os->os_synctx); 1934 kmem_free(da, sizeof (struct dbuf_arg)); 1935 } 1936 1937 static void 1938 dbuf_do_kill(void *arg) 1939 { 1940 struct dbuf_arg *da = arg; 1941 dsl_dataset_block_kill(da->os->os_dsl_dataset, 1942 &da->bp, da->os->os_synctx); 1943 kmem_free(da, sizeof (struct dbuf_arg)); 1944 } 1945 1946 /* ARGSUSED */ 1947 static void 1948 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 1949 { 1950 dmu_buf_impl_t *db = vdb; 1951 dnode_t *dn = db->db_dnode; 1952 objset_impl_t *os = dn->dn_objset; 1953 uint64_t txg = zio->io_txg; 1954 uint64_t fill = 0; 1955 int i; 1956 int old_size, new_size; 1957 1958 ASSERT3U(zio->io_error, ==, 0); 1959 1960 dprintf_dbuf_bp(db, &zio->io_bp_orig, "bp_orig: %s", ""); 1961 1962 old_size = BP_GET_ASIZE(&zio->io_bp_orig); 1963 new_size = BP_GET_ASIZE(zio->io_bp); 1964 1965 dnode_diduse_space(dn, new_size-old_size); 1966 1967 mutex_enter(&db->db_mtx); 1968 1969 if (db->db_dirtied == txg) 1970 db->db_dirtied = 0; 1971 1972 if (db->db_level == 0) { 1973 arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK]; 1974 1975 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1976 1977 if (*old != db->db_buf) 1978 arc_buf_free(*old, db); 1979 *old = NULL; 1980 db->db_data_pending = NULL; 1981 1982 mutex_enter(&dn->dn_mtx); 1983 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 1984 !BP_IS_HOLE(db->db_blkptr)) 1985 dn->dn_phys->dn_maxblkid = db->db_blkid; 1986 mutex_exit(&dn->dn_mtx); 1987 1988 if (dn->dn_type == DMU_OT_DNODE) { 1989 dnode_phys_t *dnp = db->db.db_data; 1990 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 1991 i--, dnp++) { 1992 if (dnp->dn_type != DMU_OT_NONE) 1993 fill++; 1994 } 1995 } else { 1996 if (!BP_IS_HOLE(db->db_blkptr)) 1997 fill = 1; 1998 } 1999 } else { 2000 blkptr_t *bp = db->db.db_data; 2001 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2002 if (!BP_IS_HOLE(db->db_blkptr)) { 2003 int epbs = 2004 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2005 ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, db->db.db_size); 2006 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2007 db->db.db_size); 2008 ASSERT3U(dn->dn_phys->dn_maxblkid 2009 >> (db->db_level * epbs), >=, db->db_blkid); 2010 } 2011 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) { 2012 if (BP_IS_HOLE(bp)) 2013 continue; 2014 ASSERT3U(BP_GET_LSIZE(bp), ==, 2015 db->db_level == 1 ? dn->dn_datablksz : 2016 (1<<dn->dn_phys->dn_indblkshift)); 2017 fill += bp->blk_fill; 2018 } 2019 } 2020 2021 if (!BP_IS_HOLE(db->db_blkptr)) { 2022 db->db_blkptr->blk_fill = fill; 2023 BP_SET_TYPE(db->db_blkptr, dn->dn_type); 2024 BP_SET_LEVEL(db->db_blkptr, db->db_level); 2025 } else { 2026 ASSERT3U(fill, ==, 0); 2027 ASSERT3U(db->db_blkptr->blk_fill, ==, 0); 2028 } 2029 2030 dprintf_dbuf_bp(db, db->db_blkptr, 2031 "wrote %llu bytes to blkptr:", zio->io_size); 2032 2033 ASSERT(db->db_parent == NULL || 2034 list_link_active(&db->db_parent->db_dirty_node[txg&TXG_MASK])); 2035 cv_broadcast(&db->db_changed); 2036 ASSERT(db->db_dirtycnt > 0); 2037 db->db_dirtycnt -= 1; 2038 mutex_exit(&db->db_mtx); 2039 2040 /* We must do this after we've set the bp's type and level */ 2041 if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), 2042 BP_IDENTITY(&zio->io_bp_orig))) { 2043 struct dbuf_arg *da; 2044 da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP); 2045 da->os = os; 2046 da->bp = *zio->io_bp; 2047 (void) taskq_dispatch(dbuf_tq, dbuf_do_born, da, 0); 2048 if (!BP_IS_HOLE(&zio->io_bp_orig)) { 2049 da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP); 2050 da->os = os; 2051 da->bp = zio->io_bp_orig; 2052 (void) taskq_dispatch(dbuf_tq, dbuf_do_kill, da, 0); 2053 } 2054 } 2055 2056 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 2057 } 2058