1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/zfs_context.h> 30 #include <sys/dmu.h> 31 #include <sys/dmu_impl.h> 32 #include <sys/dbuf.h> 33 #include <sys/dmu_objset.h> 34 #include <sys/dsl_dataset.h> 35 #include <sys/dsl_dir.h> 36 #include <sys/dmu_tx.h> 37 #include <sys/spa.h> 38 #include <sys/zio.h> 39 #include <sys/dmu_zfetch.h> 40 41 static void dbuf_destroy(dmu_buf_impl_t *db); 42 static void dbuf_verify(dmu_buf_impl_t *db); 43 static void dbuf_evict_user(dmu_buf_impl_t *db); 44 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 45 static arc_done_func_t dbuf_read_done; 46 static arc_done_func_t dbuf_write_done; 47 48 /* 49 * Global data structures and functions for the dbuf cache. 50 */ 51 taskq_t *dbuf_tq; 52 static kmem_cache_t *dbuf_cache; 53 54 /* ARGSUSED */ 55 static int 56 dbuf_cons(void *vdb, void *unused, int kmflag) 57 { 58 dmu_buf_impl_t *db = vdb; 59 bzero(db, sizeof (dmu_buf_impl_t)); 60 61 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 62 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 63 refcount_create(&db->db_holds); 64 return (0); 65 } 66 67 /* ARGSUSED */ 68 static void 69 dbuf_dest(void *vdb, void *unused) 70 { 71 dmu_buf_impl_t *db = vdb; 72 mutex_destroy(&db->db_mtx); 73 cv_destroy(&db->db_changed); 74 refcount_destroy(&db->db_holds); 75 } 76 77 /* 78 * dbuf hash table routines 79 */ 80 static dbuf_hash_table_t dbuf_hash_table; 81 82 static uint64_t dbuf_hash_count; 83 84 static uint64_t 85 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 86 { 87 uintptr_t osv = (uintptr_t)os; 88 uint64_t crc = -1ULL; 89 90 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 91 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 92 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 93 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 94 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 95 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 96 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 97 98 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 99 100 return (crc); 101 } 102 103 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 104 105 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 106 ((dbuf)->db.db_object == (obj) && \ 107 (dbuf)->db_objset == (os) && \ 108 (dbuf)->db_level == (level) && \ 109 (dbuf)->db_blkid == (blkid)) 110 111 dmu_buf_impl_t * 112 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 113 { 114 dbuf_hash_table_t *h = &dbuf_hash_table; 115 objset_impl_t *os = dn->dn_objset; 116 uint64_t obj = dn->dn_object; 117 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 118 uint64_t idx = hv & h->hash_table_mask; 119 dmu_buf_impl_t *db; 120 121 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 122 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 123 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 124 mutex_enter(&db->db_mtx); 125 if (!refcount_is_zero(&db->db_holds)) { 126 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 127 return (db); 128 } 129 mutex_exit(&db->db_mtx); 130 } 131 } 132 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 133 return (NULL); 134 } 135 136 /* 137 * Insert an entry into the hash table. If there is already an element 138 * equal to elem in the hash table, then the already existing element 139 * will be returned and the new element will not be inserted. 140 * Otherwise returns NULL. 141 */ 142 static dmu_buf_impl_t * 143 dbuf_hash_insert(dmu_buf_impl_t *db) 144 { 145 dbuf_hash_table_t *h = &dbuf_hash_table; 146 objset_impl_t *os = db->db_objset; 147 uint64_t obj = db->db.db_object; 148 int level = db->db_level; 149 uint64_t blkid = db->db_blkid; 150 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 151 uint64_t idx = hv & h->hash_table_mask; 152 dmu_buf_impl_t *dbf; 153 154 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 155 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 156 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 157 mutex_enter(&dbf->db_mtx); 158 if (!refcount_is_zero(&dbf->db_holds)) { 159 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 160 return (dbf); 161 } 162 mutex_exit(&dbf->db_mtx); 163 } 164 } 165 166 mutex_enter(&db->db_mtx); 167 db->db_hash_next = h->hash_table[idx]; 168 h->hash_table[idx] = db; 169 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 170 atomic_add_64(&dbuf_hash_count, 1); 171 172 return (NULL); 173 } 174 175 /* 176 * Remove an entry from the hash table. This operation will 177 * fail if there are any existing holds on the db. 178 */ 179 static void 180 dbuf_hash_remove(dmu_buf_impl_t *db) 181 { 182 dbuf_hash_table_t *h = &dbuf_hash_table; 183 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 184 db->db_level, db->db_blkid); 185 uint64_t idx = hv & h->hash_table_mask; 186 dmu_buf_impl_t *dbf, **dbp; 187 188 /* 189 * We musn't hold db_mtx to maintin lock ordering: 190 * DBUF_HASH_MUTEX > db_mtx. 191 */ 192 ASSERT(refcount_is_zero(&db->db_holds)); 193 ASSERT(db->db_dnode != NULL); 194 ASSERT(!MUTEX_HELD(&db->db_mtx)); 195 196 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 197 dbp = &h->hash_table[idx]; 198 while ((dbf = *dbp) != db) { 199 dbp = &dbf->db_hash_next; 200 ASSERT(dbf != NULL); 201 } 202 *dbp = db->db_hash_next; 203 db->db_hash_next = NULL; 204 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 205 atomic_add_64(&dbuf_hash_count, -1); 206 } 207 208 static int dbuf_evictable(dmu_buf_impl_t *db); 209 static void dbuf_clear(dmu_buf_impl_t *db); 210 211 void 212 dbuf_evict(dmu_buf_impl_t *db) 213 { 214 int err; 215 216 ASSERT(MUTEX_HELD(&db->db_mtx)); 217 err = dbuf_evictable(db); 218 ASSERT(err == TRUE); 219 dbuf_clear(db); 220 dbuf_destroy(db); 221 } 222 223 static void 224 dbuf_evict_user(dmu_buf_impl_t *db) 225 { 226 ASSERT(MUTEX_HELD(&db->db_mtx)); 227 228 if (db->db_level != 0 || db->db_d.db_evict_func == NULL) 229 return; 230 231 if (db->db_d.db_user_data_ptr_ptr) 232 *db->db_d.db_user_data_ptr_ptr = db->db.db_data; 233 db->db_d.db_evict_func(&db->db, db->db_d.db_user_ptr); 234 db->db_d.db_user_ptr = NULL; 235 db->db_d.db_user_data_ptr_ptr = NULL; 236 db->db_d.db_evict_func = NULL; 237 } 238 239 void 240 dbuf_init(void) 241 { 242 uint64_t hsize = 1; 243 dbuf_hash_table_t *h = &dbuf_hash_table; 244 int i; 245 246 /* 247 * The hash table is big enough to fill all of physical memory 248 * with an average 64k block size. The table will take up 249 * totalmem*sizeof(void*)/64k bytes (i.e. 128KB/GB with 8-byte 250 * pointers). 251 */ 252 while (hsize * 65536 < physmem * PAGESIZE) 253 hsize <<= 1; 254 255 h->hash_table_mask = hsize - 1; 256 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_SLEEP); 257 258 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 259 sizeof (dmu_buf_impl_t), 260 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 261 dbuf_tq = taskq_create("dbuf_tq", 8, maxclsyspri, 50, INT_MAX, 262 TASKQ_PREPOPULATE); 263 264 for (i = 0; i < DBUF_MUTEXES; i++) 265 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 266 } 267 268 void 269 dbuf_fini(void) 270 { 271 dbuf_hash_table_t *h = &dbuf_hash_table; 272 int i; 273 274 taskq_destroy(dbuf_tq); 275 dbuf_tq = NULL; 276 277 for (i = 0; i < DBUF_MUTEXES; i++) 278 mutex_destroy(&h->hash_mutexes[i]); 279 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 280 kmem_cache_destroy(dbuf_cache); 281 } 282 283 /* 284 * Other stuff. 285 */ 286 287 static void 288 dbuf_verify(dmu_buf_impl_t *db) 289 { 290 #ifdef ZFS_DEBUG 291 int i; 292 dnode_t *dn = db->db_dnode; 293 294 ASSERT(MUTEX_HELD(&db->db_mtx)); 295 296 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 297 return; 298 299 ASSERT(db->db_objset != NULL); 300 if (dn == NULL) { 301 ASSERT(db->db_parent == NULL); 302 ASSERT(db->db_blkptr == NULL); 303 } else { 304 ASSERT3U(db->db.db_object, ==, dn->dn_object); 305 ASSERT3P(db->db_objset, ==, dn->dn_objset); 306 ASSERT(list_head(&dn->dn_dbufs)); 307 ASSERT3U(db->db_level, <, dn->dn_nlevels); 308 } 309 if (db->db_blkid == DB_BONUS_BLKID) { 310 ASSERT(dn != NULL); 311 ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); 312 ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); 313 } else { 314 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 315 } 316 317 if (db->db_level == 0) { 318 void **udpp = db->db_d.db_user_data_ptr_ptr; 319 /* we can be momentarily larger in dnode_set_blksz() */ 320 if (db->db_blkid != DB_BONUS_BLKID && dn) { 321 ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); 322 } 323 if (udpp) { 324 ASSERT((refcount_is_zero(&db->db_holds) && 325 *udpp == NULL) || 326 (!refcount_is_zero(&db->db_holds) && 327 *udpp == db->db.db_data)); 328 } 329 330 if (IS_DNODE_DNODE(db->db.db_object)) { 331 for (i = 0; i < TXG_SIZE; i++) { 332 /* 333 * it should only be modified in syncing 334 * context, so make sure we only have 335 * one copy of the data. 336 */ 337 ASSERT(db->db_d.db_data_old[i] == NULL || 338 db->db_d.db_data_old[i] == db->db_buf); 339 } 340 } 341 } 342 343 /* verify db->db_blkptr */ 344 if (db->db_blkptr) { 345 if (db->db_parent == dn->dn_dbuf) { 346 /* db is pointed to by the dnode */ 347 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 348 if (IS_DNODE_DNODE(db->db.db_object)) 349 ASSERT(db->db_parent == NULL); 350 else 351 ASSERT(db->db_parent != NULL); 352 ASSERT3P(db->db_blkptr, ==, 353 &dn->dn_phys->dn_blkptr[db->db_blkid]); 354 } else { 355 /* db is pointed to by an indirect block */ 356 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 357 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 358 ASSERT3U(db->db_parent->db.db_object, ==, 359 db->db.db_object); 360 /* 361 * dnode_grow_indblksz() can make this fail if we don't 362 * have the struct_rwlock. XXX indblksz no longer 363 * grows. safe to do this now? 364 */ 365 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { 366 ASSERT3P(db->db_blkptr, ==, 367 ((blkptr_t *)db->db_parent->db.db_data + 368 db->db_blkid % epb)); 369 } 370 } 371 } 372 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 373 db->db.db_data && db->db_blkid != DB_BONUS_BLKID && 374 db->db_state != DB_FILL && !dn->dn_free_txg) { 375 /* 376 * If the blkptr isn't set but they have nonzero data, 377 * it had better be dirty, otherwise we'll lose that 378 * data when we evict this buffer. 379 */ 380 if (db->db_dirtycnt == 0) { 381 uint64_t *buf = db->db.db_data; 382 int i; 383 384 for (i = 0; i < db->db.db_size >> 3; i++) { 385 ASSERT(buf[i] == 0); 386 } 387 } 388 } 389 #endif 390 } 391 392 static void 393 dbuf_update_data(dmu_buf_impl_t *db) 394 { 395 ASSERT(MUTEX_HELD(&db->db_mtx)); 396 if (db->db_level == 0 && db->db_d.db_user_data_ptr_ptr) { 397 ASSERT(!refcount_is_zero(&db->db_holds)); 398 *db->db_d.db_user_data_ptr_ptr = db->db.db_data; 399 } 400 } 401 402 static void 403 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 404 { 405 ASSERT(MUTEX_HELD(&db->db_mtx)); 406 ASSERT(buf->b_data != NULL); 407 db->db_buf = buf; 408 db->db.db_data = buf->b_data; 409 dbuf_update_data(db); 410 } 411 412 uint64_t 413 dbuf_whichblock(dnode_t *dn, uint64_t offset) 414 { 415 if (dn->dn_datablkshift) { 416 return (offset >> dn->dn_datablkshift); 417 } else { 418 ASSERT3U(offset, <, dn->dn_datablksz); 419 return (0); 420 } 421 } 422 423 static void 424 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 425 { 426 dmu_buf_impl_t *db = vdb; 427 428 mutex_enter(&db->db_mtx); 429 ASSERT3U(db->db_state, ==, DB_READ); 430 /* 431 * All reads are synchronous, so we must have a hold on the dbuf 432 */ 433 ASSERT(refcount_count(&db->db_holds) > 0); 434 ASSERT(db->db.db_data == NULL); 435 if (db->db_level == 0 && db->db_d.db_freed_in_flight) { 436 /* we were freed in flight; disregard any error */ 437 arc_release(buf, db); 438 bzero(buf->b_data, db->db.db_size); 439 db->db_d.db_freed_in_flight = FALSE; 440 dbuf_set_data(db, buf); 441 db->db_state = DB_CACHED; 442 } else if (zio == NULL || zio->io_error == 0) { 443 dbuf_set_data(db, buf); 444 db->db_state = DB_CACHED; 445 } else { 446 ASSERT(db->db_blkid != DB_BONUS_BLKID); 447 arc_buf_free(buf, db); 448 db->db_state = DB_UNCACHED; 449 ASSERT3P(db->db_buf, ==, NULL); 450 } 451 cv_broadcast(&db->db_changed); 452 mutex_exit(&db->db_mtx); 453 } 454 455 void 456 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 457 { 458 arc_buf_t *buf; 459 blkptr_t *bp; 460 461 ASSERT(!refcount_is_zero(&db->db_holds)); 462 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 463 ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); 464 465 /* 466 * prefetch only data blocks (level 0) -- don't prefetch indirect 467 * blocks 468 */ 469 if ((db->db_level > 0) || (db->db_blkid == DB_BONUS_BLKID)) { 470 flags |= DB_RF_NOPREFETCH; 471 } 472 473 if (((flags & DB_RF_NOPREFETCH) == 0) && (db->db_dnode != NULL)) { 474 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 475 db->db.db_size); 476 } 477 478 if (db->db_state == DB_CACHED) { 479 ASSERT(db->db.db_data != NULL); 480 return; 481 } 482 483 mutex_enter(&db->db_mtx); 484 485 if (db->db_state != DB_UNCACHED) { 486 mutex_exit(&db->db_mtx); 487 return; 488 } 489 490 ASSERT3U(db->db_state, ==, DB_UNCACHED); 491 492 if (db->db_blkid == DB_BONUS_BLKID) { 493 ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size); 494 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 495 DN_MAX_BONUSLEN, db); 496 if (db->db.db_size < DN_MAX_BONUSLEN) 497 bzero(buf->b_data, DN_MAX_BONUSLEN); 498 bcopy(DN_BONUS(db->db_dnode->dn_phys), buf->b_data, 499 db->db.db_size); 500 dbuf_set_data(db, buf); 501 db->db_state = DB_CACHED; 502 mutex_exit(&db->db_mtx); 503 return; 504 } 505 506 if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid)) 507 bp = NULL; 508 else 509 bp = db->db_blkptr; 510 511 if (bp == NULL) 512 dprintf_dbuf(db, "blkptr: %s\n", "NULL"); 513 else 514 dprintf_dbuf_bp(db, bp, "%s", "blkptr:"); 515 516 if (bp == NULL || BP_IS_HOLE(bp)) { 517 ASSERT(bp == NULL || BP_IS_HOLE(bp)); 518 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 519 db->db.db_size, db)); 520 bzero(db->db.db_data, db->db.db_size); 521 db->db_state = DB_CACHED; 522 mutex_exit(&db->db_mtx); 523 return; 524 } 525 526 db->db_state = DB_READ; 527 mutex_exit(&db->db_mtx); 528 529 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 530 (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp, 531 db->db_level > 0 ? byteswap_uint64_array : 532 dmu_ot[db->db_dnode->dn_type].ot_byteswap, 533 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 534 (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 535 ARC_NOWAIT); 536 } 537 538 static int 539 dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags) 540 { 541 zio_t *zio; 542 int err; 543 544 /* 545 * We don't have to hold the mutex to check db_state because it 546 * can't be freed while we have a hold on the buffer. 547 */ 548 ASSERT(!refcount_is_zero(&db->db_holds)); 549 if (db->db_state == DB_CACHED) 550 return (0); 551 552 if (db->db_state == DB_UNCACHED) { 553 zio = zio_root(db->db_dnode->dn_objset->os_spa, NULL, NULL, 554 ZIO_FLAG_CANFAIL); 555 if ((flags & DB_RF_HAVESTRUCT) == 0) 556 rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); 557 dbuf_read_impl(db, zio, flags); 558 if ((flags & DB_RF_HAVESTRUCT) == 0) 559 rw_exit(&db->db_dnode->dn_struct_rwlock); 560 err = zio_wait(zio); 561 if (err) 562 return (err); 563 } 564 565 mutex_enter(&db->db_mtx); 566 while (db->db_state == DB_READ || db->db_state == DB_FILL) { 567 ASSERT(db->db_state == DB_READ || 568 (flags & DB_RF_HAVESTRUCT) == 0); 569 cv_wait(&db->db_changed, &db->db_mtx); 570 } 571 ASSERT3U(db->db_state, ==, DB_CACHED); 572 mutex_exit(&db->db_mtx); 573 574 return (0); 575 } 576 577 #pragma weak dmu_buf_read = dbuf_read 578 void 579 dbuf_read(dmu_buf_impl_t *db) 580 { 581 int err; 582 583 err = dbuf_read_generic(db, DB_RF_MUST_SUCCEED); 584 ASSERT(err == 0); 585 } 586 587 #pragma weak dmu_buf_read_canfail = dbuf_read_canfail 588 int 589 dbuf_read_canfail(dmu_buf_impl_t *db) 590 { 591 return (dbuf_read_generic(db, DB_RF_CANFAIL)); 592 } 593 594 void 595 dbuf_read_havestruct(dmu_buf_impl_t *db) 596 { 597 int err; 598 599 ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); 600 err = dbuf_read_generic(db, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH)); 601 ASSERT(err == 0); 602 } 603 604 static void 605 dbuf_noread(dmu_buf_impl_t *db) 606 { 607 ASSERT(!refcount_is_zero(&db->db_holds)); 608 mutex_enter(&db->db_mtx); 609 while (db->db_state == DB_READ || db->db_state == DB_FILL) 610 cv_wait(&db->db_changed, &db->db_mtx); 611 if (db->db_state == DB_UNCACHED) { 612 int blksz = (db->db_blkid == DB_BONUS_BLKID) ? 613 DN_MAX_BONUSLEN : db->db.db_size; 614 ASSERT(db->db.db_data == NULL); 615 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 616 blksz, db)); 617 db->db_state = DB_FILL; 618 } else { 619 ASSERT3U(db->db_state, ==, DB_CACHED); 620 } 621 mutex_exit(&db->db_mtx); 622 } 623 624 /* 625 * This is our just-in-time copy function. It makes a copy of 626 * buffers, that have been modified in a previous transaction 627 * group, before we modify them in the current active group. 628 * 629 * This function is used in two places: when we are dirtying a 630 * buffer for the first time in a txg, and when we are freeing 631 * a range in a dnode that includes this buffer. 632 * 633 * Note that when we are called from dbuf_free_range() we do 634 * not put a hold on the buffer, we just traverse the active 635 * dbuf list for the dnode. 636 */ 637 static void 638 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 639 { 640 arc_buf_t **quiescing, **syncing; 641 int size = (db->db_blkid == DB_BONUS_BLKID) ? 642 DN_MAX_BONUSLEN : db->db.db_size; 643 644 ASSERT(MUTEX_HELD(&db->db_mtx)); 645 ASSERT(db->db.db_data != NULL); 646 647 quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK]; 648 syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK]; 649 650 /* 651 * If this buffer is referenced from the current quiescing 652 * transaction group: either make a copy and reset the reference 653 * to point to the copy, or (if there a no active holders) just 654 * null out the current db_data pointer. 655 */ 656 if (*quiescing == db->db_buf) { 657 /* 658 * If the quiescing txg is "dirty", then we better not 659 * be referencing the same buffer from the syncing txg. 660 */ 661 ASSERT(*syncing != db->db_buf); 662 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 663 *quiescing = arc_buf_alloc( 664 db->db_dnode->dn_objset->os_spa, size, db); 665 bcopy(db->db.db_data, (*quiescing)->b_data, size); 666 } else { 667 db->db.db_data = NULL; 668 db->db_buf = NULL; 669 db->db_state = DB_UNCACHED; 670 } 671 return; 672 } 673 674 /* 675 * If this buffer is referenced from the current syncing 676 * transaction group: either 677 * 1 - make a copy and reset the reference, or 678 * 2 - if there are no holders, just null the current db_data. 679 */ 680 if (*syncing == db->db_buf) { 681 ASSERT3P(*quiescing, ==, NULL); 682 ASSERT3U(db->db_dirtycnt, ==, 1); 683 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 684 /* we can't copy if we have already started a write */ 685 ASSERT(*syncing != db->db_data_pending); 686 *syncing = arc_buf_alloc( 687 db->db_dnode->dn_objset->os_spa, size, db); 688 bcopy(db->db.db_data, (*syncing)->b_data, size); 689 } else { 690 db->db.db_data = NULL; 691 db->db_buf = NULL; 692 db->db_state = DB_UNCACHED; 693 } 694 } 695 } 696 697 void 698 dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg) 699 { 700 ASSERT(MUTEX_HELD(&db->db_mtx)); 701 if (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) { 702 db->db_d.db_overridden_by[txg&TXG_MASK] = NULL; 703 } else if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { 704 /* free this block */ 705 ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) || 706 db->db_dnode->dn_free_txg == txg); 707 if (!BP_IS_HOLE(db->db_d.db_overridden_by[txg&TXG_MASK])) { 708 /* XXX can get silent EIO here */ 709 (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa, 710 txg, db->db_d.db_overridden_by[txg&TXG_MASK], 711 NULL, NULL, ARC_WAIT); 712 } 713 kmem_free(db->db_d.db_overridden_by[txg&TXG_MASK], 714 sizeof (blkptr_t)); 715 db->db_d.db_overridden_by[txg&TXG_MASK] = NULL; 716 /* release the already-written buffer */ 717 arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); 718 } 719 } 720 721 void 722 dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) 723 { 724 dmu_buf_impl_t *db, *db_next; 725 uint64_t txg = tx->tx_txg; 726 727 dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks); 728 mutex_enter(&dn->dn_dbufs_mtx); 729 for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 730 db_next = list_next(&dn->dn_dbufs, db); 731 if ((db->db_level != 0) || (db->db_blkid == DB_BONUS_BLKID)) 732 continue; 733 dprintf_dbuf(db, "found buf %s\n", ""); 734 if (db->db_blkid < blkid || 735 db->db_blkid >= blkid+nblks) 736 continue; 737 738 /* found a level 0 buffer in the range */ 739 if (dbuf_undirty(db, tx)) 740 continue; 741 742 mutex_enter(&db->db_mtx); 743 if (db->db_state == DB_UNCACHED) { 744 ASSERT(db->db.db_data == NULL); 745 mutex_exit(&db->db_mtx); 746 continue; 747 } 748 if (db->db_state == DB_READ) { 749 /* this will be handled in dbuf_read_done() */ 750 db->db_d.db_freed_in_flight = TRUE; 751 mutex_exit(&db->db_mtx); 752 continue; 753 } 754 if (db->db_state == DB_FILL) { 755 /* this will be handled in dbuf_rele() */ 756 db->db_d.db_freed_in_flight = TRUE; 757 mutex_exit(&db->db_mtx); 758 continue; 759 } 760 761 /* make a copy of the data if necessary */ 762 dbuf_fix_old_data(db, txg); 763 764 if (db->db.db_data) { 765 /* fill in with appropriate data */ 766 arc_release(db->db_buf, db); 767 bzero(db->db.db_data, db->db.db_size); 768 } 769 mutex_exit(&db->db_mtx); 770 } 771 mutex_exit(&dn->dn_dbufs_mtx); 772 } 773 774 static int 775 dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx) 776 { 777 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 778 uint64_t birth_txg = 0; 779 780 /* Don't count meta-objects */ 781 if (ds == NULL) 782 return (FALSE); 783 784 /* 785 * We don't need any locking to protect db_blkptr: 786 * If it's syncing, then db_dirtied will be set so we'll 787 * ignore db_blkptr. 788 */ 789 ASSERT(MUTEX_HELD(&db->db_mtx)); /* XXX strictly necessary? */ 790 /* If we have been dirtied since the last snapshot, its not new */ 791 if (db->db_dirtied) 792 birth_txg = db->db_dirtied; 793 else if (db->db_blkptr) 794 birth_txg = db->db_blkptr->blk_birth; 795 796 if (birth_txg) 797 return (!dsl_dataset_block_freeable(ds, birth_txg, tx)); 798 else 799 return (TRUE); 800 } 801 802 void 803 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 804 { 805 arc_buf_t *buf, *obuf; 806 int osize = db->db.db_size; 807 808 /* XXX does *this* func really need the lock? */ 809 ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); 810 811 ASSERT3U(osize, <=, size); 812 if (osize == size) 813 return; 814 815 /* 816 * This call to dbuf_will_dirty() with the dn_struct_rwlock held 817 * is OK, because there can be no other references to the db 818 * when we are changing its size, so no concurrent DB_FILL can 819 * be happening. 820 */ 821 /* Make a copy of the data if necessary */ 822 dbuf_will_dirty(db, tx); 823 824 /* create the data buffer for the new block */ 825 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db); 826 827 /* copy old block data to the new block */ 828 obuf = db->db_buf; 829 bcopy(obuf->b_data, buf->b_data, osize); 830 /* zero the remainder */ 831 bzero((uint8_t *)buf->b_data + osize, size - osize); 832 833 mutex_enter(&db->db_mtx); 834 /* ASSERT3U(refcount_count(&db->db_holds), ==, 1); */ 835 dbuf_set_data(db, buf); 836 arc_buf_free(obuf, db); 837 db->db.db_size = size; 838 839 /* fix up the dirty info */ 840 if (db->db_level == 0) 841 db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf; 842 mutex_exit(&db->db_mtx); 843 844 dnode_willuse_space(db->db_dnode, size-osize, tx); 845 } 846 847 void 848 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 849 { 850 dnode_t *dn = db->db_dnode; 851 objset_impl_t *os = dn->dn_objset; 852 int drop_struct_lock = FALSE; 853 int txgoff = tx->tx_txg & TXG_MASK; 854 855 ASSERT(tx->tx_txg != 0); 856 ASSERT(!refcount_is_zero(&db->db_holds)); 857 dmu_tx_dirty_buf(tx, db); 858 859 /* 860 * Shouldn't dirty a regular buffer in syncing context. Private 861 * objects may be dirtied in syncing context, but only if they 862 * were already pre-dirtied in open context. 863 * XXX We may want to prohibit dirtying in syncing context even 864 * if they did pre-dirty. 865 */ 866 ASSERT(!(dmu_tx_is_syncing(tx) && 867 !BP_IS_HOLE(&dn->dn_objset->os_rootbp) && 868 !(dn->dn_object & DMU_PRIVATE_OBJECT) && 869 dn->dn_objset->os_dsl_dataset != NULL && 870 !dsl_dir_is_private( 871 dn->dn_objset->os_dsl_dataset->ds_dir))); 872 873 /* 874 * We make this assert for private objects as well, but after we 875 * check if we're already dirty. They are allowed to re-dirty 876 * in syncing context. 877 */ 878 ASSERT(dn->dn_object & DMU_PRIVATE_OBJECT || 879 dn->dn_dirtyctx == DN_UNDIRTIED || 880 dn->dn_dirtyctx == 881 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 882 883 mutex_enter(&db->db_mtx); 884 /* XXX make this true for indirects too? */ 885 ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || 886 db->db_state == DB_FILL); 887 888 /* 889 * If this buffer is currently part of an "overridden" region, 890 * we now need to remove it from that region. 891 */ 892 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 893 db->db_d.db_overridden_by[txgoff] != NULL) { 894 dbuf_unoverride(db, tx->tx_txg); 895 } 896 897 mutex_enter(&dn->dn_mtx); 898 /* 899 * Don't set dirtyctx to SYNC if we're just modifying this as we 900 * initialize the objset. 901 */ 902 if (dn->dn_dirtyctx == DN_UNDIRTIED && 903 !BP_IS_HOLE(&dn->dn_objset->os_rootbp)) { 904 dn->dn_dirtyctx = 905 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 906 ASSERT(dn->dn_dirtyctx_firstset == NULL); 907 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 908 } 909 mutex_exit(&dn->dn_mtx); 910 911 /* 912 * If this buffer is already dirty, we're done. 913 */ 914 if (list_link_active(&db->db_dirty_node[txgoff])) { 915 mutex_exit(&db->db_mtx); 916 return; 917 } 918 919 /* 920 * Only valid if not already dirty. 921 */ 922 ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 923 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 924 925 ASSERT3U(dn->dn_nlevels, >, db->db_level); 926 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 927 dn->dn_phys->dn_nlevels > db->db_level || 928 dn->dn_next_nlevels[txgoff] > db->db_level || 929 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 930 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 931 932 /* 933 * We should only be dirtying in syncing context if it's the 934 * mos, a spa os, or we're initializing the os. However, we are 935 * allowed to dirty in syncing context provided we already 936 * dirtied it in open context. Hence we must make this 937 * assertion only if we're not already dirty. 938 */ 939 ASSERT(!dmu_tx_is_syncing(tx) || 940 os->os_dsl_dataset == NULL || 941 !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || 942 !BP_IS_HOLE(&os->os_rootbp)); 943 ASSERT(db->db.db_size != 0); 944 945 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 946 947 if (db->db_level == 0) { 948 /* 949 * Release the data buffer from the cache so that we 950 * can modify it without impacting possible other users 951 * of this cached data block. Note that indirect blocks 952 * and private objects are not released until the syncing 953 * state (since they are only modified then). 954 * 955 * If this buffer is dirty in an old transaction group we need 956 * to make a copy of it so that the changes we make in this 957 * transaction group won't leak out when we sync the older txg. 958 */ 959 ASSERT(db->db_buf != NULL); 960 ASSERT(db->db.db_data != NULL); 961 ASSERT(db->db_d.db_data_old[txgoff] == NULL); 962 if (!(db->db.db_object & DMU_PRIVATE_OBJECT)) { 963 arc_release(db->db_buf, db); 964 dbuf_fix_old_data(db, tx->tx_txg); 965 ASSERT(db->db_buf != NULL); 966 } 967 db->db_d.db_data_old[txgoff] = db->db_buf; 968 } 969 970 mutex_enter(&dn->dn_mtx); 971 /* 972 * We could have been freed_in_flight between the dbuf_noread 973 * and dbuf_dirty. We win, as though the dbuf_noread() had 974 * happened after the free. 975 */ 976 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 977 dnode_clear_range(dn, db->db_blkid, 1, tx); 978 db->db_d.db_freed_in_flight = FALSE; 979 } 980 981 db->db_dirtied = tx->tx_txg; 982 list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db); 983 mutex_exit(&dn->dn_mtx); 984 985 /* 986 * If writting this buffer will consume a new block on disk, 987 * then update the accounting. 988 */ 989 if (db->db_blkid != DB_BONUS_BLKID) { 990 if (!dbuf_new_block(db, tx) && db->db_blkptr) { 991 /* 992 * This is only a guess -- if the dbuf is dirty 993 * in a previous txg, we don't know how much 994 * space it will use on disk yet. We should 995 * really have the struct_rwlock to access 996 * db_blkptr, but since this is just a guess, 997 * it's OK if we get an odd answer. 998 */ 999 dnode_willuse_space(dn, 1000 -BP_GET_ASIZE(db->db_blkptr), tx); 1001 } 1002 dnode_willuse_space(dn, db->db.db_size, tx); 1003 } 1004 1005 /* 1006 * This buffer is now part of this txg 1007 */ 1008 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1009 db->db_dirtycnt += 1; 1010 ASSERT3U(db->db_dirtycnt, <=, 3); 1011 1012 mutex_exit(&db->db_mtx); 1013 1014 if (db->db_blkid == DB_BONUS_BLKID) { 1015 dnode_setdirty(dn, tx); 1016 return; 1017 } 1018 1019 if (db->db_level == 0) 1020 dnode_new_blkid(dn, db->db_blkid, tx); 1021 1022 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1023 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1024 drop_struct_lock = TRUE; 1025 } 1026 1027 if (db->db_level < dn->dn_nlevels-1) { 1028 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1029 dmu_buf_impl_t *parent; 1030 parent = dbuf_hold_level(dn, db->db_level+1, 1031 db->db_blkid >> epbs, FTAG); 1032 if (drop_struct_lock) 1033 rw_exit(&dn->dn_struct_rwlock); 1034 dbuf_dirty(parent, tx); 1035 dbuf_remove_ref(parent, FTAG); 1036 } else { 1037 if (drop_struct_lock) 1038 rw_exit(&dn->dn_struct_rwlock); 1039 } 1040 1041 dnode_setdirty(dn, tx); 1042 } 1043 1044 static int 1045 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1046 { 1047 dnode_t *dn = db->db_dnode; 1048 int txgoff = tx->tx_txg & TXG_MASK; 1049 1050 ASSERT(tx->tx_txg != 0); 1051 1052 mutex_enter(&db->db_mtx); 1053 1054 /* 1055 * If this buffer is not dirty, we're done. 1056 */ 1057 if (!list_link_active(&db->db_dirty_node[txgoff])) { 1058 mutex_exit(&db->db_mtx); 1059 return (0); 1060 } 1061 1062 /* 1063 * If this buffer is currently held, we cannot undirty 1064 * it, since one of the current holders may be in the 1065 * middle of an update. Note that users of dbuf_undirty() 1066 * should not place a hold on the dbuf before the call. 1067 * XXX - this check assumes we are being called from 1068 * dbuf_free_range(), perhaps we should move it there? 1069 */ 1070 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1071 mutex_exit(&db->db_mtx); 1072 mutex_enter(&dn->dn_mtx); 1073 dnode_clear_range(dn, db->db_blkid, 1, tx); 1074 mutex_exit(&dn->dn_mtx); 1075 return (0); 1076 } 1077 1078 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1079 1080 dbuf_unoverride(db, tx->tx_txg); 1081 1082 ASSERT(db->db.db_size != 0); 1083 if (db->db_level == 0) { 1084 ASSERT(db->db_buf != NULL); 1085 ASSERT(db->db_d.db_data_old[txgoff] != NULL); 1086 if (db->db_d.db_data_old[txgoff] != db->db_buf) 1087 arc_buf_free(db->db_d.db_data_old[txgoff], db); 1088 db->db_d.db_data_old[txgoff] = NULL; 1089 } 1090 1091 /* XXX would be nice to fix up dn_towrite_space[] */ 1092 /* XXX undo db_dirtied? but how? */ 1093 /* db->db_dirtied = tx->tx_txg; */ 1094 1095 mutex_enter(&dn->dn_mtx); 1096 list_remove(&dn->dn_dirty_dbufs[txgoff], db); 1097 mutex_exit(&dn->dn_mtx); 1098 1099 ASSERT(db->db_dirtycnt > 0); 1100 db->db_dirtycnt -= 1; 1101 1102 if (refcount_remove(&db->db_holds, 1103 (void *)(uintptr_t)tx->tx_txg) == 0) { 1104 /* make duf_verify() happy */ 1105 if (db->db.db_data) 1106 bzero(db->db.db_data, db->db.db_size); 1107 1108 dbuf_evict(db); 1109 return (1); 1110 } 1111 1112 mutex_exit(&db->db_mtx); 1113 return (0); 1114 } 1115 1116 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty 1117 void 1118 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1119 { 1120 int rf = DB_RF_MUST_SUCCEED; 1121 1122 ASSERT(tx->tx_txg != 0); 1123 ASSERT(!refcount_is_zero(&db->db_holds)); 1124 1125 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) 1126 rf |= DB_RF_HAVESTRUCT; 1127 (void) dbuf_read_generic(db, rf); 1128 dbuf_dirty(db, tx); 1129 } 1130 1131 #pragma weak dmu_buf_will_fill = dbuf_will_fill 1132 void 1133 dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx) 1134 { 1135 ASSERT(tx->tx_txg != 0); 1136 ASSERT(db->db_level == 0); 1137 ASSERT(!refcount_is_zero(&db->db_holds)); 1138 1139 ASSERT(!(db->db.db_object & DMU_PRIVATE_OBJECT) || 1140 dmu_tx_private_ok(tx)); 1141 1142 dbuf_noread(db); 1143 dbuf_dirty(db, tx); 1144 } 1145 1146 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1147 /* ARGSUSED */ 1148 void 1149 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1150 { 1151 mutex_enter(&db->db_mtx); 1152 dbuf_verify(db); 1153 1154 if (db->db_state == DB_FILL) { 1155 if (db->db_level == 0 && db->db_d.db_freed_in_flight) { 1156 /* we were freed while filling */ 1157 /* XXX dbuf_undirty? */ 1158 bzero(db->db.db_data, db->db.db_size); 1159 db->db_d.db_freed_in_flight = FALSE; 1160 } 1161 db->db_state = DB_CACHED; 1162 cv_broadcast(&db->db_changed); 1163 } 1164 mutex_exit(&db->db_mtx); 1165 } 1166 1167 1168 static void 1169 dbuf_clear(dmu_buf_impl_t *db) 1170 { 1171 dnode_t *dn = db->db_dnode; 1172 1173 ASSERT(MUTEX_HELD(&dn->dn_dbufs_mtx)); 1174 ASSERT(MUTEX_HELD(&db->db_mtx)); 1175 ASSERT(refcount_is_zero(&db->db_holds)); 1176 1177 if (db->db_state == DB_CACHED) { 1178 ASSERT(db->db_buf != NULL); 1179 arc_buf_free(db->db_buf, db); 1180 db->db.db_data = NULL; 1181 db->db_buf = NULL; 1182 db->db_state = DB_UNCACHED; 1183 } 1184 1185 ASSERT3U(db->db_state, ==, DB_UNCACHED); 1186 ASSERT(db->db_buf == NULL); 1187 ASSERT(db->db_data_pending == NULL); 1188 1189 mutex_exit(&db->db_mtx); 1190 1191 /* 1192 * If this dbuf is referened from an indirect dbuf, 1193 * decrement the ref count on the indirect dbuf. 1194 */ 1195 if (db->db_parent && db->db_parent != dn->dn_dbuf) 1196 dbuf_remove_ref(db->db_parent, db); 1197 1198 /* remove from dn_dbufs */ 1199 list_remove(&dn->dn_dbufs, db); 1200 1201 dnode_rele(dn, db); 1202 1203 dbuf_hash_remove(db); 1204 1205 db->db_dnode = NULL; 1206 db->db_parent = NULL; 1207 db->db_blkptr = NULL; 1208 } 1209 1210 static int 1211 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1212 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1213 { 1214 int nlevels, epbs; 1215 1216 if (dn->dn_phys->dn_nlevels == 0) 1217 nlevels = 1; 1218 else 1219 nlevels = dn->dn_phys->dn_nlevels; 1220 1221 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1222 1223 ASSERT3U(level * epbs, <, 64); 1224 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1225 if (blkid == DB_BONUS_BLKID) { 1226 /* this is the bonus buffer */ 1227 *parentp = NULL; 1228 *bpp = NULL; 1229 return (0); 1230 } else if (level >= nlevels || 1231 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1232 /* the buffer has no parent yet */ 1233 *parentp = NULL; 1234 *bpp = NULL; 1235 return (ENOENT); 1236 } else if (level < nlevels-1) { 1237 /* this block is referenced from an indirect block */ 1238 int err = dbuf_hold_impl(dn, level+1, 1239 blkid >> epbs, fail_sparse, NULL, parentp); 1240 if (err) 1241 return (err); 1242 dbuf_read_havestruct(*parentp); 1243 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1244 (blkid & ((1ULL << epbs) - 1)); 1245 return (0); 1246 } else { 1247 /* the block is referenced from the dnode */ 1248 ASSERT3U(level, ==, nlevels-1); 1249 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1250 blkid < dn->dn_phys->dn_nblkptr); 1251 *parentp = dn->dn_dbuf; 1252 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1253 return (0); 1254 } 1255 } 1256 1257 static dmu_buf_impl_t * 1258 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1259 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1260 { 1261 objset_impl_t *os = dn->dn_objset; 1262 dmu_buf_impl_t *db, *odb; 1263 1264 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1265 ASSERT(dn->dn_type != DMU_OT_NONE); 1266 1267 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1268 1269 db->db_objset = os; 1270 db->db.db_object = dn->dn_object; 1271 db->db_level = level; 1272 db->db_blkid = blkid; 1273 db->db_state = DB_UNCACHED; 1274 1275 if (db->db_blkid == DB_BONUS_BLKID) { 1276 db->db.db_size = dn->dn_bonuslen; 1277 db->db.db_offset = DB_BONUS_BLKID; 1278 } else { 1279 int blocksize = 1280 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1281 db->db.db_size = blocksize; 1282 db->db.db_offset = db->db_blkid * blocksize; 1283 } 1284 1285 db->db_dirtied = 0; 1286 db->db_dirtycnt = 0; 1287 1288 bzero(&db->db_d, sizeof (db->db_d)); 1289 1290 /* 1291 * Hold the dn_dbufs_mtx while we get the new dbuf 1292 * in the hash table *and* added to the dbufs list. 1293 * This prevents a possible deadlock with someone 1294 * trying to look up this dbuf before its added to the 1295 * dn_dbufs list. 1296 */ 1297 mutex_enter(&dn->dn_dbufs_mtx); 1298 if ((odb = dbuf_hash_insert(db)) != NULL) { 1299 /* someone else inserted it first */ 1300 kmem_cache_free(dbuf_cache, db); 1301 mutex_exit(&dn->dn_dbufs_mtx); 1302 return (odb); 1303 } 1304 list_insert_head(&dn->dn_dbufs, db); 1305 mutex_exit(&dn->dn_dbufs_mtx); 1306 1307 if (parent && parent != dn->dn_dbuf) 1308 dbuf_add_ref(parent, db); 1309 1310 (void) refcount_add(&dn->dn_holds, db); 1311 1312 db->db_dnode = dn; 1313 db->db_parent = parent; 1314 db->db_blkptr = blkptr; 1315 1316 dprintf_dbuf(db, "db=%p\n", db); 1317 1318 return (db); 1319 } 1320 1321 static int 1322 dbuf_evictable(dmu_buf_impl_t *db) 1323 { 1324 int i; 1325 1326 ASSERT(MUTEX_HELD(&db->db_mtx)); 1327 dbuf_verify(db); 1328 1329 if (db->db_state != DB_UNCACHED && db->db_state != DB_CACHED) 1330 return (FALSE); 1331 1332 if (!refcount_is_zero(&db->db_holds)) 1333 return (FALSE); 1334 1335 #ifdef ZFS_DEBUG 1336 for (i = 0; i < TXG_SIZE; i++) { 1337 ASSERT(!list_link_active(&db->db_dirty_node[i])); 1338 ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL); 1339 } 1340 #endif 1341 1342 /* 1343 * Now we know we want to free it. 1344 * This call must be done last, since it has side effects - 1345 * calling the db_evict_func(). 1346 */ 1347 dbuf_evict_user(db); 1348 return (TRUE); 1349 } 1350 1351 static void 1352 dbuf_destroy(dmu_buf_impl_t *db) 1353 { 1354 ASSERT(refcount_is_zero(&db->db_holds)); 1355 1356 ASSERT(db->db.db_data == NULL); 1357 ASSERT(db->db_dnode == NULL); 1358 ASSERT(db->db_parent == NULL); 1359 ASSERT(db->db_hash_next == NULL); 1360 ASSERT(db->db_blkptr == NULL); 1361 ASSERT(db->db_data_pending == NULL); 1362 1363 kmem_cache_free(dbuf_cache, db); 1364 } 1365 1366 void 1367 dbuf_prefetch(dnode_t *dn, uint64_t blkid) 1368 { 1369 dmu_buf_impl_t *db, *parent = NULL; 1370 blkptr_t *bp = NULL; 1371 1372 ASSERT(blkid != DB_BONUS_BLKID); 1373 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1374 1375 if (dnode_block_freed(dn, blkid)) 1376 return; 1377 1378 /* dbuf_find() returns with db_mtx held */ 1379 if (db = dbuf_find(dn, 0, blkid)) { 1380 /* 1381 * This dbuf is already in the cache. We assume that 1382 * it is already CACHED, or else about to be either 1383 * read or filled. 1384 */ 1385 mutex_exit(&db->db_mtx); 1386 return; 1387 } 1388 1389 if (dbuf_findbp(dn, 0, blkid, TRUE, &parent, &bp) == 0) { 1390 if (bp && !BP_IS_HOLE(bp)) { 1391 (void) arc_read(NULL, dn->dn_objset->os_spa, bp, 1392 dmu_ot[dn->dn_type].ot_byteswap, 1393 NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 1394 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1395 (ARC_NOWAIT | ARC_PREFETCH)); 1396 } 1397 if (parent && parent != dn->dn_dbuf) 1398 dbuf_rele(parent); 1399 } 1400 } 1401 1402 /* 1403 * Returns with db_holds incremented, and db_mtx not held. 1404 * Note: dn_struct_rwlock must be held. 1405 */ 1406 int 1407 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1408 void *tag, dmu_buf_impl_t **dbp) 1409 { 1410 dmu_buf_impl_t *db, *parent = NULL; 1411 1412 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1413 ASSERT3U(dn->dn_nlevels, >, level); 1414 1415 *dbp = NULL; 1416 1417 /* dbuf_find() returns with db_mtx held */ 1418 db = dbuf_find(dn, level, blkid); 1419 1420 if (db == NULL) { 1421 blkptr_t *bp = NULL; 1422 int err; 1423 1424 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1425 if (fail_sparse) { 1426 if (err == 0 && bp && BP_IS_HOLE(bp)) 1427 err = ENOENT; 1428 if (err) { 1429 if (parent && parent != dn->dn_dbuf) 1430 dbuf_rele(parent); 1431 return (err); 1432 } 1433 } 1434 db = dbuf_create(dn, level, blkid, parent, bp); 1435 } 1436 1437 /* 1438 * If this buffer is currently syncing out, and we are 1439 * are still referencing it from db_data, we need to make 1440 * a copy of it in case we decide we want to dirty it 1441 * again in this txg. 1442 */ 1443 if (db->db_level == 0 && db->db_state == DB_CACHED && 1444 !(dn->dn_object & DMU_PRIVATE_OBJECT) && 1445 db->db_data_pending == db->db_buf) { 1446 int size = (db->db_blkid == DB_BONUS_BLKID) ? 1447 DN_MAX_BONUSLEN : db->db.db_size; 1448 1449 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 1450 size, db)); 1451 bcopy(db->db_data_pending->b_data, db->db.db_data, 1452 db->db.db_size); 1453 } 1454 1455 dbuf_add_ref(db, tag); 1456 dbuf_update_data(db); 1457 dbuf_verify(db); 1458 mutex_exit(&db->db_mtx); 1459 1460 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1461 if (parent && parent != dn->dn_dbuf) 1462 dbuf_rele(parent); 1463 1464 ASSERT3P(db->db_dnode, ==, dn); 1465 ASSERT3U(db->db_blkid, ==, blkid); 1466 ASSERT3U(db->db_level, ==, level); 1467 *dbp = db; 1468 1469 return (0); 1470 } 1471 1472 dmu_buf_impl_t * 1473 dbuf_hold(dnode_t *dn, uint64_t blkid) 1474 { 1475 dmu_buf_impl_t *db; 1476 (void) dbuf_hold_impl(dn, 0, blkid, FALSE, NULL, &db); 1477 return (db); 1478 } 1479 1480 dmu_buf_impl_t * 1481 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1482 { 1483 dmu_buf_impl_t *db; 1484 (void) dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1485 return (db); 1486 } 1487 1488 dmu_buf_impl_t * 1489 dbuf_hold_bonus(dnode_t *dn, void *tag) 1490 { 1491 dmu_buf_impl_t *db; 1492 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1493 (void) dbuf_hold_impl(dn, 0, DB_BONUS_BLKID, FALSE, tag, &db); 1494 rw_exit(&dn->dn_struct_rwlock); 1495 return (db); 1496 } 1497 1498 void 1499 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1500 { 1501 (void) refcount_add(&db->db_holds, tag); 1502 /* dprintf_dbuf(db, "adding ref %p; holds up to %lld\n", tag, holds); */ 1503 } 1504 1505 void 1506 dbuf_remove_ref(dmu_buf_impl_t *db, void *tag) 1507 { 1508 int64_t holds; 1509 dnode_t *dn = db->db_dnode; 1510 int need_mutex; 1511 1512 ASSERT(dn != NULL); 1513 need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx); 1514 1515 if (need_mutex) { 1516 dnode_add_ref(dn, FTAG); 1517 mutex_enter(&dn->dn_dbufs_mtx); 1518 } 1519 1520 mutex_enter(&db->db_mtx); 1521 dbuf_verify(db); 1522 1523 holds = refcount_remove(&db->db_holds, tag); 1524 1525 if (holds == 0) { 1526 ASSERT3U(db->db_state, !=, DB_FILL); 1527 if (db->db_level == 0 && 1528 db->db_d.db_user_data_ptr_ptr != NULL) 1529 *db->db_d.db_user_data_ptr_ptr = NULL; 1530 dbuf_evict(db); 1531 } else { 1532 if (holds == db->db_dirtycnt && 1533 db->db_level == 0 && db->db_d.db_immediate_evict) 1534 dbuf_evict_user(db); 1535 mutex_exit(&db->db_mtx); 1536 } 1537 1538 if (need_mutex) { 1539 mutex_exit(&dn->dn_dbufs_mtx); 1540 dnode_rele(dn, FTAG); 1541 } 1542 } 1543 1544 void 1545 dbuf_rele(dmu_buf_impl_t *db) 1546 { 1547 dbuf_remove_ref(db, NULL); 1548 } 1549 1550 #pragma weak dmu_buf_refcount = dbuf_refcount 1551 uint64_t 1552 dbuf_refcount(dmu_buf_impl_t *db) 1553 { 1554 return (refcount_count(&db->db_holds)); 1555 } 1556 1557 void * 1558 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1559 dmu_buf_evict_func_t *evict_func) 1560 { 1561 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1562 user_data_ptr_ptr, evict_func)); 1563 } 1564 1565 void * 1566 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1567 dmu_buf_evict_func_t *evict_func) 1568 { 1569 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1570 1571 db->db_d.db_immediate_evict = TRUE; 1572 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1573 user_data_ptr_ptr, evict_func)); 1574 } 1575 1576 void * 1577 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 1578 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 1579 { 1580 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1581 ASSERT(db->db_level == 0); 1582 1583 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 1584 1585 mutex_enter(&db->db_mtx); 1586 1587 if (db->db_d.db_user_ptr == old_user_ptr) { 1588 db->db_d.db_user_ptr = user_ptr; 1589 db->db_d.db_user_data_ptr_ptr = user_data_ptr_ptr; 1590 db->db_d.db_evict_func = evict_func; 1591 1592 dbuf_update_data(db); 1593 } else { 1594 old_user_ptr = db->db_d.db_user_ptr; 1595 } 1596 1597 mutex_exit(&db->db_mtx); 1598 return (old_user_ptr); 1599 } 1600 1601 void * 1602 dmu_buf_get_user(dmu_buf_t *db_fake) 1603 { 1604 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1605 ASSERT(!refcount_is_zero(&db->db_holds)); 1606 1607 return (db->db_d.db_user_ptr); 1608 } 1609 1610 void 1611 dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) 1612 { 1613 arc_buf_t **data; 1614 uint64_t txg = tx->tx_txg; 1615 dnode_t *dn = db->db_dnode; 1616 objset_impl_t *os = dn->dn_objset; 1617 int blksz; 1618 1619 ASSERT(dmu_tx_is_syncing(tx)); 1620 1621 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 1622 1623 mutex_enter(&db->db_mtx); 1624 /* 1625 * To be synced, we must be dirtied. But we 1626 * might have been freed after the dirty. 1627 */ 1628 if (db->db_state == DB_UNCACHED) { 1629 /* This buffer has been freed since it was dirtied */ 1630 ASSERT(db->db.db_data == NULL); 1631 } else if (db->db_state == DB_FILL) { 1632 /* This buffer was freed and is now being re-filled */ 1633 ASSERT(db->db.db_data != db->db_d.db_data_old[txg&TXG_MASK]); 1634 } else { 1635 ASSERT3U(db->db_state, ==, DB_CACHED); 1636 } 1637 dbuf_verify(db); 1638 1639 /* 1640 * Don't need a lock on db_dirty (dn_mtx), because it can't 1641 * be modified yet. 1642 */ 1643 1644 if (db->db_level == 0) { 1645 data = &db->db_d.db_data_old[txg&TXG_MASK]; 1646 blksz = arc_buf_size(*data); 1647 /* 1648 * If this buffer is currently "in use" (i.e., there are 1649 * active holds and db_data still references it), then make 1650 * a copy before we start the write so that any modifications 1651 * from the open txg will not leak into this write. 1652 * 1653 * NOTE: this copy does not need to be made for objects only 1654 * modified in the syncing context (e.g. DNONE_DNODE blocks) 1655 * or if there is no actual write involved (bonus blocks). 1656 */ 1657 if (!(dn->dn_object & DMU_PRIVATE_OBJECT) && 1658 db->db_d.db_overridden_by[txg&TXG_MASK] == NULL && 1659 db->db_blkid != DB_BONUS_BLKID) { 1660 if (refcount_count(&db->db_holds) > 1 && 1661 *data == db->db_buf) { 1662 *data = arc_buf_alloc( 1663 db->db_dnode->dn_objset->os_spa, blksz, db); 1664 bcopy(db->db.db_data, (*data)->b_data, blksz); 1665 } 1666 db->db_data_pending = *data; 1667 } else if (dn->dn_object & DMU_PRIVATE_OBJECT) { 1668 /* 1669 * Private object buffers are released here rather 1670 * than in dbuf_dirty() since they are only modified 1671 * in the syncing context and we don't want the 1672 * overhead of making multiple copies of the data. 1673 */ 1674 arc_release(db->db_buf, db); 1675 } 1676 } else { 1677 data = &db->db_buf; 1678 if (*data == NULL) { 1679 /* 1680 * This can happen if we dirty and then free 1681 * the level-0 data blocks in the same txg. So 1682 * this indirect remains unchanged. 1683 */ 1684 if (db->db_dirtied == txg) 1685 db->db_dirtied = 0; 1686 ASSERT(db->db_dirtycnt > 0); 1687 db->db_dirtycnt -= 1; 1688 mutex_exit(&db->db_mtx); 1689 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1690 return; 1691 } 1692 blksz = db->db.db_size; 1693 ASSERT3U(blksz, ==, 1<<dn->dn_phys->dn_indblkshift); 1694 } 1695 1696 ASSERT(*data != NULL); 1697 1698 if (db->db_blkid == DB_BONUS_BLKID) { 1699 /* 1700 * Simply copy the bonus data into the dnode. It will 1701 * be written out when the dnode is synced (and it will 1702 * be synced, since it must have been dirty for dbuf_sync 1703 * to be called). The bonus data will be byte swapped 1704 * in dnode_byteswap. 1705 */ 1706 /* 1707 * Use dn_phys->dn_bonuslen since db.db_size is the length 1708 * of the bonus buffer in the open transaction rather than 1709 * the syncing transaction. 1710 */ 1711 ASSERT3U(db->db_level, ==, 0); 1712 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, blksz); 1713 bcopy((*data)->b_data, DN_BONUS(dn->dn_phys), 1714 dn->dn_phys->dn_bonuslen); 1715 if (*data != db->db_buf) 1716 arc_buf_free(*data, db); 1717 db->db_d.db_data_old[txg&TXG_MASK] = NULL; 1718 db->db_data_pending = NULL; 1719 if (db->db_dirtied == txg) 1720 db->db_dirtied = 0; 1721 ASSERT(db->db_dirtycnt > 0); 1722 db->db_dirtycnt -= 1; 1723 mutex_exit(&db->db_mtx); 1724 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1725 return; 1726 } else if (db->db_level > 0 && !arc_released(db->db_buf)) { 1727 /* 1728 * This indirect buffer was marked dirty, but 1729 * never modified (if it had been modified, then 1730 * we would have released the buffer). There is 1731 * no reason to write anything. 1732 */ 1733 db->db_data_pending = NULL; 1734 if (db->db_dirtied == txg) 1735 db->db_dirtied = 0; 1736 ASSERT(db->db_dirtycnt > 0); 1737 db->db_dirtycnt -= 1; 1738 mutex_exit(&db->db_mtx); 1739 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1740 return; 1741 } else if (db->db_blkptr == NULL && 1742 db->db_level == dn->dn_phys->dn_nlevels-1 && 1743 db->db_blkid < dn->dn_phys->dn_nblkptr) { 1744 /* 1745 * This buffer was allocated at a time when there was 1746 * no available blkptrs from the dnode, or it was 1747 * inappropriate to hook it in (i.e., nlevels mis-match). 1748 */ 1749 ASSERT(db->db_blkptr == NULL); 1750 ASSERT(db->db_parent == NULL); 1751 db->db_parent = dn->dn_dbuf; 1752 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 1753 dbuf_verify(db); 1754 mutex_exit(&db->db_mtx); 1755 } else if (db->db_blkptr == NULL) { 1756 dmu_buf_impl_t *parent = db->db_parent; 1757 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 1758 1759 mutex_exit(&db->db_mtx); 1760 ASSERT(dn->dn_phys->dn_nlevels > 1); 1761 if (parent == NULL) { 1762 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1763 (void) dbuf_hold_impl(dn, db->db_level+1, 1764 db->db_blkid >> epbs, FALSE, NULL, &parent); 1765 rw_exit(&dn->dn_struct_rwlock); 1766 dbuf_add_ref(parent, db); 1767 db->db_parent = parent; 1768 dbuf_rele(parent); 1769 } 1770 dbuf_read(parent); 1771 } else { 1772 mutex_exit(&db->db_mtx); 1773 } 1774 1775 ASSERT(IS_DNODE_DNODE(dn->dn_object) || db->db_parent != NULL); 1776 1777 if (db->db_parent != dn->dn_dbuf) { 1778 dmu_buf_impl_t *parent = db->db_parent; 1779 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 1780 1781 mutex_enter(&db->db_mtx); 1782 ASSERT(db->db_level == parent->db_level-1); 1783 ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK])); 1784 /* 1785 * We may have read this block after we dirtied it, 1786 * so never released it from the cache. 1787 */ 1788 arc_release(parent->db_buf, parent); 1789 1790 db->db_blkptr = (blkptr_t *)parent->db.db_data + 1791 (db->db_blkid & ((1ULL << epbs) - 1)); 1792 dbuf_verify(db); 1793 mutex_exit(&db->db_mtx); 1794 } 1795 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1796 1797 #ifdef ZFS_DEBUG 1798 if (db->db_parent == dn->dn_dbuf) { 1799 /* 1800 * We don't need to dnode_setdirty(dn) because if we got 1801 * here then the parent is already dirty. 1802 */ 1803 ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); 1804 ASSERT3P(db->db_blkptr, ==, 1805 &dn->dn_phys->dn_blkptr[db->db_blkid]); 1806 } 1807 #endif 1808 if (db->db_level == 0 && 1809 db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { 1810 arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK]; 1811 blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK]; 1812 int old_size = BP_GET_ASIZE(db->db_blkptr); 1813 int new_size = BP_GET_ASIZE(*bpp); 1814 1815 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1816 1817 dnode_diduse_space(dn, new_size-old_size); 1818 mutex_enter(&dn->dn_mtx); 1819 if (db->db_blkid > dn->dn_phys->dn_maxblkid) 1820 dn->dn_phys->dn_maxblkid = db->db_blkid; 1821 mutex_exit(&dn->dn_mtx); 1822 1823 dsl_dataset_block_born(os->os_dsl_dataset, *bpp, tx); 1824 if (!BP_IS_HOLE(db->db_blkptr)) 1825 dsl_dataset_block_kill(os->os_dsl_dataset, 1826 db->db_blkptr, os->os_synctx); 1827 1828 mutex_enter(&db->db_mtx); 1829 *db->db_blkptr = **bpp; 1830 kmem_free(*bpp, sizeof (blkptr_t)); 1831 *bpp = NULL; 1832 1833 if (*old != db->db_buf) 1834 arc_buf_free(*old, db); 1835 *old = NULL; 1836 db->db_data_pending = NULL; 1837 1838 cv_broadcast(&db->db_changed); 1839 1840 ASSERT(db->db_dirtycnt > 0); 1841 db->db_dirtycnt -= 1; 1842 mutex_exit(&db->db_mtx); 1843 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1844 } else { 1845 int checksum, compress; 1846 1847 if (db->db_level > 0) { 1848 /* 1849 * XXX -- we should design a compression algorithm 1850 * that specializes in arrays of bps. 1851 */ 1852 checksum = ZIO_CHECKSUM_FLETCHER_4; 1853 compress = ZIO_COMPRESS_LZJB; 1854 } else { 1855 /* 1856 * Allow dnode settings to override objset settings, 1857 * except for metadata checksums. 1858 */ 1859 if (dmu_ot[dn->dn_type].ot_metadata) { 1860 checksum = os->os_md_checksum; 1861 compress = zio_compress_select(dn->dn_compress, 1862 os->os_md_compress); 1863 } else { 1864 checksum = zio_checksum_select(dn->dn_checksum, 1865 os->os_checksum); 1866 compress = zio_compress_select(dn->dn_compress, 1867 os->os_compress); 1868 } 1869 } 1870 #ifdef ZFS_DEBUG 1871 if (db->db_parent) { 1872 ASSERT(list_link_active( 1873 &db->db_parent->db_dirty_node[txg&TXG_MASK])); 1874 ASSERT(db->db_parent == dn->dn_dbuf || 1875 db->db_parent->db_level > 0); 1876 if (dn->dn_object & DMU_PRIVATE_OBJECT || 1877 db->db_level > 0) 1878 ASSERT(*data == db->db_buf); 1879 } 1880 #endif 1881 ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg); 1882 (void) arc_write(zio, os->os_spa, checksum, compress, txg, 1883 db->db_blkptr, *data, dbuf_write_done, db, 1884 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT); 1885 /* 1886 * We can't access db after arc_write, since it could finish 1887 * and be freed, and we have no locks on it. 1888 */ 1889 } 1890 } 1891 1892 struct dbuf_arg { 1893 objset_impl_t *os; 1894 blkptr_t bp; 1895 }; 1896 1897 static void 1898 dbuf_do_born(void *arg) 1899 { 1900 struct dbuf_arg *da = arg; 1901 dsl_dataset_block_born(da->os->os_dsl_dataset, 1902 &da->bp, da->os->os_synctx); 1903 kmem_free(da, sizeof (struct dbuf_arg)); 1904 } 1905 1906 static void 1907 dbuf_do_kill(void *arg) 1908 { 1909 struct dbuf_arg *da = arg; 1910 dsl_dataset_block_kill(da->os->os_dsl_dataset, 1911 &da->bp, da->os->os_synctx); 1912 kmem_free(da, sizeof (struct dbuf_arg)); 1913 } 1914 1915 /* ARGSUSED */ 1916 static void 1917 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 1918 { 1919 dmu_buf_impl_t *db = vdb; 1920 dnode_t *dn = db->db_dnode; 1921 objset_impl_t *os = dn->dn_objset; 1922 uint64_t txg = zio->io_txg; 1923 uint64_t fill = 0; 1924 int i; 1925 int old_size, new_size; 1926 1927 ASSERT3U(zio->io_error, ==, 0); 1928 1929 dprintf_dbuf_bp(db, &zio->io_bp_orig, "bp_orig: %s", ""); 1930 1931 old_size = BP_GET_ASIZE(&zio->io_bp_orig); 1932 new_size = BP_GET_ASIZE(zio->io_bp); 1933 1934 dnode_diduse_space(dn, new_size-old_size); 1935 1936 mutex_enter(&db->db_mtx); 1937 1938 if (db->db_dirtied == txg) 1939 db->db_dirtied = 0; 1940 1941 if (db->db_level == 0) { 1942 arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK]; 1943 1944 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1945 1946 if (*old != db->db_buf) 1947 arc_buf_free(*old, db); 1948 *old = NULL; 1949 db->db_data_pending = NULL; 1950 1951 mutex_enter(&dn->dn_mtx); 1952 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 1953 !BP_IS_HOLE(db->db_blkptr)) 1954 dn->dn_phys->dn_maxblkid = db->db_blkid; 1955 mutex_exit(&dn->dn_mtx); 1956 1957 if (dn->dn_type == DMU_OT_DNODE) { 1958 dnode_phys_t *dnp = db->db.db_data; 1959 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 1960 i--, dnp++) { 1961 if (dnp->dn_type != DMU_OT_NONE) 1962 fill++; 1963 } 1964 } else { 1965 if (!BP_IS_HOLE(db->db_blkptr)) 1966 fill = 1; 1967 } 1968 } else { 1969 blkptr_t *bp = db->db.db_data; 1970 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 1971 if (!BP_IS_HOLE(db->db_blkptr)) { 1972 ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, db->db.db_size); 1973 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 1974 db->db.db_size); 1975 } 1976 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) { 1977 if (BP_IS_HOLE(bp)) 1978 continue; 1979 ASSERT3U(BP_GET_LSIZE(bp), ==, 1980 db->db_level == 1 ? dn->dn_datablksz : 1981 (1<<dn->dn_phys->dn_indblkshift)); 1982 fill += bp->blk_fill; 1983 } 1984 } 1985 1986 if (!BP_IS_HOLE(db->db_blkptr)) { 1987 db->db_blkptr->blk_fill = fill; 1988 BP_SET_TYPE(db->db_blkptr, dn->dn_type); 1989 BP_SET_LEVEL(db->db_blkptr, db->db_level); 1990 } else { 1991 ASSERT3U(fill, ==, 0); 1992 ASSERT3U(db->db_blkptr->blk_fill, ==, 0); 1993 } 1994 1995 dprintf_dbuf_bp(db, db->db_blkptr, 1996 "wrote %llu bytes to blkptr:", zio->io_size); 1997 1998 ASSERT(db->db_parent == NULL || 1999 list_link_active(&db->db_parent->db_dirty_node[txg&TXG_MASK])); 2000 cv_broadcast(&db->db_changed); 2001 ASSERT(db->db_dirtycnt > 0); 2002 db->db_dirtycnt -= 1; 2003 mutex_exit(&db->db_mtx); 2004 2005 /* We must do this after we've set the bp's type and level */ 2006 if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), 2007 BP_IDENTITY(&zio->io_bp_orig))) { 2008 struct dbuf_arg *da; 2009 da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP); 2010 da->os = os; 2011 da->bp = *zio->io_bp; 2012 (void) taskq_dispatch(dbuf_tq, dbuf_do_born, da, 0); 2013 if (!BP_IS_HOLE(&zio->io_bp_orig)) { 2014 da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP); 2015 da->os = os; 2016 da->bp = zio->io_bp_orig; 2017 (void) taskq_dispatch(dbuf_tq, dbuf_do_kill, da, 0); 2018 } 2019 } 2020 2021 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 2022 } 2023