1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/zfs_context.h> 30 #include <sys/dmu.h> 31 #include <sys/dmu_impl.h> 32 #include <sys/dbuf.h> 33 #include <sys/dmu_objset.h> 34 #include <sys/dsl_dataset.h> 35 #include <sys/dsl_dir.h> 36 #include <sys/dmu_tx.h> 37 #include <sys/spa.h> 38 #include <sys/zio.h> 39 #include <sys/dmu_zfetch.h> 40 41 static void dbuf_destroy(dmu_buf_impl_t *db); 42 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 43 static arc_done_func_t dbuf_write_done; 44 45 /* 46 * Global data structures and functions for the dbuf cache. 47 */ 48 taskq_t *dbuf_tq; 49 static kmem_cache_t *dbuf_cache; 50 51 /* ARGSUSED */ 52 static int 53 dbuf_cons(void *vdb, void *unused, int kmflag) 54 { 55 dmu_buf_impl_t *db = vdb; 56 bzero(db, sizeof (dmu_buf_impl_t)); 57 58 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 59 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 60 refcount_create(&db->db_holds); 61 return (0); 62 } 63 64 /* ARGSUSED */ 65 static void 66 dbuf_dest(void *vdb, void *unused) 67 { 68 dmu_buf_impl_t *db = vdb; 69 mutex_destroy(&db->db_mtx); 70 cv_destroy(&db->db_changed); 71 refcount_destroy(&db->db_holds); 72 } 73 74 /* 75 * dbuf hash table routines 76 */ 77 static dbuf_hash_table_t dbuf_hash_table; 78 79 static uint64_t dbuf_hash_count; 80 81 static uint64_t 82 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 83 { 84 uintptr_t osv = (uintptr_t)os; 85 uint64_t crc = -1ULL; 86 87 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 88 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 90 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 91 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 92 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 93 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 94 95 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 96 97 return (crc); 98 } 99 100 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 101 102 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 103 ((dbuf)->db.db_object == (obj) && \ 104 (dbuf)->db_objset == (os) && \ 105 (dbuf)->db_level == (level) && \ 106 (dbuf)->db_blkid == (blkid)) 107 108 dmu_buf_impl_t * 109 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 110 { 111 dbuf_hash_table_t *h = &dbuf_hash_table; 112 objset_impl_t *os = dn->dn_objset; 113 uint64_t obj = dn->dn_object; 114 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 115 uint64_t idx = hv & h->hash_table_mask; 116 dmu_buf_impl_t *db; 117 118 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 119 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 120 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 121 mutex_enter(&db->db_mtx); 122 if (!refcount_is_zero(&db->db_holds)) { 123 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 124 return (db); 125 } 126 mutex_exit(&db->db_mtx); 127 } 128 } 129 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 130 return (NULL); 131 } 132 133 /* 134 * Insert an entry into the hash table. If there is already an element 135 * equal to elem in the hash table, then the already existing element 136 * will be returned and the new element will not be inserted. 137 * Otherwise returns NULL. 138 */ 139 static dmu_buf_impl_t * 140 dbuf_hash_insert(dmu_buf_impl_t *db) 141 { 142 dbuf_hash_table_t *h = &dbuf_hash_table; 143 objset_impl_t *os = db->db_objset; 144 uint64_t obj = db->db.db_object; 145 int level = db->db_level; 146 uint64_t blkid = db->db_blkid; 147 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 148 uint64_t idx = hv & h->hash_table_mask; 149 dmu_buf_impl_t *dbf; 150 151 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 152 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 153 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 154 mutex_enter(&dbf->db_mtx); 155 if (!refcount_is_zero(&dbf->db_holds)) { 156 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 157 return (dbf); 158 } 159 mutex_exit(&dbf->db_mtx); 160 } 161 } 162 163 mutex_enter(&db->db_mtx); 164 db->db_hash_next = h->hash_table[idx]; 165 h->hash_table[idx] = db; 166 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 167 atomic_add_64(&dbuf_hash_count, 1); 168 169 return (NULL); 170 } 171 172 /* 173 * Remove an entry from the hash table. This operation will 174 * fail if there are any existing holds on the db. 175 */ 176 static void 177 dbuf_hash_remove(dmu_buf_impl_t *db) 178 { 179 dbuf_hash_table_t *h = &dbuf_hash_table; 180 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 181 db->db_level, db->db_blkid); 182 uint64_t idx = hv & h->hash_table_mask; 183 dmu_buf_impl_t *dbf, **dbp; 184 185 /* 186 * We musn't hold db_mtx to maintin lock ordering: 187 * DBUF_HASH_MUTEX > db_mtx. 188 */ 189 ASSERT(refcount_is_zero(&db->db_holds)); 190 ASSERT(db->db_dnode != NULL); 191 ASSERT(!MUTEX_HELD(&db->db_mtx)); 192 193 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 194 dbp = &h->hash_table[idx]; 195 while ((dbf = *dbp) != db) { 196 dbp = &dbf->db_hash_next; 197 ASSERT(dbf != NULL); 198 } 199 *dbp = db->db_hash_next; 200 db->db_hash_next = NULL; 201 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 202 atomic_add_64(&dbuf_hash_count, -1); 203 } 204 205 static int dbuf_evictable(dmu_buf_impl_t *db); 206 static void dbuf_clear(dmu_buf_impl_t *db); 207 208 void 209 dbuf_evict(dmu_buf_impl_t *db) 210 { 211 int err; 212 213 ASSERT(MUTEX_HELD(&db->db_mtx)); 214 err = dbuf_evictable(db); 215 ASSERT(err == TRUE); 216 dbuf_clear(db); 217 dbuf_destroy(db); 218 } 219 220 static void 221 dbuf_evict_user(dmu_buf_impl_t *db) 222 { 223 ASSERT(MUTEX_HELD(&db->db_mtx)); 224 225 if (db->db_level != 0 || db->db_d.db_evict_func == NULL) 226 return; 227 228 if (db->db_d.db_user_data_ptr_ptr) 229 *db->db_d.db_user_data_ptr_ptr = db->db.db_data; 230 db->db_d.db_evict_func(&db->db, db->db_d.db_user_ptr); 231 db->db_d.db_user_ptr = NULL; 232 db->db_d.db_user_data_ptr_ptr = NULL; 233 db->db_d.db_evict_func = NULL; 234 } 235 236 void 237 dbuf_init(void) 238 { 239 uint64_t hsize = 1; 240 dbuf_hash_table_t *h = &dbuf_hash_table; 241 int i; 242 243 /* 244 * The hash table is big enough to fill all of physical memory 245 * with an average 64k block size. The table will take up 246 * totalmem*sizeof(void*)/64k bytes (i.e. 128KB/GB with 8-byte 247 * pointers). 248 */ 249 while (hsize * 65536 < physmem * PAGESIZE) 250 hsize <<= 1; 251 252 h->hash_table_mask = hsize - 1; 253 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_SLEEP); 254 255 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 256 sizeof (dmu_buf_impl_t), 257 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 258 dbuf_tq = taskq_create("dbuf_tq", 8, maxclsyspri, 50, INT_MAX, 259 TASKQ_PREPOPULATE); 260 261 for (i = 0; i < DBUF_MUTEXES; i++) 262 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 263 } 264 265 void 266 dbuf_fini(void) 267 { 268 dbuf_hash_table_t *h = &dbuf_hash_table; 269 int i; 270 271 taskq_destroy(dbuf_tq); 272 dbuf_tq = NULL; 273 274 for (i = 0; i < DBUF_MUTEXES; i++) 275 mutex_destroy(&h->hash_mutexes[i]); 276 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 277 kmem_cache_destroy(dbuf_cache); 278 } 279 280 /* 281 * Other stuff. 282 */ 283 284 #ifdef ZFS_DEBUG 285 static void 286 dbuf_verify(dmu_buf_impl_t *db) 287 { 288 int i; 289 dnode_t *dn = db->db_dnode; 290 291 ASSERT(MUTEX_HELD(&db->db_mtx)); 292 293 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 294 return; 295 296 ASSERT(db->db_objset != NULL); 297 if (dn == NULL) { 298 ASSERT(db->db_parent == NULL); 299 ASSERT(db->db_blkptr == NULL); 300 } else { 301 ASSERT3U(db->db.db_object, ==, dn->dn_object); 302 ASSERT3P(db->db_objset, ==, dn->dn_objset); 303 ASSERT(list_head(&dn->dn_dbufs)); 304 ASSERT3U(db->db_level, <, dn->dn_nlevels); 305 } 306 if (db->db_blkid == DB_BONUS_BLKID) { 307 ASSERT(dn != NULL); 308 ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); 309 ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); 310 } else { 311 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 312 } 313 314 if (db->db_level == 0) { 315 void **udpp = db->db_d.db_user_data_ptr_ptr; 316 /* we can be momentarily larger in dnode_set_blksz() */ 317 if (db->db_blkid != DB_BONUS_BLKID && dn) { 318 ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); 319 } 320 if (udpp) { 321 ASSERT((refcount_is_zero(&db->db_holds) && 322 *udpp == NULL) || 323 (!refcount_is_zero(&db->db_holds) && 324 *udpp == db->db.db_data)); 325 } 326 327 if (IS_DNODE_DNODE(db->db.db_object)) { 328 for (i = 0; i < TXG_SIZE; i++) { 329 /* 330 * it should only be modified in syncing 331 * context, so make sure we only have 332 * one copy of the data. 333 */ 334 ASSERT(db->db_d.db_data_old[i] == NULL || 335 db->db_d.db_data_old[i] == db->db_buf); 336 } 337 } 338 } 339 340 /* verify db->db_blkptr */ 341 if (db->db_blkptr) { 342 if (db->db_parent == dn->dn_dbuf) { 343 /* db is pointed to by the dnode */ 344 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 345 if (IS_DNODE_DNODE(db->db.db_object)) 346 ASSERT(db->db_parent == NULL); 347 else 348 ASSERT(db->db_parent != NULL); 349 ASSERT3P(db->db_blkptr, ==, 350 &dn->dn_phys->dn_blkptr[db->db_blkid]); 351 } else { 352 /* db is pointed to by an indirect block */ 353 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 354 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 355 ASSERT3U(db->db_parent->db.db_object, ==, 356 db->db.db_object); 357 /* 358 * dnode_grow_indblksz() can make this fail if we don't 359 * have the struct_rwlock. XXX indblksz no longer 360 * grows. safe to do this now? 361 */ 362 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { 363 ASSERT3P(db->db_blkptr, ==, 364 ((blkptr_t *)db->db_parent->db.db_data + 365 db->db_blkid % epb)); 366 } 367 } 368 } 369 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 370 db->db.db_data && db->db_blkid != DB_BONUS_BLKID && 371 db->db_state != DB_FILL && !dn->dn_free_txg) { 372 /* 373 * If the blkptr isn't set but they have nonzero data, 374 * it had better be dirty, otherwise we'll lose that 375 * data when we evict this buffer. 376 */ 377 if (db->db_dirtycnt == 0) { 378 uint64_t *buf = db->db.db_data; 379 int i; 380 381 for (i = 0; i < db->db.db_size >> 3; i++) { 382 ASSERT(buf[i] == 0); 383 } 384 } 385 } 386 } 387 #endif 388 389 static void 390 dbuf_update_data(dmu_buf_impl_t *db) 391 { 392 ASSERT(MUTEX_HELD(&db->db_mtx)); 393 if (db->db_level == 0 && db->db_d.db_user_data_ptr_ptr) { 394 ASSERT(!refcount_is_zero(&db->db_holds)); 395 *db->db_d.db_user_data_ptr_ptr = db->db.db_data; 396 } 397 } 398 399 static void 400 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 401 { 402 ASSERT(MUTEX_HELD(&db->db_mtx)); 403 ASSERT(buf->b_data != NULL); 404 db->db_buf = buf; 405 db->db.db_data = buf->b_data; 406 dbuf_update_data(db); 407 } 408 409 uint64_t 410 dbuf_whichblock(dnode_t *dn, uint64_t offset) 411 { 412 if (dn->dn_datablkshift) { 413 return (offset >> dn->dn_datablkshift); 414 } else { 415 ASSERT3U(offset, <, dn->dn_datablksz); 416 return (0); 417 } 418 } 419 420 static void 421 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 422 { 423 dmu_buf_impl_t *db = vdb; 424 425 mutex_enter(&db->db_mtx); 426 ASSERT3U(db->db_state, ==, DB_READ); 427 /* 428 * All reads are synchronous, so we must have a hold on the dbuf 429 */ 430 ASSERT(refcount_count(&db->db_holds) > 0); 431 ASSERT(db->db.db_data == NULL); 432 if (db->db_level == 0 && db->db_d.db_freed_in_flight) { 433 /* we were freed in flight; disregard any error */ 434 arc_release(buf, db); 435 bzero(buf->b_data, db->db.db_size); 436 db->db_d.db_freed_in_flight = FALSE; 437 dbuf_set_data(db, buf); 438 db->db_state = DB_CACHED; 439 } else if (zio == NULL || zio->io_error == 0) { 440 dbuf_set_data(db, buf); 441 db->db_state = DB_CACHED; 442 } else { 443 ASSERT(db->db_blkid != DB_BONUS_BLKID); 444 arc_buf_free(buf, db); 445 db->db_state = DB_UNCACHED; 446 ASSERT3P(db->db_buf, ==, NULL); 447 } 448 cv_broadcast(&db->db_changed); 449 mutex_exit(&db->db_mtx); 450 } 451 452 void 453 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 454 { 455 arc_buf_t *buf; 456 blkptr_t *bp; 457 458 ASSERT(!refcount_is_zero(&db->db_holds)); 459 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 460 ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); 461 462 /* 463 * prefetch only data blocks (level 0) -- don't prefetch indirect 464 * blocks 465 */ 466 if ((db->db_level > 0) || (db->db_blkid == DB_BONUS_BLKID)) { 467 flags |= DB_RF_NOPREFETCH; 468 } 469 470 if (((flags & DB_RF_NOPREFETCH) == 0) && (db->db_dnode != NULL)) { 471 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 472 db->db.db_size); 473 } 474 475 if (db->db_state == DB_CACHED) { 476 ASSERT(db->db.db_data != NULL); 477 return; 478 } 479 480 mutex_enter(&db->db_mtx); 481 482 if (db->db_state != DB_UNCACHED) { 483 mutex_exit(&db->db_mtx); 484 return; 485 } 486 487 ASSERT3U(db->db_state, ==, DB_UNCACHED); 488 489 if (db->db_blkid == DB_BONUS_BLKID) { 490 ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size); 491 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 492 DN_MAX_BONUSLEN, db); 493 if (db->db.db_size < DN_MAX_BONUSLEN) 494 bzero(buf->b_data, DN_MAX_BONUSLEN); 495 bcopy(DN_BONUS(db->db_dnode->dn_phys), buf->b_data, 496 db->db.db_size); 497 dbuf_set_data(db, buf); 498 db->db_state = DB_CACHED; 499 mutex_exit(&db->db_mtx); 500 return; 501 } 502 503 if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid)) 504 bp = NULL; 505 else 506 bp = db->db_blkptr; 507 508 if (bp == NULL) 509 dprintf_dbuf(db, "blkptr: %s\n", "NULL"); 510 else 511 dprintf_dbuf_bp(db, bp, "%s", "blkptr:"); 512 513 if (bp == NULL || BP_IS_HOLE(bp)) { 514 ASSERT(bp == NULL || BP_IS_HOLE(bp)); 515 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 516 db->db.db_size, db)); 517 bzero(db->db.db_data, db->db.db_size); 518 db->db_state = DB_CACHED; 519 mutex_exit(&db->db_mtx); 520 return; 521 } 522 523 db->db_state = DB_READ; 524 mutex_exit(&db->db_mtx); 525 526 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 527 (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp, 528 db->db_level > 0 ? byteswap_uint64_array : 529 dmu_ot[db->db_dnode->dn_type].ot_byteswap, 530 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 531 (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 532 ARC_NOWAIT); 533 } 534 535 static int 536 dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags) 537 { 538 zio_t *zio; 539 int err; 540 541 /* 542 * We don't have to hold the mutex to check db_state because it 543 * can't be freed while we have a hold on the buffer. 544 */ 545 ASSERT(!refcount_is_zero(&db->db_holds)); 546 if (db->db_state == DB_CACHED) 547 return (0); 548 549 if (db->db_state == DB_UNCACHED) { 550 zio = zio_root(db->db_dnode->dn_objset->os_spa, NULL, NULL, 551 ZIO_FLAG_CANFAIL); 552 if ((flags & DB_RF_HAVESTRUCT) == 0) 553 rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); 554 dbuf_read_impl(db, zio, flags); 555 if ((flags & DB_RF_HAVESTRUCT) == 0) 556 rw_exit(&db->db_dnode->dn_struct_rwlock); 557 err = zio_wait(zio); 558 if (err) 559 return (err); 560 } 561 562 mutex_enter(&db->db_mtx); 563 while (db->db_state == DB_READ || db->db_state == DB_FILL) { 564 ASSERT(db->db_state == DB_READ || 565 (flags & DB_RF_HAVESTRUCT) == 0); 566 cv_wait(&db->db_changed, &db->db_mtx); 567 } 568 ASSERT3U(db->db_state, ==, DB_CACHED); 569 mutex_exit(&db->db_mtx); 570 571 return (0); 572 } 573 574 #pragma weak dmu_buf_read = dbuf_read 575 void 576 dbuf_read(dmu_buf_impl_t *db) 577 { 578 int err; 579 580 err = dbuf_read_generic(db, DB_RF_MUST_SUCCEED); 581 ASSERT(err == 0); 582 } 583 584 #pragma weak dmu_buf_read_canfail = dbuf_read_canfail 585 int 586 dbuf_read_canfail(dmu_buf_impl_t *db) 587 { 588 return (dbuf_read_generic(db, DB_RF_CANFAIL)); 589 } 590 591 void 592 dbuf_read_havestruct(dmu_buf_impl_t *db) 593 { 594 int err; 595 596 ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); 597 err = dbuf_read_generic(db, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH)); 598 ASSERT(err == 0); 599 } 600 601 static void 602 dbuf_noread(dmu_buf_impl_t *db) 603 { 604 ASSERT(!refcount_is_zero(&db->db_holds)); 605 mutex_enter(&db->db_mtx); 606 while (db->db_state == DB_READ || db->db_state == DB_FILL) 607 cv_wait(&db->db_changed, &db->db_mtx); 608 if (db->db_state == DB_UNCACHED) { 609 int blksz = (db->db_blkid == DB_BONUS_BLKID) ? 610 DN_MAX_BONUSLEN : db->db.db_size; 611 ASSERT(db->db.db_data == NULL); 612 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 613 blksz, db)); 614 db->db_state = DB_FILL; 615 } else { 616 ASSERT3U(db->db_state, ==, DB_CACHED); 617 } 618 mutex_exit(&db->db_mtx); 619 } 620 621 /* 622 * This is our just-in-time copy function. It makes a copy of 623 * buffers, that have been modified in a previous transaction 624 * group, before we modify them in the current active group. 625 * 626 * This function is used in two places: when we are dirtying a 627 * buffer for the first time in a txg, and when we are freeing 628 * a range in a dnode that includes this buffer. 629 * 630 * Note that when we are called from dbuf_free_range() we do 631 * not put a hold on the buffer, we just traverse the active 632 * dbuf list for the dnode. 633 */ 634 static void 635 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 636 { 637 arc_buf_t **quiescing, **syncing; 638 int size = (db->db_blkid == DB_BONUS_BLKID) ? 639 DN_MAX_BONUSLEN : db->db.db_size; 640 641 ASSERT(MUTEX_HELD(&db->db_mtx)); 642 ASSERT(db->db.db_data != NULL); 643 644 quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK]; 645 syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK]; 646 647 /* 648 * If this buffer is referenced from the current quiescing 649 * transaction group: either make a copy and reset the reference 650 * to point to the copy, or (if there a no active holders) just 651 * null out the current db_data pointer. 652 */ 653 if (*quiescing == db->db_buf) { 654 /* 655 * If the quiescing txg is "dirty", then we better not 656 * be referencing the same buffer from the syncing txg. 657 */ 658 ASSERT(*syncing != db->db_buf); 659 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 660 *quiescing = arc_buf_alloc( 661 db->db_dnode->dn_objset->os_spa, size, db); 662 bcopy(db->db.db_data, (*quiescing)->b_data, size); 663 } else { 664 db->db.db_data = NULL; 665 db->db_buf = NULL; 666 db->db_state = DB_UNCACHED; 667 } 668 return; 669 } 670 671 /* 672 * If this buffer is referenced from the current syncing 673 * transaction group: either 674 * 1 - make a copy and reset the reference, or 675 * 2 - if there are no holders, just null the current db_data. 676 */ 677 if (*syncing == db->db_buf) { 678 ASSERT3P(*quiescing, ==, NULL); 679 ASSERT3U(db->db_dirtycnt, ==, 1); 680 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 681 /* we can't copy if we have already started a write */ 682 ASSERT(*syncing != db->db_data_pending); 683 *syncing = arc_buf_alloc( 684 db->db_dnode->dn_objset->os_spa, size, db); 685 bcopy(db->db.db_data, (*syncing)->b_data, size); 686 } else { 687 db->db.db_data = NULL; 688 db->db_buf = NULL; 689 db->db_state = DB_UNCACHED; 690 } 691 } 692 } 693 694 void 695 dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg) 696 { 697 ASSERT(MUTEX_HELD(&db->db_mtx)); 698 if (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) { 699 db->db_d.db_overridden_by[txg&TXG_MASK] = NULL; 700 } else if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { 701 /* free this block */ 702 ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) || 703 db->db_dnode->dn_free_txg == txg); 704 if (!BP_IS_HOLE(db->db_d.db_overridden_by[txg&TXG_MASK])) { 705 /* XXX can get silent EIO here */ 706 (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa, 707 txg, db->db_d.db_overridden_by[txg&TXG_MASK], 708 NULL, NULL, ARC_WAIT); 709 } 710 kmem_free(db->db_d.db_overridden_by[txg&TXG_MASK], 711 sizeof (blkptr_t)); 712 db->db_d.db_overridden_by[txg&TXG_MASK] = NULL; 713 /* release the already-written buffer */ 714 arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); 715 } 716 } 717 718 void 719 dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) 720 { 721 dmu_buf_impl_t *db, *db_next; 722 uint64_t txg = tx->tx_txg; 723 724 dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks); 725 mutex_enter(&dn->dn_dbufs_mtx); 726 for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 727 db_next = list_next(&dn->dn_dbufs, db); 728 if ((db->db_level != 0) || (db->db_blkid == DB_BONUS_BLKID)) 729 continue; 730 dprintf_dbuf(db, "found buf %s\n", ""); 731 if (db->db_blkid < blkid || 732 db->db_blkid >= blkid+nblks) 733 continue; 734 735 /* found a level 0 buffer in the range */ 736 if (dbuf_undirty(db, tx)) 737 continue; 738 739 mutex_enter(&db->db_mtx); 740 if (db->db_state == DB_UNCACHED) { 741 ASSERT(db->db.db_data == NULL); 742 mutex_exit(&db->db_mtx); 743 continue; 744 } 745 if (db->db_state == DB_READ) { 746 /* this will be handled in dbuf_read_done() */ 747 db->db_d.db_freed_in_flight = TRUE; 748 mutex_exit(&db->db_mtx); 749 continue; 750 } 751 if (db->db_state == DB_FILL) { 752 /* this will be handled in dbuf_rele() */ 753 db->db_d.db_freed_in_flight = TRUE; 754 mutex_exit(&db->db_mtx); 755 continue; 756 } 757 758 /* make a copy of the data if necessary */ 759 dbuf_fix_old_data(db, txg); 760 761 if (db->db.db_data) { 762 /* fill in with appropriate data */ 763 arc_release(db->db_buf, db); 764 bzero(db->db.db_data, db->db.db_size); 765 } 766 mutex_exit(&db->db_mtx); 767 } 768 mutex_exit(&dn->dn_dbufs_mtx); 769 } 770 771 static int 772 dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx) 773 { 774 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 775 uint64_t birth_txg = 0; 776 777 /* Don't count meta-objects */ 778 if (ds == NULL) 779 return (FALSE); 780 781 /* 782 * We don't need any locking to protect db_blkptr: 783 * If it's syncing, then db_dirtied will be set so we'll 784 * ignore db_blkptr. 785 */ 786 ASSERT(MUTEX_HELD(&db->db_mtx)); /* XXX strictly necessary? */ 787 /* If we have been dirtied since the last snapshot, its not new */ 788 if (db->db_dirtied) 789 birth_txg = db->db_dirtied; 790 else if (db->db_blkptr) 791 birth_txg = db->db_blkptr->blk_birth; 792 793 if (birth_txg) 794 return (!dsl_dataset_block_freeable(ds, birth_txg, tx)); 795 else 796 return (TRUE); 797 } 798 799 void 800 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 801 { 802 arc_buf_t *buf, *obuf; 803 int osize = db->db.db_size; 804 805 /* XXX does *this* func really need the lock? */ 806 ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); 807 808 ASSERT3U(osize, <=, size); 809 if (osize == size) 810 return; 811 812 /* 813 * This call to dbuf_will_dirty() with the dn_struct_rwlock held 814 * is OK, because there can be no other references to the db 815 * when we are changing its size, so no concurrent DB_FILL can 816 * be happening. 817 */ 818 /* Make a copy of the data if necessary */ 819 dbuf_will_dirty(db, tx); 820 821 /* create the data buffer for the new block */ 822 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db); 823 824 /* copy old block data to the new block */ 825 obuf = db->db_buf; 826 bcopy(obuf->b_data, buf->b_data, osize); 827 /* zero the remainder */ 828 bzero((uint8_t *)buf->b_data + osize, size - osize); 829 830 mutex_enter(&db->db_mtx); 831 /* ASSERT3U(refcount_count(&db->db_holds), ==, 1); */ 832 dbuf_set_data(db, buf); 833 arc_buf_free(obuf, db); 834 db->db.db_size = size; 835 836 /* fix up the dirty info */ 837 if (db->db_level == 0) 838 db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf; 839 mutex_exit(&db->db_mtx); 840 841 dnode_willuse_space(db->db_dnode, size-osize, tx); 842 } 843 844 void 845 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 846 { 847 dnode_t *dn = db->db_dnode; 848 objset_impl_t *os = dn->dn_objset; 849 int drop_struct_lock = FALSE; 850 int txgoff = tx->tx_txg & TXG_MASK; 851 852 ASSERT(tx->tx_txg != 0); 853 ASSERT(!refcount_is_zero(&db->db_holds)); 854 DMU_TX_DIRTY_BUF(tx, db); 855 856 /* 857 * Shouldn't dirty a regular buffer in syncing context. Private 858 * objects may be dirtied in syncing context, but only if they 859 * were already pre-dirtied in open context. 860 * XXX We may want to prohibit dirtying in syncing context even 861 * if they did pre-dirty. 862 */ 863 ASSERT(!(dmu_tx_is_syncing(tx) && 864 !BP_IS_HOLE(&dn->dn_objset->os_rootbp) && 865 !(dn->dn_object & DMU_PRIVATE_OBJECT) && 866 dn->dn_objset->os_dsl_dataset != NULL && 867 !dsl_dir_is_private( 868 dn->dn_objset->os_dsl_dataset->ds_dir))); 869 870 /* 871 * We make this assert for private objects as well, but after we 872 * check if we're already dirty. They are allowed to re-dirty 873 * in syncing context. 874 */ 875 ASSERT(dn->dn_object & DMU_PRIVATE_OBJECT || 876 dn->dn_dirtyctx == DN_UNDIRTIED || 877 dn->dn_dirtyctx == 878 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 879 880 mutex_enter(&db->db_mtx); 881 /* XXX make this true for indirects too? */ 882 ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || 883 db->db_state == DB_FILL); 884 885 /* 886 * If this buffer is currently part of an "overridden" region, 887 * we now need to remove it from that region. 888 */ 889 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 890 db->db_d.db_overridden_by[txgoff] != NULL) { 891 dbuf_unoverride(db, tx->tx_txg); 892 } 893 894 mutex_enter(&dn->dn_mtx); 895 /* 896 * Don't set dirtyctx to SYNC if we're just modifying this as we 897 * initialize the objset. 898 */ 899 if (dn->dn_dirtyctx == DN_UNDIRTIED && 900 !BP_IS_HOLE(&dn->dn_objset->os_rootbp)) { 901 dn->dn_dirtyctx = 902 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 903 ASSERT(dn->dn_dirtyctx_firstset == NULL); 904 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 905 } 906 mutex_exit(&dn->dn_mtx); 907 908 /* 909 * If this buffer is already dirty, we're done. 910 */ 911 if (list_link_active(&db->db_dirty_node[txgoff])) { 912 mutex_exit(&db->db_mtx); 913 return; 914 } 915 916 /* 917 * Only valid if not already dirty. 918 */ 919 ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 920 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 921 922 ASSERT3U(dn->dn_nlevels, >, db->db_level); 923 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 924 dn->dn_phys->dn_nlevels > db->db_level || 925 dn->dn_next_nlevels[txgoff] > db->db_level || 926 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 927 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 928 929 /* 930 * We should only be dirtying in syncing context if it's the 931 * mos, a spa os, or we're initializing the os. However, we are 932 * allowed to dirty in syncing context provided we already 933 * dirtied it in open context. Hence we must make this 934 * assertion only if we're not already dirty. 935 */ 936 ASSERT(!dmu_tx_is_syncing(tx) || 937 os->os_dsl_dataset == NULL || 938 !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || 939 !BP_IS_HOLE(&os->os_rootbp)); 940 ASSERT(db->db.db_size != 0); 941 942 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 943 944 if (db->db_level == 0) { 945 /* 946 * Release the data buffer from the cache so that we 947 * can modify it without impacting possible other users 948 * of this cached data block. Note that indirect blocks 949 * and private objects are not released until the syncing 950 * state (since they are only modified then). 951 * 952 * If this buffer is dirty in an old transaction group we need 953 * to make a copy of it so that the changes we make in this 954 * transaction group won't leak out when we sync the older txg. 955 */ 956 ASSERT(db->db_buf != NULL); 957 ASSERT(db->db.db_data != NULL); 958 ASSERT(db->db_d.db_data_old[txgoff] == NULL); 959 if (!(db->db.db_object & DMU_PRIVATE_OBJECT)) { 960 arc_release(db->db_buf, db); 961 dbuf_fix_old_data(db, tx->tx_txg); 962 ASSERT(db->db_buf != NULL); 963 } 964 db->db_d.db_data_old[txgoff] = db->db_buf; 965 } 966 967 mutex_enter(&dn->dn_mtx); 968 /* 969 * We could have been freed_in_flight between the dbuf_noread 970 * and dbuf_dirty. We win, as though the dbuf_noread() had 971 * happened after the free. 972 */ 973 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 974 dnode_clear_range(dn, db->db_blkid, 1, tx); 975 db->db_d.db_freed_in_flight = FALSE; 976 } 977 978 db->db_dirtied = tx->tx_txg; 979 list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db); 980 mutex_exit(&dn->dn_mtx); 981 982 /* 983 * If writting this buffer will consume a new block on disk, 984 * then update the accounting. 985 */ 986 if (db->db_blkid != DB_BONUS_BLKID) { 987 if (!dbuf_new_block(db, tx) && db->db_blkptr) { 988 /* 989 * This is only a guess -- if the dbuf is dirty 990 * in a previous txg, we don't know how much 991 * space it will use on disk yet. We should 992 * really have the struct_rwlock to access 993 * db_blkptr, but since this is just a guess, 994 * it's OK if we get an odd answer. 995 */ 996 dnode_willuse_space(dn, 997 -BP_GET_ASIZE(db->db_blkptr), tx); 998 } 999 dnode_willuse_space(dn, db->db.db_size, tx); 1000 } 1001 1002 /* 1003 * This buffer is now part of this txg 1004 */ 1005 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1006 db->db_dirtycnt += 1; 1007 ASSERT3U(db->db_dirtycnt, <=, 3); 1008 1009 mutex_exit(&db->db_mtx); 1010 1011 if (db->db_blkid == DB_BONUS_BLKID) { 1012 dnode_setdirty(dn, tx); 1013 return; 1014 } 1015 1016 if (db->db_level == 0) 1017 dnode_new_blkid(dn, db->db_blkid, tx); 1018 1019 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1020 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1021 drop_struct_lock = TRUE; 1022 } 1023 1024 if (db->db_level < dn->dn_nlevels-1) { 1025 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1026 dmu_buf_impl_t *parent; 1027 parent = dbuf_hold_level(dn, db->db_level+1, 1028 db->db_blkid >> epbs, FTAG); 1029 if (drop_struct_lock) 1030 rw_exit(&dn->dn_struct_rwlock); 1031 dbuf_dirty(parent, tx); 1032 dbuf_remove_ref(parent, FTAG); 1033 } else { 1034 if (drop_struct_lock) 1035 rw_exit(&dn->dn_struct_rwlock); 1036 } 1037 1038 dnode_setdirty(dn, tx); 1039 } 1040 1041 static int 1042 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1043 { 1044 dnode_t *dn = db->db_dnode; 1045 int txgoff = tx->tx_txg & TXG_MASK; 1046 1047 ASSERT(tx->tx_txg != 0); 1048 1049 mutex_enter(&db->db_mtx); 1050 1051 /* 1052 * If this buffer is not dirty, we're done. 1053 */ 1054 if (!list_link_active(&db->db_dirty_node[txgoff])) { 1055 mutex_exit(&db->db_mtx); 1056 return (0); 1057 } 1058 1059 /* 1060 * If this buffer is currently held, we cannot undirty 1061 * it, since one of the current holders may be in the 1062 * middle of an update. Note that users of dbuf_undirty() 1063 * should not place a hold on the dbuf before the call. 1064 * XXX - this check assumes we are being called from 1065 * dbuf_free_range(), perhaps we should move it there? 1066 */ 1067 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1068 mutex_exit(&db->db_mtx); 1069 mutex_enter(&dn->dn_mtx); 1070 dnode_clear_range(dn, db->db_blkid, 1, tx); 1071 mutex_exit(&dn->dn_mtx); 1072 return (0); 1073 } 1074 1075 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1076 1077 dbuf_unoverride(db, tx->tx_txg); 1078 1079 ASSERT(db->db.db_size != 0); 1080 if (db->db_level == 0) { 1081 ASSERT(db->db_buf != NULL); 1082 ASSERT(db->db_d.db_data_old[txgoff] != NULL); 1083 if (db->db_d.db_data_old[txgoff] != db->db_buf) 1084 arc_buf_free(db->db_d.db_data_old[txgoff], db); 1085 db->db_d.db_data_old[txgoff] = NULL; 1086 } 1087 1088 /* XXX would be nice to fix up dn_towrite_space[] */ 1089 /* XXX undo db_dirtied? but how? */ 1090 /* db->db_dirtied = tx->tx_txg; */ 1091 1092 mutex_enter(&dn->dn_mtx); 1093 list_remove(&dn->dn_dirty_dbufs[txgoff], db); 1094 mutex_exit(&dn->dn_mtx); 1095 1096 ASSERT(db->db_dirtycnt > 0); 1097 db->db_dirtycnt -= 1; 1098 1099 if (refcount_remove(&db->db_holds, 1100 (void *)(uintptr_t)tx->tx_txg) == 0) { 1101 /* make duf_verify() happy */ 1102 if (db->db.db_data) 1103 bzero(db->db.db_data, db->db.db_size); 1104 1105 dbuf_evict(db); 1106 return (1); 1107 } 1108 1109 mutex_exit(&db->db_mtx); 1110 return (0); 1111 } 1112 1113 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty 1114 void 1115 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1116 { 1117 int rf = DB_RF_MUST_SUCCEED; 1118 1119 ASSERT(tx->tx_txg != 0); 1120 ASSERT(!refcount_is_zero(&db->db_holds)); 1121 1122 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) 1123 rf |= DB_RF_HAVESTRUCT; 1124 (void) dbuf_read_generic(db, rf); 1125 dbuf_dirty(db, tx); 1126 } 1127 1128 #pragma weak dmu_buf_will_fill = dbuf_will_fill 1129 void 1130 dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx) 1131 { 1132 ASSERT(tx->tx_txg != 0); 1133 ASSERT(db->db_level == 0); 1134 ASSERT(!refcount_is_zero(&db->db_holds)); 1135 1136 ASSERT(!(db->db.db_object & DMU_PRIVATE_OBJECT) || 1137 dmu_tx_private_ok(tx)); 1138 1139 dbuf_noread(db); 1140 dbuf_dirty(db, tx); 1141 } 1142 1143 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1144 /* ARGSUSED */ 1145 void 1146 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1147 { 1148 mutex_enter(&db->db_mtx); 1149 DBUF_VERIFY(db); 1150 1151 if (db->db_state == DB_FILL) { 1152 if (db->db_level == 0 && db->db_d.db_freed_in_flight) { 1153 /* we were freed while filling */ 1154 /* XXX dbuf_undirty? */ 1155 bzero(db->db.db_data, db->db.db_size); 1156 db->db_d.db_freed_in_flight = FALSE; 1157 } 1158 db->db_state = DB_CACHED; 1159 cv_broadcast(&db->db_changed); 1160 } 1161 mutex_exit(&db->db_mtx); 1162 } 1163 1164 1165 static void 1166 dbuf_clear(dmu_buf_impl_t *db) 1167 { 1168 dnode_t *dn = db->db_dnode; 1169 1170 ASSERT(MUTEX_HELD(&dn->dn_dbufs_mtx)); 1171 ASSERT(MUTEX_HELD(&db->db_mtx)); 1172 ASSERT(refcount_is_zero(&db->db_holds)); 1173 1174 if (db->db_state == DB_CACHED) { 1175 ASSERT(db->db_buf != NULL); 1176 arc_buf_free(db->db_buf, db); 1177 db->db.db_data = NULL; 1178 db->db_buf = NULL; 1179 db->db_state = DB_UNCACHED; 1180 } 1181 1182 ASSERT3U(db->db_state, ==, DB_UNCACHED); 1183 ASSERT(db->db_buf == NULL); 1184 ASSERT(db->db_data_pending == NULL); 1185 1186 mutex_exit(&db->db_mtx); 1187 1188 /* 1189 * If this dbuf is referened from an indirect dbuf, 1190 * decrement the ref count on the indirect dbuf. 1191 */ 1192 if (db->db_parent && db->db_parent != dn->dn_dbuf) 1193 dbuf_remove_ref(db->db_parent, db); 1194 1195 /* remove from dn_dbufs */ 1196 list_remove(&dn->dn_dbufs, db); 1197 1198 dnode_rele(dn, db); 1199 1200 dbuf_hash_remove(db); 1201 1202 db->db_dnode = NULL; 1203 db->db_parent = NULL; 1204 db->db_blkptr = NULL; 1205 } 1206 1207 static int 1208 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1209 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1210 { 1211 int nlevels, epbs; 1212 1213 if (dn->dn_phys->dn_nlevels == 0) 1214 nlevels = 1; 1215 else 1216 nlevels = dn->dn_phys->dn_nlevels; 1217 1218 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1219 1220 ASSERT3U(level * epbs, <, 64); 1221 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1222 if (blkid == DB_BONUS_BLKID) { 1223 /* this is the bonus buffer */ 1224 *parentp = NULL; 1225 *bpp = NULL; 1226 return (0); 1227 } else if (level >= nlevels || 1228 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1229 /* the buffer has no parent yet */ 1230 *parentp = NULL; 1231 *bpp = NULL; 1232 return (ENOENT); 1233 } else if (level < nlevels-1) { 1234 /* this block is referenced from an indirect block */ 1235 int err = dbuf_hold_impl(dn, level+1, 1236 blkid >> epbs, fail_sparse, NULL, parentp); 1237 if (err) 1238 return (err); 1239 dbuf_read_havestruct(*parentp); 1240 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1241 (blkid & ((1ULL << epbs) - 1)); 1242 return (0); 1243 } else { 1244 /* the block is referenced from the dnode */ 1245 ASSERT3U(level, ==, nlevels-1); 1246 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1247 blkid < dn->dn_phys->dn_nblkptr); 1248 *parentp = dn->dn_dbuf; 1249 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1250 return (0); 1251 } 1252 } 1253 1254 static dmu_buf_impl_t * 1255 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1256 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1257 { 1258 objset_impl_t *os = dn->dn_objset; 1259 dmu_buf_impl_t *db, *odb; 1260 1261 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1262 ASSERT(dn->dn_type != DMU_OT_NONE); 1263 1264 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1265 1266 db->db_objset = os; 1267 db->db.db_object = dn->dn_object; 1268 db->db_level = level; 1269 db->db_blkid = blkid; 1270 db->db_state = DB_UNCACHED; 1271 1272 if (db->db_blkid == DB_BONUS_BLKID) { 1273 db->db.db_size = dn->dn_bonuslen; 1274 db->db.db_offset = DB_BONUS_BLKID; 1275 } else { 1276 int blocksize = 1277 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1278 db->db.db_size = blocksize; 1279 db->db.db_offset = db->db_blkid * blocksize; 1280 } 1281 1282 db->db_dirtied = 0; 1283 db->db_dirtycnt = 0; 1284 1285 bzero(&db->db_d, sizeof (db->db_d)); 1286 1287 /* 1288 * Hold the dn_dbufs_mtx while we get the new dbuf 1289 * in the hash table *and* added to the dbufs list. 1290 * This prevents a possible deadlock with someone 1291 * trying to look up this dbuf before its added to the 1292 * dn_dbufs list. 1293 */ 1294 mutex_enter(&dn->dn_dbufs_mtx); 1295 if ((odb = dbuf_hash_insert(db)) != NULL) { 1296 /* someone else inserted it first */ 1297 kmem_cache_free(dbuf_cache, db); 1298 mutex_exit(&dn->dn_dbufs_mtx); 1299 return (odb); 1300 } 1301 list_insert_head(&dn->dn_dbufs, db); 1302 mutex_exit(&dn->dn_dbufs_mtx); 1303 1304 if (parent && parent != dn->dn_dbuf) 1305 dbuf_add_ref(parent, db); 1306 1307 (void) refcount_add(&dn->dn_holds, db); 1308 1309 db->db_dnode = dn; 1310 db->db_parent = parent; 1311 db->db_blkptr = blkptr; 1312 1313 dprintf_dbuf(db, "db=%p\n", db); 1314 1315 return (db); 1316 } 1317 1318 static int 1319 dbuf_evictable(dmu_buf_impl_t *db) 1320 { 1321 int i; 1322 1323 ASSERT(MUTEX_HELD(&db->db_mtx)); 1324 DBUF_VERIFY(db); 1325 1326 if (db->db_state != DB_UNCACHED && db->db_state != DB_CACHED) 1327 return (FALSE); 1328 1329 if (!refcount_is_zero(&db->db_holds)) 1330 return (FALSE); 1331 1332 #ifdef ZFS_DEBUG 1333 for (i = 0; i < TXG_SIZE; i++) { 1334 ASSERT(!list_link_active(&db->db_dirty_node[i])); 1335 ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL); 1336 } 1337 #endif 1338 1339 /* 1340 * Now we know we want to free it. 1341 * This call must be done last, since it has side effects - 1342 * calling the db_evict_func(). 1343 */ 1344 dbuf_evict_user(db); 1345 return (TRUE); 1346 } 1347 1348 static void 1349 dbuf_destroy(dmu_buf_impl_t *db) 1350 { 1351 ASSERT(refcount_is_zero(&db->db_holds)); 1352 1353 ASSERT(db->db.db_data == NULL); 1354 ASSERT(db->db_dnode == NULL); 1355 ASSERT(db->db_parent == NULL); 1356 ASSERT(db->db_hash_next == NULL); 1357 ASSERT(db->db_blkptr == NULL); 1358 ASSERT(db->db_data_pending == NULL); 1359 1360 kmem_cache_free(dbuf_cache, db); 1361 } 1362 1363 void 1364 dbuf_prefetch(dnode_t *dn, uint64_t blkid) 1365 { 1366 dmu_buf_impl_t *db, *parent = NULL; 1367 blkptr_t *bp = NULL; 1368 1369 ASSERT(blkid != DB_BONUS_BLKID); 1370 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1371 1372 if (dnode_block_freed(dn, blkid)) 1373 return; 1374 1375 /* dbuf_find() returns with db_mtx held */ 1376 if (db = dbuf_find(dn, 0, blkid)) { 1377 /* 1378 * This dbuf is already in the cache. We assume that 1379 * it is already CACHED, or else about to be either 1380 * read or filled. 1381 */ 1382 mutex_exit(&db->db_mtx); 1383 return; 1384 } 1385 1386 if (dbuf_findbp(dn, 0, blkid, TRUE, &parent, &bp) == 0) { 1387 if (bp && !BP_IS_HOLE(bp)) { 1388 (void) arc_read(NULL, dn->dn_objset->os_spa, bp, 1389 dmu_ot[dn->dn_type].ot_byteswap, 1390 NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 1391 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1392 (ARC_NOWAIT | ARC_PREFETCH)); 1393 } 1394 if (parent && parent != dn->dn_dbuf) 1395 dbuf_rele(parent); 1396 } 1397 } 1398 1399 /* 1400 * Returns with db_holds incremented, and db_mtx not held. 1401 * Note: dn_struct_rwlock must be held. 1402 */ 1403 int 1404 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1405 void *tag, dmu_buf_impl_t **dbp) 1406 { 1407 dmu_buf_impl_t *db, *parent = NULL; 1408 1409 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1410 ASSERT3U(dn->dn_nlevels, >, level); 1411 1412 *dbp = NULL; 1413 1414 /* dbuf_find() returns with db_mtx held */ 1415 db = dbuf_find(dn, level, blkid); 1416 1417 if (db == NULL) { 1418 blkptr_t *bp = NULL; 1419 int err; 1420 1421 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1422 if (fail_sparse) { 1423 if (err == 0 && bp && BP_IS_HOLE(bp)) 1424 err = ENOENT; 1425 if (err) { 1426 if (parent && parent != dn->dn_dbuf) 1427 dbuf_rele(parent); 1428 return (err); 1429 } 1430 } 1431 db = dbuf_create(dn, level, blkid, parent, bp); 1432 } 1433 1434 /* 1435 * If this buffer is currently syncing out, and we are 1436 * are still referencing it from db_data, we need to make 1437 * a copy of it in case we decide we want to dirty it 1438 * again in this txg. 1439 */ 1440 if (db->db_level == 0 && db->db_state == DB_CACHED && 1441 !(dn->dn_object & DMU_PRIVATE_OBJECT) && 1442 db->db_data_pending == db->db_buf) { 1443 int size = (db->db_blkid == DB_BONUS_BLKID) ? 1444 DN_MAX_BONUSLEN : db->db.db_size; 1445 1446 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 1447 size, db)); 1448 bcopy(db->db_data_pending->b_data, db->db.db_data, 1449 db->db.db_size); 1450 } 1451 1452 dbuf_add_ref(db, tag); 1453 dbuf_update_data(db); 1454 DBUF_VERIFY(db); 1455 mutex_exit(&db->db_mtx); 1456 1457 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1458 if (parent && parent != dn->dn_dbuf) 1459 dbuf_rele(parent); 1460 1461 ASSERT3P(db->db_dnode, ==, dn); 1462 ASSERT3U(db->db_blkid, ==, blkid); 1463 ASSERT3U(db->db_level, ==, level); 1464 *dbp = db; 1465 1466 return (0); 1467 } 1468 1469 dmu_buf_impl_t * 1470 dbuf_hold(dnode_t *dn, uint64_t blkid) 1471 { 1472 dmu_buf_impl_t *db; 1473 (void) dbuf_hold_impl(dn, 0, blkid, FALSE, NULL, &db); 1474 return (db); 1475 } 1476 1477 dmu_buf_impl_t * 1478 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1479 { 1480 dmu_buf_impl_t *db; 1481 (void) dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1482 return (db); 1483 } 1484 1485 dmu_buf_impl_t * 1486 dbuf_hold_bonus(dnode_t *dn, void *tag) 1487 { 1488 dmu_buf_impl_t *db; 1489 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1490 (void) dbuf_hold_impl(dn, 0, DB_BONUS_BLKID, FALSE, tag, &db); 1491 rw_exit(&dn->dn_struct_rwlock); 1492 return (db); 1493 } 1494 1495 void 1496 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1497 { 1498 (void) refcount_add(&db->db_holds, tag); 1499 /* dprintf_dbuf(db, "adding ref %p; holds up to %lld\n", tag, holds); */ 1500 } 1501 1502 void 1503 dbuf_remove_ref(dmu_buf_impl_t *db, void *tag) 1504 { 1505 int64_t holds; 1506 dnode_t *dn = db->db_dnode; 1507 int need_mutex; 1508 1509 ASSERT(dn != NULL); 1510 need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx); 1511 1512 if (need_mutex) { 1513 dnode_add_ref(dn, FTAG); 1514 mutex_enter(&dn->dn_dbufs_mtx); 1515 } 1516 1517 mutex_enter(&db->db_mtx); 1518 DBUF_VERIFY(db); 1519 1520 holds = refcount_remove(&db->db_holds, tag); 1521 1522 if (holds == 0) { 1523 ASSERT3U(db->db_state, !=, DB_FILL); 1524 if (db->db_level == 0 && 1525 db->db_d.db_user_data_ptr_ptr != NULL) 1526 *db->db_d.db_user_data_ptr_ptr = NULL; 1527 dbuf_evict(db); 1528 } else { 1529 if (holds == db->db_dirtycnt && 1530 db->db_level == 0 && db->db_d.db_immediate_evict) 1531 dbuf_evict_user(db); 1532 mutex_exit(&db->db_mtx); 1533 } 1534 1535 if (need_mutex) { 1536 mutex_exit(&dn->dn_dbufs_mtx); 1537 dnode_rele(dn, FTAG); 1538 } 1539 } 1540 1541 void 1542 dbuf_rele(dmu_buf_impl_t *db) 1543 { 1544 dbuf_remove_ref(db, NULL); 1545 } 1546 1547 #pragma weak dmu_buf_refcount = dbuf_refcount 1548 uint64_t 1549 dbuf_refcount(dmu_buf_impl_t *db) 1550 { 1551 return (refcount_count(&db->db_holds)); 1552 } 1553 1554 void * 1555 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1556 dmu_buf_evict_func_t *evict_func) 1557 { 1558 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1559 user_data_ptr_ptr, evict_func)); 1560 } 1561 1562 void * 1563 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1564 dmu_buf_evict_func_t *evict_func) 1565 { 1566 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1567 1568 db->db_d.db_immediate_evict = TRUE; 1569 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1570 user_data_ptr_ptr, evict_func)); 1571 } 1572 1573 void * 1574 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 1575 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 1576 { 1577 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1578 ASSERT(db->db_level == 0); 1579 1580 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 1581 1582 mutex_enter(&db->db_mtx); 1583 1584 if (db->db_d.db_user_ptr == old_user_ptr) { 1585 db->db_d.db_user_ptr = user_ptr; 1586 db->db_d.db_user_data_ptr_ptr = user_data_ptr_ptr; 1587 db->db_d.db_evict_func = evict_func; 1588 1589 dbuf_update_data(db); 1590 } else { 1591 old_user_ptr = db->db_d.db_user_ptr; 1592 } 1593 1594 mutex_exit(&db->db_mtx); 1595 return (old_user_ptr); 1596 } 1597 1598 void * 1599 dmu_buf_get_user(dmu_buf_t *db_fake) 1600 { 1601 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1602 ASSERT(!refcount_is_zero(&db->db_holds)); 1603 1604 return (db->db_d.db_user_ptr); 1605 } 1606 1607 void 1608 dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) 1609 { 1610 arc_buf_t **data; 1611 uint64_t txg = tx->tx_txg; 1612 dnode_t *dn = db->db_dnode; 1613 objset_impl_t *os = dn->dn_objset; 1614 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 1615 int blksz; 1616 1617 ASSERT(dmu_tx_is_syncing(tx)); 1618 1619 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 1620 1621 mutex_enter(&db->db_mtx); 1622 /* 1623 * To be synced, we must be dirtied. But we 1624 * might have been freed after the dirty. 1625 */ 1626 if (db->db_state == DB_UNCACHED) { 1627 /* This buffer has been freed since it was dirtied */ 1628 ASSERT(db->db.db_data == NULL); 1629 } else if (db->db_state == DB_FILL) { 1630 /* This buffer was freed and is now being re-filled */ 1631 ASSERT(db->db.db_data != db->db_d.db_data_old[txg&TXG_MASK]); 1632 } else { 1633 ASSERT3U(db->db_state, ==, DB_CACHED); 1634 } 1635 DBUF_VERIFY(db); 1636 1637 /* 1638 * Don't need a lock on db_dirty (dn_mtx), because it can't 1639 * be modified yet. 1640 */ 1641 1642 if (db->db_level == 0) { 1643 data = &db->db_d.db_data_old[txg&TXG_MASK]; 1644 blksz = arc_buf_size(*data); 1645 /* 1646 * If this buffer is currently "in use" (i.e., there are 1647 * active holds and db_data still references it), then make 1648 * a copy before we start the write so that any modifications 1649 * from the open txg will not leak into this write. 1650 * 1651 * NOTE: this copy does not need to be made for objects only 1652 * modified in the syncing context (e.g. DNONE_DNODE blocks) 1653 * or if there is no actual write involved (bonus blocks). 1654 */ 1655 if (!(dn->dn_object & DMU_PRIVATE_OBJECT) && 1656 db->db_d.db_overridden_by[txg&TXG_MASK] == NULL && 1657 db->db_blkid != DB_BONUS_BLKID) { 1658 if (refcount_count(&db->db_holds) > 1 && 1659 *data == db->db_buf) { 1660 *data = arc_buf_alloc( 1661 db->db_dnode->dn_objset->os_spa, blksz, db); 1662 bcopy(db->db.db_data, (*data)->b_data, blksz); 1663 } 1664 db->db_data_pending = *data; 1665 } else if (dn->dn_object & DMU_PRIVATE_OBJECT) { 1666 /* 1667 * Private object buffers are released here rather 1668 * than in dbuf_dirty() since they are only modified 1669 * in the syncing context and we don't want the 1670 * overhead of making multiple copies of the data. 1671 */ 1672 arc_release(db->db_buf, db); 1673 } 1674 } else { 1675 data = &db->db_buf; 1676 if (*data == NULL) { 1677 /* 1678 * This can happen if we dirty and then free 1679 * the level-0 data blocks in the same txg. So 1680 * this indirect remains unchanged. 1681 */ 1682 if (db->db_dirtied == txg) 1683 db->db_dirtied = 0; 1684 ASSERT(db->db_dirtycnt > 0); 1685 db->db_dirtycnt -= 1; 1686 mutex_exit(&db->db_mtx); 1687 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1688 return; 1689 } 1690 blksz = db->db.db_size; 1691 ASSERT3U(blksz, ==, 1<<dn->dn_phys->dn_indblkshift); 1692 } 1693 1694 ASSERT(*data != NULL); 1695 1696 if (db->db_blkid == DB_BONUS_BLKID) { 1697 /* 1698 * Simply copy the bonus data into the dnode. It will 1699 * be written out when the dnode is synced (and it will 1700 * be synced, since it must have been dirty for dbuf_sync 1701 * to be called). The bonus data will be byte swapped 1702 * in dnode_byteswap. 1703 */ 1704 /* 1705 * Use dn_phys->dn_bonuslen since db.db_size is the length 1706 * of the bonus buffer in the open transaction rather than 1707 * the syncing transaction. 1708 */ 1709 ASSERT3U(db->db_level, ==, 0); 1710 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, blksz); 1711 bcopy((*data)->b_data, DN_BONUS(dn->dn_phys), 1712 dn->dn_phys->dn_bonuslen); 1713 if (*data != db->db_buf) 1714 arc_buf_free(*data, db); 1715 db->db_d.db_data_old[txg&TXG_MASK] = NULL; 1716 db->db_data_pending = NULL; 1717 if (db->db_dirtied == txg) 1718 db->db_dirtied = 0; 1719 ASSERT(db->db_dirtycnt > 0); 1720 db->db_dirtycnt -= 1; 1721 mutex_exit(&db->db_mtx); 1722 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1723 return; 1724 } else if (db->db_level > 0 && !arc_released(db->db_buf)) { 1725 /* 1726 * This indirect buffer was marked dirty, but 1727 * never modified (if it had been modified, then 1728 * we would have released the buffer). There is 1729 * no reason to write anything. 1730 */ 1731 db->db_data_pending = NULL; 1732 if (db->db_dirtied == txg) 1733 db->db_dirtied = 0; 1734 ASSERT(db->db_dirtycnt > 0); 1735 db->db_dirtycnt -= 1; 1736 mutex_exit(&db->db_mtx); 1737 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1738 return; 1739 } else if (db->db_blkptr == NULL && 1740 db->db_level == dn->dn_phys->dn_nlevels-1 && 1741 db->db_blkid < dn->dn_phys->dn_nblkptr) { 1742 /* 1743 * This buffer was allocated at a time when there was 1744 * no available blkptrs from the dnode, or it was 1745 * inappropriate to hook it in (i.e., nlevels mis-match). 1746 */ 1747 ASSERT(db->db_blkptr == NULL); 1748 ASSERT(db->db_parent == NULL); 1749 db->db_parent = dn->dn_dbuf; 1750 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 1751 DBUF_VERIFY(db); 1752 mutex_exit(&db->db_mtx); 1753 } else if (db->db_blkptr == NULL) { 1754 dmu_buf_impl_t *parent = db->db_parent; 1755 1756 mutex_exit(&db->db_mtx); 1757 ASSERT(dn->dn_phys->dn_nlevels > 1); 1758 if (parent == NULL) { 1759 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1760 (void) dbuf_hold_impl(dn, db->db_level+1, 1761 db->db_blkid >> epbs, FALSE, NULL, &parent); 1762 rw_exit(&dn->dn_struct_rwlock); 1763 dbuf_add_ref(parent, db); 1764 db->db_parent = parent; 1765 dbuf_rele(parent); 1766 } 1767 dbuf_read(parent); 1768 } else { 1769 mutex_exit(&db->db_mtx); 1770 } 1771 1772 ASSERT(IS_DNODE_DNODE(dn->dn_object) || db->db_parent != NULL); 1773 1774 if (db->db_level > 0 && 1775 db->db_blkid > dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)) { 1776 /* 1777 * Don't write indirect blocks past EOF. 1778 * We get these when we truncate a file *after* dirtying 1779 * blocks in the truncate range (we undirty the level 0 1780 * blocks in dbuf_free_range(), but not the indirects). 1781 */ 1782 #ifdef ZFS_DEBUG 1783 /* 1784 * Verify that this indirect block is empty. 1785 */ 1786 blkptr_t *bplist; 1787 int i; 1788 1789 mutex_enter(&db->db_mtx); 1790 bplist = db->db.db_data; 1791 for (i = 0; i < (1 << epbs); i++) { 1792 if (!BP_IS_HOLE(&bplist[i])) { 1793 panic("data past EOF: " 1794 "db=%p level=%d id=%llu i=%d\n", 1795 db, db->db_level, 1796 (u_longlong_t)db->db_blkid, i); 1797 } 1798 } 1799 mutex_exit(&db->db_mtx); 1800 #endif 1801 ASSERT(db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)); 1802 mutex_enter(&db->db_mtx); 1803 db->db_dirtycnt -= 1; 1804 mutex_exit(&db->db_mtx); 1805 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1806 return; 1807 } 1808 1809 if (db->db_parent != dn->dn_dbuf) { 1810 dmu_buf_impl_t *parent = db->db_parent; 1811 1812 mutex_enter(&db->db_mtx); 1813 ASSERT(db->db_level == parent->db_level-1); 1814 ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK])); 1815 /* 1816 * We may have read this block after we dirtied it, 1817 * so never released it from the cache. 1818 */ 1819 arc_release(parent->db_buf, parent); 1820 1821 db->db_blkptr = (blkptr_t *)parent->db.db_data + 1822 (db->db_blkid & ((1ULL << epbs) - 1)); 1823 DBUF_VERIFY(db); 1824 mutex_exit(&db->db_mtx); 1825 } 1826 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1827 1828 #ifdef ZFS_DEBUG 1829 if (db->db_parent == dn->dn_dbuf) { 1830 /* 1831 * We don't need to dnode_setdirty(dn) because if we got 1832 * here then the parent is already dirty. 1833 */ 1834 ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); 1835 ASSERT3P(db->db_blkptr, ==, 1836 &dn->dn_phys->dn_blkptr[db->db_blkid]); 1837 } 1838 #endif 1839 if (db->db_level == 0 && 1840 db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { 1841 arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK]; 1842 blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK]; 1843 int old_size = BP_GET_ASIZE(db->db_blkptr); 1844 int new_size = BP_GET_ASIZE(*bpp); 1845 1846 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1847 1848 dnode_diduse_space(dn, new_size-old_size); 1849 mutex_enter(&dn->dn_mtx); 1850 if (db->db_blkid > dn->dn_phys->dn_maxblkid) 1851 dn->dn_phys->dn_maxblkid = db->db_blkid; 1852 mutex_exit(&dn->dn_mtx); 1853 1854 dsl_dataset_block_born(os->os_dsl_dataset, *bpp, tx); 1855 if (!BP_IS_HOLE(db->db_blkptr)) 1856 dsl_dataset_block_kill(os->os_dsl_dataset, 1857 db->db_blkptr, os->os_synctx); 1858 1859 mutex_enter(&db->db_mtx); 1860 *db->db_blkptr = **bpp; 1861 kmem_free(*bpp, sizeof (blkptr_t)); 1862 *bpp = NULL; 1863 1864 if (*old != db->db_buf) 1865 arc_buf_free(*old, db); 1866 *old = NULL; 1867 db->db_data_pending = NULL; 1868 1869 cv_broadcast(&db->db_changed); 1870 1871 ASSERT(db->db_dirtycnt > 0); 1872 db->db_dirtycnt -= 1; 1873 mutex_exit(&db->db_mtx); 1874 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1875 } else { 1876 int checksum, compress; 1877 1878 if (db->db_level > 0) { 1879 /* 1880 * XXX -- we should design a compression algorithm 1881 * that specializes in arrays of bps. 1882 */ 1883 checksum = ZIO_CHECKSUM_FLETCHER_4; 1884 /* XXX - disable compresssion for now */ 1885 compress = ZIO_COMPRESS_OFF; 1886 } else { 1887 /* 1888 * Allow dnode settings to override objset settings, 1889 * except for metadata checksums. 1890 */ 1891 if (dmu_ot[dn->dn_type].ot_metadata) { 1892 checksum = os->os_md_checksum; 1893 compress = zio_compress_select(dn->dn_compress, 1894 os->os_md_compress); 1895 } else { 1896 checksum = zio_checksum_select(dn->dn_checksum, 1897 os->os_checksum); 1898 compress = zio_compress_select(dn->dn_compress, 1899 os->os_compress); 1900 } 1901 } 1902 #ifdef ZFS_DEBUG 1903 if (db->db_parent) { 1904 ASSERT(list_link_active( 1905 &db->db_parent->db_dirty_node[txg&TXG_MASK])); 1906 ASSERT(db->db_parent == dn->dn_dbuf || 1907 db->db_parent->db_level > 0); 1908 if (dn->dn_object & DMU_PRIVATE_OBJECT || 1909 db->db_level > 0) 1910 ASSERT(*data == db->db_buf); 1911 } 1912 #endif 1913 ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg); 1914 (void) arc_write(zio, os->os_spa, checksum, compress, txg, 1915 db->db_blkptr, *data, dbuf_write_done, db, 1916 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT); 1917 /* 1918 * We can't access db after arc_write, since it could finish 1919 * and be freed, and we have no locks on it. 1920 */ 1921 } 1922 } 1923 1924 struct dbuf_arg { 1925 objset_impl_t *os; 1926 blkptr_t bp; 1927 }; 1928 1929 static void 1930 dbuf_do_born(void *arg) 1931 { 1932 struct dbuf_arg *da = arg; 1933 dsl_dataset_block_born(da->os->os_dsl_dataset, 1934 &da->bp, da->os->os_synctx); 1935 kmem_free(da, sizeof (struct dbuf_arg)); 1936 } 1937 1938 static void 1939 dbuf_do_kill(void *arg) 1940 { 1941 struct dbuf_arg *da = arg; 1942 dsl_dataset_block_kill(da->os->os_dsl_dataset, 1943 &da->bp, da->os->os_synctx); 1944 kmem_free(da, sizeof (struct dbuf_arg)); 1945 } 1946 1947 /* ARGSUSED */ 1948 static void 1949 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 1950 { 1951 dmu_buf_impl_t *db = vdb; 1952 dnode_t *dn = db->db_dnode; 1953 objset_impl_t *os = dn->dn_objset; 1954 uint64_t txg = zio->io_txg; 1955 uint64_t fill = 0; 1956 int i; 1957 int old_size, new_size; 1958 1959 ASSERT3U(zio->io_error, ==, 0); 1960 1961 dprintf_dbuf_bp(db, &zio->io_bp_orig, "bp_orig: %s", ""); 1962 1963 old_size = BP_GET_ASIZE(&zio->io_bp_orig); 1964 new_size = BP_GET_ASIZE(zio->io_bp); 1965 1966 dnode_diduse_space(dn, new_size-old_size); 1967 1968 mutex_enter(&db->db_mtx); 1969 1970 if (db->db_dirtied == txg) 1971 db->db_dirtied = 0; 1972 1973 if (db->db_level == 0) { 1974 arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK]; 1975 1976 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1977 1978 if (*old != db->db_buf) 1979 arc_buf_free(*old, db); 1980 *old = NULL; 1981 db->db_data_pending = NULL; 1982 1983 mutex_enter(&dn->dn_mtx); 1984 if (db->db_blkid > dn->dn_phys->dn_maxblkid && 1985 !BP_IS_HOLE(db->db_blkptr)) 1986 dn->dn_phys->dn_maxblkid = db->db_blkid; 1987 mutex_exit(&dn->dn_mtx); 1988 1989 if (dn->dn_type == DMU_OT_DNODE) { 1990 dnode_phys_t *dnp = db->db.db_data; 1991 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 1992 i--, dnp++) { 1993 if (dnp->dn_type != DMU_OT_NONE) 1994 fill++; 1995 } 1996 } else { 1997 if (!BP_IS_HOLE(db->db_blkptr)) 1998 fill = 1; 1999 } 2000 } else { 2001 blkptr_t *bp = db->db.db_data; 2002 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2003 if (!BP_IS_HOLE(db->db_blkptr)) { 2004 int epbs = 2005 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2006 ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, db->db.db_size); 2007 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2008 db->db.db_size); 2009 ASSERT3U(dn->dn_phys->dn_maxblkid 2010 >> (db->db_level * epbs), >=, db->db_blkid); 2011 } 2012 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) { 2013 if (BP_IS_HOLE(bp)) 2014 continue; 2015 ASSERT3U(BP_GET_LSIZE(bp), ==, 2016 db->db_level == 1 ? dn->dn_datablksz : 2017 (1<<dn->dn_phys->dn_indblkshift)); 2018 fill += bp->blk_fill; 2019 } 2020 } 2021 2022 if (!BP_IS_HOLE(db->db_blkptr)) { 2023 db->db_blkptr->blk_fill = fill; 2024 BP_SET_TYPE(db->db_blkptr, dn->dn_type); 2025 BP_SET_LEVEL(db->db_blkptr, db->db_level); 2026 } else { 2027 ASSERT3U(fill, ==, 0); 2028 ASSERT3U(db->db_blkptr->blk_fill, ==, 0); 2029 } 2030 2031 dprintf_dbuf_bp(db, db->db_blkptr, 2032 "wrote %llu bytes to blkptr:", zio->io_size); 2033 2034 ASSERT(db->db_parent == NULL || 2035 list_link_active(&db->db_parent->db_dirty_node[txg&TXG_MASK])); 2036 cv_broadcast(&db->db_changed); 2037 ASSERT(db->db_dirtycnt > 0); 2038 db->db_dirtycnt -= 1; 2039 mutex_exit(&db->db_mtx); 2040 2041 /* We must do this after we've set the bp's type and level */ 2042 if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), 2043 BP_IDENTITY(&zio->io_bp_orig))) { 2044 struct dbuf_arg *da; 2045 da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP); 2046 da->os = os; 2047 da->bp = *zio->io_bp; 2048 (void) taskq_dispatch(dbuf_tq, dbuf_do_born, da, 0); 2049 if (!BP_IS_HOLE(&zio->io_bp_orig)) { 2050 da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP); 2051 da->os = os; 2052 da->bp = zio->io_bp_orig; 2053 (void) taskq_dispatch(dbuf_tq, dbuf_do_kill, da, 0); 2054 } 2055 } 2056 2057 dbuf_remove_ref(db, (void *)(uintptr_t)txg); 2058 } 2059