1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/dmu.h> 30 #include <sys/dmu_impl.h> 31 #include <sys/dbuf.h> 32 #include <sys/dmu_objset.h> 33 #include <sys/dsl_dataset.h> 34 #include <sys/dsl_dir.h> 35 #include <sys/dmu_tx.h> 36 #include <sys/spa.h> 37 #include <sys/zio.h> 38 #include <sys/dmu_zfetch.h> 39 40 static void dbuf_destroy(dmu_buf_impl_t *db); 41 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 42 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 43 static arc_done_func_t dbuf_write_ready; 44 static arc_done_func_t dbuf_write_done; 45 46 /* 47 * Global data structures and functions for the dbuf cache. 48 */ 49 static kmem_cache_t *dbuf_cache; 50 51 /* ARGSUSED */ 52 static int 53 dbuf_cons(void *vdb, void *unused, int kmflag) 54 { 55 dmu_buf_impl_t *db = vdb; 56 bzero(db, sizeof (dmu_buf_impl_t)); 57 58 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 59 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 60 refcount_create(&db->db_holds); 61 return (0); 62 } 63 64 /* ARGSUSED */ 65 static void 66 dbuf_dest(void *vdb, void *unused) 67 { 68 dmu_buf_impl_t *db = vdb; 69 mutex_destroy(&db->db_mtx); 70 cv_destroy(&db->db_changed); 71 refcount_destroy(&db->db_holds); 72 } 73 74 /* 75 * dbuf hash table routines 76 */ 77 static dbuf_hash_table_t dbuf_hash_table; 78 79 static uint64_t dbuf_hash_count; 80 81 static uint64_t 82 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 83 { 84 uintptr_t osv = (uintptr_t)os; 85 uint64_t crc = -1ULL; 86 87 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 88 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 90 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 91 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 92 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 93 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 94 95 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 96 97 return (crc); 98 } 99 100 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 101 102 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 103 ((dbuf)->db.db_object == (obj) && \ 104 (dbuf)->db_objset == (os) && \ 105 (dbuf)->db_level == (level) && \ 106 (dbuf)->db_blkid == (blkid)) 107 108 dmu_buf_impl_t * 109 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 110 { 111 dbuf_hash_table_t *h = &dbuf_hash_table; 112 objset_impl_t *os = dn->dn_objset; 113 uint64_t obj = dn->dn_object; 114 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 115 uint64_t idx = hv & h->hash_table_mask; 116 dmu_buf_impl_t *db; 117 118 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 119 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 120 if (DBUF_EQUAL(db, os, obj, level, blkid)) { 121 mutex_enter(&db->db_mtx); 122 if (db->db_state != DB_EVICTING) { 123 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 124 return (db); 125 } 126 mutex_exit(&db->db_mtx); 127 } 128 } 129 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 130 return (NULL); 131 } 132 133 /* 134 * Insert an entry into the hash table. If there is already an element 135 * equal to elem in the hash table, then the already existing element 136 * will be returned and the new element will not be inserted. 137 * Otherwise returns NULL. 138 */ 139 static dmu_buf_impl_t * 140 dbuf_hash_insert(dmu_buf_impl_t *db) 141 { 142 dbuf_hash_table_t *h = &dbuf_hash_table; 143 objset_impl_t *os = db->db_objset; 144 uint64_t obj = db->db.db_object; 145 int level = db->db_level; 146 uint64_t blkid = db->db_blkid; 147 uint64_t hv = DBUF_HASH(os, obj, level, blkid); 148 uint64_t idx = hv & h->hash_table_mask; 149 dmu_buf_impl_t *dbf; 150 151 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 152 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 153 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 154 mutex_enter(&dbf->db_mtx); 155 if (dbf->db_state != DB_EVICTING) { 156 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 157 return (dbf); 158 } 159 mutex_exit(&dbf->db_mtx); 160 } 161 } 162 163 mutex_enter(&db->db_mtx); 164 db->db_hash_next = h->hash_table[idx]; 165 h->hash_table[idx] = db; 166 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 167 atomic_add_64(&dbuf_hash_count, 1); 168 169 return (NULL); 170 } 171 172 /* 173 * Remove an entry from the hash table. This operation will 174 * fail if there are any existing holds on the db. 175 */ 176 static void 177 dbuf_hash_remove(dmu_buf_impl_t *db) 178 { 179 dbuf_hash_table_t *h = &dbuf_hash_table; 180 uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 181 db->db_level, db->db_blkid); 182 uint64_t idx = hv & h->hash_table_mask; 183 dmu_buf_impl_t *dbf, **dbp; 184 185 /* 186 * We musn't hold db_mtx to maintin lock ordering: 187 * DBUF_HASH_MUTEX > db_mtx. 188 */ 189 ASSERT(refcount_is_zero(&db->db_holds)); 190 ASSERT(db->db_state == DB_EVICTING); 191 ASSERT(!MUTEX_HELD(&db->db_mtx)); 192 193 mutex_enter(DBUF_HASH_MUTEX(h, idx)); 194 dbp = &h->hash_table[idx]; 195 while ((dbf = *dbp) != db) { 196 dbp = &dbf->db_hash_next; 197 ASSERT(dbf != NULL); 198 } 199 *dbp = db->db_hash_next; 200 db->db_hash_next = NULL; 201 mutex_exit(DBUF_HASH_MUTEX(h, idx)); 202 atomic_add_64(&dbuf_hash_count, -1); 203 } 204 205 static arc_evict_func_t dbuf_do_evict; 206 207 static void 208 dbuf_evict_user(dmu_buf_impl_t *db) 209 { 210 ASSERT(MUTEX_HELD(&db->db_mtx)); 211 212 if (db->db_level != 0 || db->db_evict_func == NULL) 213 return; 214 215 if (db->db_user_data_ptr_ptr) 216 *db->db_user_data_ptr_ptr = db->db.db_data; 217 db->db_evict_func(&db->db, db->db_user_ptr); 218 db->db_user_ptr = NULL; 219 db->db_user_data_ptr_ptr = NULL; 220 db->db_evict_func = NULL; 221 } 222 223 void 224 dbuf_evict(dmu_buf_impl_t *db) 225 { 226 ASSERT(MUTEX_HELD(&db->db_mtx)); 227 ASSERT(db->db_buf == NULL); 228 ASSERT(db->db_data_pending == NULL); 229 230 dbuf_clear(db); 231 dbuf_destroy(db); 232 } 233 234 void 235 dbuf_init(void) 236 { 237 uint64_t hsize = 1ULL << 16; 238 dbuf_hash_table_t *h = &dbuf_hash_table; 239 int i; 240 241 /* 242 * The hash table is big enough to fill all of physical memory 243 * with an average 4K block size. The table will take up 244 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 245 */ 246 while (hsize * 4096 < physmem * PAGESIZE) 247 hsize <<= 1; 248 249 retry: 250 h->hash_table_mask = hsize - 1; 251 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 252 if (h->hash_table == NULL) { 253 /* XXX - we should really return an error instead of assert */ 254 ASSERT(hsize > (1ULL << 10)); 255 hsize >>= 1; 256 goto retry; 257 } 258 259 dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 260 sizeof (dmu_buf_impl_t), 261 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 262 263 for (i = 0; i < DBUF_MUTEXES; i++) 264 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 265 } 266 267 void 268 dbuf_fini(void) 269 { 270 dbuf_hash_table_t *h = &dbuf_hash_table; 271 int i; 272 273 for (i = 0; i < DBUF_MUTEXES; i++) 274 mutex_destroy(&h->hash_mutexes[i]); 275 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 276 kmem_cache_destroy(dbuf_cache); 277 } 278 279 /* 280 * Other stuff. 281 */ 282 283 #ifdef ZFS_DEBUG 284 static void 285 dbuf_verify(dmu_buf_impl_t *db) 286 { 287 dnode_t *dn = db->db_dnode; 288 289 ASSERT(MUTEX_HELD(&db->db_mtx)); 290 291 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 292 return; 293 294 ASSERT(db->db_objset != NULL); 295 if (dn == NULL) { 296 ASSERT(db->db_parent == NULL); 297 ASSERT(db->db_blkptr == NULL); 298 } else { 299 ASSERT3U(db->db.db_object, ==, dn->dn_object); 300 ASSERT3P(db->db_objset, ==, dn->dn_objset); 301 ASSERT3U(db->db_level, <, dn->dn_nlevels); 302 ASSERT(db->db_blkid == DB_BONUS_BLKID || 303 list_head(&dn->dn_dbufs)); 304 } 305 if (db->db_blkid == DB_BONUS_BLKID) { 306 ASSERT(dn != NULL); 307 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 308 ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); 309 } else { 310 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 311 } 312 313 if (db->db_level == 0) { 314 /* we can be momentarily larger in dnode_set_blksz() */ 315 if (db->db_blkid != DB_BONUS_BLKID && dn) { 316 ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); 317 } 318 if (db->db.db_object == DMU_META_DNODE_OBJECT) { 319 dbuf_dirty_record_t *dr = db->db_data_pending; 320 /* 321 * it should only be modified in syncing 322 * context, so make sure we only have 323 * one copy of the data. 324 */ 325 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 326 } 327 } 328 329 /* verify db->db_blkptr */ 330 if (db->db_blkptr) { 331 if (db->db_parent == dn->dn_dbuf) { 332 /* db is pointed to by the dnode */ 333 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 334 if (db->db.db_object == DMU_META_DNODE_OBJECT) 335 ASSERT(db->db_parent == NULL); 336 else 337 ASSERT(db->db_parent != NULL); 338 ASSERT3P(db->db_blkptr, ==, 339 &dn->dn_phys->dn_blkptr[db->db_blkid]); 340 } else { 341 /* db is pointed to by an indirect block */ 342 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 343 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 344 ASSERT3U(db->db_parent->db.db_object, ==, 345 db->db.db_object); 346 /* 347 * dnode_grow_indblksz() can make this fail if we don't 348 * have the struct_rwlock. XXX indblksz no longer 349 * grows. safe to do this now? 350 */ 351 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { 352 ASSERT3P(db->db_blkptr, ==, 353 ((blkptr_t *)db->db_parent->db.db_data + 354 db->db_blkid % epb)); 355 } 356 } 357 } 358 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 359 db->db.db_data && db->db_blkid != DB_BONUS_BLKID && 360 db->db_state != DB_FILL && !dn->dn_free_txg) { 361 /* 362 * If the blkptr isn't set but they have nonzero data, 363 * it had better be dirty, otherwise we'll lose that 364 * data when we evict this buffer. 365 */ 366 if (db->db_dirtycnt == 0) { 367 uint64_t *buf = db->db.db_data; 368 int i; 369 370 for (i = 0; i < db->db.db_size >> 3; i++) { 371 ASSERT(buf[i] == 0); 372 } 373 } 374 } 375 } 376 #endif 377 378 static void 379 dbuf_update_data(dmu_buf_impl_t *db) 380 { 381 ASSERT(MUTEX_HELD(&db->db_mtx)); 382 if (db->db_level == 0 && db->db_user_data_ptr_ptr) { 383 ASSERT(!refcount_is_zero(&db->db_holds)); 384 *db->db_user_data_ptr_ptr = db->db.db_data; 385 } 386 } 387 388 static void 389 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 390 { 391 ASSERT(MUTEX_HELD(&db->db_mtx)); 392 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); 393 db->db_buf = buf; 394 if (buf != NULL) { 395 ASSERT(buf->b_data != NULL); 396 db->db.db_data = buf->b_data; 397 if (!arc_released(buf)) 398 arc_set_callback(buf, dbuf_do_evict, db); 399 dbuf_update_data(db); 400 } else { 401 dbuf_evict_user(db); 402 db->db.db_data = NULL; 403 db->db_state = DB_UNCACHED; 404 } 405 } 406 407 uint64_t 408 dbuf_whichblock(dnode_t *dn, uint64_t offset) 409 { 410 if (dn->dn_datablkshift) { 411 return (offset >> dn->dn_datablkshift); 412 } else { 413 ASSERT3U(offset, <, dn->dn_datablksz); 414 return (0); 415 } 416 } 417 418 static void 419 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 420 { 421 dmu_buf_impl_t *db = vdb; 422 423 mutex_enter(&db->db_mtx); 424 ASSERT3U(db->db_state, ==, DB_READ); 425 /* 426 * All reads are synchronous, so we must have a hold on the dbuf 427 */ 428 ASSERT(refcount_count(&db->db_holds) > 0); 429 ASSERT(db->db_buf == NULL); 430 ASSERT(db->db.db_data == NULL); 431 if (db->db_level == 0 && db->db_freed_in_flight) { 432 /* we were freed in flight; disregard any error */ 433 arc_release(buf, db); 434 bzero(buf->b_data, db->db.db_size); 435 arc_buf_freeze(buf); 436 db->db_freed_in_flight = FALSE; 437 dbuf_set_data(db, buf); 438 db->db_state = DB_CACHED; 439 } else if (zio == NULL || zio->io_error == 0) { 440 dbuf_set_data(db, buf); 441 db->db_state = DB_CACHED; 442 } else { 443 ASSERT(db->db_blkid != DB_BONUS_BLKID); 444 ASSERT3P(db->db_buf, ==, NULL); 445 VERIFY(arc_buf_remove_ref(buf, db) == 1); 446 db->db_state = DB_UNCACHED; 447 } 448 cv_broadcast(&db->db_changed); 449 mutex_exit(&db->db_mtx); 450 dbuf_rele(db, NULL); 451 } 452 453 static void 454 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) 455 { 456 dnode_t *dn = db->db_dnode; 457 zbookmark_t zb; 458 uint32_t aflags = ARC_NOWAIT; 459 arc_buf_t *pbuf; 460 461 ASSERT(!refcount_is_zero(&db->db_holds)); 462 /* We need the struct_rwlock to prevent db_blkptr from changing. */ 463 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 464 ASSERT(MUTEX_HELD(&db->db_mtx)); 465 ASSERT(db->db_state == DB_UNCACHED); 466 ASSERT(db->db_buf == NULL); 467 468 if (db->db_blkid == DB_BONUS_BLKID) { 469 int bonuslen = dn->dn_bonuslen; 470 471 ASSERT3U(bonuslen, <=, db->db.db_size); 472 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 473 arc_space_consume(DN_MAX_BONUSLEN); 474 if (bonuslen < DN_MAX_BONUSLEN) 475 bzero(db->db.db_data, DN_MAX_BONUSLEN); 476 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, 477 bonuslen); 478 dbuf_update_data(db); 479 db->db_state = DB_CACHED; 480 mutex_exit(&db->db_mtx); 481 return; 482 } 483 484 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 485 (db->db_level == 0 && dnode_block_freed(dn, db->db_blkid))) { 486 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 487 488 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, 489 db->db.db_size, db, type)); 490 bzero(db->db.db_data, db->db.db_size); 491 db->db_state = DB_CACHED; 492 *flags |= DB_RF_CACHED; 493 mutex_exit(&db->db_mtx); 494 return; 495 } 496 497 db->db_state = DB_READ; 498 mutex_exit(&db->db_mtx); 499 500 if (DBUF_IS_L2CACHEABLE(db)) 501 aflags |= ARC_L2CACHE; 502 503 zb.zb_objset = db->db_objset->os_dsl_dataset ? 504 db->db_objset->os_dsl_dataset->ds_object : 0; 505 zb.zb_object = db->db.db_object; 506 zb.zb_level = db->db_level; 507 zb.zb_blkid = db->db_blkid; 508 509 dbuf_add_ref(db, NULL); 510 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 511 512 if (db->db_parent) 513 pbuf = db->db_parent->db_buf; 514 else 515 pbuf = db->db_objset->os_phys_buf; 516 517 (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, 518 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 519 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 520 &aflags, &zb); 521 if (aflags & ARC_CACHED) 522 *flags |= DB_RF_CACHED; 523 } 524 525 int 526 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 527 { 528 int err = 0; 529 int havepzio = (zio != NULL); 530 int prefetch; 531 532 /* 533 * We don't have to hold the mutex to check db_state because it 534 * can't be freed while we have a hold on the buffer. 535 */ 536 ASSERT(!refcount_is_zero(&db->db_holds)); 537 538 if ((flags & DB_RF_HAVESTRUCT) == 0) 539 rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); 540 541 prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 542 (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL && 543 DBUF_IS_CACHEABLE(db); 544 545 mutex_enter(&db->db_mtx); 546 if (db->db_state == DB_CACHED) { 547 mutex_exit(&db->db_mtx); 548 if (prefetch) 549 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 550 db->db.db_size, TRUE); 551 if ((flags & DB_RF_HAVESTRUCT) == 0) 552 rw_exit(&db->db_dnode->dn_struct_rwlock); 553 } else if (db->db_state == DB_UNCACHED) { 554 if (zio == NULL) { 555 zio = zio_root(db->db_dnode->dn_objset->os_spa, 556 NULL, NULL, ZIO_FLAG_CANFAIL); 557 } 558 dbuf_read_impl(db, zio, &flags); 559 560 /* dbuf_read_impl has dropped db_mtx for us */ 561 562 if (prefetch) 563 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 564 db->db.db_size, flags & DB_RF_CACHED); 565 566 if ((flags & DB_RF_HAVESTRUCT) == 0) 567 rw_exit(&db->db_dnode->dn_struct_rwlock); 568 569 if (!havepzio) 570 err = zio_wait(zio); 571 } else { 572 mutex_exit(&db->db_mtx); 573 if (prefetch) 574 dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 575 db->db.db_size, TRUE); 576 if ((flags & DB_RF_HAVESTRUCT) == 0) 577 rw_exit(&db->db_dnode->dn_struct_rwlock); 578 579 mutex_enter(&db->db_mtx); 580 if ((flags & DB_RF_NEVERWAIT) == 0) { 581 while (db->db_state == DB_READ || 582 db->db_state == DB_FILL) { 583 ASSERT(db->db_state == DB_READ || 584 (flags & DB_RF_HAVESTRUCT) == 0); 585 cv_wait(&db->db_changed, &db->db_mtx); 586 } 587 if (db->db_state == DB_UNCACHED) 588 err = EIO; 589 } 590 mutex_exit(&db->db_mtx); 591 } 592 593 ASSERT(err || havepzio || db->db_state == DB_CACHED); 594 return (err); 595 } 596 597 static void 598 dbuf_noread(dmu_buf_impl_t *db) 599 { 600 ASSERT(!refcount_is_zero(&db->db_holds)); 601 ASSERT(db->db_blkid != DB_BONUS_BLKID); 602 mutex_enter(&db->db_mtx); 603 while (db->db_state == DB_READ || db->db_state == DB_FILL) 604 cv_wait(&db->db_changed, &db->db_mtx); 605 if (db->db_state == DB_UNCACHED) { 606 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 607 608 ASSERT(db->db_buf == NULL); 609 ASSERT(db->db.db_data == NULL); 610 dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 611 db->db.db_size, db, type)); 612 db->db_state = DB_FILL; 613 } else { 614 ASSERT3U(db->db_state, ==, DB_CACHED); 615 } 616 mutex_exit(&db->db_mtx); 617 } 618 619 /* 620 * This is our just-in-time copy function. It makes a copy of 621 * buffers, that have been modified in a previous transaction 622 * group, before we modify them in the current active group. 623 * 624 * This function is used in two places: when we are dirtying a 625 * buffer for the first time in a txg, and when we are freeing 626 * a range in a dnode that includes this buffer. 627 * 628 * Note that when we are called from dbuf_free_range() we do 629 * not put a hold on the buffer, we just traverse the active 630 * dbuf list for the dnode. 631 */ 632 static void 633 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 634 { 635 dbuf_dirty_record_t *dr = db->db_last_dirty; 636 637 ASSERT(MUTEX_HELD(&db->db_mtx)); 638 ASSERT(db->db.db_data != NULL); 639 ASSERT(db->db_level == 0); 640 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 641 642 if (dr == NULL || 643 (dr->dt.dl.dr_data != 644 ((db->db_blkid == DB_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 645 return; 646 647 /* 648 * If the last dirty record for this dbuf has not yet synced 649 * and its referencing the dbuf data, either: 650 * reset the reference to point to a new copy, 651 * or (if there a no active holders) 652 * just null out the current db_data pointer. 653 */ 654 ASSERT(dr->dr_txg >= txg - 2); 655 if (db->db_blkid == DB_BONUS_BLKID) { 656 /* Note that the data bufs here are zio_bufs */ 657 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 658 arc_space_consume(DN_MAX_BONUSLEN); 659 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 660 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 661 int size = db->db.db_size; 662 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 663 dr->dt.dl.dr_data = arc_buf_alloc( 664 db->db_dnode->dn_objset->os_spa, size, db, type); 665 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 666 } else { 667 dbuf_set_data(db, NULL); 668 } 669 } 670 671 void 672 dbuf_unoverride(dbuf_dirty_record_t *dr) 673 { 674 dmu_buf_impl_t *db = dr->dr_dbuf; 675 uint64_t txg = dr->dr_txg; 676 677 ASSERT(MUTEX_HELD(&db->db_mtx)); 678 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 679 ASSERT(db->db_level == 0); 680 681 if (db->db_blkid == DB_BONUS_BLKID || 682 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 683 return; 684 685 /* free this block */ 686 if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) { 687 /* XXX can get silent EIO here */ 688 (void) dsl_free(NULL, 689 spa_get_dsl(db->db_dnode->dn_objset->os_spa), 690 txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT); 691 } 692 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 693 /* 694 * Release the already-written buffer, so we leave it in 695 * a consistent dirty state. Note that all callers are 696 * modifying the buffer, so they will immediately do 697 * another (redundant) arc_release(). Therefore, leave 698 * the buf thawed to save the effort of freezing & 699 * immediately re-thawing it. 700 */ 701 arc_release(dr->dt.dl.dr_data, db); 702 } 703 704 /* 705 * Evict (if its unreferenced) or clear (if its referenced) any level-0 706 * data blocks in the free range, so that any future readers will find 707 * empty blocks. Also, if we happen accross any level-1 dbufs in the 708 * range that have not already been marked dirty, mark them dirty so 709 * they stay in memory. 710 */ 711 void 712 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) 713 { 714 dmu_buf_impl_t *db, *db_next; 715 uint64_t txg = tx->tx_txg; 716 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 717 uint64_t first_l1 = start >> epbs; 718 uint64_t last_l1 = end >> epbs; 719 720 if (end > dn->dn_maxblkid) { 721 end = dn->dn_maxblkid; 722 last_l1 = end >> epbs; 723 } 724 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); 725 mutex_enter(&dn->dn_dbufs_mtx); 726 for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 727 db_next = list_next(&dn->dn_dbufs, db); 728 ASSERT(db->db_blkid != DB_BONUS_BLKID); 729 730 if (db->db_level == 1 && 731 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { 732 mutex_enter(&db->db_mtx); 733 if (db->db_last_dirty && 734 db->db_last_dirty->dr_txg < txg) { 735 dbuf_add_ref(db, FTAG); 736 mutex_exit(&db->db_mtx); 737 dbuf_will_dirty(db, tx); 738 dbuf_rele(db, FTAG); 739 } else { 740 mutex_exit(&db->db_mtx); 741 } 742 } 743 744 if (db->db_level != 0) 745 continue; 746 dprintf_dbuf(db, "found buf %s\n", ""); 747 if (db->db_blkid < start || db->db_blkid > end) 748 continue; 749 750 /* found a level 0 buffer in the range */ 751 if (dbuf_undirty(db, tx)) 752 continue; 753 754 mutex_enter(&db->db_mtx); 755 if (db->db_state == DB_UNCACHED || 756 db->db_state == DB_EVICTING) { 757 ASSERT(db->db.db_data == NULL); 758 mutex_exit(&db->db_mtx); 759 continue; 760 } 761 if (db->db_state == DB_READ || db->db_state == DB_FILL) { 762 /* will be handled in dbuf_read_done or dbuf_rele */ 763 db->db_freed_in_flight = TRUE; 764 mutex_exit(&db->db_mtx); 765 continue; 766 } 767 if (refcount_count(&db->db_holds) == 0) { 768 ASSERT(db->db_buf); 769 dbuf_clear(db); 770 continue; 771 } 772 /* The dbuf is referenced */ 773 774 if (db->db_last_dirty != NULL) { 775 dbuf_dirty_record_t *dr = db->db_last_dirty; 776 777 if (dr->dr_txg == txg) { 778 /* 779 * This buffer is "in-use", re-adjust the file 780 * size to reflect that this buffer may 781 * contain new data when we sync. 782 */ 783 if (db->db_blkid > dn->dn_maxblkid) 784 dn->dn_maxblkid = db->db_blkid; 785 dbuf_unoverride(dr); 786 } else { 787 /* 788 * This dbuf is not dirty in the open context. 789 * Either uncache it (if its not referenced in 790 * the open context) or reset its contents to 791 * empty. 792 */ 793 dbuf_fix_old_data(db, txg); 794 } 795 } 796 /* clear the contents if its cached */ 797 if (db->db_state == DB_CACHED) { 798 ASSERT(db->db.db_data != NULL); 799 arc_release(db->db_buf, db); 800 bzero(db->db.db_data, db->db.db_size); 801 arc_buf_freeze(db->db_buf); 802 } 803 804 mutex_exit(&db->db_mtx); 805 } 806 mutex_exit(&dn->dn_dbufs_mtx); 807 } 808 809 static int 810 dbuf_block_freeable(dmu_buf_impl_t *db) 811 { 812 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 813 uint64_t birth_txg = 0; 814 815 /* 816 * We don't need any locking to protect db_blkptr: 817 * If it's syncing, then db_last_dirty will be set 818 * so we'll ignore db_blkptr. 819 */ 820 ASSERT(MUTEX_HELD(&db->db_mtx)); 821 if (db->db_last_dirty) 822 birth_txg = db->db_last_dirty->dr_txg; 823 else if (db->db_blkptr) 824 birth_txg = db->db_blkptr->blk_birth; 825 826 /* If we don't exist or are in a snapshot, we can't be freed */ 827 if (birth_txg) 828 return (ds == NULL || 829 dsl_dataset_block_freeable(ds, birth_txg)); 830 else 831 return (FALSE); 832 } 833 834 void 835 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 836 { 837 arc_buf_t *buf, *obuf; 838 int osize = db->db.db_size; 839 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 840 841 ASSERT(db->db_blkid != DB_BONUS_BLKID); 842 843 /* XXX does *this* func really need the lock? */ 844 ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); 845 846 /* 847 * This call to dbuf_will_dirty() with the dn_struct_rwlock held 848 * is OK, because there can be no other references to the db 849 * when we are changing its size, so no concurrent DB_FILL can 850 * be happening. 851 */ 852 /* 853 * XXX we should be doing a dbuf_read, checking the return 854 * value and returning that up to our callers 855 */ 856 dbuf_will_dirty(db, tx); 857 858 /* create the data buffer for the new block */ 859 buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db, type); 860 861 /* copy old block data to the new block */ 862 obuf = db->db_buf; 863 bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 864 /* zero the remainder */ 865 if (size > osize) 866 bzero((uint8_t *)buf->b_data + osize, size - osize); 867 868 mutex_enter(&db->db_mtx); 869 dbuf_set_data(db, buf); 870 VERIFY(arc_buf_remove_ref(obuf, db) == 1); 871 db->db.db_size = size; 872 873 if (db->db_level == 0) { 874 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 875 db->db_last_dirty->dt.dl.dr_data = buf; 876 } 877 mutex_exit(&db->db_mtx); 878 879 dnode_willuse_space(db->db_dnode, size-osize, tx); 880 } 881 882 dbuf_dirty_record_t * 883 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 884 { 885 dnode_t *dn = db->db_dnode; 886 objset_impl_t *os = dn->dn_objset; 887 dbuf_dirty_record_t **drp, *dr; 888 int drop_struct_lock = FALSE; 889 int txgoff = tx->tx_txg & TXG_MASK; 890 891 ASSERT(tx->tx_txg != 0); 892 ASSERT(!refcount_is_zero(&db->db_holds)); 893 DMU_TX_DIRTY_BUF(tx, db); 894 895 /* 896 * Shouldn't dirty a regular buffer in syncing context. Private 897 * objects may be dirtied in syncing context, but only if they 898 * were already pre-dirtied in open context. 899 * XXX We may want to prohibit dirtying in syncing context even 900 * if they did pre-dirty. 901 */ 902 ASSERT(!dmu_tx_is_syncing(tx) || 903 BP_IS_HOLE(dn->dn_objset->os_rootbp) || 904 dn->dn_object == DMU_META_DNODE_OBJECT || 905 dn->dn_objset->os_dsl_dataset == NULL || 906 dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir)); 907 908 /* 909 * We make this assert for private objects as well, but after we 910 * check if we're already dirty. They are allowed to re-dirty 911 * in syncing context. 912 */ 913 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 914 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 915 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 916 917 mutex_enter(&db->db_mtx); 918 /* 919 * XXX make this true for indirects too? The problem is that 920 * transactions created with dmu_tx_create_assigned() from 921 * syncing context don't bother holding ahead. 922 */ 923 ASSERT(db->db_level != 0 || 924 db->db_state == DB_CACHED || db->db_state == DB_FILL); 925 926 mutex_enter(&dn->dn_mtx); 927 /* 928 * Don't set dirtyctx to SYNC if we're just modifying this as we 929 * initialize the objset. 930 */ 931 if (dn->dn_dirtyctx == DN_UNDIRTIED && 932 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 933 dn->dn_dirtyctx = 934 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 935 ASSERT(dn->dn_dirtyctx_firstset == NULL); 936 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 937 } 938 mutex_exit(&dn->dn_mtx); 939 940 /* 941 * If this buffer is already dirty, we're done. 942 */ 943 drp = &db->db_last_dirty; 944 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 945 db->db.db_object == DMU_META_DNODE_OBJECT); 946 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 947 drp = &dr->dr_next; 948 if (dr && dr->dr_txg == tx->tx_txg) { 949 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 950 /* 951 * If this buffer has already been written out, 952 * we now need to reset its state. 953 */ 954 dbuf_unoverride(dr); 955 if (db->db.db_object != DMU_META_DNODE_OBJECT) 956 arc_buf_thaw(db->db_buf); 957 } 958 mutex_exit(&db->db_mtx); 959 return (dr); 960 } 961 962 /* 963 * Only valid if not already dirty. 964 */ 965 ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 966 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 967 968 ASSERT3U(dn->dn_nlevels, >, db->db_level); 969 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 970 dn->dn_phys->dn_nlevels > db->db_level || 971 dn->dn_next_nlevels[txgoff] > db->db_level || 972 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 973 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 974 975 /* 976 * We should only be dirtying in syncing context if it's the 977 * mos, a spa os, or we're initializing the os. However, we are 978 * allowed to dirty in syncing context provided we already 979 * dirtied it in open context. Hence we must make this 980 * assertion only if we're not already dirty. 981 */ 982 ASSERT(!dmu_tx_is_syncing(tx) || 983 os->os_dsl_dataset == NULL || 984 !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || 985 !BP_IS_HOLE(os->os_rootbp)); 986 ASSERT(db->db.db_size != 0); 987 988 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 989 990 if (db->db_blkid != DB_BONUS_BLKID) { 991 /* 992 * Update the accounting. 993 */ 994 if (dbuf_block_freeable(db)) { 995 blkptr_t *bp = db->db_blkptr; 996 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 997 bp_get_dasize(os->os_spa, bp) : db->db.db_size; 998 /* 999 * This is only a guess -- if the dbuf is dirty 1000 * in a previous txg, we don't know how much 1001 * space it will use on disk yet. We should 1002 * really have the struct_rwlock to access 1003 * db_blkptr, but since this is just a guess, 1004 * it's OK if we get an odd answer. 1005 */ 1006 dnode_willuse_space(dn, -willfree, tx); 1007 } 1008 dnode_willuse_space(dn, db->db.db_size, tx); 1009 } 1010 1011 /* 1012 * If this buffer is dirty in an old transaction group we need 1013 * to make a copy of it so that the changes we make in this 1014 * transaction group won't leak out when we sync the older txg. 1015 */ 1016 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1017 if (db->db_level == 0) { 1018 void *data_old = db->db_buf; 1019 1020 if (db->db_blkid == DB_BONUS_BLKID) { 1021 dbuf_fix_old_data(db, tx->tx_txg); 1022 data_old = db->db.db_data; 1023 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1024 /* 1025 * Release the data buffer from the cache so that we 1026 * can modify it without impacting possible other users 1027 * of this cached data block. Note that indirect 1028 * blocks and private objects are not released until the 1029 * syncing state (since they are only modified then). 1030 */ 1031 arc_release(db->db_buf, db); 1032 dbuf_fix_old_data(db, tx->tx_txg); 1033 data_old = db->db_buf; 1034 } 1035 ASSERT(data_old != NULL); 1036 dr->dt.dl.dr_data = data_old; 1037 } else { 1038 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1039 list_create(&dr->dt.di.dr_children, 1040 sizeof (dbuf_dirty_record_t), 1041 offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1042 } 1043 dr->dr_dbuf = db; 1044 dr->dr_txg = tx->tx_txg; 1045 dr->dr_next = *drp; 1046 *drp = dr; 1047 1048 /* 1049 * We could have been freed_in_flight between the dbuf_noread 1050 * and dbuf_dirty. We win, as though the dbuf_noread() had 1051 * happened after the free. 1052 */ 1053 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 1054 mutex_enter(&dn->dn_mtx); 1055 dnode_clear_range(dn, db->db_blkid, 1, tx); 1056 mutex_exit(&dn->dn_mtx); 1057 db->db_freed_in_flight = FALSE; 1058 } 1059 1060 /* 1061 * This buffer is now part of this txg 1062 */ 1063 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1064 db->db_dirtycnt += 1; 1065 ASSERT3U(db->db_dirtycnt, <=, 3); 1066 1067 mutex_exit(&db->db_mtx); 1068 1069 if (db->db_blkid == DB_BONUS_BLKID) { 1070 mutex_enter(&dn->dn_mtx); 1071 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1072 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1073 mutex_exit(&dn->dn_mtx); 1074 dnode_setdirty(dn, tx); 1075 return (dr); 1076 } 1077 1078 if (db->db_level == 0) { 1079 dnode_new_blkid(dn, db->db_blkid, tx); 1080 ASSERT(dn->dn_maxblkid >= db->db_blkid); 1081 } 1082 1083 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1084 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1085 drop_struct_lock = TRUE; 1086 } 1087 1088 if (db->db_level+1 < dn->dn_nlevels) { 1089 dmu_buf_impl_t *parent = db->db_parent; 1090 dbuf_dirty_record_t *di; 1091 int parent_held = FALSE; 1092 1093 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1094 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1095 1096 parent = dbuf_hold_level(dn, db->db_level+1, 1097 db->db_blkid >> epbs, FTAG); 1098 parent_held = TRUE; 1099 } 1100 if (drop_struct_lock) 1101 rw_exit(&dn->dn_struct_rwlock); 1102 ASSERT3U(db->db_level+1, ==, parent->db_level); 1103 di = dbuf_dirty(parent, tx); 1104 if (parent_held) 1105 dbuf_rele(parent, FTAG); 1106 1107 mutex_enter(&db->db_mtx); 1108 /* possible race with dbuf_undirty() */ 1109 if (db->db_last_dirty == dr || 1110 dn->dn_object == DMU_META_DNODE_OBJECT) { 1111 mutex_enter(&di->dt.di.dr_mtx); 1112 ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1113 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1114 list_insert_tail(&di->dt.di.dr_children, dr); 1115 mutex_exit(&di->dt.di.dr_mtx); 1116 dr->dr_parent = di; 1117 } 1118 mutex_exit(&db->db_mtx); 1119 } else { 1120 ASSERT(db->db_level+1 == dn->dn_nlevels); 1121 ASSERT(db->db_blkid < dn->dn_nblkptr); 1122 ASSERT(db->db_parent == NULL || 1123 db->db_parent == db->db_dnode->dn_dbuf); 1124 mutex_enter(&dn->dn_mtx); 1125 ASSERT(!list_link_active(&dr->dr_dirty_node)); 1126 list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1127 mutex_exit(&dn->dn_mtx); 1128 if (drop_struct_lock) 1129 rw_exit(&dn->dn_struct_rwlock); 1130 } 1131 1132 dnode_setdirty(dn, tx); 1133 return (dr); 1134 } 1135 1136 static int 1137 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1138 { 1139 dnode_t *dn = db->db_dnode; 1140 uint64_t txg = tx->tx_txg; 1141 dbuf_dirty_record_t *dr, **drp; 1142 1143 ASSERT(txg != 0); 1144 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1145 1146 mutex_enter(&db->db_mtx); 1147 1148 /* 1149 * If this buffer is not dirty, we're done. 1150 */ 1151 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1152 if (dr->dr_txg <= txg) 1153 break; 1154 if (dr == NULL || dr->dr_txg < txg) { 1155 mutex_exit(&db->db_mtx); 1156 return (0); 1157 } 1158 ASSERT(dr->dr_txg == txg); 1159 1160 /* 1161 * If this buffer is currently held, we cannot undirty 1162 * it, since one of the current holders may be in the 1163 * middle of an update. Note that users of dbuf_undirty() 1164 * should not place a hold on the dbuf before the call. 1165 */ 1166 if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1167 mutex_exit(&db->db_mtx); 1168 /* Make sure we don't toss this buffer at sync phase */ 1169 mutex_enter(&dn->dn_mtx); 1170 dnode_clear_range(dn, db->db_blkid, 1, tx); 1171 mutex_exit(&dn->dn_mtx); 1172 return (0); 1173 } 1174 1175 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1176 1177 ASSERT(db->db.db_size != 0); 1178 1179 /* XXX would be nice to fix up dn_towrite_space[] */ 1180 1181 *drp = dr->dr_next; 1182 1183 if (dr->dr_parent) { 1184 mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1185 list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1186 mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 1187 } else if (db->db_level+1 == dn->dn_nlevels) { 1188 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1189 mutex_enter(&dn->dn_mtx); 1190 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1191 mutex_exit(&dn->dn_mtx); 1192 } 1193 1194 if (db->db_level == 0) { 1195 dbuf_unoverride(dr); 1196 1197 ASSERT(db->db_buf != NULL); 1198 ASSERT(dr->dt.dl.dr_data != NULL); 1199 if (dr->dt.dl.dr_data != db->db_buf) 1200 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); 1201 } else { 1202 ASSERT(db->db_buf != NULL); 1203 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 1204 mutex_destroy(&dr->dt.di.dr_mtx); 1205 list_destroy(&dr->dt.di.dr_children); 1206 } 1207 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1208 1209 ASSERT(db->db_dirtycnt > 0); 1210 db->db_dirtycnt -= 1; 1211 1212 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1213 arc_buf_t *buf = db->db_buf; 1214 1215 ASSERT(arc_released(buf)); 1216 dbuf_set_data(db, NULL); 1217 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1218 dbuf_evict(db); 1219 return (1); 1220 } 1221 1222 mutex_exit(&db->db_mtx); 1223 return (0); 1224 } 1225 1226 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty 1227 void 1228 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1229 { 1230 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1231 1232 ASSERT(tx->tx_txg != 0); 1233 ASSERT(!refcount_is_zero(&db->db_holds)); 1234 1235 if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) 1236 rf |= DB_RF_HAVESTRUCT; 1237 (void) dbuf_read(db, NULL, rf); 1238 (void) dbuf_dirty(db, tx); 1239 } 1240 1241 void 1242 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1243 { 1244 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1245 1246 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1247 ASSERT(tx->tx_txg != 0); 1248 ASSERT(db->db_level == 0); 1249 ASSERT(!refcount_is_zero(&db->db_holds)); 1250 1251 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1252 dmu_tx_private_ok(tx)); 1253 1254 dbuf_noread(db); 1255 (void) dbuf_dirty(db, tx); 1256 } 1257 1258 #pragma weak dmu_buf_fill_done = dbuf_fill_done 1259 /* ARGSUSED */ 1260 void 1261 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1262 { 1263 mutex_enter(&db->db_mtx); 1264 DBUF_VERIFY(db); 1265 1266 if (db->db_state == DB_FILL) { 1267 if (db->db_level == 0 && db->db_freed_in_flight) { 1268 ASSERT(db->db_blkid != DB_BONUS_BLKID); 1269 /* we were freed while filling */ 1270 /* XXX dbuf_undirty? */ 1271 bzero(db->db.db_data, db->db.db_size); 1272 db->db_freed_in_flight = FALSE; 1273 } 1274 db->db_state = DB_CACHED; 1275 cv_broadcast(&db->db_changed); 1276 } 1277 mutex_exit(&db->db_mtx); 1278 } 1279 1280 /* 1281 * "Clear" the contents of this dbuf. This will mark the dbuf 1282 * EVICTING and clear *most* of its references. Unfortunetely, 1283 * when we are not holding the dn_dbufs_mtx, we can't clear the 1284 * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1285 * in this case. For callers from the DMU we will usually see: 1286 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() 1287 * For the arc callback, we will usually see: 1288 * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1289 * Sometimes, though, we will get a mix of these two: 1290 * DMU: dbuf_clear()->arc_buf_evict() 1291 * ARC: dbuf_do_evict()->dbuf_destroy() 1292 */ 1293 void 1294 dbuf_clear(dmu_buf_impl_t *db) 1295 { 1296 dnode_t *dn = db->db_dnode; 1297 dmu_buf_impl_t *parent = db->db_parent; 1298 dmu_buf_impl_t *dndb = dn->dn_dbuf; 1299 int dbuf_gone = FALSE; 1300 1301 ASSERT(MUTEX_HELD(&db->db_mtx)); 1302 ASSERT(refcount_is_zero(&db->db_holds)); 1303 1304 dbuf_evict_user(db); 1305 1306 if (db->db_state == DB_CACHED) { 1307 ASSERT(db->db.db_data != NULL); 1308 if (db->db_blkid == DB_BONUS_BLKID) { 1309 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 1310 arc_space_return(DN_MAX_BONUSLEN); 1311 } 1312 db->db.db_data = NULL; 1313 db->db_state = DB_UNCACHED; 1314 } 1315 1316 ASSERT3U(db->db_state, ==, DB_UNCACHED); 1317 ASSERT(db->db_data_pending == NULL); 1318 1319 db->db_state = DB_EVICTING; 1320 db->db_blkptr = NULL; 1321 1322 if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 1323 list_remove(&dn->dn_dbufs, db); 1324 dnode_rele(dn, db); 1325 db->db_dnode = NULL; 1326 } 1327 1328 if (db->db_buf) 1329 dbuf_gone = arc_buf_evict(db->db_buf); 1330 1331 if (!dbuf_gone) 1332 mutex_exit(&db->db_mtx); 1333 1334 /* 1335 * If this dbuf is referened from an indirect dbuf, 1336 * decrement the ref count on the indirect dbuf. 1337 */ 1338 if (parent && parent != dndb) 1339 dbuf_rele(parent, db); 1340 } 1341 1342 static int 1343 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1344 dmu_buf_impl_t **parentp, blkptr_t **bpp) 1345 { 1346 int nlevels, epbs; 1347 1348 *parentp = NULL; 1349 *bpp = NULL; 1350 1351 ASSERT(blkid != DB_BONUS_BLKID); 1352 1353 if (dn->dn_phys->dn_nlevels == 0) 1354 nlevels = 1; 1355 else 1356 nlevels = dn->dn_phys->dn_nlevels; 1357 1358 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1359 1360 ASSERT3U(level * epbs, <, 64); 1361 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1362 if (level >= nlevels || 1363 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1364 /* the buffer has no parent yet */ 1365 return (ENOENT); 1366 } else if (level < nlevels-1) { 1367 /* this block is referenced from an indirect block */ 1368 int err = dbuf_hold_impl(dn, level+1, 1369 blkid >> epbs, fail_sparse, NULL, parentp); 1370 if (err) 1371 return (err); 1372 err = dbuf_read(*parentp, NULL, 1373 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1374 if (err) { 1375 dbuf_rele(*parentp, NULL); 1376 *parentp = NULL; 1377 return (err); 1378 } 1379 *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1380 (blkid & ((1ULL << epbs) - 1)); 1381 return (0); 1382 } else { 1383 /* the block is referenced from the dnode */ 1384 ASSERT3U(level, ==, nlevels-1); 1385 ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1386 blkid < dn->dn_phys->dn_nblkptr); 1387 if (dn->dn_dbuf) { 1388 dbuf_add_ref(dn->dn_dbuf, NULL); 1389 *parentp = dn->dn_dbuf; 1390 } 1391 *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1392 return (0); 1393 } 1394 } 1395 1396 static dmu_buf_impl_t * 1397 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1398 dmu_buf_impl_t *parent, blkptr_t *blkptr) 1399 { 1400 objset_impl_t *os = dn->dn_objset; 1401 dmu_buf_impl_t *db, *odb; 1402 1403 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1404 ASSERT(dn->dn_type != DMU_OT_NONE); 1405 1406 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1407 1408 db->db_objset = os; 1409 db->db.db_object = dn->dn_object; 1410 db->db_level = level; 1411 db->db_blkid = blkid; 1412 db->db_last_dirty = NULL; 1413 db->db_dirtycnt = 0; 1414 db->db_dnode = dn; 1415 db->db_parent = parent; 1416 db->db_blkptr = blkptr; 1417 1418 db->db_user_ptr = NULL; 1419 db->db_user_data_ptr_ptr = NULL; 1420 db->db_evict_func = NULL; 1421 db->db_immediate_evict = 0; 1422 db->db_freed_in_flight = 0; 1423 1424 if (blkid == DB_BONUS_BLKID) { 1425 ASSERT3P(parent, ==, dn->dn_dbuf); 1426 db->db.db_size = DN_MAX_BONUSLEN - 1427 (dn->dn_nblkptr-1) * sizeof (blkptr_t); 1428 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 1429 db->db.db_offset = DB_BONUS_BLKID; 1430 db->db_state = DB_UNCACHED; 1431 /* the bonus dbuf is not placed in the hash table */ 1432 arc_space_consume(sizeof (dmu_buf_impl_t)); 1433 return (db); 1434 } else { 1435 int blocksize = 1436 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1437 db->db.db_size = blocksize; 1438 db->db.db_offset = db->db_blkid * blocksize; 1439 } 1440 1441 /* 1442 * Hold the dn_dbufs_mtx while we get the new dbuf 1443 * in the hash table *and* added to the dbufs list. 1444 * This prevents a possible deadlock with someone 1445 * trying to look up this dbuf before its added to the 1446 * dn_dbufs list. 1447 */ 1448 mutex_enter(&dn->dn_dbufs_mtx); 1449 db->db_state = DB_EVICTING; 1450 if ((odb = dbuf_hash_insert(db)) != NULL) { 1451 /* someone else inserted it first */ 1452 kmem_cache_free(dbuf_cache, db); 1453 mutex_exit(&dn->dn_dbufs_mtx); 1454 return (odb); 1455 } 1456 list_insert_head(&dn->dn_dbufs, db); 1457 db->db_state = DB_UNCACHED; 1458 mutex_exit(&dn->dn_dbufs_mtx); 1459 arc_space_consume(sizeof (dmu_buf_impl_t)); 1460 1461 if (parent && parent != dn->dn_dbuf) 1462 dbuf_add_ref(parent, db); 1463 1464 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1465 refcount_count(&dn->dn_holds) > 0); 1466 (void) refcount_add(&dn->dn_holds, db); 1467 1468 dprintf_dbuf(db, "db=%p\n", db); 1469 1470 return (db); 1471 } 1472 1473 static int 1474 dbuf_do_evict(void *private) 1475 { 1476 arc_buf_t *buf = private; 1477 dmu_buf_impl_t *db = buf->b_private; 1478 1479 if (!MUTEX_HELD(&db->db_mtx)) 1480 mutex_enter(&db->db_mtx); 1481 1482 ASSERT(refcount_is_zero(&db->db_holds)); 1483 1484 if (db->db_state != DB_EVICTING) { 1485 ASSERT(db->db_state == DB_CACHED); 1486 DBUF_VERIFY(db); 1487 db->db_buf = NULL; 1488 dbuf_evict(db); 1489 } else { 1490 mutex_exit(&db->db_mtx); 1491 dbuf_destroy(db); 1492 } 1493 return (0); 1494 } 1495 1496 static void 1497 dbuf_destroy(dmu_buf_impl_t *db) 1498 { 1499 ASSERT(refcount_is_zero(&db->db_holds)); 1500 1501 if (db->db_blkid != DB_BONUS_BLKID) { 1502 /* 1503 * If this dbuf is still on the dn_dbufs list, 1504 * remove it from that list. 1505 */ 1506 if (db->db_dnode) { 1507 dnode_t *dn = db->db_dnode; 1508 1509 mutex_enter(&dn->dn_dbufs_mtx); 1510 list_remove(&dn->dn_dbufs, db); 1511 mutex_exit(&dn->dn_dbufs_mtx); 1512 1513 dnode_rele(dn, db); 1514 db->db_dnode = NULL; 1515 } 1516 dbuf_hash_remove(db); 1517 } 1518 db->db_parent = NULL; 1519 db->db_buf = NULL; 1520 1521 ASSERT(!list_link_active(&db->db_link)); 1522 ASSERT(db->db.db_data == NULL); 1523 ASSERT(db->db_hash_next == NULL); 1524 ASSERT(db->db_blkptr == NULL); 1525 ASSERT(db->db_data_pending == NULL); 1526 1527 kmem_cache_free(dbuf_cache, db); 1528 arc_space_return(sizeof (dmu_buf_impl_t)); 1529 } 1530 1531 void 1532 dbuf_prefetch(dnode_t *dn, uint64_t blkid) 1533 { 1534 dmu_buf_impl_t *db = NULL; 1535 blkptr_t *bp = NULL; 1536 1537 ASSERT(blkid != DB_BONUS_BLKID); 1538 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1539 1540 if (dnode_block_freed(dn, blkid)) 1541 return; 1542 1543 /* dbuf_find() returns with db_mtx held */ 1544 if (db = dbuf_find(dn, 0, blkid)) { 1545 if (refcount_count(&db->db_holds) > 0) { 1546 /* 1547 * This dbuf is active. We assume that it is 1548 * already CACHED, or else about to be either 1549 * read or filled. 1550 */ 1551 mutex_exit(&db->db_mtx); 1552 return; 1553 } 1554 mutex_exit(&db->db_mtx); 1555 db = NULL; 1556 } 1557 1558 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { 1559 if (bp && !BP_IS_HOLE(bp)) { 1560 arc_buf_t *pbuf; 1561 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; 1562 zbookmark_t zb; 1563 zb.zb_objset = dn->dn_objset->os_dsl_dataset ? 1564 dn->dn_objset->os_dsl_dataset->ds_object : 0; 1565 zb.zb_object = dn->dn_object; 1566 zb.zb_level = 0; 1567 zb.zb_blkid = blkid; 1568 1569 if (db) 1570 pbuf = db->db_buf; 1571 else 1572 pbuf = dn->dn_objset->os_phys_buf; 1573 1574 (void) arc_read(NULL, dn->dn_objset->os_spa, 1575 bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 1576 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1577 &aflags, &zb); 1578 } 1579 if (db) 1580 dbuf_rele(db, NULL); 1581 } 1582 } 1583 1584 /* 1585 * Returns with db_holds incremented, and db_mtx not held. 1586 * Note: dn_struct_rwlock must be held. 1587 */ 1588 int 1589 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1590 void *tag, dmu_buf_impl_t **dbp) 1591 { 1592 dmu_buf_impl_t *db, *parent = NULL; 1593 1594 ASSERT(blkid != DB_BONUS_BLKID); 1595 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1596 ASSERT3U(dn->dn_nlevels, >, level); 1597 1598 *dbp = NULL; 1599 top: 1600 /* dbuf_find() returns with db_mtx held */ 1601 db = dbuf_find(dn, level, blkid); 1602 1603 if (db == NULL) { 1604 blkptr_t *bp = NULL; 1605 int err; 1606 1607 ASSERT3P(parent, ==, NULL); 1608 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1609 if (fail_sparse) { 1610 if (err == 0 && bp && BP_IS_HOLE(bp)) 1611 err = ENOENT; 1612 if (err) { 1613 if (parent) 1614 dbuf_rele(parent, NULL); 1615 return (err); 1616 } 1617 } 1618 if (err && err != ENOENT) 1619 return (err); 1620 db = dbuf_create(dn, level, blkid, parent, bp); 1621 } 1622 1623 if (db->db_buf && refcount_is_zero(&db->db_holds)) { 1624 arc_buf_add_ref(db->db_buf, db); 1625 if (db->db_buf->b_data == NULL) { 1626 dbuf_clear(db); 1627 if (parent) { 1628 dbuf_rele(parent, NULL); 1629 parent = NULL; 1630 } 1631 goto top; 1632 } 1633 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 1634 } 1635 1636 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 1637 1638 /* 1639 * If this buffer is currently syncing out, and we are are 1640 * still referencing it from db_data, we need to make a copy 1641 * of it in case we decide we want to dirty it again in this txg. 1642 */ 1643 if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 1644 dn->dn_object != DMU_META_DNODE_OBJECT && 1645 db->db_state == DB_CACHED && db->db_data_pending) { 1646 dbuf_dirty_record_t *dr = db->db_data_pending; 1647 1648 if (dr->dt.dl.dr_data == db->db_buf) { 1649 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1650 1651 dbuf_set_data(db, 1652 arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 1653 db->db.db_size, db, type)); 1654 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 1655 db->db.db_size); 1656 } 1657 } 1658 1659 (void) refcount_add(&db->db_holds, tag); 1660 dbuf_update_data(db); 1661 DBUF_VERIFY(db); 1662 mutex_exit(&db->db_mtx); 1663 1664 /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1665 if (parent) 1666 dbuf_rele(parent, NULL); 1667 1668 ASSERT3P(db->db_dnode, ==, dn); 1669 ASSERT3U(db->db_blkid, ==, blkid); 1670 ASSERT3U(db->db_level, ==, level); 1671 *dbp = db; 1672 1673 return (0); 1674 } 1675 1676 dmu_buf_impl_t * 1677 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 1678 { 1679 dmu_buf_impl_t *db; 1680 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); 1681 return (err ? NULL : db); 1682 } 1683 1684 dmu_buf_impl_t * 1685 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1686 { 1687 dmu_buf_impl_t *db; 1688 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1689 return (err ? NULL : db); 1690 } 1691 1692 void 1693 dbuf_create_bonus(dnode_t *dn) 1694 { 1695 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1696 1697 ASSERT(dn->dn_bonus == NULL); 1698 dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); 1699 } 1700 1701 #pragma weak dmu_buf_add_ref = dbuf_add_ref 1702 void 1703 dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1704 { 1705 int64_t holds = refcount_add(&db->db_holds, tag); 1706 ASSERT(holds > 1); 1707 } 1708 1709 #pragma weak dmu_buf_rele = dbuf_rele 1710 void 1711 dbuf_rele(dmu_buf_impl_t *db, void *tag) 1712 { 1713 int64_t holds; 1714 1715 mutex_enter(&db->db_mtx); 1716 DBUF_VERIFY(db); 1717 1718 holds = refcount_remove(&db->db_holds, tag); 1719 ASSERT(holds >= 0); 1720 1721 /* 1722 * We can't freeze indirects if there is a possibility that they 1723 * may be modified in the current syncing context. 1724 */ 1725 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 1726 arc_buf_freeze(db->db_buf); 1727 1728 if (holds == db->db_dirtycnt && 1729 db->db_level == 0 && db->db_immediate_evict) 1730 dbuf_evict_user(db); 1731 1732 if (holds == 0) { 1733 if (db->db_blkid == DB_BONUS_BLKID) { 1734 mutex_exit(&db->db_mtx); 1735 dnode_rele(db->db_dnode, db); 1736 } else if (db->db_buf == NULL) { 1737 /* 1738 * This is a special case: we never associated this 1739 * dbuf with any data allocated from the ARC. 1740 */ 1741 ASSERT3U(db->db_state, ==, DB_UNCACHED); 1742 dbuf_evict(db); 1743 } else if (arc_released(db->db_buf)) { 1744 arc_buf_t *buf = db->db_buf; 1745 /* 1746 * This dbuf has anonymous data associated with it. 1747 */ 1748 dbuf_set_data(db, NULL); 1749 VERIFY(arc_buf_remove_ref(buf, db) == 1); 1750 dbuf_evict(db); 1751 } else { 1752 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); 1753 if (!DBUF_IS_CACHEABLE(db)) 1754 dbuf_clear(db); 1755 else 1756 mutex_exit(&db->db_mtx); 1757 } 1758 } else { 1759 mutex_exit(&db->db_mtx); 1760 } 1761 } 1762 1763 #pragma weak dmu_buf_refcount = dbuf_refcount 1764 uint64_t 1765 dbuf_refcount(dmu_buf_impl_t *db) 1766 { 1767 return (refcount_count(&db->db_holds)); 1768 } 1769 1770 void * 1771 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1772 dmu_buf_evict_func_t *evict_func) 1773 { 1774 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1775 user_data_ptr_ptr, evict_func)); 1776 } 1777 1778 void * 1779 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1780 dmu_buf_evict_func_t *evict_func) 1781 { 1782 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1783 1784 db->db_immediate_evict = TRUE; 1785 return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1786 user_data_ptr_ptr, evict_func)); 1787 } 1788 1789 void * 1790 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 1791 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 1792 { 1793 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1794 ASSERT(db->db_level == 0); 1795 1796 ASSERT((user_ptr == NULL) == (evict_func == NULL)); 1797 1798 mutex_enter(&db->db_mtx); 1799 1800 if (db->db_user_ptr == old_user_ptr) { 1801 db->db_user_ptr = user_ptr; 1802 db->db_user_data_ptr_ptr = user_data_ptr_ptr; 1803 db->db_evict_func = evict_func; 1804 1805 dbuf_update_data(db); 1806 } else { 1807 old_user_ptr = db->db_user_ptr; 1808 } 1809 1810 mutex_exit(&db->db_mtx); 1811 return (old_user_ptr); 1812 } 1813 1814 void * 1815 dmu_buf_get_user(dmu_buf_t *db_fake) 1816 { 1817 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1818 ASSERT(!refcount_is_zero(&db->db_holds)); 1819 1820 return (db->db_user_ptr); 1821 } 1822 1823 static void 1824 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 1825 { 1826 /* ASSERT(dmu_tx_is_syncing(tx) */ 1827 ASSERT(MUTEX_HELD(&db->db_mtx)); 1828 1829 if (db->db_blkptr != NULL) 1830 return; 1831 1832 if (db->db_level == dn->dn_phys->dn_nlevels-1) { 1833 /* 1834 * This buffer was allocated at a time when there was 1835 * no available blkptrs from the dnode, or it was 1836 * inappropriate to hook it in (i.e., nlevels mis-match). 1837 */ 1838 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 1839 ASSERT(db->db_parent == NULL); 1840 db->db_parent = dn->dn_dbuf; 1841 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 1842 DBUF_VERIFY(db); 1843 } else { 1844 dmu_buf_impl_t *parent = db->db_parent; 1845 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 1846 1847 ASSERT(dn->dn_phys->dn_nlevels > 1); 1848 if (parent == NULL) { 1849 mutex_exit(&db->db_mtx); 1850 rw_enter(&dn->dn_struct_rwlock, RW_READER); 1851 (void) dbuf_hold_impl(dn, db->db_level+1, 1852 db->db_blkid >> epbs, FALSE, db, &parent); 1853 rw_exit(&dn->dn_struct_rwlock); 1854 mutex_enter(&db->db_mtx); 1855 db->db_parent = parent; 1856 } 1857 db->db_blkptr = (blkptr_t *)parent->db.db_data + 1858 (db->db_blkid & ((1ULL << epbs) - 1)); 1859 DBUF_VERIFY(db); 1860 } 1861 } 1862 1863 static void 1864 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 1865 { 1866 dmu_buf_impl_t *db = dr->dr_dbuf; 1867 dnode_t *dn = db->db_dnode; 1868 zio_t *zio; 1869 1870 ASSERT(dmu_tx_is_syncing(tx)); 1871 1872 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 1873 1874 mutex_enter(&db->db_mtx); 1875 1876 ASSERT(db->db_level > 0); 1877 DBUF_VERIFY(db); 1878 1879 if (db->db_buf == NULL) { 1880 mutex_exit(&db->db_mtx); 1881 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 1882 mutex_enter(&db->db_mtx); 1883 } 1884 ASSERT3U(db->db_state, ==, DB_CACHED); 1885 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 1886 ASSERT(db->db_buf != NULL); 1887 1888 dbuf_check_blkptr(dn, db); 1889 1890 db->db_data_pending = dr; 1891 1892 mutex_exit(&db->db_mtx); 1893 dbuf_write(dr, db->db_buf, tx); 1894 1895 zio = dr->dr_zio; 1896 mutex_enter(&dr->dt.di.dr_mtx); 1897 dbuf_sync_list(&dr->dt.di.dr_children, tx); 1898 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 1899 mutex_exit(&dr->dt.di.dr_mtx); 1900 zio_nowait(zio); 1901 } 1902 1903 static void 1904 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 1905 { 1906 arc_buf_t **datap = &dr->dt.dl.dr_data; 1907 dmu_buf_impl_t *db = dr->dr_dbuf; 1908 dnode_t *dn = db->db_dnode; 1909 objset_impl_t *os = dn->dn_objset; 1910 uint64_t txg = tx->tx_txg; 1911 int blksz; 1912 1913 ASSERT(dmu_tx_is_syncing(tx)); 1914 1915 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 1916 1917 mutex_enter(&db->db_mtx); 1918 /* 1919 * To be synced, we must be dirtied. But we 1920 * might have been freed after the dirty. 1921 */ 1922 if (db->db_state == DB_UNCACHED) { 1923 /* This buffer has been freed since it was dirtied */ 1924 ASSERT(db->db.db_data == NULL); 1925 } else if (db->db_state == DB_FILL) { 1926 /* This buffer was freed and is now being re-filled */ 1927 ASSERT(db->db.db_data != dr->dt.dl.dr_data); 1928 } else { 1929 ASSERT3U(db->db_state, ==, DB_CACHED); 1930 } 1931 DBUF_VERIFY(db); 1932 1933 /* 1934 * If this is a bonus buffer, simply copy the bonus data into the 1935 * dnode. It will be written out when the dnode is synced (and it 1936 * will be synced, since it must have been dirty for dbuf_sync to 1937 * be called). 1938 */ 1939 if (db->db_blkid == DB_BONUS_BLKID) { 1940 dbuf_dirty_record_t **drp; 1941 1942 ASSERT(*datap != NULL); 1943 ASSERT3U(db->db_level, ==, 0); 1944 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 1945 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 1946 if (*datap != db->db.db_data) { 1947 zio_buf_free(*datap, DN_MAX_BONUSLEN); 1948 arc_space_return(DN_MAX_BONUSLEN); 1949 } 1950 db->db_data_pending = NULL; 1951 drp = &db->db_last_dirty; 1952 while (*drp != dr) 1953 drp = &(*drp)->dr_next; 1954 ASSERT(dr->dr_next == NULL); 1955 *drp = dr->dr_next; 1956 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1957 ASSERT(db->db_dirtycnt > 0); 1958 db->db_dirtycnt -= 1; 1959 mutex_exit(&db->db_mtx); 1960 dbuf_rele(db, (void *)(uintptr_t)txg); 1961 return; 1962 } 1963 1964 /* 1965 * This function may have dropped the db_mtx lock allowing a dmu_sync 1966 * operation to sneak in. As a result, we need to ensure that we 1967 * don't check the dr_override_state until we have returned from 1968 * dbuf_check_blkptr. 1969 */ 1970 dbuf_check_blkptr(dn, db); 1971 1972 /* 1973 * If this buffer is in the middle of an immdiate write, 1974 * wait for the synchronous IO to complete. 1975 */ 1976 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 1977 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 1978 cv_wait(&db->db_changed, &db->db_mtx); 1979 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 1980 } 1981 1982 /* 1983 * If this dbuf has already been written out via an immediate write, 1984 * just complete the write by copying over the new block pointer and 1985 * updating the accounting via the write-completion functions. 1986 */ 1987 if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 1988 zio_t zio_fake; 1989 1990 zio_fake.io_private = &db; 1991 zio_fake.io_error = 0; 1992 zio_fake.io_bp = db->db_blkptr; 1993 zio_fake.io_bp_orig = *db->db_blkptr; 1994 zio_fake.io_txg = txg; 1995 1996 *db->db_blkptr = dr->dt.dl.dr_overridden_by; 1997 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 1998 db->db_data_pending = dr; 1999 dr->dr_zio = &zio_fake; 2000 mutex_exit(&db->db_mtx); 2001 2002 if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg)) 2003 (void) dsl_dataset_block_kill(os->os_dsl_dataset, 2004 &zio_fake.io_bp_orig, dn->dn_zio, tx); 2005 2006 dbuf_write_ready(&zio_fake, db->db_buf, db); 2007 dbuf_write_done(&zio_fake, db->db_buf, db); 2008 2009 return; 2010 } 2011 2012 blksz = arc_buf_size(*datap); 2013 2014 if (dn->dn_object != DMU_META_DNODE_OBJECT) { 2015 /* 2016 * If this buffer is currently "in use" (i.e., there are 2017 * active holds and db_data still references it), then make 2018 * a copy before we start the write so that any modifications 2019 * from the open txg will not leak into this write. 2020 * 2021 * NOTE: this copy does not need to be made for objects only 2022 * modified in the syncing context (e.g. DNONE_DNODE blocks). 2023 */ 2024 if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) { 2025 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2026 *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2027 bcopy(db->db.db_data, (*datap)->b_data, blksz); 2028 } 2029 } 2030 2031 ASSERT(*datap != NULL); 2032 db->db_data_pending = dr; 2033 2034 mutex_exit(&db->db_mtx); 2035 2036 dbuf_write(dr, *datap, tx); 2037 2038 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2039 if (dn->dn_object == DMU_META_DNODE_OBJECT) 2040 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2041 else 2042 zio_nowait(dr->dr_zio); 2043 } 2044 2045 void 2046 dbuf_sync_list(list_t *list, dmu_tx_t *tx) 2047 { 2048 dbuf_dirty_record_t *dr; 2049 2050 while (dr = list_head(list)) { 2051 if (dr->dr_zio != NULL) { 2052 /* 2053 * If we find an already initialized zio then we 2054 * are processing the meta-dnode, and we have finished. 2055 * The dbufs for all dnodes are put back on the list 2056 * during processing, so that we can zio_wait() 2057 * these IOs after initiating all child IOs. 2058 */ 2059 ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2060 DMU_META_DNODE_OBJECT); 2061 break; 2062 } 2063 list_remove(list, dr); 2064 if (dr->dr_dbuf->db_level > 0) 2065 dbuf_sync_indirect(dr, tx); 2066 else 2067 dbuf_sync_leaf(dr, tx); 2068 } 2069 } 2070 2071 static void 2072 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 2073 { 2074 dmu_buf_impl_t *db = dr->dr_dbuf; 2075 dnode_t *dn = db->db_dnode; 2076 objset_impl_t *os = dn->dn_objset; 2077 dmu_buf_impl_t *parent = db->db_parent; 2078 uint64_t txg = tx->tx_txg; 2079 zbookmark_t zb; 2080 writeprops_t wp = { 0 }; 2081 zio_t *zio; 2082 int zio_flags; 2083 2084 if (!BP_IS_HOLE(db->db_blkptr) && 2085 (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) { 2086 /* 2087 * Private object buffers are released here rather 2088 * than in dbuf_dirty() since they are only modified 2089 * in the syncing context and we don't want the 2090 * overhead of making multiple copies of the data. 2091 */ 2092 arc_release(data, db); 2093 } else { 2094 ASSERT(arc_released(data)); 2095 /* XXX why do we need to thaw here? */ 2096 arc_buf_thaw(data); 2097 } 2098 2099 if (parent != dn->dn_dbuf) { 2100 ASSERT(parent && parent->db_data_pending); 2101 ASSERT(db->db_level == parent->db_level-1); 2102 ASSERT(arc_released(parent->db_buf)); 2103 zio = parent->db_data_pending->dr_zio; 2104 } else { 2105 ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); 2106 ASSERT3P(db->db_blkptr, ==, 2107 &dn->dn_phys->dn_blkptr[db->db_blkid]); 2108 zio = dn->dn_zio; 2109 } 2110 2111 ASSERT(db->db_level == 0 || data == db->db_buf); 2112 ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 2113 ASSERT(zio); 2114 2115 zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; 2116 zb.zb_object = db->db.db_object; 2117 zb.zb_level = db->db_level; 2118 zb.zb_blkid = db->db_blkid; 2119 2120 zio_flags = ZIO_FLAG_MUSTSUCCEED; 2121 if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0) 2122 zio_flags |= ZIO_FLAG_METADATA; 2123 wp.wp_type = dn->dn_type; 2124 wp.wp_level = db->db_level; 2125 wp.wp_copies = os->os_copies; 2126 wp.wp_dncompress = dn->dn_compress; 2127 wp.wp_oscompress = os->os_compress; 2128 wp.wp_dnchecksum = dn->dn_checksum; 2129 wp.wp_oschecksum = os->os_checksum; 2130 2131 if (BP_IS_OLDER(db->db_blkptr, txg)) 2132 (void) dsl_dataset_block_kill( 2133 os->os_dsl_dataset, db->db_blkptr, zio, tx); 2134 2135 dr->dr_zio = arc_write(zio, os->os_spa, &wp, 2136 DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr, 2137 data, dbuf_write_ready, dbuf_write_done, db, 2138 ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb); 2139 } 2140 2141 /* ARGSUSED */ 2142 static void 2143 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2144 { 2145 dmu_buf_impl_t *db = vdb; 2146 dnode_t *dn = db->db_dnode; 2147 objset_impl_t *os = dn->dn_objset; 2148 blkptr_t *bp_orig = &zio->io_bp_orig; 2149 uint64_t fill = 0; 2150 int old_size, new_size, i; 2151 2152 dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", ""); 2153 2154 old_size = bp_get_dasize(os->os_spa, bp_orig); 2155 new_size = bp_get_dasize(os->os_spa, zio->io_bp); 2156 2157 dnode_diduse_space(dn, new_size-old_size); 2158 2159 if (BP_IS_HOLE(zio->io_bp)) { 2160 dsl_dataset_t *ds = os->os_dsl_dataset; 2161 dmu_tx_t *tx = os->os_synctx; 2162 2163 if (bp_orig->blk_birth == tx->tx_txg) 2164 (void) dsl_dataset_block_kill(ds, bp_orig, NULL, tx); 2165 ASSERT3U(db->db_blkptr->blk_fill, ==, 0); 2166 return; 2167 } 2168 2169 mutex_enter(&db->db_mtx); 2170 2171 if (db->db_level == 0) { 2172 mutex_enter(&dn->dn_mtx); 2173 if (db->db_blkid > dn->dn_phys->dn_maxblkid) 2174 dn->dn_phys->dn_maxblkid = db->db_blkid; 2175 mutex_exit(&dn->dn_mtx); 2176 2177 if (dn->dn_type == DMU_OT_DNODE) { 2178 dnode_phys_t *dnp = db->db.db_data; 2179 for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2180 i--, dnp++) { 2181 if (dnp->dn_type != DMU_OT_NONE) 2182 fill++; 2183 } 2184 } else { 2185 fill = 1; 2186 } 2187 } else { 2188 blkptr_t *bp = db->db.db_data; 2189 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2190 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) { 2191 if (BP_IS_HOLE(bp)) 2192 continue; 2193 ASSERT3U(BP_GET_LSIZE(bp), ==, 2194 db->db_level == 1 ? dn->dn_datablksz : 2195 (1<<dn->dn_phys->dn_indblkshift)); 2196 fill += bp->blk_fill; 2197 } 2198 } 2199 2200 db->db_blkptr->blk_fill = fill; 2201 BP_SET_TYPE(db->db_blkptr, dn->dn_type); 2202 BP_SET_LEVEL(db->db_blkptr, db->db_level); 2203 2204 mutex_exit(&db->db_mtx); 2205 2206 /* We must do this after we've set the bp's type and level */ 2207 if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) { 2208 dsl_dataset_t *ds = os->os_dsl_dataset; 2209 dmu_tx_t *tx = os->os_synctx; 2210 2211 if (bp_orig->blk_birth == tx->tx_txg) 2212 (void) dsl_dataset_block_kill(ds, bp_orig, NULL, tx); 2213 dsl_dataset_block_born(ds, zio->io_bp, tx); 2214 } 2215 } 2216 2217 /* ARGSUSED */ 2218 static void 2219 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2220 { 2221 dmu_buf_impl_t *db = vdb; 2222 uint64_t txg = zio->io_txg; 2223 dbuf_dirty_record_t **drp, *dr; 2224 2225 ASSERT3U(zio->io_error, ==, 0); 2226 2227 mutex_enter(&db->db_mtx); 2228 2229 drp = &db->db_last_dirty; 2230 while ((dr = *drp) != db->db_data_pending) 2231 drp = &dr->dr_next; 2232 ASSERT(!list_link_active(&dr->dr_dirty_node)); 2233 ASSERT(dr->dr_txg == txg); 2234 ASSERT(dr->dr_next == NULL); 2235 *drp = dr->dr_next; 2236 2237 if (db->db_level == 0) { 2238 ASSERT(db->db_blkid != DB_BONUS_BLKID); 2239 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 2240 2241 if (dr->dt.dl.dr_data != db->db_buf) 2242 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); 2243 else if (!BP_IS_HOLE(db->db_blkptr)) 2244 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2245 else 2246 ASSERT(arc_released(db->db_buf)); 2247 } else { 2248 dnode_t *dn = db->db_dnode; 2249 2250 ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2251 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2252 if (!BP_IS_HOLE(db->db_blkptr)) { 2253 int epbs = 2254 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2255 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 2256 db->db.db_size); 2257 ASSERT3U(dn->dn_phys->dn_maxblkid 2258 >> (db->db_level * epbs), >=, db->db_blkid); 2259 arc_set_callback(db->db_buf, dbuf_do_evict, db); 2260 } 2261 mutex_destroy(&dr->dt.di.dr_mtx); 2262 list_destroy(&dr->dt.di.dr_children); 2263 } 2264 kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2265 2266 cv_broadcast(&db->db_changed); 2267 ASSERT(db->db_dirtycnt > 0); 2268 db->db_dirtycnt -= 1; 2269 db->db_data_pending = NULL; 2270 mutex_exit(&db->db_mtx); 2271 2272 dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", ""); 2273 2274 dbuf_rele(db, (void *)(uintptr_t)txg); 2275 } 2276