1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5f65e61c0Sahrens * Common Development and Distribution License (the "License"). 6f65e61c0Sahrens * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 2206e0070dSMark Shellenbaum * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 233f2366c2SGordon Ross * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 2446e1baa6SMatthew Ahrens * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 25aad02571SSaso Kiselkov * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26810e43b2SBill Pijewski * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27bc9014e6SJustin Gibbs * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28*c3d26abcSMatthew Ahrens * Copyright (c) 2014 Integros [integros.com] 29fa9e4066Sahrens */ 30fa9e4066Sahrens 31fa9e4066Sahrens #include <sys/zfs_context.h> 32fa9e4066Sahrens #include <sys/dmu.h> 332f3d8780SMatthew Ahrens #include <sys/dmu_send.h> 34fa9e4066Sahrens #include <sys/dmu_impl.h> 35fa9e4066Sahrens #include <sys/dbuf.h> 36fa9e4066Sahrens #include <sys/dmu_objset.h> 37fa9e4066Sahrens #include <sys/dsl_dataset.h> 38fa9e4066Sahrens #include <sys/dsl_dir.h> 39fa9e4066Sahrens #include <sys/dmu_tx.h> 40fa9e4066Sahrens #include <sys/spa.h> 41fa9e4066Sahrens #include <sys/zio.h> 42fa9e4066Sahrens #include <sys/dmu_zfetch.h> 430a586ceaSMark Shellenbaum #include <sys/sa.h> 440a586ceaSMark Shellenbaum #include <sys/sa_impl.h> 455d7b4d43SMatthew Ahrens #include <sys/zfeature.h> 465d7b4d43SMatthew Ahrens #include <sys/blkptr.h> 47bf16b11eSMatthew Ahrens #include <sys/range_tree.h> 48fa9e4066Sahrens 49713d6c20SMatthew Ahrens /* 50713d6c20SMatthew Ahrens * Number of times that zfs_free_range() took the slow path while doing 51713d6c20SMatthew Ahrens * a zfs receive. A nonzero value indicates a potential performance problem. 52713d6c20SMatthew Ahrens */ 53713d6c20SMatthew Ahrens uint64_t zfs_free_range_recv_miss; 54713d6c20SMatthew Ahrens 55fa9e4066Sahrens static void dbuf_destroy(dmu_buf_impl_t *db); 563b2aab18SMatthew Ahrens static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 57088f3894Sahrens static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 58fa9e4066Sahrens 59bc9014e6SJustin Gibbs #ifndef __lint 60bc9014e6SJustin Gibbs extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, 61bc9014e6SJustin Gibbs dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp); 62bc9014e6SJustin Gibbs #endif /* ! __lint */ 63bc9014e6SJustin Gibbs 64fa9e4066Sahrens /* 65fa9e4066Sahrens * Global data structures and functions for the dbuf cache. 66fa9e4066Sahrens */ 67fa9e4066Sahrens static kmem_cache_t *dbuf_cache; 68bc9014e6SJustin Gibbs static taskq_t *dbu_evict_taskq; 69fa9e4066Sahrens 70fa9e4066Sahrens /* ARGSUSED */ 71fa9e4066Sahrens static int 72fa9e4066Sahrens dbuf_cons(void *vdb, void *unused, int kmflag) 73fa9e4066Sahrens { 74fa9e4066Sahrens dmu_buf_impl_t *db = vdb; 75fa9e4066Sahrens bzero(db, sizeof (dmu_buf_impl_t)); 76fa9e4066Sahrens 77fa9e4066Sahrens mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 78fa9e4066Sahrens cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 79fa9e4066Sahrens refcount_create(&db->db_holds); 800f6d88adSAlex Reece 81fa9e4066Sahrens return (0); 82fa9e4066Sahrens } 83fa9e4066Sahrens 84fa9e4066Sahrens /* ARGSUSED */ 85fa9e4066Sahrens static void 86fa9e4066Sahrens dbuf_dest(void *vdb, void *unused) 87fa9e4066Sahrens { 88fa9e4066Sahrens dmu_buf_impl_t *db = vdb; 89fa9e4066Sahrens mutex_destroy(&db->db_mtx); 90fa9e4066Sahrens cv_destroy(&db->db_changed); 91fa9e4066Sahrens refcount_destroy(&db->db_holds); 92fa9e4066Sahrens } 93fa9e4066Sahrens 94fa9e4066Sahrens /* 95fa9e4066Sahrens * dbuf hash table routines 96fa9e4066Sahrens */ 97fa9e4066Sahrens static dbuf_hash_table_t dbuf_hash_table; 98fa9e4066Sahrens 99fa9e4066Sahrens static uint64_t dbuf_hash_count; 100fa9e4066Sahrens 101fa9e4066Sahrens static uint64_t 102fa9e4066Sahrens dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 103fa9e4066Sahrens { 104fa9e4066Sahrens uintptr_t osv = (uintptr_t)os; 105fa9e4066Sahrens uint64_t crc = -1ULL; 106fa9e4066Sahrens 107fa9e4066Sahrens ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 108fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 109fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 110fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 111fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 112fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 113fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 114fa9e4066Sahrens 115fa9e4066Sahrens crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 116fa9e4066Sahrens 117fa9e4066Sahrens return (crc); 118fa9e4066Sahrens } 119fa9e4066Sahrens 120fa9e4066Sahrens #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 121fa9e4066Sahrens 122fa9e4066Sahrens #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 123fa9e4066Sahrens ((dbuf)->db.db_object == (obj) && \ 124fa9e4066Sahrens (dbuf)->db_objset == (os) && \ 125fa9e4066Sahrens (dbuf)->db_level == (level) && \ 126fa9e4066Sahrens (dbuf)->db_blkid == (blkid)) 127fa9e4066Sahrens 128fa9e4066Sahrens dmu_buf_impl_t * 129e57a022bSJustin T. Gibbs dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) 130fa9e4066Sahrens { 131fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table; 132fa9e4066Sahrens uint64_t hv = DBUF_HASH(os, obj, level, blkid); 133fa9e4066Sahrens uint64_t idx = hv & h->hash_table_mask; 134fa9e4066Sahrens dmu_buf_impl_t *db; 135fa9e4066Sahrens 136fa9e4066Sahrens mutex_enter(DBUF_HASH_MUTEX(h, idx)); 137fa9e4066Sahrens for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 138fa9e4066Sahrens if (DBUF_EQUAL(db, os, obj, level, blkid)) { 139fa9e4066Sahrens mutex_enter(&db->db_mtx); 140ea8dc4b6Seschrock if (db->db_state != DB_EVICTING) { 141fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 142fa9e4066Sahrens return (db); 143fa9e4066Sahrens } 144fa9e4066Sahrens mutex_exit(&db->db_mtx); 145fa9e4066Sahrens } 146fa9e4066Sahrens } 147fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 148fa9e4066Sahrens return (NULL); 149fa9e4066Sahrens } 150fa9e4066Sahrens 151e57a022bSJustin T. Gibbs static dmu_buf_impl_t * 152e57a022bSJustin T. Gibbs dbuf_find_bonus(objset_t *os, uint64_t object) 153e57a022bSJustin T. Gibbs { 154e57a022bSJustin T. Gibbs dnode_t *dn; 155e57a022bSJustin T. Gibbs dmu_buf_impl_t *db = NULL; 156e57a022bSJustin T. Gibbs 157e57a022bSJustin T. Gibbs if (dnode_hold(os, object, FTAG, &dn) == 0) { 158e57a022bSJustin T. Gibbs rw_enter(&dn->dn_struct_rwlock, RW_READER); 159e57a022bSJustin T. Gibbs if (dn->dn_bonus != NULL) { 160e57a022bSJustin T. Gibbs db = dn->dn_bonus; 161e57a022bSJustin T. Gibbs mutex_enter(&db->db_mtx); 162e57a022bSJustin T. Gibbs } 163e57a022bSJustin T. Gibbs rw_exit(&dn->dn_struct_rwlock); 164e57a022bSJustin T. Gibbs dnode_rele(dn, FTAG); 165e57a022bSJustin T. Gibbs } 166e57a022bSJustin T. Gibbs return (db); 167e57a022bSJustin T. Gibbs } 168e57a022bSJustin T. Gibbs 169fa9e4066Sahrens /* 170fa9e4066Sahrens * Insert an entry into the hash table. If there is already an element 171fa9e4066Sahrens * equal to elem in the hash table, then the already existing element 172fa9e4066Sahrens * will be returned and the new element will not be inserted. 173fa9e4066Sahrens * Otherwise returns NULL. 174fa9e4066Sahrens */ 175fa9e4066Sahrens static dmu_buf_impl_t * 176fa9e4066Sahrens dbuf_hash_insert(dmu_buf_impl_t *db) 177fa9e4066Sahrens { 178fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table; 179503ad85cSMatthew Ahrens objset_t *os = db->db_objset; 180fa9e4066Sahrens uint64_t obj = db->db.db_object; 181fa9e4066Sahrens int level = db->db_level; 182fa9e4066Sahrens uint64_t blkid = db->db_blkid; 183fa9e4066Sahrens uint64_t hv = DBUF_HASH(os, obj, level, blkid); 184fa9e4066Sahrens uint64_t idx = hv & h->hash_table_mask; 185fa9e4066Sahrens dmu_buf_impl_t *dbf; 186fa9e4066Sahrens 187fa9e4066Sahrens mutex_enter(DBUF_HASH_MUTEX(h, idx)); 188fa9e4066Sahrens for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 189fa9e4066Sahrens if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 190fa9e4066Sahrens mutex_enter(&dbf->db_mtx); 191ea8dc4b6Seschrock if (dbf->db_state != DB_EVICTING) { 192fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 193fa9e4066Sahrens return (dbf); 194fa9e4066Sahrens } 195fa9e4066Sahrens mutex_exit(&dbf->db_mtx); 196fa9e4066Sahrens } 197fa9e4066Sahrens } 198fa9e4066Sahrens 199fa9e4066Sahrens mutex_enter(&db->db_mtx); 200fa9e4066Sahrens db->db_hash_next = h->hash_table[idx]; 201fa9e4066Sahrens h->hash_table[idx] = db; 202fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 2031a5e258fSJosef 'Jeff' Sipek atomic_inc_64(&dbuf_hash_count); 204fa9e4066Sahrens 205fa9e4066Sahrens return (NULL); 206fa9e4066Sahrens } 207fa9e4066Sahrens 208fa9e4066Sahrens /* 209bbfa8ea8SMatthew Ahrens * Remove an entry from the hash table. It must be in the EVICTING state. 210fa9e4066Sahrens */ 211fa9e4066Sahrens static void 212fa9e4066Sahrens dbuf_hash_remove(dmu_buf_impl_t *db) 213fa9e4066Sahrens { 214fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table; 215fa9e4066Sahrens uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 216fa9e4066Sahrens db->db_level, db->db_blkid); 217fa9e4066Sahrens uint64_t idx = hv & h->hash_table_mask; 218fa9e4066Sahrens dmu_buf_impl_t *dbf, **dbp; 219fa9e4066Sahrens 220fa9e4066Sahrens /* 221bbfa8ea8SMatthew Ahrens * We musn't hold db_mtx to maintain lock ordering: 222fa9e4066Sahrens * DBUF_HASH_MUTEX > db_mtx. 223fa9e4066Sahrens */ 224fa9e4066Sahrens ASSERT(refcount_is_zero(&db->db_holds)); 225ea8dc4b6Seschrock ASSERT(db->db_state == DB_EVICTING); 226fa9e4066Sahrens ASSERT(!MUTEX_HELD(&db->db_mtx)); 227fa9e4066Sahrens 228fa9e4066Sahrens mutex_enter(DBUF_HASH_MUTEX(h, idx)); 229fa9e4066Sahrens dbp = &h->hash_table[idx]; 230fa9e4066Sahrens while ((dbf = *dbp) != db) { 231fa9e4066Sahrens dbp = &dbf->db_hash_next; 232fa9e4066Sahrens ASSERT(dbf != NULL); 233fa9e4066Sahrens } 234fa9e4066Sahrens *dbp = db->db_hash_next; 235fa9e4066Sahrens db->db_hash_next = NULL; 236fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 2371a5e258fSJosef 'Jeff' Sipek atomic_dec_64(&dbuf_hash_count); 238fa9e4066Sahrens } 239fa9e4066Sahrens 240ea8dc4b6Seschrock static arc_evict_func_t dbuf_do_evict; 241fa9e4066Sahrens 242bc9014e6SJustin Gibbs typedef enum { 243bc9014e6SJustin Gibbs DBVU_EVICTING, 244bc9014e6SJustin Gibbs DBVU_NOT_EVICTING 245bc9014e6SJustin Gibbs } dbvu_verify_type_t; 246bc9014e6SJustin Gibbs 247bc9014e6SJustin Gibbs static void 248bc9014e6SJustin Gibbs dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) 249bc9014e6SJustin Gibbs { 250bc9014e6SJustin Gibbs #ifdef ZFS_DEBUG 251bc9014e6SJustin Gibbs int64_t holds; 252bc9014e6SJustin Gibbs 253bc9014e6SJustin Gibbs if (db->db_user == NULL) 254bc9014e6SJustin Gibbs return; 255bc9014e6SJustin Gibbs 256bc9014e6SJustin Gibbs /* Only data blocks support the attachment of user data. */ 257bc9014e6SJustin Gibbs ASSERT(db->db_level == 0); 258bc9014e6SJustin Gibbs 259bc9014e6SJustin Gibbs /* Clients must resolve a dbuf before attaching user data. */ 260bc9014e6SJustin Gibbs ASSERT(db->db.db_data != NULL); 261bc9014e6SJustin Gibbs ASSERT3U(db->db_state, ==, DB_CACHED); 262bc9014e6SJustin Gibbs 263bc9014e6SJustin Gibbs holds = refcount_count(&db->db_holds); 264bc9014e6SJustin Gibbs if (verify_type == DBVU_EVICTING) { 265bc9014e6SJustin Gibbs /* 266bc9014e6SJustin Gibbs * Immediate eviction occurs when holds == dirtycnt. 267bc9014e6SJustin Gibbs * For normal eviction buffers, holds is zero on 268bc9014e6SJustin Gibbs * eviction, except when dbuf_fix_old_data() calls 269bc9014e6SJustin Gibbs * dbuf_clear_data(). However, the hold count can grow 270bc9014e6SJustin Gibbs * during eviction even though db_mtx is held (see 271bc9014e6SJustin Gibbs * dmu_bonus_hold() for an example), so we can only 272bc9014e6SJustin Gibbs * test the generic invariant that holds >= dirtycnt. 273bc9014e6SJustin Gibbs */ 274bc9014e6SJustin Gibbs ASSERT3U(holds, >=, db->db_dirtycnt); 275bc9014e6SJustin Gibbs } else { 276d2058105SJustin T. Gibbs if (db->db_user_immediate_evict == TRUE) 277bc9014e6SJustin Gibbs ASSERT3U(holds, >=, db->db_dirtycnt); 278bc9014e6SJustin Gibbs else 279bc9014e6SJustin Gibbs ASSERT3U(holds, >, 0); 280bc9014e6SJustin Gibbs } 281bc9014e6SJustin Gibbs #endif 282bc9014e6SJustin Gibbs } 283bc9014e6SJustin Gibbs 284fa9e4066Sahrens static void 285fa9e4066Sahrens dbuf_evict_user(dmu_buf_impl_t *db) 286fa9e4066Sahrens { 287bc9014e6SJustin Gibbs dmu_buf_user_t *dbu = db->db_user; 288bc9014e6SJustin Gibbs 289fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 290fa9e4066Sahrens 291bc9014e6SJustin Gibbs if (dbu == NULL) 292fa9e4066Sahrens return; 293fa9e4066Sahrens 294bc9014e6SJustin Gibbs dbuf_verify_user(db, DBVU_EVICTING); 295bc9014e6SJustin Gibbs db->db_user = NULL; 296bc9014e6SJustin Gibbs 297bc9014e6SJustin Gibbs #ifdef ZFS_DEBUG 298bc9014e6SJustin Gibbs if (dbu->dbu_clear_on_evict_dbufp != NULL) 299bc9014e6SJustin Gibbs *dbu->dbu_clear_on_evict_dbufp = NULL; 300bc9014e6SJustin Gibbs #endif 301bc9014e6SJustin Gibbs 302bc9014e6SJustin Gibbs /* 303bc9014e6SJustin Gibbs * Invoke the callback from a taskq to avoid lock order reversals 304bc9014e6SJustin Gibbs * and limit stack depth. 305bc9014e6SJustin Gibbs */ 306bc9014e6SJustin Gibbs taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0, 307bc9014e6SJustin Gibbs &dbu->dbu_tqent); 308fa9e4066Sahrens } 309fa9e4066Sahrens 310744947dcSTom Erickson boolean_t 311744947dcSTom Erickson dbuf_is_metadata(dmu_buf_impl_t *db) 312744947dcSTom Erickson { 313744947dcSTom Erickson if (db->db_level > 0) { 314744947dcSTom Erickson return (B_TRUE); 315744947dcSTom Erickson } else { 316744947dcSTom Erickson boolean_t is_metadata; 317744947dcSTom Erickson 318744947dcSTom Erickson DB_DNODE_ENTER(db); 319ad135b5dSChristopher Siden is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 320744947dcSTom Erickson DB_DNODE_EXIT(db); 321744947dcSTom Erickson 322744947dcSTom Erickson return (is_metadata); 323744947dcSTom Erickson } 324744947dcSTom Erickson } 325744947dcSTom Erickson 326fa9e4066Sahrens void 327ea8dc4b6Seschrock dbuf_evict(dmu_buf_impl_t *db) 328ea8dc4b6Seschrock { 329ea8dc4b6Seschrock ASSERT(MUTEX_HELD(&db->db_mtx)); 330ea8dc4b6Seschrock ASSERT(db->db_buf == NULL); 331c717a561Smaybee ASSERT(db->db_data_pending == NULL); 332ea8dc4b6Seschrock 333ea8dc4b6Seschrock dbuf_clear(db); 334ea8dc4b6Seschrock dbuf_destroy(db); 335ea8dc4b6Seschrock } 336ea8dc4b6Seschrock 337ea8dc4b6Seschrock void 338fa9e4066Sahrens dbuf_init(void) 339fa9e4066Sahrens { 340ea8dc4b6Seschrock uint64_t hsize = 1ULL << 16; 341fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table; 342fa9e4066Sahrens int i; 343fa9e4066Sahrens 344fa9e4066Sahrens /* 345fa9e4066Sahrens * The hash table is big enough to fill all of physical memory 346ea8dc4b6Seschrock * with an average 4K block size. The table will take up 347ea8dc4b6Seschrock * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 348fa9e4066Sahrens */ 349ea8dc4b6Seschrock while (hsize * 4096 < physmem * PAGESIZE) 350fa9e4066Sahrens hsize <<= 1; 351fa9e4066Sahrens 352ea8dc4b6Seschrock retry: 353fa9e4066Sahrens h->hash_table_mask = hsize - 1; 354ea8dc4b6Seschrock h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 355ea8dc4b6Seschrock if (h->hash_table == NULL) { 356ea8dc4b6Seschrock /* XXX - we should really return an error instead of assert */ 357ea8dc4b6Seschrock ASSERT(hsize > (1ULL << 10)); 358ea8dc4b6Seschrock hsize >>= 1; 359ea8dc4b6Seschrock goto retry; 360ea8dc4b6Seschrock } 361fa9e4066Sahrens 362fa9e4066Sahrens dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 363fa9e4066Sahrens sizeof (dmu_buf_impl_t), 364fa9e4066Sahrens 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 365fa9e4066Sahrens 366fa9e4066Sahrens for (i = 0; i < DBUF_MUTEXES; i++) 367fa9e4066Sahrens mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 368bc9014e6SJustin Gibbs 369bc9014e6SJustin Gibbs /* 370bc9014e6SJustin Gibbs * All entries are queued via taskq_dispatch_ent(), so min/maxalloc 371bc9014e6SJustin Gibbs * configuration is not required. 372bc9014e6SJustin Gibbs */ 373bc9014e6SJustin Gibbs dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); 374fa9e4066Sahrens } 375fa9e4066Sahrens 376fa9e4066Sahrens void 377fa9e4066Sahrens dbuf_fini(void) 378fa9e4066Sahrens { 379fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table; 380fa9e4066Sahrens int i; 381fa9e4066Sahrens 382fa9e4066Sahrens for (i = 0; i < DBUF_MUTEXES; i++) 383fa9e4066Sahrens mutex_destroy(&h->hash_mutexes[i]); 384fa9e4066Sahrens kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 385fa9e4066Sahrens kmem_cache_destroy(dbuf_cache); 386bc9014e6SJustin Gibbs taskq_destroy(dbu_evict_taskq); 387fa9e4066Sahrens } 388fa9e4066Sahrens 389fa9e4066Sahrens /* 390fa9e4066Sahrens * Other stuff. 391fa9e4066Sahrens */ 392fa9e4066Sahrens 3939c9dc39aSek110237 #ifdef ZFS_DEBUG 394fa9e4066Sahrens static void 395fa9e4066Sahrens dbuf_verify(dmu_buf_impl_t *db) 396fa9e4066Sahrens { 397744947dcSTom Erickson dnode_t *dn; 398b24ab676SJeff Bonwick dbuf_dirty_record_t *dr; 399fa9e4066Sahrens 400fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 401fa9e4066Sahrens 402fa9e4066Sahrens if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 403fa9e4066Sahrens return; 404fa9e4066Sahrens 405fa9e4066Sahrens ASSERT(db->db_objset != NULL); 406744947dcSTom Erickson DB_DNODE_ENTER(db); 407744947dcSTom Erickson dn = DB_DNODE(db); 408fa9e4066Sahrens if (dn == NULL) { 409fa9e4066Sahrens ASSERT(db->db_parent == NULL); 410fa9e4066Sahrens ASSERT(db->db_blkptr == NULL); 411fa9e4066Sahrens } else { 412fa9e4066Sahrens ASSERT3U(db->db.db_object, ==, dn->dn_object); 413fa9e4066Sahrens ASSERT3P(db->db_objset, ==, dn->dn_objset); 414fa9e4066Sahrens ASSERT3U(db->db_level, <, dn->dn_nlevels); 415744947dcSTom Erickson ASSERT(db->db_blkid == DMU_BONUS_BLKID || 416744947dcSTom Erickson db->db_blkid == DMU_SPILL_BLKID || 4170f6d88adSAlex Reece !avl_is_empty(&dn->dn_dbufs)); 418fa9e4066Sahrens } 4190a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) { 420fa9e4066Sahrens ASSERT(dn != NULL); 4211934e92fSmaybee ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 4220a586ceaSMark Shellenbaum ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 4230a586ceaSMark Shellenbaum } else if (db->db_blkid == DMU_SPILL_BLKID) { 4240a586ceaSMark Shellenbaum ASSERT(dn != NULL); 4250a586ceaSMark Shellenbaum ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 426fb09f5aaSMadhav Suresh ASSERT0(db->db.db_offset); 427fa9e4066Sahrens } else { 428fa9e4066Sahrens ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 429fa9e4066Sahrens } 430fa9e4066Sahrens 431b24ab676SJeff Bonwick for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 432b24ab676SJeff Bonwick ASSERT(dr->dr_dbuf == db); 433b24ab676SJeff Bonwick 434b24ab676SJeff Bonwick for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 435b24ab676SJeff Bonwick ASSERT(dr->dr_dbuf == db); 436b24ab676SJeff Bonwick 43788b7b0f2SMatthew Ahrens /* 43888b7b0f2SMatthew Ahrens * We can't assert that db_size matches dn_datablksz because it 43988b7b0f2SMatthew Ahrens * can be momentarily different when another thread is doing 44088b7b0f2SMatthew Ahrens * dnode_set_blksz(). 44188b7b0f2SMatthew Ahrens */ 44288b7b0f2SMatthew Ahrens if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 443b24ab676SJeff Bonwick dr = db->db_data_pending; 444fa9e4066Sahrens /* 44588b7b0f2SMatthew Ahrens * It should only be modified in syncing context, so 44688b7b0f2SMatthew Ahrens * make sure we only have one copy of the data. 447fa9e4066Sahrens */ 448c717a561Smaybee ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 449fa9e4066Sahrens } 450fa9e4066Sahrens 451fa9e4066Sahrens /* verify db->db_blkptr */ 452fa9e4066Sahrens if (db->db_blkptr) { 453fa9e4066Sahrens if (db->db_parent == dn->dn_dbuf) { 454fa9e4066Sahrens /* db is pointed to by the dnode */ 455fa9e4066Sahrens /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 45614843421SMatthew Ahrens if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 457fa9e4066Sahrens ASSERT(db->db_parent == NULL); 458fa9e4066Sahrens else 459fa9e4066Sahrens ASSERT(db->db_parent != NULL); 4600a586ceaSMark Shellenbaum if (db->db_blkid != DMU_SPILL_BLKID) 461fa9e4066Sahrens ASSERT3P(db->db_blkptr, ==, 462fa9e4066Sahrens &dn->dn_phys->dn_blkptr[db->db_blkid]); 463fa9e4066Sahrens } else { 464fa9e4066Sahrens /* db is pointed to by an indirect block */ 465fa9e4066Sahrens int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 466fa9e4066Sahrens ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 467fa9e4066Sahrens ASSERT3U(db->db_parent->db.db_object, ==, 468fa9e4066Sahrens db->db.db_object); 469fa9e4066Sahrens /* 470fa9e4066Sahrens * dnode_grow_indblksz() can make this fail if we don't 471fa9e4066Sahrens * have the struct_rwlock. XXX indblksz no longer 472fa9e4066Sahrens * grows. safe to do this now? 473fa9e4066Sahrens */ 474744947dcSTom Erickson if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 475fa9e4066Sahrens ASSERT3P(db->db_blkptr, ==, 476fa9e4066Sahrens ((blkptr_t *)db->db_parent->db.db_data + 477fa9e4066Sahrens db->db_blkid % epb)); 478fa9e4066Sahrens } 479fa9e4066Sahrens } 480fa9e4066Sahrens } 481fa9e4066Sahrens if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 4823f9d6ad7SLin Ling (db->db_buf == NULL || db->db_buf->b_data) && 4830a586ceaSMark Shellenbaum db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 484fa9e4066Sahrens db->db_state != DB_FILL && !dn->dn_free_txg) { 485fa9e4066Sahrens /* 486fa9e4066Sahrens * If the blkptr isn't set but they have nonzero data, 487fa9e4066Sahrens * it had better be dirty, otherwise we'll lose that 488fa9e4066Sahrens * data when we evict this buffer. 489fa9e4066Sahrens */ 490fa9e4066Sahrens if (db->db_dirtycnt == 0) { 491fa9e4066Sahrens uint64_t *buf = db->db.db_data; 492fa9e4066Sahrens int i; 493fa9e4066Sahrens 494fa9e4066Sahrens for (i = 0; i < db->db.db_size >> 3; i++) { 495fa9e4066Sahrens ASSERT(buf[i] == 0); 496fa9e4066Sahrens } 497fa9e4066Sahrens } 498fa9e4066Sahrens } 499744947dcSTom Erickson DB_DNODE_EXIT(db); 500fa9e4066Sahrens } 5019c9dc39aSek110237 #endif 502fa9e4066Sahrens 503fa9e4066Sahrens static void 504bc9014e6SJustin Gibbs dbuf_clear_data(dmu_buf_impl_t *db) 505fa9e4066Sahrens { 506fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 507ea8dc4b6Seschrock dbuf_evict_user(db); 508bc9014e6SJustin Gibbs db->db_buf = NULL; 509ea8dc4b6Seschrock db->db.db_data = NULL; 51082c9918fSTim Haley if (db->db_state != DB_NOFILL) 511ea8dc4b6Seschrock db->db_state = DB_UNCACHED; 512ea8dc4b6Seschrock } 513bc9014e6SJustin Gibbs 514bc9014e6SJustin Gibbs static void 515bc9014e6SJustin Gibbs dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 516bc9014e6SJustin Gibbs { 517bc9014e6SJustin Gibbs ASSERT(MUTEX_HELD(&db->db_mtx)); 518bc9014e6SJustin Gibbs ASSERT(buf != NULL); 519bc9014e6SJustin Gibbs 520bc9014e6SJustin Gibbs db->db_buf = buf; 521bc9014e6SJustin Gibbs ASSERT(buf->b_data != NULL); 522bc9014e6SJustin Gibbs db->db.db_data = buf->b_data; 523bc9014e6SJustin Gibbs if (!arc_released(buf)) 524bc9014e6SJustin Gibbs arc_set_callback(buf, dbuf_do_evict, db); 525fa9e4066Sahrens } 526fa9e4066Sahrens 527c242f9a0Schunli zhang - Sun Microsystems - Irvine United States /* 528c242f9a0Schunli zhang - Sun Microsystems - Irvine United States * Loan out an arc_buf for read. Return the loaned arc_buf. 529c242f9a0Schunli zhang - Sun Microsystems - Irvine United States */ 530c242f9a0Schunli zhang - Sun Microsystems - Irvine United States arc_buf_t * 531c242f9a0Schunli zhang - Sun Microsystems - Irvine United States dbuf_loan_arcbuf(dmu_buf_impl_t *db) 532c242f9a0Schunli zhang - Sun Microsystems - Irvine United States { 533c242f9a0Schunli zhang - Sun Microsystems - Irvine United States arc_buf_t *abuf; 534c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 535c242f9a0Schunli zhang - Sun Microsystems - Irvine United States mutex_enter(&db->db_mtx); 536c242f9a0Schunli zhang - Sun Microsystems - Irvine United States if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 537c242f9a0Schunli zhang - Sun Microsystems - Irvine United States int blksz = db->db.db_size; 53843466aaeSMax Grossman spa_t *spa = db->db_objset->os_spa; 539744947dcSTom Erickson 540c242f9a0Schunli zhang - Sun Microsystems - Irvine United States mutex_exit(&db->db_mtx); 541744947dcSTom Erickson abuf = arc_loan_buf(spa, blksz); 542c242f9a0Schunli zhang - Sun Microsystems - Irvine United States bcopy(db->db.db_data, abuf->b_data, blksz); 543c242f9a0Schunli zhang - Sun Microsystems - Irvine United States } else { 544c242f9a0Schunli zhang - Sun Microsystems - Irvine United States abuf = db->db_buf; 545c242f9a0Schunli zhang - Sun Microsystems - Irvine United States arc_loan_inuse_buf(abuf, db); 546bc9014e6SJustin Gibbs dbuf_clear_data(db); 547c242f9a0Schunli zhang - Sun Microsystems - Irvine United States mutex_exit(&db->db_mtx); 548c242f9a0Schunli zhang - Sun Microsystems - Irvine United States } 549c242f9a0Schunli zhang - Sun Microsystems - Irvine United States return (abuf); 550c242f9a0Schunli zhang - Sun Microsystems - Irvine United States } 551c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 552a2cdcdd2SPaul Dagnelie /* 553a2cdcdd2SPaul Dagnelie * Calculate which level n block references the data at the level 0 offset 554a2cdcdd2SPaul Dagnelie * provided. 555a2cdcdd2SPaul Dagnelie */ 556fa9e4066Sahrens uint64_t 557a2cdcdd2SPaul Dagnelie dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) 558fa9e4066Sahrens { 559a2cdcdd2SPaul Dagnelie if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { 560a2cdcdd2SPaul Dagnelie /* 561a2cdcdd2SPaul Dagnelie * The level n blkid is equal to the level 0 blkid divided by 562a2cdcdd2SPaul Dagnelie * the number of level 0s in a level n block. 563a2cdcdd2SPaul Dagnelie * 564a2cdcdd2SPaul Dagnelie * The level 0 blkid is offset >> datablkshift = 565a2cdcdd2SPaul Dagnelie * offset / 2^datablkshift. 566a2cdcdd2SPaul Dagnelie * 567a2cdcdd2SPaul Dagnelie * The number of level 0s in a level n is the number of block 568a2cdcdd2SPaul Dagnelie * pointers in an indirect block, raised to the power of level. 569a2cdcdd2SPaul Dagnelie * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = 570a2cdcdd2SPaul Dagnelie * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). 571a2cdcdd2SPaul Dagnelie * 572a2cdcdd2SPaul Dagnelie * Thus, the level n blkid is: offset / 573a2cdcdd2SPaul Dagnelie * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) 574a2cdcdd2SPaul Dagnelie * = offset / 2^(datablkshift + level * 575a2cdcdd2SPaul Dagnelie * (indblkshift - SPA_BLKPTRSHIFT)) 576a2cdcdd2SPaul Dagnelie * = offset >> (datablkshift + level * 577a2cdcdd2SPaul Dagnelie * (indblkshift - SPA_BLKPTRSHIFT)) 578a2cdcdd2SPaul Dagnelie */ 579a2cdcdd2SPaul Dagnelie return (offset >> (dn->dn_datablkshift + level * 580a2cdcdd2SPaul Dagnelie (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); 581fa9e4066Sahrens } else { 582fa9e4066Sahrens ASSERT3U(offset, <, dn->dn_datablksz); 583fa9e4066Sahrens return (0); 584fa9e4066Sahrens } 585fa9e4066Sahrens } 586fa9e4066Sahrens 587fa9e4066Sahrens static void 588fa9e4066Sahrens dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 589fa9e4066Sahrens { 590fa9e4066Sahrens dmu_buf_impl_t *db = vdb; 591fa9e4066Sahrens 592fa9e4066Sahrens mutex_enter(&db->db_mtx); 593fa9e4066Sahrens ASSERT3U(db->db_state, ==, DB_READ); 594fa9e4066Sahrens /* 595fa9e4066Sahrens * All reads are synchronous, so we must have a hold on the dbuf 596fa9e4066Sahrens */ 597fa9e4066Sahrens ASSERT(refcount_count(&db->db_holds) > 0); 598ea8dc4b6Seschrock ASSERT(db->db_buf == NULL); 599fa9e4066Sahrens ASSERT(db->db.db_data == NULL); 600c717a561Smaybee if (db->db_level == 0 && db->db_freed_in_flight) { 601fa9e4066Sahrens /* we were freed in flight; disregard any error */ 602fa9e4066Sahrens arc_release(buf, db); 603fa9e4066Sahrens bzero(buf->b_data, db->db.db_size); 6046b4acc8bSahrens arc_buf_freeze(buf); 605c717a561Smaybee db->db_freed_in_flight = FALSE; 606fa9e4066Sahrens dbuf_set_data(db, buf); 607fa9e4066Sahrens db->db_state = DB_CACHED; 608fa9e4066Sahrens } else if (zio == NULL || zio->io_error == 0) { 609fa9e4066Sahrens dbuf_set_data(db, buf); 610fa9e4066Sahrens db->db_state = DB_CACHED; 611fa9e4066Sahrens } else { 6120a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 613fa9e4066Sahrens ASSERT3P(db->db_buf, ==, NULL); 6143b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(buf, db)); 615ea8dc4b6Seschrock db->db_state = DB_UNCACHED; 616fa9e4066Sahrens } 617fa9e4066Sahrens cv_broadcast(&db->db_changed); 6183f9d6ad7SLin Ling dbuf_rele_and_unlock(db, NULL); 619fa9e4066Sahrens } 620fa9e4066Sahrens 621ea8dc4b6Seschrock static void 622cf6106c8SMatthew Ahrens dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 623fa9e4066Sahrens { 624744947dcSTom Erickson dnode_t *dn; 6257802d7bfSMatthew Ahrens zbookmark_phys_t zb; 6267adb730bSGeorge Wilson arc_flags_t aflags = ARC_FLAG_NOWAIT; 627fa9e4066Sahrens 628744947dcSTom Erickson DB_DNODE_ENTER(db); 629744947dcSTom Erickson dn = DB_DNODE(db); 630fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 631fa9e4066Sahrens /* We need the struct_rwlock to prevent db_blkptr from changing. */ 632088f3894Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 633ea8dc4b6Seschrock ASSERT(MUTEX_HELD(&db->db_mtx)); 634ea8dc4b6Seschrock ASSERT(db->db_state == DB_UNCACHED); 635ea8dc4b6Seschrock ASSERT(db->db_buf == NULL); 636fa9e4066Sahrens 6370a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) { 638cf04dda1SMark Maybee int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 6391934e92fSmaybee 6401934e92fSmaybee ASSERT3U(bonuslen, <=, db->db.db_size); 641ea8dc4b6Seschrock db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 6425a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 6431934e92fSmaybee if (bonuslen < DN_MAX_BONUSLEN) 644ea8dc4b6Seschrock bzero(db->db.db_data, DN_MAX_BONUSLEN); 645cf04dda1SMark Maybee if (bonuslen) 646cf04dda1SMark Maybee bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 647744947dcSTom Erickson DB_DNODE_EXIT(db); 648fa9e4066Sahrens db->db_state = DB_CACHED; 649fa9e4066Sahrens mutex_exit(&db->db_mtx); 650fa9e4066Sahrens return; 651fa9e4066Sahrens } 652fa9e4066Sahrens 6531c8564a7SMark Maybee /* 6541c8564a7SMark Maybee * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 6551c8564a7SMark Maybee * processes the delete record and clears the bp while we are waiting 6561c8564a7SMark Maybee * for the dn_mtx (resulting in a "no" from block_freed). 6571c8564a7SMark Maybee */ 658088f3894Sahrens if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 6591c8564a7SMark Maybee (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 6601c8564a7SMark Maybee BP_IS_HOLE(db->db_blkptr)))) { 661ad23a2dbSjohansen arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 662ad23a2dbSjohansen 663744947dcSTom Erickson DB_DNODE_EXIT(db); 66443466aaeSMax Grossman dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, 66543466aaeSMax Grossman db->db.db_size, db, type)); 666fa9e4066Sahrens bzero(db->db.db_data, db->db.db_size); 667fa9e4066Sahrens db->db_state = DB_CACHED; 668fa9e4066Sahrens mutex_exit(&db->db_mtx); 669fa9e4066Sahrens return; 670fa9e4066Sahrens } 671fa9e4066Sahrens 672744947dcSTom Erickson DB_DNODE_EXIT(db); 673744947dcSTom Erickson 674fa9e4066Sahrens db->db_state = DB_READ; 675fa9e4066Sahrens mutex_exit(&db->db_mtx); 676fa9e4066Sahrens 6773baa08fcSek110237 if (DBUF_IS_L2CACHEABLE(db)) 6787adb730bSGeorge Wilson aflags |= ARC_FLAG_L2CACHE; 679aad02571SSaso Kiselkov if (DBUF_IS_L2COMPRESSIBLE(db)) 6807adb730bSGeorge Wilson aflags |= ARC_FLAG_L2COMPRESS; 6813baa08fcSek110237 682b24ab676SJeff Bonwick SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 683b24ab676SJeff Bonwick db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 684b24ab676SJeff Bonwick db->db.db_object, db->db_level, db->db_blkid); 685ea8dc4b6Seschrock 686ea8dc4b6Seschrock dbuf_add_ref(db, NULL); 687088f3894Sahrens 68843466aaeSMax Grossman (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 689fa9e4066Sahrens dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 690cf6106c8SMatthew Ahrens (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 69113506d1eSmaybee &aflags, &zb); 692fa9e4066Sahrens } 693fa9e4066Sahrens 694ea8dc4b6Seschrock int 695ea8dc4b6Seschrock dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 696fa9e4066Sahrens { 697ea8dc4b6Seschrock int err = 0; 69843466aaeSMax Grossman boolean_t havepzio = (zio != NULL); 69943466aaeSMax Grossman boolean_t prefetch; 700744947dcSTom Erickson dnode_t *dn; 701fa9e4066Sahrens 702fa9e4066Sahrens /* 703fa9e4066Sahrens * We don't have to hold the mutex to check db_state because it 704fa9e4066Sahrens * can't be freed while we have a hold on the buffer. 705fa9e4066Sahrens */ 706fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 707fa9e4066Sahrens 70882c9918fSTim Haley if (db->db_state == DB_NOFILL) 709be6fd75aSMatthew Ahrens return (SET_ERROR(EIO)); 71082c9918fSTim Haley 711744947dcSTom Erickson DB_DNODE_ENTER(db); 712744947dcSTom Erickson dn = DB_DNODE(db); 713fa9e4066Sahrens if ((flags & DB_RF_HAVESTRUCT) == 0) 714744947dcSTom Erickson rw_enter(&dn->dn_struct_rwlock, RW_READER); 715fa9e4066Sahrens 7160a586ceaSMark Shellenbaum prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 717744947dcSTom Erickson (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 7183baa08fcSek110237 DBUF_IS_CACHEABLE(db); 71913506d1eSmaybee 720fa9e4066Sahrens mutex_enter(&db->db_mtx); 721ea8dc4b6Seschrock if (db->db_state == DB_CACHED) { 722ea8dc4b6Seschrock mutex_exit(&db->db_mtx); 72313506d1eSmaybee if (prefetch) 724cf6106c8SMatthew Ahrens dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1); 725ea8dc4b6Seschrock if ((flags & DB_RF_HAVESTRUCT) == 0) 726744947dcSTom Erickson rw_exit(&dn->dn_struct_rwlock); 727744947dcSTom Erickson DB_DNODE_EXIT(db); 728ea8dc4b6Seschrock } else if (db->db_state == DB_UNCACHED) { 729744947dcSTom Erickson spa_t *spa = dn->dn_objset->os_spa; 730744947dcSTom Erickson 731744947dcSTom Erickson if (zio == NULL) 732744947dcSTom Erickson zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 733cf6106c8SMatthew Ahrens dbuf_read_impl(db, zio, flags); 73413506d1eSmaybee 735ea8dc4b6Seschrock /* dbuf_read_impl has dropped db_mtx for us */ 736ea8dc4b6Seschrock 73713506d1eSmaybee if (prefetch) 738cf6106c8SMatthew Ahrens dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1); 739ea8dc4b6Seschrock 740ea8dc4b6Seschrock if ((flags & DB_RF_HAVESTRUCT) == 0) 741744947dcSTom Erickson rw_exit(&dn->dn_struct_rwlock); 742744947dcSTom Erickson DB_DNODE_EXIT(db); 743ea8dc4b6Seschrock 744ea8dc4b6Seschrock if (!havepzio) 745ea8dc4b6Seschrock err = zio_wait(zio); 746ea8dc4b6Seschrock } else { 7473e30c24aSWill Andrews /* 7483e30c24aSWill Andrews * Another reader came in while the dbuf was in flight 7493e30c24aSWill Andrews * between UNCACHED and CACHED. Either a writer will finish 7503e30c24aSWill Andrews * writing the buffer (sending the dbuf to CACHED) or the 7513e30c24aSWill Andrews * first reader's request will reach the read_done callback 7523e30c24aSWill Andrews * and send the dbuf to CACHED. Otherwise, a failure 7533e30c24aSWill Andrews * occurred and the dbuf went to UNCACHED. 7543e30c24aSWill Andrews */ 75513506d1eSmaybee mutex_exit(&db->db_mtx); 75613506d1eSmaybee if (prefetch) 757cf6106c8SMatthew Ahrens dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1); 758ea8dc4b6Seschrock if ((flags & DB_RF_HAVESTRUCT) == 0) 759744947dcSTom Erickson rw_exit(&dn->dn_struct_rwlock); 760744947dcSTom Erickson DB_DNODE_EXIT(db); 76113506d1eSmaybee 7623e30c24aSWill Andrews /* Skip the wait per the caller's request. */ 76313506d1eSmaybee mutex_enter(&db->db_mtx); 764ea8dc4b6Seschrock if ((flags & DB_RF_NEVERWAIT) == 0) { 765ea8dc4b6Seschrock while (db->db_state == DB_READ || 766ea8dc4b6Seschrock db->db_state == DB_FILL) { 767fa9e4066Sahrens ASSERT(db->db_state == DB_READ || 768fa9e4066Sahrens (flags & DB_RF_HAVESTRUCT) == 0); 769f6164ad6SAdam H. Leventhal DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, 770f6164ad6SAdam H. Leventhal db, zio_t *, zio); 771fa9e4066Sahrens cv_wait(&db->db_changed, &db->db_mtx); 772fa9e4066Sahrens } 773ea8dc4b6Seschrock if (db->db_state == DB_UNCACHED) 774be6fd75aSMatthew Ahrens err = SET_ERROR(EIO); 775ea8dc4b6Seschrock } 776fa9e4066Sahrens mutex_exit(&db->db_mtx); 777fa9e4066Sahrens } 778fa9e4066Sahrens 779ea8dc4b6Seschrock ASSERT(err || havepzio || db->db_state == DB_CACHED); 780ea8dc4b6Seschrock return (err); 781fa9e4066Sahrens } 782fa9e4066Sahrens 783fa9e4066Sahrens static void 784fa9e4066Sahrens dbuf_noread(dmu_buf_impl_t *db) 785fa9e4066Sahrens { 786fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 7870a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 788fa9e4066Sahrens mutex_enter(&db->db_mtx); 789fa9e4066Sahrens while (db->db_state == DB_READ || db->db_state == DB_FILL) 790fa9e4066Sahrens cv_wait(&db->db_changed, &db->db_mtx); 791fa9e4066Sahrens if (db->db_state == DB_UNCACHED) { 792ad23a2dbSjohansen arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 79343466aaeSMax Grossman spa_t *spa = db->db_objset->os_spa; 794ad23a2dbSjohansen 795ea8dc4b6Seschrock ASSERT(db->db_buf == NULL); 796fa9e4066Sahrens ASSERT(db->db.db_data == NULL); 797744947dcSTom Erickson dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 798fa9e4066Sahrens db->db_state = DB_FILL; 79982c9918fSTim Haley } else if (db->db_state == DB_NOFILL) { 800bc9014e6SJustin Gibbs dbuf_clear_data(db); 801fa9e4066Sahrens } else { 802fa9e4066Sahrens ASSERT3U(db->db_state, ==, DB_CACHED); 803fa9e4066Sahrens } 804fa9e4066Sahrens mutex_exit(&db->db_mtx); 805fa9e4066Sahrens } 806fa9e4066Sahrens 807fa9e4066Sahrens /* 808fa9e4066Sahrens * This is our just-in-time copy function. It makes a copy of 809fa9e4066Sahrens * buffers, that have been modified in a previous transaction 810fa9e4066Sahrens * group, before we modify them in the current active group. 811fa9e4066Sahrens * 812fa9e4066Sahrens * This function is used in two places: when we are dirtying a 813fa9e4066Sahrens * buffer for the first time in a txg, and when we are freeing 814fa9e4066Sahrens * a range in a dnode that includes this buffer. 815fa9e4066Sahrens * 816fa9e4066Sahrens * Note that when we are called from dbuf_free_range() we do 817fa9e4066Sahrens * not put a hold on the buffer, we just traverse the active 818fa9e4066Sahrens * dbuf list for the dnode. 819fa9e4066Sahrens */ 820fa9e4066Sahrens static void 821fa9e4066Sahrens dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 822fa9e4066Sahrens { 823c717a561Smaybee dbuf_dirty_record_t *dr = db->db_last_dirty; 824fa9e4066Sahrens 825fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 826fa9e4066Sahrens ASSERT(db->db.db_data != NULL); 827c717a561Smaybee ASSERT(db->db_level == 0); 828c717a561Smaybee ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 829fa9e4066Sahrens 8304d31c452Smaybee if (dr == NULL || 8314d31c452Smaybee (dr->dt.dl.dr_data != 8320a586ceaSMark Shellenbaum ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 833fa9e4066Sahrens return; 834fa9e4066Sahrens 835fa9e4066Sahrens /* 836c717a561Smaybee * If the last dirty record for this dbuf has not yet synced 837c717a561Smaybee * and its referencing the dbuf data, either: 838c717a561Smaybee * reset the reference to point to a new copy, 839c717a561Smaybee * or (if there a no active holders) 840c717a561Smaybee * just null out the current db_data pointer. 841fa9e4066Sahrens */ 842c717a561Smaybee ASSERT(dr->dr_txg >= txg - 2); 8430a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) { 844c717a561Smaybee /* Note that the data bufs here are zio_bufs */ 845c717a561Smaybee dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 8465a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 847c717a561Smaybee bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 848c717a561Smaybee } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 849ea8dc4b6Seschrock int size = db->db.db_size; 850c717a561Smaybee arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 85143466aaeSMax Grossman spa_t *spa = db->db_objset->os_spa; 852744947dcSTom Erickson 853744947dcSTom Erickson dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 854c717a561Smaybee bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 855fa9e4066Sahrens } else { 856bc9014e6SJustin Gibbs dbuf_clear_data(db); 857fa9e4066Sahrens } 858fa9e4066Sahrens } 859ea8dc4b6Seschrock 860fa9e4066Sahrens void 861c717a561Smaybee dbuf_unoverride(dbuf_dirty_record_t *dr) 862fa9e4066Sahrens { 863c717a561Smaybee dmu_buf_impl_t *db = dr->dr_dbuf; 864b24ab676SJeff Bonwick blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 865c717a561Smaybee uint64_t txg = dr->dr_txg; 866c5c6ffa0Smaybee 867c717a561Smaybee ASSERT(MUTEX_HELD(&db->db_mtx)); 868c717a561Smaybee ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 869c717a561Smaybee ASSERT(db->db_level == 0); 870c717a561Smaybee 8710a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID || 872c717a561Smaybee dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 873c717a561Smaybee return; 874c717a561Smaybee 875b24ab676SJeff Bonwick ASSERT(db->db_data_pending != dr); 876b24ab676SJeff Bonwick 877fa9e4066Sahrens /* free this block */ 87843466aaeSMax Grossman if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 87943466aaeSMax Grossman zio_free(db->db_objset->os_spa, txg, bp); 880b24ab676SJeff Bonwick 881c717a561Smaybee dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 88280901aeaSGeorge Wilson dr->dt.dl.dr_nopwrite = B_FALSE; 88380901aeaSGeorge Wilson 8846b4acc8bSahrens /* 8856b4acc8bSahrens * Release the already-written buffer, so we leave it in 8866b4acc8bSahrens * a consistent dirty state. Note that all callers are 8876b4acc8bSahrens * modifying the buffer, so they will immediately do 8886b4acc8bSahrens * another (redundant) arc_release(). Therefore, leave 8896b4acc8bSahrens * the buf thawed to save the effort of freezing & 8906b4acc8bSahrens * immediately re-thawing it. 8916b4acc8bSahrens */ 892c717a561Smaybee arc_release(dr->dt.dl.dr_data, db); 893fa9e4066Sahrens } 894fa9e4066Sahrens 895cdb0ab79Smaybee /* 896cdb0ab79Smaybee * Evict (if its unreferenced) or clear (if its referenced) any level-0 897cdb0ab79Smaybee * data blocks in the free range, so that any future readers will find 89843466aaeSMax Grossman * empty blocks. 8992f3d8780SMatthew Ahrens * 9002f3d8780SMatthew Ahrens * This is a no-op if the dataset is in the middle of an incremental 9012f3d8780SMatthew Ahrens * receive; see comment below for details. 902cdb0ab79Smaybee */ 903fa9e4066Sahrens void 9040f6d88adSAlex Reece dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, 9050f6d88adSAlex Reece dmu_tx_t *tx) 906fa9e4066Sahrens { 907bc9014e6SJustin Gibbs dmu_buf_impl_t db_search; 908bc9014e6SJustin Gibbs dmu_buf_impl_t *db, *db_next; 909fa9e4066Sahrens uint64_t txg = tx->tx_txg; 9100f6d88adSAlex Reece avl_index_t where; 911fa9e4066Sahrens 9120f6d88adSAlex Reece if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID)) 9130f6d88adSAlex Reece end_blkid = dn->dn_maxblkid; 9140f6d88adSAlex Reece dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); 9150f6d88adSAlex Reece 9160f6d88adSAlex Reece db_search.db_level = 0; 9170f6d88adSAlex Reece db_search.db_blkid = start_blkid; 91886bb58aeSAlex Reece db_search.db_state = DB_SEARCH; 9192f3d8780SMatthew Ahrens 920713d6c20SMatthew Ahrens mutex_enter(&dn->dn_dbufs_mtx); 9210f6d88adSAlex Reece if (start_blkid >= dn->dn_unlisted_l0_blkid) { 922713d6c20SMatthew Ahrens /* There can't be any dbufs in this range; no need to search. */ 9230f6d88adSAlex Reece #ifdef DEBUG 9240f6d88adSAlex Reece db = avl_find(&dn->dn_dbufs, &db_search, &where); 9250f6d88adSAlex Reece ASSERT3P(db, ==, NULL); 9260f6d88adSAlex Reece db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 9270f6d88adSAlex Reece ASSERT(db == NULL || db->db_level > 0); 9280f6d88adSAlex Reece #endif 929713d6c20SMatthew Ahrens mutex_exit(&dn->dn_dbufs_mtx); 9302f3d8780SMatthew Ahrens return; 931713d6c20SMatthew Ahrens } else if (dmu_objset_is_receiving(dn->dn_objset)) { 932713d6c20SMatthew Ahrens /* 933713d6c20SMatthew Ahrens * If we are receiving, we expect there to be no dbufs in 934713d6c20SMatthew Ahrens * the range to be freed, because receive modifies each 935713d6c20SMatthew Ahrens * block at most once, and in offset order. If this is 936713d6c20SMatthew Ahrens * not the case, it can lead to performance problems, 937713d6c20SMatthew Ahrens * so note that we unexpectedly took the slow path. 938713d6c20SMatthew Ahrens */ 939713d6c20SMatthew Ahrens atomic_inc_64(&zfs_free_range_recv_miss); 9402f3d8780SMatthew Ahrens } 9412f3d8780SMatthew Ahrens 9420f6d88adSAlex Reece db = avl_find(&dn->dn_dbufs, &db_search, &where); 9430f6d88adSAlex Reece ASSERT3P(db, ==, NULL); 9440f6d88adSAlex Reece db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 9450f6d88adSAlex Reece 9460f6d88adSAlex Reece for (; db != NULL; db = db_next) { 9470f6d88adSAlex Reece db_next = AVL_NEXT(&dn->dn_dbufs, db); 9480a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 949cdb0ab79Smaybee 9500f6d88adSAlex Reece if (db->db_level != 0 || db->db_blkid > end_blkid) { 9510f6d88adSAlex Reece break; 9520f6d88adSAlex Reece } 9530f6d88adSAlex Reece ASSERT3U(db->db_blkid, >=, start_blkid); 954fa9e4066Sahrens 955fa9e4066Sahrens /* found a level 0 buffer in the range */ 956fa9e4066Sahrens mutex_enter(&db->db_mtx); 9573b2aab18SMatthew Ahrens if (dbuf_undirty(db, tx)) { 9583b2aab18SMatthew Ahrens /* mutex has been dropped and dbuf destroyed */ 9593b2aab18SMatthew Ahrens continue; 9603b2aab18SMatthew Ahrens } 9613b2aab18SMatthew Ahrens 962ea8dc4b6Seschrock if (db->db_state == DB_UNCACHED || 96382c9918fSTim Haley db->db_state == DB_NOFILL || 964ea8dc4b6Seschrock db->db_state == DB_EVICTING) { 965fa9e4066Sahrens ASSERT(db->db.db_data == NULL); 966fa9e4066Sahrens mutex_exit(&db->db_mtx); 967fa9e4066Sahrens continue; 968fa9e4066Sahrens } 969c543ec06Sahrens if (db->db_state == DB_READ || db->db_state == DB_FILL) { 970c543ec06Sahrens /* will be handled in dbuf_read_done or dbuf_rele */ 971c717a561Smaybee db->db_freed_in_flight = TRUE; 972fa9e4066Sahrens mutex_exit(&db->db_mtx); 973fa9e4066Sahrens continue; 974fa9e4066Sahrens } 975ea8dc4b6Seschrock if (refcount_count(&db->db_holds) == 0) { 976ea8dc4b6Seschrock ASSERT(db->db_buf); 977ea8dc4b6Seschrock dbuf_clear(db); 978ea8dc4b6Seschrock continue; 979ea8dc4b6Seschrock } 980c717a561Smaybee /* The dbuf is referenced */ 981fa9e4066Sahrens 982c717a561Smaybee if (db->db_last_dirty != NULL) { 983c717a561Smaybee dbuf_dirty_record_t *dr = db->db_last_dirty; 984c717a561Smaybee 985c717a561Smaybee if (dr->dr_txg == txg) { 986ea8dc4b6Seschrock /* 987c717a561Smaybee * This buffer is "in-use", re-adjust the file 988c717a561Smaybee * size to reflect that this buffer may 989c717a561Smaybee * contain new data when we sync. 990ea8dc4b6Seschrock */ 99106e0070dSMark Shellenbaum if (db->db_blkid != DMU_SPILL_BLKID && 99206e0070dSMark Shellenbaum db->db_blkid > dn->dn_maxblkid) 99344eda4d7Smaybee dn->dn_maxblkid = db->db_blkid; 994c717a561Smaybee dbuf_unoverride(dr); 995c717a561Smaybee } else { 996c717a561Smaybee /* 997c717a561Smaybee * This dbuf is not dirty in the open context. 998c717a561Smaybee * Either uncache it (if its not referenced in 999c717a561Smaybee * the open context) or reset its contents to 1000c717a561Smaybee * empty. 1001c717a561Smaybee */ 1002c717a561Smaybee dbuf_fix_old_data(db, txg); 100344eda4d7Smaybee } 1004c717a561Smaybee } 1005c717a561Smaybee /* clear the contents if its cached */ 1006ea8dc4b6Seschrock if (db->db_state == DB_CACHED) { 1007ea8dc4b6Seschrock ASSERT(db->db.db_data != NULL); 1008fa9e4066Sahrens arc_release(db->db_buf, db); 1009fa9e4066Sahrens bzero(db->db.db_data, db->db.db_size); 10106b4acc8bSahrens arc_buf_freeze(db->db_buf); 1011fa9e4066Sahrens } 1012ea8dc4b6Seschrock 1013fa9e4066Sahrens mutex_exit(&db->db_mtx); 1014fa9e4066Sahrens } 1015fa9e4066Sahrens mutex_exit(&dn->dn_dbufs_mtx); 1016fa9e4066Sahrens } 1017fa9e4066Sahrens 1018fa9e4066Sahrens static int 10191934e92fSmaybee dbuf_block_freeable(dmu_buf_impl_t *db) 1020fa9e4066Sahrens { 1021fa9e4066Sahrens dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 1022fa9e4066Sahrens uint64_t birth_txg = 0; 1023fa9e4066Sahrens 1024fa9e4066Sahrens /* 1025fa9e4066Sahrens * We don't need any locking to protect db_blkptr: 1026c717a561Smaybee * If it's syncing, then db_last_dirty will be set 1027c717a561Smaybee * so we'll ignore db_blkptr. 102843466aaeSMax Grossman * 102943466aaeSMax Grossman * This logic ensures that only block births for 103043466aaeSMax Grossman * filled blocks are considered. 1031fa9e4066Sahrens */ 1032c717a561Smaybee ASSERT(MUTEX_HELD(&db->db_mtx)); 103343466aaeSMax Grossman if (db->db_last_dirty && (db->db_blkptr == NULL || 103443466aaeSMax Grossman !BP_IS_HOLE(db->db_blkptr))) { 1035c717a561Smaybee birth_txg = db->db_last_dirty->dr_txg; 103643466aaeSMax Grossman } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 1037fa9e4066Sahrens birth_txg = db->db_blkptr->blk_birth; 103843466aaeSMax Grossman } 1039fa9e4066Sahrens 1040837b568bSGeorge Wilson /* 104143466aaeSMax Grossman * If this block don't exist or is in a snapshot, it can't be freed. 1042837b568bSGeorge Wilson * Don't pass the bp to dsl_dataset_block_freeable() since we 1043837b568bSGeorge Wilson * are holding the db_mtx lock and might deadlock if we are 1044837b568bSGeorge Wilson * prefetching a dedup-ed block. 1045837b568bSGeorge Wilson */ 104643466aaeSMax Grossman if (birth_txg != 0) 10471934e92fSmaybee return (ds == NULL || 1048837b568bSGeorge Wilson dsl_dataset_block_freeable(ds, NULL, birth_txg)); 1049fa9e4066Sahrens else 105043466aaeSMax Grossman return (B_FALSE); 1051fa9e4066Sahrens } 1052fa9e4066Sahrens 1053fa9e4066Sahrens void 1054fa9e4066Sahrens dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 1055fa9e4066Sahrens { 1056fa9e4066Sahrens arc_buf_t *buf, *obuf; 1057fa9e4066Sahrens int osize = db->db.db_size; 1058ad23a2dbSjohansen arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1059744947dcSTom Erickson dnode_t *dn; 1060fa9e4066Sahrens 10610a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1062ea8dc4b6Seschrock 1063744947dcSTom Erickson DB_DNODE_ENTER(db); 1064744947dcSTom Erickson dn = DB_DNODE(db); 1065744947dcSTom Erickson 1066fa9e4066Sahrens /* XXX does *this* func really need the lock? */ 1067744947dcSTom Erickson ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1068fa9e4066Sahrens 1069fa9e4066Sahrens /* 107043466aaeSMax Grossman * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 1071fa9e4066Sahrens * is OK, because there can be no other references to the db 1072fa9e4066Sahrens * when we are changing its size, so no concurrent DB_FILL can 1073fa9e4066Sahrens * be happening. 1074fa9e4066Sahrens */ 1075ea8dc4b6Seschrock /* 1076ea8dc4b6Seschrock * XXX we should be doing a dbuf_read, checking the return 1077ea8dc4b6Seschrock * value and returning that up to our callers 1078ea8dc4b6Seschrock */ 107943466aaeSMax Grossman dmu_buf_will_dirty(&db->db, tx); 1080fa9e4066Sahrens 1081fa9e4066Sahrens /* create the data buffer for the new block */ 1082744947dcSTom Erickson buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 1083fa9e4066Sahrens 1084fa9e4066Sahrens /* copy old block data to the new block */ 1085fa9e4066Sahrens obuf = db->db_buf; 1086f65e61c0Sahrens bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 1087fa9e4066Sahrens /* zero the remainder */ 1088f65e61c0Sahrens if (size > osize) 1089fa9e4066Sahrens bzero((uint8_t *)buf->b_data + osize, size - osize); 1090fa9e4066Sahrens 1091fa9e4066Sahrens mutex_enter(&db->db_mtx); 1092fa9e4066Sahrens dbuf_set_data(db, buf); 10933b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(obuf, db)); 1094fa9e4066Sahrens db->db.db_size = size; 1095fa9e4066Sahrens 1096c717a561Smaybee if (db->db_level == 0) { 1097c717a561Smaybee ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1098c717a561Smaybee db->db_last_dirty->dt.dl.dr_data = buf; 1099c717a561Smaybee } 1100fa9e4066Sahrens mutex_exit(&db->db_mtx); 1101fa9e4066Sahrens 1102744947dcSTom Erickson dnode_willuse_space(dn, size-osize, tx); 1103744947dcSTom Erickson DB_DNODE_EXIT(db); 1104fa9e4066Sahrens } 1105fa9e4066Sahrens 11063f9d6ad7SLin Ling void 11073f9d6ad7SLin Ling dbuf_release_bp(dmu_buf_impl_t *db) 11083f9d6ad7SLin Ling { 110943466aaeSMax Grossman objset_t *os = db->db_objset; 11103f9d6ad7SLin Ling 11113f9d6ad7SLin Ling ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 11123f9d6ad7SLin Ling ASSERT(arc_released(os->os_phys_buf) || 11133f9d6ad7SLin Ling list_link_active(&os->os_dsl_dataset->ds_synced_link)); 11143f9d6ad7SLin Ling ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 11153f9d6ad7SLin Ling 11161b912ec7SGeorge Wilson (void) arc_release(db->db_buf, db); 11173f9d6ad7SLin Ling } 11183f9d6ad7SLin Ling 11190f2e7d03SMatthew Ahrens /* 11200f2e7d03SMatthew Ahrens * We already have a dirty record for this TXG, and we are being 11210f2e7d03SMatthew Ahrens * dirtied again. 11220f2e7d03SMatthew Ahrens */ 11230f2e7d03SMatthew Ahrens static void 11240f2e7d03SMatthew Ahrens dbuf_redirty(dbuf_dirty_record_t *dr) 11250f2e7d03SMatthew Ahrens { 11260f2e7d03SMatthew Ahrens dmu_buf_impl_t *db = dr->dr_dbuf; 11270f2e7d03SMatthew Ahrens 11280f2e7d03SMatthew Ahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 11290f2e7d03SMatthew Ahrens 11300f2e7d03SMatthew Ahrens if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 11310f2e7d03SMatthew Ahrens /* 11320f2e7d03SMatthew Ahrens * If this buffer has already been written out, 11330f2e7d03SMatthew Ahrens * we now need to reset its state. 11340f2e7d03SMatthew Ahrens */ 11350f2e7d03SMatthew Ahrens dbuf_unoverride(dr); 11360f2e7d03SMatthew Ahrens if (db->db.db_object != DMU_META_DNODE_OBJECT && 11370f2e7d03SMatthew Ahrens db->db_state != DB_NOFILL) { 11380f2e7d03SMatthew Ahrens /* Already released on initial dirty, so just thaw. */ 11390f2e7d03SMatthew Ahrens ASSERT(arc_released(db->db_buf)); 11400f2e7d03SMatthew Ahrens arc_buf_thaw(db->db_buf); 11410f2e7d03SMatthew Ahrens } 11420f2e7d03SMatthew Ahrens } 11430f2e7d03SMatthew Ahrens } 11440f2e7d03SMatthew Ahrens 1145c717a561Smaybee dbuf_dirty_record_t * 1146fa9e4066Sahrens dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1147fa9e4066Sahrens { 1148744947dcSTom Erickson dnode_t *dn; 1149744947dcSTom Erickson objset_t *os; 1150c717a561Smaybee dbuf_dirty_record_t **drp, *dr; 1151fa9e4066Sahrens int drop_struct_lock = FALSE; 1152d3469faaSMark Maybee boolean_t do_free_accounting = B_FALSE; 1153fa9e4066Sahrens int txgoff = tx->tx_txg & TXG_MASK; 1154fa9e4066Sahrens 1155fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 1156fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 11579c9dc39aSek110237 DMU_TX_DIRTY_BUF(tx, db); 1158fa9e4066Sahrens 1159744947dcSTom Erickson DB_DNODE_ENTER(db); 1160744947dcSTom Erickson dn = DB_DNODE(db); 1161fa9e4066Sahrens /* 1162fa9e4066Sahrens * Shouldn't dirty a regular buffer in syncing context. Private 1163fa9e4066Sahrens * objects may be dirtied in syncing context, but only if they 1164fa9e4066Sahrens * were already pre-dirtied in open context. 1165fa9e4066Sahrens */ 1166c717a561Smaybee ASSERT(!dmu_tx_is_syncing(tx) || 1167c717a561Smaybee BP_IS_HOLE(dn->dn_objset->os_rootbp) || 116814843421SMatthew Ahrens DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 116914843421SMatthew Ahrens dn->dn_objset->os_dsl_dataset == NULL); 1170fa9e4066Sahrens /* 1171fa9e4066Sahrens * We make this assert for private objects as well, but after we 1172fa9e4066Sahrens * check if we're already dirty. They are allowed to re-dirty 1173fa9e4066Sahrens * in syncing context. 1174fa9e4066Sahrens */ 1175ea8dc4b6Seschrock ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1176c717a561Smaybee dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1177fa9e4066Sahrens (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1178fa9e4066Sahrens 1179fa9e4066Sahrens mutex_enter(&db->db_mtx); 1180fa9e4066Sahrens /* 1181c717a561Smaybee * XXX make this true for indirects too? The problem is that 1182c717a561Smaybee * transactions created with dmu_tx_create_assigned() from 1183c717a561Smaybee * syncing context don't bother holding ahead. 1184fa9e4066Sahrens */ 1185c717a561Smaybee ASSERT(db->db_level != 0 || 118682c9918fSTim Haley db->db_state == DB_CACHED || db->db_state == DB_FILL || 118782c9918fSTim Haley db->db_state == DB_NOFILL); 1188fa9e4066Sahrens 1189fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 1190fa9e4066Sahrens /* 1191fa9e4066Sahrens * Don't set dirtyctx to SYNC if we're just modifying this as we 1192fa9e4066Sahrens * initialize the objset. 1193fa9e4066Sahrens */ 1194fa9e4066Sahrens if (dn->dn_dirtyctx == DN_UNDIRTIED && 1195c717a561Smaybee !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1196fa9e4066Sahrens dn->dn_dirtyctx = 1197fa9e4066Sahrens (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1198fa9e4066Sahrens ASSERT(dn->dn_dirtyctx_firstset == NULL); 1199fa9e4066Sahrens dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1200fa9e4066Sahrens } 1201fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 1202fa9e4066Sahrens 12030a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID) 12040a586ceaSMark Shellenbaum dn->dn_have_spill = B_TRUE; 12050a586ceaSMark Shellenbaum 1206fa9e4066Sahrens /* 1207fa9e4066Sahrens * If this buffer is already dirty, we're done. 1208fa9e4066Sahrens */ 1209c717a561Smaybee drp = &db->db_last_dirty; 1210c717a561Smaybee ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1211c717a561Smaybee db->db.db_object == DMU_META_DNODE_OBJECT); 12127e2186e3Sbonwick while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 12137e2186e3Sbonwick drp = &dr->dr_next; 12147e2186e3Sbonwick if (dr && dr->dr_txg == tx->tx_txg) { 1215744947dcSTom Erickson DB_DNODE_EXIT(db); 1216744947dcSTom Erickson 12170f2e7d03SMatthew Ahrens dbuf_redirty(dr); 1218fa9e4066Sahrens mutex_exit(&db->db_mtx); 12197e2186e3Sbonwick return (dr); 1220fa9e4066Sahrens } 1221fa9e4066Sahrens 1222fa9e4066Sahrens /* 1223fa9e4066Sahrens * Only valid if not already dirty. 1224fa9e4066Sahrens */ 122514843421SMatthew Ahrens ASSERT(dn->dn_object == 0 || 122614843421SMatthew Ahrens dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1227fa9e4066Sahrens (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1228fa9e4066Sahrens 1229fa9e4066Sahrens ASSERT3U(dn->dn_nlevels, >, db->db_level); 1230fa9e4066Sahrens ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1231fa9e4066Sahrens dn->dn_phys->dn_nlevels > db->db_level || 1232fa9e4066Sahrens dn->dn_next_nlevels[txgoff] > db->db_level || 1233fa9e4066Sahrens dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1234fa9e4066Sahrens dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1235fa9e4066Sahrens 1236fa9e4066Sahrens /* 1237fa9e4066Sahrens * We should only be dirtying in syncing context if it's the 123814843421SMatthew Ahrens * mos or we're initializing the os or it's a special object. 123914843421SMatthew Ahrens * However, we are allowed to dirty in syncing context provided 124014843421SMatthew Ahrens * we already dirtied it in open context. Hence we must make 124114843421SMatthew Ahrens * this assertion only if we're not already dirty. 1242fa9e4066Sahrens */ 1243744947dcSTom Erickson os = dn->dn_objset; 124414843421SMatthew Ahrens ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 124514843421SMatthew Ahrens os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1246fa9e4066Sahrens ASSERT(db->db.db_size != 0); 1247fa9e4066Sahrens 1248fa9e4066Sahrens dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1249fa9e4066Sahrens 12500a586ceaSMark Shellenbaum if (db->db_blkid != DMU_BONUS_BLKID) { 12511934e92fSmaybee /* 12521934e92fSmaybee * Update the accounting. 1253d3469faaSMark Maybee * Note: we delay "free accounting" until after we drop 1254d3469faaSMark Maybee * the db_mtx. This keeps us from grabbing other locks 1255b24ab676SJeff Bonwick * (and possibly deadlocking) in bp_get_dsize() while 1256d3469faaSMark Maybee * also holding the db_mtx. 12571934e92fSmaybee */ 12581934e92fSmaybee dnode_willuse_space(dn, db->db.db_size, tx); 1259d3469faaSMark Maybee do_free_accounting = dbuf_block_freeable(db); 12601934e92fSmaybee } 12611934e92fSmaybee 1262ea8dc4b6Seschrock /* 1263ea8dc4b6Seschrock * If this buffer is dirty in an old transaction group we need 1264ea8dc4b6Seschrock * to make a copy of it so that the changes we make in this 1265ea8dc4b6Seschrock * transaction group won't leak out when we sync the older txg. 1266ea8dc4b6Seschrock */ 1267c717a561Smaybee dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1268c717a561Smaybee if (db->db_level == 0) { 1269c717a561Smaybee void *data_old = db->db_buf; 1270c717a561Smaybee 127182c9918fSTim Haley if (db->db_state != DB_NOFILL) { 12720a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) { 1273c717a561Smaybee dbuf_fix_old_data(db, tx->tx_txg); 1274c717a561Smaybee data_old = db->db.db_data; 1275c717a561Smaybee } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1276fa9e4066Sahrens /* 127782c9918fSTim Haley * Release the data buffer from the cache so 127882c9918fSTim Haley * that we can modify it without impacting 127982c9918fSTim Haley * possible other users of this cached data 128082c9918fSTim Haley * block. Note that indirect blocks and 128182c9918fSTim Haley * private objects are not released until the 128282c9918fSTim Haley * syncing state (since they are only modified 128382c9918fSTim Haley * then). 1284fa9e4066Sahrens */ 1285fa9e4066Sahrens arc_release(db->db_buf, db); 1286fa9e4066Sahrens dbuf_fix_old_data(db, tx->tx_txg); 1287c717a561Smaybee data_old = db->db_buf; 1288fa9e4066Sahrens } 1289c717a561Smaybee ASSERT(data_old != NULL); 129082c9918fSTim Haley } 1291c717a561Smaybee dr->dt.dl.dr_data = data_old; 1292c717a561Smaybee } else { 1293c717a561Smaybee mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1294c717a561Smaybee list_create(&dr->dt.di.dr_children, 1295c717a561Smaybee sizeof (dbuf_dirty_record_t), 1296c717a561Smaybee offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1297fa9e4066Sahrens } 129869962b56SMatthew Ahrens if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 129969962b56SMatthew Ahrens dr->dr_accounted = db->db.db_size; 1300c717a561Smaybee dr->dr_dbuf = db; 1301c717a561Smaybee dr->dr_txg = tx->tx_txg; 1302c717a561Smaybee dr->dr_next = *drp; 1303c717a561Smaybee *drp = dr; 1304fa9e4066Sahrens 1305fa9e4066Sahrens /* 1306fa9e4066Sahrens * We could have been freed_in_flight between the dbuf_noread 1307fa9e4066Sahrens * and dbuf_dirty. We win, as though the dbuf_noread() had 1308fa9e4066Sahrens * happened after the free. 1309fa9e4066Sahrens */ 13100a586ceaSMark Shellenbaum if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 13110a586ceaSMark Shellenbaum db->db_blkid != DMU_SPILL_BLKID) { 1312c717a561Smaybee mutex_enter(&dn->dn_mtx); 1313bf16b11eSMatthew Ahrens if (dn->dn_free_ranges[txgoff] != NULL) { 1314bf16b11eSMatthew Ahrens range_tree_clear(dn->dn_free_ranges[txgoff], 1315bf16b11eSMatthew Ahrens db->db_blkid, 1); 1316bf16b11eSMatthew Ahrens } 1317fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 1318c717a561Smaybee db->db_freed_in_flight = FALSE; 1319c717a561Smaybee } 1320fa9e4066Sahrens 1321fa9e4066Sahrens /* 1322fa9e4066Sahrens * This buffer is now part of this txg 1323fa9e4066Sahrens */ 1324fa9e4066Sahrens dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1325fa9e4066Sahrens db->db_dirtycnt += 1; 1326fa9e4066Sahrens ASSERT3U(db->db_dirtycnt, <=, 3); 1327fa9e4066Sahrens 1328fa9e4066Sahrens mutex_exit(&db->db_mtx); 1329fa9e4066Sahrens 13300a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID || 13310a586ceaSMark Shellenbaum db->db_blkid == DMU_SPILL_BLKID) { 1332c717a561Smaybee mutex_enter(&dn->dn_mtx); 1333c717a561Smaybee ASSERT(!list_link_active(&dr->dr_dirty_node)); 1334c717a561Smaybee list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1335c717a561Smaybee mutex_exit(&dn->dn_mtx); 1336fa9e4066Sahrens dnode_setdirty(dn, tx); 1337744947dcSTom Erickson DB_DNODE_EXIT(db); 1338c717a561Smaybee return (dr); 1339d3469faaSMark Maybee } else if (do_free_accounting) { 1340d3469faaSMark Maybee blkptr_t *bp = db->db_blkptr; 1341d3469faaSMark Maybee int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1342b24ab676SJeff Bonwick bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1343d3469faaSMark Maybee /* 1344d3469faaSMark Maybee * This is only a guess -- if the dbuf is dirty 1345d3469faaSMark Maybee * in a previous txg, we don't know how much 1346d3469faaSMark Maybee * space it will use on disk yet. We should 1347d3469faaSMark Maybee * really have the struct_rwlock to access 1348d3469faaSMark Maybee * db_blkptr, but since this is just a guess, 1349d3469faaSMark Maybee * it's OK if we get an odd answer. 1350d3469faaSMark Maybee */ 1351837b568bSGeorge Wilson ddt_prefetch(os->os_spa, bp); 1352d3469faaSMark Maybee dnode_willuse_space(dn, -willfree, tx); 1353fa9e4066Sahrens } 1354fa9e4066Sahrens 1355fa9e4066Sahrens if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1356fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 1357fa9e4066Sahrens drop_struct_lock = TRUE; 1358fa9e4066Sahrens } 1359fa9e4066Sahrens 13608346f03fSJonathan W Adams if (db->db_level == 0) { 13618346f03fSJonathan W Adams dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 13628346f03fSJonathan W Adams ASSERT(dn->dn_maxblkid >= db->db_blkid); 13638346f03fSJonathan W Adams } 13648346f03fSJonathan W Adams 136544eda4d7Smaybee if (db->db_level+1 < dn->dn_nlevels) { 1366c717a561Smaybee dmu_buf_impl_t *parent = db->db_parent; 1367c717a561Smaybee dbuf_dirty_record_t *di; 1368c717a561Smaybee int parent_held = FALSE; 1369c717a561Smaybee 1370c717a561Smaybee if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1371fa9e4066Sahrens int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1372c717a561Smaybee 1373fa9e4066Sahrens parent = dbuf_hold_level(dn, db->db_level+1, 1374fa9e4066Sahrens db->db_blkid >> epbs, FTAG); 137501025c89SJohn Harres ASSERT(parent != NULL); 1376c717a561Smaybee parent_held = TRUE; 1377c717a561Smaybee } 1378fa9e4066Sahrens if (drop_struct_lock) 1379fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 1380c717a561Smaybee ASSERT3U(db->db_level+1, ==, parent->db_level); 1381c717a561Smaybee di = dbuf_dirty(parent, tx); 1382c717a561Smaybee if (parent_held) 1383ea8dc4b6Seschrock dbuf_rele(parent, FTAG); 1384c717a561Smaybee 1385c717a561Smaybee mutex_enter(&db->db_mtx); 138669962b56SMatthew Ahrens /* 138769962b56SMatthew Ahrens * Since we've dropped the mutex, it's possible that 138869962b56SMatthew Ahrens * dbuf_undirty() might have changed this out from under us. 138969962b56SMatthew Ahrens */ 1390c717a561Smaybee if (db->db_last_dirty == dr || 1391c717a561Smaybee dn->dn_object == DMU_META_DNODE_OBJECT) { 1392c717a561Smaybee mutex_enter(&di->dt.di.dr_mtx); 1393c717a561Smaybee ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1394c717a561Smaybee ASSERT(!list_link_active(&dr->dr_dirty_node)); 1395c717a561Smaybee list_insert_tail(&di->dt.di.dr_children, dr); 1396c717a561Smaybee mutex_exit(&di->dt.di.dr_mtx); 1397c717a561Smaybee dr->dr_parent = di; 1398c717a561Smaybee } 1399c717a561Smaybee mutex_exit(&db->db_mtx); 1400fa9e4066Sahrens } else { 1401c717a561Smaybee ASSERT(db->db_level+1 == dn->dn_nlevels); 1402c717a561Smaybee ASSERT(db->db_blkid < dn->dn_nblkptr); 1403744947dcSTom Erickson ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1404c717a561Smaybee mutex_enter(&dn->dn_mtx); 1405c717a561Smaybee ASSERT(!list_link_active(&dr->dr_dirty_node)); 1406c717a561Smaybee list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1407c717a561Smaybee mutex_exit(&dn->dn_mtx); 1408fa9e4066Sahrens if (drop_struct_lock) 1409fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 1410fa9e4066Sahrens } 1411fa9e4066Sahrens 1412fa9e4066Sahrens dnode_setdirty(dn, tx); 1413744947dcSTom Erickson DB_DNODE_EXIT(db); 1414c717a561Smaybee return (dr); 1415fa9e4066Sahrens } 1416fa9e4066Sahrens 14173b2aab18SMatthew Ahrens /* 14183e30c24aSWill Andrews * Undirty a buffer in the transaction group referenced by the given 14193e30c24aSWill Andrews * transaction. Return whether this evicted the dbuf. 14203b2aab18SMatthew Ahrens */ 14213b2aab18SMatthew Ahrens static boolean_t 1422fa9e4066Sahrens dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1423fa9e4066Sahrens { 1424744947dcSTom Erickson dnode_t *dn; 1425c717a561Smaybee uint64_t txg = tx->tx_txg; 142617f17c2dSbonwick dbuf_dirty_record_t *dr, **drp; 1427fa9e4066Sahrens 1428c717a561Smaybee ASSERT(txg != 0); 142946e1baa6SMatthew Ahrens 143046e1baa6SMatthew Ahrens /* 143146e1baa6SMatthew Ahrens * Due to our use of dn_nlevels below, this can only be called 143246e1baa6SMatthew Ahrens * in open context, unless we are operating on the MOS. 143346e1baa6SMatthew Ahrens * From syncing context, dn_nlevels may be different from the 143446e1baa6SMatthew Ahrens * dn_nlevels used when dbuf was dirtied. 143546e1baa6SMatthew Ahrens */ 143646e1baa6SMatthew Ahrens ASSERT(db->db_objset == 143746e1baa6SMatthew Ahrens dmu_objset_pool(db->db_objset)->dp_meta_objset || 143846e1baa6SMatthew Ahrens txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); 14390a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 14403b2aab18SMatthew Ahrens ASSERT0(db->db_level); 14413b2aab18SMatthew Ahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 1442fa9e4066Sahrens 1443fa9e4066Sahrens /* 1444fa9e4066Sahrens * If this buffer is not dirty, we're done. 1445fa9e4066Sahrens */ 144617f17c2dSbonwick for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1447c717a561Smaybee if (dr->dr_txg <= txg) 1448c717a561Smaybee break; 14493b2aab18SMatthew Ahrens if (dr == NULL || dr->dr_txg < txg) 14503b2aab18SMatthew Ahrens return (B_FALSE); 1451c717a561Smaybee ASSERT(dr->dr_txg == txg); 1452b24ab676SJeff Bonwick ASSERT(dr->dr_dbuf == db); 1453fa9e4066Sahrens 1454744947dcSTom Erickson DB_DNODE_ENTER(db); 1455744947dcSTom Erickson dn = DB_DNODE(db); 1456744947dcSTom Erickson 1457fa9e4066Sahrens dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1458fa9e4066Sahrens 1459fa9e4066Sahrens ASSERT(db->db.db_size != 0); 1460fa9e4066Sahrens 146146e1baa6SMatthew Ahrens dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), 146246e1baa6SMatthew Ahrens dr->dr_accounted, txg); 1463fa9e4066Sahrens 146417f17c2dSbonwick *drp = dr->dr_next; 1465c717a561Smaybee 14663f2366c2SGordon Ross /* 14673f2366c2SGordon Ross * Note that there are three places in dbuf_dirty() 14683f2366c2SGordon Ross * where this dirty record may be put on a list. 14693f2366c2SGordon Ross * Make sure to do a list_remove corresponding to 14703f2366c2SGordon Ross * every one of those list_insert calls. 14713f2366c2SGordon Ross */ 1472c717a561Smaybee if (dr->dr_parent) { 1473c717a561Smaybee mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1474c717a561Smaybee list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1475c717a561Smaybee mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 14763f2366c2SGordon Ross } else if (db->db_blkid == DMU_SPILL_BLKID || 14773f2366c2SGordon Ross db->db_level + 1 == dn->dn_nlevels) { 1478cdb0ab79Smaybee ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1479fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 1480c717a561Smaybee list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1481fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 1482c717a561Smaybee } 1483744947dcSTom Erickson DB_DNODE_EXIT(db); 1484c717a561Smaybee 148582c9918fSTim Haley if (db->db_state != DB_NOFILL) { 1486c717a561Smaybee dbuf_unoverride(dr); 1487c717a561Smaybee 1488c717a561Smaybee ASSERT(db->db_buf != NULL); 1489c717a561Smaybee ASSERT(dr->dt.dl.dr_data != NULL); 1490c717a561Smaybee if (dr->dt.dl.dr_data != db->db_buf) 14913b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); 1492c717a561Smaybee } 1493d2b3cbbdSJorgen Lundman 1494c717a561Smaybee kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1495fa9e4066Sahrens 1496fa9e4066Sahrens ASSERT(db->db_dirtycnt > 0); 1497fa9e4066Sahrens db->db_dirtycnt -= 1; 1498fa9e4066Sahrens 1499c717a561Smaybee if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1500ea8dc4b6Seschrock arc_buf_t *buf = db->db_buf; 1501fa9e4066Sahrens 1502b24ab676SJeff Bonwick ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1503bc9014e6SJustin Gibbs dbuf_clear_data(db); 15043b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(buf, db)); 1505fa9e4066Sahrens dbuf_evict(db); 15063b2aab18SMatthew Ahrens return (B_TRUE); 1507fa9e4066Sahrens } 1508fa9e4066Sahrens 15093b2aab18SMatthew Ahrens return (B_FALSE); 1510fa9e4066Sahrens } 1511fa9e4066Sahrens 1512fa9e4066Sahrens void 151343466aaeSMax Grossman dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1514fa9e4066Sahrens { 151543466aaeSMax Grossman dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 15161ab7f2deSmaybee int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1517fa9e4066Sahrens 1518fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 1519fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 1520fa9e4066Sahrens 15210f2e7d03SMatthew Ahrens /* 15220f2e7d03SMatthew Ahrens * Quick check for dirtyness. For already dirty blocks, this 15230f2e7d03SMatthew Ahrens * reduces runtime of this function by >90%, and overall performance 15240f2e7d03SMatthew Ahrens * by 50% for some workloads (e.g. file deletion with indirect blocks 15250f2e7d03SMatthew Ahrens * cached). 15260f2e7d03SMatthew Ahrens */ 15270f2e7d03SMatthew Ahrens mutex_enter(&db->db_mtx); 15280f2e7d03SMatthew Ahrens dbuf_dirty_record_t *dr; 15290f2e7d03SMatthew Ahrens for (dr = db->db_last_dirty; 15300f2e7d03SMatthew Ahrens dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { 15310f2e7d03SMatthew Ahrens /* 15320f2e7d03SMatthew Ahrens * It's possible that it is already dirty but not cached, 15330f2e7d03SMatthew Ahrens * because there are some calls to dbuf_dirty() that don't 15340f2e7d03SMatthew Ahrens * go through dmu_buf_will_dirty(). 15350f2e7d03SMatthew Ahrens */ 15360f2e7d03SMatthew Ahrens if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { 15370f2e7d03SMatthew Ahrens /* This dbuf is already dirty and cached. */ 15380f2e7d03SMatthew Ahrens dbuf_redirty(dr); 15390f2e7d03SMatthew Ahrens mutex_exit(&db->db_mtx); 15400f2e7d03SMatthew Ahrens return; 15410f2e7d03SMatthew Ahrens } 15420f2e7d03SMatthew Ahrens } 15430f2e7d03SMatthew Ahrens mutex_exit(&db->db_mtx); 15440f2e7d03SMatthew Ahrens 1545744947dcSTom Erickson DB_DNODE_ENTER(db); 1546744947dcSTom Erickson if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1547fa9e4066Sahrens rf |= DB_RF_HAVESTRUCT; 1548744947dcSTom Erickson DB_DNODE_EXIT(db); 1549ea8dc4b6Seschrock (void) dbuf_read(db, NULL, rf); 1550c717a561Smaybee (void) dbuf_dirty(db, tx); 1551fa9e4066Sahrens } 1552fa9e4066Sahrens 1553fa9e4066Sahrens void 155482c9918fSTim Haley dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 155582c9918fSTim Haley { 155682c9918fSTim Haley dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 155782c9918fSTim Haley 155882c9918fSTim Haley db->db_state = DB_NOFILL; 155982c9918fSTim Haley 156082c9918fSTim Haley dmu_buf_will_fill(db_fake, tx); 156182c9918fSTim Haley } 156282c9918fSTim Haley 156382c9918fSTim Haley void 1564ea8dc4b6Seschrock dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1565fa9e4066Sahrens { 1566ea8dc4b6Seschrock dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1567ea8dc4b6Seschrock 15680a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1569fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 1570fa9e4066Sahrens ASSERT(db->db_level == 0); 1571fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 1572fa9e4066Sahrens 1573ea8dc4b6Seschrock ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1574fa9e4066Sahrens dmu_tx_private_ok(tx)); 1575fa9e4066Sahrens 1576fa9e4066Sahrens dbuf_noread(db); 1577c717a561Smaybee (void) dbuf_dirty(db, tx); 1578fa9e4066Sahrens } 1579fa9e4066Sahrens 1580fa9e4066Sahrens #pragma weak dmu_buf_fill_done = dbuf_fill_done 1581fa9e4066Sahrens /* ARGSUSED */ 1582fa9e4066Sahrens void 1583fa9e4066Sahrens dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1584fa9e4066Sahrens { 1585fa9e4066Sahrens mutex_enter(&db->db_mtx); 15869c9dc39aSek110237 DBUF_VERIFY(db); 1587fa9e4066Sahrens 1588fa9e4066Sahrens if (db->db_state == DB_FILL) { 1589c717a561Smaybee if (db->db_level == 0 && db->db_freed_in_flight) { 15900a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1591fa9e4066Sahrens /* we were freed while filling */ 1592fa9e4066Sahrens /* XXX dbuf_undirty? */ 1593fa9e4066Sahrens bzero(db->db.db_data, db->db.db_size); 1594c717a561Smaybee db->db_freed_in_flight = FALSE; 1595fa9e4066Sahrens } 1596fa9e4066Sahrens db->db_state = DB_CACHED; 1597fa9e4066Sahrens cv_broadcast(&db->db_changed); 1598fa9e4066Sahrens } 1599fa9e4066Sahrens mutex_exit(&db->db_mtx); 1600fa9e4066Sahrens } 1601fa9e4066Sahrens 16025d7b4d43SMatthew Ahrens void 16035d7b4d43SMatthew Ahrens dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, 16045d7b4d43SMatthew Ahrens bp_embedded_type_t etype, enum zio_compress comp, 16055d7b4d43SMatthew Ahrens int uncompressed_size, int compressed_size, int byteorder, 16065d7b4d43SMatthew Ahrens dmu_tx_t *tx) 16075d7b4d43SMatthew Ahrens { 16085d7b4d43SMatthew Ahrens dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 16095d7b4d43SMatthew Ahrens struct dirty_leaf *dl; 16105d7b4d43SMatthew Ahrens dmu_object_type_t type; 16115d7b4d43SMatthew Ahrens 1612ca0cc391SMatthew Ahrens if (etype == BP_EMBEDDED_TYPE_DATA) { 1613ca0cc391SMatthew Ahrens ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), 1614ca0cc391SMatthew Ahrens SPA_FEATURE_EMBEDDED_DATA)); 1615ca0cc391SMatthew Ahrens } 1616ca0cc391SMatthew Ahrens 16175d7b4d43SMatthew Ahrens DB_DNODE_ENTER(db); 16185d7b4d43SMatthew Ahrens type = DB_DNODE(db)->dn_type; 16195d7b4d43SMatthew Ahrens DB_DNODE_EXIT(db); 16205d7b4d43SMatthew Ahrens 16215d7b4d43SMatthew Ahrens ASSERT0(db->db_level); 16225d7b4d43SMatthew Ahrens ASSERT(db->db_blkid != DMU_BONUS_BLKID); 16235d7b4d43SMatthew Ahrens 16245d7b4d43SMatthew Ahrens dmu_buf_will_not_fill(dbuf, tx); 16255d7b4d43SMatthew Ahrens 16265d7b4d43SMatthew Ahrens ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 16275d7b4d43SMatthew Ahrens dl = &db->db_last_dirty->dt.dl; 16285d7b4d43SMatthew Ahrens encode_embedded_bp_compressed(&dl->dr_overridden_by, 16295d7b4d43SMatthew Ahrens data, comp, uncompressed_size, compressed_size); 16305d7b4d43SMatthew Ahrens BPE_SET_ETYPE(&dl->dr_overridden_by, etype); 16315d7b4d43SMatthew Ahrens BP_SET_TYPE(&dl->dr_overridden_by, type); 16325d7b4d43SMatthew Ahrens BP_SET_LEVEL(&dl->dr_overridden_by, 0); 16335d7b4d43SMatthew Ahrens BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); 16345d7b4d43SMatthew Ahrens 16355d7b4d43SMatthew Ahrens dl->dr_override_state = DR_OVERRIDDEN; 16365d7b4d43SMatthew Ahrens dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; 16375d7b4d43SMatthew Ahrens } 16385d7b4d43SMatthew Ahrens 1639ea8dc4b6Seschrock /* 16402fdbea25SAleksandr Guzovskiy * Directly assign a provided arc buf to a given dbuf if it's not referenced 16412fdbea25SAleksandr Guzovskiy * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 16422fdbea25SAleksandr Guzovskiy */ 16432fdbea25SAleksandr Guzovskiy void 16442fdbea25SAleksandr Guzovskiy dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 16452fdbea25SAleksandr Guzovskiy { 16462fdbea25SAleksandr Guzovskiy ASSERT(!refcount_is_zero(&db->db_holds)); 16470a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 16482fdbea25SAleksandr Guzovskiy ASSERT(db->db_level == 0); 16492fdbea25SAleksandr Guzovskiy ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 16502fdbea25SAleksandr Guzovskiy ASSERT(buf != NULL); 16512fdbea25SAleksandr Guzovskiy ASSERT(arc_buf_size(buf) == db->db.db_size); 16522fdbea25SAleksandr Guzovskiy ASSERT(tx->tx_txg != 0); 16532fdbea25SAleksandr Guzovskiy 16542fdbea25SAleksandr Guzovskiy arc_return_buf(buf, db); 16552fdbea25SAleksandr Guzovskiy ASSERT(arc_released(buf)); 16562fdbea25SAleksandr Guzovskiy 16572fdbea25SAleksandr Guzovskiy mutex_enter(&db->db_mtx); 16582fdbea25SAleksandr Guzovskiy 16592fdbea25SAleksandr Guzovskiy while (db->db_state == DB_READ || db->db_state == DB_FILL) 16602fdbea25SAleksandr Guzovskiy cv_wait(&db->db_changed, &db->db_mtx); 16612fdbea25SAleksandr Guzovskiy 16622fdbea25SAleksandr Guzovskiy ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 16632fdbea25SAleksandr Guzovskiy 16642fdbea25SAleksandr Guzovskiy if (db->db_state == DB_CACHED && 16652fdbea25SAleksandr Guzovskiy refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 16662fdbea25SAleksandr Guzovskiy mutex_exit(&db->db_mtx); 16672fdbea25SAleksandr Guzovskiy (void) dbuf_dirty(db, tx); 16682fdbea25SAleksandr Guzovskiy bcopy(buf->b_data, db->db.db_data, db->db.db_size); 16693b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(buf, db)); 1670c242f9a0Schunli zhang - Sun Microsystems - Irvine United States xuio_stat_wbuf_copied(); 16712fdbea25SAleksandr Guzovskiy return; 16722fdbea25SAleksandr Guzovskiy } 16732fdbea25SAleksandr Guzovskiy 1674c242f9a0Schunli zhang - Sun Microsystems - Irvine United States xuio_stat_wbuf_nocopy(); 16752fdbea25SAleksandr Guzovskiy if (db->db_state == DB_CACHED) { 16762fdbea25SAleksandr Guzovskiy dbuf_dirty_record_t *dr = db->db_last_dirty; 16772fdbea25SAleksandr Guzovskiy 16782fdbea25SAleksandr Guzovskiy ASSERT(db->db_buf != NULL); 16792fdbea25SAleksandr Guzovskiy if (dr != NULL && dr->dr_txg == tx->tx_txg) { 16802fdbea25SAleksandr Guzovskiy ASSERT(dr->dt.dl.dr_data == db->db_buf); 16812fdbea25SAleksandr Guzovskiy if (!arc_released(db->db_buf)) { 16822fdbea25SAleksandr Guzovskiy ASSERT(dr->dt.dl.dr_override_state == 16832fdbea25SAleksandr Guzovskiy DR_OVERRIDDEN); 16842fdbea25SAleksandr Guzovskiy arc_release(db->db_buf, db); 16852fdbea25SAleksandr Guzovskiy } 16862fdbea25SAleksandr Guzovskiy dr->dt.dl.dr_data = buf; 16873b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(db->db_buf, db)); 16882fdbea25SAleksandr Guzovskiy } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 16892fdbea25SAleksandr Guzovskiy arc_release(db->db_buf, db); 16903b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(db->db_buf, db)); 16912fdbea25SAleksandr Guzovskiy } 16922fdbea25SAleksandr Guzovskiy db->db_buf = NULL; 16932fdbea25SAleksandr Guzovskiy } 16942fdbea25SAleksandr Guzovskiy ASSERT(db->db_buf == NULL); 16952fdbea25SAleksandr Guzovskiy dbuf_set_data(db, buf); 16962fdbea25SAleksandr Guzovskiy db->db_state = DB_FILL; 16972fdbea25SAleksandr Guzovskiy mutex_exit(&db->db_mtx); 16982fdbea25SAleksandr Guzovskiy (void) dbuf_dirty(db, tx); 169943466aaeSMax Grossman dmu_buf_fill_done(&db->db, tx); 17002fdbea25SAleksandr Guzovskiy } 17012fdbea25SAleksandr Guzovskiy 17022fdbea25SAleksandr Guzovskiy /* 1703ea8dc4b6Seschrock * "Clear" the contents of this dbuf. This will mark the dbuf 170469962b56SMatthew Ahrens * EVICTING and clear *most* of its references. Unfortunately, 1705ea8dc4b6Seschrock * when we are not holding the dn_dbufs_mtx, we can't clear the 1706ea8dc4b6Seschrock * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1707ea8dc4b6Seschrock * in this case. For callers from the DMU we will usually see: 1708bbfa8ea8SMatthew Ahrens * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() 1709ea8dc4b6Seschrock * For the arc callback, we will usually see: 1710ea8dc4b6Seschrock * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1711ea8dc4b6Seschrock * Sometimes, though, we will get a mix of these two: 1712bbfa8ea8SMatthew Ahrens * DMU: dbuf_clear()->arc_clear_callback() 1713ea8dc4b6Seschrock * ARC: dbuf_do_evict()->dbuf_destroy() 1714bbfa8ea8SMatthew Ahrens * 1715bbfa8ea8SMatthew Ahrens * This routine will dissociate the dbuf from the arc, by calling 1716bbfa8ea8SMatthew Ahrens * arc_clear_callback(), but will not evict the data from the ARC. 1717ea8dc4b6Seschrock */ 1718ea8dc4b6Seschrock void 1719fa9e4066Sahrens dbuf_clear(dmu_buf_impl_t *db) 1720fa9e4066Sahrens { 1721744947dcSTom Erickson dnode_t *dn; 1722ea8dc4b6Seschrock dmu_buf_impl_t *parent = db->db_parent; 1723744947dcSTom Erickson dmu_buf_impl_t *dndb; 1724bbfa8ea8SMatthew Ahrens boolean_t dbuf_gone = B_FALSE; 1725fa9e4066Sahrens 1726fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 1727fa9e4066Sahrens ASSERT(refcount_is_zero(&db->db_holds)); 1728fa9e4066Sahrens 1729ea8dc4b6Seschrock dbuf_evict_user(db); 1730ea8dc4b6Seschrock 1731fa9e4066Sahrens if (db->db_state == DB_CACHED) { 1732ea8dc4b6Seschrock ASSERT(db->db.db_data != NULL); 17330a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) { 1734ea8dc4b6Seschrock zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 17355a98e54bSBrendan Gregg - Sun Microsystems arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 17360e8c6158Smaybee } 1737fa9e4066Sahrens db->db.db_data = NULL; 1738fa9e4066Sahrens db->db_state = DB_UNCACHED; 1739fa9e4066Sahrens } 1740fa9e4066Sahrens 174182c9918fSTim Haley ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1742fa9e4066Sahrens ASSERT(db->db_data_pending == NULL); 1743fa9e4066Sahrens 1744ea8dc4b6Seschrock db->db_state = DB_EVICTING; 1745ea8dc4b6Seschrock db->db_blkptr = NULL; 1746ea8dc4b6Seschrock 1747744947dcSTom Erickson DB_DNODE_ENTER(db); 1748744947dcSTom Erickson dn = DB_DNODE(db); 1749744947dcSTom Erickson dndb = dn->dn_dbuf; 17500a586ceaSMark Shellenbaum if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 17510f6d88adSAlex Reece avl_remove(&dn->dn_dbufs, db); 1752640c1670SJosef 'Jeff' Sipek atomic_dec_32(&dn->dn_dbufs_count); 1753744947dcSTom Erickson membar_producer(); 1754744947dcSTom Erickson DB_DNODE_EXIT(db); 1755744947dcSTom Erickson /* 1756744947dcSTom Erickson * Decrementing the dbuf count means that the hold corresponding 1757744947dcSTom Erickson * to the removed dbuf is no longer discounted in dnode_move(), 1758744947dcSTom Erickson * so the dnode cannot be moved until after we release the hold. 1759744947dcSTom Erickson * The membar_producer() ensures visibility of the decremented 1760744947dcSTom Erickson * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1761744947dcSTom Erickson * release any lock. 1762744947dcSTom Erickson */ 1763ea8dc4b6Seschrock dnode_rele(dn, db); 1764744947dcSTom Erickson db->db_dnode_handle = NULL; 1765744947dcSTom Erickson } else { 1766744947dcSTom Erickson DB_DNODE_EXIT(db); 1767ea8dc4b6Seschrock } 1768ea8dc4b6Seschrock 1769ea8dc4b6Seschrock if (db->db_buf) 1770bbfa8ea8SMatthew Ahrens dbuf_gone = arc_clear_callback(db->db_buf); 1771ea8dc4b6Seschrock 1772ea8dc4b6Seschrock if (!dbuf_gone) 1773fa9e4066Sahrens mutex_exit(&db->db_mtx); 1774fa9e4066Sahrens 1775fa9e4066Sahrens /* 1776744947dcSTom Erickson * If this dbuf is referenced from an indirect dbuf, 1777fa9e4066Sahrens * decrement the ref count on the indirect dbuf. 1778fa9e4066Sahrens */ 1779c543ec06Sahrens if (parent && parent != dndb) 1780ea8dc4b6Seschrock dbuf_rele(parent, db); 1781fa9e4066Sahrens } 1782fa9e4066Sahrens 1783a2cdcdd2SPaul Dagnelie /* 1784a2cdcdd2SPaul Dagnelie * Note: While bpp will always be updated if the function returns success, 1785a2cdcdd2SPaul Dagnelie * parentp will not be updated if the dnode does not have dn_dbuf filled in; 1786a2cdcdd2SPaul Dagnelie * this happens when the dnode is the meta-dnode, or a userused or groupused 1787a2cdcdd2SPaul Dagnelie * object. 1788a2cdcdd2SPaul Dagnelie */ 1789fa9e4066Sahrens static int 1790fa9e4066Sahrens dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1791fa9e4066Sahrens dmu_buf_impl_t **parentp, blkptr_t **bpp) 1792fa9e4066Sahrens { 1793fa9e4066Sahrens int nlevels, epbs; 1794fa9e4066Sahrens 17950b69c2f0Sahrens *parentp = NULL; 17960b69c2f0Sahrens *bpp = NULL; 17970b69c2f0Sahrens 17980a586ceaSMark Shellenbaum ASSERT(blkid != DMU_BONUS_BLKID); 17990a586ceaSMark Shellenbaum 18000a586ceaSMark Shellenbaum if (blkid == DMU_SPILL_BLKID) { 18010a586ceaSMark Shellenbaum mutex_enter(&dn->dn_mtx); 180206e0070dSMark Shellenbaum if (dn->dn_have_spill && 180306e0070dSMark Shellenbaum (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 18040a586ceaSMark Shellenbaum *bpp = &dn->dn_phys->dn_spill; 18050a586ceaSMark Shellenbaum else 18060a586ceaSMark Shellenbaum *bpp = NULL; 18070a586ceaSMark Shellenbaum dbuf_add_ref(dn->dn_dbuf, NULL); 18080a586ceaSMark Shellenbaum *parentp = dn->dn_dbuf; 18090a586ceaSMark Shellenbaum mutex_exit(&dn->dn_mtx); 18100a586ceaSMark Shellenbaum return (0); 18110a586ceaSMark Shellenbaum } 1812ea8dc4b6Seschrock 1813fa9e4066Sahrens if (dn->dn_phys->dn_nlevels == 0) 1814fa9e4066Sahrens nlevels = 1; 1815fa9e4066Sahrens else 1816fa9e4066Sahrens nlevels = dn->dn_phys->dn_nlevels; 1817fa9e4066Sahrens 1818fa9e4066Sahrens epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1819fa9e4066Sahrens 1820fa9e4066Sahrens ASSERT3U(level * epbs, <, 64); 1821fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1822ea8dc4b6Seschrock if (level >= nlevels || 1823fa9e4066Sahrens (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1824fa9e4066Sahrens /* the buffer has no parent yet */ 1825be6fd75aSMatthew Ahrens return (SET_ERROR(ENOENT)); 1826fa9e4066Sahrens } else if (level < nlevels-1) { 1827fa9e4066Sahrens /* this block is referenced from an indirect block */ 1828fa9e4066Sahrens int err = dbuf_hold_impl(dn, level+1, 1829a2cdcdd2SPaul Dagnelie blkid >> epbs, fail_sparse, FALSE, NULL, parentp); 1830fa9e4066Sahrens if (err) 1831fa9e4066Sahrens return (err); 1832ea8dc4b6Seschrock err = dbuf_read(*parentp, NULL, 1833ea8dc4b6Seschrock (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1834c543ec06Sahrens if (err) { 1835c543ec06Sahrens dbuf_rele(*parentp, NULL); 1836c543ec06Sahrens *parentp = NULL; 1837c543ec06Sahrens return (err); 1838c543ec06Sahrens } 1839fa9e4066Sahrens *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1840fa9e4066Sahrens (blkid & ((1ULL << epbs) - 1)); 1841c543ec06Sahrens return (0); 1842fa9e4066Sahrens } else { 1843fa9e4066Sahrens /* the block is referenced from the dnode */ 1844fa9e4066Sahrens ASSERT3U(level, ==, nlevels-1); 1845fa9e4066Sahrens ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1846fa9e4066Sahrens blkid < dn->dn_phys->dn_nblkptr); 1847c543ec06Sahrens if (dn->dn_dbuf) { 1848c543ec06Sahrens dbuf_add_ref(dn->dn_dbuf, NULL); 1849fa9e4066Sahrens *parentp = dn->dn_dbuf; 1850c543ec06Sahrens } 1851fa9e4066Sahrens *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1852fa9e4066Sahrens return (0); 1853fa9e4066Sahrens } 1854fa9e4066Sahrens } 1855fa9e4066Sahrens 1856fa9e4066Sahrens static dmu_buf_impl_t * 1857fa9e4066Sahrens dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1858fa9e4066Sahrens dmu_buf_impl_t *parent, blkptr_t *blkptr) 1859fa9e4066Sahrens { 1860503ad85cSMatthew Ahrens objset_t *os = dn->dn_objset; 1861fa9e4066Sahrens dmu_buf_impl_t *db, *odb; 1862fa9e4066Sahrens 1863fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1864fa9e4066Sahrens ASSERT(dn->dn_type != DMU_OT_NONE); 1865fa9e4066Sahrens 1866fa9e4066Sahrens db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1867fa9e4066Sahrens 1868fa9e4066Sahrens db->db_objset = os; 1869fa9e4066Sahrens db->db.db_object = dn->dn_object; 1870fa9e4066Sahrens db->db_level = level; 1871fa9e4066Sahrens db->db_blkid = blkid; 1872c717a561Smaybee db->db_last_dirty = NULL; 1873ea8dc4b6Seschrock db->db_dirtycnt = 0; 1874744947dcSTom Erickson db->db_dnode_handle = dn->dn_handle; 1875ea8dc4b6Seschrock db->db_parent = parent; 1876ea8dc4b6Seschrock db->db_blkptr = blkptr; 1877fa9e4066Sahrens 1878bc9014e6SJustin Gibbs db->db_user = NULL; 1879d2058105SJustin T. Gibbs db->db_user_immediate_evict = FALSE; 1880d2058105SJustin T. Gibbs db->db_freed_in_flight = FALSE; 1881d2058105SJustin T. Gibbs db->db_pending_evict = FALSE; 1882ea8dc4b6Seschrock 18830a586ceaSMark Shellenbaum if (blkid == DMU_BONUS_BLKID) { 1884ea8dc4b6Seschrock ASSERT3P(parent, ==, dn->dn_dbuf); 18851934e92fSmaybee db->db.db_size = DN_MAX_BONUSLEN - 18861934e92fSmaybee (dn->dn_nblkptr-1) * sizeof (blkptr_t); 18871934e92fSmaybee ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 18880a586ceaSMark Shellenbaum db->db.db_offset = DMU_BONUS_BLKID; 1889ea8dc4b6Seschrock db->db_state = DB_UNCACHED; 1890ea8dc4b6Seschrock /* the bonus dbuf is not placed in the hash table */ 18915a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1892ea8dc4b6Seschrock return (db); 18930a586ceaSMark Shellenbaum } else if (blkid == DMU_SPILL_BLKID) { 18940a586ceaSMark Shellenbaum db->db.db_size = (blkptr != NULL) ? 18950a586ceaSMark Shellenbaum BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 18960a586ceaSMark Shellenbaum db->db.db_offset = 0; 1897fa9e4066Sahrens } else { 1898fa9e4066Sahrens int blocksize = 1899fa9e4066Sahrens db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 1900fa9e4066Sahrens db->db.db_size = blocksize; 1901fa9e4066Sahrens db->db.db_offset = db->db_blkid * blocksize; 1902fa9e4066Sahrens } 1903fa9e4066Sahrens 1904fa9e4066Sahrens /* 1905fa9e4066Sahrens * Hold the dn_dbufs_mtx while we get the new dbuf 1906fa9e4066Sahrens * in the hash table *and* added to the dbufs list. 1907fa9e4066Sahrens * This prevents a possible deadlock with someone 1908fa9e4066Sahrens * trying to look up this dbuf before its added to the 1909fa9e4066Sahrens * dn_dbufs list. 1910fa9e4066Sahrens */ 1911fa9e4066Sahrens mutex_enter(&dn->dn_dbufs_mtx); 1912ea8dc4b6Seschrock db->db_state = DB_EVICTING; 1913fa9e4066Sahrens if ((odb = dbuf_hash_insert(db)) != NULL) { 1914fa9e4066Sahrens /* someone else inserted it first */ 1915fa9e4066Sahrens kmem_cache_free(dbuf_cache, db); 1916fa9e4066Sahrens mutex_exit(&dn->dn_dbufs_mtx); 1917fa9e4066Sahrens return (odb); 1918fa9e4066Sahrens } 19190f6d88adSAlex Reece avl_add(&dn->dn_dbufs, db); 1920713d6c20SMatthew Ahrens if (db->db_level == 0 && db->db_blkid >= 1921713d6c20SMatthew Ahrens dn->dn_unlisted_l0_blkid) 1922713d6c20SMatthew Ahrens dn->dn_unlisted_l0_blkid = db->db_blkid + 1; 1923ea8dc4b6Seschrock db->db_state = DB_UNCACHED; 1924fa9e4066Sahrens mutex_exit(&dn->dn_dbufs_mtx); 19255a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1926fa9e4066Sahrens 1927fa9e4066Sahrens if (parent && parent != dn->dn_dbuf) 1928fa9e4066Sahrens dbuf_add_ref(parent, db); 1929fa9e4066Sahrens 1930ea8dc4b6Seschrock ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1931ea8dc4b6Seschrock refcount_count(&dn->dn_holds) > 0); 1932fa9e4066Sahrens (void) refcount_add(&dn->dn_holds, db); 1933640c1670SJosef 'Jeff' Sipek atomic_inc_32(&dn->dn_dbufs_count); 1934fa9e4066Sahrens 1935fa9e4066Sahrens dprintf_dbuf(db, "db=%p\n", db); 1936fa9e4066Sahrens 1937fa9e4066Sahrens return (db); 1938fa9e4066Sahrens } 1939fa9e4066Sahrens 1940fa9e4066Sahrens static int 1941ea8dc4b6Seschrock dbuf_do_evict(void *private) 1942fa9e4066Sahrens { 1943bbfa8ea8SMatthew Ahrens dmu_buf_impl_t *db = private; 1944fa9e4066Sahrens 1945ea8dc4b6Seschrock if (!MUTEX_HELD(&db->db_mtx)) 1946ea8dc4b6Seschrock mutex_enter(&db->db_mtx); 1947ea8dc4b6Seschrock 1948ea8dc4b6Seschrock ASSERT(refcount_is_zero(&db->db_holds)); 1949ea8dc4b6Seschrock 1950ea8dc4b6Seschrock if (db->db_state != DB_EVICTING) { 1951ea8dc4b6Seschrock ASSERT(db->db_state == DB_CACHED); 19529c9dc39aSek110237 DBUF_VERIFY(db); 1953ea8dc4b6Seschrock db->db_buf = NULL; 1954ea8dc4b6Seschrock dbuf_evict(db); 1955ea8dc4b6Seschrock } else { 1956ea8dc4b6Seschrock mutex_exit(&db->db_mtx); 1957ea8dc4b6Seschrock dbuf_destroy(db); 1958fa9e4066Sahrens } 1959ea8dc4b6Seschrock return (0); 1960fa9e4066Sahrens } 1961fa9e4066Sahrens 1962fa9e4066Sahrens static void 1963fa9e4066Sahrens dbuf_destroy(dmu_buf_impl_t *db) 1964fa9e4066Sahrens { 1965fa9e4066Sahrens ASSERT(refcount_is_zero(&db->db_holds)); 1966fa9e4066Sahrens 19670a586ceaSMark Shellenbaum if (db->db_blkid != DMU_BONUS_BLKID) { 1968ea8dc4b6Seschrock /* 1969ea8dc4b6Seschrock * If this dbuf is still on the dn_dbufs list, 1970ea8dc4b6Seschrock * remove it from that list. 1971ea8dc4b6Seschrock */ 1972744947dcSTom Erickson if (db->db_dnode_handle != NULL) { 1973744947dcSTom Erickson dnode_t *dn; 19741934e92fSmaybee 1975744947dcSTom Erickson DB_DNODE_ENTER(db); 1976744947dcSTom Erickson dn = DB_DNODE(db); 19771934e92fSmaybee mutex_enter(&dn->dn_dbufs_mtx); 19780f6d88adSAlex Reece avl_remove(&dn->dn_dbufs, db); 1979640c1670SJosef 'Jeff' Sipek atomic_dec_32(&dn->dn_dbufs_count); 1980ea8dc4b6Seschrock mutex_exit(&dn->dn_dbufs_mtx); 1981744947dcSTom Erickson DB_DNODE_EXIT(db); 1982744947dcSTom Erickson /* 1983744947dcSTom Erickson * Decrementing the dbuf count means that the hold 1984744947dcSTom Erickson * corresponding to the removed dbuf is no longer 1985744947dcSTom Erickson * discounted in dnode_move(), so the dnode cannot be 1986744947dcSTom Erickson * moved until after we release the hold. 1987744947dcSTom Erickson */ 1988ea8dc4b6Seschrock dnode_rele(dn, db); 1989744947dcSTom Erickson db->db_dnode_handle = NULL; 1990ea8dc4b6Seschrock } 1991ea8dc4b6Seschrock dbuf_hash_remove(db); 1992ea8dc4b6Seschrock } 1993ea8dc4b6Seschrock db->db_parent = NULL; 1994ea8dc4b6Seschrock db->db_buf = NULL; 1995ea8dc4b6Seschrock 1996fa9e4066Sahrens ASSERT(db->db.db_data == NULL); 1997fa9e4066Sahrens ASSERT(db->db_hash_next == NULL); 1998fa9e4066Sahrens ASSERT(db->db_blkptr == NULL); 1999fa9e4066Sahrens ASSERT(db->db_data_pending == NULL); 2000fa9e4066Sahrens 2001fa9e4066Sahrens kmem_cache_free(dbuf_cache, db); 20025a98e54bSBrendan Gregg - Sun Microsystems arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2003fa9e4066Sahrens } 2004fa9e4066Sahrens 2005a2cdcdd2SPaul Dagnelie typedef struct dbuf_prefetch_arg { 2006a2cdcdd2SPaul Dagnelie spa_t *dpa_spa; /* The spa to issue the prefetch in. */ 2007a2cdcdd2SPaul Dagnelie zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ 2008a2cdcdd2SPaul Dagnelie int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ 2009a2cdcdd2SPaul Dagnelie int dpa_curlevel; /* The current level that we're reading */ 2010a2cdcdd2SPaul Dagnelie zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ 2011a2cdcdd2SPaul Dagnelie zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ 2012a2cdcdd2SPaul Dagnelie arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ 2013a2cdcdd2SPaul Dagnelie } dbuf_prefetch_arg_t; 2014a2cdcdd2SPaul Dagnelie 2015a2cdcdd2SPaul Dagnelie /* 2016a2cdcdd2SPaul Dagnelie * Actually issue the prefetch read for the block given. 2017a2cdcdd2SPaul Dagnelie */ 2018a2cdcdd2SPaul Dagnelie static void 2019a2cdcdd2SPaul Dagnelie dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) 2020fa9e4066Sahrens { 2021a2cdcdd2SPaul Dagnelie if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 2022a2cdcdd2SPaul Dagnelie return; 2023a2cdcdd2SPaul Dagnelie 2024a2cdcdd2SPaul Dagnelie arc_flags_t aflags = 2025a2cdcdd2SPaul Dagnelie dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; 2026a2cdcdd2SPaul Dagnelie 2027a2cdcdd2SPaul Dagnelie ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2028a2cdcdd2SPaul Dagnelie ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); 2029a2cdcdd2SPaul Dagnelie ASSERT(dpa->dpa_zio != NULL); 2030a2cdcdd2SPaul Dagnelie (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, 2031a2cdcdd2SPaul Dagnelie dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2032a2cdcdd2SPaul Dagnelie &aflags, &dpa->dpa_zb); 2033a2cdcdd2SPaul Dagnelie } 2034a2cdcdd2SPaul Dagnelie 2035a2cdcdd2SPaul Dagnelie /* 2036a2cdcdd2SPaul Dagnelie * Called when an indirect block above our prefetch target is read in. This 2037a2cdcdd2SPaul Dagnelie * will either read in the next indirect block down the tree or issue the actual 2038a2cdcdd2SPaul Dagnelie * prefetch if the next block down is our target. 2039a2cdcdd2SPaul Dagnelie */ 2040a2cdcdd2SPaul Dagnelie static void 2041a2cdcdd2SPaul Dagnelie dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) 2042a2cdcdd2SPaul Dagnelie { 2043a2cdcdd2SPaul Dagnelie dbuf_prefetch_arg_t *dpa = private; 2044a2cdcdd2SPaul Dagnelie 2045a2cdcdd2SPaul Dagnelie ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); 2046a2cdcdd2SPaul Dagnelie ASSERT3S(dpa->dpa_curlevel, >, 0); 2047a2cdcdd2SPaul Dagnelie if (zio != NULL) { 2048a2cdcdd2SPaul Dagnelie ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); 2049a2cdcdd2SPaul Dagnelie ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); 2050a2cdcdd2SPaul Dagnelie ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); 2051a2cdcdd2SPaul Dagnelie } 2052a2cdcdd2SPaul Dagnelie 2053a2cdcdd2SPaul Dagnelie dpa->dpa_curlevel--; 2054a2cdcdd2SPaul Dagnelie 2055a2cdcdd2SPaul Dagnelie uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> 2056a2cdcdd2SPaul Dagnelie (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); 2057a2cdcdd2SPaul Dagnelie blkptr_t *bp = ((blkptr_t *)abuf->b_data) + 2058a2cdcdd2SPaul Dagnelie P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); 2059a2cdcdd2SPaul Dagnelie if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { 2060a2cdcdd2SPaul Dagnelie kmem_free(dpa, sizeof (*dpa)); 2061a2cdcdd2SPaul Dagnelie } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { 2062a2cdcdd2SPaul Dagnelie ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); 2063a2cdcdd2SPaul Dagnelie dbuf_issue_final_prefetch(dpa, bp); 2064a2cdcdd2SPaul Dagnelie kmem_free(dpa, sizeof (*dpa)); 2065a2cdcdd2SPaul Dagnelie } else { 2066a2cdcdd2SPaul Dagnelie arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 2067a2cdcdd2SPaul Dagnelie zbookmark_phys_t zb; 2068a2cdcdd2SPaul Dagnelie 2069a2cdcdd2SPaul Dagnelie ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2070a2cdcdd2SPaul Dagnelie 2071a2cdcdd2SPaul Dagnelie SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, 2072a2cdcdd2SPaul Dagnelie dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); 2073a2cdcdd2SPaul Dagnelie 2074a2cdcdd2SPaul Dagnelie (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2075a2cdcdd2SPaul Dagnelie bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, 2076a2cdcdd2SPaul Dagnelie ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2077a2cdcdd2SPaul Dagnelie &iter_aflags, &zb); 2078a2cdcdd2SPaul Dagnelie } 2079a2cdcdd2SPaul Dagnelie (void) arc_buf_remove_ref(abuf, private); 2080a2cdcdd2SPaul Dagnelie } 2081a2cdcdd2SPaul Dagnelie 2082a2cdcdd2SPaul Dagnelie /* 2083a2cdcdd2SPaul Dagnelie * Issue prefetch reads for the given block on the given level. If the indirect 2084a2cdcdd2SPaul Dagnelie * blocks above that block are not in memory, we will read them in 2085a2cdcdd2SPaul Dagnelie * asynchronously. As a result, this call never blocks waiting for a read to 2086a2cdcdd2SPaul Dagnelie * complete. 2087a2cdcdd2SPaul Dagnelie */ 2088a2cdcdd2SPaul Dagnelie void 2089a2cdcdd2SPaul Dagnelie dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, 2090a2cdcdd2SPaul Dagnelie arc_flags_t aflags) 2091a2cdcdd2SPaul Dagnelie { 2092a2cdcdd2SPaul Dagnelie blkptr_t bp; 2093a2cdcdd2SPaul Dagnelie int epbs, nlevels, curlevel; 2094a2cdcdd2SPaul Dagnelie uint64_t curblkid; 2095fa9e4066Sahrens 20960a586ceaSMark Shellenbaum ASSERT(blkid != DMU_BONUS_BLKID); 2097fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2098fa9e4066Sahrens 2099cf6106c8SMatthew Ahrens if (blkid > dn->dn_maxblkid) 2100cf6106c8SMatthew Ahrens return; 2101cf6106c8SMatthew Ahrens 2102fa9e4066Sahrens if (dnode_block_freed(dn, blkid)) 2103fa9e4066Sahrens return; 2104fa9e4066Sahrens 2105fa9e4066Sahrens /* 2106a2cdcdd2SPaul Dagnelie * This dnode hasn't been written to disk yet, so there's nothing to 2107a2cdcdd2SPaul Dagnelie * prefetch. 2108fa9e4066Sahrens */ 2109a2cdcdd2SPaul Dagnelie nlevels = dn->dn_phys->dn_nlevels; 2110a2cdcdd2SPaul Dagnelie if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) 2111a2cdcdd2SPaul Dagnelie return; 2112a2cdcdd2SPaul Dagnelie 2113a2cdcdd2SPaul Dagnelie epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2114a2cdcdd2SPaul Dagnelie if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) 2115a2cdcdd2SPaul Dagnelie return; 2116a2cdcdd2SPaul Dagnelie 2117a2cdcdd2SPaul Dagnelie dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, 2118a2cdcdd2SPaul Dagnelie level, blkid); 2119a2cdcdd2SPaul Dagnelie if (db != NULL) { 2120fa9e4066Sahrens mutex_exit(&db->db_mtx); 2121a2cdcdd2SPaul Dagnelie /* 2122a2cdcdd2SPaul Dagnelie * This dbuf already exists. It is either CACHED, or 2123a2cdcdd2SPaul Dagnelie * (we assume) about to be read or filled. 2124a2cdcdd2SPaul Dagnelie */ 2125fa9e4066Sahrens return; 2126fa9e4066Sahrens } 2127fa9e4066Sahrens 2128a2cdcdd2SPaul Dagnelie /* 2129a2cdcdd2SPaul Dagnelie * Find the closest ancestor (indirect block) of the target block 2130a2cdcdd2SPaul Dagnelie * that is present in the cache. In this indirect block, we will 2131a2cdcdd2SPaul Dagnelie * find the bp that is at curlevel, curblkid. 2132a2cdcdd2SPaul Dagnelie */ 2133a2cdcdd2SPaul Dagnelie curlevel = level; 2134a2cdcdd2SPaul Dagnelie curblkid = blkid; 2135a2cdcdd2SPaul Dagnelie while (curlevel < nlevels - 1) { 2136a2cdcdd2SPaul Dagnelie int parent_level = curlevel + 1; 2137a2cdcdd2SPaul Dagnelie uint64_t parent_blkid = curblkid >> epbs; 2138a2cdcdd2SPaul Dagnelie dmu_buf_impl_t *db; 2139a2cdcdd2SPaul Dagnelie 2140a2cdcdd2SPaul Dagnelie if (dbuf_hold_impl(dn, parent_level, parent_blkid, 2141a2cdcdd2SPaul Dagnelie FALSE, TRUE, FTAG, &db) == 0) { 2142a2cdcdd2SPaul Dagnelie blkptr_t *bpp = db->db_buf->b_data; 2143a2cdcdd2SPaul Dagnelie bp = bpp[P2PHASE(curblkid, 1 << epbs)]; 2144a2cdcdd2SPaul Dagnelie dbuf_rele(db, FTAG); 2145a2cdcdd2SPaul Dagnelie break; 2146a2cdcdd2SPaul Dagnelie } 2147a2cdcdd2SPaul Dagnelie 2148a2cdcdd2SPaul Dagnelie curlevel = parent_level; 2149a2cdcdd2SPaul Dagnelie curblkid = parent_blkid; 2150a2cdcdd2SPaul Dagnelie } 2151a2cdcdd2SPaul Dagnelie 2152a2cdcdd2SPaul Dagnelie if (curlevel == nlevels - 1) { 2153a2cdcdd2SPaul Dagnelie /* No cached indirect blocks found. */ 2154a2cdcdd2SPaul Dagnelie ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); 2155a2cdcdd2SPaul Dagnelie bp = dn->dn_phys->dn_blkptr[curblkid]; 2156a2cdcdd2SPaul Dagnelie } 2157a2cdcdd2SPaul Dagnelie if (BP_IS_HOLE(&bp)) 2158a2cdcdd2SPaul Dagnelie return; 2159a2cdcdd2SPaul Dagnelie 2160a2cdcdd2SPaul Dagnelie ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); 2161a2cdcdd2SPaul Dagnelie 2162a2cdcdd2SPaul Dagnelie zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, 2163a2cdcdd2SPaul Dagnelie ZIO_FLAG_CANFAIL); 2164a2cdcdd2SPaul Dagnelie 2165a2cdcdd2SPaul Dagnelie dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); 2166b24ab676SJeff Bonwick dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 2167a2cdcdd2SPaul Dagnelie SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2168a2cdcdd2SPaul Dagnelie dn->dn_object, level, blkid); 2169a2cdcdd2SPaul Dagnelie dpa->dpa_curlevel = curlevel; 2170a2cdcdd2SPaul Dagnelie dpa->dpa_prio = prio; 2171a2cdcdd2SPaul Dagnelie dpa->dpa_aflags = aflags; 2172a2cdcdd2SPaul Dagnelie dpa->dpa_spa = dn->dn_objset->os_spa; 2173a2cdcdd2SPaul Dagnelie dpa->dpa_epbs = epbs; 2174a2cdcdd2SPaul Dagnelie dpa->dpa_zio = pio; 2175a2cdcdd2SPaul Dagnelie 2176a2cdcdd2SPaul Dagnelie /* 2177a2cdcdd2SPaul Dagnelie * If we have the indirect just above us, no need to do the asynchronous 2178a2cdcdd2SPaul Dagnelie * prefetch chain; we'll just run the last step ourselves. If we're at 2179a2cdcdd2SPaul Dagnelie * a higher level, though, we want to issue the prefetches for all the 2180a2cdcdd2SPaul Dagnelie * indirect blocks asynchronously, so we can go on with whatever we were 2181a2cdcdd2SPaul Dagnelie * doing. 2182a2cdcdd2SPaul Dagnelie */ 2183a2cdcdd2SPaul Dagnelie if (curlevel == level) { 2184a2cdcdd2SPaul Dagnelie ASSERT3U(curblkid, ==, blkid); 2185a2cdcdd2SPaul Dagnelie dbuf_issue_final_prefetch(dpa, &bp); 2186a2cdcdd2SPaul Dagnelie kmem_free(dpa, sizeof (*dpa)); 2187a2cdcdd2SPaul Dagnelie } else { 2188a2cdcdd2SPaul Dagnelie arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 21897802d7bfSMatthew Ahrens zbookmark_phys_t zb; 2190b24ab676SJeff Bonwick 2191a2cdcdd2SPaul Dagnelie SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2192a2cdcdd2SPaul Dagnelie dn->dn_object, curlevel, curblkid); 2193a2cdcdd2SPaul Dagnelie (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2194a2cdcdd2SPaul Dagnelie &bp, dbuf_prefetch_indirect_done, dpa, prio, 2195fa9e4066Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2196a2cdcdd2SPaul Dagnelie &iter_aflags, &zb); 2197fa9e4066Sahrens } 2198a2cdcdd2SPaul Dagnelie /* 2199a2cdcdd2SPaul Dagnelie * We use pio here instead of dpa_zio since it's possible that 2200a2cdcdd2SPaul Dagnelie * dpa may have already been freed. 2201a2cdcdd2SPaul Dagnelie */ 2202a2cdcdd2SPaul Dagnelie zio_nowait(pio); 2203fa9e4066Sahrens } 2204fa9e4066Sahrens 2205fa9e4066Sahrens /* 2206fa9e4066Sahrens * Returns with db_holds incremented, and db_mtx not held. 2207fa9e4066Sahrens * Note: dn_struct_rwlock must be held. 2208fa9e4066Sahrens */ 2209fa9e4066Sahrens int 2210a2cdcdd2SPaul Dagnelie dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, 2211a2cdcdd2SPaul Dagnelie boolean_t fail_sparse, boolean_t fail_uncached, 2212fa9e4066Sahrens void *tag, dmu_buf_impl_t **dbp) 2213fa9e4066Sahrens { 2214fa9e4066Sahrens dmu_buf_impl_t *db, *parent = NULL; 2215fa9e4066Sahrens 22160a586ceaSMark Shellenbaum ASSERT(blkid != DMU_BONUS_BLKID); 2217fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2218fa9e4066Sahrens ASSERT3U(dn->dn_nlevels, >, level); 2219fa9e4066Sahrens 2220fa9e4066Sahrens *dbp = NULL; 2221ea8dc4b6Seschrock top: 2222fa9e4066Sahrens /* dbuf_find() returns with db_mtx held */ 2223e57a022bSJustin T. Gibbs db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); 2224fa9e4066Sahrens 2225fa9e4066Sahrens if (db == NULL) { 2226fa9e4066Sahrens blkptr_t *bp = NULL; 2227fa9e4066Sahrens int err; 2228fa9e4066Sahrens 2229a2cdcdd2SPaul Dagnelie if (fail_uncached) 2230a2cdcdd2SPaul Dagnelie return (SET_ERROR(ENOENT)); 2231a2cdcdd2SPaul Dagnelie 2232c543ec06Sahrens ASSERT3P(parent, ==, NULL); 2233fa9e4066Sahrens err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 2234fa9e4066Sahrens if (fail_sparse) { 2235fa9e4066Sahrens if (err == 0 && bp && BP_IS_HOLE(bp)) 2236be6fd75aSMatthew Ahrens err = SET_ERROR(ENOENT); 2237fa9e4066Sahrens if (err) { 2238c543ec06Sahrens if (parent) 2239ea8dc4b6Seschrock dbuf_rele(parent, NULL); 2240fa9e4066Sahrens return (err); 2241fa9e4066Sahrens } 2242fa9e4066Sahrens } 2243ea8dc4b6Seschrock if (err && err != ENOENT) 2244ea8dc4b6Seschrock return (err); 2245fa9e4066Sahrens db = dbuf_create(dn, level, blkid, parent, bp); 2246fa9e4066Sahrens } 2247fa9e4066Sahrens 2248a2cdcdd2SPaul Dagnelie if (fail_uncached && db->db_state != DB_CACHED) { 2249a2cdcdd2SPaul Dagnelie mutex_exit(&db->db_mtx); 2250a2cdcdd2SPaul Dagnelie return (SET_ERROR(ENOENT)); 2251a2cdcdd2SPaul Dagnelie } 2252a2cdcdd2SPaul Dagnelie 2253ea8dc4b6Seschrock if (db->db_buf && refcount_is_zero(&db->db_holds)) { 2254ea8dc4b6Seschrock arc_buf_add_ref(db->db_buf, db); 2255ea8dc4b6Seschrock if (db->db_buf->b_data == NULL) { 2256ea8dc4b6Seschrock dbuf_clear(db); 2257c543ec06Sahrens if (parent) { 2258c543ec06Sahrens dbuf_rele(parent, NULL); 2259c543ec06Sahrens parent = NULL; 2260c543ec06Sahrens } 2261ea8dc4b6Seschrock goto top; 2262ea8dc4b6Seschrock } 2263ea8dc4b6Seschrock ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 2264ea8dc4b6Seschrock } 2265ea8dc4b6Seschrock 2266ea8dc4b6Seschrock ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 2267ea8dc4b6Seschrock 2268fa9e4066Sahrens /* 2269c717a561Smaybee * If this buffer is currently syncing out, and we are are 2270c717a561Smaybee * still referencing it from db_data, we need to make a copy 2271c717a561Smaybee * of it in case we decide we want to dirty it again in this txg. 2272fa9e4066Sahrens */ 22730a586ceaSMark Shellenbaum if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 2274ea8dc4b6Seschrock dn->dn_object != DMU_META_DNODE_OBJECT && 2275c717a561Smaybee db->db_state == DB_CACHED && db->db_data_pending) { 2276c717a561Smaybee dbuf_dirty_record_t *dr = db->db_data_pending; 2277c717a561Smaybee 2278c717a561Smaybee if (dr->dt.dl.dr_data == db->db_buf) { 2279ad23a2dbSjohansen arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2280fa9e4066Sahrens 2281c717a561Smaybee dbuf_set_data(db, 2282744947dcSTom Erickson arc_buf_alloc(dn->dn_objset->os_spa, 2283c717a561Smaybee db->db.db_size, db, type)); 2284c717a561Smaybee bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 2285fa9e4066Sahrens db->db.db_size); 2286fa9e4066Sahrens } 2287c717a561Smaybee } 2288fa9e4066Sahrens 2289ea8dc4b6Seschrock (void) refcount_add(&db->db_holds, tag); 22909c9dc39aSek110237 DBUF_VERIFY(db); 2291fa9e4066Sahrens mutex_exit(&db->db_mtx); 2292fa9e4066Sahrens 2293fa9e4066Sahrens /* NOTE: we can't rele the parent until after we drop the db_mtx */ 2294c543ec06Sahrens if (parent) 2295ea8dc4b6Seschrock dbuf_rele(parent, NULL); 2296fa9e4066Sahrens 2297744947dcSTom Erickson ASSERT3P(DB_DNODE(db), ==, dn); 2298fa9e4066Sahrens ASSERT3U(db->db_blkid, ==, blkid); 2299fa9e4066Sahrens ASSERT3U(db->db_level, ==, level); 2300fa9e4066Sahrens *dbp = db; 2301fa9e4066Sahrens 2302fa9e4066Sahrens return (0); 2303fa9e4066Sahrens } 2304fa9e4066Sahrens 2305fa9e4066Sahrens dmu_buf_impl_t * 2306ea8dc4b6Seschrock dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 2307fa9e4066Sahrens { 2308a2cdcdd2SPaul Dagnelie return (dbuf_hold_level(dn, 0, blkid, tag)); 2309fa9e4066Sahrens } 2310fa9e4066Sahrens 2311fa9e4066Sahrens dmu_buf_impl_t * 2312fa9e4066Sahrens dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 2313fa9e4066Sahrens { 2314fa9e4066Sahrens dmu_buf_impl_t *db; 2315a2cdcdd2SPaul Dagnelie int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); 2316ea8dc4b6Seschrock return (err ? NULL : db); 2317fa9e4066Sahrens } 2318fa9e4066Sahrens 23191934e92fSmaybee void 2320ea8dc4b6Seschrock dbuf_create_bonus(dnode_t *dn) 2321fa9e4066Sahrens { 2322ea8dc4b6Seschrock ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 2323ea8dc4b6Seschrock 2324ea8dc4b6Seschrock ASSERT(dn->dn_bonus == NULL); 23250a586ceaSMark Shellenbaum dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 23260a586ceaSMark Shellenbaum } 23270a586ceaSMark Shellenbaum 23280a586ceaSMark Shellenbaum int 23290a586ceaSMark Shellenbaum dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 23300a586ceaSMark Shellenbaum { 23310a586ceaSMark Shellenbaum dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2332744947dcSTom Erickson dnode_t *dn; 2333744947dcSTom Erickson 23340a586ceaSMark Shellenbaum if (db->db_blkid != DMU_SPILL_BLKID) 2335be6fd75aSMatthew Ahrens return (SET_ERROR(ENOTSUP)); 23360a586ceaSMark Shellenbaum if (blksz == 0) 23370a586ceaSMark Shellenbaum blksz = SPA_MINBLOCKSIZE; 2338b5152584SMatthew Ahrens ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); 23390a586ceaSMark Shellenbaum blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 23400a586ceaSMark Shellenbaum 2341744947dcSTom Erickson DB_DNODE_ENTER(db); 2342744947dcSTom Erickson dn = DB_DNODE(db); 2343744947dcSTom Erickson rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 23440a586ceaSMark Shellenbaum dbuf_new_size(db, blksz, tx); 2345744947dcSTom Erickson rw_exit(&dn->dn_struct_rwlock); 2346744947dcSTom Erickson DB_DNODE_EXIT(db); 23470a586ceaSMark Shellenbaum 23480a586ceaSMark Shellenbaum return (0); 23490a586ceaSMark Shellenbaum } 23500a586ceaSMark Shellenbaum 23510a586ceaSMark Shellenbaum void 23520a586ceaSMark Shellenbaum dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 23530a586ceaSMark Shellenbaum { 23540a586ceaSMark Shellenbaum dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2355fa9e4066Sahrens } 2356fa9e4066Sahrens 2357ea8dc4b6Seschrock #pragma weak dmu_buf_add_ref = dbuf_add_ref 2358fa9e4066Sahrens void 2359fa9e4066Sahrens dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 2360fa9e4066Sahrens { 2361ea8dc4b6Seschrock int64_t holds = refcount_add(&db->db_holds, tag); 2362ea8dc4b6Seschrock ASSERT(holds > 1); 2363fa9e4066Sahrens } 2364fa9e4066Sahrens 2365e57a022bSJustin T. Gibbs #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref 2366e57a022bSJustin T. Gibbs boolean_t 2367e57a022bSJustin T. Gibbs dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, 2368e57a022bSJustin T. Gibbs void *tag) 2369e57a022bSJustin T. Gibbs { 2370e57a022bSJustin T. Gibbs dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2371e57a022bSJustin T. Gibbs dmu_buf_impl_t *found_db; 2372e57a022bSJustin T. Gibbs boolean_t result = B_FALSE; 2373e57a022bSJustin T. Gibbs 2374e57a022bSJustin T. Gibbs if (db->db_blkid == DMU_BONUS_BLKID) 2375e57a022bSJustin T. Gibbs found_db = dbuf_find_bonus(os, obj); 2376e57a022bSJustin T. Gibbs else 2377e57a022bSJustin T. Gibbs found_db = dbuf_find(os, obj, 0, blkid); 2378e57a022bSJustin T. Gibbs 2379e57a022bSJustin T. Gibbs if (found_db != NULL) { 2380e57a022bSJustin T. Gibbs if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { 2381e57a022bSJustin T. Gibbs (void) refcount_add(&db->db_holds, tag); 2382e57a022bSJustin T. Gibbs result = B_TRUE; 2383e57a022bSJustin T. Gibbs } 2384e57a022bSJustin T. Gibbs mutex_exit(&db->db_mtx); 2385e57a022bSJustin T. Gibbs } 2386e57a022bSJustin T. Gibbs return (result); 2387e57a022bSJustin T. Gibbs } 2388e57a022bSJustin T. Gibbs 2389744947dcSTom Erickson /* 2390744947dcSTom Erickson * If you call dbuf_rele() you had better not be referencing the dnode handle 2391744947dcSTom Erickson * unless you have some other direct or indirect hold on the dnode. (An indirect 2392744947dcSTom Erickson * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2393744947dcSTom Erickson * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2394744947dcSTom Erickson * dnode's parent dbuf evicting its dnode handles. 2395744947dcSTom Erickson */ 2396fa9e4066Sahrens void 2397ea8dc4b6Seschrock dbuf_rele(dmu_buf_impl_t *db, void *tag) 2398fa9e4066Sahrens { 2399b24ab676SJeff Bonwick mutex_enter(&db->db_mtx); 2400b24ab676SJeff Bonwick dbuf_rele_and_unlock(db, tag); 2401b24ab676SJeff Bonwick } 2402b24ab676SJeff Bonwick 240343466aaeSMax Grossman void 240443466aaeSMax Grossman dmu_buf_rele(dmu_buf_t *db, void *tag) 240543466aaeSMax Grossman { 240643466aaeSMax Grossman dbuf_rele((dmu_buf_impl_t *)db, tag); 240743466aaeSMax Grossman } 240843466aaeSMax Grossman 2409b24ab676SJeff Bonwick /* 2410b24ab676SJeff Bonwick * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2411b24ab676SJeff Bonwick * db_dirtycnt and db_holds to be updated atomically. 2412b24ab676SJeff Bonwick */ 2413b24ab676SJeff Bonwick void 2414b24ab676SJeff Bonwick dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2415b24ab676SJeff Bonwick { 2416fa9e4066Sahrens int64_t holds; 2417fa9e4066Sahrens 2418b24ab676SJeff Bonwick ASSERT(MUTEX_HELD(&db->db_mtx)); 24199c9dc39aSek110237 DBUF_VERIFY(db); 2420fa9e4066Sahrens 2421744947dcSTom Erickson /* 2422744947dcSTom Erickson * Remove the reference to the dbuf before removing its hold on the 2423744947dcSTom Erickson * dnode so we can guarantee in dnode_move() that a referenced bonus 2424744947dcSTom Erickson * buffer has a corresponding dnode hold. 2425744947dcSTom Erickson */ 2426fa9e4066Sahrens holds = refcount_remove(&db->db_holds, tag); 2427ea8dc4b6Seschrock ASSERT(holds >= 0); 2428fa9e4066Sahrens 2429c717a561Smaybee /* 2430c717a561Smaybee * We can't freeze indirects if there is a possibility that they 2431c717a561Smaybee * may be modified in the current syncing context. 2432c717a561Smaybee */ 2433c717a561Smaybee if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 24346b4acc8bSahrens arc_buf_freeze(db->db_buf); 24356b4acc8bSahrens 2436fa9e4066Sahrens if (holds == db->db_dirtycnt && 2437d2058105SJustin T. Gibbs db->db_level == 0 && db->db_user_immediate_evict) 2438fa9e4066Sahrens dbuf_evict_user(db); 2439ea8dc4b6Seschrock 2440ea8dc4b6Seschrock if (holds == 0) { 24410a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) { 2442cd485b49SJustin T. Gibbs dnode_t *dn; 2443d2058105SJustin T. Gibbs boolean_t evict_dbuf = db->db_pending_evict; 2444cd485b49SJustin T. Gibbs 2445cd485b49SJustin T. Gibbs /* 2446cd485b49SJustin T. Gibbs * If the dnode moves here, we cannot cross this 2447cd485b49SJustin T. Gibbs * barrier until the move completes. 2448cd485b49SJustin T. Gibbs */ 2449cd485b49SJustin T. Gibbs DB_DNODE_ENTER(db); 2450cd485b49SJustin T. Gibbs 2451cd485b49SJustin T. Gibbs dn = DB_DNODE(db); 2452cd485b49SJustin T. Gibbs atomic_dec_32(&dn->dn_dbufs_count); 2453cd485b49SJustin T. Gibbs 2454cd485b49SJustin T. Gibbs /* 2455cd485b49SJustin T. Gibbs * Decrementing the dbuf count means that the bonus 2456cd485b49SJustin T. Gibbs * buffer's dnode hold is no longer discounted in 2457cd485b49SJustin T. Gibbs * dnode_move(). The dnode cannot move until after 2458d2058105SJustin T. Gibbs * the dnode_rele() below. 2459cd485b49SJustin T. Gibbs */ 2460cd485b49SJustin T. Gibbs DB_DNODE_EXIT(db); 2461cd485b49SJustin T. Gibbs 2462cd485b49SJustin T. Gibbs /* 2463cd485b49SJustin T. Gibbs * Do not reference db after its lock is dropped. 2464cd485b49SJustin T. Gibbs * Another thread may evict it. 2465cd485b49SJustin T. Gibbs */ 2466ea8dc4b6Seschrock mutex_exit(&db->db_mtx); 2467744947dcSTom Erickson 2468d2058105SJustin T. Gibbs if (evict_dbuf) 2469cd485b49SJustin T. Gibbs dnode_evict_bonus(dn); 2470d2058105SJustin T. Gibbs 2471d2058105SJustin T. Gibbs dnode_rele(dn, db); 2472ea8dc4b6Seschrock } else if (db->db_buf == NULL) { 2473ea8dc4b6Seschrock /* 2474ea8dc4b6Seschrock * This is a special case: we never associated this 2475ea8dc4b6Seschrock * dbuf with any data allocated from the ARC. 2476ea8dc4b6Seschrock */ 247782c9918fSTim Haley ASSERT(db->db_state == DB_UNCACHED || 247882c9918fSTim Haley db->db_state == DB_NOFILL); 2479ea8dc4b6Seschrock dbuf_evict(db); 2480ea8dc4b6Seschrock } else if (arc_released(db->db_buf)) { 2481ea8dc4b6Seschrock arc_buf_t *buf = db->db_buf; 2482ea8dc4b6Seschrock /* 2483ea8dc4b6Seschrock * This dbuf has anonymous data associated with it. 2484ea8dc4b6Seschrock */ 2485bc9014e6SJustin Gibbs dbuf_clear_data(db); 24863b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(buf, db)); 2487ea8dc4b6Seschrock dbuf_evict(db); 2488ea8dc4b6Seschrock } else { 24893b2aab18SMatthew Ahrens VERIFY(!arc_buf_remove_ref(db->db_buf, db)); 24909253d63dSGeorge Wilson 24919253d63dSGeorge Wilson /* 24929253d63dSGeorge Wilson * A dbuf will be eligible for eviction if either the 24939253d63dSGeorge Wilson * 'primarycache' property is set or a duplicate 24949253d63dSGeorge Wilson * copy of this buffer is already cached in the arc. 24959253d63dSGeorge Wilson * 24969253d63dSGeorge Wilson * In the case of the 'primarycache' a buffer 24979253d63dSGeorge Wilson * is considered for eviction if it matches the 24989253d63dSGeorge Wilson * criteria set in the property. 24999253d63dSGeorge Wilson * 25009253d63dSGeorge Wilson * To decide if our buffer is considered a 25019253d63dSGeorge Wilson * duplicate, we must call into the arc to determine 25029253d63dSGeorge Wilson * if multiple buffers are referencing the same 25039253d63dSGeorge Wilson * block on-disk. If so, then we simply evict 25049253d63dSGeorge Wilson * ourselves. 25059253d63dSGeorge Wilson */ 2506bbfa8ea8SMatthew Ahrens if (!DBUF_IS_CACHEABLE(db)) { 2507bbfa8ea8SMatthew Ahrens if (db->db_blkptr != NULL && 2508bbfa8ea8SMatthew Ahrens !BP_IS_HOLE(db->db_blkptr) && 2509bbfa8ea8SMatthew Ahrens !BP_IS_EMBEDDED(db->db_blkptr)) { 2510bbfa8ea8SMatthew Ahrens spa_t *spa = 2511bbfa8ea8SMatthew Ahrens dmu_objset_spa(db->db_objset); 2512bbfa8ea8SMatthew Ahrens blkptr_t bp = *db->db_blkptr; 25133baa08fcSek110237 dbuf_clear(db); 2514bbfa8ea8SMatthew Ahrens arc_freed(spa, &bp); 2515bbfa8ea8SMatthew Ahrens } else { 2516bbfa8ea8SMatthew Ahrens dbuf_clear(db); 2517bbfa8ea8SMatthew Ahrens } 2518d2058105SJustin T. Gibbs } else if (db->db_pending_evict || 2519bc9014e6SJustin Gibbs arc_buf_eviction_needed(db->db_buf)) { 2520bbfa8ea8SMatthew Ahrens dbuf_clear(db); 2521bbfa8ea8SMatthew Ahrens } else { 2522fa9e4066Sahrens mutex_exit(&db->db_mtx); 2523fa9e4066Sahrens } 2524bbfa8ea8SMatthew Ahrens } 2525ea8dc4b6Seschrock } else { 2526ea8dc4b6Seschrock mutex_exit(&db->db_mtx); 2527fa9e4066Sahrens } 2528fa9e4066Sahrens } 2529fa9e4066Sahrens 2530fa9e4066Sahrens #pragma weak dmu_buf_refcount = dbuf_refcount 2531fa9e4066Sahrens uint64_t 2532fa9e4066Sahrens dbuf_refcount(dmu_buf_impl_t *db) 2533fa9e4066Sahrens { 2534fa9e4066Sahrens return (refcount_count(&db->db_holds)); 2535fa9e4066Sahrens } 2536fa9e4066Sahrens 2537fa9e4066Sahrens void * 2538bc9014e6SJustin Gibbs dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, 2539bc9014e6SJustin Gibbs dmu_buf_user_t *new_user) 2540fa9e4066Sahrens { 2541bc9014e6SJustin Gibbs dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2542bc9014e6SJustin Gibbs 2543bc9014e6SJustin Gibbs mutex_enter(&db->db_mtx); 2544bc9014e6SJustin Gibbs dbuf_verify_user(db, DBVU_NOT_EVICTING); 2545bc9014e6SJustin Gibbs if (db->db_user == old_user) 2546bc9014e6SJustin Gibbs db->db_user = new_user; 2547bc9014e6SJustin Gibbs else 2548bc9014e6SJustin Gibbs old_user = db->db_user; 2549bc9014e6SJustin Gibbs dbuf_verify_user(db, DBVU_NOT_EVICTING); 2550bc9014e6SJustin Gibbs mutex_exit(&db->db_mtx); 2551bc9014e6SJustin Gibbs 2552bc9014e6SJustin Gibbs return (old_user); 2553fa9e4066Sahrens } 2554fa9e4066Sahrens 2555fa9e4066Sahrens void * 2556bc9014e6SJustin Gibbs dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2557bc9014e6SJustin Gibbs { 2558bc9014e6SJustin Gibbs return (dmu_buf_replace_user(db_fake, NULL, user)); 2559bc9014e6SJustin Gibbs } 2560bc9014e6SJustin Gibbs 2561bc9014e6SJustin Gibbs void * 2562bc9014e6SJustin Gibbs dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2563fa9e4066Sahrens { 2564fa9e4066Sahrens dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2565fa9e4066Sahrens 2566d2058105SJustin T. Gibbs db->db_user_immediate_evict = TRUE; 2567bc9014e6SJustin Gibbs return (dmu_buf_set_user(db_fake, user)); 2568fa9e4066Sahrens } 2569fa9e4066Sahrens 2570fa9e4066Sahrens void * 2571bc9014e6SJustin Gibbs dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2572fa9e4066Sahrens { 2573bc9014e6SJustin Gibbs return (dmu_buf_replace_user(db_fake, user, NULL)); 2574fa9e4066Sahrens } 2575fa9e4066Sahrens 2576fa9e4066Sahrens void * 2577fa9e4066Sahrens dmu_buf_get_user(dmu_buf_t *db_fake) 2578fa9e4066Sahrens { 2579fa9e4066Sahrens dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2580fa9e4066Sahrens 2581bc9014e6SJustin Gibbs dbuf_verify_user(db, DBVU_NOT_EVICTING); 2582bc9014e6SJustin Gibbs return (db->db_user); 2583bc9014e6SJustin Gibbs } 2584bc9014e6SJustin Gibbs 2585bc9014e6SJustin Gibbs void 2586bc9014e6SJustin Gibbs dmu_buf_user_evict_wait() 2587bc9014e6SJustin Gibbs { 2588bc9014e6SJustin Gibbs taskq_wait(dbu_evict_taskq); 2589fa9e4066Sahrens } 2590fa9e4066Sahrens 25913d692628SSanjeev Bagewadi boolean_t 25923d692628SSanjeev Bagewadi dmu_buf_freeable(dmu_buf_t *dbuf) 25933d692628SSanjeev Bagewadi { 25943d692628SSanjeev Bagewadi boolean_t res = B_FALSE; 25953d692628SSanjeev Bagewadi dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 25963d692628SSanjeev Bagewadi 25973d692628SSanjeev Bagewadi if (db->db_blkptr) 25983d692628SSanjeev Bagewadi res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2599c7cd2421SGeorge Wilson db->db_blkptr, db->db_blkptr->blk_birth); 26003d692628SSanjeev Bagewadi 26013d692628SSanjeev Bagewadi return (res); 26023d692628SSanjeev Bagewadi } 26033d692628SSanjeev Bagewadi 260480901aeaSGeorge Wilson blkptr_t * 260580901aeaSGeorge Wilson dmu_buf_get_blkptr(dmu_buf_t *db) 260680901aeaSGeorge Wilson { 260780901aeaSGeorge Wilson dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 260880901aeaSGeorge Wilson return (dbi->db_blkptr); 260980901aeaSGeorge Wilson } 261080901aeaSGeorge Wilson 2611c717a561Smaybee static void 2612c717a561Smaybee dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2613fa9e4066Sahrens { 2614c717a561Smaybee /* ASSERT(dmu_tx_is_syncing(tx) */ 2615c717a561Smaybee ASSERT(MUTEX_HELD(&db->db_mtx)); 2616c717a561Smaybee 2617c717a561Smaybee if (db->db_blkptr != NULL) 2618c717a561Smaybee return; 2619c717a561Smaybee 26200a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID) { 26210a586ceaSMark Shellenbaum db->db_blkptr = &dn->dn_phys->dn_spill; 26220a586ceaSMark Shellenbaum BP_ZERO(db->db_blkptr); 26230a586ceaSMark Shellenbaum return; 26240a586ceaSMark Shellenbaum } 2625c717a561Smaybee if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2626c717a561Smaybee /* 2627c717a561Smaybee * This buffer was allocated at a time when there was 2628c717a561Smaybee * no available blkptrs from the dnode, or it was 2629c717a561Smaybee * inappropriate to hook it in (i.e., nlevels mis-match). 2630c717a561Smaybee */ 2631c717a561Smaybee ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2632c717a561Smaybee ASSERT(db->db_parent == NULL); 2633c717a561Smaybee db->db_parent = dn->dn_dbuf; 2634c717a561Smaybee db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2635c717a561Smaybee DBUF_VERIFY(db); 2636c717a561Smaybee } else { 2637c717a561Smaybee dmu_buf_impl_t *parent = db->db_parent; 2638c717a561Smaybee int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2639c717a561Smaybee 2640c717a561Smaybee ASSERT(dn->dn_phys->dn_nlevels > 1); 2641c717a561Smaybee if (parent == NULL) { 2642c717a561Smaybee mutex_exit(&db->db_mtx); 2643c717a561Smaybee rw_enter(&dn->dn_struct_rwlock, RW_READER); 2644a2cdcdd2SPaul Dagnelie parent = dbuf_hold_level(dn, db->db_level + 1, 2645a2cdcdd2SPaul Dagnelie db->db_blkid >> epbs, db); 2646c717a561Smaybee rw_exit(&dn->dn_struct_rwlock); 2647c717a561Smaybee mutex_enter(&db->db_mtx); 2648c717a561Smaybee db->db_parent = parent; 2649c717a561Smaybee } 2650c717a561Smaybee db->db_blkptr = (blkptr_t *)parent->db.db_data + 2651c717a561Smaybee (db->db_blkid & ((1ULL << epbs) - 1)); 2652c717a561Smaybee DBUF_VERIFY(db); 2653c717a561Smaybee } 2654c717a561Smaybee } 2655c717a561Smaybee 2656c717a561Smaybee static void 2657c717a561Smaybee dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2658c717a561Smaybee { 2659c717a561Smaybee dmu_buf_impl_t *db = dr->dr_dbuf; 2660744947dcSTom Erickson dnode_t *dn; 2661c717a561Smaybee zio_t *zio; 2662c717a561Smaybee 2663c717a561Smaybee ASSERT(dmu_tx_is_syncing(tx)); 2664c717a561Smaybee 2665c717a561Smaybee dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2666c717a561Smaybee 2667c717a561Smaybee mutex_enter(&db->db_mtx); 2668c717a561Smaybee 2669c717a561Smaybee ASSERT(db->db_level > 0); 2670c717a561Smaybee DBUF_VERIFY(db); 2671c717a561Smaybee 26723e30c24aSWill Andrews /* Read the block if it hasn't been read yet. */ 2673c717a561Smaybee if (db->db_buf == NULL) { 2674c717a561Smaybee mutex_exit(&db->db_mtx); 2675c717a561Smaybee (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2676c717a561Smaybee mutex_enter(&db->db_mtx); 2677c717a561Smaybee } 2678c717a561Smaybee ASSERT3U(db->db_state, ==, DB_CACHED); 2679c717a561Smaybee ASSERT(db->db_buf != NULL); 2680c717a561Smaybee 2681744947dcSTom Erickson DB_DNODE_ENTER(db); 2682744947dcSTom Erickson dn = DB_DNODE(db); 26833e30c24aSWill Andrews /* Indirect block size must match what the dnode thinks it is. */ 2684744947dcSTom Erickson ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2685c717a561Smaybee dbuf_check_blkptr(dn, db); 2686744947dcSTom Erickson DB_DNODE_EXIT(db); 2687c717a561Smaybee 26883e30c24aSWill Andrews /* Provide the pending dirty record to child dbufs */ 2689c717a561Smaybee db->db_data_pending = dr; 2690c717a561Smaybee 2691af2c4821Smaybee mutex_exit(&db->db_mtx); 2692088f3894Sahrens dbuf_write(dr, db->db_buf, tx); 2693c717a561Smaybee 2694c717a561Smaybee zio = dr->dr_zio; 2695c717a561Smaybee mutex_enter(&dr->dt.di.dr_mtx); 269646e1baa6SMatthew Ahrens dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); 2697c717a561Smaybee ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2698c717a561Smaybee mutex_exit(&dr->dt.di.dr_mtx); 2699c717a561Smaybee zio_nowait(zio); 2700c717a561Smaybee } 2701c717a561Smaybee 2702c717a561Smaybee static void 2703c717a561Smaybee dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2704c717a561Smaybee { 2705c717a561Smaybee arc_buf_t **datap = &dr->dt.dl.dr_data; 2706c717a561Smaybee dmu_buf_impl_t *db = dr->dr_dbuf; 2707744947dcSTom Erickson dnode_t *dn; 2708744947dcSTom Erickson objset_t *os; 2709c717a561Smaybee uint64_t txg = tx->tx_txg; 2710fa9e4066Sahrens 2711fa9e4066Sahrens ASSERT(dmu_tx_is_syncing(tx)); 2712fa9e4066Sahrens 2713fa9e4066Sahrens dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2714fa9e4066Sahrens 2715fa9e4066Sahrens mutex_enter(&db->db_mtx); 2716fa9e4066Sahrens /* 2717fa9e4066Sahrens * To be synced, we must be dirtied. But we 2718fa9e4066Sahrens * might have been freed after the dirty. 2719fa9e4066Sahrens */ 2720fa9e4066Sahrens if (db->db_state == DB_UNCACHED) { 2721fa9e4066Sahrens /* This buffer has been freed since it was dirtied */ 2722fa9e4066Sahrens ASSERT(db->db.db_data == NULL); 2723fa9e4066Sahrens } else if (db->db_state == DB_FILL) { 2724fa9e4066Sahrens /* This buffer was freed and is now being re-filled */ 2725c717a561Smaybee ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2726fa9e4066Sahrens } else { 272782c9918fSTim Haley ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2728fa9e4066Sahrens } 27299c9dc39aSek110237 DBUF_VERIFY(db); 2730fa9e4066Sahrens 2731744947dcSTom Erickson DB_DNODE_ENTER(db); 2732744947dcSTom Erickson dn = DB_DNODE(db); 2733744947dcSTom Erickson 27340a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID) { 27350a586ceaSMark Shellenbaum mutex_enter(&dn->dn_mtx); 27360a586ceaSMark Shellenbaum dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 27370a586ceaSMark Shellenbaum mutex_exit(&dn->dn_mtx); 27380a586ceaSMark Shellenbaum } 27390a586ceaSMark Shellenbaum 2740fa9e4066Sahrens /* 2741c717a561Smaybee * If this is a bonus buffer, simply copy the bonus data into the 2742c717a561Smaybee * dnode. It will be written out when the dnode is synced (and it 2743c717a561Smaybee * will be synced, since it must have been dirty for dbuf_sync to 2744c717a561Smaybee * be called). 2745fa9e4066Sahrens */ 27460a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) { 2747c717a561Smaybee dbuf_dirty_record_t **drp; 27481934e92fSmaybee 2749ea8dc4b6Seschrock ASSERT(*datap != NULL); 2750fb09f5aaSMadhav Suresh ASSERT0(db->db_level); 2751ea8dc4b6Seschrock ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2752ea8dc4b6Seschrock bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2753744947dcSTom Erickson DB_DNODE_EXIT(db); 2754744947dcSTom Erickson 27550e8c6158Smaybee if (*datap != db->db.db_data) { 2756ea8dc4b6Seschrock zio_buf_free(*datap, DN_MAX_BONUSLEN); 27575a98e54bSBrendan Gregg - Sun Microsystems arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 27580e8c6158Smaybee } 2759ea8dc4b6Seschrock db->db_data_pending = NULL; 2760c717a561Smaybee drp = &db->db_last_dirty; 2761c717a561Smaybee while (*drp != dr) 2762c717a561Smaybee drp = &(*drp)->dr_next; 276317f17c2dSbonwick ASSERT(dr->dr_next == NULL); 2764b24ab676SJeff Bonwick ASSERT(dr->dr_dbuf == db); 276517f17c2dSbonwick *drp = dr->dr_next; 2766c717a561Smaybee kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2767ea8dc4b6Seschrock ASSERT(db->db_dirtycnt > 0); 2768ea8dc4b6Seschrock db->db_dirtycnt -= 1; 2769b24ab676SJeff Bonwick dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2770ea8dc4b6Seschrock return; 2771ea8dc4b6Seschrock } 2772ea8dc4b6Seschrock 2773744947dcSTom Erickson os = dn->dn_objset; 2774744947dcSTom Erickson 2775c5c6ffa0Smaybee /* 2776f82bfe17Sgw25295 * This function may have dropped the db_mtx lock allowing a dmu_sync 2777f82bfe17Sgw25295 * operation to sneak in. As a result, we need to ensure that we 2778f82bfe17Sgw25295 * don't check the dr_override_state until we have returned from 2779f82bfe17Sgw25295 * dbuf_check_blkptr. 2780f82bfe17Sgw25295 */ 2781f82bfe17Sgw25295 dbuf_check_blkptr(dn, db); 2782f82bfe17Sgw25295 2783f82bfe17Sgw25295 /* 2784744947dcSTom Erickson * If this buffer is in the middle of an immediate write, 2785c717a561Smaybee * wait for the synchronous IO to complete. 2786c5c6ffa0Smaybee */ 2787c717a561Smaybee while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2788c5c6ffa0Smaybee ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2789c5c6ffa0Smaybee cv_wait(&db->db_changed, &db->db_mtx); 2790c717a561Smaybee ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2791c5c6ffa0Smaybee } 2792c717a561Smaybee 2793ab69d62fSMatthew Ahrens if (db->db_state != DB_NOFILL && 2794ab69d62fSMatthew Ahrens dn->dn_object != DMU_META_DNODE_OBJECT && 2795ab69d62fSMatthew Ahrens refcount_count(&db->db_holds) > 1 && 2796b24ab676SJeff Bonwick dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2797ab69d62fSMatthew Ahrens *datap == db->db_buf) { 2798fa9e4066Sahrens /* 279982c9918fSTim Haley * If this buffer is currently "in use" (i.e., there 280082c9918fSTim Haley * are active holds and db_data still references it), 280182c9918fSTim Haley * then make a copy before we start the write so that 280282c9918fSTim Haley * any modifications from the open txg will not leak 280382c9918fSTim Haley * into this write. 2804fa9e4066Sahrens * 280582c9918fSTim Haley * NOTE: this copy does not need to be made for 280682c9918fSTim Haley * objects only modified in the syncing context (e.g. 280782c9918fSTim Haley * DNONE_DNODE blocks). 2808fa9e4066Sahrens */ 2809ab69d62fSMatthew Ahrens int blksz = arc_buf_size(*datap); 2810ab69d62fSMatthew Ahrens arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2811ab69d62fSMatthew Ahrens *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2812c717a561Smaybee bcopy(db->db.db_data, (*datap)->b_data, blksz); 2813fa9e4066Sahrens } 2814c717a561Smaybee db->db_data_pending = dr; 2815fa9e4066Sahrens 2816fa9e4066Sahrens mutex_exit(&db->db_mtx); 2817fa9e4066Sahrens 2818088f3894Sahrens dbuf_write(dr, *datap, tx); 2819c717a561Smaybee 2820c717a561Smaybee ASSERT(!list_link_active(&dr->dr_dirty_node)); 2821744947dcSTom Erickson if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2822c717a561Smaybee list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2823744947dcSTom Erickson DB_DNODE_EXIT(db); 2824744947dcSTom Erickson } else { 2825744947dcSTom Erickson /* 2826744947dcSTom Erickson * Although zio_nowait() does not "wait for an IO", it does 2827744947dcSTom Erickson * initiate the IO. If this is an empty write it seems plausible 2828744947dcSTom Erickson * that the IO could actually be completed before the nowait 2829744947dcSTom Erickson * returns. We need to DB_DNODE_EXIT() first in case 2830744947dcSTom Erickson * zio_nowait() invalidates the dbuf. 2831744947dcSTom Erickson */ 2832744947dcSTom Erickson DB_DNODE_EXIT(db); 2833c717a561Smaybee zio_nowait(dr->dr_zio); 2834fa9e4066Sahrens } 2835744947dcSTom Erickson } 2836c717a561Smaybee 2837c717a561Smaybee void 283846e1baa6SMatthew Ahrens dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) 2839c717a561Smaybee { 2840c717a561Smaybee dbuf_dirty_record_t *dr; 2841c717a561Smaybee 2842c717a561Smaybee while (dr = list_head(list)) { 2843c717a561Smaybee if (dr->dr_zio != NULL) { 2844c717a561Smaybee /* 2845c717a561Smaybee * If we find an already initialized zio then we 2846c717a561Smaybee * are processing the meta-dnode, and we have finished. 2847c717a561Smaybee * The dbufs for all dnodes are put back on the list 2848c717a561Smaybee * during processing, so that we can zio_wait() 2849c717a561Smaybee * these IOs after initiating all child IOs. 2850c717a561Smaybee */ 2851c717a561Smaybee ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2852c717a561Smaybee DMU_META_DNODE_OBJECT); 2853c717a561Smaybee break; 2854fa9e4066Sahrens } 285546e1baa6SMatthew Ahrens if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && 285646e1baa6SMatthew Ahrens dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { 285746e1baa6SMatthew Ahrens VERIFY3U(dr->dr_dbuf->db_level, ==, level); 285846e1baa6SMatthew Ahrens } 2859c717a561Smaybee list_remove(list, dr); 2860c717a561Smaybee if (dr->dr_dbuf->db_level > 0) 2861c717a561Smaybee dbuf_sync_indirect(dr, tx); 2862c717a561Smaybee else 2863c717a561Smaybee dbuf_sync_leaf(dr, tx); 2864c717a561Smaybee } 2865c717a561Smaybee } 2866c717a561Smaybee 2867fa9e4066Sahrens /* ARGSUSED */ 2868fa9e4066Sahrens static void 2869c717a561Smaybee dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2870fa9e4066Sahrens { 2871fa9e4066Sahrens dmu_buf_impl_t *db = vdb; 2872744947dcSTom Erickson dnode_t *dn; 2873e14bb325SJeff Bonwick blkptr_t *bp = zio->io_bp; 2874c717a561Smaybee blkptr_t *bp_orig = &zio->io_bp_orig; 2875b24ab676SJeff Bonwick spa_t *spa = zio->io_spa; 2876b24ab676SJeff Bonwick int64_t delta; 2877fa9e4066Sahrens uint64_t fill = 0; 2878b24ab676SJeff Bonwick int i; 2879fa9e4066Sahrens 28805d7b4d43SMatthew Ahrens ASSERT3P(db->db_blkptr, ==, bp); 2881e14bb325SJeff Bonwick 2882744947dcSTom Erickson DB_DNODE_ENTER(db); 2883744947dcSTom Erickson dn = DB_DNODE(db); 2884b24ab676SJeff Bonwick delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2885b24ab676SJeff Bonwick dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2886b24ab676SJeff Bonwick zio->io_prev_space_delta = delta; 2887fa9e4066Sahrens 288843466aaeSMax Grossman if (bp->blk_birth != 0) { 28890a586ceaSMark Shellenbaum ASSERT((db->db_blkid != DMU_SPILL_BLKID && 28900a586ceaSMark Shellenbaum BP_GET_TYPE(bp) == dn->dn_type) || 28910a586ceaSMark Shellenbaum (db->db_blkid == DMU_SPILL_BLKID && 28925d7b4d43SMatthew Ahrens BP_GET_TYPE(bp) == dn->dn_bonustype) || 28935d7b4d43SMatthew Ahrens BP_IS_EMBEDDED(bp)); 2894e14bb325SJeff Bonwick ASSERT(BP_GET_LEVEL(bp) == db->db_level); 289543466aaeSMax Grossman } 2896e14bb325SJeff Bonwick 2897fa9e4066Sahrens mutex_enter(&db->db_mtx); 2898fa9e4066Sahrens 28990a586ceaSMark Shellenbaum #ifdef ZFS_DEBUG 29000a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID) { 29010a586ceaSMark Shellenbaum ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 29020a586ceaSMark Shellenbaum ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 29030a586ceaSMark Shellenbaum db->db_blkptr == &dn->dn_phys->dn_spill); 29040a586ceaSMark Shellenbaum } 29050a586ceaSMark Shellenbaum #endif 29060a586ceaSMark Shellenbaum 2907fa9e4066Sahrens if (db->db_level == 0) { 2908fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 29090a586ceaSMark Shellenbaum if (db->db_blkid > dn->dn_phys->dn_maxblkid && 29100a586ceaSMark Shellenbaum db->db_blkid != DMU_SPILL_BLKID) 2911fa9e4066Sahrens dn->dn_phys->dn_maxblkid = db->db_blkid; 2912fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 2913fa9e4066Sahrens 2914fa9e4066Sahrens if (dn->dn_type == DMU_OT_DNODE) { 2915fa9e4066Sahrens dnode_phys_t *dnp = db->db.db_data; 2916fa9e4066Sahrens for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2917fa9e4066Sahrens i--, dnp++) { 2918fa9e4066Sahrens if (dnp->dn_type != DMU_OT_NONE) 2919fa9e4066Sahrens fill++; 2920fa9e4066Sahrens } 2921fa9e4066Sahrens } else { 292243466aaeSMax Grossman if (BP_IS_HOLE(bp)) { 292343466aaeSMax Grossman fill = 0; 292443466aaeSMax Grossman } else { 2925fa9e4066Sahrens fill = 1; 2926fa9e4066Sahrens } 292743466aaeSMax Grossman } 2928fa9e4066Sahrens } else { 2929e14bb325SJeff Bonwick blkptr_t *ibp = db->db.db_data; 2930fa9e4066Sahrens ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2931e14bb325SJeff Bonwick for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2932e14bb325SJeff Bonwick if (BP_IS_HOLE(ibp)) 2933fa9e4066Sahrens continue; 29345d7b4d43SMatthew Ahrens fill += BP_GET_FILL(ibp); 2935fa9e4066Sahrens } 2936fa9e4066Sahrens } 2937744947dcSTom Erickson DB_DNODE_EXIT(db); 2938fa9e4066Sahrens 29395d7b4d43SMatthew Ahrens if (!BP_IS_EMBEDDED(bp)) 2940e14bb325SJeff Bonwick bp->blk_fill = fill; 2941fa9e4066Sahrens 2942fa9e4066Sahrens mutex_exit(&db->db_mtx); 2943fa9e4066Sahrens } 2944fa9e4066Sahrens 294569962b56SMatthew Ahrens /* 294669962b56SMatthew Ahrens * The SPA will call this callback several times for each zio - once 294769962b56SMatthew Ahrens * for every physical child i/o (zio->io_phys_children times). This 294869962b56SMatthew Ahrens * allows the DMU to monitor the progress of each logical i/o. For example, 294969962b56SMatthew Ahrens * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 295069962b56SMatthew Ahrens * block. There may be a long delay before all copies/fragments are completed, 295169962b56SMatthew Ahrens * so this callback allows us to retire dirty space gradually, as the physical 295269962b56SMatthew Ahrens * i/os complete. 295369962b56SMatthew Ahrens */ 295469962b56SMatthew Ahrens /* ARGSUSED */ 295569962b56SMatthew Ahrens static void 295669962b56SMatthew Ahrens dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) 295769962b56SMatthew Ahrens { 295869962b56SMatthew Ahrens dmu_buf_impl_t *db = arg; 295969962b56SMatthew Ahrens objset_t *os = db->db_objset; 296069962b56SMatthew Ahrens dsl_pool_t *dp = dmu_objset_pool(os); 296169962b56SMatthew Ahrens dbuf_dirty_record_t *dr; 296269962b56SMatthew Ahrens int delta = 0; 296369962b56SMatthew Ahrens 296469962b56SMatthew Ahrens dr = db->db_data_pending; 296569962b56SMatthew Ahrens ASSERT3U(dr->dr_txg, ==, zio->io_txg); 296669962b56SMatthew Ahrens 296769962b56SMatthew Ahrens /* 296869962b56SMatthew Ahrens * The callback will be called io_phys_children times. Retire one 296969962b56SMatthew Ahrens * portion of our dirty space each time we are called. Any rounding 297069962b56SMatthew Ahrens * error will be cleaned up by dsl_pool_sync()'s call to 297169962b56SMatthew Ahrens * dsl_pool_undirty_space(). 297269962b56SMatthew Ahrens */ 297369962b56SMatthew Ahrens delta = dr->dr_accounted / zio->io_phys_children; 297469962b56SMatthew Ahrens dsl_pool_undirty_space(dp, delta, zio->io_txg); 297569962b56SMatthew Ahrens } 297669962b56SMatthew Ahrens 2977c717a561Smaybee /* ARGSUSED */ 2978c717a561Smaybee static void 2979c717a561Smaybee dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2980c717a561Smaybee { 2981c717a561Smaybee dmu_buf_impl_t *db = vdb; 2982b24ab676SJeff Bonwick blkptr_t *bp_orig = &zio->io_bp_orig; 298343466aaeSMax Grossman blkptr_t *bp = db->db_blkptr; 298443466aaeSMax Grossman objset_t *os = db->db_objset; 298543466aaeSMax Grossman dmu_tx_t *tx = os->os_synctx; 2986c717a561Smaybee dbuf_dirty_record_t **drp, *dr; 2987c717a561Smaybee 2988fb09f5aaSMadhav Suresh ASSERT0(zio->io_error); 2989b24ab676SJeff Bonwick ASSERT(db->db_blkptr == bp); 2990b24ab676SJeff Bonwick 299180901aeaSGeorge Wilson /* 299280901aeaSGeorge Wilson * For nopwrites and rewrites we ensure that the bp matches our 299380901aeaSGeorge Wilson * original and bypass all the accounting. 299480901aeaSGeorge Wilson */ 299580901aeaSGeorge Wilson if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 2996b24ab676SJeff Bonwick ASSERT(BP_EQUAL(bp, bp_orig)); 2997b24ab676SJeff Bonwick } else { 299843466aaeSMax Grossman dsl_dataset_t *ds = os->os_dsl_dataset; 2999b24ab676SJeff Bonwick (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 3000b24ab676SJeff Bonwick dsl_dataset_block_born(ds, bp, tx); 3001b24ab676SJeff Bonwick } 3002c717a561Smaybee 3003c717a561Smaybee mutex_enter(&db->db_mtx); 3004c717a561Smaybee 3005b24ab676SJeff Bonwick DBUF_VERIFY(db); 3006b24ab676SJeff Bonwick 3007c717a561Smaybee drp = &db->db_last_dirty; 300817f17c2dSbonwick while ((dr = *drp) != db->db_data_pending) 300917f17c2dSbonwick drp = &dr->dr_next; 301017f17c2dSbonwick ASSERT(!list_link_active(&dr->dr_dirty_node)); 3011b24ab676SJeff Bonwick ASSERT(dr->dr_dbuf == db); 301217f17c2dSbonwick ASSERT(dr->dr_next == NULL); 301317f17c2dSbonwick *drp = dr->dr_next; 3014c717a561Smaybee 30150a586ceaSMark Shellenbaum #ifdef ZFS_DEBUG 30160a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID) { 3017744947dcSTom Erickson dnode_t *dn; 3018744947dcSTom Erickson 3019744947dcSTom Erickson DB_DNODE_ENTER(db); 3020744947dcSTom Erickson dn = DB_DNODE(db); 30210a586ceaSMark Shellenbaum ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 30220a586ceaSMark Shellenbaum ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 30230a586ceaSMark Shellenbaum db->db_blkptr == &dn->dn_phys->dn_spill); 3024744947dcSTom Erickson DB_DNODE_EXIT(db); 30250a586ceaSMark Shellenbaum } 30260a586ceaSMark Shellenbaum #endif 30270a586ceaSMark Shellenbaum 3028c717a561Smaybee if (db->db_level == 0) { 30290a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 3030c717a561Smaybee ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 303182c9918fSTim Haley if (db->db_state != DB_NOFILL) { 3032c717a561Smaybee if (dr->dt.dl.dr_data != db->db_buf) 303382c9918fSTim Haley VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 30343b2aab18SMatthew Ahrens db)); 3035b24ab676SJeff Bonwick else if (!arc_released(db->db_buf)) 3036c717a561Smaybee arc_set_callback(db->db_buf, dbuf_do_evict, db); 303782c9918fSTim Haley } 3038c717a561Smaybee } else { 3039744947dcSTom Erickson dnode_t *dn; 3040744947dcSTom Erickson 3041744947dcSTom Erickson DB_DNODE_ENTER(db); 3042744947dcSTom Erickson dn = DB_DNODE(db); 3043c717a561Smaybee ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 3044c717a561Smaybee ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 3045c717a561Smaybee if (!BP_IS_HOLE(db->db_blkptr)) { 3046c717a561Smaybee int epbs = 3047c717a561Smaybee dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 304843466aaeSMax Grossman ASSERT3U(db->db_blkid, <=, 304943466aaeSMax Grossman dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 3050c717a561Smaybee ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 3051c717a561Smaybee db->db.db_size); 30525d7b4d43SMatthew Ahrens if (!arc_released(db->db_buf)) 3053c717a561Smaybee arc_set_callback(db->db_buf, dbuf_do_evict, db); 3054c717a561Smaybee } 3055744947dcSTom Erickson DB_DNODE_EXIT(db); 3056c25056deSgw25295 mutex_destroy(&dr->dt.di.dr_mtx); 3057c25056deSgw25295 list_destroy(&dr->dt.di.dr_children); 3058c717a561Smaybee } 3059c717a561Smaybee kmem_free(dr, sizeof (dbuf_dirty_record_t)); 3060c717a561Smaybee 3061c717a561Smaybee cv_broadcast(&db->db_changed); 3062c717a561Smaybee ASSERT(db->db_dirtycnt > 0); 3063c717a561Smaybee db->db_dirtycnt -= 1; 3064c717a561Smaybee db->db_data_pending = NULL; 306543466aaeSMax Grossman dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); 3066b24ab676SJeff Bonwick } 3067b24ab676SJeff Bonwick 3068b24ab676SJeff Bonwick static void 3069b24ab676SJeff Bonwick dbuf_write_nofill_ready(zio_t *zio) 3070b24ab676SJeff Bonwick { 3071b24ab676SJeff Bonwick dbuf_write_ready(zio, NULL, zio->io_private); 3072b24ab676SJeff Bonwick } 3073b24ab676SJeff Bonwick 3074b24ab676SJeff Bonwick static void 3075b24ab676SJeff Bonwick dbuf_write_nofill_done(zio_t *zio) 3076b24ab676SJeff Bonwick { 3077b24ab676SJeff Bonwick dbuf_write_done(zio, NULL, zio->io_private); 3078b24ab676SJeff Bonwick } 3079b24ab676SJeff Bonwick 3080b24ab676SJeff Bonwick static void 3081b24ab676SJeff Bonwick dbuf_write_override_ready(zio_t *zio) 3082b24ab676SJeff Bonwick { 3083b24ab676SJeff Bonwick dbuf_dirty_record_t *dr = zio->io_private; 3084b24ab676SJeff Bonwick dmu_buf_impl_t *db = dr->dr_dbuf; 3085b24ab676SJeff Bonwick 3086b24ab676SJeff Bonwick dbuf_write_ready(zio, NULL, db); 3087b24ab676SJeff Bonwick } 3088b24ab676SJeff Bonwick 3089b24ab676SJeff Bonwick static void 3090b24ab676SJeff Bonwick dbuf_write_override_done(zio_t *zio) 3091b24ab676SJeff Bonwick { 3092b24ab676SJeff Bonwick dbuf_dirty_record_t *dr = zio->io_private; 3093b24ab676SJeff Bonwick dmu_buf_impl_t *db = dr->dr_dbuf; 3094b24ab676SJeff Bonwick blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 3095b24ab676SJeff Bonwick 3096b24ab676SJeff Bonwick mutex_enter(&db->db_mtx); 3097b24ab676SJeff Bonwick if (!BP_EQUAL(zio->io_bp, obp)) { 3098b24ab676SJeff Bonwick if (!BP_IS_HOLE(obp)) 3099b24ab676SJeff Bonwick dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 3100b24ab676SJeff Bonwick arc_release(dr->dt.dl.dr_data, db); 3101b24ab676SJeff Bonwick } 3102c717a561Smaybee mutex_exit(&db->db_mtx); 3103c717a561Smaybee 3104b24ab676SJeff Bonwick dbuf_write_done(zio, NULL, db); 3105b24ab676SJeff Bonwick } 3106c717a561Smaybee 31073e30c24aSWill Andrews /* Issue I/O to commit a dirty buffer to disk. */ 3108b24ab676SJeff Bonwick static void 3109b24ab676SJeff Bonwick dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 3110b24ab676SJeff Bonwick { 3111b24ab676SJeff Bonwick dmu_buf_impl_t *db = dr->dr_dbuf; 3112744947dcSTom Erickson dnode_t *dn; 3113744947dcSTom Erickson objset_t *os; 3114b24ab676SJeff Bonwick dmu_buf_impl_t *parent = db->db_parent; 3115b24ab676SJeff Bonwick uint64_t txg = tx->tx_txg; 31167802d7bfSMatthew Ahrens zbookmark_phys_t zb; 3117b24ab676SJeff Bonwick zio_prop_t zp; 3118b24ab676SJeff Bonwick zio_t *zio; 31190a586ceaSMark Shellenbaum int wp_flag = 0; 3120b24ab676SJeff Bonwick 3121744947dcSTom Erickson DB_DNODE_ENTER(db); 3122744947dcSTom Erickson dn = DB_DNODE(db); 3123744947dcSTom Erickson os = dn->dn_objset; 3124744947dcSTom Erickson 3125b24ab676SJeff Bonwick if (db->db_state != DB_NOFILL) { 3126b24ab676SJeff Bonwick if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 3127b24ab676SJeff Bonwick /* 3128b24ab676SJeff Bonwick * Private object buffers are released here rather 3129b24ab676SJeff Bonwick * than in dbuf_dirty() since they are only modified 3130b24ab676SJeff Bonwick * in the syncing context and we don't want the 3131b24ab676SJeff Bonwick * overhead of making multiple copies of the data. 3132b24ab676SJeff Bonwick */ 3133b24ab676SJeff Bonwick if (BP_IS_HOLE(db->db_blkptr)) { 3134b24ab676SJeff Bonwick arc_buf_thaw(data); 3135b24ab676SJeff Bonwick } else { 31363f9d6ad7SLin Ling dbuf_release_bp(db); 3137b24ab676SJeff Bonwick } 3138b24ab676SJeff Bonwick } 3139b24ab676SJeff Bonwick } 3140b24ab676SJeff Bonwick 3141b24ab676SJeff Bonwick if (parent != dn->dn_dbuf) { 31423e30c24aSWill Andrews /* Our parent is an indirect block. */ 31433e30c24aSWill Andrews /* We have a dirty parent that has been scheduled for write. */ 3144b24ab676SJeff Bonwick ASSERT(parent && parent->db_data_pending); 31453e30c24aSWill Andrews /* Our parent's buffer is one level closer to the dnode. */ 3146b24ab676SJeff Bonwick ASSERT(db->db_level == parent->db_level-1); 31473e30c24aSWill Andrews /* 31483e30c24aSWill Andrews * We're about to modify our parent's db_data by modifying 31493e30c24aSWill Andrews * our block pointer, so the parent must be released. 31503e30c24aSWill Andrews */ 3151b24ab676SJeff Bonwick ASSERT(arc_released(parent->db_buf)); 3152b24ab676SJeff Bonwick zio = parent->db_data_pending->dr_zio; 3153b24ab676SJeff Bonwick } else { 31543e30c24aSWill Andrews /* Our parent is the dnode itself. */ 31550a586ceaSMark Shellenbaum ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 31560a586ceaSMark Shellenbaum db->db_blkid != DMU_SPILL_BLKID) || 31570a586ceaSMark Shellenbaum (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 31580a586ceaSMark Shellenbaum if (db->db_blkid != DMU_SPILL_BLKID) 3159b24ab676SJeff Bonwick ASSERT3P(db->db_blkptr, ==, 3160b24ab676SJeff Bonwick &dn->dn_phys->dn_blkptr[db->db_blkid]); 3161b24ab676SJeff Bonwick zio = dn->dn_zio; 3162b24ab676SJeff Bonwick } 3163b24ab676SJeff Bonwick 3164b24ab676SJeff Bonwick ASSERT(db->db_level == 0 || data == db->db_buf); 3165b24ab676SJeff Bonwick ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 3166b24ab676SJeff Bonwick ASSERT(zio); 3167b24ab676SJeff Bonwick 3168b24ab676SJeff Bonwick SET_BOOKMARK(&zb, os->os_dsl_dataset ? 3169b24ab676SJeff Bonwick os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 3170b24ab676SJeff Bonwick db->db.db_object, db->db_level, db->db_blkid); 3171b24ab676SJeff Bonwick 31720a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID) 31730a586ceaSMark Shellenbaum wp_flag = WP_SPILL; 31740a586ceaSMark Shellenbaum wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 31750a586ceaSMark Shellenbaum 31760a586ceaSMark Shellenbaum dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 3177744947dcSTom Erickson DB_DNODE_EXIT(db); 3178b24ab676SJeff Bonwick 31795d7b4d43SMatthew Ahrens if (db->db_level == 0 && 31805d7b4d43SMatthew Ahrens dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 31815d7b4d43SMatthew Ahrens /* 31825d7b4d43SMatthew Ahrens * The BP for this block has been provided by open context 31835d7b4d43SMatthew Ahrens * (by dmu_sync() or dmu_buf_write_embedded()). 31845d7b4d43SMatthew Ahrens */ 31855d7b4d43SMatthew Ahrens void *contents = (data != NULL) ? data->b_data : NULL; 31865d7b4d43SMatthew Ahrens 3187b24ab676SJeff Bonwick dr->dr_zio = zio_write(zio, os->os_spa, txg, 31885d7b4d43SMatthew Ahrens db->db_blkptr, contents, db->db.db_size, &zp, 318969962b56SMatthew Ahrens dbuf_write_override_ready, NULL, dbuf_write_override_done, 319069962b56SMatthew Ahrens dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3191b24ab676SJeff Bonwick mutex_enter(&db->db_mtx); 3192b24ab676SJeff Bonwick dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 3193b24ab676SJeff Bonwick zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 319480901aeaSGeorge Wilson dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 3195b24ab676SJeff Bonwick mutex_exit(&db->db_mtx); 3196b24ab676SJeff Bonwick } else if (db->db_state == DB_NOFILL) { 3197810e43b2SBill Pijewski ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || 3198810e43b2SBill Pijewski zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 3199b24ab676SJeff Bonwick dr->dr_zio = zio_write(zio, os->os_spa, txg, 3200b24ab676SJeff Bonwick db->db_blkptr, NULL, db->db.db_size, &zp, 320169962b56SMatthew Ahrens dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, 3202b24ab676SJeff Bonwick ZIO_PRIORITY_ASYNC_WRITE, 3203b24ab676SJeff Bonwick ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 3204b24ab676SJeff Bonwick } else { 3205b24ab676SJeff Bonwick ASSERT(arc_released(data)); 3206b24ab676SJeff Bonwick dr->dr_zio = arc_write(zio, os->os_spa, txg, 3207aad02571SSaso Kiselkov db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), 3208aad02571SSaso Kiselkov DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, 320969962b56SMatthew Ahrens dbuf_write_physdone, dbuf_write_done, db, 321069962b56SMatthew Ahrens ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3211b24ab676SJeff Bonwick } 3212fa9e4066Sahrens } 3213