1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5f65e61c0Sahrens * Common Development and Distribution License (the "License"). 6f65e61c0Sahrens * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 2206e0070dSMark Shellenbaum * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 233f2366c2SGordon Ross * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 2446e1baa6SMatthew Ahrens * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 25aad02571SSaso Kiselkov * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. 26810e43b2SBill Pijewski * Copyright (c) 2013, Joyent, Inc. All rights reserved. 27bc9014e6SJustin Gibbs * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28c3d26abcSMatthew Ahrens * Copyright (c) 2014 Integros [integros.com] 29fa9e4066Sahrens */ 30fa9e4066Sahrens 31fa9e4066Sahrens #include <sys/zfs_context.h> 32fa9e4066Sahrens #include <sys/dmu.h> 332f3d8780SMatthew Ahrens #include <sys/dmu_send.h> 34fa9e4066Sahrens #include <sys/dmu_impl.h> 35fa9e4066Sahrens #include <sys/dbuf.h> 36fa9e4066Sahrens #include <sys/dmu_objset.h> 37fa9e4066Sahrens #include <sys/dsl_dataset.h> 38fa9e4066Sahrens #include <sys/dsl_dir.h> 39fa9e4066Sahrens #include <sys/dmu_tx.h> 40fa9e4066Sahrens #include <sys/spa.h> 41fa9e4066Sahrens #include <sys/zio.h> 42fa9e4066Sahrens #include <sys/dmu_zfetch.h> 430a586ceaSMark Shellenbaum #include <sys/sa.h> 440a586ceaSMark Shellenbaum #include <sys/sa_impl.h> 455d7b4d43SMatthew Ahrens #include <sys/zfeature.h> 465d7b4d43SMatthew Ahrens #include <sys/blkptr.h> 47bf16b11eSMatthew Ahrens #include <sys/range_tree.h> 48fa9e4066Sahrens 49713d6c20SMatthew Ahrens /* 50713d6c20SMatthew Ahrens * Number of times that zfs_free_range() took the slow path while doing 51713d6c20SMatthew Ahrens * a zfs receive. A nonzero value indicates a potential performance problem. 52713d6c20SMatthew Ahrens */ 53713d6c20SMatthew Ahrens uint64_t zfs_free_range_recv_miss; 54713d6c20SMatthew Ahrens 55fa9e4066Sahrens static void dbuf_destroy(dmu_buf_impl_t *db); 563b2aab18SMatthew Ahrens static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 57088f3894Sahrens static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); 58fa9e4066Sahrens 59bc9014e6SJustin Gibbs #ifndef __lint 60bc9014e6SJustin Gibbs extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, 61bc9014e6SJustin Gibbs dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp); 62bc9014e6SJustin Gibbs #endif /* ! __lint */ 63bc9014e6SJustin Gibbs 64fa9e4066Sahrens /* 65fa9e4066Sahrens * Global data structures and functions for the dbuf cache. 66fa9e4066Sahrens */ 67fa9e4066Sahrens static kmem_cache_t *dbuf_cache; 68bc9014e6SJustin Gibbs static taskq_t *dbu_evict_taskq; 69fa9e4066Sahrens 70fa9e4066Sahrens /* ARGSUSED */ 71fa9e4066Sahrens static int 72fa9e4066Sahrens dbuf_cons(void *vdb, void *unused, int kmflag) 73fa9e4066Sahrens { 74fa9e4066Sahrens dmu_buf_impl_t *db = vdb; 75fa9e4066Sahrens bzero(db, sizeof (dmu_buf_impl_t)); 76fa9e4066Sahrens 77fa9e4066Sahrens mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 78fa9e4066Sahrens cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 79fa9e4066Sahrens refcount_create(&db->db_holds); 800f6d88adSAlex Reece 81fa9e4066Sahrens return (0); 82fa9e4066Sahrens } 83fa9e4066Sahrens 84fa9e4066Sahrens /* ARGSUSED */ 85fa9e4066Sahrens static void 86fa9e4066Sahrens dbuf_dest(void *vdb, void *unused) 87fa9e4066Sahrens { 88fa9e4066Sahrens dmu_buf_impl_t *db = vdb; 89fa9e4066Sahrens mutex_destroy(&db->db_mtx); 90fa9e4066Sahrens cv_destroy(&db->db_changed); 91fa9e4066Sahrens refcount_destroy(&db->db_holds); 92fa9e4066Sahrens } 93fa9e4066Sahrens 94fa9e4066Sahrens /* 95fa9e4066Sahrens * dbuf hash table routines 96fa9e4066Sahrens */ 97fa9e4066Sahrens static dbuf_hash_table_t dbuf_hash_table; 98fa9e4066Sahrens 99fa9e4066Sahrens static uint64_t dbuf_hash_count; 100fa9e4066Sahrens 101fa9e4066Sahrens static uint64_t 102fa9e4066Sahrens dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 103fa9e4066Sahrens { 104fa9e4066Sahrens uintptr_t osv = (uintptr_t)os; 105fa9e4066Sahrens uint64_t crc = -1ULL; 106fa9e4066Sahrens 107fa9e4066Sahrens ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 108fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 109fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 110fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 111fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 112fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 113fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 114fa9e4066Sahrens 115fa9e4066Sahrens crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 116fa9e4066Sahrens 117fa9e4066Sahrens return (crc); 118fa9e4066Sahrens } 119fa9e4066Sahrens 120fa9e4066Sahrens #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 121fa9e4066Sahrens 122fa9e4066Sahrens #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 123fa9e4066Sahrens ((dbuf)->db.db_object == (obj) && \ 124fa9e4066Sahrens (dbuf)->db_objset == (os) && \ 125fa9e4066Sahrens (dbuf)->db_level == (level) && \ 126fa9e4066Sahrens (dbuf)->db_blkid == (blkid)) 127fa9e4066Sahrens 128fa9e4066Sahrens dmu_buf_impl_t * 129e57a022bSJustin T. Gibbs dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) 130fa9e4066Sahrens { 131fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table; 132fa9e4066Sahrens uint64_t hv = DBUF_HASH(os, obj, level, blkid); 133fa9e4066Sahrens uint64_t idx = hv & h->hash_table_mask; 134fa9e4066Sahrens dmu_buf_impl_t *db; 135fa9e4066Sahrens 136fa9e4066Sahrens mutex_enter(DBUF_HASH_MUTEX(h, idx)); 137fa9e4066Sahrens for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 138fa9e4066Sahrens if (DBUF_EQUAL(db, os, obj, level, blkid)) { 139fa9e4066Sahrens mutex_enter(&db->db_mtx); 140ea8dc4b6Seschrock if (db->db_state != DB_EVICTING) { 141fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 142fa9e4066Sahrens return (db); 143fa9e4066Sahrens } 144fa9e4066Sahrens mutex_exit(&db->db_mtx); 145fa9e4066Sahrens } 146fa9e4066Sahrens } 147fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 148fa9e4066Sahrens return (NULL); 149fa9e4066Sahrens } 150fa9e4066Sahrens 151e57a022bSJustin T. Gibbs static dmu_buf_impl_t * 152e57a022bSJustin T. Gibbs dbuf_find_bonus(objset_t *os, uint64_t object) 153e57a022bSJustin T. Gibbs { 154e57a022bSJustin T. Gibbs dnode_t *dn; 155e57a022bSJustin T. Gibbs dmu_buf_impl_t *db = NULL; 156e57a022bSJustin T. Gibbs 157e57a022bSJustin T. Gibbs if (dnode_hold(os, object, FTAG, &dn) == 0) { 158e57a022bSJustin T. Gibbs rw_enter(&dn->dn_struct_rwlock, RW_READER); 159e57a022bSJustin T. Gibbs if (dn->dn_bonus != NULL) { 160e57a022bSJustin T. Gibbs db = dn->dn_bonus; 161e57a022bSJustin T. Gibbs mutex_enter(&db->db_mtx); 162e57a022bSJustin T. Gibbs } 163e57a022bSJustin T. Gibbs rw_exit(&dn->dn_struct_rwlock); 164e57a022bSJustin T. Gibbs dnode_rele(dn, FTAG); 165e57a022bSJustin T. Gibbs } 166e57a022bSJustin T. Gibbs return (db); 167e57a022bSJustin T. Gibbs } 168e57a022bSJustin T. Gibbs 169fa9e4066Sahrens /* 170fa9e4066Sahrens * Insert an entry into the hash table. If there is already an element 171fa9e4066Sahrens * equal to elem in the hash table, then the already existing element 172fa9e4066Sahrens * will be returned and the new element will not be inserted. 173fa9e4066Sahrens * Otherwise returns NULL. 174fa9e4066Sahrens */ 175fa9e4066Sahrens static dmu_buf_impl_t * 176fa9e4066Sahrens dbuf_hash_insert(dmu_buf_impl_t *db) 177fa9e4066Sahrens { 178fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table; 179503ad85cSMatthew Ahrens objset_t *os = db->db_objset; 180fa9e4066Sahrens uint64_t obj = db->db.db_object; 181fa9e4066Sahrens int level = db->db_level; 182fa9e4066Sahrens uint64_t blkid = db->db_blkid; 183fa9e4066Sahrens uint64_t hv = DBUF_HASH(os, obj, level, blkid); 184fa9e4066Sahrens uint64_t idx = hv & h->hash_table_mask; 185fa9e4066Sahrens dmu_buf_impl_t *dbf; 186fa9e4066Sahrens 187fa9e4066Sahrens mutex_enter(DBUF_HASH_MUTEX(h, idx)); 188fa9e4066Sahrens for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 189fa9e4066Sahrens if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 190fa9e4066Sahrens mutex_enter(&dbf->db_mtx); 191ea8dc4b6Seschrock if (dbf->db_state != DB_EVICTING) { 192fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 193fa9e4066Sahrens return (dbf); 194fa9e4066Sahrens } 195fa9e4066Sahrens mutex_exit(&dbf->db_mtx); 196fa9e4066Sahrens } 197fa9e4066Sahrens } 198fa9e4066Sahrens 199fa9e4066Sahrens mutex_enter(&db->db_mtx); 200fa9e4066Sahrens db->db_hash_next = h->hash_table[idx]; 201fa9e4066Sahrens h->hash_table[idx] = db; 202fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 2031a5e258fSJosef 'Jeff' Sipek atomic_inc_64(&dbuf_hash_count); 204fa9e4066Sahrens 205fa9e4066Sahrens return (NULL); 206fa9e4066Sahrens } 207fa9e4066Sahrens 208fa9e4066Sahrens /* 209bbfa8ea8SMatthew Ahrens * Remove an entry from the hash table. It must be in the EVICTING state. 210fa9e4066Sahrens */ 211fa9e4066Sahrens static void 212fa9e4066Sahrens dbuf_hash_remove(dmu_buf_impl_t *db) 213fa9e4066Sahrens { 214fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table; 215fa9e4066Sahrens uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 216fa9e4066Sahrens db->db_level, db->db_blkid); 217fa9e4066Sahrens uint64_t idx = hv & h->hash_table_mask; 218fa9e4066Sahrens dmu_buf_impl_t *dbf, **dbp; 219fa9e4066Sahrens 220fa9e4066Sahrens /* 221bbfa8ea8SMatthew Ahrens * We musn't hold db_mtx to maintain lock ordering: 222fa9e4066Sahrens * DBUF_HASH_MUTEX > db_mtx. 223fa9e4066Sahrens */ 224fa9e4066Sahrens ASSERT(refcount_is_zero(&db->db_holds)); 225ea8dc4b6Seschrock ASSERT(db->db_state == DB_EVICTING); 226fa9e4066Sahrens ASSERT(!MUTEX_HELD(&db->db_mtx)); 227fa9e4066Sahrens 228fa9e4066Sahrens mutex_enter(DBUF_HASH_MUTEX(h, idx)); 229fa9e4066Sahrens dbp = &h->hash_table[idx]; 230fa9e4066Sahrens while ((dbf = *dbp) != db) { 231fa9e4066Sahrens dbp = &dbf->db_hash_next; 232fa9e4066Sahrens ASSERT(dbf != NULL); 233fa9e4066Sahrens } 234fa9e4066Sahrens *dbp = db->db_hash_next; 235fa9e4066Sahrens db->db_hash_next = NULL; 236fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 2371a5e258fSJosef 'Jeff' Sipek atomic_dec_64(&dbuf_hash_count); 238fa9e4066Sahrens } 239fa9e4066Sahrens 240ea8dc4b6Seschrock static arc_evict_func_t dbuf_do_evict; 241fa9e4066Sahrens 242bc9014e6SJustin Gibbs typedef enum { 243bc9014e6SJustin Gibbs DBVU_EVICTING, 244bc9014e6SJustin Gibbs DBVU_NOT_EVICTING 245bc9014e6SJustin Gibbs } dbvu_verify_type_t; 246bc9014e6SJustin Gibbs 247bc9014e6SJustin Gibbs static void 248bc9014e6SJustin Gibbs dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) 249bc9014e6SJustin Gibbs { 250bc9014e6SJustin Gibbs #ifdef ZFS_DEBUG 251bc9014e6SJustin Gibbs int64_t holds; 252bc9014e6SJustin Gibbs 253bc9014e6SJustin Gibbs if (db->db_user == NULL) 254bc9014e6SJustin Gibbs return; 255bc9014e6SJustin Gibbs 256bc9014e6SJustin Gibbs /* Only data blocks support the attachment of user data. */ 257bc9014e6SJustin Gibbs ASSERT(db->db_level == 0); 258bc9014e6SJustin Gibbs 259bc9014e6SJustin Gibbs /* Clients must resolve a dbuf before attaching user data. */ 260bc9014e6SJustin Gibbs ASSERT(db->db.db_data != NULL); 261bc9014e6SJustin Gibbs ASSERT3U(db->db_state, ==, DB_CACHED); 262bc9014e6SJustin Gibbs 263bc9014e6SJustin Gibbs holds = refcount_count(&db->db_holds); 264bc9014e6SJustin Gibbs if (verify_type == DBVU_EVICTING) { 265bc9014e6SJustin Gibbs /* 266bc9014e6SJustin Gibbs * Immediate eviction occurs when holds == dirtycnt. 267bc9014e6SJustin Gibbs * For normal eviction buffers, holds is zero on 268bc9014e6SJustin Gibbs * eviction, except when dbuf_fix_old_data() calls 269bc9014e6SJustin Gibbs * dbuf_clear_data(). However, the hold count can grow 270bc9014e6SJustin Gibbs * during eviction even though db_mtx is held (see 271bc9014e6SJustin Gibbs * dmu_bonus_hold() for an example), so we can only 272bc9014e6SJustin Gibbs * test the generic invariant that holds >= dirtycnt. 273bc9014e6SJustin Gibbs */ 274bc9014e6SJustin Gibbs ASSERT3U(holds, >=, db->db_dirtycnt); 275bc9014e6SJustin Gibbs } else { 276d2058105SJustin T. Gibbs if (db->db_user_immediate_evict == TRUE) 277bc9014e6SJustin Gibbs ASSERT3U(holds, >=, db->db_dirtycnt); 278bc9014e6SJustin Gibbs else 279bc9014e6SJustin Gibbs ASSERT3U(holds, >, 0); 280bc9014e6SJustin Gibbs } 281bc9014e6SJustin Gibbs #endif 282bc9014e6SJustin Gibbs } 283bc9014e6SJustin Gibbs 284fa9e4066Sahrens static void 285fa9e4066Sahrens dbuf_evict_user(dmu_buf_impl_t *db) 286fa9e4066Sahrens { 287bc9014e6SJustin Gibbs dmu_buf_user_t *dbu = db->db_user; 288bc9014e6SJustin Gibbs 289fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 290fa9e4066Sahrens 291bc9014e6SJustin Gibbs if (dbu == NULL) 292fa9e4066Sahrens return; 293fa9e4066Sahrens 294bc9014e6SJustin Gibbs dbuf_verify_user(db, DBVU_EVICTING); 295bc9014e6SJustin Gibbs db->db_user = NULL; 296bc9014e6SJustin Gibbs 297bc9014e6SJustin Gibbs #ifdef ZFS_DEBUG 298bc9014e6SJustin Gibbs if (dbu->dbu_clear_on_evict_dbufp != NULL) 299bc9014e6SJustin Gibbs *dbu->dbu_clear_on_evict_dbufp = NULL; 300bc9014e6SJustin Gibbs #endif 301bc9014e6SJustin Gibbs 302bc9014e6SJustin Gibbs /* 303bc9014e6SJustin Gibbs * Invoke the callback from a taskq to avoid lock order reversals 304bc9014e6SJustin Gibbs * and limit stack depth. 305bc9014e6SJustin Gibbs */ 306bc9014e6SJustin Gibbs taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0, 307bc9014e6SJustin Gibbs &dbu->dbu_tqent); 308fa9e4066Sahrens } 309fa9e4066Sahrens 310744947dcSTom Erickson boolean_t 311744947dcSTom Erickson dbuf_is_metadata(dmu_buf_impl_t *db) 312744947dcSTom Erickson { 313744947dcSTom Erickson if (db->db_level > 0) { 314744947dcSTom Erickson return (B_TRUE); 315744947dcSTom Erickson } else { 316744947dcSTom Erickson boolean_t is_metadata; 317744947dcSTom Erickson 318744947dcSTom Erickson DB_DNODE_ENTER(db); 319ad135b5dSChristopher Siden is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); 320744947dcSTom Erickson DB_DNODE_EXIT(db); 321744947dcSTom Erickson 322744947dcSTom Erickson return (is_metadata); 323744947dcSTom Erickson } 324744947dcSTom Erickson } 325744947dcSTom Erickson 326fa9e4066Sahrens void 327ea8dc4b6Seschrock dbuf_evict(dmu_buf_impl_t *db) 328ea8dc4b6Seschrock { 329ea8dc4b6Seschrock ASSERT(MUTEX_HELD(&db->db_mtx)); 330ea8dc4b6Seschrock ASSERT(db->db_buf == NULL); 331c717a561Smaybee ASSERT(db->db_data_pending == NULL); 332ea8dc4b6Seschrock 333ea8dc4b6Seschrock dbuf_clear(db); 334ea8dc4b6Seschrock dbuf_destroy(db); 335ea8dc4b6Seschrock } 336ea8dc4b6Seschrock 337ea8dc4b6Seschrock void 338fa9e4066Sahrens dbuf_init(void) 339fa9e4066Sahrens { 340ea8dc4b6Seschrock uint64_t hsize = 1ULL << 16; 341fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table; 342fa9e4066Sahrens int i; 343fa9e4066Sahrens 344fa9e4066Sahrens /* 345fa9e4066Sahrens * The hash table is big enough to fill all of physical memory 346ea8dc4b6Seschrock * with an average 4K block size. The table will take up 347ea8dc4b6Seschrock * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). 348fa9e4066Sahrens */ 349ea8dc4b6Seschrock while (hsize * 4096 < physmem * PAGESIZE) 350fa9e4066Sahrens hsize <<= 1; 351fa9e4066Sahrens 352ea8dc4b6Seschrock retry: 353fa9e4066Sahrens h->hash_table_mask = hsize - 1; 354ea8dc4b6Seschrock h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); 355ea8dc4b6Seschrock if (h->hash_table == NULL) { 356ea8dc4b6Seschrock /* XXX - we should really return an error instead of assert */ 357ea8dc4b6Seschrock ASSERT(hsize > (1ULL << 10)); 358ea8dc4b6Seschrock hsize >>= 1; 359ea8dc4b6Seschrock goto retry; 360ea8dc4b6Seschrock } 361fa9e4066Sahrens 362fa9e4066Sahrens dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 363fa9e4066Sahrens sizeof (dmu_buf_impl_t), 364fa9e4066Sahrens 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 365fa9e4066Sahrens 366fa9e4066Sahrens for (i = 0; i < DBUF_MUTEXES; i++) 367fa9e4066Sahrens mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 368bc9014e6SJustin Gibbs 369bc9014e6SJustin Gibbs /* 370bc9014e6SJustin Gibbs * All entries are queued via taskq_dispatch_ent(), so min/maxalloc 371bc9014e6SJustin Gibbs * configuration is not required. 372bc9014e6SJustin Gibbs */ 373bc9014e6SJustin Gibbs dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); 374fa9e4066Sahrens } 375fa9e4066Sahrens 376fa9e4066Sahrens void 377fa9e4066Sahrens dbuf_fini(void) 378fa9e4066Sahrens { 379fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table; 380fa9e4066Sahrens int i; 381fa9e4066Sahrens 382fa9e4066Sahrens for (i = 0; i < DBUF_MUTEXES; i++) 383fa9e4066Sahrens mutex_destroy(&h->hash_mutexes[i]); 384fa9e4066Sahrens kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 385fa9e4066Sahrens kmem_cache_destroy(dbuf_cache); 386bc9014e6SJustin Gibbs taskq_destroy(dbu_evict_taskq); 387fa9e4066Sahrens } 388fa9e4066Sahrens 389fa9e4066Sahrens /* 390fa9e4066Sahrens * Other stuff. 391fa9e4066Sahrens */ 392fa9e4066Sahrens 3939c9dc39aSek110237 #ifdef ZFS_DEBUG 394fa9e4066Sahrens static void 395fa9e4066Sahrens dbuf_verify(dmu_buf_impl_t *db) 396fa9e4066Sahrens { 397744947dcSTom Erickson dnode_t *dn; 398b24ab676SJeff Bonwick dbuf_dirty_record_t *dr; 399fa9e4066Sahrens 400fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 401fa9e4066Sahrens 402fa9e4066Sahrens if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 403fa9e4066Sahrens return; 404fa9e4066Sahrens 405fa9e4066Sahrens ASSERT(db->db_objset != NULL); 406744947dcSTom Erickson DB_DNODE_ENTER(db); 407744947dcSTom Erickson dn = DB_DNODE(db); 408fa9e4066Sahrens if (dn == NULL) { 409fa9e4066Sahrens ASSERT(db->db_parent == NULL); 410fa9e4066Sahrens ASSERT(db->db_blkptr == NULL); 411fa9e4066Sahrens } else { 412fa9e4066Sahrens ASSERT3U(db->db.db_object, ==, dn->dn_object); 413fa9e4066Sahrens ASSERT3P(db->db_objset, ==, dn->dn_objset); 414fa9e4066Sahrens ASSERT3U(db->db_level, <, dn->dn_nlevels); 415744947dcSTom Erickson ASSERT(db->db_blkid == DMU_BONUS_BLKID || 416744947dcSTom Erickson db->db_blkid == DMU_SPILL_BLKID || 4170f6d88adSAlex Reece !avl_is_empty(&dn->dn_dbufs)); 418fa9e4066Sahrens } 4190a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) { 420fa9e4066Sahrens ASSERT(dn != NULL); 4211934e92fSmaybee ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 4220a586ceaSMark Shellenbaum ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); 4230a586ceaSMark Shellenbaum } else if (db->db_blkid == DMU_SPILL_BLKID) { 4240a586ceaSMark Shellenbaum ASSERT(dn != NULL); 4250a586ceaSMark Shellenbaum ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 426fb09f5aaSMadhav Suresh ASSERT0(db->db.db_offset); 427fa9e4066Sahrens } else { 428fa9e4066Sahrens ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 429fa9e4066Sahrens } 430fa9e4066Sahrens 431b24ab676SJeff Bonwick for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) 432b24ab676SJeff Bonwick ASSERT(dr->dr_dbuf == db); 433b24ab676SJeff Bonwick 434b24ab676SJeff Bonwick for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) 435b24ab676SJeff Bonwick ASSERT(dr->dr_dbuf == db); 436b24ab676SJeff Bonwick 43788b7b0f2SMatthew Ahrens /* 43888b7b0f2SMatthew Ahrens * We can't assert that db_size matches dn_datablksz because it 43988b7b0f2SMatthew Ahrens * can be momentarily different when another thread is doing 44088b7b0f2SMatthew Ahrens * dnode_set_blksz(). 44188b7b0f2SMatthew Ahrens */ 44288b7b0f2SMatthew Ahrens if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { 443b24ab676SJeff Bonwick dr = db->db_data_pending; 444fa9e4066Sahrens /* 44588b7b0f2SMatthew Ahrens * It should only be modified in syncing context, so 44688b7b0f2SMatthew Ahrens * make sure we only have one copy of the data. 447fa9e4066Sahrens */ 448c717a561Smaybee ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); 449fa9e4066Sahrens } 450fa9e4066Sahrens 451fa9e4066Sahrens /* verify db->db_blkptr */ 452fa9e4066Sahrens if (db->db_blkptr) { 453fa9e4066Sahrens if (db->db_parent == dn->dn_dbuf) { 454fa9e4066Sahrens /* db is pointed to by the dnode */ 455fa9e4066Sahrens /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 45614843421SMatthew Ahrens if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) 457fa9e4066Sahrens ASSERT(db->db_parent == NULL); 458fa9e4066Sahrens else 459fa9e4066Sahrens ASSERT(db->db_parent != NULL); 4600a586ceaSMark Shellenbaum if (db->db_blkid != DMU_SPILL_BLKID) 461fa9e4066Sahrens ASSERT3P(db->db_blkptr, ==, 462fa9e4066Sahrens &dn->dn_phys->dn_blkptr[db->db_blkid]); 463fa9e4066Sahrens } else { 464fa9e4066Sahrens /* db is pointed to by an indirect block */ 465fa9e4066Sahrens int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 466fa9e4066Sahrens ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 467fa9e4066Sahrens ASSERT3U(db->db_parent->db.db_object, ==, 468fa9e4066Sahrens db->db.db_object); 469fa9e4066Sahrens /* 470fa9e4066Sahrens * dnode_grow_indblksz() can make this fail if we don't 471fa9e4066Sahrens * have the struct_rwlock. XXX indblksz no longer 472fa9e4066Sahrens * grows. safe to do this now? 473fa9e4066Sahrens */ 474744947dcSTom Erickson if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 475fa9e4066Sahrens ASSERT3P(db->db_blkptr, ==, 476fa9e4066Sahrens ((blkptr_t *)db->db_parent->db.db_data + 477fa9e4066Sahrens db->db_blkid % epb)); 478fa9e4066Sahrens } 479fa9e4066Sahrens } 480fa9e4066Sahrens } 481fa9e4066Sahrens if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 4823f9d6ad7SLin Ling (db->db_buf == NULL || db->db_buf->b_data) && 4830a586ceaSMark Shellenbaum db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && 484fa9e4066Sahrens db->db_state != DB_FILL && !dn->dn_free_txg) { 485fa9e4066Sahrens /* 486fa9e4066Sahrens * If the blkptr isn't set but they have nonzero data, 487fa9e4066Sahrens * it had better be dirty, otherwise we'll lose that 488fa9e4066Sahrens * data when we evict this buffer. 489fa9e4066Sahrens */ 490fa9e4066Sahrens if (db->db_dirtycnt == 0) { 491fa9e4066Sahrens uint64_t *buf = db->db.db_data; 492fa9e4066Sahrens int i; 493fa9e4066Sahrens 494fa9e4066Sahrens for (i = 0; i < db->db.db_size >> 3; i++) { 495fa9e4066Sahrens ASSERT(buf[i] == 0); 496fa9e4066Sahrens } 497fa9e4066Sahrens } 498fa9e4066Sahrens } 499744947dcSTom Erickson DB_DNODE_EXIT(db); 500fa9e4066Sahrens } 5019c9dc39aSek110237 #endif 502fa9e4066Sahrens 503fa9e4066Sahrens static void 504bc9014e6SJustin Gibbs dbuf_clear_data(dmu_buf_impl_t *db) 505fa9e4066Sahrens { 506fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 507ea8dc4b6Seschrock dbuf_evict_user(db); 508bc9014e6SJustin Gibbs db->db_buf = NULL; 509ea8dc4b6Seschrock db->db.db_data = NULL; 51082c9918fSTim Haley if (db->db_state != DB_NOFILL) 511ea8dc4b6Seschrock db->db_state = DB_UNCACHED; 512ea8dc4b6Seschrock } 513bc9014e6SJustin Gibbs 514bc9014e6SJustin Gibbs static void 515bc9014e6SJustin Gibbs dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 516bc9014e6SJustin Gibbs { 517bc9014e6SJustin Gibbs ASSERT(MUTEX_HELD(&db->db_mtx)); 518bc9014e6SJustin Gibbs ASSERT(buf != NULL); 519bc9014e6SJustin Gibbs 520bc9014e6SJustin Gibbs db->db_buf = buf; 521bc9014e6SJustin Gibbs ASSERT(buf->b_data != NULL); 522bc9014e6SJustin Gibbs db->db.db_data = buf->b_data; 523bc9014e6SJustin Gibbs if (!arc_released(buf)) 524bc9014e6SJustin Gibbs arc_set_callback(buf, dbuf_do_evict, db); 525fa9e4066Sahrens } 526fa9e4066Sahrens 527c242f9a0Schunli zhang - Sun Microsystems - Irvine United States /* 528c242f9a0Schunli zhang - Sun Microsystems - Irvine United States * Loan out an arc_buf for read. Return the loaned arc_buf. 529c242f9a0Schunli zhang - Sun Microsystems - Irvine United States */ 530c242f9a0Schunli zhang - Sun Microsystems - Irvine United States arc_buf_t * 531c242f9a0Schunli zhang - Sun Microsystems - Irvine United States dbuf_loan_arcbuf(dmu_buf_impl_t *db) 532c242f9a0Schunli zhang - Sun Microsystems - Irvine United States { 533c242f9a0Schunli zhang - Sun Microsystems - Irvine United States arc_buf_t *abuf; 534c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 535c242f9a0Schunli zhang - Sun Microsystems - Irvine United States mutex_enter(&db->db_mtx); 536c242f9a0Schunli zhang - Sun Microsystems - Irvine United States if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { 537c242f9a0Schunli zhang - Sun Microsystems - Irvine United States int blksz = db->db.db_size; 53843466aaeSMax Grossman spa_t *spa = db->db_objset->os_spa; 539744947dcSTom Erickson 540c242f9a0Schunli zhang - Sun Microsystems - Irvine United States mutex_exit(&db->db_mtx); 541744947dcSTom Erickson abuf = arc_loan_buf(spa, blksz); 542c242f9a0Schunli zhang - Sun Microsystems - Irvine United States bcopy(db->db.db_data, abuf->b_data, blksz); 543c242f9a0Schunli zhang - Sun Microsystems - Irvine United States } else { 544c242f9a0Schunli zhang - Sun Microsystems - Irvine United States abuf = db->db_buf; 545c242f9a0Schunli zhang - Sun Microsystems - Irvine United States arc_loan_inuse_buf(abuf, db); 546bc9014e6SJustin Gibbs dbuf_clear_data(db); 547c242f9a0Schunli zhang - Sun Microsystems - Irvine United States mutex_exit(&db->db_mtx); 548c242f9a0Schunli zhang - Sun Microsystems - Irvine United States } 549c242f9a0Schunli zhang - Sun Microsystems - Irvine United States return (abuf); 550c242f9a0Schunli zhang - Sun Microsystems - Irvine United States } 551c242f9a0Schunli zhang - Sun Microsystems - Irvine United States 552a2cdcdd2SPaul Dagnelie /* 553a2cdcdd2SPaul Dagnelie * Calculate which level n block references the data at the level 0 offset 554a2cdcdd2SPaul Dagnelie * provided. 555a2cdcdd2SPaul Dagnelie */ 556fa9e4066Sahrens uint64_t 557a2cdcdd2SPaul Dagnelie dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) 558fa9e4066Sahrens { 559a2cdcdd2SPaul Dagnelie if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { 560a2cdcdd2SPaul Dagnelie /* 561a2cdcdd2SPaul Dagnelie * The level n blkid is equal to the level 0 blkid divided by 562a2cdcdd2SPaul Dagnelie * the number of level 0s in a level n block. 563a2cdcdd2SPaul Dagnelie * 564a2cdcdd2SPaul Dagnelie * The level 0 blkid is offset >> datablkshift = 565a2cdcdd2SPaul Dagnelie * offset / 2^datablkshift. 566a2cdcdd2SPaul Dagnelie * 567a2cdcdd2SPaul Dagnelie * The number of level 0s in a level n is the number of block 568a2cdcdd2SPaul Dagnelie * pointers in an indirect block, raised to the power of level. 569a2cdcdd2SPaul Dagnelie * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = 570a2cdcdd2SPaul Dagnelie * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). 571a2cdcdd2SPaul Dagnelie * 572a2cdcdd2SPaul Dagnelie * Thus, the level n blkid is: offset / 573a2cdcdd2SPaul Dagnelie * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT))) 574a2cdcdd2SPaul Dagnelie * = offset / 2^(datablkshift + level * 575a2cdcdd2SPaul Dagnelie * (indblkshift - SPA_BLKPTRSHIFT)) 576a2cdcdd2SPaul Dagnelie * = offset >> (datablkshift + level * 577a2cdcdd2SPaul Dagnelie * (indblkshift - SPA_BLKPTRSHIFT)) 578a2cdcdd2SPaul Dagnelie */ 579a2cdcdd2SPaul Dagnelie return (offset >> (dn->dn_datablkshift + level * 580a2cdcdd2SPaul Dagnelie (dn->dn_indblkshift - SPA_BLKPTRSHIFT))); 581fa9e4066Sahrens } else { 582fa9e4066Sahrens ASSERT3U(offset, <, dn->dn_datablksz); 583fa9e4066Sahrens return (0); 584fa9e4066Sahrens } 585fa9e4066Sahrens } 586fa9e4066Sahrens 587fa9e4066Sahrens static void 588fa9e4066Sahrens dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 589fa9e4066Sahrens { 590fa9e4066Sahrens dmu_buf_impl_t *db = vdb; 591fa9e4066Sahrens 592fa9e4066Sahrens mutex_enter(&db->db_mtx); 593fa9e4066Sahrens ASSERT3U(db->db_state, ==, DB_READ); 594fa9e4066Sahrens /* 595fa9e4066Sahrens * All reads are synchronous, so we must have a hold on the dbuf 596fa9e4066Sahrens */ 597fa9e4066Sahrens ASSERT(refcount_count(&db->db_holds) > 0); 598ea8dc4b6Seschrock ASSERT(db->db_buf == NULL); 599fa9e4066Sahrens ASSERT(db->db.db_data == NULL); 600c717a561Smaybee if (db->db_level == 0 && db->db_freed_in_flight) { 601fa9e4066Sahrens /* we were freed in flight; disregard any error */ 602fa9e4066Sahrens arc_release(buf, db); 603fa9e4066Sahrens bzero(buf->b_data, db->db.db_size); 6046b4acc8bSahrens arc_buf_freeze(buf); 605c717a561Smaybee db->db_freed_in_flight = FALSE; 606fa9e4066Sahrens dbuf_set_data(db, buf); 607fa9e4066Sahrens db->db_state = DB_CACHED; 608fa9e4066Sahrens } else if (zio == NULL || zio->io_error == 0) { 609fa9e4066Sahrens dbuf_set_data(db, buf); 610fa9e4066Sahrens db->db_state = DB_CACHED; 611fa9e4066Sahrens } else { 6120a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 613fa9e4066Sahrens ASSERT3P(db->db_buf, ==, NULL); 6143b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(buf, db)); 615ea8dc4b6Seschrock db->db_state = DB_UNCACHED; 616fa9e4066Sahrens } 617fa9e4066Sahrens cv_broadcast(&db->db_changed); 6183f9d6ad7SLin Ling dbuf_rele_and_unlock(db, NULL); 619fa9e4066Sahrens } 620fa9e4066Sahrens 621ea8dc4b6Seschrock static void 622cf6106c8SMatthew Ahrens dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 623fa9e4066Sahrens { 624744947dcSTom Erickson dnode_t *dn; 6257802d7bfSMatthew Ahrens zbookmark_phys_t zb; 6267adb730bSGeorge Wilson arc_flags_t aflags = ARC_FLAG_NOWAIT; 627fa9e4066Sahrens 628744947dcSTom Erickson DB_DNODE_ENTER(db); 629744947dcSTom Erickson dn = DB_DNODE(db); 630fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 631fa9e4066Sahrens /* We need the struct_rwlock to prevent db_blkptr from changing. */ 632088f3894Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 633ea8dc4b6Seschrock ASSERT(MUTEX_HELD(&db->db_mtx)); 634ea8dc4b6Seschrock ASSERT(db->db_state == DB_UNCACHED); 635ea8dc4b6Seschrock ASSERT(db->db_buf == NULL); 636fa9e4066Sahrens 6370a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) { 638cf04dda1SMark Maybee int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); 6391934e92fSmaybee 6401934e92fSmaybee ASSERT3U(bonuslen, <=, db->db.db_size); 641ea8dc4b6Seschrock db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); 6425a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 6431934e92fSmaybee if (bonuslen < DN_MAX_BONUSLEN) 644ea8dc4b6Seschrock bzero(db->db.db_data, DN_MAX_BONUSLEN); 645cf04dda1SMark Maybee if (bonuslen) 646cf04dda1SMark Maybee bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); 647744947dcSTom Erickson DB_DNODE_EXIT(db); 648fa9e4066Sahrens db->db_state = DB_CACHED; 649fa9e4066Sahrens mutex_exit(&db->db_mtx); 650fa9e4066Sahrens return; 651fa9e4066Sahrens } 652fa9e4066Sahrens 6531c8564a7SMark Maybee /* 6541c8564a7SMark Maybee * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() 6551c8564a7SMark Maybee * processes the delete record and clears the bp while we are waiting 6561c8564a7SMark Maybee * for the dn_mtx (resulting in a "no" from block_freed). 6571c8564a7SMark Maybee */ 658088f3894Sahrens if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || 6591c8564a7SMark Maybee (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || 6601c8564a7SMark Maybee BP_IS_HOLE(db->db_blkptr)))) { 661ad23a2dbSjohansen arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 662ad23a2dbSjohansen 663744947dcSTom Erickson DB_DNODE_EXIT(db); 66443466aaeSMax Grossman dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, 66543466aaeSMax Grossman db->db.db_size, db, type)); 666fa9e4066Sahrens bzero(db->db.db_data, db->db.db_size); 667fa9e4066Sahrens db->db_state = DB_CACHED; 668fa9e4066Sahrens mutex_exit(&db->db_mtx); 669fa9e4066Sahrens return; 670fa9e4066Sahrens } 671fa9e4066Sahrens 672744947dcSTom Erickson DB_DNODE_EXIT(db); 673744947dcSTom Erickson 674fa9e4066Sahrens db->db_state = DB_READ; 675fa9e4066Sahrens mutex_exit(&db->db_mtx); 676fa9e4066Sahrens 6773baa08fcSek110237 if (DBUF_IS_L2CACHEABLE(db)) 6787adb730bSGeorge Wilson aflags |= ARC_FLAG_L2CACHE; 679aad02571SSaso Kiselkov if (DBUF_IS_L2COMPRESSIBLE(db)) 6807adb730bSGeorge Wilson aflags |= ARC_FLAG_L2COMPRESS; 6813baa08fcSek110237 682b24ab676SJeff Bonwick SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? 683b24ab676SJeff Bonwick db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, 684b24ab676SJeff Bonwick db->db.db_object, db->db_level, db->db_blkid); 685ea8dc4b6Seschrock 686ea8dc4b6Seschrock dbuf_add_ref(db, NULL); 687088f3894Sahrens 68843466aaeSMax Grossman (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, 689fa9e4066Sahrens dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 690cf6106c8SMatthew Ahrens (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 69113506d1eSmaybee &aflags, &zb); 692fa9e4066Sahrens } 693fa9e4066Sahrens 694ea8dc4b6Seschrock int 695ea8dc4b6Seschrock dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 696fa9e4066Sahrens { 697ea8dc4b6Seschrock int err = 0; 69843466aaeSMax Grossman boolean_t havepzio = (zio != NULL); 69943466aaeSMax Grossman boolean_t prefetch; 700744947dcSTom Erickson dnode_t *dn; 701fa9e4066Sahrens 702fa9e4066Sahrens /* 703fa9e4066Sahrens * We don't have to hold the mutex to check db_state because it 704fa9e4066Sahrens * can't be freed while we have a hold on the buffer. 705fa9e4066Sahrens */ 706fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 707fa9e4066Sahrens 70882c9918fSTim Haley if (db->db_state == DB_NOFILL) 709be6fd75aSMatthew Ahrens return (SET_ERROR(EIO)); 71082c9918fSTim Haley 711744947dcSTom Erickson DB_DNODE_ENTER(db); 712744947dcSTom Erickson dn = DB_DNODE(db); 713fa9e4066Sahrens if ((flags & DB_RF_HAVESTRUCT) == 0) 714744947dcSTom Erickson rw_enter(&dn->dn_struct_rwlock, RW_READER); 715fa9e4066Sahrens 7160a586ceaSMark Shellenbaum prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 717744947dcSTom Erickson (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && 7183baa08fcSek110237 DBUF_IS_CACHEABLE(db); 71913506d1eSmaybee 720fa9e4066Sahrens mutex_enter(&db->db_mtx); 721ea8dc4b6Seschrock if (db->db_state == DB_CACHED) { 722ea8dc4b6Seschrock mutex_exit(&db->db_mtx); 72313506d1eSmaybee if (prefetch) 724cb92f413SAlexander Motin dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 725ea8dc4b6Seschrock if ((flags & DB_RF_HAVESTRUCT) == 0) 726744947dcSTom Erickson rw_exit(&dn->dn_struct_rwlock); 727744947dcSTom Erickson DB_DNODE_EXIT(db); 728ea8dc4b6Seschrock } else if (db->db_state == DB_UNCACHED) { 729744947dcSTom Erickson spa_t *spa = dn->dn_objset->os_spa; 730744947dcSTom Erickson 731744947dcSTom Erickson if (zio == NULL) 732744947dcSTom Erickson zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 733cf6106c8SMatthew Ahrens dbuf_read_impl(db, zio, flags); 73413506d1eSmaybee 735ea8dc4b6Seschrock /* dbuf_read_impl has dropped db_mtx for us */ 736ea8dc4b6Seschrock 73713506d1eSmaybee if (prefetch) 738cb92f413SAlexander Motin dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 739ea8dc4b6Seschrock 740ea8dc4b6Seschrock if ((flags & DB_RF_HAVESTRUCT) == 0) 741744947dcSTom Erickson rw_exit(&dn->dn_struct_rwlock); 742744947dcSTom Erickson DB_DNODE_EXIT(db); 743ea8dc4b6Seschrock 744ea8dc4b6Seschrock if (!havepzio) 745ea8dc4b6Seschrock err = zio_wait(zio); 746ea8dc4b6Seschrock } else { 7473e30c24aSWill Andrews /* 7483e30c24aSWill Andrews * Another reader came in while the dbuf was in flight 7493e30c24aSWill Andrews * between UNCACHED and CACHED. Either a writer will finish 7503e30c24aSWill Andrews * writing the buffer (sending the dbuf to CACHED) or the 7513e30c24aSWill Andrews * first reader's request will reach the read_done callback 7523e30c24aSWill Andrews * and send the dbuf to CACHED. Otherwise, a failure 7533e30c24aSWill Andrews * occurred and the dbuf went to UNCACHED. 7543e30c24aSWill Andrews */ 75513506d1eSmaybee mutex_exit(&db->db_mtx); 75613506d1eSmaybee if (prefetch) 757cb92f413SAlexander Motin dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); 758ea8dc4b6Seschrock if ((flags & DB_RF_HAVESTRUCT) == 0) 759744947dcSTom Erickson rw_exit(&dn->dn_struct_rwlock); 760744947dcSTom Erickson DB_DNODE_EXIT(db); 76113506d1eSmaybee 7623e30c24aSWill Andrews /* Skip the wait per the caller's request. */ 76313506d1eSmaybee mutex_enter(&db->db_mtx); 764ea8dc4b6Seschrock if ((flags & DB_RF_NEVERWAIT) == 0) { 765ea8dc4b6Seschrock while (db->db_state == DB_READ || 766ea8dc4b6Seschrock db->db_state == DB_FILL) { 767fa9e4066Sahrens ASSERT(db->db_state == DB_READ || 768fa9e4066Sahrens (flags & DB_RF_HAVESTRUCT) == 0); 769f6164ad6SAdam H. Leventhal DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, 770f6164ad6SAdam H. Leventhal db, zio_t *, zio); 771fa9e4066Sahrens cv_wait(&db->db_changed, &db->db_mtx); 772fa9e4066Sahrens } 773ea8dc4b6Seschrock if (db->db_state == DB_UNCACHED) 774be6fd75aSMatthew Ahrens err = SET_ERROR(EIO); 775ea8dc4b6Seschrock } 776fa9e4066Sahrens mutex_exit(&db->db_mtx); 777fa9e4066Sahrens } 778fa9e4066Sahrens 779ea8dc4b6Seschrock ASSERT(err || havepzio || db->db_state == DB_CACHED); 780ea8dc4b6Seschrock return (err); 781fa9e4066Sahrens } 782fa9e4066Sahrens 783fa9e4066Sahrens static void 784fa9e4066Sahrens dbuf_noread(dmu_buf_impl_t *db) 785fa9e4066Sahrens { 786fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 7870a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 788fa9e4066Sahrens mutex_enter(&db->db_mtx); 789fa9e4066Sahrens while (db->db_state == DB_READ || db->db_state == DB_FILL) 790fa9e4066Sahrens cv_wait(&db->db_changed, &db->db_mtx); 791fa9e4066Sahrens if (db->db_state == DB_UNCACHED) { 792ad23a2dbSjohansen arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 79343466aaeSMax Grossman spa_t *spa = db->db_objset->os_spa; 794ad23a2dbSjohansen 795ea8dc4b6Seschrock ASSERT(db->db_buf == NULL); 796fa9e4066Sahrens ASSERT(db->db.db_data == NULL); 797744947dcSTom Erickson dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); 798fa9e4066Sahrens db->db_state = DB_FILL; 79982c9918fSTim Haley } else if (db->db_state == DB_NOFILL) { 800bc9014e6SJustin Gibbs dbuf_clear_data(db); 801fa9e4066Sahrens } else { 802fa9e4066Sahrens ASSERT3U(db->db_state, ==, DB_CACHED); 803fa9e4066Sahrens } 804fa9e4066Sahrens mutex_exit(&db->db_mtx); 805fa9e4066Sahrens } 806fa9e4066Sahrens 807fa9e4066Sahrens /* 808fa9e4066Sahrens * This is our just-in-time copy function. It makes a copy of 809fa9e4066Sahrens * buffers, that have been modified in a previous transaction 810fa9e4066Sahrens * group, before we modify them in the current active group. 811fa9e4066Sahrens * 812fa9e4066Sahrens * This function is used in two places: when we are dirtying a 813fa9e4066Sahrens * buffer for the first time in a txg, and when we are freeing 814fa9e4066Sahrens * a range in a dnode that includes this buffer. 815fa9e4066Sahrens * 816fa9e4066Sahrens * Note that when we are called from dbuf_free_range() we do 817fa9e4066Sahrens * not put a hold on the buffer, we just traverse the active 818fa9e4066Sahrens * dbuf list for the dnode. 819fa9e4066Sahrens */ 820fa9e4066Sahrens static void 821fa9e4066Sahrens dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 822fa9e4066Sahrens { 823c717a561Smaybee dbuf_dirty_record_t *dr = db->db_last_dirty; 824fa9e4066Sahrens 825fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 826fa9e4066Sahrens ASSERT(db->db.db_data != NULL); 827c717a561Smaybee ASSERT(db->db_level == 0); 828c717a561Smaybee ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); 829fa9e4066Sahrens 8304d31c452Smaybee if (dr == NULL || 8314d31c452Smaybee (dr->dt.dl.dr_data != 8320a586ceaSMark Shellenbaum ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) 833fa9e4066Sahrens return; 834fa9e4066Sahrens 835fa9e4066Sahrens /* 836c717a561Smaybee * If the last dirty record for this dbuf has not yet synced 837c717a561Smaybee * and its referencing the dbuf data, either: 838c717a561Smaybee * reset the reference to point to a new copy, 839c717a561Smaybee * or (if there a no active holders) 840c717a561Smaybee * just null out the current db_data pointer. 841fa9e4066Sahrens */ 842c717a561Smaybee ASSERT(dr->dr_txg >= txg - 2); 8430a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) { 844c717a561Smaybee /* Note that the data bufs here are zio_bufs */ 845c717a561Smaybee dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); 8465a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 847c717a561Smaybee bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); 848c717a561Smaybee } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 849ea8dc4b6Seschrock int size = db->db.db_size; 850c717a561Smaybee arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 85143466aaeSMax Grossman spa_t *spa = db->db_objset->os_spa; 852744947dcSTom Erickson 853744947dcSTom Erickson dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); 854c717a561Smaybee bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); 855fa9e4066Sahrens } else { 856bc9014e6SJustin Gibbs dbuf_clear_data(db); 857fa9e4066Sahrens } 858fa9e4066Sahrens } 859ea8dc4b6Seschrock 860fa9e4066Sahrens void 861c717a561Smaybee dbuf_unoverride(dbuf_dirty_record_t *dr) 862fa9e4066Sahrens { 863c717a561Smaybee dmu_buf_impl_t *db = dr->dr_dbuf; 864b24ab676SJeff Bonwick blkptr_t *bp = &dr->dt.dl.dr_overridden_by; 865c717a561Smaybee uint64_t txg = dr->dr_txg; 866c5c6ffa0Smaybee 867c717a561Smaybee ASSERT(MUTEX_HELD(&db->db_mtx)); 868c717a561Smaybee ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); 869c717a561Smaybee ASSERT(db->db_level == 0); 870c717a561Smaybee 8710a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID || 872c717a561Smaybee dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) 873c717a561Smaybee return; 874c717a561Smaybee 875b24ab676SJeff Bonwick ASSERT(db->db_data_pending != dr); 876b24ab676SJeff Bonwick 877fa9e4066Sahrens /* free this block */ 87843466aaeSMax Grossman if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) 87943466aaeSMax Grossman zio_free(db->db_objset->os_spa, txg, bp); 880b24ab676SJeff Bonwick 881c717a561Smaybee dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 88280901aeaSGeorge Wilson dr->dt.dl.dr_nopwrite = B_FALSE; 88380901aeaSGeorge Wilson 8846b4acc8bSahrens /* 8856b4acc8bSahrens * Release the already-written buffer, so we leave it in 8866b4acc8bSahrens * a consistent dirty state. Note that all callers are 8876b4acc8bSahrens * modifying the buffer, so they will immediately do 8886b4acc8bSahrens * another (redundant) arc_release(). Therefore, leave 8896b4acc8bSahrens * the buf thawed to save the effort of freezing & 8906b4acc8bSahrens * immediately re-thawing it. 8916b4acc8bSahrens */ 892c717a561Smaybee arc_release(dr->dt.dl.dr_data, db); 893fa9e4066Sahrens } 894fa9e4066Sahrens 895cdb0ab79Smaybee /* 896cdb0ab79Smaybee * Evict (if its unreferenced) or clear (if its referenced) any level-0 897cdb0ab79Smaybee * data blocks in the free range, so that any future readers will find 89843466aaeSMax Grossman * empty blocks. 8992f3d8780SMatthew Ahrens * 9002f3d8780SMatthew Ahrens * This is a no-op if the dataset is in the middle of an incremental 9012f3d8780SMatthew Ahrens * receive; see comment below for details. 902cdb0ab79Smaybee */ 903fa9e4066Sahrens void 9040f6d88adSAlex Reece dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, 9050f6d88adSAlex Reece dmu_tx_t *tx) 906fa9e4066Sahrens { 907bc9014e6SJustin Gibbs dmu_buf_impl_t db_search; 908bc9014e6SJustin Gibbs dmu_buf_impl_t *db, *db_next; 909fa9e4066Sahrens uint64_t txg = tx->tx_txg; 9100f6d88adSAlex Reece avl_index_t where; 911445e6780STim Chase boolean_t freespill = 912445e6780STim Chase (start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID); 913fa9e4066Sahrens 914445e6780STim Chase if (end_blkid > dn->dn_maxblkid && !freespill) 9150f6d88adSAlex Reece end_blkid = dn->dn_maxblkid; 9160f6d88adSAlex Reece dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); 9170f6d88adSAlex Reece 9180f6d88adSAlex Reece db_search.db_level = 0; 9190f6d88adSAlex Reece db_search.db_blkid = start_blkid; 92086bb58aeSAlex Reece db_search.db_state = DB_SEARCH; 9212f3d8780SMatthew Ahrens 922713d6c20SMatthew Ahrens mutex_enter(&dn->dn_dbufs_mtx); 923445e6780STim Chase if (start_blkid >= dn->dn_unlisted_l0_blkid && !freespill) { 924713d6c20SMatthew Ahrens /* There can't be any dbufs in this range; no need to search. */ 9250f6d88adSAlex Reece #ifdef DEBUG 9260f6d88adSAlex Reece db = avl_find(&dn->dn_dbufs, &db_search, &where); 9270f6d88adSAlex Reece ASSERT3P(db, ==, NULL); 9280f6d88adSAlex Reece db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 9290f6d88adSAlex Reece ASSERT(db == NULL || db->db_level > 0); 9300f6d88adSAlex Reece #endif 931713d6c20SMatthew Ahrens mutex_exit(&dn->dn_dbufs_mtx); 9322f3d8780SMatthew Ahrens return; 933713d6c20SMatthew Ahrens } else if (dmu_objset_is_receiving(dn->dn_objset)) { 934713d6c20SMatthew Ahrens /* 935713d6c20SMatthew Ahrens * If we are receiving, we expect there to be no dbufs in 936713d6c20SMatthew Ahrens * the range to be freed, because receive modifies each 937713d6c20SMatthew Ahrens * block at most once, and in offset order. If this is 938713d6c20SMatthew Ahrens * not the case, it can lead to performance problems, 939713d6c20SMatthew Ahrens * so note that we unexpectedly took the slow path. 940713d6c20SMatthew Ahrens */ 941713d6c20SMatthew Ahrens atomic_inc_64(&zfs_free_range_recv_miss); 9422f3d8780SMatthew Ahrens } 9432f3d8780SMatthew Ahrens 9440f6d88adSAlex Reece db = avl_find(&dn->dn_dbufs, &db_search, &where); 9450f6d88adSAlex Reece ASSERT3P(db, ==, NULL); 9460f6d88adSAlex Reece db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); 9470f6d88adSAlex Reece 9480f6d88adSAlex Reece for (; db != NULL; db = db_next) { 9490f6d88adSAlex Reece db_next = AVL_NEXT(&dn->dn_dbufs, db); 9500a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 951cdb0ab79Smaybee 9520f6d88adSAlex Reece if (db->db_level != 0 || db->db_blkid > end_blkid) { 9530f6d88adSAlex Reece break; 9540f6d88adSAlex Reece } 9550f6d88adSAlex Reece ASSERT3U(db->db_blkid, >=, start_blkid); 956fa9e4066Sahrens 957fa9e4066Sahrens /* found a level 0 buffer in the range */ 958fa9e4066Sahrens mutex_enter(&db->db_mtx); 9593b2aab18SMatthew Ahrens if (dbuf_undirty(db, tx)) { 9603b2aab18SMatthew Ahrens /* mutex has been dropped and dbuf destroyed */ 9613b2aab18SMatthew Ahrens continue; 9623b2aab18SMatthew Ahrens } 9633b2aab18SMatthew Ahrens 964ea8dc4b6Seschrock if (db->db_state == DB_UNCACHED || 96582c9918fSTim Haley db->db_state == DB_NOFILL || 966ea8dc4b6Seschrock db->db_state == DB_EVICTING) { 967fa9e4066Sahrens ASSERT(db->db.db_data == NULL); 968fa9e4066Sahrens mutex_exit(&db->db_mtx); 969fa9e4066Sahrens continue; 970fa9e4066Sahrens } 971c543ec06Sahrens if (db->db_state == DB_READ || db->db_state == DB_FILL) { 972c543ec06Sahrens /* will be handled in dbuf_read_done or dbuf_rele */ 973c717a561Smaybee db->db_freed_in_flight = TRUE; 974fa9e4066Sahrens mutex_exit(&db->db_mtx); 975fa9e4066Sahrens continue; 976fa9e4066Sahrens } 977ea8dc4b6Seschrock if (refcount_count(&db->db_holds) == 0) { 978ea8dc4b6Seschrock ASSERT(db->db_buf); 979ea8dc4b6Seschrock dbuf_clear(db); 980ea8dc4b6Seschrock continue; 981ea8dc4b6Seschrock } 982c717a561Smaybee /* The dbuf is referenced */ 983fa9e4066Sahrens 984c717a561Smaybee if (db->db_last_dirty != NULL) { 985c717a561Smaybee dbuf_dirty_record_t *dr = db->db_last_dirty; 986c717a561Smaybee 987c717a561Smaybee if (dr->dr_txg == txg) { 988ea8dc4b6Seschrock /* 989c717a561Smaybee * This buffer is "in-use", re-adjust the file 990c717a561Smaybee * size to reflect that this buffer may 991c717a561Smaybee * contain new data when we sync. 992ea8dc4b6Seschrock */ 99306e0070dSMark Shellenbaum if (db->db_blkid != DMU_SPILL_BLKID && 99406e0070dSMark Shellenbaum db->db_blkid > dn->dn_maxblkid) 99544eda4d7Smaybee dn->dn_maxblkid = db->db_blkid; 996c717a561Smaybee dbuf_unoverride(dr); 997c717a561Smaybee } else { 998c717a561Smaybee /* 999c717a561Smaybee * This dbuf is not dirty in the open context. 1000c717a561Smaybee * Either uncache it (if its not referenced in 1001c717a561Smaybee * the open context) or reset its contents to 1002c717a561Smaybee * empty. 1003c717a561Smaybee */ 1004c717a561Smaybee dbuf_fix_old_data(db, txg); 100544eda4d7Smaybee } 1006c717a561Smaybee } 1007c717a561Smaybee /* clear the contents if its cached */ 1008ea8dc4b6Seschrock if (db->db_state == DB_CACHED) { 1009ea8dc4b6Seschrock ASSERT(db->db.db_data != NULL); 1010fa9e4066Sahrens arc_release(db->db_buf, db); 1011fa9e4066Sahrens bzero(db->db.db_data, db->db.db_size); 10126b4acc8bSahrens arc_buf_freeze(db->db_buf); 1013fa9e4066Sahrens } 1014ea8dc4b6Seschrock 1015fa9e4066Sahrens mutex_exit(&db->db_mtx); 1016fa9e4066Sahrens } 1017fa9e4066Sahrens mutex_exit(&dn->dn_dbufs_mtx); 1018fa9e4066Sahrens } 1019fa9e4066Sahrens 1020fa9e4066Sahrens static int 10211934e92fSmaybee dbuf_block_freeable(dmu_buf_impl_t *db) 1022fa9e4066Sahrens { 1023fa9e4066Sahrens dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 1024fa9e4066Sahrens uint64_t birth_txg = 0; 1025fa9e4066Sahrens 1026fa9e4066Sahrens /* 1027fa9e4066Sahrens * We don't need any locking to protect db_blkptr: 1028c717a561Smaybee * If it's syncing, then db_last_dirty will be set 1029c717a561Smaybee * so we'll ignore db_blkptr. 103043466aaeSMax Grossman * 103143466aaeSMax Grossman * This logic ensures that only block births for 103243466aaeSMax Grossman * filled blocks are considered. 1033fa9e4066Sahrens */ 1034c717a561Smaybee ASSERT(MUTEX_HELD(&db->db_mtx)); 103543466aaeSMax Grossman if (db->db_last_dirty && (db->db_blkptr == NULL || 103643466aaeSMax Grossman !BP_IS_HOLE(db->db_blkptr))) { 1037c717a561Smaybee birth_txg = db->db_last_dirty->dr_txg; 103843466aaeSMax Grossman } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { 1039fa9e4066Sahrens birth_txg = db->db_blkptr->blk_birth; 104043466aaeSMax Grossman } 1041fa9e4066Sahrens 1042837b568bSGeorge Wilson /* 104343466aaeSMax Grossman * If this block don't exist or is in a snapshot, it can't be freed. 1044837b568bSGeorge Wilson * Don't pass the bp to dsl_dataset_block_freeable() since we 1045837b568bSGeorge Wilson * are holding the db_mtx lock and might deadlock if we are 1046837b568bSGeorge Wilson * prefetching a dedup-ed block. 1047837b568bSGeorge Wilson */ 104843466aaeSMax Grossman if (birth_txg != 0) 10491934e92fSmaybee return (ds == NULL || 1050837b568bSGeorge Wilson dsl_dataset_block_freeable(ds, NULL, birth_txg)); 1051fa9e4066Sahrens else 105243466aaeSMax Grossman return (B_FALSE); 1053fa9e4066Sahrens } 1054fa9e4066Sahrens 1055fa9e4066Sahrens void 1056fa9e4066Sahrens dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 1057fa9e4066Sahrens { 1058fa9e4066Sahrens arc_buf_t *buf, *obuf; 1059fa9e4066Sahrens int osize = db->db.db_size; 1060ad23a2dbSjohansen arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 1061744947dcSTom Erickson dnode_t *dn; 1062fa9e4066Sahrens 10630a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1064ea8dc4b6Seschrock 1065744947dcSTom Erickson DB_DNODE_ENTER(db); 1066744947dcSTom Erickson dn = DB_DNODE(db); 1067744947dcSTom Erickson 1068fa9e4066Sahrens /* XXX does *this* func really need the lock? */ 1069744947dcSTom Erickson ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 1070fa9e4066Sahrens 1071fa9e4066Sahrens /* 107243466aaeSMax Grossman * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held 1073fa9e4066Sahrens * is OK, because there can be no other references to the db 1074fa9e4066Sahrens * when we are changing its size, so no concurrent DB_FILL can 1075fa9e4066Sahrens * be happening. 1076fa9e4066Sahrens */ 1077ea8dc4b6Seschrock /* 1078ea8dc4b6Seschrock * XXX we should be doing a dbuf_read, checking the return 1079ea8dc4b6Seschrock * value and returning that up to our callers 1080ea8dc4b6Seschrock */ 108143466aaeSMax Grossman dmu_buf_will_dirty(&db->db, tx); 1082fa9e4066Sahrens 1083fa9e4066Sahrens /* create the data buffer for the new block */ 1084744947dcSTom Erickson buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); 1085fa9e4066Sahrens 1086fa9e4066Sahrens /* copy old block data to the new block */ 1087fa9e4066Sahrens obuf = db->db_buf; 1088f65e61c0Sahrens bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); 1089fa9e4066Sahrens /* zero the remainder */ 1090f65e61c0Sahrens if (size > osize) 1091fa9e4066Sahrens bzero((uint8_t *)buf->b_data + osize, size - osize); 1092fa9e4066Sahrens 1093fa9e4066Sahrens mutex_enter(&db->db_mtx); 1094fa9e4066Sahrens dbuf_set_data(db, buf); 10953b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(obuf, db)); 1096fa9e4066Sahrens db->db.db_size = size; 1097fa9e4066Sahrens 1098c717a561Smaybee if (db->db_level == 0) { 1099c717a561Smaybee ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 1100c717a561Smaybee db->db_last_dirty->dt.dl.dr_data = buf; 1101c717a561Smaybee } 1102fa9e4066Sahrens mutex_exit(&db->db_mtx); 1103fa9e4066Sahrens 1104744947dcSTom Erickson dnode_willuse_space(dn, size-osize, tx); 1105744947dcSTom Erickson DB_DNODE_EXIT(db); 1106fa9e4066Sahrens } 1107fa9e4066Sahrens 11083f9d6ad7SLin Ling void 11093f9d6ad7SLin Ling dbuf_release_bp(dmu_buf_impl_t *db) 11103f9d6ad7SLin Ling { 111143466aaeSMax Grossman objset_t *os = db->db_objset; 11123f9d6ad7SLin Ling 11133f9d6ad7SLin Ling ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); 11143f9d6ad7SLin Ling ASSERT(arc_released(os->os_phys_buf) || 11153f9d6ad7SLin Ling list_link_active(&os->os_dsl_dataset->ds_synced_link)); 11163f9d6ad7SLin Ling ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 11173f9d6ad7SLin Ling 11181b912ec7SGeorge Wilson (void) arc_release(db->db_buf, db); 11193f9d6ad7SLin Ling } 11203f9d6ad7SLin Ling 11210f2e7d03SMatthew Ahrens /* 11220f2e7d03SMatthew Ahrens * We already have a dirty record for this TXG, and we are being 11230f2e7d03SMatthew Ahrens * dirtied again. 11240f2e7d03SMatthew Ahrens */ 11250f2e7d03SMatthew Ahrens static void 11260f2e7d03SMatthew Ahrens dbuf_redirty(dbuf_dirty_record_t *dr) 11270f2e7d03SMatthew Ahrens { 11280f2e7d03SMatthew Ahrens dmu_buf_impl_t *db = dr->dr_dbuf; 11290f2e7d03SMatthew Ahrens 11300f2e7d03SMatthew Ahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 11310f2e7d03SMatthew Ahrens 11320f2e7d03SMatthew Ahrens if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { 11330f2e7d03SMatthew Ahrens /* 11340f2e7d03SMatthew Ahrens * If this buffer has already been written out, 11350f2e7d03SMatthew Ahrens * we now need to reset its state. 11360f2e7d03SMatthew Ahrens */ 11370f2e7d03SMatthew Ahrens dbuf_unoverride(dr); 11380f2e7d03SMatthew Ahrens if (db->db.db_object != DMU_META_DNODE_OBJECT && 11390f2e7d03SMatthew Ahrens db->db_state != DB_NOFILL) { 11400f2e7d03SMatthew Ahrens /* Already released on initial dirty, so just thaw. */ 11410f2e7d03SMatthew Ahrens ASSERT(arc_released(db->db_buf)); 11420f2e7d03SMatthew Ahrens arc_buf_thaw(db->db_buf); 11430f2e7d03SMatthew Ahrens } 11440f2e7d03SMatthew Ahrens } 11450f2e7d03SMatthew Ahrens } 11460f2e7d03SMatthew Ahrens 1147c717a561Smaybee dbuf_dirty_record_t * 1148fa9e4066Sahrens dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1149fa9e4066Sahrens { 1150744947dcSTom Erickson dnode_t *dn; 1151744947dcSTom Erickson objset_t *os; 1152c717a561Smaybee dbuf_dirty_record_t **drp, *dr; 1153fa9e4066Sahrens int drop_struct_lock = FALSE; 1154d3469faaSMark Maybee boolean_t do_free_accounting = B_FALSE; 1155fa9e4066Sahrens int txgoff = tx->tx_txg & TXG_MASK; 1156fa9e4066Sahrens 1157fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 1158fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 11599c9dc39aSek110237 DMU_TX_DIRTY_BUF(tx, db); 1160fa9e4066Sahrens 1161744947dcSTom Erickson DB_DNODE_ENTER(db); 1162744947dcSTom Erickson dn = DB_DNODE(db); 1163fa9e4066Sahrens /* 1164fa9e4066Sahrens * Shouldn't dirty a regular buffer in syncing context. Private 1165fa9e4066Sahrens * objects may be dirtied in syncing context, but only if they 1166fa9e4066Sahrens * were already pre-dirtied in open context. 1167fa9e4066Sahrens */ 1168c717a561Smaybee ASSERT(!dmu_tx_is_syncing(tx) || 1169c717a561Smaybee BP_IS_HOLE(dn->dn_objset->os_rootbp) || 117014843421SMatthew Ahrens DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 117114843421SMatthew Ahrens dn->dn_objset->os_dsl_dataset == NULL); 1172fa9e4066Sahrens /* 1173fa9e4066Sahrens * We make this assert for private objects as well, but after we 1174fa9e4066Sahrens * check if we're already dirty. They are allowed to re-dirty 1175fa9e4066Sahrens * in syncing context. 1176fa9e4066Sahrens */ 1177ea8dc4b6Seschrock ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1178c717a561Smaybee dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1179fa9e4066Sahrens (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1180fa9e4066Sahrens 1181fa9e4066Sahrens mutex_enter(&db->db_mtx); 1182fa9e4066Sahrens /* 1183c717a561Smaybee * XXX make this true for indirects too? The problem is that 1184c717a561Smaybee * transactions created with dmu_tx_create_assigned() from 1185c717a561Smaybee * syncing context don't bother holding ahead. 1186fa9e4066Sahrens */ 1187c717a561Smaybee ASSERT(db->db_level != 0 || 118882c9918fSTim Haley db->db_state == DB_CACHED || db->db_state == DB_FILL || 118982c9918fSTim Haley db->db_state == DB_NOFILL); 1190fa9e4066Sahrens 1191fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 1192fa9e4066Sahrens /* 1193fa9e4066Sahrens * Don't set dirtyctx to SYNC if we're just modifying this as we 1194fa9e4066Sahrens * initialize the objset. 1195fa9e4066Sahrens */ 1196fa9e4066Sahrens if (dn->dn_dirtyctx == DN_UNDIRTIED && 1197c717a561Smaybee !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { 1198fa9e4066Sahrens dn->dn_dirtyctx = 1199fa9e4066Sahrens (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 1200fa9e4066Sahrens ASSERT(dn->dn_dirtyctx_firstset == NULL); 1201fa9e4066Sahrens dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 1202fa9e4066Sahrens } 1203fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 1204fa9e4066Sahrens 12050a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID) 12060a586ceaSMark Shellenbaum dn->dn_have_spill = B_TRUE; 12070a586ceaSMark Shellenbaum 1208fa9e4066Sahrens /* 1209fa9e4066Sahrens * If this buffer is already dirty, we're done. 1210fa9e4066Sahrens */ 1211c717a561Smaybee drp = &db->db_last_dirty; 1212c717a561Smaybee ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || 1213c717a561Smaybee db->db.db_object == DMU_META_DNODE_OBJECT); 12147e2186e3Sbonwick while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) 12157e2186e3Sbonwick drp = &dr->dr_next; 12167e2186e3Sbonwick if (dr && dr->dr_txg == tx->tx_txg) { 1217744947dcSTom Erickson DB_DNODE_EXIT(db); 1218744947dcSTom Erickson 12190f2e7d03SMatthew Ahrens dbuf_redirty(dr); 1220fa9e4066Sahrens mutex_exit(&db->db_mtx); 12217e2186e3Sbonwick return (dr); 1222fa9e4066Sahrens } 1223fa9e4066Sahrens 1224fa9e4066Sahrens /* 1225fa9e4066Sahrens * Only valid if not already dirty. 1226fa9e4066Sahrens */ 122714843421SMatthew Ahrens ASSERT(dn->dn_object == 0 || 122814843421SMatthew Ahrens dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 1229fa9e4066Sahrens (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 1230fa9e4066Sahrens 1231fa9e4066Sahrens ASSERT3U(dn->dn_nlevels, >, db->db_level); 1232fa9e4066Sahrens ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 1233fa9e4066Sahrens dn->dn_phys->dn_nlevels > db->db_level || 1234fa9e4066Sahrens dn->dn_next_nlevels[txgoff] > db->db_level || 1235fa9e4066Sahrens dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 1236fa9e4066Sahrens dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 1237fa9e4066Sahrens 1238fa9e4066Sahrens /* 1239fa9e4066Sahrens * We should only be dirtying in syncing context if it's the 124014843421SMatthew Ahrens * mos or we're initializing the os or it's a special object. 124114843421SMatthew Ahrens * However, we are allowed to dirty in syncing context provided 124214843421SMatthew Ahrens * we already dirtied it in open context. Hence we must make 124314843421SMatthew Ahrens * this assertion only if we're not already dirty. 1244fa9e4066Sahrens */ 1245744947dcSTom Erickson os = dn->dn_objset; 124614843421SMatthew Ahrens ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || 124714843421SMatthew Ahrens os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); 1248fa9e4066Sahrens ASSERT(db->db.db_size != 0); 1249fa9e4066Sahrens 1250fa9e4066Sahrens dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1251fa9e4066Sahrens 12520a586ceaSMark Shellenbaum if (db->db_blkid != DMU_BONUS_BLKID) { 12531934e92fSmaybee /* 12541934e92fSmaybee * Update the accounting. 1255d3469faaSMark Maybee * Note: we delay "free accounting" until after we drop 1256d3469faaSMark Maybee * the db_mtx. This keeps us from grabbing other locks 1257b24ab676SJeff Bonwick * (and possibly deadlocking) in bp_get_dsize() while 1258d3469faaSMark Maybee * also holding the db_mtx. 12591934e92fSmaybee */ 12601934e92fSmaybee dnode_willuse_space(dn, db->db.db_size, tx); 1261d3469faaSMark Maybee do_free_accounting = dbuf_block_freeable(db); 12621934e92fSmaybee } 12631934e92fSmaybee 1264ea8dc4b6Seschrock /* 1265ea8dc4b6Seschrock * If this buffer is dirty in an old transaction group we need 1266ea8dc4b6Seschrock * to make a copy of it so that the changes we make in this 1267ea8dc4b6Seschrock * transaction group won't leak out when we sync the older txg. 1268ea8dc4b6Seschrock */ 1269c717a561Smaybee dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); 1270c717a561Smaybee if (db->db_level == 0) { 1271c717a561Smaybee void *data_old = db->db_buf; 1272c717a561Smaybee 127382c9918fSTim Haley if (db->db_state != DB_NOFILL) { 12740a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) { 1275c717a561Smaybee dbuf_fix_old_data(db, tx->tx_txg); 1276c717a561Smaybee data_old = db->db.db_data; 1277c717a561Smaybee } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { 1278fa9e4066Sahrens /* 127982c9918fSTim Haley * Release the data buffer from the cache so 128082c9918fSTim Haley * that we can modify it without impacting 128182c9918fSTim Haley * possible other users of this cached data 128282c9918fSTim Haley * block. Note that indirect blocks and 128382c9918fSTim Haley * private objects are not released until the 128482c9918fSTim Haley * syncing state (since they are only modified 128582c9918fSTim Haley * then). 1286fa9e4066Sahrens */ 1287fa9e4066Sahrens arc_release(db->db_buf, db); 1288fa9e4066Sahrens dbuf_fix_old_data(db, tx->tx_txg); 1289c717a561Smaybee data_old = db->db_buf; 1290fa9e4066Sahrens } 1291c717a561Smaybee ASSERT(data_old != NULL); 129282c9918fSTim Haley } 1293c717a561Smaybee dr->dt.dl.dr_data = data_old; 1294c717a561Smaybee } else { 1295c717a561Smaybee mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); 1296c717a561Smaybee list_create(&dr->dt.di.dr_children, 1297c717a561Smaybee sizeof (dbuf_dirty_record_t), 1298c717a561Smaybee offsetof(dbuf_dirty_record_t, dr_dirty_node)); 1299fa9e4066Sahrens } 130069962b56SMatthew Ahrens if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) 130169962b56SMatthew Ahrens dr->dr_accounted = db->db.db_size; 1302c717a561Smaybee dr->dr_dbuf = db; 1303c717a561Smaybee dr->dr_txg = tx->tx_txg; 1304c717a561Smaybee dr->dr_next = *drp; 1305c717a561Smaybee *drp = dr; 1306fa9e4066Sahrens 1307fa9e4066Sahrens /* 1308fa9e4066Sahrens * We could have been freed_in_flight between the dbuf_noread 1309fa9e4066Sahrens * and dbuf_dirty. We win, as though the dbuf_noread() had 1310fa9e4066Sahrens * happened after the free. 1311fa9e4066Sahrens */ 13120a586ceaSMark Shellenbaum if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 13130a586ceaSMark Shellenbaum db->db_blkid != DMU_SPILL_BLKID) { 1314c717a561Smaybee mutex_enter(&dn->dn_mtx); 1315bf16b11eSMatthew Ahrens if (dn->dn_free_ranges[txgoff] != NULL) { 1316bf16b11eSMatthew Ahrens range_tree_clear(dn->dn_free_ranges[txgoff], 1317bf16b11eSMatthew Ahrens db->db_blkid, 1); 1318bf16b11eSMatthew Ahrens } 1319fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 1320c717a561Smaybee db->db_freed_in_flight = FALSE; 1321c717a561Smaybee } 1322fa9e4066Sahrens 1323fa9e4066Sahrens /* 1324fa9e4066Sahrens * This buffer is now part of this txg 1325fa9e4066Sahrens */ 1326fa9e4066Sahrens dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1327fa9e4066Sahrens db->db_dirtycnt += 1; 1328fa9e4066Sahrens ASSERT3U(db->db_dirtycnt, <=, 3); 1329fa9e4066Sahrens 1330fa9e4066Sahrens mutex_exit(&db->db_mtx); 1331fa9e4066Sahrens 13320a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID || 13330a586ceaSMark Shellenbaum db->db_blkid == DMU_SPILL_BLKID) { 1334c717a561Smaybee mutex_enter(&dn->dn_mtx); 1335c717a561Smaybee ASSERT(!list_link_active(&dr->dr_dirty_node)); 1336c717a561Smaybee list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1337c717a561Smaybee mutex_exit(&dn->dn_mtx); 1338fa9e4066Sahrens dnode_setdirty(dn, tx); 1339744947dcSTom Erickson DB_DNODE_EXIT(db); 1340c717a561Smaybee return (dr); 1341d3469faaSMark Maybee } else if (do_free_accounting) { 1342d3469faaSMark Maybee blkptr_t *bp = db->db_blkptr; 1343d3469faaSMark Maybee int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? 1344b24ab676SJeff Bonwick bp_get_dsize(os->os_spa, bp) : db->db.db_size; 1345d3469faaSMark Maybee /* 1346d3469faaSMark Maybee * This is only a guess -- if the dbuf is dirty 1347d3469faaSMark Maybee * in a previous txg, we don't know how much 1348d3469faaSMark Maybee * space it will use on disk yet. We should 1349d3469faaSMark Maybee * really have the struct_rwlock to access 1350d3469faaSMark Maybee * db_blkptr, but since this is just a guess, 1351d3469faaSMark Maybee * it's OK if we get an odd answer. 1352d3469faaSMark Maybee */ 1353837b568bSGeorge Wilson ddt_prefetch(os->os_spa, bp); 1354d3469faaSMark Maybee dnode_willuse_space(dn, -willfree, tx); 1355fa9e4066Sahrens } 1356fa9e4066Sahrens 1357fa9e4066Sahrens if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1358fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 1359fa9e4066Sahrens drop_struct_lock = TRUE; 1360fa9e4066Sahrens } 1361fa9e4066Sahrens 13628346f03fSJonathan W Adams if (db->db_level == 0) { 13638346f03fSJonathan W Adams dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); 13648346f03fSJonathan W Adams ASSERT(dn->dn_maxblkid >= db->db_blkid); 13658346f03fSJonathan W Adams } 13668346f03fSJonathan W Adams 136744eda4d7Smaybee if (db->db_level+1 < dn->dn_nlevels) { 1368c717a561Smaybee dmu_buf_impl_t *parent = db->db_parent; 1369c717a561Smaybee dbuf_dirty_record_t *di; 1370c717a561Smaybee int parent_held = FALSE; 1371c717a561Smaybee 1372c717a561Smaybee if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { 1373fa9e4066Sahrens int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1374c717a561Smaybee 1375fa9e4066Sahrens parent = dbuf_hold_level(dn, db->db_level+1, 1376fa9e4066Sahrens db->db_blkid >> epbs, FTAG); 137701025c89SJohn Harres ASSERT(parent != NULL); 1378c717a561Smaybee parent_held = TRUE; 1379c717a561Smaybee } 1380fa9e4066Sahrens if (drop_struct_lock) 1381fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 1382c717a561Smaybee ASSERT3U(db->db_level+1, ==, parent->db_level); 1383c717a561Smaybee di = dbuf_dirty(parent, tx); 1384c717a561Smaybee if (parent_held) 1385ea8dc4b6Seschrock dbuf_rele(parent, FTAG); 1386c717a561Smaybee 1387c717a561Smaybee mutex_enter(&db->db_mtx); 138869962b56SMatthew Ahrens /* 138969962b56SMatthew Ahrens * Since we've dropped the mutex, it's possible that 139069962b56SMatthew Ahrens * dbuf_undirty() might have changed this out from under us. 139169962b56SMatthew Ahrens */ 1392c717a561Smaybee if (db->db_last_dirty == dr || 1393c717a561Smaybee dn->dn_object == DMU_META_DNODE_OBJECT) { 1394c717a561Smaybee mutex_enter(&di->dt.di.dr_mtx); 1395c717a561Smaybee ASSERT3U(di->dr_txg, ==, tx->tx_txg); 1396c717a561Smaybee ASSERT(!list_link_active(&dr->dr_dirty_node)); 1397c717a561Smaybee list_insert_tail(&di->dt.di.dr_children, dr); 1398c717a561Smaybee mutex_exit(&di->dt.di.dr_mtx); 1399c717a561Smaybee dr->dr_parent = di; 1400c717a561Smaybee } 1401c717a561Smaybee mutex_exit(&db->db_mtx); 1402fa9e4066Sahrens } else { 1403c717a561Smaybee ASSERT(db->db_level+1 == dn->dn_nlevels); 1404c717a561Smaybee ASSERT(db->db_blkid < dn->dn_nblkptr); 1405744947dcSTom Erickson ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); 1406c717a561Smaybee mutex_enter(&dn->dn_mtx); 1407c717a561Smaybee ASSERT(!list_link_active(&dr->dr_dirty_node)); 1408c717a561Smaybee list_insert_tail(&dn->dn_dirty_records[txgoff], dr); 1409c717a561Smaybee mutex_exit(&dn->dn_mtx); 1410fa9e4066Sahrens if (drop_struct_lock) 1411fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 1412fa9e4066Sahrens } 1413fa9e4066Sahrens 1414fa9e4066Sahrens dnode_setdirty(dn, tx); 1415744947dcSTom Erickson DB_DNODE_EXIT(db); 1416c717a561Smaybee return (dr); 1417fa9e4066Sahrens } 1418fa9e4066Sahrens 14193b2aab18SMatthew Ahrens /* 14203e30c24aSWill Andrews * Undirty a buffer in the transaction group referenced by the given 14213e30c24aSWill Andrews * transaction. Return whether this evicted the dbuf. 14223b2aab18SMatthew Ahrens */ 14233b2aab18SMatthew Ahrens static boolean_t 1424fa9e4066Sahrens dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1425fa9e4066Sahrens { 1426744947dcSTom Erickson dnode_t *dn; 1427c717a561Smaybee uint64_t txg = tx->tx_txg; 142817f17c2dSbonwick dbuf_dirty_record_t *dr, **drp; 1429fa9e4066Sahrens 1430c717a561Smaybee ASSERT(txg != 0); 143146e1baa6SMatthew Ahrens 143246e1baa6SMatthew Ahrens /* 143346e1baa6SMatthew Ahrens * Due to our use of dn_nlevels below, this can only be called 143446e1baa6SMatthew Ahrens * in open context, unless we are operating on the MOS. 143546e1baa6SMatthew Ahrens * From syncing context, dn_nlevels may be different from the 143646e1baa6SMatthew Ahrens * dn_nlevels used when dbuf was dirtied. 143746e1baa6SMatthew Ahrens */ 143846e1baa6SMatthew Ahrens ASSERT(db->db_objset == 143946e1baa6SMatthew Ahrens dmu_objset_pool(db->db_objset)->dp_meta_objset || 144046e1baa6SMatthew Ahrens txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); 14410a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 14423b2aab18SMatthew Ahrens ASSERT0(db->db_level); 14433b2aab18SMatthew Ahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 1444fa9e4066Sahrens 1445fa9e4066Sahrens /* 1446fa9e4066Sahrens * If this buffer is not dirty, we're done. 1447fa9e4066Sahrens */ 144817f17c2dSbonwick for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) 1449c717a561Smaybee if (dr->dr_txg <= txg) 1450c717a561Smaybee break; 14513b2aab18SMatthew Ahrens if (dr == NULL || dr->dr_txg < txg) 14523b2aab18SMatthew Ahrens return (B_FALSE); 1453c717a561Smaybee ASSERT(dr->dr_txg == txg); 1454b24ab676SJeff Bonwick ASSERT(dr->dr_dbuf == db); 1455fa9e4066Sahrens 1456744947dcSTom Erickson DB_DNODE_ENTER(db); 1457744947dcSTom Erickson dn = DB_DNODE(db); 1458744947dcSTom Erickson 1459fa9e4066Sahrens dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1460fa9e4066Sahrens 1461fa9e4066Sahrens ASSERT(db->db.db_size != 0); 1462fa9e4066Sahrens 146346e1baa6SMatthew Ahrens dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), 146446e1baa6SMatthew Ahrens dr->dr_accounted, txg); 1465fa9e4066Sahrens 146617f17c2dSbonwick *drp = dr->dr_next; 1467c717a561Smaybee 14683f2366c2SGordon Ross /* 14693f2366c2SGordon Ross * Note that there are three places in dbuf_dirty() 14703f2366c2SGordon Ross * where this dirty record may be put on a list. 14713f2366c2SGordon Ross * Make sure to do a list_remove corresponding to 14723f2366c2SGordon Ross * every one of those list_insert calls. 14733f2366c2SGordon Ross */ 1474c717a561Smaybee if (dr->dr_parent) { 1475c717a561Smaybee mutex_enter(&dr->dr_parent->dt.di.dr_mtx); 1476c717a561Smaybee list_remove(&dr->dr_parent->dt.di.dr_children, dr); 1477c717a561Smaybee mutex_exit(&dr->dr_parent->dt.di.dr_mtx); 14783f2366c2SGordon Ross } else if (db->db_blkid == DMU_SPILL_BLKID || 14793f2366c2SGordon Ross db->db_level + 1 == dn->dn_nlevels) { 1480cdb0ab79Smaybee ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); 1481fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 1482c717a561Smaybee list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); 1483fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 1484c717a561Smaybee } 1485744947dcSTom Erickson DB_DNODE_EXIT(db); 1486c717a561Smaybee 148782c9918fSTim Haley if (db->db_state != DB_NOFILL) { 1488c717a561Smaybee dbuf_unoverride(dr); 1489c717a561Smaybee 1490c717a561Smaybee ASSERT(db->db_buf != NULL); 1491c717a561Smaybee ASSERT(dr->dt.dl.dr_data != NULL); 1492c717a561Smaybee if (dr->dt.dl.dr_data != db->db_buf) 14933b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); 1494c717a561Smaybee } 1495d2b3cbbdSJorgen Lundman 1496c717a561Smaybee kmem_free(dr, sizeof (dbuf_dirty_record_t)); 1497fa9e4066Sahrens 1498fa9e4066Sahrens ASSERT(db->db_dirtycnt > 0); 1499fa9e4066Sahrens db->db_dirtycnt -= 1; 1500fa9e4066Sahrens 1501c717a561Smaybee if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { 1502ea8dc4b6Seschrock arc_buf_t *buf = db->db_buf; 1503fa9e4066Sahrens 1504b24ab676SJeff Bonwick ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); 1505bc9014e6SJustin Gibbs dbuf_clear_data(db); 15063b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(buf, db)); 1507fa9e4066Sahrens dbuf_evict(db); 15083b2aab18SMatthew Ahrens return (B_TRUE); 1509fa9e4066Sahrens } 1510fa9e4066Sahrens 15113b2aab18SMatthew Ahrens return (B_FALSE); 1512fa9e4066Sahrens } 1513fa9e4066Sahrens 1514fa9e4066Sahrens void 151543466aaeSMax Grossman dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) 1516fa9e4066Sahrens { 151743466aaeSMax Grossman dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 15181ab7f2deSmaybee int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; 1519fa9e4066Sahrens 1520fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 1521fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 1522fa9e4066Sahrens 15230f2e7d03SMatthew Ahrens /* 15240f2e7d03SMatthew Ahrens * Quick check for dirtyness. For already dirty blocks, this 15250f2e7d03SMatthew Ahrens * reduces runtime of this function by >90%, and overall performance 15260f2e7d03SMatthew Ahrens * by 50% for some workloads (e.g. file deletion with indirect blocks 15270f2e7d03SMatthew Ahrens * cached). 15280f2e7d03SMatthew Ahrens */ 15290f2e7d03SMatthew Ahrens mutex_enter(&db->db_mtx); 15300f2e7d03SMatthew Ahrens dbuf_dirty_record_t *dr; 15310f2e7d03SMatthew Ahrens for (dr = db->db_last_dirty; 15320f2e7d03SMatthew Ahrens dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) { 15330f2e7d03SMatthew Ahrens /* 15340f2e7d03SMatthew Ahrens * It's possible that it is already dirty but not cached, 15350f2e7d03SMatthew Ahrens * because there are some calls to dbuf_dirty() that don't 15360f2e7d03SMatthew Ahrens * go through dmu_buf_will_dirty(). 15370f2e7d03SMatthew Ahrens */ 15380f2e7d03SMatthew Ahrens if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) { 15390f2e7d03SMatthew Ahrens /* This dbuf is already dirty and cached. */ 15400f2e7d03SMatthew Ahrens dbuf_redirty(dr); 15410f2e7d03SMatthew Ahrens mutex_exit(&db->db_mtx); 15420f2e7d03SMatthew Ahrens return; 15430f2e7d03SMatthew Ahrens } 15440f2e7d03SMatthew Ahrens } 15450f2e7d03SMatthew Ahrens mutex_exit(&db->db_mtx); 15460f2e7d03SMatthew Ahrens 1547744947dcSTom Erickson DB_DNODE_ENTER(db); 1548744947dcSTom Erickson if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) 1549fa9e4066Sahrens rf |= DB_RF_HAVESTRUCT; 1550744947dcSTom Erickson DB_DNODE_EXIT(db); 1551ea8dc4b6Seschrock (void) dbuf_read(db, NULL, rf); 1552c717a561Smaybee (void) dbuf_dirty(db, tx); 1553fa9e4066Sahrens } 1554fa9e4066Sahrens 1555fa9e4066Sahrens void 155682c9918fSTim Haley dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 155782c9918fSTim Haley { 155882c9918fSTim Haley dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 155982c9918fSTim Haley 156082c9918fSTim Haley db->db_state = DB_NOFILL; 156182c9918fSTim Haley 156282c9918fSTim Haley dmu_buf_will_fill(db_fake, tx); 156382c9918fSTim Haley } 156482c9918fSTim Haley 156582c9918fSTim Haley void 1566ea8dc4b6Seschrock dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) 1567fa9e4066Sahrens { 1568ea8dc4b6Seschrock dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1569ea8dc4b6Seschrock 15700a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1571fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 1572fa9e4066Sahrens ASSERT(db->db_level == 0); 1573fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 1574fa9e4066Sahrens 1575ea8dc4b6Seschrock ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || 1576fa9e4066Sahrens dmu_tx_private_ok(tx)); 1577fa9e4066Sahrens 1578fa9e4066Sahrens dbuf_noread(db); 1579c717a561Smaybee (void) dbuf_dirty(db, tx); 1580fa9e4066Sahrens } 1581fa9e4066Sahrens 1582fa9e4066Sahrens #pragma weak dmu_buf_fill_done = dbuf_fill_done 1583fa9e4066Sahrens /* ARGSUSED */ 1584fa9e4066Sahrens void 1585fa9e4066Sahrens dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1586fa9e4066Sahrens { 1587fa9e4066Sahrens mutex_enter(&db->db_mtx); 15889c9dc39aSek110237 DBUF_VERIFY(db); 1589fa9e4066Sahrens 1590fa9e4066Sahrens if (db->db_state == DB_FILL) { 1591c717a561Smaybee if (db->db_level == 0 && db->db_freed_in_flight) { 15920a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 1593fa9e4066Sahrens /* we were freed while filling */ 1594fa9e4066Sahrens /* XXX dbuf_undirty? */ 1595fa9e4066Sahrens bzero(db->db.db_data, db->db.db_size); 1596c717a561Smaybee db->db_freed_in_flight = FALSE; 1597fa9e4066Sahrens } 1598fa9e4066Sahrens db->db_state = DB_CACHED; 1599fa9e4066Sahrens cv_broadcast(&db->db_changed); 1600fa9e4066Sahrens } 1601fa9e4066Sahrens mutex_exit(&db->db_mtx); 1602fa9e4066Sahrens } 1603fa9e4066Sahrens 16045d7b4d43SMatthew Ahrens void 16055d7b4d43SMatthew Ahrens dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, 16065d7b4d43SMatthew Ahrens bp_embedded_type_t etype, enum zio_compress comp, 16075d7b4d43SMatthew Ahrens int uncompressed_size, int compressed_size, int byteorder, 16085d7b4d43SMatthew Ahrens dmu_tx_t *tx) 16095d7b4d43SMatthew Ahrens { 16105d7b4d43SMatthew Ahrens dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 16115d7b4d43SMatthew Ahrens struct dirty_leaf *dl; 16125d7b4d43SMatthew Ahrens dmu_object_type_t type; 16135d7b4d43SMatthew Ahrens 1614ca0cc391SMatthew Ahrens if (etype == BP_EMBEDDED_TYPE_DATA) { 1615ca0cc391SMatthew Ahrens ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), 1616ca0cc391SMatthew Ahrens SPA_FEATURE_EMBEDDED_DATA)); 1617ca0cc391SMatthew Ahrens } 1618ca0cc391SMatthew Ahrens 16195d7b4d43SMatthew Ahrens DB_DNODE_ENTER(db); 16205d7b4d43SMatthew Ahrens type = DB_DNODE(db)->dn_type; 16215d7b4d43SMatthew Ahrens DB_DNODE_EXIT(db); 16225d7b4d43SMatthew Ahrens 16235d7b4d43SMatthew Ahrens ASSERT0(db->db_level); 16245d7b4d43SMatthew Ahrens ASSERT(db->db_blkid != DMU_BONUS_BLKID); 16255d7b4d43SMatthew Ahrens 16265d7b4d43SMatthew Ahrens dmu_buf_will_not_fill(dbuf, tx); 16275d7b4d43SMatthew Ahrens 16285d7b4d43SMatthew Ahrens ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); 16295d7b4d43SMatthew Ahrens dl = &db->db_last_dirty->dt.dl; 16305d7b4d43SMatthew Ahrens encode_embedded_bp_compressed(&dl->dr_overridden_by, 16315d7b4d43SMatthew Ahrens data, comp, uncompressed_size, compressed_size); 16325d7b4d43SMatthew Ahrens BPE_SET_ETYPE(&dl->dr_overridden_by, etype); 16335d7b4d43SMatthew Ahrens BP_SET_TYPE(&dl->dr_overridden_by, type); 16345d7b4d43SMatthew Ahrens BP_SET_LEVEL(&dl->dr_overridden_by, 0); 16355d7b4d43SMatthew Ahrens BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); 16365d7b4d43SMatthew Ahrens 16375d7b4d43SMatthew Ahrens dl->dr_override_state = DR_OVERRIDDEN; 16385d7b4d43SMatthew Ahrens dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; 16395d7b4d43SMatthew Ahrens } 16405d7b4d43SMatthew Ahrens 1641ea8dc4b6Seschrock /* 16422fdbea25SAleksandr Guzovskiy * Directly assign a provided arc buf to a given dbuf if it's not referenced 16432fdbea25SAleksandr Guzovskiy * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. 16442fdbea25SAleksandr Guzovskiy */ 16452fdbea25SAleksandr Guzovskiy void 16462fdbea25SAleksandr Guzovskiy dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) 16472fdbea25SAleksandr Guzovskiy { 16482fdbea25SAleksandr Guzovskiy ASSERT(!refcount_is_zero(&db->db_holds)); 16490a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 16502fdbea25SAleksandr Guzovskiy ASSERT(db->db_level == 0); 16512fdbea25SAleksandr Guzovskiy ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); 16522fdbea25SAleksandr Guzovskiy ASSERT(buf != NULL); 16532fdbea25SAleksandr Guzovskiy ASSERT(arc_buf_size(buf) == db->db.db_size); 16542fdbea25SAleksandr Guzovskiy ASSERT(tx->tx_txg != 0); 16552fdbea25SAleksandr Guzovskiy 16562fdbea25SAleksandr Guzovskiy arc_return_buf(buf, db); 16572fdbea25SAleksandr Guzovskiy ASSERT(arc_released(buf)); 16582fdbea25SAleksandr Guzovskiy 16592fdbea25SAleksandr Guzovskiy mutex_enter(&db->db_mtx); 16602fdbea25SAleksandr Guzovskiy 16612fdbea25SAleksandr Guzovskiy while (db->db_state == DB_READ || db->db_state == DB_FILL) 16622fdbea25SAleksandr Guzovskiy cv_wait(&db->db_changed, &db->db_mtx); 16632fdbea25SAleksandr Guzovskiy 16642fdbea25SAleksandr Guzovskiy ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); 16652fdbea25SAleksandr Guzovskiy 16662fdbea25SAleksandr Guzovskiy if (db->db_state == DB_CACHED && 16672fdbea25SAleksandr Guzovskiy refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { 16682fdbea25SAleksandr Guzovskiy mutex_exit(&db->db_mtx); 16692fdbea25SAleksandr Guzovskiy (void) dbuf_dirty(db, tx); 16702fdbea25SAleksandr Guzovskiy bcopy(buf->b_data, db->db.db_data, db->db.db_size); 16713b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(buf, db)); 1672c242f9a0Schunli zhang - Sun Microsystems - Irvine United States xuio_stat_wbuf_copied(); 16732fdbea25SAleksandr Guzovskiy return; 16742fdbea25SAleksandr Guzovskiy } 16752fdbea25SAleksandr Guzovskiy 1676c242f9a0Schunli zhang - Sun Microsystems - Irvine United States xuio_stat_wbuf_nocopy(); 16772fdbea25SAleksandr Guzovskiy if (db->db_state == DB_CACHED) { 16782fdbea25SAleksandr Guzovskiy dbuf_dirty_record_t *dr = db->db_last_dirty; 16792fdbea25SAleksandr Guzovskiy 16802fdbea25SAleksandr Guzovskiy ASSERT(db->db_buf != NULL); 16812fdbea25SAleksandr Guzovskiy if (dr != NULL && dr->dr_txg == tx->tx_txg) { 16822fdbea25SAleksandr Guzovskiy ASSERT(dr->dt.dl.dr_data == db->db_buf); 16832fdbea25SAleksandr Guzovskiy if (!arc_released(db->db_buf)) { 16842fdbea25SAleksandr Guzovskiy ASSERT(dr->dt.dl.dr_override_state == 16852fdbea25SAleksandr Guzovskiy DR_OVERRIDDEN); 16862fdbea25SAleksandr Guzovskiy arc_release(db->db_buf, db); 16872fdbea25SAleksandr Guzovskiy } 16882fdbea25SAleksandr Guzovskiy dr->dt.dl.dr_data = buf; 16893b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(db->db_buf, db)); 16902fdbea25SAleksandr Guzovskiy } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { 16912fdbea25SAleksandr Guzovskiy arc_release(db->db_buf, db); 16923b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(db->db_buf, db)); 16932fdbea25SAleksandr Guzovskiy } 16942fdbea25SAleksandr Guzovskiy db->db_buf = NULL; 16952fdbea25SAleksandr Guzovskiy } 16962fdbea25SAleksandr Guzovskiy ASSERT(db->db_buf == NULL); 16972fdbea25SAleksandr Guzovskiy dbuf_set_data(db, buf); 16982fdbea25SAleksandr Guzovskiy db->db_state = DB_FILL; 16992fdbea25SAleksandr Guzovskiy mutex_exit(&db->db_mtx); 17002fdbea25SAleksandr Guzovskiy (void) dbuf_dirty(db, tx); 170143466aaeSMax Grossman dmu_buf_fill_done(&db->db, tx); 17022fdbea25SAleksandr Guzovskiy } 17032fdbea25SAleksandr Guzovskiy 17042fdbea25SAleksandr Guzovskiy /* 1705ea8dc4b6Seschrock * "Clear" the contents of this dbuf. This will mark the dbuf 170669962b56SMatthew Ahrens * EVICTING and clear *most* of its references. Unfortunately, 1707ea8dc4b6Seschrock * when we are not holding the dn_dbufs_mtx, we can't clear the 1708ea8dc4b6Seschrock * entry in the dn_dbufs list. We have to wait until dbuf_destroy() 1709ea8dc4b6Seschrock * in this case. For callers from the DMU we will usually see: 1710bbfa8ea8SMatthew Ahrens * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() 1711ea8dc4b6Seschrock * For the arc callback, we will usually see: 1712ea8dc4b6Seschrock * dbuf_do_evict()->dbuf_clear();dbuf_destroy() 1713ea8dc4b6Seschrock * Sometimes, though, we will get a mix of these two: 1714bbfa8ea8SMatthew Ahrens * DMU: dbuf_clear()->arc_clear_callback() 1715ea8dc4b6Seschrock * ARC: dbuf_do_evict()->dbuf_destroy() 1716bbfa8ea8SMatthew Ahrens * 1717bbfa8ea8SMatthew Ahrens * This routine will dissociate the dbuf from the arc, by calling 1718bbfa8ea8SMatthew Ahrens * arc_clear_callback(), but will not evict the data from the ARC. 1719ea8dc4b6Seschrock */ 1720ea8dc4b6Seschrock void 1721fa9e4066Sahrens dbuf_clear(dmu_buf_impl_t *db) 1722fa9e4066Sahrens { 1723744947dcSTom Erickson dnode_t *dn; 1724ea8dc4b6Seschrock dmu_buf_impl_t *parent = db->db_parent; 1725744947dcSTom Erickson dmu_buf_impl_t *dndb; 1726bbfa8ea8SMatthew Ahrens boolean_t dbuf_gone = B_FALSE; 1727fa9e4066Sahrens 1728fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 1729fa9e4066Sahrens ASSERT(refcount_is_zero(&db->db_holds)); 1730fa9e4066Sahrens 1731ea8dc4b6Seschrock dbuf_evict_user(db); 1732ea8dc4b6Seschrock 1733fa9e4066Sahrens if (db->db_state == DB_CACHED) { 1734ea8dc4b6Seschrock ASSERT(db->db.db_data != NULL); 17350a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) { 1736ea8dc4b6Seschrock zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); 17375a98e54bSBrendan Gregg - Sun Microsystems arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 17380e8c6158Smaybee } 1739fa9e4066Sahrens db->db.db_data = NULL; 1740fa9e4066Sahrens db->db_state = DB_UNCACHED; 1741fa9e4066Sahrens } 1742fa9e4066Sahrens 174382c9918fSTim Haley ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); 1744fa9e4066Sahrens ASSERT(db->db_data_pending == NULL); 1745fa9e4066Sahrens 1746ea8dc4b6Seschrock db->db_state = DB_EVICTING; 1747ea8dc4b6Seschrock db->db_blkptr = NULL; 1748ea8dc4b6Seschrock 1749744947dcSTom Erickson DB_DNODE_ENTER(db); 1750744947dcSTom Erickson dn = DB_DNODE(db); 1751744947dcSTom Erickson dndb = dn->dn_dbuf; 17520a586ceaSMark Shellenbaum if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { 17530f6d88adSAlex Reece avl_remove(&dn->dn_dbufs, db); 1754640c1670SJosef 'Jeff' Sipek atomic_dec_32(&dn->dn_dbufs_count); 1755744947dcSTom Erickson membar_producer(); 1756744947dcSTom Erickson DB_DNODE_EXIT(db); 1757744947dcSTom Erickson /* 1758744947dcSTom Erickson * Decrementing the dbuf count means that the hold corresponding 1759744947dcSTom Erickson * to the removed dbuf is no longer discounted in dnode_move(), 1760744947dcSTom Erickson * so the dnode cannot be moved until after we release the hold. 1761744947dcSTom Erickson * The membar_producer() ensures visibility of the decremented 1762744947dcSTom Erickson * value in dnode_move(), since DB_DNODE_EXIT doesn't actually 1763744947dcSTom Erickson * release any lock. 1764744947dcSTom Erickson */ 1765ea8dc4b6Seschrock dnode_rele(dn, db); 1766744947dcSTom Erickson db->db_dnode_handle = NULL; 1767744947dcSTom Erickson } else { 1768744947dcSTom Erickson DB_DNODE_EXIT(db); 1769ea8dc4b6Seschrock } 1770ea8dc4b6Seschrock 1771ea8dc4b6Seschrock if (db->db_buf) 1772bbfa8ea8SMatthew Ahrens dbuf_gone = arc_clear_callback(db->db_buf); 1773ea8dc4b6Seschrock 1774ea8dc4b6Seschrock if (!dbuf_gone) 1775fa9e4066Sahrens mutex_exit(&db->db_mtx); 1776fa9e4066Sahrens 1777fa9e4066Sahrens /* 1778744947dcSTom Erickson * If this dbuf is referenced from an indirect dbuf, 1779fa9e4066Sahrens * decrement the ref count on the indirect dbuf. 1780fa9e4066Sahrens */ 1781c543ec06Sahrens if (parent && parent != dndb) 1782ea8dc4b6Seschrock dbuf_rele(parent, db); 1783fa9e4066Sahrens } 1784fa9e4066Sahrens 1785a2cdcdd2SPaul Dagnelie /* 1786a2cdcdd2SPaul Dagnelie * Note: While bpp will always be updated if the function returns success, 1787a2cdcdd2SPaul Dagnelie * parentp will not be updated if the dnode does not have dn_dbuf filled in; 1788a2cdcdd2SPaul Dagnelie * this happens when the dnode is the meta-dnode, or a userused or groupused 1789a2cdcdd2SPaul Dagnelie * object. 1790a2cdcdd2SPaul Dagnelie */ 1791fa9e4066Sahrens static int 1792fa9e4066Sahrens dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1793fa9e4066Sahrens dmu_buf_impl_t **parentp, blkptr_t **bpp) 1794fa9e4066Sahrens { 1795fa9e4066Sahrens int nlevels, epbs; 1796fa9e4066Sahrens 17970b69c2f0Sahrens *parentp = NULL; 17980b69c2f0Sahrens *bpp = NULL; 17990b69c2f0Sahrens 18000a586ceaSMark Shellenbaum ASSERT(blkid != DMU_BONUS_BLKID); 18010a586ceaSMark Shellenbaum 18020a586ceaSMark Shellenbaum if (blkid == DMU_SPILL_BLKID) { 18030a586ceaSMark Shellenbaum mutex_enter(&dn->dn_mtx); 180406e0070dSMark Shellenbaum if (dn->dn_have_spill && 180506e0070dSMark Shellenbaum (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 18060a586ceaSMark Shellenbaum *bpp = &dn->dn_phys->dn_spill; 18070a586ceaSMark Shellenbaum else 18080a586ceaSMark Shellenbaum *bpp = NULL; 18090a586ceaSMark Shellenbaum dbuf_add_ref(dn->dn_dbuf, NULL); 18100a586ceaSMark Shellenbaum *parentp = dn->dn_dbuf; 18110a586ceaSMark Shellenbaum mutex_exit(&dn->dn_mtx); 18120a586ceaSMark Shellenbaum return (0); 18130a586ceaSMark Shellenbaum } 1814ea8dc4b6Seschrock 1815fa9e4066Sahrens if (dn->dn_phys->dn_nlevels == 0) 1816fa9e4066Sahrens nlevels = 1; 1817fa9e4066Sahrens else 1818fa9e4066Sahrens nlevels = dn->dn_phys->dn_nlevels; 1819fa9e4066Sahrens 1820fa9e4066Sahrens epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1821fa9e4066Sahrens 1822fa9e4066Sahrens ASSERT3U(level * epbs, <, 64); 1823fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1824ea8dc4b6Seschrock if (level >= nlevels || 1825fa9e4066Sahrens (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1826fa9e4066Sahrens /* the buffer has no parent yet */ 1827be6fd75aSMatthew Ahrens return (SET_ERROR(ENOENT)); 1828fa9e4066Sahrens } else if (level < nlevels-1) { 1829fa9e4066Sahrens /* this block is referenced from an indirect block */ 1830fa9e4066Sahrens int err = dbuf_hold_impl(dn, level+1, 1831a2cdcdd2SPaul Dagnelie blkid >> epbs, fail_sparse, FALSE, NULL, parentp); 1832fa9e4066Sahrens if (err) 1833fa9e4066Sahrens return (err); 1834ea8dc4b6Seschrock err = dbuf_read(*parentp, NULL, 1835ea8dc4b6Seschrock (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); 1836c543ec06Sahrens if (err) { 1837c543ec06Sahrens dbuf_rele(*parentp, NULL); 1838c543ec06Sahrens *parentp = NULL; 1839c543ec06Sahrens return (err); 1840c543ec06Sahrens } 1841fa9e4066Sahrens *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1842fa9e4066Sahrens (blkid & ((1ULL << epbs) - 1)); 1843c543ec06Sahrens return (0); 1844fa9e4066Sahrens } else { 1845fa9e4066Sahrens /* the block is referenced from the dnode */ 1846fa9e4066Sahrens ASSERT3U(level, ==, nlevels-1); 1847fa9e4066Sahrens ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1848fa9e4066Sahrens blkid < dn->dn_phys->dn_nblkptr); 1849c543ec06Sahrens if (dn->dn_dbuf) { 1850c543ec06Sahrens dbuf_add_ref(dn->dn_dbuf, NULL); 1851fa9e4066Sahrens *parentp = dn->dn_dbuf; 1852c543ec06Sahrens } 1853fa9e4066Sahrens *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1854fa9e4066Sahrens return (0); 1855fa9e4066Sahrens } 1856fa9e4066Sahrens } 1857fa9e4066Sahrens 1858fa9e4066Sahrens static dmu_buf_impl_t * 1859fa9e4066Sahrens dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1860fa9e4066Sahrens dmu_buf_impl_t *parent, blkptr_t *blkptr) 1861fa9e4066Sahrens { 1862503ad85cSMatthew Ahrens objset_t *os = dn->dn_objset; 1863fa9e4066Sahrens dmu_buf_impl_t *db, *odb; 1864fa9e4066Sahrens 1865fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1866fa9e4066Sahrens ASSERT(dn->dn_type != DMU_OT_NONE); 1867fa9e4066Sahrens 1868fa9e4066Sahrens db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1869fa9e4066Sahrens 1870fa9e4066Sahrens db->db_objset = os; 1871fa9e4066Sahrens db->db.db_object = dn->dn_object; 1872fa9e4066Sahrens db->db_level = level; 1873fa9e4066Sahrens db->db_blkid = blkid; 1874c717a561Smaybee db->db_last_dirty = NULL; 1875ea8dc4b6Seschrock db->db_dirtycnt = 0; 1876744947dcSTom Erickson db->db_dnode_handle = dn->dn_handle; 1877ea8dc4b6Seschrock db->db_parent = parent; 1878ea8dc4b6Seschrock db->db_blkptr = blkptr; 1879fa9e4066Sahrens 1880bc9014e6SJustin Gibbs db->db_user = NULL; 1881d2058105SJustin T. Gibbs db->db_user_immediate_evict = FALSE; 1882d2058105SJustin T. Gibbs db->db_freed_in_flight = FALSE; 1883d2058105SJustin T. Gibbs db->db_pending_evict = FALSE; 1884ea8dc4b6Seschrock 18850a586ceaSMark Shellenbaum if (blkid == DMU_BONUS_BLKID) { 1886ea8dc4b6Seschrock ASSERT3P(parent, ==, dn->dn_dbuf); 18871934e92fSmaybee db->db.db_size = DN_MAX_BONUSLEN - 18881934e92fSmaybee (dn->dn_nblkptr-1) * sizeof (blkptr_t); 18891934e92fSmaybee ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); 18900a586ceaSMark Shellenbaum db->db.db_offset = DMU_BONUS_BLKID; 1891ea8dc4b6Seschrock db->db_state = DB_UNCACHED; 1892ea8dc4b6Seschrock /* the bonus dbuf is not placed in the hash table */ 18935a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1894ea8dc4b6Seschrock return (db); 18950a586ceaSMark Shellenbaum } else if (blkid == DMU_SPILL_BLKID) { 18960a586ceaSMark Shellenbaum db->db.db_size = (blkptr != NULL) ? 18970a586ceaSMark Shellenbaum BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; 18980a586ceaSMark Shellenbaum db->db.db_offset = 0; 1899fa9e4066Sahrens } else { 1900fa9e4066Sahrens int blocksize = 1901fa9e4066Sahrens db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; 1902fa9e4066Sahrens db->db.db_size = blocksize; 1903fa9e4066Sahrens db->db.db_offset = db->db_blkid * blocksize; 1904fa9e4066Sahrens } 1905fa9e4066Sahrens 1906fa9e4066Sahrens /* 1907fa9e4066Sahrens * Hold the dn_dbufs_mtx while we get the new dbuf 1908fa9e4066Sahrens * in the hash table *and* added to the dbufs list. 1909fa9e4066Sahrens * This prevents a possible deadlock with someone 1910fa9e4066Sahrens * trying to look up this dbuf before its added to the 1911fa9e4066Sahrens * dn_dbufs list. 1912fa9e4066Sahrens */ 1913fa9e4066Sahrens mutex_enter(&dn->dn_dbufs_mtx); 1914ea8dc4b6Seschrock db->db_state = DB_EVICTING; 1915fa9e4066Sahrens if ((odb = dbuf_hash_insert(db)) != NULL) { 1916fa9e4066Sahrens /* someone else inserted it first */ 1917fa9e4066Sahrens kmem_cache_free(dbuf_cache, db); 1918fa9e4066Sahrens mutex_exit(&dn->dn_dbufs_mtx); 1919fa9e4066Sahrens return (odb); 1920fa9e4066Sahrens } 19210f6d88adSAlex Reece avl_add(&dn->dn_dbufs, db); 1922713d6c20SMatthew Ahrens if (db->db_level == 0 && db->db_blkid >= 1923713d6c20SMatthew Ahrens dn->dn_unlisted_l0_blkid) 1924713d6c20SMatthew Ahrens dn->dn_unlisted_l0_blkid = db->db_blkid + 1; 1925ea8dc4b6Seschrock db->db_state = DB_UNCACHED; 1926fa9e4066Sahrens mutex_exit(&dn->dn_dbufs_mtx); 19275a98e54bSBrendan Gregg - Sun Microsystems arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 1928fa9e4066Sahrens 1929fa9e4066Sahrens if (parent && parent != dn->dn_dbuf) 1930fa9e4066Sahrens dbuf_add_ref(parent, db); 1931fa9e4066Sahrens 1932ea8dc4b6Seschrock ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || 1933ea8dc4b6Seschrock refcount_count(&dn->dn_holds) > 0); 1934fa9e4066Sahrens (void) refcount_add(&dn->dn_holds, db); 1935640c1670SJosef 'Jeff' Sipek atomic_inc_32(&dn->dn_dbufs_count); 1936fa9e4066Sahrens 1937fa9e4066Sahrens dprintf_dbuf(db, "db=%p\n", db); 1938fa9e4066Sahrens 1939fa9e4066Sahrens return (db); 1940fa9e4066Sahrens } 1941fa9e4066Sahrens 1942fa9e4066Sahrens static int 1943ea8dc4b6Seschrock dbuf_do_evict(void *private) 1944fa9e4066Sahrens { 1945bbfa8ea8SMatthew Ahrens dmu_buf_impl_t *db = private; 1946fa9e4066Sahrens 1947ea8dc4b6Seschrock if (!MUTEX_HELD(&db->db_mtx)) 1948ea8dc4b6Seschrock mutex_enter(&db->db_mtx); 1949ea8dc4b6Seschrock 1950ea8dc4b6Seschrock ASSERT(refcount_is_zero(&db->db_holds)); 1951ea8dc4b6Seschrock 1952ea8dc4b6Seschrock if (db->db_state != DB_EVICTING) { 1953ea8dc4b6Seschrock ASSERT(db->db_state == DB_CACHED); 19549c9dc39aSek110237 DBUF_VERIFY(db); 1955ea8dc4b6Seschrock db->db_buf = NULL; 1956ea8dc4b6Seschrock dbuf_evict(db); 1957ea8dc4b6Seschrock } else { 1958ea8dc4b6Seschrock mutex_exit(&db->db_mtx); 1959ea8dc4b6Seschrock dbuf_destroy(db); 1960fa9e4066Sahrens } 1961ea8dc4b6Seschrock return (0); 1962fa9e4066Sahrens } 1963fa9e4066Sahrens 1964fa9e4066Sahrens static void 1965fa9e4066Sahrens dbuf_destroy(dmu_buf_impl_t *db) 1966fa9e4066Sahrens { 1967fa9e4066Sahrens ASSERT(refcount_is_zero(&db->db_holds)); 1968fa9e4066Sahrens 19690a586ceaSMark Shellenbaum if (db->db_blkid != DMU_BONUS_BLKID) { 1970ea8dc4b6Seschrock /* 1971ea8dc4b6Seschrock * If this dbuf is still on the dn_dbufs list, 1972ea8dc4b6Seschrock * remove it from that list. 1973ea8dc4b6Seschrock */ 1974744947dcSTom Erickson if (db->db_dnode_handle != NULL) { 1975744947dcSTom Erickson dnode_t *dn; 19761934e92fSmaybee 1977744947dcSTom Erickson DB_DNODE_ENTER(db); 1978744947dcSTom Erickson dn = DB_DNODE(db); 19791934e92fSmaybee mutex_enter(&dn->dn_dbufs_mtx); 19800f6d88adSAlex Reece avl_remove(&dn->dn_dbufs, db); 1981640c1670SJosef 'Jeff' Sipek atomic_dec_32(&dn->dn_dbufs_count); 1982ea8dc4b6Seschrock mutex_exit(&dn->dn_dbufs_mtx); 1983744947dcSTom Erickson DB_DNODE_EXIT(db); 1984744947dcSTom Erickson /* 1985744947dcSTom Erickson * Decrementing the dbuf count means that the hold 1986744947dcSTom Erickson * corresponding to the removed dbuf is no longer 1987744947dcSTom Erickson * discounted in dnode_move(), so the dnode cannot be 1988744947dcSTom Erickson * moved until after we release the hold. 1989744947dcSTom Erickson */ 1990ea8dc4b6Seschrock dnode_rele(dn, db); 1991744947dcSTom Erickson db->db_dnode_handle = NULL; 1992ea8dc4b6Seschrock } 1993ea8dc4b6Seschrock dbuf_hash_remove(db); 1994ea8dc4b6Seschrock } 1995ea8dc4b6Seschrock db->db_parent = NULL; 1996ea8dc4b6Seschrock db->db_buf = NULL; 1997ea8dc4b6Seschrock 1998fa9e4066Sahrens ASSERT(db->db.db_data == NULL); 1999fa9e4066Sahrens ASSERT(db->db_hash_next == NULL); 2000fa9e4066Sahrens ASSERT(db->db_blkptr == NULL); 2001fa9e4066Sahrens ASSERT(db->db_data_pending == NULL); 2002fa9e4066Sahrens 2003fa9e4066Sahrens kmem_cache_free(dbuf_cache, db); 20045a98e54bSBrendan Gregg - Sun Microsystems arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); 2005fa9e4066Sahrens } 2006fa9e4066Sahrens 2007a2cdcdd2SPaul Dagnelie typedef struct dbuf_prefetch_arg { 2008a2cdcdd2SPaul Dagnelie spa_t *dpa_spa; /* The spa to issue the prefetch in. */ 2009a2cdcdd2SPaul Dagnelie zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ 2010a2cdcdd2SPaul Dagnelie int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ 2011a2cdcdd2SPaul Dagnelie int dpa_curlevel; /* The current level that we're reading */ 2012a2cdcdd2SPaul Dagnelie zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ 2013a2cdcdd2SPaul Dagnelie zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ 2014a2cdcdd2SPaul Dagnelie arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ 2015a2cdcdd2SPaul Dagnelie } dbuf_prefetch_arg_t; 2016a2cdcdd2SPaul Dagnelie 2017a2cdcdd2SPaul Dagnelie /* 2018a2cdcdd2SPaul Dagnelie * Actually issue the prefetch read for the block given. 2019a2cdcdd2SPaul Dagnelie */ 2020a2cdcdd2SPaul Dagnelie static void 2021a2cdcdd2SPaul Dagnelie dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) 2022fa9e4066Sahrens { 2023a2cdcdd2SPaul Dagnelie if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 2024a2cdcdd2SPaul Dagnelie return; 2025a2cdcdd2SPaul Dagnelie 2026a2cdcdd2SPaul Dagnelie arc_flags_t aflags = 2027a2cdcdd2SPaul Dagnelie dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; 2028a2cdcdd2SPaul Dagnelie 2029a2cdcdd2SPaul Dagnelie ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2030a2cdcdd2SPaul Dagnelie ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); 2031a2cdcdd2SPaul Dagnelie ASSERT(dpa->dpa_zio != NULL); 2032a2cdcdd2SPaul Dagnelie (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, 2033a2cdcdd2SPaul Dagnelie dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2034a2cdcdd2SPaul Dagnelie &aflags, &dpa->dpa_zb); 2035a2cdcdd2SPaul Dagnelie } 2036a2cdcdd2SPaul Dagnelie 2037a2cdcdd2SPaul Dagnelie /* 2038a2cdcdd2SPaul Dagnelie * Called when an indirect block above our prefetch target is read in. This 2039a2cdcdd2SPaul Dagnelie * will either read in the next indirect block down the tree or issue the actual 2040a2cdcdd2SPaul Dagnelie * prefetch if the next block down is our target. 2041a2cdcdd2SPaul Dagnelie */ 2042a2cdcdd2SPaul Dagnelie static void 2043a2cdcdd2SPaul Dagnelie dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) 2044a2cdcdd2SPaul Dagnelie { 2045a2cdcdd2SPaul Dagnelie dbuf_prefetch_arg_t *dpa = private; 2046a2cdcdd2SPaul Dagnelie 2047a2cdcdd2SPaul Dagnelie ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); 2048a2cdcdd2SPaul Dagnelie ASSERT3S(dpa->dpa_curlevel, >, 0); 2049a2cdcdd2SPaul Dagnelie if (zio != NULL) { 2050a2cdcdd2SPaul Dagnelie ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); 2051a2cdcdd2SPaul Dagnelie ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); 2052a2cdcdd2SPaul Dagnelie ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); 2053a2cdcdd2SPaul Dagnelie } 2054a2cdcdd2SPaul Dagnelie 2055a2cdcdd2SPaul Dagnelie dpa->dpa_curlevel--; 2056a2cdcdd2SPaul Dagnelie 2057a2cdcdd2SPaul Dagnelie uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> 2058a2cdcdd2SPaul Dagnelie (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); 2059a2cdcdd2SPaul Dagnelie blkptr_t *bp = ((blkptr_t *)abuf->b_data) + 2060a2cdcdd2SPaul Dagnelie P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); 2061a2cdcdd2SPaul Dagnelie if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) { 2062a2cdcdd2SPaul Dagnelie kmem_free(dpa, sizeof (*dpa)); 2063a2cdcdd2SPaul Dagnelie } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { 2064a2cdcdd2SPaul Dagnelie ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); 2065a2cdcdd2SPaul Dagnelie dbuf_issue_final_prefetch(dpa, bp); 2066a2cdcdd2SPaul Dagnelie kmem_free(dpa, sizeof (*dpa)); 2067a2cdcdd2SPaul Dagnelie } else { 2068a2cdcdd2SPaul Dagnelie arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 2069a2cdcdd2SPaul Dagnelie zbookmark_phys_t zb; 2070a2cdcdd2SPaul Dagnelie 2071a2cdcdd2SPaul Dagnelie ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); 2072a2cdcdd2SPaul Dagnelie 2073a2cdcdd2SPaul Dagnelie SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, 2074a2cdcdd2SPaul Dagnelie dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); 2075a2cdcdd2SPaul Dagnelie 2076a2cdcdd2SPaul Dagnelie (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2077a2cdcdd2SPaul Dagnelie bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, 2078a2cdcdd2SPaul Dagnelie ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2079a2cdcdd2SPaul Dagnelie &iter_aflags, &zb); 2080a2cdcdd2SPaul Dagnelie } 2081a2cdcdd2SPaul Dagnelie (void) arc_buf_remove_ref(abuf, private); 2082a2cdcdd2SPaul Dagnelie } 2083a2cdcdd2SPaul Dagnelie 2084a2cdcdd2SPaul Dagnelie /* 2085a2cdcdd2SPaul Dagnelie * Issue prefetch reads for the given block on the given level. If the indirect 2086a2cdcdd2SPaul Dagnelie * blocks above that block are not in memory, we will read them in 2087a2cdcdd2SPaul Dagnelie * asynchronously. As a result, this call never blocks waiting for a read to 2088a2cdcdd2SPaul Dagnelie * complete. 2089a2cdcdd2SPaul Dagnelie */ 2090a2cdcdd2SPaul Dagnelie void 2091a2cdcdd2SPaul Dagnelie dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, 2092a2cdcdd2SPaul Dagnelie arc_flags_t aflags) 2093a2cdcdd2SPaul Dagnelie { 2094a2cdcdd2SPaul Dagnelie blkptr_t bp; 2095a2cdcdd2SPaul Dagnelie int epbs, nlevels, curlevel; 2096a2cdcdd2SPaul Dagnelie uint64_t curblkid; 2097fa9e4066Sahrens 20980a586ceaSMark Shellenbaum ASSERT(blkid != DMU_BONUS_BLKID); 2099fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2100fa9e4066Sahrens 2101cf6106c8SMatthew Ahrens if (blkid > dn->dn_maxblkid) 2102cf6106c8SMatthew Ahrens return; 2103cf6106c8SMatthew Ahrens 2104fa9e4066Sahrens if (dnode_block_freed(dn, blkid)) 2105fa9e4066Sahrens return; 2106fa9e4066Sahrens 2107fa9e4066Sahrens /* 2108a2cdcdd2SPaul Dagnelie * This dnode hasn't been written to disk yet, so there's nothing to 2109a2cdcdd2SPaul Dagnelie * prefetch. 2110fa9e4066Sahrens */ 2111a2cdcdd2SPaul Dagnelie nlevels = dn->dn_phys->dn_nlevels; 2112a2cdcdd2SPaul Dagnelie if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) 2113a2cdcdd2SPaul Dagnelie return; 2114a2cdcdd2SPaul Dagnelie 2115a2cdcdd2SPaul Dagnelie epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2116a2cdcdd2SPaul Dagnelie if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) 2117a2cdcdd2SPaul Dagnelie return; 2118a2cdcdd2SPaul Dagnelie 2119a2cdcdd2SPaul Dagnelie dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, 2120a2cdcdd2SPaul Dagnelie level, blkid); 2121a2cdcdd2SPaul Dagnelie if (db != NULL) { 2122fa9e4066Sahrens mutex_exit(&db->db_mtx); 2123a2cdcdd2SPaul Dagnelie /* 2124a2cdcdd2SPaul Dagnelie * This dbuf already exists. It is either CACHED, or 2125a2cdcdd2SPaul Dagnelie * (we assume) about to be read or filled. 2126a2cdcdd2SPaul Dagnelie */ 2127fa9e4066Sahrens return; 2128fa9e4066Sahrens } 2129fa9e4066Sahrens 2130a2cdcdd2SPaul Dagnelie /* 2131a2cdcdd2SPaul Dagnelie * Find the closest ancestor (indirect block) of the target block 2132a2cdcdd2SPaul Dagnelie * that is present in the cache. In this indirect block, we will 2133a2cdcdd2SPaul Dagnelie * find the bp that is at curlevel, curblkid. 2134a2cdcdd2SPaul Dagnelie */ 2135a2cdcdd2SPaul Dagnelie curlevel = level; 2136a2cdcdd2SPaul Dagnelie curblkid = blkid; 2137a2cdcdd2SPaul Dagnelie while (curlevel < nlevels - 1) { 2138a2cdcdd2SPaul Dagnelie int parent_level = curlevel + 1; 2139a2cdcdd2SPaul Dagnelie uint64_t parent_blkid = curblkid >> epbs; 2140a2cdcdd2SPaul Dagnelie dmu_buf_impl_t *db; 2141a2cdcdd2SPaul Dagnelie 2142a2cdcdd2SPaul Dagnelie if (dbuf_hold_impl(dn, parent_level, parent_blkid, 2143a2cdcdd2SPaul Dagnelie FALSE, TRUE, FTAG, &db) == 0) { 2144a2cdcdd2SPaul Dagnelie blkptr_t *bpp = db->db_buf->b_data; 2145a2cdcdd2SPaul Dagnelie bp = bpp[P2PHASE(curblkid, 1 << epbs)]; 2146a2cdcdd2SPaul Dagnelie dbuf_rele(db, FTAG); 2147a2cdcdd2SPaul Dagnelie break; 2148a2cdcdd2SPaul Dagnelie } 2149a2cdcdd2SPaul Dagnelie 2150a2cdcdd2SPaul Dagnelie curlevel = parent_level; 2151a2cdcdd2SPaul Dagnelie curblkid = parent_blkid; 2152a2cdcdd2SPaul Dagnelie } 2153a2cdcdd2SPaul Dagnelie 2154a2cdcdd2SPaul Dagnelie if (curlevel == nlevels - 1) { 2155a2cdcdd2SPaul Dagnelie /* No cached indirect blocks found. */ 2156a2cdcdd2SPaul Dagnelie ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); 2157a2cdcdd2SPaul Dagnelie bp = dn->dn_phys->dn_blkptr[curblkid]; 2158a2cdcdd2SPaul Dagnelie } 2159a2cdcdd2SPaul Dagnelie if (BP_IS_HOLE(&bp)) 2160a2cdcdd2SPaul Dagnelie return; 2161a2cdcdd2SPaul Dagnelie 2162a2cdcdd2SPaul Dagnelie ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); 2163a2cdcdd2SPaul Dagnelie 2164a2cdcdd2SPaul Dagnelie zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, 2165a2cdcdd2SPaul Dagnelie ZIO_FLAG_CANFAIL); 2166a2cdcdd2SPaul Dagnelie 2167a2cdcdd2SPaul Dagnelie dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); 2168b24ab676SJeff Bonwick dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; 2169a2cdcdd2SPaul Dagnelie SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2170a2cdcdd2SPaul Dagnelie dn->dn_object, level, blkid); 2171a2cdcdd2SPaul Dagnelie dpa->dpa_curlevel = curlevel; 2172a2cdcdd2SPaul Dagnelie dpa->dpa_prio = prio; 2173a2cdcdd2SPaul Dagnelie dpa->dpa_aflags = aflags; 2174a2cdcdd2SPaul Dagnelie dpa->dpa_spa = dn->dn_objset->os_spa; 2175a2cdcdd2SPaul Dagnelie dpa->dpa_epbs = epbs; 2176a2cdcdd2SPaul Dagnelie dpa->dpa_zio = pio; 2177a2cdcdd2SPaul Dagnelie 2178a2cdcdd2SPaul Dagnelie /* 2179a2cdcdd2SPaul Dagnelie * If we have the indirect just above us, no need to do the asynchronous 2180a2cdcdd2SPaul Dagnelie * prefetch chain; we'll just run the last step ourselves. If we're at 2181a2cdcdd2SPaul Dagnelie * a higher level, though, we want to issue the prefetches for all the 2182a2cdcdd2SPaul Dagnelie * indirect blocks asynchronously, so we can go on with whatever we were 2183a2cdcdd2SPaul Dagnelie * doing. 2184a2cdcdd2SPaul Dagnelie */ 2185a2cdcdd2SPaul Dagnelie if (curlevel == level) { 2186a2cdcdd2SPaul Dagnelie ASSERT3U(curblkid, ==, blkid); 2187a2cdcdd2SPaul Dagnelie dbuf_issue_final_prefetch(dpa, &bp); 2188a2cdcdd2SPaul Dagnelie kmem_free(dpa, sizeof (*dpa)); 2189a2cdcdd2SPaul Dagnelie } else { 2190a2cdcdd2SPaul Dagnelie arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; 21917802d7bfSMatthew Ahrens zbookmark_phys_t zb; 2192b24ab676SJeff Bonwick 2193a2cdcdd2SPaul Dagnelie SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, 2194a2cdcdd2SPaul Dagnelie dn->dn_object, curlevel, curblkid); 2195a2cdcdd2SPaul Dagnelie (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, 2196a2cdcdd2SPaul Dagnelie &bp, dbuf_prefetch_indirect_done, dpa, prio, 2197fa9e4066Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 2198a2cdcdd2SPaul Dagnelie &iter_aflags, &zb); 2199fa9e4066Sahrens } 2200a2cdcdd2SPaul Dagnelie /* 2201a2cdcdd2SPaul Dagnelie * We use pio here instead of dpa_zio since it's possible that 2202a2cdcdd2SPaul Dagnelie * dpa may have already been freed. 2203a2cdcdd2SPaul Dagnelie */ 2204a2cdcdd2SPaul Dagnelie zio_nowait(pio); 2205fa9e4066Sahrens } 2206fa9e4066Sahrens 2207fa9e4066Sahrens /* 2208fa9e4066Sahrens * Returns with db_holds incremented, and db_mtx not held. 2209fa9e4066Sahrens * Note: dn_struct_rwlock must be held. 2210fa9e4066Sahrens */ 2211fa9e4066Sahrens int 2212a2cdcdd2SPaul Dagnelie dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, 2213a2cdcdd2SPaul Dagnelie boolean_t fail_sparse, boolean_t fail_uncached, 2214fa9e4066Sahrens void *tag, dmu_buf_impl_t **dbp) 2215fa9e4066Sahrens { 2216fa9e4066Sahrens dmu_buf_impl_t *db, *parent = NULL; 2217fa9e4066Sahrens 22180a586ceaSMark Shellenbaum ASSERT(blkid != DMU_BONUS_BLKID); 2219fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 2220fa9e4066Sahrens ASSERT3U(dn->dn_nlevels, >, level); 2221fa9e4066Sahrens 2222fa9e4066Sahrens *dbp = NULL; 2223ea8dc4b6Seschrock top: 2224fa9e4066Sahrens /* dbuf_find() returns with db_mtx held */ 2225e57a022bSJustin T. Gibbs db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); 2226fa9e4066Sahrens 2227fa9e4066Sahrens if (db == NULL) { 2228fa9e4066Sahrens blkptr_t *bp = NULL; 2229fa9e4066Sahrens int err; 2230fa9e4066Sahrens 2231a2cdcdd2SPaul Dagnelie if (fail_uncached) 2232a2cdcdd2SPaul Dagnelie return (SET_ERROR(ENOENT)); 2233a2cdcdd2SPaul Dagnelie 2234c543ec06Sahrens ASSERT3P(parent, ==, NULL); 2235fa9e4066Sahrens err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 2236fa9e4066Sahrens if (fail_sparse) { 2237fa9e4066Sahrens if (err == 0 && bp && BP_IS_HOLE(bp)) 2238be6fd75aSMatthew Ahrens err = SET_ERROR(ENOENT); 2239fa9e4066Sahrens if (err) { 2240c543ec06Sahrens if (parent) 2241ea8dc4b6Seschrock dbuf_rele(parent, NULL); 2242fa9e4066Sahrens return (err); 2243fa9e4066Sahrens } 2244fa9e4066Sahrens } 2245ea8dc4b6Seschrock if (err && err != ENOENT) 2246ea8dc4b6Seschrock return (err); 2247fa9e4066Sahrens db = dbuf_create(dn, level, blkid, parent, bp); 2248fa9e4066Sahrens } 2249fa9e4066Sahrens 2250a2cdcdd2SPaul Dagnelie if (fail_uncached && db->db_state != DB_CACHED) { 2251a2cdcdd2SPaul Dagnelie mutex_exit(&db->db_mtx); 2252a2cdcdd2SPaul Dagnelie return (SET_ERROR(ENOENT)); 2253a2cdcdd2SPaul Dagnelie } 2254a2cdcdd2SPaul Dagnelie 2255ea8dc4b6Seschrock if (db->db_buf && refcount_is_zero(&db->db_holds)) { 2256ea8dc4b6Seschrock arc_buf_add_ref(db->db_buf, db); 2257ea8dc4b6Seschrock if (db->db_buf->b_data == NULL) { 2258ea8dc4b6Seschrock dbuf_clear(db); 2259c543ec06Sahrens if (parent) { 2260c543ec06Sahrens dbuf_rele(parent, NULL); 2261c543ec06Sahrens parent = NULL; 2262c543ec06Sahrens } 2263ea8dc4b6Seschrock goto top; 2264ea8dc4b6Seschrock } 2265ea8dc4b6Seschrock ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); 2266ea8dc4b6Seschrock } 2267ea8dc4b6Seschrock 2268ea8dc4b6Seschrock ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); 2269ea8dc4b6Seschrock 2270fa9e4066Sahrens /* 2271c717a561Smaybee * If this buffer is currently syncing out, and we are are 2272c717a561Smaybee * still referencing it from db_data, we need to make a copy 2273c717a561Smaybee * of it in case we decide we want to dirty it again in this txg. 2274fa9e4066Sahrens */ 22750a586ceaSMark Shellenbaum if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && 2276ea8dc4b6Seschrock dn->dn_object != DMU_META_DNODE_OBJECT && 2277c717a561Smaybee db->db_state == DB_CACHED && db->db_data_pending) { 2278c717a561Smaybee dbuf_dirty_record_t *dr = db->db_data_pending; 2279c717a561Smaybee 2280c717a561Smaybee if (dr->dt.dl.dr_data == db->db_buf) { 2281ad23a2dbSjohansen arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2282fa9e4066Sahrens 2283c717a561Smaybee dbuf_set_data(db, 2284744947dcSTom Erickson arc_buf_alloc(dn->dn_objset->os_spa, 2285c717a561Smaybee db->db.db_size, db, type)); 2286c717a561Smaybee bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, 2287fa9e4066Sahrens db->db.db_size); 2288fa9e4066Sahrens } 2289c717a561Smaybee } 2290fa9e4066Sahrens 2291ea8dc4b6Seschrock (void) refcount_add(&db->db_holds, tag); 22929c9dc39aSek110237 DBUF_VERIFY(db); 2293fa9e4066Sahrens mutex_exit(&db->db_mtx); 2294fa9e4066Sahrens 2295fa9e4066Sahrens /* NOTE: we can't rele the parent until after we drop the db_mtx */ 2296c543ec06Sahrens if (parent) 2297ea8dc4b6Seschrock dbuf_rele(parent, NULL); 2298fa9e4066Sahrens 2299744947dcSTom Erickson ASSERT3P(DB_DNODE(db), ==, dn); 2300fa9e4066Sahrens ASSERT3U(db->db_blkid, ==, blkid); 2301fa9e4066Sahrens ASSERT3U(db->db_level, ==, level); 2302fa9e4066Sahrens *dbp = db; 2303fa9e4066Sahrens 2304fa9e4066Sahrens return (0); 2305fa9e4066Sahrens } 2306fa9e4066Sahrens 2307fa9e4066Sahrens dmu_buf_impl_t * 2308ea8dc4b6Seschrock dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) 2309fa9e4066Sahrens { 2310a2cdcdd2SPaul Dagnelie return (dbuf_hold_level(dn, 0, blkid, tag)); 2311fa9e4066Sahrens } 2312fa9e4066Sahrens 2313fa9e4066Sahrens dmu_buf_impl_t * 2314fa9e4066Sahrens dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 2315fa9e4066Sahrens { 2316fa9e4066Sahrens dmu_buf_impl_t *db; 2317a2cdcdd2SPaul Dagnelie int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); 2318ea8dc4b6Seschrock return (err ? NULL : db); 2319fa9e4066Sahrens } 2320fa9e4066Sahrens 23211934e92fSmaybee void 2322ea8dc4b6Seschrock dbuf_create_bonus(dnode_t *dn) 2323fa9e4066Sahrens { 2324ea8dc4b6Seschrock ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); 2325ea8dc4b6Seschrock 2326ea8dc4b6Seschrock ASSERT(dn->dn_bonus == NULL); 23270a586ceaSMark Shellenbaum dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); 23280a586ceaSMark Shellenbaum } 23290a586ceaSMark Shellenbaum 23300a586ceaSMark Shellenbaum int 23310a586ceaSMark Shellenbaum dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) 23320a586ceaSMark Shellenbaum { 23330a586ceaSMark Shellenbaum dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2334744947dcSTom Erickson dnode_t *dn; 2335744947dcSTom Erickson 23360a586ceaSMark Shellenbaum if (db->db_blkid != DMU_SPILL_BLKID) 2337be6fd75aSMatthew Ahrens return (SET_ERROR(ENOTSUP)); 23380a586ceaSMark Shellenbaum if (blksz == 0) 23390a586ceaSMark Shellenbaum blksz = SPA_MINBLOCKSIZE; 2340b5152584SMatthew Ahrens ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); 23410a586ceaSMark Shellenbaum blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); 23420a586ceaSMark Shellenbaum 2343744947dcSTom Erickson DB_DNODE_ENTER(db); 2344744947dcSTom Erickson dn = DB_DNODE(db); 2345744947dcSTom Erickson rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 23460a586ceaSMark Shellenbaum dbuf_new_size(db, blksz, tx); 2347744947dcSTom Erickson rw_exit(&dn->dn_struct_rwlock); 2348744947dcSTom Erickson DB_DNODE_EXIT(db); 23490a586ceaSMark Shellenbaum 23500a586ceaSMark Shellenbaum return (0); 23510a586ceaSMark Shellenbaum } 23520a586ceaSMark Shellenbaum 23530a586ceaSMark Shellenbaum void 23540a586ceaSMark Shellenbaum dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) 23550a586ceaSMark Shellenbaum { 23560a586ceaSMark Shellenbaum dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); 2357fa9e4066Sahrens } 2358fa9e4066Sahrens 2359ea8dc4b6Seschrock #pragma weak dmu_buf_add_ref = dbuf_add_ref 2360fa9e4066Sahrens void 2361fa9e4066Sahrens dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 2362fa9e4066Sahrens { 2363ea8dc4b6Seschrock int64_t holds = refcount_add(&db->db_holds, tag); 2364ea8dc4b6Seschrock ASSERT(holds > 1); 2365fa9e4066Sahrens } 2366fa9e4066Sahrens 2367e57a022bSJustin T. Gibbs #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref 2368e57a022bSJustin T. Gibbs boolean_t 2369e57a022bSJustin T. Gibbs dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, 2370e57a022bSJustin T. Gibbs void *tag) 2371e57a022bSJustin T. Gibbs { 2372e57a022bSJustin T. Gibbs dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2373e57a022bSJustin T. Gibbs dmu_buf_impl_t *found_db; 2374e57a022bSJustin T. Gibbs boolean_t result = B_FALSE; 2375e57a022bSJustin T. Gibbs 2376e57a022bSJustin T. Gibbs if (db->db_blkid == DMU_BONUS_BLKID) 2377e57a022bSJustin T. Gibbs found_db = dbuf_find_bonus(os, obj); 2378e57a022bSJustin T. Gibbs else 2379e57a022bSJustin T. Gibbs found_db = dbuf_find(os, obj, 0, blkid); 2380e57a022bSJustin T. Gibbs 2381e57a022bSJustin T. Gibbs if (found_db != NULL) { 2382e57a022bSJustin T. Gibbs if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { 2383e57a022bSJustin T. Gibbs (void) refcount_add(&db->db_holds, tag); 2384e57a022bSJustin T. Gibbs result = B_TRUE; 2385e57a022bSJustin T. Gibbs } 2386e57a022bSJustin T. Gibbs mutex_exit(&db->db_mtx); 2387e57a022bSJustin T. Gibbs } 2388e57a022bSJustin T. Gibbs return (result); 2389e57a022bSJustin T. Gibbs } 2390e57a022bSJustin T. Gibbs 2391744947dcSTom Erickson /* 2392744947dcSTom Erickson * If you call dbuf_rele() you had better not be referencing the dnode handle 2393744947dcSTom Erickson * unless you have some other direct or indirect hold on the dnode. (An indirect 2394744947dcSTom Erickson * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) 2395744947dcSTom Erickson * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the 2396744947dcSTom Erickson * dnode's parent dbuf evicting its dnode handles. 2397744947dcSTom Erickson */ 2398fa9e4066Sahrens void 2399ea8dc4b6Seschrock dbuf_rele(dmu_buf_impl_t *db, void *tag) 2400fa9e4066Sahrens { 2401b24ab676SJeff Bonwick mutex_enter(&db->db_mtx); 2402b24ab676SJeff Bonwick dbuf_rele_and_unlock(db, tag); 2403b24ab676SJeff Bonwick } 2404b24ab676SJeff Bonwick 240543466aaeSMax Grossman void 240643466aaeSMax Grossman dmu_buf_rele(dmu_buf_t *db, void *tag) 240743466aaeSMax Grossman { 240843466aaeSMax Grossman dbuf_rele((dmu_buf_impl_t *)db, tag); 240943466aaeSMax Grossman } 241043466aaeSMax Grossman 2411b24ab676SJeff Bonwick /* 2412b24ab676SJeff Bonwick * dbuf_rele() for an already-locked dbuf. This is necessary to allow 2413b24ab676SJeff Bonwick * db_dirtycnt and db_holds to be updated atomically. 2414b24ab676SJeff Bonwick */ 2415b24ab676SJeff Bonwick void 2416b24ab676SJeff Bonwick dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) 2417b24ab676SJeff Bonwick { 2418fa9e4066Sahrens int64_t holds; 2419fa9e4066Sahrens 2420b24ab676SJeff Bonwick ASSERT(MUTEX_HELD(&db->db_mtx)); 24219c9dc39aSek110237 DBUF_VERIFY(db); 2422fa9e4066Sahrens 2423744947dcSTom Erickson /* 2424744947dcSTom Erickson * Remove the reference to the dbuf before removing its hold on the 2425744947dcSTom Erickson * dnode so we can guarantee in dnode_move() that a referenced bonus 2426744947dcSTom Erickson * buffer has a corresponding dnode hold. 2427744947dcSTom Erickson */ 2428fa9e4066Sahrens holds = refcount_remove(&db->db_holds, tag); 2429ea8dc4b6Seschrock ASSERT(holds >= 0); 2430fa9e4066Sahrens 2431c717a561Smaybee /* 2432c717a561Smaybee * We can't freeze indirects if there is a possibility that they 2433c717a561Smaybee * may be modified in the current syncing context. 2434c717a561Smaybee */ 2435c717a561Smaybee if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) 24366b4acc8bSahrens arc_buf_freeze(db->db_buf); 24376b4acc8bSahrens 2438fa9e4066Sahrens if (holds == db->db_dirtycnt && 2439d2058105SJustin T. Gibbs db->db_level == 0 && db->db_user_immediate_evict) 2440fa9e4066Sahrens dbuf_evict_user(db); 2441ea8dc4b6Seschrock 2442ea8dc4b6Seschrock if (holds == 0) { 24430a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) { 2444cd485b49SJustin T. Gibbs dnode_t *dn; 2445d2058105SJustin T. Gibbs boolean_t evict_dbuf = db->db_pending_evict; 2446cd485b49SJustin T. Gibbs 2447cd485b49SJustin T. Gibbs /* 2448cd485b49SJustin T. Gibbs * If the dnode moves here, we cannot cross this 2449cd485b49SJustin T. Gibbs * barrier until the move completes. 2450cd485b49SJustin T. Gibbs */ 2451cd485b49SJustin T. Gibbs DB_DNODE_ENTER(db); 2452cd485b49SJustin T. Gibbs 2453cd485b49SJustin T. Gibbs dn = DB_DNODE(db); 2454cd485b49SJustin T. Gibbs atomic_dec_32(&dn->dn_dbufs_count); 2455cd485b49SJustin T. Gibbs 2456cd485b49SJustin T. Gibbs /* 2457cd485b49SJustin T. Gibbs * Decrementing the dbuf count means that the bonus 2458cd485b49SJustin T. Gibbs * buffer's dnode hold is no longer discounted in 2459cd485b49SJustin T. Gibbs * dnode_move(). The dnode cannot move until after 2460d2058105SJustin T. Gibbs * the dnode_rele() below. 2461cd485b49SJustin T. Gibbs */ 2462cd485b49SJustin T. Gibbs DB_DNODE_EXIT(db); 2463cd485b49SJustin T. Gibbs 2464cd485b49SJustin T. Gibbs /* 2465cd485b49SJustin T. Gibbs * Do not reference db after its lock is dropped. 2466cd485b49SJustin T. Gibbs * Another thread may evict it. 2467cd485b49SJustin T. Gibbs */ 2468ea8dc4b6Seschrock mutex_exit(&db->db_mtx); 2469744947dcSTom Erickson 2470d2058105SJustin T. Gibbs if (evict_dbuf) 2471cd485b49SJustin T. Gibbs dnode_evict_bonus(dn); 2472d2058105SJustin T. Gibbs 2473d2058105SJustin T. Gibbs dnode_rele(dn, db); 2474ea8dc4b6Seschrock } else if (db->db_buf == NULL) { 2475ea8dc4b6Seschrock /* 2476ea8dc4b6Seschrock * This is a special case: we never associated this 2477ea8dc4b6Seschrock * dbuf with any data allocated from the ARC. 2478ea8dc4b6Seschrock */ 247982c9918fSTim Haley ASSERT(db->db_state == DB_UNCACHED || 248082c9918fSTim Haley db->db_state == DB_NOFILL); 2481ea8dc4b6Seschrock dbuf_evict(db); 2482ea8dc4b6Seschrock } else if (arc_released(db->db_buf)) { 2483ea8dc4b6Seschrock arc_buf_t *buf = db->db_buf; 2484ea8dc4b6Seschrock /* 2485ea8dc4b6Seschrock * This dbuf has anonymous data associated with it. 2486ea8dc4b6Seschrock */ 2487bc9014e6SJustin Gibbs dbuf_clear_data(db); 24883b2aab18SMatthew Ahrens VERIFY(arc_buf_remove_ref(buf, db)); 2489ea8dc4b6Seschrock dbuf_evict(db); 2490ea8dc4b6Seschrock } else { 24913b2aab18SMatthew Ahrens VERIFY(!arc_buf_remove_ref(db->db_buf, db)); 24929253d63dSGeorge Wilson 24939253d63dSGeorge Wilson /* 24949253d63dSGeorge Wilson * A dbuf will be eligible for eviction if either the 24959253d63dSGeorge Wilson * 'primarycache' property is set or a duplicate 24969253d63dSGeorge Wilson * copy of this buffer is already cached in the arc. 24979253d63dSGeorge Wilson * 24989253d63dSGeorge Wilson * In the case of the 'primarycache' a buffer 24999253d63dSGeorge Wilson * is considered for eviction if it matches the 25009253d63dSGeorge Wilson * criteria set in the property. 25019253d63dSGeorge Wilson * 25029253d63dSGeorge Wilson * To decide if our buffer is considered a 25039253d63dSGeorge Wilson * duplicate, we must call into the arc to determine 25049253d63dSGeorge Wilson * if multiple buffers are referencing the same 25059253d63dSGeorge Wilson * block on-disk. If so, then we simply evict 25069253d63dSGeorge Wilson * ourselves. 25079253d63dSGeorge Wilson */ 2508bbfa8ea8SMatthew Ahrens if (!DBUF_IS_CACHEABLE(db)) { 2509bbfa8ea8SMatthew Ahrens if (db->db_blkptr != NULL && 2510bbfa8ea8SMatthew Ahrens !BP_IS_HOLE(db->db_blkptr) && 2511bbfa8ea8SMatthew Ahrens !BP_IS_EMBEDDED(db->db_blkptr)) { 2512bbfa8ea8SMatthew Ahrens spa_t *spa = 2513bbfa8ea8SMatthew Ahrens dmu_objset_spa(db->db_objset); 2514bbfa8ea8SMatthew Ahrens blkptr_t bp = *db->db_blkptr; 25153baa08fcSek110237 dbuf_clear(db); 2516bbfa8ea8SMatthew Ahrens arc_freed(spa, &bp); 2517bbfa8ea8SMatthew Ahrens } else { 2518bbfa8ea8SMatthew Ahrens dbuf_clear(db); 2519bbfa8ea8SMatthew Ahrens } 2520d2058105SJustin T. Gibbs } else if (db->db_pending_evict || 2521bc9014e6SJustin Gibbs arc_buf_eviction_needed(db->db_buf)) { 2522bbfa8ea8SMatthew Ahrens dbuf_clear(db); 2523bbfa8ea8SMatthew Ahrens } else { 2524fa9e4066Sahrens mutex_exit(&db->db_mtx); 2525fa9e4066Sahrens } 2526bbfa8ea8SMatthew Ahrens } 2527ea8dc4b6Seschrock } else { 2528ea8dc4b6Seschrock mutex_exit(&db->db_mtx); 2529fa9e4066Sahrens } 2530fa9e4066Sahrens } 2531fa9e4066Sahrens 2532fa9e4066Sahrens #pragma weak dmu_buf_refcount = dbuf_refcount 2533fa9e4066Sahrens uint64_t 2534fa9e4066Sahrens dbuf_refcount(dmu_buf_impl_t *db) 2535fa9e4066Sahrens { 2536fa9e4066Sahrens return (refcount_count(&db->db_holds)); 2537fa9e4066Sahrens } 2538fa9e4066Sahrens 2539fa9e4066Sahrens void * 2540bc9014e6SJustin Gibbs dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, 2541bc9014e6SJustin Gibbs dmu_buf_user_t *new_user) 2542fa9e4066Sahrens { 2543bc9014e6SJustin Gibbs dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2544bc9014e6SJustin Gibbs 2545bc9014e6SJustin Gibbs mutex_enter(&db->db_mtx); 2546bc9014e6SJustin Gibbs dbuf_verify_user(db, DBVU_NOT_EVICTING); 2547bc9014e6SJustin Gibbs if (db->db_user == old_user) 2548bc9014e6SJustin Gibbs db->db_user = new_user; 2549bc9014e6SJustin Gibbs else 2550bc9014e6SJustin Gibbs old_user = db->db_user; 2551bc9014e6SJustin Gibbs dbuf_verify_user(db, DBVU_NOT_EVICTING); 2552bc9014e6SJustin Gibbs mutex_exit(&db->db_mtx); 2553bc9014e6SJustin Gibbs 2554bc9014e6SJustin Gibbs return (old_user); 2555fa9e4066Sahrens } 2556fa9e4066Sahrens 2557fa9e4066Sahrens void * 2558bc9014e6SJustin Gibbs dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2559bc9014e6SJustin Gibbs { 2560bc9014e6SJustin Gibbs return (dmu_buf_replace_user(db_fake, NULL, user)); 2561bc9014e6SJustin Gibbs } 2562bc9014e6SJustin Gibbs 2563bc9014e6SJustin Gibbs void * 2564bc9014e6SJustin Gibbs dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2565fa9e4066Sahrens { 2566fa9e4066Sahrens dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2567fa9e4066Sahrens 2568d2058105SJustin T. Gibbs db->db_user_immediate_evict = TRUE; 2569bc9014e6SJustin Gibbs return (dmu_buf_set_user(db_fake, user)); 2570fa9e4066Sahrens } 2571fa9e4066Sahrens 2572fa9e4066Sahrens void * 2573bc9014e6SJustin Gibbs dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) 2574fa9e4066Sahrens { 2575bc9014e6SJustin Gibbs return (dmu_buf_replace_user(db_fake, user, NULL)); 2576fa9e4066Sahrens } 2577fa9e4066Sahrens 2578fa9e4066Sahrens void * 2579fa9e4066Sahrens dmu_buf_get_user(dmu_buf_t *db_fake) 2580fa9e4066Sahrens { 2581fa9e4066Sahrens dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 2582fa9e4066Sahrens 2583bc9014e6SJustin Gibbs dbuf_verify_user(db, DBVU_NOT_EVICTING); 2584bc9014e6SJustin Gibbs return (db->db_user); 2585bc9014e6SJustin Gibbs } 2586bc9014e6SJustin Gibbs 2587bc9014e6SJustin Gibbs void 2588bc9014e6SJustin Gibbs dmu_buf_user_evict_wait() 2589bc9014e6SJustin Gibbs { 2590bc9014e6SJustin Gibbs taskq_wait(dbu_evict_taskq); 2591fa9e4066Sahrens } 2592fa9e4066Sahrens 25933d692628SSanjeev Bagewadi boolean_t 25943d692628SSanjeev Bagewadi dmu_buf_freeable(dmu_buf_t *dbuf) 25953d692628SSanjeev Bagewadi { 25963d692628SSanjeev Bagewadi boolean_t res = B_FALSE; 25973d692628SSanjeev Bagewadi dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; 25983d692628SSanjeev Bagewadi 25993d692628SSanjeev Bagewadi if (db->db_blkptr) 26003d692628SSanjeev Bagewadi res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, 2601c7cd2421SGeorge Wilson db->db_blkptr, db->db_blkptr->blk_birth); 26023d692628SSanjeev Bagewadi 26033d692628SSanjeev Bagewadi return (res); 26043d692628SSanjeev Bagewadi } 26053d692628SSanjeev Bagewadi 260680901aeaSGeorge Wilson blkptr_t * 260780901aeaSGeorge Wilson dmu_buf_get_blkptr(dmu_buf_t *db) 260880901aeaSGeorge Wilson { 260980901aeaSGeorge Wilson dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; 261080901aeaSGeorge Wilson return (dbi->db_blkptr); 261180901aeaSGeorge Wilson } 261280901aeaSGeorge Wilson 2613c717a561Smaybee static void 2614c717a561Smaybee dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) 2615fa9e4066Sahrens { 2616c717a561Smaybee /* ASSERT(dmu_tx_is_syncing(tx) */ 2617c717a561Smaybee ASSERT(MUTEX_HELD(&db->db_mtx)); 2618c717a561Smaybee 2619c717a561Smaybee if (db->db_blkptr != NULL) 2620c717a561Smaybee return; 2621c717a561Smaybee 26220a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID) { 26230a586ceaSMark Shellenbaum db->db_blkptr = &dn->dn_phys->dn_spill; 26240a586ceaSMark Shellenbaum BP_ZERO(db->db_blkptr); 26250a586ceaSMark Shellenbaum return; 26260a586ceaSMark Shellenbaum } 2627c717a561Smaybee if (db->db_level == dn->dn_phys->dn_nlevels-1) { 2628c717a561Smaybee /* 2629c717a561Smaybee * This buffer was allocated at a time when there was 2630c717a561Smaybee * no available blkptrs from the dnode, or it was 2631c717a561Smaybee * inappropriate to hook it in (i.e., nlevels mis-match). 2632c717a561Smaybee */ 2633c717a561Smaybee ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); 2634c717a561Smaybee ASSERT(db->db_parent == NULL); 2635c717a561Smaybee db->db_parent = dn->dn_dbuf; 2636c717a561Smaybee db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 2637c717a561Smaybee DBUF_VERIFY(db); 2638c717a561Smaybee } else { 2639c717a561Smaybee dmu_buf_impl_t *parent = db->db_parent; 2640c717a561Smaybee int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 2641c717a561Smaybee 2642c717a561Smaybee ASSERT(dn->dn_phys->dn_nlevels > 1); 2643c717a561Smaybee if (parent == NULL) { 2644c717a561Smaybee mutex_exit(&db->db_mtx); 2645c717a561Smaybee rw_enter(&dn->dn_struct_rwlock, RW_READER); 2646a2cdcdd2SPaul Dagnelie parent = dbuf_hold_level(dn, db->db_level + 1, 2647a2cdcdd2SPaul Dagnelie db->db_blkid >> epbs, db); 2648c717a561Smaybee rw_exit(&dn->dn_struct_rwlock); 2649c717a561Smaybee mutex_enter(&db->db_mtx); 2650c717a561Smaybee db->db_parent = parent; 2651c717a561Smaybee } 2652c717a561Smaybee db->db_blkptr = (blkptr_t *)parent->db.db_data + 2653c717a561Smaybee (db->db_blkid & ((1ULL << epbs) - 1)); 2654c717a561Smaybee DBUF_VERIFY(db); 2655c717a561Smaybee } 2656c717a561Smaybee } 2657c717a561Smaybee 2658c717a561Smaybee static void 2659c717a561Smaybee dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2660c717a561Smaybee { 2661c717a561Smaybee dmu_buf_impl_t *db = dr->dr_dbuf; 2662744947dcSTom Erickson dnode_t *dn; 2663c717a561Smaybee zio_t *zio; 2664c717a561Smaybee 2665c717a561Smaybee ASSERT(dmu_tx_is_syncing(tx)); 2666c717a561Smaybee 2667c717a561Smaybee dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2668c717a561Smaybee 2669c717a561Smaybee mutex_enter(&db->db_mtx); 2670c717a561Smaybee 2671c717a561Smaybee ASSERT(db->db_level > 0); 2672c717a561Smaybee DBUF_VERIFY(db); 2673c717a561Smaybee 26743e30c24aSWill Andrews /* Read the block if it hasn't been read yet. */ 2675c717a561Smaybee if (db->db_buf == NULL) { 2676c717a561Smaybee mutex_exit(&db->db_mtx); 2677c717a561Smaybee (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); 2678c717a561Smaybee mutex_enter(&db->db_mtx); 2679c717a561Smaybee } 2680c717a561Smaybee ASSERT3U(db->db_state, ==, DB_CACHED); 2681c717a561Smaybee ASSERT(db->db_buf != NULL); 2682c717a561Smaybee 2683744947dcSTom Erickson DB_DNODE_ENTER(db); 2684744947dcSTom Erickson dn = DB_DNODE(db); 26853e30c24aSWill Andrews /* Indirect block size must match what the dnode thinks it is. */ 2686744947dcSTom Erickson ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2687c717a561Smaybee dbuf_check_blkptr(dn, db); 2688744947dcSTom Erickson DB_DNODE_EXIT(db); 2689c717a561Smaybee 26903e30c24aSWill Andrews /* Provide the pending dirty record to child dbufs */ 2691c717a561Smaybee db->db_data_pending = dr; 2692c717a561Smaybee 2693af2c4821Smaybee mutex_exit(&db->db_mtx); 2694088f3894Sahrens dbuf_write(dr, db->db_buf, tx); 2695c717a561Smaybee 2696c717a561Smaybee zio = dr->dr_zio; 2697c717a561Smaybee mutex_enter(&dr->dt.di.dr_mtx); 269846e1baa6SMatthew Ahrens dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); 2699c717a561Smaybee ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 2700c717a561Smaybee mutex_exit(&dr->dt.di.dr_mtx); 2701c717a561Smaybee zio_nowait(zio); 2702c717a561Smaybee } 2703c717a561Smaybee 2704c717a561Smaybee static void 2705c717a561Smaybee dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) 2706c717a561Smaybee { 2707c717a561Smaybee arc_buf_t **datap = &dr->dt.dl.dr_data; 2708c717a561Smaybee dmu_buf_impl_t *db = dr->dr_dbuf; 2709744947dcSTom Erickson dnode_t *dn; 2710744947dcSTom Erickson objset_t *os; 2711c717a561Smaybee uint64_t txg = tx->tx_txg; 2712fa9e4066Sahrens 2713fa9e4066Sahrens ASSERT(dmu_tx_is_syncing(tx)); 2714fa9e4066Sahrens 2715fa9e4066Sahrens dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 2716fa9e4066Sahrens 2717fa9e4066Sahrens mutex_enter(&db->db_mtx); 2718fa9e4066Sahrens /* 2719fa9e4066Sahrens * To be synced, we must be dirtied. But we 2720fa9e4066Sahrens * might have been freed after the dirty. 2721fa9e4066Sahrens */ 2722fa9e4066Sahrens if (db->db_state == DB_UNCACHED) { 2723fa9e4066Sahrens /* This buffer has been freed since it was dirtied */ 2724fa9e4066Sahrens ASSERT(db->db.db_data == NULL); 2725fa9e4066Sahrens } else if (db->db_state == DB_FILL) { 2726fa9e4066Sahrens /* This buffer was freed and is now being re-filled */ 2727c717a561Smaybee ASSERT(db->db.db_data != dr->dt.dl.dr_data); 2728fa9e4066Sahrens } else { 272982c9918fSTim Haley ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); 2730fa9e4066Sahrens } 27319c9dc39aSek110237 DBUF_VERIFY(db); 2732fa9e4066Sahrens 2733744947dcSTom Erickson DB_DNODE_ENTER(db); 2734744947dcSTom Erickson dn = DB_DNODE(db); 2735744947dcSTom Erickson 27360a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID) { 27370a586ceaSMark Shellenbaum mutex_enter(&dn->dn_mtx); 27380a586ceaSMark Shellenbaum dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; 27390a586ceaSMark Shellenbaum mutex_exit(&dn->dn_mtx); 27400a586ceaSMark Shellenbaum } 27410a586ceaSMark Shellenbaum 2742fa9e4066Sahrens /* 2743c717a561Smaybee * If this is a bonus buffer, simply copy the bonus data into the 2744c717a561Smaybee * dnode. It will be written out when the dnode is synced (and it 2745c717a561Smaybee * will be synced, since it must have been dirty for dbuf_sync to 2746c717a561Smaybee * be called). 2747fa9e4066Sahrens */ 27480a586ceaSMark Shellenbaum if (db->db_blkid == DMU_BONUS_BLKID) { 2749c717a561Smaybee dbuf_dirty_record_t **drp; 27501934e92fSmaybee 2751ea8dc4b6Seschrock ASSERT(*datap != NULL); 2752fb09f5aaSMadhav Suresh ASSERT0(db->db_level); 2753ea8dc4b6Seschrock ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); 2754ea8dc4b6Seschrock bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); 2755744947dcSTom Erickson DB_DNODE_EXIT(db); 2756744947dcSTom Erickson 27570e8c6158Smaybee if (*datap != db->db.db_data) { 2758ea8dc4b6Seschrock zio_buf_free(*datap, DN_MAX_BONUSLEN); 27595a98e54bSBrendan Gregg - Sun Microsystems arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); 27600e8c6158Smaybee } 2761ea8dc4b6Seschrock db->db_data_pending = NULL; 2762c717a561Smaybee drp = &db->db_last_dirty; 2763c717a561Smaybee while (*drp != dr) 2764c717a561Smaybee drp = &(*drp)->dr_next; 276517f17c2dSbonwick ASSERT(dr->dr_next == NULL); 2766b24ab676SJeff Bonwick ASSERT(dr->dr_dbuf == db); 276717f17c2dSbonwick *drp = dr->dr_next; 2768c717a561Smaybee kmem_free(dr, sizeof (dbuf_dirty_record_t)); 2769ea8dc4b6Seschrock ASSERT(db->db_dirtycnt > 0); 2770ea8dc4b6Seschrock db->db_dirtycnt -= 1; 2771b24ab676SJeff Bonwick dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); 2772ea8dc4b6Seschrock return; 2773ea8dc4b6Seschrock } 2774ea8dc4b6Seschrock 2775744947dcSTom Erickson os = dn->dn_objset; 2776744947dcSTom Erickson 2777c5c6ffa0Smaybee /* 2778f82bfe17Sgw25295 * This function may have dropped the db_mtx lock allowing a dmu_sync 2779f82bfe17Sgw25295 * operation to sneak in. As a result, we need to ensure that we 2780f82bfe17Sgw25295 * don't check the dr_override_state until we have returned from 2781f82bfe17Sgw25295 * dbuf_check_blkptr. 2782f82bfe17Sgw25295 */ 2783f82bfe17Sgw25295 dbuf_check_blkptr(dn, db); 2784f82bfe17Sgw25295 2785f82bfe17Sgw25295 /* 2786744947dcSTom Erickson * If this buffer is in the middle of an immediate write, 2787c717a561Smaybee * wait for the synchronous IO to complete. 2788c5c6ffa0Smaybee */ 2789c717a561Smaybee while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 2790c5c6ffa0Smaybee ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); 2791c5c6ffa0Smaybee cv_wait(&db->db_changed, &db->db_mtx); 2792c717a561Smaybee ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); 2793c5c6ffa0Smaybee } 2794c717a561Smaybee 2795ab69d62fSMatthew Ahrens if (db->db_state != DB_NOFILL && 2796ab69d62fSMatthew Ahrens dn->dn_object != DMU_META_DNODE_OBJECT && 2797ab69d62fSMatthew Ahrens refcount_count(&db->db_holds) > 1 && 2798b24ab676SJeff Bonwick dr->dt.dl.dr_override_state != DR_OVERRIDDEN && 2799ab69d62fSMatthew Ahrens *datap == db->db_buf) { 2800fa9e4066Sahrens /* 280182c9918fSTim Haley * If this buffer is currently "in use" (i.e., there 280282c9918fSTim Haley * are active holds and db_data still references it), 280382c9918fSTim Haley * then make a copy before we start the write so that 280482c9918fSTim Haley * any modifications from the open txg will not leak 280582c9918fSTim Haley * into this write. 2806fa9e4066Sahrens * 280782c9918fSTim Haley * NOTE: this copy does not need to be made for 280882c9918fSTim Haley * objects only modified in the syncing context (e.g. 280982c9918fSTim Haley * DNONE_DNODE blocks). 2810fa9e4066Sahrens */ 2811ab69d62fSMatthew Ahrens int blksz = arc_buf_size(*datap); 2812ab69d62fSMatthew Ahrens arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); 2813ab69d62fSMatthew Ahrens *datap = arc_buf_alloc(os->os_spa, blksz, db, type); 2814c717a561Smaybee bcopy(db->db.db_data, (*datap)->b_data, blksz); 2815fa9e4066Sahrens } 2816c717a561Smaybee db->db_data_pending = dr; 2817fa9e4066Sahrens 2818fa9e4066Sahrens mutex_exit(&db->db_mtx); 2819fa9e4066Sahrens 2820088f3894Sahrens dbuf_write(dr, *datap, tx); 2821c717a561Smaybee 2822c717a561Smaybee ASSERT(!list_link_active(&dr->dr_dirty_node)); 2823744947dcSTom Erickson if (dn->dn_object == DMU_META_DNODE_OBJECT) { 2824c717a561Smaybee list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); 2825744947dcSTom Erickson DB_DNODE_EXIT(db); 2826744947dcSTom Erickson } else { 2827744947dcSTom Erickson /* 2828744947dcSTom Erickson * Although zio_nowait() does not "wait for an IO", it does 2829744947dcSTom Erickson * initiate the IO. If this is an empty write it seems plausible 2830744947dcSTom Erickson * that the IO could actually be completed before the nowait 2831744947dcSTom Erickson * returns. We need to DB_DNODE_EXIT() first in case 2832744947dcSTom Erickson * zio_nowait() invalidates the dbuf. 2833744947dcSTom Erickson */ 2834744947dcSTom Erickson DB_DNODE_EXIT(db); 2835c717a561Smaybee zio_nowait(dr->dr_zio); 2836fa9e4066Sahrens } 2837744947dcSTom Erickson } 2838c717a561Smaybee 2839c717a561Smaybee void 284046e1baa6SMatthew Ahrens dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) 2841c717a561Smaybee { 2842c717a561Smaybee dbuf_dirty_record_t *dr; 2843c717a561Smaybee 2844c717a561Smaybee while (dr = list_head(list)) { 2845c717a561Smaybee if (dr->dr_zio != NULL) { 2846c717a561Smaybee /* 2847c717a561Smaybee * If we find an already initialized zio then we 2848c717a561Smaybee * are processing the meta-dnode, and we have finished. 2849c717a561Smaybee * The dbufs for all dnodes are put back on the list 2850c717a561Smaybee * during processing, so that we can zio_wait() 2851c717a561Smaybee * these IOs after initiating all child IOs. 2852c717a561Smaybee */ 2853c717a561Smaybee ASSERT3U(dr->dr_dbuf->db.db_object, ==, 2854c717a561Smaybee DMU_META_DNODE_OBJECT); 2855c717a561Smaybee break; 2856fa9e4066Sahrens } 285746e1baa6SMatthew Ahrens if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && 285846e1baa6SMatthew Ahrens dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { 285946e1baa6SMatthew Ahrens VERIFY3U(dr->dr_dbuf->db_level, ==, level); 286046e1baa6SMatthew Ahrens } 2861c717a561Smaybee list_remove(list, dr); 2862c717a561Smaybee if (dr->dr_dbuf->db_level > 0) 2863c717a561Smaybee dbuf_sync_indirect(dr, tx); 2864c717a561Smaybee else 2865c717a561Smaybee dbuf_sync_leaf(dr, tx); 2866c717a561Smaybee } 2867c717a561Smaybee } 2868c717a561Smaybee 2869fa9e4066Sahrens /* ARGSUSED */ 2870fa9e4066Sahrens static void 2871c717a561Smaybee dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) 2872fa9e4066Sahrens { 2873fa9e4066Sahrens dmu_buf_impl_t *db = vdb; 2874744947dcSTom Erickson dnode_t *dn; 2875e14bb325SJeff Bonwick blkptr_t *bp = zio->io_bp; 2876c717a561Smaybee blkptr_t *bp_orig = &zio->io_bp_orig; 2877b24ab676SJeff Bonwick spa_t *spa = zio->io_spa; 2878b24ab676SJeff Bonwick int64_t delta; 2879fa9e4066Sahrens uint64_t fill = 0; 2880b24ab676SJeff Bonwick int i; 2881fa9e4066Sahrens 2882*11ceac77SAlex Reece ASSERT3P(db->db_blkptr, !=, NULL); 2883*11ceac77SAlex Reece ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); 2884e14bb325SJeff Bonwick 2885744947dcSTom Erickson DB_DNODE_ENTER(db); 2886744947dcSTom Erickson dn = DB_DNODE(db); 2887b24ab676SJeff Bonwick delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); 2888b24ab676SJeff Bonwick dnode_diduse_space(dn, delta - zio->io_prev_space_delta); 2889b24ab676SJeff Bonwick zio->io_prev_space_delta = delta; 2890fa9e4066Sahrens 289143466aaeSMax Grossman if (bp->blk_birth != 0) { 28920a586ceaSMark Shellenbaum ASSERT((db->db_blkid != DMU_SPILL_BLKID && 28930a586ceaSMark Shellenbaum BP_GET_TYPE(bp) == dn->dn_type) || 28940a586ceaSMark Shellenbaum (db->db_blkid == DMU_SPILL_BLKID && 28955d7b4d43SMatthew Ahrens BP_GET_TYPE(bp) == dn->dn_bonustype) || 28965d7b4d43SMatthew Ahrens BP_IS_EMBEDDED(bp)); 2897e14bb325SJeff Bonwick ASSERT(BP_GET_LEVEL(bp) == db->db_level); 289843466aaeSMax Grossman } 2899e14bb325SJeff Bonwick 2900fa9e4066Sahrens mutex_enter(&db->db_mtx); 2901fa9e4066Sahrens 29020a586ceaSMark Shellenbaum #ifdef ZFS_DEBUG 29030a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID) { 29040a586ceaSMark Shellenbaum ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 2905*11ceac77SAlex Reece ASSERT(!(BP_IS_HOLE(bp)) && 29060a586ceaSMark Shellenbaum db->db_blkptr == &dn->dn_phys->dn_spill); 29070a586ceaSMark Shellenbaum } 29080a586ceaSMark Shellenbaum #endif 29090a586ceaSMark Shellenbaum 2910fa9e4066Sahrens if (db->db_level == 0) { 2911fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 29120a586ceaSMark Shellenbaum if (db->db_blkid > dn->dn_phys->dn_maxblkid && 29130a586ceaSMark Shellenbaum db->db_blkid != DMU_SPILL_BLKID) 2914fa9e4066Sahrens dn->dn_phys->dn_maxblkid = db->db_blkid; 2915fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 2916fa9e4066Sahrens 2917fa9e4066Sahrens if (dn->dn_type == DMU_OT_DNODE) { 2918fa9e4066Sahrens dnode_phys_t *dnp = db->db.db_data; 2919fa9e4066Sahrens for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 2920fa9e4066Sahrens i--, dnp++) { 2921fa9e4066Sahrens if (dnp->dn_type != DMU_OT_NONE) 2922fa9e4066Sahrens fill++; 2923fa9e4066Sahrens } 2924fa9e4066Sahrens } else { 292543466aaeSMax Grossman if (BP_IS_HOLE(bp)) { 292643466aaeSMax Grossman fill = 0; 292743466aaeSMax Grossman } else { 2928fa9e4066Sahrens fill = 1; 2929fa9e4066Sahrens } 293043466aaeSMax Grossman } 2931fa9e4066Sahrens } else { 2932e14bb325SJeff Bonwick blkptr_t *ibp = db->db.db_data; 2933fa9e4066Sahrens ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 2934e14bb325SJeff Bonwick for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { 2935e14bb325SJeff Bonwick if (BP_IS_HOLE(ibp)) 2936fa9e4066Sahrens continue; 29375d7b4d43SMatthew Ahrens fill += BP_GET_FILL(ibp); 2938fa9e4066Sahrens } 2939fa9e4066Sahrens } 2940744947dcSTom Erickson DB_DNODE_EXIT(db); 2941fa9e4066Sahrens 29425d7b4d43SMatthew Ahrens if (!BP_IS_EMBEDDED(bp)) 2943e14bb325SJeff Bonwick bp->blk_fill = fill; 2944fa9e4066Sahrens 2945fa9e4066Sahrens mutex_exit(&db->db_mtx); 2946*11ceac77SAlex Reece 2947*11ceac77SAlex Reece rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 2948*11ceac77SAlex Reece *db->db_blkptr = *bp; 2949*11ceac77SAlex Reece rw_exit(&dn->dn_struct_rwlock); 2950fa9e4066Sahrens } 2951fa9e4066Sahrens 295269962b56SMatthew Ahrens /* 295369962b56SMatthew Ahrens * The SPA will call this callback several times for each zio - once 295469962b56SMatthew Ahrens * for every physical child i/o (zio->io_phys_children times). This 295569962b56SMatthew Ahrens * allows the DMU to monitor the progress of each logical i/o. For example, 295669962b56SMatthew Ahrens * there may be 2 copies of an indirect block, or many fragments of a RAID-Z 295769962b56SMatthew Ahrens * block. There may be a long delay before all copies/fragments are completed, 295869962b56SMatthew Ahrens * so this callback allows us to retire dirty space gradually, as the physical 295969962b56SMatthew Ahrens * i/os complete. 296069962b56SMatthew Ahrens */ 296169962b56SMatthew Ahrens /* ARGSUSED */ 296269962b56SMatthew Ahrens static void 296369962b56SMatthew Ahrens dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) 296469962b56SMatthew Ahrens { 296569962b56SMatthew Ahrens dmu_buf_impl_t *db = arg; 296669962b56SMatthew Ahrens objset_t *os = db->db_objset; 296769962b56SMatthew Ahrens dsl_pool_t *dp = dmu_objset_pool(os); 296869962b56SMatthew Ahrens dbuf_dirty_record_t *dr; 296969962b56SMatthew Ahrens int delta = 0; 297069962b56SMatthew Ahrens 297169962b56SMatthew Ahrens dr = db->db_data_pending; 297269962b56SMatthew Ahrens ASSERT3U(dr->dr_txg, ==, zio->io_txg); 297369962b56SMatthew Ahrens 297469962b56SMatthew Ahrens /* 297569962b56SMatthew Ahrens * The callback will be called io_phys_children times. Retire one 297669962b56SMatthew Ahrens * portion of our dirty space each time we are called. Any rounding 297769962b56SMatthew Ahrens * error will be cleaned up by dsl_pool_sync()'s call to 297869962b56SMatthew Ahrens * dsl_pool_undirty_space(). 297969962b56SMatthew Ahrens */ 298069962b56SMatthew Ahrens delta = dr->dr_accounted / zio->io_phys_children; 298169962b56SMatthew Ahrens dsl_pool_undirty_space(dp, delta, zio->io_txg); 298269962b56SMatthew Ahrens } 298369962b56SMatthew Ahrens 2984c717a561Smaybee /* ARGSUSED */ 2985c717a561Smaybee static void 2986c717a561Smaybee dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 2987c717a561Smaybee { 2988c717a561Smaybee dmu_buf_impl_t *db = vdb; 2989b24ab676SJeff Bonwick blkptr_t *bp_orig = &zio->io_bp_orig; 299043466aaeSMax Grossman blkptr_t *bp = db->db_blkptr; 299143466aaeSMax Grossman objset_t *os = db->db_objset; 299243466aaeSMax Grossman dmu_tx_t *tx = os->os_synctx; 2993c717a561Smaybee dbuf_dirty_record_t **drp, *dr; 2994c717a561Smaybee 2995fb09f5aaSMadhav Suresh ASSERT0(zio->io_error); 2996b24ab676SJeff Bonwick ASSERT(db->db_blkptr == bp); 2997b24ab676SJeff Bonwick 299880901aeaSGeorge Wilson /* 299980901aeaSGeorge Wilson * For nopwrites and rewrites we ensure that the bp matches our 300080901aeaSGeorge Wilson * original and bypass all the accounting. 300180901aeaSGeorge Wilson */ 300280901aeaSGeorge Wilson if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { 3003b24ab676SJeff Bonwick ASSERT(BP_EQUAL(bp, bp_orig)); 3004b24ab676SJeff Bonwick } else { 300543466aaeSMax Grossman dsl_dataset_t *ds = os->os_dsl_dataset; 3006b24ab676SJeff Bonwick (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); 3007b24ab676SJeff Bonwick dsl_dataset_block_born(ds, bp, tx); 3008b24ab676SJeff Bonwick } 3009c717a561Smaybee 3010c717a561Smaybee mutex_enter(&db->db_mtx); 3011c717a561Smaybee 3012b24ab676SJeff Bonwick DBUF_VERIFY(db); 3013b24ab676SJeff Bonwick 3014c717a561Smaybee drp = &db->db_last_dirty; 301517f17c2dSbonwick while ((dr = *drp) != db->db_data_pending) 301617f17c2dSbonwick drp = &dr->dr_next; 301717f17c2dSbonwick ASSERT(!list_link_active(&dr->dr_dirty_node)); 3018b24ab676SJeff Bonwick ASSERT(dr->dr_dbuf == db); 301917f17c2dSbonwick ASSERT(dr->dr_next == NULL); 302017f17c2dSbonwick *drp = dr->dr_next; 3021c717a561Smaybee 30220a586ceaSMark Shellenbaum #ifdef ZFS_DEBUG 30230a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID) { 3024744947dcSTom Erickson dnode_t *dn; 3025744947dcSTom Erickson 3026744947dcSTom Erickson DB_DNODE_ENTER(db); 3027744947dcSTom Erickson dn = DB_DNODE(db); 30280a586ceaSMark Shellenbaum ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); 30290a586ceaSMark Shellenbaum ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && 30300a586ceaSMark Shellenbaum db->db_blkptr == &dn->dn_phys->dn_spill); 3031744947dcSTom Erickson DB_DNODE_EXIT(db); 30320a586ceaSMark Shellenbaum } 30330a586ceaSMark Shellenbaum #endif 30340a586ceaSMark Shellenbaum 3035c717a561Smaybee if (db->db_level == 0) { 30360a586ceaSMark Shellenbaum ASSERT(db->db_blkid != DMU_BONUS_BLKID); 3037c717a561Smaybee ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); 303882c9918fSTim Haley if (db->db_state != DB_NOFILL) { 3039c717a561Smaybee if (dr->dt.dl.dr_data != db->db_buf) 304082c9918fSTim Haley VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, 30413b2aab18SMatthew Ahrens db)); 3042b24ab676SJeff Bonwick else if (!arc_released(db->db_buf)) 3043c717a561Smaybee arc_set_callback(db->db_buf, dbuf_do_evict, db); 304482c9918fSTim Haley } 3045c717a561Smaybee } else { 3046744947dcSTom Erickson dnode_t *dn; 3047744947dcSTom Erickson 3048744947dcSTom Erickson DB_DNODE_ENTER(db); 3049744947dcSTom Erickson dn = DB_DNODE(db); 3050c717a561Smaybee ASSERT(list_head(&dr->dt.di.dr_children) == NULL); 3051c717a561Smaybee ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); 3052c717a561Smaybee if (!BP_IS_HOLE(db->db_blkptr)) { 3053c717a561Smaybee int epbs = 3054c717a561Smaybee dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 305543466aaeSMax Grossman ASSERT3U(db->db_blkid, <=, 305643466aaeSMax Grossman dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); 3057c717a561Smaybee ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 3058c717a561Smaybee db->db.db_size); 30595d7b4d43SMatthew Ahrens if (!arc_released(db->db_buf)) 3060c717a561Smaybee arc_set_callback(db->db_buf, dbuf_do_evict, db); 3061c717a561Smaybee } 3062744947dcSTom Erickson DB_DNODE_EXIT(db); 3063c25056deSgw25295 mutex_destroy(&dr->dt.di.dr_mtx); 3064c25056deSgw25295 list_destroy(&dr->dt.di.dr_children); 3065c717a561Smaybee } 3066c717a561Smaybee kmem_free(dr, sizeof (dbuf_dirty_record_t)); 3067c717a561Smaybee 3068c717a561Smaybee cv_broadcast(&db->db_changed); 3069c717a561Smaybee ASSERT(db->db_dirtycnt > 0); 3070c717a561Smaybee db->db_dirtycnt -= 1; 3071c717a561Smaybee db->db_data_pending = NULL; 307243466aaeSMax Grossman dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); 3073b24ab676SJeff Bonwick } 3074b24ab676SJeff Bonwick 3075b24ab676SJeff Bonwick static void 3076b24ab676SJeff Bonwick dbuf_write_nofill_ready(zio_t *zio) 3077b24ab676SJeff Bonwick { 3078b24ab676SJeff Bonwick dbuf_write_ready(zio, NULL, zio->io_private); 3079b24ab676SJeff Bonwick } 3080b24ab676SJeff Bonwick 3081b24ab676SJeff Bonwick static void 3082b24ab676SJeff Bonwick dbuf_write_nofill_done(zio_t *zio) 3083b24ab676SJeff Bonwick { 3084b24ab676SJeff Bonwick dbuf_write_done(zio, NULL, zio->io_private); 3085b24ab676SJeff Bonwick } 3086b24ab676SJeff Bonwick 3087b24ab676SJeff Bonwick static void 3088b24ab676SJeff Bonwick dbuf_write_override_ready(zio_t *zio) 3089b24ab676SJeff Bonwick { 3090b24ab676SJeff Bonwick dbuf_dirty_record_t *dr = zio->io_private; 3091b24ab676SJeff Bonwick dmu_buf_impl_t *db = dr->dr_dbuf; 3092b24ab676SJeff Bonwick 3093b24ab676SJeff Bonwick dbuf_write_ready(zio, NULL, db); 3094b24ab676SJeff Bonwick } 3095b24ab676SJeff Bonwick 3096b24ab676SJeff Bonwick static void 3097b24ab676SJeff Bonwick dbuf_write_override_done(zio_t *zio) 3098b24ab676SJeff Bonwick { 3099b24ab676SJeff Bonwick dbuf_dirty_record_t *dr = zio->io_private; 3100b24ab676SJeff Bonwick dmu_buf_impl_t *db = dr->dr_dbuf; 3101b24ab676SJeff Bonwick blkptr_t *obp = &dr->dt.dl.dr_overridden_by; 3102b24ab676SJeff Bonwick 3103b24ab676SJeff Bonwick mutex_enter(&db->db_mtx); 3104b24ab676SJeff Bonwick if (!BP_EQUAL(zio->io_bp, obp)) { 3105b24ab676SJeff Bonwick if (!BP_IS_HOLE(obp)) 3106b24ab676SJeff Bonwick dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); 3107b24ab676SJeff Bonwick arc_release(dr->dt.dl.dr_data, db); 3108b24ab676SJeff Bonwick } 3109c717a561Smaybee mutex_exit(&db->db_mtx); 3110c717a561Smaybee 3111b24ab676SJeff Bonwick dbuf_write_done(zio, NULL, db); 3112b24ab676SJeff Bonwick } 3113c717a561Smaybee 31143e30c24aSWill Andrews /* Issue I/O to commit a dirty buffer to disk. */ 3115b24ab676SJeff Bonwick static void 3116b24ab676SJeff Bonwick dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) 3117b24ab676SJeff Bonwick { 3118b24ab676SJeff Bonwick dmu_buf_impl_t *db = dr->dr_dbuf; 3119744947dcSTom Erickson dnode_t *dn; 3120744947dcSTom Erickson objset_t *os; 3121b24ab676SJeff Bonwick dmu_buf_impl_t *parent = db->db_parent; 3122b24ab676SJeff Bonwick uint64_t txg = tx->tx_txg; 31237802d7bfSMatthew Ahrens zbookmark_phys_t zb; 3124b24ab676SJeff Bonwick zio_prop_t zp; 3125b24ab676SJeff Bonwick zio_t *zio; 31260a586ceaSMark Shellenbaum int wp_flag = 0; 3127b24ab676SJeff Bonwick 3128*11ceac77SAlex Reece ASSERT(dmu_tx_is_syncing(tx)); 3129*11ceac77SAlex Reece 3130744947dcSTom Erickson DB_DNODE_ENTER(db); 3131744947dcSTom Erickson dn = DB_DNODE(db); 3132744947dcSTom Erickson os = dn->dn_objset; 3133744947dcSTom Erickson 3134b24ab676SJeff Bonwick if (db->db_state != DB_NOFILL) { 3135b24ab676SJeff Bonwick if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { 3136b24ab676SJeff Bonwick /* 3137b24ab676SJeff Bonwick * Private object buffers are released here rather 3138b24ab676SJeff Bonwick * than in dbuf_dirty() since they are only modified 3139b24ab676SJeff Bonwick * in the syncing context and we don't want the 3140b24ab676SJeff Bonwick * overhead of making multiple copies of the data. 3141b24ab676SJeff Bonwick */ 3142b24ab676SJeff Bonwick if (BP_IS_HOLE(db->db_blkptr)) { 3143b24ab676SJeff Bonwick arc_buf_thaw(data); 3144b24ab676SJeff Bonwick } else { 31453f9d6ad7SLin Ling dbuf_release_bp(db); 3146b24ab676SJeff Bonwick } 3147b24ab676SJeff Bonwick } 3148b24ab676SJeff Bonwick } 3149b24ab676SJeff Bonwick 3150b24ab676SJeff Bonwick if (parent != dn->dn_dbuf) { 31513e30c24aSWill Andrews /* Our parent is an indirect block. */ 31523e30c24aSWill Andrews /* We have a dirty parent that has been scheduled for write. */ 3153b24ab676SJeff Bonwick ASSERT(parent && parent->db_data_pending); 31543e30c24aSWill Andrews /* Our parent's buffer is one level closer to the dnode. */ 3155b24ab676SJeff Bonwick ASSERT(db->db_level == parent->db_level-1); 31563e30c24aSWill Andrews /* 31573e30c24aSWill Andrews * We're about to modify our parent's db_data by modifying 31583e30c24aSWill Andrews * our block pointer, so the parent must be released. 31593e30c24aSWill Andrews */ 3160b24ab676SJeff Bonwick ASSERT(arc_released(parent->db_buf)); 3161b24ab676SJeff Bonwick zio = parent->db_data_pending->dr_zio; 3162b24ab676SJeff Bonwick } else { 31633e30c24aSWill Andrews /* Our parent is the dnode itself. */ 31640a586ceaSMark Shellenbaum ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && 31650a586ceaSMark Shellenbaum db->db_blkid != DMU_SPILL_BLKID) || 31660a586ceaSMark Shellenbaum (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); 31670a586ceaSMark Shellenbaum if (db->db_blkid != DMU_SPILL_BLKID) 3168b24ab676SJeff Bonwick ASSERT3P(db->db_blkptr, ==, 3169b24ab676SJeff Bonwick &dn->dn_phys->dn_blkptr[db->db_blkid]); 3170b24ab676SJeff Bonwick zio = dn->dn_zio; 3171b24ab676SJeff Bonwick } 3172b24ab676SJeff Bonwick 3173b24ab676SJeff Bonwick ASSERT(db->db_level == 0 || data == db->db_buf); 3174b24ab676SJeff Bonwick ASSERT3U(db->db_blkptr->blk_birth, <=, txg); 3175b24ab676SJeff Bonwick ASSERT(zio); 3176b24ab676SJeff Bonwick 3177b24ab676SJeff Bonwick SET_BOOKMARK(&zb, os->os_dsl_dataset ? 3178b24ab676SJeff Bonwick os->os_dsl_dataset->ds_object : DMU_META_OBJSET, 3179b24ab676SJeff Bonwick db->db.db_object, db->db_level, db->db_blkid); 3180b24ab676SJeff Bonwick 31810a586ceaSMark Shellenbaum if (db->db_blkid == DMU_SPILL_BLKID) 31820a586ceaSMark Shellenbaum wp_flag = WP_SPILL; 31830a586ceaSMark Shellenbaum wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; 31840a586ceaSMark Shellenbaum 31850a586ceaSMark Shellenbaum dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); 3186744947dcSTom Erickson DB_DNODE_EXIT(db); 3187b24ab676SJeff Bonwick 3188*11ceac77SAlex Reece /* 3189*11ceac77SAlex Reece * We copy the blkptr now (rather than when we instantiate the dirty 3190*11ceac77SAlex Reece * record), because its value can change between open context and 3191*11ceac77SAlex Reece * syncing context. We do not need to hold dn_struct_rwlock to read 3192*11ceac77SAlex Reece * db_blkptr because we are in syncing context. 3193*11ceac77SAlex Reece */ 3194*11ceac77SAlex Reece dr->dr_bp_copy = *db->db_blkptr; 3195*11ceac77SAlex Reece 31965d7b4d43SMatthew Ahrens if (db->db_level == 0 && 31975d7b4d43SMatthew Ahrens dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 31985d7b4d43SMatthew Ahrens /* 31995d7b4d43SMatthew Ahrens * The BP for this block has been provided by open context 32005d7b4d43SMatthew Ahrens * (by dmu_sync() or dmu_buf_write_embedded()). 32015d7b4d43SMatthew Ahrens */ 32025d7b4d43SMatthew Ahrens void *contents = (data != NULL) ? data->b_data : NULL; 32035d7b4d43SMatthew Ahrens 3204b24ab676SJeff Bonwick dr->dr_zio = zio_write(zio, os->os_spa, txg, 3205*11ceac77SAlex Reece &dr->dr_bp_copy, contents, db->db.db_size, &zp, 320669962b56SMatthew Ahrens dbuf_write_override_ready, NULL, dbuf_write_override_done, 320769962b56SMatthew Ahrens dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3208b24ab676SJeff Bonwick mutex_enter(&db->db_mtx); 3209b24ab676SJeff Bonwick dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; 3210b24ab676SJeff Bonwick zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, 321180901aeaSGeorge Wilson dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); 3212b24ab676SJeff Bonwick mutex_exit(&db->db_mtx); 3213b24ab676SJeff Bonwick } else if (db->db_state == DB_NOFILL) { 3214810e43b2SBill Pijewski ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || 3215810e43b2SBill Pijewski zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); 3216b24ab676SJeff Bonwick dr->dr_zio = zio_write(zio, os->os_spa, txg, 3217*11ceac77SAlex Reece &dr->dr_bp_copy, NULL, db->db.db_size, &zp, 321869962b56SMatthew Ahrens dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, 3219b24ab676SJeff Bonwick ZIO_PRIORITY_ASYNC_WRITE, 3220b24ab676SJeff Bonwick ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); 3221b24ab676SJeff Bonwick } else { 3222b24ab676SJeff Bonwick ASSERT(arc_released(data)); 3223b24ab676SJeff Bonwick dr->dr_zio = arc_write(zio, os->os_spa, txg, 3224*11ceac77SAlex Reece &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), 3225aad02571SSaso Kiselkov DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, 322669962b56SMatthew Ahrens dbuf_write_physdone, dbuf_write_done, db, 322769962b56SMatthew Ahrens ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); 3228b24ab676SJeff Bonwick } 3229fa9e4066Sahrens } 3230