1*fa9e4066Sahrens /* 2*fa9e4066Sahrens * CDDL HEADER START 3*fa9e4066Sahrens * 4*fa9e4066Sahrens * The contents of this file are subject to the terms of the 5*fa9e4066Sahrens * Common Development and Distribution License, Version 1.0 only 6*fa9e4066Sahrens * (the "License"). You may not use this file except in compliance 7*fa9e4066Sahrens * with the License. 8*fa9e4066Sahrens * 9*fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10*fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 11*fa9e4066Sahrens * See the License for the specific language governing permissions 12*fa9e4066Sahrens * and limitations under the License. 13*fa9e4066Sahrens * 14*fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 15*fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16*fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 17*fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 18*fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 19*fa9e4066Sahrens * 20*fa9e4066Sahrens * CDDL HEADER END 21*fa9e4066Sahrens */ 22*fa9e4066Sahrens /* 23*fa9e4066Sahrens * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24*fa9e4066Sahrens * Use is subject to license terms. 25*fa9e4066Sahrens */ 26*fa9e4066Sahrens 27*fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 28*fa9e4066Sahrens 29*fa9e4066Sahrens #include <sys/zfs_context.h> 30*fa9e4066Sahrens #include <sys/dmu.h> 31*fa9e4066Sahrens #include <sys/dmu_impl.h> 32*fa9e4066Sahrens #include <sys/dbuf.h> 33*fa9e4066Sahrens #include <sys/dmu_objset.h> 34*fa9e4066Sahrens #include <sys/dsl_dataset.h> 35*fa9e4066Sahrens #include <sys/dsl_dir.h> 36*fa9e4066Sahrens #include <sys/dmu_tx.h> 37*fa9e4066Sahrens #include <sys/spa.h> 38*fa9e4066Sahrens #include <sys/zio.h> 39*fa9e4066Sahrens #include <sys/dmu_zfetch.h> 40*fa9e4066Sahrens 41*fa9e4066Sahrens static void dbuf_destroy(dmu_buf_impl_t *db); 42*fa9e4066Sahrens static void dbuf_verify(dmu_buf_impl_t *db); 43*fa9e4066Sahrens static void dbuf_evict_user(dmu_buf_impl_t *db); 44*fa9e4066Sahrens static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); 45*fa9e4066Sahrens static arc_done_func_t dbuf_read_done; 46*fa9e4066Sahrens static arc_done_func_t dbuf_write_done; 47*fa9e4066Sahrens 48*fa9e4066Sahrens /* 49*fa9e4066Sahrens * Global data structures and functions for the dbuf cache. 50*fa9e4066Sahrens */ 51*fa9e4066Sahrens taskq_t *dbuf_tq; 52*fa9e4066Sahrens static kmem_cache_t *dbuf_cache; 53*fa9e4066Sahrens 54*fa9e4066Sahrens /* ARGSUSED */ 55*fa9e4066Sahrens static int 56*fa9e4066Sahrens dbuf_cons(void *vdb, void *unused, int kmflag) 57*fa9e4066Sahrens { 58*fa9e4066Sahrens dmu_buf_impl_t *db = vdb; 59*fa9e4066Sahrens bzero(db, sizeof (dmu_buf_impl_t)); 60*fa9e4066Sahrens 61*fa9e4066Sahrens mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); 62*fa9e4066Sahrens cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); 63*fa9e4066Sahrens refcount_create(&db->db_holds); 64*fa9e4066Sahrens return (0); 65*fa9e4066Sahrens } 66*fa9e4066Sahrens 67*fa9e4066Sahrens /* ARGSUSED */ 68*fa9e4066Sahrens static void 69*fa9e4066Sahrens dbuf_dest(void *vdb, void *unused) 70*fa9e4066Sahrens { 71*fa9e4066Sahrens dmu_buf_impl_t *db = vdb; 72*fa9e4066Sahrens mutex_destroy(&db->db_mtx); 73*fa9e4066Sahrens cv_destroy(&db->db_changed); 74*fa9e4066Sahrens refcount_destroy(&db->db_holds); 75*fa9e4066Sahrens } 76*fa9e4066Sahrens 77*fa9e4066Sahrens /* 78*fa9e4066Sahrens * dbuf hash table routines 79*fa9e4066Sahrens */ 80*fa9e4066Sahrens static dbuf_hash_table_t dbuf_hash_table; 81*fa9e4066Sahrens 82*fa9e4066Sahrens static uint64_t dbuf_hash_count; 83*fa9e4066Sahrens 84*fa9e4066Sahrens static uint64_t 85*fa9e4066Sahrens dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) 86*fa9e4066Sahrens { 87*fa9e4066Sahrens uintptr_t osv = (uintptr_t)os; 88*fa9e4066Sahrens uint64_t crc = -1ULL; 89*fa9e4066Sahrens 90*fa9e4066Sahrens ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 91*fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; 92*fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; 93*fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; 94*fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; 95*fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; 96*fa9e4066Sahrens crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; 97*fa9e4066Sahrens 98*fa9e4066Sahrens crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); 99*fa9e4066Sahrens 100*fa9e4066Sahrens return (crc); 101*fa9e4066Sahrens } 102*fa9e4066Sahrens 103*fa9e4066Sahrens #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); 104*fa9e4066Sahrens 105*fa9e4066Sahrens #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ 106*fa9e4066Sahrens ((dbuf)->db.db_object == (obj) && \ 107*fa9e4066Sahrens (dbuf)->db_objset == (os) && \ 108*fa9e4066Sahrens (dbuf)->db_level == (level) && \ 109*fa9e4066Sahrens (dbuf)->db_blkid == (blkid)) 110*fa9e4066Sahrens 111*fa9e4066Sahrens dmu_buf_impl_t * 112*fa9e4066Sahrens dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) 113*fa9e4066Sahrens { 114*fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table; 115*fa9e4066Sahrens objset_impl_t *os = dn->dn_objset; 116*fa9e4066Sahrens uint64_t obj = dn->dn_object; 117*fa9e4066Sahrens uint64_t hv = DBUF_HASH(os, obj, level, blkid); 118*fa9e4066Sahrens uint64_t idx = hv & h->hash_table_mask; 119*fa9e4066Sahrens dmu_buf_impl_t *db; 120*fa9e4066Sahrens 121*fa9e4066Sahrens mutex_enter(DBUF_HASH_MUTEX(h, idx)); 122*fa9e4066Sahrens for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { 123*fa9e4066Sahrens if (DBUF_EQUAL(db, os, obj, level, blkid)) { 124*fa9e4066Sahrens mutex_enter(&db->db_mtx); 125*fa9e4066Sahrens if (!refcount_is_zero(&db->db_holds)) { 126*fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 127*fa9e4066Sahrens return (db); 128*fa9e4066Sahrens } 129*fa9e4066Sahrens mutex_exit(&db->db_mtx); 130*fa9e4066Sahrens } 131*fa9e4066Sahrens } 132*fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 133*fa9e4066Sahrens return (NULL); 134*fa9e4066Sahrens } 135*fa9e4066Sahrens 136*fa9e4066Sahrens /* 137*fa9e4066Sahrens * Insert an entry into the hash table. If there is already an element 138*fa9e4066Sahrens * equal to elem in the hash table, then the already existing element 139*fa9e4066Sahrens * will be returned and the new element will not be inserted. 140*fa9e4066Sahrens * Otherwise returns NULL. 141*fa9e4066Sahrens */ 142*fa9e4066Sahrens static dmu_buf_impl_t * 143*fa9e4066Sahrens dbuf_hash_insert(dmu_buf_impl_t *db) 144*fa9e4066Sahrens { 145*fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table; 146*fa9e4066Sahrens objset_impl_t *os = db->db_objset; 147*fa9e4066Sahrens uint64_t obj = db->db.db_object; 148*fa9e4066Sahrens int level = db->db_level; 149*fa9e4066Sahrens uint64_t blkid = db->db_blkid; 150*fa9e4066Sahrens uint64_t hv = DBUF_HASH(os, obj, level, blkid); 151*fa9e4066Sahrens uint64_t idx = hv & h->hash_table_mask; 152*fa9e4066Sahrens dmu_buf_impl_t *dbf; 153*fa9e4066Sahrens 154*fa9e4066Sahrens mutex_enter(DBUF_HASH_MUTEX(h, idx)); 155*fa9e4066Sahrens for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { 156*fa9e4066Sahrens if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { 157*fa9e4066Sahrens mutex_enter(&dbf->db_mtx); 158*fa9e4066Sahrens if (!refcount_is_zero(&dbf->db_holds)) { 159*fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 160*fa9e4066Sahrens return (dbf); 161*fa9e4066Sahrens } 162*fa9e4066Sahrens mutex_exit(&dbf->db_mtx); 163*fa9e4066Sahrens } 164*fa9e4066Sahrens } 165*fa9e4066Sahrens 166*fa9e4066Sahrens mutex_enter(&db->db_mtx); 167*fa9e4066Sahrens db->db_hash_next = h->hash_table[idx]; 168*fa9e4066Sahrens h->hash_table[idx] = db; 169*fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 170*fa9e4066Sahrens atomic_add_64(&dbuf_hash_count, 1); 171*fa9e4066Sahrens 172*fa9e4066Sahrens return (NULL); 173*fa9e4066Sahrens } 174*fa9e4066Sahrens 175*fa9e4066Sahrens /* 176*fa9e4066Sahrens * Remove an entry from the hash table. This operation will 177*fa9e4066Sahrens * fail if there are any existing holds on the db. 178*fa9e4066Sahrens */ 179*fa9e4066Sahrens static void 180*fa9e4066Sahrens dbuf_hash_remove(dmu_buf_impl_t *db) 181*fa9e4066Sahrens { 182*fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table; 183*fa9e4066Sahrens uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, 184*fa9e4066Sahrens db->db_level, db->db_blkid); 185*fa9e4066Sahrens uint64_t idx = hv & h->hash_table_mask; 186*fa9e4066Sahrens dmu_buf_impl_t *dbf, **dbp; 187*fa9e4066Sahrens 188*fa9e4066Sahrens /* 189*fa9e4066Sahrens * We musn't hold db_mtx to maintin lock ordering: 190*fa9e4066Sahrens * DBUF_HASH_MUTEX > db_mtx. 191*fa9e4066Sahrens */ 192*fa9e4066Sahrens ASSERT(refcount_is_zero(&db->db_holds)); 193*fa9e4066Sahrens ASSERT(db->db_dnode != NULL); 194*fa9e4066Sahrens ASSERT(!MUTEX_HELD(&db->db_mtx)); 195*fa9e4066Sahrens 196*fa9e4066Sahrens mutex_enter(DBUF_HASH_MUTEX(h, idx)); 197*fa9e4066Sahrens dbp = &h->hash_table[idx]; 198*fa9e4066Sahrens while ((dbf = *dbp) != db) { 199*fa9e4066Sahrens dbp = &dbf->db_hash_next; 200*fa9e4066Sahrens ASSERT(dbf != NULL); 201*fa9e4066Sahrens } 202*fa9e4066Sahrens *dbp = db->db_hash_next; 203*fa9e4066Sahrens db->db_hash_next = NULL; 204*fa9e4066Sahrens mutex_exit(DBUF_HASH_MUTEX(h, idx)); 205*fa9e4066Sahrens atomic_add_64(&dbuf_hash_count, -1); 206*fa9e4066Sahrens } 207*fa9e4066Sahrens 208*fa9e4066Sahrens static int dbuf_evictable(dmu_buf_impl_t *db); 209*fa9e4066Sahrens static void dbuf_clear(dmu_buf_impl_t *db); 210*fa9e4066Sahrens 211*fa9e4066Sahrens void 212*fa9e4066Sahrens dbuf_evict(dmu_buf_impl_t *db) 213*fa9e4066Sahrens { 214*fa9e4066Sahrens int err; 215*fa9e4066Sahrens 216*fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 217*fa9e4066Sahrens err = dbuf_evictable(db); 218*fa9e4066Sahrens ASSERT(err == TRUE); 219*fa9e4066Sahrens dbuf_clear(db); 220*fa9e4066Sahrens dbuf_destroy(db); 221*fa9e4066Sahrens } 222*fa9e4066Sahrens 223*fa9e4066Sahrens static void 224*fa9e4066Sahrens dbuf_evict_user(dmu_buf_impl_t *db) 225*fa9e4066Sahrens { 226*fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 227*fa9e4066Sahrens 228*fa9e4066Sahrens if (db->db_level != 0 || db->db_d.db_evict_func == NULL) 229*fa9e4066Sahrens return; 230*fa9e4066Sahrens 231*fa9e4066Sahrens if (db->db_d.db_user_data_ptr_ptr) 232*fa9e4066Sahrens *db->db_d.db_user_data_ptr_ptr = db->db.db_data; 233*fa9e4066Sahrens db->db_d.db_evict_func(&db->db, db->db_d.db_user_ptr); 234*fa9e4066Sahrens db->db_d.db_user_ptr = NULL; 235*fa9e4066Sahrens db->db_d.db_user_data_ptr_ptr = NULL; 236*fa9e4066Sahrens db->db_d.db_evict_func = NULL; 237*fa9e4066Sahrens } 238*fa9e4066Sahrens 239*fa9e4066Sahrens void 240*fa9e4066Sahrens dbuf_init(void) 241*fa9e4066Sahrens { 242*fa9e4066Sahrens uint64_t hsize = 1; 243*fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table; 244*fa9e4066Sahrens int i; 245*fa9e4066Sahrens 246*fa9e4066Sahrens /* 247*fa9e4066Sahrens * The hash table is big enough to fill all of physical memory 248*fa9e4066Sahrens * with an average 64k block size. The table will take up 249*fa9e4066Sahrens * totalmem*sizeof(void*)/64k bytes (i.e. 128KB/GB with 8-byte 250*fa9e4066Sahrens * pointers). 251*fa9e4066Sahrens */ 252*fa9e4066Sahrens while (hsize * 65536 < physmem * PAGESIZE) 253*fa9e4066Sahrens hsize <<= 1; 254*fa9e4066Sahrens 255*fa9e4066Sahrens h->hash_table_mask = hsize - 1; 256*fa9e4066Sahrens h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_SLEEP); 257*fa9e4066Sahrens 258*fa9e4066Sahrens dbuf_cache = kmem_cache_create("dmu_buf_impl_t", 259*fa9e4066Sahrens sizeof (dmu_buf_impl_t), 260*fa9e4066Sahrens 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); 261*fa9e4066Sahrens dbuf_tq = taskq_create("dbuf_tq", 8, maxclsyspri, 50, INT_MAX, 262*fa9e4066Sahrens TASKQ_PREPOPULATE); 263*fa9e4066Sahrens 264*fa9e4066Sahrens for (i = 0; i < DBUF_MUTEXES; i++) 265*fa9e4066Sahrens mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); 266*fa9e4066Sahrens } 267*fa9e4066Sahrens 268*fa9e4066Sahrens void 269*fa9e4066Sahrens dbuf_fini(void) 270*fa9e4066Sahrens { 271*fa9e4066Sahrens dbuf_hash_table_t *h = &dbuf_hash_table; 272*fa9e4066Sahrens int i; 273*fa9e4066Sahrens 274*fa9e4066Sahrens taskq_destroy(dbuf_tq); 275*fa9e4066Sahrens dbuf_tq = NULL; 276*fa9e4066Sahrens 277*fa9e4066Sahrens for (i = 0; i < DBUF_MUTEXES; i++) 278*fa9e4066Sahrens mutex_destroy(&h->hash_mutexes[i]); 279*fa9e4066Sahrens kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); 280*fa9e4066Sahrens kmem_cache_destroy(dbuf_cache); 281*fa9e4066Sahrens } 282*fa9e4066Sahrens 283*fa9e4066Sahrens /* 284*fa9e4066Sahrens * Other stuff. 285*fa9e4066Sahrens */ 286*fa9e4066Sahrens 287*fa9e4066Sahrens static void 288*fa9e4066Sahrens dbuf_verify(dmu_buf_impl_t *db) 289*fa9e4066Sahrens { 290*fa9e4066Sahrens #ifdef ZFS_DEBUG 291*fa9e4066Sahrens int i; 292*fa9e4066Sahrens dnode_t *dn = db->db_dnode; 293*fa9e4066Sahrens 294*fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 295*fa9e4066Sahrens 296*fa9e4066Sahrens if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) 297*fa9e4066Sahrens return; 298*fa9e4066Sahrens 299*fa9e4066Sahrens ASSERT(db->db_objset != NULL); 300*fa9e4066Sahrens if (dn == NULL) { 301*fa9e4066Sahrens ASSERT(db->db_parent == NULL); 302*fa9e4066Sahrens ASSERT(db->db_blkptr == NULL); 303*fa9e4066Sahrens } else { 304*fa9e4066Sahrens ASSERT3U(db->db.db_object, ==, dn->dn_object); 305*fa9e4066Sahrens ASSERT3P(db->db_objset, ==, dn->dn_objset); 306*fa9e4066Sahrens ASSERT(list_head(&dn->dn_dbufs)); 307*fa9e4066Sahrens ASSERT3U(db->db_level, <, dn->dn_nlevels); 308*fa9e4066Sahrens } 309*fa9e4066Sahrens if (db->db_blkid == DB_BONUS_BLKID) { 310*fa9e4066Sahrens ASSERT(dn != NULL); 311*fa9e4066Sahrens ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); 312*fa9e4066Sahrens ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); 313*fa9e4066Sahrens } else { 314*fa9e4066Sahrens ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); 315*fa9e4066Sahrens } 316*fa9e4066Sahrens 317*fa9e4066Sahrens if (db->db_level == 0) { 318*fa9e4066Sahrens void **udpp = db->db_d.db_user_data_ptr_ptr; 319*fa9e4066Sahrens /* we can be momentarily larger in dnode_set_blksz() */ 320*fa9e4066Sahrens if (db->db_blkid != DB_BONUS_BLKID && dn) { 321*fa9e4066Sahrens ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); 322*fa9e4066Sahrens } 323*fa9e4066Sahrens if (udpp) { 324*fa9e4066Sahrens ASSERT((refcount_is_zero(&db->db_holds) && 325*fa9e4066Sahrens *udpp == NULL) || 326*fa9e4066Sahrens (!refcount_is_zero(&db->db_holds) && 327*fa9e4066Sahrens *udpp == db->db.db_data)); 328*fa9e4066Sahrens } 329*fa9e4066Sahrens 330*fa9e4066Sahrens if (IS_DNODE_DNODE(db->db.db_object)) { 331*fa9e4066Sahrens for (i = 0; i < TXG_SIZE; i++) { 332*fa9e4066Sahrens /* 333*fa9e4066Sahrens * it should only be modified in syncing 334*fa9e4066Sahrens * context, so make sure we only have 335*fa9e4066Sahrens * one copy of the data. 336*fa9e4066Sahrens */ 337*fa9e4066Sahrens ASSERT(db->db_d.db_data_old[i] == NULL || 338*fa9e4066Sahrens db->db_d.db_data_old[i] == db->db_buf); 339*fa9e4066Sahrens } 340*fa9e4066Sahrens } 341*fa9e4066Sahrens } 342*fa9e4066Sahrens 343*fa9e4066Sahrens /* verify db->db_blkptr */ 344*fa9e4066Sahrens if (db->db_blkptr) { 345*fa9e4066Sahrens if (db->db_parent == dn->dn_dbuf) { 346*fa9e4066Sahrens /* db is pointed to by the dnode */ 347*fa9e4066Sahrens /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ 348*fa9e4066Sahrens if (IS_DNODE_DNODE(db->db.db_object)) 349*fa9e4066Sahrens ASSERT(db->db_parent == NULL); 350*fa9e4066Sahrens else 351*fa9e4066Sahrens ASSERT(db->db_parent != NULL); 352*fa9e4066Sahrens ASSERT3P(db->db_blkptr, ==, 353*fa9e4066Sahrens &dn->dn_phys->dn_blkptr[db->db_blkid]); 354*fa9e4066Sahrens } else { 355*fa9e4066Sahrens /* db is pointed to by an indirect block */ 356*fa9e4066Sahrens int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; 357*fa9e4066Sahrens ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); 358*fa9e4066Sahrens ASSERT3U(db->db_parent->db.db_object, ==, 359*fa9e4066Sahrens db->db.db_object); 360*fa9e4066Sahrens /* 361*fa9e4066Sahrens * dnode_grow_indblksz() can make this fail if we don't 362*fa9e4066Sahrens * have the struct_rwlock. XXX indblksz no longer 363*fa9e4066Sahrens * grows. safe to do this now? 364*fa9e4066Sahrens */ 365*fa9e4066Sahrens if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) { 366*fa9e4066Sahrens ASSERT3P(db->db_blkptr, ==, 367*fa9e4066Sahrens ((blkptr_t *)db->db_parent->db.db_data + 368*fa9e4066Sahrens db->db_blkid % epb)); 369*fa9e4066Sahrens } 370*fa9e4066Sahrens } 371*fa9e4066Sahrens } 372*fa9e4066Sahrens if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && 373*fa9e4066Sahrens db->db.db_data && db->db_blkid != DB_BONUS_BLKID && 374*fa9e4066Sahrens db->db_state != DB_FILL && !dn->dn_free_txg) { 375*fa9e4066Sahrens /* 376*fa9e4066Sahrens * If the blkptr isn't set but they have nonzero data, 377*fa9e4066Sahrens * it had better be dirty, otherwise we'll lose that 378*fa9e4066Sahrens * data when we evict this buffer. 379*fa9e4066Sahrens */ 380*fa9e4066Sahrens if (db->db_dirtycnt == 0) { 381*fa9e4066Sahrens uint64_t *buf = db->db.db_data; 382*fa9e4066Sahrens int i; 383*fa9e4066Sahrens 384*fa9e4066Sahrens for (i = 0; i < db->db.db_size >> 3; i++) { 385*fa9e4066Sahrens ASSERT(buf[i] == 0); 386*fa9e4066Sahrens } 387*fa9e4066Sahrens } 388*fa9e4066Sahrens } 389*fa9e4066Sahrens #endif 390*fa9e4066Sahrens } 391*fa9e4066Sahrens 392*fa9e4066Sahrens static void 393*fa9e4066Sahrens dbuf_update_data(dmu_buf_impl_t *db) 394*fa9e4066Sahrens { 395*fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 396*fa9e4066Sahrens if (db->db_level == 0 && db->db_d.db_user_data_ptr_ptr) { 397*fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 398*fa9e4066Sahrens *db->db_d.db_user_data_ptr_ptr = db->db.db_data; 399*fa9e4066Sahrens } 400*fa9e4066Sahrens } 401*fa9e4066Sahrens 402*fa9e4066Sahrens static void 403*fa9e4066Sahrens dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) 404*fa9e4066Sahrens { 405*fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 406*fa9e4066Sahrens ASSERT(buf->b_data != NULL); 407*fa9e4066Sahrens db->db_buf = buf; 408*fa9e4066Sahrens db->db.db_data = buf->b_data; 409*fa9e4066Sahrens dbuf_update_data(db); 410*fa9e4066Sahrens } 411*fa9e4066Sahrens 412*fa9e4066Sahrens uint64_t 413*fa9e4066Sahrens dbuf_whichblock(dnode_t *dn, uint64_t offset) 414*fa9e4066Sahrens { 415*fa9e4066Sahrens if (dn->dn_datablkshift) { 416*fa9e4066Sahrens return (offset >> dn->dn_datablkshift); 417*fa9e4066Sahrens } else { 418*fa9e4066Sahrens ASSERT3U(offset, <, dn->dn_datablksz); 419*fa9e4066Sahrens return (0); 420*fa9e4066Sahrens } 421*fa9e4066Sahrens } 422*fa9e4066Sahrens 423*fa9e4066Sahrens static void 424*fa9e4066Sahrens dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) 425*fa9e4066Sahrens { 426*fa9e4066Sahrens dmu_buf_impl_t *db = vdb; 427*fa9e4066Sahrens 428*fa9e4066Sahrens mutex_enter(&db->db_mtx); 429*fa9e4066Sahrens ASSERT3U(db->db_state, ==, DB_READ); 430*fa9e4066Sahrens /* 431*fa9e4066Sahrens * All reads are synchronous, so we must have a hold on the dbuf 432*fa9e4066Sahrens */ 433*fa9e4066Sahrens ASSERT(refcount_count(&db->db_holds) > 0); 434*fa9e4066Sahrens ASSERT(db->db.db_data == NULL); 435*fa9e4066Sahrens if (db->db_level == 0 && db->db_d.db_freed_in_flight) { 436*fa9e4066Sahrens /* we were freed in flight; disregard any error */ 437*fa9e4066Sahrens arc_release(buf, db); 438*fa9e4066Sahrens bzero(buf->b_data, db->db.db_size); 439*fa9e4066Sahrens db->db_d.db_freed_in_flight = FALSE; 440*fa9e4066Sahrens dbuf_set_data(db, buf); 441*fa9e4066Sahrens db->db_state = DB_CACHED; 442*fa9e4066Sahrens } else if (zio == NULL || zio->io_error == 0) { 443*fa9e4066Sahrens dbuf_set_data(db, buf); 444*fa9e4066Sahrens db->db_state = DB_CACHED; 445*fa9e4066Sahrens } else { 446*fa9e4066Sahrens ASSERT(db->db_blkid != DB_BONUS_BLKID); 447*fa9e4066Sahrens arc_buf_free(buf, db); 448*fa9e4066Sahrens db->db_state = DB_UNCACHED; 449*fa9e4066Sahrens ASSERT3P(db->db_buf, ==, NULL); 450*fa9e4066Sahrens } 451*fa9e4066Sahrens cv_broadcast(&db->db_changed); 452*fa9e4066Sahrens mutex_exit(&db->db_mtx); 453*fa9e4066Sahrens } 454*fa9e4066Sahrens 455*fa9e4066Sahrens void 456*fa9e4066Sahrens dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) 457*fa9e4066Sahrens { 458*fa9e4066Sahrens arc_buf_t *buf; 459*fa9e4066Sahrens blkptr_t *bp; 460*fa9e4066Sahrens 461*fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 462*fa9e4066Sahrens /* We need the struct_rwlock to prevent db_blkptr from changing. */ 463*fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); 464*fa9e4066Sahrens 465*fa9e4066Sahrens /* 466*fa9e4066Sahrens * prefetch only data blocks (level 0) -- don't prefetch indirect 467*fa9e4066Sahrens * blocks 468*fa9e4066Sahrens */ 469*fa9e4066Sahrens if ((db->db_level > 0) || (db->db_blkid == DB_BONUS_BLKID)) { 470*fa9e4066Sahrens flags |= DB_RF_NOPREFETCH; 471*fa9e4066Sahrens } 472*fa9e4066Sahrens 473*fa9e4066Sahrens if (((flags & DB_RF_NOPREFETCH) == 0) && (db->db_dnode != NULL)) { 474*fa9e4066Sahrens dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, 475*fa9e4066Sahrens db->db.db_size); 476*fa9e4066Sahrens } 477*fa9e4066Sahrens 478*fa9e4066Sahrens if (db->db_state == DB_CACHED) { 479*fa9e4066Sahrens ASSERT(db->db.db_data != NULL); 480*fa9e4066Sahrens return; 481*fa9e4066Sahrens } 482*fa9e4066Sahrens 483*fa9e4066Sahrens mutex_enter(&db->db_mtx); 484*fa9e4066Sahrens 485*fa9e4066Sahrens if (db->db_state != DB_UNCACHED) { 486*fa9e4066Sahrens mutex_exit(&db->db_mtx); 487*fa9e4066Sahrens return; 488*fa9e4066Sahrens } 489*fa9e4066Sahrens 490*fa9e4066Sahrens ASSERT3U(db->db_state, ==, DB_UNCACHED); 491*fa9e4066Sahrens 492*fa9e4066Sahrens if (db->db_blkid == DB_BONUS_BLKID) { 493*fa9e4066Sahrens ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size); 494*fa9e4066Sahrens buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 495*fa9e4066Sahrens DN_MAX_BONUSLEN, db); 496*fa9e4066Sahrens if (db->db.db_size < DN_MAX_BONUSLEN) 497*fa9e4066Sahrens bzero(buf->b_data, DN_MAX_BONUSLEN); 498*fa9e4066Sahrens bcopy(DN_BONUS(db->db_dnode->dn_phys), buf->b_data, 499*fa9e4066Sahrens db->db.db_size); 500*fa9e4066Sahrens dbuf_set_data(db, buf); 501*fa9e4066Sahrens db->db_state = DB_CACHED; 502*fa9e4066Sahrens mutex_exit(&db->db_mtx); 503*fa9e4066Sahrens return; 504*fa9e4066Sahrens } 505*fa9e4066Sahrens 506*fa9e4066Sahrens if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid)) 507*fa9e4066Sahrens bp = NULL; 508*fa9e4066Sahrens else 509*fa9e4066Sahrens bp = db->db_blkptr; 510*fa9e4066Sahrens 511*fa9e4066Sahrens if (bp == NULL) 512*fa9e4066Sahrens dprintf_dbuf(db, "blkptr: %s\n", "NULL"); 513*fa9e4066Sahrens else 514*fa9e4066Sahrens dprintf_dbuf_bp(db, bp, "%s", "blkptr:"); 515*fa9e4066Sahrens 516*fa9e4066Sahrens if (bp == NULL || BP_IS_HOLE(bp)) { 517*fa9e4066Sahrens ASSERT(bp == NULL || BP_IS_HOLE(bp)); 518*fa9e4066Sahrens dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 519*fa9e4066Sahrens db->db.db_size, db)); 520*fa9e4066Sahrens bzero(db->db.db_data, db->db.db_size); 521*fa9e4066Sahrens db->db_state = DB_CACHED; 522*fa9e4066Sahrens mutex_exit(&db->db_mtx); 523*fa9e4066Sahrens return; 524*fa9e4066Sahrens } 525*fa9e4066Sahrens 526*fa9e4066Sahrens db->db_state = DB_READ; 527*fa9e4066Sahrens mutex_exit(&db->db_mtx); 528*fa9e4066Sahrens 529*fa9e4066Sahrens /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ 530*fa9e4066Sahrens (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp, 531*fa9e4066Sahrens db->db_level > 0 ? byteswap_uint64_array : 532*fa9e4066Sahrens dmu_ot[db->db_dnode->dn_type].ot_byteswap, 533*fa9e4066Sahrens dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, 534*fa9e4066Sahrens (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, 535*fa9e4066Sahrens ARC_NOWAIT); 536*fa9e4066Sahrens } 537*fa9e4066Sahrens 538*fa9e4066Sahrens static int 539*fa9e4066Sahrens dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags) 540*fa9e4066Sahrens { 541*fa9e4066Sahrens zio_t *zio; 542*fa9e4066Sahrens int err; 543*fa9e4066Sahrens 544*fa9e4066Sahrens /* 545*fa9e4066Sahrens * We don't have to hold the mutex to check db_state because it 546*fa9e4066Sahrens * can't be freed while we have a hold on the buffer. 547*fa9e4066Sahrens */ 548*fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 549*fa9e4066Sahrens if (db->db_state == DB_CACHED) 550*fa9e4066Sahrens return (0); 551*fa9e4066Sahrens 552*fa9e4066Sahrens if (db->db_state == DB_UNCACHED) { 553*fa9e4066Sahrens zio = zio_root(db->db_dnode->dn_objset->os_spa, NULL, NULL, 554*fa9e4066Sahrens ZIO_FLAG_CANFAIL); 555*fa9e4066Sahrens if ((flags & DB_RF_HAVESTRUCT) == 0) 556*fa9e4066Sahrens rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); 557*fa9e4066Sahrens dbuf_read_impl(db, zio, flags); 558*fa9e4066Sahrens if ((flags & DB_RF_HAVESTRUCT) == 0) 559*fa9e4066Sahrens rw_exit(&db->db_dnode->dn_struct_rwlock); 560*fa9e4066Sahrens err = zio_wait(zio); 561*fa9e4066Sahrens if (err) 562*fa9e4066Sahrens return (err); 563*fa9e4066Sahrens } 564*fa9e4066Sahrens 565*fa9e4066Sahrens mutex_enter(&db->db_mtx); 566*fa9e4066Sahrens while (db->db_state == DB_READ || db->db_state == DB_FILL) { 567*fa9e4066Sahrens ASSERT(db->db_state == DB_READ || 568*fa9e4066Sahrens (flags & DB_RF_HAVESTRUCT) == 0); 569*fa9e4066Sahrens cv_wait(&db->db_changed, &db->db_mtx); 570*fa9e4066Sahrens } 571*fa9e4066Sahrens ASSERT3U(db->db_state, ==, DB_CACHED); 572*fa9e4066Sahrens mutex_exit(&db->db_mtx); 573*fa9e4066Sahrens 574*fa9e4066Sahrens return (0); 575*fa9e4066Sahrens } 576*fa9e4066Sahrens 577*fa9e4066Sahrens #pragma weak dmu_buf_read = dbuf_read 578*fa9e4066Sahrens void 579*fa9e4066Sahrens dbuf_read(dmu_buf_impl_t *db) 580*fa9e4066Sahrens { 581*fa9e4066Sahrens int err; 582*fa9e4066Sahrens 583*fa9e4066Sahrens err = dbuf_read_generic(db, DB_RF_MUST_SUCCEED); 584*fa9e4066Sahrens ASSERT(err == 0); 585*fa9e4066Sahrens } 586*fa9e4066Sahrens 587*fa9e4066Sahrens #pragma weak dmu_buf_read_canfail = dbuf_read_canfail 588*fa9e4066Sahrens int 589*fa9e4066Sahrens dbuf_read_canfail(dmu_buf_impl_t *db) 590*fa9e4066Sahrens { 591*fa9e4066Sahrens return (dbuf_read_generic(db, DB_RF_CANFAIL)); 592*fa9e4066Sahrens } 593*fa9e4066Sahrens 594*fa9e4066Sahrens void 595*fa9e4066Sahrens dbuf_read_havestruct(dmu_buf_impl_t *db) 596*fa9e4066Sahrens { 597*fa9e4066Sahrens int err; 598*fa9e4066Sahrens 599*fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); 600*fa9e4066Sahrens err = dbuf_read_generic(db, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH)); 601*fa9e4066Sahrens ASSERT(err == 0); 602*fa9e4066Sahrens } 603*fa9e4066Sahrens 604*fa9e4066Sahrens static void 605*fa9e4066Sahrens dbuf_noread(dmu_buf_impl_t *db) 606*fa9e4066Sahrens { 607*fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 608*fa9e4066Sahrens mutex_enter(&db->db_mtx); 609*fa9e4066Sahrens while (db->db_state == DB_READ || db->db_state == DB_FILL) 610*fa9e4066Sahrens cv_wait(&db->db_changed, &db->db_mtx); 611*fa9e4066Sahrens if (db->db_state == DB_UNCACHED) { 612*fa9e4066Sahrens int blksz = (db->db_blkid == DB_BONUS_BLKID) ? 613*fa9e4066Sahrens DN_MAX_BONUSLEN : db->db.db_size; 614*fa9e4066Sahrens ASSERT(db->db.db_data == NULL); 615*fa9e4066Sahrens dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 616*fa9e4066Sahrens blksz, db)); 617*fa9e4066Sahrens db->db_state = DB_FILL; 618*fa9e4066Sahrens } else { 619*fa9e4066Sahrens ASSERT3U(db->db_state, ==, DB_CACHED); 620*fa9e4066Sahrens } 621*fa9e4066Sahrens mutex_exit(&db->db_mtx); 622*fa9e4066Sahrens } 623*fa9e4066Sahrens 624*fa9e4066Sahrens /* 625*fa9e4066Sahrens * This is our just-in-time copy function. It makes a copy of 626*fa9e4066Sahrens * buffers, that have been modified in a previous transaction 627*fa9e4066Sahrens * group, before we modify them in the current active group. 628*fa9e4066Sahrens * 629*fa9e4066Sahrens * This function is used in two places: when we are dirtying a 630*fa9e4066Sahrens * buffer for the first time in a txg, and when we are freeing 631*fa9e4066Sahrens * a range in a dnode that includes this buffer. 632*fa9e4066Sahrens * 633*fa9e4066Sahrens * Note that when we are called from dbuf_free_range() we do 634*fa9e4066Sahrens * not put a hold on the buffer, we just traverse the active 635*fa9e4066Sahrens * dbuf list for the dnode. 636*fa9e4066Sahrens */ 637*fa9e4066Sahrens static void 638*fa9e4066Sahrens dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) 639*fa9e4066Sahrens { 640*fa9e4066Sahrens arc_buf_t **quiescing, **syncing; 641*fa9e4066Sahrens int size = (db->db_blkid == DB_BONUS_BLKID) ? 642*fa9e4066Sahrens DN_MAX_BONUSLEN : db->db.db_size; 643*fa9e4066Sahrens 644*fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 645*fa9e4066Sahrens ASSERT(db->db.db_data != NULL); 646*fa9e4066Sahrens 647*fa9e4066Sahrens quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK]; 648*fa9e4066Sahrens syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK]; 649*fa9e4066Sahrens 650*fa9e4066Sahrens /* 651*fa9e4066Sahrens * If this buffer is referenced from the current quiescing 652*fa9e4066Sahrens * transaction group: either make a copy and reset the reference 653*fa9e4066Sahrens * to point to the copy, or (if there a no active holders) just 654*fa9e4066Sahrens * null out the current db_data pointer. 655*fa9e4066Sahrens */ 656*fa9e4066Sahrens if (*quiescing == db->db_buf) { 657*fa9e4066Sahrens /* 658*fa9e4066Sahrens * If the quiescing txg is "dirty", then we better not 659*fa9e4066Sahrens * be referencing the same buffer from the syncing txg. 660*fa9e4066Sahrens */ 661*fa9e4066Sahrens ASSERT(*syncing != db->db_buf); 662*fa9e4066Sahrens if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 663*fa9e4066Sahrens *quiescing = arc_buf_alloc( 664*fa9e4066Sahrens db->db_dnode->dn_objset->os_spa, size, db); 665*fa9e4066Sahrens bcopy(db->db.db_data, (*quiescing)->b_data, size); 666*fa9e4066Sahrens } else { 667*fa9e4066Sahrens db->db.db_data = NULL; 668*fa9e4066Sahrens db->db_buf = NULL; 669*fa9e4066Sahrens db->db_state = DB_UNCACHED; 670*fa9e4066Sahrens } 671*fa9e4066Sahrens return; 672*fa9e4066Sahrens } 673*fa9e4066Sahrens 674*fa9e4066Sahrens /* 675*fa9e4066Sahrens * If this buffer is referenced from the current syncing 676*fa9e4066Sahrens * transaction group: either 677*fa9e4066Sahrens * 1 - make a copy and reset the reference, or 678*fa9e4066Sahrens * 2 - if there are no holders, just null the current db_data. 679*fa9e4066Sahrens */ 680*fa9e4066Sahrens if (*syncing == db->db_buf) { 681*fa9e4066Sahrens ASSERT3P(*quiescing, ==, NULL); 682*fa9e4066Sahrens ASSERT3U(db->db_dirtycnt, ==, 1); 683*fa9e4066Sahrens if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 684*fa9e4066Sahrens /* we can't copy if we have already started a write */ 685*fa9e4066Sahrens ASSERT(*syncing != db->db_data_pending); 686*fa9e4066Sahrens *syncing = arc_buf_alloc( 687*fa9e4066Sahrens db->db_dnode->dn_objset->os_spa, size, db); 688*fa9e4066Sahrens bcopy(db->db.db_data, (*syncing)->b_data, size); 689*fa9e4066Sahrens } else { 690*fa9e4066Sahrens db->db.db_data = NULL; 691*fa9e4066Sahrens db->db_buf = NULL; 692*fa9e4066Sahrens db->db_state = DB_UNCACHED; 693*fa9e4066Sahrens } 694*fa9e4066Sahrens } 695*fa9e4066Sahrens } 696*fa9e4066Sahrens 697*fa9e4066Sahrens void 698*fa9e4066Sahrens dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg) 699*fa9e4066Sahrens { 700*fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 701*fa9e4066Sahrens if (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) { 702*fa9e4066Sahrens db->db_d.db_overridden_by[txg&TXG_MASK] = NULL; 703*fa9e4066Sahrens } else if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { 704*fa9e4066Sahrens /* free this block */ 705*fa9e4066Sahrens ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) || 706*fa9e4066Sahrens db->db_dnode->dn_free_txg == txg); 707*fa9e4066Sahrens if (!BP_IS_HOLE(db->db_d.db_overridden_by[txg&TXG_MASK])) { 708*fa9e4066Sahrens /* XXX can get silent EIO here */ 709*fa9e4066Sahrens (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa, 710*fa9e4066Sahrens txg, db->db_d.db_overridden_by[txg&TXG_MASK], 711*fa9e4066Sahrens NULL, NULL, ARC_WAIT); 712*fa9e4066Sahrens } 713*fa9e4066Sahrens kmem_free(db->db_d.db_overridden_by[txg&TXG_MASK], 714*fa9e4066Sahrens sizeof (blkptr_t)); 715*fa9e4066Sahrens db->db_d.db_overridden_by[txg&TXG_MASK] = NULL; 716*fa9e4066Sahrens /* release the already-written buffer */ 717*fa9e4066Sahrens arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); 718*fa9e4066Sahrens } 719*fa9e4066Sahrens } 720*fa9e4066Sahrens 721*fa9e4066Sahrens void 722*fa9e4066Sahrens dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) 723*fa9e4066Sahrens { 724*fa9e4066Sahrens dmu_buf_impl_t *db, *db_next; 725*fa9e4066Sahrens uint64_t txg = tx->tx_txg; 726*fa9e4066Sahrens 727*fa9e4066Sahrens dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks); 728*fa9e4066Sahrens mutex_enter(&dn->dn_dbufs_mtx); 729*fa9e4066Sahrens for (db = list_head(&dn->dn_dbufs); db; db = db_next) { 730*fa9e4066Sahrens db_next = list_next(&dn->dn_dbufs, db); 731*fa9e4066Sahrens if ((db->db_level != 0) || (db->db_blkid == DB_BONUS_BLKID)) 732*fa9e4066Sahrens continue; 733*fa9e4066Sahrens dprintf_dbuf(db, "found buf %s\n", ""); 734*fa9e4066Sahrens if (db->db_blkid < blkid || 735*fa9e4066Sahrens db->db_blkid >= blkid+nblks) 736*fa9e4066Sahrens continue; 737*fa9e4066Sahrens 738*fa9e4066Sahrens /* found a level 0 buffer in the range */ 739*fa9e4066Sahrens if (dbuf_undirty(db, tx)) 740*fa9e4066Sahrens continue; 741*fa9e4066Sahrens 742*fa9e4066Sahrens mutex_enter(&db->db_mtx); 743*fa9e4066Sahrens if (db->db_state == DB_UNCACHED) { 744*fa9e4066Sahrens ASSERT(db->db.db_data == NULL); 745*fa9e4066Sahrens mutex_exit(&db->db_mtx); 746*fa9e4066Sahrens continue; 747*fa9e4066Sahrens } 748*fa9e4066Sahrens if (db->db_state == DB_READ) { 749*fa9e4066Sahrens /* this will be handled in dbuf_read_done() */ 750*fa9e4066Sahrens db->db_d.db_freed_in_flight = TRUE; 751*fa9e4066Sahrens mutex_exit(&db->db_mtx); 752*fa9e4066Sahrens continue; 753*fa9e4066Sahrens } 754*fa9e4066Sahrens if (db->db_state == DB_FILL) { 755*fa9e4066Sahrens /* this will be handled in dbuf_rele() */ 756*fa9e4066Sahrens db->db_d.db_freed_in_flight = TRUE; 757*fa9e4066Sahrens mutex_exit(&db->db_mtx); 758*fa9e4066Sahrens continue; 759*fa9e4066Sahrens } 760*fa9e4066Sahrens 761*fa9e4066Sahrens /* make a copy of the data if necessary */ 762*fa9e4066Sahrens dbuf_fix_old_data(db, txg); 763*fa9e4066Sahrens 764*fa9e4066Sahrens if (db->db.db_data) { 765*fa9e4066Sahrens /* fill in with appropriate data */ 766*fa9e4066Sahrens arc_release(db->db_buf, db); 767*fa9e4066Sahrens bzero(db->db.db_data, db->db.db_size); 768*fa9e4066Sahrens } 769*fa9e4066Sahrens mutex_exit(&db->db_mtx); 770*fa9e4066Sahrens } 771*fa9e4066Sahrens mutex_exit(&dn->dn_dbufs_mtx); 772*fa9e4066Sahrens } 773*fa9e4066Sahrens 774*fa9e4066Sahrens static int 775*fa9e4066Sahrens dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx) 776*fa9e4066Sahrens { 777*fa9e4066Sahrens dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; 778*fa9e4066Sahrens uint64_t birth_txg = 0; 779*fa9e4066Sahrens 780*fa9e4066Sahrens /* Don't count meta-objects */ 781*fa9e4066Sahrens if (ds == NULL) 782*fa9e4066Sahrens return (FALSE); 783*fa9e4066Sahrens 784*fa9e4066Sahrens /* 785*fa9e4066Sahrens * We don't need any locking to protect db_blkptr: 786*fa9e4066Sahrens * If it's syncing, then db_dirtied will be set so we'll 787*fa9e4066Sahrens * ignore db_blkptr. 788*fa9e4066Sahrens */ 789*fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); /* XXX strictly necessary? */ 790*fa9e4066Sahrens /* If we have been dirtied since the last snapshot, its not new */ 791*fa9e4066Sahrens if (db->db_dirtied) 792*fa9e4066Sahrens birth_txg = db->db_dirtied; 793*fa9e4066Sahrens else if (db->db_blkptr) 794*fa9e4066Sahrens birth_txg = db->db_blkptr->blk_birth; 795*fa9e4066Sahrens 796*fa9e4066Sahrens if (birth_txg) 797*fa9e4066Sahrens return (!dsl_dataset_block_freeable(ds, birth_txg, tx)); 798*fa9e4066Sahrens else 799*fa9e4066Sahrens return (TRUE); 800*fa9e4066Sahrens } 801*fa9e4066Sahrens 802*fa9e4066Sahrens void 803*fa9e4066Sahrens dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) 804*fa9e4066Sahrens { 805*fa9e4066Sahrens arc_buf_t *buf, *obuf; 806*fa9e4066Sahrens int osize = db->db.db_size; 807*fa9e4066Sahrens 808*fa9e4066Sahrens /* XXX does *this* func really need the lock? */ 809*fa9e4066Sahrens ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); 810*fa9e4066Sahrens 811*fa9e4066Sahrens ASSERT3U(osize, <=, size); 812*fa9e4066Sahrens if (osize == size) 813*fa9e4066Sahrens return; 814*fa9e4066Sahrens 815*fa9e4066Sahrens /* 816*fa9e4066Sahrens * This call to dbuf_will_dirty() with the dn_struct_rwlock held 817*fa9e4066Sahrens * is OK, because there can be no other references to the db 818*fa9e4066Sahrens * when we are changing its size, so no concurrent DB_FILL can 819*fa9e4066Sahrens * be happening. 820*fa9e4066Sahrens */ 821*fa9e4066Sahrens /* Make a copy of the data if necessary */ 822*fa9e4066Sahrens dbuf_will_dirty(db, tx); 823*fa9e4066Sahrens 824*fa9e4066Sahrens /* create the data buffer for the new block */ 825*fa9e4066Sahrens buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, size, db); 826*fa9e4066Sahrens 827*fa9e4066Sahrens /* copy old block data to the new block */ 828*fa9e4066Sahrens obuf = db->db_buf; 829*fa9e4066Sahrens bcopy(obuf->b_data, buf->b_data, osize); 830*fa9e4066Sahrens /* zero the remainder */ 831*fa9e4066Sahrens bzero((uint8_t *)buf->b_data + osize, size - osize); 832*fa9e4066Sahrens 833*fa9e4066Sahrens mutex_enter(&db->db_mtx); 834*fa9e4066Sahrens /* ASSERT3U(refcount_count(&db->db_holds), ==, 1); */ 835*fa9e4066Sahrens dbuf_set_data(db, buf); 836*fa9e4066Sahrens arc_buf_free(obuf, db); 837*fa9e4066Sahrens db->db.db_size = size; 838*fa9e4066Sahrens 839*fa9e4066Sahrens /* fix up the dirty info */ 840*fa9e4066Sahrens if (db->db_level == 0) 841*fa9e4066Sahrens db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf; 842*fa9e4066Sahrens mutex_exit(&db->db_mtx); 843*fa9e4066Sahrens 844*fa9e4066Sahrens dnode_willuse_space(db->db_dnode, size-osize, tx); 845*fa9e4066Sahrens } 846*fa9e4066Sahrens 847*fa9e4066Sahrens void 848*fa9e4066Sahrens dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 849*fa9e4066Sahrens { 850*fa9e4066Sahrens dnode_t *dn = db->db_dnode; 851*fa9e4066Sahrens objset_impl_t *os = dn->dn_objset; 852*fa9e4066Sahrens int drop_struct_lock = FALSE; 853*fa9e4066Sahrens int txgoff = tx->tx_txg & TXG_MASK; 854*fa9e4066Sahrens 855*fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 856*fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 857*fa9e4066Sahrens dmu_tx_dirty_buf(tx, db); 858*fa9e4066Sahrens 859*fa9e4066Sahrens /* 860*fa9e4066Sahrens * Shouldn't dirty a regular buffer in syncing context. Private 861*fa9e4066Sahrens * objects may be dirtied in syncing context, but only if they 862*fa9e4066Sahrens * were already pre-dirtied in open context. 863*fa9e4066Sahrens * XXX We may want to prohibit dirtying in syncing context even 864*fa9e4066Sahrens * if they did pre-dirty. 865*fa9e4066Sahrens */ 866*fa9e4066Sahrens ASSERT(!(dmu_tx_is_syncing(tx) && 867*fa9e4066Sahrens !BP_IS_HOLE(&dn->dn_objset->os_rootbp) && 868*fa9e4066Sahrens !(dn->dn_object & DMU_PRIVATE_OBJECT) && 869*fa9e4066Sahrens dn->dn_objset->os_dsl_dataset != NULL && 870*fa9e4066Sahrens !dsl_dir_is_private( 871*fa9e4066Sahrens dn->dn_objset->os_dsl_dataset->ds_dir))); 872*fa9e4066Sahrens 873*fa9e4066Sahrens /* 874*fa9e4066Sahrens * We make this assert for private objects as well, but after we 875*fa9e4066Sahrens * check if we're already dirty. They are allowed to re-dirty 876*fa9e4066Sahrens * in syncing context. 877*fa9e4066Sahrens */ 878*fa9e4066Sahrens ASSERT(dn->dn_object & DMU_PRIVATE_OBJECT || 879*fa9e4066Sahrens dn->dn_dirtyctx == DN_UNDIRTIED || 880*fa9e4066Sahrens dn->dn_dirtyctx == 881*fa9e4066Sahrens (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 882*fa9e4066Sahrens 883*fa9e4066Sahrens mutex_enter(&db->db_mtx); 884*fa9e4066Sahrens /* XXX make this true for indirects too? */ 885*fa9e4066Sahrens ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || 886*fa9e4066Sahrens db->db_state == DB_FILL); 887*fa9e4066Sahrens 888*fa9e4066Sahrens /* 889*fa9e4066Sahrens * If this buffer is currently part of an "overridden" region, 890*fa9e4066Sahrens * we now need to remove it from that region. 891*fa9e4066Sahrens */ 892*fa9e4066Sahrens if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && 893*fa9e4066Sahrens db->db_d.db_overridden_by[txgoff] != NULL) { 894*fa9e4066Sahrens dbuf_unoverride(db, tx->tx_txg); 895*fa9e4066Sahrens } 896*fa9e4066Sahrens 897*fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 898*fa9e4066Sahrens /* 899*fa9e4066Sahrens * Don't set dirtyctx to SYNC if we're just modifying this as we 900*fa9e4066Sahrens * initialize the objset. 901*fa9e4066Sahrens */ 902*fa9e4066Sahrens if (dn->dn_dirtyctx == DN_UNDIRTIED && 903*fa9e4066Sahrens !BP_IS_HOLE(&dn->dn_objset->os_rootbp)) { 904*fa9e4066Sahrens dn->dn_dirtyctx = 905*fa9e4066Sahrens (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); 906*fa9e4066Sahrens ASSERT(dn->dn_dirtyctx_firstset == NULL); 907*fa9e4066Sahrens dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); 908*fa9e4066Sahrens } 909*fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 910*fa9e4066Sahrens 911*fa9e4066Sahrens /* 912*fa9e4066Sahrens * If this buffer is already dirty, we're done. 913*fa9e4066Sahrens */ 914*fa9e4066Sahrens if (list_link_active(&db->db_dirty_node[txgoff])) { 915*fa9e4066Sahrens mutex_exit(&db->db_mtx); 916*fa9e4066Sahrens return; 917*fa9e4066Sahrens } 918*fa9e4066Sahrens 919*fa9e4066Sahrens /* 920*fa9e4066Sahrens * Only valid if not already dirty. 921*fa9e4066Sahrens */ 922*fa9e4066Sahrens ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == 923*fa9e4066Sahrens (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); 924*fa9e4066Sahrens 925*fa9e4066Sahrens ASSERT3U(dn->dn_nlevels, >, db->db_level); 926*fa9e4066Sahrens ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || 927*fa9e4066Sahrens dn->dn_phys->dn_nlevels > db->db_level || 928*fa9e4066Sahrens dn->dn_next_nlevels[txgoff] > db->db_level || 929*fa9e4066Sahrens dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || 930*fa9e4066Sahrens dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); 931*fa9e4066Sahrens 932*fa9e4066Sahrens /* 933*fa9e4066Sahrens * We should only be dirtying in syncing context if it's the 934*fa9e4066Sahrens * mos, a spa os, or we're initializing the os. However, we are 935*fa9e4066Sahrens * allowed to dirty in syncing context provided we already 936*fa9e4066Sahrens * dirtied it in open context. Hence we must make this 937*fa9e4066Sahrens * assertion only if we're not already dirty. 938*fa9e4066Sahrens */ 939*fa9e4066Sahrens ASSERT(!dmu_tx_is_syncing(tx) || 940*fa9e4066Sahrens os->os_dsl_dataset == NULL || 941*fa9e4066Sahrens !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || 942*fa9e4066Sahrens !BP_IS_HOLE(&os->os_rootbp)); 943*fa9e4066Sahrens ASSERT(db->db.db_size != 0); 944*fa9e4066Sahrens 945*fa9e4066Sahrens dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 946*fa9e4066Sahrens 947*fa9e4066Sahrens if (db->db_level == 0) { 948*fa9e4066Sahrens /* 949*fa9e4066Sahrens * Release the data buffer from the cache so that we 950*fa9e4066Sahrens * can modify it without impacting possible other users 951*fa9e4066Sahrens * of this cached data block. Note that indirect blocks 952*fa9e4066Sahrens * and private objects are not released until the syncing 953*fa9e4066Sahrens * state (since they are only modified then). 954*fa9e4066Sahrens * 955*fa9e4066Sahrens * If this buffer is dirty in an old transaction group we need 956*fa9e4066Sahrens * to make a copy of it so that the changes we make in this 957*fa9e4066Sahrens * transaction group won't leak out when we sync the older txg. 958*fa9e4066Sahrens */ 959*fa9e4066Sahrens ASSERT(db->db_buf != NULL); 960*fa9e4066Sahrens ASSERT(db->db.db_data != NULL); 961*fa9e4066Sahrens ASSERT(db->db_d.db_data_old[txgoff] == NULL); 962*fa9e4066Sahrens if (!(db->db.db_object & DMU_PRIVATE_OBJECT)) { 963*fa9e4066Sahrens arc_release(db->db_buf, db); 964*fa9e4066Sahrens dbuf_fix_old_data(db, tx->tx_txg); 965*fa9e4066Sahrens ASSERT(db->db_buf != NULL); 966*fa9e4066Sahrens } 967*fa9e4066Sahrens db->db_d.db_data_old[txgoff] = db->db_buf; 968*fa9e4066Sahrens } 969*fa9e4066Sahrens 970*fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 971*fa9e4066Sahrens /* 972*fa9e4066Sahrens * We could have been freed_in_flight between the dbuf_noread 973*fa9e4066Sahrens * and dbuf_dirty. We win, as though the dbuf_noread() had 974*fa9e4066Sahrens * happened after the free. 975*fa9e4066Sahrens */ 976*fa9e4066Sahrens if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { 977*fa9e4066Sahrens dnode_clear_range(dn, db->db_blkid, 1, tx); 978*fa9e4066Sahrens db->db_d.db_freed_in_flight = FALSE; 979*fa9e4066Sahrens } 980*fa9e4066Sahrens 981*fa9e4066Sahrens db->db_dirtied = tx->tx_txg; 982*fa9e4066Sahrens list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db); 983*fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 984*fa9e4066Sahrens 985*fa9e4066Sahrens /* 986*fa9e4066Sahrens * If writting this buffer will consume a new block on disk, 987*fa9e4066Sahrens * then update the accounting. 988*fa9e4066Sahrens */ 989*fa9e4066Sahrens if (db->db_blkid != DB_BONUS_BLKID) { 990*fa9e4066Sahrens if (!dbuf_new_block(db, tx) && db->db_blkptr) { 991*fa9e4066Sahrens /* 992*fa9e4066Sahrens * This is only a guess -- if the dbuf is dirty 993*fa9e4066Sahrens * in a previous txg, we don't know how much 994*fa9e4066Sahrens * space it will use on disk yet. We should 995*fa9e4066Sahrens * really have the struct_rwlock to access 996*fa9e4066Sahrens * db_blkptr, but since this is just a guess, 997*fa9e4066Sahrens * it's OK if we get an odd answer. 998*fa9e4066Sahrens */ 999*fa9e4066Sahrens dnode_willuse_space(dn, 1000*fa9e4066Sahrens -BP_GET_ASIZE(db->db_blkptr), tx); 1001*fa9e4066Sahrens } 1002*fa9e4066Sahrens dnode_willuse_space(dn, db->db.db_size, tx); 1003*fa9e4066Sahrens } 1004*fa9e4066Sahrens 1005*fa9e4066Sahrens /* 1006*fa9e4066Sahrens * This buffer is now part of this txg 1007*fa9e4066Sahrens */ 1008*fa9e4066Sahrens dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); 1009*fa9e4066Sahrens db->db_dirtycnt += 1; 1010*fa9e4066Sahrens ASSERT3U(db->db_dirtycnt, <=, 3); 1011*fa9e4066Sahrens 1012*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1013*fa9e4066Sahrens 1014*fa9e4066Sahrens if (db->db_blkid == DB_BONUS_BLKID) { 1015*fa9e4066Sahrens dnode_setdirty(dn, tx); 1016*fa9e4066Sahrens return; 1017*fa9e4066Sahrens } 1018*fa9e4066Sahrens 1019*fa9e4066Sahrens if (db->db_level == 0) 1020*fa9e4066Sahrens dnode_new_blkid(dn, db->db_blkid, tx); 1021*fa9e4066Sahrens 1022*fa9e4066Sahrens if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { 1023*fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 1024*fa9e4066Sahrens drop_struct_lock = TRUE; 1025*fa9e4066Sahrens } 1026*fa9e4066Sahrens 1027*fa9e4066Sahrens if (db->db_level < dn->dn_nlevels-1) { 1028*fa9e4066Sahrens int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1029*fa9e4066Sahrens dmu_buf_impl_t *parent; 1030*fa9e4066Sahrens parent = dbuf_hold_level(dn, db->db_level+1, 1031*fa9e4066Sahrens db->db_blkid >> epbs, FTAG); 1032*fa9e4066Sahrens if (drop_struct_lock) 1033*fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 1034*fa9e4066Sahrens dbuf_dirty(parent, tx); 1035*fa9e4066Sahrens dbuf_remove_ref(parent, FTAG); 1036*fa9e4066Sahrens } else { 1037*fa9e4066Sahrens if (drop_struct_lock) 1038*fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 1039*fa9e4066Sahrens } 1040*fa9e4066Sahrens 1041*fa9e4066Sahrens dnode_setdirty(dn, tx); 1042*fa9e4066Sahrens } 1043*fa9e4066Sahrens 1044*fa9e4066Sahrens static int 1045*fa9e4066Sahrens dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1046*fa9e4066Sahrens { 1047*fa9e4066Sahrens dnode_t *dn = db->db_dnode; 1048*fa9e4066Sahrens int txgoff = tx->tx_txg & TXG_MASK; 1049*fa9e4066Sahrens 1050*fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 1051*fa9e4066Sahrens 1052*fa9e4066Sahrens mutex_enter(&db->db_mtx); 1053*fa9e4066Sahrens 1054*fa9e4066Sahrens /* 1055*fa9e4066Sahrens * If this buffer is not dirty, we're done. 1056*fa9e4066Sahrens */ 1057*fa9e4066Sahrens if (!list_link_active(&db->db_dirty_node[txgoff])) { 1058*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1059*fa9e4066Sahrens return (0); 1060*fa9e4066Sahrens } 1061*fa9e4066Sahrens 1062*fa9e4066Sahrens /* 1063*fa9e4066Sahrens * If this buffer is currently held, we cannot undirty 1064*fa9e4066Sahrens * it, since one of the current holders may be in the 1065*fa9e4066Sahrens * middle of an update. Note that users of dbuf_undirty() 1066*fa9e4066Sahrens * should not place a hold on the dbuf before the call. 1067*fa9e4066Sahrens * XXX - this check assumes we are being called from 1068*fa9e4066Sahrens * dbuf_free_range(), perhaps we should move it there? 1069*fa9e4066Sahrens */ 1070*fa9e4066Sahrens if (refcount_count(&db->db_holds) > db->db_dirtycnt) { 1071*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1072*fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 1073*fa9e4066Sahrens dnode_clear_range(dn, db->db_blkid, 1, tx); 1074*fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 1075*fa9e4066Sahrens return (0); 1076*fa9e4066Sahrens } 1077*fa9e4066Sahrens 1078*fa9e4066Sahrens dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); 1079*fa9e4066Sahrens 1080*fa9e4066Sahrens dbuf_unoverride(db, tx->tx_txg); 1081*fa9e4066Sahrens 1082*fa9e4066Sahrens ASSERT(db->db.db_size != 0); 1083*fa9e4066Sahrens if (db->db_level == 0) { 1084*fa9e4066Sahrens ASSERT(db->db_buf != NULL); 1085*fa9e4066Sahrens ASSERT(db->db_d.db_data_old[txgoff] != NULL); 1086*fa9e4066Sahrens if (db->db_d.db_data_old[txgoff] != db->db_buf) 1087*fa9e4066Sahrens arc_buf_free(db->db_d.db_data_old[txgoff], db); 1088*fa9e4066Sahrens db->db_d.db_data_old[txgoff] = NULL; 1089*fa9e4066Sahrens } 1090*fa9e4066Sahrens 1091*fa9e4066Sahrens /* XXX would be nice to fix up dn_towrite_space[] */ 1092*fa9e4066Sahrens /* XXX undo db_dirtied? but how? */ 1093*fa9e4066Sahrens /* db->db_dirtied = tx->tx_txg; */ 1094*fa9e4066Sahrens 1095*fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 1096*fa9e4066Sahrens list_remove(&dn->dn_dirty_dbufs[txgoff], db); 1097*fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 1098*fa9e4066Sahrens 1099*fa9e4066Sahrens ASSERT(db->db_dirtycnt > 0); 1100*fa9e4066Sahrens db->db_dirtycnt -= 1; 1101*fa9e4066Sahrens 1102*fa9e4066Sahrens if (refcount_remove(&db->db_holds, 1103*fa9e4066Sahrens (void *)(uintptr_t)tx->tx_txg) == 0) { 1104*fa9e4066Sahrens /* make duf_verify() happy */ 1105*fa9e4066Sahrens if (db->db.db_data) 1106*fa9e4066Sahrens bzero(db->db.db_data, db->db.db_size); 1107*fa9e4066Sahrens 1108*fa9e4066Sahrens dbuf_evict(db); 1109*fa9e4066Sahrens return (1); 1110*fa9e4066Sahrens } 1111*fa9e4066Sahrens 1112*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1113*fa9e4066Sahrens return (0); 1114*fa9e4066Sahrens } 1115*fa9e4066Sahrens 1116*fa9e4066Sahrens #pragma weak dmu_buf_will_dirty = dbuf_will_dirty 1117*fa9e4066Sahrens void 1118*fa9e4066Sahrens dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) 1119*fa9e4066Sahrens { 1120*fa9e4066Sahrens int rf = DB_RF_MUST_SUCCEED; 1121*fa9e4066Sahrens 1122*fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 1123*fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 1124*fa9e4066Sahrens 1125*fa9e4066Sahrens if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) 1126*fa9e4066Sahrens rf |= DB_RF_HAVESTRUCT; 1127*fa9e4066Sahrens (void) dbuf_read_generic(db, rf); 1128*fa9e4066Sahrens dbuf_dirty(db, tx); 1129*fa9e4066Sahrens } 1130*fa9e4066Sahrens 1131*fa9e4066Sahrens #pragma weak dmu_buf_will_fill = dbuf_will_fill 1132*fa9e4066Sahrens void 1133*fa9e4066Sahrens dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx) 1134*fa9e4066Sahrens { 1135*fa9e4066Sahrens ASSERT(tx->tx_txg != 0); 1136*fa9e4066Sahrens ASSERT(db->db_level == 0); 1137*fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 1138*fa9e4066Sahrens 1139*fa9e4066Sahrens ASSERT(!(db->db.db_object & DMU_PRIVATE_OBJECT) || 1140*fa9e4066Sahrens dmu_tx_private_ok(tx)); 1141*fa9e4066Sahrens 1142*fa9e4066Sahrens dbuf_noread(db); 1143*fa9e4066Sahrens dbuf_dirty(db, tx); 1144*fa9e4066Sahrens } 1145*fa9e4066Sahrens 1146*fa9e4066Sahrens #pragma weak dmu_buf_fill_done = dbuf_fill_done 1147*fa9e4066Sahrens /* ARGSUSED */ 1148*fa9e4066Sahrens void 1149*fa9e4066Sahrens dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) 1150*fa9e4066Sahrens { 1151*fa9e4066Sahrens mutex_enter(&db->db_mtx); 1152*fa9e4066Sahrens dbuf_verify(db); 1153*fa9e4066Sahrens 1154*fa9e4066Sahrens if (db->db_state == DB_FILL) { 1155*fa9e4066Sahrens if (db->db_level == 0 && db->db_d.db_freed_in_flight) { 1156*fa9e4066Sahrens /* we were freed while filling */ 1157*fa9e4066Sahrens /* XXX dbuf_undirty? */ 1158*fa9e4066Sahrens bzero(db->db.db_data, db->db.db_size); 1159*fa9e4066Sahrens db->db_d.db_freed_in_flight = FALSE; 1160*fa9e4066Sahrens } 1161*fa9e4066Sahrens db->db_state = DB_CACHED; 1162*fa9e4066Sahrens cv_broadcast(&db->db_changed); 1163*fa9e4066Sahrens } 1164*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1165*fa9e4066Sahrens } 1166*fa9e4066Sahrens 1167*fa9e4066Sahrens 1168*fa9e4066Sahrens static void 1169*fa9e4066Sahrens dbuf_clear(dmu_buf_impl_t *db) 1170*fa9e4066Sahrens { 1171*fa9e4066Sahrens dnode_t *dn = db->db_dnode; 1172*fa9e4066Sahrens 1173*fa9e4066Sahrens ASSERT(MUTEX_HELD(&dn->dn_dbufs_mtx)); 1174*fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 1175*fa9e4066Sahrens ASSERT(refcount_is_zero(&db->db_holds)); 1176*fa9e4066Sahrens 1177*fa9e4066Sahrens if (db->db_state == DB_CACHED) { 1178*fa9e4066Sahrens ASSERT(db->db_buf != NULL); 1179*fa9e4066Sahrens arc_buf_free(db->db_buf, db); 1180*fa9e4066Sahrens db->db.db_data = NULL; 1181*fa9e4066Sahrens db->db_buf = NULL; 1182*fa9e4066Sahrens db->db_state = DB_UNCACHED; 1183*fa9e4066Sahrens } 1184*fa9e4066Sahrens 1185*fa9e4066Sahrens ASSERT3U(db->db_state, ==, DB_UNCACHED); 1186*fa9e4066Sahrens ASSERT(db->db_buf == NULL); 1187*fa9e4066Sahrens ASSERT(db->db_data_pending == NULL); 1188*fa9e4066Sahrens 1189*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1190*fa9e4066Sahrens 1191*fa9e4066Sahrens /* 1192*fa9e4066Sahrens * If this dbuf is referened from an indirect dbuf, 1193*fa9e4066Sahrens * decrement the ref count on the indirect dbuf. 1194*fa9e4066Sahrens */ 1195*fa9e4066Sahrens if (db->db_parent && db->db_parent != dn->dn_dbuf) 1196*fa9e4066Sahrens dbuf_remove_ref(db->db_parent, db); 1197*fa9e4066Sahrens 1198*fa9e4066Sahrens /* remove from dn_dbufs */ 1199*fa9e4066Sahrens list_remove(&dn->dn_dbufs, db); 1200*fa9e4066Sahrens 1201*fa9e4066Sahrens dnode_rele(dn, db); 1202*fa9e4066Sahrens 1203*fa9e4066Sahrens dbuf_hash_remove(db); 1204*fa9e4066Sahrens 1205*fa9e4066Sahrens db->db_dnode = NULL; 1206*fa9e4066Sahrens db->db_parent = NULL; 1207*fa9e4066Sahrens db->db_blkptr = NULL; 1208*fa9e4066Sahrens } 1209*fa9e4066Sahrens 1210*fa9e4066Sahrens static int 1211*fa9e4066Sahrens dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, 1212*fa9e4066Sahrens dmu_buf_impl_t **parentp, blkptr_t **bpp) 1213*fa9e4066Sahrens { 1214*fa9e4066Sahrens int nlevels, epbs; 1215*fa9e4066Sahrens 1216*fa9e4066Sahrens if (dn->dn_phys->dn_nlevels == 0) 1217*fa9e4066Sahrens nlevels = 1; 1218*fa9e4066Sahrens else 1219*fa9e4066Sahrens nlevels = dn->dn_phys->dn_nlevels; 1220*fa9e4066Sahrens 1221*fa9e4066Sahrens epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; 1222*fa9e4066Sahrens 1223*fa9e4066Sahrens ASSERT3U(level * epbs, <, 64); 1224*fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1225*fa9e4066Sahrens if (blkid == DB_BONUS_BLKID) { 1226*fa9e4066Sahrens /* this is the bonus buffer */ 1227*fa9e4066Sahrens *parentp = NULL; 1228*fa9e4066Sahrens *bpp = NULL; 1229*fa9e4066Sahrens return (0); 1230*fa9e4066Sahrens } else if (level >= nlevels || 1231*fa9e4066Sahrens (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { 1232*fa9e4066Sahrens /* the buffer has no parent yet */ 1233*fa9e4066Sahrens *parentp = NULL; 1234*fa9e4066Sahrens *bpp = NULL; 1235*fa9e4066Sahrens return (ENOENT); 1236*fa9e4066Sahrens } else if (level < nlevels-1) { 1237*fa9e4066Sahrens /* this block is referenced from an indirect block */ 1238*fa9e4066Sahrens int err = dbuf_hold_impl(dn, level+1, 1239*fa9e4066Sahrens blkid >> epbs, fail_sparse, NULL, parentp); 1240*fa9e4066Sahrens if (err) 1241*fa9e4066Sahrens return (err); 1242*fa9e4066Sahrens dbuf_read_havestruct(*parentp); 1243*fa9e4066Sahrens *bpp = ((blkptr_t *)(*parentp)->db.db_data) + 1244*fa9e4066Sahrens (blkid & ((1ULL << epbs) - 1)); 1245*fa9e4066Sahrens return (0); 1246*fa9e4066Sahrens } else { 1247*fa9e4066Sahrens /* the block is referenced from the dnode */ 1248*fa9e4066Sahrens ASSERT3U(level, ==, nlevels-1); 1249*fa9e4066Sahrens ASSERT(dn->dn_phys->dn_nblkptr == 0 || 1250*fa9e4066Sahrens blkid < dn->dn_phys->dn_nblkptr); 1251*fa9e4066Sahrens *parentp = dn->dn_dbuf; 1252*fa9e4066Sahrens *bpp = &dn->dn_phys->dn_blkptr[blkid]; 1253*fa9e4066Sahrens return (0); 1254*fa9e4066Sahrens } 1255*fa9e4066Sahrens } 1256*fa9e4066Sahrens 1257*fa9e4066Sahrens static dmu_buf_impl_t * 1258*fa9e4066Sahrens dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, 1259*fa9e4066Sahrens dmu_buf_impl_t *parent, blkptr_t *blkptr) 1260*fa9e4066Sahrens { 1261*fa9e4066Sahrens objset_impl_t *os = dn->dn_objset; 1262*fa9e4066Sahrens dmu_buf_impl_t *db, *odb; 1263*fa9e4066Sahrens 1264*fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1265*fa9e4066Sahrens ASSERT(dn->dn_type != DMU_OT_NONE); 1266*fa9e4066Sahrens 1267*fa9e4066Sahrens db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); 1268*fa9e4066Sahrens 1269*fa9e4066Sahrens db->db_objset = os; 1270*fa9e4066Sahrens db->db.db_object = dn->dn_object; 1271*fa9e4066Sahrens db->db_level = level; 1272*fa9e4066Sahrens db->db_blkid = blkid; 1273*fa9e4066Sahrens db->db_state = DB_UNCACHED; 1274*fa9e4066Sahrens 1275*fa9e4066Sahrens if (db->db_blkid == DB_BONUS_BLKID) { 1276*fa9e4066Sahrens db->db.db_size = dn->dn_bonuslen; 1277*fa9e4066Sahrens db->db.db_offset = DB_BONUS_BLKID; 1278*fa9e4066Sahrens } else { 1279*fa9e4066Sahrens int blocksize = 1280*fa9e4066Sahrens db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; 1281*fa9e4066Sahrens db->db.db_size = blocksize; 1282*fa9e4066Sahrens db->db.db_offset = db->db_blkid * blocksize; 1283*fa9e4066Sahrens } 1284*fa9e4066Sahrens 1285*fa9e4066Sahrens db->db_dirtied = 0; 1286*fa9e4066Sahrens db->db_dirtycnt = 0; 1287*fa9e4066Sahrens 1288*fa9e4066Sahrens bzero(&db->db_d, sizeof (db->db_d)); 1289*fa9e4066Sahrens 1290*fa9e4066Sahrens /* 1291*fa9e4066Sahrens * Hold the dn_dbufs_mtx while we get the new dbuf 1292*fa9e4066Sahrens * in the hash table *and* added to the dbufs list. 1293*fa9e4066Sahrens * This prevents a possible deadlock with someone 1294*fa9e4066Sahrens * trying to look up this dbuf before its added to the 1295*fa9e4066Sahrens * dn_dbufs list. 1296*fa9e4066Sahrens */ 1297*fa9e4066Sahrens mutex_enter(&dn->dn_dbufs_mtx); 1298*fa9e4066Sahrens if ((odb = dbuf_hash_insert(db)) != NULL) { 1299*fa9e4066Sahrens /* someone else inserted it first */ 1300*fa9e4066Sahrens kmem_cache_free(dbuf_cache, db); 1301*fa9e4066Sahrens mutex_exit(&dn->dn_dbufs_mtx); 1302*fa9e4066Sahrens return (odb); 1303*fa9e4066Sahrens } 1304*fa9e4066Sahrens list_insert_head(&dn->dn_dbufs, db); 1305*fa9e4066Sahrens mutex_exit(&dn->dn_dbufs_mtx); 1306*fa9e4066Sahrens 1307*fa9e4066Sahrens if (parent && parent != dn->dn_dbuf) 1308*fa9e4066Sahrens dbuf_add_ref(parent, db); 1309*fa9e4066Sahrens 1310*fa9e4066Sahrens (void) refcount_add(&dn->dn_holds, db); 1311*fa9e4066Sahrens 1312*fa9e4066Sahrens db->db_dnode = dn; 1313*fa9e4066Sahrens db->db_parent = parent; 1314*fa9e4066Sahrens db->db_blkptr = blkptr; 1315*fa9e4066Sahrens 1316*fa9e4066Sahrens dprintf_dbuf(db, "db=%p\n", db); 1317*fa9e4066Sahrens 1318*fa9e4066Sahrens return (db); 1319*fa9e4066Sahrens } 1320*fa9e4066Sahrens 1321*fa9e4066Sahrens static int 1322*fa9e4066Sahrens dbuf_evictable(dmu_buf_impl_t *db) 1323*fa9e4066Sahrens { 1324*fa9e4066Sahrens int i; 1325*fa9e4066Sahrens 1326*fa9e4066Sahrens ASSERT(MUTEX_HELD(&db->db_mtx)); 1327*fa9e4066Sahrens dbuf_verify(db); 1328*fa9e4066Sahrens 1329*fa9e4066Sahrens if (db->db_state != DB_UNCACHED && db->db_state != DB_CACHED) 1330*fa9e4066Sahrens return (FALSE); 1331*fa9e4066Sahrens 1332*fa9e4066Sahrens if (!refcount_is_zero(&db->db_holds)) 1333*fa9e4066Sahrens return (FALSE); 1334*fa9e4066Sahrens 1335*fa9e4066Sahrens #ifdef ZFS_DEBUG 1336*fa9e4066Sahrens for (i = 0; i < TXG_SIZE; i++) { 1337*fa9e4066Sahrens ASSERT(!list_link_active(&db->db_dirty_node[i])); 1338*fa9e4066Sahrens ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL); 1339*fa9e4066Sahrens } 1340*fa9e4066Sahrens #endif 1341*fa9e4066Sahrens 1342*fa9e4066Sahrens /* 1343*fa9e4066Sahrens * Now we know we want to free it. 1344*fa9e4066Sahrens * This call must be done last, since it has side effects - 1345*fa9e4066Sahrens * calling the db_evict_func(). 1346*fa9e4066Sahrens */ 1347*fa9e4066Sahrens dbuf_evict_user(db); 1348*fa9e4066Sahrens return (TRUE); 1349*fa9e4066Sahrens } 1350*fa9e4066Sahrens 1351*fa9e4066Sahrens static void 1352*fa9e4066Sahrens dbuf_destroy(dmu_buf_impl_t *db) 1353*fa9e4066Sahrens { 1354*fa9e4066Sahrens ASSERT(refcount_is_zero(&db->db_holds)); 1355*fa9e4066Sahrens 1356*fa9e4066Sahrens ASSERT(db->db.db_data == NULL); 1357*fa9e4066Sahrens ASSERT(db->db_dnode == NULL); 1358*fa9e4066Sahrens ASSERT(db->db_parent == NULL); 1359*fa9e4066Sahrens ASSERT(db->db_hash_next == NULL); 1360*fa9e4066Sahrens ASSERT(db->db_blkptr == NULL); 1361*fa9e4066Sahrens ASSERT(db->db_data_pending == NULL); 1362*fa9e4066Sahrens 1363*fa9e4066Sahrens kmem_cache_free(dbuf_cache, db); 1364*fa9e4066Sahrens } 1365*fa9e4066Sahrens 1366*fa9e4066Sahrens void 1367*fa9e4066Sahrens dbuf_prefetch(dnode_t *dn, uint64_t blkid) 1368*fa9e4066Sahrens { 1369*fa9e4066Sahrens dmu_buf_impl_t *db, *parent = NULL; 1370*fa9e4066Sahrens blkptr_t *bp = NULL; 1371*fa9e4066Sahrens 1372*fa9e4066Sahrens ASSERT(blkid != DB_BONUS_BLKID); 1373*fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1374*fa9e4066Sahrens 1375*fa9e4066Sahrens if (dnode_block_freed(dn, blkid)) 1376*fa9e4066Sahrens return; 1377*fa9e4066Sahrens 1378*fa9e4066Sahrens /* dbuf_find() returns with db_mtx held */ 1379*fa9e4066Sahrens if (db = dbuf_find(dn, 0, blkid)) { 1380*fa9e4066Sahrens /* 1381*fa9e4066Sahrens * This dbuf is already in the cache. We assume that 1382*fa9e4066Sahrens * it is already CACHED, or else about to be either 1383*fa9e4066Sahrens * read or filled. 1384*fa9e4066Sahrens */ 1385*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1386*fa9e4066Sahrens return; 1387*fa9e4066Sahrens } 1388*fa9e4066Sahrens 1389*fa9e4066Sahrens if (dbuf_findbp(dn, 0, blkid, TRUE, &parent, &bp) == 0) { 1390*fa9e4066Sahrens if (bp && !BP_IS_HOLE(bp)) { 1391*fa9e4066Sahrens (void) arc_read(NULL, dn->dn_objset->os_spa, bp, 1392*fa9e4066Sahrens dmu_ot[dn->dn_type].ot_byteswap, 1393*fa9e4066Sahrens NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 1394*fa9e4066Sahrens ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, 1395*fa9e4066Sahrens (ARC_NOWAIT | ARC_PREFETCH)); 1396*fa9e4066Sahrens } 1397*fa9e4066Sahrens if (parent && parent != dn->dn_dbuf) 1398*fa9e4066Sahrens dbuf_rele(parent); 1399*fa9e4066Sahrens } 1400*fa9e4066Sahrens } 1401*fa9e4066Sahrens 1402*fa9e4066Sahrens /* 1403*fa9e4066Sahrens * Returns with db_holds incremented, and db_mtx not held. 1404*fa9e4066Sahrens * Note: dn_struct_rwlock must be held. 1405*fa9e4066Sahrens */ 1406*fa9e4066Sahrens int 1407*fa9e4066Sahrens dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, 1408*fa9e4066Sahrens void *tag, dmu_buf_impl_t **dbp) 1409*fa9e4066Sahrens { 1410*fa9e4066Sahrens dmu_buf_impl_t *db, *parent = NULL; 1411*fa9e4066Sahrens 1412*fa9e4066Sahrens ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); 1413*fa9e4066Sahrens ASSERT3U(dn->dn_nlevels, >, level); 1414*fa9e4066Sahrens 1415*fa9e4066Sahrens *dbp = NULL; 1416*fa9e4066Sahrens 1417*fa9e4066Sahrens /* dbuf_find() returns with db_mtx held */ 1418*fa9e4066Sahrens db = dbuf_find(dn, level, blkid); 1419*fa9e4066Sahrens 1420*fa9e4066Sahrens if (db == NULL) { 1421*fa9e4066Sahrens blkptr_t *bp = NULL; 1422*fa9e4066Sahrens int err; 1423*fa9e4066Sahrens 1424*fa9e4066Sahrens err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); 1425*fa9e4066Sahrens if (fail_sparse) { 1426*fa9e4066Sahrens if (err == 0 && bp && BP_IS_HOLE(bp)) 1427*fa9e4066Sahrens err = ENOENT; 1428*fa9e4066Sahrens if (err) { 1429*fa9e4066Sahrens if (parent && parent != dn->dn_dbuf) 1430*fa9e4066Sahrens dbuf_rele(parent); 1431*fa9e4066Sahrens return (err); 1432*fa9e4066Sahrens } 1433*fa9e4066Sahrens } 1434*fa9e4066Sahrens db = dbuf_create(dn, level, blkid, parent, bp); 1435*fa9e4066Sahrens } 1436*fa9e4066Sahrens 1437*fa9e4066Sahrens /* 1438*fa9e4066Sahrens * If this buffer is currently syncing out, and we are 1439*fa9e4066Sahrens * are still referencing it from db_data, we need to make 1440*fa9e4066Sahrens * a copy of it in case we decide we want to dirty it 1441*fa9e4066Sahrens * again in this txg. 1442*fa9e4066Sahrens */ 1443*fa9e4066Sahrens if (db->db_level == 0 && db->db_state == DB_CACHED && 1444*fa9e4066Sahrens !(dn->dn_object & DMU_PRIVATE_OBJECT) && 1445*fa9e4066Sahrens db->db_data_pending == db->db_buf) { 1446*fa9e4066Sahrens int size = (db->db_blkid == DB_BONUS_BLKID) ? 1447*fa9e4066Sahrens DN_MAX_BONUSLEN : db->db.db_size; 1448*fa9e4066Sahrens 1449*fa9e4066Sahrens dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, 1450*fa9e4066Sahrens size, db)); 1451*fa9e4066Sahrens bcopy(db->db_data_pending->b_data, db->db.db_data, 1452*fa9e4066Sahrens db->db.db_size); 1453*fa9e4066Sahrens } 1454*fa9e4066Sahrens 1455*fa9e4066Sahrens dbuf_add_ref(db, tag); 1456*fa9e4066Sahrens dbuf_update_data(db); 1457*fa9e4066Sahrens dbuf_verify(db); 1458*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1459*fa9e4066Sahrens 1460*fa9e4066Sahrens /* NOTE: we can't rele the parent until after we drop the db_mtx */ 1461*fa9e4066Sahrens if (parent && parent != dn->dn_dbuf) 1462*fa9e4066Sahrens dbuf_rele(parent); 1463*fa9e4066Sahrens 1464*fa9e4066Sahrens ASSERT3P(db->db_dnode, ==, dn); 1465*fa9e4066Sahrens ASSERT3U(db->db_blkid, ==, blkid); 1466*fa9e4066Sahrens ASSERT3U(db->db_level, ==, level); 1467*fa9e4066Sahrens *dbp = db; 1468*fa9e4066Sahrens 1469*fa9e4066Sahrens return (0); 1470*fa9e4066Sahrens } 1471*fa9e4066Sahrens 1472*fa9e4066Sahrens dmu_buf_impl_t * 1473*fa9e4066Sahrens dbuf_hold(dnode_t *dn, uint64_t blkid) 1474*fa9e4066Sahrens { 1475*fa9e4066Sahrens dmu_buf_impl_t *db; 1476*fa9e4066Sahrens (void) dbuf_hold_impl(dn, 0, blkid, FALSE, NULL, &db); 1477*fa9e4066Sahrens return (db); 1478*fa9e4066Sahrens } 1479*fa9e4066Sahrens 1480*fa9e4066Sahrens dmu_buf_impl_t * 1481*fa9e4066Sahrens dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) 1482*fa9e4066Sahrens { 1483*fa9e4066Sahrens dmu_buf_impl_t *db; 1484*fa9e4066Sahrens (void) dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); 1485*fa9e4066Sahrens return (db); 1486*fa9e4066Sahrens } 1487*fa9e4066Sahrens 1488*fa9e4066Sahrens dmu_buf_impl_t * 1489*fa9e4066Sahrens dbuf_hold_bonus(dnode_t *dn, void *tag) 1490*fa9e4066Sahrens { 1491*fa9e4066Sahrens dmu_buf_impl_t *db; 1492*fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 1493*fa9e4066Sahrens (void) dbuf_hold_impl(dn, 0, DB_BONUS_BLKID, FALSE, tag, &db); 1494*fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 1495*fa9e4066Sahrens return (db); 1496*fa9e4066Sahrens } 1497*fa9e4066Sahrens 1498*fa9e4066Sahrens void 1499*fa9e4066Sahrens dbuf_add_ref(dmu_buf_impl_t *db, void *tag) 1500*fa9e4066Sahrens { 1501*fa9e4066Sahrens (void) refcount_add(&db->db_holds, tag); 1502*fa9e4066Sahrens /* dprintf_dbuf(db, "adding ref %p; holds up to %lld\n", tag, holds); */ 1503*fa9e4066Sahrens } 1504*fa9e4066Sahrens 1505*fa9e4066Sahrens void 1506*fa9e4066Sahrens dbuf_remove_ref(dmu_buf_impl_t *db, void *tag) 1507*fa9e4066Sahrens { 1508*fa9e4066Sahrens int64_t holds; 1509*fa9e4066Sahrens dnode_t *dn = db->db_dnode; 1510*fa9e4066Sahrens int need_mutex; 1511*fa9e4066Sahrens 1512*fa9e4066Sahrens ASSERT(dn != NULL); 1513*fa9e4066Sahrens need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx); 1514*fa9e4066Sahrens 1515*fa9e4066Sahrens if (need_mutex) { 1516*fa9e4066Sahrens dnode_add_ref(dn, FTAG); 1517*fa9e4066Sahrens mutex_enter(&dn->dn_dbufs_mtx); 1518*fa9e4066Sahrens } 1519*fa9e4066Sahrens 1520*fa9e4066Sahrens mutex_enter(&db->db_mtx); 1521*fa9e4066Sahrens dbuf_verify(db); 1522*fa9e4066Sahrens 1523*fa9e4066Sahrens holds = refcount_remove(&db->db_holds, tag); 1524*fa9e4066Sahrens 1525*fa9e4066Sahrens if (holds == 0) { 1526*fa9e4066Sahrens ASSERT3U(db->db_state, !=, DB_FILL); 1527*fa9e4066Sahrens if (db->db_level == 0 && 1528*fa9e4066Sahrens db->db_d.db_user_data_ptr_ptr != NULL) 1529*fa9e4066Sahrens *db->db_d.db_user_data_ptr_ptr = NULL; 1530*fa9e4066Sahrens dbuf_evict(db); 1531*fa9e4066Sahrens } else { 1532*fa9e4066Sahrens if (holds == db->db_dirtycnt && 1533*fa9e4066Sahrens db->db_level == 0 && db->db_d.db_immediate_evict) 1534*fa9e4066Sahrens dbuf_evict_user(db); 1535*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1536*fa9e4066Sahrens } 1537*fa9e4066Sahrens 1538*fa9e4066Sahrens if (need_mutex) { 1539*fa9e4066Sahrens mutex_exit(&dn->dn_dbufs_mtx); 1540*fa9e4066Sahrens dnode_rele(dn, FTAG); 1541*fa9e4066Sahrens } 1542*fa9e4066Sahrens } 1543*fa9e4066Sahrens 1544*fa9e4066Sahrens void 1545*fa9e4066Sahrens dbuf_rele(dmu_buf_impl_t *db) 1546*fa9e4066Sahrens { 1547*fa9e4066Sahrens dbuf_remove_ref(db, NULL); 1548*fa9e4066Sahrens } 1549*fa9e4066Sahrens 1550*fa9e4066Sahrens #pragma weak dmu_buf_refcount = dbuf_refcount 1551*fa9e4066Sahrens uint64_t 1552*fa9e4066Sahrens dbuf_refcount(dmu_buf_impl_t *db) 1553*fa9e4066Sahrens { 1554*fa9e4066Sahrens return (refcount_count(&db->db_holds)); 1555*fa9e4066Sahrens } 1556*fa9e4066Sahrens 1557*fa9e4066Sahrens void * 1558*fa9e4066Sahrens dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1559*fa9e4066Sahrens dmu_buf_evict_func_t *evict_func) 1560*fa9e4066Sahrens { 1561*fa9e4066Sahrens return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1562*fa9e4066Sahrens user_data_ptr_ptr, evict_func)); 1563*fa9e4066Sahrens } 1564*fa9e4066Sahrens 1565*fa9e4066Sahrens void * 1566*fa9e4066Sahrens dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr, 1567*fa9e4066Sahrens dmu_buf_evict_func_t *evict_func) 1568*fa9e4066Sahrens { 1569*fa9e4066Sahrens dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1570*fa9e4066Sahrens 1571*fa9e4066Sahrens db->db_d.db_immediate_evict = TRUE; 1572*fa9e4066Sahrens return (dmu_buf_update_user(db_fake, NULL, user_ptr, 1573*fa9e4066Sahrens user_data_ptr_ptr, evict_func)); 1574*fa9e4066Sahrens } 1575*fa9e4066Sahrens 1576*fa9e4066Sahrens void * 1577*fa9e4066Sahrens dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, 1578*fa9e4066Sahrens void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func) 1579*fa9e4066Sahrens { 1580*fa9e4066Sahrens dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1581*fa9e4066Sahrens ASSERT(db->db_level == 0); 1582*fa9e4066Sahrens 1583*fa9e4066Sahrens ASSERT((user_ptr == NULL) == (evict_func == NULL)); 1584*fa9e4066Sahrens 1585*fa9e4066Sahrens mutex_enter(&db->db_mtx); 1586*fa9e4066Sahrens 1587*fa9e4066Sahrens if (db->db_d.db_user_ptr == old_user_ptr) { 1588*fa9e4066Sahrens db->db_d.db_user_ptr = user_ptr; 1589*fa9e4066Sahrens db->db_d.db_user_data_ptr_ptr = user_data_ptr_ptr; 1590*fa9e4066Sahrens db->db_d.db_evict_func = evict_func; 1591*fa9e4066Sahrens 1592*fa9e4066Sahrens dbuf_update_data(db); 1593*fa9e4066Sahrens } else { 1594*fa9e4066Sahrens old_user_ptr = db->db_d.db_user_ptr; 1595*fa9e4066Sahrens } 1596*fa9e4066Sahrens 1597*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1598*fa9e4066Sahrens return (old_user_ptr); 1599*fa9e4066Sahrens } 1600*fa9e4066Sahrens 1601*fa9e4066Sahrens void * 1602*fa9e4066Sahrens dmu_buf_get_user(dmu_buf_t *db_fake) 1603*fa9e4066Sahrens { 1604*fa9e4066Sahrens dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 1605*fa9e4066Sahrens ASSERT(!refcount_is_zero(&db->db_holds)); 1606*fa9e4066Sahrens 1607*fa9e4066Sahrens return (db->db_d.db_user_ptr); 1608*fa9e4066Sahrens } 1609*fa9e4066Sahrens 1610*fa9e4066Sahrens void 1611*fa9e4066Sahrens dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) 1612*fa9e4066Sahrens { 1613*fa9e4066Sahrens arc_buf_t **data; 1614*fa9e4066Sahrens uint64_t txg = tx->tx_txg; 1615*fa9e4066Sahrens dnode_t *dn = db->db_dnode; 1616*fa9e4066Sahrens objset_impl_t *os = dn->dn_objset; 1617*fa9e4066Sahrens int blksz; 1618*fa9e4066Sahrens 1619*fa9e4066Sahrens ASSERT(dmu_tx_is_syncing(tx)); 1620*fa9e4066Sahrens 1621*fa9e4066Sahrens dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); 1622*fa9e4066Sahrens 1623*fa9e4066Sahrens mutex_enter(&db->db_mtx); 1624*fa9e4066Sahrens /* 1625*fa9e4066Sahrens * To be synced, we must be dirtied. But we 1626*fa9e4066Sahrens * might have been freed after the dirty. 1627*fa9e4066Sahrens */ 1628*fa9e4066Sahrens if (db->db_state == DB_UNCACHED) { 1629*fa9e4066Sahrens /* This buffer has been freed since it was dirtied */ 1630*fa9e4066Sahrens ASSERT(db->db.db_data == NULL); 1631*fa9e4066Sahrens } else if (db->db_state == DB_FILL) { 1632*fa9e4066Sahrens /* This buffer was freed and is now being re-filled */ 1633*fa9e4066Sahrens ASSERT(db->db.db_data != db->db_d.db_data_old[txg&TXG_MASK]); 1634*fa9e4066Sahrens } else { 1635*fa9e4066Sahrens ASSERT3U(db->db_state, ==, DB_CACHED); 1636*fa9e4066Sahrens } 1637*fa9e4066Sahrens dbuf_verify(db); 1638*fa9e4066Sahrens 1639*fa9e4066Sahrens /* 1640*fa9e4066Sahrens * Don't need a lock on db_dirty (dn_mtx), because it can't 1641*fa9e4066Sahrens * be modified yet. 1642*fa9e4066Sahrens */ 1643*fa9e4066Sahrens 1644*fa9e4066Sahrens if (db->db_level == 0) { 1645*fa9e4066Sahrens data = &db->db_d.db_data_old[txg&TXG_MASK]; 1646*fa9e4066Sahrens blksz = arc_buf_size(*data); 1647*fa9e4066Sahrens /* 1648*fa9e4066Sahrens * If this buffer is currently "in use" (i.e., there are 1649*fa9e4066Sahrens * active holds and db_data still references it), then make 1650*fa9e4066Sahrens * a copy before we start the write so that any modifications 1651*fa9e4066Sahrens * from the open txg will not leak into this write. 1652*fa9e4066Sahrens * 1653*fa9e4066Sahrens * NOTE: this copy does not need to be made for objects only 1654*fa9e4066Sahrens * modified in the syncing context (e.g. DNONE_DNODE blocks) 1655*fa9e4066Sahrens * or if there is no actual write involved (bonus blocks). 1656*fa9e4066Sahrens */ 1657*fa9e4066Sahrens if (!(dn->dn_object & DMU_PRIVATE_OBJECT) && 1658*fa9e4066Sahrens db->db_d.db_overridden_by[txg&TXG_MASK] == NULL && 1659*fa9e4066Sahrens db->db_blkid != DB_BONUS_BLKID) { 1660*fa9e4066Sahrens if (refcount_count(&db->db_holds) > 1 && 1661*fa9e4066Sahrens *data == db->db_buf) { 1662*fa9e4066Sahrens *data = arc_buf_alloc( 1663*fa9e4066Sahrens db->db_dnode->dn_objset->os_spa, blksz, db); 1664*fa9e4066Sahrens bcopy(db->db.db_data, (*data)->b_data, blksz); 1665*fa9e4066Sahrens } 1666*fa9e4066Sahrens db->db_data_pending = *data; 1667*fa9e4066Sahrens } else if (dn->dn_object & DMU_PRIVATE_OBJECT) { 1668*fa9e4066Sahrens /* 1669*fa9e4066Sahrens * Private object buffers are released here rather 1670*fa9e4066Sahrens * than in dbuf_dirty() since they are only modified 1671*fa9e4066Sahrens * in the syncing context and we don't want the 1672*fa9e4066Sahrens * overhead of making multiple copies of the data. 1673*fa9e4066Sahrens */ 1674*fa9e4066Sahrens arc_release(db->db_buf, db); 1675*fa9e4066Sahrens } 1676*fa9e4066Sahrens } else { 1677*fa9e4066Sahrens data = &db->db_buf; 1678*fa9e4066Sahrens if (*data == NULL) { 1679*fa9e4066Sahrens /* 1680*fa9e4066Sahrens * This can happen if we dirty and then free 1681*fa9e4066Sahrens * the level-0 data blocks in the same txg. So 1682*fa9e4066Sahrens * this indirect remains unchanged. 1683*fa9e4066Sahrens */ 1684*fa9e4066Sahrens if (db->db_dirtied == txg) 1685*fa9e4066Sahrens db->db_dirtied = 0; 1686*fa9e4066Sahrens ASSERT(db->db_dirtycnt > 0); 1687*fa9e4066Sahrens db->db_dirtycnt -= 1; 1688*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1689*fa9e4066Sahrens dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1690*fa9e4066Sahrens return; 1691*fa9e4066Sahrens } 1692*fa9e4066Sahrens blksz = db->db.db_size; 1693*fa9e4066Sahrens ASSERT3U(blksz, ==, 1<<dn->dn_phys->dn_indblkshift); 1694*fa9e4066Sahrens } 1695*fa9e4066Sahrens 1696*fa9e4066Sahrens ASSERT(*data != NULL); 1697*fa9e4066Sahrens 1698*fa9e4066Sahrens if (db->db_blkid == DB_BONUS_BLKID) { 1699*fa9e4066Sahrens /* 1700*fa9e4066Sahrens * Simply copy the bonus data into the dnode. It will 1701*fa9e4066Sahrens * be written out when the dnode is synced (and it will 1702*fa9e4066Sahrens * be synced, since it must have been dirty for dbuf_sync 1703*fa9e4066Sahrens * to be called). The bonus data will be byte swapped 1704*fa9e4066Sahrens * in dnode_byteswap. 1705*fa9e4066Sahrens */ 1706*fa9e4066Sahrens /* 1707*fa9e4066Sahrens * Use dn_phys->dn_bonuslen since db.db_size is the length 1708*fa9e4066Sahrens * of the bonus buffer in the open transaction rather than 1709*fa9e4066Sahrens * the syncing transaction. 1710*fa9e4066Sahrens */ 1711*fa9e4066Sahrens ASSERT3U(db->db_level, ==, 0); 1712*fa9e4066Sahrens ASSERT3U(dn->dn_phys->dn_bonuslen, <=, blksz); 1713*fa9e4066Sahrens bcopy((*data)->b_data, DN_BONUS(dn->dn_phys), 1714*fa9e4066Sahrens dn->dn_phys->dn_bonuslen); 1715*fa9e4066Sahrens if (*data != db->db_buf) 1716*fa9e4066Sahrens arc_buf_free(*data, db); 1717*fa9e4066Sahrens db->db_d.db_data_old[txg&TXG_MASK] = NULL; 1718*fa9e4066Sahrens db->db_data_pending = NULL; 1719*fa9e4066Sahrens if (db->db_dirtied == txg) 1720*fa9e4066Sahrens db->db_dirtied = 0; 1721*fa9e4066Sahrens ASSERT(db->db_dirtycnt > 0); 1722*fa9e4066Sahrens db->db_dirtycnt -= 1; 1723*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1724*fa9e4066Sahrens dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1725*fa9e4066Sahrens return; 1726*fa9e4066Sahrens } else if (db->db_level > 0 && !arc_released(db->db_buf)) { 1727*fa9e4066Sahrens /* 1728*fa9e4066Sahrens * This indirect buffer was marked dirty, but 1729*fa9e4066Sahrens * never modified (if it had been modified, then 1730*fa9e4066Sahrens * we would have released the buffer). There is 1731*fa9e4066Sahrens * no reason to write anything. 1732*fa9e4066Sahrens */ 1733*fa9e4066Sahrens db->db_data_pending = NULL; 1734*fa9e4066Sahrens if (db->db_dirtied == txg) 1735*fa9e4066Sahrens db->db_dirtied = 0; 1736*fa9e4066Sahrens ASSERT(db->db_dirtycnt > 0); 1737*fa9e4066Sahrens db->db_dirtycnt -= 1; 1738*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1739*fa9e4066Sahrens dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1740*fa9e4066Sahrens return; 1741*fa9e4066Sahrens } else if (db->db_blkptr == NULL && 1742*fa9e4066Sahrens db->db_level == dn->dn_phys->dn_nlevels-1 && 1743*fa9e4066Sahrens db->db_blkid < dn->dn_phys->dn_nblkptr) { 1744*fa9e4066Sahrens /* 1745*fa9e4066Sahrens * This buffer was allocated at a time when there was 1746*fa9e4066Sahrens * no available blkptrs from the dnode, or it was 1747*fa9e4066Sahrens * inappropriate to hook it in (i.e., nlevels mis-match). 1748*fa9e4066Sahrens */ 1749*fa9e4066Sahrens ASSERT(db->db_blkptr == NULL); 1750*fa9e4066Sahrens ASSERT(db->db_parent == NULL); 1751*fa9e4066Sahrens db->db_parent = dn->dn_dbuf; 1752*fa9e4066Sahrens db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; 1753*fa9e4066Sahrens dbuf_verify(db); 1754*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1755*fa9e4066Sahrens } else if (db->db_blkptr == NULL) { 1756*fa9e4066Sahrens dmu_buf_impl_t *parent = db->db_parent; 1757*fa9e4066Sahrens int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 1758*fa9e4066Sahrens 1759*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1760*fa9e4066Sahrens ASSERT(dn->dn_phys->dn_nlevels > 1); 1761*fa9e4066Sahrens if (parent == NULL) { 1762*fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 1763*fa9e4066Sahrens (void) dbuf_hold_impl(dn, db->db_level+1, 1764*fa9e4066Sahrens db->db_blkid >> epbs, FALSE, NULL, &parent); 1765*fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 1766*fa9e4066Sahrens dbuf_add_ref(parent, db); 1767*fa9e4066Sahrens db->db_parent = parent; 1768*fa9e4066Sahrens dbuf_rele(parent); 1769*fa9e4066Sahrens } 1770*fa9e4066Sahrens dbuf_read(parent); 1771*fa9e4066Sahrens } else { 1772*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1773*fa9e4066Sahrens } 1774*fa9e4066Sahrens 1775*fa9e4066Sahrens ASSERT(IS_DNODE_DNODE(dn->dn_object) || db->db_parent != NULL); 1776*fa9e4066Sahrens 1777*fa9e4066Sahrens if (db->db_parent != dn->dn_dbuf) { 1778*fa9e4066Sahrens dmu_buf_impl_t *parent = db->db_parent; 1779*fa9e4066Sahrens int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; 1780*fa9e4066Sahrens 1781*fa9e4066Sahrens mutex_enter(&db->db_mtx); 1782*fa9e4066Sahrens ASSERT(db->db_level == parent->db_level-1); 1783*fa9e4066Sahrens ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK])); 1784*fa9e4066Sahrens /* 1785*fa9e4066Sahrens * We may have read this block after we dirtied it, 1786*fa9e4066Sahrens * so never released it from the cache. 1787*fa9e4066Sahrens */ 1788*fa9e4066Sahrens arc_release(parent->db_buf, parent); 1789*fa9e4066Sahrens 1790*fa9e4066Sahrens db->db_blkptr = (blkptr_t *)parent->db.db_data + 1791*fa9e4066Sahrens (db->db_blkid & ((1ULL << epbs) - 1)); 1792*fa9e4066Sahrens dbuf_verify(db); 1793*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1794*fa9e4066Sahrens } 1795*fa9e4066Sahrens ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); 1796*fa9e4066Sahrens 1797*fa9e4066Sahrens #ifdef ZFS_DEBUG 1798*fa9e4066Sahrens if (db->db_parent == dn->dn_dbuf) { 1799*fa9e4066Sahrens /* 1800*fa9e4066Sahrens * We don't need to dnode_setdirty(dn) because if we got 1801*fa9e4066Sahrens * here then the parent is already dirty. 1802*fa9e4066Sahrens */ 1803*fa9e4066Sahrens ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); 1804*fa9e4066Sahrens ASSERT3P(db->db_blkptr, ==, 1805*fa9e4066Sahrens &dn->dn_phys->dn_blkptr[db->db_blkid]); 1806*fa9e4066Sahrens } 1807*fa9e4066Sahrens #endif 1808*fa9e4066Sahrens if (db->db_level == 0 && 1809*fa9e4066Sahrens db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { 1810*fa9e4066Sahrens arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK]; 1811*fa9e4066Sahrens blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK]; 1812*fa9e4066Sahrens int old_size = BP_GET_ASIZE(db->db_blkptr); 1813*fa9e4066Sahrens int new_size = BP_GET_ASIZE(*bpp); 1814*fa9e4066Sahrens 1815*fa9e4066Sahrens ASSERT(db->db_blkid != DB_BONUS_BLKID); 1816*fa9e4066Sahrens 1817*fa9e4066Sahrens dnode_diduse_space(dn, new_size-old_size); 1818*fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 1819*fa9e4066Sahrens if (db->db_blkid > dn->dn_phys->dn_maxblkid) 1820*fa9e4066Sahrens dn->dn_phys->dn_maxblkid = db->db_blkid; 1821*fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 1822*fa9e4066Sahrens 1823*fa9e4066Sahrens dsl_dataset_block_born(os->os_dsl_dataset, *bpp, tx); 1824*fa9e4066Sahrens if (!BP_IS_HOLE(db->db_blkptr)) 1825*fa9e4066Sahrens dsl_dataset_block_kill(os->os_dsl_dataset, 1826*fa9e4066Sahrens db->db_blkptr, os->os_synctx); 1827*fa9e4066Sahrens 1828*fa9e4066Sahrens mutex_enter(&db->db_mtx); 1829*fa9e4066Sahrens *db->db_blkptr = **bpp; 1830*fa9e4066Sahrens kmem_free(*bpp, sizeof (blkptr_t)); 1831*fa9e4066Sahrens *bpp = NULL; 1832*fa9e4066Sahrens 1833*fa9e4066Sahrens if (*old != db->db_buf) 1834*fa9e4066Sahrens arc_buf_free(*old, db); 1835*fa9e4066Sahrens *old = NULL; 1836*fa9e4066Sahrens db->db_data_pending = NULL; 1837*fa9e4066Sahrens 1838*fa9e4066Sahrens cv_broadcast(&db->db_changed); 1839*fa9e4066Sahrens 1840*fa9e4066Sahrens ASSERT(db->db_dirtycnt > 0); 1841*fa9e4066Sahrens db->db_dirtycnt -= 1; 1842*fa9e4066Sahrens mutex_exit(&db->db_mtx); 1843*fa9e4066Sahrens dbuf_remove_ref(db, (void *)(uintptr_t)txg); 1844*fa9e4066Sahrens } else { 1845*fa9e4066Sahrens int checksum, compress; 1846*fa9e4066Sahrens 1847*fa9e4066Sahrens if (db->db_level > 0) { 1848*fa9e4066Sahrens /* 1849*fa9e4066Sahrens * XXX -- we should design a compression algorithm 1850*fa9e4066Sahrens * that specializes in arrays of bps. 1851*fa9e4066Sahrens */ 1852*fa9e4066Sahrens checksum = ZIO_CHECKSUM_FLETCHER_4; 1853*fa9e4066Sahrens compress = ZIO_COMPRESS_LZJB; 1854*fa9e4066Sahrens } else { 1855*fa9e4066Sahrens /* 1856*fa9e4066Sahrens * Allow dnode settings to override objset settings, 1857*fa9e4066Sahrens * except for metadata checksums. 1858*fa9e4066Sahrens */ 1859*fa9e4066Sahrens if (dmu_ot[dn->dn_type].ot_metadata) { 1860*fa9e4066Sahrens checksum = os->os_md_checksum; 1861*fa9e4066Sahrens compress = zio_compress_select(dn->dn_compress, 1862*fa9e4066Sahrens os->os_md_compress); 1863*fa9e4066Sahrens } else { 1864*fa9e4066Sahrens checksum = zio_checksum_select(dn->dn_checksum, 1865*fa9e4066Sahrens os->os_checksum); 1866*fa9e4066Sahrens compress = zio_compress_select(dn->dn_compress, 1867*fa9e4066Sahrens os->os_compress); 1868*fa9e4066Sahrens } 1869*fa9e4066Sahrens } 1870*fa9e4066Sahrens #ifdef ZFS_DEBUG 1871*fa9e4066Sahrens if (db->db_parent) { 1872*fa9e4066Sahrens ASSERT(list_link_active( 1873*fa9e4066Sahrens &db->db_parent->db_dirty_node[txg&TXG_MASK])); 1874*fa9e4066Sahrens ASSERT(db->db_parent == dn->dn_dbuf || 1875*fa9e4066Sahrens db->db_parent->db_level > 0); 1876*fa9e4066Sahrens if (dn->dn_object & DMU_PRIVATE_OBJECT || 1877*fa9e4066Sahrens db->db_level > 0) 1878*fa9e4066Sahrens ASSERT(*data == db->db_buf); 1879*fa9e4066Sahrens } 1880*fa9e4066Sahrens #endif 1881*fa9e4066Sahrens ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg); 1882*fa9e4066Sahrens (void) arc_write(zio, os->os_spa, checksum, compress, txg, 1883*fa9e4066Sahrens db->db_blkptr, *data, dbuf_write_done, db, 1884*fa9e4066Sahrens ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT); 1885*fa9e4066Sahrens /* 1886*fa9e4066Sahrens * We can't access db after arc_write, since it could finish 1887*fa9e4066Sahrens * and be freed, and we have no locks on it. 1888*fa9e4066Sahrens */ 1889*fa9e4066Sahrens } 1890*fa9e4066Sahrens } 1891*fa9e4066Sahrens 1892*fa9e4066Sahrens struct dbuf_arg { 1893*fa9e4066Sahrens objset_impl_t *os; 1894*fa9e4066Sahrens blkptr_t bp; 1895*fa9e4066Sahrens }; 1896*fa9e4066Sahrens 1897*fa9e4066Sahrens static void 1898*fa9e4066Sahrens dbuf_do_born(void *arg) 1899*fa9e4066Sahrens { 1900*fa9e4066Sahrens struct dbuf_arg *da = arg; 1901*fa9e4066Sahrens dsl_dataset_block_born(da->os->os_dsl_dataset, 1902*fa9e4066Sahrens &da->bp, da->os->os_synctx); 1903*fa9e4066Sahrens kmem_free(da, sizeof (struct dbuf_arg)); 1904*fa9e4066Sahrens } 1905*fa9e4066Sahrens 1906*fa9e4066Sahrens static void 1907*fa9e4066Sahrens dbuf_do_kill(void *arg) 1908*fa9e4066Sahrens { 1909*fa9e4066Sahrens struct dbuf_arg *da = arg; 1910*fa9e4066Sahrens dsl_dataset_block_kill(da->os->os_dsl_dataset, 1911*fa9e4066Sahrens &da->bp, da->os->os_synctx); 1912*fa9e4066Sahrens kmem_free(da, sizeof (struct dbuf_arg)); 1913*fa9e4066Sahrens } 1914*fa9e4066Sahrens 1915*fa9e4066Sahrens /* ARGSUSED */ 1916*fa9e4066Sahrens static void 1917*fa9e4066Sahrens dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) 1918*fa9e4066Sahrens { 1919*fa9e4066Sahrens dmu_buf_impl_t *db = vdb; 1920*fa9e4066Sahrens dnode_t *dn = db->db_dnode; 1921*fa9e4066Sahrens objset_impl_t *os = dn->dn_objset; 1922*fa9e4066Sahrens uint64_t txg = zio->io_txg; 1923*fa9e4066Sahrens uint64_t fill = 0; 1924*fa9e4066Sahrens int i; 1925*fa9e4066Sahrens int old_size, new_size; 1926*fa9e4066Sahrens 1927*fa9e4066Sahrens ASSERT3U(zio->io_error, ==, 0); 1928*fa9e4066Sahrens 1929*fa9e4066Sahrens dprintf_dbuf_bp(db, &zio->io_bp_orig, "bp_orig: %s", ""); 1930*fa9e4066Sahrens 1931*fa9e4066Sahrens old_size = BP_GET_ASIZE(&zio->io_bp_orig); 1932*fa9e4066Sahrens new_size = BP_GET_ASIZE(zio->io_bp); 1933*fa9e4066Sahrens 1934*fa9e4066Sahrens dnode_diduse_space(dn, new_size-old_size); 1935*fa9e4066Sahrens 1936*fa9e4066Sahrens mutex_enter(&db->db_mtx); 1937*fa9e4066Sahrens 1938*fa9e4066Sahrens if (db->db_dirtied == txg) 1939*fa9e4066Sahrens db->db_dirtied = 0; 1940*fa9e4066Sahrens 1941*fa9e4066Sahrens if (db->db_level == 0) { 1942*fa9e4066Sahrens arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK]; 1943*fa9e4066Sahrens 1944*fa9e4066Sahrens ASSERT(db->db_blkid != DB_BONUS_BLKID); 1945*fa9e4066Sahrens 1946*fa9e4066Sahrens if (*old != db->db_buf) 1947*fa9e4066Sahrens arc_buf_free(*old, db); 1948*fa9e4066Sahrens *old = NULL; 1949*fa9e4066Sahrens db->db_data_pending = NULL; 1950*fa9e4066Sahrens 1951*fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 1952*fa9e4066Sahrens if (db->db_blkid > dn->dn_phys->dn_maxblkid && 1953*fa9e4066Sahrens !BP_IS_HOLE(db->db_blkptr)) 1954*fa9e4066Sahrens dn->dn_phys->dn_maxblkid = db->db_blkid; 1955*fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 1956*fa9e4066Sahrens 1957*fa9e4066Sahrens if (dn->dn_type == DMU_OT_DNODE) { 1958*fa9e4066Sahrens dnode_phys_t *dnp = db->db.db_data; 1959*fa9e4066Sahrens for (i = db->db.db_size >> DNODE_SHIFT; i > 0; 1960*fa9e4066Sahrens i--, dnp++) { 1961*fa9e4066Sahrens if (dnp->dn_type != DMU_OT_NONE) 1962*fa9e4066Sahrens fill++; 1963*fa9e4066Sahrens } 1964*fa9e4066Sahrens } else { 1965*fa9e4066Sahrens if (!BP_IS_HOLE(db->db_blkptr)) 1966*fa9e4066Sahrens fill = 1; 1967*fa9e4066Sahrens } 1968*fa9e4066Sahrens } else { 1969*fa9e4066Sahrens blkptr_t *bp = db->db.db_data; 1970*fa9e4066Sahrens ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); 1971*fa9e4066Sahrens if (!BP_IS_HOLE(db->db_blkptr)) { 1972*fa9e4066Sahrens ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, db->db.db_size); 1973*fa9e4066Sahrens ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, 1974*fa9e4066Sahrens db->db.db_size); 1975*fa9e4066Sahrens } 1976*fa9e4066Sahrens for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) { 1977*fa9e4066Sahrens if (BP_IS_HOLE(bp)) 1978*fa9e4066Sahrens continue; 1979*fa9e4066Sahrens ASSERT3U(BP_GET_LSIZE(bp), ==, 1980*fa9e4066Sahrens db->db_level == 1 ? dn->dn_datablksz : 1981*fa9e4066Sahrens (1<<dn->dn_phys->dn_indblkshift)); 1982*fa9e4066Sahrens fill += bp->blk_fill; 1983*fa9e4066Sahrens } 1984*fa9e4066Sahrens } 1985*fa9e4066Sahrens 1986*fa9e4066Sahrens if (!BP_IS_HOLE(db->db_blkptr)) { 1987*fa9e4066Sahrens db->db_blkptr->blk_fill = fill; 1988*fa9e4066Sahrens BP_SET_TYPE(db->db_blkptr, dn->dn_type); 1989*fa9e4066Sahrens BP_SET_LEVEL(db->db_blkptr, db->db_level); 1990*fa9e4066Sahrens } else { 1991*fa9e4066Sahrens ASSERT3U(fill, ==, 0); 1992*fa9e4066Sahrens ASSERT3U(db->db_blkptr->blk_fill, ==, 0); 1993*fa9e4066Sahrens } 1994*fa9e4066Sahrens 1995*fa9e4066Sahrens dprintf_dbuf_bp(db, db->db_blkptr, 1996*fa9e4066Sahrens "wrote %llu bytes to blkptr:", zio->io_size); 1997*fa9e4066Sahrens 1998*fa9e4066Sahrens ASSERT(db->db_parent == NULL || 1999*fa9e4066Sahrens list_link_active(&db->db_parent->db_dirty_node[txg&TXG_MASK])); 2000*fa9e4066Sahrens cv_broadcast(&db->db_changed); 2001*fa9e4066Sahrens ASSERT(db->db_dirtycnt > 0); 2002*fa9e4066Sahrens db->db_dirtycnt -= 1; 2003*fa9e4066Sahrens mutex_exit(&db->db_mtx); 2004*fa9e4066Sahrens 2005*fa9e4066Sahrens /* We must do this after we've set the bp's type and level */ 2006*fa9e4066Sahrens if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), 2007*fa9e4066Sahrens BP_IDENTITY(&zio->io_bp_orig))) { 2008*fa9e4066Sahrens struct dbuf_arg *da; 2009*fa9e4066Sahrens da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP); 2010*fa9e4066Sahrens da->os = os; 2011*fa9e4066Sahrens da->bp = *zio->io_bp; 2012*fa9e4066Sahrens (void) taskq_dispatch(dbuf_tq, dbuf_do_born, da, 0); 2013*fa9e4066Sahrens if (!BP_IS_HOLE(&zio->io_bp_orig)) { 2014*fa9e4066Sahrens da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP); 2015*fa9e4066Sahrens da->os = os; 2016*fa9e4066Sahrens da->bp = zio->io_bp_orig; 2017*fa9e4066Sahrens (void) taskq_dispatch(dbuf_tq, dbuf_do_kill, da, 0); 2018*fa9e4066Sahrens } 2019*fa9e4066Sahrens } 2020*fa9e4066Sahrens 2021*fa9e4066Sahrens dbuf_remove_ref(db, (void *)(uintptr_t)txg); 2022*fa9e4066Sahrens } 2023