1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5ea8dc4b6Seschrock * Common Development and Distribution License (the "License"). 6ea8dc4b6Seschrock * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 22*cdb0ab79Smaybee * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27fa9e4066Sahrens 28fa9e4066Sahrens #include <sys/dmu.h> 29fa9e4066Sahrens #include <sys/dmu_impl.h> 30fa9e4066Sahrens #include <sys/dmu_tx.h> 31fa9e4066Sahrens #include <sys/dbuf.h> 32fa9e4066Sahrens #include <sys/dnode.h> 33fa9e4066Sahrens #include <sys/zfs_context.h> 34fa9e4066Sahrens #include <sys/dmu_objset.h> 35fa9e4066Sahrens #include <sys/dmu_traverse.h> 36fa9e4066Sahrens #include <sys/dsl_dataset.h> 37fa9e4066Sahrens #include <sys/dsl_dir.h> 38fa9e4066Sahrens #include <sys/dsl_pool.h> 391d452cf5Sahrens #include <sys/dsl_synctask.h> 40a2eea2e1Sahrens #include <sys/dsl_prop.h> 41fa9e4066Sahrens #include <sys/dmu_zfetch.h> 42fa9e4066Sahrens #include <sys/zfs_ioctl.h> 43fa9e4066Sahrens #include <sys/zap.h> 44ea8dc4b6Seschrock #include <sys/zio_checksum.h> 4544eda4d7Smaybee #ifdef _KERNEL 4644eda4d7Smaybee #include <sys/vmsystm.h> 4744eda4d7Smaybee #endif 48fa9e4066Sahrens 49fa9e4066Sahrens const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { 50fa9e4066Sahrens { byteswap_uint8_array, TRUE, "unallocated" }, 51fa9e4066Sahrens { zap_byteswap, TRUE, "object directory" }, 52fa9e4066Sahrens { byteswap_uint64_array, TRUE, "object array" }, 53fa9e4066Sahrens { byteswap_uint8_array, TRUE, "packed nvlist" }, 54fa9e4066Sahrens { byteswap_uint64_array, TRUE, "packed nvlist size" }, 55fa9e4066Sahrens { byteswap_uint64_array, TRUE, "bplist" }, 56fa9e4066Sahrens { byteswap_uint64_array, TRUE, "bplist header" }, 57fa9e4066Sahrens { byteswap_uint64_array, TRUE, "SPA space map header" }, 58fa9e4066Sahrens { byteswap_uint64_array, TRUE, "SPA space map" }, 59fa9e4066Sahrens { byteswap_uint64_array, TRUE, "ZIL intent log" }, 60fa9e4066Sahrens { dnode_buf_byteswap, TRUE, "DMU dnode" }, 61fa9e4066Sahrens { dmu_objset_byteswap, TRUE, "DMU objset" }, 62fa9e4066Sahrens { byteswap_uint64_array, TRUE, "DSL directory" }, 63fa9e4066Sahrens { zap_byteswap, TRUE, "DSL directory child map"}, 64fa9e4066Sahrens { zap_byteswap, TRUE, "DSL dataset snap map" }, 65fa9e4066Sahrens { zap_byteswap, TRUE, "DSL props" }, 66fa9e4066Sahrens { byteswap_uint64_array, TRUE, "DSL dataset" }, 67fa9e4066Sahrens { zfs_znode_byteswap, TRUE, "ZFS znode" }, 68da6c28aaSamw { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" }, 69fa9e4066Sahrens { byteswap_uint8_array, FALSE, "ZFS plain file" }, 70fa9e4066Sahrens { zap_byteswap, TRUE, "ZFS directory" }, 71fa9e4066Sahrens { zap_byteswap, TRUE, "ZFS master node" }, 72fa9e4066Sahrens { zap_byteswap, TRUE, "ZFS delete queue" }, 73fa9e4066Sahrens { byteswap_uint8_array, FALSE, "zvol object" }, 74fa9e4066Sahrens { zap_byteswap, TRUE, "zvol prop" }, 75fa9e4066Sahrens { byteswap_uint8_array, FALSE, "other uint8[]" }, 76fa9e4066Sahrens { byteswap_uint64_array, FALSE, "other uint64[]" }, 77fa9e4066Sahrens { zap_byteswap, TRUE, "other ZAP" }, 78ea8dc4b6Seschrock { zap_byteswap, TRUE, "persistent error log" }, 7906eeb2adSek110237 { byteswap_uint8_array, TRUE, "SPA history" }, 8006eeb2adSek110237 { byteswap_uint64_array, TRUE, "SPA history offsets" }, 81b1b8ab34Slling { zap_byteswap, TRUE, "Pool properties" }, 82da6c28aaSamw { zap_byteswap, TRUE, "DSL permissions" }, 83da6c28aaSamw { zfs_acl_byteswap, TRUE, "ZFS ACL" }, 84da6c28aaSamw { byteswap_uint8_array, TRUE, "ZFS SYSACL" }, 85da6c28aaSamw { byteswap_uint8_array, TRUE, "FUID table" }, 86add89791Smarks { byteswap_uint64_array, TRUE, "FUID table size" }, 87fa9e4066Sahrens }; 88fa9e4066Sahrens 89fa9e4066Sahrens int 90ea8dc4b6Seschrock dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, 91ea8dc4b6Seschrock void *tag, dmu_buf_t **dbp) 92fa9e4066Sahrens { 93fa9e4066Sahrens dnode_t *dn; 94fa9e4066Sahrens uint64_t blkid; 95fa9e4066Sahrens dmu_buf_impl_t *db; 96ea8dc4b6Seschrock int err; 97fa9e4066Sahrens 98ea8dc4b6Seschrock err = dnode_hold(os->os, object, FTAG, &dn); 99ea8dc4b6Seschrock if (err) 100ea8dc4b6Seschrock return (err); 101fa9e4066Sahrens blkid = dbuf_whichblock(dn, offset); 102fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 103ea8dc4b6Seschrock db = dbuf_hold(dn, blkid, tag); 104fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 105ea8dc4b6Seschrock if (db == NULL) { 106ea8dc4b6Seschrock err = EIO; 107ea8dc4b6Seschrock } else { 108ea8dc4b6Seschrock err = dbuf_read(db, NULL, DB_RF_CANFAIL); 109ea8dc4b6Seschrock if (err) { 110ea8dc4b6Seschrock dbuf_rele(db, tag); 111ea8dc4b6Seschrock db = NULL; 112ea8dc4b6Seschrock } 113fa9e4066Sahrens } 114fa9e4066Sahrens 115ea8dc4b6Seschrock dnode_rele(dn, FTAG); 116ea8dc4b6Seschrock *dbp = &db->db; 117ea8dc4b6Seschrock return (err); 118fa9e4066Sahrens } 119fa9e4066Sahrens 120fa9e4066Sahrens int 121fa9e4066Sahrens dmu_bonus_max(void) 122fa9e4066Sahrens { 123fa9e4066Sahrens return (DN_MAX_BONUSLEN); 124fa9e4066Sahrens } 125fa9e4066Sahrens 1261934e92fSmaybee int 1271934e92fSmaybee dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx) 1281934e92fSmaybee { 1291934e92fSmaybee dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 1301934e92fSmaybee 1311934e92fSmaybee if (dn->dn_bonus != (dmu_buf_impl_t *)db) 1321934e92fSmaybee return (EINVAL); 1331934e92fSmaybee if (newsize < 0 || newsize > db->db_size) 1341934e92fSmaybee return (EINVAL); 1351934e92fSmaybee dnode_setbonuslen(dn, newsize, tx); 1361934e92fSmaybee return (0); 1371934e92fSmaybee } 1381934e92fSmaybee 139fa9e4066Sahrens /* 140ea8dc4b6Seschrock * returns ENOENT, EIO, or 0. 141fa9e4066Sahrens */ 142ea8dc4b6Seschrock int 143ea8dc4b6Seschrock dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) 144fa9e4066Sahrens { 145ea8dc4b6Seschrock dnode_t *dn; 146fa9e4066Sahrens dmu_buf_impl_t *db; 1471934e92fSmaybee int error; 148fa9e4066Sahrens 1491934e92fSmaybee error = dnode_hold(os->os, object, FTAG, &dn); 1501934e92fSmaybee if (error) 1511934e92fSmaybee return (error); 152fa9e4066Sahrens 153fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 154ea8dc4b6Seschrock if (dn->dn_bonus == NULL) { 155fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 156ea8dc4b6Seschrock rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 157ea8dc4b6Seschrock if (dn->dn_bonus == NULL) 1581934e92fSmaybee dbuf_create_bonus(dn); 159fa9e4066Sahrens } 160ea8dc4b6Seschrock db = dn->dn_bonus; 161ea8dc4b6Seschrock rw_exit(&dn->dn_struct_rwlock); 1621934e92fSmaybee 1631934e92fSmaybee /* as long as the bonus buf is held, the dnode will be held */ 1641934e92fSmaybee if (refcount_add(&db->db_holds, tag) == 1) 1651934e92fSmaybee VERIFY(dnode_add_ref(dn, db)); 1661934e92fSmaybee 167fa9e4066Sahrens dnode_rele(dn, FTAG); 168ea8dc4b6Seschrock 169ea8dc4b6Seschrock VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); 170ea8dc4b6Seschrock 171ea8dc4b6Seschrock *dbp = &db->db; 172ea8dc4b6Seschrock return (0); 173fa9e4066Sahrens } 174fa9e4066Sahrens 17513506d1eSmaybee /* 17613506d1eSmaybee * Note: longer-term, we should modify all of the dmu_buf_*() interfaces 17713506d1eSmaybee * to take a held dnode rather than <os, object> -- the lookup is wasteful, 17813506d1eSmaybee * and can induce severe lock contention when writing to several files 17913506d1eSmaybee * whose dnodes are in the same block. 18013506d1eSmaybee */ 18113506d1eSmaybee static int 18213506d1eSmaybee dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, 183ea8dc4b6Seschrock uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 184fa9e4066Sahrens { 185fa9e4066Sahrens dmu_buf_t **dbp; 186fa9e4066Sahrens uint64_t blkid, nblks, i; 187ea8dc4b6Seschrock uint32_t flags; 188ea8dc4b6Seschrock int err; 189ea8dc4b6Seschrock zio_t *zio; 190ea8dc4b6Seschrock 191ea8dc4b6Seschrock ASSERT(length <= DMU_MAX_ACCESS); 192fa9e4066Sahrens 193ea8dc4b6Seschrock flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; 194e1930233Sbonwick if (length > zfetch_array_rd_sz) 195ea8dc4b6Seschrock flags |= DB_RF_NOPREFETCH; 196ea8dc4b6Seschrock 197fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 198fa9e4066Sahrens if (dn->dn_datablkshift) { 199fa9e4066Sahrens int blkshift = dn->dn_datablkshift; 200fa9e4066Sahrens nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - 201fa9e4066Sahrens P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; 202fa9e4066Sahrens } else { 2030125049cSahrens if (offset + length > dn->dn_datablksz) { 2040125049cSahrens zfs_panic_recover("zfs: accessing past end of object " 2050125049cSahrens "%llx/%llx (size=%u access=%llu+%llu)", 2060125049cSahrens (longlong_t)dn->dn_objset-> 2070125049cSahrens os_dsl_dataset->ds_object, 2080125049cSahrens (longlong_t)dn->dn_object, dn->dn_datablksz, 2090125049cSahrens (longlong_t)offset, (longlong_t)length); 2100125049cSahrens return (EIO); 2110125049cSahrens } 212fa9e4066Sahrens nblks = 1; 213fa9e4066Sahrens } 214ea8dc4b6Seschrock dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); 215fa9e4066Sahrens 216ea8dc4b6Seschrock zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE); 217fa9e4066Sahrens blkid = dbuf_whichblock(dn, offset); 218fa9e4066Sahrens for (i = 0; i < nblks; i++) { 219ea8dc4b6Seschrock dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); 220ea8dc4b6Seschrock if (db == NULL) { 221ea8dc4b6Seschrock rw_exit(&dn->dn_struct_rwlock); 222ea8dc4b6Seschrock dmu_buf_rele_array(dbp, nblks, tag); 223ea8dc4b6Seschrock zio_nowait(zio); 224ea8dc4b6Seschrock return (EIO); 225ea8dc4b6Seschrock } 226ea8dc4b6Seschrock /* initiate async i/o */ 22713506d1eSmaybee if (read) { 228ea8dc4b6Seschrock rw_exit(&dn->dn_struct_rwlock); 229ea8dc4b6Seschrock (void) dbuf_read(db, zio, flags); 230ea8dc4b6Seschrock rw_enter(&dn->dn_struct_rwlock, RW_READER); 231ea8dc4b6Seschrock } 232ea8dc4b6Seschrock dbp[i] = &db->db; 233fa9e4066Sahrens } 234fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 235fa9e4066Sahrens 236ea8dc4b6Seschrock /* wait for async i/o */ 237ea8dc4b6Seschrock err = zio_wait(zio); 238ea8dc4b6Seschrock if (err) { 239ea8dc4b6Seschrock dmu_buf_rele_array(dbp, nblks, tag); 240ea8dc4b6Seschrock return (err); 241ea8dc4b6Seschrock } 242ea8dc4b6Seschrock 243ea8dc4b6Seschrock /* wait for other io to complete */ 244ea8dc4b6Seschrock if (read) { 245ea8dc4b6Seschrock for (i = 0; i < nblks; i++) { 246ea8dc4b6Seschrock dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; 247ea8dc4b6Seschrock mutex_enter(&db->db_mtx); 248ea8dc4b6Seschrock while (db->db_state == DB_READ || 249ea8dc4b6Seschrock db->db_state == DB_FILL) 250ea8dc4b6Seschrock cv_wait(&db->db_changed, &db->db_mtx); 251ea8dc4b6Seschrock if (db->db_state == DB_UNCACHED) 252ea8dc4b6Seschrock err = EIO; 253ea8dc4b6Seschrock mutex_exit(&db->db_mtx); 254ea8dc4b6Seschrock if (err) { 255ea8dc4b6Seschrock dmu_buf_rele_array(dbp, nblks, tag); 256ea8dc4b6Seschrock return (err); 257ea8dc4b6Seschrock } 258ea8dc4b6Seschrock } 259ea8dc4b6Seschrock } 260ea8dc4b6Seschrock 261ea8dc4b6Seschrock *numbufsp = nblks; 262ea8dc4b6Seschrock *dbpp = dbp; 263ea8dc4b6Seschrock return (0); 264fa9e4066Sahrens } 265fa9e4066Sahrens 266a2eea2e1Sahrens static int 26713506d1eSmaybee dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, 26813506d1eSmaybee uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 26913506d1eSmaybee { 27013506d1eSmaybee dnode_t *dn; 27113506d1eSmaybee int err; 27213506d1eSmaybee 27313506d1eSmaybee err = dnode_hold(os->os, object, FTAG, &dn); 27413506d1eSmaybee if (err) 27513506d1eSmaybee return (err); 27613506d1eSmaybee 27713506d1eSmaybee err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 27813506d1eSmaybee numbufsp, dbpp); 27913506d1eSmaybee 28013506d1eSmaybee dnode_rele(dn, FTAG); 28113506d1eSmaybee 28213506d1eSmaybee return (err); 28313506d1eSmaybee } 28413506d1eSmaybee 28513506d1eSmaybee int 28613506d1eSmaybee dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, 28713506d1eSmaybee uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 28813506d1eSmaybee { 28913506d1eSmaybee dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 29013506d1eSmaybee int err; 29113506d1eSmaybee 29213506d1eSmaybee err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 29313506d1eSmaybee numbufsp, dbpp); 29413506d1eSmaybee 29513506d1eSmaybee return (err); 29613506d1eSmaybee } 29713506d1eSmaybee 298fa9e4066Sahrens void 299ea8dc4b6Seschrock dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) 300fa9e4066Sahrens { 301fa9e4066Sahrens int i; 302fa9e4066Sahrens dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; 303fa9e4066Sahrens 304fa9e4066Sahrens if (numbufs == 0) 305fa9e4066Sahrens return; 306fa9e4066Sahrens 307ea8dc4b6Seschrock for (i = 0; i < numbufs; i++) { 308ea8dc4b6Seschrock if (dbp[i]) 309ea8dc4b6Seschrock dbuf_rele(dbp[i], tag); 310ea8dc4b6Seschrock } 311fa9e4066Sahrens 312fa9e4066Sahrens kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); 313fa9e4066Sahrens } 314fa9e4066Sahrens 315fa9e4066Sahrens void 316fa9e4066Sahrens dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) 317fa9e4066Sahrens { 318fa9e4066Sahrens dnode_t *dn; 319fa9e4066Sahrens uint64_t blkid; 320ea8dc4b6Seschrock int nblks, i, err; 321fa9e4066Sahrens 322416e0cd8Sek110237 if (zfs_prefetch_disable) 323416e0cd8Sek110237 return; 324416e0cd8Sek110237 325fa9e4066Sahrens if (len == 0) { /* they're interested in the bonus buffer */ 326fa9e4066Sahrens dn = os->os->os_meta_dnode; 327fa9e4066Sahrens 328fa9e4066Sahrens if (object == 0 || object >= DN_MAX_OBJECT) 329fa9e4066Sahrens return; 330fa9e4066Sahrens 331fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 332fa9e4066Sahrens blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); 333fa9e4066Sahrens dbuf_prefetch(dn, blkid); 334fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 335fa9e4066Sahrens return; 336fa9e4066Sahrens } 337fa9e4066Sahrens 338fa9e4066Sahrens /* 339fa9e4066Sahrens * XXX - Note, if the dnode for the requested object is not 340fa9e4066Sahrens * already cached, we will do a *synchronous* read in the 341fa9e4066Sahrens * dnode_hold() call. The same is true for any indirects. 342fa9e4066Sahrens */ 343ea8dc4b6Seschrock err = dnode_hold(os->os, object, FTAG, &dn); 344ea8dc4b6Seschrock if (err != 0) 345fa9e4066Sahrens return; 346fa9e4066Sahrens 347fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 348fa9e4066Sahrens if (dn->dn_datablkshift) { 349fa9e4066Sahrens int blkshift = dn->dn_datablkshift; 350fa9e4066Sahrens nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - 351fa9e4066Sahrens P2ALIGN(offset, 1<<blkshift)) >> blkshift; 352fa9e4066Sahrens } else { 353fa9e4066Sahrens nblks = (offset < dn->dn_datablksz); 354fa9e4066Sahrens } 355fa9e4066Sahrens 356fa9e4066Sahrens if (nblks != 0) { 357fa9e4066Sahrens blkid = dbuf_whichblock(dn, offset); 358fa9e4066Sahrens for (i = 0; i < nblks; i++) 359fa9e4066Sahrens dbuf_prefetch(dn, blkid+i); 360fa9e4066Sahrens } 361fa9e4066Sahrens 362fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 363fa9e4066Sahrens 364fa9e4066Sahrens dnode_rele(dn, FTAG); 365fa9e4066Sahrens } 366fa9e4066Sahrens 367*cdb0ab79Smaybee static int 368*cdb0ab79Smaybee get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit) 369*cdb0ab79Smaybee { 370*cdb0ab79Smaybee uint64_t len = limit - *offset; 371*cdb0ab79Smaybee uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT; 372*cdb0ab79Smaybee uint64_t dn_used; 373*cdb0ab79Smaybee int err; 374*cdb0ab79Smaybee 375*cdb0ab79Smaybee ASSERT(limit <= *offset); 376*cdb0ab79Smaybee 377*cdb0ab79Smaybee dn_used = dn->dn_phys->dn_used << 378*cdb0ab79Smaybee (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES ? 0 : DEV_BSHIFT); 379*cdb0ab79Smaybee if (len <= chunk_len || dn_used <= chunk_len) { 380*cdb0ab79Smaybee *offset = limit; 381*cdb0ab79Smaybee return (0); 382*cdb0ab79Smaybee } 383*cdb0ab79Smaybee 384*cdb0ab79Smaybee while (*offset > limit) { 385*cdb0ab79Smaybee uint64_t initial_offset = *offset; 386*cdb0ab79Smaybee uint64_t delta; 387*cdb0ab79Smaybee 388*cdb0ab79Smaybee /* skip over allocated data */ 389*cdb0ab79Smaybee err = dnode_next_offset(dn, 390*cdb0ab79Smaybee DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0); 391*cdb0ab79Smaybee if (err == ESRCH) 392*cdb0ab79Smaybee *offset = limit; 393*cdb0ab79Smaybee else if (err) 394*cdb0ab79Smaybee return (err); 395*cdb0ab79Smaybee 396*cdb0ab79Smaybee ASSERT3U(*offset, <=, initial_offset); 397*cdb0ab79Smaybee delta = initial_offset - *offset; 398*cdb0ab79Smaybee if (delta >= chunk_len) { 399*cdb0ab79Smaybee *offset += delta - chunk_len; 400*cdb0ab79Smaybee return (0); 401*cdb0ab79Smaybee } 402*cdb0ab79Smaybee chunk_len -= delta; 403*cdb0ab79Smaybee 404*cdb0ab79Smaybee /* skip over unallocated data */ 405*cdb0ab79Smaybee err = dnode_next_offset(dn, 406*cdb0ab79Smaybee DNODE_FIND_BACKWARDS, offset, 1, 1, 0); 407*cdb0ab79Smaybee if (err == ESRCH) 408*cdb0ab79Smaybee *offset = limit; 409*cdb0ab79Smaybee else if (err) 410*cdb0ab79Smaybee return (err); 411*cdb0ab79Smaybee 412*cdb0ab79Smaybee if (*offset < limit) 413*cdb0ab79Smaybee *offset = limit; 414*cdb0ab79Smaybee ASSERT3U(*offset, <, initial_offset); 415*cdb0ab79Smaybee } 416*cdb0ab79Smaybee return (0); 417*cdb0ab79Smaybee } 418*cdb0ab79Smaybee 419*cdb0ab79Smaybee static int 420*cdb0ab79Smaybee dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, 421*cdb0ab79Smaybee uint64_t length, boolean_t free_dnode) 422*cdb0ab79Smaybee { 423*cdb0ab79Smaybee dmu_tx_t *tx; 424*cdb0ab79Smaybee uint64_t object_size, start, end, len; 425*cdb0ab79Smaybee boolean_t trunc = (length == DMU_OBJECT_END); 426*cdb0ab79Smaybee int align, err; 427*cdb0ab79Smaybee 428*cdb0ab79Smaybee align = 1 << dn->dn_datablkshift; 429*cdb0ab79Smaybee ASSERT(align > 0); 430*cdb0ab79Smaybee object_size = align == 1 ? dn->dn_datablksz : 431*cdb0ab79Smaybee (dn->dn_maxblkid + 1) << dn->dn_datablkshift; 432*cdb0ab79Smaybee 433*cdb0ab79Smaybee if (trunc || (end = offset + length) > object_size) 434*cdb0ab79Smaybee end = object_size; 435*cdb0ab79Smaybee if (end <= offset) 436*cdb0ab79Smaybee return (0); 437*cdb0ab79Smaybee length = end - offset; 438*cdb0ab79Smaybee 439*cdb0ab79Smaybee while (length) { 440*cdb0ab79Smaybee start = end; 441*cdb0ab79Smaybee err = get_next_chunk(dn, &start, offset); 442*cdb0ab79Smaybee if (err) 443*cdb0ab79Smaybee return (err); 444*cdb0ab79Smaybee len = trunc ? DMU_OBJECT_END : end - start; 445*cdb0ab79Smaybee 446*cdb0ab79Smaybee tx = dmu_tx_create(os); 447*cdb0ab79Smaybee dmu_tx_hold_free(tx, dn->dn_object, start, len); 448*cdb0ab79Smaybee err = dmu_tx_assign(tx, TXG_WAIT); 449*cdb0ab79Smaybee if (err) { 450*cdb0ab79Smaybee dmu_tx_abort(tx); 451*cdb0ab79Smaybee return (err); 452*cdb0ab79Smaybee } 453*cdb0ab79Smaybee 454*cdb0ab79Smaybee dnode_free_range(dn, start, trunc ? -1 : len, tx); 455*cdb0ab79Smaybee 456*cdb0ab79Smaybee if (start == 0 && trunc && free_dnode) 457*cdb0ab79Smaybee dnode_free(dn, tx); 458*cdb0ab79Smaybee 459*cdb0ab79Smaybee length -= end - start; 460*cdb0ab79Smaybee 461*cdb0ab79Smaybee dmu_tx_commit(tx); 462*cdb0ab79Smaybee end = start; 463*cdb0ab79Smaybee trunc = FALSE; 464*cdb0ab79Smaybee } 465*cdb0ab79Smaybee return (0); 466*cdb0ab79Smaybee } 467*cdb0ab79Smaybee 468*cdb0ab79Smaybee int 469*cdb0ab79Smaybee dmu_free_long_range(objset_t *os, uint64_t object, 470*cdb0ab79Smaybee uint64_t offset, uint64_t length) 471*cdb0ab79Smaybee { 472*cdb0ab79Smaybee dnode_t *dn; 473*cdb0ab79Smaybee int err; 474*cdb0ab79Smaybee 475*cdb0ab79Smaybee err = dnode_hold(os->os, object, FTAG, &dn); 476*cdb0ab79Smaybee if (err != 0) 477*cdb0ab79Smaybee return (err); 478*cdb0ab79Smaybee err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); 479*cdb0ab79Smaybee dnode_rele(dn, FTAG); 480*cdb0ab79Smaybee return (err); 481*cdb0ab79Smaybee } 482*cdb0ab79Smaybee 483*cdb0ab79Smaybee int 484*cdb0ab79Smaybee dmu_free_object(objset_t *os, uint64_t object) 485*cdb0ab79Smaybee { 486*cdb0ab79Smaybee dnode_t *dn; 487*cdb0ab79Smaybee dmu_tx_t *tx; 488*cdb0ab79Smaybee int err; 489*cdb0ab79Smaybee 490*cdb0ab79Smaybee err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, 491*cdb0ab79Smaybee FTAG, &dn); 492*cdb0ab79Smaybee if (err != 0) 493*cdb0ab79Smaybee return (err); 494*cdb0ab79Smaybee if (dn->dn_nlevels == 1) { 495*cdb0ab79Smaybee tx = dmu_tx_create(os); 496*cdb0ab79Smaybee dmu_tx_hold_bonus(tx, object); 497*cdb0ab79Smaybee dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END); 498*cdb0ab79Smaybee err = dmu_tx_assign(tx, TXG_WAIT); 499*cdb0ab79Smaybee if (err == 0) { 500*cdb0ab79Smaybee dnode_free_range(dn, 0, DMU_OBJECT_END, tx); 501*cdb0ab79Smaybee dnode_free(dn, tx); 502*cdb0ab79Smaybee dmu_tx_commit(tx); 503*cdb0ab79Smaybee } else { 504*cdb0ab79Smaybee dmu_tx_abort(tx); 505*cdb0ab79Smaybee } 506*cdb0ab79Smaybee } else { 507*cdb0ab79Smaybee err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE); 508*cdb0ab79Smaybee } 509*cdb0ab79Smaybee dnode_rele(dn, FTAG); 510*cdb0ab79Smaybee return (err); 511*cdb0ab79Smaybee } 512*cdb0ab79Smaybee 513ea8dc4b6Seschrock int 514fa9e4066Sahrens dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, 515fa9e4066Sahrens uint64_t size, dmu_tx_t *tx) 516fa9e4066Sahrens { 517ea8dc4b6Seschrock dnode_t *dn; 518ea8dc4b6Seschrock int err = dnode_hold(os->os, object, FTAG, &dn); 519ea8dc4b6Seschrock if (err) 520ea8dc4b6Seschrock return (err); 521fa9e4066Sahrens ASSERT(offset < UINT64_MAX); 522fa9e4066Sahrens ASSERT(size == -1ULL || size <= UINT64_MAX - offset); 523fa9e4066Sahrens dnode_free_range(dn, offset, size, tx); 524fa9e4066Sahrens dnode_rele(dn, FTAG); 525ea8dc4b6Seschrock return (0); 526fa9e4066Sahrens } 527fa9e4066Sahrens 528ea8dc4b6Seschrock int 529ea8dc4b6Seschrock dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 530ea8dc4b6Seschrock void *buf) 531fa9e4066Sahrens { 532fa9e4066Sahrens dnode_t *dn; 533fa9e4066Sahrens dmu_buf_t **dbp; 534ea8dc4b6Seschrock int numbufs, i, err; 535fa9e4066Sahrens 536ea8dc4b6Seschrock err = dnode_hold(os->os, object, FTAG, &dn); 537ea8dc4b6Seschrock if (err) 538ea8dc4b6Seschrock return (err); 539feb08c6bSbillm 540feb08c6bSbillm /* 541feb08c6bSbillm * Deal with odd block sizes, where there can't be data past the first 542feb08c6bSbillm * block. If we ever do the tail block optimization, we will need to 543feb08c6bSbillm * handle that here as well. 544feb08c6bSbillm */ 545fa9e4066Sahrens if (dn->dn_datablkshift == 0) { 546fa9e4066Sahrens int newsz = offset > dn->dn_datablksz ? 0 : 547fa9e4066Sahrens MIN(size, dn->dn_datablksz - offset); 548fa9e4066Sahrens bzero((char *)buf + newsz, size - newsz); 549fa9e4066Sahrens size = newsz; 550fa9e4066Sahrens } 551fa9e4066Sahrens 552fa9e4066Sahrens while (size > 0) { 553fa9e4066Sahrens uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); 554fa9e4066Sahrens 555fa9e4066Sahrens /* 556fa9e4066Sahrens * NB: we could do this block-at-a-time, but it's nice 557fa9e4066Sahrens * to be reading in parallel. 558fa9e4066Sahrens */ 559a2eea2e1Sahrens err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, 560ea8dc4b6Seschrock TRUE, FTAG, &numbufs, &dbp); 561ea8dc4b6Seschrock if (err) 5621934e92fSmaybee break; 563fa9e4066Sahrens 564fa9e4066Sahrens for (i = 0; i < numbufs; i++) { 565fa9e4066Sahrens int tocpy; 566fa9e4066Sahrens int bufoff; 567fa9e4066Sahrens dmu_buf_t *db = dbp[i]; 568fa9e4066Sahrens 569fa9e4066Sahrens ASSERT(size > 0); 570fa9e4066Sahrens 571fa9e4066Sahrens bufoff = offset - db->db_offset; 572fa9e4066Sahrens tocpy = (int)MIN(db->db_size - bufoff, size); 573fa9e4066Sahrens 574fa9e4066Sahrens bcopy((char *)db->db_data + bufoff, buf, tocpy); 575fa9e4066Sahrens 576fa9e4066Sahrens offset += tocpy; 577fa9e4066Sahrens size -= tocpy; 578fa9e4066Sahrens buf = (char *)buf + tocpy; 579fa9e4066Sahrens } 580ea8dc4b6Seschrock dmu_buf_rele_array(dbp, numbufs, FTAG); 581fa9e4066Sahrens } 582a2eea2e1Sahrens dnode_rele(dn, FTAG); 5831934e92fSmaybee return (err); 584fa9e4066Sahrens } 585fa9e4066Sahrens 586fa9e4066Sahrens void 587fa9e4066Sahrens dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 588fa9e4066Sahrens const void *buf, dmu_tx_t *tx) 589fa9e4066Sahrens { 590fa9e4066Sahrens dmu_buf_t **dbp; 591fa9e4066Sahrens int numbufs, i; 592fa9e4066Sahrens 59313506d1eSmaybee if (size == 0) 59413506d1eSmaybee return; 59513506d1eSmaybee 596ea8dc4b6Seschrock VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, 597ea8dc4b6Seschrock FALSE, FTAG, &numbufs, &dbp)); 598fa9e4066Sahrens 599fa9e4066Sahrens for (i = 0; i < numbufs; i++) { 600fa9e4066Sahrens int tocpy; 601fa9e4066Sahrens int bufoff; 602fa9e4066Sahrens dmu_buf_t *db = dbp[i]; 603fa9e4066Sahrens 604fa9e4066Sahrens ASSERT(size > 0); 605fa9e4066Sahrens 606fa9e4066Sahrens bufoff = offset - db->db_offset; 607fa9e4066Sahrens tocpy = (int)MIN(db->db_size - bufoff, size); 608fa9e4066Sahrens 609fa9e4066Sahrens ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 610fa9e4066Sahrens 611fa9e4066Sahrens if (tocpy == db->db_size) 612fa9e4066Sahrens dmu_buf_will_fill(db, tx); 613fa9e4066Sahrens else 614fa9e4066Sahrens dmu_buf_will_dirty(db, tx); 615fa9e4066Sahrens 616fa9e4066Sahrens bcopy(buf, (char *)db->db_data + bufoff, tocpy); 617fa9e4066Sahrens 618fa9e4066Sahrens if (tocpy == db->db_size) 619fa9e4066Sahrens dmu_buf_fill_done(db, tx); 620fa9e4066Sahrens 621fa9e4066Sahrens offset += tocpy; 622fa9e4066Sahrens size -= tocpy; 623fa9e4066Sahrens buf = (char *)buf + tocpy; 624fa9e4066Sahrens } 625ea8dc4b6Seschrock dmu_buf_rele_array(dbp, numbufs, FTAG); 626fa9e4066Sahrens } 627fa9e4066Sahrens 628fa9e4066Sahrens #ifdef _KERNEL 629fa9e4066Sahrens int 630feb08c6bSbillm dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) 631feb08c6bSbillm { 632feb08c6bSbillm dmu_buf_t **dbp; 633feb08c6bSbillm int numbufs, i, err; 634feb08c6bSbillm 635feb08c6bSbillm /* 636feb08c6bSbillm * NB: we could do this block-at-a-time, but it's nice 637feb08c6bSbillm * to be reading in parallel. 638feb08c6bSbillm */ 639feb08c6bSbillm err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, 640feb08c6bSbillm &numbufs, &dbp); 641feb08c6bSbillm if (err) 642feb08c6bSbillm return (err); 643feb08c6bSbillm 644feb08c6bSbillm for (i = 0; i < numbufs; i++) { 645feb08c6bSbillm int tocpy; 646feb08c6bSbillm int bufoff; 647feb08c6bSbillm dmu_buf_t *db = dbp[i]; 648feb08c6bSbillm 649feb08c6bSbillm ASSERT(size > 0); 650feb08c6bSbillm 651feb08c6bSbillm bufoff = uio->uio_loffset - db->db_offset; 652feb08c6bSbillm tocpy = (int)MIN(db->db_size - bufoff, size); 653feb08c6bSbillm 654feb08c6bSbillm err = uiomove((char *)db->db_data + bufoff, tocpy, 655feb08c6bSbillm UIO_READ, uio); 656feb08c6bSbillm if (err) 657feb08c6bSbillm break; 658feb08c6bSbillm 659feb08c6bSbillm size -= tocpy; 660feb08c6bSbillm } 661feb08c6bSbillm dmu_buf_rele_array(dbp, numbufs, FTAG); 662feb08c6bSbillm 663feb08c6bSbillm return (err); 664feb08c6bSbillm } 665feb08c6bSbillm 666feb08c6bSbillm int 667feb08c6bSbillm dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, 668feb08c6bSbillm dmu_tx_t *tx) 669fa9e4066Sahrens { 670fa9e4066Sahrens dmu_buf_t **dbp; 671fa9e4066Sahrens int numbufs, i; 672fa9e4066Sahrens int err = 0; 673fa9e4066Sahrens 67413506d1eSmaybee if (size == 0) 67513506d1eSmaybee return (0); 67613506d1eSmaybee 677feb08c6bSbillm err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, 678ea8dc4b6Seschrock FALSE, FTAG, &numbufs, &dbp); 679ea8dc4b6Seschrock if (err) 680ea8dc4b6Seschrock return (err); 681fa9e4066Sahrens 682fa9e4066Sahrens for (i = 0; i < numbufs; i++) { 683fa9e4066Sahrens int tocpy; 684fa9e4066Sahrens int bufoff; 685fa9e4066Sahrens dmu_buf_t *db = dbp[i]; 686fa9e4066Sahrens 687fa9e4066Sahrens ASSERT(size > 0); 688fa9e4066Sahrens 689feb08c6bSbillm bufoff = uio->uio_loffset - db->db_offset; 690fa9e4066Sahrens tocpy = (int)MIN(db->db_size - bufoff, size); 691fa9e4066Sahrens 692fa9e4066Sahrens ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 693fa9e4066Sahrens 694fa9e4066Sahrens if (tocpy == db->db_size) 695fa9e4066Sahrens dmu_buf_will_fill(db, tx); 696fa9e4066Sahrens else 697fa9e4066Sahrens dmu_buf_will_dirty(db, tx); 698fa9e4066Sahrens 699fa9e4066Sahrens /* 700fa9e4066Sahrens * XXX uiomove could block forever (eg. nfs-backed 701fa9e4066Sahrens * pages). There needs to be a uiolockdown() function 702fa9e4066Sahrens * to lock the pages in memory, so that uiomove won't 703fa9e4066Sahrens * block. 704fa9e4066Sahrens */ 705fa9e4066Sahrens err = uiomove((char *)db->db_data + bufoff, tocpy, 706fa9e4066Sahrens UIO_WRITE, uio); 707fa9e4066Sahrens 708fa9e4066Sahrens if (tocpy == db->db_size) 709fa9e4066Sahrens dmu_buf_fill_done(db, tx); 710fa9e4066Sahrens 711fa9e4066Sahrens if (err) 712fa9e4066Sahrens break; 713fa9e4066Sahrens 714fa9e4066Sahrens size -= tocpy; 715fa9e4066Sahrens } 716ea8dc4b6Seschrock dmu_buf_rele_array(dbp, numbufs, FTAG); 717fa9e4066Sahrens return (err); 718fa9e4066Sahrens } 71944eda4d7Smaybee 72044eda4d7Smaybee int 72144eda4d7Smaybee dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 72244eda4d7Smaybee page_t *pp, dmu_tx_t *tx) 72344eda4d7Smaybee { 72444eda4d7Smaybee dmu_buf_t **dbp; 72544eda4d7Smaybee int numbufs, i; 72644eda4d7Smaybee int err; 72744eda4d7Smaybee 72844eda4d7Smaybee if (size == 0) 72944eda4d7Smaybee return (0); 73044eda4d7Smaybee 73144eda4d7Smaybee err = dmu_buf_hold_array(os, object, offset, size, 73244eda4d7Smaybee FALSE, FTAG, &numbufs, &dbp); 73344eda4d7Smaybee if (err) 73444eda4d7Smaybee return (err); 73544eda4d7Smaybee 73644eda4d7Smaybee for (i = 0; i < numbufs; i++) { 73744eda4d7Smaybee int tocpy, copied, thiscpy; 73844eda4d7Smaybee int bufoff; 73944eda4d7Smaybee dmu_buf_t *db = dbp[i]; 74044eda4d7Smaybee caddr_t va; 74144eda4d7Smaybee 74244eda4d7Smaybee ASSERT(size > 0); 74344eda4d7Smaybee ASSERT3U(db->db_size, >=, PAGESIZE); 74444eda4d7Smaybee 74544eda4d7Smaybee bufoff = offset - db->db_offset; 74644eda4d7Smaybee tocpy = (int)MIN(db->db_size - bufoff, size); 74744eda4d7Smaybee 74844eda4d7Smaybee ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 74944eda4d7Smaybee 75044eda4d7Smaybee if (tocpy == db->db_size) 75144eda4d7Smaybee dmu_buf_will_fill(db, tx); 75244eda4d7Smaybee else 75344eda4d7Smaybee dmu_buf_will_dirty(db, tx); 75444eda4d7Smaybee 75544eda4d7Smaybee for (copied = 0; copied < tocpy; copied += PAGESIZE) { 75644eda4d7Smaybee ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); 75744eda4d7Smaybee thiscpy = MIN(PAGESIZE, tocpy - copied); 75844eda4d7Smaybee va = ppmapin(pp, PROT_READ, (caddr_t)-1); 75944eda4d7Smaybee bcopy(va, (char *)db->db_data + bufoff, thiscpy); 76044eda4d7Smaybee ppmapout(va); 76144eda4d7Smaybee pp = pp->p_next; 76244eda4d7Smaybee bufoff += PAGESIZE; 76344eda4d7Smaybee } 76444eda4d7Smaybee 76544eda4d7Smaybee if (tocpy == db->db_size) 76644eda4d7Smaybee dmu_buf_fill_done(db, tx); 76744eda4d7Smaybee 76844eda4d7Smaybee if (err) 76944eda4d7Smaybee break; 77044eda4d7Smaybee 77144eda4d7Smaybee offset += tocpy; 77244eda4d7Smaybee size -= tocpy; 77344eda4d7Smaybee } 77444eda4d7Smaybee dmu_buf_rele_array(dbp, numbufs, FTAG); 77544eda4d7Smaybee return (err); 77644eda4d7Smaybee } 777fa9e4066Sahrens #endif 778fa9e4066Sahrens 779c5c6ffa0Smaybee typedef struct { 780c717a561Smaybee dbuf_dirty_record_t *dr; 781c5c6ffa0Smaybee dmu_sync_cb_t *done; 782c5c6ffa0Smaybee void *arg; 783c717a561Smaybee } dmu_sync_arg_t; 784c5c6ffa0Smaybee 785c5c6ffa0Smaybee /* ARGSUSED */ 786c5c6ffa0Smaybee static void 787c5c6ffa0Smaybee dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) 788c5c6ffa0Smaybee { 789c717a561Smaybee dmu_sync_arg_t *in = varg; 790c717a561Smaybee dbuf_dirty_record_t *dr = in->dr; 791c717a561Smaybee dmu_buf_impl_t *db = dr->dr_dbuf; 792c5c6ffa0Smaybee dmu_sync_cb_t *done = in->done; 793c5c6ffa0Smaybee 794c5c6ffa0Smaybee if (!BP_IS_HOLE(zio->io_bp)) { 795c5c6ffa0Smaybee zio->io_bp->blk_fill = 1; 796c5c6ffa0Smaybee BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type); 797c5c6ffa0Smaybee BP_SET_LEVEL(zio->io_bp, 0); 798c5c6ffa0Smaybee } 799c5c6ffa0Smaybee 800c5c6ffa0Smaybee mutex_enter(&db->db_mtx); 801c717a561Smaybee ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); 802c717a561Smaybee dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */ 803c717a561Smaybee dr->dt.dl.dr_override_state = DR_OVERRIDDEN; 804c5c6ffa0Smaybee cv_broadcast(&db->db_changed); 805c5c6ffa0Smaybee mutex_exit(&db->db_mtx); 806c5c6ffa0Smaybee 807c5c6ffa0Smaybee if (done) 808c717a561Smaybee done(&(db->db), in->arg); 809c717a561Smaybee 810c717a561Smaybee kmem_free(in, sizeof (dmu_sync_arg_t)); 811c5c6ffa0Smaybee } 812c5c6ffa0Smaybee 813fa9e4066Sahrens /* 814c5c6ffa0Smaybee * Intent log support: sync the block associated with db to disk. 815c5c6ffa0Smaybee * N.B. and XXX: the caller is responsible for making sure that the 816c5c6ffa0Smaybee * data isn't changing while dmu_sync() is writing it. 817fa9e4066Sahrens * 818fa9e4066Sahrens * Return values: 819fa9e4066Sahrens * 820c5c6ffa0Smaybee * EEXIST: this txg has already been synced, so there's nothing to to. 821fa9e4066Sahrens * The caller should not log the write. 822fa9e4066Sahrens * 823fa9e4066Sahrens * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. 824fa9e4066Sahrens * The caller should not log the write. 825fa9e4066Sahrens * 826c5c6ffa0Smaybee * EALREADY: this block is already in the process of being synced. 827c5c6ffa0Smaybee * The caller should track its progress (somehow). 828fa9e4066Sahrens * 829c5c6ffa0Smaybee * EINPROGRESS: the IO has been initiated. 830c5c6ffa0Smaybee * The caller should log this blkptr in the callback. 831fa9e4066Sahrens * 832c5c6ffa0Smaybee * 0: completed. Sets *bp to the blkptr just written. 833c5c6ffa0Smaybee * The caller should log this blkptr immediately. 834fa9e4066Sahrens */ 835fa9e4066Sahrens int 836c5c6ffa0Smaybee dmu_sync(zio_t *pio, dmu_buf_t *db_fake, 837c5c6ffa0Smaybee blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg) 838fa9e4066Sahrens { 839c5c6ffa0Smaybee dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 840c5c6ffa0Smaybee objset_impl_t *os = db->db_objset; 841c5c6ffa0Smaybee dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; 842fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 843c717a561Smaybee dbuf_dirty_record_t *dr; 844c717a561Smaybee dmu_sync_arg_t *in; 845ea8dc4b6Seschrock zbookmark_t zb; 846c717a561Smaybee zio_t *zio; 8479bc11082Sek110237 int zio_flags; 848c5c6ffa0Smaybee int err; 849fa9e4066Sahrens 850fa9e4066Sahrens ASSERT(BP_IS_HOLE(bp)); 851fa9e4066Sahrens ASSERT(txg != 0); 852fa9e4066Sahrens 853c5c6ffa0Smaybee 854fa9e4066Sahrens dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", 855fa9e4066Sahrens txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); 856fa9e4066Sahrens 857fa9e4066Sahrens /* 858c5c6ffa0Smaybee * XXX - would be nice if we could do this without suspending... 859ea8dc4b6Seschrock */ 860c5c6ffa0Smaybee txg_suspend(dp); 861ea8dc4b6Seschrock 862ea8dc4b6Seschrock /* 863fa9e4066Sahrens * If this txg already synced, there's nothing to do. 864fa9e4066Sahrens */ 865fa9e4066Sahrens if (txg <= tx->tx_synced_txg) { 866c5c6ffa0Smaybee txg_resume(dp); 867fa9e4066Sahrens /* 868fa9e4066Sahrens * If we're running ziltest, we need the blkptr regardless. 869fa9e4066Sahrens */ 870fa9e4066Sahrens if (txg > spa_freeze_txg(dp->dp_spa)) { 871fa9e4066Sahrens /* if db_blkptr == NULL, this was an empty write */ 872fa9e4066Sahrens if (db->db_blkptr) 873fa9e4066Sahrens *bp = *db->db_blkptr; /* structure assignment */ 874fa9e4066Sahrens return (0); 875fa9e4066Sahrens } 876c5c6ffa0Smaybee return (EEXIST); 877fa9e4066Sahrens } 878fa9e4066Sahrens 879fa9e4066Sahrens mutex_enter(&db->db_mtx); 880fa9e4066Sahrens 881c5c6ffa0Smaybee if (txg == tx->tx_syncing_txg) { 882c5c6ffa0Smaybee while (db->db_data_pending) { 883c5c6ffa0Smaybee /* 884c5c6ffa0Smaybee * IO is in-progress. Wait for it to finish. 885c5c6ffa0Smaybee * XXX - would be nice to be able to somehow "attach" 886c5c6ffa0Smaybee * this zio to the parent zio passed in. 887c5c6ffa0Smaybee */ 888c5c6ffa0Smaybee cv_wait(&db->db_changed, &db->db_mtx); 88913506d1eSmaybee if (!db->db_data_pending && 89013506d1eSmaybee db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) { 89113506d1eSmaybee /* 89213506d1eSmaybee * IO was compressed away 89313506d1eSmaybee */ 89413506d1eSmaybee *bp = *db->db_blkptr; /* structure assignment */ 89513506d1eSmaybee mutex_exit(&db->db_mtx); 89613506d1eSmaybee txg_resume(dp); 89713506d1eSmaybee return (0); 89813506d1eSmaybee } 899c5c6ffa0Smaybee ASSERT(db->db_data_pending || 900c5c6ffa0Smaybee (db->db_blkptr && db->db_blkptr->blk_birth == txg)); 901c5c6ffa0Smaybee } 902c5c6ffa0Smaybee 903c5c6ffa0Smaybee if (db->db_blkptr && db->db_blkptr->blk_birth == txg) { 904c5c6ffa0Smaybee /* 905c5c6ffa0Smaybee * IO is already completed. 906c5c6ffa0Smaybee */ 907c5c6ffa0Smaybee *bp = *db->db_blkptr; /* structure assignment */ 908c5c6ffa0Smaybee mutex_exit(&db->db_mtx); 909c5c6ffa0Smaybee txg_resume(dp); 910c5c6ffa0Smaybee return (0); 911c5c6ffa0Smaybee } 912c5c6ffa0Smaybee } 913c5c6ffa0Smaybee 914c717a561Smaybee dr = db->db_last_dirty; 915c717a561Smaybee while (dr && dr->dr_txg > txg) 916c717a561Smaybee dr = dr->dr_next; 917c717a561Smaybee if (dr == NULL || dr->dr_txg < txg) { 918c5c6ffa0Smaybee /* 919c5c6ffa0Smaybee * This dbuf isn't dirty, must have been free_range'd. 920fa9e4066Sahrens * There's no need to log writes to freed blocks, so we're done. 921fa9e4066Sahrens */ 922fa9e4066Sahrens mutex_exit(&db->db_mtx); 923c5c6ffa0Smaybee txg_resume(dp); 924fa9e4066Sahrens return (ENOENT); 925fa9e4066Sahrens } 926fa9e4066Sahrens 927c717a561Smaybee ASSERT(dr->dr_txg == txg); 928c717a561Smaybee if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 929c5c6ffa0Smaybee /* 930c717a561Smaybee * We have already issued a sync write for this buffer. 931c5c6ffa0Smaybee */ 932c717a561Smaybee mutex_exit(&db->db_mtx); 933c717a561Smaybee txg_resume(dp); 934c717a561Smaybee return (EALREADY); 935c717a561Smaybee } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 936c717a561Smaybee /* 937c717a561Smaybee * This buffer has already been synced. It could not 938c717a561Smaybee * have been dirtied since, or we would have cleared the state. 939c717a561Smaybee */ 940c717a561Smaybee *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */ 941c717a561Smaybee mutex_exit(&db->db_mtx); 942c717a561Smaybee txg_resume(dp); 943c717a561Smaybee return (0); 944c717a561Smaybee } 945c717a561Smaybee 946c717a561Smaybee dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; 947c717a561Smaybee in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 948c717a561Smaybee in->dr = dr; 949c5c6ffa0Smaybee in->done = done; 950c5c6ffa0Smaybee in->arg = arg; 951fa9e4066Sahrens mutex_exit(&db->db_mtx); 952c5c6ffa0Smaybee txg_resume(dp); 953fa9e4066Sahrens 954c5c6ffa0Smaybee zb.zb_objset = os->os_dsl_dataset->ds_object; 955ea8dc4b6Seschrock zb.zb_object = db->db.db_object; 956ea8dc4b6Seschrock zb.zb_level = db->db_level; 957ea8dc4b6Seschrock zb.zb_blkid = db->db_blkid; 9589bc11082Sek110237 zio_flags = ZIO_FLAG_MUSTSUCCEED; 9599bc11082Sek110237 if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0) 9609bc11082Sek110237 zio_flags |= ZIO_FLAG_METADATA; 961c717a561Smaybee zio = arc_write(pio, os->os_spa, 962c5c6ffa0Smaybee zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum), 963c5c6ffa0Smaybee zio_compress_select(db->db_dnode->dn_compress, os->os_compress), 964d0ad202dSahrens dmu_get_replication_level(os, &zb, db->db_dnode->dn_type), 965c717a561Smaybee txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in, 9669bc11082Sek110237 ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb); 967fa9e4066Sahrens 968c717a561Smaybee if (pio) { 969c717a561Smaybee zio_nowait(zio); 970c717a561Smaybee err = EINPROGRESS; 971c717a561Smaybee } else { 972c717a561Smaybee err = zio_wait(zio); 973c717a561Smaybee ASSERT(err == 0); 974c717a561Smaybee } 975c717a561Smaybee return (err); 976fa9e4066Sahrens } 977fa9e4066Sahrens 978fa9e4066Sahrens int 979fa9e4066Sahrens dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, 980fa9e4066Sahrens dmu_tx_t *tx) 981fa9e4066Sahrens { 982ea8dc4b6Seschrock dnode_t *dn; 983ea8dc4b6Seschrock int err; 984ea8dc4b6Seschrock 985ea8dc4b6Seschrock err = dnode_hold(os->os, object, FTAG, &dn); 986ea8dc4b6Seschrock if (err) 987ea8dc4b6Seschrock return (err); 988ea8dc4b6Seschrock err = dnode_set_blksz(dn, size, ibs, tx); 989fa9e4066Sahrens dnode_rele(dn, FTAG); 990fa9e4066Sahrens return (err); 991fa9e4066Sahrens } 992fa9e4066Sahrens 993fa9e4066Sahrens void 994fa9e4066Sahrens dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, 995fa9e4066Sahrens dmu_tx_t *tx) 996fa9e4066Sahrens { 997ea8dc4b6Seschrock dnode_t *dn; 998ea8dc4b6Seschrock 999ea8dc4b6Seschrock /* XXX assumes dnode_hold will not get an i/o error */ 1000ea8dc4b6Seschrock (void) dnode_hold(os->os, object, FTAG, &dn); 1001fa9e4066Sahrens ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); 1002fa9e4066Sahrens dn->dn_checksum = checksum; 1003fa9e4066Sahrens dnode_setdirty(dn, tx); 1004fa9e4066Sahrens dnode_rele(dn, FTAG); 1005fa9e4066Sahrens } 1006fa9e4066Sahrens 1007fa9e4066Sahrens void 1008fa9e4066Sahrens dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, 1009fa9e4066Sahrens dmu_tx_t *tx) 1010fa9e4066Sahrens { 1011ea8dc4b6Seschrock dnode_t *dn; 1012ea8dc4b6Seschrock 1013ea8dc4b6Seschrock /* XXX assumes dnode_hold will not get an i/o error */ 1014ea8dc4b6Seschrock (void) dnode_hold(os->os, object, FTAG, &dn); 1015fa9e4066Sahrens ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); 1016fa9e4066Sahrens dn->dn_compress = compress; 1017fa9e4066Sahrens dnode_setdirty(dn, tx); 1018fa9e4066Sahrens dnode_rele(dn, FTAG); 1019fa9e4066Sahrens } 1020fa9e4066Sahrens 102144cd46caSbillm int 1022d0ad202dSahrens dmu_get_replication_level(objset_impl_t *os, 1023d0ad202dSahrens zbookmark_t *zb, dmu_object_type_t ot) 102444cd46caSbillm { 1025d0ad202dSahrens int ncopies = os->os_copies; 102644cd46caSbillm 1027d0ad202dSahrens /* If it's the mos, it should have max copies set. */ 1028d0ad202dSahrens ASSERT(zb->zb_objset != 0 || 1029d0ad202dSahrens ncopies == spa_max_replication(os->os_spa)); 1030d0ad202dSahrens 1031d0ad202dSahrens if (dmu_ot[ot].ot_metadata || zb->zb_level != 0) 103244cd46caSbillm ncopies++; 1033d0ad202dSahrens return (MIN(ncopies, spa_max_replication(os->os_spa))); 103444cd46caSbillm } 103544cd46caSbillm 1036fa9e4066Sahrens int 1037fa9e4066Sahrens dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) 1038fa9e4066Sahrens { 1039fa9e4066Sahrens dnode_t *dn; 1040fa9e4066Sahrens int i, err; 1041fa9e4066Sahrens 1042ea8dc4b6Seschrock err = dnode_hold(os->os, object, FTAG, &dn); 1043ea8dc4b6Seschrock if (err) 1044ea8dc4b6Seschrock return (err); 1045fa9e4066Sahrens /* 1046fa9e4066Sahrens * Sync any current changes before 1047fa9e4066Sahrens * we go trundling through the block pointers. 1048fa9e4066Sahrens */ 1049fa9e4066Sahrens for (i = 0; i < TXG_SIZE; i++) { 1050c543ec06Sahrens if (list_link_active(&dn->dn_dirty_link[i])) 1051fa9e4066Sahrens break; 1052fa9e4066Sahrens } 1053fa9e4066Sahrens if (i != TXG_SIZE) { 1054fa9e4066Sahrens dnode_rele(dn, FTAG); 1055fa9e4066Sahrens txg_wait_synced(dmu_objset_pool(os), 0); 1056ea8dc4b6Seschrock err = dnode_hold(os->os, object, FTAG, &dn); 1057ea8dc4b6Seschrock if (err) 1058ea8dc4b6Seschrock return (err); 1059fa9e4066Sahrens } 1060fa9e4066Sahrens 1061*cdb0ab79Smaybee err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); 1062fa9e4066Sahrens dnode_rele(dn, FTAG); 1063fa9e4066Sahrens 1064fa9e4066Sahrens return (err); 1065fa9e4066Sahrens } 1066fa9e4066Sahrens 1067fa9e4066Sahrens void 1068fa9e4066Sahrens dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) 1069fa9e4066Sahrens { 1070fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 1071fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 1072fa9e4066Sahrens 1073fa9e4066Sahrens doi->doi_data_block_size = dn->dn_datablksz; 1074fa9e4066Sahrens doi->doi_metadata_block_size = dn->dn_indblkshift ? 1075fa9e4066Sahrens 1ULL << dn->dn_indblkshift : 0; 1076fa9e4066Sahrens doi->doi_indirection = dn->dn_nlevels; 1077fa9e4066Sahrens doi->doi_checksum = dn->dn_checksum; 1078fa9e4066Sahrens doi->doi_compress = dn->dn_compress; 107999653d4eSeschrock doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) + 108099653d4eSeschrock SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT; 1081fa9e4066Sahrens doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; 1082fa9e4066Sahrens doi->doi_type = dn->dn_type; 1083fa9e4066Sahrens doi->doi_bonus_size = dn->dn_bonuslen; 1084fa9e4066Sahrens doi->doi_bonus_type = dn->dn_bonustype; 1085fa9e4066Sahrens 1086fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 1087fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 1088fa9e4066Sahrens } 1089fa9e4066Sahrens 1090fa9e4066Sahrens /* 1091fa9e4066Sahrens * Get information on a DMU object. 1092fa9e4066Sahrens * If doi is NULL, just indicates whether the object exists. 1093fa9e4066Sahrens */ 1094fa9e4066Sahrens int 1095fa9e4066Sahrens dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) 1096fa9e4066Sahrens { 1097ea8dc4b6Seschrock dnode_t *dn; 1098ea8dc4b6Seschrock int err = dnode_hold(os->os, object, FTAG, &dn); 1099fa9e4066Sahrens 1100ea8dc4b6Seschrock if (err) 1101ea8dc4b6Seschrock return (err); 1102fa9e4066Sahrens 1103fa9e4066Sahrens if (doi != NULL) 1104fa9e4066Sahrens dmu_object_info_from_dnode(dn, doi); 1105fa9e4066Sahrens 1106fa9e4066Sahrens dnode_rele(dn, FTAG); 1107fa9e4066Sahrens return (0); 1108fa9e4066Sahrens } 1109fa9e4066Sahrens 1110fa9e4066Sahrens /* 1111fa9e4066Sahrens * As above, but faster; can be used when you have a held dbuf in hand. 1112fa9e4066Sahrens */ 1113fa9e4066Sahrens void 1114fa9e4066Sahrens dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) 1115fa9e4066Sahrens { 1116fa9e4066Sahrens dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); 1117fa9e4066Sahrens } 1118fa9e4066Sahrens 1119fa9e4066Sahrens /* 1120fa9e4066Sahrens * Faster still when you only care about the size. 1121fa9e4066Sahrens * This is specifically optimized for zfs_getattr(). 1122fa9e4066Sahrens */ 1123fa9e4066Sahrens void 1124fa9e4066Sahrens dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) 1125fa9e4066Sahrens { 1126fa9e4066Sahrens dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 1127fa9e4066Sahrens 1128fa9e4066Sahrens *blksize = dn->dn_datablksz; 112999653d4eSeschrock /* add 1 for dnode space */ 113099653d4eSeschrock *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> 113199653d4eSeschrock SPA_MINBLOCKSHIFT) + 1; 1132fa9e4066Sahrens } 1133fa9e4066Sahrens 1134fa9e4066Sahrens void 1135fa9e4066Sahrens byteswap_uint64_array(void *vbuf, size_t size) 1136fa9e4066Sahrens { 1137fa9e4066Sahrens uint64_t *buf = vbuf; 1138fa9e4066Sahrens size_t count = size >> 3; 1139fa9e4066Sahrens int i; 1140fa9e4066Sahrens 1141fa9e4066Sahrens ASSERT((size & 7) == 0); 1142fa9e4066Sahrens 1143fa9e4066Sahrens for (i = 0; i < count; i++) 1144fa9e4066Sahrens buf[i] = BSWAP_64(buf[i]); 1145fa9e4066Sahrens } 1146fa9e4066Sahrens 1147fa9e4066Sahrens void 1148fa9e4066Sahrens byteswap_uint32_array(void *vbuf, size_t size) 1149fa9e4066Sahrens { 1150fa9e4066Sahrens uint32_t *buf = vbuf; 1151fa9e4066Sahrens size_t count = size >> 2; 1152fa9e4066Sahrens int i; 1153fa9e4066Sahrens 1154fa9e4066Sahrens ASSERT((size & 3) == 0); 1155fa9e4066Sahrens 1156fa9e4066Sahrens for (i = 0; i < count; i++) 1157fa9e4066Sahrens buf[i] = BSWAP_32(buf[i]); 1158fa9e4066Sahrens } 1159fa9e4066Sahrens 1160fa9e4066Sahrens void 1161fa9e4066Sahrens byteswap_uint16_array(void *vbuf, size_t size) 1162fa9e4066Sahrens { 1163fa9e4066Sahrens uint16_t *buf = vbuf; 1164fa9e4066Sahrens size_t count = size >> 1; 1165fa9e4066Sahrens int i; 1166fa9e4066Sahrens 1167fa9e4066Sahrens ASSERT((size & 1) == 0); 1168fa9e4066Sahrens 1169fa9e4066Sahrens for (i = 0; i < count; i++) 1170fa9e4066Sahrens buf[i] = BSWAP_16(buf[i]); 1171fa9e4066Sahrens } 1172fa9e4066Sahrens 1173fa9e4066Sahrens /* ARGSUSED */ 1174fa9e4066Sahrens void 1175fa9e4066Sahrens byteswap_uint8_array(void *vbuf, size_t size) 1176fa9e4066Sahrens { 1177fa9e4066Sahrens } 1178fa9e4066Sahrens 1179fa9e4066Sahrens void 1180fa9e4066Sahrens dmu_init(void) 1181fa9e4066Sahrens { 1182fa9e4066Sahrens dbuf_init(); 1183fa9e4066Sahrens dnode_init(); 1184fa9e4066Sahrens arc_init(); 1185fa94a07fSbrendan l2arc_init(); 1186fa9e4066Sahrens } 1187fa9e4066Sahrens 1188fa9e4066Sahrens void 1189fa9e4066Sahrens dmu_fini(void) 1190fa9e4066Sahrens { 1191fa9e4066Sahrens arc_fini(); 1192fa9e4066Sahrens dnode_fini(); 1193fa9e4066Sahrens dbuf_fini(); 1194fa94a07fSbrendan l2arc_fini(); 1195fa9e4066Sahrens } 1196