1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 5ea8dc4b6Seschrock * Common Development and Distribution License (the "License"). 6ea8dc4b6Seschrock * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 2255434c77Sek110237 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 26fa9e4066Sahrens #pragma ident "%Z%%M% %I% %E% SMI" 27fa9e4066Sahrens 28fa9e4066Sahrens #include <sys/dmu.h> 29fa9e4066Sahrens #include <sys/dmu_impl.h> 30fa9e4066Sahrens #include <sys/dmu_tx.h> 31fa9e4066Sahrens #include <sys/dbuf.h> 32fa9e4066Sahrens #include <sys/dnode.h> 33fa9e4066Sahrens #include <sys/zfs_context.h> 34fa9e4066Sahrens #include <sys/dmu_objset.h> 35fa9e4066Sahrens #include <sys/dmu_traverse.h> 36fa9e4066Sahrens #include <sys/dsl_dataset.h> 37fa9e4066Sahrens #include <sys/dsl_dir.h> 38fa9e4066Sahrens #include <sys/dsl_pool.h> 391d452cf5Sahrens #include <sys/dsl_synctask.h> 40a2eea2e1Sahrens #include <sys/dsl_prop.h> 41fa9e4066Sahrens #include <sys/dmu_zfetch.h> 42fa9e4066Sahrens #include <sys/zfs_ioctl.h> 43fa9e4066Sahrens #include <sys/zap.h> 44ea8dc4b6Seschrock #include <sys/zio_checksum.h> 4544eda4d7Smaybee #ifdef _KERNEL 4644eda4d7Smaybee #include <sys/vmsystm.h> 4744eda4d7Smaybee #endif 48fa9e4066Sahrens 49fa9e4066Sahrens const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { 50fa9e4066Sahrens { byteswap_uint8_array, TRUE, "unallocated" }, 51fa9e4066Sahrens { zap_byteswap, TRUE, "object directory" }, 52fa9e4066Sahrens { byteswap_uint64_array, TRUE, "object array" }, 53fa9e4066Sahrens { byteswap_uint8_array, TRUE, "packed nvlist" }, 54fa9e4066Sahrens { byteswap_uint64_array, TRUE, "packed nvlist size" }, 55fa9e4066Sahrens { byteswap_uint64_array, TRUE, "bplist" }, 56fa9e4066Sahrens { byteswap_uint64_array, TRUE, "bplist header" }, 57fa9e4066Sahrens { byteswap_uint64_array, TRUE, "SPA space map header" }, 58fa9e4066Sahrens { byteswap_uint64_array, TRUE, "SPA space map" }, 59fa9e4066Sahrens { byteswap_uint64_array, TRUE, "ZIL intent log" }, 60fa9e4066Sahrens { dnode_buf_byteswap, TRUE, "DMU dnode" }, 61fa9e4066Sahrens { dmu_objset_byteswap, TRUE, "DMU objset" }, 62fa9e4066Sahrens { byteswap_uint64_array, TRUE, "DSL directory" }, 63fa9e4066Sahrens { zap_byteswap, TRUE, "DSL directory child map"}, 64fa9e4066Sahrens { zap_byteswap, TRUE, "DSL dataset snap map" }, 65fa9e4066Sahrens { zap_byteswap, TRUE, "DSL props" }, 66fa9e4066Sahrens { byteswap_uint64_array, TRUE, "DSL dataset" }, 67fa9e4066Sahrens { zfs_znode_byteswap, TRUE, "ZFS znode" }, 68fa9e4066Sahrens { zfs_acl_byteswap, TRUE, "ZFS ACL" }, 69fa9e4066Sahrens { byteswap_uint8_array, FALSE, "ZFS plain file" }, 70fa9e4066Sahrens { zap_byteswap, TRUE, "ZFS directory" }, 71fa9e4066Sahrens { zap_byteswap, TRUE, "ZFS master node" }, 72fa9e4066Sahrens { zap_byteswap, TRUE, "ZFS delete queue" }, 73fa9e4066Sahrens { byteswap_uint8_array, FALSE, "zvol object" }, 74fa9e4066Sahrens { zap_byteswap, TRUE, "zvol prop" }, 75fa9e4066Sahrens { byteswap_uint8_array, FALSE, "other uint8[]" }, 76fa9e4066Sahrens { byteswap_uint64_array, FALSE, "other uint64[]" }, 77fa9e4066Sahrens { zap_byteswap, TRUE, "other ZAP" }, 78ea8dc4b6Seschrock { zap_byteswap, TRUE, "persistent error log" }, 7906eeb2adSek110237 { byteswap_uint8_array, TRUE, "SPA history" }, 8006eeb2adSek110237 { byteswap_uint64_array, TRUE, "SPA history offsets" }, 81b1b8ab34Slling { zap_byteswap, TRUE, "Pool properties" }, 82ecd6cf80Smarks { zap_byteswap, TRUE, "DSL permissions" } 83fa9e4066Sahrens }; 84fa9e4066Sahrens 85fa9e4066Sahrens int 86ea8dc4b6Seschrock dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, 87ea8dc4b6Seschrock void *tag, dmu_buf_t **dbp) 88fa9e4066Sahrens { 89fa9e4066Sahrens dnode_t *dn; 90fa9e4066Sahrens uint64_t blkid; 91fa9e4066Sahrens dmu_buf_impl_t *db; 92ea8dc4b6Seschrock int err; 93fa9e4066Sahrens 94ea8dc4b6Seschrock err = dnode_hold(os->os, object, FTAG, &dn); 95ea8dc4b6Seschrock if (err) 96ea8dc4b6Seschrock return (err); 97fa9e4066Sahrens blkid = dbuf_whichblock(dn, offset); 98fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 99ea8dc4b6Seschrock db = dbuf_hold(dn, blkid, tag); 100fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 101ea8dc4b6Seschrock if (db == NULL) { 102ea8dc4b6Seschrock err = EIO; 103ea8dc4b6Seschrock } else { 104ea8dc4b6Seschrock err = dbuf_read(db, NULL, DB_RF_CANFAIL); 105ea8dc4b6Seschrock if (err) { 106ea8dc4b6Seschrock dbuf_rele(db, tag); 107ea8dc4b6Seschrock db = NULL; 108ea8dc4b6Seschrock } 109fa9e4066Sahrens } 110fa9e4066Sahrens 111ea8dc4b6Seschrock dnode_rele(dn, FTAG); 112ea8dc4b6Seschrock *dbp = &db->db; 113ea8dc4b6Seschrock return (err); 114fa9e4066Sahrens } 115fa9e4066Sahrens 116fa9e4066Sahrens int 117fa9e4066Sahrens dmu_bonus_max(void) 118fa9e4066Sahrens { 119fa9e4066Sahrens return (DN_MAX_BONUSLEN); 120fa9e4066Sahrens } 121fa9e4066Sahrens 122*1934e92fSmaybee int 123*1934e92fSmaybee dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx) 124*1934e92fSmaybee { 125*1934e92fSmaybee dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 126*1934e92fSmaybee 127*1934e92fSmaybee if (dn->dn_bonus != (dmu_buf_impl_t *)db) 128*1934e92fSmaybee return (EINVAL); 129*1934e92fSmaybee if (newsize < 0 || newsize > db->db_size) 130*1934e92fSmaybee return (EINVAL); 131*1934e92fSmaybee dnode_setbonuslen(dn, newsize, tx); 132*1934e92fSmaybee return (0); 133*1934e92fSmaybee } 134*1934e92fSmaybee 135fa9e4066Sahrens /* 136ea8dc4b6Seschrock * returns ENOENT, EIO, or 0. 137fa9e4066Sahrens */ 138ea8dc4b6Seschrock int 139ea8dc4b6Seschrock dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) 140fa9e4066Sahrens { 141ea8dc4b6Seschrock dnode_t *dn; 142fa9e4066Sahrens dmu_buf_impl_t *db; 143*1934e92fSmaybee int error; 144fa9e4066Sahrens 145*1934e92fSmaybee error = dnode_hold(os->os, object, FTAG, &dn); 146*1934e92fSmaybee if (error) 147*1934e92fSmaybee return (error); 148fa9e4066Sahrens 149fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 150ea8dc4b6Seschrock if (dn->dn_bonus == NULL) { 151fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 152ea8dc4b6Seschrock rw_enter(&dn->dn_struct_rwlock, RW_WRITER); 153ea8dc4b6Seschrock if (dn->dn_bonus == NULL) 154*1934e92fSmaybee dbuf_create_bonus(dn); 155fa9e4066Sahrens } 156ea8dc4b6Seschrock db = dn->dn_bonus; 157ea8dc4b6Seschrock rw_exit(&dn->dn_struct_rwlock); 158*1934e92fSmaybee 159*1934e92fSmaybee /* as long as the bonus buf is held, the dnode will be held */ 160*1934e92fSmaybee if (refcount_add(&db->db_holds, tag) == 1) 161*1934e92fSmaybee VERIFY(dnode_add_ref(dn, db)); 162*1934e92fSmaybee 163fa9e4066Sahrens dnode_rele(dn, FTAG); 164ea8dc4b6Seschrock 165ea8dc4b6Seschrock VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); 166ea8dc4b6Seschrock 167ea8dc4b6Seschrock *dbp = &db->db; 168ea8dc4b6Seschrock return (0); 169fa9e4066Sahrens } 170fa9e4066Sahrens 17113506d1eSmaybee /* 17213506d1eSmaybee * Note: longer-term, we should modify all of the dmu_buf_*() interfaces 17313506d1eSmaybee * to take a held dnode rather than <os, object> -- the lookup is wasteful, 17413506d1eSmaybee * and can induce severe lock contention when writing to several files 17513506d1eSmaybee * whose dnodes are in the same block. 17613506d1eSmaybee */ 17713506d1eSmaybee static int 17813506d1eSmaybee dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, 179ea8dc4b6Seschrock uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 180fa9e4066Sahrens { 181fa9e4066Sahrens dmu_buf_t **dbp; 182fa9e4066Sahrens uint64_t blkid, nblks, i; 183ea8dc4b6Seschrock uint32_t flags; 184ea8dc4b6Seschrock int err; 185ea8dc4b6Seschrock zio_t *zio; 186ea8dc4b6Seschrock 187ea8dc4b6Seschrock ASSERT(length <= DMU_MAX_ACCESS); 188fa9e4066Sahrens 189ea8dc4b6Seschrock flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; 190e1930233Sbonwick if (length > zfetch_array_rd_sz) 191ea8dc4b6Seschrock flags |= DB_RF_NOPREFETCH; 192ea8dc4b6Seschrock 193fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 194fa9e4066Sahrens if (dn->dn_datablkshift) { 195fa9e4066Sahrens int blkshift = dn->dn_datablkshift; 196fa9e4066Sahrens nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - 197fa9e4066Sahrens P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; 198fa9e4066Sahrens } else { 1990125049cSahrens if (offset + length > dn->dn_datablksz) { 2000125049cSahrens zfs_panic_recover("zfs: accessing past end of object " 2010125049cSahrens "%llx/%llx (size=%u access=%llu+%llu)", 2020125049cSahrens (longlong_t)dn->dn_objset-> 2030125049cSahrens os_dsl_dataset->ds_object, 2040125049cSahrens (longlong_t)dn->dn_object, dn->dn_datablksz, 2050125049cSahrens (longlong_t)offset, (longlong_t)length); 2060125049cSahrens return (EIO); 2070125049cSahrens } 208fa9e4066Sahrens nblks = 1; 209fa9e4066Sahrens } 210ea8dc4b6Seschrock dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); 211fa9e4066Sahrens 212ea8dc4b6Seschrock zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE); 213fa9e4066Sahrens blkid = dbuf_whichblock(dn, offset); 214fa9e4066Sahrens for (i = 0; i < nblks; i++) { 215ea8dc4b6Seschrock dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); 216ea8dc4b6Seschrock if (db == NULL) { 217ea8dc4b6Seschrock rw_exit(&dn->dn_struct_rwlock); 218ea8dc4b6Seschrock dmu_buf_rele_array(dbp, nblks, tag); 219ea8dc4b6Seschrock zio_nowait(zio); 220ea8dc4b6Seschrock return (EIO); 221ea8dc4b6Seschrock } 222ea8dc4b6Seschrock /* initiate async i/o */ 22313506d1eSmaybee if (read) { 224ea8dc4b6Seschrock rw_exit(&dn->dn_struct_rwlock); 225ea8dc4b6Seschrock (void) dbuf_read(db, zio, flags); 226ea8dc4b6Seschrock rw_enter(&dn->dn_struct_rwlock, RW_READER); 227ea8dc4b6Seschrock } 228ea8dc4b6Seschrock dbp[i] = &db->db; 229fa9e4066Sahrens } 230fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 231fa9e4066Sahrens 232ea8dc4b6Seschrock /* wait for async i/o */ 233ea8dc4b6Seschrock err = zio_wait(zio); 234ea8dc4b6Seschrock if (err) { 235ea8dc4b6Seschrock dmu_buf_rele_array(dbp, nblks, tag); 236ea8dc4b6Seschrock return (err); 237ea8dc4b6Seschrock } 238ea8dc4b6Seschrock 239ea8dc4b6Seschrock /* wait for other io to complete */ 240ea8dc4b6Seschrock if (read) { 241ea8dc4b6Seschrock for (i = 0; i < nblks; i++) { 242ea8dc4b6Seschrock dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; 243ea8dc4b6Seschrock mutex_enter(&db->db_mtx); 244ea8dc4b6Seschrock while (db->db_state == DB_READ || 245ea8dc4b6Seschrock db->db_state == DB_FILL) 246ea8dc4b6Seschrock cv_wait(&db->db_changed, &db->db_mtx); 247ea8dc4b6Seschrock if (db->db_state == DB_UNCACHED) 248ea8dc4b6Seschrock err = EIO; 249ea8dc4b6Seschrock mutex_exit(&db->db_mtx); 250ea8dc4b6Seschrock if (err) { 251ea8dc4b6Seschrock dmu_buf_rele_array(dbp, nblks, tag); 252ea8dc4b6Seschrock return (err); 253ea8dc4b6Seschrock } 254ea8dc4b6Seschrock } 255ea8dc4b6Seschrock } 256ea8dc4b6Seschrock 257ea8dc4b6Seschrock *numbufsp = nblks; 258ea8dc4b6Seschrock *dbpp = dbp; 259ea8dc4b6Seschrock return (0); 260fa9e4066Sahrens } 261fa9e4066Sahrens 262a2eea2e1Sahrens static int 26313506d1eSmaybee dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, 26413506d1eSmaybee uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 26513506d1eSmaybee { 26613506d1eSmaybee dnode_t *dn; 26713506d1eSmaybee int err; 26813506d1eSmaybee 26913506d1eSmaybee err = dnode_hold(os->os, object, FTAG, &dn); 27013506d1eSmaybee if (err) 27113506d1eSmaybee return (err); 27213506d1eSmaybee 27313506d1eSmaybee err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 27413506d1eSmaybee numbufsp, dbpp); 27513506d1eSmaybee 27613506d1eSmaybee dnode_rele(dn, FTAG); 27713506d1eSmaybee 27813506d1eSmaybee return (err); 27913506d1eSmaybee } 28013506d1eSmaybee 28113506d1eSmaybee int 28213506d1eSmaybee dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, 28313506d1eSmaybee uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) 28413506d1eSmaybee { 28513506d1eSmaybee dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 28613506d1eSmaybee int err; 28713506d1eSmaybee 28813506d1eSmaybee err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, 28913506d1eSmaybee numbufsp, dbpp); 29013506d1eSmaybee 29113506d1eSmaybee return (err); 29213506d1eSmaybee } 29313506d1eSmaybee 294fa9e4066Sahrens void 295ea8dc4b6Seschrock dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) 296fa9e4066Sahrens { 297fa9e4066Sahrens int i; 298fa9e4066Sahrens dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; 299fa9e4066Sahrens 300fa9e4066Sahrens if (numbufs == 0) 301fa9e4066Sahrens return; 302fa9e4066Sahrens 303ea8dc4b6Seschrock for (i = 0; i < numbufs; i++) { 304ea8dc4b6Seschrock if (dbp[i]) 305ea8dc4b6Seschrock dbuf_rele(dbp[i], tag); 306ea8dc4b6Seschrock } 307fa9e4066Sahrens 308fa9e4066Sahrens kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); 309fa9e4066Sahrens } 310fa9e4066Sahrens 311fa9e4066Sahrens void 312fa9e4066Sahrens dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) 313fa9e4066Sahrens { 314fa9e4066Sahrens dnode_t *dn; 315fa9e4066Sahrens uint64_t blkid; 316ea8dc4b6Seschrock int nblks, i, err; 317fa9e4066Sahrens 318416e0cd8Sek110237 if (zfs_prefetch_disable) 319416e0cd8Sek110237 return; 320416e0cd8Sek110237 321fa9e4066Sahrens if (len == 0) { /* they're interested in the bonus buffer */ 322fa9e4066Sahrens dn = os->os->os_meta_dnode; 323fa9e4066Sahrens 324fa9e4066Sahrens if (object == 0 || object >= DN_MAX_OBJECT) 325fa9e4066Sahrens return; 326fa9e4066Sahrens 327fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 328fa9e4066Sahrens blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); 329fa9e4066Sahrens dbuf_prefetch(dn, blkid); 330fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 331fa9e4066Sahrens return; 332fa9e4066Sahrens } 333fa9e4066Sahrens 334fa9e4066Sahrens /* 335fa9e4066Sahrens * XXX - Note, if the dnode for the requested object is not 336fa9e4066Sahrens * already cached, we will do a *synchronous* read in the 337fa9e4066Sahrens * dnode_hold() call. The same is true for any indirects. 338fa9e4066Sahrens */ 339ea8dc4b6Seschrock err = dnode_hold(os->os, object, FTAG, &dn); 340ea8dc4b6Seschrock if (err != 0) 341fa9e4066Sahrens return; 342fa9e4066Sahrens 343fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 344fa9e4066Sahrens if (dn->dn_datablkshift) { 345fa9e4066Sahrens int blkshift = dn->dn_datablkshift; 346fa9e4066Sahrens nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - 347fa9e4066Sahrens P2ALIGN(offset, 1<<blkshift)) >> blkshift; 348fa9e4066Sahrens } else { 349fa9e4066Sahrens nblks = (offset < dn->dn_datablksz); 350fa9e4066Sahrens } 351fa9e4066Sahrens 352fa9e4066Sahrens if (nblks != 0) { 353fa9e4066Sahrens blkid = dbuf_whichblock(dn, offset); 354fa9e4066Sahrens for (i = 0; i < nblks; i++) 355fa9e4066Sahrens dbuf_prefetch(dn, blkid+i); 356fa9e4066Sahrens } 357fa9e4066Sahrens 358fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 359fa9e4066Sahrens 360fa9e4066Sahrens dnode_rele(dn, FTAG); 361fa9e4066Sahrens } 362fa9e4066Sahrens 363ea8dc4b6Seschrock int 364fa9e4066Sahrens dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, 365fa9e4066Sahrens uint64_t size, dmu_tx_t *tx) 366fa9e4066Sahrens { 367ea8dc4b6Seschrock dnode_t *dn; 368ea8dc4b6Seschrock int err = dnode_hold(os->os, object, FTAG, &dn); 369ea8dc4b6Seschrock if (err) 370ea8dc4b6Seschrock return (err); 371fa9e4066Sahrens ASSERT(offset < UINT64_MAX); 372fa9e4066Sahrens ASSERT(size == -1ULL || size <= UINT64_MAX - offset); 373fa9e4066Sahrens dnode_free_range(dn, offset, size, tx); 374fa9e4066Sahrens dnode_rele(dn, FTAG); 375ea8dc4b6Seschrock return (0); 376fa9e4066Sahrens } 377fa9e4066Sahrens 378ea8dc4b6Seschrock int 379ea8dc4b6Seschrock dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 380ea8dc4b6Seschrock void *buf) 381fa9e4066Sahrens { 382fa9e4066Sahrens dnode_t *dn; 383fa9e4066Sahrens dmu_buf_t **dbp; 384ea8dc4b6Seschrock int numbufs, i, err; 385fa9e4066Sahrens 386ea8dc4b6Seschrock err = dnode_hold(os->os, object, FTAG, &dn); 387ea8dc4b6Seschrock if (err) 388ea8dc4b6Seschrock return (err); 389feb08c6bSbillm 390feb08c6bSbillm /* 391feb08c6bSbillm * Deal with odd block sizes, where there can't be data past the first 392feb08c6bSbillm * block. If we ever do the tail block optimization, we will need to 393feb08c6bSbillm * handle that here as well. 394feb08c6bSbillm */ 395fa9e4066Sahrens if (dn->dn_datablkshift == 0) { 396fa9e4066Sahrens int newsz = offset > dn->dn_datablksz ? 0 : 397fa9e4066Sahrens MIN(size, dn->dn_datablksz - offset); 398fa9e4066Sahrens bzero((char *)buf + newsz, size - newsz); 399fa9e4066Sahrens size = newsz; 400fa9e4066Sahrens } 401fa9e4066Sahrens 402fa9e4066Sahrens while (size > 0) { 403fa9e4066Sahrens uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); 404fa9e4066Sahrens 405fa9e4066Sahrens /* 406fa9e4066Sahrens * NB: we could do this block-at-a-time, but it's nice 407fa9e4066Sahrens * to be reading in parallel. 408fa9e4066Sahrens */ 409a2eea2e1Sahrens err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, 410ea8dc4b6Seschrock TRUE, FTAG, &numbufs, &dbp); 411ea8dc4b6Seschrock if (err) 412*1934e92fSmaybee break; 413fa9e4066Sahrens 414fa9e4066Sahrens for (i = 0; i < numbufs; i++) { 415fa9e4066Sahrens int tocpy; 416fa9e4066Sahrens int bufoff; 417fa9e4066Sahrens dmu_buf_t *db = dbp[i]; 418fa9e4066Sahrens 419fa9e4066Sahrens ASSERT(size > 0); 420fa9e4066Sahrens 421fa9e4066Sahrens bufoff = offset - db->db_offset; 422fa9e4066Sahrens tocpy = (int)MIN(db->db_size - bufoff, size); 423fa9e4066Sahrens 424fa9e4066Sahrens bcopy((char *)db->db_data + bufoff, buf, tocpy); 425fa9e4066Sahrens 426fa9e4066Sahrens offset += tocpy; 427fa9e4066Sahrens size -= tocpy; 428fa9e4066Sahrens buf = (char *)buf + tocpy; 429fa9e4066Sahrens } 430ea8dc4b6Seschrock dmu_buf_rele_array(dbp, numbufs, FTAG); 431fa9e4066Sahrens } 432a2eea2e1Sahrens dnode_rele(dn, FTAG); 433*1934e92fSmaybee return (err); 434fa9e4066Sahrens } 435fa9e4066Sahrens 436fa9e4066Sahrens void 437fa9e4066Sahrens dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 438fa9e4066Sahrens const void *buf, dmu_tx_t *tx) 439fa9e4066Sahrens { 440fa9e4066Sahrens dmu_buf_t **dbp; 441fa9e4066Sahrens int numbufs, i; 442fa9e4066Sahrens 44313506d1eSmaybee if (size == 0) 44413506d1eSmaybee return; 44513506d1eSmaybee 446ea8dc4b6Seschrock VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, 447ea8dc4b6Seschrock FALSE, FTAG, &numbufs, &dbp)); 448fa9e4066Sahrens 449fa9e4066Sahrens for (i = 0; i < numbufs; i++) { 450fa9e4066Sahrens int tocpy; 451fa9e4066Sahrens int bufoff; 452fa9e4066Sahrens dmu_buf_t *db = dbp[i]; 453fa9e4066Sahrens 454fa9e4066Sahrens ASSERT(size > 0); 455fa9e4066Sahrens 456fa9e4066Sahrens bufoff = offset - db->db_offset; 457fa9e4066Sahrens tocpy = (int)MIN(db->db_size - bufoff, size); 458fa9e4066Sahrens 459fa9e4066Sahrens ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 460fa9e4066Sahrens 461fa9e4066Sahrens if (tocpy == db->db_size) 462fa9e4066Sahrens dmu_buf_will_fill(db, tx); 463fa9e4066Sahrens else 464fa9e4066Sahrens dmu_buf_will_dirty(db, tx); 465fa9e4066Sahrens 466fa9e4066Sahrens bcopy(buf, (char *)db->db_data + bufoff, tocpy); 467fa9e4066Sahrens 468fa9e4066Sahrens if (tocpy == db->db_size) 469fa9e4066Sahrens dmu_buf_fill_done(db, tx); 470fa9e4066Sahrens 471fa9e4066Sahrens offset += tocpy; 472fa9e4066Sahrens size -= tocpy; 473fa9e4066Sahrens buf = (char *)buf + tocpy; 474fa9e4066Sahrens } 475ea8dc4b6Seschrock dmu_buf_rele_array(dbp, numbufs, FTAG); 476fa9e4066Sahrens } 477fa9e4066Sahrens 478fa9e4066Sahrens #ifdef _KERNEL 479fa9e4066Sahrens int 480feb08c6bSbillm dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) 481feb08c6bSbillm { 482feb08c6bSbillm dmu_buf_t **dbp; 483feb08c6bSbillm int numbufs, i, err; 484feb08c6bSbillm 485feb08c6bSbillm /* 486feb08c6bSbillm * NB: we could do this block-at-a-time, but it's nice 487feb08c6bSbillm * to be reading in parallel. 488feb08c6bSbillm */ 489feb08c6bSbillm err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, 490feb08c6bSbillm &numbufs, &dbp); 491feb08c6bSbillm if (err) 492feb08c6bSbillm return (err); 493feb08c6bSbillm 494feb08c6bSbillm for (i = 0; i < numbufs; i++) { 495feb08c6bSbillm int tocpy; 496feb08c6bSbillm int bufoff; 497feb08c6bSbillm dmu_buf_t *db = dbp[i]; 498feb08c6bSbillm 499feb08c6bSbillm ASSERT(size > 0); 500feb08c6bSbillm 501feb08c6bSbillm bufoff = uio->uio_loffset - db->db_offset; 502feb08c6bSbillm tocpy = (int)MIN(db->db_size - bufoff, size); 503feb08c6bSbillm 504feb08c6bSbillm err = uiomove((char *)db->db_data + bufoff, tocpy, 505feb08c6bSbillm UIO_READ, uio); 506feb08c6bSbillm if (err) 507feb08c6bSbillm break; 508feb08c6bSbillm 509feb08c6bSbillm size -= tocpy; 510feb08c6bSbillm } 511feb08c6bSbillm dmu_buf_rele_array(dbp, numbufs, FTAG); 512feb08c6bSbillm 513feb08c6bSbillm return (err); 514feb08c6bSbillm } 515feb08c6bSbillm 516feb08c6bSbillm int 517feb08c6bSbillm dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, 518feb08c6bSbillm dmu_tx_t *tx) 519fa9e4066Sahrens { 520fa9e4066Sahrens dmu_buf_t **dbp; 521fa9e4066Sahrens int numbufs, i; 522fa9e4066Sahrens int err = 0; 523fa9e4066Sahrens 52413506d1eSmaybee if (size == 0) 52513506d1eSmaybee return (0); 52613506d1eSmaybee 527feb08c6bSbillm err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, 528ea8dc4b6Seschrock FALSE, FTAG, &numbufs, &dbp); 529ea8dc4b6Seschrock if (err) 530ea8dc4b6Seschrock return (err); 531fa9e4066Sahrens 532fa9e4066Sahrens for (i = 0; i < numbufs; i++) { 533fa9e4066Sahrens int tocpy; 534fa9e4066Sahrens int bufoff; 535fa9e4066Sahrens dmu_buf_t *db = dbp[i]; 536fa9e4066Sahrens 537fa9e4066Sahrens ASSERT(size > 0); 538fa9e4066Sahrens 539feb08c6bSbillm bufoff = uio->uio_loffset - db->db_offset; 540fa9e4066Sahrens tocpy = (int)MIN(db->db_size - bufoff, size); 541fa9e4066Sahrens 542fa9e4066Sahrens ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 543fa9e4066Sahrens 544fa9e4066Sahrens if (tocpy == db->db_size) 545fa9e4066Sahrens dmu_buf_will_fill(db, tx); 546fa9e4066Sahrens else 547fa9e4066Sahrens dmu_buf_will_dirty(db, tx); 548fa9e4066Sahrens 549fa9e4066Sahrens /* 550fa9e4066Sahrens * XXX uiomove could block forever (eg. nfs-backed 551fa9e4066Sahrens * pages). There needs to be a uiolockdown() function 552fa9e4066Sahrens * to lock the pages in memory, so that uiomove won't 553fa9e4066Sahrens * block. 554fa9e4066Sahrens */ 555fa9e4066Sahrens err = uiomove((char *)db->db_data + bufoff, tocpy, 556fa9e4066Sahrens UIO_WRITE, uio); 557fa9e4066Sahrens 558fa9e4066Sahrens if (tocpy == db->db_size) 559fa9e4066Sahrens dmu_buf_fill_done(db, tx); 560fa9e4066Sahrens 561fa9e4066Sahrens if (err) 562fa9e4066Sahrens break; 563fa9e4066Sahrens 564fa9e4066Sahrens size -= tocpy; 565fa9e4066Sahrens } 566ea8dc4b6Seschrock dmu_buf_rele_array(dbp, numbufs, FTAG); 567fa9e4066Sahrens return (err); 568fa9e4066Sahrens } 56944eda4d7Smaybee 57044eda4d7Smaybee int 57144eda4d7Smaybee dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, 57244eda4d7Smaybee page_t *pp, dmu_tx_t *tx) 57344eda4d7Smaybee { 57444eda4d7Smaybee dmu_buf_t **dbp; 57544eda4d7Smaybee int numbufs, i; 57644eda4d7Smaybee int err; 57744eda4d7Smaybee 57844eda4d7Smaybee if (size == 0) 57944eda4d7Smaybee return (0); 58044eda4d7Smaybee 58144eda4d7Smaybee err = dmu_buf_hold_array(os, object, offset, size, 58244eda4d7Smaybee FALSE, FTAG, &numbufs, &dbp); 58344eda4d7Smaybee if (err) 58444eda4d7Smaybee return (err); 58544eda4d7Smaybee 58644eda4d7Smaybee for (i = 0; i < numbufs; i++) { 58744eda4d7Smaybee int tocpy, copied, thiscpy; 58844eda4d7Smaybee int bufoff; 58944eda4d7Smaybee dmu_buf_t *db = dbp[i]; 59044eda4d7Smaybee caddr_t va; 59144eda4d7Smaybee 59244eda4d7Smaybee ASSERT(size > 0); 59344eda4d7Smaybee ASSERT3U(db->db_size, >=, PAGESIZE); 59444eda4d7Smaybee 59544eda4d7Smaybee bufoff = offset - db->db_offset; 59644eda4d7Smaybee tocpy = (int)MIN(db->db_size - bufoff, size); 59744eda4d7Smaybee 59844eda4d7Smaybee ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); 59944eda4d7Smaybee 60044eda4d7Smaybee if (tocpy == db->db_size) 60144eda4d7Smaybee dmu_buf_will_fill(db, tx); 60244eda4d7Smaybee else 60344eda4d7Smaybee dmu_buf_will_dirty(db, tx); 60444eda4d7Smaybee 60544eda4d7Smaybee for (copied = 0; copied < tocpy; copied += PAGESIZE) { 60644eda4d7Smaybee ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); 60744eda4d7Smaybee thiscpy = MIN(PAGESIZE, tocpy - copied); 60844eda4d7Smaybee va = ppmapin(pp, PROT_READ, (caddr_t)-1); 60944eda4d7Smaybee bcopy(va, (char *)db->db_data + bufoff, thiscpy); 61044eda4d7Smaybee ppmapout(va); 61144eda4d7Smaybee pp = pp->p_next; 61244eda4d7Smaybee bufoff += PAGESIZE; 61344eda4d7Smaybee } 61444eda4d7Smaybee 61544eda4d7Smaybee if (tocpy == db->db_size) 61644eda4d7Smaybee dmu_buf_fill_done(db, tx); 61744eda4d7Smaybee 61844eda4d7Smaybee if (err) 61944eda4d7Smaybee break; 62044eda4d7Smaybee 62144eda4d7Smaybee offset += tocpy; 62244eda4d7Smaybee size -= tocpy; 62344eda4d7Smaybee } 62444eda4d7Smaybee dmu_buf_rele_array(dbp, numbufs, FTAG); 62544eda4d7Smaybee return (err); 62644eda4d7Smaybee } 627fa9e4066Sahrens #endif 628fa9e4066Sahrens 629c5c6ffa0Smaybee typedef struct { 630c717a561Smaybee dbuf_dirty_record_t *dr; 631c5c6ffa0Smaybee dmu_sync_cb_t *done; 632c5c6ffa0Smaybee void *arg; 633c717a561Smaybee } dmu_sync_arg_t; 634c5c6ffa0Smaybee 635c5c6ffa0Smaybee /* ARGSUSED */ 636c5c6ffa0Smaybee static void 637c5c6ffa0Smaybee dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) 638c5c6ffa0Smaybee { 639c717a561Smaybee dmu_sync_arg_t *in = varg; 640c717a561Smaybee dbuf_dirty_record_t *dr = in->dr; 641c717a561Smaybee dmu_buf_impl_t *db = dr->dr_dbuf; 642c5c6ffa0Smaybee dmu_sync_cb_t *done = in->done; 643c5c6ffa0Smaybee 644c5c6ffa0Smaybee if (!BP_IS_HOLE(zio->io_bp)) { 645c5c6ffa0Smaybee zio->io_bp->blk_fill = 1; 646c5c6ffa0Smaybee BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type); 647c5c6ffa0Smaybee BP_SET_LEVEL(zio->io_bp, 0); 648c5c6ffa0Smaybee } 649c5c6ffa0Smaybee 650c5c6ffa0Smaybee mutex_enter(&db->db_mtx); 651c717a561Smaybee ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); 652c717a561Smaybee dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */ 653c717a561Smaybee dr->dt.dl.dr_override_state = DR_OVERRIDDEN; 654c5c6ffa0Smaybee cv_broadcast(&db->db_changed); 655c5c6ffa0Smaybee mutex_exit(&db->db_mtx); 656c5c6ffa0Smaybee 657c5c6ffa0Smaybee if (done) 658c717a561Smaybee done(&(db->db), in->arg); 659c717a561Smaybee 660c717a561Smaybee kmem_free(in, sizeof (dmu_sync_arg_t)); 661c5c6ffa0Smaybee } 662c5c6ffa0Smaybee 663fa9e4066Sahrens /* 664c5c6ffa0Smaybee * Intent log support: sync the block associated with db to disk. 665c5c6ffa0Smaybee * N.B. and XXX: the caller is responsible for making sure that the 666c5c6ffa0Smaybee * data isn't changing while dmu_sync() is writing it. 667fa9e4066Sahrens * 668fa9e4066Sahrens * Return values: 669fa9e4066Sahrens * 670c5c6ffa0Smaybee * EEXIST: this txg has already been synced, so there's nothing to to. 671fa9e4066Sahrens * The caller should not log the write. 672fa9e4066Sahrens * 673fa9e4066Sahrens * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. 674fa9e4066Sahrens * The caller should not log the write. 675fa9e4066Sahrens * 676c5c6ffa0Smaybee * EALREADY: this block is already in the process of being synced. 677c5c6ffa0Smaybee * The caller should track its progress (somehow). 678fa9e4066Sahrens * 679c5c6ffa0Smaybee * EINPROGRESS: the IO has been initiated. 680c5c6ffa0Smaybee * The caller should log this blkptr in the callback. 681fa9e4066Sahrens * 682c5c6ffa0Smaybee * 0: completed. Sets *bp to the blkptr just written. 683c5c6ffa0Smaybee * The caller should log this blkptr immediately. 684fa9e4066Sahrens */ 685fa9e4066Sahrens int 686c5c6ffa0Smaybee dmu_sync(zio_t *pio, dmu_buf_t *db_fake, 687c5c6ffa0Smaybee blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg) 688fa9e4066Sahrens { 689c5c6ffa0Smaybee dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; 690c5c6ffa0Smaybee objset_impl_t *os = db->db_objset; 691c5c6ffa0Smaybee dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; 692fa9e4066Sahrens tx_state_t *tx = &dp->dp_tx; 693c717a561Smaybee dbuf_dirty_record_t *dr; 694c717a561Smaybee dmu_sync_arg_t *in; 695ea8dc4b6Seschrock zbookmark_t zb; 696c717a561Smaybee zio_t *zio; 6979bc11082Sek110237 int zio_flags; 698c5c6ffa0Smaybee int err; 699fa9e4066Sahrens 700fa9e4066Sahrens ASSERT(BP_IS_HOLE(bp)); 701fa9e4066Sahrens ASSERT(txg != 0); 702fa9e4066Sahrens 703c5c6ffa0Smaybee 704fa9e4066Sahrens dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", 705fa9e4066Sahrens txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); 706fa9e4066Sahrens 707fa9e4066Sahrens /* 708c5c6ffa0Smaybee * XXX - would be nice if we could do this without suspending... 709ea8dc4b6Seschrock */ 710c5c6ffa0Smaybee txg_suspend(dp); 711ea8dc4b6Seschrock 712ea8dc4b6Seschrock /* 713fa9e4066Sahrens * If this txg already synced, there's nothing to do. 714fa9e4066Sahrens */ 715fa9e4066Sahrens if (txg <= tx->tx_synced_txg) { 716c5c6ffa0Smaybee txg_resume(dp); 717fa9e4066Sahrens /* 718fa9e4066Sahrens * If we're running ziltest, we need the blkptr regardless. 719fa9e4066Sahrens */ 720fa9e4066Sahrens if (txg > spa_freeze_txg(dp->dp_spa)) { 721fa9e4066Sahrens /* if db_blkptr == NULL, this was an empty write */ 722fa9e4066Sahrens if (db->db_blkptr) 723fa9e4066Sahrens *bp = *db->db_blkptr; /* structure assignment */ 724fa9e4066Sahrens return (0); 725fa9e4066Sahrens } 726c5c6ffa0Smaybee return (EEXIST); 727fa9e4066Sahrens } 728fa9e4066Sahrens 729fa9e4066Sahrens mutex_enter(&db->db_mtx); 730fa9e4066Sahrens 731c5c6ffa0Smaybee if (txg == tx->tx_syncing_txg) { 732c5c6ffa0Smaybee while (db->db_data_pending) { 733c5c6ffa0Smaybee /* 734c5c6ffa0Smaybee * IO is in-progress. Wait for it to finish. 735c5c6ffa0Smaybee * XXX - would be nice to be able to somehow "attach" 736c5c6ffa0Smaybee * this zio to the parent zio passed in. 737c5c6ffa0Smaybee */ 738c5c6ffa0Smaybee cv_wait(&db->db_changed, &db->db_mtx); 73913506d1eSmaybee if (!db->db_data_pending && 74013506d1eSmaybee db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) { 74113506d1eSmaybee /* 74213506d1eSmaybee * IO was compressed away 74313506d1eSmaybee */ 74413506d1eSmaybee *bp = *db->db_blkptr; /* structure assignment */ 74513506d1eSmaybee mutex_exit(&db->db_mtx); 74613506d1eSmaybee txg_resume(dp); 74713506d1eSmaybee return (0); 74813506d1eSmaybee } 749c5c6ffa0Smaybee ASSERT(db->db_data_pending || 750c5c6ffa0Smaybee (db->db_blkptr && db->db_blkptr->blk_birth == txg)); 751c5c6ffa0Smaybee } 752c5c6ffa0Smaybee 753c5c6ffa0Smaybee if (db->db_blkptr && db->db_blkptr->blk_birth == txg) { 754c5c6ffa0Smaybee /* 755c5c6ffa0Smaybee * IO is already completed. 756c5c6ffa0Smaybee */ 757c5c6ffa0Smaybee *bp = *db->db_blkptr; /* structure assignment */ 758c5c6ffa0Smaybee mutex_exit(&db->db_mtx); 759c5c6ffa0Smaybee txg_resume(dp); 760c5c6ffa0Smaybee return (0); 761c5c6ffa0Smaybee } 762c5c6ffa0Smaybee } 763c5c6ffa0Smaybee 764c717a561Smaybee dr = db->db_last_dirty; 765c717a561Smaybee while (dr && dr->dr_txg > txg) 766c717a561Smaybee dr = dr->dr_next; 767c717a561Smaybee if (dr == NULL || dr->dr_txg < txg) { 768c5c6ffa0Smaybee /* 769c5c6ffa0Smaybee * This dbuf isn't dirty, must have been free_range'd. 770fa9e4066Sahrens * There's no need to log writes to freed blocks, so we're done. 771fa9e4066Sahrens */ 772fa9e4066Sahrens mutex_exit(&db->db_mtx); 773c5c6ffa0Smaybee txg_resume(dp); 774fa9e4066Sahrens return (ENOENT); 775fa9e4066Sahrens } 776fa9e4066Sahrens 777c717a561Smaybee ASSERT(dr->dr_txg == txg); 778c717a561Smaybee if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { 779c5c6ffa0Smaybee /* 780c717a561Smaybee * We have already issued a sync write for this buffer. 781c5c6ffa0Smaybee */ 782c717a561Smaybee mutex_exit(&db->db_mtx); 783c717a561Smaybee txg_resume(dp); 784c717a561Smaybee return (EALREADY); 785c717a561Smaybee } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { 786c717a561Smaybee /* 787c717a561Smaybee * This buffer has already been synced. It could not 788c717a561Smaybee * have been dirtied since, or we would have cleared the state. 789c717a561Smaybee */ 790c717a561Smaybee *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */ 791c717a561Smaybee mutex_exit(&db->db_mtx); 792c717a561Smaybee txg_resume(dp); 793c717a561Smaybee return (0); 794c717a561Smaybee } 795c717a561Smaybee 796c717a561Smaybee dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; 797c717a561Smaybee in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 798c717a561Smaybee in->dr = dr; 799c5c6ffa0Smaybee in->done = done; 800c5c6ffa0Smaybee in->arg = arg; 801fa9e4066Sahrens mutex_exit(&db->db_mtx); 802c5c6ffa0Smaybee txg_resume(dp); 803fa9e4066Sahrens 804c5c6ffa0Smaybee zb.zb_objset = os->os_dsl_dataset->ds_object; 805ea8dc4b6Seschrock zb.zb_object = db->db.db_object; 806ea8dc4b6Seschrock zb.zb_level = db->db_level; 807ea8dc4b6Seschrock zb.zb_blkid = db->db_blkid; 8089bc11082Sek110237 zio_flags = ZIO_FLAG_MUSTSUCCEED; 8099bc11082Sek110237 if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0) 8109bc11082Sek110237 zio_flags |= ZIO_FLAG_METADATA; 811c717a561Smaybee zio = arc_write(pio, os->os_spa, 812c5c6ffa0Smaybee zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum), 813c5c6ffa0Smaybee zio_compress_select(db->db_dnode->dn_compress, os->os_compress), 814d0ad202dSahrens dmu_get_replication_level(os, &zb, db->db_dnode->dn_type), 815c717a561Smaybee txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in, 8169bc11082Sek110237 ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb); 817fa9e4066Sahrens 818c717a561Smaybee if (pio) { 819c717a561Smaybee zio_nowait(zio); 820c717a561Smaybee err = EINPROGRESS; 821c717a561Smaybee } else { 822c717a561Smaybee err = zio_wait(zio); 823c717a561Smaybee ASSERT(err == 0); 824c717a561Smaybee } 825c717a561Smaybee return (err); 826fa9e4066Sahrens } 827fa9e4066Sahrens 828fa9e4066Sahrens int 829fa9e4066Sahrens dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, 830fa9e4066Sahrens dmu_tx_t *tx) 831fa9e4066Sahrens { 832ea8dc4b6Seschrock dnode_t *dn; 833ea8dc4b6Seschrock int err; 834ea8dc4b6Seschrock 835ea8dc4b6Seschrock err = dnode_hold(os->os, object, FTAG, &dn); 836ea8dc4b6Seschrock if (err) 837ea8dc4b6Seschrock return (err); 838ea8dc4b6Seschrock err = dnode_set_blksz(dn, size, ibs, tx); 839fa9e4066Sahrens dnode_rele(dn, FTAG); 840fa9e4066Sahrens return (err); 841fa9e4066Sahrens } 842fa9e4066Sahrens 843fa9e4066Sahrens void 844fa9e4066Sahrens dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, 845fa9e4066Sahrens dmu_tx_t *tx) 846fa9e4066Sahrens { 847ea8dc4b6Seschrock dnode_t *dn; 848ea8dc4b6Seschrock 849ea8dc4b6Seschrock /* XXX assumes dnode_hold will not get an i/o error */ 850ea8dc4b6Seschrock (void) dnode_hold(os->os, object, FTAG, &dn); 851fa9e4066Sahrens ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); 852fa9e4066Sahrens dn->dn_checksum = checksum; 853fa9e4066Sahrens dnode_setdirty(dn, tx); 854fa9e4066Sahrens dnode_rele(dn, FTAG); 855fa9e4066Sahrens } 856fa9e4066Sahrens 857fa9e4066Sahrens void 858fa9e4066Sahrens dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, 859fa9e4066Sahrens dmu_tx_t *tx) 860fa9e4066Sahrens { 861ea8dc4b6Seschrock dnode_t *dn; 862ea8dc4b6Seschrock 863ea8dc4b6Seschrock /* XXX assumes dnode_hold will not get an i/o error */ 864ea8dc4b6Seschrock (void) dnode_hold(os->os, object, FTAG, &dn); 865fa9e4066Sahrens ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); 866fa9e4066Sahrens dn->dn_compress = compress; 867fa9e4066Sahrens dnode_setdirty(dn, tx); 868fa9e4066Sahrens dnode_rele(dn, FTAG); 869fa9e4066Sahrens } 870fa9e4066Sahrens 87144cd46caSbillm int 872d0ad202dSahrens dmu_get_replication_level(objset_impl_t *os, 873d0ad202dSahrens zbookmark_t *zb, dmu_object_type_t ot) 87444cd46caSbillm { 875d0ad202dSahrens int ncopies = os->os_copies; 87644cd46caSbillm 877d0ad202dSahrens /* If it's the mos, it should have max copies set. */ 878d0ad202dSahrens ASSERT(zb->zb_objset != 0 || 879d0ad202dSahrens ncopies == spa_max_replication(os->os_spa)); 880d0ad202dSahrens 881d0ad202dSahrens if (dmu_ot[ot].ot_metadata || zb->zb_level != 0) 88244cd46caSbillm ncopies++; 883d0ad202dSahrens return (MIN(ncopies, spa_max_replication(os->os_spa))); 88444cd46caSbillm } 88544cd46caSbillm 886fa9e4066Sahrens int 887fa9e4066Sahrens dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) 888fa9e4066Sahrens { 889fa9e4066Sahrens dnode_t *dn; 890fa9e4066Sahrens int i, err; 891fa9e4066Sahrens 892ea8dc4b6Seschrock err = dnode_hold(os->os, object, FTAG, &dn); 893ea8dc4b6Seschrock if (err) 894ea8dc4b6Seschrock return (err); 895fa9e4066Sahrens /* 896fa9e4066Sahrens * Sync any current changes before 897fa9e4066Sahrens * we go trundling through the block pointers. 898fa9e4066Sahrens */ 899fa9e4066Sahrens for (i = 0; i < TXG_SIZE; i++) { 900c543ec06Sahrens if (list_link_active(&dn->dn_dirty_link[i])) 901fa9e4066Sahrens break; 902fa9e4066Sahrens } 903fa9e4066Sahrens if (i != TXG_SIZE) { 904fa9e4066Sahrens dnode_rele(dn, FTAG); 905fa9e4066Sahrens txg_wait_synced(dmu_objset_pool(os), 0); 906ea8dc4b6Seschrock err = dnode_hold(os->os, object, FTAG, &dn); 907ea8dc4b6Seschrock if (err) 908ea8dc4b6Seschrock return (err); 909fa9e4066Sahrens } 910fa9e4066Sahrens 9116754306eSahrens err = dnode_next_offset(dn, hole, off, 1, 1, 0); 912fa9e4066Sahrens dnode_rele(dn, FTAG); 913fa9e4066Sahrens 914fa9e4066Sahrens return (err); 915fa9e4066Sahrens } 916fa9e4066Sahrens 917fa9e4066Sahrens void 918fa9e4066Sahrens dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) 919fa9e4066Sahrens { 920fa9e4066Sahrens rw_enter(&dn->dn_struct_rwlock, RW_READER); 921fa9e4066Sahrens mutex_enter(&dn->dn_mtx); 922fa9e4066Sahrens 923fa9e4066Sahrens doi->doi_data_block_size = dn->dn_datablksz; 924fa9e4066Sahrens doi->doi_metadata_block_size = dn->dn_indblkshift ? 925fa9e4066Sahrens 1ULL << dn->dn_indblkshift : 0; 926fa9e4066Sahrens doi->doi_indirection = dn->dn_nlevels; 927fa9e4066Sahrens doi->doi_checksum = dn->dn_checksum; 928fa9e4066Sahrens doi->doi_compress = dn->dn_compress; 92999653d4eSeschrock doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) + 93099653d4eSeschrock SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT; 931fa9e4066Sahrens doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; 932fa9e4066Sahrens doi->doi_type = dn->dn_type; 933fa9e4066Sahrens doi->doi_bonus_size = dn->dn_bonuslen; 934fa9e4066Sahrens doi->doi_bonus_type = dn->dn_bonustype; 935fa9e4066Sahrens 936fa9e4066Sahrens mutex_exit(&dn->dn_mtx); 937fa9e4066Sahrens rw_exit(&dn->dn_struct_rwlock); 938fa9e4066Sahrens } 939fa9e4066Sahrens 940fa9e4066Sahrens /* 941fa9e4066Sahrens * Get information on a DMU object. 942fa9e4066Sahrens * If doi is NULL, just indicates whether the object exists. 943fa9e4066Sahrens */ 944fa9e4066Sahrens int 945fa9e4066Sahrens dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) 946fa9e4066Sahrens { 947ea8dc4b6Seschrock dnode_t *dn; 948ea8dc4b6Seschrock int err = dnode_hold(os->os, object, FTAG, &dn); 949fa9e4066Sahrens 950ea8dc4b6Seschrock if (err) 951ea8dc4b6Seschrock return (err); 952fa9e4066Sahrens 953fa9e4066Sahrens if (doi != NULL) 954fa9e4066Sahrens dmu_object_info_from_dnode(dn, doi); 955fa9e4066Sahrens 956fa9e4066Sahrens dnode_rele(dn, FTAG); 957fa9e4066Sahrens return (0); 958fa9e4066Sahrens } 959fa9e4066Sahrens 960fa9e4066Sahrens /* 961fa9e4066Sahrens * As above, but faster; can be used when you have a held dbuf in hand. 962fa9e4066Sahrens */ 963fa9e4066Sahrens void 964fa9e4066Sahrens dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) 965fa9e4066Sahrens { 966fa9e4066Sahrens dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); 967fa9e4066Sahrens } 968fa9e4066Sahrens 969fa9e4066Sahrens /* 970fa9e4066Sahrens * Faster still when you only care about the size. 971fa9e4066Sahrens * This is specifically optimized for zfs_getattr(). 972fa9e4066Sahrens */ 973fa9e4066Sahrens void 974fa9e4066Sahrens dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) 975fa9e4066Sahrens { 976fa9e4066Sahrens dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; 977fa9e4066Sahrens 978fa9e4066Sahrens *blksize = dn->dn_datablksz; 97999653d4eSeschrock /* add 1 for dnode space */ 98099653d4eSeschrock *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> 98199653d4eSeschrock SPA_MINBLOCKSHIFT) + 1; 982fa9e4066Sahrens } 983fa9e4066Sahrens 984fa9e4066Sahrens void 985fa9e4066Sahrens byteswap_uint64_array(void *vbuf, size_t size) 986fa9e4066Sahrens { 987fa9e4066Sahrens uint64_t *buf = vbuf; 988fa9e4066Sahrens size_t count = size >> 3; 989fa9e4066Sahrens int i; 990fa9e4066Sahrens 991fa9e4066Sahrens ASSERT((size & 7) == 0); 992fa9e4066Sahrens 993fa9e4066Sahrens for (i = 0; i < count; i++) 994fa9e4066Sahrens buf[i] = BSWAP_64(buf[i]); 995fa9e4066Sahrens } 996fa9e4066Sahrens 997fa9e4066Sahrens void 998fa9e4066Sahrens byteswap_uint32_array(void *vbuf, size_t size) 999fa9e4066Sahrens { 1000fa9e4066Sahrens uint32_t *buf = vbuf; 1001fa9e4066Sahrens size_t count = size >> 2; 1002fa9e4066Sahrens int i; 1003fa9e4066Sahrens 1004fa9e4066Sahrens ASSERT((size & 3) == 0); 1005fa9e4066Sahrens 1006fa9e4066Sahrens for (i = 0; i < count; i++) 1007fa9e4066Sahrens buf[i] = BSWAP_32(buf[i]); 1008fa9e4066Sahrens } 1009fa9e4066Sahrens 1010fa9e4066Sahrens void 1011fa9e4066Sahrens byteswap_uint16_array(void *vbuf, size_t size) 1012fa9e4066Sahrens { 1013fa9e4066Sahrens uint16_t *buf = vbuf; 1014fa9e4066Sahrens size_t count = size >> 1; 1015fa9e4066Sahrens int i; 1016fa9e4066Sahrens 1017fa9e4066Sahrens ASSERT((size & 1) == 0); 1018fa9e4066Sahrens 1019fa9e4066Sahrens for (i = 0; i < count; i++) 1020fa9e4066Sahrens buf[i] = BSWAP_16(buf[i]); 1021fa9e4066Sahrens } 1022fa9e4066Sahrens 1023fa9e4066Sahrens /* ARGSUSED */ 1024fa9e4066Sahrens void 1025fa9e4066Sahrens byteswap_uint8_array(void *vbuf, size_t size) 1026fa9e4066Sahrens { 1027fa9e4066Sahrens } 1028fa9e4066Sahrens 1029fa9e4066Sahrens void 1030fa9e4066Sahrens dmu_init(void) 1031fa9e4066Sahrens { 1032fa9e4066Sahrens dbuf_init(); 1033fa9e4066Sahrens dnode_init(); 1034fa9e4066Sahrens arc_init(); 1035fa9e4066Sahrens } 1036fa9e4066Sahrens 1037fa9e4066Sahrens void 1038fa9e4066Sahrens dmu_fini(void) 1039fa9e4066Sahrens { 1040fa9e4066Sahrens arc_fini(); 1041fa9e4066Sahrens dnode_fini(); 1042fa9e4066Sahrens dbuf_fini(); 1043fa9e4066Sahrens } 1044