1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 23 #include <sys/dmu.h> 24 #include <sys/dmu_impl.h> 25 #include <sys/dbuf.h> 26 #include <sys/dnode.h> 27 #include <sys/zfs_context.h> 28 #include <sys/zfs_racct.h> 29 #include <sys/dsl_dataset.h> 30 #include <sys/dmu_objset.h> 31 32 static abd_t * 33 make_abd_for_dbuf(dmu_buf_impl_t *db, abd_t *data, uint64_t offset, 34 uint64_t size) 35 { 36 size_t buf_size = db->db.db_size; 37 abd_t *pre_buf = NULL, *post_buf = NULL, *mbuf = NULL; 38 size_t buf_off = 0; 39 40 ASSERT(MUTEX_HELD(&db->db_mtx)); 41 42 if (offset > db->db.db_offset) { 43 size_t pre_size = offset - db->db.db_offset; 44 pre_buf = abd_alloc_for_io(pre_size, B_TRUE); 45 buf_size -= pre_size; 46 buf_off = 0; 47 } else { 48 buf_off = db->db.db_offset - offset; 49 size -= buf_off; 50 } 51 52 if (size < buf_size) { 53 size_t post_size = buf_size - size; 54 post_buf = abd_alloc_for_io(post_size, B_TRUE); 55 buf_size -= post_size; 56 } 57 58 ASSERT3U(buf_size, >, 0); 59 abd_t *buf = abd_get_offset_size(data, buf_off, buf_size); 60 61 if (pre_buf || post_buf) { 62 mbuf = abd_alloc_gang(); 63 if (pre_buf) 64 abd_gang_add(mbuf, pre_buf, B_TRUE); 65 abd_gang_add(mbuf, buf, B_TRUE); 66 if (post_buf) 67 abd_gang_add(mbuf, post_buf, B_TRUE); 68 } else { 69 mbuf = buf; 70 } 71 72 return (mbuf); 73 } 74 75 static void 76 dmu_read_abd_done(zio_t *zio) 77 { 78 abd_free(zio->io_abd); 79 } 80 81 static void 82 dmu_write_direct_ready(zio_t *zio) 83 { 84 dmu_sync_ready(zio, NULL, zio->io_private); 85 } 86 87 static void 88 dmu_write_direct_done(zio_t *zio) 89 { 90 dmu_sync_arg_t *dsa = zio->io_private; 91 dbuf_dirty_record_t *dr = dsa->dsa_dr; 92 dmu_buf_impl_t *db = dr->dr_dbuf; 93 94 abd_free(zio->io_abd); 95 96 mutex_enter(&db->db_mtx); 97 ASSERT3P(db->db_buf, ==, NULL); 98 ASSERT3P(dr->dt.dl.dr_data, ==, NULL); 99 ASSERT3P(db->db.db_data, ==, NULL); 100 db->db_state = DB_UNCACHED; 101 mutex_exit(&db->db_mtx); 102 103 dmu_sync_done(zio, NULL, zio->io_private); 104 105 if (zio->io_error != 0) { 106 if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) 107 ASSERT3U(zio->io_error, ==, EIO); 108 109 /* 110 * In the event of an I/O error this block has been freed in 111 * zio_done() through zio_dva_unallocate(). Calling 112 * dmu_sync_done() above set dr_override_state to 113 * DR_NOT_OVERRIDDEN. In this case when dbuf_undirty() calls 114 * dbuf_unoverride(), it will skip doing zio_free() to free 115 * this block as that was already taken care of. 116 * 117 * Since we are undirtying the record in open-context, we must 118 * have a hold on the db, so it should never be evicted after 119 * calling dbuf_undirty(). 120 */ 121 mutex_enter(&db->db_mtx); 122 VERIFY3B(dbuf_undirty(db, dsa->dsa_tx), ==, B_FALSE); 123 mutex_exit(&db->db_mtx); 124 } 125 126 kmem_free(zio->io_bp, sizeof (blkptr_t)); 127 zio->io_bp = NULL; 128 } 129 130 int 131 dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx) 132 { 133 objset_t *os = db->db_objset; 134 dsl_dataset_t *ds = dmu_objset_ds(os); 135 zbookmark_phys_t zb; 136 dbuf_dirty_record_t *dr_head; 137 138 SET_BOOKMARK(&zb, ds->ds_object, 139 db->db.db_object, db->db_level, db->db_blkid); 140 141 DB_DNODE_ENTER(db); 142 zio_prop_t zp; 143 dmu_write_policy(os, DB_DNODE(db), db->db_level, 144 WP_DMU_SYNC | WP_DIRECT_WR, &zp); 145 DB_DNODE_EXIT(db); 146 147 /* 148 * Dirty this dbuf with DB_NOFILL since we will not have any data 149 * associated with the dbuf. 150 */ 151 dmu_buf_will_clone_or_dio(&db->db, tx); 152 153 mutex_enter(&db->db_mtx); 154 155 uint64_t txg = dmu_tx_get_txg(tx); 156 ASSERT3U(txg, >, spa_last_synced_txg(os->os_spa)); 157 ASSERT3U(txg, >, spa_syncing_txg(os->os_spa)); 158 159 dr_head = list_head(&db->db_dirty_records); 160 ASSERT3U(dr_head->dr_txg, ==, txg); 161 dr_head->dt.dl.dr_diowrite = B_TRUE; 162 dr_head->dr_accounted = db->db.db_size; 163 164 blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); 165 if (db->db_blkptr != NULL) { 166 /* 167 * Fill in bp with the current block pointer so that 168 * the nopwrite code can check if we're writing the same 169 * data that's already on disk. 170 */ 171 *bp = *db->db_blkptr; 172 } else { 173 memset(bp, 0, sizeof (blkptr_t)); 174 } 175 176 /* 177 * Disable nopwrite if the current block pointer could change 178 * before this TXG syncs. 179 */ 180 if (list_next(&db->db_dirty_records, dr_head) != NULL) 181 zp.zp_nopwrite = B_FALSE; 182 183 ASSERT0(dr_head->dt.dl.dr_has_raw_params); 184 ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN); 185 dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC; 186 187 mutex_exit(&db->db_mtx); 188 189 dmu_objset_willuse_space(os, dr_head->dr_accounted, tx); 190 191 dmu_sync_arg_t *dsa = kmem_zalloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 192 dsa->dsa_dr = dr_head; 193 dsa->dsa_tx = tx; 194 195 zio_t *zio = zio_write(pio, os->os_spa, txg, bp, data, 196 db->db.db_size, db->db.db_size, &zp, 197 dmu_write_direct_ready, NULL, dmu_write_direct_done, dsa, 198 ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb); 199 200 if (pio == NULL) 201 return (zio_wait(zio)); 202 203 zio_nowait(zio); 204 205 return (0); 206 } 207 208 int 209 dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size, 210 abd_t *data, uint32_t flags, dmu_tx_t *tx) 211 { 212 dmu_buf_t **dbp; 213 spa_t *spa = dn->dn_objset->os_spa; 214 int numbufs, err; 215 216 ASSERT(flags & DMU_DIRECTIO); 217 218 err = dmu_buf_hold_array_by_dnode(dn, offset, 219 size, B_FALSE, FTAG, &numbufs, &dbp, flags); 220 if (err) 221 return (err); 222 223 zio_t *pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 224 225 for (int i = 0; i < numbufs && err == 0; i++) { 226 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; 227 228 abd_t *abd = abd_get_offset_size(data, 229 db->db.db_offset - offset, dn->dn_datablksz); 230 231 zfs_racct_write(spa, db->db.db_size, 1, flags); 232 err = dmu_write_direct(pio, db, abd, tx); 233 ASSERT0(err); 234 } 235 236 err = zio_wait(pio); 237 238 /* 239 * The dbuf must be held until the Direct I/O write has completed in 240 * the event there was any errors and dbuf_undirty() was called. 241 */ 242 dmu_buf_rele_array(dbp, numbufs, FTAG); 243 244 return (err); 245 } 246 247 int 248 dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, 249 abd_t *data, uint32_t flags) 250 { 251 objset_t *os = dn->dn_objset; 252 spa_t *spa = os->os_spa; 253 dmu_buf_t **dbp; 254 int numbufs, err; 255 256 ASSERT(flags & DMU_DIRECTIO); 257 258 err = dmu_buf_hold_array_by_dnode(dn, offset, 259 size, B_FALSE, FTAG, &numbufs, &dbp, flags); 260 if (err) 261 return (err); 262 263 zio_t *rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 264 265 for (int i = 0; i < numbufs; i++) { 266 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; 267 abd_t *mbuf; 268 zbookmark_phys_t zb; 269 blkptr_t *bp; 270 271 mutex_enter(&db->db_mtx); 272 273 SET_BOOKMARK(&zb, dmu_objset_ds(os)->ds_object, 274 db->db.db_object, db->db_level, db->db_blkid); 275 276 /* 277 * If there is another read for this dbuf, we will wait for 278 * that to complete first before checking the db_state below. 279 */ 280 while (db->db_state == DB_READ) 281 cv_wait(&db->db_changed, &db->db_mtx); 282 283 err = dmu_buf_get_bp_from_dbuf(db, &bp); 284 if (err) { 285 mutex_exit(&db->db_mtx); 286 goto error; 287 } 288 289 /* 290 * There is no need to read if this is a hole or the data is 291 * cached. This will not be considered a direct read for IO 292 * accounting in the same way that an ARC hit is not counted. 293 */ 294 if (bp == NULL || BP_IS_HOLE(bp) || db->db_state == DB_CACHED) { 295 size_t aoff = offset < db->db.db_offset ? 296 db->db.db_offset - offset : 0; 297 size_t boff = offset > db->db.db_offset ? 298 offset - db->db.db_offset : 0; 299 size_t len = MIN(size - aoff, db->db.db_size - boff); 300 301 if (db->db_state == DB_CACHED) { 302 /* 303 * We need to untransformed the ARC buf data 304 * before we copy it over. 305 */ 306 err = dmu_buf_untransform_direct(db, spa); 307 ASSERT0(err); 308 abd_copy_from_buf_off(data, 309 (char *)db->db.db_data + boff, aoff, len); 310 } else { 311 abd_zero_off(data, aoff, len); 312 } 313 314 mutex_exit(&db->db_mtx); 315 continue; 316 } 317 318 mbuf = make_abd_for_dbuf(db, data, offset, size); 319 ASSERT3P(mbuf, !=, NULL); 320 321 /* 322 * The dbuf mutex (db_mtx) must be held when creating the ZIO 323 * for the read. The BP returned from 324 * dmu_buf_get_bp_from_dbuf() could be from a pending block 325 * clone or a yet to be synced Direct I/O write that is in the 326 * dbuf's dirty record. When zio_read() is called, zio_create() 327 * will make a copy of the BP. However, if zio_read() is called 328 * without the mutex being held then the dirty record from the 329 * dbuf could be freed in dbuf_write_done() resulting in garbage 330 * being set for the zio BP. 331 */ 332 zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size, 333 dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ, 334 ZIO_FLAG_CANFAIL | ZIO_FLAG_DIO_READ, &zb); 335 mutex_exit(&db->db_mtx); 336 337 zfs_racct_read(spa, db->db.db_size, 1, flags); 338 zio_nowait(cio); 339 } 340 341 dmu_buf_rele_array(dbp, numbufs, FTAG); 342 343 return (zio_wait(rio)); 344 345 error: 346 dmu_buf_rele_array(dbp, numbufs, FTAG); 347 (void) zio_wait(rio); 348 return (err); 349 } 350 351 #ifdef _KERNEL 352 int 353 dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size) 354 { 355 offset_t offset = zfs_uio_offset(uio); 356 offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT; 357 int err; 358 359 ASSERT(uio->uio_extflg & UIO_DIRECT); 360 ASSERT3U(page_index, <, uio->uio_dio.npages); 361 362 abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index], 363 offset & (PAGESIZE - 1), size); 364 err = dmu_read_abd(dn, offset, size, data, DMU_DIRECTIO); 365 abd_free(data); 366 367 if (err == 0) 368 zfs_uioskip(uio, size); 369 370 return (err); 371 } 372 373 int 374 dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) 375 { 376 offset_t offset = zfs_uio_offset(uio); 377 offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT; 378 int err; 379 380 ASSERT(uio->uio_extflg & UIO_DIRECT); 381 ASSERT3U(page_index, <, uio->uio_dio.npages); 382 383 abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index], 384 offset & (PAGESIZE - 1), size); 385 err = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx); 386 abd_free(data); 387 388 if (err == 0) 389 zfs_uioskip(uio, size); 390 391 return (err); 392 } 393 #endif /* _KERNEL */ 394 395 EXPORT_SYMBOL(dmu_read_uio_direct); 396 EXPORT_SYMBOL(dmu_write_uio_direct); 397