1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 24 #include <sys/dmu.h> 25 #include <sys/dmu_impl.h> 26 #include <sys/dbuf.h> 27 #include <sys/dnode.h> 28 #include <sys/zfs_context.h> 29 #include <sys/zfs_racct.h> 30 #include <sys/dsl_dataset.h> 31 #include <sys/dmu_objset.h> 32 33 static abd_t * 34 make_abd_for_dbuf(dmu_buf_impl_t *db, abd_t *data, uint64_t offset, 35 uint64_t size) 36 { 37 size_t buf_size = db->db.db_size; 38 abd_t *pre_buf = NULL, *post_buf = NULL, *mbuf = NULL; 39 size_t buf_off = 0; 40 41 ASSERT(MUTEX_HELD(&db->db_mtx)); 42 43 if (offset > db->db.db_offset) { 44 size_t pre_size = offset - db->db.db_offset; 45 pre_buf = abd_alloc_for_io(pre_size, B_TRUE); 46 buf_size -= pre_size; 47 buf_off = 0; 48 } else { 49 buf_off = db->db.db_offset - offset; 50 size -= buf_off; 51 } 52 53 if (size < buf_size) { 54 size_t post_size = buf_size - size; 55 post_buf = abd_alloc_for_io(post_size, B_TRUE); 56 buf_size -= post_size; 57 } 58 59 ASSERT3U(buf_size, >, 0); 60 abd_t *buf = abd_get_offset_size(data, buf_off, buf_size); 61 62 if (pre_buf || post_buf) { 63 mbuf = abd_alloc_gang(); 64 if (pre_buf) 65 abd_gang_add(mbuf, pre_buf, B_TRUE); 66 abd_gang_add(mbuf, buf, B_TRUE); 67 if (post_buf) 68 abd_gang_add(mbuf, post_buf, B_TRUE); 69 } else { 70 mbuf = buf; 71 } 72 73 return (mbuf); 74 } 75 76 static void 77 dmu_read_abd_done(zio_t *zio) 78 { 79 abd_free(zio->io_abd); 80 } 81 82 static void 83 dmu_write_direct_ready(zio_t *zio) 84 { 85 dmu_sync_ready(zio, NULL, zio->io_private); 86 } 87 88 static void 89 dmu_write_direct_done(zio_t *zio) 90 { 91 dmu_sync_arg_t *dsa = zio->io_private; 92 dbuf_dirty_record_t *dr = dsa->dsa_dr; 93 dmu_buf_impl_t *db = dr->dr_dbuf; 94 95 abd_free(zio->io_abd); 96 97 mutex_enter(&db->db_mtx); 98 ASSERT3P(db->db_buf, ==, NULL); 99 ASSERT3P(dr->dt.dl.dr_data, ==, NULL); 100 ASSERT3P(db->db.db_data, ==, NULL); 101 db->db_state = DB_UNCACHED; 102 mutex_exit(&db->db_mtx); 103 104 dmu_sync_done(zio, NULL, zio->io_private); 105 106 if (zio->io_error != 0) { 107 if (zio->io_post & ZIO_POST_DIO_CHKSUM_ERR) 108 ASSERT3U(zio->io_error, ==, EIO); 109 110 /* 111 * In the event of an I/O error this block has been freed in 112 * zio_done() through zio_dva_unallocate(). Calling 113 * dmu_sync_done() above set dr_override_state to 114 * DR_NOT_OVERRIDDEN. In this case when dbuf_undirty() calls 115 * dbuf_unoverride(), it will skip doing zio_free() to free 116 * this block as that was already taken care of. 117 * 118 * Since we are undirtying the record in open-context, we must 119 * have a hold on the db, so it should never be evicted after 120 * calling dbuf_undirty(). 121 */ 122 mutex_enter(&db->db_mtx); 123 VERIFY3B(dbuf_undirty(db, dsa->dsa_tx), ==, B_FALSE); 124 mutex_exit(&db->db_mtx); 125 } 126 127 kmem_free(zio->io_bp, sizeof (blkptr_t)); 128 zio->io_bp = NULL; 129 } 130 131 int 132 dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx) 133 { 134 objset_t *os = db->db_objset; 135 dsl_dataset_t *ds = dmu_objset_ds(os); 136 zbookmark_phys_t zb; 137 dbuf_dirty_record_t *dr_head; 138 139 SET_BOOKMARK(&zb, ds->ds_object, 140 db->db.db_object, db->db_level, db->db_blkid); 141 142 DB_DNODE_ENTER(db); 143 zio_prop_t zp; 144 dmu_write_policy(os, DB_DNODE(db), db->db_level, 145 WP_DMU_SYNC | WP_DIRECT_WR, &zp); 146 DB_DNODE_EXIT(db); 147 148 /* 149 * Dirty this dbuf with DB_NOFILL since we will not have any data 150 * associated with the dbuf. 151 */ 152 dmu_buf_will_clone_or_dio(&db->db, tx); 153 154 mutex_enter(&db->db_mtx); 155 156 uint64_t txg = dmu_tx_get_txg(tx); 157 ASSERT3U(txg, >, spa_last_synced_txg(os->os_spa)); 158 ASSERT3U(txg, >, spa_syncing_txg(os->os_spa)); 159 160 dr_head = list_head(&db->db_dirty_records); 161 ASSERT3U(dr_head->dr_txg, ==, txg); 162 dr_head->dt.dl.dr_diowrite = B_TRUE; 163 dr_head->dr_accounted = db->db.db_size; 164 165 blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); 166 if (db->db_blkptr != NULL) { 167 /* 168 * Fill in bp with the current block pointer so that 169 * the nopwrite code can check if we're writing the same 170 * data that's already on disk. 171 */ 172 *bp = *db->db_blkptr; 173 } else { 174 memset(bp, 0, sizeof (blkptr_t)); 175 } 176 177 /* 178 * Disable nopwrite if the current block pointer could change 179 * before this TXG syncs. 180 */ 181 if (list_next(&db->db_dirty_records, dr_head) != NULL) 182 zp.zp_nopwrite = B_FALSE; 183 184 ASSERT0(dr_head->dt.dl.dr_has_raw_params); 185 ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN); 186 dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC; 187 188 mutex_exit(&db->db_mtx); 189 190 dmu_objset_willuse_space(os, dr_head->dr_accounted, tx); 191 192 dmu_sync_arg_t *dsa = kmem_zalloc(sizeof (dmu_sync_arg_t), KM_SLEEP); 193 dsa->dsa_dr = dr_head; 194 dsa->dsa_tx = tx; 195 196 zio_t *zio = zio_write(pio, os->os_spa, txg, bp, data, 197 db->db.db_size, db->db.db_size, &zp, 198 dmu_write_direct_ready, NULL, dmu_write_direct_done, dsa, 199 ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb); 200 201 if (pio == NULL) 202 return (zio_wait(zio)); 203 204 zio_nowait(zio); 205 206 return (0); 207 } 208 209 int 210 dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size, 211 abd_t *data, dmu_flags_t flags, dmu_tx_t *tx) 212 { 213 dmu_buf_t **dbp; 214 spa_t *spa = dn->dn_objset->os_spa; 215 int numbufs, err; 216 217 ASSERT(flags & DMU_DIRECTIO); 218 219 err = dmu_buf_hold_array_by_dnode(dn, offset, 220 size, B_FALSE, FTAG, &numbufs, &dbp, flags); 221 if (err) 222 return (err); 223 224 zio_t *pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 225 226 for (int i = 0; i < numbufs && err == 0; i++) { 227 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; 228 229 abd_t *abd = abd_get_offset_size(data, 230 db->db.db_offset - offset, dn->dn_datablksz); 231 232 zfs_racct_write(spa, db->db.db_size, 1, flags); 233 err = dmu_write_direct(pio, db, abd, tx); 234 ASSERT0(err); 235 } 236 237 err = zio_wait(pio); 238 239 /* 240 * The dbuf must be held until the Direct I/O write has completed in 241 * the event there was any errors and dbuf_undirty() was called. 242 */ 243 dmu_buf_rele_array(dbp, numbufs, FTAG); 244 245 return (err); 246 } 247 248 int 249 dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, 250 abd_t *data, dmu_flags_t flags) 251 { 252 objset_t *os = dn->dn_objset; 253 spa_t *spa = os->os_spa; 254 dmu_buf_t **dbp; 255 int numbufs, err; 256 257 ASSERT(flags & DMU_DIRECTIO); 258 259 err = dmu_buf_hold_array_by_dnode(dn, offset, 260 size, B_FALSE, FTAG, &numbufs, &dbp, flags); 261 if (err) 262 return (err); 263 264 zio_t *rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 265 266 for (int i = 0; i < numbufs; i++) { 267 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; 268 abd_t *mbuf; 269 zbookmark_phys_t zb; 270 blkptr_t *bp; 271 272 mutex_enter(&db->db_mtx); 273 274 SET_BOOKMARK(&zb, dmu_objset_ds(os)->ds_object, 275 db->db.db_object, db->db_level, db->db_blkid); 276 277 /* 278 * If there is another read for this dbuf, we will wait for 279 * that to complete first before checking the db_state below. 280 */ 281 while (db->db_state == DB_READ) 282 cv_wait(&db->db_changed, &db->db_mtx); 283 284 err = dmu_buf_get_bp_from_dbuf(db, &bp); 285 if (err) { 286 mutex_exit(&db->db_mtx); 287 goto error; 288 } 289 290 /* 291 * There is no need to read if this is a hole or the data is 292 * cached. This will not be considered a direct read for IO 293 * accounting in the same way that an ARC hit is not counted. 294 */ 295 if (bp == NULL || BP_IS_HOLE(bp) || db->db_state == DB_CACHED) { 296 size_t aoff = offset < db->db.db_offset ? 297 db->db.db_offset - offset : 0; 298 size_t boff = offset > db->db.db_offset ? 299 offset - db->db.db_offset : 0; 300 size_t len = MIN(size - aoff, db->db.db_size - boff); 301 302 if (db->db_state == DB_CACHED) { 303 /* 304 * We need to untransformed the ARC buf data 305 * before we copy it over. 306 */ 307 err = dmu_buf_untransform_direct(db, spa); 308 ASSERT0(err); 309 abd_copy_from_buf_off(data, 310 (char *)db->db.db_data + boff, aoff, len); 311 } else { 312 abd_zero_off(data, aoff, len); 313 } 314 315 mutex_exit(&db->db_mtx); 316 continue; 317 } 318 319 mbuf = make_abd_for_dbuf(db, data, offset, size); 320 ASSERT3P(mbuf, !=, NULL); 321 322 /* 323 * The dbuf mutex (db_mtx) must be held when creating the ZIO 324 * for the read. The BP returned from 325 * dmu_buf_get_bp_from_dbuf() could be from a pending block 326 * clone or a yet to be synced Direct I/O write that is in the 327 * dbuf's dirty record. When zio_read() is called, zio_create() 328 * will make a copy of the BP. However, if zio_read() is called 329 * without the mutex being held then the dirty record from the 330 * dbuf could be freed in dbuf_write_done() resulting in garbage 331 * being set for the zio BP. 332 */ 333 zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size, 334 dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ, 335 ZIO_FLAG_CANFAIL | ZIO_FLAG_DIO_READ, &zb); 336 mutex_exit(&db->db_mtx); 337 338 zfs_racct_read(spa, db->db.db_size, 1, flags); 339 zio_nowait(cio); 340 } 341 342 dmu_buf_rele_array(dbp, numbufs, FTAG); 343 344 return (zio_wait(rio)); 345 346 error: 347 dmu_buf_rele_array(dbp, numbufs, FTAG); 348 (void) zio_wait(rio); 349 return (err); 350 } 351 352 #ifdef _KERNEL 353 int 354 dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, 355 dmu_flags_t flags) 356 { 357 offset_t offset = zfs_uio_offset(uio); 358 offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT; 359 int err; 360 361 ASSERT(uio->uio_extflg & UIO_DIRECT); 362 ASSERT3U(page_index, <, uio->uio_dio.npages); 363 364 abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index], 365 offset & (PAGESIZE - 1), size); 366 err = dmu_read_abd(dn, offset, size, data, flags); 367 abd_free(data); 368 369 if (err == 0) 370 zfs_uioskip(uio, size); 371 372 return (err); 373 } 374 375 int 376 dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, 377 dmu_flags_t flags, dmu_tx_t *tx) 378 { 379 offset_t offset = zfs_uio_offset(uio); 380 offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT; 381 int err; 382 383 ASSERT(uio->uio_extflg & UIO_DIRECT); 384 ASSERT3U(page_index, <, uio->uio_dio.npages); 385 386 abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index], 387 offset & (PAGESIZE - 1), size); 388 err = dmu_write_abd(dn, offset, size, data, flags, tx); 389 abd_free(data); 390 391 if (err == 0) 392 zfs_uioskip(uio, size); 393 394 return (err); 395 } 396 #endif /* _KERNEL */ 397 398 EXPORT_SYMBOL(dmu_read_abd); 399 EXPORT_SYMBOL(dmu_write_abd); 400