1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2014, 2018 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/conf.h> 27 #include <sys/file.h> 28 #include <sys/ddi.h> 29 #include <sys/sunddi.h> 30 #include <sys/modctl.h> 31 #include <sys/scsi/scsi.h> 32 #include <sys/scsi/impl/scsi_reset_notify.h> 33 #include <sys/scsi/generic/mode.h> 34 #include <sys/disp.h> 35 #include <sys/byteorder.h> 36 #include <sys/atomic.h> 37 #include <sys/sdt.h> 38 #include <sys/dkio.h> 39 #include <sys/dmu.h> 40 #include <sys/arc.h> 41 #include <sys/zvol.h> 42 #include <sys/zfs_rlock.h> 43 #include <sys/zil.h> 44 45 #include <sys/stmf.h> 46 #include <sys/lpif.h> 47 #include <sys/portif.h> 48 #include <sys/stmf_ioctl.h> 49 #include <sys/stmf_sbd_ioctl.h> 50 51 #include "stmf_sbd.h" 52 #include "sbd_impl.h" 53 54 55 /* 56 * This file contains direct calls into the zfs module. 57 * These functions mimic zvol_read and zvol_write except pointers 58 * to the data buffers are passed instead of copying the data itself. 59 * 60 * zfs internal interfaces referenced here: 61 * 62 * FUNCTIONS 63 * dmu_buf_hold_array_by_dnode() 64 * dmu_buf_rele_array() 65 * 66 * arc_loan_buf() 67 * dmu_assign_arcbuf() 68 * dmu_return_arcbuf() 69 * arc_buf_size() 70 * 71 * dmu_tx_create() 72 * dmu_tx_hold_write() 73 * dmu_tx_assign() 74 * dmu_tx_commit(tx) 75 * dmu_tx_abort(tx) 76 * zil_commit() 77 * 78 * rangelock_enter() 79 * rangelock_exit() 80 * 81 * zvol_log_write() 82 * 83 * dmu_read_uio() 84 * dmu_write_uio() 85 * MINOR DATA 86 * zv_volsize 87 * zv_volblocksize 88 * zv_flags - for WCE 89 * zv_objset - dmu_tx_create 90 * zv_zilog - zil_commit 91 * zv_znode - rangelock_enter 92 * zv_dn - dmu_buf_hold_array_by_bonus, dmu_request_arcbuf 93 * GLOBAL DATA 94 * zvol_maxphys 95 */ 96 97 /* 98 * Take direct control of the volume instead of using the driver 99 * interfaces provided by zvol.c. Gather parameters and handles 100 * needed to make direct calls into zfs/dmu/zvol. The driver is 101 * opened exclusively at this point, so these parameters cannot change. 102 * 103 * NOTE: the object size and WCE can change while the device 104 * is open, so they must be fetched for every operation. 105 */ 106 int 107 sbd_zvol_get_volume_params(sbd_lu_t *sl) 108 { 109 int ret; 110 111 ret = zvol_get_volume_params(sl->sl_zvol_minor, 112 &sl->sl_blksize, /* volume block size */ 113 &sl->sl_max_xfer_len, /* max data chunk size */ 114 &sl->sl_zvol_minor_hdl, /* minor soft state */ 115 &sl->sl_zvol_objset_hdl, /* dmu_tx_create */ 116 &sl->sl_zvol_zil_hdl, /* zil_commit */ 117 &sl->sl_zvol_rl_hdl, /* locked_range_t */ 118 &sl->sl_zvol_dn_hdl); /* dmu_buf_hold_array_by_dnode, */ 119 /* dmu_request_arcbuf, */ 120 /* dmu_assign_arcbuf */ 121 122 if (ret == 0 && sl->sl_blksize < MMU_PAGESIZE) { 123 cmn_err(CE_NOTE, "COMSTAR reduced copy disabled due to " 124 "small zvol blocksize (%d)\n", (int)sl->sl_blksize); 125 ret = ENOTSUP; 126 } 127 128 return (ret); 129 } 130 131 /* 132 * Return the number of elements in a scatter/gather list required for 133 * the given span in the zvol. Elements are 1:1 with zvol blocks. 134 */ 135 uint32_t 136 sbd_zvol_numsegs(sbd_lu_t *sl, uint64_t off, uint32_t len) 137 { 138 uint64_t blksz = sl->sl_blksize; 139 uint64_t endoff = off + len; 140 uint64_t numsegs; 141 142 numsegs = (P2ROUNDUP(endoff, blksz) - P2ALIGN(off, blksz)) / blksz; 143 return ((uint32_t)numsegs); 144 } 145 146 /* 147 * Return an array of dmu_buf_t pointers for the requested range. 148 * The dmu buffers are either in cache or read in synchronously. 149 * Fill in the dbuf sglist from the dmu_buf_t array. 150 */ 151 static void *RDTAG = "sbd_zvol_read"; 152 153 int 154 sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 155 { 156 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 157 locked_range_t *lr; 158 int numbufs, error; 159 uint64_t len = dbuf->db_data_size; 160 uint64_t offset = zvio->zvio_offset; 161 dmu_buf_t **dbpp, *dbp; 162 163 /* Make sure request is reasonable */ 164 if (len > sl->sl_max_xfer_len) 165 return (E2BIG); 166 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) 167 return (EIO); 168 169 /* 170 * The range lock is only held until the dmu buffers read in and 171 * held; not during the callers use of the data. 172 */ 173 lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len, RL_READER); 174 175 error = dmu_buf_hold_array_by_dnode(sl->sl_zvol_dn_hdl, 176 offset, len, TRUE, RDTAG, &numbufs, &dbpp, 177 DMU_READ_PREFETCH); 178 179 rangelock_exit(lr); 180 181 if (error == ECKSUM) 182 error = EIO; 183 184 if (error == 0) { 185 /* 186 * Fill in db_sglist from the dmu_buf_t array. 187 */ 188 int i; 189 stmf_sglist_ent_t *sgl; 190 uint64_t odiff, seglen; 191 192 zvio->zvio_dbp = dbpp; 193 /* make sure db_sglist is large enough */ 194 if (dbuf->db_sglist_length != numbufs) { 195 cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n", 196 dbuf->db_sglist_length, numbufs); 197 } 198 199 sgl = &dbuf->db_sglist[0]; 200 for (i = 0; i < numbufs; i++) { 201 dbp = dbpp[i]; 202 odiff = offset - dbp->db_offset; 203 ASSERT(odiff == 0 || i == 0); 204 sgl->seg_addr = (uint8_t *)dbp->db_data + odiff; 205 seglen = MIN(len, dbp->db_size - odiff); 206 sgl->seg_length = (uint32_t)seglen; 207 offset += seglen; 208 len -= seglen; 209 sgl++; 210 } 211 ASSERT(len == 0); 212 213 } 214 return (error); 215 } 216 217 /* 218 * Release a dmu_buf_t array. 219 */ 220 /*ARGSUSED*/ 221 void 222 sbd_zvol_rele_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 223 { 224 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 225 226 ASSERT(zvio->zvio_dbp); 227 ASSERT(dbuf->db_sglist_length); 228 229 dmu_buf_rele_array(zvio->zvio_dbp, (int)dbuf->db_sglist_length, RDTAG); 230 } 231 232 /* 233 * Allocate enough loaned arc buffers for the requested region. 234 * Mimic the handling of the dmu_buf_t array used for reads as closely 235 * as possible even though the arc_buf_t's are anonymous until released. 236 * The buffers will match the zvol object blocks sizes and alignments 237 * such that a data copy may be avoided when the buffers are assigned. 238 */ 239 int 240 sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 241 { 242 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 243 int blkshift, numbufs, i; 244 uint64_t blksize; 245 arc_buf_t **abp; 246 stmf_sglist_ent_t *sgl; 247 uint64_t len = dbuf->db_data_size; 248 uint64_t offset = zvio->zvio_offset; 249 250 /* Make sure request is reasonable */ 251 if (len > sl->sl_max_xfer_len) 252 return (E2BIG); 253 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) 254 return (EIO); 255 256 /* 257 * Break up the request into chunks to match 258 * the volume block size. Only full, and aligned 259 * buffers will avoid the data copy in the dmu. 260 */ 261 /* 262 * calculate how may dbufs are needed 263 */ 264 blksize = sl->sl_blksize; 265 ASSERT(ISP2(blksize)); 266 blkshift = highbit(blksize - 1); 267 /* 268 * taken from dmu_buf_hold_array_by_dnode() 269 */ 270 numbufs = (P2ROUNDUP(offset+len, 1ULL<<blkshift) - 271 P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; 272 if (dbuf->db_sglist_length != numbufs) { 273 cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n", 274 dbuf->db_sglist_length, numbufs); 275 } 276 /* 277 * allocate a holder for the needed arc_buf pointers 278 */ 279 abp = kmem_alloc(sizeof (arc_buf_t *) * numbufs, KM_SLEEP); 280 /* 281 * The write operation uses loaned arc buffers so that 282 * the xfer_data is done outside of a dmu transaction. 283 * These buffers will exactly match the request unlike 284 * the dmu buffers obtained from the read operation. 285 */ 286 /* 287 * allocate the arc buffers and fill in the stmf sglist 288 */ 289 sgl = &dbuf->db_sglist[0]; 290 for (i = 0; i < numbufs; i++) { 291 uint64_t seglen; 292 293 /* first block may not be aligned */ 294 seglen = P2NPHASE(offset, blksize); 295 if (seglen == 0) 296 seglen = blksize; 297 seglen = MIN(seglen, len); 298 abp[i] = arc_loan_buf(dmu_objset_spa(sl->sl_zvol_objset_hdl), 299 B_FALSE, (int)seglen); 300 ASSERT(arc_buf_size(abp[i]) == (int)seglen); 301 sgl->seg_addr = abp[i]->b_data; 302 sgl->seg_length = (uint32_t)seglen; 303 sgl++; 304 offset += seglen; 305 len -= seglen; 306 } 307 ASSERT(len == 0); 308 309 zvio->zvio_abp = abp; 310 return (0); 311 } 312 313 /*ARGSUSED*/ 314 void 315 sbd_zvol_rele_write_bufs_abort(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 316 { 317 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 318 int i; 319 arc_buf_t **abp = zvio->zvio_abp; 320 321 /* free arcbufs */ 322 for (i = 0; i < dbuf->db_sglist_length; i++) 323 dmu_return_arcbuf(*abp++); 324 kmem_free(zvio->zvio_abp, 325 sizeof (arc_buf_t *) * dbuf->db_sglist_length); 326 zvio->zvio_abp = NULL; 327 } 328 329 /* 330 * Release the arc_buf_t array allocated above and handle these cases : 331 * 332 * flags == 0 - create transaction and assign all arc bufs to offsets 333 * flags == ZVIO_COMMIT - same as above and commit to zil on sync devices 334 */ 335 int 336 sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 337 { 338 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 339 dmu_tx_t *tx; 340 int sync, i, error; 341 locked_range_t *lr; 342 arc_buf_t **abp = zvio->zvio_abp; 343 int flags = zvio->zvio_flags; 344 uint64_t toffset, offset = zvio->zvio_offset; 345 uint64_t resid, len = dbuf->db_data_size; 346 347 ASSERT(flags == 0 || flags == ZVIO_COMMIT || flags == ZVIO_ABORT); 348 349 lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER); 350 351 tx = dmu_tx_create(sl->sl_zvol_objset_hdl); 352 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)len); 353 error = dmu_tx_assign(tx, TXG_WAIT); 354 355 if (error) { 356 dmu_tx_abort(tx); 357 rangelock_exit(lr); 358 sbd_zvol_rele_write_bufs_abort(sl, dbuf); 359 return (error); 360 } 361 362 toffset = offset; 363 resid = len; 364 for (i = 0; i < dbuf->db_sglist_length; i++) { 365 arc_buf_t *abuf; 366 int size; 367 368 abuf = abp[i]; 369 size = arc_buf_size(abuf); 370 (void) dmu_assign_arcbuf_by_dnode(sl->sl_zvol_dn_hdl, 371 toffset, abuf, tx); 372 toffset += size; 373 resid -= size; 374 } 375 ASSERT(resid == 0); 376 377 sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl); 378 zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset, 379 (ssize_t)len, sync); 380 dmu_tx_commit(tx); 381 rangelock_exit(lr); 382 kmem_free(zvio->zvio_abp, 383 sizeof (arc_buf_t *) * dbuf->db_sglist_length); 384 zvio->zvio_abp = NULL; 385 if (sync && (flags & ZVIO_COMMIT)) 386 zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ); 387 return (0); 388 } 389 390 /* 391 * Copy interface for callers using direct zvol access. 392 * Very similar to zvol_read but the uio may have multiple iovec entries. 393 */ 394 int 395 sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio) 396 { 397 uint64_t len = (uint64_t)uio->uio_resid; 398 uint64_t offset = (uint64_t)uio->uio_loffset; 399 400 /* Make sure request is reasonable */ 401 if (len > sl->sl_max_xfer_len) 402 return (E2BIG); 403 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) 404 return (EIO); 405 406 locked_range_t *lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len, 407 RL_READER); 408 int error = dmu_read_uio_dnode(sl->sl_zvol_dn_hdl, uio, len); 409 rangelock_exit(lr); 410 411 if (error == ECKSUM) 412 error = EIO; 413 return (error); 414 } 415 416 /* 417 * Copy interface for callers using direct zvol access. 418 * Very similar to zvol_write but the uio may have multiple iovec entries. 419 */ 420 int 421 sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags) 422 { 423 dmu_tx_t *tx; 424 int error, sync; 425 uint64_t len = (uint64_t)uio->uio_resid; 426 uint64_t offset = (uint64_t)uio->uio_loffset; 427 428 ASSERT(flags == 0 || flags == ZVIO_COMMIT); 429 430 /* Make sure request is reasonable */ 431 if (len > sl->sl_max_xfer_len) 432 return (E2BIG); 433 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) 434 return (EIO); 435 436 locked_range_t *lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len, 437 RL_WRITER); 438 sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl); 439 440 tx = dmu_tx_create(sl->sl_zvol_objset_hdl); 441 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)uio->uio_resid); 442 error = dmu_tx_assign(tx, TXG_WAIT); 443 if (error) { 444 dmu_tx_abort(tx); 445 } else { 446 error = dmu_write_uio_dnode(sl->sl_zvol_dn_hdl, uio, len, tx); 447 if (error == 0) { 448 zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset, 449 (ssize_t)len, sync); 450 } 451 dmu_tx_commit(tx); 452 } 453 rangelock_exit(lr); 454 455 if (sync && (flags & ZVIO_COMMIT)) 456 zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ); 457 if (error == ECKSUM) 458 error = EIO; 459 return (error); 460 } 461