1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2014 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/conf.h> 27 #include <sys/file.h> 28 #include <sys/ddi.h> 29 #include <sys/sunddi.h> 30 #include <sys/modctl.h> 31 #include <sys/scsi/scsi.h> 32 #include <sys/scsi/impl/scsi_reset_notify.h> 33 #include <sys/scsi/generic/mode.h> 34 #include <sys/disp.h> 35 #include <sys/byteorder.h> 36 #include <sys/atomic.h> 37 #include <sys/sdt.h> 38 #include <sys/dkio.h> 39 #include <sys/dmu.h> 40 #include <sys/arc.h> 41 #include <sys/zvol.h> 42 #include <sys/zfs_rlock.h> 43 44 #include <sys/stmf.h> 45 #include <sys/lpif.h> 46 #include <sys/portif.h> 47 #include <sys/stmf_ioctl.h> 48 #include <sys/stmf_sbd_ioctl.h> 49 50 #include "stmf_sbd.h" 51 #include "sbd_impl.h" 52 53 54 /* 55 * This file contains direct calls into the zfs module. 56 * These functions mimic zvol_read and zvol_write except pointers 57 * to the data buffers are passed instead of copying the data itself. 58 * 59 * zfs internal interfaces referenced here: 60 * 61 * FUNCTIONS 62 * dmu_buf_hold_array_by_bonus() 63 * dmu_buf_rele_array() 64 * 65 * dmu_request_arc_buf() 66 * dmu_assign_arcbuf() 67 * dmu_return_arc() 68 * arc_buf_size() 69 * 70 * dmu_tx_create() 71 * dmu_tx_hold_write() 72 * dmu_tx_assign() 73 * dmu_tx_commit(tx) 74 * dmu_tx_abort(tx) 75 * zil_commit() 76 * 77 * zfs_range_lock() 78 * zfs_range_unlock() 79 * 80 * zvol_log_write() 81 * 82 * dmu_read_uio() 83 * dmu_write_uio() 84 * MINOR DATA 85 * zv_volsize 86 * zv_volblocksize 87 * zv_flags - for WCE 88 * zv_objset - dmu_tx_create 89 * zv_zilog - zil_commit 90 * zv_znode - zfs_range_lock 91 * zv_dbuf - dmu_buf_hold_array_by_bonus, dmu_request_arcbuf 92 * GLOBAL DATA 93 * zvol_maxphys 94 */ 95 96 /* 97 * Take direct control of the volume instead of using the driver 98 * interfaces provided by zvol.c. Gather parameters and handles 99 * needed to make direct calls into zfs/dmu/zvol. The driver is 100 * opened exclusively at this point, so these parameters cannot change. 101 * 102 * NOTE: the object size and WCE can change while the device 103 * is open, so they must be fetched for every operation. 104 */ 105 int 106 sbd_zvol_get_volume_params(sbd_lu_t *sl) 107 { 108 int ret; 109 110 ret = zvol_get_volume_params(sl->sl_zvol_minor, 111 &sl->sl_blksize, /* volume block size */ 112 &sl->sl_max_xfer_len, /* max data chunk size */ 113 &sl->sl_zvol_minor_hdl, /* minor soft state */ 114 &sl->sl_zvol_objset_hdl, /* dmu_tx_create */ 115 &sl->sl_zvol_zil_hdl, /* zil_commit */ 116 &sl->sl_zvol_rl_hdl, /* zfs_range_lock */ 117 &sl->sl_zvol_bonus_hdl); /* dmu_buf_hold_array_by_bonus, */ 118 /* dmu_request_arcbuf, */ 119 /* dmu_assign_arcbuf */ 120 121 if (ret == 0 && sl->sl_blksize < MMU_PAGESIZE) { 122 cmn_err(CE_NOTE, "COMSTAR reduced copy disabled due to " 123 "small zvol blocksize (%d)\n", (int)sl->sl_blksize); 124 ret = ENOTSUP; 125 } 126 127 return (ret); 128 } 129 130 /* 131 * Return the number of elements in a scatter/gather list required for 132 * the given span in the zvol. Elements are 1:1 with zvol blocks. 133 */ 134 uint32_t 135 sbd_zvol_numsegs(sbd_lu_t *sl, uint64_t off, uint32_t len) 136 { 137 uint64_t blksz = sl->sl_blksize; 138 uint64_t endoff = off + len; 139 uint64_t numsegs; 140 141 numsegs = (P2ROUNDUP(endoff, blksz) - P2ALIGN(off, blksz)) / blksz; 142 return ((uint32_t)numsegs); 143 } 144 145 /* 146 * Return an array of dmu_buf_t pointers for the requested range. 147 * The dmu buffers are either in cache or read in synchronously. 148 * Fill in the dbuf sglist from the dmu_buf_t array. 149 */ 150 static void *RDTAG = "sbd_zvol_read"; 151 152 int 153 sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 154 { 155 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 156 rl_t *rl; 157 int numbufs, error; 158 uint64_t len = dbuf->db_data_size; 159 uint64_t offset = zvio->zvio_offset; 160 dmu_buf_t **dbpp, *dbp; 161 162 /* Make sure request is reasonable */ 163 if (len > sl->sl_max_xfer_len) 164 return (E2BIG); 165 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) 166 return (EIO); 167 168 /* 169 * The range lock is only held until the dmu buffers read in and 170 * held; not during the callers use of the data. 171 */ 172 rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER); 173 174 error = dmu_buf_hold_array_by_bonus(sl->sl_zvol_bonus_hdl, offset, 175 len, TRUE, RDTAG, &numbufs, &dbpp); 176 177 zfs_range_unlock(rl); 178 179 if (error == ECKSUM) 180 error = EIO; 181 182 if (error == 0) { 183 /* 184 * Fill in db_sglist from the dmu_buf_t array. 185 */ 186 int i; 187 stmf_sglist_ent_t *sgl; 188 uint64_t odiff, seglen; 189 190 zvio->zvio_dbp = dbpp; 191 /* make sure db_sglist is large enough */ 192 if (dbuf->db_sglist_length != numbufs) { 193 cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n", 194 dbuf->db_sglist_length, numbufs); 195 } 196 197 sgl = &dbuf->db_sglist[0]; 198 for (i = 0; i < numbufs; i++) { 199 dbp = dbpp[i]; 200 odiff = offset - dbp->db_offset; 201 ASSERT(odiff == 0 || i == 0); 202 sgl->seg_addr = (uint8_t *)dbp->db_data + odiff; 203 seglen = MIN(len, dbp->db_size - odiff); 204 sgl->seg_length = (uint32_t)seglen; 205 offset += seglen; 206 len -= seglen; 207 sgl++; 208 } 209 ASSERT(len == 0); 210 211 } 212 return (error); 213 } 214 215 /* 216 * Release a dmu_buf_t array. 217 */ 218 /*ARGSUSED*/ 219 void 220 sbd_zvol_rele_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 221 { 222 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 223 224 ASSERT(zvio->zvio_dbp); 225 ASSERT(dbuf->db_sglist_length); 226 227 dmu_buf_rele_array(zvio->zvio_dbp, (int)dbuf->db_sglist_length, RDTAG); 228 } 229 230 /* 231 * Allocate enough loaned arc buffers for the requested region. 232 * Mimic the handling of the dmu_buf_t array used for reads as closely 233 * as possible even though the arc_buf_t's are anonymous until released. 234 * The buffers will match the zvol object blocks sizes and alignments 235 * such that a data copy may be avoided when the buffers are assigned. 236 */ 237 int 238 sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 239 { 240 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 241 int blkshift, numbufs, i; 242 uint64_t blksize; 243 arc_buf_t **abp; 244 stmf_sglist_ent_t *sgl; 245 uint64_t len = dbuf->db_data_size; 246 uint64_t offset = zvio->zvio_offset; 247 248 /* Make sure request is reasonable */ 249 if (len > sl->sl_max_xfer_len) 250 return (E2BIG); 251 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) 252 return (EIO); 253 254 /* 255 * Break up the request into chunks to match 256 * the volume block size. Only full, and aligned 257 * buffers will avoid the data copy in the dmu. 258 */ 259 /* 260 * calculate how may dbufs are needed 261 */ 262 blksize = sl->sl_blksize; 263 ASSERT(ISP2(blksize)); 264 blkshift = highbit(blksize - 1); 265 /* 266 * taken from dmu_buf_hold_array_by_dnode() 267 */ 268 numbufs = (P2ROUNDUP(offset+len, 1ULL<<blkshift) - 269 P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; 270 if (dbuf->db_sglist_length != numbufs) { 271 cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n", 272 dbuf->db_sglist_length, numbufs); 273 } 274 /* 275 * allocate a holder for the needed arc_buf pointers 276 */ 277 abp = kmem_alloc(sizeof (arc_buf_t *) * numbufs, KM_SLEEP); 278 /* 279 * The write operation uses loaned arc buffers so that 280 * the xfer_data is done outside of a dmu transaction. 281 * These buffers will exactly match the request unlike 282 * the dmu buffers obtained from the read operation. 283 */ 284 /* 285 * allocate the arc buffers and fill in the stmf sglist 286 */ 287 sgl = &dbuf->db_sglist[0]; 288 for (i = 0; i < numbufs; i++) { 289 uint64_t seglen; 290 291 /* first block may not be aligned */ 292 seglen = P2NPHASE(offset, blksize); 293 if (seglen == 0) 294 seglen = blksize; 295 seglen = MIN(seglen, len); 296 abp[i] = dmu_request_arcbuf(sl->sl_zvol_bonus_hdl, (int)seglen); 297 ASSERT(arc_buf_size(abp[i]) == (int)seglen); 298 sgl->seg_addr = abp[i]->b_data; 299 sgl->seg_length = (uint32_t)seglen; 300 sgl++; 301 offset += seglen; 302 len -= seglen; 303 } 304 ASSERT(len == 0); 305 306 zvio->zvio_abp = abp; 307 return (0); 308 } 309 310 /*ARGSUSED*/ 311 void 312 sbd_zvol_rele_write_bufs_abort(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 313 { 314 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 315 int i; 316 arc_buf_t **abp = zvio->zvio_abp; 317 318 /* free arcbufs */ 319 for (i = 0; i < dbuf->db_sglist_length; i++) 320 dmu_return_arcbuf(*abp++); 321 kmem_free(zvio->zvio_abp, 322 sizeof (arc_buf_t *) * dbuf->db_sglist_length); 323 zvio->zvio_abp = NULL; 324 } 325 326 /* 327 * Release the arc_buf_t array allocated above and handle these cases : 328 * 329 * flags == 0 - create transaction and assign all arc bufs to offsets 330 * flags == ZVIO_COMMIT - same as above and commit to zil on sync devices 331 */ 332 int 333 sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 334 { 335 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 336 dmu_tx_t *tx; 337 int sync, i, error; 338 rl_t *rl; 339 arc_buf_t **abp = zvio->zvio_abp; 340 int flags = zvio->zvio_flags; 341 uint64_t toffset, offset = zvio->zvio_offset; 342 uint64_t resid, len = dbuf->db_data_size; 343 344 ASSERT(flags == 0 || flags == ZVIO_COMMIT || flags == ZVIO_ABORT); 345 346 rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER); 347 348 tx = dmu_tx_create(sl->sl_zvol_objset_hdl); 349 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)len); 350 error = dmu_tx_assign(tx, TXG_WAIT); 351 352 if (error) { 353 dmu_tx_abort(tx); 354 zfs_range_unlock(rl); 355 sbd_zvol_rele_write_bufs_abort(sl, dbuf); 356 return (error); 357 } 358 359 toffset = offset; 360 resid = len; 361 for (i = 0; i < dbuf->db_sglist_length; i++) { 362 arc_buf_t *abuf; 363 int size; 364 365 abuf = abp[i]; 366 size = arc_buf_size(abuf); 367 dmu_assign_arcbuf(sl->sl_zvol_bonus_hdl, toffset, abuf, tx); 368 toffset += size; 369 resid -= size; 370 } 371 ASSERT(resid == 0); 372 373 sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl); 374 zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset, 375 (ssize_t)len, sync); 376 dmu_tx_commit(tx); 377 zfs_range_unlock(rl); 378 kmem_free(zvio->zvio_abp, 379 sizeof (arc_buf_t *) * dbuf->db_sglist_length); 380 zvio->zvio_abp = NULL; 381 if (sync && (flags & ZVIO_COMMIT)) 382 zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ); 383 return (0); 384 } 385 386 /* 387 * Copy interface for callers using direct zvol access. 388 * Very similar to zvol_read but the uio may have multiple iovec entries. 389 */ 390 int 391 sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio) 392 { 393 int error; 394 rl_t *rl; 395 uint64_t len = (uint64_t)uio->uio_resid; 396 uint64_t offset = (uint64_t)uio->uio_loffset; 397 398 /* Make sure request is reasonable */ 399 if (len > sl->sl_max_xfer_len) 400 return (E2BIG); 401 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) 402 return (EIO); 403 404 rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER); 405 406 error = dmu_read_uio_dbuf(sl->sl_zvol_bonus_hdl, uio, len); 407 408 zfs_range_unlock(rl); 409 if (error == ECKSUM) 410 error = EIO; 411 return (error); 412 } 413 414 /* 415 * Copy interface for callers using direct zvol access. 416 * Very similar to zvol_write but the uio may have multiple iovec entries. 417 */ 418 int 419 sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags) 420 { 421 rl_t *rl; 422 dmu_tx_t *tx; 423 int error, sync; 424 uint64_t len = (uint64_t)uio->uio_resid; 425 uint64_t offset = (uint64_t)uio->uio_loffset; 426 427 ASSERT(flags == 0 || flags == ZVIO_COMMIT); 428 429 /* Make sure request is reasonable */ 430 if (len > sl->sl_max_xfer_len) 431 return (E2BIG); 432 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) 433 return (EIO); 434 435 rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER); 436 437 sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl); 438 439 tx = dmu_tx_create(sl->sl_zvol_objset_hdl); 440 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)uio->uio_resid); 441 error = dmu_tx_assign(tx, TXG_WAIT); 442 if (error) { 443 dmu_tx_abort(tx); 444 } else { 445 error = dmu_write_uio_dbuf(sl->sl_zvol_bonus_hdl, uio, len, tx); 446 if (error == 0) { 447 zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset, 448 (ssize_t)len, sync); 449 } 450 dmu_tx_commit(tx); 451 } 452 zfs_range_unlock(rl); 453 if (sync && (flags & ZVIO_COMMIT)) 454 zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ); 455 if (error == ECKSUM) 456 error = EIO; 457 return (error); 458 } 459