1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 #include <sys/conf.h> 26 #include <sys/file.h> 27 #include <sys/ddi.h> 28 #include <sys/sunddi.h> 29 #include <sys/modctl.h> 30 #include <sys/scsi/scsi.h> 31 #include <sys/scsi/impl/scsi_reset_notify.h> 32 #include <sys/scsi/generic/mode.h> 33 #include <sys/disp.h> 34 #include <sys/byteorder.h> 35 #include <sys/atomic.h> 36 #include <sys/sdt.h> 37 #include <sys/dkio.h> 38 #include <sys/dmu.h> 39 #include <sys/arc.h> 40 #include <sys/zvol.h> 41 #include <sys/zfs_rlock.h> 42 43 #include <sys/stmf.h> 44 #include <sys/lpif.h> 45 #include <sys/portif.h> 46 #include <sys/stmf_ioctl.h> 47 #include <sys/stmf_sbd_ioctl.h> 48 49 #include "stmf_sbd.h" 50 #include "sbd_impl.h" 51 52 53 /* 54 * This file contains direct calls into the zfs module. 55 * These functions mimic zvol_read and zvol_write except pointers 56 * to the data buffers are passed instead of copying the data itself. 57 * 58 * zfs internal interfaces referenced here: 59 * 60 * FUNCTIONS 61 * dmu_buf_hold_array_by_bonus() 62 * dmu_buf_rele_array() 63 * 64 * dmu_request_arc_buf() 65 * dmu_assign_arcbuf() 66 * dmu_return_arc() 67 * arc_buf_size() 68 * 69 * dmu_tx_create() 70 * dmu_tx_hold_write() 71 * dmu_tx_assign() 72 * dmu_tx_commit(tx) 73 * dmu_tx_abort(tx) 74 * zil_commit() 75 * 76 * zfs_range_lock() 77 * zfs_range_unlock() 78 * 79 * zvol_log_write() 80 * 81 * dmu_read_uio() 82 * dmu_write_uio() 83 * MINOR DATA 84 * zv_volsize 85 * zv_volblocksize 86 * zv_flags - for WCE 87 * zv_objset - dmu_tx_create 88 * zv_zilog - zil_commit 89 * zv_znode - zfs_range_lock 90 * zv_dbuf - dmu_buf_hold_array_by_bonus, dmu_request_arcbuf 91 * GLOBAL DATA 92 * zvol_maxphys 93 */ 94 95 /* 96 * Take direct control of the volume instead of using the driver 97 * interfaces provided by zvol.c. Gather parameters and handles 98 * needed to make direct calls into zfs/dmu/zvol. The driver is 99 * opened exclusively at this point, so these parameters cannot change. 100 * 101 * NOTE: the object size and WCE can change while the device 102 * is open, so they must be fetched for every operation. 103 */ 104 int 105 sbd_zvol_get_volume_params(sbd_lu_t *sl) 106 { 107 int ret; 108 109 ret = zvol_get_volume_params(sl->sl_zvol_minor, 110 &sl->sl_blksize, /* volume block size */ 111 &sl->sl_max_xfer_len, /* max data chunk size */ 112 &sl->sl_zvol_minor_hdl, /* minor soft state */ 113 &sl->sl_zvol_objset_hdl, /* dmu_tx_create */ 114 &sl->sl_zvol_zil_hdl, /* zil_commit */ 115 &sl->sl_zvol_rl_hdl, /* zfs_range_lock */ 116 &sl->sl_zvol_bonus_hdl); /* dmu_buf_hold_array_by_bonus, */ 117 /* dmu_request_arcbuf, */ 118 /* dmu_assign_arcbuf */ 119 120 if (ret == 0 && sl->sl_blksize < MMU_PAGESIZE) { 121 cmn_err(CE_NOTE, "COMSTAR reduced copy disabled due to " 122 "small zvol blocksize (%d)\n", (int)sl->sl_blksize); 123 ret = ENOTSUP; 124 } 125 126 return (ret); 127 } 128 129 /* 130 * Return the number of elements in a scatter/gather list required for 131 * the given span in the zvol. Elements are 1:1 with zvol blocks. 132 */ 133 uint32_t 134 sbd_zvol_numsegs(sbd_lu_t *sl, uint64_t off, uint32_t len) 135 { 136 uint64_t blksz = sl->sl_blksize; 137 uint64_t endoff = off + len; 138 uint64_t numsegs; 139 140 numsegs = (P2ROUNDUP(endoff, blksz) - P2ALIGN(off, blksz)) / blksz; 141 return ((uint32_t)numsegs); 142 } 143 144 /* 145 * Return an array of dmu_buf_t pointers for the requested range. 146 * The dmu buffers are either in cache or read in synchronously. 147 * Fill in the dbuf sglist from the dmu_buf_t array. 148 */ 149 static void *RDTAG = "sbd_zvol_read"; 150 151 int 152 sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 153 { 154 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 155 rl_t *rl; 156 int numbufs, error; 157 uint64_t len = dbuf->db_data_size; 158 uint64_t offset = zvio->zvio_offset; 159 dmu_buf_t **dbpp, *dbp; 160 161 /* Make sure request is reasonable */ 162 if (len > sl->sl_max_xfer_len) 163 return (E2BIG); 164 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) 165 return (EIO); 166 167 /* 168 * The range lock is only held until the dmu buffers read in and 169 * held; not during the callers use of the data. 170 */ 171 rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER); 172 173 error = dmu_buf_hold_array_by_bonus(sl->sl_zvol_bonus_hdl, offset, 174 len, TRUE, RDTAG, &numbufs, &dbpp); 175 176 zfs_range_unlock(rl); 177 178 if (error == ECKSUM) 179 error = EIO; 180 181 if (error == 0) { 182 /* 183 * Fill in db_sglist from the dmu_buf_t array. 184 */ 185 int i; 186 stmf_sglist_ent_t *sgl; 187 uint64_t odiff, seglen; 188 189 zvio->zvio_dbp = dbpp; 190 /* make sure db_sglist is large enough */ 191 if (dbuf->db_sglist_length != numbufs) { 192 cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n", 193 dbuf->db_sglist_length, numbufs); 194 } 195 196 sgl = &dbuf->db_sglist[0]; 197 for (i = 0; i < numbufs; i++) { 198 dbp = dbpp[i]; 199 odiff = offset - dbp->db_offset; 200 ASSERT(odiff == 0 || i == 0); 201 sgl->seg_addr = (uint8_t *)dbp->db_data + odiff; 202 seglen = MIN(len, dbp->db_size - odiff); 203 sgl->seg_length = (uint32_t)seglen; 204 offset += seglen; 205 len -= seglen; 206 sgl++; 207 } 208 ASSERT(len == 0); 209 210 } 211 return (error); 212 } 213 214 /* 215 * Release a dmu_buf_t array. 216 */ 217 /*ARGSUSED*/ 218 void 219 sbd_zvol_rele_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 220 { 221 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 222 223 ASSERT(zvio->zvio_dbp); 224 ASSERT(dbuf->db_sglist_length); 225 226 dmu_buf_rele_array(zvio->zvio_dbp, (int)dbuf->db_sglist_length, RDTAG); 227 } 228 229 /* 230 * Allocate enough loaned arc buffers for the requested region. 231 * Mimic the handling of the dmu_buf_t array used for reads as closely 232 * as possible even though the arc_buf_t's are anonymous until released. 233 * The buffers will match the zvol object blocks sizes and alignments 234 * such that a data copy may be avoided when the buffers are assigned. 235 */ 236 int 237 sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 238 { 239 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 240 int blkshift, numbufs, i; 241 uint64_t blksize; 242 arc_buf_t **abp; 243 stmf_sglist_ent_t *sgl; 244 uint64_t len = dbuf->db_data_size; 245 uint64_t offset = zvio->zvio_offset; 246 247 /* Make sure request is reasonable */ 248 if (len > sl->sl_max_xfer_len) 249 return (E2BIG); 250 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) 251 return (EIO); 252 253 /* 254 * Break up the request into chunks to match 255 * the volume block size. Only full, and aligned 256 * buffers will avoid the data copy in the dmu. 257 */ 258 /* 259 * calculate how may dbufs are needed 260 */ 261 blksize = sl->sl_blksize; 262 ASSERT(ISP2(blksize)); 263 blkshift = highbit(blksize - 1); 264 /* 265 * taken from dmu_buf_hold_array_by_dnode() 266 */ 267 numbufs = (P2ROUNDUP(offset+len, 1ULL<<blkshift) - 268 P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; 269 if (dbuf->db_sglist_length != numbufs) { 270 cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n", 271 dbuf->db_sglist_length, numbufs); 272 } 273 /* 274 * allocate a holder for the needed arc_buf pointers 275 */ 276 abp = kmem_alloc(sizeof (arc_buf_t *) * numbufs, KM_SLEEP); 277 /* 278 * The write operation uses loaned arc buffers so that 279 * the xfer_data is done outside of a dmu transaction. 280 * These buffers will exactly match the request unlike 281 * the dmu buffers obtained from the read operation. 282 */ 283 /* 284 * allocate the arc buffers and fill in the stmf sglist 285 */ 286 sgl = &dbuf->db_sglist[0]; 287 for (i = 0; i < numbufs; i++) { 288 uint64_t seglen; 289 290 /* first block may not be aligned */ 291 seglen = P2NPHASE(offset, blksize); 292 if (seglen == 0) 293 seglen = blksize; 294 seglen = MIN(seglen, len); 295 abp[i] = dmu_request_arcbuf(sl->sl_zvol_bonus_hdl, (int)seglen); 296 ASSERT(arc_buf_size(abp[i]) == (int)seglen); 297 sgl->seg_addr = abp[i]->b_data; 298 sgl->seg_length = (uint32_t)seglen; 299 sgl++; 300 offset += seglen; 301 len -= seglen; 302 } 303 ASSERT(len == 0); 304 305 zvio->zvio_abp = abp; 306 return (0); 307 } 308 309 /*ARGSUSED*/ 310 void 311 sbd_zvol_rele_write_bufs_abort(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 312 { 313 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 314 int i; 315 arc_buf_t **abp = zvio->zvio_abp; 316 317 /* free arcbufs */ 318 for (i = 0; i < dbuf->db_sglist_length; i++) 319 dmu_return_arcbuf(*abp++); 320 kmem_free(zvio->zvio_abp, 321 sizeof (arc_buf_t *) * dbuf->db_sglist_length); 322 zvio->zvio_abp = NULL; 323 } 324 325 /* 326 * Release the arc_buf_t array allocated above and handle these cases : 327 * 328 * flags == 0 - create transaction and assign all arc bufs to offsets 329 * flags == ZVIO_COMMIT - same as above and commit to zil on sync devices 330 */ 331 int 332 sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf) 333 { 334 sbd_zvol_io_t *zvio = dbuf->db_lu_private; 335 dmu_tx_t *tx; 336 int sync, i, error; 337 rl_t *rl; 338 arc_buf_t **abp = zvio->zvio_abp; 339 int flags = zvio->zvio_flags; 340 uint64_t toffset, offset = zvio->zvio_offset; 341 uint64_t resid, len = dbuf->db_data_size; 342 343 ASSERT(flags == 0 || flags == ZVIO_COMMIT || flags == ZVIO_ABORT); 344 345 rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER); 346 347 tx = dmu_tx_create(sl->sl_zvol_objset_hdl); 348 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)len); 349 error = dmu_tx_assign(tx, TXG_WAIT); 350 351 if (error) { 352 dmu_tx_abort(tx); 353 zfs_range_unlock(rl); 354 sbd_zvol_rele_write_bufs_abort(sl, dbuf); 355 return (error); 356 } 357 358 toffset = offset; 359 resid = len; 360 for (i = 0; i < dbuf->db_sglist_length; i++) { 361 arc_buf_t *abuf; 362 int size; 363 364 abuf = abp[i]; 365 size = arc_buf_size(abuf); 366 dmu_assign_arcbuf(sl->sl_zvol_bonus_hdl, toffset, abuf, tx); 367 toffset += size; 368 resid -= size; 369 } 370 ASSERT(resid == 0); 371 372 sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl); 373 zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset, 374 (ssize_t)len, sync); 375 dmu_tx_commit(tx); 376 zfs_range_unlock(rl); 377 kmem_free(zvio->zvio_abp, 378 sizeof (arc_buf_t *) * dbuf->db_sglist_length); 379 zvio->zvio_abp = NULL; 380 if (sync && (flags & ZVIO_COMMIT)) 381 zil_commit(sl->sl_zvol_zil_hdl, UINT64_MAX, ZVOL_OBJ); 382 return (0); 383 } 384 385 /* 386 * Copy interface for callers using direct zvol access. 387 * Very similar to zvol_read but the uio may have multiple iovec entries. 388 */ 389 int 390 sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio) 391 { 392 int error; 393 rl_t *rl; 394 uint64_t len = (uint64_t)uio->uio_resid; 395 uint64_t offset = (uint64_t)uio->uio_loffset; 396 397 /* Make sure request is reasonable */ 398 if (len > sl->sl_max_xfer_len) 399 return (E2BIG); 400 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) 401 return (EIO); 402 403 rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER); 404 405 error = dmu_read_uio(sl->sl_zvol_objset_hdl, ZVOL_OBJ, uio, len); 406 407 zfs_range_unlock(rl); 408 if (error == ECKSUM) 409 error = EIO; 410 return (error); 411 } 412 413 /* 414 * Copy interface for callers using direct zvol access. 415 * Very similar to zvol_write but the uio may have multiple iovec entries. 416 */ 417 int 418 sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags) 419 { 420 rl_t *rl; 421 dmu_tx_t *tx; 422 int error, sync; 423 uint64_t len = (uint64_t)uio->uio_resid; 424 uint64_t offset = (uint64_t)uio->uio_loffset; 425 426 ASSERT(flags == 0 || flags == ZVIO_COMMIT); 427 428 /* Make sure request is reasonable */ 429 if (len > sl->sl_max_xfer_len) 430 return (E2BIG); 431 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl)) 432 return (EIO); 433 434 rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER); 435 436 sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl); 437 438 tx = dmu_tx_create(sl->sl_zvol_objset_hdl); 439 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)uio->uio_resid); 440 error = dmu_tx_assign(tx, TXG_WAIT); 441 if (error) { 442 dmu_tx_abort(tx); 443 } else { 444 /* 445 * XXX use the new bonus handle entry. 446 */ 447 error = dmu_write_uio(sl->sl_zvol_objset_hdl, ZVOL_OBJ, 448 uio, len, tx); 449 if (error == 0) { 450 zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset, 451 (ssize_t)len, sync); 452 } 453 dmu_tx_commit(tx); 454 } 455 zfs_range_unlock(rl); 456 if (sync && (flags & ZVIO_COMMIT)) 457 zil_commit(sl->sl_zvol_zil_hdl, UINT64_MAX, ZVOL_OBJ); 458 if (error == ECKSUM) 459 error = EIO; 460 return (error); 461 } 462