1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2014, 2018 by Delphix. All rights reserved.
24 */
25
26 #include <sys/conf.h>
27 #include <sys/file.h>
28 #include <sys/ddi.h>
29 #include <sys/sunddi.h>
30 #include <sys/modctl.h>
31 #include <sys/scsi/scsi.h>
32 #include <sys/scsi/impl/scsi_reset_notify.h>
33 #include <sys/scsi/generic/mode.h>
34 #include <sys/disp.h>
35 #include <sys/byteorder.h>
36 #include <sys/atomic.h>
37 #include <sys/sdt.h>
38 #include <sys/dkio.h>
39 #include <sys/dmu.h>
40 #include <sys/arc.h>
41 #include <sys/zvol.h>
42 #include <sys/zfs_rlock.h>
43 #include <sys/zil.h>
44
45 #include <sys/stmf.h>
46 #include <sys/lpif.h>
47 #include <sys/portif.h>
48 #include <sys/stmf_ioctl.h>
49 #include <sys/stmf_sbd_ioctl.h>
50
51 #include "stmf_sbd.h"
52 #include "sbd_impl.h"
53
54
55 /*
56 * This file contains direct calls into the zfs module.
57 * These functions mimic zvol_read and zvol_write except pointers
58 * to the data buffers are passed instead of copying the data itself.
59 *
60 * zfs internal interfaces referenced here:
61 *
62 * FUNCTIONS
63 * dmu_buf_hold_array_by_dnode()
64 * dmu_buf_rele_array()
65 *
66 * arc_loan_buf()
67 * dmu_assign_arcbuf()
68 * dmu_return_arcbuf()
69 * arc_buf_size()
70 *
71 * dmu_tx_create()
72 * dmu_tx_hold_write()
73 * dmu_tx_assign()
74 * dmu_tx_commit(tx)
75 * dmu_tx_abort(tx)
76 * zil_commit()
77 *
78 * rangelock_enter()
79 * rangelock_exit()
80 *
81 * zvol_log_write()
82 *
83 * dmu_read_uio()
84 * dmu_write_uio()
85 * MINOR DATA
86 * zv_volsize
87 * zv_volblocksize
88 * zv_flags - for WCE
89 * zv_objset - dmu_tx_create
90 * zv_zilog - zil_commit
91 * zv_znode - rangelock_enter
92 * zv_dn - dmu_buf_hold_array_by_bonus, dmu_request_arcbuf
93 * GLOBAL DATA
94 * zvol_maxphys
95 */
96
97 /*
98 * Take direct control of the volume instead of using the driver
99 * interfaces provided by zvol.c. Gather parameters and handles
100 * needed to make direct calls into zfs/dmu/zvol. The driver is
101 * opened exclusively at this point, so these parameters cannot change.
102 *
103 * NOTE: the object size and WCE can change while the device
104 * is open, so they must be fetched for every operation.
105 */
106 int
sbd_zvol_get_volume_params(sbd_lu_t * sl)107 sbd_zvol_get_volume_params(sbd_lu_t *sl)
108 {
109 int ret;
110
111 ret = zvol_get_volume_params(sl->sl_zvol_minor,
112 &sl->sl_blksize, /* volume block size */
113 &sl->sl_max_xfer_len, /* max data chunk size */
114 &sl->sl_zvol_minor_hdl, /* minor soft state */
115 &sl->sl_zvol_objset_hdl, /* dmu_tx_create */
116 &sl->sl_zvol_zil_hdl, /* zil_commit */
117 &sl->sl_zvol_rl_hdl, /* locked_range_t */
118 &sl->sl_zvol_dn_hdl); /* dmu_buf_hold_array_by_dnode, */
119 /* dmu_request_arcbuf, */
120 /* dmu_assign_arcbuf */
121
122 if (ret == 0 && sl->sl_blksize < MMU_PAGESIZE) {
123 cmn_err(CE_NOTE, "COMSTAR reduced copy disabled due to "
124 "small zvol blocksize (%d)\n", (int)sl->sl_blksize);
125 ret = ENOTSUP;
126 }
127
128 return (ret);
129 }
130
131 /*
132 * Return the number of elements in a scatter/gather list required for
133 * the given span in the zvol. Elements are 1:1 with zvol blocks.
134 */
135 uint32_t
sbd_zvol_numsegs(sbd_lu_t * sl,uint64_t off,uint32_t len)136 sbd_zvol_numsegs(sbd_lu_t *sl, uint64_t off, uint32_t len)
137 {
138 uint64_t blksz = sl->sl_blksize;
139 uint64_t endoff = off + len;
140 uint64_t numsegs;
141
142 numsegs = (P2ROUNDUP(endoff, blksz) - P2ALIGN(off, blksz)) / blksz;
143 return ((uint32_t)numsegs);
144 }
145
146 /*
147 * Return an array of dmu_buf_t pointers for the requested range.
148 * The dmu buffers are either in cache or read in synchronously.
149 * Fill in the dbuf sglist from the dmu_buf_t array.
150 */
151 static void *RDTAG = "sbd_zvol_read";
152
153 int
sbd_zvol_alloc_read_bufs(sbd_lu_t * sl,stmf_data_buf_t * dbuf)154 sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
155 {
156 sbd_zvol_io_t *zvio = dbuf->db_lu_private;
157 locked_range_t *lr;
158 int numbufs, error;
159 uint64_t len = dbuf->db_data_size;
160 uint64_t offset = zvio->zvio_offset;
161 dmu_buf_t **dbpp, *dbp;
162
163 /* Make sure request is reasonable */
164 if (len > sl->sl_max_xfer_len)
165 return (E2BIG);
166 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
167 return (EIO);
168
169 /*
170 * The range lock is only held until the dmu buffers read in and
171 * held; not during the callers use of the data.
172 */
173 lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
174
175 error = dmu_buf_hold_array_by_dnode(sl->sl_zvol_dn_hdl,
176 offset, len, TRUE, RDTAG, &numbufs, &dbpp,
177 DMU_READ_PREFETCH);
178
179 rangelock_exit(lr);
180
181 if (error == ECKSUM)
182 error = EIO;
183
184 if (error == 0) {
185 /*
186 * Fill in db_sglist from the dmu_buf_t array.
187 */
188 int i;
189 stmf_sglist_ent_t *sgl;
190 uint64_t odiff, seglen;
191
192 zvio->zvio_dbp = dbpp;
193 /* make sure db_sglist is large enough */
194 if (dbuf->db_sglist_length != numbufs) {
195 cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n",
196 dbuf->db_sglist_length, numbufs);
197 }
198
199 sgl = &dbuf->db_sglist[0];
200 for (i = 0; i < numbufs; i++) {
201 dbp = dbpp[i];
202 odiff = offset - dbp->db_offset;
203 ASSERT(odiff == 0 || i == 0);
204 sgl->seg_addr = (uint8_t *)dbp->db_data + odiff;
205 seglen = MIN(len, dbp->db_size - odiff);
206 sgl->seg_length = (uint32_t)seglen;
207 offset += seglen;
208 len -= seglen;
209 sgl++;
210 }
211 ASSERT(len == 0);
212
213 }
214 return (error);
215 }
216
217 /*
218 * Release a dmu_buf_t array.
219 */
220 /*ARGSUSED*/
221 void
sbd_zvol_rele_read_bufs(sbd_lu_t * sl,stmf_data_buf_t * dbuf)222 sbd_zvol_rele_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
223 {
224 sbd_zvol_io_t *zvio = dbuf->db_lu_private;
225
226 ASSERT(zvio->zvio_dbp);
227 ASSERT(dbuf->db_sglist_length);
228
229 dmu_buf_rele_array(zvio->zvio_dbp, (int)dbuf->db_sglist_length, RDTAG);
230 }
231
232 /*
233 * Allocate enough loaned arc buffers for the requested region.
234 * Mimic the handling of the dmu_buf_t array used for reads as closely
235 * as possible even though the arc_buf_t's are anonymous until released.
236 * The buffers will match the zvol object blocks sizes and alignments
237 * such that a data copy may be avoided when the buffers are assigned.
238 */
239 int
sbd_zvol_alloc_write_bufs(sbd_lu_t * sl,stmf_data_buf_t * dbuf)240 sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
241 {
242 sbd_zvol_io_t *zvio = dbuf->db_lu_private;
243 int blkshift, numbufs, i;
244 uint64_t blksize;
245 arc_buf_t **abp;
246 stmf_sglist_ent_t *sgl;
247 uint64_t len = dbuf->db_data_size;
248 uint64_t offset = zvio->zvio_offset;
249
250 /* Make sure request is reasonable */
251 if (len > sl->sl_max_xfer_len)
252 return (E2BIG);
253 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
254 return (EIO);
255
256 /*
257 * Break up the request into chunks to match
258 * the volume block size. Only full, and aligned
259 * buffers will avoid the data copy in the dmu.
260 */
261 /*
262 * calculate how may dbufs are needed
263 */
264 blksize = sl->sl_blksize;
265 ASSERT(ISP2(blksize));
266 blkshift = highbit(blksize - 1);
267 /*
268 * taken from dmu_buf_hold_array_by_dnode()
269 */
270 numbufs = (P2ROUNDUP(offset+len, 1ULL<<blkshift) -
271 P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
272 if (dbuf->db_sglist_length != numbufs) {
273 cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n",
274 dbuf->db_sglist_length, numbufs);
275 }
276 /*
277 * allocate a holder for the needed arc_buf pointers
278 */
279 abp = kmem_alloc(sizeof (arc_buf_t *) * numbufs, KM_SLEEP);
280 /*
281 * The write operation uses loaned arc buffers so that
282 * the xfer_data is done outside of a dmu transaction.
283 * These buffers will exactly match the request unlike
284 * the dmu buffers obtained from the read operation.
285 */
286 /*
287 * allocate the arc buffers and fill in the stmf sglist
288 */
289 sgl = &dbuf->db_sglist[0];
290 for (i = 0; i < numbufs; i++) {
291 uint64_t seglen;
292
293 /* first block may not be aligned */
294 seglen = P2NPHASE(offset, blksize);
295 if (seglen == 0)
296 seglen = blksize;
297 seglen = MIN(seglen, len);
298 abp[i] = arc_loan_buf(dmu_objset_spa(sl->sl_zvol_objset_hdl),
299 B_FALSE, (int)seglen);
300 ASSERT(arc_buf_size(abp[i]) == (int)seglen);
301 sgl->seg_addr = abp[i]->b_data;
302 sgl->seg_length = (uint32_t)seglen;
303 sgl++;
304 offset += seglen;
305 len -= seglen;
306 }
307 ASSERT(len == 0);
308
309 zvio->zvio_abp = abp;
310 return (0);
311 }
312
313 /*ARGSUSED*/
314 void
sbd_zvol_rele_write_bufs_abort(sbd_lu_t * sl,stmf_data_buf_t * dbuf)315 sbd_zvol_rele_write_bufs_abort(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
316 {
317 sbd_zvol_io_t *zvio = dbuf->db_lu_private;
318 int i;
319 arc_buf_t **abp = zvio->zvio_abp;
320
321 /* free arcbufs */
322 for (i = 0; i < dbuf->db_sglist_length; i++)
323 dmu_return_arcbuf(*abp++);
324 kmem_free(zvio->zvio_abp,
325 sizeof (arc_buf_t *) * dbuf->db_sglist_length);
326 zvio->zvio_abp = NULL;
327 }
328
329 /*
330 * Release the arc_buf_t array allocated above and handle these cases :
331 *
332 * flags == 0 - create transaction and assign all arc bufs to offsets
333 * flags == ZVIO_COMMIT - same as above and commit to zil on sync devices
334 */
335 int
sbd_zvol_rele_write_bufs(sbd_lu_t * sl,stmf_data_buf_t * dbuf)336 sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
337 {
338 sbd_zvol_io_t *zvio = dbuf->db_lu_private;
339 dmu_tx_t *tx;
340 int sync, i, error;
341 locked_range_t *lr;
342 arc_buf_t **abp = zvio->zvio_abp;
343 int flags = zvio->zvio_flags;
344 uint64_t toffset, offset = zvio->zvio_offset;
345 uint64_t resid, len = dbuf->db_data_size;
346
347 ASSERT(flags == 0 || flags == ZVIO_COMMIT || flags == ZVIO_ABORT);
348
349 lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER);
350
351 tx = dmu_tx_create(sl->sl_zvol_objset_hdl);
352 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)len);
353 error = dmu_tx_assign(tx, TXG_WAIT);
354
355 if (error) {
356 dmu_tx_abort(tx);
357 rangelock_exit(lr);
358 sbd_zvol_rele_write_bufs_abort(sl, dbuf);
359 return (error);
360 }
361
362 toffset = offset;
363 resid = len;
364 for (i = 0; i < dbuf->db_sglist_length; i++) {
365 arc_buf_t *abuf;
366 int size;
367
368 abuf = abp[i];
369 size = arc_buf_size(abuf);
370 (void) dmu_assign_arcbuf_by_dnode(sl->sl_zvol_dn_hdl,
371 toffset, abuf, tx);
372 toffset += size;
373 resid -= size;
374 }
375 ASSERT(resid == 0);
376
377 sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl);
378 zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset,
379 (ssize_t)len, sync);
380 dmu_tx_commit(tx);
381 rangelock_exit(lr);
382 kmem_free(zvio->zvio_abp,
383 sizeof (arc_buf_t *) * dbuf->db_sglist_length);
384 zvio->zvio_abp = NULL;
385 if (sync && (flags & ZVIO_COMMIT))
386 zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ);
387 return (0);
388 }
389
390 /*
391 * Copy interface for callers using direct zvol access.
392 * Very similar to zvol_read but the uio may have multiple iovec entries.
393 */
394 int
sbd_zvol_copy_read(sbd_lu_t * sl,uio_t * uio)395 sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio)
396 {
397 uint64_t len = (uint64_t)uio->uio_resid;
398 uint64_t offset = (uint64_t)uio->uio_loffset;
399
400 /* Make sure request is reasonable */
401 if (len > sl->sl_max_xfer_len)
402 return (E2BIG);
403 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
404 return (EIO);
405
406 locked_range_t *lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len,
407 RL_READER);
408 int error = dmu_read_uio_dnode(sl->sl_zvol_dn_hdl, uio, len);
409 rangelock_exit(lr);
410
411 if (error == ECKSUM)
412 error = EIO;
413 return (error);
414 }
415
416 /*
417 * Copy interface for callers using direct zvol access.
418 * Very similar to zvol_write but the uio may have multiple iovec entries.
419 */
420 int
sbd_zvol_copy_write(sbd_lu_t * sl,uio_t * uio,int flags)421 sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags)
422 {
423 dmu_tx_t *tx;
424 int error, sync;
425 uint64_t len = (uint64_t)uio->uio_resid;
426 uint64_t offset = (uint64_t)uio->uio_loffset;
427
428 ASSERT(flags == 0 || flags == ZVIO_COMMIT);
429
430 /* Make sure request is reasonable */
431 if (len > sl->sl_max_xfer_len)
432 return (E2BIG);
433 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
434 return (EIO);
435
436 locked_range_t *lr = rangelock_enter(sl->sl_zvol_rl_hdl, offset, len,
437 RL_WRITER);
438 sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl);
439
440 tx = dmu_tx_create(sl->sl_zvol_objset_hdl);
441 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)uio->uio_resid);
442 error = dmu_tx_assign(tx, TXG_WAIT);
443 if (error) {
444 dmu_tx_abort(tx);
445 } else {
446 error = dmu_write_uio_dnode(sl->sl_zvol_dn_hdl, uio, len, tx);
447 if (error == 0) {
448 zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset,
449 (ssize_t)len, sync);
450 }
451 dmu_tx_commit(tx);
452 }
453 rangelock_exit(lr);
454
455 if (sync && (flags & ZVIO_COMMIT))
456 zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ);
457 if (error == ECKSUM)
458 error = EIO;
459 return (error);
460 }
461