1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2014 by Delphix. All rights reserved.
24 */
25
26 #include <sys/conf.h>
27 #include <sys/file.h>
28 #include <sys/ddi.h>
29 #include <sys/sunddi.h>
30 #include <sys/modctl.h>
31 #include <sys/scsi/scsi.h>
32 #include <sys/scsi/impl/scsi_reset_notify.h>
33 #include <sys/scsi/generic/mode.h>
34 #include <sys/disp.h>
35 #include <sys/byteorder.h>
36 #include <sys/atomic.h>
37 #include <sys/sdt.h>
38 #include <sys/dkio.h>
39 #include <sys/dmu.h>
40 #include <sys/arc.h>
41 #include <sys/zvol.h>
42 #include <sys/zfs_rlock.h>
43
44 #include <sys/stmf.h>
45 #include <sys/lpif.h>
46 #include <sys/portif.h>
47 #include <sys/stmf_ioctl.h>
48 #include <sys/stmf_sbd_ioctl.h>
49
50 #include "stmf_sbd.h"
51 #include "sbd_impl.h"
52
53
54 /*
55 * This file contains direct calls into the zfs module.
56 * These functions mimic zvol_read and zvol_write except pointers
57 * to the data buffers are passed instead of copying the data itself.
58 *
59 * zfs internal interfaces referenced here:
60 *
61 * FUNCTIONS
62 * dmu_buf_hold_array_by_bonus()
63 * dmu_buf_rele_array()
64 *
65 * dmu_request_arc_buf()
66 * dmu_assign_arcbuf()
67 * dmu_return_arc()
68 * arc_buf_size()
69 *
70 * dmu_tx_create()
71 * dmu_tx_hold_write()
72 * dmu_tx_assign()
73 * dmu_tx_commit(tx)
74 * dmu_tx_abort(tx)
75 * zil_commit()
76 *
77 * zfs_range_lock()
78 * zfs_range_unlock()
79 *
80 * zvol_log_write()
81 *
82 * dmu_read_uio()
83 * dmu_write_uio()
84 * MINOR DATA
85 * zv_volsize
86 * zv_volblocksize
87 * zv_flags - for WCE
88 * zv_objset - dmu_tx_create
89 * zv_zilog - zil_commit
90 * zv_znode - zfs_range_lock
91 * zv_dbuf - dmu_buf_hold_array_by_bonus, dmu_request_arcbuf
92 * GLOBAL DATA
93 * zvol_maxphys
94 */
95
96 /*
97 * Take direct control of the volume instead of using the driver
98 * interfaces provided by zvol.c. Gather parameters and handles
99 * needed to make direct calls into zfs/dmu/zvol. The driver is
100 * opened exclusively at this point, so these parameters cannot change.
101 *
102 * NOTE: the object size and WCE can change while the device
103 * is open, so they must be fetched for every operation.
104 */
105 int
sbd_zvol_get_volume_params(sbd_lu_t * sl)106 sbd_zvol_get_volume_params(sbd_lu_t *sl)
107 {
108 int ret;
109
110 ret = zvol_get_volume_params(sl->sl_zvol_minor,
111 &sl->sl_blksize, /* volume block size */
112 &sl->sl_max_xfer_len, /* max data chunk size */
113 &sl->sl_zvol_minor_hdl, /* minor soft state */
114 &sl->sl_zvol_objset_hdl, /* dmu_tx_create */
115 &sl->sl_zvol_zil_hdl, /* zil_commit */
116 &sl->sl_zvol_rl_hdl, /* zfs_range_lock */
117 &sl->sl_zvol_bonus_hdl); /* dmu_buf_hold_array_by_bonus, */
118 /* dmu_request_arcbuf, */
119 /* dmu_assign_arcbuf */
120
121 if (ret == 0 && sl->sl_blksize < MMU_PAGESIZE) {
122 cmn_err(CE_NOTE, "COMSTAR reduced copy disabled due to "
123 "small zvol blocksize (%d)\n", (int)sl->sl_blksize);
124 ret = ENOTSUP;
125 }
126
127 return (ret);
128 }
129
130 /*
131 * Return the number of elements in a scatter/gather list required for
132 * the given span in the zvol. Elements are 1:1 with zvol blocks.
133 */
134 uint32_t
sbd_zvol_numsegs(sbd_lu_t * sl,uint64_t off,uint32_t len)135 sbd_zvol_numsegs(sbd_lu_t *sl, uint64_t off, uint32_t len)
136 {
137 uint64_t blksz = sl->sl_blksize;
138 uint64_t endoff = off + len;
139 uint64_t numsegs;
140
141 numsegs = (P2ROUNDUP(endoff, blksz) - P2ALIGN(off, blksz)) / blksz;
142 return ((uint32_t)numsegs);
143 }
144
145 /*
146 * Return an array of dmu_buf_t pointers for the requested range.
147 * The dmu buffers are either in cache or read in synchronously.
148 * Fill in the dbuf sglist from the dmu_buf_t array.
149 */
150 static void *RDTAG = "sbd_zvol_read";
151
152 int
sbd_zvol_alloc_read_bufs(sbd_lu_t * sl,stmf_data_buf_t * dbuf)153 sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
154 {
155 sbd_zvol_io_t *zvio = dbuf->db_lu_private;
156 rl_t *rl;
157 int numbufs, error;
158 uint64_t len = dbuf->db_data_size;
159 uint64_t offset = zvio->zvio_offset;
160 dmu_buf_t **dbpp, *dbp;
161
162 /* Make sure request is reasonable */
163 if (len > sl->sl_max_xfer_len)
164 return (E2BIG);
165 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
166 return (EIO);
167
168 /*
169 * The range lock is only held until the dmu buffers read in and
170 * held; not during the callers use of the data.
171 */
172 rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
173
174 error = dmu_buf_hold_array_by_bonus(sl->sl_zvol_bonus_hdl, offset,
175 len, TRUE, RDTAG, &numbufs, &dbpp);
176
177 zfs_range_unlock(rl);
178
179 if (error == ECKSUM)
180 error = EIO;
181
182 if (error == 0) {
183 /*
184 * Fill in db_sglist from the dmu_buf_t array.
185 */
186 int i;
187 stmf_sglist_ent_t *sgl;
188 uint64_t odiff, seglen;
189
190 zvio->zvio_dbp = dbpp;
191 /* make sure db_sglist is large enough */
192 if (dbuf->db_sglist_length != numbufs) {
193 cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n",
194 dbuf->db_sglist_length, numbufs);
195 }
196
197 sgl = &dbuf->db_sglist[0];
198 for (i = 0; i < numbufs; i++) {
199 dbp = dbpp[i];
200 odiff = offset - dbp->db_offset;
201 ASSERT(odiff == 0 || i == 0);
202 sgl->seg_addr = (uint8_t *)dbp->db_data + odiff;
203 seglen = MIN(len, dbp->db_size - odiff);
204 sgl->seg_length = (uint32_t)seglen;
205 offset += seglen;
206 len -= seglen;
207 sgl++;
208 }
209 ASSERT(len == 0);
210
211 }
212 return (error);
213 }
214
215 /*
216 * Release a dmu_buf_t array.
217 */
218 /*ARGSUSED*/
219 void
sbd_zvol_rele_read_bufs(sbd_lu_t * sl,stmf_data_buf_t * dbuf)220 sbd_zvol_rele_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
221 {
222 sbd_zvol_io_t *zvio = dbuf->db_lu_private;
223
224 ASSERT(zvio->zvio_dbp);
225 ASSERT(dbuf->db_sglist_length);
226
227 dmu_buf_rele_array(zvio->zvio_dbp, (int)dbuf->db_sglist_length, RDTAG);
228 }
229
230 /*
231 * Allocate enough loaned arc buffers for the requested region.
232 * Mimic the handling of the dmu_buf_t array used for reads as closely
233 * as possible even though the arc_buf_t's are anonymous until released.
234 * The buffers will match the zvol object blocks sizes and alignments
235 * such that a data copy may be avoided when the buffers are assigned.
236 */
237 int
sbd_zvol_alloc_write_bufs(sbd_lu_t * sl,stmf_data_buf_t * dbuf)238 sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
239 {
240 sbd_zvol_io_t *zvio = dbuf->db_lu_private;
241 int blkshift, numbufs, i;
242 uint64_t blksize;
243 arc_buf_t **abp;
244 stmf_sglist_ent_t *sgl;
245 uint64_t len = dbuf->db_data_size;
246 uint64_t offset = zvio->zvio_offset;
247
248 /* Make sure request is reasonable */
249 if (len > sl->sl_max_xfer_len)
250 return (E2BIG);
251 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
252 return (EIO);
253
254 /*
255 * Break up the request into chunks to match
256 * the volume block size. Only full, and aligned
257 * buffers will avoid the data copy in the dmu.
258 */
259 /*
260 * calculate how may dbufs are needed
261 */
262 blksize = sl->sl_blksize;
263 ASSERT(ISP2(blksize));
264 blkshift = highbit(blksize - 1);
265 /*
266 * taken from dmu_buf_hold_array_by_dnode()
267 */
268 numbufs = (P2ROUNDUP(offset+len, 1ULL<<blkshift) -
269 P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
270 if (dbuf->db_sglist_length != numbufs) {
271 cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n",
272 dbuf->db_sglist_length, numbufs);
273 }
274 /*
275 * allocate a holder for the needed arc_buf pointers
276 */
277 abp = kmem_alloc(sizeof (arc_buf_t *) * numbufs, KM_SLEEP);
278 /*
279 * The write operation uses loaned arc buffers so that
280 * the xfer_data is done outside of a dmu transaction.
281 * These buffers will exactly match the request unlike
282 * the dmu buffers obtained from the read operation.
283 */
284 /*
285 * allocate the arc buffers and fill in the stmf sglist
286 */
287 sgl = &dbuf->db_sglist[0];
288 for (i = 0; i < numbufs; i++) {
289 uint64_t seglen;
290
291 /* first block may not be aligned */
292 seglen = P2NPHASE(offset, blksize);
293 if (seglen == 0)
294 seglen = blksize;
295 seglen = MIN(seglen, len);
296 abp[i] = dmu_request_arcbuf(sl->sl_zvol_bonus_hdl, (int)seglen);
297 ASSERT(arc_buf_size(abp[i]) == (int)seglen);
298 sgl->seg_addr = abp[i]->b_data;
299 sgl->seg_length = (uint32_t)seglen;
300 sgl++;
301 offset += seglen;
302 len -= seglen;
303 }
304 ASSERT(len == 0);
305
306 zvio->zvio_abp = abp;
307 return (0);
308 }
309
310 /*ARGSUSED*/
311 void
sbd_zvol_rele_write_bufs_abort(sbd_lu_t * sl,stmf_data_buf_t * dbuf)312 sbd_zvol_rele_write_bufs_abort(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
313 {
314 sbd_zvol_io_t *zvio = dbuf->db_lu_private;
315 int i;
316 arc_buf_t **abp = zvio->zvio_abp;
317
318 /* free arcbufs */
319 for (i = 0; i < dbuf->db_sglist_length; i++)
320 dmu_return_arcbuf(*abp++);
321 kmem_free(zvio->zvio_abp,
322 sizeof (arc_buf_t *) * dbuf->db_sglist_length);
323 zvio->zvio_abp = NULL;
324 }
325
326 /*
327 * Release the arc_buf_t array allocated above and handle these cases :
328 *
329 * flags == 0 - create transaction and assign all arc bufs to offsets
330 * flags == ZVIO_COMMIT - same as above and commit to zil on sync devices
331 */
332 int
sbd_zvol_rele_write_bufs(sbd_lu_t * sl,stmf_data_buf_t * dbuf)333 sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
334 {
335 sbd_zvol_io_t *zvio = dbuf->db_lu_private;
336 dmu_tx_t *tx;
337 int sync, i, error;
338 rl_t *rl;
339 arc_buf_t **abp = zvio->zvio_abp;
340 int flags = zvio->zvio_flags;
341 uint64_t toffset, offset = zvio->zvio_offset;
342 uint64_t resid, len = dbuf->db_data_size;
343
344 ASSERT(flags == 0 || flags == ZVIO_COMMIT || flags == ZVIO_ABORT);
345
346 rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER);
347
348 tx = dmu_tx_create(sl->sl_zvol_objset_hdl);
349 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)len);
350 error = dmu_tx_assign(tx, TXG_WAIT);
351
352 if (error) {
353 dmu_tx_abort(tx);
354 zfs_range_unlock(rl);
355 sbd_zvol_rele_write_bufs_abort(sl, dbuf);
356 return (error);
357 }
358
359 toffset = offset;
360 resid = len;
361 for (i = 0; i < dbuf->db_sglist_length; i++) {
362 arc_buf_t *abuf;
363 int size;
364
365 abuf = abp[i];
366 size = arc_buf_size(abuf);
367 dmu_assign_arcbuf(sl->sl_zvol_bonus_hdl, toffset, abuf, tx);
368 toffset += size;
369 resid -= size;
370 }
371 ASSERT(resid == 0);
372
373 sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl);
374 zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset,
375 (ssize_t)len, sync);
376 dmu_tx_commit(tx);
377 zfs_range_unlock(rl);
378 kmem_free(zvio->zvio_abp,
379 sizeof (arc_buf_t *) * dbuf->db_sglist_length);
380 zvio->zvio_abp = NULL;
381 if (sync && (flags & ZVIO_COMMIT))
382 zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ);
383 return (0);
384 }
385
386 /*
387 * Copy interface for callers using direct zvol access.
388 * Very similar to zvol_read but the uio may have multiple iovec entries.
389 */
390 int
sbd_zvol_copy_read(sbd_lu_t * sl,uio_t * uio)391 sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio)
392 {
393 int error;
394 rl_t *rl;
395 uint64_t len = (uint64_t)uio->uio_resid;
396 uint64_t offset = (uint64_t)uio->uio_loffset;
397
398 /* Make sure request is reasonable */
399 if (len > sl->sl_max_xfer_len)
400 return (E2BIG);
401 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
402 return (EIO);
403
404 rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
405
406 error = dmu_read_uio_dbuf(sl->sl_zvol_bonus_hdl, uio, len);
407
408 zfs_range_unlock(rl);
409 if (error == ECKSUM)
410 error = EIO;
411 return (error);
412 }
413
414 /*
415 * Copy interface for callers using direct zvol access.
416 * Very similar to zvol_write but the uio may have multiple iovec entries.
417 */
418 int
sbd_zvol_copy_write(sbd_lu_t * sl,uio_t * uio,int flags)419 sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags)
420 {
421 rl_t *rl;
422 dmu_tx_t *tx;
423 int error, sync;
424 uint64_t len = (uint64_t)uio->uio_resid;
425 uint64_t offset = (uint64_t)uio->uio_loffset;
426
427 ASSERT(flags == 0 || flags == ZVIO_COMMIT);
428
429 /* Make sure request is reasonable */
430 if (len > sl->sl_max_xfer_len)
431 return (E2BIG);
432 if (offset + len > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
433 return (EIO);
434
435 rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER);
436
437 sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl);
438
439 tx = dmu_tx_create(sl->sl_zvol_objset_hdl);
440 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)uio->uio_resid);
441 error = dmu_tx_assign(tx, TXG_WAIT);
442 if (error) {
443 dmu_tx_abort(tx);
444 } else {
445 error = dmu_write_uio_dbuf(sl->sl_zvol_bonus_hdl, uio, len, tx);
446 if (error == 0) {
447 zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset,
448 (ssize_t)len, sync);
449 }
450 dmu_tx_commit(tx);
451 }
452 zfs_range_unlock(rl);
453 if (sync && (flags & ZVIO_COMMIT))
454 zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ);
455 if (error == ECKSUM)
456 error = EIO;
457 return (error);
458 }
459