xref: /illumos-gate/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c (revision bafd1f1462c49949e0251d74b4fbfa24d29bc79a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2014 by Delphix. All rights reserved.
24  */
25 
26 #include <sys/conf.h>
27 #include <sys/file.h>
28 #include <sys/ddi.h>
29 #include <sys/sunddi.h>
30 #include <sys/modctl.h>
31 #include <sys/scsi/scsi.h>
32 #include <sys/scsi/impl/scsi_reset_notify.h>
33 #include <sys/scsi/generic/mode.h>
34 #include <sys/disp.h>
35 #include <sys/byteorder.h>
36 #include <sys/atomic.h>
37 #include <sys/sdt.h>
38 #include <sys/dkio.h>
39 #include <sys/dmu.h>
40 #include <sys/arc.h>
41 #include <sys/zvol.h>
42 #include <sys/zfs_rlock.h>
43 
44 #include <sys/stmf.h>
45 #include <sys/lpif.h>
46 #include <sys/portif.h>
47 #include <sys/stmf_ioctl.h>
48 #include <sys/stmf_sbd_ioctl.h>
49 
50 #include "stmf_sbd.h"
51 #include "sbd_impl.h"
52 
53 
54 /*
55  * This file contains direct calls into the zfs module.
56  * These functions mimic zvol_read and zvol_write except pointers
57  * to the data buffers are passed instead of copying the data itself.
58  *
59  * zfs internal interfaces referenced here:
60  *
61  * FUNCTIONS
62  *    dmu_buf_hold_array_by_bonus()
63  *    dmu_buf_rele_array()
64  *
65  *    dmu_request_arc_buf()
66  *    dmu_assign_arcbuf()
67  *    dmu_return_arc()
68  *    arc_buf_size()
69  *
70  *    dmu_tx_create()
71  *    dmu_tx_hold_write()
72  *    dmu_tx_assign()
73  *    dmu_tx_commit(tx)
74  *    dmu_tx_abort(tx)
75  *    zil_commit()
76  *
77  *    zfs_range_lock()
78  *    zfs_range_unlock()
79  *
80  *    zvol_log_write()
81  *
82  *    dmu_read_uio()
83  *    dmu_write_uio()
84  * MINOR DATA
85  *    zv_volsize
86  *    zv_volblocksize
87  *    zv_flags		- for WCE
88  *    zv_objset		- dmu_tx_create
89  *    zv_zilog		- zil_commit
90  *    zv_znode		- zfs_range_lock
91  *    zv_dbuf		- dmu_buf_hold_array_by_bonus, dmu_request_arcbuf
92  * GLOBAL DATA
93  *    zvol_maxphys
94  */
95 
96 /*
97  * Take direct control of the volume instead of using the driver
98  * interfaces provided by zvol.c. Gather parameters and handles
99  * needed to make direct calls into zfs/dmu/zvol. The driver is
100  * opened exclusively at this point, so these parameters cannot change.
101  *
102  * NOTE: the object size and WCE can change while the device
103  * is open, so they must be fetched for every operation.
104  */
105 int
106 sbd_zvol_get_volume_params(sbd_lu_t *sl)
107 {
108 	int ret;
109 
110 	ret = zvol_get_volume_params(sl->sl_zvol_minor,
111 	    &sl->sl_blksize,		/* volume block size */
112 	    &sl->sl_max_xfer_len,	/* max data chunk size */
113 	    &sl->sl_zvol_minor_hdl,	/* minor soft state */
114 	    &sl->sl_zvol_objset_hdl,	/* dmu_tx_create */
115 	    &sl->sl_zvol_zil_hdl,	/* zil_commit */
116 	    &sl->sl_zvol_rl_hdl,	/* zfs_range_lock */
117 	    &sl->sl_zvol_bonus_hdl);	/* dmu_buf_hold_array_by_bonus, */
118 					/* dmu_request_arcbuf, */
119 					/* dmu_assign_arcbuf */
120 
121 	if (ret == 0 && sl->sl_blksize < MMU_PAGESIZE) {
122 		cmn_err(CE_NOTE, "COMSTAR reduced copy disabled due to "
123 		    "small zvol blocksize (%d)\n", (int)sl->sl_blksize);
124 		ret = ENOTSUP;
125 	}
126 
127 	return (ret);
128 }
129 
130 /*
131  * Return the number of elements in a scatter/gather list required for
132  * the given span in the zvol. Elements are 1:1 with zvol blocks.
133  */
134 uint32_t
135 sbd_zvol_numsegs(sbd_lu_t *sl, uint64_t off, uint32_t len)
136 {
137 	uint64_t blksz = sl->sl_blksize;
138 	uint64_t endoff = off + len;
139 	uint64_t numsegs;
140 
141 	numsegs = (P2ROUNDUP(endoff, blksz) - P2ALIGN(off, blksz)) / blksz;
142 	return ((uint32_t)numsegs);
143 }
144 
145 /*
146  * Return an array of dmu_buf_t pointers for the requested range.
147  * The dmu buffers are either in cache or read in synchronously.
148  * Fill in the dbuf sglist from the dmu_buf_t array.
149  */
150 static void *RDTAG = "sbd_zvol_read";
151 
152 int
153 sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
154 {
155 	sbd_zvol_io_t	*zvio = dbuf->db_lu_private;
156 	rl_t 		*rl;
157 	int 		numbufs, error;
158 	uint64_t 	len = dbuf->db_data_size;
159 	uint64_t 	offset = zvio->zvio_offset;
160 	dmu_buf_t	**dbpp, *dbp;
161 
162 	/* Make sure request is reasonable */
163 	if (len > sl->sl_max_xfer_len)
164 		return (E2BIG);
165 	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
166 		return (EIO);
167 
168 	/*
169 	 * The range lock is only held until the dmu buffers read in and
170 	 * held; not during the callers use of the data.
171 	 */
172 	rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
173 
174 	error = dmu_buf_hold_array_by_bonus(sl->sl_zvol_bonus_hdl, offset,
175 	    len, TRUE, RDTAG, &numbufs, &dbpp);
176 
177 	zfs_range_unlock(rl);
178 
179 	if (error == ECKSUM)
180 		error = EIO;
181 
182 	if (error == 0) {
183 		/*
184 		 * Fill in db_sglist from the dmu_buf_t array.
185 		 */
186 		int		i;
187 		stmf_sglist_ent_t *sgl;
188 		uint64_t	odiff, seglen;
189 
190 		zvio->zvio_dbp = dbpp;
191 		/* make sure db_sglist is large enough */
192 		if (dbuf->db_sglist_length != numbufs) {
193 			cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n",
194 			    dbuf->db_sglist_length, numbufs);
195 		}
196 
197 		sgl = &dbuf->db_sglist[0];
198 		for (i = 0; i < numbufs; i++) {
199 			dbp = dbpp[i];
200 			odiff =  offset - dbp->db_offset;
201 			ASSERT(odiff == 0 || i == 0);
202 			sgl->seg_addr = (uint8_t *)dbp->db_data + odiff;
203 			seglen = MIN(len, dbp->db_size - odiff);
204 			sgl->seg_length = (uint32_t)seglen;
205 			offset += seglen;
206 			len -= seglen;
207 			sgl++;
208 		}
209 		ASSERT(len == 0);
210 
211 	}
212 	return (error);
213 }
214 
215 /*
216  * Release a dmu_buf_t array.
217  */
218 /*ARGSUSED*/
219 void
220 sbd_zvol_rele_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
221 {
222 	sbd_zvol_io_t *zvio = dbuf->db_lu_private;
223 
224 	ASSERT(zvio->zvio_dbp);
225 	ASSERT(dbuf->db_sglist_length);
226 
227 	dmu_buf_rele_array(zvio->zvio_dbp, (int)dbuf->db_sglist_length, RDTAG);
228 }
229 
230 /*
231  * Allocate enough loaned arc buffers for the requested region.
232  * Mimic the handling of the dmu_buf_t array used for reads as closely
233  * as possible even though the arc_buf_t's are anonymous until released.
234  * The buffers will match the zvol object blocks sizes and alignments
235  * such that a data copy may be avoided when the buffers are assigned.
236  */
237 int
238 sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
239 {
240 	sbd_zvol_io_t	*zvio = dbuf->db_lu_private;
241 	int		blkshift, numbufs, i;
242 	uint64_t	blksize;
243 	arc_buf_t	**abp;
244 	stmf_sglist_ent_t *sgl;
245 	uint64_t 	len = dbuf->db_data_size;
246 	uint64_t 	offset = zvio->zvio_offset;
247 
248 	/* Make sure request is reasonable */
249 	if (len > sl->sl_max_xfer_len)
250 		return (E2BIG);
251 	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
252 		return (EIO);
253 
254 	/*
255 	 * Break up the request into chunks to match
256 	 * the volume block size. Only full, and aligned
257 	 * buffers will avoid the data copy in the dmu.
258 	 */
259 	/*
260 	 * calculate how may dbufs are needed
261 	 */
262 	blksize = sl->sl_blksize;
263 	ASSERT(ISP2(blksize));
264 	blkshift = highbit(blksize - 1);
265 	/*
266 	 * taken from dmu_buf_hold_array_by_dnode()
267 	 */
268 	numbufs = (P2ROUNDUP(offset+len, 1ULL<<blkshift) -
269 	    P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
270 	if (dbuf->db_sglist_length != numbufs) {
271 		cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n",
272 		    dbuf->db_sglist_length, numbufs);
273 	}
274 	/*
275 	 * allocate a holder for the needed arc_buf pointers
276 	 */
277 	abp = kmem_alloc(sizeof (arc_buf_t *) * numbufs, KM_SLEEP);
278 	/*
279 	 * The write operation uses loaned arc buffers so that
280 	 * the xfer_data is done outside of a dmu transaction.
281 	 * These buffers will exactly match the request unlike
282 	 * the dmu buffers obtained from the read operation.
283 	 */
284 	/*
285 	 * allocate the arc buffers and fill in the stmf sglist
286 	 */
287 	sgl = &dbuf->db_sglist[0];
288 	for (i = 0; i < numbufs; i++) {
289 		uint64_t seglen;
290 
291 		/* first block may not be aligned */
292 		seglen = P2NPHASE(offset, blksize);
293 		if (seglen == 0)
294 			seglen = blksize;
295 		seglen = MIN(seglen, len);
296 		abp[i] = dmu_request_arcbuf(sl->sl_zvol_bonus_hdl, (int)seglen);
297 		ASSERT(arc_buf_size(abp[i]) == (int)seglen);
298 		sgl->seg_addr = abp[i]->b_data;
299 		sgl->seg_length = (uint32_t)seglen;
300 		sgl++;
301 		offset += seglen;
302 		len -= seglen;
303 	}
304 	ASSERT(len == 0);
305 
306 	zvio->zvio_abp = abp;
307 	return (0);
308 }
309 
310 /*ARGSUSED*/
311 void
312 sbd_zvol_rele_write_bufs_abort(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
313 {
314 	sbd_zvol_io_t *zvio = dbuf->db_lu_private;
315 	int i;
316 	arc_buf_t **abp = zvio->zvio_abp;
317 
318 	/* free arcbufs */
319 	for (i = 0; i < dbuf->db_sglist_length; i++)
320 		dmu_return_arcbuf(*abp++);
321 	kmem_free(zvio->zvio_abp,
322 	    sizeof (arc_buf_t *) * dbuf->db_sglist_length);
323 	zvio->zvio_abp = NULL;
324 }
325 
326 /*
327  * Release the arc_buf_t array allocated above and handle these cases :
328  *
329  * flags == 0 - create transaction and assign all arc bufs to offsets
330  * flags == ZVIO_COMMIT - same as above and commit to zil on sync devices
331  */
332 int
333 sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
334 {
335 	sbd_zvol_io_t	*zvio = dbuf->db_lu_private;
336 	dmu_tx_t	*tx;
337 	int		sync, i, error;
338 	rl_t 		*rl;
339 	arc_buf_t	**abp = zvio->zvio_abp;
340 	int		flags = zvio->zvio_flags;
341 	uint64_t	toffset, offset = zvio->zvio_offset;
342 	uint64_t	resid, len = dbuf->db_data_size;
343 
344 	ASSERT(flags == 0 || flags == ZVIO_COMMIT || flags == ZVIO_ABORT);
345 
346 	rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER);
347 
348 	tx = dmu_tx_create(sl->sl_zvol_objset_hdl);
349 	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)len);
350 	error = dmu_tx_assign(tx, TXG_WAIT);
351 
352 	if (error) {
353 		dmu_tx_abort(tx);
354 		zfs_range_unlock(rl);
355 		sbd_zvol_rele_write_bufs_abort(sl, dbuf);
356 		return (error);
357 	}
358 
359 	toffset = offset;
360 	resid = len;
361 	for (i = 0; i < dbuf->db_sglist_length; i++) {
362 		arc_buf_t *abuf;
363 		int size;
364 
365 		abuf = abp[i];
366 		size = arc_buf_size(abuf);
367 		dmu_assign_arcbuf(sl->sl_zvol_bonus_hdl, toffset, abuf, tx);
368 		toffset += size;
369 		resid -= size;
370 	}
371 	ASSERT(resid == 0);
372 
373 	sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl);
374 	zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset,
375 	    (ssize_t)len, sync);
376 	dmu_tx_commit(tx);
377 	zfs_range_unlock(rl);
378 	kmem_free(zvio->zvio_abp,
379 	    sizeof (arc_buf_t *) * dbuf->db_sglist_length);
380 	zvio->zvio_abp = NULL;
381 	if (sync && (flags & ZVIO_COMMIT))
382 		zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ);
383 	return (0);
384 }
385 
386 /*
387  * Copy interface for callers using direct zvol access.
388  * Very similar to zvol_read but the uio may have multiple iovec entries.
389  */
390 int
391 sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio)
392 {
393 	int		error;
394 	rl_t 		*rl;
395 	uint64_t	len = (uint64_t)uio->uio_resid;
396 	uint64_t	offset = (uint64_t)uio->uio_loffset;
397 
398 	/* Make sure request is reasonable */
399 	if (len > sl->sl_max_xfer_len)
400 		return (E2BIG);
401 	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
402 		return (EIO);
403 
404 	rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
405 
406 	error = dmu_read_uio_dbuf(sl->sl_zvol_bonus_hdl, uio, len);
407 
408 	zfs_range_unlock(rl);
409 	if (error == ECKSUM)
410 		error = EIO;
411 	return (error);
412 }
413 
414 /*
415  * Copy interface for callers using direct zvol access.
416  * Very similar to zvol_write but the uio may have multiple iovec entries.
417  */
418 int
419 sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags)
420 {
421 	rl_t 		*rl;
422 	dmu_tx_t 	*tx;
423 	int		error, sync;
424 	uint64_t	len = (uint64_t)uio->uio_resid;
425 	uint64_t	offset = (uint64_t)uio->uio_loffset;
426 
427 	ASSERT(flags == 0 || flags == ZVIO_COMMIT);
428 
429 	/* Make sure request is reasonable */
430 	if (len > sl->sl_max_xfer_len)
431 		return (E2BIG);
432 	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
433 		return (EIO);
434 
435 	rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER);
436 
437 	sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl);
438 
439 	tx = dmu_tx_create(sl->sl_zvol_objset_hdl);
440 	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)uio->uio_resid);
441 	error = dmu_tx_assign(tx, TXG_WAIT);
442 	if (error) {
443 		dmu_tx_abort(tx);
444 	} else {
445 		error = dmu_write_uio_dbuf(sl->sl_zvol_bonus_hdl, uio, len, tx);
446 		if (error == 0) {
447 			zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset,
448 			    (ssize_t)len, sync);
449 		}
450 		dmu_tx_commit(tx);
451 	}
452 	zfs_range_unlock(rl);
453 	if (sync && (flags & ZVIO_COMMIT))
454 		zil_commit(sl->sl_zvol_zil_hdl, ZVOL_OBJ);
455 	if (error == ECKSUM)
456 		error = EIO;
457 	return (error);
458 }
459