xref: /freebsd/sys/cam/ctl/ctl_backend_block.c (revision d316de24faa7453118a90fb0e9839e8026e36a4e)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2003 Silicon Graphics International Corp.
5  * Copyright (c) 2009-2011 Spectra Logic Corporation
6  * Copyright (c) 2012,2021 The FreeBSD Foundation
7  * Copyright (c) 2014-2021 Alexander Motin <mav@FreeBSD.org>
8  * All rights reserved.
9  *
10  * Portions of this software were developed by Edward Tomasz Napierala
11  * under sponsorship from the FreeBSD Foundation.
12  *
13  * Portions of this software were developed by Ka Ho Ng <khng@FreeBSD.org>
14  * under sponsorship from the FreeBSD Foundation.
15  *
16  * Redistribution and use in source and binary forms, with or without
17  * modification, are permitted provided that the following conditions
18  * are met:
19  * 1. Redistributions of source code must retain the above copyright
20  *    notice, this list of conditions, and the following disclaimer,
21  *    without modification.
22  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
23  *    substantially similar to the "NO WARRANTY" disclaimer below
24  *    ("Disclaimer") and any redistribution must be conditioned upon
25  *    including a substantially similar Disclaimer requirement for further
26  *    binary redistribution.
27  *
28  * NO WARRANTY
29  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
30  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
31  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
32  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
33  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
37  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
38  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39  * POSSIBILITY OF SUCH DAMAGES.
40  *
41  * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_backend_block.c#5 $
42  */
43 /*
44  * CAM Target Layer driver backend for block devices.
45  *
46  * Author: Ken Merry <ken@FreeBSD.org>
47  */
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/kernel.h>
51 #include <sys/types.h>
52 #include <sys/kthread.h>
53 #include <sys/bio.h>
54 #include <sys/fcntl.h>
55 #include <sys/limits.h>
56 #include <sys/lock.h>
57 #include <sys/mutex.h>
58 #include <sys/condvar.h>
59 #include <sys/malloc.h>
60 #include <sys/conf.h>
61 #include <sys/ioccom.h>
62 #include <sys/queue.h>
63 #include <sys/sbuf.h>
64 #include <sys/endian.h>
65 #include <sys/uio.h>
66 #include <sys/buf.h>
67 #include <sys/taskqueue.h>
68 #include <sys/vnode.h>
69 #include <sys/namei.h>
70 #include <sys/mount.h>
71 #include <sys/disk.h>
72 #include <sys/fcntl.h>
73 #include <sys/filedesc.h>
74 #include <sys/filio.h>
75 #include <sys/proc.h>
76 #include <sys/pcpu.h>
77 #include <sys/module.h>
78 #include <sys/sdt.h>
79 #include <sys/devicestat.h>
80 #include <sys/sysctl.h>
81 #include <sys/nv.h>
82 #include <sys/dnv.h>
83 #include <sys/sx.h>
84 #include <sys/unistd.h>
85 
86 #include <geom/geom.h>
87 
88 #include <cam/cam.h>
89 #include <cam/scsi/scsi_all.h>
90 #include <cam/scsi/scsi_da.h>
91 #include <cam/ctl/ctl_io.h>
92 #include <cam/ctl/ctl.h>
93 #include <cam/ctl/ctl_backend.h>
94 #include <cam/ctl/ctl_ioctl.h>
95 #include <cam/ctl/ctl_ha.h>
96 #include <cam/ctl/ctl_scsi_all.h>
97 #include <cam/ctl/ctl_private.h>
98 #include <cam/ctl/ctl_error.h>
99 
100 /*
101  * The idea here is to allocate enough S/G space to handle at least 1MB I/Os.
102  * On systems with small maxphys it can be 8 128KB segments.  On large systems
103  * it can be up to 8 1MB segments.  I/Os larger than that we'll split.
104  */
105 #define	CTLBLK_MAX_SEGS		8
106 #define	CTLBLK_HALF_SEGS	(CTLBLK_MAX_SEGS / 2)
107 #define	CTLBLK_MIN_SEG		(128 * 1024)
108 #define	CTLBLK_MAX_SEG		MIN(1024 * 1024, MAX(CTLBLK_MIN_SEG, maxphys))
109 #define	CTLBLK_MAX_IO_SIZE	(CTLBLK_MAX_SEG * CTLBLK_MAX_SEGS)
110 
111 #ifdef CTLBLK_DEBUG
112 #define DPRINTF(fmt, args...) \
113     printf("cbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
114 #else
115 #define DPRINTF(fmt, args...) do {} while(0)
116 #endif
117 
118 #define PRIV(io)	\
119     ((struct ctl_ptr_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_BACKEND])
120 #define ARGS(io)	\
121     ((struct ctl_lba_len_flags *)&(io)->io_hdr.ctl_private[CTL_PRIV_LBA_LEN])
122 #define	DSM_RANGE(io)	((io)->io_hdr.ctl_private[CTL_PRIV_LBA_LEN].integer)
123 
124 SDT_PROVIDER_DEFINE(cbb);
125 
126 typedef enum {
127 	CTL_BE_BLOCK_LUN_UNCONFIGURED	= 0x01,
128 	CTL_BE_BLOCK_LUN_WAITING	= 0x04,
129 } ctl_be_block_lun_flags;
130 
131 typedef enum {
132 	CTL_BE_BLOCK_NONE,
133 	CTL_BE_BLOCK_DEV,
134 	CTL_BE_BLOCK_FILE
135 } ctl_be_block_type;
136 
137 struct ctl_be_block_filedata {
138 	struct ucred *cred;
139 };
140 
141 union ctl_be_block_bedata {
142 	struct ctl_be_block_filedata file;
143 };
144 
145 struct ctl_be_block_io;
146 struct ctl_be_block_lun;
147 
148 typedef void (*cbb_dispatch_t)(struct ctl_be_block_lun *be_lun,
149 			       struct ctl_be_block_io *beio);
150 typedef uint64_t (*cbb_getattr_t)(struct ctl_be_block_lun *be_lun,
151 				  const char *attrname);
152 
153 /*
154  * Backend LUN structure.  There is a 1:1 mapping between a block device
155  * and a backend block LUN, and between a backend block LUN and a CTL LUN.
156  */
157 struct ctl_be_block_lun {
158 	struct ctl_be_lun cbe_lun;		/* Must be first element. */
159 	struct ctl_lun_create_params params;
160 	char *dev_path;
161 	ctl_be_block_type dev_type;
162 	struct vnode *vn;
163 	union ctl_be_block_bedata backend;
164 	cbb_dispatch_t dispatch;
165 	cbb_dispatch_t lun_flush;
166 	cbb_dispatch_t unmap;
167 	cbb_dispatch_t get_lba_status;
168 	cbb_getattr_t getattr;
169 	uint64_t size_blocks;
170 	uint64_t size_bytes;
171 	struct ctl_be_block_softc *softc;
172 	struct devstat *disk_stats;
173 	ctl_be_block_lun_flags flags;
174 	SLIST_ENTRY(ctl_be_block_lun) links;
175 	struct taskqueue *io_taskqueue;
176 	struct task io_task;
177 	int num_threads;
178 	STAILQ_HEAD(, ctl_io_hdr) input_queue;
179 	STAILQ_HEAD(, ctl_io_hdr) config_read_queue;
180 	STAILQ_HEAD(, ctl_io_hdr) config_write_queue;
181 	STAILQ_HEAD(, ctl_io_hdr) datamove_queue;
182 	struct mtx_padalign io_lock;
183 	struct mtx_padalign queue_lock;
184 };
185 
186 /*
187  * Overall softc structure for the block backend module.
188  */
189 struct ctl_be_block_softc {
190 	struct sx			 modify_lock;
191 	struct mtx			 lock;
192 	int				 num_luns;
193 	SLIST_HEAD(, ctl_be_block_lun)	 lun_list;
194 	uma_zone_t			 beio_zone;
195 	uma_zone_t			 bufmin_zone;
196 	uma_zone_t			 bufmax_zone;
197 };
198 
199 static struct ctl_be_block_softc backend_block_softc;
200 
201 /*
202  * Per-I/O information.
203  */
204 struct ctl_be_block_io {
205 	union ctl_io			*io;
206 	struct ctl_sg_entry		sg_segs[CTLBLK_MAX_SEGS];
207 	struct iovec			xiovecs[CTLBLK_MAX_SEGS];
208 	int				refcnt;
209 	int				bio_cmd;
210 	int				two_sglists;
211 	int				num_segs;
212 	int				num_bios_sent;
213 	int				num_bios_done;
214 	int				send_complete;
215 	int				first_error;
216 	uint64_t			first_error_offset;
217 	struct bintime			ds_t0;
218 	devstat_tag_type		ds_tag_type;
219 	devstat_trans_flags		ds_trans_type;
220 	uint64_t			io_len;
221 	uint64_t			io_offset;
222 	int				io_arg;
223 	struct ctl_be_block_softc	*softc;
224 	struct ctl_be_block_lun		*lun;
225 	void (*beio_cont)(struct ctl_be_block_io *beio); /* to continue processing */
226 };
227 
228 extern struct ctl_softc *control_softc;
229 
230 static int cbb_num_threads = 32;
231 SYSCTL_NODE(_kern_cam_ctl, OID_AUTO, block, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
232 	    "CAM Target Layer Block Backend");
233 SYSCTL_INT(_kern_cam_ctl_block, OID_AUTO, num_threads, CTLFLAG_RWTUN,
234            &cbb_num_threads, 0, "Number of threads per backing file");
235 
236 static struct ctl_be_block_io *ctl_alloc_beio(struct ctl_be_block_softc *softc);
237 static void ctl_free_beio(struct ctl_be_block_io *beio);
238 static void ctl_complete_beio(struct ctl_be_block_io *beio);
239 static int ctl_be_block_move_done(union ctl_io *io, bool samethr);
240 static void ctl_be_block_biodone(struct bio *bio);
241 static void ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
242 				    struct ctl_be_block_io *beio);
243 static void ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
244 				       struct ctl_be_block_io *beio);
245 static void ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
246 				  struct ctl_be_block_io *beio);
247 static uint64_t ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun,
248 					 const char *attrname);
249 static void ctl_be_block_unmap_file(struct ctl_be_block_lun *be_lun,
250 				    struct ctl_be_block_io *beio);
251 static void ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
252 				   struct ctl_be_block_io *beio);
253 static void ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
254 				   struct ctl_be_block_io *beio);
255 static void ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
256 				      struct ctl_be_block_io *beio);
257 static uint64_t ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun,
258 					 const char *attrname);
259 static void ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
260 				    union ctl_io *io);
261 static void ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
262 				    union ctl_io *io);
263 static void ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
264 				  union ctl_io *io);
265 static void ctl_be_block_worker(void *context, int pending);
266 static int ctl_be_block_submit(union ctl_io *io);
267 static int ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
268 				   int flag, struct thread *td);
269 static int ctl_be_block_open_file(struct ctl_be_block_lun *be_lun,
270 				  struct ctl_lun_req *req);
271 static int ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun,
272 				 struct ctl_lun_req *req);
273 static int ctl_be_block_close(struct ctl_be_block_lun *be_lun);
274 static int ctl_be_block_open(struct ctl_be_block_lun *be_lun,
275 			     struct ctl_lun_req *req);
276 static int ctl_be_block_create(struct ctl_be_block_softc *softc,
277 			       struct ctl_lun_req *req);
278 static int ctl_be_block_rm(struct ctl_be_block_softc *softc,
279 			   struct ctl_lun_req *req);
280 static int ctl_be_block_modify(struct ctl_be_block_softc *softc,
281 			   struct ctl_lun_req *req);
282 static void ctl_be_block_lun_shutdown(struct ctl_be_lun *cbe_lun);
283 static int ctl_be_block_config_write(union ctl_io *io);
284 static int ctl_be_block_config_read(union ctl_io *io);
285 static int ctl_be_block_lun_info(struct ctl_be_lun *cbe_lun, struct sbuf *sb);
286 static uint64_t ctl_be_block_lun_attr(struct ctl_be_lun *cbe_lun, const char *attrname);
287 static int ctl_be_block_init(void);
288 static int ctl_be_block_shutdown(void);
289 
290 static struct ctl_backend_driver ctl_be_block_driver =
291 {
292 	.name = "block",
293 	.flags = CTL_BE_FLAG_HAS_CONFIG,
294 	.init = ctl_be_block_init,
295 	.shutdown = ctl_be_block_shutdown,
296 	.data_submit = ctl_be_block_submit,
297 	.config_read = ctl_be_block_config_read,
298 	.config_write = ctl_be_block_config_write,
299 	.ioctl = ctl_be_block_ioctl,
300 	.lun_info = ctl_be_block_lun_info,
301 	.lun_attr = ctl_be_block_lun_attr
302 };
303 
304 MALLOC_DEFINE(M_CTLBLK, "ctlblock", "Memory used for CTL block backend");
305 CTL_BACKEND_DECLARE(cbb, ctl_be_block_driver);
306 
307 static void
308 ctl_alloc_seg(struct ctl_be_block_softc *softc, struct ctl_sg_entry *sg,
309     size_t len)
310 {
311 
312 	if (len <= CTLBLK_MIN_SEG) {
313 		sg->addr = uma_zalloc(softc->bufmin_zone, M_WAITOK);
314 	} else {
315 		KASSERT(len <= CTLBLK_MAX_SEG,
316 		    ("Too large alloc %zu > %lu", len, CTLBLK_MAX_SEG));
317 		sg->addr = uma_zalloc(softc->bufmax_zone, M_WAITOK);
318 	}
319 	sg->len = len;
320 }
321 
322 static void
323 ctl_free_seg(struct ctl_be_block_softc *softc, struct ctl_sg_entry *sg)
324 {
325 
326 	if (sg->len <= CTLBLK_MIN_SEG) {
327 		uma_zfree(softc->bufmin_zone, sg->addr);
328 	} else {
329 		KASSERT(sg->len <= CTLBLK_MAX_SEG,
330 		    ("Too large free %zu > %lu", sg->len, CTLBLK_MAX_SEG));
331 		uma_zfree(softc->bufmax_zone, sg->addr);
332 	}
333 }
334 
335 static struct ctl_be_block_io *
336 ctl_alloc_beio(struct ctl_be_block_softc *softc)
337 {
338 	struct ctl_be_block_io *beio;
339 
340 	beio = uma_zalloc(softc->beio_zone, M_WAITOK | M_ZERO);
341 	beio->softc = softc;
342 	beio->refcnt = 1;
343 	return (beio);
344 }
345 
346 static void
347 ctl_real_free_beio(struct ctl_be_block_io *beio)
348 {
349 	struct ctl_be_block_softc *softc = beio->softc;
350 	int i;
351 
352 	for (i = 0; i < beio->num_segs; i++) {
353 		ctl_free_seg(softc, &beio->sg_segs[i]);
354 
355 		/* For compare we had two equal S/G lists. */
356 		if (beio->two_sglists) {
357 			ctl_free_seg(softc,
358 			    &beio->sg_segs[i + CTLBLK_HALF_SEGS]);
359 		}
360 	}
361 
362 	uma_zfree(softc->beio_zone, beio);
363 }
364 
365 static void
366 ctl_refcnt_beio(void *arg, int diff)
367 {
368 	struct ctl_be_block_io *beio = arg;
369 
370 	if (atomic_fetchadd_int(&beio->refcnt, diff) + diff == 0)
371 		ctl_real_free_beio(beio);
372 }
373 
374 static void
375 ctl_free_beio(struct ctl_be_block_io *beio)
376 {
377 
378 	ctl_refcnt_beio(beio, -1);
379 }
380 
381 static void
382 ctl_complete_beio(struct ctl_be_block_io *beio)
383 {
384 	union ctl_io *io = beio->io;
385 
386 	if (beio->beio_cont != NULL) {
387 		beio->beio_cont(beio);
388 	} else {
389 		ctl_free_beio(beio);
390 		ctl_data_submit_done(io);
391 	}
392 }
393 
394 static void
395 ctl_be_block_io_error(union ctl_io *io, int bio_cmd, uint16_t retry_count)
396 {
397 	switch (io->io_hdr.io_type) {
398 	case CTL_IO_SCSI:
399 		if (bio_cmd == BIO_FLUSH) {
400 			/* XXX KDM is there is a better error here? */
401 			ctl_set_internal_failure(&io->scsiio,
402 						 /*sks_valid*/ 1,
403 						 retry_count);
404 		} else {
405 			ctl_set_medium_error(&io->scsiio, bio_cmd == BIO_READ);
406 		}
407 		break;
408 	case CTL_IO_NVME:
409 		switch (bio_cmd) {
410 		case BIO_FLUSH:
411 		case BIO_WRITE:
412 			ctl_nvme_set_write_fault(&io->nvmeio);
413 			break;
414 		case BIO_READ:
415 			ctl_nvme_set_unrecoverable_read_error(&io->nvmeio);
416 			break;
417 		default:
418 			ctl_nvme_set_internal_error(&io->nvmeio);
419 			break;
420 		}
421 		break;
422 	default:
423 		__assert_unreachable();
424 	}
425 }
426 
427 static size_t
428 cmp(uint8_t *a, uint8_t *b, size_t size)
429 {
430 	size_t i;
431 
432 	for (i = 0; i < size; i++) {
433 		if (a[i] != b[i])
434 			break;
435 	}
436 	return (i);
437 }
438 
439 static void
440 ctl_be_block_compare(union ctl_io *io)
441 {
442 	struct ctl_be_block_io *beio;
443 	uint64_t off, res;
444 	int i;
445 
446 	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
447 	off = 0;
448 	for (i = 0; i < beio->num_segs; i++) {
449 		res = cmp(beio->sg_segs[i].addr,
450 		    beio->sg_segs[i + CTLBLK_HALF_SEGS].addr,
451 		    beio->sg_segs[i].len);
452 		off += res;
453 		if (res < beio->sg_segs[i].len)
454 			break;
455 	}
456 	if (i < beio->num_segs) {
457 		ctl_io_set_compare_failure(io, off);
458 	} else
459 		ctl_io_set_success(io);
460 }
461 
462 static int
463 ctl_be_block_move_done(union ctl_io *io, bool samethr)
464 {
465 	struct ctl_be_block_io *beio;
466 	struct ctl_be_block_lun *be_lun;
467 	struct ctl_lba_len_flags *lbalen;
468 
469 	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
470 
471 	DPRINTF("entered\n");
472 	ctl_add_kern_rel_offset(io, ctl_kern_data_len(io));
473 
474 	/*
475 	 * We set status at this point for read and compare commands.
476 	 */
477 	if ((io->io_hdr.flags & CTL_FLAG_ABORT) == 0 &&
478 	    (io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE) {
479 		lbalen = ARGS(io);
480 		if (lbalen->flags & CTL_LLF_READ) {
481 			ctl_io_set_success(io);
482 		} else if (lbalen->flags & CTL_LLF_COMPARE) {
483 			/* We have two data blocks ready for comparison. */
484 			ctl_be_block_compare(io);
485 		}
486 	}
487 
488 	/*
489 	 * If this is a read, or a write with errors, it is done.
490 	 */
491 	if ((beio->bio_cmd == BIO_READ)
492 	 || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)
493 	 || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE)) {
494 		ctl_complete_beio(beio);
495 		return (0);
496 	}
497 
498 	/*
499 	 * At this point, we have a write and the DMA completed successfully.
500 	 * If we were called synchronously in the original thread then just
501 	 * dispatch, otherwise we now have to queue it to the task queue to
502 	 * execute the backend I/O.  That is because we do blocking
503 	 * memory allocations, and in the file backing case, blocking I/O.
504 	 * This move done routine is generally called in the SIM's
505 	 * interrupt context, and therefore we cannot block.
506 	 */
507 	be_lun = (struct ctl_be_block_lun *)CTL_BACKEND_LUN(io);
508 	if (samethr) {
509 		be_lun->dispatch(be_lun, beio);
510 	} else {
511 		mtx_lock(&be_lun->queue_lock);
512 		STAILQ_INSERT_TAIL(&be_lun->datamove_queue, &io->io_hdr, links);
513 		mtx_unlock(&be_lun->queue_lock);
514 		taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
515 	}
516 	return (0);
517 }
518 
519 static void
520 ctl_be_block_biodone(struct bio *bio)
521 {
522 	struct ctl_be_block_io *beio = bio->bio_caller1;
523 	struct ctl_be_block_lun *be_lun = beio->lun;
524 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
525 	union ctl_io *io;
526 	int error;
527 
528 	io = beio->io;
529 
530 	DPRINTF("entered\n");
531 
532 	error = bio->bio_error;
533 	mtx_lock(&be_lun->io_lock);
534 	if (error != 0 &&
535 	    (beio->first_error == 0 ||
536 	     bio->bio_offset < beio->first_error_offset)) {
537 		beio->first_error = error;
538 		beio->first_error_offset = bio->bio_offset;
539 	}
540 
541 	beio->num_bios_done++;
542 
543 	/*
544 	 * XXX KDM will this cause WITNESS to complain?  Holding a lock
545 	 * during the free might cause it to complain.
546 	 */
547 	g_destroy_bio(bio);
548 
549 	/*
550 	 * If the send complete bit isn't set, or we aren't the last I/O to
551 	 * complete, then we're done.
552 	 */
553 	if ((beio->send_complete == 0)
554 	 || (beio->num_bios_done < beio->num_bios_sent)) {
555 		mtx_unlock(&be_lun->io_lock);
556 		return;
557 	}
558 
559 	/*
560 	 * At this point, we've verified that we are the last I/O to
561 	 * complete, so it's safe to drop the lock.
562 	 */
563 	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
564 	    beio->ds_tag_type, beio->ds_trans_type,
565 	    /*now*/ NULL, /*then*/&beio->ds_t0);
566 	mtx_unlock(&be_lun->io_lock);
567 
568 	/*
569 	 * If there are any errors from the backing device, we fail the
570 	 * entire I/O with a medium error.
571 	 */
572 	error = beio->first_error;
573 	if (error != 0) {
574 		if (error == EOPNOTSUPP) {
575 			ctl_io_set_invalid_opcode(io);
576 		} else if (error == ENOSPC || error == EDQUOT) {
577 			ctl_io_set_space_alloc_fail(io);
578 		} else if (error == EROFS || error == EACCES) {
579 			ctl_io_set_hw_write_protected(io);
580 		} else {
581 			ctl_be_block_io_error(io, beio->bio_cmd,
582 			    /*retry_count*/ 0xbad2);
583 		}
584 		ctl_complete_beio(beio);
585 		return;
586 	}
587 
588 	/*
589 	 * If this is a write, a flush, a delete or verify, we're all done.
590 	 * If this is a read, we can now send the data to the user.
591 	 */
592 	if ((beio->bio_cmd == BIO_WRITE)
593 	 || (beio->bio_cmd == BIO_FLUSH)
594 	 || (beio->bio_cmd == BIO_DELETE)
595 	 || (ARGS(io)->flags & CTL_LLF_VERIFY)) {
596 		ctl_io_set_success(io);
597 		ctl_complete_beio(beio);
598 	} else {
599 		if ((ARGS(io)->flags & CTL_LLF_READ) &&
600 		    beio->beio_cont == NULL) {
601 			ctl_io_set_success(io);
602 			if (cbe_lun->serseq >= CTL_LUN_SERSEQ_SOFT)
603 				ctl_serseq_done(io);
604 		}
605 		ctl_datamove(io);
606 	}
607 }
608 
609 static void
610 ctl_be_block_flush_file(struct ctl_be_block_lun *be_lun,
611 			struct ctl_be_block_io *beio)
612 {
613 	union ctl_io *io = beio->io;
614 	struct mount *mountpoint;
615 	int error;
616 
617 	DPRINTF("entered\n");
618 
619 	binuptime(&beio->ds_t0);
620 	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
621 
622 	(void) vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
623 
624 	vn_lock(be_lun->vn, vn_lktype_write(mountpoint, be_lun->vn) |
625 	    LK_RETRY);
626 	error = VOP_FSYNC(be_lun->vn, beio->io_arg ? MNT_NOWAIT : MNT_WAIT,
627 	    curthread);
628 	VOP_UNLOCK(be_lun->vn);
629 
630 	vn_finished_write(mountpoint);
631 
632 	mtx_lock(&be_lun->io_lock);
633 	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
634 	    beio->ds_tag_type, beio->ds_trans_type,
635 	    /*now*/ NULL, /*then*/&beio->ds_t0);
636 	mtx_unlock(&be_lun->io_lock);
637 
638 	if (error == 0)
639 		ctl_io_set_success(io);
640 	else {
641 		ctl_be_block_io_error(io, BIO_FLUSH,
642 		    /*retry_count*/ 0xbad1);
643 	}
644 
645 	ctl_complete_beio(beio);
646 }
647 
648 SDT_PROBE_DEFINE1(cbb, , read, file_start, "uint64_t");
649 SDT_PROBE_DEFINE1(cbb, , write, file_start, "uint64_t");
650 SDT_PROBE_DEFINE1(cbb, , read, file_done,"uint64_t");
651 SDT_PROBE_DEFINE1(cbb, , write, file_done, "uint64_t");
652 
653 static void
654 ctl_be_block_dispatch_file(struct ctl_be_block_lun *be_lun,
655 			   struct ctl_be_block_io *beio)
656 {
657 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
658 	struct ctl_be_block_filedata *file_data;
659 	union ctl_io *io;
660 	struct uio xuio;
661 	struct iovec *xiovec;
662 	size_t s;
663 	int error, flags, i;
664 
665 	DPRINTF("entered\n");
666 
667 	file_data = &be_lun->backend.file;
668 	io = beio->io;
669 	flags = 0;
670 	if (ARGS(io)->flags & CTL_LLF_DPO)
671 		flags |= IO_DIRECT;
672 	if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
673 		flags |= IO_SYNC;
674 
675 	bzero(&xuio, sizeof(xuio));
676 	if (beio->bio_cmd == BIO_READ) {
677 		SDT_PROBE0(cbb, , read, file_start);
678 		xuio.uio_rw = UIO_READ;
679 	} else {
680 		SDT_PROBE0(cbb, , write, file_start);
681 		xuio.uio_rw = UIO_WRITE;
682 	}
683 	xuio.uio_offset = beio->io_offset;
684 	xuio.uio_resid = beio->io_len;
685 	xuio.uio_segflg = UIO_SYSSPACE;
686 	xuio.uio_iov = beio->xiovecs;
687 	xuio.uio_iovcnt = beio->num_segs;
688 	xuio.uio_td = curthread;
689 
690 	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
691 		xiovec->iov_base = beio->sg_segs[i].addr;
692 		xiovec->iov_len = beio->sg_segs[i].len;
693 	}
694 
695 	binuptime(&beio->ds_t0);
696 	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
697 
698 	if (beio->bio_cmd == BIO_READ) {
699 		vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
700 
701 		if (beio->beio_cont == NULL &&
702 		    cbe_lun->serseq == CTL_LUN_SERSEQ_SOFT)
703 			ctl_serseq_done(io);
704 		/*
705 		 * UFS pays attention to IO_DIRECT for reads.  If the
706 		 * DIRECTIO option is configured into the kernel, it calls
707 		 * ffs_rawread().  But that only works for single-segment
708 		 * uios with user space addresses.  In our case, with a
709 		 * kernel uio, it still reads into the buffer cache, but it
710 		 * will just try to release the buffer from the cache later
711 		 * on in ffs_read().
712 		 *
713 		 * ZFS does not pay attention to IO_DIRECT for reads.
714 		 *
715 		 * UFS does not pay attention to IO_SYNC for reads.
716 		 *
717 		 * ZFS pays attention to IO_SYNC (which translates into the
718 		 * Solaris define FRSYNC for zfs_read()) for reads.  It
719 		 * attempts to sync the file before reading.
720 		 */
721 		error = VOP_READ(be_lun->vn, &xuio, flags, file_data->cred);
722 
723 		VOP_UNLOCK(be_lun->vn);
724 		SDT_PROBE0(cbb, , read, file_done);
725 		if (error == 0 && xuio.uio_resid > 0) {
726 			/*
727 			 * If we read less then requested (EOF), then
728 			 * we should clean the rest of the buffer.
729 			 */
730 			s = beio->io_len - xuio.uio_resid;
731 			for (i = 0; i < beio->num_segs; i++) {
732 				if (s >= beio->sg_segs[i].len) {
733 					s -= beio->sg_segs[i].len;
734 					continue;
735 				}
736 				bzero((uint8_t *)beio->sg_segs[i].addr + s,
737 				    beio->sg_segs[i].len - s);
738 				s = 0;
739 			}
740 		}
741 	} else {
742 		struct mount *mountpoint;
743 
744 		(void)vn_start_write(be_lun->vn, &mountpoint, V_WAIT);
745 		vn_lock(be_lun->vn, vn_lktype_write(mountpoint,
746 		    be_lun->vn) | LK_RETRY);
747 
748 		/*
749 		 * UFS pays attention to IO_DIRECT for writes.  The write
750 		 * is done asynchronously.  (Normally the write would just
751 		 * get put into cache.
752 		 *
753 		 * UFS pays attention to IO_SYNC for writes.  It will
754 		 * attempt to write the buffer out synchronously if that
755 		 * flag is set.
756 		 *
757 		 * ZFS does not pay attention to IO_DIRECT for writes.
758 		 *
759 		 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
760 		 * for writes.  It will flush the transaction from the
761 		 * cache before returning.
762 		 */
763 		error = VOP_WRITE(be_lun->vn, &xuio, flags, file_data->cred);
764 		VOP_UNLOCK(be_lun->vn);
765 
766 		vn_finished_write(mountpoint);
767 		SDT_PROBE0(cbb, , write, file_done);
768         }
769 
770 	mtx_lock(&be_lun->io_lock);
771 	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
772 	    beio->ds_tag_type, beio->ds_trans_type,
773 	    /*now*/ NULL, /*then*/&beio->ds_t0);
774 	mtx_unlock(&be_lun->io_lock);
775 
776 	/*
777 	 * If we got an error, set the sense data to "MEDIUM ERROR" and
778 	 * return the I/O to the user.
779 	 */
780 	if (error != 0) {
781 		if (error == ENOSPC || error == EDQUOT) {
782 			ctl_io_set_space_alloc_fail(io);
783 		} else if (error == EROFS || error == EACCES) {
784 			ctl_io_set_hw_write_protected(io);
785 		} else {
786 			ctl_be_block_io_error(io, beio->bio_cmd, 0);
787 		}
788 		ctl_complete_beio(beio);
789 		return;
790 	}
791 
792 	/*
793 	 * If this is a write or a verify, we're all done.
794 	 * If this is a read, we can now send the data to the user.
795 	 */
796 	if ((beio->bio_cmd == BIO_WRITE) ||
797 	    (ARGS(io)->flags & CTL_LLF_VERIFY)) {
798 		ctl_io_set_success(io);
799 		ctl_complete_beio(beio);
800 	} else {
801 		if ((ARGS(io)->flags & CTL_LLF_READ) &&
802 		    beio->beio_cont == NULL) {
803 			ctl_io_set_success(io);
804 			if (cbe_lun->serseq > CTL_LUN_SERSEQ_SOFT)
805 				ctl_serseq_done(io);
806 		}
807 		ctl_datamove(io);
808 	}
809 }
810 
811 static void
812 ctl_be_block_gls_file(struct ctl_be_block_lun *be_lun,
813 			struct ctl_be_block_io *beio)
814 {
815 	union ctl_io *io = beio->io;
816 	struct ctl_lba_len_flags *lbalen = ARGS(io);
817 	struct scsi_get_lba_status_data *data;
818 	off_t roff, off;
819 	int error, status;
820 
821 	DPRINTF("entered\n");
822 
823 	CTL_IO_ASSERT(io, SCSI);
824 
825 	off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize;
826 	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
827 	error = VOP_IOCTL(be_lun->vn, FIOSEEKHOLE, &off,
828 	    0, curthread->td_ucred, curthread);
829 	if (error == 0 && off > roff)
830 		status = 0;	/* mapped up to off */
831 	else {
832 		error = VOP_IOCTL(be_lun->vn, FIOSEEKDATA, &off,
833 		    0, curthread->td_ucred, curthread);
834 		if (error == 0 && off > roff)
835 			status = 1;	/* deallocated up to off */
836 		else {
837 			status = 0;	/* unknown up to the end */
838 			off = be_lun->size_bytes;
839 		}
840 	}
841 	VOP_UNLOCK(be_lun->vn);
842 
843 	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
844 	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
845 	scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize -
846 	    lbalen->lba), data->descr[0].length);
847 	data->descr[0].status = status;
848 
849 	ctl_complete_beio(beio);
850 }
851 
852 static uint64_t
853 ctl_be_block_getattr_file(struct ctl_be_block_lun *be_lun, const char *attrname)
854 {
855 	struct vattr		vattr;
856 	struct statfs		statfs;
857 	uint64_t		val;
858 	int			error;
859 
860 	val = UINT64_MAX;
861 	if (be_lun->vn == NULL)
862 		return (val);
863 	vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
864 	if (strcmp(attrname, "blocksused") == 0) {
865 		error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
866 		if (error == 0)
867 			val = vattr.va_bytes / be_lun->cbe_lun.blocksize;
868 	}
869 	if (strcmp(attrname, "blocksavail") == 0 &&
870 	    !VN_IS_DOOMED(be_lun->vn)) {
871 		error = VFS_STATFS(be_lun->vn->v_mount, &statfs);
872 		if (error == 0)
873 			val = statfs.f_bavail * statfs.f_bsize /
874 			    be_lun->cbe_lun.blocksize;
875 	}
876 	VOP_UNLOCK(be_lun->vn);
877 	return (val);
878 }
879 
880 static void
881 ctl_be_block_unmap_file(struct ctl_be_block_lun *be_lun,
882 		        struct ctl_be_block_io *beio)
883 {
884 	struct ctl_be_block_filedata *file_data;
885 	union ctl_io *io;
886 	struct ctl_ptr_len_flags *ptrlen;
887 	struct scsi_unmap_desc *buf, *end;
888 	struct mount *mp;
889 	off_t off, len;
890 	int error;
891 
892 	io = beio->io;
893 	file_data = &be_lun->backend.file;
894 	mp = NULL;
895 	error = 0;
896 
897 	binuptime(&beio->ds_t0);
898 	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
899 
900 	(void)vn_start_write(be_lun->vn, &mp, V_WAIT);
901 	vn_lock(be_lun->vn, vn_lktype_write(mp, be_lun->vn) | LK_RETRY);
902 	if (beio->io_offset == -1) {
903 		beio->io_len = 0;
904 		ptrlen = (struct ctl_ptr_len_flags *)
905 		    &io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
906 		buf = (struct scsi_unmap_desc *)ptrlen->ptr;
907 		end = buf + ptrlen->len / sizeof(*buf);
908 		for (; buf < end; buf++) {
909 			off = (off_t)scsi_8btou64(buf->lba) *
910 			    be_lun->cbe_lun.blocksize;
911 			len = (off_t)scsi_4btoul(buf->length) *
912 			    be_lun->cbe_lun.blocksize;
913 			beio->io_len += len;
914 			error = vn_deallocate(be_lun->vn, &off, &len,
915 			    0, IO_NOMACCHECK | IO_NODELOCKED, file_data->cred,
916 			    NOCRED);
917 			if (error != 0)
918 				break;
919 		}
920 	} else {
921 		/* WRITE_SAME */
922 		off = beio->io_offset;
923 		len = beio->io_len;
924 		error = vn_deallocate(be_lun->vn, &off, &len, 0,
925 		    IO_NOMACCHECK | IO_NODELOCKED, file_data->cred, NOCRED);
926 	}
927 	VOP_UNLOCK(be_lun->vn);
928 	vn_finished_write(mp);
929 
930 	mtx_lock(&be_lun->io_lock);
931 	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
932 	    beio->ds_tag_type, beio->ds_trans_type,
933 	    /*now*/ NULL, /*then*/&beio->ds_t0);
934 	mtx_unlock(&be_lun->io_lock);
935 
936 	/*
937 	 * If we got an error, set the sense data to "MEDIUM ERROR" and
938 	 * return the I/O to the user.
939 	 */
940 	switch (error) {
941 	case 0:
942 		ctl_io_set_success(io);
943 		break;
944 	case ENOSPC:
945 	case EDQUOT:
946 		ctl_io_set_space_alloc_fail(io);
947 		break;
948 	case EROFS:
949 	case EACCES:
950 		ctl_io_set_hw_write_protected(io);
951 		break;
952 	default:
953 		ctl_be_block_io_error(io, BIO_DELETE, 0);
954 	}
955 	ctl_complete_beio(beio);
956 }
957 
958 static void
959 ctl_be_block_dispatch_zvol(struct ctl_be_block_lun *be_lun,
960 			   struct ctl_be_block_io *beio)
961 {
962 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
963 	union ctl_io *io;
964 	struct cdevsw *csw;
965 	struct cdev *dev;
966 	struct uio xuio;
967 	struct iovec *xiovec;
968 	int error, flags, i, ref;
969 
970 	DPRINTF("entered\n");
971 
972 	io = beio->io;
973 	flags = 0;
974 	if (ARGS(io)->flags & CTL_LLF_DPO)
975 		flags |= IO_DIRECT;
976 	if (beio->bio_cmd == BIO_WRITE && ARGS(io)->flags & CTL_LLF_FUA)
977 		flags |= IO_SYNC;
978 
979 	bzero(&xuio, sizeof(xuio));
980 	if (beio->bio_cmd == BIO_READ) {
981 		SDT_PROBE0(cbb, , read, file_start);
982 		xuio.uio_rw = UIO_READ;
983 	} else {
984 		SDT_PROBE0(cbb, , write, file_start);
985 		xuio.uio_rw = UIO_WRITE;
986 	}
987 	xuio.uio_offset = beio->io_offset;
988 	xuio.uio_resid = beio->io_len;
989 	xuio.uio_segflg = UIO_SYSSPACE;
990 	xuio.uio_iov = beio->xiovecs;
991 	xuio.uio_iovcnt = beio->num_segs;
992 	xuio.uio_td = curthread;
993 
994 	for (i = 0, xiovec = xuio.uio_iov; i < xuio.uio_iovcnt; i++, xiovec++) {
995 		xiovec->iov_base = beio->sg_segs[i].addr;
996 		xiovec->iov_len = beio->sg_segs[i].len;
997 	}
998 
999 	binuptime(&beio->ds_t0);
1000 	devstat_start_transaction(beio->lun->disk_stats, &beio->ds_t0);
1001 
1002 	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1003 	if (csw) {
1004 		if (beio->bio_cmd == BIO_READ) {
1005 			if (beio->beio_cont == NULL &&
1006 			    cbe_lun->serseq == CTL_LUN_SERSEQ_SOFT)
1007 				ctl_serseq_done(io);
1008 			error = csw->d_read(dev, &xuio, flags);
1009 		} else
1010 			error = csw->d_write(dev, &xuio, flags);
1011 		dev_relthread(dev, ref);
1012 	} else
1013 		error = ENXIO;
1014 
1015 	if (beio->bio_cmd == BIO_READ)
1016 		SDT_PROBE0(cbb, , read, file_done);
1017 	else
1018 		SDT_PROBE0(cbb, , write, file_done);
1019 
1020 	mtx_lock(&be_lun->io_lock);
1021 	devstat_end_transaction(beio->lun->disk_stats, beio->io_len,
1022 	    beio->ds_tag_type, beio->ds_trans_type,
1023 	    /*now*/ NULL, /*then*/&beio->ds_t0);
1024 	mtx_unlock(&be_lun->io_lock);
1025 
1026 	/*
1027 	 * If we got an error, set the sense data to "MEDIUM ERROR" and
1028 	 * return the I/O to the user.
1029 	 */
1030 	if (error != 0) {
1031 		if (error == ENOSPC || error == EDQUOT) {
1032 			ctl_io_set_space_alloc_fail(io);
1033 		} else if (error == EROFS || error == EACCES) {
1034 			ctl_io_set_hw_write_protected(io);
1035 		} else {
1036 			ctl_be_block_io_error(io, beio->bio_cmd, 0);
1037 		}
1038 		ctl_complete_beio(beio);
1039 		return;
1040 	}
1041 
1042 	/*
1043 	 * If this is a write or a verify, we're all done.
1044 	 * If this is a read, we can now send the data to the user.
1045 	 */
1046 	if ((beio->bio_cmd == BIO_WRITE) ||
1047 	    (ARGS(io)->flags & CTL_LLF_VERIFY)) {
1048 		ctl_io_set_success(io);
1049 		ctl_complete_beio(beio);
1050 	} else {
1051 		if ((ARGS(io)->flags & CTL_LLF_READ) &&
1052 		    beio->beio_cont == NULL) {
1053 			ctl_io_set_success(io);
1054 			if (cbe_lun->serseq > CTL_LUN_SERSEQ_SOFT)
1055 				ctl_serseq_done(io);
1056 		}
1057 		ctl_datamove(io);
1058 	}
1059 }
1060 
1061 static void
1062 ctl_be_block_gls_zvol(struct ctl_be_block_lun *be_lun,
1063 			struct ctl_be_block_io *beio)
1064 {
1065 	union ctl_io *io = beio->io;
1066 	struct cdevsw *csw;
1067 	struct cdev *dev;
1068 	struct ctl_lba_len_flags *lbalen = ARGS(io);
1069 	struct scsi_get_lba_status_data *data;
1070 	off_t roff, off;
1071 	int error, ref, status;
1072 
1073 	DPRINTF("entered\n");
1074 
1075 	CTL_IO_ASSERT(io, SCSI);
1076 
1077 	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1078 	if (csw == NULL) {
1079 		status = 0;	/* unknown up to the end */
1080 		off = be_lun->size_bytes;
1081 		goto done;
1082 	}
1083 	off = roff = ((off_t)lbalen->lba) * be_lun->cbe_lun.blocksize;
1084 	error = csw->d_ioctl(dev, FIOSEEKHOLE, (caddr_t)&off, FREAD,
1085 	    curthread);
1086 	if (error == 0 && off > roff)
1087 		status = 0;	/* mapped up to off */
1088 	else {
1089 		error = csw->d_ioctl(dev, FIOSEEKDATA, (caddr_t)&off, FREAD,
1090 		    curthread);
1091 		if (error == 0 && off > roff)
1092 			status = 1;	/* deallocated up to off */
1093 		else {
1094 			status = 0;	/* unknown up to the end */
1095 			off = be_lun->size_bytes;
1096 		}
1097 	}
1098 	dev_relthread(dev, ref);
1099 
1100 done:
1101 	data = (struct scsi_get_lba_status_data *)io->scsiio.kern_data_ptr;
1102 	scsi_u64to8b(lbalen->lba, data->descr[0].addr);
1103 	scsi_ulto4b(MIN(UINT32_MAX, off / be_lun->cbe_lun.blocksize -
1104 	    lbalen->lba), data->descr[0].length);
1105 	data->descr[0].status = status;
1106 
1107 	ctl_complete_beio(beio);
1108 }
1109 
1110 static void
1111 ctl_be_block_flush_dev(struct ctl_be_block_lun *be_lun,
1112 		       struct ctl_be_block_io *beio)
1113 {
1114 	struct bio *bio;
1115 	struct cdevsw *csw;
1116 	struct cdev *dev;
1117 	int ref;
1118 
1119 	DPRINTF("entered\n");
1120 
1121 	/* This can't fail, it's a blocking allocation. */
1122 	bio = g_alloc_bio();
1123 
1124 	bio->bio_cmd	    = BIO_FLUSH;
1125 	bio->bio_offset	    = 0;
1126 	bio->bio_data	    = 0;
1127 	bio->bio_done	    = ctl_be_block_biodone;
1128 	bio->bio_caller1    = beio;
1129 	bio->bio_pblkno	    = 0;
1130 
1131 	/*
1132 	 * We don't need to acquire the LUN lock here, because we are only
1133 	 * sending one bio, and so there is no other context to synchronize
1134 	 * with.
1135 	 */
1136 	beio->num_bios_sent = 1;
1137 	beio->send_complete = 1;
1138 
1139 	binuptime(&beio->ds_t0);
1140 	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1141 
1142 	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1143 	if (csw) {
1144 		bio->bio_dev = dev;
1145 		csw->d_strategy(bio);
1146 		dev_relthread(dev, ref);
1147 	} else {
1148 		bio->bio_error = ENXIO;
1149 		ctl_be_block_biodone(bio);
1150 	}
1151 }
1152 
1153 static void
1154 ctl_be_block_unmap_dev_range(struct ctl_be_block_lun *be_lun,
1155 		       struct ctl_be_block_io *beio,
1156 		       uint64_t off, uint64_t len, int last)
1157 {
1158 	struct bio *bio;
1159 	uint64_t maxlen;
1160 	struct cdevsw *csw;
1161 	struct cdev *dev;
1162 	int ref;
1163 
1164 	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1165 	maxlen = LONG_MAX - (LONG_MAX % be_lun->cbe_lun.blocksize);
1166 	while (len > 0) {
1167 		bio = g_alloc_bio();
1168 		bio->bio_cmd	    = BIO_DELETE;
1169 		bio->bio_dev	    = dev;
1170 		bio->bio_offset	    = off;
1171 		bio->bio_length	    = MIN(len, maxlen);
1172 		bio->bio_data	    = 0;
1173 		bio->bio_done	    = ctl_be_block_biodone;
1174 		bio->bio_caller1    = beio;
1175 		bio->bio_pblkno     = off / be_lun->cbe_lun.blocksize;
1176 
1177 		off += bio->bio_length;
1178 		len -= bio->bio_length;
1179 
1180 		mtx_lock(&be_lun->io_lock);
1181 		beio->num_bios_sent++;
1182 		if (last && len == 0)
1183 			beio->send_complete = 1;
1184 		mtx_unlock(&be_lun->io_lock);
1185 
1186 		if (csw) {
1187 			csw->d_strategy(bio);
1188 		} else {
1189 			bio->bio_error = ENXIO;
1190 			ctl_be_block_biodone(bio);
1191 		}
1192 	}
1193 	if (csw)
1194 		dev_relthread(dev, ref);
1195 }
1196 
1197 static void
1198 ctl_be_block_unmap_dev(struct ctl_be_block_lun *be_lun,
1199 		       struct ctl_be_block_io *beio)
1200 {
1201 	union ctl_io *io;
1202 	struct ctl_ptr_len_flags *ptrlen;
1203 	struct scsi_unmap_desc *buf, *end;
1204 	uint64_t len;
1205 
1206 	io = beio->io;
1207 
1208 	DPRINTF("entered\n");
1209 
1210 	binuptime(&beio->ds_t0);
1211 	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1212 
1213 	if (beio->io_offset == -1) {
1214 		beio->io_len = 0;
1215 		ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1216 		buf = (struct scsi_unmap_desc *)ptrlen->ptr;
1217 		end = buf + ptrlen->len / sizeof(*buf);
1218 		for (; buf < end; buf++) {
1219 			len = (uint64_t)scsi_4btoul(buf->length) *
1220 			    be_lun->cbe_lun.blocksize;
1221 			beio->io_len += len;
1222 			ctl_be_block_unmap_dev_range(be_lun, beio,
1223 			    scsi_8btou64(buf->lba) * be_lun->cbe_lun.blocksize,
1224 			    len, (end - buf < 2) ? TRUE : FALSE);
1225 		}
1226 	} else
1227 		ctl_be_block_unmap_dev_range(be_lun, beio,
1228 		    beio->io_offset, beio->io_len, TRUE);
1229 }
1230 
1231 static void
1232 ctl_be_block_dispatch_dev(struct ctl_be_block_lun *be_lun,
1233 			  struct ctl_be_block_io *beio)
1234 {
1235 	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
1236 	struct bio *bio;
1237 	struct cdevsw *csw;
1238 	struct cdev *dev;
1239 	off_t cur_offset;
1240 	int i, max_iosize, ref;
1241 
1242 	DPRINTF("entered\n");
1243 	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1244 
1245 	/*
1246 	 * We have to limit our I/O size to the maximum supported by the
1247 	 * backend device.
1248 	 */
1249 	if (csw) {
1250 		max_iosize = dev->si_iosize_max;
1251 		if (max_iosize <= 0)
1252 			max_iosize = DFLTPHYS;
1253 	} else
1254 		max_iosize = maxphys;
1255 
1256 	cur_offset = beio->io_offset;
1257 	for (i = 0; i < beio->num_segs; i++) {
1258 		size_t cur_size;
1259 		uint8_t *cur_ptr;
1260 
1261 		cur_size = beio->sg_segs[i].len;
1262 		cur_ptr = beio->sg_segs[i].addr;
1263 
1264 		while (cur_size > 0) {
1265 			/* This can't fail, it's a blocking allocation. */
1266 			bio = g_alloc_bio();
1267 
1268 			KASSERT(bio != NULL, ("g_alloc_bio() failed!\n"));
1269 
1270 			bio->bio_cmd = beio->bio_cmd;
1271 			bio->bio_dev = dev;
1272 			bio->bio_caller1 = beio;
1273 			bio->bio_length = min(cur_size, max_iosize);
1274 			bio->bio_offset = cur_offset;
1275 			bio->bio_data = cur_ptr;
1276 			bio->bio_done = ctl_be_block_biodone;
1277 			bio->bio_pblkno = cur_offset / be_lun->cbe_lun.blocksize;
1278 
1279 			cur_offset += bio->bio_length;
1280 			cur_ptr += bio->bio_length;
1281 			cur_size -= bio->bio_length;
1282 
1283 			TAILQ_INSERT_TAIL(&queue, bio, bio_queue);
1284 			beio->num_bios_sent++;
1285 		}
1286 	}
1287 	beio->send_complete = 1;
1288 	binuptime(&beio->ds_t0);
1289 	devstat_start_transaction(be_lun->disk_stats, &beio->ds_t0);
1290 
1291 	/*
1292 	 * Fire off all allocated requests!
1293 	 */
1294 	while ((bio = TAILQ_FIRST(&queue)) != NULL) {
1295 		TAILQ_REMOVE(&queue, bio, bio_queue);
1296 		if (csw)
1297 			csw->d_strategy(bio);
1298 		else {
1299 			bio->bio_error = ENXIO;
1300 			ctl_be_block_biodone(bio);
1301 		}
1302 	}
1303 	if (csw)
1304 		dev_relthread(dev, ref);
1305 }
1306 
1307 static uint64_t
1308 ctl_be_block_getattr_dev(struct ctl_be_block_lun *be_lun, const char *attrname)
1309 {
1310 	struct diocgattr_arg	arg;
1311 	struct cdevsw *csw;
1312 	struct cdev *dev;
1313 	int error, ref;
1314 
1315 	csw = devvn_refthread(be_lun->vn, &dev, &ref);
1316 	if (csw == NULL)
1317 		return (UINT64_MAX);
1318 	strlcpy(arg.name, attrname, sizeof(arg.name));
1319 	arg.len = sizeof(arg.value.off);
1320 	if (csw->d_ioctl) {
1321 		error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD,
1322 		    curthread);
1323 	} else
1324 		error = ENODEV;
1325 	dev_relthread(dev, ref);
1326 	if (error != 0)
1327 		return (UINT64_MAX);
1328 	return (arg.value.off);
1329 }
1330 
1331 static void
1332 ctl_be_block_namespace_data(struct ctl_be_block_lun *be_lun,
1333 			    union ctl_io *io)
1334 {
1335 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1336 	struct nvme_namespace_data *nsdata;
1337 
1338 	nsdata = (struct nvme_namespace_data *)io->nvmeio.kern_data_ptr;
1339 	memset(nsdata, 0, sizeof(*nsdata));
1340 	nsdata->nsze = htole64(be_lun->size_blocks);
1341 	nsdata->ncap = nsdata->nsze;
1342 	nsdata->nuse = nsdata->nsze;
1343 	nsdata->nlbaf = 1 - 1;
1344 	nsdata->dlfeat = NVMEM(NVME_NS_DATA_DLFEAT_DWZ) |
1345 	    NVMEF(NVME_NS_DATA_DLFEAT_READ, NVME_NS_DATA_DLFEAT_READ_00);
1346 	nsdata->flbas = NVMEF(NVME_NS_DATA_FLBAS_FORMAT, 0);
1347 	nsdata->lbaf[0] = NVMEF(NVME_NS_DATA_LBAF_LBADS,
1348 	    ffs(cbe_lun->blocksize) - 1);
1349 
1350 	ctl_lun_nsdata_ids(cbe_lun, nsdata);
1351 	ctl_config_read_done(io);
1352 }
1353 
1354 static void
1355 ctl_be_block_nvme_ids(struct ctl_be_block_lun *be_lun,
1356 		      union ctl_io *io)
1357 {
1358 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1359 
1360 	ctl_lun_nvme_ids(cbe_lun, io->nvmeio.kern_data_ptr);
1361 	ctl_config_read_done(io);
1362 }
1363 
1364 static void
1365 ctl_be_block_cw_dispatch_sync(struct ctl_be_block_lun *be_lun,
1366 			    union ctl_io *io)
1367 {
1368 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1369 	struct ctl_be_block_io *beio;
1370 	struct ctl_lba_len_flags *lbalen;
1371 
1372 	DPRINTF("entered\n");
1373 	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1374 	lbalen = (struct ctl_lba_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1375 
1376 	beio->io_len = lbalen->len * cbe_lun->blocksize;
1377 	beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1378 	beio->io_arg = (lbalen->flags & SSC_IMMED) != 0;
1379 	beio->bio_cmd = BIO_FLUSH;
1380 	beio->ds_trans_type = DEVSTAT_NO_DATA;
1381 	DPRINTF("SYNC\n");
1382 	be_lun->lun_flush(be_lun, beio);
1383 }
1384 
1385 static void
1386 ctl_be_block_cw_done_ws(struct ctl_be_block_io *beio)
1387 {
1388 	union ctl_io *io;
1389 
1390 	io = beio->io;
1391 	ctl_free_beio(beio);
1392 	if ((io->io_hdr.flags & CTL_FLAG_ABORT) ||
1393 	    ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
1394 	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
1395 		ctl_config_write_done(io);
1396 		return;
1397 	}
1398 
1399 	ctl_be_block_config_write(io);
1400 }
1401 
1402 static void
1403 ctl_be_block_cw_dispatch_ws(struct ctl_be_block_lun *be_lun,
1404 			    union ctl_io *io)
1405 {
1406 	struct ctl_be_block_softc *softc = be_lun->softc;
1407 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1408 	struct ctl_be_block_io *beio;
1409 	struct ctl_lba_len_flags *lbalen;
1410 	uint64_t len_left, lba;
1411 	uint32_t pb, pbo, adj;
1412 	int i, seglen;
1413 	uint8_t *buf, *end;
1414 
1415 	DPRINTF("entered\n");
1416 
1417 	CTL_IO_ASSERT(io, SCSI);
1418 
1419 	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1420 	lbalen = ARGS(io);
1421 
1422 	if (lbalen->flags & ~(SWS_LBDATA | SWS_UNMAP | SWS_ANCHOR | SWS_NDOB) ||
1423 	    (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR) && be_lun->unmap == NULL)) {
1424 		ctl_free_beio(beio);
1425 		ctl_set_invalid_field(&io->scsiio,
1426 				      /*sks_valid*/ 1,
1427 				      /*command*/ 1,
1428 				      /*field*/ 1,
1429 				      /*bit_valid*/ 0,
1430 				      /*bit*/ 0);
1431 		ctl_config_write_done(io);
1432 		return;
1433 	}
1434 
1435 	if (lbalen->flags & (SWS_UNMAP | SWS_ANCHOR)) {
1436 		beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1437 		beio->io_len = (uint64_t)lbalen->len * cbe_lun->blocksize;
1438 		beio->bio_cmd = BIO_DELETE;
1439 		beio->ds_trans_type = DEVSTAT_FREE;
1440 
1441 		be_lun->unmap(be_lun, beio);
1442 		return;
1443 	}
1444 
1445 	beio->bio_cmd = BIO_WRITE;
1446 	beio->ds_trans_type = DEVSTAT_WRITE;
1447 
1448 	DPRINTF("WRITE SAME at LBA %jx len %u\n",
1449 	       (uintmax_t)lbalen->lba, lbalen->len);
1450 
1451 	pb = cbe_lun->blocksize << be_lun->cbe_lun.pblockexp;
1452 	if (be_lun->cbe_lun.pblockoff > 0)
1453 		pbo = pb - cbe_lun->blocksize * be_lun->cbe_lun.pblockoff;
1454 	else
1455 		pbo = 0;
1456 	len_left = (uint64_t)lbalen->len * cbe_lun->blocksize;
1457 	for (i = 0, lba = 0; i < CTLBLK_MAX_SEGS && len_left > 0; i++) {
1458 		/*
1459 		 * Setup the S/G entry for this chunk.
1460 		 */
1461 		seglen = MIN(CTLBLK_MAX_SEG, len_left);
1462 		if (pb > cbe_lun->blocksize) {
1463 			adj = ((lbalen->lba + lba) * cbe_lun->blocksize +
1464 			    seglen - pbo) % pb;
1465 			if (seglen > adj)
1466 				seglen -= adj;
1467 			else
1468 				seglen -= seglen % cbe_lun->blocksize;
1469 		} else
1470 			seglen -= seglen % cbe_lun->blocksize;
1471 		ctl_alloc_seg(softc, &beio->sg_segs[i], seglen);
1472 
1473 		DPRINTF("segment %d addr %p len %zd\n", i,
1474 			beio->sg_segs[i].addr, beio->sg_segs[i].len);
1475 
1476 		beio->num_segs++;
1477 		len_left -= seglen;
1478 
1479 		buf = beio->sg_segs[i].addr;
1480 		end = buf + seglen;
1481 		for (; buf < end; buf += cbe_lun->blocksize) {
1482 			if (lbalen->flags & SWS_NDOB) {
1483 				memset(buf, 0, cbe_lun->blocksize);
1484 			} else {
1485 				memcpy(buf, io->scsiio.kern_data_ptr,
1486 				    cbe_lun->blocksize);
1487 			}
1488 			if (lbalen->flags & SWS_LBDATA)
1489 				scsi_ulto4b(lbalen->lba + lba, buf);
1490 			lba++;
1491 		}
1492 	}
1493 
1494 	beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1495 	beio->io_len = lba * cbe_lun->blocksize;
1496 
1497 	/* We can not do all in one run. Correct and schedule rerun. */
1498 	if (len_left > 0) {
1499 		lbalen->lba += lba;
1500 		lbalen->len -= lba;
1501 		beio->beio_cont = ctl_be_block_cw_done_ws;
1502 	}
1503 
1504 	be_lun->dispatch(be_lun, beio);
1505 }
1506 
1507 static void
1508 ctl_be_block_cw_dispatch_unmap(struct ctl_be_block_lun *be_lun,
1509 			       union ctl_io *io)
1510 {
1511 	struct ctl_be_block_io *beio;
1512 	struct ctl_ptr_len_flags *ptrlen;
1513 
1514 	DPRINTF("entered\n");
1515 
1516 	CTL_IO_ASSERT(io, SCSI);
1517 
1518 	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1519 	ptrlen = (struct ctl_ptr_len_flags *)&io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
1520 
1521 	if ((ptrlen->flags & ~SU_ANCHOR) != 0 || be_lun->unmap == NULL) {
1522 		ctl_free_beio(beio);
1523 		ctl_set_invalid_field(&io->scsiio,
1524 				      /*sks_valid*/ 0,
1525 				      /*command*/ 1,
1526 				      /*field*/ 0,
1527 				      /*bit_valid*/ 0,
1528 				      /*bit*/ 0);
1529 		ctl_config_write_done(io);
1530 		return;
1531 	}
1532 
1533 	beio->io_len = 0;
1534 	beio->io_offset = -1;
1535 	beio->bio_cmd = BIO_DELETE;
1536 	beio->ds_trans_type = DEVSTAT_FREE;
1537 	DPRINTF("UNMAP\n");
1538 	be_lun->unmap(be_lun, beio);
1539 }
1540 
1541 static void
1542 ctl_be_block_cw_dispatch_flush(struct ctl_be_block_lun *be_lun,
1543 			       union ctl_io *io)
1544 {
1545 	struct ctl_be_block_io *beio;
1546 
1547 	DPRINTF("entered\n");
1548 	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1549 
1550 	beio->io_len = be_lun->size_bytes;
1551 	beio->io_offset = 0;
1552 	beio->io_arg = 1;
1553 	beio->bio_cmd = BIO_FLUSH;
1554 	beio->ds_trans_type = DEVSTAT_NO_DATA;
1555 	DPRINTF("FLUSH\n");
1556 	be_lun->lun_flush(be_lun, beio);
1557 }
1558 
1559 static void
1560 ctl_be_block_cw_dispatch_wu(struct ctl_be_block_lun *be_lun,
1561 			    union ctl_io *io)
1562 {
1563 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1564 	struct ctl_be_block_io *beio;
1565 	struct ctl_lba_len_flags *lbalen;
1566 
1567 	CTL_IO_ASSERT(io, NVME);
1568 
1569 	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1570 	lbalen = ARGS(io);
1571 
1572 	/*
1573 	 * XXX: Not quite right as reads will return zeroes rather
1574 	 * than failing.
1575 	 */
1576 	beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1577 	beio->io_len = (uint64_t)lbalen->len * cbe_lun->blocksize;
1578 	beio->bio_cmd = BIO_DELETE;
1579 	beio->ds_trans_type = DEVSTAT_FREE;
1580 
1581 	be_lun->unmap(be_lun, beio);
1582 }
1583 
1584 static void
1585 ctl_be_block_cw_dispatch_wz(struct ctl_be_block_lun *be_lun,
1586 			    union ctl_io *io)
1587 {
1588 	struct ctl_be_block_softc *softc = be_lun->softc;
1589 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1590 	struct ctl_be_block_io *beio;
1591 	struct ctl_lba_len_flags *lbalen;
1592 	uint64_t len_left, lba;
1593 	uint32_t pb, pbo, adj;
1594 	int i, seglen;
1595 
1596 	DPRINTF("entered\n");
1597 
1598 	CTL_IO_ASSERT(io, NVME);
1599 
1600 	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1601 	lbalen = ARGS(io);
1602 
1603 	if ((le32toh(io->nvmeio.cmd.cdw12) & (1U << 25)) != 0 &&
1604 	    be_lun->unmap != NULL) {
1605 		beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1606 		beio->io_len = (uint64_t)lbalen->len * cbe_lun->blocksize;
1607 		beio->bio_cmd = BIO_DELETE;
1608 		beio->ds_trans_type = DEVSTAT_FREE;
1609 
1610 		be_lun->unmap(be_lun, beio);
1611 		return;
1612 	}
1613 
1614 	beio->bio_cmd = BIO_WRITE;
1615 	beio->ds_trans_type = DEVSTAT_WRITE;
1616 
1617 	DPRINTF("WRITE ZEROES at LBA %jx len %u\n",
1618 	       (uintmax_t)lbalen->lba, lbalen->len);
1619 
1620 	pb = cbe_lun->blocksize << be_lun->cbe_lun.pblockexp;
1621 	if (be_lun->cbe_lun.pblockoff > 0)
1622 		pbo = pb - cbe_lun->blocksize * be_lun->cbe_lun.pblockoff;
1623 	else
1624 		pbo = 0;
1625 	len_left = (uint64_t)lbalen->len * cbe_lun->blocksize;
1626 	for (i = 0, lba = 0; i < CTLBLK_MAX_SEGS && len_left > 0; i++) {
1627 		/*
1628 		 * Setup the S/G entry for this chunk.
1629 		 */
1630 		seglen = MIN(CTLBLK_MAX_SEG, len_left);
1631 		if (pb > cbe_lun->blocksize) {
1632 			adj = ((lbalen->lba + lba) * cbe_lun->blocksize +
1633 			    seglen - pbo) % pb;
1634 			if (seglen > adj)
1635 				seglen -= adj;
1636 			else
1637 				seglen -= seglen % cbe_lun->blocksize;
1638 		} else
1639 			seglen -= seglen % cbe_lun->blocksize;
1640 		ctl_alloc_seg(softc, &beio->sg_segs[i], seglen);
1641 
1642 		DPRINTF("segment %d addr %p len %zd\n", i,
1643 			beio->sg_segs[i].addr, beio->sg_segs[i].len);
1644 
1645 		beio->num_segs++;
1646 		len_left -= seglen;
1647 
1648 		memset(beio->sg_segs[i].addr, 0, seglen);
1649 		lba += seglen / cbe_lun->blocksize;
1650 	}
1651 
1652 	beio->io_offset = lbalen->lba * cbe_lun->blocksize;
1653 	beio->io_len = lba * cbe_lun->blocksize;
1654 
1655 	/* We can not do all in one run. Correct and schedule rerun. */
1656 	if (len_left > 0) {
1657 		lbalen->lba += lba;
1658 		lbalen->len -= lba;
1659 		beio->beio_cont = ctl_be_block_cw_done_ws;
1660 	}
1661 
1662 	be_lun->dispatch(be_lun, beio);
1663 }
1664 
1665 static void
1666 ctl_be_block_cw_dispatch_dsm(struct ctl_be_block_lun *be_lun,
1667 			     union ctl_io *io)
1668 {
1669 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1670 	struct ctl_be_block_io *beio;
1671 	struct nvme_dsm_range *r;
1672 	uint64_t lba;
1673 	uint32_t num_blocks;
1674 	u_int i, ranges;
1675 
1676 	CTL_IO_ASSERT(io, NVME);
1677 
1678 	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1679 
1680 	if (be_lun->unmap == NULL) {
1681 		ctl_free_beio(beio);
1682 		ctl_nvme_set_success(&io->nvmeio);
1683 		ctl_config_write_done(io);
1684 		return;
1685 	}
1686 
1687 	ranges = le32toh(io->nvmeio.cmd.cdw10) & 0xff;
1688 	r = (struct nvme_dsm_range *)io->nvmeio.kern_data_ptr;
1689 
1690 	/* Find the next range to delete. */
1691 	for (i = DSM_RANGE(io); i < ranges; i++) {
1692 		if ((le32toh(r[i].attributes) & (1U << 2)) != 0)
1693 			break;
1694 	}
1695 
1696 	/* If no range to delete, complete the operation. */
1697 	if (i == ranges) {
1698 		ctl_free_beio(beio);
1699 		ctl_nvme_set_success(&io->nvmeio);
1700 		ctl_config_write_done(io);
1701 		return;
1702 	}
1703 
1704 	/* If this is not the last range, request a rerun after this range. */
1705 	if (i + 1 < ranges) {
1706 		DSM_RANGE(io) = i + 1;
1707 		beio->beio_cont = ctl_be_block_cw_done_ws;
1708 	}
1709 
1710 	lba = le64toh(r[i].starting_lba);
1711 	num_blocks = le32toh(r[i].length);
1712 
1713 	beio->io_offset = lba * cbe_lun->blocksize;
1714 	beio->io_len = (uint64_t)num_blocks * cbe_lun->blocksize;
1715 	beio->bio_cmd = BIO_DELETE;
1716 	beio->ds_trans_type = DEVSTAT_FREE;
1717 
1718 	be_lun->unmap(be_lun, beio);
1719 }
1720 
1721 static void
1722 ctl_be_block_scsi_cr_done(struct ctl_be_block_io *beio)
1723 {
1724 	union ctl_io *io;
1725 
1726 	io = beio->io;
1727 	ctl_free_beio(beio);
1728 	ctl_config_read_done(io);
1729 }
1730 
1731 static void
1732 ctl_be_block_scsi_cr_dispatch(struct ctl_be_block_lun *be_lun,
1733 			      union ctl_io *io)
1734 {
1735 	struct ctl_be_block_io *beio;
1736 	struct ctl_be_block_softc *softc;
1737 
1738 	DPRINTF("entered\n");
1739 
1740 	softc = be_lun->softc;
1741 	beio = ctl_alloc_beio(softc);
1742 	beio->io = io;
1743 	beio->lun = be_lun;
1744 	beio->beio_cont = ctl_be_block_scsi_cr_done;
1745 	PRIV(io)->ptr = (void *)beio;
1746 
1747 	switch (io->scsiio.cdb[0]) {
1748 	case SERVICE_ACTION_IN:		/* GET LBA STATUS */
1749 		beio->bio_cmd = -1;
1750 		beio->ds_trans_type = DEVSTAT_NO_DATA;
1751 		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1752 		beio->io_len = 0;
1753 		if (be_lun->get_lba_status)
1754 			be_lun->get_lba_status(be_lun, beio);
1755 		else
1756 			ctl_be_block_scsi_cr_done(beio);
1757 		break;
1758 	default:
1759 		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
1760 		break;
1761 	}
1762 }
1763 
1764 static void
1765 ctl_be_block_nvme_cr_dispatch(struct ctl_be_block_lun *be_lun,
1766 			      union ctl_io *io)
1767 {
1768 	uint8_t cns;
1769 
1770 	DPRINTF("entered\n");
1771 
1772 	MPASS(io->nvmeio.cmd.opc == NVME_OPC_IDENTIFY);
1773 
1774 	cns = le32toh(io->nvmeio.cmd.cdw10) & 0xff;
1775 	switch (cns) {
1776 	case 0:
1777 		ctl_be_block_namespace_data(be_lun, io);
1778 		break;
1779 	case 3:
1780 		ctl_be_block_nvme_ids(be_lun, io);
1781 		break;
1782 	default:
1783 		__assert_unreachable();
1784 	}
1785 }
1786 
1787 static void
1788 ctl_be_block_cr_dispatch(struct ctl_be_block_lun *be_lun,
1789 			 union ctl_io *io)
1790 {
1791 	switch (io->io_hdr.io_type) {
1792 	case CTL_IO_SCSI:
1793 		ctl_be_block_scsi_cr_dispatch(be_lun, io);
1794 		break;
1795 	case CTL_IO_NVME_ADMIN:
1796 		ctl_be_block_nvme_cr_dispatch(be_lun, io);
1797 		break;
1798 	default:
1799 		__assert_unreachable();
1800 	}
1801 }
1802 
1803 static void
1804 ctl_be_block_cw_done(struct ctl_be_block_io *beio)
1805 {
1806 	union ctl_io *io;
1807 
1808 	io = beio->io;
1809 	ctl_free_beio(beio);
1810 	ctl_config_write_done(io);
1811 }
1812 
1813 static void
1814 ctl_be_block_scsi_cw_dispatch(struct ctl_be_block_lun *be_lun,
1815 			      union ctl_io *io)
1816 {
1817 	struct ctl_be_block_io *beio;
1818 
1819 	DPRINTF("entered\n");
1820 
1821 	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1822 
1823 	switch (io->scsiio.tag_type) {
1824 	case CTL_TAG_ORDERED:
1825 		beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1826 		break;
1827 	case CTL_TAG_HEAD_OF_QUEUE:
1828 		beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1829 		break;
1830 	case CTL_TAG_UNTAGGED:
1831 	case CTL_TAG_SIMPLE:
1832 	case CTL_TAG_ACA:
1833 	default:
1834 		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1835 		break;
1836 	}
1837 
1838 	switch (io->scsiio.cdb[0]) {
1839 	case SYNCHRONIZE_CACHE:
1840 	case SYNCHRONIZE_CACHE_16:
1841 		ctl_be_block_cw_dispatch_sync(be_lun, io);
1842 		break;
1843 	case WRITE_SAME_10:
1844 	case WRITE_SAME_16:
1845 		ctl_be_block_cw_dispatch_ws(be_lun, io);
1846 		break;
1847 	case UNMAP:
1848 		ctl_be_block_cw_dispatch_unmap(be_lun, io);
1849 		break;
1850 	default:
1851 		panic("Unhandled CDB type %#x", io->scsiio.cdb[0]);
1852 		break;
1853 	}
1854 }
1855 
1856 static void
1857 ctl_be_block_nvme_cw_dispatch(struct ctl_be_block_lun *be_lun,
1858 			      union ctl_io *io)
1859 {
1860 	struct ctl_be_block_io *beio;
1861 
1862 	DPRINTF("entered\n");
1863 
1864 	beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
1865 	beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1866 
1867 	switch (io->nvmeio.cmd.opc) {
1868 	case NVME_OPC_FLUSH:
1869 		ctl_be_block_cw_dispatch_flush(be_lun, io);
1870 		break;
1871 	case NVME_OPC_WRITE_UNCORRECTABLE:
1872 		ctl_be_block_cw_dispatch_wu(be_lun, io);
1873 		break;
1874 	case NVME_OPC_WRITE_ZEROES:
1875 		ctl_be_block_cw_dispatch_wz(be_lun, io);
1876 		break;
1877 	case NVME_OPC_DATASET_MANAGEMENT:
1878 		ctl_be_block_cw_dispatch_dsm(be_lun, io);
1879 		break;
1880 	default:
1881 		__assert_unreachable();
1882 	}
1883 }
1884 
1885 static void
1886 ctl_be_block_cw_dispatch(struct ctl_be_block_lun *be_lun,
1887 			 union ctl_io *io)
1888 {
1889 	struct ctl_be_block_io *beio;
1890 	struct ctl_be_block_softc *softc;
1891 
1892 	softc = be_lun->softc;
1893 	beio = ctl_alloc_beio(softc);
1894 	beio->io = io;
1895 	beio->lun = be_lun;
1896 	beio->beio_cont = ctl_be_block_cw_done;
1897 	PRIV(io)->ptr = (void *)beio;
1898 
1899 	switch (io->io_hdr.io_type) {
1900 	case CTL_IO_SCSI:
1901 		ctl_be_block_scsi_cw_dispatch(be_lun, io);
1902 		break;
1903 	case CTL_IO_NVME:
1904 		ctl_be_block_nvme_cw_dispatch(be_lun, io);
1905 		break;
1906 	default:
1907 		__assert_unreachable();
1908 	}
1909 }
1910 
1911 SDT_PROBE_DEFINE1(cbb, , read, start, "uint64_t");
1912 SDT_PROBE_DEFINE1(cbb, , write, start, "uint64_t");
1913 SDT_PROBE_DEFINE1(cbb, , read, alloc_done, "uint64_t");
1914 SDT_PROBE_DEFINE1(cbb, , write, alloc_done, "uint64_t");
1915 
1916 static void
1917 ctl_be_block_next(struct ctl_be_block_io *beio)
1918 {
1919 	struct ctl_be_block_lun *be_lun;
1920 	union ctl_io *io;
1921 
1922 	io = beio->io;
1923 	be_lun = beio->lun;
1924 	ctl_free_beio(beio);
1925 	if ((io->io_hdr.flags & CTL_FLAG_ABORT) ||
1926 	    ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
1927 	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
1928 		ctl_data_submit_done(io);
1929 		return;
1930 	}
1931 
1932 	io->io_hdr.status &= ~CTL_STATUS_MASK;
1933 	io->io_hdr.status |= CTL_STATUS_NONE;
1934 
1935 	mtx_lock(&be_lun->queue_lock);
1936 	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
1937 	mtx_unlock(&be_lun->queue_lock);
1938 	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
1939 }
1940 
1941 static void
1942 ctl_be_block_dispatch(struct ctl_be_block_lun *be_lun,
1943 			   union ctl_io *io)
1944 {
1945 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
1946 	struct ctl_be_block_io *beio;
1947 	struct ctl_be_block_softc *softc;
1948 	struct ctl_lba_len_flags *lbalen;
1949 	struct ctl_ptr_len_flags *bptrlen;
1950 	uint64_t len_left, lbas;
1951 	int i;
1952 
1953 	softc = be_lun->softc;
1954 
1955 	DPRINTF("entered\n");
1956 
1957 	lbalen = ARGS(io);
1958 	if (lbalen->flags & CTL_LLF_WRITE) {
1959 		SDT_PROBE0(cbb, , write, start);
1960 	} else {
1961 		SDT_PROBE0(cbb, , read, start);
1962 	}
1963 
1964 	beio = ctl_alloc_beio(softc);
1965 	beio->io = io;
1966 	beio->lun = be_lun;
1967 	bptrlen = PRIV(io);
1968 	bptrlen->ptr = (void *)beio;
1969 
1970 	switch (io->io_hdr.io_type) {
1971 	case CTL_IO_SCSI:
1972 		switch (io->scsiio.tag_type) {
1973 		case CTL_TAG_ORDERED:
1974 			beio->ds_tag_type = DEVSTAT_TAG_ORDERED;
1975 			break;
1976 		case CTL_TAG_HEAD_OF_QUEUE:
1977 			beio->ds_tag_type = DEVSTAT_TAG_HEAD;
1978 			break;
1979 		case CTL_TAG_UNTAGGED:
1980 		case CTL_TAG_SIMPLE:
1981 		case CTL_TAG_ACA:
1982 		default:
1983 			beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1984 			break;
1985 		}
1986 		break;
1987 	case CTL_IO_NVME:
1988 		beio->ds_tag_type = DEVSTAT_TAG_SIMPLE;
1989 		break;
1990 	default:
1991 		__assert_unreachable();
1992 	}
1993 
1994 	if (lbalen->flags & CTL_LLF_WRITE) {
1995 		beio->bio_cmd = BIO_WRITE;
1996 		beio->ds_trans_type = DEVSTAT_WRITE;
1997 	} else {
1998 		beio->bio_cmd = BIO_READ;
1999 		beio->ds_trans_type = DEVSTAT_READ;
2000 	}
2001 
2002 	DPRINTF("%s at LBA %jx len %u @%ju\n",
2003 	       (beio->bio_cmd == BIO_READ) ? "READ" : "WRITE",
2004 	       (uintmax_t)lbalen->lba, lbalen->len, bptrlen->len);
2005 	lbas = CTLBLK_MAX_IO_SIZE;
2006 	if (lbalen->flags & CTL_LLF_COMPARE) {
2007 		beio->two_sglists = 1;
2008 		lbas /= 2;
2009 	}
2010 	lbas = MIN(lbalen->len - bptrlen->len, lbas / cbe_lun->blocksize);
2011 	beio->io_offset = (lbalen->lba + bptrlen->len) * cbe_lun->blocksize;
2012 	beio->io_len = lbas * cbe_lun->blocksize;
2013 	bptrlen->len += lbas;
2014 
2015 	for (i = 0, len_left = beio->io_len; len_left > 0; i++) {
2016 		KASSERT(i < CTLBLK_MAX_SEGS, ("Too many segs (%d >= %d)",
2017 		    i, CTLBLK_MAX_SEGS));
2018 
2019 		/*
2020 		 * Setup the S/G entry for this chunk.
2021 		 */
2022 		ctl_alloc_seg(softc, &beio->sg_segs[i],
2023 		    MIN(CTLBLK_MAX_SEG, len_left));
2024 
2025 		DPRINTF("segment %d addr %p len %zd\n", i,
2026 			beio->sg_segs[i].addr, beio->sg_segs[i].len);
2027 
2028 		/* Set up second segment for compare operation. */
2029 		if (beio->two_sglists) {
2030 			ctl_alloc_seg(softc,
2031 			    &beio->sg_segs[i + CTLBLK_HALF_SEGS],
2032 			    beio->sg_segs[i].len);
2033 		}
2034 
2035 		beio->num_segs++;
2036 		len_left -= beio->sg_segs[i].len;
2037 	}
2038 	if (bptrlen->len < lbalen->len)
2039 		beio->beio_cont = ctl_be_block_next;
2040 	ctl_set_be_move_done(io, ctl_be_block_move_done);
2041 	/* For compare we have separate S/G lists for read and datamove. */
2042 	if (beio->two_sglists)
2043 		ctl_set_kern_data_ptr(io, &beio->sg_segs[CTLBLK_HALF_SEGS]);
2044 	else
2045 		ctl_set_kern_data_ptr(io, beio->sg_segs);
2046 	ctl_set_kern_data_len(io, beio->io_len);
2047 	ctl_set_kern_sg_entries(io, beio->num_segs);
2048 	ctl_set_kern_data_ref(io, ctl_refcnt_beio);
2049 	ctl_set_kern_data_arg(io, beio);
2050 	io->io_hdr.flags |= CTL_FLAG_ALLOCATED;
2051 
2052 	/*
2053 	 * For the read case, we need to read the data into our buffers and
2054 	 * then we can send it back to the user.  For the write case, we
2055 	 * need to get the data from the user first.
2056 	 */
2057 	if (beio->bio_cmd == BIO_READ) {
2058 		SDT_PROBE0(cbb, , read, alloc_done);
2059 		be_lun->dispatch(be_lun, beio);
2060 	} else {
2061 		SDT_PROBE0(cbb, , write, alloc_done);
2062 		ctl_datamove(io);
2063 	}
2064 }
2065 
2066 static void
2067 ctl_be_block_worker(void *context, int pending)
2068 {
2069 	struct ctl_be_block_lun *be_lun = (struct ctl_be_block_lun *)context;
2070 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
2071 	union ctl_io *io;
2072 	struct ctl_be_block_io *beio;
2073 
2074 	DPRINTF("entered\n");
2075 	/*
2076 	 * Fetch and process I/Os from all queues.  If we detect LUN
2077 	 * CTL_LUN_FLAG_NO_MEDIA status here -- it is result of a race,
2078 	 * so make response maximally opaque to not confuse initiator.
2079 	 */
2080 	for (;;) {
2081 		mtx_lock(&be_lun->queue_lock);
2082 		io = (union ctl_io *)STAILQ_FIRST(&be_lun->datamove_queue);
2083 		if (io != NULL) {
2084 			DPRINTF("datamove queue\n");
2085 			STAILQ_REMOVE_HEAD(&be_lun->datamove_queue, links);
2086 			mtx_unlock(&be_lun->queue_lock);
2087 			beio = (struct ctl_be_block_io *)PRIV(io)->ptr;
2088 			if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
2089 				ctl_io_set_busy(io);
2090 				ctl_complete_beio(beio);
2091 				continue;
2092 			}
2093 			be_lun->dispatch(be_lun, beio);
2094 			continue;
2095 		}
2096 		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_write_queue);
2097 		if (io != NULL) {
2098 			DPRINTF("config write queue\n");
2099 			STAILQ_REMOVE_HEAD(&be_lun->config_write_queue, links);
2100 			mtx_unlock(&be_lun->queue_lock);
2101 			if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
2102 				ctl_io_set_busy(io);
2103 				ctl_config_write_done(io);
2104 				continue;
2105 			}
2106 			ctl_be_block_cw_dispatch(be_lun, io);
2107 			continue;
2108 		}
2109 		io = (union ctl_io *)STAILQ_FIRST(&be_lun->config_read_queue);
2110 		if (io != NULL) {
2111 			DPRINTF("config read queue\n");
2112 			STAILQ_REMOVE_HEAD(&be_lun->config_read_queue, links);
2113 			mtx_unlock(&be_lun->queue_lock);
2114 			if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
2115 				ctl_io_set_busy(io);
2116 				ctl_config_read_done(io);
2117 				continue;
2118 			}
2119 			ctl_be_block_cr_dispatch(be_lun, io);
2120 			continue;
2121 		}
2122 		io = (union ctl_io *)STAILQ_FIRST(&be_lun->input_queue);
2123 		if (io != NULL) {
2124 			DPRINTF("input queue\n");
2125 			STAILQ_REMOVE_HEAD(&be_lun->input_queue, links);
2126 			mtx_unlock(&be_lun->queue_lock);
2127 			if (cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) {
2128 				ctl_io_set_busy(io);
2129 				ctl_data_submit_done(io);
2130 				continue;
2131 			}
2132 			ctl_be_block_dispatch(be_lun, io);
2133 			continue;
2134 		}
2135 
2136 		/*
2137 		 * If we get here, there is no work left in the queues, so
2138 		 * just break out and let the task queue go to sleep.
2139 		 */
2140 		mtx_unlock(&be_lun->queue_lock);
2141 		break;
2142 	}
2143 }
2144 
2145 /*
2146  * Entry point from CTL to the backend for I/O.  We queue everything to a
2147  * work thread, so this just puts the I/O on a queue and wakes up the
2148  * thread.
2149  */
2150 static int
2151 ctl_be_block_submit(union ctl_io *io)
2152 {
2153 	struct ctl_be_block_lun *be_lun;
2154 
2155 	DPRINTF("entered\n");
2156 
2157 	be_lun = (struct ctl_be_block_lun *)CTL_BACKEND_LUN(io);
2158 
2159 	CTL_IO_ASSERT(io, SCSI, NVME);
2160 
2161 	PRIV(io)->len = 0;
2162 
2163 	mtx_lock(&be_lun->queue_lock);
2164 	STAILQ_INSERT_TAIL(&be_lun->input_queue, &io->io_hdr, links);
2165 	mtx_unlock(&be_lun->queue_lock);
2166 	taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
2167 
2168 	return (CTL_RETVAL_COMPLETE);
2169 }
2170 
2171 static int
2172 ctl_be_block_ioctl(struct cdev *dev, u_long cmd, caddr_t addr,
2173 			int flag, struct thread *td)
2174 {
2175 	struct ctl_be_block_softc *softc = &backend_block_softc;
2176 	int error;
2177 
2178 	error = 0;
2179 	switch (cmd) {
2180 	case CTL_LUN_REQ: {
2181 		struct ctl_lun_req *lun_req;
2182 
2183 		lun_req = (struct ctl_lun_req *)addr;
2184 
2185 		switch (lun_req->reqtype) {
2186 		case CTL_LUNREQ_CREATE:
2187 			error = ctl_be_block_create(softc, lun_req);
2188 			break;
2189 		case CTL_LUNREQ_RM:
2190 			error = ctl_be_block_rm(softc, lun_req);
2191 			break;
2192 		case CTL_LUNREQ_MODIFY:
2193 			error = ctl_be_block_modify(softc, lun_req);
2194 			break;
2195 		default:
2196 			lun_req->status = CTL_LUN_ERROR;
2197 			snprintf(lun_req->error_str, sizeof(lun_req->error_str),
2198 				 "invalid LUN request type %d",
2199 				 lun_req->reqtype);
2200 			break;
2201 		}
2202 		break;
2203 	}
2204 	default:
2205 		error = ENOTTY;
2206 		break;
2207 	}
2208 
2209 	return (error);
2210 }
2211 
2212 static int
2213 ctl_be_block_open_file(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
2214 {
2215 	struct ctl_be_lun *cbe_lun;
2216 	struct ctl_be_block_filedata *file_data;
2217 	struct ctl_lun_create_params *params;
2218 	const char		     *value;
2219 	struct vattr		      vattr;
2220 	off_t			      ps, pss, po, pos, us, uss, uo, uos;
2221 	int			      error;
2222 	long			      pconf;
2223 
2224 	cbe_lun = &be_lun->cbe_lun;
2225 	file_data = &be_lun->backend.file;
2226 	params = &be_lun->params;
2227 
2228 	be_lun->dev_type = CTL_BE_BLOCK_FILE;
2229 	be_lun->dispatch = ctl_be_block_dispatch_file;
2230 	be_lun->lun_flush = ctl_be_block_flush_file;
2231 	be_lun->get_lba_status = ctl_be_block_gls_file;
2232 	be_lun->getattr = ctl_be_block_getattr_file;
2233 	be_lun->unmap = ctl_be_block_unmap_file;
2234 	cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP;
2235 
2236 	error = VOP_GETATTR(be_lun->vn, &vattr, curthread->td_ucred);
2237 	if (error != 0) {
2238 		snprintf(req->error_str, sizeof(req->error_str),
2239 			 "error calling VOP_GETATTR() for file %s",
2240 			 be_lun->dev_path);
2241 		return (error);
2242 	}
2243 
2244 	error = VOP_PATHCONF(be_lun->vn, _PC_DEALLOC_PRESENT, &pconf);
2245 	if (error != 0) {
2246 		snprintf(req->error_str, sizeof(req->error_str),
2247 		    "error calling VOP_PATHCONF() for file %s",
2248 		    be_lun->dev_path);
2249 		return (error);
2250 	}
2251 	if (pconf == 1)
2252 		cbe_lun->flags |= CTL_LUN_FLAG_UNMAP;
2253 
2254 	file_data->cred = crhold(curthread->td_ucred);
2255 	if (params->lun_size_bytes != 0)
2256 		be_lun->size_bytes = params->lun_size_bytes;
2257 	else
2258 		be_lun->size_bytes = vattr.va_size;
2259 
2260 	/*
2261 	 * For files we can use any logical block size.  Prefer 512 bytes
2262 	 * for compatibility reasons.  If file's vattr.va_blocksize
2263 	 * (preferred I/O block size) is bigger and multiple to chosen
2264 	 * logical block size -- report it as physical block size.
2265 	 */
2266 	if (params->blocksize_bytes != 0)
2267 		cbe_lun->blocksize = params->blocksize_bytes;
2268 	else if (cbe_lun->lun_type == T_CDROM)
2269 		cbe_lun->blocksize = 2048;
2270 	else
2271 		cbe_lun->blocksize = 512;
2272 	be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
2273 	cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
2274 	    0 : (be_lun->size_blocks - 1);
2275 
2276 	us = ps = vattr.va_blocksize;
2277 	uo = po = 0;
2278 
2279 	value = dnvlist_get_string(cbe_lun->options, "pblocksize", NULL);
2280 	if (value != NULL)
2281 		ctl_expand_number(value, &ps);
2282 	value = dnvlist_get_string(cbe_lun->options, "pblockoffset", NULL);
2283 	if (value != NULL)
2284 		ctl_expand_number(value, &po);
2285 	pss = ps / cbe_lun->blocksize;
2286 	pos = po / cbe_lun->blocksize;
2287 	if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) &&
2288 	    ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) {
2289 		cbe_lun->pblockexp = fls(pss) - 1;
2290 		cbe_lun->pblockoff = (pss - pos) % pss;
2291 	}
2292 
2293 	value = dnvlist_get_string(cbe_lun->options, "ublocksize", NULL);
2294 	if (value != NULL)
2295 		ctl_expand_number(value, &us);
2296 	value = dnvlist_get_string(cbe_lun->options, "ublockoffset", NULL);
2297 	if (value != NULL)
2298 		ctl_expand_number(value, &uo);
2299 	uss = us / cbe_lun->blocksize;
2300 	uos = uo / cbe_lun->blocksize;
2301 	if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) &&
2302 	    ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) {
2303 		cbe_lun->ublockexp = fls(uss) - 1;
2304 		cbe_lun->ublockoff = (uss - uos) % uss;
2305 	}
2306 
2307 	/*
2308 	 * Sanity check.  The media size has to be at least one
2309 	 * sector long.
2310 	 */
2311 	if (be_lun->size_bytes < cbe_lun->blocksize) {
2312 		error = EINVAL;
2313 		snprintf(req->error_str, sizeof(req->error_str),
2314 			 "file %s size %ju < block size %u", be_lun->dev_path,
2315 			 (uintmax_t)be_lun->size_bytes, cbe_lun->blocksize);
2316 	}
2317 
2318 	cbe_lun->opttxferlen = CTLBLK_MAX_IO_SIZE / cbe_lun->blocksize;
2319 	return (error);
2320 }
2321 
2322 static int
2323 ctl_be_block_open_dev(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
2324 {
2325 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
2326 	struct ctl_lun_create_params *params;
2327 	struct cdevsw		     *csw;
2328 	struct cdev		     *dev;
2329 	const char		     *value;
2330 	int			      error, atomic, maxio, ref, unmap, tmp;
2331 	off_t			      ps, pss, po, pos, us, uss, uo, uos, otmp;
2332 
2333 	params = &be_lun->params;
2334 
2335 	be_lun->dev_type = CTL_BE_BLOCK_DEV;
2336 	csw = devvn_refthread(be_lun->vn, &dev, &ref);
2337 	if (csw == NULL)
2338 		return (ENXIO);
2339 	if (strcmp(csw->d_name, "zvol") == 0) {
2340 		be_lun->dispatch = ctl_be_block_dispatch_zvol;
2341 		be_lun->get_lba_status = ctl_be_block_gls_zvol;
2342 		atomic = maxio = CTLBLK_MAX_IO_SIZE;
2343 	} else {
2344 		be_lun->dispatch = ctl_be_block_dispatch_dev;
2345 		be_lun->get_lba_status = NULL;
2346 		atomic = 0;
2347 		maxio = dev->si_iosize_max;
2348 		if (maxio <= 0)
2349 			maxio = DFLTPHYS;
2350 		if (maxio > CTLBLK_MAX_SEG)
2351 			maxio = CTLBLK_MAX_SEG;
2352 	}
2353 	be_lun->lun_flush = ctl_be_block_flush_dev;
2354 	be_lun->getattr = ctl_be_block_getattr_dev;
2355 	be_lun->unmap = ctl_be_block_unmap_dev;
2356 
2357 	if (!csw->d_ioctl) {
2358 		dev_relthread(dev, ref);
2359 		snprintf(req->error_str, sizeof(req->error_str),
2360 			 "no d_ioctl for device %s!", be_lun->dev_path);
2361 		return (ENODEV);
2362 	}
2363 
2364 	error = csw->d_ioctl(dev, DIOCGSECTORSIZE, (caddr_t)&tmp, FREAD,
2365 			       curthread);
2366 	if (error) {
2367 		dev_relthread(dev, ref);
2368 		snprintf(req->error_str, sizeof(req->error_str),
2369 			 "error %d returned for DIOCGSECTORSIZE ioctl "
2370 			 "on %s!", error, be_lun->dev_path);
2371 		return (error);
2372 	}
2373 
2374 	/*
2375 	 * If the user has asked for a blocksize that is greater than the
2376 	 * backing device's blocksize, we can do it only if the blocksize
2377 	 * the user is asking for is an even multiple of the underlying
2378 	 * device's blocksize.
2379 	 */
2380 	if ((params->blocksize_bytes != 0) &&
2381 	    (params->blocksize_bytes >= tmp)) {
2382 		if (params->blocksize_bytes % tmp == 0) {
2383 			cbe_lun->blocksize = params->blocksize_bytes;
2384 		} else {
2385 			dev_relthread(dev, ref);
2386 			snprintf(req->error_str, sizeof(req->error_str),
2387 				 "requested blocksize %u is not an even "
2388 				 "multiple of backing device blocksize %u",
2389 				 params->blocksize_bytes, tmp);
2390 			return (EINVAL);
2391 		}
2392 	} else if (params->blocksize_bytes != 0) {
2393 		dev_relthread(dev, ref);
2394 		snprintf(req->error_str, sizeof(req->error_str),
2395 			 "requested blocksize %u < backing device "
2396 			 "blocksize %u", params->blocksize_bytes, tmp);
2397 		return (EINVAL);
2398 	} else if (cbe_lun->lun_type == T_CDROM)
2399 		cbe_lun->blocksize = MAX(tmp, 2048);
2400 	else
2401 		cbe_lun->blocksize = tmp;
2402 
2403 	error = csw->d_ioctl(dev, DIOCGMEDIASIZE, (caddr_t)&otmp, FREAD,
2404 			     curthread);
2405 	if (error) {
2406 		dev_relthread(dev, ref);
2407 		snprintf(req->error_str, sizeof(req->error_str),
2408 			 "error %d returned for DIOCGMEDIASIZE "
2409 			 " ioctl on %s!", error,
2410 			 be_lun->dev_path);
2411 		return (error);
2412 	}
2413 
2414 	if (params->lun_size_bytes != 0) {
2415 		if (params->lun_size_bytes > otmp) {
2416 			dev_relthread(dev, ref);
2417 			snprintf(req->error_str, sizeof(req->error_str),
2418 				 "requested LUN size %ju > backing device "
2419 				 "size %ju",
2420 				 (uintmax_t)params->lun_size_bytes,
2421 				 (uintmax_t)otmp);
2422 			return (EINVAL);
2423 		}
2424 
2425 		be_lun->size_bytes = params->lun_size_bytes;
2426 	} else
2427 		be_lun->size_bytes = otmp;
2428 	be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
2429 	cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
2430 	    0 : (be_lun->size_blocks - 1);
2431 
2432 	error = csw->d_ioctl(dev, DIOCGSTRIPESIZE, (caddr_t)&ps, FREAD,
2433 	    curthread);
2434 	if (error)
2435 		ps = po = 0;
2436 	else {
2437 		error = csw->d_ioctl(dev, DIOCGSTRIPEOFFSET, (caddr_t)&po,
2438 		    FREAD, curthread);
2439 		if (error)
2440 			po = 0;
2441 	}
2442 	us = ps;
2443 	uo = po;
2444 
2445 	value = dnvlist_get_string(cbe_lun->options, "pblocksize", NULL);
2446 	if (value != NULL)
2447 		ctl_expand_number(value, &ps);
2448 	value = dnvlist_get_string(cbe_lun->options, "pblockoffset", NULL);
2449 	if (value != NULL)
2450 		ctl_expand_number(value, &po);
2451 	pss = ps / cbe_lun->blocksize;
2452 	pos = po / cbe_lun->blocksize;
2453 	if ((pss > 0) && (pss * cbe_lun->blocksize == ps) && (pss >= pos) &&
2454 	    ((pss & (pss - 1)) == 0) && (pos * cbe_lun->blocksize == po)) {
2455 		cbe_lun->pblockexp = fls(pss) - 1;
2456 		cbe_lun->pblockoff = (pss - pos) % pss;
2457 	}
2458 
2459 	value = dnvlist_get_string(cbe_lun->options, "ublocksize", NULL);
2460 	if (value != NULL)
2461 		ctl_expand_number(value, &us);
2462 	value = dnvlist_get_string(cbe_lun->options, "ublockoffset", NULL);
2463 	if (value != NULL)
2464 		ctl_expand_number(value, &uo);
2465 	uss = us / cbe_lun->blocksize;
2466 	uos = uo / cbe_lun->blocksize;
2467 	if ((uss > 0) && (uss * cbe_lun->blocksize == us) && (uss >= uos) &&
2468 	    ((uss & (uss - 1)) == 0) && (uos * cbe_lun->blocksize == uo)) {
2469 		cbe_lun->ublockexp = fls(uss) - 1;
2470 		cbe_lun->ublockoff = (uss - uos) % uss;
2471 	}
2472 
2473 	cbe_lun->atomicblock = atomic / cbe_lun->blocksize;
2474 	cbe_lun->opttxferlen = maxio / cbe_lun->blocksize;
2475 
2476 	if (be_lun->dispatch == ctl_be_block_dispatch_zvol) {
2477 		unmap = 1;
2478 	} else {
2479 		struct diocgattr_arg	arg;
2480 
2481 		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
2482 		arg.len = sizeof(arg.value.i);
2483 		error = csw->d_ioctl(dev, DIOCGATTR, (caddr_t)&arg, FREAD,
2484 		    curthread);
2485 		unmap = (error == 0) ? arg.value.i : 0;
2486 	}
2487 	value = dnvlist_get_string(cbe_lun->options, "unmap", NULL);
2488 	if (value != NULL)
2489 		unmap = (strcmp(value, "on") == 0);
2490 	if (unmap)
2491 		cbe_lun->flags |= CTL_LUN_FLAG_UNMAP;
2492 	else
2493 		cbe_lun->flags &= ~CTL_LUN_FLAG_UNMAP;
2494 
2495 	dev_relthread(dev, ref);
2496 	return (0);
2497 }
2498 
2499 static int
2500 ctl_be_block_close(struct ctl_be_block_lun *be_lun)
2501 {
2502 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
2503 	int flags;
2504 
2505 	if (be_lun->vn) {
2506 		flags = FREAD;
2507 		if ((cbe_lun->flags & CTL_LUN_FLAG_READONLY) == 0)
2508 			flags |= FWRITE;
2509 		(void)vn_close(be_lun->vn, flags, NOCRED, curthread);
2510 		be_lun->vn = NULL;
2511 
2512 		switch (be_lun->dev_type) {
2513 		case CTL_BE_BLOCK_DEV:
2514 			break;
2515 		case CTL_BE_BLOCK_FILE:
2516 			if (be_lun->backend.file.cred != NULL) {
2517 				crfree(be_lun->backend.file.cred);
2518 				be_lun->backend.file.cred = NULL;
2519 			}
2520 			break;
2521 		case CTL_BE_BLOCK_NONE:
2522 			break;
2523 		default:
2524 			panic("Unexpected backend type %d", be_lun->dev_type);
2525 			break;
2526 		}
2527 		be_lun->dev_type = CTL_BE_BLOCK_NONE;
2528 	}
2529 	return (0);
2530 }
2531 
2532 static int
2533 ctl_be_block_open(struct ctl_be_block_lun *be_lun, struct ctl_lun_req *req)
2534 {
2535 	struct ctl_be_lun *cbe_lun = &be_lun->cbe_lun;
2536 	struct nameidata nd;
2537 	const char	*value;
2538 	int		 error, flags;
2539 
2540 	error = 0;
2541 	if (rootvnode == NULL) {
2542 		snprintf(req->error_str, sizeof(req->error_str),
2543 			 "Root filesystem is not mounted");
2544 		return (1);
2545 	}
2546 	pwd_ensure_dirs();
2547 
2548 	value = dnvlist_get_string(cbe_lun->options, "file", NULL);
2549 	if (value == NULL) {
2550 		snprintf(req->error_str, sizeof(req->error_str),
2551 			 "no file argument specified");
2552 		return (1);
2553 	}
2554 	free(be_lun->dev_path, M_CTLBLK);
2555 	be_lun->dev_path = strdup(value, M_CTLBLK);
2556 
2557 	flags = FREAD;
2558 	value = dnvlist_get_string(cbe_lun->options, "readonly", NULL);
2559 	if (value != NULL) {
2560 		if (strcmp(value, "on") != 0)
2561 			flags |= FWRITE;
2562 	} else if (cbe_lun->lun_type == T_DIRECT)
2563 		flags |= FWRITE;
2564 
2565 again:
2566 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, be_lun->dev_path);
2567 	error = vn_open(&nd, &flags, 0, NULL);
2568 	if ((error == EROFS || error == EACCES) && (flags & FWRITE)) {
2569 		flags &= ~FWRITE;
2570 		goto again;
2571 	}
2572 	if (error) {
2573 		/*
2574 		 * This is the only reasonable guess we can make as far as
2575 		 * path if the user doesn't give us a fully qualified path.
2576 		 * If they want to specify a file, they need to specify the
2577 		 * full path.
2578 		 */
2579 		if (be_lun->dev_path[0] != '/') {
2580 			char *dev_name;
2581 
2582 			asprintf(&dev_name, M_CTLBLK, "/dev/%s",
2583 				be_lun->dev_path);
2584 			free(be_lun->dev_path, M_CTLBLK);
2585 			be_lun->dev_path = dev_name;
2586 			goto again;
2587 		}
2588 		snprintf(req->error_str, sizeof(req->error_str),
2589 		    "error opening %s: %d", be_lun->dev_path, error);
2590 		return (error);
2591 	}
2592 	if (flags & FWRITE)
2593 		cbe_lun->flags &= ~CTL_LUN_FLAG_READONLY;
2594 	else
2595 		cbe_lun->flags |= CTL_LUN_FLAG_READONLY;
2596 
2597 	NDFREE_PNBUF(&nd);
2598 	be_lun->vn = nd.ni_vp;
2599 
2600 	/* We only support disks and files. */
2601 	if (vn_isdisk_error(be_lun->vn, &error)) {
2602 		error = ctl_be_block_open_dev(be_lun, req);
2603 	} else if (be_lun->vn->v_type == VREG) {
2604 		error = ctl_be_block_open_file(be_lun, req);
2605 	} else {
2606 		error = EINVAL;
2607 		snprintf(req->error_str, sizeof(req->error_str),
2608 			 "%s is not a disk or plain file", be_lun->dev_path);
2609 	}
2610 	VOP_UNLOCK(be_lun->vn);
2611 
2612 	if (error != 0)
2613 		ctl_be_block_close(be_lun);
2614 	cbe_lun->serseq = CTL_LUN_SERSEQ_OFF;
2615 	if (be_lun->dispatch != ctl_be_block_dispatch_dev)
2616 		cbe_lun->serseq = CTL_LUN_SERSEQ_SOFT;
2617 	value = dnvlist_get_string(cbe_lun->options, "serseq", NULL);
2618 	if (value != NULL && strcmp(value, "on") == 0)
2619 		cbe_lun->serseq = CTL_LUN_SERSEQ_ON;
2620 	else if (value != NULL && strcmp(value, "read") == 0)
2621 		cbe_lun->serseq = CTL_LUN_SERSEQ_READ;
2622 	else if (value != NULL && strcmp(value, "soft") == 0)
2623 		cbe_lun->serseq = CTL_LUN_SERSEQ_SOFT;
2624 	else if (value != NULL && strcmp(value, "off") == 0)
2625 		cbe_lun->serseq = CTL_LUN_SERSEQ_OFF;
2626 	return (0);
2627 }
2628 
2629 static int
2630 ctl_be_block_create(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2631 {
2632 	struct ctl_be_lun *cbe_lun;
2633 	struct ctl_be_block_lun *be_lun;
2634 	struct ctl_lun_create_params *params;
2635 	char num_thread_str[16];
2636 	char tmpstr[32];
2637 	const char *value;
2638 	int retval, num_threads;
2639 	int tmp_num_threads;
2640 
2641 	params = &req->reqdata.create;
2642 	retval = 0;
2643 	req->status = CTL_LUN_OK;
2644 
2645 	be_lun = malloc(sizeof(*be_lun), M_CTLBLK, M_ZERO | M_WAITOK);
2646 	cbe_lun = &be_lun->cbe_lun;
2647 	be_lun->params = req->reqdata.create;
2648 	be_lun->softc = softc;
2649 	STAILQ_INIT(&be_lun->input_queue);
2650 	STAILQ_INIT(&be_lun->config_read_queue);
2651 	STAILQ_INIT(&be_lun->config_write_queue);
2652 	STAILQ_INIT(&be_lun->datamove_queue);
2653 	mtx_init(&be_lun->io_lock, "ctlblock io", NULL, MTX_DEF);
2654 	mtx_init(&be_lun->queue_lock, "ctlblock queue", NULL, MTX_DEF);
2655 	cbe_lun->options = nvlist_clone(req->args_nvl);
2656 
2657 	if (params->flags & CTL_LUN_FLAG_DEV_TYPE)
2658 		cbe_lun->lun_type = params->device_type;
2659 	else
2660 		cbe_lun->lun_type = T_DIRECT;
2661 	be_lun->flags = 0;
2662 	cbe_lun->flags = 0;
2663 	value = dnvlist_get_string(cbe_lun->options, "ha_role", NULL);
2664 	if (value != NULL) {
2665 		if (strcmp(value, "primary") == 0)
2666 			cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2667 	} else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF)
2668 		cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2669 
2670 	if (cbe_lun->lun_type == T_DIRECT ||
2671 	    cbe_lun->lun_type == T_CDROM) {
2672 		be_lun->size_bytes = params->lun_size_bytes;
2673 		if (params->blocksize_bytes != 0)
2674 			cbe_lun->blocksize = params->blocksize_bytes;
2675 		else if (cbe_lun->lun_type == T_CDROM)
2676 			cbe_lun->blocksize = 2048;
2677 		else
2678 			cbe_lun->blocksize = 512;
2679 		be_lun->size_blocks = be_lun->size_bytes / cbe_lun->blocksize;
2680 		cbe_lun->maxlba = (be_lun->size_blocks == 0) ?
2681 		    0 : (be_lun->size_blocks - 1);
2682 
2683 		if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) ||
2684 		    control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) {
2685 			retval = ctl_be_block_open(be_lun, req);
2686 			if (retval != 0) {
2687 				retval = 0;
2688 				req->status = CTL_LUN_WARNING;
2689 			}
2690 		}
2691 		num_threads = cbb_num_threads;
2692 	} else {
2693 		num_threads = 1;
2694 	}
2695 
2696 	value = dnvlist_get_string(cbe_lun->options, "num_threads", NULL);
2697 	if (value != NULL) {
2698 		tmp_num_threads = strtol(value, NULL, 0);
2699 
2700 		/*
2701 		 * We don't let the user specify less than one
2702 		 * thread, but hope he's clueful enough not to
2703 		 * specify 1000 threads.
2704 		 */
2705 		if (tmp_num_threads < 1) {
2706 			snprintf(req->error_str, sizeof(req->error_str),
2707 				 "invalid number of threads %s",
2708 				 num_thread_str);
2709 			goto bailout_error;
2710 		}
2711 		num_threads = tmp_num_threads;
2712 	}
2713 
2714 	if (be_lun->vn == NULL)
2715 		cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
2716 	/* Tell the user the blocksize we ended up using */
2717 	params->lun_size_bytes = be_lun->size_bytes;
2718 	params->blocksize_bytes = cbe_lun->blocksize;
2719 	if (params->flags & CTL_LUN_FLAG_ID_REQ) {
2720 		cbe_lun->req_lun_id = params->req_lun_id;
2721 		cbe_lun->flags |= CTL_LUN_FLAG_ID_REQ;
2722 	} else
2723 		cbe_lun->req_lun_id = 0;
2724 
2725 	cbe_lun->lun_shutdown = ctl_be_block_lun_shutdown;
2726 	cbe_lun->be = &ctl_be_block_driver;
2727 
2728 	if ((params->flags & CTL_LUN_FLAG_SERIAL_NUM) == 0) {
2729 		snprintf(tmpstr, sizeof(tmpstr), "MYSERIAL%04d",
2730 			 softc->num_luns);
2731 		strncpy((char *)cbe_lun->serial_num, tmpstr,
2732 			MIN(sizeof(cbe_lun->serial_num), sizeof(tmpstr)));
2733 
2734 		/* Tell the user what we used for a serial number */
2735 		strncpy((char *)params->serial_num, tmpstr,
2736 			MIN(sizeof(params->serial_num), sizeof(tmpstr)));
2737 	} else {
2738 		strncpy((char *)cbe_lun->serial_num, params->serial_num,
2739 			MIN(sizeof(cbe_lun->serial_num),
2740 			sizeof(params->serial_num)));
2741 	}
2742 	if ((params->flags & CTL_LUN_FLAG_DEVID) == 0) {
2743 		snprintf(tmpstr, sizeof(tmpstr), "MYDEVID%04d", softc->num_luns);
2744 		strncpy((char *)cbe_lun->device_id, tmpstr,
2745 			MIN(sizeof(cbe_lun->device_id), sizeof(tmpstr)));
2746 
2747 		/* Tell the user what we used for a device ID */
2748 		strncpy((char *)params->device_id, tmpstr,
2749 			MIN(sizeof(params->device_id), sizeof(tmpstr)));
2750 	} else {
2751 		strncpy((char *)cbe_lun->device_id, params->device_id,
2752 			MIN(sizeof(cbe_lun->device_id),
2753 			    sizeof(params->device_id)));
2754 	}
2755 
2756 	TASK_INIT(&be_lun->io_task, /*priority*/0, ctl_be_block_worker, be_lun);
2757 
2758 	be_lun->io_taskqueue = taskqueue_create("ctlblocktq", M_WAITOK,
2759 	    taskqueue_thread_enqueue, /*context*/&be_lun->io_taskqueue);
2760 
2761 	if (be_lun->io_taskqueue == NULL) {
2762 		snprintf(req->error_str, sizeof(req->error_str),
2763 			 "unable to create taskqueue");
2764 		goto bailout_error;
2765 	}
2766 
2767 	/*
2768 	 * Note that we start the same number of threads by default for
2769 	 * both the file case and the block device case.  For the file
2770 	 * case, we need multiple threads to allow concurrency, because the
2771 	 * vnode interface is designed to be a blocking interface.  For the
2772 	 * block device case, ZFS zvols at least will block the caller's
2773 	 * context in many instances, and so we need multiple threads to
2774 	 * overcome that problem.  Other block devices don't need as many
2775 	 * threads, but they shouldn't cause too many problems.
2776 	 *
2777 	 * If the user wants to just have a single thread for a block
2778 	 * device, he can specify that when the LUN is created, or change
2779 	 * the tunable/sysctl to alter the default number of threads.
2780 	 */
2781 	retval = taskqueue_start_threads_in_proc(&be_lun->io_taskqueue,
2782 					 /*num threads*/num_threads,
2783 					 /*priority*/PUSER,
2784 					 /*proc*/control_softc->ctl_proc,
2785 					 /*thread name*/"block");
2786 
2787 	if (retval != 0)
2788 		goto bailout_error;
2789 
2790 	be_lun->num_threads = num_threads;
2791 
2792 	retval = ctl_add_lun(&be_lun->cbe_lun);
2793 	if (retval != 0) {
2794 		snprintf(req->error_str, sizeof(req->error_str),
2795 			 "ctl_add_lun() returned error %d, see dmesg for "
2796 			 "details", retval);
2797 		retval = 0;
2798 		goto bailout_error;
2799 	}
2800 
2801 	be_lun->disk_stats = devstat_new_entry("cbb", cbe_lun->lun_id,
2802 					       cbe_lun->blocksize,
2803 					       DEVSTAT_ALL_SUPPORTED,
2804 					       cbe_lun->lun_type
2805 					       | DEVSTAT_TYPE_IF_OTHER,
2806 					       DEVSTAT_PRIORITY_OTHER);
2807 
2808 	mtx_lock(&softc->lock);
2809 	softc->num_luns++;
2810 	SLIST_INSERT_HEAD(&softc->lun_list, be_lun, links);
2811 	mtx_unlock(&softc->lock);
2812 
2813 	params->req_lun_id = cbe_lun->lun_id;
2814 
2815 	return (retval);
2816 
2817 bailout_error:
2818 	req->status = CTL_LUN_ERROR;
2819 
2820 	if (be_lun->io_taskqueue != NULL)
2821 		taskqueue_free(be_lun->io_taskqueue);
2822 	ctl_be_block_close(be_lun);
2823 	if (be_lun->dev_path != NULL)
2824 		free(be_lun->dev_path, M_CTLBLK);
2825 	nvlist_destroy(cbe_lun->options);
2826 	mtx_destroy(&be_lun->queue_lock);
2827 	mtx_destroy(&be_lun->io_lock);
2828 	free(be_lun, M_CTLBLK);
2829 
2830 	return (retval);
2831 }
2832 
2833 static int
2834 ctl_be_block_rm(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2835 {
2836 	struct ctl_lun_rm_params *params;
2837 	struct ctl_be_block_lun *be_lun;
2838 	struct ctl_be_lun *cbe_lun;
2839 	int retval;
2840 
2841 	params = &req->reqdata.rm;
2842 
2843 	sx_xlock(&softc->modify_lock);
2844 	mtx_lock(&softc->lock);
2845 	SLIST_FOREACH(be_lun, &softc->lun_list, links) {
2846 		if (be_lun->cbe_lun.lun_id == params->lun_id) {
2847 			SLIST_REMOVE(&softc->lun_list, be_lun,
2848 			    ctl_be_block_lun, links);
2849 			softc->num_luns--;
2850 			break;
2851 		}
2852 	}
2853 	mtx_unlock(&softc->lock);
2854 	sx_xunlock(&softc->modify_lock);
2855 	if (be_lun == NULL) {
2856 		snprintf(req->error_str, sizeof(req->error_str),
2857 			 "LUN %u is not managed by the block backend",
2858 			 params->lun_id);
2859 		goto bailout_error;
2860 	}
2861 	cbe_lun = &be_lun->cbe_lun;
2862 
2863 	if (be_lun->vn != NULL) {
2864 		cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
2865 		ctl_lun_no_media(cbe_lun);
2866 		taskqueue_drain_all(be_lun->io_taskqueue);
2867 		ctl_be_block_close(be_lun);
2868 	}
2869 
2870 	mtx_lock(&softc->lock);
2871 	be_lun->flags |= CTL_BE_BLOCK_LUN_WAITING;
2872 	mtx_unlock(&softc->lock);
2873 
2874 	retval = ctl_remove_lun(cbe_lun);
2875 	if (retval != 0) {
2876 		snprintf(req->error_str, sizeof(req->error_str),
2877 			 "error %d returned from ctl_remove_lun() for "
2878 			 "LUN %d", retval, params->lun_id);
2879 		mtx_lock(&softc->lock);
2880 		be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
2881 		mtx_unlock(&softc->lock);
2882 		goto bailout_error;
2883 	}
2884 
2885 	mtx_lock(&softc->lock);
2886 	while ((be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) == 0) {
2887 		retval = msleep(be_lun, &softc->lock, PCATCH, "ctlblockrm", 0);
2888 		if (retval == EINTR)
2889 			break;
2890 	}
2891 	be_lun->flags &= ~CTL_BE_BLOCK_LUN_WAITING;
2892 	if (be_lun->flags & CTL_BE_BLOCK_LUN_UNCONFIGURED) {
2893 		mtx_unlock(&softc->lock);
2894 		free(be_lun, M_CTLBLK);
2895 	} else {
2896 		mtx_unlock(&softc->lock);
2897 		return (EINTR);
2898 	}
2899 
2900 	req->status = CTL_LUN_OK;
2901 	return (0);
2902 
2903 bailout_error:
2904 	req->status = CTL_LUN_ERROR;
2905 	return (0);
2906 }
2907 
2908 static int
2909 ctl_be_block_modify(struct ctl_be_block_softc *softc, struct ctl_lun_req *req)
2910 {
2911 	struct ctl_lun_modify_params *params;
2912 	struct ctl_be_block_lun *be_lun;
2913 	struct ctl_be_lun *cbe_lun;
2914 	const char *value;
2915 	uint64_t oldsize;
2916 	int error, wasprim;
2917 
2918 	params = &req->reqdata.modify;
2919 
2920 	sx_xlock(&softc->modify_lock);
2921 	mtx_lock(&softc->lock);
2922 	SLIST_FOREACH(be_lun, &softc->lun_list, links) {
2923 		if (be_lun->cbe_lun.lun_id == params->lun_id)
2924 			break;
2925 	}
2926 	mtx_unlock(&softc->lock);
2927 	if (be_lun == NULL) {
2928 		snprintf(req->error_str, sizeof(req->error_str),
2929 			 "LUN %u is not managed by the block backend",
2930 			 params->lun_id);
2931 		goto bailout_error;
2932 	}
2933 	cbe_lun = &be_lun->cbe_lun;
2934 
2935 	if (params->lun_size_bytes != 0)
2936 		be_lun->params.lun_size_bytes = params->lun_size_bytes;
2937 
2938 	if (req->args_nvl != NULL) {
2939 		nvlist_destroy(cbe_lun->options);
2940 		cbe_lun->options = nvlist_clone(req->args_nvl);
2941 	}
2942 
2943 	wasprim = (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY);
2944 	value = dnvlist_get_string(cbe_lun->options, "ha_role", NULL);
2945 	if (value != NULL) {
2946 		if (strcmp(value, "primary") == 0)
2947 			cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2948 		else
2949 			cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY;
2950 	} else if (control_softc->flags & CTL_FLAG_ACTIVE_SHELF)
2951 		cbe_lun->flags |= CTL_LUN_FLAG_PRIMARY;
2952 	else
2953 		cbe_lun->flags &= ~CTL_LUN_FLAG_PRIMARY;
2954 	if (wasprim != (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)) {
2955 		if (cbe_lun->flags & CTL_LUN_FLAG_PRIMARY)
2956 			ctl_lun_primary(cbe_lun);
2957 		else
2958 			ctl_lun_secondary(cbe_lun);
2959 	}
2960 
2961 	oldsize = be_lun->size_blocks;
2962 	if ((cbe_lun->flags & CTL_LUN_FLAG_PRIMARY) ||
2963 	    control_softc->ha_mode == CTL_HA_MODE_SER_ONLY) {
2964 		if (be_lun->vn == NULL)
2965 			error = ctl_be_block_open(be_lun, req);
2966 		else if (vn_isdisk_error(be_lun->vn, &error))
2967 			error = ctl_be_block_open_dev(be_lun, req);
2968 		else if (be_lun->vn->v_type == VREG) {
2969 			vn_lock(be_lun->vn, LK_SHARED | LK_RETRY);
2970 			error = ctl_be_block_open_file(be_lun, req);
2971 			VOP_UNLOCK(be_lun->vn);
2972 		} else
2973 			error = EINVAL;
2974 		if ((cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) &&
2975 		    be_lun->vn != NULL) {
2976 			cbe_lun->flags &= ~CTL_LUN_FLAG_NO_MEDIA;
2977 			ctl_lun_has_media(cbe_lun);
2978 		} else if ((cbe_lun->flags & CTL_LUN_FLAG_NO_MEDIA) == 0 &&
2979 		    be_lun->vn == NULL) {
2980 			cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
2981 			ctl_lun_no_media(cbe_lun);
2982 		}
2983 		cbe_lun->flags &= ~CTL_LUN_FLAG_EJECTED;
2984 	} else {
2985 		if (be_lun->vn != NULL) {
2986 			cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
2987 			ctl_lun_no_media(cbe_lun);
2988 			taskqueue_drain_all(be_lun->io_taskqueue);
2989 			error = ctl_be_block_close(be_lun);
2990 		} else
2991 			error = 0;
2992 	}
2993 	if (be_lun->size_blocks != oldsize)
2994 		ctl_lun_capacity_changed(cbe_lun);
2995 
2996 	/* Tell the user the exact size we ended up using */
2997 	params->lun_size_bytes = be_lun->size_bytes;
2998 
2999 	sx_xunlock(&softc->modify_lock);
3000 	req->status = error ? CTL_LUN_WARNING : CTL_LUN_OK;
3001 	return (0);
3002 
3003 bailout_error:
3004 	sx_xunlock(&softc->modify_lock);
3005 	req->status = CTL_LUN_ERROR;
3006 	return (0);
3007 }
3008 
3009 static void
3010 ctl_be_block_lun_shutdown(struct ctl_be_lun *cbe_lun)
3011 {
3012 	struct ctl_be_block_lun *be_lun = (struct ctl_be_block_lun *)cbe_lun;
3013 	struct ctl_be_block_softc *softc = be_lun->softc;
3014 
3015 	taskqueue_drain_all(be_lun->io_taskqueue);
3016 	taskqueue_free(be_lun->io_taskqueue);
3017 	if (be_lun->disk_stats != NULL)
3018 		devstat_remove_entry(be_lun->disk_stats);
3019 	nvlist_destroy(be_lun->cbe_lun.options);
3020 	free(be_lun->dev_path, M_CTLBLK);
3021 	mtx_destroy(&be_lun->queue_lock);
3022 	mtx_destroy(&be_lun->io_lock);
3023 
3024 	mtx_lock(&softc->lock);
3025 	be_lun->flags |= CTL_BE_BLOCK_LUN_UNCONFIGURED;
3026 	if (be_lun->flags & CTL_BE_BLOCK_LUN_WAITING)
3027 		wakeup(be_lun);
3028 	else
3029 		free(be_lun, M_CTLBLK);
3030 	mtx_unlock(&softc->lock);
3031 }
3032 
3033 static int
3034 ctl_be_block_scsi_config_write(union ctl_io *io)
3035 {
3036 	struct ctl_be_block_lun *be_lun;
3037 	struct ctl_be_lun *cbe_lun;
3038 	int retval;
3039 
3040 	DPRINTF("entered\n");
3041 
3042 	cbe_lun = CTL_BACKEND_LUN(io);
3043 	be_lun = (struct ctl_be_block_lun *)cbe_lun;
3044 
3045 	retval = 0;
3046 	switch (io->scsiio.cdb[0]) {
3047 	case SYNCHRONIZE_CACHE:
3048 	case SYNCHRONIZE_CACHE_16:
3049 	case WRITE_SAME_10:
3050 	case WRITE_SAME_16:
3051 	case UNMAP:
3052 		/*
3053 		 * The upper level CTL code will filter out any CDBs with
3054 		 * the immediate bit set and return the proper error.
3055 		 *
3056 		 * We don't really need to worry about what LBA range the
3057 		 * user asked to be synced out.  When they issue a sync
3058 		 * cache command, we'll sync out the whole thing.
3059 		 */
3060 		mtx_lock(&be_lun->queue_lock);
3061 		STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr,
3062 				   links);
3063 		mtx_unlock(&be_lun->queue_lock);
3064 		taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
3065 		break;
3066 	case START_STOP_UNIT: {
3067 		struct scsi_start_stop_unit *cdb;
3068 		struct ctl_lun_req req;
3069 
3070 		cdb = (struct scsi_start_stop_unit *)io->scsiio.cdb;
3071 		if ((cdb->how & SSS_PC_MASK) != 0) {
3072 			ctl_set_success(&io->scsiio);
3073 			ctl_config_write_done(io);
3074 			break;
3075 		}
3076 		if (cdb->how & SSS_START) {
3077 			if ((cdb->how & SSS_LOEJ) && be_lun->vn == NULL) {
3078 				retval = ctl_be_block_open(be_lun, &req);
3079 				cbe_lun->flags &= ~CTL_LUN_FLAG_EJECTED;
3080 				if (retval == 0) {
3081 					cbe_lun->flags &= ~CTL_LUN_FLAG_NO_MEDIA;
3082 					ctl_lun_has_media(cbe_lun);
3083 				} else {
3084 					cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
3085 					ctl_lun_no_media(cbe_lun);
3086 				}
3087 			}
3088 			ctl_start_lun(cbe_lun);
3089 		} else {
3090 			ctl_stop_lun(cbe_lun);
3091 			if (cdb->how & SSS_LOEJ) {
3092 				cbe_lun->flags |= CTL_LUN_FLAG_NO_MEDIA;
3093 				cbe_lun->flags |= CTL_LUN_FLAG_EJECTED;
3094 				ctl_lun_ejected(cbe_lun);
3095 				if (be_lun->vn != NULL)
3096 					ctl_be_block_close(be_lun);
3097 			}
3098 		}
3099 
3100 		ctl_set_success(&io->scsiio);
3101 		ctl_config_write_done(io);
3102 		break;
3103 	}
3104 	case PREVENT_ALLOW:
3105 		ctl_set_success(&io->scsiio);
3106 		ctl_config_write_done(io);
3107 		break;
3108 	default:
3109 		ctl_set_invalid_opcode(&io->scsiio);
3110 		ctl_config_write_done(io);
3111 		retval = CTL_RETVAL_COMPLETE;
3112 		break;
3113 	}
3114 
3115 	return (retval);
3116 }
3117 
3118 static int
3119 ctl_be_block_nvme_config_write(union ctl_io *io)
3120 {
3121 	struct ctl_be_block_lun *be_lun;
3122 
3123 	DPRINTF("entered\n");
3124 
3125 	be_lun = (struct ctl_be_block_lun *)CTL_BACKEND_LUN(io);
3126 
3127 	switch (io->nvmeio.cmd.opc) {
3128 	case NVME_OPC_DATASET_MANAGEMENT:
3129 		DSM_RANGE(io) = 0;
3130 		/* FALLTHROUGH */
3131 	case NVME_OPC_FLUSH:
3132 	case NVME_OPC_WRITE_UNCORRECTABLE:
3133 	case NVME_OPC_WRITE_ZEROES:
3134 		mtx_lock(&be_lun->queue_lock);
3135 		STAILQ_INSERT_TAIL(&be_lun->config_write_queue, &io->io_hdr,
3136 				   links);
3137 		mtx_unlock(&be_lun->queue_lock);
3138 		taskqueue_enqueue(be_lun->io_taskqueue, &be_lun->io_task);
3139 		break;
3140 	default:
3141 		ctl_nvme_set_invalid_opcode(&io->nvmeio);
3142 		ctl_config_write_done(io);
3143 		break;
3144 	}
3145 	return (CTL_RETVAL_COMPLETE);
3146 }
3147 
3148 static int
3149 ctl_be_block_config_write(union ctl_io *io)
3150 {
3151 	switch (io->io_hdr.io_type) {
3152 	case CTL_IO_SCSI:
3153 		return (ctl_be_block_scsi_config_write(io));
3154 	case CTL_IO_NVME:
3155 		return (ctl_be_block_nvme_config_write(io));
3156 	default:
3157 		__assert_unreachable();
3158 	}
3159 }
3160 
3161 static int
3162 ctl_be_block_scsi_config_read(union ctl_io *io)
3163 {
3164 	struct ctl_be_block_lun *be_lun;
3165 	int retval = 0;
3166 
3167 	DPRINTF("entered\n");
3168 
3169 	be_lun = (struct ctl_be_block_lun *)CTL_BACKEND_LUN(io);
3170 
3171 	switch (io->scsiio.cdb[0]) {
3172 	case SERVICE_ACTION_IN:
3173 		if (io->scsiio.cdb[1] == SGLS_SERVICE_ACTION) {
3174 			mtx_lock(&be_lun->queue_lock);
3175 			STAILQ_INSERT_TAIL(&be_lun->config_read_queue,
3176 			    &io->io_hdr, links);
3177 			mtx_unlock(&be_lun->queue_lock);
3178 			taskqueue_enqueue(be_lun->io_taskqueue,
3179 			    &be_lun->io_task);
3180 			retval = CTL_RETVAL_QUEUED;
3181 			break;
3182 		}
3183 		ctl_set_invalid_field(&io->scsiio,
3184 				      /*sks_valid*/ 1,
3185 				      /*command*/ 1,
3186 				      /*field*/ 1,
3187 				      /*bit_valid*/ 1,
3188 				      /*bit*/ 4);
3189 		ctl_config_read_done(io);
3190 		retval = CTL_RETVAL_COMPLETE;
3191 		break;
3192 	default:
3193 		ctl_set_invalid_opcode(&io->scsiio);
3194 		ctl_config_read_done(io);
3195 		retval = CTL_RETVAL_COMPLETE;
3196 		break;
3197 	}
3198 
3199 	return (retval);
3200 }
3201 
3202 static int
3203 ctl_be_block_nvme_config_read(union ctl_io *io)
3204 {
3205 	struct ctl_be_block_lun *be_lun;
3206 
3207 	DPRINTF("entered\n");
3208 
3209 	be_lun = (struct ctl_be_block_lun *)CTL_BACKEND_LUN(io);
3210 
3211 	switch (io->nvmeio.cmd.opc) {
3212 	case NVME_OPC_IDENTIFY:
3213 	{
3214 		uint8_t cns;
3215 
3216 		cns = le32toh(io->nvmeio.cmd.cdw10) & 0xff;
3217 		switch (cns) {
3218 		case 0:
3219 		case 3:
3220 			mtx_lock(&be_lun->queue_lock);
3221 			STAILQ_INSERT_TAIL(&be_lun->config_read_queue,
3222 			    &io->io_hdr, links);
3223 			mtx_unlock(&be_lun->queue_lock);
3224 			taskqueue_enqueue(be_lun->io_taskqueue,
3225 			    &be_lun->io_task);
3226 			return (CTL_RETVAL_QUEUED);
3227 		default:
3228 			ctl_nvme_set_invalid_field(&io->nvmeio);
3229 			ctl_config_read_done(io);
3230 			break;
3231 		}
3232 		break;
3233 	}
3234 	default:
3235 		ctl_nvme_set_invalid_opcode(&io->nvmeio);
3236 		ctl_config_read_done(io);
3237 		break;
3238 	}
3239 	return (CTL_RETVAL_COMPLETE);
3240 }
3241 
3242 static int
3243 ctl_be_block_config_read(union ctl_io *io)
3244 {
3245 	switch (io->io_hdr.io_type) {
3246 	case CTL_IO_SCSI:
3247 		return (ctl_be_block_scsi_config_read(io));
3248 	case CTL_IO_NVME_ADMIN:
3249 		return (ctl_be_block_nvme_config_read(io));
3250 	default:
3251 		__assert_unreachable();
3252 	}
3253 }
3254 
3255 static int
3256 ctl_be_block_lun_info(struct ctl_be_lun *cbe_lun, struct sbuf *sb)
3257 {
3258 	struct ctl_be_block_lun *lun = (struct ctl_be_block_lun *)cbe_lun;
3259 	int retval;
3260 
3261 	retval = sbuf_cat(sb, "\t<num_threads>");
3262 	if (retval != 0)
3263 		goto bailout;
3264 	retval = sbuf_printf(sb, "%d", lun->num_threads);
3265 	if (retval != 0)
3266 		goto bailout;
3267 	retval = sbuf_cat(sb, "</num_threads>\n");
3268 
3269 bailout:
3270 	return (retval);
3271 }
3272 
3273 static uint64_t
3274 ctl_be_block_lun_attr(struct ctl_be_lun *cbe_lun, const char *attrname)
3275 {
3276 	struct ctl_be_block_lun *lun = (struct ctl_be_block_lun *)cbe_lun;
3277 
3278 	if (lun->getattr == NULL)
3279 		return (UINT64_MAX);
3280 	return (lun->getattr(lun, attrname));
3281 }
3282 
3283 static int
3284 ctl_be_block_init(void)
3285 {
3286 	struct ctl_be_block_softc *softc = &backend_block_softc;
3287 
3288 	sx_init(&softc->modify_lock, "ctlblock modify");
3289 	mtx_init(&softc->lock, "ctlblock", NULL, MTX_DEF);
3290 	softc->beio_zone = uma_zcreate("beio", sizeof(struct ctl_be_block_io),
3291 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
3292 	softc->bufmin_zone = uma_zcreate("ctlblockmin", CTLBLK_MIN_SEG,
3293 	    NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/0);
3294 	if (CTLBLK_MIN_SEG < CTLBLK_MAX_SEG)
3295 		softc->bufmax_zone = uma_zcreate("ctlblockmax", CTLBLK_MAX_SEG,
3296 		    NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/0);
3297 	SLIST_INIT(&softc->lun_list);
3298 	return (0);
3299 }
3300 
3301 static int
3302 ctl_be_block_shutdown(void)
3303 {
3304 	struct ctl_be_block_softc *softc = &backend_block_softc;
3305 	struct ctl_be_block_lun *lun;
3306 
3307 	mtx_lock(&softc->lock);
3308 	while ((lun = SLIST_FIRST(&softc->lun_list)) != NULL) {
3309 		SLIST_REMOVE_HEAD(&softc->lun_list, links);
3310 		softc->num_luns--;
3311 		/*
3312 		 * Drop our lock here.  Since ctl_remove_lun() can call
3313 		 * back into us, this could potentially lead to a recursive
3314 		 * lock of the same mutex, which would cause a hang.
3315 		 */
3316 		mtx_unlock(&softc->lock);
3317 		ctl_remove_lun(&lun->cbe_lun);
3318 		mtx_lock(&softc->lock);
3319 	}
3320 	mtx_unlock(&softc->lock);
3321 	uma_zdestroy(softc->bufmin_zone);
3322 	if (CTLBLK_MIN_SEG < CTLBLK_MAX_SEG)
3323 		uma_zdestroy(softc->bufmax_zone);
3324 	uma_zdestroy(softc->beio_zone);
3325 	mtx_destroy(&softc->lock);
3326 	sx_destroy(&softc->modify_lock);
3327 	return (0);
3328 }
3329