xref: /freebsd/sys/dev/virtio/block/virtio_blk.c (revision d0b2dbfa0ecf2bbc9709efc5e20baf8e4b44bbbf)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011, Bryan Venteicher <bryanv@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /* Driver for VirtIO block devices. */
30 
31 #include <sys/cdefs.h>
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/bio.h>
36 #include <sys/malloc.h>
37 #include <sys/module.h>
38 #include <sys/msan.h>
39 #include <sys/sglist.h>
40 #include <sys/sysctl.h>
41 #include <sys/lock.h>
42 #include <sys/mutex.h>
43 #include <sys/queue.h>
44 
45 #include <geom/geom.h>
46 #include <geom/geom_disk.h>
47 
48 #include <machine/bus.h>
49 #include <machine/resource.h>
50 #include <sys/bus.h>
51 #include <sys/rman.h>
52 
53 #include <dev/virtio/virtio.h>
54 #include <dev/virtio/virtqueue.h>
55 #include <dev/virtio/block/virtio_blk.h>
56 
57 #include "virtio_if.h"
58 
59 struct vtblk_request {
60 	struct vtblk_softc		*vbr_sc;
61 	bus_dmamap_t			 vbr_mapp;
62 
63 	/* Fields after this point are zeroed for each request. */
64 	struct virtio_blk_outhdr	 vbr_hdr;
65 	struct bio			*vbr_bp;
66 	uint8_t				 vbr_ack;
67 	uint8_t				 vbr_requeue_on_error;
68 	uint8_t				 vbr_busdma_wait;
69 	int				 vbr_error;
70 	TAILQ_ENTRY(vtblk_request)	 vbr_link;
71 };
72 
73 enum vtblk_cache_mode {
74 	VTBLK_CACHE_WRITETHROUGH,
75 	VTBLK_CACHE_WRITEBACK,
76 	VTBLK_CACHE_MAX
77 };
78 
79 struct vtblk_softc {
80 	device_t		 vtblk_dev;
81 	struct mtx		 vtblk_mtx;
82 	uint64_t		 vtblk_features;
83 	uint32_t		 vtblk_flags;
84 #define VTBLK_FLAG_INDIRECT	0x0001
85 #define VTBLK_FLAG_DETACH	0x0002
86 #define VTBLK_FLAG_SUSPEND	0x0004
87 #define VTBLK_FLAG_BARRIER	0x0008
88 #define VTBLK_FLAG_WCE_CONFIG	0x0010
89 #define VTBLK_FLAG_BUSDMA_WAIT	0x0020
90 #define VTBLK_FLAG_BUSDMA_ALIGN	0x0040
91 
92 	struct virtqueue	*vtblk_vq;
93 	struct sglist		*vtblk_sglist;
94 	bus_dma_tag_t		 vtblk_dmat;
95 	struct disk		*vtblk_disk;
96 
97 	struct bio_queue_head	 vtblk_bioq;
98 	TAILQ_HEAD(, vtblk_request)
99 				 vtblk_req_free;
100 	TAILQ_HEAD(, vtblk_request)
101 				 vtblk_req_ready;
102 	struct vtblk_request	*vtblk_req_ordered;
103 
104 	int			 vtblk_max_nsegs;
105 	int			 vtblk_request_count;
106 	enum vtblk_cache_mode	 vtblk_write_cache;
107 
108 	struct bio_queue	 vtblk_dump_queue;
109 	struct vtblk_request	 vtblk_dump_request;
110 };
111 
112 static struct virtio_feature_desc vtblk_feature_desc[] = {
113 	{ VIRTIO_BLK_F_BARRIER,		"HostBarrier"	},
114 	{ VIRTIO_BLK_F_SIZE_MAX,	"MaxSegSize"	},
115 	{ VIRTIO_BLK_F_SEG_MAX,		"MaxNumSegs"	},
116 	{ VIRTIO_BLK_F_GEOMETRY,	"DiskGeometry"	},
117 	{ VIRTIO_BLK_F_RO,		"ReadOnly"	},
118 	{ VIRTIO_BLK_F_BLK_SIZE,	"BlockSize"	},
119 	{ VIRTIO_BLK_F_SCSI,		"SCSICmds"	},
120 	{ VIRTIO_BLK_F_FLUSH,		"FlushCmd"	},
121 	{ VIRTIO_BLK_F_TOPOLOGY,	"Topology"	},
122 	{ VIRTIO_BLK_F_CONFIG_WCE,	"ConfigWCE"	},
123 	{ VIRTIO_BLK_F_MQ,		"Multiqueue"	},
124 	{ VIRTIO_BLK_F_DISCARD,		"Discard"	},
125 	{ VIRTIO_BLK_F_WRITE_ZEROES,	"WriteZeros"	},
126 
127 	{ 0, NULL }
128 };
129 
130 static int	vtblk_modevent(module_t, int, void *);
131 
132 static int	vtblk_probe(device_t);
133 static int	vtblk_attach(device_t);
134 static int	vtblk_detach(device_t);
135 static int	vtblk_suspend(device_t);
136 static int	vtblk_resume(device_t);
137 static int	vtblk_shutdown(device_t);
138 static int	vtblk_attach_completed(device_t);
139 static int	vtblk_config_change(device_t);
140 
141 static int	vtblk_open(struct disk *);
142 static int	vtblk_close(struct disk *);
143 static int	vtblk_ioctl(struct disk *, u_long, void *, int,
144 		    struct thread *);
145 static int	vtblk_dump(void *, void *, off_t, size_t);
146 static void	vtblk_strategy(struct bio *);
147 
148 static int	vtblk_negotiate_features(struct vtblk_softc *);
149 static int	vtblk_setup_features(struct vtblk_softc *);
150 static int	vtblk_maximum_segments(struct vtblk_softc *,
151 		    struct virtio_blk_config *);
152 static int	vtblk_alloc_virtqueue(struct vtblk_softc *);
153 static void	vtblk_resize_disk(struct vtblk_softc *, uint64_t);
154 static void	vtblk_alloc_disk(struct vtblk_softc *,
155 		    struct virtio_blk_config *);
156 static void	vtblk_create_disk(struct vtblk_softc *);
157 
158 static int	vtblk_request_prealloc(struct vtblk_softc *);
159 static void	vtblk_request_free(struct vtblk_softc *);
160 static struct vtblk_request *
161 		vtblk_request_dequeue(struct vtblk_softc *);
162 static void	vtblk_request_enqueue(struct vtblk_softc *,
163 		    struct vtblk_request *);
164 static struct vtblk_request *
165 		vtblk_request_next_ready(struct vtblk_softc *);
166 static void	vtblk_request_requeue_ready(struct vtblk_softc *,
167 		    struct vtblk_request *);
168 static struct vtblk_request *
169 		vtblk_request_next(struct vtblk_softc *);
170 static struct vtblk_request *
171 		vtblk_request_bio(struct vtblk_softc *);
172 static int	vtblk_request_execute(struct vtblk_request *, int);
173 static void	vtblk_request_execute_cb(void *,
174 		    bus_dma_segment_t *, int, int);
175 static int	vtblk_request_error(struct vtblk_request *);
176 
177 static void	vtblk_queue_completed(struct vtblk_softc *,
178 		    struct bio_queue *);
179 static void	vtblk_done_completed(struct vtblk_softc *,
180 		    struct bio_queue *);
181 static void	vtblk_drain_vq(struct vtblk_softc *);
182 static void	vtblk_drain(struct vtblk_softc *);
183 
184 static void	vtblk_startio(struct vtblk_softc *);
185 static void	vtblk_bio_done(struct vtblk_softc *, struct bio *, int);
186 
187 static void	vtblk_read_config(struct vtblk_softc *,
188 		    struct virtio_blk_config *);
189 static void	vtblk_ident(struct vtblk_softc *);
190 static int	vtblk_poll_request(struct vtblk_softc *,
191 		    struct vtblk_request *);
192 static int	vtblk_quiesce(struct vtblk_softc *);
193 static void	vtblk_vq_intr(void *);
194 static void	vtblk_stop(struct vtblk_softc *);
195 
196 static void	vtblk_dump_quiesce(struct vtblk_softc *);
197 static int	vtblk_dump_write(struct vtblk_softc *, void *, off_t, size_t);
198 static int	vtblk_dump_flush(struct vtblk_softc *);
199 static void	vtblk_dump_complete(struct vtblk_softc *);
200 
201 static void	vtblk_set_write_cache(struct vtblk_softc *, int);
202 static int	vtblk_write_cache_enabled(struct vtblk_softc *sc,
203 		    struct virtio_blk_config *);
204 static int	vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS);
205 
206 static void	vtblk_setup_sysctl(struct vtblk_softc *);
207 static int	vtblk_tunable_int(struct vtblk_softc *, const char *, int);
208 
209 #define vtblk_modern(_sc) (((_sc)->vtblk_features & VIRTIO_F_VERSION_1) != 0)
210 #define vtblk_htog16(_sc, _val)	virtio_htog16(vtblk_modern(_sc), _val)
211 #define vtblk_htog32(_sc, _val)	virtio_htog32(vtblk_modern(_sc), _val)
212 #define vtblk_htog64(_sc, _val)	virtio_htog64(vtblk_modern(_sc), _val)
213 #define vtblk_gtoh16(_sc, _val)	virtio_gtoh16(vtblk_modern(_sc), _val)
214 #define vtblk_gtoh32(_sc, _val)	virtio_gtoh32(vtblk_modern(_sc), _val)
215 #define vtblk_gtoh64(_sc, _val)	virtio_gtoh64(vtblk_modern(_sc), _val)
216 
217 /* Tunables. */
218 static int vtblk_no_ident = 0;
219 TUNABLE_INT("hw.vtblk.no_ident", &vtblk_no_ident);
220 static int vtblk_writecache_mode = -1;
221 TUNABLE_INT("hw.vtblk.writecache_mode", &vtblk_writecache_mode);
222 
223 #define VTBLK_COMMON_FEATURES \
224     (VIRTIO_BLK_F_SIZE_MAX		| \
225      VIRTIO_BLK_F_SEG_MAX		| \
226      VIRTIO_BLK_F_GEOMETRY		| \
227      VIRTIO_BLK_F_RO			| \
228      VIRTIO_BLK_F_BLK_SIZE		| \
229      VIRTIO_BLK_F_FLUSH			| \
230      VIRTIO_BLK_F_TOPOLOGY		| \
231      VIRTIO_BLK_F_CONFIG_WCE		| \
232      VIRTIO_BLK_F_DISCARD		| \
233      VIRTIO_RING_F_INDIRECT_DESC)
234 
235 #define VTBLK_MODERN_FEATURES	(VTBLK_COMMON_FEATURES)
236 #define VTBLK_LEGACY_FEATURES	(VIRTIO_BLK_F_BARRIER | VTBLK_COMMON_FEATURES)
237 
238 #define VTBLK_MTX(_sc)		&(_sc)->vtblk_mtx
239 #define VTBLK_LOCK_INIT(_sc, _name) \
240 				mtx_init(VTBLK_MTX((_sc)), (_name), \
241 				    "VirtIO Block Lock", MTX_DEF)
242 #define VTBLK_LOCK(_sc)		mtx_lock(VTBLK_MTX((_sc)))
243 #define VTBLK_UNLOCK(_sc)	mtx_unlock(VTBLK_MTX((_sc)))
244 #define VTBLK_LOCK_DESTROY(_sc)	mtx_destroy(VTBLK_MTX((_sc)))
245 #define VTBLK_LOCK_ASSERT(_sc)	mtx_assert(VTBLK_MTX((_sc)), MA_OWNED)
246 #define VTBLK_LOCK_ASSERT_NOTOWNED(_sc) \
247 				mtx_assert(VTBLK_MTX((_sc)), MA_NOTOWNED)
248 
249 #define VTBLK_DISK_NAME		"vtbd"
250 #define VTBLK_QUIESCE_TIMEOUT	(30 * hz)
251 #define VTBLK_BSIZE		512
252 
253 /*
254  * Each block request uses at least two segments - one for the header
255  * and one for the status.
256  */
257 #define VTBLK_MIN_SEGMENTS	2
258 
259 static device_method_t vtblk_methods[] = {
260 	/* Device methods. */
261 	DEVMETHOD(device_probe,		vtblk_probe),
262 	DEVMETHOD(device_attach,	vtblk_attach),
263 	DEVMETHOD(device_detach,	vtblk_detach),
264 	DEVMETHOD(device_suspend,	vtblk_suspend),
265 	DEVMETHOD(device_resume,	vtblk_resume),
266 	DEVMETHOD(device_shutdown,	vtblk_shutdown),
267 
268 	/* VirtIO methods. */
269 	DEVMETHOD(virtio_attach_completed, vtblk_attach_completed),
270 	DEVMETHOD(virtio_config_change,	vtblk_config_change),
271 
272 	DEVMETHOD_END
273 };
274 
275 static driver_t vtblk_driver = {
276 	"vtblk",
277 	vtblk_methods,
278 	sizeof(struct vtblk_softc)
279 };
280 
281 VIRTIO_DRIVER_MODULE(virtio_blk, vtblk_driver, vtblk_modevent, NULL);
282 MODULE_VERSION(virtio_blk, 1);
283 MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
284 
285 VIRTIO_SIMPLE_PNPINFO(virtio_blk, VIRTIO_ID_BLOCK, "VirtIO Block Adapter");
286 
287 static int
288 vtblk_modevent(module_t mod, int type, void *unused)
289 {
290 	int error;
291 
292 	error = 0;
293 
294 	switch (type) {
295 	case MOD_LOAD:
296 	case MOD_QUIESCE:
297 	case MOD_UNLOAD:
298 	case MOD_SHUTDOWN:
299 		break;
300 	default:
301 		error = EOPNOTSUPP;
302 		break;
303 	}
304 
305 	return (error);
306 }
307 
308 static int
309 vtblk_probe(device_t dev)
310 {
311 	return (VIRTIO_SIMPLE_PROBE(dev, virtio_blk));
312 }
313 
314 static int
315 vtblk_attach(device_t dev)
316 {
317 	struct vtblk_softc *sc;
318 	struct virtio_blk_config blkcfg;
319 	int error;
320 
321 	sc = device_get_softc(dev);
322 	sc->vtblk_dev = dev;
323 	virtio_set_feature_desc(dev, vtblk_feature_desc);
324 
325 	VTBLK_LOCK_INIT(sc, device_get_nameunit(dev));
326 	bioq_init(&sc->vtblk_bioq);
327 	TAILQ_INIT(&sc->vtblk_dump_queue);
328 	TAILQ_INIT(&sc->vtblk_req_free);
329 	TAILQ_INIT(&sc->vtblk_req_ready);
330 
331 	vtblk_setup_sysctl(sc);
332 
333 	error = vtblk_setup_features(sc);
334 	if (error) {
335 		device_printf(dev, "cannot setup features\n");
336 		goto fail;
337 	}
338 
339 	vtblk_read_config(sc, &blkcfg);
340 
341 	/*
342 	 * With the current sglist(9) implementation, it is not easy
343 	 * for us to support a maximum segment size as adjacent
344 	 * segments are coalesced. For now, just make sure it's larger
345 	 * than the maximum supported transfer size.
346 	 */
347 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
348 		if (blkcfg.size_max < maxphys) {
349 			error = ENOTSUP;
350 			device_printf(dev, "host requires unsupported "
351 			    "maximum segment size feature\n");
352 			goto fail;
353 		}
354 	}
355 
356 	sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
357 	if (sc->vtblk_max_nsegs <= VTBLK_MIN_SEGMENTS) {
358 		error = EINVAL;
359 		device_printf(dev, "fewer than minimum number of segments "
360 		    "allowed: %d\n", sc->vtblk_max_nsegs);
361 		goto fail;
362 	}
363 
364 	sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_NOWAIT);
365 	if (sc->vtblk_sglist == NULL) {
366 		error = ENOMEM;
367 		device_printf(dev, "cannot allocate sglist\n");
368 		goto fail;
369 	}
370 
371 	/*
372 	 * If vtblk_max_nsegs == VTBLK_MIN_SEGMENTS + 1, the device only
373 	 * supports a single data segment; in that case we need busdma to
374 	 * align to a page boundary so we can send a *contiguous* page size
375 	 * request to the host.
376 	 */
377 	if (sc->vtblk_max_nsegs == VTBLK_MIN_SEGMENTS + 1)
378 		sc->vtblk_flags |= VTBLK_FLAG_BUSDMA_ALIGN;
379 	error = bus_dma_tag_create(
380 	    bus_get_dma_tag(dev),			/* parent */
381 	    (sc->vtblk_flags & VTBLK_FLAG_BUSDMA_ALIGN) ? PAGE_SIZE : 1,
382 	    0,						/* boundary */
383 	    BUS_SPACE_MAXADDR,				/* lowaddr */
384 	    BUS_SPACE_MAXADDR,				/* highaddr */
385 	    NULL, NULL,					/* filter, filterarg */
386 	    maxphys,					/* max request size */
387 	    sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS,	/* max # segments */
388 	    maxphys,					/* maxsegsize */
389 	    0,						/* flags */
390 	    busdma_lock_mutex,				/* lockfunc */
391 	    &sc->vtblk_mtx,				/* lockarg */
392 	    &sc->vtblk_dmat);
393 	if (error) {
394 		device_printf(dev, "cannot create bus dma tag\n");
395 		goto fail;
396 	}
397 
398 #ifdef __powerpc__
399 	/*
400 	 * Virtio uses physical addresses rather than bus addresses, so we
401 	 * need to ask busdma to skip the iommu physical->bus mapping.  At
402 	 * present, this is only a thing on the powerpc architectures.
403 	 */
404 	bus_dma_tag_set_iommu(sc->vtblk_dmat, NULL, NULL);
405 #endif
406 
407 	error = vtblk_alloc_virtqueue(sc);
408 	if (error) {
409 		device_printf(dev, "cannot allocate virtqueue\n");
410 		goto fail;
411 	}
412 
413 	error = vtblk_request_prealloc(sc);
414 	if (error) {
415 		device_printf(dev, "cannot preallocate requests\n");
416 		goto fail;
417 	}
418 
419 	vtblk_alloc_disk(sc, &blkcfg);
420 
421 	error = virtio_setup_intr(dev, INTR_TYPE_BIO | INTR_ENTROPY);
422 	if (error) {
423 		device_printf(dev, "cannot setup virtqueue interrupt\n");
424 		goto fail;
425 	}
426 
427 	virtqueue_enable_intr(sc->vtblk_vq);
428 
429 fail:
430 	if (error)
431 		vtblk_detach(dev);
432 
433 	return (error);
434 }
435 
436 static int
437 vtblk_detach(device_t dev)
438 {
439 	struct vtblk_softc *sc;
440 
441 	sc = device_get_softc(dev);
442 
443 	VTBLK_LOCK(sc);
444 	sc->vtblk_flags |= VTBLK_FLAG_DETACH;
445 	if (device_is_attached(dev))
446 		vtblk_stop(sc);
447 	VTBLK_UNLOCK(sc);
448 
449 	vtblk_drain(sc);
450 
451 	if (sc->vtblk_disk != NULL) {
452 		disk_destroy(sc->vtblk_disk);
453 		sc->vtblk_disk = NULL;
454 	}
455 
456 	if (sc->vtblk_dmat != NULL) {
457 		bus_dma_tag_destroy(sc->vtblk_dmat);
458 		sc->vtblk_dmat = NULL;
459 	}
460 
461 	if (sc->vtblk_sglist != NULL) {
462 		sglist_free(sc->vtblk_sglist);
463 		sc->vtblk_sglist = NULL;
464 	}
465 
466 	VTBLK_LOCK_DESTROY(sc);
467 
468 	return (0);
469 }
470 
471 static int
472 vtblk_suspend(device_t dev)
473 {
474 	struct vtblk_softc *sc;
475 	int error;
476 
477 	sc = device_get_softc(dev);
478 
479 	VTBLK_LOCK(sc);
480 	sc->vtblk_flags |= VTBLK_FLAG_SUSPEND;
481 	/* XXX BMV: virtio_stop(), etc needed here? */
482 	error = vtblk_quiesce(sc);
483 	if (error)
484 		sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
485 	VTBLK_UNLOCK(sc);
486 
487 	return (error);
488 }
489 
490 static int
491 vtblk_resume(device_t dev)
492 {
493 	struct vtblk_softc *sc;
494 
495 	sc = device_get_softc(dev);
496 
497 	VTBLK_LOCK(sc);
498 	/* XXX BMV: virtio_reinit(), etc needed here? */
499 	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
500 	vtblk_startio(sc);
501 	VTBLK_UNLOCK(sc);
502 
503 	return (0);
504 }
505 
506 static int
507 vtblk_shutdown(device_t dev)
508 {
509 
510 	return (0);
511 }
512 
513 static int
514 vtblk_attach_completed(device_t dev)
515 {
516 	struct vtblk_softc *sc;
517 
518 	sc = device_get_softc(dev);
519 
520 	/*
521 	 * Create disk after attach as VIRTIO_BLK_T_GET_ID can only be
522 	 * processed after the device acknowledged
523 	 * VIRTIO_CONFIG_STATUS_DRIVER_OK.
524 	 */
525 	vtblk_create_disk(sc);
526 	return (0);
527 }
528 
529 static int
530 vtblk_config_change(device_t dev)
531 {
532 	struct vtblk_softc *sc;
533 	struct virtio_blk_config blkcfg;
534 	uint64_t capacity;
535 
536 	sc = device_get_softc(dev);
537 
538 	vtblk_read_config(sc, &blkcfg);
539 
540 	/* Capacity is always in 512-byte units. */
541 	capacity = blkcfg.capacity * VTBLK_BSIZE;
542 
543 	if (sc->vtblk_disk->d_mediasize != capacity)
544 		vtblk_resize_disk(sc, capacity);
545 
546 	return (0);
547 }
548 
549 static int
550 vtblk_open(struct disk *dp)
551 {
552 	struct vtblk_softc *sc;
553 
554 	if ((sc = dp->d_drv1) == NULL)
555 		return (ENXIO);
556 
557 	return (sc->vtblk_flags & VTBLK_FLAG_DETACH ? ENXIO : 0);
558 }
559 
560 static int
561 vtblk_close(struct disk *dp)
562 {
563 	struct vtblk_softc *sc;
564 
565 	if ((sc = dp->d_drv1) == NULL)
566 		return (ENXIO);
567 
568 	return (0);
569 }
570 
571 static int
572 vtblk_ioctl(struct disk *dp, u_long cmd, void *addr, int flag,
573     struct thread *td)
574 {
575 	struct vtblk_softc *sc;
576 
577 	if ((sc = dp->d_drv1) == NULL)
578 		return (ENXIO);
579 
580 	return (ENOTTY);
581 }
582 
583 static int
584 vtblk_dump(void *arg, void *virtual, off_t offset, size_t length)
585 {
586 	struct disk *dp;
587 	struct vtblk_softc *sc;
588 	int error;
589 
590 	dp = arg;
591 	error = 0;
592 
593 	if ((sc = dp->d_drv1) == NULL)
594 		return (ENXIO);
595 
596 	VTBLK_LOCK(sc);
597 
598 	vtblk_dump_quiesce(sc);
599 
600 	if (length > 0)
601 		error = vtblk_dump_write(sc, virtual, offset, length);
602 	if (error || (virtual == NULL && offset == 0))
603 		vtblk_dump_complete(sc);
604 
605 	VTBLK_UNLOCK(sc);
606 
607 	return (error);
608 }
609 
610 static void
611 vtblk_strategy(struct bio *bp)
612 {
613 	struct vtblk_softc *sc;
614 
615 	if ((sc = bp->bio_disk->d_drv1) == NULL) {
616 		vtblk_bio_done(NULL, bp, EINVAL);
617 		return;
618 	}
619 
620 	if ((bp->bio_cmd != BIO_READ) && (bp->bio_cmd != BIO_WRITE) &&
621 	    (bp->bio_cmd != BIO_FLUSH) && (bp->bio_cmd != BIO_DELETE)) {
622 		vtblk_bio_done(sc, bp, EOPNOTSUPP);
623 		return;
624 	}
625 
626 	VTBLK_LOCK(sc);
627 
628 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH) {
629 		VTBLK_UNLOCK(sc);
630 		vtblk_bio_done(sc, bp, ENXIO);
631 		return;
632 	}
633 
634 	bioq_insert_tail(&sc->vtblk_bioq, bp);
635 	vtblk_startio(sc);
636 
637 	VTBLK_UNLOCK(sc);
638 }
639 
640 static int
641 vtblk_negotiate_features(struct vtblk_softc *sc)
642 {
643 	device_t dev;
644 	uint64_t features;
645 
646 	dev = sc->vtblk_dev;
647 	features = virtio_bus_is_modern(dev) ? VTBLK_MODERN_FEATURES :
648 	    VTBLK_LEGACY_FEATURES;
649 
650 	sc->vtblk_features = virtio_negotiate_features(dev, features);
651 	return (virtio_finalize_features(dev));
652 }
653 
654 static int
655 vtblk_setup_features(struct vtblk_softc *sc)
656 {
657 	device_t dev;
658 	int error;
659 
660 	dev = sc->vtblk_dev;
661 
662 	error = vtblk_negotiate_features(sc);
663 	if (error)
664 		return (error);
665 
666 	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
667 		sc->vtblk_flags |= VTBLK_FLAG_INDIRECT;
668 	if (virtio_with_feature(dev, VIRTIO_BLK_F_CONFIG_WCE))
669 		sc->vtblk_flags |= VTBLK_FLAG_WCE_CONFIG;
670 
671 	/* Legacy. */
672 	if (virtio_with_feature(dev, VIRTIO_BLK_F_BARRIER))
673 		sc->vtblk_flags |= VTBLK_FLAG_BARRIER;
674 
675 	return (0);
676 }
677 
678 static int
679 vtblk_maximum_segments(struct vtblk_softc *sc,
680     struct virtio_blk_config *blkcfg)
681 {
682 	device_t dev;
683 	int nsegs;
684 
685 	dev = sc->vtblk_dev;
686 	nsegs = VTBLK_MIN_SEGMENTS;
687 
688 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
689 		nsegs += MIN(blkcfg->seg_max, maxphys / PAGE_SIZE + 1);
690 		if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
691 			nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
692 	} else
693 		nsegs += 1;
694 
695 	return (nsegs);
696 }
697 
698 static int
699 vtblk_alloc_virtqueue(struct vtblk_softc *sc)
700 {
701 	device_t dev;
702 	struct vq_alloc_info vq_info;
703 
704 	dev = sc->vtblk_dev;
705 
706 	VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
707 	    vtblk_vq_intr, sc, &sc->vtblk_vq,
708 	    "%s request", device_get_nameunit(dev));
709 
710 	return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info));
711 }
712 
713 static void
714 vtblk_resize_disk(struct vtblk_softc *sc, uint64_t new_capacity)
715 {
716 	device_t dev;
717 	struct disk *dp;
718 	int error;
719 
720 	dev = sc->vtblk_dev;
721 	dp = sc->vtblk_disk;
722 
723 	dp->d_mediasize = new_capacity;
724 	if (bootverbose) {
725 		device_printf(dev, "resized to %juMB (%ju %u byte sectors)\n",
726 		    (uintmax_t) dp->d_mediasize >> 20,
727 		    (uintmax_t) dp->d_mediasize / dp->d_sectorsize,
728 		    dp->d_sectorsize);
729 	}
730 
731 	error = disk_resize(dp, M_NOWAIT);
732 	if (error) {
733 		device_printf(dev,
734 		    "disk_resize(9) failed, error: %d\n", error);
735 	}
736 }
737 
738 static void
739 vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
740 {
741 	device_t dev;
742 	struct disk *dp;
743 
744 	dev = sc->vtblk_dev;
745 
746 	sc->vtblk_disk = dp = disk_alloc();
747 	dp->d_open = vtblk_open;
748 	dp->d_close = vtblk_close;
749 	dp->d_ioctl = vtblk_ioctl;
750 	dp->d_strategy = vtblk_strategy;
751 	dp->d_name = VTBLK_DISK_NAME;
752 	dp->d_unit = device_get_unit(dev);
753 	dp->d_drv1 = sc;
754 	dp->d_flags = DISKFLAG_UNMAPPED_BIO | DISKFLAG_DIRECT_COMPLETION;
755 	dp->d_hba_vendor = virtio_get_vendor(dev);
756 	dp->d_hba_device = virtio_get_device(dev);
757 	dp->d_hba_subvendor = virtio_get_subvendor(dev);
758 	dp->d_hba_subdevice = virtio_get_subdevice(dev);
759 
760 	if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
761 		dp->d_flags |= DISKFLAG_WRITE_PROTECT;
762 	else {
763 		if (virtio_with_feature(dev, VIRTIO_BLK_F_FLUSH))
764 			dp->d_flags |= DISKFLAG_CANFLUSHCACHE;
765 		dp->d_dump = vtblk_dump;
766 	}
767 
768 	/* Capacity is always in 512-byte units. */
769 	dp->d_mediasize = blkcfg->capacity * VTBLK_BSIZE;
770 
771 	if (virtio_with_feature(dev, VIRTIO_BLK_F_BLK_SIZE))
772 		dp->d_sectorsize = blkcfg->blk_size;
773 	else
774 		dp->d_sectorsize = VTBLK_BSIZE;
775 
776 	/*
777 	 * The VirtIO maximum I/O size is given in terms of segments.
778 	 * However, FreeBSD limits I/O size by logical buffer size, not
779 	 * by physically contiguous pages. Therefore, we have to assume
780 	 * no pages are contiguous. This may impose an artificially low
781 	 * maximum I/O size. But in practice, since QEMU advertises 128
782 	 * segments, this gives us a maximum IO size of 125 * PAGE_SIZE,
783 	 * which is typically greater than maxphys. Eventually we should
784 	 * just advertise maxphys and split buffers that are too big.
785 	 *
786 	 * If we're not asking busdma to align data to page boundaries, the
787 	 * maximum I/O size is reduced by PAGE_SIZE in order to accommodate
788 	 * unaligned I/Os.
789 	 */
790 	dp->d_maxsize = (sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS) *
791 	    PAGE_SIZE;
792 	if ((sc->vtblk_flags & VTBLK_FLAG_BUSDMA_ALIGN) == 0)
793 		dp->d_maxsize -= PAGE_SIZE;
794 
795 	if (virtio_with_feature(dev, VIRTIO_BLK_F_GEOMETRY)) {
796 		dp->d_fwsectors = blkcfg->geometry.sectors;
797 		dp->d_fwheads = blkcfg->geometry.heads;
798 	}
799 
800 	if (virtio_with_feature(dev, VIRTIO_BLK_F_TOPOLOGY) &&
801 	    blkcfg->topology.physical_block_exp > 0) {
802 		dp->d_stripesize = dp->d_sectorsize *
803 		    (1 << blkcfg->topology.physical_block_exp);
804 		dp->d_stripeoffset = (dp->d_stripesize -
805 		    blkcfg->topology.alignment_offset * dp->d_sectorsize) %
806 		    dp->d_stripesize;
807 	}
808 
809 	if (virtio_with_feature(dev, VIRTIO_BLK_F_DISCARD)) {
810 		dp->d_flags |= DISKFLAG_CANDELETE;
811 		dp->d_delmaxsize = blkcfg->max_discard_sectors * VTBLK_BSIZE;
812 	}
813 
814 	if (vtblk_write_cache_enabled(sc, blkcfg) != 0)
815 		sc->vtblk_write_cache = VTBLK_CACHE_WRITEBACK;
816 	else
817 		sc->vtblk_write_cache = VTBLK_CACHE_WRITETHROUGH;
818 }
819 
820 static void
821 vtblk_create_disk(struct vtblk_softc *sc)
822 {
823 	struct disk *dp;
824 
825 	dp = sc->vtblk_disk;
826 
827 	vtblk_ident(sc);
828 
829 	device_printf(sc->vtblk_dev, "%juMB (%ju %u byte sectors)\n",
830 	    (uintmax_t) dp->d_mediasize >> 20,
831 	    (uintmax_t) dp->d_mediasize / dp->d_sectorsize,
832 	    dp->d_sectorsize);
833 
834 	disk_create(dp, DISK_VERSION);
835 }
836 
837 static int
838 vtblk_request_prealloc(struct vtblk_softc *sc)
839 {
840 	struct vtblk_request *req;
841 	int i, nreqs;
842 
843 	nreqs = virtqueue_size(sc->vtblk_vq);
844 
845 	/*
846 	 * Preallocate sufficient requests to keep the virtqueue full. Each
847 	 * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
848 	 * the number allocated when indirect descriptors are not available.
849 	 */
850 	if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
851 		nreqs /= VTBLK_MIN_SEGMENTS;
852 
853 	for (i = 0; i < nreqs; i++) {
854 		req = malloc(sizeof(struct vtblk_request), M_DEVBUF, M_NOWAIT);
855 		if (req == NULL)
856 			return (ENOMEM);
857 
858 		req->vbr_sc = sc;
859 		if (bus_dmamap_create(sc->vtblk_dmat, 0, &req->vbr_mapp)) {
860 			free(req, M_DEVBUF);
861 			return (ENOMEM);
862 		}
863 
864 		MPASS(sglist_count(&req->vbr_hdr, sizeof(req->vbr_hdr)) == 1);
865 		MPASS(sglist_count(&req->vbr_ack, sizeof(req->vbr_ack)) == 1);
866 
867 		sc->vtblk_request_count++;
868 		vtblk_request_enqueue(sc, req);
869 	}
870 
871 	return (0);
872 }
873 
874 static void
875 vtblk_request_free(struct vtblk_softc *sc)
876 {
877 	struct vtblk_request *req;
878 
879 	MPASS(TAILQ_EMPTY(&sc->vtblk_req_ready));
880 
881 	while ((req = vtblk_request_dequeue(sc)) != NULL) {
882 		sc->vtblk_request_count--;
883 		bus_dmamap_destroy(sc->vtblk_dmat, req->vbr_mapp);
884 		free(req, M_DEVBUF);
885 	}
886 
887 	KASSERT(sc->vtblk_request_count == 0,
888 	    ("%s: leaked %d requests", __func__, sc->vtblk_request_count));
889 }
890 
891 static struct vtblk_request *
892 vtblk_request_dequeue(struct vtblk_softc *sc)
893 {
894 	struct vtblk_request *req;
895 
896 	req = TAILQ_FIRST(&sc->vtblk_req_free);
897 	if (req != NULL) {
898 		TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link);
899 		bzero(&req->vbr_hdr, sizeof(struct vtblk_request) -
900 		    offsetof(struct vtblk_request, vbr_hdr));
901 	}
902 
903 	return (req);
904 }
905 
906 static void
907 vtblk_request_enqueue(struct vtblk_softc *sc, struct vtblk_request *req)
908 {
909 
910 	TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
911 }
912 
913 static struct vtblk_request *
914 vtblk_request_next_ready(struct vtblk_softc *sc)
915 {
916 	struct vtblk_request *req;
917 
918 	req = TAILQ_FIRST(&sc->vtblk_req_ready);
919 	if (req != NULL)
920 		TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link);
921 
922 	return (req);
923 }
924 
925 static void
926 vtblk_request_requeue_ready(struct vtblk_softc *sc, struct vtblk_request *req)
927 {
928 
929 	/* NOTE: Currently, there will be at most one request in the queue. */
930 	TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link);
931 }
932 
933 static struct vtblk_request *
934 vtblk_request_next(struct vtblk_softc *sc)
935 {
936 	struct vtblk_request *req;
937 
938 	req = vtblk_request_next_ready(sc);
939 	if (req != NULL)
940 		return (req);
941 
942 	return (vtblk_request_bio(sc));
943 }
944 
945 static struct vtblk_request *
946 vtblk_request_bio(struct vtblk_softc *sc)
947 {
948 	struct bio_queue_head *bioq;
949 	struct vtblk_request *req;
950 	struct bio *bp;
951 
952 	bioq = &sc->vtblk_bioq;
953 
954 	if (bioq_first(bioq) == NULL)
955 		return (NULL);
956 
957 	req = vtblk_request_dequeue(sc);
958 	if (req == NULL)
959 		return (NULL);
960 
961 	bp = bioq_takefirst(bioq);
962 	req->vbr_bp = bp;
963 	req->vbr_ack = -1;
964 	req->vbr_hdr.ioprio = vtblk_gtoh32(sc, 1);
965 
966 	switch (bp->bio_cmd) {
967 	case BIO_FLUSH:
968 		req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_FLUSH);
969 		req->vbr_hdr.sector = 0;
970 		break;
971 	case BIO_READ:
972 		req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_IN);
973 		req->vbr_hdr.sector = vtblk_gtoh64(sc, bp->bio_offset / VTBLK_BSIZE);
974 		break;
975 	case BIO_WRITE:
976 		req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_OUT);
977 		req->vbr_hdr.sector = vtblk_gtoh64(sc, bp->bio_offset / VTBLK_BSIZE);
978 		break;
979 	case BIO_DELETE:
980 		req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_DISCARD);
981 		req->vbr_hdr.sector = vtblk_gtoh64(sc, bp->bio_offset / VTBLK_BSIZE);
982 		break;
983 	default:
984 		panic("%s: bio with unhandled cmd: %d", __func__, bp->bio_cmd);
985 	}
986 
987 	if (bp->bio_flags & BIO_ORDERED)
988 		req->vbr_hdr.type |= vtblk_gtoh32(sc, VIRTIO_BLK_T_BARRIER);
989 
990 	return (req);
991 }
992 
993 static int
994 vtblk_request_execute(struct vtblk_request *req, int flags)
995 {
996 	struct vtblk_softc *sc = req->vbr_sc;
997 	struct bio *bp = req->vbr_bp;
998 	int error = 0;
999 
1000 	/*
1001 	 * Call via bus_dmamap_load_bio or directly depending on whether we
1002 	 * have a buffer we need to map.  If we don't have a busdma map,
1003 	 * try to perform the I/O directly and hope that it works (this will
1004 	 * happen when dumping).
1005 	 */
1006 	if ((req->vbr_mapp != NULL) &&
1007 	    (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
1008 		error = bus_dmamap_load_bio(sc->vtblk_dmat, req->vbr_mapp,
1009 		    req->vbr_bp, vtblk_request_execute_cb, req, flags);
1010 		if (error == EINPROGRESS) {
1011 			req->vbr_busdma_wait = 1;
1012 			sc->vtblk_flags |= VTBLK_FLAG_BUSDMA_WAIT;
1013 		}
1014 	} else {
1015 		vtblk_request_execute_cb(req, NULL, 0, 0);
1016 	}
1017 
1018 	return (error ? error : req->vbr_error);
1019 }
1020 
1021 static void
1022 vtblk_request_execute_cb(void * callback_arg, bus_dma_segment_t * segs,
1023     int nseg, int error)
1024 {
1025 	struct vtblk_request *req;
1026 	struct vtblk_softc *sc;
1027 	struct virtqueue *vq;
1028 	struct sglist *sg;
1029 	struct bio *bp;
1030 	int ordered, readable, writable, i;
1031 
1032 	req = (struct vtblk_request *)callback_arg;
1033 	sc = req->vbr_sc;
1034 	vq = sc->vtblk_vq;
1035 	sg = sc->vtblk_sglist;
1036 	bp = req->vbr_bp;
1037 	ordered = 0;
1038 	writable = 0;
1039 
1040 	/*
1041 	 * If we paused request queueing while we waited for busdma to call us
1042 	 * asynchronously, unpause it now; this request made it through so we
1043 	 * don't need to worry about others getting ahead of us.  (Note that we
1044 	 * hold the device mutex so nothing will happen until after we return
1045 	 * anyway.)
1046 	 */
1047 	if (req->vbr_busdma_wait)
1048 		sc->vtblk_flags &= ~VTBLK_FLAG_BUSDMA_WAIT;
1049 
1050 	/* Fail on errors from busdma. */
1051 	if (error)
1052 		goto out1;
1053 
1054 	/*
1055 	 * Some hosts (such as bhyve) do not implement the barrier feature,
1056 	 * so we emulate it in the driver by allowing the barrier request
1057 	 * to be the only one in flight.
1058 	 */
1059 	if ((sc->vtblk_flags & VTBLK_FLAG_BARRIER) == 0) {
1060 		if (sc->vtblk_req_ordered != NULL) {
1061 			error = EBUSY;
1062 			goto out;
1063 		}
1064 		if (bp->bio_flags & BIO_ORDERED) {
1065 			if (!virtqueue_empty(vq)) {
1066 				error = EBUSY;
1067 				goto out;
1068 			}
1069 			ordered = 1;
1070 			req->vbr_hdr.type &= vtblk_gtoh32(sc,
1071 				~VIRTIO_BLK_T_BARRIER);
1072 		}
1073 	}
1074 
1075 	sglist_reset(sg);
1076 	sglist_append(sg, &req->vbr_hdr, sizeof(struct virtio_blk_outhdr));
1077 
1078 	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
1079 		/*
1080 		 * We cast bus_addr_t to vm_paddr_t here; since we skip the
1081 		 * iommu mapping (see vtblk_attach) this should be safe.
1082 		 */
1083 		for (i = 0; i < nseg; i++) {
1084 			error = sglist_append_phys(sg,
1085 			    (vm_paddr_t)segs[i].ds_addr, segs[i].ds_len);
1086 			if (error || sg->sg_nseg == sg->sg_maxseg) {
1087 				panic("%s: bio %p data buffer too big %d",
1088 				    __func__, bp, error);
1089 			}
1090 		}
1091 
1092 		/* Special handling for dump, which bypasses busdma. */
1093 		if (req->vbr_mapp == NULL) {
1094 			error = sglist_append_bio(sg, bp);
1095 			if (error || sg->sg_nseg == sg->sg_maxseg) {
1096 				panic("%s: bio %p data buffer too big %d",
1097 				    __func__, bp, error);
1098 			}
1099 		}
1100 
1101 		/* BIO_READ means the host writes into our buffer. */
1102 		if (bp->bio_cmd == BIO_READ)
1103 			writable = sg->sg_nseg - 1;
1104 	} else if (bp->bio_cmd == BIO_DELETE) {
1105 		struct virtio_blk_discard_write_zeroes *discard;
1106 
1107 		discard = malloc(sizeof(*discard), M_DEVBUF, M_NOWAIT | M_ZERO);
1108 		if (discard == NULL) {
1109 			error = ENOMEM;
1110 			goto out;
1111 		}
1112 
1113 		bp->bio_driver1 = discard;
1114 		discard->sector = vtblk_gtoh64(sc, bp->bio_offset / VTBLK_BSIZE);
1115 		discard->num_sectors = vtblk_gtoh32(sc, bp->bio_bcount / VTBLK_BSIZE);
1116 		error = sglist_append(sg, discard, sizeof(*discard));
1117 		if (error || sg->sg_nseg == sg->sg_maxseg) {
1118 			panic("%s: bio %p data buffer too big %d",
1119 			    __func__, bp, error);
1120 		}
1121 	}
1122 
1123 	writable++;
1124 	sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
1125 	readable = sg->sg_nseg - writable;
1126 
1127 	if (req->vbr_mapp != NULL) {
1128 		switch (bp->bio_cmd) {
1129 		case BIO_READ:
1130 			bus_dmamap_sync(sc->vtblk_dmat, req->vbr_mapp,
1131 			    BUS_DMASYNC_PREREAD);
1132 			break;
1133 		case BIO_WRITE:
1134 			bus_dmamap_sync(sc->vtblk_dmat, req->vbr_mapp,
1135 			    BUS_DMASYNC_PREWRITE);
1136 			break;
1137 		}
1138 	}
1139 
1140 	error = virtqueue_enqueue(vq, req, sg, readable, writable);
1141 	if (error == 0 && ordered)
1142 		sc->vtblk_req_ordered = req;
1143 
1144 	/*
1145 	 * If we were called asynchronously, we need to notify the queue that
1146 	 * we've added a new request, since the notification from startio was
1147 	 * performed already.
1148 	 */
1149 	if (error == 0 && req->vbr_busdma_wait)
1150 		virtqueue_notify(vq);
1151 
1152 out:
1153 	if (error && (req->vbr_mapp != NULL))
1154 		bus_dmamap_unload(sc->vtblk_dmat, req->vbr_mapp);
1155 out1:
1156 	if (error && req->vbr_requeue_on_error)
1157 		vtblk_request_requeue_ready(sc, req);
1158 	req->vbr_error = error;
1159 }
1160 
1161 static int
1162 vtblk_request_error(struct vtblk_request *req)
1163 {
1164 	int error;
1165 
1166 	switch (req->vbr_ack) {
1167 	case VIRTIO_BLK_S_OK:
1168 		error = 0;
1169 		break;
1170 	case VIRTIO_BLK_S_UNSUPP:
1171 		error = ENOTSUP;
1172 		break;
1173 	default:
1174 		error = EIO;
1175 		break;
1176 	}
1177 
1178 	return (error);
1179 }
1180 
1181 static void
1182 vtblk_queue_completed(struct vtblk_softc *sc, struct bio_queue *queue)
1183 {
1184 	struct vtblk_request *req;
1185 	struct bio *bp;
1186 
1187 	while ((req = virtqueue_dequeue(sc->vtblk_vq, NULL)) != NULL) {
1188 		if (sc->vtblk_req_ordered != NULL) {
1189 			MPASS(sc->vtblk_req_ordered == req);
1190 			sc->vtblk_req_ordered = NULL;
1191 		}
1192 
1193 		bp = req->vbr_bp;
1194 		if (req->vbr_mapp != NULL) {
1195 			switch (bp->bio_cmd) {
1196 			case BIO_READ:
1197 				bus_dmamap_sync(sc->vtblk_dmat, req->vbr_mapp,
1198 				    BUS_DMASYNC_POSTREAD);
1199 				bus_dmamap_unload(sc->vtblk_dmat,
1200 				    req->vbr_mapp);
1201 				break;
1202 			case BIO_WRITE:
1203 				bus_dmamap_sync(sc->vtblk_dmat, req->vbr_mapp,
1204 				    BUS_DMASYNC_POSTWRITE);
1205 				bus_dmamap_unload(sc->vtblk_dmat,
1206 				    req->vbr_mapp);
1207 				break;
1208 			}
1209 		}
1210 		bp->bio_error = vtblk_request_error(req);
1211 		TAILQ_INSERT_TAIL(queue, bp, bio_queue);
1212 
1213 		vtblk_request_enqueue(sc, req);
1214 	}
1215 }
1216 
1217 static void
1218 vtblk_done_completed(struct vtblk_softc *sc, struct bio_queue *queue)
1219 {
1220 	struct bio *bp, *tmp;
1221 
1222 	TAILQ_FOREACH_SAFE(bp, queue, bio_queue, tmp) {
1223 		if (bp->bio_error != 0)
1224 			disk_err(bp, "hard error", -1, 1);
1225 		vtblk_bio_done(sc, bp, bp->bio_error);
1226 	}
1227 }
1228 
1229 static void
1230 vtblk_drain_vq(struct vtblk_softc *sc)
1231 {
1232 	struct virtqueue *vq;
1233 	struct vtblk_request *req;
1234 	int last;
1235 
1236 	vq = sc->vtblk_vq;
1237 	last = 0;
1238 
1239 	while ((req = virtqueue_drain(vq, &last)) != NULL) {
1240 		vtblk_bio_done(sc, req->vbr_bp, ENXIO);
1241 		vtblk_request_enqueue(sc, req);
1242 	}
1243 
1244 	sc->vtblk_req_ordered = NULL;
1245 	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
1246 }
1247 
1248 static void
1249 vtblk_drain(struct vtblk_softc *sc)
1250 {
1251 	struct bio_queue_head *bioq;
1252 	struct vtblk_request *req;
1253 	struct bio *bp;
1254 
1255 	bioq = &sc->vtblk_bioq;
1256 
1257 	if (sc->vtblk_vq != NULL) {
1258 		struct bio_queue queue;
1259 
1260 		TAILQ_INIT(&queue);
1261 		vtblk_queue_completed(sc, &queue);
1262 		vtblk_done_completed(sc, &queue);
1263 
1264 		vtblk_drain_vq(sc);
1265 	}
1266 
1267 	while ((req = vtblk_request_next_ready(sc)) != NULL) {
1268 		vtblk_bio_done(sc, req->vbr_bp, ENXIO);
1269 		vtblk_request_enqueue(sc, req);
1270 	}
1271 
1272 	while (bioq_first(bioq) != NULL) {
1273 		bp = bioq_takefirst(bioq);
1274 		vtblk_bio_done(sc, bp, ENXIO);
1275 	}
1276 
1277 	vtblk_request_free(sc);
1278 }
1279 
1280 static void
1281 vtblk_startio(struct vtblk_softc *sc)
1282 {
1283 	struct virtqueue *vq;
1284 	struct vtblk_request *req;
1285 	int enq;
1286 
1287 	VTBLK_LOCK_ASSERT(sc);
1288 	vq = sc->vtblk_vq;
1289 	enq = 0;
1290 
1291 	if (sc->vtblk_flags & (VTBLK_FLAG_SUSPEND | VTBLK_FLAG_BUSDMA_WAIT))
1292 		return;
1293 
1294 	while (!virtqueue_full(vq)) {
1295 		req = vtblk_request_next(sc);
1296 		if (req == NULL)
1297 			break;
1298 
1299 		req->vbr_requeue_on_error = 1;
1300 		if (vtblk_request_execute(req, BUS_DMA_WAITOK))
1301 			break;
1302 
1303 		enq++;
1304 	}
1305 
1306 	if (enq > 0)
1307 		virtqueue_notify(vq);
1308 }
1309 
1310 static void
1311 vtblk_bio_done(struct vtblk_softc *sc, struct bio *bp, int error)
1312 {
1313 
1314 	/* Because of GEOM direct dispatch, we cannot hold any locks. */
1315 	if (sc != NULL)
1316 		VTBLK_LOCK_ASSERT_NOTOWNED(sc);
1317 
1318 	if (error) {
1319 		bp->bio_resid = bp->bio_bcount;
1320 		bp->bio_error = error;
1321 		bp->bio_flags |= BIO_ERROR;
1322 	} else {
1323 		kmsan_mark_bio(bp, KMSAN_STATE_INITED);
1324 	}
1325 
1326 	if (bp->bio_driver1 != NULL) {
1327 		free(bp->bio_driver1, M_DEVBUF);
1328 		bp->bio_driver1 = NULL;
1329 	}
1330 
1331 	biodone(bp);
1332 }
1333 
1334 #define VTBLK_GET_CONFIG(_dev, _feature, _field, _cfg)			\
1335 	if (virtio_with_feature(_dev, _feature)) {			\
1336 		virtio_read_device_config(_dev,				\
1337 		    offsetof(struct virtio_blk_config, _field),		\
1338 		    &(_cfg)->_field, sizeof((_cfg)->_field));		\
1339 	}
1340 
1341 static void
1342 vtblk_read_config(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
1343 {
1344 	device_t dev;
1345 
1346 	dev = sc->vtblk_dev;
1347 
1348 	bzero(blkcfg, sizeof(struct virtio_blk_config));
1349 
1350 	/* The capacity is always available. */
1351 	virtio_read_device_config(dev, offsetof(struct virtio_blk_config,
1352 	    capacity), &blkcfg->capacity, sizeof(blkcfg->capacity));
1353 
1354 	/* Read the configuration if the feature was negotiated. */
1355 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_SIZE_MAX, size_max, blkcfg);
1356 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_SEG_MAX, seg_max, blkcfg);
1357 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_GEOMETRY,
1358 	    geometry.cylinders, blkcfg);
1359 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_GEOMETRY,
1360 	    geometry.heads, blkcfg);
1361 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_GEOMETRY,
1362 	    geometry.sectors, blkcfg);
1363 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_BLK_SIZE, blk_size, blkcfg);
1364 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY,
1365 	    topology.physical_block_exp, blkcfg);
1366 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY,
1367 	    topology.alignment_offset, blkcfg);
1368 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY,
1369 	    topology.min_io_size, blkcfg);
1370 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY,
1371 	    topology.opt_io_size, blkcfg);
1372 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_CONFIG_WCE, wce, blkcfg);
1373 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, max_discard_sectors,
1374 	    blkcfg);
1375 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, max_discard_seg, blkcfg);
1376 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, discard_sector_alignment,
1377 	    blkcfg);
1378 }
1379 
1380 #undef VTBLK_GET_CONFIG
1381 
1382 static void
1383 vtblk_ident(struct vtblk_softc *sc)
1384 {
1385 	struct bio buf;
1386 	struct disk *dp;
1387 	struct vtblk_request *req;
1388 	int len, error;
1389 
1390 	dp = sc->vtblk_disk;
1391 	len = MIN(VIRTIO_BLK_ID_BYTES, DISK_IDENT_SIZE);
1392 
1393 	if (vtblk_tunable_int(sc, "no_ident", vtblk_no_ident) != 0)
1394 		return;
1395 
1396 	req = vtblk_request_dequeue(sc);
1397 	if (req == NULL)
1398 		return;
1399 
1400 	req->vbr_ack = -1;
1401 	req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_GET_ID);
1402 	req->vbr_hdr.ioprio = vtblk_gtoh32(sc, 1);
1403 	req->vbr_hdr.sector = 0;
1404 
1405 	req->vbr_bp = &buf;
1406 	g_reset_bio(&buf);
1407 
1408 	buf.bio_cmd = BIO_READ;
1409 	buf.bio_data = dp->d_ident;
1410 	buf.bio_bcount = len;
1411 
1412 	VTBLK_LOCK(sc);
1413 	error = vtblk_poll_request(sc, req);
1414 	VTBLK_UNLOCK(sc);
1415 
1416 	vtblk_request_enqueue(sc, req);
1417 
1418 	if (error) {
1419 		device_printf(sc->vtblk_dev,
1420 		    "error getting device identifier: %d\n", error);
1421 	}
1422 }
1423 
1424 static int
1425 vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
1426 {
1427 	struct virtqueue *vq;
1428 	int error;
1429 
1430 	vq = sc->vtblk_vq;
1431 
1432 	if (!virtqueue_empty(vq))
1433 		return (EBUSY);
1434 
1435 	error = vtblk_request_execute(req, BUS_DMA_NOWAIT);
1436 	if (error)
1437 		return (error);
1438 
1439 	virtqueue_notify(vq);
1440 	virtqueue_poll(vq, NULL);
1441 
1442 	error = vtblk_request_error(req);
1443 	if (error && bootverbose) {
1444 		device_printf(sc->vtblk_dev,
1445 		    "%s: IO error: %d\n", __func__, error);
1446 	}
1447 
1448 	return (error);
1449 }
1450 
1451 static int
1452 vtblk_quiesce(struct vtblk_softc *sc)
1453 {
1454 	int error;
1455 
1456 	VTBLK_LOCK_ASSERT(sc);
1457 	error = 0;
1458 
1459 	while (!virtqueue_empty(sc->vtblk_vq)) {
1460 		if (mtx_sleep(&sc->vtblk_vq, VTBLK_MTX(sc), PRIBIO, "vtblkq",
1461 		    VTBLK_QUIESCE_TIMEOUT) == EWOULDBLOCK) {
1462 			error = EBUSY;
1463 			break;
1464 		}
1465 	}
1466 
1467 	return (error);
1468 }
1469 
1470 static void
1471 vtblk_vq_intr(void *xsc)
1472 {
1473 	struct vtblk_softc *sc;
1474 	struct virtqueue *vq;
1475 	struct bio_queue queue;
1476 
1477 	sc = xsc;
1478 	vq = sc->vtblk_vq;
1479 	TAILQ_INIT(&queue);
1480 
1481 	VTBLK_LOCK(sc);
1482 
1483 again:
1484 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH)
1485 		goto out;
1486 
1487 	vtblk_queue_completed(sc, &queue);
1488 	vtblk_startio(sc);
1489 
1490 	if (virtqueue_enable_intr(vq) != 0) {
1491 		virtqueue_disable_intr(vq);
1492 		goto again;
1493 	}
1494 
1495 	if (sc->vtblk_flags & VTBLK_FLAG_SUSPEND)
1496 		wakeup(&sc->vtblk_vq);
1497 
1498 out:
1499 	VTBLK_UNLOCK(sc);
1500 	vtblk_done_completed(sc, &queue);
1501 }
1502 
1503 static void
1504 vtblk_stop(struct vtblk_softc *sc)
1505 {
1506 
1507 	virtqueue_disable_intr(sc->vtblk_vq);
1508 	virtio_stop(sc->vtblk_dev);
1509 }
1510 
1511 static void
1512 vtblk_dump_quiesce(struct vtblk_softc *sc)
1513 {
1514 
1515 	/*
1516 	 * Spin here until all the requests in-flight at the time of the
1517 	 * dump are completed and queued. The queued requests will be
1518 	 * biodone'd once the dump is finished.
1519 	 */
1520 	while (!virtqueue_empty(sc->vtblk_vq))
1521 		vtblk_queue_completed(sc, &sc->vtblk_dump_queue);
1522 }
1523 
1524 static int
1525 vtblk_dump_write(struct vtblk_softc *sc, void *virtual, off_t offset,
1526     size_t length)
1527 {
1528 	struct bio buf;
1529 	struct vtblk_request *req;
1530 
1531 	req = &sc->vtblk_dump_request;
1532 	req->vbr_sc = sc;
1533 	req->vbr_ack = -1;
1534 	req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_OUT);
1535 	req->vbr_hdr.ioprio = vtblk_gtoh32(sc, 1);
1536 	req->vbr_hdr.sector = vtblk_gtoh64(sc, offset / VTBLK_BSIZE);
1537 
1538 	req->vbr_bp = &buf;
1539 	g_reset_bio(&buf);
1540 
1541 	buf.bio_cmd = BIO_WRITE;
1542 	buf.bio_data = virtual;
1543 	buf.bio_bcount = length;
1544 
1545 	return (vtblk_poll_request(sc, req));
1546 }
1547 
1548 static int
1549 vtblk_dump_flush(struct vtblk_softc *sc)
1550 {
1551 	struct bio buf;
1552 	struct vtblk_request *req;
1553 
1554 	req = &sc->vtblk_dump_request;
1555 	req->vbr_sc = sc;
1556 	req->vbr_ack = -1;
1557 	req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_FLUSH);
1558 	req->vbr_hdr.ioprio = vtblk_gtoh32(sc, 1);
1559 	req->vbr_hdr.sector = 0;
1560 
1561 	req->vbr_bp = &buf;
1562 	g_reset_bio(&buf);
1563 
1564 	buf.bio_cmd = BIO_FLUSH;
1565 
1566 	return (vtblk_poll_request(sc, req));
1567 }
1568 
1569 static void
1570 vtblk_dump_complete(struct vtblk_softc *sc)
1571 {
1572 
1573 	vtblk_dump_flush(sc);
1574 
1575 	VTBLK_UNLOCK(sc);
1576 	vtblk_done_completed(sc, &sc->vtblk_dump_queue);
1577 	VTBLK_LOCK(sc);
1578 }
1579 
1580 static void
1581 vtblk_set_write_cache(struct vtblk_softc *sc, int wc)
1582 {
1583 
1584 	/* Set either writeback (1) or writethrough (0) mode. */
1585 	virtio_write_dev_config_1(sc->vtblk_dev,
1586 	    offsetof(struct virtio_blk_config, wce), wc);
1587 }
1588 
1589 static int
1590 vtblk_write_cache_enabled(struct vtblk_softc *sc,
1591     struct virtio_blk_config *blkcfg)
1592 {
1593 	int wc;
1594 
1595 	if (sc->vtblk_flags & VTBLK_FLAG_WCE_CONFIG) {
1596 		wc = vtblk_tunable_int(sc, "writecache_mode",
1597 		    vtblk_writecache_mode);
1598 		if (wc >= 0 && wc < VTBLK_CACHE_MAX)
1599 			vtblk_set_write_cache(sc, wc);
1600 		else
1601 			wc = blkcfg->wce;
1602 	} else
1603 		wc = virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_FLUSH);
1604 
1605 	return (wc);
1606 }
1607 
1608 static int
1609 vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS)
1610 {
1611 	struct vtblk_softc *sc;
1612 	int wc, error;
1613 
1614 	sc = oidp->oid_arg1;
1615 	wc = sc->vtblk_write_cache;
1616 
1617 	error = sysctl_handle_int(oidp, &wc, 0, req);
1618 	if (error || req->newptr == NULL)
1619 		return (error);
1620 	if ((sc->vtblk_flags & VTBLK_FLAG_WCE_CONFIG) == 0)
1621 		return (EPERM);
1622 	if (wc < 0 || wc >= VTBLK_CACHE_MAX)
1623 		return (EINVAL);
1624 
1625 	VTBLK_LOCK(sc);
1626 	sc->vtblk_write_cache = wc;
1627 	vtblk_set_write_cache(sc, sc->vtblk_write_cache);
1628 	VTBLK_UNLOCK(sc);
1629 
1630 	return (0);
1631 }
1632 
1633 static void
1634 vtblk_setup_sysctl(struct vtblk_softc *sc)
1635 {
1636 	device_t dev;
1637 	struct sysctl_ctx_list *ctx;
1638 	struct sysctl_oid *tree;
1639 	struct sysctl_oid_list *child;
1640 
1641 	dev = sc->vtblk_dev;
1642 	ctx = device_get_sysctl_ctx(dev);
1643 	tree = device_get_sysctl_tree(dev);
1644 	child = SYSCTL_CHILDREN(tree);
1645 
1646 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "writecache_mode",
1647 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1648 	    vtblk_write_cache_sysctl, "I",
1649 	    "Write cache mode (writethrough (0) or writeback (1))");
1650 }
1651 
1652 static int
1653 vtblk_tunable_int(struct vtblk_softc *sc, const char *knob, int def)
1654 {
1655 	char path[64];
1656 
1657 	snprintf(path, sizeof(path),
1658 	    "hw.vtblk.%d.%s", device_get_unit(sc->vtblk_dev), knob);
1659 	TUNABLE_INT_FETCH(path, &def);
1660 
1661 	return (def);
1662 }
1663