xref: /freebsd/sys/dev/virtio/block/virtio_blk.c (revision f3065e767def62d9b593dd7528c0eb121a7e1439)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011, Bryan Venteicher <bryanv@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /* Driver for VirtIO block devices. */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/bio.h>
38 #include <sys/malloc.h>
39 #include <sys/module.h>
40 #include <sys/msan.h>
41 #include <sys/sglist.h>
42 #include <sys/sysctl.h>
43 #include <sys/lock.h>
44 #include <sys/mutex.h>
45 #include <sys/queue.h>
46 
47 #include <geom/geom.h>
48 #include <geom/geom_disk.h>
49 
50 #include <machine/bus.h>
51 #include <machine/resource.h>
52 #include <sys/bus.h>
53 #include <sys/rman.h>
54 
55 #include <dev/virtio/virtio.h>
56 #include <dev/virtio/virtqueue.h>
57 #include <dev/virtio/block/virtio_blk.h>
58 
59 #include "virtio_if.h"
60 
61 struct vtblk_request {
62 	struct vtblk_softc		*vbr_sc;
63 	bus_dmamap_t			 vbr_mapp;
64 
65 	/* Fields after this point are zeroed for each request. */
66 	struct virtio_blk_outhdr	 vbr_hdr;
67 	struct bio			*vbr_bp;
68 	uint8_t				 vbr_ack;
69 	uint8_t				 vbr_requeue_on_error;
70 	uint8_t				 vbr_busdma_wait;
71 	int				 vbr_error;
72 	TAILQ_ENTRY(vtblk_request)	 vbr_link;
73 };
74 
75 enum vtblk_cache_mode {
76 	VTBLK_CACHE_WRITETHROUGH,
77 	VTBLK_CACHE_WRITEBACK,
78 	VTBLK_CACHE_MAX
79 };
80 
81 struct vtblk_softc {
82 	device_t		 vtblk_dev;
83 	struct mtx		 vtblk_mtx;
84 	uint64_t		 vtblk_features;
85 	uint32_t		 vtblk_flags;
86 #define VTBLK_FLAG_INDIRECT	0x0001
87 #define VTBLK_FLAG_DETACH	0x0002
88 #define VTBLK_FLAG_SUSPEND	0x0004
89 #define VTBLK_FLAG_BARRIER	0x0008
90 #define VTBLK_FLAG_WCE_CONFIG	0x0010
91 #define VTBLK_FLAG_BUSDMA_WAIT	0x0020
92 #define VTBLK_FLAG_BUSDMA_ALIGN	0x0040
93 
94 	struct virtqueue	*vtblk_vq;
95 	struct sglist		*vtblk_sglist;
96 	bus_dma_tag_t		 vtblk_dmat;
97 	struct disk		*vtblk_disk;
98 
99 	struct bio_queue_head	 vtblk_bioq;
100 	TAILQ_HEAD(, vtblk_request)
101 				 vtblk_req_free;
102 	TAILQ_HEAD(, vtblk_request)
103 				 vtblk_req_ready;
104 	struct vtblk_request	*vtblk_req_ordered;
105 
106 	int			 vtblk_max_nsegs;
107 	int			 vtblk_request_count;
108 	enum vtblk_cache_mode	 vtblk_write_cache;
109 
110 	struct bio_queue	 vtblk_dump_queue;
111 	struct vtblk_request	 vtblk_dump_request;
112 };
113 
114 static struct virtio_feature_desc vtblk_feature_desc[] = {
115 	{ VIRTIO_BLK_F_BARRIER,		"HostBarrier"	},
116 	{ VIRTIO_BLK_F_SIZE_MAX,	"MaxSegSize"	},
117 	{ VIRTIO_BLK_F_SEG_MAX,		"MaxNumSegs"	},
118 	{ VIRTIO_BLK_F_GEOMETRY,	"DiskGeometry"	},
119 	{ VIRTIO_BLK_F_RO,		"ReadOnly"	},
120 	{ VIRTIO_BLK_F_BLK_SIZE,	"BlockSize"	},
121 	{ VIRTIO_BLK_F_SCSI,		"SCSICmds"	},
122 	{ VIRTIO_BLK_F_FLUSH,		"FlushCmd"	},
123 	{ VIRTIO_BLK_F_TOPOLOGY,	"Topology"	},
124 	{ VIRTIO_BLK_F_CONFIG_WCE,	"ConfigWCE"	},
125 	{ VIRTIO_BLK_F_MQ,		"Multiqueue"	},
126 	{ VIRTIO_BLK_F_DISCARD,		"Discard"	},
127 	{ VIRTIO_BLK_F_WRITE_ZEROES,	"WriteZeros"	},
128 
129 	{ 0, NULL }
130 };
131 
132 static int	vtblk_modevent(module_t, int, void *);
133 
134 static int	vtblk_probe(device_t);
135 static int	vtblk_attach(device_t);
136 static int	vtblk_detach(device_t);
137 static int	vtblk_suspend(device_t);
138 static int	vtblk_resume(device_t);
139 static int	vtblk_shutdown(device_t);
140 static int	vtblk_attach_completed(device_t);
141 static int	vtblk_config_change(device_t);
142 
143 static int	vtblk_open(struct disk *);
144 static int	vtblk_close(struct disk *);
145 static int	vtblk_ioctl(struct disk *, u_long, void *, int,
146 		    struct thread *);
147 static int	vtblk_dump(void *, void *, off_t, size_t);
148 static void	vtblk_strategy(struct bio *);
149 
150 static int	vtblk_negotiate_features(struct vtblk_softc *);
151 static int	vtblk_setup_features(struct vtblk_softc *);
152 static int	vtblk_maximum_segments(struct vtblk_softc *,
153 		    struct virtio_blk_config *);
154 static int	vtblk_alloc_virtqueue(struct vtblk_softc *);
155 static void	vtblk_resize_disk(struct vtblk_softc *, uint64_t);
156 static void	vtblk_alloc_disk(struct vtblk_softc *,
157 		    struct virtio_blk_config *);
158 static void	vtblk_create_disk(struct vtblk_softc *);
159 
160 static int	vtblk_request_prealloc(struct vtblk_softc *);
161 static void	vtblk_request_free(struct vtblk_softc *);
162 static struct vtblk_request *
163 		vtblk_request_dequeue(struct vtblk_softc *);
164 static void	vtblk_request_enqueue(struct vtblk_softc *,
165 		    struct vtblk_request *);
166 static struct vtblk_request *
167 		vtblk_request_next_ready(struct vtblk_softc *);
168 static void	vtblk_request_requeue_ready(struct vtblk_softc *,
169 		    struct vtblk_request *);
170 static struct vtblk_request *
171 		vtblk_request_next(struct vtblk_softc *);
172 static struct vtblk_request *
173 		vtblk_request_bio(struct vtblk_softc *);
174 static int	vtblk_request_execute(struct vtblk_request *, int);
175 static void	vtblk_request_execute_cb(void *,
176 		    bus_dma_segment_t *, int, int);
177 static int	vtblk_request_error(struct vtblk_request *);
178 
179 static void	vtblk_queue_completed(struct vtblk_softc *,
180 		    struct bio_queue *);
181 static void	vtblk_done_completed(struct vtblk_softc *,
182 		    struct bio_queue *);
183 static void	vtblk_drain_vq(struct vtblk_softc *);
184 static void	vtblk_drain(struct vtblk_softc *);
185 
186 static void	vtblk_startio(struct vtblk_softc *);
187 static void	vtblk_bio_done(struct vtblk_softc *, struct bio *, int);
188 
189 static void	vtblk_read_config(struct vtblk_softc *,
190 		    struct virtio_blk_config *);
191 static void	vtblk_ident(struct vtblk_softc *);
192 static int	vtblk_poll_request(struct vtblk_softc *,
193 		    struct vtblk_request *);
194 static int	vtblk_quiesce(struct vtblk_softc *);
195 static void	vtblk_vq_intr(void *);
196 static void	vtblk_stop(struct vtblk_softc *);
197 
198 static void	vtblk_dump_quiesce(struct vtblk_softc *);
199 static int	vtblk_dump_write(struct vtblk_softc *, void *, off_t, size_t);
200 static int	vtblk_dump_flush(struct vtblk_softc *);
201 static void	vtblk_dump_complete(struct vtblk_softc *);
202 
203 static void	vtblk_set_write_cache(struct vtblk_softc *, int);
204 static int	vtblk_write_cache_enabled(struct vtblk_softc *sc,
205 		    struct virtio_blk_config *);
206 static int	vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS);
207 
208 static void	vtblk_setup_sysctl(struct vtblk_softc *);
209 static int	vtblk_tunable_int(struct vtblk_softc *, const char *, int);
210 
211 #define vtblk_modern(_sc) (((_sc)->vtblk_features & VIRTIO_F_VERSION_1) != 0)
212 #define vtblk_htog16(_sc, _val)	virtio_htog16(vtblk_modern(_sc), _val)
213 #define vtblk_htog32(_sc, _val)	virtio_htog32(vtblk_modern(_sc), _val)
214 #define vtblk_htog64(_sc, _val)	virtio_htog64(vtblk_modern(_sc), _val)
215 #define vtblk_gtoh16(_sc, _val)	virtio_gtoh16(vtblk_modern(_sc), _val)
216 #define vtblk_gtoh32(_sc, _val)	virtio_gtoh32(vtblk_modern(_sc), _val)
217 #define vtblk_gtoh64(_sc, _val)	virtio_gtoh64(vtblk_modern(_sc), _val)
218 
219 /* Tunables. */
220 static int vtblk_no_ident = 0;
221 TUNABLE_INT("hw.vtblk.no_ident", &vtblk_no_ident);
222 static int vtblk_writecache_mode = -1;
223 TUNABLE_INT("hw.vtblk.writecache_mode", &vtblk_writecache_mode);
224 
225 #define VTBLK_COMMON_FEATURES \
226     (VIRTIO_BLK_F_SIZE_MAX		| \
227      VIRTIO_BLK_F_SEG_MAX		| \
228      VIRTIO_BLK_F_GEOMETRY		| \
229      VIRTIO_BLK_F_RO			| \
230      VIRTIO_BLK_F_BLK_SIZE		| \
231      VIRTIO_BLK_F_FLUSH			| \
232      VIRTIO_BLK_F_TOPOLOGY		| \
233      VIRTIO_BLK_F_CONFIG_WCE		| \
234      VIRTIO_BLK_F_DISCARD		| \
235      VIRTIO_RING_F_INDIRECT_DESC)
236 
237 #define VTBLK_MODERN_FEATURES	(VTBLK_COMMON_FEATURES)
238 #define VTBLK_LEGACY_FEATURES	(VIRTIO_BLK_F_BARRIER | VTBLK_COMMON_FEATURES)
239 
240 #define VTBLK_MTX(_sc)		&(_sc)->vtblk_mtx
241 #define VTBLK_LOCK_INIT(_sc, _name) \
242 				mtx_init(VTBLK_MTX((_sc)), (_name), \
243 				    "VirtIO Block Lock", MTX_DEF)
244 #define VTBLK_LOCK(_sc)		mtx_lock(VTBLK_MTX((_sc)))
245 #define VTBLK_UNLOCK(_sc)	mtx_unlock(VTBLK_MTX((_sc)))
246 #define VTBLK_LOCK_DESTROY(_sc)	mtx_destroy(VTBLK_MTX((_sc)))
247 #define VTBLK_LOCK_ASSERT(_sc)	mtx_assert(VTBLK_MTX((_sc)), MA_OWNED)
248 #define VTBLK_LOCK_ASSERT_NOTOWNED(_sc) \
249 				mtx_assert(VTBLK_MTX((_sc)), MA_NOTOWNED)
250 
251 #define VTBLK_DISK_NAME		"vtbd"
252 #define VTBLK_QUIESCE_TIMEOUT	(30 * hz)
253 #define VTBLK_BSIZE		512
254 
255 /*
256  * Each block request uses at least two segments - one for the header
257  * and one for the status.
258  */
259 #define VTBLK_MIN_SEGMENTS	2
260 
261 static device_method_t vtblk_methods[] = {
262 	/* Device methods. */
263 	DEVMETHOD(device_probe,		vtblk_probe),
264 	DEVMETHOD(device_attach,	vtblk_attach),
265 	DEVMETHOD(device_detach,	vtblk_detach),
266 	DEVMETHOD(device_suspend,	vtblk_suspend),
267 	DEVMETHOD(device_resume,	vtblk_resume),
268 	DEVMETHOD(device_shutdown,	vtblk_shutdown),
269 
270 	/* VirtIO methods. */
271 	DEVMETHOD(virtio_attach_completed, vtblk_attach_completed),
272 	DEVMETHOD(virtio_config_change,	vtblk_config_change),
273 
274 	DEVMETHOD_END
275 };
276 
277 static driver_t vtblk_driver = {
278 	"vtblk",
279 	vtblk_methods,
280 	sizeof(struct vtblk_softc)
281 };
282 
283 VIRTIO_DRIVER_MODULE(virtio_blk, vtblk_driver, vtblk_modevent, NULL);
284 MODULE_VERSION(virtio_blk, 1);
285 MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
286 
287 VIRTIO_SIMPLE_PNPINFO(virtio_blk, VIRTIO_ID_BLOCK, "VirtIO Block Adapter");
288 
289 static int
290 vtblk_modevent(module_t mod, int type, void *unused)
291 {
292 	int error;
293 
294 	error = 0;
295 
296 	switch (type) {
297 	case MOD_LOAD:
298 	case MOD_QUIESCE:
299 	case MOD_UNLOAD:
300 	case MOD_SHUTDOWN:
301 		break;
302 	default:
303 		error = EOPNOTSUPP;
304 		break;
305 	}
306 
307 	return (error);
308 }
309 
310 static int
311 vtblk_probe(device_t dev)
312 {
313 	return (VIRTIO_SIMPLE_PROBE(dev, virtio_blk));
314 }
315 
316 static int
317 vtblk_attach(device_t dev)
318 {
319 	struct vtblk_softc *sc;
320 	struct virtio_blk_config blkcfg;
321 	int error;
322 
323 	sc = device_get_softc(dev);
324 	sc->vtblk_dev = dev;
325 	virtio_set_feature_desc(dev, vtblk_feature_desc);
326 
327 	VTBLK_LOCK_INIT(sc, device_get_nameunit(dev));
328 	bioq_init(&sc->vtblk_bioq);
329 	TAILQ_INIT(&sc->vtblk_dump_queue);
330 	TAILQ_INIT(&sc->vtblk_req_free);
331 	TAILQ_INIT(&sc->vtblk_req_ready);
332 
333 	vtblk_setup_sysctl(sc);
334 
335 	error = vtblk_setup_features(sc);
336 	if (error) {
337 		device_printf(dev, "cannot setup features\n");
338 		goto fail;
339 	}
340 
341 	vtblk_read_config(sc, &blkcfg);
342 
343 	/*
344 	 * With the current sglist(9) implementation, it is not easy
345 	 * for us to support a maximum segment size as adjacent
346 	 * segments are coalesced. For now, just make sure it's larger
347 	 * than the maximum supported transfer size.
348 	 */
349 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
350 		if (blkcfg.size_max < maxphys) {
351 			error = ENOTSUP;
352 			device_printf(dev, "host requires unsupported "
353 			    "maximum segment size feature\n");
354 			goto fail;
355 		}
356 	}
357 
358 	sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
359 	if (sc->vtblk_max_nsegs <= VTBLK_MIN_SEGMENTS) {
360 		error = EINVAL;
361 		device_printf(dev, "fewer than minimum number of segments "
362 		    "allowed: %d\n", sc->vtblk_max_nsegs);
363 		goto fail;
364 	}
365 
366 	sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_NOWAIT);
367 	if (sc->vtblk_sglist == NULL) {
368 		error = ENOMEM;
369 		device_printf(dev, "cannot allocate sglist\n");
370 		goto fail;
371 	}
372 
373 	/*
374 	 * If vtblk_max_nsegs == VTBLK_MIN_SEGMENTS + 1, the device only
375 	 * supports a single data segment; in that case we need busdma to
376 	 * align to a page boundary so we can send a *contiguous* page size
377 	 * request to the host.
378 	 */
379 	if (sc->vtblk_max_nsegs == VTBLK_MIN_SEGMENTS + 1)
380 		sc->vtblk_flags |= VTBLK_FLAG_BUSDMA_ALIGN;
381 	error = bus_dma_tag_create(
382 	    bus_get_dma_tag(dev),			/* parent */
383 	    (sc->vtblk_flags & VTBLK_FLAG_BUSDMA_ALIGN) ? PAGE_SIZE : 1,
384 	    0,						/* boundary */
385 	    BUS_SPACE_MAXADDR,				/* lowaddr */
386 	    BUS_SPACE_MAXADDR,				/* highaddr */
387 	    NULL, NULL,					/* filter, filterarg */
388 	    maxphys,					/* max request size */
389 	    sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS,	/* max # segments */
390 	    maxphys,					/* maxsegsize */
391 	    0,						/* flags */
392 	    busdma_lock_mutex,				/* lockfunc */
393 	    &sc->vtblk_mtx,				/* lockarg */
394 	    &sc->vtblk_dmat);
395 	if (error) {
396 		device_printf(dev, "cannot create bus dma tag\n");
397 		goto fail;
398 	}
399 
400 #ifdef __powerpc__
401 	/*
402 	 * Virtio uses physical addresses rather than bus addresses, so we
403 	 * need to ask busdma to skip the iommu physical->bus mapping.  At
404 	 * present, this is only a thing on the powerpc architectures.
405 	 */
406 	bus_dma_tag_set_iommu(sc->vtblk_dmat, NULL, NULL);
407 #endif
408 
409 	error = vtblk_alloc_virtqueue(sc);
410 	if (error) {
411 		device_printf(dev, "cannot allocate virtqueue\n");
412 		goto fail;
413 	}
414 
415 	error = vtblk_request_prealloc(sc);
416 	if (error) {
417 		device_printf(dev, "cannot preallocate requests\n");
418 		goto fail;
419 	}
420 
421 	vtblk_alloc_disk(sc, &blkcfg);
422 
423 	error = virtio_setup_intr(dev, INTR_TYPE_BIO | INTR_ENTROPY);
424 	if (error) {
425 		device_printf(dev, "cannot setup virtqueue interrupt\n");
426 		goto fail;
427 	}
428 
429 	virtqueue_enable_intr(sc->vtblk_vq);
430 
431 fail:
432 	if (error)
433 		vtblk_detach(dev);
434 
435 	return (error);
436 }
437 
438 static int
439 vtblk_detach(device_t dev)
440 {
441 	struct vtblk_softc *sc;
442 
443 	sc = device_get_softc(dev);
444 
445 	VTBLK_LOCK(sc);
446 	sc->vtblk_flags |= VTBLK_FLAG_DETACH;
447 	if (device_is_attached(dev))
448 		vtblk_stop(sc);
449 	VTBLK_UNLOCK(sc);
450 
451 	vtblk_drain(sc);
452 
453 	if (sc->vtblk_disk != NULL) {
454 		disk_destroy(sc->vtblk_disk);
455 		sc->vtblk_disk = NULL;
456 	}
457 
458 	if (sc->vtblk_dmat != NULL) {
459 		bus_dma_tag_destroy(sc->vtblk_dmat);
460 		sc->vtblk_dmat = NULL;
461 	}
462 
463 	if (sc->vtblk_sglist != NULL) {
464 		sglist_free(sc->vtblk_sglist);
465 		sc->vtblk_sglist = NULL;
466 	}
467 
468 	VTBLK_LOCK_DESTROY(sc);
469 
470 	return (0);
471 }
472 
473 static int
474 vtblk_suspend(device_t dev)
475 {
476 	struct vtblk_softc *sc;
477 	int error;
478 
479 	sc = device_get_softc(dev);
480 
481 	VTBLK_LOCK(sc);
482 	sc->vtblk_flags |= VTBLK_FLAG_SUSPEND;
483 	/* XXX BMV: virtio_stop(), etc needed here? */
484 	error = vtblk_quiesce(sc);
485 	if (error)
486 		sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
487 	VTBLK_UNLOCK(sc);
488 
489 	return (error);
490 }
491 
492 static int
493 vtblk_resume(device_t dev)
494 {
495 	struct vtblk_softc *sc;
496 
497 	sc = device_get_softc(dev);
498 
499 	VTBLK_LOCK(sc);
500 	/* XXX BMV: virtio_reinit(), etc needed here? */
501 	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
502 	vtblk_startio(sc);
503 	VTBLK_UNLOCK(sc);
504 
505 	return (0);
506 }
507 
508 static int
509 vtblk_shutdown(device_t dev)
510 {
511 
512 	return (0);
513 }
514 
515 static int
516 vtblk_attach_completed(device_t dev)
517 {
518 	struct vtblk_softc *sc;
519 
520 	sc = device_get_softc(dev);
521 
522 	/*
523 	 * Create disk after attach as VIRTIO_BLK_T_GET_ID can only be
524 	 * processed after the device acknowledged
525 	 * VIRTIO_CONFIG_STATUS_DRIVER_OK.
526 	 */
527 	vtblk_create_disk(sc);
528 	return (0);
529 }
530 
531 static int
532 vtblk_config_change(device_t dev)
533 {
534 	struct vtblk_softc *sc;
535 	struct virtio_blk_config blkcfg;
536 	uint64_t capacity;
537 
538 	sc = device_get_softc(dev);
539 
540 	vtblk_read_config(sc, &blkcfg);
541 
542 	/* Capacity is always in 512-byte units. */
543 	capacity = blkcfg.capacity * VTBLK_BSIZE;
544 
545 	if (sc->vtblk_disk->d_mediasize != capacity)
546 		vtblk_resize_disk(sc, capacity);
547 
548 	return (0);
549 }
550 
551 static int
552 vtblk_open(struct disk *dp)
553 {
554 	struct vtblk_softc *sc;
555 
556 	if ((sc = dp->d_drv1) == NULL)
557 		return (ENXIO);
558 
559 	return (sc->vtblk_flags & VTBLK_FLAG_DETACH ? ENXIO : 0);
560 }
561 
562 static int
563 vtblk_close(struct disk *dp)
564 {
565 	struct vtblk_softc *sc;
566 
567 	if ((sc = dp->d_drv1) == NULL)
568 		return (ENXIO);
569 
570 	return (0);
571 }
572 
573 static int
574 vtblk_ioctl(struct disk *dp, u_long cmd, void *addr, int flag,
575     struct thread *td)
576 {
577 	struct vtblk_softc *sc;
578 
579 	if ((sc = dp->d_drv1) == NULL)
580 		return (ENXIO);
581 
582 	return (ENOTTY);
583 }
584 
585 static int
586 vtblk_dump(void *arg, void *virtual, off_t offset, size_t length)
587 {
588 	struct disk *dp;
589 	struct vtblk_softc *sc;
590 	int error;
591 
592 	dp = arg;
593 	error = 0;
594 
595 	if ((sc = dp->d_drv1) == NULL)
596 		return (ENXIO);
597 
598 	VTBLK_LOCK(sc);
599 
600 	vtblk_dump_quiesce(sc);
601 
602 	if (length > 0)
603 		error = vtblk_dump_write(sc, virtual, offset, length);
604 	if (error || (virtual == NULL && offset == 0))
605 		vtblk_dump_complete(sc);
606 
607 	VTBLK_UNLOCK(sc);
608 
609 	return (error);
610 }
611 
612 static void
613 vtblk_strategy(struct bio *bp)
614 {
615 	struct vtblk_softc *sc;
616 
617 	if ((sc = bp->bio_disk->d_drv1) == NULL) {
618 		vtblk_bio_done(NULL, bp, EINVAL);
619 		return;
620 	}
621 
622 	if ((bp->bio_cmd != BIO_READ) && (bp->bio_cmd != BIO_WRITE) &&
623 	    (bp->bio_cmd != BIO_FLUSH) && (bp->bio_cmd != BIO_DELETE)) {
624 		vtblk_bio_done(sc, bp, EOPNOTSUPP);
625 		return;
626 	}
627 
628 	VTBLK_LOCK(sc);
629 
630 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH) {
631 		VTBLK_UNLOCK(sc);
632 		vtblk_bio_done(sc, bp, ENXIO);
633 		return;
634 	}
635 
636 	bioq_insert_tail(&sc->vtblk_bioq, bp);
637 	vtblk_startio(sc);
638 
639 	VTBLK_UNLOCK(sc);
640 }
641 
642 static int
643 vtblk_negotiate_features(struct vtblk_softc *sc)
644 {
645 	device_t dev;
646 	uint64_t features;
647 
648 	dev = sc->vtblk_dev;
649 	features = virtio_bus_is_modern(dev) ? VTBLK_MODERN_FEATURES :
650 	    VTBLK_LEGACY_FEATURES;
651 
652 	sc->vtblk_features = virtio_negotiate_features(dev, features);
653 	return (virtio_finalize_features(dev));
654 }
655 
656 static int
657 vtblk_setup_features(struct vtblk_softc *sc)
658 {
659 	device_t dev;
660 	int error;
661 
662 	dev = sc->vtblk_dev;
663 
664 	error = vtblk_negotiate_features(sc);
665 	if (error)
666 		return (error);
667 
668 	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
669 		sc->vtblk_flags |= VTBLK_FLAG_INDIRECT;
670 	if (virtio_with_feature(dev, VIRTIO_BLK_F_CONFIG_WCE))
671 		sc->vtblk_flags |= VTBLK_FLAG_WCE_CONFIG;
672 
673 	/* Legacy. */
674 	if (virtio_with_feature(dev, VIRTIO_BLK_F_BARRIER))
675 		sc->vtblk_flags |= VTBLK_FLAG_BARRIER;
676 
677 	return (0);
678 }
679 
680 static int
681 vtblk_maximum_segments(struct vtblk_softc *sc,
682     struct virtio_blk_config *blkcfg)
683 {
684 	device_t dev;
685 	int nsegs;
686 
687 	dev = sc->vtblk_dev;
688 	nsegs = VTBLK_MIN_SEGMENTS;
689 
690 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
691 		nsegs += MIN(blkcfg->seg_max, maxphys / PAGE_SIZE + 1);
692 		if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
693 			nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
694 	} else
695 		nsegs += 1;
696 
697 	return (nsegs);
698 }
699 
700 static int
701 vtblk_alloc_virtqueue(struct vtblk_softc *sc)
702 {
703 	device_t dev;
704 	struct vq_alloc_info vq_info;
705 
706 	dev = sc->vtblk_dev;
707 
708 	VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
709 	    vtblk_vq_intr, sc, &sc->vtblk_vq,
710 	    "%s request", device_get_nameunit(dev));
711 
712 	return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info));
713 }
714 
715 static void
716 vtblk_resize_disk(struct vtblk_softc *sc, uint64_t new_capacity)
717 {
718 	device_t dev;
719 	struct disk *dp;
720 	int error;
721 
722 	dev = sc->vtblk_dev;
723 	dp = sc->vtblk_disk;
724 
725 	dp->d_mediasize = new_capacity;
726 	if (bootverbose) {
727 		device_printf(dev, "resized to %juMB (%ju %u byte sectors)\n",
728 		    (uintmax_t) dp->d_mediasize >> 20,
729 		    (uintmax_t) dp->d_mediasize / dp->d_sectorsize,
730 		    dp->d_sectorsize);
731 	}
732 
733 	error = disk_resize(dp, M_NOWAIT);
734 	if (error) {
735 		device_printf(dev,
736 		    "disk_resize(9) failed, error: %d\n", error);
737 	}
738 }
739 
740 static void
741 vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
742 {
743 	device_t dev;
744 	struct disk *dp;
745 
746 	dev = sc->vtblk_dev;
747 
748 	sc->vtblk_disk = dp = disk_alloc();
749 	dp->d_open = vtblk_open;
750 	dp->d_close = vtblk_close;
751 	dp->d_ioctl = vtblk_ioctl;
752 	dp->d_strategy = vtblk_strategy;
753 	dp->d_name = VTBLK_DISK_NAME;
754 	dp->d_unit = device_get_unit(dev);
755 	dp->d_drv1 = sc;
756 	dp->d_flags = DISKFLAG_UNMAPPED_BIO | DISKFLAG_DIRECT_COMPLETION;
757 	dp->d_hba_vendor = virtio_get_vendor(dev);
758 	dp->d_hba_device = virtio_get_device(dev);
759 	dp->d_hba_subvendor = virtio_get_subvendor(dev);
760 	dp->d_hba_subdevice = virtio_get_subdevice(dev);
761 
762 	if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
763 		dp->d_flags |= DISKFLAG_WRITE_PROTECT;
764 	else {
765 		if (virtio_with_feature(dev, VIRTIO_BLK_F_FLUSH))
766 			dp->d_flags |= DISKFLAG_CANFLUSHCACHE;
767 		dp->d_dump = vtblk_dump;
768 	}
769 
770 	/* Capacity is always in 512-byte units. */
771 	dp->d_mediasize = blkcfg->capacity * VTBLK_BSIZE;
772 
773 	if (virtio_with_feature(dev, VIRTIO_BLK_F_BLK_SIZE))
774 		dp->d_sectorsize = blkcfg->blk_size;
775 	else
776 		dp->d_sectorsize = VTBLK_BSIZE;
777 
778 	/*
779 	 * The VirtIO maximum I/O size is given in terms of segments.
780 	 * However, FreeBSD limits I/O size by logical buffer size, not
781 	 * by physically contiguous pages. Therefore, we have to assume
782 	 * no pages are contiguous. This may impose an artificially low
783 	 * maximum I/O size. But in practice, since QEMU advertises 128
784 	 * segments, this gives us a maximum IO size of 125 * PAGE_SIZE,
785 	 * which is typically greater than maxphys. Eventually we should
786 	 * just advertise maxphys and split buffers that are too big.
787 	 *
788 	 * If we're not asking busdma to align data to page boundaries, the
789 	 * maximum I/O size is reduced by PAGE_SIZE in order to accommodate
790 	 * unaligned I/Os.
791 	 */
792 	dp->d_maxsize = (sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS) *
793 	    PAGE_SIZE;
794 	if ((sc->vtblk_flags & VTBLK_FLAG_BUSDMA_ALIGN) == 0)
795 		dp->d_maxsize -= PAGE_SIZE;
796 
797 	if (virtio_with_feature(dev, VIRTIO_BLK_F_GEOMETRY)) {
798 		dp->d_fwsectors = blkcfg->geometry.sectors;
799 		dp->d_fwheads = blkcfg->geometry.heads;
800 	}
801 
802 	if (virtio_with_feature(dev, VIRTIO_BLK_F_TOPOLOGY) &&
803 	    blkcfg->topology.physical_block_exp > 0) {
804 		dp->d_stripesize = dp->d_sectorsize *
805 		    (1 << blkcfg->topology.physical_block_exp);
806 		dp->d_stripeoffset = (dp->d_stripesize -
807 		    blkcfg->topology.alignment_offset * dp->d_sectorsize) %
808 		    dp->d_stripesize;
809 	}
810 
811 	if (virtio_with_feature(dev, VIRTIO_BLK_F_DISCARD)) {
812 		dp->d_flags |= DISKFLAG_CANDELETE;
813 		dp->d_delmaxsize = blkcfg->max_discard_sectors * VTBLK_BSIZE;
814 	}
815 
816 	if (vtblk_write_cache_enabled(sc, blkcfg) != 0)
817 		sc->vtblk_write_cache = VTBLK_CACHE_WRITEBACK;
818 	else
819 		sc->vtblk_write_cache = VTBLK_CACHE_WRITETHROUGH;
820 }
821 
822 static void
823 vtblk_create_disk(struct vtblk_softc *sc)
824 {
825 	struct disk *dp;
826 
827 	dp = sc->vtblk_disk;
828 
829 	vtblk_ident(sc);
830 
831 	device_printf(sc->vtblk_dev, "%juMB (%ju %u byte sectors)\n",
832 	    (uintmax_t) dp->d_mediasize >> 20,
833 	    (uintmax_t) dp->d_mediasize / dp->d_sectorsize,
834 	    dp->d_sectorsize);
835 
836 	disk_create(dp, DISK_VERSION);
837 }
838 
839 static int
840 vtblk_request_prealloc(struct vtblk_softc *sc)
841 {
842 	struct vtblk_request *req;
843 	int i, nreqs;
844 
845 	nreqs = virtqueue_size(sc->vtblk_vq);
846 
847 	/*
848 	 * Preallocate sufficient requests to keep the virtqueue full. Each
849 	 * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
850 	 * the number allocated when indirect descriptors are not available.
851 	 */
852 	if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
853 		nreqs /= VTBLK_MIN_SEGMENTS;
854 
855 	for (i = 0; i < nreqs; i++) {
856 		req = malloc(sizeof(struct vtblk_request), M_DEVBUF, M_NOWAIT);
857 		if (req == NULL)
858 			return (ENOMEM);
859 
860 		req->vbr_sc = sc;
861 		if (bus_dmamap_create(sc->vtblk_dmat, 0, &req->vbr_mapp)) {
862 			free(req, M_DEVBUF);
863 			return (ENOMEM);
864 		}
865 
866 		MPASS(sglist_count(&req->vbr_hdr, sizeof(req->vbr_hdr)) == 1);
867 		MPASS(sglist_count(&req->vbr_ack, sizeof(req->vbr_ack)) == 1);
868 
869 		sc->vtblk_request_count++;
870 		vtblk_request_enqueue(sc, req);
871 	}
872 
873 	return (0);
874 }
875 
876 static void
877 vtblk_request_free(struct vtblk_softc *sc)
878 {
879 	struct vtblk_request *req;
880 
881 	MPASS(TAILQ_EMPTY(&sc->vtblk_req_ready));
882 
883 	while ((req = vtblk_request_dequeue(sc)) != NULL) {
884 		sc->vtblk_request_count--;
885 		bus_dmamap_destroy(sc->vtblk_dmat, req->vbr_mapp);
886 		free(req, M_DEVBUF);
887 	}
888 
889 	KASSERT(sc->vtblk_request_count == 0,
890 	    ("%s: leaked %d requests", __func__, sc->vtblk_request_count));
891 }
892 
893 static struct vtblk_request *
894 vtblk_request_dequeue(struct vtblk_softc *sc)
895 {
896 	struct vtblk_request *req;
897 
898 	req = TAILQ_FIRST(&sc->vtblk_req_free);
899 	if (req != NULL) {
900 		TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link);
901 		bzero(&req->vbr_hdr, sizeof(struct vtblk_request) -
902 		    offsetof(struct vtblk_request, vbr_hdr));
903 	}
904 
905 	return (req);
906 }
907 
908 static void
909 vtblk_request_enqueue(struct vtblk_softc *sc, struct vtblk_request *req)
910 {
911 
912 	TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
913 }
914 
915 static struct vtblk_request *
916 vtblk_request_next_ready(struct vtblk_softc *sc)
917 {
918 	struct vtblk_request *req;
919 
920 	req = TAILQ_FIRST(&sc->vtblk_req_ready);
921 	if (req != NULL)
922 		TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link);
923 
924 	return (req);
925 }
926 
927 static void
928 vtblk_request_requeue_ready(struct vtblk_softc *sc, struct vtblk_request *req)
929 {
930 
931 	/* NOTE: Currently, there will be at most one request in the queue. */
932 	TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link);
933 }
934 
935 static struct vtblk_request *
936 vtblk_request_next(struct vtblk_softc *sc)
937 {
938 	struct vtblk_request *req;
939 
940 	req = vtblk_request_next_ready(sc);
941 	if (req != NULL)
942 		return (req);
943 
944 	return (vtblk_request_bio(sc));
945 }
946 
947 static struct vtblk_request *
948 vtblk_request_bio(struct vtblk_softc *sc)
949 {
950 	struct bio_queue_head *bioq;
951 	struct vtblk_request *req;
952 	struct bio *bp;
953 
954 	bioq = &sc->vtblk_bioq;
955 
956 	if (bioq_first(bioq) == NULL)
957 		return (NULL);
958 
959 	req = vtblk_request_dequeue(sc);
960 	if (req == NULL)
961 		return (NULL);
962 
963 	bp = bioq_takefirst(bioq);
964 	req->vbr_bp = bp;
965 	req->vbr_ack = -1;
966 	req->vbr_hdr.ioprio = vtblk_gtoh32(sc, 1);
967 
968 	switch (bp->bio_cmd) {
969 	case BIO_FLUSH:
970 		req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_FLUSH);
971 		req->vbr_hdr.sector = 0;
972 		break;
973 	case BIO_READ:
974 		req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_IN);
975 		req->vbr_hdr.sector = vtblk_gtoh64(sc, bp->bio_offset / VTBLK_BSIZE);
976 		break;
977 	case BIO_WRITE:
978 		req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_OUT);
979 		req->vbr_hdr.sector = vtblk_gtoh64(sc, bp->bio_offset / VTBLK_BSIZE);
980 		break;
981 	case BIO_DELETE:
982 		req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_DISCARD);
983 		req->vbr_hdr.sector = vtblk_gtoh64(sc, bp->bio_offset / VTBLK_BSIZE);
984 		break;
985 	default:
986 		panic("%s: bio with unhandled cmd: %d", __func__, bp->bio_cmd);
987 	}
988 
989 	if (bp->bio_flags & BIO_ORDERED)
990 		req->vbr_hdr.type |= vtblk_gtoh32(sc, VIRTIO_BLK_T_BARRIER);
991 
992 	return (req);
993 }
994 
995 static int
996 vtblk_request_execute(struct vtblk_request *req, int flags)
997 {
998 	struct vtblk_softc *sc = req->vbr_sc;
999 	struct bio *bp = req->vbr_bp;
1000 	int error = 0;
1001 
1002 	/*
1003 	 * Call via bus_dmamap_load_bio or directly depending on whether we
1004 	 * have a buffer we need to map.  If we don't have a busdma map,
1005 	 * try to perform the I/O directly and hope that it works (this will
1006 	 * happen when dumping).
1007 	 */
1008 	if ((req->vbr_mapp != NULL) &&
1009 	    (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
1010 		error = bus_dmamap_load_bio(sc->vtblk_dmat, req->vbr_mapp,
1011 		    req->vbr_bp, vtblk_request_execute_cb, req, flags);
1012 		if (error == EINPROGRESS) {
1013 			req->vbr_busdma_wait = 1;
1014 			sc->vtblk_flags |= VTBLK_FLAG_BUSDMA_WAIT;
1015 		}
1016 	} else {
1017 		vtblk_request_execute_cb(req, NULL, 0, 0);
1018 	}
1019 
1020 	return (error ? error : req->vbr_error);
1021 }
1022 
1023 static void
1024 vtblk_request_execute_cb(void * callback_arg, bus_dma_segment_t * segs,
1025     int nseg, int error)
1026 {
1027 	struct vtblk_request *req;
1028 	struct vtblk_softc *sc;
1029 	struct virtqueue *vq;
1030 	struct sglist *sg;
1031 	struct bio *bp;
1032 	int ordered, readable, writable, i;
1033 
1034 	req = (struct vtblk_request *)callback_arg;
1035 	sc = req->vbr_sc;
1036 	vq = sc->vtblk_vq;
1037 	sg = sc->vtblk_sglist;
1038 	bp = req->vbr_bp;
1039 	ordered = 0;
1040 	writable = 0;
1041 
1042 	/*
1043 	 * If we paused request queueing while we waited for busdma to call us
1044 	 * asynchronously, unpause it now; this request made it through so we
1045 	 * don't need to worry about others getting ahead of us.  (Note that we
1046 	 * hold the device mutex so nothing will happen until after we return
1047 	 * anyway.)
1048 	 */
1049 	if (req->vbr_busdma_wait)
1050 		sc->vtblk_flags &= ~VTBLK_FLAG_BUSDMA_WAIT;
1051 
1052 	/* Fail on errors from busdma. */
1053 	if (error)
1054 		goto out1;
1055 
1056 	/*
1057 	 * Some hosts (such as bhyve) do not implement the barrier feature,
1058 	 * so we emulate it in the driver by allowing the barrier request
1059 	 * to be the only one in flight.
1060 	 */
1061 	if ((sc->vtblk_flags & VTBLK_FLAG_BARRIER) == 0) {
1062 		if (sc->vtblk_req_ordered != NULL) {
1063 			error = EBUSY;
1064 			goto out;
1065 		}
1066 		if (bp->bio_flags & BIO_ORDERED) {
1067 			if (!virtqueue_empty(vq)) {
1068 				error = EBUSY;
1069 				goto out;
1070 			}
1071 			ordered = 1;
1072 			req->vbr_hdr.type &= vtblk_gtoh32(sc,
1073 				~VIRTIO_BLK_T_BARRIER);
1074 		}
1075 	}
1076 
1077 	sglist_reset(sg);
1078 	sglist_append(sg, &req->vbr_hdr, sizeof(struct virtio_blk_outhdr));
1079 
1080 	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
1081 		/*
1082 		 * We cast bus_addr_t to vm_paddr_t here; since we skip the
1083 		 * iommu mapping (see vtblk_attach) this should be safe.
1084 		 */
1085 		for (i = 0; i < nseg; i++) {
1086 			error = sglist_append_phys(sg,
1087 			    (vm_paddr_t)segs[i].ds_addr, segs[i].ds_len);
1088 			if (error || sg->sg_nseg == sg->sg_maxseg) {
1089 				panic("%s: bio %p data buffer too big %d",
1090 				    __func__, bp, error);
1091 			}
1092 		}
1093 
1094 		/* Special handling for dump, which bypasses busdma. */
1095 		if (req->vbr_mapp == NULL) {
1096 			error = sglist_append_bio(sg, bp);
1097 			if (error || sg->sg_nseg == sg->sg_maxseg) {
1098 				panic("%s: bio %p data buffer too big %d",
1099 				    __func__, bp, error);
1100 			}
1101 		}
1102 
1103 		/* BIO_READ means the host writes into our buffer. */
1104 		if (bp->bio_cmd == BIO_READ)
1105 			writable = sg->sg_nseg - 1;
1106 	} else if (bp->bio_cmd == BIO_DELETE) {
1107 		struct virtio_blk_discard_write_zeroes *discard;
1108 
1109 		discard = malloc(sizeof(*discard), M_DEVBUF, M_NOWAIT | M_ZERO);
1110 		if (discard == NULL) {
1111 			error = ENOMEM;
1112 			goto out;
1113 		}
1114 
1115 		bp->bio_driver1 = discard;
1116 		discard->sector = vtblk_gtoh64(sc, bp->bio_offset / VTBLK_BSIZE);
1117 		discard->num_sectors = vtblk_gtoh32(sc, bp->bio_bcount / VTBLK_BSIZE);
1118 		error = sglist_append(sg, discard, sizeof(*discard));
1119 		if (error || sg->sg_nseg == sg->sg_maxseg) {
1120 			panic("%s: bio %p data buffer too big %d",
1121 			    __func__, bp, error);
1122 		}
1123 	}
1124 
1125 	writable++;
1126 	sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
1127 	readable = sg->sg_nseg - writable;
1128 
1129 	if (req->vbr_mapp != NULL) {
1130 		switch (bp->bio_cmd) {
1131 		case BIO_READ:
1132 			bus_dmamap_sync(sc->vtblk_dmat, req->vbr_mapp,
1133 			    BUS_DMASYNC_PREREAD);
1134 			break;
1135 		case BIO_WRITE:
1136 			bus_dmamap_sync(sc->vtblk_dmat, req->vbr_mapp,
1137 			    BUS_DMASYNC_PREWRITE);
1138 			break;
1139 		}
1140 	}
1141 
1142 	error = virtqueue_enqueue(vq, req, sg, readable, writable);
1143 	if (error == 0 && ordered)
1144 		sc->vtblk_req_ordered = req;
1145 
1146 	/*
1147 	 * If we were called asynchronously, we need to notify the queue that
1148 	 * we've added a new request, since the notification from startio was
1149 	 * performed already.
1150 	 */
1151 	if (error == 0 && req->vbr_busdma_wait)
1152 		virtqueue_notify(vq);
1153 
1154 out:
1155 	if (error && (req->vbr_mapp != NULL))
1156 		bus_dmamap_unload(sc->vtblk_dmat, req->vbr_mapp);
1157 out1:
1158 	if (error && req->vbr_requeue_on_error)
1159 		vtblk_request_requeue_ready(sc, req);
1160 	req->vbr_error = error;
1161 }
1162 
1163 static int
1164 vtblk_request_error(struct vtblk_request *req)
1165 {
1166 	int error;
1167 
1168 	switch (req->vbr_ack) {
1169 	case VIRTIO_BLK_S_OK:
1170 		error = 0;
1171 		break;
1172 	case VIRTIO_BLK_S_UNSUPP:
1173 		error = ENOTSUP;
1174 		break;
1175 	default:
1176 		error = EIO;
1177 		break;
1178 	}
1179 
1180 	return (error);
1181 }
1182 
1183 static void
1184 vtblk_queue_completed(struct vtblk_softc *sc, struct bio_queue *queue)
1185 {
1186 	struct vtblk_request *req;
1187 	struct bio *bp;
1188 
1189 	while ((req = virtqueue_dequeue(sc->vtblk_vq, NULL)) != NULL) {
1190 		if (sc->vtblk_req_ordered != NULL) {
1191 			MPASS(sc->vtblk_req_ordered == req);
1192 			sc->vtblk_req_ordered = NULL;
1193 		}
1194 
1195 		bp = req->vbr_bp;
1196 		if (req->vbr_mapp != NULL) {
1197 			switch (bp->bio_cmd) {
1198 			case BIO_READ:
1199 				bus_dmamap_sync(sc->vtblk_dmat, req->vbr_mapp,
1200 				    BUS_DMASYNC_POSTREAD);
1201 				bus_dmamap_unload(sc->vtblk_dmat,
1202 				    req->vbr_mapp);
1203 				break;
1204 			case BIO_WRITE:
1205 				bus_dmamap_sync(sc->vtblk_dmat, req->vbr_mapp,
1206 				    BUS_DMASYNC_POSTWRITE);
1207 				bus_dmamap_unload(sc->vtblk_dmat,
1208 				    req->vbr_mapp);
1209 				break;
1210 			}
1211 		}
1212 		bp->bio_error = vtblk_request_error(req);
1213 		TAILQ_INSERT_TAIL(queue, bp, bio_queue);
1214 
1215 		vtblk_request_enqueue(sc, req);
1216 	}
1217 }
1218 
1219 static void
1220 vtblk_done_completed(struct vtblk_softc *sc, struct bio_queue *queue)
1221 {
1222 	struct bio *bp, *tmp;
1223 
1224 	TAILQ_FOREACH_SAFE(bp, queue, bio_queue, tmp) {
1225 		if (bp->bio_error != 0)
1226 			disk_err(bp, "hard error", -1, 1);
1227 		vtblk_bio_done(sc, bp, bp->bio_error);
1228 	}
1229 }
1230 
1231 static void
1232 vtblk_drain_vq(struct vtblk_softc *sc)
1233 {
1234 	struct virtqueue *vq;
1235 	struct vtblk_request *req;
1236 	int last;
1237 
1238 	vq = sc->vtblk_vq;
1239 	last = 0;
1240 
1241 	while ((req = virtqueue_drain(vq, &last)) != NULL) {
1242 		vtblk_bio_done(sc, req->vbr_bp, ENXIO);
1243 		vtblk_request_enqueue(sc, req);
1244 	}
1245 
1246 	sc->vtblk_req_ordered = NULL;
1247 	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
1248 }
1249 
1250 static void
1251 vtblk_drain(struct vtblk_softc *sc)
1252 {
1253 	struct bio_queue_head *bioq;
1254 	struct vtblk_request *req;
1255 	struct bio *bp;
1256 
1257 	bioq = &sc->vtblk_bioq;
1258 
1259 	if (sc->vtblk_vq != NULL) {
1260 		struct bio_queue queue;
1261 
1262 		TAILQ_INIT(&queue);
1263 		vtblk_queue_completed(sc, &queue);
1264 		vtblk_done_completed(sc, &queue);
1265 
1266 		vtblk_drain_vq(sc);
1267 	}
1268 
1269 	while ((req = vtblk_request_next_ready(sc)) != NULL) {
1270 		vtblk_bio_done(sc, req->vbr_bp, ENXIO);
1271 		vtblk_request_enqueue(sc, req);
1272 	}
1273 
1274 	while (bioq_first(bioq) != NULL) {
1275 		bp = bioq_takefirst(bioq);
1276 		vtblk_bio_done(sc, bp, ENXIO);
1277 	}
1278 
1279 	vtblk_request_free(sc);
1280 }
1281 
1282 static void
1283 vtblk_startio(struct vtblk_softc *sc)
1284 {
1285 	struct virtqueue *vq;
1286 	struct vtblk_request *req;
1287 	int enq;
1288 
1289 	VTBLK_LOCK_ASSERT(sc);
1290 	vq = sc->vtblk_vq;
1291 	enq = 0;
1292 
1293 	if (sc->vtblk_flags & (VTBLK_FLAG_SUSPEND | VTBLK_FLAG_BUSDMA_WAIT))
1294 		return;
1295 
1296 	while (!virtqueue_full(vq)) {
1297 		req = vtblk_request_next(sc);
1298 		if (req == NULL)
1299 			break;
1300 
1301 		req->vbr_requeue_on_error = 1;
1302 		if (vtblk_request_execute(req, BUS_DMA_WAITOK))
1303 			break;
1304 
1305 		enq++;
1306 	}
1307 
1308 	if (enq > 0)
1309 		virtqueue_notify(vq);
1310 }
1311 
1312 static void
1313 vtblk_bio_done(struct vtblk_softc *sc, struct bio *bp, int error)
1314 {
1315 
1316 	/* Because of GEOM direct dispatch, we cannot hold any locks. */
1317 	if (sc != NULL)
1318 		VTBLK_LOCK_ASSERT_NOTOWNED(sc);
1319 
1320 	if (error) {
1321 		bp->bio_resid = bp->bio_bcount;
1322 		bp->bio_error = error;
1323 		bp->bio_flags |= BIO_ERROR;
1324 	} else {
1325 		kmsan_mark_bio(bp, KMSAN_STATE_INITED);
1326 	}
1327 
1328 	if (bp->bio_driver1 != NULL) {
1329 		free(bp->bio_driver1, M_DEVBUF);
1330 		bp->bio_driver1 = NULL;
1331 	}
1332 
1333 	biodone(bp);
1334 }
1335 
1336 #define VTBLK_GET_CONFIG(_dev, _feature, _field, _cfg)			\
1337 	if (virtio_with_feature(_dev, _feature)) {			\
1338 		virtio_read_device_config(_dev,				\
1339 		    offsetof(struct virtio_blk_config, _field),		\
1340 		    &(_cfg)->_field, sizeof((_cfg)->_field));		\
1341 	}
1342 
1343 static void
1344 vtblk_read_config(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
1345 {
1346 	device_t dev;
1347 
1348 	dev = sc->vtblk_dev;
1349 
1350 	bzero(blkcfg, sizeof(struct virtio_blk_config));
1351 
1352 	/* The capacity is always available. */
1353 	virtio_read_device_config(dev, offsetof(struct virtio_blk_config,
1354 	    capacity), &blkcfg->capacity, sizeof(blkcfg->capacity));
1355 
1356 	/* Read the configuration if the feature was negotiated. */
1357 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_SIZE_MAX, size_max, blkcfg);
1358 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_SEG_MAX, seg_max, blkcfg);
1359 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_GEOMETRY,
1360 	    geometry.cylinders, blkcfg);
1361 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_GEOMETRY,
1362 	    geometry.heads, blkcfg);
1363 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_GEOMETRY,
1364 	    geometry.sectors, blkcfg);
1365 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_BLK_SIZE, blk_size, blkcfg);
1366 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY,
1367 	    topology.physical_block_exp, blkcfg);
1368 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY,
1369 	    topology.alignment_offset, blkcfg);
1370 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY,
1371 	    topology.min_io_size, blkcfg);
1372 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY,
1373 	    topology.opt_io_size, blkcfg);
1374 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_CONFIG_WCE, wce, blkcfg);
1375 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, max_discard_sectors,
1376 	    blkcfg);
1377 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, max_discard_seg, blkcfg);
1378 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, discard_sector_alignment,
1379 	    blkcfg);
1380 }
1381 
1382 #undef VTBLK_GET_CONFIG
1383 
1384 static void
1385 vtblk_ident(struct vtblk_softc *sc)
1386 {
1387 	struct bio buf;
1388 	struct disk *dp;
1389 	struct vtblk_request *req;
1390 	int len, error;
1391 
1392 	dp = sc->vtblk_disk;
1393 	len = MIN(VIRTIO_BLK_ID_BYTES, DISK_IDENT_SIZE);
1394 
1395 	if (vtblk_tunable_int(sc, "no_ident", vtblk_no_ident) != 0)
1396 		return;
1397 
1398 	req = vtblk_request_dequeue(sc);
1399 	if (req == NULL)
1400 		return;
1401 
1402 	req->vbr_ack = -1;
1403 	req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_GET_ID);
1404 	req->vbr_hdr.ioprio = vtblk_gtoh32(sc, 1);
1405 	req->vbr_hdr.sector = 0;
1406 
1407 	req->vbr_bp = &buf;
1408 	g_reset_bio(&buf);
1409 
1410 	buf.bio_cmd = BIO_READ;
1411 	buf.bio_data = dp->d_ident;
1412 	buf.bio_bcount = len;
1413 
1414 	VTBLK_LOCK(sc);
1415 	error = vtblk_poll_request(sc, req);
1416 	VTBLK_UNLOCK(sc);
1417 
1418 	vtblk_request_enqueue(sc, req);
1419 
1420 	if (error) {
1421 		device_printf(sc->vtblk_dev,
1422 		    "error getting device identifier: %d\n", error);
1423 	}
1424 }
1425 
1426 static int
1427 vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
1428 {
1429 	struct virtqueue *vq;
1430 	int error;
1431 
1432 	vq = sc->vtblk_vq;
1433 
1434 	if (!virtqueue_empty(vq))
1435 		return (EBUSY);
1436 
1437 	error = vtblk_request_execute(req, BUS_DMA_NOWAIT);
1438 	if (error)
1439 		return (error);
1440 
1441 	virtqueue_notify(vq);
1442 	virtqueue_poll(vq, NULL);
1443 
1444 	error = vtblk_request_error(req);
1445 	if (error && bootverbose) {
1446 		device_printf(sc->vtblk_dev,
1447 		    "%s: IO error: %d\n", __func__, error);
1448 	}
1449 
1450 	return (error);
1451 }
1452 
1453 static int
1454 vtblk_quiesce(struct vtblk_softc *sc)
1455 {
1456 	int error;
1457 
1458 	VTBLK_LOCK_ASSERT(sc);
1459 	error = 0;
1460 
1461 	while (!virtqueue_empty(sc->vtblk_vq)) {
1462 		if (mtx_sleep(&sc->vtblk_vq, VTBLK_MTX(sc), PRIBIO, "vtblkq",
1463 		    VTBLK_QUIESCE_TIMEOUT) == EWOULDBLOCK) {
1464 			error = EBUSY;
1465 			break;
1466 		}
1467 	}
1468 
1469 	return (error);
1470 }
1471 
1472 static void
1473 vtblk_vq_intr(void *xsc)
1474 {
1475 	struct vtblk_softc *sc;
1476 	struct virtqueue *vq;
1477 	struct bio_queue queue;
1478 
1479 	sc = xsc;
1480 	vq = sc->vtblk_vq;
1481 	TAILQ_INIT(&queue);
1482 
1483 	VTBLK_LOCK(sc);
1484 
1485 again:
1486 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH)
1487 		goto out;
1488 
1489 	vtblk_queue_completed(sc, &queue);
1490 	vtblk_startio(sc);
1491 
1492 	if (virtqueue_enable_intr(vq) != 0) {
1493 		virtqueue_disable_intr(vq);
1494 		goto again;
1495 	}
1496 
1497 	if (sc->vtblk_flags & VTBLK_FLAG_SUSPEND)
1498 		wakeup(&sc->vtblk_vq);
1499 
1500 out:
1501 	VTBLK_UNLOCK(sc);
1502 	vtblk_done_completed(sc, &queue);
1503 }
1504 
1505 static void
1506 vtblk_stop(struct vtblk_softc *sc)
1507 {
1508 
1509 	virtqueue_disable_intr(sc->vtblk_vq);
1510 	virtio_stop(sc->vtblk_dev);
1511 }
1512 
1513 static void
1514 vtblk_dump_quiesce(struct vtblk_softc *sc)
1515 {
1516 
1517 	/*
1518 	 * Spin here until all the requests in-flight at the time of the
1519 	 * dump are completed and queued. The queued requests will be
1520 	 * biodone'd once the dump is finished.
1521 	 */
1522 	while (!virtqueue_empty(sc->vtblk_vq))
1523 		vtblk_queue_completed(sc, &sc->vtblk_dump_queue);
1524 }
1525 
1526 static int
1527 vtblk_dump_write(struct vtblk_softc *sc, void *virtual, off_t offset,
1528     size_t length)
1529 {
1530 	struct bio buf;
1531 	struct vtblk_request *req;
1532 
1533 	req = &sc->vtblk_dump_request;
1534 	req->vbr_sc = sc;
1535 	req->vbr_ack = -1;
1536 	req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_OUT);
1537 	req->vbr_hdr.ioprio = vtblk_gtoh32(sc, 1);
1538 	req->vbr_hdr.sector = vtblk_gtoh64(sc, offset / VTBLK_BSIZE);
1539 
1540 	req->vbr_bp = &buf;
1541 	g_reset_bio(&buf);
1542 
1543 	buf.bio_cmd = BIO_WRITE;
1544 	buf.bio_data = virtual;
1545 	buf.bio_bcount = length;
1546 
1547 	return (vtblk_poll_request(sc, req));
1548 }
1549 
1550 static int
1551 vtblk_dump_flush(struct vtblk_softc *sc)
1552 {
1553 	struct bio buf;
1554 	struct vtblk_request *req;
1555 
1556 	req = &sc->vtblk_dump_request;
1557 	req->vbr_sc = sc;
1558 	req->vbr_ack = -1;
1559 	req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_FLUSH);
1560 	req->vbr_hdr.ioprio = vtblk_gtoh32(sc, 1);
1561 	req->vbr_hdr.sector = 0;
1562 
1563 	req->vbr_bp = &buf;
1564 	g_reset_bio(&buf);
1565 
1566 	buf.bio_cmd = BIO_FLUSH;
1567 
1568 	return (vtblk_poll_request(sc, req));
1569 }
1570 
1571 static void
1572 vtblk_dump_complete(struct vtblk_softc *sc)
1573 {
1574 
1575 	vtblk_dump_flush(sc);
1576 
1577 	VTBLK_UNLOCK(sc);
1578 	vtblk_done_completed(sc, &sc->vtblk_dump_queue);
1579 	VTBLK_LOCK(sc);
1580 }
1581 
1582 static void
1583 vtblk_set_write_cache(struct vtblk_softc *sc, int wc)
1584 {
1585 
1586 	/* Set either writeback (1) or writethrough (0) mode. */
1587 	virtio_write_dev_config_1(sc->vtblk_dev,
1588 	    offsetof(struct virtio_blk_config, wce), wc);
1589 }
1590 
1591 static int
1592 vtblk_write_cache_enabled(struct vtblk_softc *sc,
1593     struct virtio_blk_config *blkcfg)
1594 {
1595 	int wc;
1596 
1597 	if (sc->vtblk_flags & VTBLK_FLAG_WCE_CONFIG) {
1598 		wc = vtblk_tunable_int(sc, "writecache_mode",
1599 		    vtblk_writecache_mode);
1600 		if (wc >= 0 && wc < VTBLK_CACHE_MAX)
1601 			vtblk_set_write_cache(sc, wc);
1602 		else
1603 			wc = blkcfg->wce;
1604 	} else
1605 		wc = virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_FLUSH);
1606 
1607 	return (wc);
1608 }
1609 
1610 static int
1611 vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS)
1612 {
1613 	struct vtblk_softc *sc;
1614 	int wc, error;
1615 
1616 	sc = oidp->oid_arg1;
1617 	wc = sc->vtblk_write_cache;
1618 
1619 	error = sysctl_handle_int(oidp, &wc, 0, req);
1620 	if (error || req->newptr == NULL)
1621 		return (error);
1622 	if ((sc->vtblk_flags & VTBLK_FLAG_WCE_CONFIG) == 0)
1623 		return (EPERM);
1624 	if (wc < 0 || wc >= VTBLK_CACHE_MAX)
1625 		return (EINVAL);
1626 
1627 	VTBLK_LOCK(sc);
1628 	sc->vtblk_write_cache = wc;
1629 	vtblk_set_write_cache(sc, sc->vtblk_write_cache);
1630 	VTBLK_UNLOCK(sc);
1631 
1632 	return (0);
1633 }
1634 
1635 static void
1636 vtblk_setup_sysctl(struct vtblk_softc *sc)
1637 {
1638 	device_t dev;
1639 	struct sysctl_ctx_list *ctx;
1640 	struct sysctl_oid *tree;
1641 	struct sysctl_oid_list *child;
1642 
1643 	dev = sc->vtblk_dev;
1644 	ctx = device_get_sysctl_ctx(dev);
1645 	tree = device_get_sysctl_tree(dev);
1646 	child = SYSCTL_CHILDREN(tree);
1647 
1648 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "writecache_mode",
1649 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1650 	    vtblk_write_cache_sysctl, "I",
1651 	    "Write cache mode (writethrough (0) or writeback (1))");
1652 }
1653 
1654 static int
1655 vtblk_tunable_int(struct vtblk_softc *sc, const char *knob, int def)
1656 {
1657 	char path[64];
1658 
1659 	snprintf(path, sizeof(path),
1660 	    "hw.vtblk.%d.%s", device_get_unit(sc->vtblk_dev), knob);
1661 	TUNABLE_INT_FETCH(path, &def);
1662 
1663 	return (def);
1664 }
1665