xref: /freebsd/sys/dev/virtio/block/virtio_blk.c (revision cbc2e34613c48c0b7955d5d970dcc08cd52da9b4)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011, Bryan Venteicher <bryanv@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /* Driver for VirtIO block devices. */
30 
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/kernel.h>
34 #include <sys/bio.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/msan.h>
38 #include <sys/sglist.h>
39 #include <sys/sysctl.h>
40 #include <sys/lock.h>
41 #include <sys/mutex.h>
42 #include <sys/queue.h>
43 
44 #include <geom/geom.h>
45 #include <geom/geom_disk.h>
46 
47 #include <machine/bus.h>
48 #include <machine/resource.h>
49 #include <sys/bus.h>
50 #include <sys/rman.h>
51 
52 #include <dev/virtio/virtio.h>
53 #include <dev/virtio/virtqueue.h>
54 #include <dev/virtio/block/virtio_blk.h>
55 
56 #include "virtio_if.h"
57 
58 struct vtblk_request {
59 	struct vtblk_softc		*vbr_sc;
60 	bus_dmamap_t			 vbr_mapp;
61 
62 	/* Fields after this point are zeroed for each request. */
63 	struct virtio_blk_outhdr	 vbr_hdr;
64 	struct bio			*vbr_bp;
65 	uint8_t				 vbr_ack;
66 	uint8_t				 vbr_requeue_on_error;
67 	uint8_t				 vbr_busdma_wait;
68 	int				 vbr_error;
69 	TAILQ_ENTRY(vtblk_request)	 vbr_link;
70 };
71 
72 enum vtblk_cache_mode {
73 	VTBLK_CACHE_WRITETHROUGH,
74 	VTBLK_CACHE_WRITEBACK,
75 	VTBLK_CACHE_MAX
76 };
77 
78 struct vtblk_softc {
79 	device_t		 vtblk_dev;
80 	struct mtx		 vtblk_mtx;
81 	uint64_t		 vtblk_features;
82 	uint32_t		 vtblk_flags;
83 #define VTBLK_FLAG_INDIRECT	0x0001
84 #define VTBLK_FLAG_DETACH	0x0002
85 #define VTBLK_FLAG_SUSPEND	0x0004
86 #define VTBLK_FLAG_BARRIER	0x0008
87 #define VTBLK_FLAG_WCE_CONFIG	0x0010
88 #define VTBLK_FLAG_BUSDMA_WAIT	0x0020
89 #define VTBLK_FLAG_BUSDMA_ALIGN	0x0040
90 
91 	struct virtqueue	*vtblk_vq;
92 	struct sglist		*vtblk_sglist;
93 	bus_dma_tag_t		 vtblk_dmat;
94 	struct disk		*vtblk_disk;
95 
96 	struct bio_queue_head	 vtblk_bioq;
97 	TAILQ_HEAD(, vtblk_request)
98 				 vtblk_req_free;
99 	TAILQ_HEAD(, vtblk_request)
100 				 vtblk_req_ready;
101 	struct vtblk_request	*vtblk_req_ordered;
102 
103 	int			 vtblk_max_nsegs;
104 	int			 vtblk_request_count;
105 	enum vtblk_cache_mode	 vtblk_write_cache;
106 
107 	struct bio_queue	 vtblk_dump_queue;
108 	struct vtblk_request	 vtblk_dump_request;
109 };
110 
111 static struct virtio_feature_desc vtblk_feature_desc[] = {
112 	{ VIRTIO_BLK_F_BARRIER,		"HostBarrier"	},
113 	{ VIRTIO_BLK_F_SIZE_MAX,	"MaxSegSize"	},
114 	{ VIRTIO_BLK_F_SEG_MAX,		"MaxNumSegs"	},
115 	{ VIRTIO_BLK_F_GEOMETRY,	"DiskGeometry"	},
116 	{ VIRTIO_BLK_F_RO,		"ReadOnly"	},
117 	{ VIRTIO_BLK_F_BLK_SIZE,	"BlockSize"	},
118 	{ VIRTIO_BLK_F_SCSI,		"SCSICmds"	},
119 	{ VIRTIO_BLK_F_FLUSH,		"FlushCmd"	},
120 	{ VIRTIO_BLK_F_TOPOLOGY,	"Topology"	},
121 	{ VIRTIO_BLK_F_CONFIG_WCE,	"ConfigWCE"	},
122 	{ VIRTIO_BLK_F_MQ,		"Multiqueue"	},
123 	{ VIRTIO_BLK_F_DISCARD,		"Discard"	},
124 	{ VIRTIO_BLK_F_WRITE_ZEROES,	"WriteZeros"	},
125 
126 	{ 0, NULL }
127 };
128 
129 static int	vtblk_modevent(module_t, int, void *);
130 
131 static int	vtblk_probe(device_t);
132 static int	vtblk_attach(device_t);
133 static int	vtblk_detach(device_t);
134 static int	vtblk_suspend(device_t);
135 static int	vtblk_resume(device_t);
136 static int	vtblk_shutdown(device_t);
137 static int	vtblk_attach_completed(device_t);
138 static int	vtblk_config_change(device_t);
139 
140 static int	vtblk_open(struct disk *);
141 static int	vtblk_close(struct disk *);
142 static int	vtblk_ioctl(struct disk *, u_long, void *, int,
143 		    struct thread *);
144 static int	vtblk_dump(void *, void *, off_t, size_t);
145 static void	vtblk_strategy(struct bio *);
146 
147 static int	vtblk_negotiate_features(struct vtblk_softc *);
148 static int	vtblk_setup_features(struct vtblk_softc *);
149 static int	vtblk_maximum_segments(struct vtblk_softc *,
150 		    struct virtio_blk_config *);
151 static int	vtblk_alloc_virtqueue(struct vtblk_softc *);
152 static void	vtblk_resize_disk(struct vtblk_softc *, uint64_t);
153 static void	vtblk_alloc_disk(struct vtblk_softc *,
154 		    struct virtio_blk_config *);
155 static void	vtblk_create_disk(struct vtblk_softc *);
156 
157 static int	vtblk_request_prealloc(struct vtblk_softc *);
158 static void	vtblk_request_free(struct vtblk_softc *);
159 static struct vtblk_request *
160 		vtblk_request_dequeue(struct vtblk_softc *);
161 static void	vtblk_request_enqueue(struct vtblk_softc *,
162 		    struct vtblk_request *);
163 static struct vtblk_request *
164 		vtblk_request_next_ready(struct vtblk_softc *);
165 static void	vtblk_request_requeue_ready(struct vtblk_softc *,
166 		    struct vtblk_request *);
167 static struct vtblk_request *
168 		vtblk_request_next(struct vtblk_softc *);
169 static struct vtblk_request *
170 		vtblk_request_bio(struct vtblk_softc *);
171 static int	vtblk_request_execute(struct vtblk_request *, int);
172 static void	vtblk_request_execute_cb(void *,
173 		    bus_dma_segment_t *, int, int);
174 static int	vtblk_request_error(struct vtblk_request *);
175 
176 static void	vtblk_queue_completed(struct vtblk_softc *,
177 		    struct bio_queue *);
178 static void	vtblk_done_completed(struct vtblk_softc *,
179 		    struct bio_queue *);
180 static void	vtblk_drain_vq(struct vtblk_softc *);
181 static void	vtblk_drain(struct vtblk_softc *);
182 
183 static void	vtblk_startio(struct vtblk_softc *);
184 static void	vtblk_bio_done(struct vtblk_softc *, struct bio *, int);
185 
186 static void	vtblk_read_config(struct vtblk_softc *,
187 		    struct virtio_blk_config *);
188 static void	vtblk_ident(struct vtblk_softc *);
189 static int	vtblk_poll_request(struct vtblk_softc *,
190 		    struct vtblk_request *);
191 static int	vtblk_quiesce(struct vtblk_softc *);
192 static void	vtblk_vq_intr(void *);
193 static void	vtblk_stop(struct vtblk_softc *);
194 
195 static void	vtblk_dump_quiesce(struct vtblk_softc *);
196 static int	vtblk_dump_write(struct vtblk_softc *, void *, off_t, size_t);
197 static int	vtblk_dump_flush(struct vtblk_softc *);
198 static void	vtblk_dump_complete(struct vtblk_softc *);
199 
200 static void	vtblk_set_write_cache(struct vtblk_softc *, int);
201 static int	vtblk_write_cache_enabled(struct vtblk_softc *sc,
202 		    struct virtio_blk_config *);
203 static int	vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS);
204 
205 static void	vtblk_setup_sysctl(struct vtblk_softc *);
206 static int	vtblk_tunable_int(struct vtblk_softc *, const char *, int);
207 
208 #define vtblk_modern(_sc) (((_sc)->vtblk_features & VIRTIO_F_VERSION_1) != 0)
209 #define vtblk_htog16(_sc, _val)	virtio_htog16(vtblk_modern(_sc), _val)
210 #define vtblk_htog32(_sc, _val)	virtio_htog32(vtblk_modern(_sc), _val)
211 #define vtblk_htog64(_sc, _val)	virtio_htog64(vtblk_modern(_sc), _val)
212 #define vtblk_gtoh16(_sc, _val)	virtio_gtoh16(vtblk_modern(_sc), _val)
213 #define vtblk_gtoh32(_sc, _val)	virtio_gtoh32(vtblk_modern(_sc), _val)
214 #define vtblk_gtoh64(_sc, _val)	virtio_gtoh64(vtblk_modern(_sc), _val)
215 
216 /* Tunables. */
217 static int vtblk_no_ident = 0;
218 TUNABLE_INT("hw.vtblk.no_ident", &vtblk_no_ident);
219 static int vtblk_writecache_mode = -1;
220 TUNABLE_INT("hw.vtblk.writecache_mode", &vtblk_writecache_mode);
221 
222 #define VTBLK_COMMON_FEATURES \
223     (VIRTIO_BLK_F_SIZE_MAX		| \
224      VIRTIO_BLK_F_SEG_MAX		| \
225      VIRTIO_BLK_F_GEOMETRY		| \
226      VIRTIO_BLK_F_RO			| \
227      VIRTIO_BLK_F_BLK_SIZE		| \
228      VIRTIO_BLK_F_FLUSH			| \
229      VIRTIO_BLK_F_TOPOLOGY		| \
230      VIRTIO_BLK_F_CONFIG_WCE		| \
231      VIRTIO_BLK_F_DISCARD		| \
232      VIRTIO_RING_F_INDIRECT_DESC)
233 
234 #define VTBLK_MODERN_FEATURES	(VTBLK_COMMON_FEATURES)
235 #define VTBLK_LEGACY_FEATURES	(VIRTIO_BLK_F_BARRIER | VTBLK_COMMON_FEATURES)
236 
237 #define VTBLK_MTX(_sc)		&(_sc)->vtblk_mtx
238 #define VTBLK_LOCK_INIT(_sc, _name) \
239 				mtx_init(VTBLK_MTX((_sc)), (_name), \
240 				    "VirtIO Block Lock", MTX_DEF)
241 #define VTBLK_LOCK(_sc)		mtx_lock(VTBLK_MTX((_sc)))
242 #define VTBLK_UNLOCK(_sc)	mtx_unlock(VTBLK_MTX((_sc)))
243 #define VTBLK_LOCK_DESTROY(_sc)	mtx_destroy(VTBLK_MTX((_sc)))
244 #define VTBLK_LOCK_ASSERT(_sc)	mtx_assert(VTBLK_MTX((_sc)), MA_OWNED)
245 #define VTBLK_LOCK_ASSERT_NOTOWNED(_sc) \
246 				mtx_assert(VTBLK_MTX((_sc)), MA_NOTOWNED)
247 
248 #define VTBLK_DISK_NAME		"vtbd"
249 #define VTBLK_QUIESCE_TIMEOUT	(30 * hz)
250 #define VTBLK_BSIZE		512
251 
252 /*
253  * Each block request uses at least two segments - one for the header
254  * and one for the status.
255  */
256 #define VTBLK_MIN_SEGMENTS	2
257 
258 static device_method_t vtblk_methods[] = {
259 	/* Device methods. */
260 	DEVMETHOD(device_probe,		vtblk_probe),
261 	DEVMETHOD(device_attach,	vtblk_attach),
262 	DEVMETHOD(device_detach,	vtblk_detach),
263 	DEVMETHOD(device_suspend,	vtblk_suspend),
264 	DEVMETHOD(device_resume,	vtblk_resume),
265 	DEVMETHOD(device_shutdown,	vtblk_shutdown),
266 
267 	/* VirtIO methods. */
268 	DEVMETHOD(virtio_attach_completed, vtblk_attach_completed),
269 	DEVMETHOD(virtio_config_change,	vtblk_config_change),
270 
271 	DEVMETHOD_END
272 };
273 
274 static driver_t vtblk_driver = {
275 	"vtblk",
276 	vtblk_methods,
277 	sizeof(struct vtblk_softc)
278 };
279 
280 VIRTIO_DRIVER_MODULE(virtio_blk, vtblk_driver, vtblk_modevent, NULL);
281 MODULE_VERSION(virtio_blk, 1);
282 MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
283 
284 VIRTIO_SIMPLE_PNPINFO(virtio_blk, VIRTIO_ID_BLOCK, "VirtIO Block Adapter");
285 
286 static int
vtblk_modevent(module_t mod,int type,void * unused)287 vtblk_modevent(module_t mod, int type, void *unused)
288 {
289 	int error;
290 
291 	error = 0;
292 
293 	switch (type) {
294 	case MOD_LOAD:
295 	case MOD_QUIESCE:
296 	case MOD_UNLOAD:
297 	case MOD_SHUTDOWN:
298 		break;
299 	default:
300 		error = EOPNOTSUPP;
301 		break;
302 	}
303 
304 	return (error);
305 }
306 
307 static int
vtblk_probe(device_t dev)308 vtblk_probe(device_t dev)
309 {
310 	return (VIRTIO_SIMPLE_PROBE(dev, virtio_blk));
311 }
312 
313 static int
vtblk_attach(device_t dev)314 vtblk_attach(device_t dev)
315 {
316 	struct vtblk_softc *sc;
317 	struct virtio_blk_config blkcfg;
318 	int error;
319 
320 	sc = device_get_softc(dev);
321 	sc->vtblk_dev = dev;
322 	virtio_set_feature_desc(dev, vtblk_feature_desc);
323 
324 	VTBLK_LOCK_INIT(sc, device_get_nameunit(dev));
325 	bioq_init(&sc->vtblk_bioq);
326 	TAILQ_INIT(&sc->vtblk_dump_queue);
327 	TAILQ_INIT(&sc->vtblk_req_free);
328 	TAILQ_INIT(&sc->vtblk_req_ready);
329 
330 	vtblk_setup_sysctl(sc);
331 
332 	error = vtblk_setup_features(sc);
333 	if (error) {
334 		device_printf(dev, "cannot setup features\n");
335 		goto fail;
336 	}
337 
338 	vtblk_read_config(sc, &blkcfg);
339 
340 	/*
341 	 * With the current sglist(9) implementation, it is not easy
342 	 * for us to support a maximum segment size as adjacent
343 	 * segments are coalesced. For now, just make sure it's larger
344 	 * than the maximum supported transfer size.
345 	 */
346 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
347 		if (blkcfg.size_max < maxphys) {
348 			error = ENOTSUP;
349 			device_printf(dev, "host requires unsupported "
350 			    "maximum segment size feature\n");
351 			goto fail;
352 		}
353 	}
354 
355 	sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
356 	if (sc->vtblk_max_nsegs <= VTBLK_MIN_SEGMENTS) {
357 		error = EINVAL;
358 		device_printf(dev, "fewer than minimum number of segments "
359 		    "allowed: %d\n", sc->vtblk_max_nsegs);
360 		goto fail;
361 	}
362 
363 	sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_NOWAIT);
364 	if (sc->vtblk_sglist == NULL) {
365 		error = ENOMEM;
366 		device_printf(dev, "cannot allocate sglist\n");
367 		goto fail;
368 	}
369 
370 	/*
371 	 * If vtblk_max_nsegs == VTBLK_MIN_SEGMENTS + 1, the device only
372 	 * supports a single data segment; in that case we need busdma to
373 	 * align to a page boundary so we can send a *contiguous* page size
374 	 * request to the host.
375 	 */
376 	if (sc->vtblk_max_nsegs == VTBLK_MIN_SEGMENTS + 1)
377 		sc->vtblk_flags |= VTBLK_FLAG_BUSDMA_ALIGN;
378 	error = bus_dma_tag_create(
379 	    bus_get_dma_tag(dev),			/* parent */
380 	    (sc->vtblk_flags & VTBLK_FLAG_BUSDMA_ALIGN) ? PAGE_SIZE : 1,
381 	    0,						/* boundary */
382 	    BUS_SPACE_MAXADDR,				/* lowaddr */
383 	    BUS_SPACE_MAXADDR,				/* highaddr */
384 	    NULL, NULL,					/* filter, filterarg */
385 	    maxphys,					/* max request size */
386 	    sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS,	/* max # segments */
387 	    maxphys,					/* maxsegsize */
388 	    0,						/* flags */
389 	    busdma_lock_mutex,				/* lockfunc */
390 	    &sc->vtblk_mtx,				/* lockarg */
391 	    &sc->vtblk_dmat);
392 	if (error) {
393 		device_printf(dev, "cannot create bus dma tag\n");
394 		goto fail;
395 	}
396 
397 #ifdef __powerpc__
398 	/*
399 	 * Virtio uses physical addresses rather than bus addresses, so we
400 	 * need to ask busdma to skip the iommu physical->bus mapping.  At
401 	 * present, this is only a thing on the powerpc architectures.
402 	 */
403 	bus_dma_tag_set_iommu(sc->vtblk_dmat, NULL, NULL);
404 #endif
405 
406 	error = vtblk_alloc_virtqueue(sc);
407 	if (error) {
408 		device_printf(dev, "cannot allocate virtqueue\n");
409 		goto fail;
410 	}
411 
412 	error = vtblk_request_prealloc(sc);
413 	if (error) {
414 		device_printf(dev, "cannot preallocate requests\n");
415 		goto fail;
416 	}
417 
418 	vtblk_alloc_disk(sc, &blkcfg);
419 
420 	error = virtio_setup_intr(dev, INTR_TYPE_BIO | INTR_ENTROPY);
421 	if (error) {
422 		device_printf(dev, "cannot setup virtqueue interrupt\n");
423 		goto fail;
424 	}
425 
426 	virtqueue_enable_intr(sc->vtblk_vq);
427 
428 fail:
429 	if (error)
430 		vtblk_detach(dev);
431 
432 	return (error);
433 }
434 
435 static int
vtblk_detach(device_t dev)436 vtblk_detach(device_t dev)
437 {
438 	struct vtblk_softc *sc;
439 
440 	sc = device_get_softc(dev);
441 
442 	VTBLK_LOCK(sc);
443 	sc->vtblk_flags |= VTBLK_FLAG_DETACH;
444 	if (device_is_attached(dev))
445 		vtblk_stop(sc);
446 	VTBLK_UNLOCK(sc);
447 
448 	vtblk_drain(sc);
449 
450 	if (sc->vtblk_disk != NULL) {
451 		disk_destroy(sc->vtblk_disk);
452 		sc->vtblk_disk = NULL;
453 	}
454 
455 	if (sc->vtblk_dmat != NULL) {
456 		bus_dma_tag_destroy(sc->vtblk_dmat);
457 		sc->vtblk_dmat = NULL;
458 	}
459 
460 	if (sc->vtblk_sglist != NULL) {
461 		sglist_free(sc->vtblk_sglist);
462 		sc->vtblk_sglist = NULL;
463 	}
464 
465 	VTBLK_LOCK_DESTROY(sc);
466 
467 	return (0);
468 }
469 
470 static int
vtblk_suspend(device_t dev)471 vtblk_suspend(device_t dev)
472 {
473 	struct vtblk_softc *sc;
474 	int error;
475 
476 	sc = device_get_softc(dev);
477 
478 	VTBLK_LOCK(sc);
479 	sc->vtblk_flags |= VTBLK_FLAG_SUSPEND;
480 	/* XXX BMV: virtio_stop(), etc needed here? */
481 	error = vtblk_quiesce(sc);
482 	if (error)
483 		sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
484 	VTBLK_UNLOCK(sc);
485 
486 	return (error);
487 }
488 
489 static int
vtblk_resume(device_t dev)490 vtblk_resume(device_t dev)
491 {
492 	struct vtblk_softc *sc;
493 
494 	sc = device_get_softc(dev);
495 
496 	VTBLK_LOCK(sc);
497 	/* XXX BMV: virtio_reinit(), etc needed here? */
498 	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
499 	vtblk_startio(sc);
500 	VTBLK_UNLOCK(sc);
501 
502 	return (0);
503 }
504 
505 static int
vtblk_shutdown(device_t dev)506 vtblk_shutdown(device_t dev)
507 {
508 
509 	return (0);
510 }
511 
512 static int
vtblk_attach_completed(device_t dev)513 vtblk_attach_completed(device_t dev)
514 {
515 	struct vtblk_softc *sc;
516 
517 	sc = device_get_softc(dev);
518 
519 	/*
520 	 * Create disk after attach as VIRTIO_BLK_T_GET_ID can only be
521 	 * processed after the device acknowledged
522 	 * VIRTIO_CONFIG_STATUS_DRIVER_OK.
523 	 */
524 	vtblk_create_disk(sc);
525 	return (0);
526 }
527 
528 static int
vtblk_config_change(device_t dev)529 vtblk_config_change(device_t dev)
530 {
531 	struct vtblk_softc *sc;
532 	struct virtio_blk_config blkcfg;
533 	uint64_t capacity;
534 
535 	sc = device_get_softc(dev);
536 
537 	vtblk_read_config(sc, &blkcfg);
538 
539 	/* Capacity is always in 512-byte units. */
540 	capacity = blkcfg.capacity * VTBLK_BSIZE;
541 
542 	if (sc->vtblk_disk->d_mediasize != capacity)
543 		vtblk_resize_disk(sc, capacity);
544 
545 	return (0);
546 }
547 
548 static int
vtblk_open(struct disk * dp)549 vtblk_open(struct disk *dp)
550 {
551 	struct vtblk_softc *sc;
552 
553 	if ((sc = dp->d_drv1) == NULL)
554 		return (ENXIO);
555 
556 	return (sc->vtblk_flags & VTBLK_FLAG_DETACH ? ENXIO : 0);
557 }
558 
559 static int
vtblk_close(struct disk * dp)560 vtblk_close(struct disk *dp)
561 {
562 	struct vtblk_softc *sc;
563 
564 	if ((sc = dp->d_drv1) == NULL)
565 		return (ENXIO);
566 
567 	return (0);
568 }
569 
570 static int
vtblk_ioctl(struct disk * dp,u_long cmd,void * addr,int flag,struct thread * td)571 vtblk_ioctl(struct disk *dp, u_long cmd, void *addr, int flag,
572     struct thread *td)
573 {
574 	struct vtblk_softc *sc;
575 
576 	if ((sc = dp->d_drv1) == NULL)
577 		return (ENXIO);
578 
579 	return (ENOTTY);
580 }
581 
582 static int
vtblk_dump(void * arg,void * virtual,off_t offset,size_t length)583 vtblk_dump(void *arg, void *virtual, off_t offset, size_t length)
584 {
585 	struct disk *dp;
586 	struct vtblk_softc *sc;
587 	int error;
588 
589 	dp = arg;
590 	error = 0;
591 
592 	if ((sc = dp->d_drv1) == NULL)
593 		return (ENXIO);
594 
595 	VTBLK_LOCK(sc);
596 
597 	vtblk_dump_quiesce(sc);
598 
599 	if (length > 0)
600 		error = vtblk_dump_write(sc, virtual, offset, length);
601 	if (error || (virtual == NULL && offset == 0))
602 		vtblk_dump_complete(sc);
603 
604 	VTBLK_UNLOCK(sc);
605 
606 	return (error);
607 }
608 
609 static void
vtblk_strategy(struct bio * bp)610 vtblk_strategy(struct bio *bp)
611 {
612 	struct vtblk_softc *sc;
613 
614 	if ((sc = bp->bio_disk->d_drv1) == NULL) {
615 		vtblk_bio_done(NULL, bp, EINVAL);
616 		return;
617 	}
618 
619 	if ((bp->bio_cmd != BIO_READ) && (bp->bio_cmd != BIO_WRITE) &&
620 	    (bp->bio_cmd != BIO_FLUSH) && (bp->bio_cmd != BIO_DELETE)) {
621 		vtblk_bio_done(sc, bp, EOPNOTSUPP);
622 		return;
623 	}
624 
625 	VTBLK_LOCK(sc);
626 
627 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH) {
628 		VTBLK_UNLOCK(sc);
629 		vtblk_bio_done(sc, bp, ENXIO);
630 		return;
631 	}
632 
633 	bioq_insert_tail(&sc->vtblk_bioq, bp);
634 	vtblk_startio(sc);
635 
636 	VTBLK_UNLOCK(sc);
637 }
638 
639 static int
vtblk_negotiate_features(struct vtblk_softc * sc)640 vtblk_negotiate_features(struct vtblk_softc *sc)
641 {
642 	device_t dev;
643 	uint64_t features;
644 
645 	dev = sc->vtblk_dev;
646 	features = virtio_bus_is_modern(dev) ? VTBLK_MODERN_FEATURES :
647 	    VTBLK_LEGACY_FEATURES;
648 
649 	sc->vtblk_features = virtio_negotiate_features(dev, features);
650 	return (virtio_finalize_features(dev));
651 }
652 
653 static int
vtblk_setup_features(struct vtblk_softc * sc)654 vtblk_setup_features(struct vtblk_softc *sc)
655 {
656 	device_t dev;
657 	int error;
658 
659 	dev = sc->vtblk_dev;
660 
661 	error = vtblk_negotiate_features(sc);
662 	if (error)
663 		return (error);
664 
665 	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
666 		sc->vtblk_flags |= VTBLK_FLAG_INDIRECT;
667 	if (virtio_with_feature(dev, VIRTIO_BLK_F_CONFIG_WCE))
668 		sc->vtblk_flags |= VTBLK_FLAG_WCE_CONFIG;
669 
670 	/* Legacy. */
671 	if (virtio_with_feature(dev, VIRTIO_BLK_F_BARRIER))
672 		sc->vtblk_flags |= VTBLK_FLAG_BARRIER;
673 
674 	return (0);
675 }
676 
677 static int
vtblk_maximum_segments(struct vtblk_softc * sc,struct virtio_blk_config * blkcfg)678 vtblk_maximum_segments(struct vtblk_softc *sc,
679     struct virtio_blk_config *blkcfg)
680 {
681 	device_t dev;
682 	int nsegs;
683 
684 	dev = sc->vtblk_dev;
685 	nsegs = VTBLK_MIN_SEGMENTS;
686 
687 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
688 		nsegs += MIN(blkcfg->seg_max, maxphys / PAGE_SIZE + 1);
689 		if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
690 			nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
691 	} else
692 		nsegs += 1;
693 
694 	return (nsegs);
695 }
696 
697 static int
vtblk_alloc_virtqueue(struct vtblk_softc * sc)698 vtblk_alloc_virtqueue(struct vtblk_softc *sc)
699 {
700 	device_t dev;
701 	struct vq_alloc_info vq_info;
702 	int indir_segs;
703 
704 	dev = sc->vtblk_dev;
705 
706 	indir_segs = 0;
707 	if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
708 		indir_segs = sc->vtblk_max_nsegs;
709 	VQ_ALLOC_INFO_INIT(&vq_info, indir_segs,
710 	    vtblk_vq_intr, sc, &sc->vtblk_vq,
711 	    "%s request", device_get_nameunit(dev));
712 
713 	return (virtio_alloc_virtqueues(dev, 1, &vq_info));
714 }
715 
716 static void
vtblk_resize_disk(struct vtblk_softc * sc,uint64_t new_capacity)717 vtblk_resize_disk(struct vtblk_softc *sc, uint64_t new_capacity)
718 {
719 	device_t dev;
720 	struct disk *dp;
721 	int error;
722 
723 	dev = sc->vtblk_dev;
724 	dp = sc->vtblk_disk;
725 
726 	dp->d_mediasize = new_capacity;
727 	if (bootverbose) {
728 		device_printf(dev, "resized to %juMB (%ju %u byte sectors)\n",
729 		    (uintmax_t) dp->d_mediasize >> 20,
730 		    (uintmax_t) dp->d_mediasize / dp->d_sectorsize,
731 		    dp->d_sectorsize);
732 	}
733 
734 	error = disk_resize(dp, M_NOWAIT);
735 	if (error) {
736 		device_printf(dev,
737 		    "disk_resize(9) failed, error: %d\n", error);
738 	}
739 }
740 
741 static void
vtblk_alloc_disk(struct vtblk_softc * sc,struct virtio_blk_config * blkcfg)742 vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
743 {
744 	device_t dev;
745 	struct disk *dp;
746 
747 	dev = sc->vtblk_dev;
748 
749 	sc->vtblk_disk = dp = disk_alloc();
750 	dp->d_open = vtblk_open;
751 	dp->d_close = vtblk_close;
752 	dp->d_ioctl = vtblk_ioctl;
753 	dp->d_strategy = vtblk_strategy;
754 	dp->d_name = VTBLK_DISK_NAME;
755 	dp->d_unit = device_get_unit(dev);
756 	dp->d_drv1 = sc;
757 	dp->d_flags = DISKFLAG_UNMAPPED_BIO | DISKFLAG_DIRECT_COMPLETION;
758 	dp->d_hba_vendor = virtio_get_vendor(dev);
759 	dp->d_hba_device = virtio_get_device(dev);
760 	dp->d_hba_subvendor = virtio_get_subvendor(dev);
761 	dp->d_hba_subdevice = virtio_get_subdevice(dev);
762 	strlcpy(dp->d_attachment, device_get_nameunit(dev),
763 	    sizeof(dp->d_attachment));
764 
765 	if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
766 		dp->d_flags |= DISKFLAG_WRITE_PROTECT;
767 	else {
768 		if (virtio_with_feature(dev, VIRTIO_BLK_F_FLUSH))
769 			dp->d_flags |= DISKFLAG_CANFLUSHCACHE;
770 		dp->d_dump = vtblk_dump;
771 	}
772 
773 	/* Capacity is always in 512-byte units. */
774 	dp->d_mediasize = blkcfg->capacity * VTBLK_BSIZE;
775 
776 	if (virtio_with_feature(dev, VIRTIO_BLK_F_BLK_SIZE))
777 		dp->d_sectorsize = blkcfg->blk_size;
778 	else
779 		dp->d_sectorsize = VTBLK_BSIZE;
780 
781 	/*
782 	 * The VirtIO maximum I/O size is given in terms of segments.
783 	 * However, FreeBSD limits I/O size by logical buffer size, not
784 	 * by physically contiguous pages. Therefore, we have to assume
785 	 * no pages are contiguous. This may impose an artificially low
786 	 * maximum I/O size. But in practice, since QEMU advertises 128
787 	 * segments, this gives us a maximum IO size of 125 * PAGE_SIZE,
788 	 * which is typically greater than maxphys. Eventually we should
789 	 * just advertise maxphys and split buffers that are too big.
790 	 *
791 	 * If we're not asking busdma to align data to page boundaries, the
792 	 * maximum I/O size is reduced by PAGE_SIZE in order to accommodate
793 	 * unaligned I/Os.
794 	 */
795 	dp->d_maxsize = (sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS) *
796 	    PAGE_SIZE;
797 	if ((sc->vtblk_flags & VTBLK_FLAG_BUSDMA_ALIGN) == 0)
798 		dp->d_maxsize -= PAGE_SIZE;
799 
800 	if (virtio_with_feature(dev, VIRTIO_BLK_F_GEOMETRY)) {
801 		dp->d_fwsectors = blkcfg->geometry.sectors;
802 		dp->d_fwheads = blkcfg->geometry.heads;
803 	}
804 
805 	if (virtio_with_feature(dev, VIRTIO_BLK_F_TOPOLOGY) &&
806 	    blkcfg->topology.physical_block_exp > 0) {
807 		dp->d_stripesize = dp->d_sectorsize *
808 		    (1 << blkcfg->topology.physical_block_exp);
809 		dp->d_stripeoffset = (dp->d_stripesize -
810 		    blkcfg->topology.alignment_offset * dp->d_sectorsize) %
811 		    dp->d_stripesize;
812 	}
813 
814 	if (virtio_with_feature(dev, VIRTIO_BLK_F_DISCARD)) {
815 		dp->d_flags |= DISKFLAG_CANDELETE;
816 		dp->d_delmaxsize = blkcfg->max_discard_sectors * VTBLK_BSIZE;
817 	}
818 
819 	if (vtblk_write_cache_enabled(sc, blkcfg) != 0)
820 		sc->vtblk_write_cache = VTBLK_CACHE_WRITEBACK;
821 	else
822 		sc->vtblk_write_cache = VTBLK_CACHE_WRITETHROUGH;
823 }
824 
825 static void
vtblk_create_disk(struct vtblk_softc * sc)826 vtblk_create_disk(struct vtblk_softc *sc)
827 {
828 	struct disk *dp;
829 
830 	dp = sc->vtblk_disk;
831 
832 	vtblk_ident(sc);
833 
834 	device_printf(sc->vtblk_dev, "%juMB (%ju %u byte sectors)\n",
835 	    (uintmax_t) dp->d_mediasize >> 20,
836 	    (uintmax_t) dp->d_mediasize / dp->d_sectorsize,
837 	    dp->d_sectorsize);
838 
839 	disk_create(dp, DISK_VERSION);
840 }
841 
842 static int
vtblk_request_prealloc(struct vtblk_softc * sc)843 vtblk_request_prealloc(struct vtblk_softc *sc)
844 {
845 	struct vtblk_request *req;
846 	int i, nreqs;
847 
848 	nreqs = virtqueue_size(sc->vtblk_vq);
849 
850 	/*
851 	 * Preallocate sufficient requests to keep the virtqueue full. Each
852 	 * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
853 	 * the number allocated when indirect descriptors are not available.
854 	 */
855 	if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
856 		nreqs /= VTBLK_MIN_SEGMENTS;
857 
858 	for (i = 0; i < nreqs; i++) {
859 		req = malloc(sizeof(struct vtblk_request), M_DEVBUF, M_NOWAIT);
860 		if (req == NULL)
861 			return (ENOMEM);
862 
863 		req->vbr_sc = sc;
864 		if (bus_dmamap_create(sc->vtblk_dmat, 0, &req->vbr_mapp)) {
865 			free(req, M_DEVBUF);
866 			return (ENOMEM);
867 		}
868 
869 		MPASS(sglist_count(&req->vbr_hdr, sizeof(req->vbr_hdr)) == 1);
870 		MPASS(sglist_count(&req->vbr_ack, sizeof(req->vbr_ack)) == 1);
871 
872 		sc->vtblk_request_count++;
873 		vtblk_request_enqueue(sc, req);
874 	}
875 
876 	return (0);
877 }
878 
879 static void
vtblk_request_free(struct vtblk_softc * sc)880 vtblk_request_free(struct vtblk_softc *sc)
881 {
882 	struct vtblk_request *req;
883 
884 	MPASS(TAILQ_EMPTY(&sc->vtblk_req_ready));
885 
886 	while ((req = vtblk_request_dequeue(sc)) != NULL) {
887 		sc->vtblk_request_count--;
888 		bus_dmamap_destroy(sc->vtblk_dmat, req->vbr_mapp);
889 		free(req, M_DEVBUF);
890 	}
891 
892 	KASSERT(sc->vtblk_request_count == 0,
893 	    ("%s: leaked %d requests", __func__, sc->vtblk_request_count));
894 }
895 
896 static struct vtblk_request *
vtblk_request_dequeue(struct vtblk_softc * sc)897 vtblk_request_dequeue(struct vtblk_softc *sc)
898 {
899 	struct vtblk_request *req;
900 
901 	req = TAILQ_FIRST(&sc->vtblk_req_free);
902 	if (req != NULL) {
903 		TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link);
904 		bzero(&req->vbr_hdr, sizeof(struct vtblk_request) -
905 		    offsetof(struct vtblk_request, vbr_hdr));
906 	}
907 
908 	return (req);
909 }
910 
911 static void
vtblk_request_enqueue(struct vtblk_softc * sc,struct vtblk_request * req)912 vtblk_request_enqueue(struct vtblk_softc *sc, struct vtblk_request *req)
913 {
914 
915 	TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
916 }
917 
918 static struct vtblk_request *
vtblk_request_next_ready(struct vtblk_softc * sc)919 vtblk_request_next_ready(struct vtblk_softc *sc)
920 {
921 	struct vtblk_request *req;
922 
923 	req = TAILQ_FIRST(&sc->vtblk_req_ready);
924 	if (req != NULL)
925 		TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link);
926 
927 	return (req);
928 }
929 
930 static void
vtblk_request_requeue_ready(struct vtblk_softc * sc,struct vtblk_request * req)931 vtblk_request_requeue_ready(struct vtblk_softc *sc, struct vtblk_request *req)
932 {
933 
934 	/* NOTE: Currently, there will be at most one request in the queue. */
935 	TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link);
936 }
937 
938 static struct vtblk_request *
vtblk_request_next(struct vtblk_softc * sc)939 vtblk_request_next(struct vtblk_softc *sc)
940 {
941 	struct vtblk_request *req;
942 
943 	req = vtblk_request_next_ready(sc);
944 	if (req != NULL)
945 		return (req);
946 
947 	return (vtblk_request_bio(sc));
948 }
949 
950 static struct vtblk_request *
vtblk_request_bio(struct vtblk_softc * sc)951 vtblk_request_bio(struct vtblk_softc *sc)
952 {
953 	struct bio_queue_head *bioq;
954 	struct vtblk_request *req;
955 	struct bio *bp;
956 
957 	bioq = &sc->vtblk_bioq;
958 
959 	if (bioq_first(bioq) == NULL)
960 		return (NULL);
961 
962 	req = vtblk_request_dequeue(sc);
963 	if (req == NULL)
964 		return (NULL);
965 
966 	bp = bioq_takefirst(bioq);
967 	req->vbr_bp = bp;
968 	req->vbr_ack = -1;
969 	req->vbr_hdr.ioprio = vtblk_gtoh32(sc, 1);
970 
971 	switch (bp->bio_cmd) {
972 	case BIO_FLUSH:
973 		req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_FLUSH);
974 		req->vbr_hdr.sector = 0;
975 		break;
976 	case BIO_READ:
977 		req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_IN);
978 		req->vbr_hdr.sector = vtblk_gtoh64(sc, bp->bio_offset / VTBLK_BSIZE);
979 		break;
980 	case BIO_WRITE:
981 		req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_OUT);
982 		req->vbr_hdr.sector = vtblk_gtoh64(sc, bp->bio_offset / VTBLK_BSIZE);
983 		break;
984 	case BIO_DELETE:
985 		req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_DISCARD);
986 		req->vbr_hdr.sector = vtblk_gtoh64(sc, bp->bio_offset / VTBLK_BSIZE);
987 		break;
988 	default:
989 		panic("%s: bio with unhandled cmd: %d", __func__, bp->bio_cmd);
990 	}
991 
992 	if (bp->bio_flags & BIO_ORDERED)
993 		req->vbr_hdr.type |= vtblk_gtoh32(sc, VIRTIO_BLK_T_BARRIER);
994 
995 	return (req);
996 }
997 
998 static int
vtblk_request_execute(struct vtblk_request * req,int flags)999 vtblk_request_execute(struct vtblk_request *req, int flags)
1000 {
1001 	struct vtblk_softc *sc = req->vbr_sc;
1002 	struct bio *bp = req->vbr_bp;
1003 	int error = 0;
1004 
1005 	/*
1006 	 * Call via bus_dmamap_load_bio or directly depending on whether we
1007 	 * have a buffer we need to map.  If we don't have a busdma map,
1008 	 * try to perform the I/O directly and hope that it works (this will
1009 	 * happen when dumping).
1010 	 */
1011 	if ((req->vbr_mapp != NULL) &&
1012 	    (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) {
1013 		error = bus_dmamap_load_bio(sc->vtblk_dmat, req->vbr_mapp,
1014 		    req->vbr_bp, vtblk_request_execute_cb, req, flags);
1015 		if (error == EINPROGRESS) {
1016 			req->vbr_busdma_wait = 1;
1017 			sc->vtblk_flags |= VTBLK_FLAG_BUSDMA_WAIT;
1018 		}
1019 	} else {
1020 		vtblk_request_execute_cb(req, NULL, 0, 0);
1021 	}
1022 
1023 	return (error ? error : req->vbr_error);
1024 }
1025 
1026 static void
vtblk_request_execute_cb(void * callback_arg,bus_dma_segment_t * segs,int nseg,int error)1027 vtblk_request_execute_cb(void * callback_arg, bus_dma_segment_t * segs,
1028     int nseg, int error)
1029 {
1030 	struct vtblk_request *req;
1031 	struct vtblk_softc *sc;
1032 	struct virtqueue *vq;
1033 	struct sglist *sg;
1034 	struct bio *bp;
1035 	int ordered, readable, writable, i;
1036 
1037 	req = (struct vtblk_request *)callback_arg;
1038 	sc = req->vbr_sc;
1039 	vq = sc->vtblk_vq;
1040 	sg = sc->vtblk_sglist;
1041 	bp = req->vbr_bp;
1042 	ordered = 0;
1043 	writable = 0;
1044 
1045 	/*
1046 	 * If we paused request queueing while we waited for busdma to call us
1047 	 * asynchronously, unpause it now; this request made it through so we
1048 	 * don't need to worry about others getting ahead of us.  (Note that we
1049 	 * hold the device mutex so nothing will happen until after we return
1050 	 * anyway.)
1051 	 */
1052 	if (req->vbr_busdma_wait)
1053 		sc->vtblk_flags &= ~VTBLK_FLAG_BUSDMA_WAIT;
1054 
1055 	/* Fail on errors from busdma. */
1056 	if (error)
1057 		goto out1;
1058 
1059 	/*
1060 	 * Some hosts (such as bhyve) do not implement the barrier feature,
1061 	 * so we emulate it in the driver by allowing the barrier request
1062 	 * to be the only one in flight.
1063 	 */
1064 	if ((sc->vtblk_flags & VTBLK_FLAG_BARRIER) == 0) {
1065 		if (sc->vtblk_req_ordered != NULL) {
1066 			error = EBUSY;
1067 			goto out;
1068 		}
1069 		if (bp->bio_flags & BIO_ORDERED) {
1070 			if (!virtqueue_empty(vq)) {
1071 				error = EBUSY;
1072 				goto out;
1073 			}
1074 			ordered = 1;
1075 			req->vbr_hdr.type &= vtblk_gtoh32(sc,
1076 				~VIRTIO_BLK_T_BARRIER);
1077 		}
1078 	}
1079 
1080 	sglist_reset(sg);
1081 	sglist_append(sg, &req->vbr_hdr, sizeof(struct virtio_blk_outhdr));
1082 
1083 	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
1084 		/*
1085 		 * We cast bus_addr_t to vm_paddr_t here; since we skip the
1086 		 * iommu mapping (see vtblk_attach) this should be safe.
1087 		 */
1088 		for (i = 0; i < nseg; i++) {
1089 			error = sglist_append_phys(sg,
1090 			    (vm_paddr_t)segs[i].ds_addr, segs[i].ds_len);
1091 			if (error || sg->sg_nseg == sg->sg_maxseg) {
1092 				panic("%s: bio %p data buffer too big %d",
1093 				    __func__, bp, error);
1094 			}
1095 		}
1096 
1097 		/* Special handling for dump, which bypasses busdma. */
1098 		if (req->vbr_mapp == NULL) {
1099 			error = sglist_append_bio(sg, bp);
1100 			if (error || sg->sg_nseg == sg->sg_maxseg) {
1101 				panic("%s: bio %p data buffer too big %d",
1102 				    __func__, bp, error);
1103 			}
1104 		}
1105 
1106 		/* BIO_READ means the host writes into our buffer. */
1107 		if (bp->bio_cmd == BIO_READ)
1108 			writable = sg->sg_nseg - 1;
1109 	} else if (bp->bio_cmd == BIO_DELETE) {
1110 		struct virtio_blk_discard_write_zeroes *discard;
1111 
1112 		discard = malloc(sizeof(*discard), M_DEVBUF, M_NOWAIT | M_ZERO);
1113 		if (discard == NULL) {
1114 			error = ENOMEM;
1115 			goto out;
1116 		}
1117 
1118 		bp->bio_driver1 = discard;
1119 		discard->sector = vtblk_gtoh64(sc, bp->bio_offset / VTBLK_BSIZE);
1120 		discard->num_sectors = vtblk_gtoh32(sc, bp->bio_bcount / VTBLK_BSIZE);
1121 		error = sglist_append(sg, discard, sizeof(*discard));
1122 		if (error || sg->sg_nseg == sg->sg_maxseg) {
1123 			panic("%s: bio %p data buffer too big %d",
1124 			    __func__, bp, error);
1125 		}
1126 	}
1127 
1128 	writable++;
1129 	sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
1130 	readable = sg->sg_nseg - writable;
1131 
1132 	if (req->vbr_mapp != NULL) {
1133 		switch (bp->bio_cmd) {
1134 		case BIO_READ:
1135 			bus_dmamap_sync(sc->vtblk_dmat, req->vbr_mapp,
1136 			    BUS_DMASYNC_PREREAD);
1137 			break;
1138 		case BIO_WRITE:
1139 			bus_dmamap_sync(sc->vtblk_dmat, req->vbr_mapp,
1140 			    BUS_DMASYNC_PREWRITE);
1141 			break;
1142 		}
1143 	}
1144 
1145 	error = virtqueue_enqueue(vq, req, sg, readable, writable);
1146 	if (error == 0 && ordered)
1147 		sc->vtblk_req_ordered = req;
1148 
1149 	/*
1150 	 * If we were called asynchronously, we need to notify the queue that
1151 	 * we've added a new request, since the notification from startio was
1152 	 * performed already.
1153 	 */
1154 	if (error == 0 && req->vbr_busdma_wait)
1155 		virtqueue_notify(vq);
1156 
1157 out:
1158 	if (error && (req->vbr_mapp != NULL))
1159 		bus_dmamap_unload(sc->vtblk_dmat, req->vbr_mapp);
1160 out1:
1161 	if (error && req->vbr_requeue_on_error)
1162 		vtblk_request_requeue_ready(sc, req);
1163 	req->vbr_error = error;
1164 }
1165 
1166 static int
vtblk_request_error(struct vtblk_request * req)1167 vtblk_request_error(struct vtblk_request *req)
1168 {
1169 	int error;
1170 
1171 	switch (req->vbr_ack) {
1172 	case VIRTIO_BLK_S_OK:
1173 		error = 0;
1174 		break;
1175 	case VIRTIO_BLK_S_UNSUPP:
1176 		error = ENOTSUP;
1177 		break;
1178 	default:
1179 		error = EIO;
1180 		break;
1181 	}
1182 
1183 	return (error);
1184 }
1185 
1186 static struct bio *
vtblk_queue_complete_one(struct vtblk_softc * sc,struct vtblk_request * req)1187 vtblk_queue_complete_one(struct vtblk_softc *sc, struct vtblk_request *req)
1188 {
1189 	struct bio *bp;
1190 
1191 	if (sc->vtblk_req_ordered != NULL) {
1192 		MPASS(sc->vtblk_req_ordered == req);
1193 		sc->vtblk_req_ordered = NULL;
1194 	}
1195 
1196 	bp = req->vbr_bp;
1197 	if (req->vbr_mapp != NULL) {
1198 		switch (bp->bio_cmd) {
1199 		case BIO_READ:
1200 			bus_dmamap_sync(sc->vtblk_dmat, req->vbr_mapp,
1201 			    BUS_DMASYNC_POSTREAD);
1202 			bus_dmamap_unload(sc->vtblk_dmat, req->vbr_mapp);
1203 			break;
1204 		case BIO_WRITE:
1205 			bus_dmamap_sync(sc->vtblk_dmat, req->vbr_mapp,
1206 			    BUS_DMASYNC_POSTWRITE);
1207 			bus_dmamap_unload(sc->vtblk_dmat, req->vbr_mapp);
1208 			break;
1209 		}
1210 	}
1211 	bp->bio_error = vtblk_request_error(req);
1212 	return (bp);
1213 }
1214 
1215 static void
vtblk_queue_completed(struct vtblk_softc * sc,struct bio_queue * queue)1216 vtblk_queue_completed(struct vtblk_softc *sc, struct bio_queue *queue)
1217 {
1218 	struct vtblk_request *req;
1219 	struct bio *bp;
1220 
1221 	while ((req = virtqueue_dequeue(sc->vtblk_vq, NULL)) != NULL) {
1222 		bp = vtblk_queue_complete_one(sc, req);
1223 
1224 		TAILQ_INSERT_TAIL(queue, bp, bio_queue);
1225 		vtblk_request_enqueue(sc, req);
1226 	}
1227 }
1228 
1229 static void
vtblk_done_completed(struct vtblk_softc * sc,struct bio_queue * queue)1230 vtblk_done_completed(struct vtblk_softc *sc, struct bio_queue *queue)
1231 {
1232 	struct bio *bp, *tmp;
1233 
1234 	TAILQ_FOREACH_SAFE(bp, queue, bio_queue, tmp) {
1235 		if (bp->bio_error != 0)
1236 			disk_err(bp, "hard error", -1, 1);
1237 		vtblk_bio_done(sc, bp, bp->bio_error);
1238 	}
1239 }
1240 
1241 static void
vtblk_drain_vq(struct vtblk_softc * sc)1242 vtblk_drain_vq(struct vtblk_softc *sc)
1243 {
1244 	struct virtqueue *vq;
1245 	struct vtblk_request *req;
1246 	int last;
1247 
1248 	vq = sc->vtblk_vq;
1249 	last = 0;
1250 
1251 	while ((req = virtqueue_drain(vq, &last)) != NULL) {
1252 		vtblk_bio_done(sc, req->vbr_bp, ENXIO);
1253 		vtblk_request_enqueue(sc, req);
1254 	}
1255 
1256 	sc->vtblk_req_ordered = NULL;
1257 	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
1258 }
1259 
1260 static void
vtblk_drain(struct vtblk_softc * sc)1261 vtblk_drain(struct vtblk_softc *sc)
1262 {
1263 	struct bio_queue_head *bioq;
1264 	struct vtblk_request *req;
1265 	struct bio *bp;
1266 
1267 	bioq = &sc->vtblk_bioq;
1268 
1269 	if (sc->vtblk_vq != NULL) {
1270 		struct bio_queue queue;
1271 
1272 		TAILQ_INIT(&queue);
1273 		vtblk_queue_completed(sc, &queue);
1274 		vtblk_done_completed(sc, &queue);
1275 
1276 		vtblk_drain_vq(sc);
1277 	}
1278 
1279 	while ((req = vtblk_request_next_ready(sc)) != NULL) {
1280 		vtblk_bio_done(sc, req->vbr_bp, ENXIO);
1281 		vtblk_request_enqueue(sc, req);
1282 	}
1283 
1284 	while (bioq_first(bioq) != NULL) {
1285 		bp = bioq_takefirst(bioq);
1286 		vtblk_bio_done(sc, bp, ENXIO);
1287 	}
1288 
1289 	vtblk_request_free(sc);
1290 }
1291 
1292 static void
vtblk_startio(struct vtblk_softc * sc)1293 vtblk_startio(struct vtblk_softc *sc)
1294 {
1295 	struct virtqueue *vq;
1296 	struct vtblk_request *req;
1297 	int enq;
1298 
1299 	VTBLK_LOCK_ASSERT(sc);
1300 	vq = sc->vtblk_vq;
1301 	enq = 0;
1302 
1303 	if (sc->vtblk_flags & (VTBLK_FLAG_SUSPEND | VTBLK_FLAG_BUSDMA_WAIT))
1304 		return;
1305 
1306 	while (!virtqueue_full(vq)) {
1307 		req = vtblk_request_next(sc);
1308 		if (req == NULL)
1309 			break;
1310 
1311 		req->vbr_requeue_on_error = 1;
1312 		if (vtblk_request_execute(req, BUS_DMA_WAITOK))
1313 			break;
1314 
1315 		enq++;
1316 	}
1317 
1318 	if (enq > 0)
1319 		virtqueue_notify(vq);
1320 }
1321 
1322 static void
vtblk_bio_done(struct vtblk_softc * sc,struct bio * bp,int error)1323 vtblk_bio_done(struct vtblk_softc *sc, struct bio *bp, int error)
1324 {
1325 
1326 	/* Because of GEOM direct dispatch, we cannot hold any locks. */
1327 	if (sc != NULL)
1328 		VTBLK_LOCK_ASSERT_NOTOWNED(sc);
1329 
1330 	if (error) {
1331 		bp->bio_resid = bp->bio_bcount;
1332 		bp->bio_error = error;
1333 		bp->bio_flags |= BIO_ERROR;
1334 	} else {
1335 		kmsan_mark_bio(bp, KMSAN_STATE_INITED);
1336 	}
1337 
1338 	if (bp->bio_driver1 != NULL) {
1339 		free(bp->bio_driver1, M_DEVBUF);
1340 		bp->bio_driver1 = NULL;
1341 	}
1342 
1343 	biodone(bp);
1344 }
1345 
1346 #define VTBLK_GET_CONFIG(_dev, _feature, _field, _cfg)			\
1347 	if (virtio_with_feature(_dev, _feature)) {			\
1348 		virtio_read_device_config(_dev,				\
1349 		    offsetof(struct virtio_blk_config, _field),		\
1350 		    &(_cfg)->_field, sizeof((_cfg)->_field));		\
1351 	}
1352 
1353 static void
vtblk_read_config(struct vtblk_softc * sc,struct virtio_blk_config * blkcfg)1354 vtblk_read_config(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
1355 {
1356 	device_t dev;
1357 
1358 	dev = sc->vtblk_dev;
1359 
1360 	bzero(blkcfg, sizeof(struct virtio_blk_config));
1361 
1362 	/* The capacity is always available. */
1363 	virtio_read_device_config(dev, offsetof(struct virtio_blk_config,
1364 	    capacity), &blkcfg->capacity, sizeof(blkcfg->capacity));
1365 
1366 	/* Read the configuration if the feature was negotiated. */
1367 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_SIZE_MAX, size_max, blkcfg);
1368 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_SEG_MAX, seg_max, blkcfg);
1369 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_GEOMETRY,
1370 	    geometry.cylinders, blkcfg);
1371 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_GEOMETRY,
1372 	    geometry.heads, blkcfg);
1373 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_GEOMETRY,
1374 	    geometry.sectors, blkcfg);
1375 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_BLK_SIZE, blk_size, blkcfg);
1376 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY,
1377 	    topology.physical_block_exp, blkcfg);
1378 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY,
1379 	    topology.alignment_offset, blkcfg);
1380 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY,
1381 	    topology.min_io_size, blkcfg);
1382 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY,
1383 	    topology.opt_io_size, blkcfg);
1384 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_CONFIG_WCE, wce, blkcfg);
1385 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, max_discard_sectors,
1386 	    blkcfg);
1387 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, max_discard_seg, blkcfg);
1388 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, discard_sector_alignment,
1389 	    blkcfg);
1390 }
1391 
1392 #undef VTBLK_GET_CONFIG
1393 
1394 static void
vtblk_ident(struct vtblk_softc * sc)1395 vtblk_ident(struct vtblk_softc *sc)
1396 {
1397 	struct bio buf;
1398 	struct disk *dp;
1399 	struct vtblk_request *req;
1400 	int len, error;
1401 
1402 	dp = sc->vtblk_disk;
1403 	len = MIN(VIRTIO_BLK_ID_BYTES, DISK_IDENT_SIZE);
1404 
1405 	if (vtblk_tunable_int(sc, "no_ident", vtblk_no_ident) != 0)
1406 		return;
1407 
1408 	req = vtblk_request_dequeue(sc);
1409 	if (req == NULL)
1410 		return;
1411 
1412 	req->vbr_ack = -1;
1413 	req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_GET_ID);
1414 	req->vbr_hdr.ioprio = vtblk_gtoh32(sc, 1);
1415 	req->vbr_hdr.sector = 0;
1416 
1417 	req->vbr_bp = &buf;
1418 	g_reset_bio(&buf);
1419 
1420 	buf.bio_cmd = BIO_READ;
1421 	buf.bio_data = dp->d_ident;
1422 	buf.bio_bcount = len;
1423 
1424 	VTBLK_LOCK(sc);
1425 	error = vtblk_poll_request(sc, req);
1426 	VTBLK_UNLOCK(sc);
1427 
1428 	if (error) {
1429 		device_printf(sc->vtblk_dev,
1430 		    "error getting device identifier: %d\n", error);
1431 	}
1432 }
1433 
1434 static int
vtblk_poll_request(struct vtblk_softc * sc,struct vtblk_request * req)1435 vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
1436 {
1437 	struct vtblk_request *req1 __diagused;
1438 	struct virtqueue *vq;
1439 	struct bio *bp;
1440 	int error;
1441 
1442 	vq = sc->vtblk_vq;
1443 
1444 	if (!virtqueue_empty(vq))
1445 		return (EBUSY);
1446 
1447 	error = vtblk_request_execute(req, BUS_DMA_NOWAIT);
1448 	if (error)
1449 		return (error);
1450 
1451 	virtqueue_notify(vq);
1452 	req1 = virtqueue_poll(vq, NULL);
1453 	KASSERT(req == req1,
1454 	    ("%s: polling completed %p not %p", __func__, req1, req));
1455 
1456 	bp = vtblk_queue_complete_one(sc, req);
1457 	error = bp->bio_error;
1458 	if (error && bootverbose) {
1459 		device_printf(sc->vtblk_dev,
1460 		    "%s: IO error: %d\n", __func__, error);
1461 	}
1462 	if (req != &sc->vtblk_dump_request)
1463 		vtblk_request_enqueue(sc, req);
1464 
1465 	return (error);
1466 }
1467 
1468 static int
vtblk_quiesce(struct vtblk_softc * sc)1469 vtblk_quiesce(struct vtblk_softc *sc)
1470 {
1471 	int error;
1472 
1473 	VTBLK_LOCK_ASSERT(sc);
1474 	error = 0;
1475 
1476 	while (!virtqueue_empty(sc->vtblk_vq)) {
1477 		if (mtx_sleep(&sc->vtblk_vq, VTBLK_MTX(sc), PRIBIO, "vtblkq",
1478 		    VTBLK_QUIESCE_TIMEOUT) == EWOULDBLOCK) {
1479 			error = EBUSY;
1480 			break;
1481 		}
1482 	}
1483 
1484 	return (error);
1485 }
1486 
1487 static void
vtblk_vq_intr(void * xsc)1488 vtblk_vq_intr(void *xsc)
1489 {
1490 	struct vtblk_softc *sc;
1491 	struct virtqueue *vq;
1492 	struct bio_queue queue;
1493 
1494 	sc = xsc;
1495 	vq = sc->vtblk_vq;
1496 	TAILQ_INIT(&queue);
1497 
1498 	VTBLK_LOCK(sc);
1499 
1500 again:
1501 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH)
1502 		goto out;
1503 
1504 	vtblk_queue_completed(sc, &queue);
1505 	vtblk_startio(sc);
1506 
1507 	if (virtqueue_enable_intr(vq) != 0) {
1508 		virtqueue_disable_intr(vq);
1509 		goto again;
1510 	}
1511 
1512 	if (sc->vtblk_flags & VTBLK_FLAG_SUSPEND)
1513 		wakeup(&sc->vtblk_vq);
1514 
1515 out:
1516 	VTBLK_UNLOCK(sc);
1517 	vtblk_done_completed(sc, &queue);
1518 }
1519 
1520 static void
vtblk_stop(struct vtblk_softc * sc)1521 vtblk_stop(struct vtblk_softc *sc)
1522 {
1523 
1524 	virtqueue_disable_intr(sc->vtblk_vq);
1525 	virtio_stop(sc->vtblk_dev);
1526 }
1527 
1528 static void
vtblk_dump_quiesce(struct vtblk_softc * sc)1529 vtblk_dump_quiesce(struct vtblk_softc *sc)
1530 {
1531 
1532 	/*
1533 	 * Spin here until all the requests in-flight at the time of the
1534 	 * dump are completed and queued. The queued requests will be
1535 	 * biodone'd once the dump is finished.
1536 	 */
1537 	while (!virtqueue_empty(sc->vtblk_vq))
1538 		vtblk_queue_completed(sc, &sc->vtblk_dump_queue);
1539 }
1540 
1541 static int
vtblk_dump_write(struct vtblk_softc * sc,void * virtual,off_t offset,size_t length)1542 vtblk_dump_write(struct vtblk_softc *sc, void *virtual, off_t offset,
1543     size_t length)
1544 {
1545 	struct bio buf;
1546 	struct vtblk_request *req;
1547 
1548 	req = &sc->vtblk_dump_request;
1549 	req->vbr_sc = sc;
1550 	req->vbr_ack = -1;
1551 	req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_OUT);
1552 	req->vbr_hdr.ioprio = vtblk_gtoh32(sc, 1);
1553 	req->vbr_hdr.sector = vtblk_gtoh64(sc, offset / VTBLK_BSIZE);
1554 
1555 	req->vbr_bp = &buf;
1556 	g_reset_bio(&buf);
1557 
1558 	buf.bio_cmd = BIO_WRITE;
1559 	buf.bio_data = virtual;
1560 	buf.bio_bcount = length;
1561 
1562 	return (vtblk_poll_request(sc, req));
1563 }
1564 
1565 static int
vtblk_dump_flush(struct vtblk_softc * sc)1566 vtblk_dump_flush(struct vtblk_softc *sc)
1567 {
1568 	struct bio buf;
1569 	struct vtblk_request *req;
1570 
1571 	req = &sc->vtblk_dump_request;
1572 	req->vbr_sc = sc;
1573 	req->vbr_ack = -1;
1574 	req->vbr_hdr.type = vtblk_gtoh32(sc, VIRTIO_BLK_T_FLUSH);
1575 	req->vbr_hdr.ioprio = vtblk_gtoh32(sc, 1);
1576 	req->vbr_hdr.sector = 0;
1577 
1578 	req->vbr_bp = &buf;
1579 	g_reset_bio(&buf);
1580 
1581 	buf.bio_cmd = BIO_FLUSH;
1582 
1583 	return (vtblk_poll_request(sc, req));
1584 }
1585 
1586 static void
vtblk_dump_complete(struct vtblk_softc * sc)1587 vtblk_dump_complete(struct vtblk_softc *sc)
1588 {
1589 
1590 	vtblk_dump_flush(sc);
1591 
1592 	VTBLK_UNLOCK(sc);
1593 	vtblk_done_completed(sc, &sc->vtblk_dump_queue);
1594 	VTBLK_LOCK(sc);
1595 }
1596 
1597 static void
vtblk_set_write_cache(struct vtblk_softc * sc,int wc)1598 vtblk_set_write_cache(struct vtblk_softc *sc, int wc)
1599 {
1600 
1601 	/* Set either writeback (1) or writethrough (0) mode. */
1602 	virtio_write_dev_config_1(sc->vtblk_dev,
1603 	    offsetof(struct virtio_blk_config, wce), wc);
1604 }
1605 
1606 static int
vtblk_write_cache_enabled(struct vtblk_softc * sc,struct virtio_blk_config * blkcfg)1607 vtblk_write_cache_enabled(struct vtblk_softc *sc,
1608     struct virtio_blk_config *blkcfg)
1609 {
1610 	int wc;
1611 
1612 	if (sc->vtblk_flags & VTBLK_FLAG_WCE_CONFIG) {
1613 		wc = vtblk_tunable_int(sc, "writecache_mode",
1614 		    vtblk_writecache_mode);
1615 		if (wc >= 0 && wc < VTBLK_CACHE_MAX)
1616 			vtblk_set_write_cache(sc, wc);
1617 		else
1618 			wc = blkcfg->wce;
1619 	} else
1620 		wc = virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_FLUSH);
1621 
1622 	return (wc);
1623 }
1624 
1625 static int
vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS)1626 vtblk_write_cache_sysctl(SYSCTL_HANDLER_ARGS)
1627 {
1628 	struct vtblk_softc *sc;
1629 	int wc, error;
1630 
1631 	sc = oidp->oid_arg1;
1632 	wc = sc->vtblk_write_cache;
1633 
1634 	error = sysctl_handle_int(oidp, &wc, 0, req);
1635 	if (error || req->newptr == NULL)
1636 		return (error);
1637 	if ((sc->vtblk_flags & VTBLK_FLAG_WCE_CONFIG) == 0)
1638 		return (EPERM);
1639 	if (wc < 0 || wc >= VTBLK_CACHE_MAX)
1640 		return (EINVAL);
1641 
1642 	VTBLK_LOCK(sc);
1643 	sc->vtblk_write_cache = wc;
1644 	vtblk_set_write_cache(sc, sc->vtblk_write_cache);
1645 	VTBLK_UNLOCK(sc);
1646 
1647 	return (0);
1648 }
1649 
1650 static void
vtblk_setup_sysctl(struct vtblk_softc * sc)1651 vtblk_setup_sysctl(struct vtblk_softc *sc)
1652 {
1653 	device_t dev;
1654 	struct sysctl_ctx_list *ctx;
1655 	struct sysctl_oid *tree;
1656 	struct sysctl_oid_list *child;
1657 
1658 	dev = sc->vtblk_dev;
1659 	ctx = device_get_sysctl_ctx(dev);
1660 	tree = device_get_sysctl_tree(dev);
1661 	child = SYSCTL_CHILDREN(tree);
1662 
1663 	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "writecache_mode",
1664 	    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
1665 	    vtblk_write_cache_sysctl, "I",
1666 	    "Write cache mode (writethrough (0) or writeback (1))");
1667 }
1668 
1669 static int
vtblk_tunable_int(struct vtblk_softc * sc,const char * knob,int def)1670 vtblk_tunable_int(struct vtblk_softc *sc, const char *knob, int def)
1671 {
1672 	char path[64];
1673 
1674 	snprintf(path, sizeof(path),
1675 	    "hw.vtblk.%d.%s", device_get_unit(sc->vtblk_dev), knob);
1676 	TUNABLE_INT_FETCH(path, &def);
1677 
1678 	return (def);
1679 }
1680