xref: /freebsd/sys/dev/virtio/block/virtio_blk.c (revision 23090366f729c56cab62de74c7a51792357e98a9)
1 /*-
2  * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 /* Driver for VirtIO block devices. */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/bio.h>
36 #include <sys/malloc.h>
37 #include <sys/module.h>
38 #include <sys/sglist.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/queue.h>
42 #include <sys/taskqueue.h>
43 
44 #include <geom/geom_disk.h>
45 
46 #include <machine/bus.h>
47 #include <machine/resource.h>
48 #include <sys/bus.h>
49 #include <sys/rman.h>
50 
51 #include <dev/virtio/virtio.h>
52 #include <dev/virtio/virtqueue.h>
53 #include <dev/virtio/block/virtio_blk.h>
54 
55 #include "virtio_if.h"
56 
57 struct vtblk_request {
58 	struct virtio_blk_outhdr	 vbr_hdr;
59 	struct bio			*vbr_bp;
60 	uint8_t				 vbr_ack;
61 
62 	TAILQ_ENTRY(vtblk_request)	 vbr_link;
63 };
64 
65 struct vtblk_softc {
66 	device_t		 vtblk_dev;
67 	struct mtx		 vtblk_mtx;
68 	uint64_t		 vtblk_features;
69 	uint32_t		 vtblk_flags;
70 #define VTBLK_FLAG_INDIRECT	0x0001
71 #define VTBLK_FLAG_READONLY	0x0002
72 #define VTBLK_FLAG_DETACH	0x0004
73 #define VTBLK_FLAG_SUSPEND	0x0008
74 #define VTBLK_FLAG_DUMPING	0x0010
75 
76 	struct virtqueue	*vtblk_vq;
77 	struct sglist		*vtblk_sglist;
78 	struct disk		*vtblk_disk;
79 
80 	struct bio_queue_head	 vtblk_bioq;
81 	TAILQ_HEAD(, vtblk_request)
82 				 vtblk_req_free;
83 	TAILQ_HEAD(, vtblk_request)
84 				vtblk_req_ready;
85 
86 	struct taskqueue	*vtblk_tq;
87 	struct task		 vtblk_intr_task;
88 
89 	int			 vtblk_max_nsegs;
90 	int			 vtblk_request_count;
91 
92 	struct vtblk_request	 vtblk_dump_request;
93 };
94 
95 static struct virtio_feature_desc vtblk_feature_desc[] = {
96 	{ VIRTIO_BLK_F_BARRIER,		"HostBarrier"	},
97 	{ VIRTIO_BLK_F_SIZE_MAX,	"MaxSegSize"	},
98 	{ VIRTIO_BLK_F_SEG_MAX,		"MaxNumSegs"	},
99 	{ VIRTIO_BLK_F_GEOMETRY,	"DiskGeometry"	},
100 	{ VIRTIO_BLK_F_RO,		"ReadOnly"	},
101 	{ VIRTIO_BLK_F_BLK_SIZE,	"BlockSize"	},
102 	{ VIRTIO_BLK_F_SCSI,		"SCSICmds"	},
103 	{ VIRTIO_BLK_F_FLUSH,		"FlushCmd"	},
104 	{ VIRTIO_BLK_F_TOPOLOGY,	"Topology"	},
105 
106 	{ 0, NULL }
107 };
108 
109 static int	vtblk_modevent(module_t, int, void *);
110 
111 static int	vtblk_probe(device_t);
112 static int	vtblk_attach(device_t);
113 static int	vtblk_detach(device_t);
114 static int	vtblk_suspend(device_t);
115 static int	vtblk_resume(device_t);
116 static int	vtblk_shutdown(device_t);
117 
118 static int	vtblk_open(struct disk *);
119 static int	vtblk_close(struct disk *);
120 static int	vtblk_ioctl(struct disk *, u_long, void *, int,
121 		    struct thread *);
122 static int	vtblk_dump(void *, void *, vm_offset_t, off_t, size_t);
123 static void	vtblk_strategy(struct bio *);
124 
125 static void	vtblk_negotiate_features(struct vtblk_softc *);
126 static int	vtblk_maximum_segments(struct vtblk_softc *,
127 		    struct virtio_blk_config *);
128 static int	vtblk_alloc_virtqueue(struct vtblk_softc *);
129 static void	vtblk_alloc_disk(struct vtblk_softc *,
130 		    struct virtio_blk_config *);
131 static void	vtblk_create_disk(struct vtblk_softc *);
132 
133 static int	vtblk_quiesce(struct vtblk_softc *);
134 static void	vtblk_startio(struct vtblk_softc *);
135 static struct vtblk_request * vtblk_bio_request(struct vtblk_softc *);
136 static int	vtblk_execute_request(struct vtblk_softc *,
137 		    struct vtblk_request *);
138 
139 static int	vtblk_vq_intr(void *);
140 static void	vtblk_intr_task(void *, int);
141 
142 static void	vtblk_stop(struct vtblk_softc *);
143 
144 static void	vtblk_get_ident(struct vtblk_softc *);
145 static void	vtblk_prepare_dump(struct vtblk_softc *);
146 static int	vtblk_write_dump(struct vtblk_softc *, void *, off_t, size_t);
147 static int	vtblk_flush_dump(struct vtblk_softc *);
148 static int	vtblk_poll_request(struct vtblk_softc *,
149 		    struct vtblk_request *);
150 
151 static void	vtblk_finish_completed(struct vtblk_softc *);
152 static void	vtblk_drain_vq(struct vtblk_softc *, int);
153 static void	vtblk_drain(struct vtblk_softc *);
154 
155 static int	vtblk_alloc_requests(struct vtblk_softc *);
156 static void	vtblk_free_requests(struct vtblk_softc *);
157 static struct vtblk_request * vtblk_dequeue_request(struct vtblk_softc *);
158 static void	vtblk_enqueue_request(struct vtblk_softc *,
159 		    struct vtblk_request *);
160 
161 static struct vtblk_request * vtblk_dequeue_ready(struct vtblk_softc *);
162 static void	vtblk_enqueue_ready(struct vtblk_softc *,
163 		    struct vtblk_request *);
164 
165 static int	vtblk_request_error(struct vtblk_request *);
166 static void	vtblk_finish_bio(struct bio *, int);
167 
168 /* Tunables. */
169 static int vtblk_no_ident = 0;
170 TUNABLE_INT("hw.vtblk.no_ident", &vtblk_no_ident);
171 
172 /* Features desired/implemented by this driver. */
173 #define VTBLK_FEATURES \
174     (VIRTIO_BLK_F_BARRIER		| \
175      VIRTIO_BLK_F_SIZE_MAX		| \
176      VIRTIO_BLK_F_SEG_MAX		| \
177      VIRTIO_BLK_F_GEOMETRY		| \
178      VIRTIO_BLK_F_RO			| \
179      VIRTIO_BLK_F_BLK_SIZE		| \
180      VIRTIO_BLK_F_FLUSH			| \
181      VIRTIO_RING_F_INDIRECT_DESC)
182 
183 #define VTBLK_MTX(_sc)		&(_sc)->vtblk_mtx
184 #define VTBLK_LOCK_INIT(_sc, _name) \
185 				mtx_init(VTBLK_MTX((_sc)), (_name), \
186 				    "VTBLK Lock", MTX_DEF)
187 #define VTBLK_LOCK(_sc)		mtx_lock(VTBLK_MTX((_sc)))
188 #define VTBLK_UNLOCK(_sc)	mtx_unlock(VTBLK_MTX((_sc)))
189 #define VTBLK_LOCK_DESTROY(_sc)	mtx_destroy(VTBLK_MTX((_sc)))
190 #define VTBLK_LOCK_ASSERT(_sc)	mtx_assert(VTBLK_MTX((_sc)), MA_OWNED)
191 #define VTBLK_LOCK_ASSERT_NOTOWNED(_sc) \
192 				mtx_assert(VTBLK_MTX((_sc)), MA_NOTOWNED)
193 
194 #define VTBLK_DISK_NAME		"vtbd"
195 #define VTBLK_QUIESCE_TIMEOUT	(30 * hz)
196 
197 /*
198  * Each block request uses at least two segments - one for the header
199  * and one for the status.
200  */
201 #define VTBLK_MIN_SEGMENTS	2
202 
203 static device_method_t vtblk_methods[] = {
204 	/* Device methods. */
205 	DEVMETHOD(device_probe,		vtblk_probe),
206 	DEVMETHOD(device_attach,	vtblk_attach),
207 	DEVMETHOD(device_detach,	vtblk_detach),
208 	DEVMETHOD(device_suspend,	vtblk_suspend),
209 	DEVMETHOD(device_resume,	vtblk_resume),
210 	DEVMETHOD(device_shutdown,	vtblk_shutdown),
211 
212 	DEVMETHOD_END
213 };
214 
215 static driver_t vtblk_driver = {
216 	"vtblk",
217 	vtblk_methods,
218 	sizeof(struct vtblk_softc)
219 };
220 static devclass_t vtblk_devclass;
221 
222 DRIVER_MODULE(virtio_blk, virtio_pci, vtblk_driver, vtblk_devclass,
223     vtblk_modevent, 0);
224 MODULE_VERSION(virtio_blk, 1);
225 MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
226 
227 static int
228 vtblk_modevent(module_t mod, int type, void *unused)
229 {
230 	int error;
231 
232 	error = 0;
233 
234 	switch (type) {
235 	case MOD_LOAD:
236 	case MOD_QUIESCE:
237 	case MOD_UNLOAD:
238 	case MOD_SHUTDOWN:
239 		break;
240 	default:
241 		error = EOPNOTSUPP;
242 		break;
243 	}
244 
245 	return (error);
246 }
247 
248 static int
249 vtblk_probe(device_t dev)
250 {
251 
252 	if (virtio_get_device_type(dev) != VIRTIO_ID_BLOCK)
253 		return (ENXIO);
254 
255 	device_set_desc(dev, "VirtIO Block Adapter");
256 
257 	return (BUS_PROBE_DEFAULT);
258 }
259 
260 static int
261 vtblk_attach(device_t dev)
262 {
263 	struct vtblk_softc *sc;
264 	struct virtio_blk_config blkcfg;
265 	int error;
266 
267 	sc = device_get_softc(dev);
268 	sc->vtblk_dev = dev;
269 
270 	VTBLK_LOCK_INIT(sc, device_get_nameunit(dev));
271 
272 	bioq_init(&sc->vtblk_bioq);
273 	TAILQ_INIT(&sc->vtblk_req_free);
274 	TAILQ_INIT(&sc->vtblk_req_ready);
275 
276 	virtio_set_feature_desc(dev, vtblk_feature_desc);
277 	vtblk_negotiate_features(sc);
278 
279 	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
280 		sc->vtblk_flags |= VTBLK_FLAG_INDIRECT;
281 
282 	if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
283 		sc->vtblk_flags |= VTBLK_FLAG_READONLY;
284 
285 	/* Get local copy of config. */
286 	virtio_read_device_config(dev, 0, &blkcfg,
287 	    sizeof(struct virtio_blk_config));
288 
289 	/*
290 	 * With the current sglist(9) implementation, it is not easy
291 	 * for us to support a maximum segment size as adjacent
292 	 * segments are coalesced. For now, just make sure it's larger
293 	 * than the maximum supported transfer size.
294 	 */
295 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
296 		if (blkcfg.size_max < MAXPHYS) {
297 			error = ENOTSUP;
298 			device_printf(dev, "host requires unsupported "
299 			    "maximum segment size feature\n");
300 			goto fail;
301 		}
302 	}
303 
304 	sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
305 	if (sc->vtblk_max_nsegs <= VTBLK_MIN_SEGMENTS) {
306 		error = EINVAL;
307 		device_printf(dev, "fewer than minimum number of segments "
308 		    "allowed: %d\n", sc->vtblk_max_nsegs);
309 		goto fail;
310 	}
311 
312 	sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_NOWAIT);
313 	if (sc->vtblk_sglist == NULL) {
314 		error = ENOMEM;
315 		device_printf(dev, "cannot allocate sglist\n");
316 		goto fail;
317 	}
318 
319 	error = vtblk_alloc_virtqueue(sc);
320 	if (error) {
321 		device_printf(dev, "cannot allocate virtqueue\n");
322 		goto fail;
323 	}
324 
325 	error = vtblk_alloc_requests(sc);
326 	if (error) {
327 		device_printf(dev, "cannot preallocate requests\n");
328 		goto fail;
329 	}
330 
331 	vtblk_alloc_disk(sc, &blkcfg);
332 
333 	TASK_INIT(&sc->vtblk_intr_task, 0, vtblk_intr_task, sc);
334 	sc->vtblk_tq = taskqueue_create_fast("vtblk_taskq", M_NOWAIT,
335 	    taskqueue_thread_enqueue, &sc->vtblk_tq);
336 	if (sc->vtblk_tq == NULL) {
337 		error = ENOMEM;
338 		device_printf(dev, "cannot allocate taskqueue\n");
339 		goto fail;
340 	}
341 	taskqueue_start_threads(&sc->vtblk_tq, 1, PI_DISK, "%s taskq",
342 	    device_get_nameunit(dev));
343 
344 	error = virtio_setup_intr(dev, INTR_TYPE_BIO | INTR_ENTROPY);
345 	if (error) {
346 		device_printf(dev, "cannot setup virtqueue interrupt\n");
347 		goto fail;
348 	}
349 
350 	vtblk_create_disk(sc);
351 
352 	virtqueue_enable_intr(sc->vtblk_vq);
353 
354 fail:
355 	if (error)
356 		vtblk_detach(dev);
357 
358 	return (error);
359 }
360 
361 static int
362 vtblk_detach(device_t dev)
363 {
364 	struct vtblk_softc *sc;
365 
366 	sc = device_get_softc(dev);
367 
368 	VTBLK_LOCK(sc);
369 	sc->vtblk_flags |= VTBLK_FLAG_DETACH;
370 	if (device_is_attached(dev))
371 		vtblk_stop(sc);
372 	VTBLK_UNLOCK(sc);
373 
374 	if (sc->vtblk_tq != NULL) {
375 		taskqueue_drain(sc->vtblk_tq, &sc->vtblk_intr_task);
376 		taskqueue_free(sc->vtblk_tq);
377 		sc->vtblk_tq = NULL;
378 	}
379 
380 	vtblk_drain(sc);
381 
382 	if (sc->vtblk_disk != NULL) {
383 		disk_destroy(sc->vtblk_disk);
384 		sc->vtblk_disk = NULL;
385 	}
386 
387 	if (sc->vtblk_sglist != NULL) {
388 		sglist_free(sc->vtblk_sglist);
389 		sc->vtblk_sglist = NULL;
390 	}
391 
392 	VTBLK_LOCK_DESTROY(sc);
393 
394 	return (0);
395 }
396 
397 static int
398 vtblk_suspend(device_t dev)
399 {
400 	struct vtblk_softc *sc;
401 	int error;
402 
403 	sc = device_get_softc(dev);
404 
405 	VTBLK_LOCK(sc);
406 	sc->vtblk_flags |= VTBLK_FLAG_SUSPEND;
407 	/* XXX BMV: virtio_stop(), etc needed here? */
408 	error = vtblk_quiesce(sc);
409 	if (error)
410 		sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
411 	VTBLK_UNLOCK(sc);
412 
413 	return (error);
414 }
415 
416 static int
417 vtblk_resume(device_t dev)
418 {
419 	struct vtblk_softc *sc;
420 
421 	sc = device_get_softc(dev);
422 
423 	VTBLK_LOCK(sc);
424 	/* XXX BMV: virtio_reinit(), etc needed here? */
425 	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
426 	vtblk_startio(sc);
427 	VTBLK_UNLOCK(sc);
428 
429 	return (0);
430 }
431 
432 static int
433 vtblk_shutdown(device_t dev)
434 {
435 
436 	return (0);
437 }
438 
439 static int
440 vtblk_open(struct disk *dp)
441 {
442 	struct vtblk_softc *sc;
443 
444 	if ((sc = dp->d_drv1) == NULL)
445 		return (ENXIO);
446 
447 	return (sc->vtblk_flags & VTBLK_FLAG_DETACH ? ENXIO : 0);
448 }
449 
450 static int
451 vtblk_close(struct disk *dp)
452 {
453 	struct vtblk_softc *sc;
454 
455 	if ((sc = dp->d_drv1) == NULL)
456 		return (ENXIO);
457 
458 	return (0);
459 }
460 
461 static int
462 vtblk_ioctl(struct disk *dp, u_long cmd, void *addr, int flag,
463     struct thread *td)
464 {
465 	struct vtblk_softc *sc;
466 
467 	if ((sc = dp->d_drv1) == NULL)
468 		return (ENXIO);
469 
470 	return (ENOTTY);
471 }
472 
473 static int
474 vtblk_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
475     size_t length)
476 {
477 	struct disk *dp;
478 	struct vtblk_softc *sc;
479 	int error;
480 
481 	dp = arg;
482 
483 	if ((sc = dp->d_drv1) == NULL)
484 		return (ENXIO);
485 
486 	VTBLK_LOCK(sc);
487 
488 	if ((sc->vtblk_flags & VTBLK_FLAG_DUMPING) == 0) {
489 		vtblk_prepare_dump(sc);
490 		sc->vtblk_flags |= VTBLK_FLAG_DUMPING;
491 	}
492 
493 	if (length > 0)
494 		error = vtblk_write_dump(sc, virtual, offset, length);
495 	else if (virtual == NULL && offset == 0)
496 		error = vtblk_flush_dump(sc);
497 	else {
498 		error = EINVAL;
499 		sc->vtblk_flags &= ~VTBLK_FLAG_DUMPING;
500 	}
501 
502 	VTBLK_UNLOCK(sc);
503 
504 	return (error);
505 }
506 
507 static void
508 vtblk_strategy(struct bio *bp)
509 {
510 	struct vtblk_softc *sc;
511 
512 	if ((sc = bp->bio_disk->d_drv1) == NULL) {
513 		vtblk_finish_bio(bp, EINVAL);
514 		return;
515 	}
516 
517 	/*
518 	 * Fail any write if RO. Unfortunately, there does not seem to
519 	 * be a better way to report our readonly'ness to GEOM above.
520 	 */
521 	if (sc->vtblk_flags & VTBLK_FLAG_READONLY &&
522 	    (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FLUSH)) {
523 		vtblk_finish_bio(bp, EROFS);
524 		return;
525 	}
526 
527 #ifdef INVARIANTS
528 	/*
529 	 * Prevent read/write buffers spanning too many segments from
530 	 * getting into the queue. This should only trip if d_maxsize
531 	 * was incorrectly set.
532 	 */
533 	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
534 		int nsegs, max_nsegs;
535 
536 		nsegs = sglist_count(bp->bio_data, bp->bio_bcount);
537 		max_nsegs = sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS;
538 
539 		KASSERT(nsegs <= max_nsegs,
540 		    ("bio %p spanned too many segments: %d, max: %d",
541 		    bp, nsegs, max_nsegs));
542 	}
543 #endif
544 
545 	VTBLK_LOCK(sc);
546 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH)
547 		vtblk_finish_bio(bp, ENXIO);
548 	else {
549 		bioq_disksort(&sc->vtblk_bioq, bp);
550 
551 		if ((sc->vtblk_flags & VTBLK_FLAG_SUSPEND) == 0)
552 			vtblk_startio(sc);
553 	}
554 	VTBLK_UNLOCK(sc);
555 }
556 
557 static void
558 vtblk_negotiate_features(struct vtblk_softc *sc)
559 {
560 	device_t dev;
561 	uint64_t features;
562 
563 	dev = sc->vtblk_dev;
564 	features = VTBLK_FEATURES;
565 
566 	sc->vtblk_features = virtio_negotiate_features(dev, features);
567 }
568 
569 static int
570 vtblk_maximum_segments(struct vtblk_softc *sc,
571     struct virtio_blk_config *blkcfg)
572 {
573 	device_t dev;
574 	int nsegs;
575 
576 	dev = sc->vtblk_dev;
577 	nsegs = VTBLK_MIN_SEGMENTS;
578 
579 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
580 		nsegs += MIN(blkcfg->seg_max, MAXPHYS / PAGE_SIZE + 1);
581 		if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
582 			nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
583 	} else
584 		nsegs += 1;
585 
586 	return (nsegs);
587 }
588 
589 static int
590 vtblk_alloc_virtqueue(struct vtblk_softc *sc)
591 {
592 	device_t dev;
593 	struct vq_alloc_info vq_info;
594 
595 	dev = sc->vtblk_dev;
596 
597 	VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
598 	    vtblk_vq_intr, sc, &sc->vtblk_vq,
599 	    "%s request", device_get_nameunit(dev));
600 
601 	return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info));
602 }
603 
604 static void
605 vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
606 {
607 	device_t dev;
608 	struct disk *dp;
609 
610 	dev = sc->vtblk_dev;
611 
612 	sc->vtblk_disk = dp = disk_alloc();
613 	dp->d_open = vtblk_open;
614 	dp->d_close = vtblk_close;
615 	dp->d_ioctl = vtblk_ioctl;
616 	dp->d_strategy = vtblk_strategy;
617 	dp->d_name = VTBLK_DISK_NAME;
618 	dp->d_unit = device_get_unit(dev);
619 	dp->d_drv1 = sc;
620 
621 	if ((sc->vtblk_flags & VTBLK_FLAG_READONLY) == 0)
622 		dp->d_dump = vtblk_dump;
623 
624 	/* Capacity is always in 512-byte units. */
625 	dp->d_mediasize = blkcfg->capacity * 512;
626 
627 	if (virtio_with_feature(dev, VIRTIO_BLK_F_BLK_SIZE))
628 		dp->d_sectorsize = blkcfg->blk_size;
629 	else
630 		dp->d_sectorsize = 512;
631 
632 	/*
633 	 * The VirtIO maximum I/O size is given in terms of segments.
634 	 * However, FreeBSD limits I/O size by logical buffer size, not
635 	 * by physically contiguous pages. Therefore, we have to assume
636 	 * no pages are contiguous. This may impose an artificially low
637 	 * maximum I/O size. But in practice, since QEMU advertises 128
638 	 * segments, this gives us a maximum IO size of 125 * PAGE_SIZE,
639 	 * which is typically greater than MAXPHYS. Eventually we should
640 	 * just advertise MAXPHYS and split buffers that are too big.
641 	 *
642 	 * Note we must subtract one additional segment in case of non
643 	 * page aligned buffers.
644 	 */
645 	dp->d_maxsize = (sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS - 1) *
646 	    PAGE_SIZE;
647 	if (dp->d_maxsize < PAGE_SIZE)
648 		dp->d_maxsize = PAGE_SIZE; /* XXX */
649 
650 	if (virtio_with_feature(dev, VIRTIO_BLK_F_GEOMETRY)) {
651 		dp->d_fwsectors = blkcfg->geometry.sectors;
652 		dp->d_fwheads = blkcfg->geometry.heads;
653 	}
654 
655 	if (virtio_with_feature(dev, VIRTIO_BLK_F_FLUSH))
656 		dp->d_flags |= DISKFLAG_CANFLUSHCACHE;
657 }
658 
659 static void
660 vtblk_create_disk(struct vtblk_softc *sc)
661 {
662 	struct disk *dp;
663 
664 	dp = sc->vtblk_disk;
665 
666 	/*
667 	 * Retrieving the identification string must be done after
668 	 * the virtqueue interrupt is setup otherwise it will hang.
669 	 */
670 	vtblk_get_ident(sc);
671 
672 	device_printf(sc->vtblk_dev, "%juMB (%ju %u byte sectors)\n",
673 	    (uintmax_t) dp->d_mediasize >> 20,
674 	    (uintmax_t) dp->d_mediasize / dp->d_sectorsize,
675 	    dp->d_sectorsize);
676 
677 	disk_create(dp, DISK_VERSION);
678 }
679 
680 static int
681 vtblk_quiesce(struct vtblk_softc *sc)
682 {
683 	int error;
684 
685 	error = 0;
686 
687 	VTBLK_LOCK_ASSERT(sc);
688 
689 	while (!virtqueue_empty(sc->vtblk_vq)) {
690 		if (mtx_sleep(&sc->vtblk_vq, VTBLK_MTX(sc), PRIBIO, "vtblkq",
691 		    VTBLK_QUIESCE_TIMEOUT) == EWOULDBLOCK) {
692 			error = EBUSY;
693 			break;
694 		}
695 	}
696 
697 	return (error);
698 }
699 
700 static void
701 vtblk_startio(struct vtblk_softc *sc)
702 {
703 	struct virtqueue *vq;
704 	struct vtblk_request *req;
705 	int enq;
706 
707 	vq = sc->vtblk_vq;
708 	enq = 0;
709 
710 	VTBLK_LOCK_ASSERT(sc);
711 
712 	while (!virtqueue_full(vq)) {
713 		if ((req = vtblk_dequeue_ready(sc)) == NULL)
714 			req = vtblk_bio_request(sc);
715 		if (req == NULL)
716 			break;
717 
718 		if (vtblk_execute_request(sc, req) != 0) {
719 			vtblk_enqueue_ready(sc, req);
720 			break;
721 		}
722 
723 		enq++;
724 	}
725 
726 	if (enq > 0)
727 		virtqueue_notify(vq);
728 }
729 
730 static struct vtblk_request *
731 vtblk_bio_request(struct vtblk_softc *sc)
732 {
733 	struct bio_queue_head *bioq;
734 	struct vtblk_request *req;
735 	struct bio *bp;
736 
737 	bioq = &sc->vtblk_bioq;
738 
739 	if (bioq_first(bioq) == NULL)
740 		return (NULL);
741 
742 	req = vtblk_dequeue_request(sc);
743 	if (req == NULL)
744 		return (NULL);
745 
746 	bp = bioq_takefirst(bioq);
747 	req->vbr_bp = bp;
748 	req->vbr_ack = -1;
749 	req->vbr_hdr.ioprio = 1;
750 
751 	switch (bp->bio_cmd) {
752 	case BIO_FLUSH:
753 		req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
754 		break;
755 	case BIO_READ:
756 		req->vbr_hdr.type = VIRTIO_BLK_T_IN;
757 		req->vbr_hdr.sector = bp->bio_offset / 512;
758 		break;
759 	case BIO_WRITE:
760 		req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
761 		req->vbr_hdr.sector = bp->bio_offset / 512;
762 		break;
763 	default:
764 		panic("%s: bio with unhandled cmd: %d", __FUNCTION__,
765 		    bp->bio_cmd);
766 	}
767 
768 	if (bp->bio_flags & BIO_ORDERED)
769 		req->vbr_hdr.type |= VIRTIO_BLK_T_BARRIER;
770 
771 	return (req);
772 }
773 
774 static int
775 vtblk_execute_request(struct vtblk_softc *sc, struct vtblk_request *req)
776 {
777 	struct sglist *sg;
778 	struct bio *bp;
779 	int readable, writable, error;
780 
781 	sg = sc->vtblk_sglist;
782 	bp = req->vbr_bp;
783 	writable = 0;
784 
785 	VTBLK_LOCK_ASSERT(sc);
786 
787 	sglist_reset(sg);
788 
789 	sglist_append(sg, &req->vbr_hdr, sizeof(struct virtio_blk_outhdr));
790 
791 	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
792 		error = sglist_append(sg, bp->bio_data, bp->bio_bcount);
793 		if (error || sg->sg_nseg == sg->sg_maxseg)
794 			panic("%s: data buffer too big bio:%p error:%d",
795 			    __FUNCTION__, bp, error);
796 
797 		/* BIO_READ means the host writes into our buffer. */
798 		if (bp->bio_cmd == BIO_READ)
799 			writable = sg->sg_nseg - 1;
800 	}
801 
802 	writable++;
803 	sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
804 
805 	readable = sg->sg_nseg - writable;
806 
807 	return (virtqueue_enqueue(sc->vtblk_vq, req, sg, readable, writable));
808 }
809 
810 static int
811 vtblk_vq_intr(void *xsc)
812 {
813 	struct vtblk_softc *sc;
814 
815 	sc = xsc;
816 
817 	virtqueue_disable_intr(sc->vtblk_vq);
818 	taskqueue_enqueue_fast(sc->vtblk_tq, &sc->vtblk_intr_task);
819 
820 	return (1);
821 }
822 
823 static void
824 vtblk_intr_task(void *arg, int pending)
825 {
826 	struct vtblk_softc *sc;
827 	struct virtqueue *vq;
828 
829 	sc = arg;
830 	vq = sc->vtblk_vq;
831 
832 	VTBLK_LOCK(sc);
833 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH) {
834 		VTBLK_UNLOCK(sc);
835 		return;
836 	}
837 
838 	vtblk_finish_completed(sc);
839 
840 	if ((sc->vtblk_flags & VTBLK_FLAG_SUSPEND) == 0)
841 		vtblk_startio(sc);
842 	else
843 		wakeup(&sc->vtblk_vq);
844 
845 	if (virtqueue_enable_intr(vq) != 0) {
846 		virtqueue_disable_intr(vq);
847 		VTBLK_UNLOCK(sc);
848 		taskqueue_enqueue_fast(sc->vtblk_tq,
849 		    &sc->vtblk_intr_task);
850 		return;
851 	}
852 
853 	VTBLK_UNLOCK(sc);
854 }
855 
856 static void
857 vtblk_stop(struct vtblk_softc *sc)
858 {
859 
860 	virtqueue_disable_intr(sc->vtblk_vq);
861 	virtio_stop(sc->vtblk_dev);
862 }
863 
864 static void
865 vtblk_get_ident(struct vtblk_softc *sc)
866 {
867 	struct bio buf;
868 	struct disk *dp;
869 	struct vtblk_request *req;
870 	int len, error;
871 
872 	dp = sc->vtblk_disk;
873 	len = MIN(VIRTIO_BLK_ID_BYTES, DISK_IDENT_SIZE);
874 
875 	if (vtblk_no_ident != 0)
876 		return;
877 
878 	req = vtblk_dequeue_request(sc);
879 	if (req == NULL)
880 		return;
881 
882 	req->vbr_ack = -1;
883 	req->vbr_hdr.type = VIRTIO_BLK_T_GET_ID;
884 	req->vbr_hdr.ioprio = 1;
885 	req->vbr_hdr.sector = 0;
886 
887 	req->vbr_bp = &buf;
888 	bzero(&buf, sizeof(struct bio));
889 
890 	buf.bio_cmd = BIO_READ;
891 	buf.bio_data = dp->d_ident;
892 	buf.bio_bcount = len;
893 
894 	VTBLK_LOCK(sc);
895 	error = vtblk_poll_request(sc, req);
896 	VTBLK_UNLOCK(sc);
897 
898 	vtblk_enqueue_request(sc, req);
899 
900 	if (error) {
901 		device_printf(sc->vtblk_dev,
902 		    "error getting device identifier: %d\n", error);
903 	}
904 }
905 
906 static void
907 vtblk_prepare_dump(struct vtblk_softc *sc)
908 {
909 	device_t dev;
910 	struct virtqueue *vq;
911 
912 	dev = sc->vtblk_dev;
913 	vq = sc->vtblk_vq;
914 
915 	vtblk_stop(sc);
916 
917 	/*
918 	 * Drain all requests caught in-flight in the virtqueue,
919 	 * skipping biodone(). When dumping, only one request is
920 	 * outstanding at a time, and we just poll the virtqueue
921 	 * for the response.
922 	 */
923 	vtblk_drain_vq(sc, 1);
924 
925 	if (virtio_reinit(dev, sc->vtblk_features) != 0)
926 		panic("cannot reinit VirtIO block device during dump");
927 
928 	virtqueue_disable_intr(vq);
929 	virtio_reinit_complete(dev);
930 }
931 
932 static int
933 vtblk_write_dump(struct vtblk_softc *sc, void *virtual, off_t offset,
934     size_t length)
935 {
936 	struct bio buf;
937 	struct vtblk_request *req;
938 
939 	req = &sc->vtblk_dump_request;
940 	req->vbr_ack = -1;
941 	req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
942 	req->vbr_hdr.ioprio = 1;
943 	req->vbr_hdr.sector = offset / 512;
944 
945 	req->vbr_bp = &buf;
946 	bzero(&buf, sizeof(struct bio));
947 
948 	buf.bio_cmd = BIO_WRITE;
949 	buf.bio_data = virtual;
950 	buf.bio_bcount = length;
951 
952 	return (vtblk_poll_request(sc, req));
953 }
954 
955 static int
956 vtblk_flush_dump(struct vtblk_softc *sc)
957 {
958 	struct bio buf;
959 	struct vtblk_request *req;
960 
961 	req = &sc->vtblk_dump_request;
962 	req->vbr_ack = -1;
963 	req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
964 	req->vbr_hdr.ioprio = 1;
965 	req->vbr_hdr.sector = 0;
966 
967 	req->vbr_bp = &buf;
968 	bzero(&buf, sizeof(struct bio));
969 
970 	buf.bio_cmd = BIO_FLUSH;
971 
972 	return (vtblk_poll_request(sc, req));
973 }
974 
975 static int
976 vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
977 {
978 	struct virtqueue *vq;
979 	struct vtblk_request *r;
980 	int error;
981 
982 	vq = sc->vtblk_vq;
983 
984 	if (!virtqueue_empty(vq))
985 		return (EBUSY);
986 
987 	error = vtblk_execute_request(sc, req);
988 	if (error)
989 		return (error);
990 
991 	virtqueue_notify(vq);
992 
993 	r = virtqueue_poll(vq, NULL);
994 	KASSERT(r == req, ("unexpected request response: %p/%p", r, req));
995 
996 	error = vtblk_request_error(req);
997 	if (error && bootverbose) {
998 		device_printf(sc->vtblk_dev,
999 		    "%s: IO error: %d\n", __FUNCTION__, error);
1000 	}
1001 
1002 	return (error);
1003 }
1004 
1005 static void
1006 vtblk_finish_completed(struct vtblk_softc *sc)
1007 {
1008 	struct vtblk_request *req;
1009 	struct bio *bp;
1010 	int error;
1011 
1012 	while ((req = virtqueue_dequeue(sc->vtblk_vq, NULL)) != NULL) {
1013 		bp = req->vbr_bp;
1014 
1015 		error = vtblk_request_error(req);
1016 		if (error)
1017 			disk_err(bp, "hard error", -1, 1);
1018 
1019 		vtblk_finish_bio(bp, error);
1020 		vtblk_enqueue_request(sc, req);
1021 	}
1022 }
1023 
1024 static void
1025 vtblk_drain_vq(struct vtblk_softc *sc, int skip_done)
1026 {
1027 	struct virtqueue *vq;
1028 	struct vtblk_request *req;
1029 	int last;
1030 
1031 	vq = sc->vtblk_vq;
1032 	last = 0;
1033 
1034 	while ((req = virtqueue_drain(vq, &last)) != NULL) {
1035 		if (!skip_done)
1036 			vtblk_finish_bio(req->vbr_bp, ENXIO);
1037 
1038 		vtblk_enqueue_request(sc, req);
1039 	}
1040 
1041 	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
1042 }
1043 
1044 static void
1045 vtblk_drain(struct vtblk_softc *sc)
1046 {
1047 	struct bio_queue_head *bioq;
1048 	struct vtblk_request *req;
1049 	struct bio *bp;
1050 
1051 	bioq = &sc->vtblk_bioq;
1052 
1053 	if (sc->vtblk_vq != NULL) {
1054 		vtblk_finish_completed(sc);
1055 		vtblk_drain_vq(sc, 0);
1056 	}
1057 
1058 	while ((req = vtblk_dequeue_ready(sc)) != NULL) {
1059 		vtblk_finish_bio(req->vbr_bp, ENXIO);
1060 		vtblk_enqueue_request(sc, req);
1061 	}
1062 
1063 	while (bioq_first(bioq) != NULL) {
1064 		bp = bioq_takefirst(bioq);
1065 		vtblk_finish_bio(bp, ENXIO);
1066 	}
1067 
1068 	vtblk_free_requests(sc);
1069 }
1070 
1071 #ifdef INVARIANTS
1072 static void
1073 vtblk_request_invariants(struct vtblk_request *req)
1074 {
1075 	int hdr_nsegs, ack_nsegs;
1076 
1077 	hdr_nsegs = sglist_count(&req->vbr_hdr, sizeof(req->vbr_hdr));
1078 	ack_nsegs = sglist_count(&req->vbr_ack, sizeof(req->vbr_ack));
1079 
1080 	KASSERT(hdr_nsegs == 1, ("request header crossed page boundary"));
1081 	KASSERT(ack_nsegs == 1, ("request ack crossed page boundary"));
1082 }
1083 #endif
1084 
1085 static int
1086 vtblk_alloc_requests(struct vtblk_softc *sc)
1087 {
1088 	struct vtblk_request *req;
1089 	int i, nreqs;
1090 
1091 	nreqs = virtqueue_size(sc->vtblk_vq);
1092 
1093 	/*
1094 	 * Preallocate sufficient requests to keep the virtqueue full. Each
1095 	 * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
1096 	 * the number allocated when indirect descriptors are not available.
1097 	 */
1098 	if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
1099 		nreqs /= VTBLK_MIN_SEGMENTS;
1100 
1101 	for (i = 0; i < nreqs; i++) {
1102 		req = malloc(sizeof(struct vtblk_request), M_DEVBUF, M_NOWAIT);
1103 		if (req == NULL)
1104 			return (ENOMEM);
1105 
1106 #ifdef INVARIANTS
1107 		vtblk_request_invariants(req);
1108 #endif
1109 
1110 		sc->vtblk_request_count++;
1111 		vtblk_enqueue_request(sc, req);
1112 	}
1113 
1114 	return (0);
1115 }
1116 
1117 static void
1118 vtblk_free_requests(struct vtblk_softc *sc)
1119 {
1120 	struct vtblk_request *req;
1121 
1122 	KASSERT(TAILQ_EMPTY(&sc->vtblk_req_ready),
1123 	    ("ready requests left on queue"));
1124 
1125 	while ((req = vtblk_dequeue_request(sc)) != NULL) {
1126 		sc->vtblk_request_count--;
1127 		free(req, M_DEVBUF);
1128 	}
1129 
1130 	KASSERT(sc->vtblk_request_count == 0,
1131 	    ("leaked requests: %d", sc->vtblk_request_count));
1132 }
1133 
1134 static struct vtblk_request *
1135 vtblk_dequeue_request(struct vtblk_softc *sc)
1136 {
1137 	struct vtblk_request *req;
1138 
1139 	req = TAILQ_FIRST(&sc->vtblk_req_free);
1140 	if (req != NULL)
1141 		TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link);
1142 
1143 	return (req);
1144 }
1145 
1146 static void
1147 vtblk_enqueue_request(struct vtblk_softc *sc, struct vtblk_request *req)
1148 {
1149 
1150 	bzero(req, sizeof(struct vtblk_request));
1151 	TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
1152 }
1153 
1154 static struct vtblk_request *
1155 vtblk_dequeue_ready(struct vtblk_softc *sc)
1156 {
1157 	struct vtblk_request *req;
1158 
1159 	req = TAILQ_FIRST(&sc->vtblk_req_ready);
1160 	if (req != NULL)
1161 		TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link);
1162 
1163 	return (req);
1164 }
1165 
1166 static void
1167 vtblk_enqueue_ready(struct vtblk_softc *sc, struct vtblk_request *req)
1168 {
1169 
1170 	TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link);
1171 }
1172 
1173 static int
1174 vtblk_request_error(struct vtblk_request *req)
1175 {
1176 	int error;
1177 
1178 	switch (req->vbr_ack) {
1179 	case VIRTIO_BLK_S_OK:
1180 		error = 0;
1181 		break;
1182 	case VIRTIO_BLK_S_UNSUPP:
1183 		error = ENOTSUP;
1184 		break;
1185 	default:
1186 		error = EIO;
1187 		break;
1188 	}
1189 
1190 	return (error);
1191 }
1192 
1193 static void
1194 vtblk_finish_bio(struct bio *bp, int error)
1195 {
1196 
1197 	if (error) {
1198 		bp->bio_resid = bp->bio_bcount;
1199 		bp->bio_error = error;
1200 		bp->bio_flags |= BIO_ERROR;
1201 	}
1202 
1203 	biodone(bp);
1204 }
1205