xref: /freebsd/sys/dev/virtio/block/virtio_blk.c (revision 6486b015fc84e96725fef22b0e3363351399ae83)
1 /*-
2  * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 /* Driver for VirtIO block devices. */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/bio.h>
36 #include <sys/malloc.h>
37 #include <sys/module.h>
38 #include <sys/sglist.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/queue.h>
42 #include <sys/taskqueue.h>
43 
44 #include <geom/geom_disk.h>
45 #include <vm/uma.h>
46 
47 #include <machine/bus.h>
48 #include <machine/resource.h>
49 #include <sys/bus.h>
50 #include <sys/rman.h>
51 
52 #include <dev/virtio/virtio.h>
53 #include <dev/virtio/virtqueue.h>
54 #include <dev/virtio/block/virtio_blk.h>
55 
56 #include "virtio_if.h"
57 
58 struct vtblk_request {
59 	struct virtio_blk_outhdr	 vbr_hdr;
60 	struct bio			*vbr_bp;
61 	uint8_t				 vbr_ack;
62 
63 	TAILQ_ENTRY(vtblk_request)	 vbr_link;
64 };
65 
66 struct vtblk_softc {
67 	device_t		 vtblk_dev;
68 	struct mtx		 vtblk_mtx;
69 	uint64_t		 vtblk_features;
70 	uint32_t		 vtblk_flags;
71 #define VTBLK_FLAG_INDIRECT	0x0001
72 #define VTBLK_FLAG_READONLY	0x0002
73 #define VTBLK_FLAG_DETACH	0x0004
74 #define VTBLK_FLAG_SUSPEND	0x0008
75 #define VTBLK_FLAG_DUMPING	0x0010
76 
77 	struct virtqueue	*vtblk_vq;
78 	struct sglist		*vtblk_sglist;
79 	struct disk		*vtblk_disk;
80 
81 	struct bio_queue_head	 vtblk_bioq;
82 	TAILQ_HEAD(, vtblk_request)
83 				 vtblk_req_free;
84 	TAILQ_HEAD(, vtblk_request)
85 				vtblk_req_ready;
86 
87 	struct taskqueue	*vtblk_tq;
88 	struct task		 vtblk_intr_task;
89 
90 	int			 vtblk_max_nsegs;
91 	int			 vtblk_request_count;
92 
93 	struct vtblk_request	 vtblk_dump_request;
94 };
95 
96 static struct virtio_feature_desc vtblk_feature_desc[] = {
97 	{ VIRTIO_BLK_F_BARRIER,		"HostBarrier"	},
98 	{ VIRTIO_BLK_F_SIZE_MAX,	"MaxSegSize"	},
99 	{ VIRTIO_BLK_F_SEG_MAX,		"MaxNumSegs"	},
100 	{ VIRTIO_BLK_F_GEOMETRY,	"DiskGeometry"	},
101 	{ VIRTIO_BLK_F_RO,		"ReadOnly"	},
102 	{ VIRTIO_BLK_F_BLK_SIZE,	"BlockSize"	},
103 	{ VIRTIO_BLK_F_SCSI,		"SCSICmds"	},
104 	{ VIRTIO_BLK_F_FLUSH,		"FlushCmd"	},
105 	{ VIRTIO_BLK_F_TOPOLOGY,	"Topology"	},
106 
107 	{ 0, NULL }
108 };
109 
110 static int	vtblk_modevent(module_t, int, void *);
111 
112 static int	vtblk_probe(device_t);
113 static int	vtblk_attach(device_t);
114 static int	vtblk_detach(device_t);
115 static int	vtblk_suspend(device_t);
116 static int	vtblk_resume(device_t);
117 static int	vtblk_shutdown(device_t);
118 
119 static int	vtblk_open(struct disk *);
120 static int	vtblk_close(struct disk *);
121 static int	vtblk_ioctl(struct disk *, u_long, void *, int,
122 	            struct thread *);
123 static int	vtblk_dump(void *, void *, vm_offset_t, off_t, size_t);
124 static void	vtblk_strategy(struct bio *);
125 
126 static void	vtblk_negotiate_features(struct vtblk_softc *);
127 static int	vtblk_maximum_segments(struct vtblk_softc *,
128 		    struct virtio_blk_config *);
129 static int	vtblk_alloc_virtqueue(struct vtblk_softc *);
130 static void	vtblk_alloc_disk(struct vtblk_softc *,
131 		    struct virtio_blk_config *);
132 static void	vtblk_create_disk(struct vtblk_softc *);
133 
134 static int	vtblk_quiesce(struct vtblk_softc *);
135 static void	vtblk_startio(struct vtblk_softc *);
136 static struct vtblk_request * vtblk_bio_request(struct vtblk_softc *);
137 static int	vtblk_execute_request(struct vtblk_softc *,
138 		    struct vtblk_request *);
139 
140 static int	vtblk_vq_intr(void *);
141 static void	vtblk_intr_task(void *, int);
142 
143 static void	vtblk_stop(struct vtblk_softc *);
144 
145 static void	vtblk_get_ident(struct vtblk_softc *);
146 static void	vtblk_prepare_dump(struct vtblk_softc *);
147 static int	vtblk_write_dump(struct vtblk_softc *, void *, off_t, size_t);
148 static int	vtblk_flush_dump(struct vtblk_softc *);
149 static int	vtblk_poll_request(struct vtblk_softc *,
150 		    struct vtblk_request *);
151 
152 static void	vtblk_finish_completed(struct vtblk_softc *);
153 static void	vtblk_drain_vq(struct vtblk_softc *, int);
154 static void	vtblk_drain(struct vtblk_softc *);
155 
156 static int	vtblk_alloc_requests(struct vtblk_softc *);
157 static void	vtblk_free_requests(struct vtblk_softc *);
158 static struct vtblk_request * vtblk_dequeue_request(struct vtblk_softc *);
159 static void	vtblk_enqueue_request(struct vtblk_softc *,
160 		    struct vtblk_request *);
161 
162 static struct vtblk_request * vtblk_dequeue_ready(struct vtblk_softc *);
163 static void	vtblk_enqueue_ready(struct vtblk_softc *,
164 		    struct vtblk_request *);
165 
166 static int	vtblk_request_error(struct vtblk_request *);
167 static void	vtblk_finish_bio(struct bio *, int);
168 
169 /* Tunables. */
170 static int vtblk_no_ident = 0;
171 TUNABLE_INT("hw.vtblk.no_ident", &vtblk_no_ident);
172 
173 /* Features desired/implemented by this driver. */
174 #define VTBLK_FEATURES \
175     (VIRTIO_BLK_F_BARRIER		| \
176      VIRTIO_BLK_F_SIZE_MAX		| \
177      VIRTIO_BLK_F_SEG_MAX		| \
178      VIRTIO_BLK_F_GEOMETRY		| \
179      VIRTIO_BLK_F_RO			| \
180      VIRTIO_BLK_F_BLK_SIZE		| \
181      VIRTIO_BLK_F_FLUSH			| \
182      VIRTIO_RING_F_INDIRECT_DESC)
183 
184 #define VTBLK_MTX(_sc)		&(_sc)->vtblk_mtx
185 #define VTBLK_LOCK_INIT(_sc, _name) \
186 				mtx_init(VTBLK_MTX((_sc)), (_name), \
187 				    "VTBLK Lock", MTX_DEF)
188 #define VTBLK_LOCK(_sc)		mtx_lock(VTBLK_MTX((_sc)))
189 #define VTBLK_UNLOCK(_sc)	mtx_unlock(VTBLK_MTX((_sc)))
190 #define VTBLK_LOCK_DESTROY(_sc)	mtx_destroy(VTBLK_MTX((_sc)))
191 #define VTBLK_LOCK_ASSERT(_sc)	mtx_assert(VTBLK_MTX((_sc)), MA_OWNED)
192 #define VTBLK_LOCK_ASSERT_NOTOWNED(_sc) \
193 				mtx_assert(VTBLK_MTX((_sc)), MA_NOTOWNED)
194 
195 #define VTBLK_DISK_NAME		"vtbd"
196 #define	VTBLK_QUIESCE_TIMEOUT	(30 * hz)
197 
198 /*
199  * Each block request uses at least two segments - one for the header
200  * and one for the status.
201  */
202 #define VTBLK_MIN_SEGMENTS	2
203 
204 static uma_zone_t vtblk_req_zone;
205 
206 static device_method_t vtblk_methods[] = {
207 	/* Device methods. */
208 	DEVMETHOD(device_probe,		vtblk_probe),
209 	DEVMETHOD(device_attach,	vtblk_attach),
210 	DEVMETHOD(device_detach,	vtblk_detach),
211 	DEVMETHOD(device_suspend,	vtblk_suspend),
212 	DEVMETHOD(device_resume,	vtblk_resume),
213 	DEVMETHOD(device_shutdown,	vtblk_shutdown),
214 
215 	DEVMETHOD_END
216 };
217 
218 static driver_t vtblk_driver = {
219 	"vtblk",
220 	vtblk_methods,
221 	sizeof(struct vtblk_softc)
222 };
223 static devclass_t vtblk_devclass;
224 
225 DRIVER_MODULE(virtio_blk, virtio_pci, vtblk_driver, vtblk_devclass,
226     vtblk_modevent, 0);
227 MODULE_VERSION(virtio_blk, 1);
228 MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
229 
230 static int
231 vtblk_modevent(module_t mod, int type, void *unused)
232 {
233 	int error;
234 
235 	error = 0;
236 
237 	switch (type) {
238 	case MOD_LOAD:
239 		vtblk_req_zone = uma_zcreate("vtblk_request",
240 		    sizeof(struct vtblk_request),
241 		    NULL, NULL, NULL, NULL, 0, 0);
242 		break;
243 	case MOD_QUIESCE:
244 	case MOD_UNLOAD:
245 		if (uma_zone_get_cur(vtblk_req_zone) > 0)
246 			error = EBUSY;
247 		else if (type == MOD_UNLOAD) {
248 			uma_zdestroy(vtblk_req_zone);
249 			vtblk_req_zone = NULL;
250 		}
251 		break;
252 	case MOD_SHUTDOWN:
253 		break;
254 	default:
255 		error = EOPNOTSUPP;
256 		break;
257 	}
258 
259 	return (error);
260 }
261 
262 static int
263 vtblk_probe(device_t dev)
264 {
265 
266 	if (virtio_get_device_type(dev) != VIRTIO_ID_BLOCK)
267 		return (ENXIO);
268 
269 	device_set_desc(dev, "VirtIO Block Adapter");
270 
271 	return (BUS_PROBE_DEFAULT);
272 }
273 
274 static int
275 vtblk_attach(device_t dev)
276 {
277 	struct vtblk_softc *sc;
278 	struct virtio_blk_config blkcfg;
279 	int error;
280 
281 	sc = device_get_softc(dev);
282 	sc->vtblk_dev = dev;
283 
284 	VTBLK_LOCK_INIT(sc, device_get_nameunit(dev));
285 
286 	bioq_init(&sc->vtblk_bioq);
287 	TAILQ_INIT(&sc->vtblk_req_free);
288 	TAILQ_INIT(&sc->vtblk_req_ready);
289 
290 	virtio_set_feature_desc(dev, vtblk_feature_desc);
291 	vtblk_negotiate_features(sc);
292 
293 	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
294 		sc->vtblk_flags |= VTBLK_FLAG_INDIRECT;
295 
296 	if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
297 		sc->vtblk_flags |= VTBLK_FLAG_READONLY;
298 
299 	/* Get local copy of config. */
300 	virtio_read_device_config(dev, 0, &blkcfg,
301 	    sizeof(struct virtio_blk_config));
302 
303 	/*
304 	 * With the current sglist(9) implementation, it is not easy
305 	 * for us to support a maximum segment size as adjacent
306 	 * segments are coalesced. For now, just make sure it's larger
307 	 * than the maximum supported transfer size.
308 	 */
309 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
310 		if (blkcfg.size_max < MAXPHYS) {
311 			error = ENOTSUP;
312 			device_printf(dev, "host requires unsupported "
313 			    "maximum segment size feature\n");
314 			goto fail;
315 		}
316 	}
317 
318 	sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
319         if (sc->vtblk_max_nsegs <= VTBLK_MIN_SEGMENTS) {
320 		error = EINVAL;
321 		device_printf(dev, "fewer than minimum number of segments "
322 		    "allowed: %d\n", sc->vtblk_max_nsegs);
323 		goto fail;
324 	}
325 
326 	sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_NOWAIT);
327 	if (sc->vtblk_sglist == NULL) {
328 		error = ENOMEM;
329 		device_printf(dev, "cannot allocate sglist\n");
330 		goto fail;
331 	}
332 
333 	error = vtblk_alloc_virtqueue(sc);
334 	if (error) {
335 		device_printf(dev, "cannot allocate virtqueue\n");
336 		goto fail;
337 	}
338 
339 	error = vtblk_alloc_requests(sc);
340 	if (error) {
341 		device_printf(dev, "cannot preallocate requests\n");
342 		goto fail;
343 	}
344 
345 	vtblk_alloc_disk(sc, &blkcfg);
346 
347 	TASK_INIT(&sc->vtblk_intr_task, 0, vtblk_intr_task, sc);
348 	sc->vtblk_tq = taskqueue_create_fast("vtblk_taskq", M_NOWAIT,
349 	    taskqueue_thread_enqueue, &sc->vtblk_tq);
350 	if (sc->vtblk_tq == NULL) {
351 		error = ENOMEM;
352 		device_printf(dev, "cannot allocate taskqueue\n");
353 		goto fail;
354 	}
355 	taskqueue_start_threads(&sc->vtblk_tq, 1, PI_DISK, "%s taskq",
356 	    device_get_nameunit(dev));
357 
358 	error = virtio_setup_intr(dev, INTR_TYPE_BIO | INTR_ENTROPY);
359 	if (error) {
360 		device_printf(dev, "cannot setup virtqueue interrupt\n");
361 		goto fail;
362 	}
363 
364 	vtblk_create_disk(sc);
365 
366 	virtqueue_enable_intr(sc->vtblk_vq);
367 
368 fail:
369 	if (error)
370 		vtblk_detach(dev);
371 
372 	return (error);
373 }
374 
375 static int
376 vtblk_detach(device_t dev)
377 {
378 	struct vtblk_softc *sc;
379 
380 	sc = device_get_softc(dev);
381 
382 	VTBLK_LOCK(sc);
383 	sc->vtblk_flags |= VTBLK_FLAG_DETACH;
384 	if (device_is_attached(dev))
385 		vtblk_stop(sc);
386 	VTBLK_UNLOCK(sc);
387 
388 	if (sc->vtblk_tq != NULL) {
389 		taskqueue_drain(sc->vtblk_tq, &sc->vtblk_intr_task);
390 		taskqueue_free(sc->vtblk_tq);
391 		sc->vtblk_tq = NULL;
392 	}
393 
394 	vtblk_drain(sc);
395 
396 	if (sc->vtblk_disk != NULL) {
397 		disk_destroy(sc->vtblk_disk);
398 		sc->vtblk_disk = NULL;
399 	}
400 
401 	if (sc->vtblk_sglist != NULL) {
402 		sglist_free(sc->vtblk_sglist);
403 		sc->vtblk_sglist = NULL;
404 	}
405 
406 	VTBLK_LOCK_DESTROY(sc);
407 
408 	return (0);
409 }
410 
411 static int
412 vtblk_suspend(device_t dev)
413 {
414 	struct vtblk_softc *sc;
415 	int error;
416 
417 	sc = device_get_softc(dev);
418 
419 	VTBLK_LOCK(sc);
420 	sc->vtblk_flags |= VTBLK_FLAG_SUSPEND;
421 	/* XXX BMV: virtio_stop(), etc needed here? */
422 	error = vtblk_quiesce(sc);
423 	if (error)
424 		sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
425 	VTBLK_UNLOCK(sc);
426 
427 	return (error);
428 }
429 
430 static int
431 vtblk_resume(device_t dev)
432 {
433 	struct vtblk_softc *sc;
434 
435 	sc = device_get_softc(dev);
436 
437 	VTBLK_LOCK(sc);
438 	/* XXX BMV: virtio_reinit(), etc needed here? */
439 	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
440 	vtblk_startio(sc);
441 	VTBLK_UNLOCK(sc);
442 
443 	return (0);
444 }
445 
446 static int
447 vtblk_shutdown(device_t dev)
448 {
449 
450 	return (0);
451 }
452 
453 static int
454 vtblk_open(struct disk *dp)
455 {
456 	struct vtblk_softc *sc;
457 
458 	if ((sc = dp->d_drv1) == NULL)
459 		return (ENXIO);
460 
461 	return (sc->vtblk_flags & VTBLK_FLAG_DETACH ? ENXIO : 0);
462 }
463 
464 static int
465 vtblk_close(struct disk *dp)
466 {
467 	struct vtblk_softc *sc;
468 
469 	if ((sc = dp->d_drv1) == NULL)
470 		return (ENXIO);
471 
472 	return (0);
473 }
474 
475 static int
476 vtblk_ioctl(struct disk *dp, u_long cmd, void *addr, int flag,
477     struct thread *td)
478 {
479 	struct vtblk_softc *sc;
480 
481 	if ((sc = dp->d_drv1) == NULL)
482 		return (ENXIO);
483 
484 	return (ENOTTY);
485 }
486 
487 static int
488 vtblk_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
489     size_t length)
490 {
491 	struct disk *dp;
492 	struct vtblk_softc *sc;
493 	int error;
494 
495 	dp = arg;
496 	error = 0;
497 
498 	if ((sc = dp->d_drv1) == NULL)
499 		return (ENXIO);
500 
501 	VTBLK_LOCK(sc);
502 
503 	if ((sc->vtblk_flags & VTBLK_FLAG_DUMPING) == 0) {
504 		vtblk_prepare_dump(sc);
505 		sc->vtblk_flags |= VTBLK_FLAG_DUMPING;
506 	}
507 
508 	if (length > 0)
509 		error = vtblk_write_dump(sc, virtual, offset, length);
510 	else if (virtual == NULL && offset == 0)
511 		error = vtblk_flush_dump(sc);
512 	else {
513 		error = EINVAL;
514 		sc->vtblk_flags &= ~VTBLK_FLAG_DUMPING;
515 	}
516 
517 	VTBLK_UNLOCK(sc);
518 
519 	return (error);
520 }
521 
522 static void
523 vtblk_strategy(struct bio *bp)
524 {
525 	struct vtblk_softc *sc;
526 
527 	if ((sc = bp->bio_disk->d_drv1) == NULL) {
528 		vtblk_finish_bio(bp, EINVAL);
529 		return;
530 	}
531 
532 	/*
533 	 * Fail any write if RO. Unfortunately, there does not seem to
534 	 * be a better way to report our readonly'ness to GEOM above.
535 	 */
536 	if (sc->vtblk_flags & VTBLK_FLAG_READONLY &&
537 	    (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FLUSH)) {
538 		vtblk_finish_bio(bp, EROFS);
539 		return;
540 	}
541 
542 #ifdef	INVARIANTS
543 	/*
544 	 * Prevent read/write buffers spanning too many segments from
545 	 * getting into the queue. This should only trip if d_maxsize
546 	 * was incorrectly set.
547 	 */
548 	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
549 		int nsegs, max_nsegs;
550 
551 		nsegs = sglist_count(bp->bio_data, bp->bio_bcount);
552 		max_nsegs = sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS;
553 
554 		KASSERT(nsegs <= max_nsegs,
555 		    ("bio spanned too many segments: %d, max: %d",
556 		    nsegs, max_nsegs));
557 	}
558 #endif
559 
560 	VTBLK_LOCK(sc);
561 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH)
562 		vtblk_finish_bio(bp, ENXIO);
563 	else {
564 		bioq_disksort(&sc->vtblk_bioq, bp);
565 
566 		if ((sc->vtblk_flags & VTBLK_FLAG_SUSPEND) == 0)
567 			vtblk_startio(sc);
568 	}
569 	VTBLK_UNLOCK(sc);
570 }
571 
572 static void
573 vtblk_negotiate_features(struct vtblk_softc *sc)
574 {
575 	device_t dev;
576 	uint64_t features;
577 
578 	dev = sc->vtblk_dev;
579 	features = VTBLK_FEATURES;
580 
581 	sc->vtblk_features = virtio_negotiate_features(dev, features);
582 }
583 
584 static int
585 vtblk_maximum_segments(struct vtblk_softc *sc,
586     struct virtio_blk_config *blkcfg)
587 {
588 	device_t dev;
589 	int nsegs;
590 
591 	dev = sc->vtblk_dev;
592 	nsegs = VTBLK_MIN_SEGMENTS;
593 
594 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
595 		nsegs += MIN(blkcfg->seg_max, MAXPHYS / PAGE_SIZE + 1);
596 		if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
597 			nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
598 	} else
599 		nsegs += 1;
600 
601 	return (nsegs);
602 }
603 
604 static int
605 vtblk_alloc_virtqueue(struct vtblk_softc *sc)
606 {
607 	device_t dev;
608 	struct vq_alloc_info vq_info;
609 
610 	dev = sc->vtblk_dev;
611 
612 	VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
613 	    vtblk_vq_intr, sc, &sc->vtblk_vq,
614 	    "%s request", device_get_nameunit(dev));
615 
616 	return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info));
617 }
618 
619 static void
620 vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
621 {
622 	device_t dev;
623 	struct disk *dp;
624 
625 	dev = sc->vtblk_dev;
626 
627 	sc->vtblk_disk = dp = disk_alloc();
628 	dp->d_open = vtblk_open;
629 	dp->d_close = vtblk_close;
630 	dp->d_ioctl = vtblk_ioctl;
631 	dp->d_strategy = vtblk_strategy;
632 	dp->d_name = VTBLK_DISK_NAME;
633 	dp->d_unit = device_get_unit(dev);
634 	dp->d_drv1 = sc;
635 
636 	if ((sc->vtblk_flags & VTBLK_FLAG_READONLY) == 0)
637 		dp->d_dump = vtblk_dump;
638 
639 	/* Capacity is always in 512-byte units. */
640 	dp->d_mediasize = blkcfg->capacity * 512;
641 
642 	if (virtio_with_feature(dev, VIRTIO_BLK_F_BLK_SIZE))
643 		dp->d_sectorsize = blkcfg->blk_size;
644 	else
645 		dp->d_sectorsize = 512;
646 
647 	/*
648 	 * The VirtIO maximum I/O size is given in terms of segments.
649 	 * However, FreeBSD limits I/O size by logical buffer size, not
650 	 * by physically contiguous pages. Therefore, we have to assume
651 	 * no pages are contiguous. This may impose an artificially low
652 	 * maximum I/O size. But in practice, since QEMU advertises 128
653 	 * segments, this gives us a maximum IO size of 125 * PAGE_SIZE,
654 	 * which is typically greater than MAXPHYS. Eventually we should
655 	 * just advertise MAXPHYS and split buffers that are too big.
656 	 *
657 	 * Note we must subtract one additional segment in case of non
658 	 * page aligned buffers.
659 	 */
660 	dp->d_maxsize = (sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS - 1) *
661 	    PAGE_SIZE;
662 	if (dp->d_maxsize < PAGE_SIZE)
663 		dp->d_maxsize = PAGE_SIZE; /* XXX */
664 
665 	if (virtio_with_feature(dev, VIRTIO_BLK_F_GEOMETRY)) {
666 		dp->d_fwsectors = blkcfg->geometry.sectors;
667 		dp->d_fwheads = blkcfg->geometry.heads;
668 	}
669 
670 	if (virtio_with_feature(dev, VIRTIO_BLK_F_FLUSH))
671 		dp->d_flags |= DISKFLAG_CANFLUSHCACHE;
672 }
673 
674 static void
675 vtblk_create_disk(struct vtblk_softc *sc)
676 {
677 	struct disk *dp;
678 
679 	dp = sc->vtblk_disk;
680 
681 	/*
682 	 * Retrieving the identification string must be done after
683 	 * the virtqueue interrupt is setup otherwise it will hang.
684 	 */
685 	vtblk_get_ident(sc);
686 
687 	device_printf(sc->vtblk_dev, "%juMB (%ju %u byte sectors)\n",
688 	    (uintmax_t) dp->d_mediasize >> 20,
689 	    (uintmax_t) dp->d_mediasize / dp->d_sectorsize,
690 	    dp->d_sectorsize);
691 
692 	disk_create(dp, DISK_VERSION);
693 }
694 
695 static int
696 vtblk_quiesce(struct vtblk_softc *sc)
697 {
698 	int error;
699 
700 	error = 0;
701 
702 	VTBLK_LOCK_ASSERT(sc);
703 
704 	while (!virtqueue_empty(sc->vtblk_vq)) {
705 		if (mtx_sleep(&sc->vtblk_vq, VTBLK_MTX(sc), PRIBIO, "vtblkq",
706 		    VTBLK_QUIESCE_TIMEOUT) == EWOULDBLOCK) {
707 			error = EBUSY;
708 			break;
709 		}
710 	}
711 
712 	return (error);
713 }
714 
715 static void
716 vtblk_startio(struct vtblk_softc *sc)
717 {
718 	struct virtqueue *vq;
719 	struct vtblk_request *req;
720 	int enq;
721 
722 	vq = sc->vtblk_vq;
723 	enq = 0;
724 
725 	VTBLK_LOCK_ASSERT(sc);
726 
727 	while (!virtqueue_full(vq)) {
728 		if ((req = vtblk_dequeue_ready(sc)) == NULL)
729 			req = vtblk_bio_request(sc);
730 		if (req == NULL)
731 			break;
732 
733 		if (vtblk_execute_request(sc, req) != 0) {
734 			vtblk_enqueue_ready(sc, req);
735 			break;
736 		}
737 
738 		enq++;
739 	}
740 
741 	if (enq > 0)
742 		virtqueue_notify(vq);
743 }
744 
745 static struct vtblk_request *
746 vtblk_bio_request(struct vtblk_softc *sc)
747 {
748 	struct bio_queue_head *bioq;
749 	struct vtblk_request *req;
750 	struct bio *bp;
751 
752 	bioq = &sc->vtblk_bioq;
753 
754 	if (bioq_first(bioq) == NULL)
755 		return (NULL);
756 
757 	req = vtblk_dequeue_request(sc);
758 	if (req == NULL)
759 		return (NULL);
760 
761 	bp = bioq_takefirst(bioq);
762 	req->vbr_bp = bp;
763 	req->vbr_ack = -1;
764 	req->vbr_hdr.ioprio = 1;
765 
766 	switch (bp->bio_cmd) {
767 	case BIO_FLUSH:
768 		req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
769 		break;
770 	case BIO_READ:
771 		req->vbr_hdr.type = VIRTIO_BLK_T_IN;
772 		req->vbr_hdr.sector = bp->bio_offset / 512;
773 		break;
774 	case BIO_WRITE:
775 		req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
776 		req->vbr_hdr.sector = bp->bio_offset / 512;
777 		break;
778 	default:
779 		panic("%s: bio with unhandled cmd: %d", __FUNCTION__,
780 		    bp->bio_cmd);
781 	}
782 
783 	if (bp->bio_flags & BIO_ORDERED)
784 		req->vbr_hdr.type |= VIRTIO_BLK_T_BARRIER;
785 
786 	return (req);
787 }
788 
789 static int
790 vtblk_execute_request(struct vtblk_softc *sc, struct vtblk_request *req)
791 {
792 	struct sglist *sg;
793 	struct bio *bp;
794 	int readable, writable, error;
795 
796 	sg = sc->vtblk_sglist;
797 	bp = req->vbr_bp;
798 	writable = 0;
799 
800 	VTBLK_LOCK_ASSERT(sc);
801 
802 	sglist_reset(sg);
803 	error = sglist_append(sg, &req->vbr_hdr,
804 	    sizeof(struct virtio_blk_outhdr));
805 	KASSERT(error == 0, ("error adding header to sglist"));
806 	KASSERT(sg->sg_nseg == 1,
807 	    ("header spanned multiple segments: %d", sg->sg_nseg));
808 
809 	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
810 		error = sglist_append(sg, bp->bio_data, bp->bio_bcount);
811 		KASSERT(error == 0, ("error adding buffer to sglist"));
812 
813 		/* BIO_READ means the host writes into our buffer. */
814 		if (bp->bio_cmd == BIO_READ)
815 			writable += sg->sg_nseg - 1;
816 	}
817 
818 	error = sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
819 	KASSERT(error == 0, ("error adding ack to sglist"));
820 	writable++;
821 
822 	KASSERT(sg->sg_nseg >= VTBLK_MIN_SEGMENTS,
823 	    ("fewer than min segments: %d", sg->sg_nseg));
824 
825 	readable = sg->sg_nseg - writable;
826 
827 	return (virtqueue_enqueue(sc->vtblk_vq, req, sg, readable, writable));
828 }
829 
830 static int
831 vtblk_vq_intr(void *xsc)
832 {
833 	struct vtblk_softc *sc;
834 
835 	sc = xsc;
836 
837 	virtqueue_disable_intr(sc->vtblk_vq);
838 	taskqueue_enqueue_fast(sc->vtblk_tq, &sc->vtblk_intr_task);
839 
840 	return (1);
841 }
842 
843 static void
844 vtblk_intr_task(void *arg, int pending)
845 {
846 	struct vtblk_softc *sc;
847 	struct virtqueue *vq;
848 
849 	sc = arg;
850 	vq = sc->vtblk_vq;
851 
852 	VTBLK_LOCK(sc);
853 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH) {
854 		VTBLK_UNLOCK(sc);
855 		return;
856 	}
857 
858 	vtblk_finish_completed(sc);
859 
860 	if ((sc->vtblk_flags & VTBLK_FLAG_SUSPEND) == 0)
861 		vtblk_startio(sc);
862 	else
863 		wakeup(&sc->vtblk_vq);
864 
865 	if (virtqueue_enable_intr(vq) != 0) {
866 		virtqueue_disable_intr(vq);
867 		VTBLK_UNLOCK(sc);
868 		taskqueue_enqueue_fast(sc->vtblk_tq,
869 		    &sc->vtblk_intr_task);
870 		return;
871 	}
872 
873 	VTBLK_UNLOCK(sc);
874 }
875 
876 static void
877 vtblk_stop(struct vtblk_softc *sc)
878 {
879 
880 	virtqueue_disable_intr(sc->vtblk_vq);
881 	virtio_stop(sc->vtblk_dev);
882 }
883 
884 static void
885 vtblk_get_ident(struct vtblk_softc *sc)
886 {
887 	struct bio buf;
888 	struct disk *dp;
889 	struct vtblk_request *req;
890 	int len, error;
891 
892 	dp = sc->vtblk_disk;
893 	len = MIN(VIRTIO_BLK_ID_BYTES, DISK_IDENT_SIZE);
894 
895 	if (vtblk_no_ident != 0)
896 		return;
897 
898 	req = vtblk_dequeue_request(sc);
899 	if (req == NULL)
900 		return;
901 
902 	req->vbr_ack = -1;
903 	req->vbr_hdr.type = VIRTIO_BLK_T_GET_ID;
904 	req->vbr_hdr.ioprio = 1;
905 	req->vbr_hdr.sector = 0;
906 
907 	req->vbr_bp = &buf;
908 	bzero(&buf, sizeof(struct bio));
909 
910 	buf.bio_cmd = BIO_READ;
911 	buf.bio_data = dp->d_ident;
912 	buf.bio_bcount = len;
913 
914 	VTBLK_LOCK(sc);
915 	error = vtblk_poll_request(sc, req);
916 	VTBLK_UNLOCK(sc);
917 
918 	vtblk_enqueue_request(sc, req);
919 
920 	if (error) {
921 		device_printf(sc->vtblk_dev,
922 		    "error getting device identifier: %d\n", error);
923 	}
924 }
925 
926 static void
927 vtblk_prepare_dump(struct vtblk_softc *sc)
928 {
929 	device_t dev;
930 	struct virtqueue *vq;
931 
932 	dev = sc->vtblk_dev;
933 	vq = sc->vtblk_vq;
934 
935 	vtblk_stop(sc);
936 
937 	/*
938 	 * Drain all requests caught in-flight in the virtqueue,
939 	 * skipping biodone(). When dumping, only one request is
940 	 * outstanding at a time, and we just poll the virtqueue
941 	 * for the response.
942 	 */
943 	vtblk_drain_vq(sc, 1);
944 
945 	if (virtio_reinit(dev, sc->vtblk_features) != 0)
946 		panic("cannot reinit VirtIO block device during dump");
947 
948 	virtqueue_disable_intr(vq);
949 	virtio_reinit_complete(dev);
950 }
951 
952 static int
953 vtblk_write_dump(struct vtblk_softc *sc, void *virtual, off_t offset,
954     size_t length)
955 {
956 	struct bio buf;
957 	struct vtblk_request *req;
958 
959 	req = &sc->vtblk_dump_request;
960 	req->vbr_ack = -1;
961 	req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
962 	req->vbr_hdr.ioprio = 1;
963 	req->vbr_hdr.sector = offset / 512;
964 
965 	req->vbr_bp = &buf;
966 	bzero(&buf, sizeof(struct bio));
967 
968 	buf.bio_cmd = BIO_WRITE;
969 	buf.bio_data = virtual;
970 	buf.bio_bcount = length;
971 
972 	return (vtblk_poll_request(sc, req));
973 }
974 
975 static int
976 vtblk_flush_dump(struct vtblk_softc *sc)
977 {
978 	struct bio buf;
979 	struct vtblk_request *req;
980 
981 	req = &sc->vtblk_dump_request;
982 	req->vbr_ack = -1;
983 	req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
984 	req->vbr_hdr.ioprio = 1;
985 	req->vbr_hdr.sector = 0;
986 
987 	req->vbr_bp = &buf;
988 	bzero(&buf, sizeof(struct bio));
989 
990 	buf.bio_cmd = BIO_FLUSH;
991 
992 	return (vtblk_poll_request(sc, req));
993 }
994 
995 static int
996 vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
997 {
998 	device_t dev;
999 	struct virtqueue *vq;
1000 	struct vtblk_request *r;
1001 	int error;
1002 
1003 	dev = sc->vtblk_dev;
1004 	vq = sc->vtblk_vq;
1005 
1006 	if (!virtqueue_empty(vq))
1007 		return (EBUSY);
1008 
1009 	error = vtblk_execute_request(sc, req);
1010 	if (error)
1011 		return (error);
1012 
1013 	virtqueue_notify(vq);
1014 
1015 	r = virtqueue_poll(vq, NULL);
1016 	KASSERT(r == req, ("unexpected request response"));
1017 
1018 	error = vtblk_request_error(req);
1019 	if (error && bootverbose) {
1020 		device_printf(dev, "vtblk_poll_request: IO error: %d\n",
1021 		    error);
1022 	}
1023 
1024 	return (error);
1025 }
1026 
1027 static void
1028 vtblk_finish_completed(struct vtblk_softc *sc)
1029 {
1030 	struct vtblk_request *req;
1031 	struct bio *bp;
1032 	int error;
1033 
1034 	while ((req = virtqueue_dequeue(sc->vtblk_vq, NULL)) != NULL) {
1035 		bp = req->vbr_bp;
1036 
1037 		error = vtblk_request_error(req);
1038 		if (error)
1039 			disk_err(bp, "hard error", -1, 1);
1040 
1041 		vtblk_finish_bio(bp, error);
1042 		vtblk_enqueue_request(sc, req);
1043 	}
1044 }
1045 
1046 static void
1047 vtblk_drain_vq(struct vtblk_softc *sc, int skip_done)
1048 {
1049 	struct virtqueue *vq;
1050 	struct vtblk_request *req;
1051 	int last;
1052 
1053 	vq = sc->vtblk_vq;
1054 	last = 0;
1055 
1056 	while ((req = virtqueue_drain(vq, &last)) != NULL) {
1057 		if (!skip_done)
1058 			vtblk_finish_bio(req->vbr_bp, ENXIO);
1059 
1060 		vtblk_enqueue_request(sc, req);
1061 	}
1062 
1063 	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
1064 }
1065 
1066 static void
1067 vtblk_drain(struct vtblk_softc *sc)
1068 {
1069 	struct bio_queue_head *bioq;
1070 	struct vtblk_request *req;
1071 	struct bio *bp;
1072 
1073 	bioq = &sc->vtblk_bioq;
1074 
1075 	if (sc->vtblk_vq != NULL) {
1076 		vtblk_finish_completed(sc);
1077 		vtblk_drain_vq(sc, 0);
1078 	}
1079 
1080 	while ((req = vtblk_dequeue_ready(sc)) != NULL) {
1081 		vtblk_finish_bio(req->vbr_bp, ENXIO);
1082 		vtblk_enqueue_request(sc, req);
1083 	}
1084 
1085 	while (bioq_first(bioq) != NULL) {
1086 		bp = bioq_takefirst(bioq);
1087 		vtblk_finish_bio(bp, ENXIO);
1088 	}
1089 
1090 	vtblk_free_requests(sc);
1091 }
1092 
1093 static int
1094 vtblk_alloc_requests(struct vtblk_softc *sc)
1095 {
1096 	struct vtblk_request *req;
1097 	int i, nreqs;
1098 
1099 	nreqs = virtqueue_size(sc->vtblk_vq);
1100 
1101 	/*
1102 	 * Preallocate sufficient requests to keep the virtqueue full. Each
1103 	 * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
1104 	 * the number allocated when indirect descriptors are not available.
1105 	 */
1106 	if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
1107 		nreqs /= VTBLK_MIN_SEGMENTS;
1108 
1109 	for (i = 0; i < nreqs; i++) {
1110 		req = uma_zalloc(vtblk_req_zone, M_NOWAIT);
1111 		if (req == NULL)
1112 			return (ENOMEM);
1113 
1114 		sc->vtblk_request_count++;
1115 		vtblk_enqueue_request(sc, req);
1116 	}
1117 
1118 	return (0);
1119 }
1120 
1121 static void
1122 vtblk_free_requests(struct vtblk_softc *sc)
1123 {
1124 	struct vtblk_request *req;
1125 
1126 	KASSERT(TAILQ_EMPTY(&sc->vtblk_req_ready),
1127 	    ("ready requests left on queue"));
1128 
1129 	while ((req = vtblk_dequeue_request(sc)) != NULL) {
1130 		sc->vtblk_request_count--;
1131 		uma_zfree(vtblk_req_zone, req);
1132 	}
1133 
1134 	KASSERT(sc->vtblk_request_count == 0, ("leaked requests"));
1135 }
1136 
1137 static struct vtblk_request *
1138 vtblk_dequeue_request(struct vtblk_softc *sc)
1139 {
1140 	struct vtblk_request *req;
1141 
1142 	req = TAILQ_FIRST(&sc->vtblk_req_free);
1143 	if (req != NULL)
1144 		TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link);
1145 
1146 	return (req);
1147 }
1148 
1149 static void
1150 vtblk_enqueue_request(struct vtblk_softc *sc, struct vtblk_request *req)
1151 {
1152 
1153 	bzero(req, sizeof(struct vtblk_request));
1154 	TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
1155 }
1156 
1157 static struct vtblk_request *
1158 vtblk_dequeue_ready(struct vtblk_softc *sc)
1159 {
1160 	struct vtblk_request *req;
1161 
1162 	req = TAILQ_FIRST(&sc->vtblk_req_ready);
1163 	if (req != NULL)
1164 		TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link);
1165 
1166 	return (req);
1167 }
1168 
1169 static void
1170 vtblk_enqueue_ready(struct vtblk_softc *sc, struct vtblk_request *req)
1171 {
1172 
1173 	TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link);
1174 }
1175 
1176 static int
1177 vtblk_request_error(struct vtblk_request *req)
1178 {
1179 	int error;
1180 
1181 	switch (req->vbr_ack) {
1182 	case VIRTIO_BLK_S_OK:
1183 		error = 0;
1184 		break;
1185 	case VIRTIO_BLK_S_UNSUPP:
1186 		error = ENOTSUP;
1187 		break;
1188 	default:
1189 		error = EIO;
1190 		break;
1191 	}
1192 
1193 	return (error);
1194 }
1195 
1196 static void
1197 vtblk_finish_bio(struct bio *bp, int error)
1198 {
1199 
1200 	if (error) {
1201 		bp->bio_resid = bp->bio_bcount;
1202 		bp->bio_error = error;
1203 		bp->bio_flags |= BIO_ERROR;
1204 	}
1205 
1206 	biodone(bp);
1207 }
1208