xref: /freebsd/sys/dev/virtio/block/virtio_blk.c (revision 595e514d0df2bac5b813d35f83e32875dbf16a83)
1 /*-
2  * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 /* Driver for VirtIO block devices. */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/bio.h>
36 #include <sys/malloc.h>
37 #include <sys/module.h>
38 #include <sys/sglist.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/queue.h>
42 #include <sys/taskqueue.h>
43 
44 #include <geom/geom_disk.h>
45 
46 #include <machine/bus.h>
47 #include <machine/resource.h>
48 #include <sys/bus.h>
49 #include <sys/rman.h>
50 
51 #include <dev/virtio/virtio.h>
52 #include <dev/virtio/virtqueue.h>
53 #include <dev/virtio/block/virtio_blk.h>
54 
55 #include "virtio_if.h"
56 
57 struct vtblk_request {
58 	struct virtio_blk_outhdr	 vbr_hdr;
59 	struct bio			*vbr_bp;
60 	uint8_t				 vbr_ack;
61 
62 	TAILQ_ENTRY(vtblk_request)	 vbr_link;
63 };
64 
65 struct vtblk_softc {
66 	device_t		 vtblk_dev;
67 	struct mtx		 vtblk_mtx;
68 	uint64_t		 vtblk_features;
69 	uint32_t		 vtblk_flags;
70 #define VTBLK_FLAG_INDIRECT	0x0001
71 #define VTBLK_FLAG_READONLY	0x0002
72 #define VTBLK_FLAG_DETACH	0x0004
73 #define VTBLK_FLAG_SUSPEND	0x0008
74 #define VTBLK_FLAG_DUMPING	0x0010
75 #define VTBLK_FLAG_BARRIER	0x0020
76 
77 	struct virtqueue	*vtblk_vq;
78 	struct sglist		*vtblk_sglist;
79 	struct disk		*vtblk_disk;
80 
81 	struct bio_queue_head	 vtblk_bioq;
82 	TAILQ_HEAD(, vtblk_request)
83 				 vtblk_req_free;
84 	TAILQ_HEAD(, vtblk_request)
85 				 vtblk_req_ready;
86 	struct vtblk_request	*vtblk_req_ordered;
87 
88 	struct taskqueue	*vtblk_tq;
89 	struct task		 vtblk_intr_task;
90 
91 	int			 vtblk_max_nsegs;
92 	int			 vtblk_request_count;
93 
94 	struct vtblk_request	 vtblk_dump_request;
95 };
96 
97 static struct virtio_feature_desc vtblk_feature_desc[] = {
98 	{ VIRTIO_BLK_F_BARRIER,		"HostBarrier"	},
99 	{ VIRTIO_BLK_F_SIZE_MAX,	"MaxSegSize"	},
100 	{ VIRTIO_BLK_F_SEG_MAX,		"MaxNumSegs"	},
101 	{ VIRTIO_BLK_F_GEOMETRY,	"DiskGeometry"	},
102 	{ VIRTIO_BLK_F_RO,		"ReadOnly"	},
103 	{ VIRTIO_BLK_F_BLK_SIZE,	"BlockSize"	},
104 	{ VIRTIO_BLK_F_SCSI,		"SCSICmds"	},
105 	{ VIRTIO_BLK_F_FLUSH,		"FlushCmd"	},
106 	{ VIRTIO_BLK_F_TOPOLOGY,	"Topology"	},
107 
108 	{ 0, NULL }
109 };
110 
111 static int	vtblk_modevent(module_t, int, void *);
112 
113 static int	vtblk_probe(device_t);
114 static int	vtblk_attach(device_t);
115 static int	vtblk_detach(device_t);
116 static int	vtblk_suspend(device_t);
117 static int	vtblk_resume(device_t);
118 static int	vtblk_shutdown(device_t);
119 
120 static int	vtblk_open(struct disk *);
121 static int	vtblk_close(struct disk *);
122 static int	vtblk_ioctl(struct disk *, u_long, void *, int,
123 		    struct thread *);
124 static int	vtblk_dump(void *, void *, vm_offset_t, off_t, size_t);
125 static void	vtblk_strategy(struct bio *);
126 
127 static void	vtblk_negotiate_features(struct vtblk_softc *);
128 static int	vtblk_maximum_segments(struct vtblk_softc *,
129 		    struct virtio_blk_config *);
130 static int	vtblk_alloc_virtqueue(struct vtblk_softc *);
131 static void	vtblk_alloc_disk(struct vtblk_softc *,
132 		    struct virtio_blk_config *);
133 static void	vtblk_create_disk(struct vtblk_softc *);
134 
135 static int	vtblk_quiesce(struct vtblk_softc *);
136 static void	vtblk_startio(struct vtblk_softc *);
137 static struct vtblk_request * vtblk_bio_request(struct vtblk_softc *);
138 static int	vtblk_execute_request(struct vtblk_softc *,
139 		    struct vtblk_request *);
140 
141 static int	vtblk_vq_intr(void *);
142 static void	vtblk_intr_task(void *, int);
143 
144 static void	vtblk_stop(struct vtblk_softc *);
145 
146 static void	vtblk_get_ident(struct vtblk_softc *);
147 static void	vtblk_prepare_dump(struct vtblk_softc *);
148 static int	vtblk_write_dump(struct vtblk_softc *, void *, off_t, size_t);
149 static int	vtblk_flush_dump(struct vtblk_softc *);
150 static int	vtblk_poll_request(struct vtblk_softc *,
151 		    struct vtblk_request *);
152 
153 static void	vtblk_finish_completed(struct vtblk_softc *);
154 static void	vtblk_drain_vq(struct vtblk_softc *, int);
155 static void	vtblk_drain(struct vtblk_softc *);
156 
157 static int	vtblk_alloc_requests(struct vtblk_softc *);
158 static void	vtblk_free_requests(struct vtblk_softc *);
159 static struct vtblk_request * vtblk_dequeue_request(struct vtblk_softc *);
160 static void	vtblk_enqueue_request(struct vtblk_softc *,
161 		    struct vtblk_request *);
162 
163 static struct vtblk_request * vtblk_dequeue_ready(struct vtblk_softc *);
164 static void	vtblk_enqueue_ready(struct vtblk_softc *,
165 		    struct vtblk_request *);
166 
167 static int	vtblk_request_error(struct vtblk_request *);
168 static void	vtblk_finish_bio(struct bio *, int);
169 
170 /* Tunables. */
171 static int vtblk_no_ident = 0;
172 TUNABLE_INT("hw.vtblk.no_ident", &vtblk_no_ident);
173 
174 /* Features desired/implemented by this driver. */
175 #define VTBLK_FEATURES \
176     (VIRTIO_BLK_F_BARRIER		| \
177      VIRTIO_BLK_F_SIZE_MAX		| \
178      VIRTIO_BLK_F_SEG_MAX		| \
179      VIRTIO_BLK_F_GEOMETRY		| \
180      VIRTIO_BLK_F_RO			| \
181      VIRTIO_BLK_F_BLK_SIZE		| \
182      VIRTIO_BLK_F_FLUSH			| \
183      VIRTIO_RING_F_INDIRECT_DESC)
184 
185 #define VTBLK_MTX(_sc)		&(_sc)->vtblk_mtx
186 #define VTBLK_LOCK_INIT(_sc, _name) \
187 				mtx_init(VTBLK_MTX((_sc)), (_name), \
188 				    "VTBLK Lock", MTX_DEF)
189 #define VTBLK_LOCK(_sc)		mtx_lock(VTBLK_MTX((_sc)))
190 #define VTBLK_UNLOCK(_sc)	mtx_unlock(VTBLK_MTX((_sc)))
191 #define VTBLK_LOCK_DESTROY(_sc)	mtx_destroy(VTBLK_MTX((_sc)))
192 #define VTBLK_LOCK_ASSERT(_sc)	mtx_assert(VTBLK_MTX((_sc)), MA_OWNED)
193 #define VTBLK_LOCK_ASSERT_NOTOWNED(_sc) \
194 				mtx_assert(VTBLK_MTX((_sc)), MA_NOTOWNED)
195 
196 #define VTBLK_DISK_NAME		"vtbd"
197 #define VTBLK_QUIESCE_TIMEOUT	(30 * hz)
198 
199 /*
200  * Each block request uses at least two segments - one for the header
201  * and one for the status.
202  */
203 #define VTBLK_MIN_SEGMENTS	2
204 
205 static device_method_t vtblk_methods[] = {
206 	/* Device methods. */
207 	DEVMETHOD(device_probe,		vtblk_probe),
208 	DEVMETHOD(device_attach,	vtblk_attach),
209 	DEVMETHOD(device_detach,	vtblk_detach),
210 	DEVMETHOD(device_suspend,	vtblk_suspend),
211 	DEVMETHOD(device_resume,	vtblk_resume),
212 	DEVMETHOD(device_shutdown,	vtblk_shutdown),
213 
214 	DEVMETHOD_END
215 };
216 
217 static driver_t vtblk_driver = {
218 	"vtblk",
219 	vtblk_methods,
220 	sizeof(struct vtblk_softc)
221 };
222 static devclass_t vtblk_devclass;
223 
224 DRIVER_MODULE(virtio_blk, virtio_pci, vtblk_driver, vtblk_devclass,
225     vtblk_modevent, 0);
226 MODULE_VERSION(virtio_blk, 1);
227 MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
228 
229 static int
230 vtblk_modevent(module_t mod, int type, void *unused)
231 {
232 	int error;
233 
234 	error = 0;
235 
236 	switch (type) {
237 	case MOD_LOAD:
238 	case MOD_QUIESCE:
239 	case MOD_UNLOAD:
240 	case MOD_SHUTDOWN:
241 		break;
242 	default:
243 		error = EOPNOTSUPP;
244 		break;
245 	}
246 
247 	return (error);
248 }
249 
250 static int
251 vtblk_probe(device_t dev)
252 {
253 
254 	if (virtio_get_device_type(dev) != VIRTIO_ID_BLOCK)
255 		return (ENXIO);
256 
257 	device_set_desc(dev, "VirtIO Block Adapter");
258 
259 	return (BUS_PROBE_DEFAULT);
260 }
261 
262 static int
263 vtblk_attach(device_t dev)
264 {
265 	struct vtblk_softc *sc;
266 	struct virtio_blk_config blkcfg;
267 	int error;
268 
269 	sc = device_get_softc(dev);
270 	sc->vtblk_dev = dev;
271 
272 	VTBLK_LOCK_INIT(sc, device_get_nameunit(dev));
273 
274 	bioq_init(&sc->vtblk_bioq);
275 	TAILQ_INIT(&sc->vtblk_req_free);
276 	TAILQ_INIT(&sc->vtblk_req_ready);
277 
278 	virtio_set_feature_desc(dev, vtblk_feature_desc);
279 	vtblk_negotiate_features(sc);
280 
281 	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
282 		sc->vtblk_flags |= VTBLK_FLAG_INDIRECT;
283 	if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
284 		sc->vtblk_flags |= VTBLK_FLAG_READONLY;
285 	if (virtio_with_feature(dev, VIRTIO_BLK_F_BARRIER))
286 		sc->vtblk_flags |= VTBLK_FLAG_BARRIER;
287 
288 	/* Get local copy of config. */
289 	virtio_read_device_config(dev, 0, &blkcfg,
290 	    sizeof(struct virtio_blk_config));
291 
292 	/*
293 	 * With the current sglist(9) implementation, it is not easy
294 	 * for us to support a maximum segment size as adjacent
295 	 * segments are coalesced. For now, just make sure it's larger
296 	 * than the maximum supported transfer size.
297 	 */
298 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
299 		if (blkcfg.size_max < MAXPHYS) {
300 			error = ENOTSUP;
301 			device_printf(dev, "host requires unsupported "
302 			    "maximum segment size feature\n");
303 			goto fail;
304 		}
305 	}
306 
307 	sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
308 	if (sc->vtblk_max_nsegs <= VTBLK_MIN_SEGMENTS) {
309 		error = EINVAL;
310 		device_printf(dev, "fewer than minimum number of segments "
311 		    "allowed: %d\n", sc->vtblk_max_nsegs);
312 		goto fail;
313 	}
314 
315 	sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_NOWAIT);
316 	if (sc->vtblk_sglist == NULL) {
317 		error = ENOMEM;
318 		device_printf(dev, "cannot allocate sglist\n");
319 		goto fail;
320 	}
321 
322 	error = vtblk_alloc_virtqueue(sc);
323 	if (error) {
324 		device_printf(dev, "cannot allocate virtqueue\n");
325 		goto fail;
326 	}
327 
328 	error = vtblk_alloc_requests(sc);
329 	if (error) {
330 		device_printf(dev, "cannot preallocate requests\n");
331 		goto fail;
332 	}
333 
334 	vtblk_alloc_disk(sc, &blkcfg);
335 
336 	TASK_INIT(&sc->vtblk_intr_task, 0, vtblk_intr_task, sc);
337 	sc->vtblk_tq = taskqueue_create_fast("vtblk_taskq", M_NOWAIT,
338 	    taskqueue_thread_enqueue, &sc->vtblk_tq);
339 	if (sc->vtblk_tq == NULL) {
340 		error = ENOMEM;
341 		device_printf(dev, "cannot allocate taskqueue\n");
342 		goto fail;
343 	}
344 
345 	error = virtio_setup_intr(dev, INTR_TYPE_BIO | INTR_ENTROPY);
346 	if (error) {
347 		device_printf(dev, "cannot setup virtqueue interrupt\n");
348 		goto fail;
349 	}
350 
351 	taskqueue_start_threads(&sc->vtblk_tq, 1, PI_DISK, "%s taskq",
352 	    device_get_nameunit(dev));
353 
354 	vtblk_create_disk(sc);
355 
356 	virtqueue_enable_intr(sc->vtblk_vq);
357 
358 fail:
359 	if (error)
360 		vtblk_detach(dev);
361 
362 	return (error);
363 }
364 
365 static int
366 vtblk_detach(device_t dev)
367 {
368 	struct vtblk_softc *sc;
369 
370 	sc = device_get_softc(dev);
371 
372 	VTBLK_LOCK(sc);
373 	sc->vtblk_flags |= VTBLK_FLAG_DETACH;
374 	if (device_is_attached(dev))
375 		vtblk_stop(sc);
376 	VTBLK_UNLOCK(sc);
377 
378 	if (sc->vtblk_tq != NULL) {
379 		taskqueue_drain(sc->vtblk_tq, &sc->vtblk_intr_task);
380 		taskqueue_free(sc->vtblk_tq);
381 		sc->vtblk_tq = NULL;
382 	}
383 
384 	vtblk_drain(sc);
385 
386 	if (sc->vtblk_disk != NULL) {
387 		disk_destroy(sc->vtblk_disk);
388 		sc->vtblk_disk = NULL;
389 	}
390 
391 	if (sc->vtblk_sglist != NULL) {
392 		sglist_free(sc->vtblk_sglist);
393 		sc->vtblk_sglist = NULL;
394 	}
395 
396 	VTBLK_LOCK_DESTROY(sc);
397 
398 	return (0);
399 }
400 
401 static int
402 vtblk_suspend(device_t dev)
403 {
404 	struct vtblk_softc *sc;
405 	int error;
406 
407 	sc = device_get_softc(dev);
408 
409 	VTBLK_LOCK(sc);
410 	sc->vtblk_flags |= VTBLK_FLAG_SUSPEND;
411 	/* XXX BMV: virtio_stop(), etc needed here? */
412 	error = vtblk_quiesce(sc);
413 	if (error)
414 		sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
415 	VTBLK_UNLOCK(sc);
416 
417 	return (error);
418 }
419 
420 static int
421 vtblk_resume(device_t dev)
422 {
423 	struct vtblk_softc *sc;
424 
425 	sc = device_get_softc(dev);
426 
427 	VTBLK_LOCK(sc);
428 	/* XXX BMV: virtio_reinit(), etc needed here? */
429 	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
430 	vtblk_startio(sc);
431 	VTBLK_UNLOCK(sc);
432 
433 	return (0);
434 }
435 
436 static int
437 vtblk_shutdown(device_t dev)
438 {
439 
440 	return (0);
441 }
442 
443 static int
444 vtblk_open(struct disk *dp)
445 {
446 	struct vtblk_softc *sc;
447 
448 	if ((sc = dp->d_drv1) == NULL)
449 		return (ENXIO);
450 
451 	return (sc->vtblk_flags & VTBLK_FLAG_DETACH ? ENXIO : 0);
452 }
453 
454 static int
455 vtblk_close(struct disk *dp)
456 {
457 	struct vtblk_softc *sc;
458 
459 	if ((sc = dp->d_drv1) == NULL)
460 		return (ENXIO);
461 
462 	return (0);
463 }
464 
465 static int
466 vtblk_ioctl(struct disk *dp, u_long cmd, void *addr, int flag,
467     struct thread *td)
468 {
469 	struct vtblk_softc *sc;
470 
471 	if ((sc = dp->d_drv1) == NULL)
472 		return (ENXIO);
473 
474 	return (ENOTTY);
475 }
476 
477 static int
478 vtblk_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
479     size_t length)
480 {
481 	struct disk *dp;
482 	struct vtblk_softc *sc;
483 	int error;
484 
485 	dp = arg;
486 
487 	if ((sc = dp->d_drv1) == NULL)
488 		return (ENXIO);
489 
490 	VTBLK_LOCK(sc);
491 
492 	if ((sc->vtblk_flags & VTBLK_FLAG_DUMPING) == 0) {
493 		vtblk_prepare_dump(sc);
494 		sc->vtblk_flags |= VTBLK_FLAG_DUMPING;
495 	}
496 
497 	if (length > 0)
498 		error = vtblk_write_dump(sc, virtual, offset, length);
499 	else if (virtual == NULL && offset == 0)
500 		error = vtblk_flush_dump(sc);
501 	else {
502 		error = EINVAL;
503 		sc->vtblk_flags &= ~VTBLK_FLAG_DUMPING;
504 	}
505 
506 	VTBLK_UNLOCK(sc);
507 
508 	return (error);
509 }
510 
511 static void
512 vtblk_strategy(struct bio *bp)
513 {
514 	struct vtblk_softc *sc;
515 
516 	if ((sc = bp->bio_disk->d_drv1) == NULL) {
517 		vtblk_finish_bio(bp, EINVAL);
518 		return;
519 	}
520 
521 	/*
522 	 * Fail any write if RO. Unfortunately, there does not seem to
523 	 * be a better way to report our readonly'ness to GEOM above.
524 	 */
525 	if (sc->vtblk_flags & VTBLK_FLAG_READONLY &&
526 	    (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FLUSH)) {
527 		vtblk_finish_bio(bp, EROFS);
528 		return;
529 	}
530 
531 #ifdef INVARIANTS
532 	/*
533 	 * Prevent read/write buffers spanning too many segments from
534 	 * getting into the queue. This should only trip if d_maxsize
535 	 * was incorrectly set.
536 	 */
537 	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
538 		int nsegs, max_nsegs;
539 
540 		nsegs = sglist_count(bp->bio_data, bp->bio_bcount);
541 		max_nsegs = sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS;
542 
543 		KASSERT(nsegs <= max_nsegs,
544 		    ("bio %p spanned too many segments: %d, max: %d",
545 		    bp, nsegs, max_nsegs));
546 	}
547 #endif
548 
549 	VTBLK_LOCK(sc);
550 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH)
551 		vtblk_finish_bio(bp, ENXIO);
552 	else {
553 		bioq_disksort(&sc->vtblk_bioq, bp);
554 
555 		if ((sc->vtblk_flags & VTBLK_FLAG_SUSPEND) == 0)
556 			vtblk_startio(sc);
557 	}
558 	VTBLK_UNLOCK(sc);
559 }
560 
561 static void
562 vtblk_negotiate_features(struct vtblk_softc *sc)
563 {
564 	device_t dev;
565 	uint64_t features;
566 
567 	dev = sc->vtblk_dev;
568 	features = VTBLK_FEATURES;
569 
570 	sc->vtblk_features = virtio_negotiate_features(dev, features);
571 }
572 
573 static int
574 vtblk_maximum_segments(struct vtblk_softc *sc,
575     struct virtio_blk_config *blkcfg)
576 {
577 	device_t dev;
578 	int nsegs;
579 
580 	dev = sc->vtblk_dev;
581 	nsegs = VTBLK_MIN_SEGMENTS;
582 
583 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
584 		nsegs += MIN(blkcfg->seg_max, MAXPHYS / PAGE_SIZE + 1);
585 		if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
586 			nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
587 	} else
588 		nsegs += 1;
589 
590 	return (nsegs);
591 }
592 
593 static int
594 vtblk_alloc_virtqueue(struct vtblk_softc *sc)
595 {
596 	device_t dev;
597 	struct vq_alloc_info vq_info;
598 
599 	dev = sc->vtblk_dev;
600 
601 	VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
602 	    vtblk_vq_intr, sc, &sc->vtblk_vq,
603 	    "%s request", device_get_nameunit(dev));
604 
605 	return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info));
606 }
607 
608 static void
609 vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
610 {
611 	device_t dev;
612 	struct disk *dp;
613 
614 	dev = sc->vtblk_dev;
615 
616 	sc->vtblk_disk = dp = disk_alloc();
617 	dp->d_open = vtblk_open;
618 	dp->d_close = vtblk_close;
619 	dp->d_ioctl = vtblk_ioctl;
620 	dp->d_strategy = vtblk_strategy;
621 	dp->d_name = VTBLK_DISK_NAME;
622 	dp->d_unit = device_get_unit(dev);
623 	dp->d_drv1 = sc;
624 
625 	if ((sc->vtblk_flags & VTBLK_FLAG_READONLY) == 0)
626 		dp->d_dump = vtblk_dump;
627 
628 	/* Capacity is always in 512-byte units. */
629 	dp->d_mediasize = blkcfg->capacity * 512;
630 
631 	if (virtio_with_feature(dev, VIRTIO_BLK_F_BLK_SIZE))
632 		dp->d_sectorsize = blkcfg->blk_size;
633 	else
634 		dp->d_sectorsize = 512;
635 
636 	/*
637 	 * The VirtIO maximum I/O size is given in terms of segments.
638 	 * However, FreeBSD limits I/O size by logical buffer size, not
639 	 * by physically contiguous pages. Therefore, we have to assume
640 	 * no pages are contiguous. This may impose an artificially low
641 	 * maximum I/O size. But in practice, since QEMU advertises 128
642 	 * segments, this gives us a maximum IO size of 125 * PAGE_SIZE,
643 	 * which is typically greater than MAXPHYS. Eventually we should
644 	 * just advertise MAXPHYS and split buffers that are too big.
645 	 *
646 	 * Note we must subtract one additional segment in case of non
647 	 * page aligned buffers.
648 	 */
649 	dp->d_maxsize = (sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS - 1) *
650 	    PAGE_SIZE;
651 	if (dp->d_maxsize < PAGE_SIZE)
652 		dp->d_maxsize = PAGE_SIZE; /* XXX */
653 
654 	if (virtio_with_feature(dev, VIRTIO_BLK_F_GEOMETRY)) {
655 		dp->d_fwsectors = blkcfg->geometry.sectors;
656 		dp->d_fwheads = blkcfg->geometry.heads;
657 	}
658 
659 	if (virtio_with_feature(dev, VIRTIO_BLK_F_FLUSH))
660 		dp->d_flags |= DISKFLAG_CANFLUSHCACHE;
661 }
662 
663 static void
664 vtblk_create_disk(struct vtblk_softc *sc)
665 {
666 	struct disk *dp;
667 
668 	dp = sc->vtblk_disk;
669 
670 	/*
671 	 * Retrieving the identification string must be done after
672 	 * the virtqueue interrupt is setup otherwise it will hang.
673 	 */
674 	vtblk_get_ident(sc);
675 
676 	device_printf(sc->vtblk_dev, "%juMB (%ju %u byte sectors)\n",
677 	    (uintmax_t) dp->d_mediasize >> 20,
678 	    (uintmax_t) dp->d_mediasize / dp->d_sectorsize,
679 	    dp->d_sectorsize);
680 
681 	disk_create(dp, DISK_VERSION);
682 }
683 
684 static int
685 vtblk_quiesce(struct vtblk_softc *sc)
686 {
687 	int error;
688 
689 	error = 0;
690 
691 	VTBLK_LOCK_ASSERT(sc);
692 
693 	while (!virtqueue_empty(sc->vtblk_vq)) {
694 		if (mtx_sleep(&sc->vtblk_vq, VTBLK_MTX(sc), PRIBIO, "vtblkq",
695 		    VTBLK_QUIESCE_TIMEOUT) == EWOULDBLOCK) {
696 			error = EBUSY;
697 			break;
698 		}
699 	}
700 
701 	return (error);
702 }
703 
704 static void
705 vtblk_startio(struct vtblk_softc *sc)
706 {
707 	struct virtqueue *vq;
708 	struct vtblk_request *req;
709 	int enq;
710 
711 	vq = sc->vtblk_vq;
712 	enq = 0;
713 
714 	VTBLK_LOCK_ASSERT(sc);
715 
716 	while (!virtqueue_full(vq)) {
717 		if ((req = vtblk_dequeue_ready(sc)) == NULL)
718 			req = vtblk_bio_request(sc);
719 		if (req == NULL)
720 			break;
721 
722 		if (vtblk_execute_request(sc, req) != 0) {
723 			vtblk_enqueue_ready(sc, req);
724 			break;
725 		}
726 
727 		enq++;
728 	}
729 
730 	if (enq > 0)
731 		virtqueue_notify(vq);
732 }
733 
734 static struct vtblk_request *
735 vtblk_bio_request(struct vtblk_softc *sc)
736 {
737 	struct bio_queue_head *bioq;
738 	struct vtblk_request *req;
739 	struct bio *bp;
740 
741 	bioq = &sc->vtblk_bioq;
742 
743 	if (bioq_first(bioq) == NULL)
744 		return (NULL);
745 
746 	req = vtblk_dequeue_request(sc);
747 	if (req == NULL)
748 		return (NULL);
749 
750 	bp = bioq_takefirst(bioq);
751 	req->vbr_bp = bp;
752 	req->vbr_ack = -1;
753 	req->vbr_hdr.ioprio = 1;
754 
755 	switch (bp->bio_cmd) {
756 	case BIO_FLUSH:
757 		req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
758 		break;
759 	case BIO_READ:
760 		req->vbr_hdr.type = VIRTIO_BLK_T_IN;
761 		req->vbr_hdr.sector = bp->bio_offset / 512;
762 		break;
763 	case BIO_WRITE:
764 		req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
765 		req->vbr_hdr.sector = bp->bio_offset / 512;
766 		break;
767 	default:
768 		panic("%s: bio with unhandled cmd: %d", __FUNCTION__,
769 		    bp->bio_cmd);
770 	}
771 
772 	return (req);
773 }
774 
775 static int
776 vtblk_execute_request(struct vtblk_softc *sc, struct vtblk_request *req)
777 {
778 	struct virtqueue *vq;
779 	struct sglist *sg;
780 	struct bio *bp;
781 	int ordered, readable, writable, error;
782 
783 	vq = sc->vtblk_vq;
784 	sg = sc->vtblk_sglist;
785 	bp = req->vbr_bp;
786 	ordered = 0;
787 	writable = 0;
788 
789 	VTBLK_LOCK_ASSERT(sc);
790 
791 	/*
792 	 * Wait until the ordered request completes before
793 	 * executing subsequent requests.
794 	 */
795 	if (sc->vtblk_req_ordered != NULL)
796 		return (EBUSY);
797 
798 	if (bp->bio_flags & BIO_ORDERED) {
799 		if ((sc->vtblk_flags & VTBLK_FLAG_BARRIER) == 0) {
800 			/*
801 			 * This request will be executed once all
802 			 * the in-flight requests are completed.
803 			 */
804 			if (!virtqueue_empty(vq))
805 				return (EBUSY);
806 			ordered = 1;
807 		} else
808 			req->vbr_hdr.type |= VIRTIO_BLK_T_BARRIER;
809 	}
810 
811 	sglist_reset(sg);
812 
813 	sglist_append(sg, &req->vbr_hdr, sizeof(struct virtio_blk_outhdr));
814 
815 	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
816 		error = sglist_append(sg, bp->bio_data, bp->bio_bcount);
817 		if (error || sg->sg_nseg == sg->sg_maxseg)
818 			panic("%s: data buffer too big bio:%p error:%d",
819 			    __FUNCTION__, bp, error);
820 
821 		/* BIO_READ means the host writes into our buffer. */
822 		if (bp->bio_cmd == BIO_READ)
823 			writable = sg->sg_nseg - 1;
824 	}
825 
826 	writable++;
827 	sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
828 	readable = sg->sg_nseg - writable;
829 
830 	error = virtqueue_enqueue(vq, req, sg, readable, writable);
831 	if (error == 0 && ordered)
832 		sc->vtblk_req_ordered = req;
833 
834 	return (error);
835 }
836 
837 static int
838 vtblk_vq_intr(void *xsc)
839 {
840 	struct vtblk_softc *sc;
841 
842 	sc = xsc;
843 
844 	virtqueue_disable_intr(sc->vtblk_vq);
845 	taskqueue_enqueue_fast(sc->vtblk_tq, &sc->vtblk_intr_task);
846 
847 	return (1);
848 }
849 
850 static void
851 vtblk_intr_task(void *arg, int pending)
852 {
853 	struct vtblk_softc *sc;
854 	struct virtqueue *vq;
855 
856 	sc = arg;
857 	vq = sc->vtblk_vq;
858 
859 	VTBLK_LOCK(sc);
860 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH) {
861 		VTBLK_UNLOCK(sc);
862 		return;
863 	}
864 
865 	vtblk_finish_completed(sc);
866 
867 	if ((sc->vtblk_flags & VTBLK_FLAG_SUSPEND) == 0)
868 		vtblk_startio(sc);
869 	else
870 		wakeup(&sc->vtblk_vq);
871 
872 	if (virtqueue_enable_intr(vq) != 0) {
873 		virtqueue_disable_intr(vq);
874 		VTBLK_UNLOCK(sc);
875 		taskqueue_enqueue_fast(sc->vtblk_tq,
876 		    &sc->vtblk_intr_task);
877 		return;
878 	}
879 
880 	VTBLK_UNLOCK(sc);
881 }
882 
883 static void
884 vtblk_stop(struct vtblk_softc *sc)
885 {
886 
887 	virtqueue_disable_intr(sc->vtblk_vq);
888 	virtio_stop(sc->vtblk_dev);
889 }
890 
891 static void
892 vtblk_get_ident(struct vtblk_softc *sc)
893 {
894 	struct bio buf;
895 	struct disk *dp;
896 	struct vtblk_request *req;
897 	int len, error;
898 
899 	dp = sc->vtblk_disk;
900 	len = MIN(VIRTIO_BLK_ID_BYTES, DISK_IDENT_SIZE);
901 
902 	if (vtblk_no_ident != 0)
903 		return;
904 
905 	req = vtblk_dequeue_request(sc);
906 	if (req == NULL)
907 		return;
908 
909 	req->vbr_ack = -1;
910 	req->vbr_hdr.type = VIRTIO_BLK_T_GET_ID;
911 	req->vbr_hdr.ioprio = 1;
912 	req->vbr_hdr.sector = 0;
913 
914 	req->vbr_bp = &buf;
915 	bzero(&buf, sizeof(struct bio));
916 
917 	buf.bio_cmd = BIO_READ;
918 	buf.bio_data = dp->d_ident;
919 	buf.bio_bcount = len;
920 
921 	VTBLK_LOCK(sc);
922 	error = vtblk_poll_request(sc, req);
923 	VTBLK_UNLOCK(sc);
924 
925 	vtblk_enqueue_request(sc, req);
926 
927 	if (error) {
928 		device_printf(sc->vtblk_dev,
929 		    "error getting device identifier: %d\n", error);
930 	}
931 }
932 
933 static void
934 vtblk_prepare_dump(struct vtblk_softc *sc)
935 {
936 	device_t dev;
937 	struct virtqueue *vq;
938 
939 	dev = sc->vtblk_dev;
940 	vq = sc->vtblk_vq;
941 
942 	vtblk_stop(sc);
943 
944 	/*
945 	 * Drain all requests caught in-flight in the virtqueue,
946 	 * skipping biodone(). When dumping, only one request is
947 	 * outstanding at a time, and we just poll the virtqueue
948 	 * for the response.
949 	 */
950 	vtblk_drain_vq(sc, 1);
951 
952 	if (virtio_reinit(dev, sc->vtblk_features) != 0)
953 		panic("cannot reinit VirtIO block device during dump");
954 
955 	virtqueue_disable_intr(vq);
956 	virtio_reinit_complete(dev);
957 }
958 
959 static int
960 vtblk_write_dump(struct vtblk_softc *sc, void *virtual, off_t offset,
961     size_t length)
962 {
963 	struct bio buf;
964 	struct vtblk_request *req;
965 
966 	req = &sc->vtblk_dump_request;
967 	req->vbr_ack = -1;
968 	req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
969 	req->vbr_hdr.ioprio = 1;
970 	req->vbr_hdr.sector = offset / 512;
971 
972 	req->vbr_bp = &buf;
973 	bzero(&buf, sizeof(struct bio));
974 
975 	buf.bio_cmd = BIO_WRITE;
976 	buf.bio_data = virtual;
977 	buf.bio_bcount = length;
978 
979 	return (vtblk_poll_request(sc, req));
980 }
981 
982 static int
983 vtblk_flush_dump(struct vtblk_softc *sc)
984 {
985 	struct bio buf;
986 	struct vtblk_request *req;
987 
988 	req = &sc->vtblk_dump_request;
989 	req->vbr_ack = -1;
990 	req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
991 	req->vbr_hdr.ioprio = 1;
992 	req->vbr_hdr.sector = 0;
993 
994 	req->vbr_bp = &buf;
995 	bzero(&buf, sizeof(struct bio));
996 
997 	buf.bio_cmd = BIO_FLUSH;
998 
999 	return (vtblk_poll_request(sc, req));
1000 }
1001 
1002 static int
1003 vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
1004 {
1005 	struct virtqueue *vq;
1006 	struct vtblk_request *r;
1007 	int error;
1008 
1009 	vq = sc->vtblk_vq;
1010 
1011 	if (!virtqueue_empty(vq))
1012 		return (EBUSY);
1013 
1014 	error = vtblk_execute_request(sc, req);
1015 	if (error)
1016 		return (error);
1017 
1018 	virtqueue_notify(vq);
1019 
1020 	r = virtqueue_poll(vq, NULL);
1021 	KASSERT(r == req, ("unexpected request response: %p/%p", r, req));
1022 
1023 	error = vtblk_request_error(req);
1024 	if (error && bootverbose) {
1025 		device_printf(sc->vtblk_dev,
1026 		    "%s: IO error: %d\n", __FUNCTION__, error);
1027 	}
1028 
1029 	return (error);
1030 }
1031 
1032 static void
1033 vtblk_finish_completed(struct vtblk_softc *sc)
1034 {
1035 	struct vtblk_request *req;
1036 	struct bio *bp;
1037 	int error;
1038 
1039 	while ((req = virtqueue_dequeue(sc->vtblk_vq, NULL)) != NULL) {
1040 		bp = req->vbr_bp;
1041 
1042 		if (sc->vtblk_req_ordered != NULL) {
1043 			/* This should be the only outstanding request. */
1044 			MPASS(sc->vtblk_req_ordered == req);
1045 			sc->vtblk_req_ordered = NULL;
1046 		}
1047 
1048 		error = vtblk_request_error(req);
1049 		if (error)
1050 			disk_err(bp, "hard error", -1, 1);
1051 
1052 		vtblk_finish_bio(bp, error);
1053 		vtblk_enqueue_request(sc, req);
1054 	}
1055 }
1056 
1057 static void
1058 vtblk_drain_vq(struct vtblk_softc *sc, int skip_done)
1059 {
1060 	struct virtqueue *vq;
1061 	struct vtblk_request *req;
1062 	int last;
1063 
1064 	vq = sc->vtblk_vq;
1065 	last = 0;
1066 
1067 	while ((req = virtqueue_drain(vq, &last)) != NULL) {
1068 		if (!skip_done)
1069 			vtblk_finish_bio(req->vbr_bp, ENXIO);
1070 
1071 		vtblk_enqueue_request(sc, req);
1072 	}
1073 
1074 	sc->vtblk_req_ordered = NULL;
1075 	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
1076 }
1077 
1078 static void
1079 vtblk_drain(struct vtblk_softc *sc)
1080 {
1081 	struct bio_queue_head *bioq;
1082 	struct vtblk_request *req;
1083 	struct bio *bp;
1084 
1085 	bioq = &sc->vtblk_bioq;
1086 
1087 	if (sc->vtblk_vq != NULL) {
1088 		vtblk_finish_completed(sc);
1089 		vtblk_drain_vq(sc, 0);
1090 	}
1091 
1092 	while ((req = vtblk_dequeue_ready(sc)) != NULL) {
1093 		vtblk_finish_bio(req->vbr_bp, ENXIO);
1094 		vtblk_enqueue_request(sc, req);
1095 	}
1096 
1097 	while (bioq_first(bioq) != NULL) {
1098 		bp = bioq_takefirst(bioq);
1099 		vtblk_finish_bio(bp, ENXIO);
1100 	}
1101 
1102 	vtblk_free_requests(sc);
1103 }
1104 
1105 #ifdef INVARIANTS
1106 static void
1107 vtblk_request_invariants(struct vtblk_request *req)
1108 {
1109 	int hdr_nsegs, ack_nsegs;
1110 
1111 	hdr_nsegs = sglist_count(&req->vbr_hdr, sizeof(req->vbr_hdr));
1112 	ack_nsegs = sglist_count(&req->vbr_ack, sizeof(req->vbr_ack));
1113 
1114 	KASSERT(hdr_nsegs == 1, ("request header crossed page boundary"));
1115 	KASSERT(ack_nsegs == 1, ("request ack crossed page boundary"));
1116 }
1117 #endif
1118 
1119 static int
1120 vtblk_alloc_requests(struct vtblk_softc *sc)
1121 {
1122 	struct vtblk_request *req;
1123 	int i, nreqs;
1124 
1125 	nreqs = virtqueue_size(sc->vtblk_vq);
1126 
1127 	/*
1128 	 * Preallocate sufficient requests to keep the virtqueue full. Each
1129 	 * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
1130 	 * the number allocated when indirect descriptors are not available.
1131 	 */
1132 	if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
1133 		nreqs /= VTBLK_MIN_SEGMENTS;
1134 
1135 	for (i = 0; i < nreqs; i++) {
1136 		req = malloc(sizeof(struct vtblk_request), M_DEVBUF, M_NOWAIT);
1137 		if (req == NULL)
1138 			return (ENOMEM);
1139 
1140 #ifdef INVARIANTS
1141 		vtblk_request_invariants(req);
1142 #endif
1143 
1144 		sc->vtblk_request_count++;
1145 		vtblk_enqueue_request(sc, req);
1146 	}
1147 
1148 	return (0);
1149 }
1150 
1151 static void
1152 vtblk_free_requests(struct vtblk_softc *sc)
1153 {
1154 	struct vtblk_request *req;
1155 
1156 	KASSERT(TAILQ_EMPTY(&sc->vtblk_req_ready),
1157 	    ("ready requests left on queue"));
1158 
1159 	while ((req = vtblk_dequeue_request(sc)) != NULL) {
1160 		sc->vtblk_request_count--;
1161 		free(req, M_DEVBUF);
1162 	}
1163 
1164 	KASSERT(sc->vtblk_request_count == 0,
1165 	    ("leaked requests: %d", sc->vtblk_request_count));
1166 }
1167 
1168 static struct vtblk_request *
1169 vtblk_dequeue_request(struct vtblk_softc *sc)
1170 {
1171 	struct vtblk_request *req;
1172 
1173 	req = TAILQ_FIRST(&sc->vtblk_req_free);
1174 	if (req != NULL)
1175 		TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link);
1176 
1177 	return (req);
1178 }
1179 
1180 static void
1181 vtblk_enqueue_request(struct vtblk_softc *sc, struct vtblk_request *req)
1182 {
1183 
1184 	bzero(req, sizeof(struct vtblk_request));
1185 	TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
1186 }
1187 
1188 static struct vtblk_request *
1189 vtblk_dequeue_ready(struct vtblk_softc *sc)
1190 {
1191 	struct vtblk_request *req;
1192 
1193 	req = TAILQ_FIRST(&sc->vtblk_req_ready);
1194 	if (req != NULL)
1195 		TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link);
1196 
1197 	return (req);
1198 }
1199 
1200 static void
1201 vtblk_enqueue_ready(struct vtblk_softc *sc, struct vtblk_request *req)
1202 {
1203 
1204 	TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link);
1205 }
1206 
1207 static int
1208 vtblk_request_error(struct vtblk_request *req)
1209 {
1210 	int error;
1211 
1212 	switch (req->vbr_ack) {
1213 	case VIRTIO_BLK_S_OK:
1214 		error = 0;
1215 		break;
1216 	case VIRTIO_BLK_S_UNSUPP:
1217 		error = ENOTSUP;
1218 		break;
1219 	default:
1220 		error = EIO;
1221 		break;
1222 	}
1223 
1224 	return (error);
1225 }
1226 
1227 static void
1228 vtblk_finish_bio(struct bio *bp, int error)
1229 {
1230 
1231 	if (error) {
1232 		bp->bio_resid = bp->bio_bcount;
1233 		bp->bio_error = error;
1234 		bp->bio_flags |= BIO_ERROR;
1235 	}
1236 
1237 	biodone(bp);
1238 }
1239