xref: /freebsd/sys/dev/virtio/block/virtio_blk.c (revision bb15ca603fa442c72dde3f3cb8b46db6970e3950)
1 /*-
2  * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 /* Driver for VirtIO block devices. */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/bio.h>
36 #include <sys/malloc.h>
37 #include <sys/module.h>
38 #include <sys/sglist.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/queue.h>
42 #include <sys/taskqueue.h>
43 
44 #include <geom/geom_disk.h>
45 #include <vm/uma.h>
46 
47 #include <machine/bus.h>
48 #include <machine/resource.h>
49 #include <sys/bus.h>
50 #include <sys/rman.h>
51 
52 #include <dev/virtio/virtio.h>
53 #include <dev/virtio/virtqueue.h>
54 #include <dev/virtio/block/virtio_blk.h>
55 
56 #include "virtio_if.h"
57 
58 struct vtblk_request {
59 	struct virtio_blk_outhdr	 vbr_hdr;
60 	struct bio			*vbr_bp;
61 	uint8_t				 vbr_ack;
62 
63 	TAILQ_ENTRY(vtblk_request)	 vbr_link;
64 };
65 
66 struct vtblk_softc {
67 	device_t		 vtblk_dev;
68 	struct mtx		 vtblk_mtx;
69 	uint64_t		 vtblk_features;
70 	uint32_t		 vtblk_flags;
71 #define VTBLK_FLAG_INDIRECT	0x0001
72 #define VTBLK_FLAG_READONLY	0x0002
73 #define VTBLK_FLAG_DETACHING	0x0004
74 #define VTBLK_FLAG_SUSPENDED	0x0008
75 #define VTBLK_FLAG_DUMPING	0x0010
76 
77 	struct virtqueue	*vtblk_vq;
78 	struct sglist		*vtblk_sglist;
79 	struct disk		*vtblk_disk;
80 
81 	struct bio_queue_head	 vtblk_bioq;
82 	TAILQ_HEAD(, vtblk_request)
83 				 vtblk_req_free;
84 	TAILQ_HEAD(, vtblk_request)
85 				 vtblk_req_ready;
86 
87 	struct taskqueue	*vtblk_tq;
88 	struct task		 vtblk_intr_task;
89 
90 	int			 vtblk_max_nsegs;
91 	int			 vtblk_request_count;
92 
93 	struct vtblk_request	 vtblk_dump_request;
94 };
95 
96 static struct virtio_feature_desc vtblk_feature_desc[] = {
97 	{ VIRTIO_BLK_F_BARRIER,		"HostBarrier"	},
98 	{ VIRTIO_BLK_F_SIZE_MAX,	"MaxSegSize"	},
99 	{ VIRTIO_BLK_F_SEG_MAX,		"MaxNumSegs"	},
100 	{ VIRTIO_BLK_F_GEOMETRY,	"DiskGeometry"	},
101 	{ VIRTIO_BLK_F_RO,		"ReadOnly"	},
102 	{ VIRTIO_BLK_F_BLK_SIZE,	"BlockSize"	},
103 	{ VIRTIO_BLK_F_SCSI,		"SCSICmds"	},
104 	{ VIRTIO_BLK_F_FLUSH,		"FlushCmd"	},
105 	{ VIRTIO_BLK_F_TOPOLOGY,	"Topology"	},
106 
107 	{ 0, NULL }
108 };
109 
110 static int	vtblk_modevent(module_t, int, void *);
111 
112 static int	vtblk_probe(device_t);
113 static int	vtblk_attach(device_t);
114 static int	vtblk_detach(device_t);
115 static int	vtblk_suspend(device_t);
116 static int	vtblk_resume(device_t);
117 static int	vtblk_shutdown(device_t);
118 
119 static void	vtblk_negotiate_features(struct vtblk_softc *);
120 static int	vtblk_maximum_segments(struct vtblk_softc *,
121 		    struct virtio_blk_config *);
122 static int	vtblk_alloc_virtqueue(struct vtblk_softc *);
123 static void	vtblk_alloc_disk(struct vtblk_softc *,
124 		    struct virtio_blk_config *);
125 static void	vtblk_create_disk(struct vtblk_softc *);
126 
127 static int	vtblk_open(struct disk *);
128 static int	vtblk_close(struct disk *);
129 static int	vtblk_ioctl(struct disk *, u_long, void *, int,
130 		    struct thread *);
131 static int	vtblk_dump(void *, void *, vm_offset_t, off_t, size_t);
132 static void	vtblk_strategy(struct bio *);
133 
134 static void	vtblk_startio(struct vtblk_softc *);
135 static struct vtblk_request * vtblk_bio_request(struct vtblk_softc *);
136 static int	vtblk_execute_request(struct vtblk_softc *,
137 		    struct vtblk_request *);
138 
139 static int	vtblk_vq_intr(void *);
140 static void	vtblk_intr_task(void *, int);
141 
142 static void	vtblk_stop(struct vtblk_softc *);
143 
144 static void	vtblk_get_ident(struct vtblk_softc *);
145 static void	vtblk_prepare_dump(struct vtblk_softc *);
146 static int	vtblk_write_dump(struct vtblk_softc *, void *, off_t, size_t);
147 static int	vtblk_flush_dump(struct vtblk_softc *);
148 static int	vtblk_poll_request(struct vtblk_softc *,
149 		    struct vtblk_request *);
150 
151 static void	vtblk_drain_vq(struct vtblk_softc *, int);
152 static void	vtblk_drain(struct vtblk_softc *);
153 
154 static int	vtblk_alloc_requests(struct vtblk_softc *);
155 static void	vtblk_free_requests(struct vtblk_softc *);
156 static struct vtblk_request * vtblk_dequeue_request(struct vtblk_softc *);
157 static void	vtblk_enqueue_request(struct vtblk_softc *,
158 		    struct vtblk_request *);
159 
160 static struct vtblk_request * vtblk_dequeue_ready(struct vtblk_softc *);
161 static void	vtblk_enqueue_ready(struct vtblk_softc *,
162 		    struct vtblk_request *);
163 
164 static void	vtblk_bio_error(struct bio *, int);
165 
166 /* Tunables. */
167 static int vtblk_no_ident = 0;
168 TUNABLE_INT("hw.vtblk.no_ident", &vtblk_no_ident);
169 
170 /* Features desired/implemented by this driver. */
171 #define VTBLK_FEATURES \
172     (VIRTIO_BLK_F_BARRIER		| \
173      VIRTIO_BLK_F_SIZE_MAX		| \
174      VIRTIO_BLK_F_SEG_MAX		| \
175      VIRTIO_BLK_F_GEOMETRY		| \
176      VIRTIO_BLK_F_RO			| \
177      VIRTIO_BLK_F_BLK_SIZE		| \
178      VIRTIO_BLK_F_FLUSH			| \
179      VIRTIO_RING_F_INDIRECT_DESC)
180 
181 #define VTBLK_MTX(_sc)		&(_sc)->vtblk_mtx
182 #define VTBLK_LOCK_INIT(_sc, _name) \
183 				mtx_init(VTBLK_MTX((_sc)), (_name), \
184 				    "VTBLK Lock", MTX_DEF)
185 #define VTBLK_LOCK(_sc)		mtx_lock(VTBLK_MTX((_sc)))
186 #define VTBLK_UNLOCK(_sc)	mtx_unlock(VTBLK_MTX((_sc)))
187 #define VTBLK_LOCK_DESTROY(_sc)	mtx_destroy(VTBLK_MTX((_sc)))
188 #define VTBLK_LOCK_ASSERT(_sc)	mtx_assert(VTBLK_MTX((_sc)), MA_OWNED)
189 #define VTBLK_LOCK_ASSERT_NOTOWNED(_sc) \
190 				mtx_assert(VTBLK_MTX((_sc)), MA_NOTOWNED)
191 
192 #define VTBLK_BIO_SEGMENTS(_bp)	sglist_count((_bp)->bio_data, (_bp)->bio_bcount)
193 
194 #define VTBLK_DISK_NAME		"vtbd"
195 
196 /*
197  * Each block request uses at least two segments - one for the header
198  * and one for the status.
199  */
200 #define VTBLK_MIN_SEGMENTS	2
201 
202 static uma_zone_t vtblk_req_zone;
203 
204 static device_method_t vtblk_methods[] = {
205 	/* Device methods. */
206 	DEVMETHOD(device_probe,		vtblk_probe),
207 	DEVMETHOD(device_attach,	vtblk_attach),
208 	DEVMETHOD(device_detach,	vtblk_detach),
209 	DEVMETHOD(device_suspend,	vtblk_suspend),
210 	DEVMETHOD(device_resume,	vtblk_resume),
211 	DEVMETHOD(device_shutdown,	vtblk_shutdown),
212 
213 	{ 0, 0 }
214 };
215 
216 static driver_t vtblk_driver = {
217 	"vtblk",
218 	vtblk_methods,
219 	sizeof(struct vtblk_softc)
220 };
221 static devclass_t vtblk_devclass;
222 
223 DRIVER_MODULE(virtio_blk, virtio_pci, vtblk_driver, vtblk_devclass,
224     vtblk_modevent, 0);
225 MODULE_VERSION(virtio_blk, 1);
226 MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
227 
228 static int
229 vtblk_modevent(module_t mod, int type, void *unused)
230 {
231 	int error;
232 
233 	error = 0;
234 
235 	switch (type) {
236 	case MOD_LOAD:
237 		vtblk_req_zone = uma_zcreate("vtblk_request",
238 		    sizeof(struct vtblk_request),
239 		    NULL, NULL, NULL, NULL, 0, 0);
240 		break;
241 	case MOD_QUIESCE:
242 	case MOD_UNLOAD:
243 		if (uma_zone_get_cur(vtblk_req_zone) > 0)
244 			error = EBUSY;
245 		else if (type == MOD_UNLOAD) {
246 			uma_zdestroy(vtblk_req_zone);
247 			vtblk_req_zone = NULL;
248 		}
249 		break;
250 	case MOD_SHUTDOWN:
251 		break;
252 	default:
253 		error = EOPNOTSUPP;
254 		break;
255 	}
256 
257 	return (error);
258 }
259 
260 static int
261 vtblk_probe(device_t dev)
262 {
263 
264 	if (virtio_get_device_type(dev) != VIRTIO_ID_BLOCK)
265 		return (ENXIO);
266 
267 	device_set_desc(dev, "VirtIO Block Adapter");
268 
269 	return (BUS_PROBE_DEFAULT);
270 }
271 
272 static int
273 vtblk_attach(device_t dev)
274 {
275 	struct vtblk_softc *sc;
276 	struct virtio_blk_config blkcfg;
277 	int error;
278 
279 	sc = device_get_softc(dev);
280 	sc->vtblk_dev = dev;
281 
282 	VTBLK_LOCK_INIT(sc, device_get_nameunit(dev));
283 
284 	bioq_init(&sc->vtblk_bioq);
285 	TAILQ_INIT(&sc->vtblk_req_free);
286 	TAILQ_INIT(&sc->vtblk_req_ready);
287 
288 	virtio_set_feature_desc(dev, vtblk_feature_desc);
289 	vtblk_negotiate_features(sc);
290 
291 	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
292 		sc->vtblk_flags |= VTBLK_FLAG_INDIRECT;
293 
294 	if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
295 		sc->vtblk_flags |= VTBLK_FLAG_READONLY;
296 
297 	/* Get local copy of config. */
298 	virtio_read_device_config(dev, 0, &blkcfg,
299 	    sizeof(struct virtio_blk_config));
300 
301 	/*
302 	 * With the current sglist(9) implementation, it is not easy
303 	 * for us to support a maximum segment size as adjacent
304 	 * segments are coalesced. For now, just make sure it's larger
305 	 * than the maximum supported transfer size.
306 	 */
307 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
308 		if (blkcfg.size_max < MAXPHYS) {
309 			error = ENOTSUP;
310 			device_printf(dev, "host requires unsupported "
311 			    "maximum segment size feature\n");
312 			goto fail;
313 		}
314 	}
315 
316 	sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
317 
318 	/*
319 	 * Allocate working sglist. The number of segments may be too
320 	 * large to safely store on the stack.
321 	 */
322 	sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_NOWAIT);
323 	if (sc->vtblk_sglist == NULL) {
324 		error = ENOMEM;
325 		device_printf(dev, "cannot allocate sglist\n");
326 		goto fail;
327 	}
328 
329 	error = vtblk_alloc_virtqueue(sc);
330 	if (error) {
331 		device_printf(dev, "cannot allocate virtqueue\n");
332 		goto fail;
333 	}
334 
335 	error = vtblk_alloc_requests(sc);
336 	if (error) {
337 		device_printf(dev, "cannot preallocate requests\n");
338 		goto fail;
339 	}
340 
341 	vtblk_alloc_disk(sc, &blkcfg);
342 
343 	TASK_INIT(&sc->vtblk_intr_task, 0, vtblk_intr_task, sc);
344 	sc->vtblk_tq = taskqueue_create_fast("vtblk_taskq", M_NOWAIT,
345 	    taskqueue_thread_enqueue, &sc->vtblk_tq);
346 	if (sc->vtblk_tq == NULL) {
347 		error = ENOMEM;
348 		device_printf(dev, "cannot allocate taskqueue\n");
349 		goto fail;
350 	}
351 	taskqueue_start_threads(&sc->vtblk_tq, 1, PI_DISK, "%s taskq",
352 	    device_get_nameunit(dev));
353 
354 	error = virtio_setup_intr(dev, INTR_TYPE_BIO | INTR_ENTROPY);
355 	if (error) {
356 		device_printf(dev, "cannot setup virtqueue interrupt\n");
357 		goto fail;
358 	}
359 
360 	vtblk_create_disk(sc);
361 
362 	virtqueue_enable_intr(sc->vtblk_vq);
363 
364 fail:
365 	if (error)
366 		vtblk_detach(dev);
367 
368 	return (error);
369 }
370 
371 static int
372 vtblk_detach(device_t dev)
373 {
374 	struct vtblk_softc *sc;
375 
376 	sc = device_get_softc(dev);
377 
378 	VTBLK_LOCK(sc);
379 	sc->vtblk_flags |= VTBLK_FLAG_DETACHING;
380 	if (device_is_attached(dev))
381 		vtblk_stop(sc);
382 	VTBLK_UNLOCK(sc);
383 
384 	if (sc->vtblk_tq != NULL) {
385 		taskqueue_drain(sc->vtblk_tq, &sc->vtblk_intr_task);
386 		taskqueue_free(sc->vtblk_tq);
387 		sc->vtblk_tq = NULL;
388 	}
389 
390 	vtblk_drain(sc);
391 
392 	if (sc->vtblk_disk != NULL) {
393 		disk_destroy(sc->vtblk_disk);
394 		sc->vtblk_disk = NULL;
395 	}
396 
397 	if (sc->vtblk_sglist != NULL) {
398 		sglist_free(sc->vtblk_sglist);
399 		sc->vtblk_sglist = NULL;
400 	}
401 
402 	VTBLK_LOCK_DESTROY(sc);
403 
404 	return (0);
405 }
406 
407 static int
408 vtblk_suspend(device_t dev)
409 {
410 	struct vtblk_softc *sc;
411 
412 	sc = device_get_softc(dev);
413 
414 	VTBLK_LOCK(sc);
415 	sc->vtblk_flags |= VTBLK_FLAG_SUSPENDED;
416 	/* TODO Wait for any inflight IO to complete? */
417 	VTBLK_UNLOCK(sc);
418 
419 	return (0);
420 }
421 
422 static int
423 vtblk_resume(device_t dev)
424 {
425 	struct vtblk_softc *sc;
426 
427 	sc = device_get_softc(dev);
428 
429 	VTBLK_LOCK(sc);
430 	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPENDED;
431 	/* TODO Resume IO? */
432 	VTBLK_UNLOCK(sc);
433 
434 	return (0);
435 }
436 
437 static int
438 vtblk_shutdown(device_t dev)
439 {
440 
441 	return (0);
442 }
443 
444 static int
445 vtblk_open(struct disk *dp)
446 {
447 	struct vtblk_softc *sc;
448 
449 	if ((sc = dp->d_drv1) == NULL)
450 		return (ENXIO);
451 
452 	return (sc->vtblk_flags & VTBLK_FLAG_DETACHING ? ENXIO : 0);
453 }
454 
455 static int
456 vtblk_close(struct disk *dp)
457 {
458 	struct vtblk_softc *sc;
459 
460 	if ((sc = dp->d_drv1) == NULL)
461 		return (ENXIO);
462 
463 	return (0);
464 }
465 
466 static int
467 vtblk_ioctl(struct disk *dp, u_long cmd, void *addr, int flag,
468     struct thread *td)
469 {
470 	struct vtblk_softc *sc;
471 
472 	if ((sc = dp->d_drv1) == NULL)
473 		return (ENXIO);
474 
475 	return (ENOTTY);
476 }
477 
478 static int
479 vtblk_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
480     size_t length)
481 {
482 	struct disk *dp;
483 	struct vtblk_softc *sc;
484 	int error;
485 
486 	dp = arg;
487 	error = 0;
488 
489 	if ((sc = dp->d_drv1) == NULL)
490 		return (ENXIO);
491 
492 	if ((sc->vtblk_flags & VTBLK_FLAG_DUMPING) == 0) {
493 		vtblk_prepare_dump(sc);
494 		sc->vtblk_flags |= VTBLK_FLAG_DUMPING;
495 	}
496 
497 	if (length > 0)
498 		error = vtblk_write_dump(sc, virtual, offset, length);
499 	else if (virtual == NULL && offset == 0)
500 		error = vtblk_flush_dump(sc);
501 
502 	VTBLK_UNLOCK(sc);
503 
504 	return (error);
505 }
506 
507 static void
508 vtblk_strategy(struct bio *bp)
509 {
510 	struct vtblk_softc *sc;
511 
512 	if ((sc = bp->bio_disk->d_drv1) == NULL) {
513 		vtblk_bio_error(bp, EINVAL);
514 		return;
515 	}
516 
517 	/*
518 	 * Fail any write if RO. Unfortunately, there does not seem to
519 	 * be a better way to report our readonly'ness to GEOM above.
520 	 */
521 	if (sc->vtblk_flags & VTBLK_FLAG_READONLY &&
522 	    (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FLUSH)) {
523 		vtblk_bio_error(bp, EROFS);
524 		return;
525 	}
526 
527 	/*
528 	 * Prevent read/write buffers spanning too many segments from
529 	 * getting into the queue. This should only trip if d_maxsize
530 	 * was incorrectly set.
531 	 */
532 	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
533 		KASSERT(VTBLK_BIO_SEGMENTS(bp) <= sc->vtblk_max_nsegs -
534 		    VTBLK_MIN_SEGMENTS,
535 		    ("bio spanned too many segments: %d, max: %d",
536 		    VTBLK_BIO_SEGMENTS(bp),
537 		    sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS));
538 	}
539 
540 	VTBLK_LOCK(sc);
541 	if ((sc->vtblk_flags & VTBLK_FLAG_DETACHING) == 0) {
542 		bioq_disksort(&sc->vtblk_bioq, bp);
543 		vtblk_startio(sc);
544 	} else
545 		vtblk_bio_error(bp, ENXIO);
546 	VTBLK_UNLOCK(sc);
547 }
548 
549 static void
550 vtblk_negotiate_features(struct vtblk_softc *sc)
551 {
552 	device_t dev;
553 	uint64_t features;
554 
555 	dev = sc->vtblk_dev;
556 	features = VTBLK_FEATURES;
557 
558 	sc->vtblk_features = virtio_negotiate_features(dev, features);
559 }
560 
561 static int
562 vtblk_maximum_segments(struct vtblk_softc *sc,
563     struct virtio_blk_config *blkcfg)
564 {
565 	device_t dev;
566 	int nsegs;
567 
568 	dev = sc->vtblk_dev;
569 	nsegs = VTBLK_MIN_SEGMENTS;
570 
571 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
572 		nsegs += MIN(blkcfg->seg_max, MAXPHYS / PAGE_SIZE + 1);
573 		if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
574 			nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
575 	} else
576 		nsegs += 1;
577 
578 	return (nsegs);
579 }
580 
581 static int
582 vtblk_alloc_virtqueue(struct vtblk_softc *sc)
583 {
584 	device_t dev;
585 	struct vq_alloc_info vq_info;
586 
587 	dev = sc->vtblk_dev;
588 
589 	VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
590 	    vtblk_vq_intr, sc, &sc->vtblk_vq,
591 	    "%s request", device_get_nameunit(dev));
592 
593 	return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info));
594 }
595 
596 static void
597 vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
598 {
599 	device_t dev;
600 	struct disk *dp;
601 
602 	dev = sc->vtblk_dev;
603 
604 	sc->vtblk_disk = dp = disk_alloc();
605 	dp->d_open = vtblk_open;
606 	dp->d_close = vtblk_close;
607 	dp->d_ioctl = vtblk_ioctl;
608 	dp->d_strategy = vtblk_strategy;
609 	dp->d_name = VTBLK_DISK_NAME;
610 	dp->d_unit = device_get_unit(dev);
611 	dp->d_drv1 = sc;
612 
613 	if ((sc->vtblk_flags & VTBLK_FLAG_READONLY) == 0)
614 		dp->d_dump = vtblk_dump;
615 
616 	/* Capacity is always in 512-byte units. */
617 	dp->d_mediasize = blkcfg->capacity * 512;
618 
619 	if (virtio_with_feature(dev, VIRTIO_BLK_F_BLK_SIZE))
620 		dp->d_sectorsize = blkcfg->blk_size;
621 	else
622 		dp->d_sectorsize = 512;
623 
624 	/*
625 	 * The VirtIO maximum I/O size is given in terms of segments.
626 	 * However, FreeBSD limits I/O size by logical buffer size, not
627 	 * by physically contiguous pages. Therefore, we have to assume
628 	 * no pages are contiguous. This may impose an artificially low
629 	 * maximum I/O size. But in practice, since QEMU advertises 128
630 	 * segments, this gives us a maximum IO size of 125 * PAGE_SIZE,
631 	 * which is typically greater than MAXPHYS. Eventually we should
632 	 * just advertise MAXPHYS and split buffers that are too big.
633 	 *
634 	 * Note we must subtract one additional segment in case of non
635 	 * page aligned buffers.
636 	 */
637 	dp->d_maxsize = (sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS - 1) *
638 	    PAGE_SIZE;
639 	if (dp->d_maxsize < PAGE_SIZE)
640 		dp->d_maxsize = PAGE_SIZE; /* XXX */
641 
642 	if (virtio_with_feature(dev, VIRTIO_BLK_F_GEOMETRY)) {
643 		dp->d_fwsectors = blkcfg->geometry.sectors;
644 		dp->d_fwheads = blkcfg->geometry.heads;
645 	}
646 
647 	if (virtio_with_feature(dev, VIRTIO_BLK_F_FLUSH))
648 		dp->d_flags |= DISKFLAG_CANFLUSHCACHE;
649 }
650 
651 static void
652 vtblk_create_disk(struct vtblk_softc *sc)
653 {
654 	struct disk *dp;
655 
656 	dp = sc->vtblk_disk;
657 
658 	/*
659 	 * Retrieving the identification string must be done after
660 	 * the virtqueue interrupt is setup otherwise it will hang.
661 	 */
662 	vtblk_get_ident(sc);
663 
664 	device_printf(sc->vtblk_dev, "%juMB (%ju %u byte sectors)\n",
665 	    (uintmax_t) dp->d_mediasize >> 20,
666 	    (uintmax_t) dp->d_mediasize / dp->d_sectorsize,
667 	    dp->d_sectorsize);
668 
669 	disk_create(dp, DISK_VERSION);
670 }
671 
672 static void
673 vtblk_startio(struct vtblk_softc *sc)
674 {
675 	struct virtqueue *vq;
676 	struct vtblk_request *req;
677 	int enq;
678 
679 	vq = sc->vtblk_vq;
680 	enq = 0;
681 
682 	VTBLK_LOCK_ASSERT(sc);
683 
684 	if (sc->vtblk_flags & VTBLK_FLAG_SUSPENDED)
685 		return;
686 
687 	while (!virtqueue_full(vq)) {
688 		if ((req = vtblk_dequeue_ready(sc)) == NULL)
689 			req = vtblk_bio_request(sc);
690 		if (req == NULL)
691 			break;
692 
693 		if (vtblk_execute_request(sc, req) != 0) {
694 			vtblk_enqueue_ready(sc, req);
695 			break;
696 		}
697 
698 		enq++;
699 	}
700 
701 	if (enq > 0)
702 		virtqueue_notify(vq);
703 }
704 
705 static struct vtblk_request *
706 vtblk_bio_request(struct vtblk_softc *sc)
707 {
708 	struct bio_queue_head *bioq;
709 	struct vtblk_request *req;
710 	struct bio *bp;
711 
712 	bioq = &sc->vtblk_bioq;
713 
714 	if (bioq_first(bioq) == NULL)
715 		return (NULL);
716 
717 	req = vtblk_dequeue_request(sc);
718 	if (req == NULL)
719 		return (NULL);
720 
721 	bp = bioq_takefirst(bioq);
722 	req->vbr_bp = bp;
723 	req->vbr_ack = -1;
724 	req->vbr_hdr.ioprio = 1;
725 
726 	switch (bp->bio_cmd) {
727 	case BIO_FLUSH:
728 		req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
729 		break;
730 	case BIO_READ:
731 		req->vbr_hdr.type = VIRTIO_BLK_T_IN;
732 		req->vbr_hdr.sector = bp->bio_offset / 512;
733 		break;
734 	case BIO_WRITE:
735 		req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
736 		req->vbr_hdr.sector = bp->bio_offset / 512;
737 		break;
738 	default:
739 		KASSERT(0, ("bio with unhandled cmd: %d", bp->bio_cmd));
740 		req->vbr_hdr.type = -1;
741 		break;
742 	}
743 
744 	if (bp->bio_flags & BIO_ORDERED)
745 		req->vbr_hdr.type |= VIRTIO_BLK_T_BARRIER;
746 
747 	return (req);
748 }
749 
750 static int
751 vtblk_execute_request(struct vtblk_softc *sc, struct vtblk_request *req)
752 {
753 	struct sglist *sg;
754 	struct bio *bp;
755 	int writable, error;
756 
757 	sg = sc->vtblk_sglist;
758 	bp = req->vbr_bp;
759 	writable = 0;
760 
761 	VTBLK_LOCK_ASSERT(sc);
762 
763 	sglist_reset(sg);
764 	error = sglist_append(sg, &req->vbr_hdr,
765 	    sizeof(struct virtio_blk_outhdr));
766 	KASSERT(error == 0, ("error adding header to sglist"));
767 	KASSERT(sg->sg_nseg == 1,
768 	    ("header spanned multiple segments: %d", sg->sg_nseg));
769 
770 	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
771 		error = sglist_append(sg, bp->bio_data, bp->bio_bcount);
772 		KASSERT(error == 0, ("error adding buffer to sglist"));
773 
774 		/* BIO_READ means the host writes into our buffer. */
775 		if (bp->bio_cmd == BIO_READ)
776 			writable += sg->sg_nseg - 1;
777 	}
778 
779 	error = sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
780 	KASSERT(error == 0, ("error adding ack to sglist"));
781 	writable++;
782 
783 	KASSERT(sg->sg_nseg >= VTBLK_MIN_SEGMENTS,
784 	    ("fewer than min segments: %d", sg->sg_nseg));
785 
786 	error = virtqueue_enqueue(sc->vtblk_vq, req, sg,
787 	    sg->sg_nseg - writable, writable);
788 
789 	return (error);
790 }
791 
792 static int
793 vtblk_vq_intr(void *xsc)
794 {
795 	struct vtblk_softc *sc;
796 
797 	sc = xsc;
798 
799 	virtqueue_disable_intr(sc->vtblk_vq);
800 	taskqueue_enqueue_fast(sc->vtblk_tq, &sc->vtblk_intr_task);
801 
802 	return (1);
803 }
804 
805 static void
806 vtblk_intr_task(void *arg, int pending)
807 {
808 	struct vtblk_softc *sc;
809 	struct vtblk_request *req;
810 	struct virtqueue *vq;
811 	struct bio *bp;
812 
813 	sc = arg;
814 	vq = sc->vtblk_vq;
815 
816 	VTBLK_LOCK(sc);
817 	if (sc->vtblk_flags & VTBLK_FLAG_DETACHING) {
818 		VTBLK_UNLOCK(sc);
819 		return;
820 	}
821 
822 	while ((req = virtqueue_dequeue(vq, NULL)) != NULL) {
823 		bp = req->vbr_bp;
824 
825 		if (req->vbr_ack == VIRTIO_BLK_S_OK)
826 			bp->bio_resid = 0;
827 		else {
828 			bp->bio_flags |= BIO_ERROR;
829 			if (req->vbr_ack == VIRTIO_BLK_S_UNSUPP)
830 				bp->bio_error = ENOTSUP;
831 			else
832 				bp->bio_error = EIO;
833 		}
834 
835 		biodone(bp);
836 		vtblk_enqueue_request(sc, req);
837 	}
838 
839 	vtblk_startio(sc);
840 
841 	if (virtqueue_enable_intr(vq) != 0) {
842 		virtqueue_disable_intr(vq);
843 		VTBLK_UNLOCK(sc);
844 		taskqueue_enqueue_fast(sc->vtblk_tq,
845 		    &sc->vtblk_intr_task);
846 		return;
847 	}
848 
849 	VTBLK_UNLOCK(sc);
850 }
851 
852 static void
853 vtblk_stop(struct vtblk_softc *sc)
854 {
855 
856 	virtqueue_disable_intr(sc->vtblk_vq);
857 	virtio_stop(sc->vtblk_dev);
858 }
859 
860 static void
861 vtblk_get_ident(struct vtblk_softc *sc)
862 {
863 	struct bio buf;
864 	struct disk *dp;
865 	struct vtblk_request *req;
866 	int len, error;
867 
868 	dp = sc->vtblk_disk;
869 	len = MIN(VIRTIO_BLK_ID_BYTES, DISK_IDENT_SIZE);
870 
871 	if (vtblk_no_ident != 0)
872 		return;
873 
874 	req = vtblk_dequeue_request(sc);
875 	if (req == NULL)
876 		return;
877 
878 	req->vbr_ack = -1;
879 	req->vbr_hdr.type = VIRTIO_BLK_T_GET_ID;
880 	req->vbr_hdr.ioprio = 1;
881 	req->vbr_hdr.sector = 0;
882 
883 	req->vbr_bp = &buf;
884 	bzero(&buf, sizeof(struct bio));
885 
886 	buf.bio_cmd = BIO_READ;
887 	buf.bio_data = dp->d_ident;
888 	buf.bio_bcount = len;
889 
890 	VTBLK_LOCK(sc);
891 	error = vtblk_poll_request(sc, req);
892 	VTBLK_UNLOCK(sc);
893 
894 	vtblk_enqueue_request(sc, req);
895 
896 	if (error) {
897 		device_printf(sc->vtblk_dev,
898 		    "error getting device identifier: %d\n", error);
899 	}
900 }
901 
902 static void
903 vtblk_prepare_dump(struct vtblk_softc *sc)
904 {
905 	device_t dev;
906 	struct virtqueue *vq;
907 
908 	dev = sc->vtblk_dev;
909 	vq = sc->vtblk_vq;
910 
911 	vtblk_stop(sc);
912 
913 	/*
914 	 * Drain all requests caught in-flight in the virtqueue,
915 	 * skipping biodone(). When dumping, only one request is
916 	 * outstanding at a time, and we just poll the virtqueue
917 	 * for the response.
918 	 */
919 	vtblk_drain_vq(sc, 1);
920 
921 	if (virtio_reinit(dev, sc->vtblk_features) != 0)
922 		panic("cannot reinit VirtIO block device during dump");
923 
924 	virtqueue_disable_intr(vq);
925 	virtio_reinit_complete(dev);
926 }
927 
928 static int
929 vtblk_write_dump(struct vtblk_softc *sc, void *virtual, off_t offset,
930     size_t length)
931 {
932 	struct bio buf;
933 	struct vtblk_request *req;
934 
935 	req = &sc->vtblk_dump_request;
936 	req->vbr_ack = -1;
937 	req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
938 	req->vbr_hdr.ioprio = 1;
939 	req->vbr_hdr.sector = offset / 512;
940 
941 	req->vbr_bp = &buf;
942 	bzero(&buf, sizeof(struct bio));
943 
944 	buf.bio_cmd = BIO_WRITE;
945 	buf.bio_data = virtual;
946 	buf.bio_bcount = length;
947 
948 	return (vtblk_poll_request(sc, req));
949 }
950 
951 static int
952 vtblk_flush_dump(struct vtblk_softc *sc)
953 {
954 	struct bio buf;
955 	struct vtblk_request *req;
956 
957 	req = &sc->vtblk_dump_request;
958 	req->vbr_ack = -1;
959 	req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
960 	req->vbr_hdr.ioprio = 1;
961 	req->vbr_hdr.sector = 0;
962 
963 	req->vbr_bp = &buf;
964 	bzero(&buf, sizeof(struct bio));
965 
966 	buf.bio_cmd = BIO_FLUSH;
967 
968 	return (vtblk_poll_request(sc, req));
969 }
970 
971 static int
972 vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
973 {
974 	device_t dev;
975 	struct virtqueue *vq;
976 	struct vtblk_request *r;
977 	int error;
978 
979 	dev = sc->vtblk_dev;
980 	vq = sc->vtblk_vq;
981 
982 	if (!virtqueue_empty(vq))
983 		return (EBUSY);
984 
985 	error = vtblk_execute_request(sc, req);
986 	if (error)
987 		return (error);
988 
989 	virtqueue_notify(vq);
990 
991 	r = virtqueue_poll(vq, NULL);
992 	KASSERT(r == req, ("unexpected request response"));
993 
994 	if (req->vbr_ack != VIRTIO_BLK_S_OK) {
995 		error = req->vbr_ack == VIRTIO_BLK_S_UNSUPP ? ENOTSUP : EIO;
996 		if (bootverbose)
997 			device_printf(dev,
998 			    "vtblk_poll_request: IO error: %d\n", error);
999 	}
1000 
1001 	return (error);
1002 }
1003 
1004 static void
1005 vtblk_drain_vq(struct vtblk_softc *sc, int skip_done)
1006 {
1007 	struct virtqueue *vq;
1008 	struct vtblk_request *req;
1009 	int last;
1010 
1011 	vq = sc->vtblk_vq;
1012 	last = 0;
1013 
1014 	while ((req = virtqueue_drain(vq, &last)) != NULL) {
1015 		if (!skip_done)
1016 			vtblk_bio_error(req->vbr_bp, ENXIO);
1017 
1018 		vtblk_enqueue_request(sc, req);
1019 	}
1020 
1021 	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
1022 }
1023 
1024 static void
1025 vtblk_drain(struct vtblk_softc *sc)
1026 {
1027 	struct bio_queue_head *bioq;
1028 	struct vtblk_request *req;
1029 	struct bio *bp;
1030 
1031 	bioq = &sc->vtblk_bioq;
1032 
1033 	if (sc->vtblk_vq != NULL)
1034 		vtblk_drain_vq(sc, 0);
1035 
1036 	while ((req = vtblk_dequeue_ready(sc)) != NULL) {
1037 		vtblk_bio_error(req->vbr_bp, ENXIO);
1038 		vtblk_enqueue_request(sc, req);
1039 	}
1040 
1041 	while (bioq_first(bioq) != NULL) {
1042 		bp = bioq_takefirst(bioq);
1043 		vtblk_bio_error(bp, ENXIO);
1044 	}
1045 
1046 	vtblk_free_requests(sc);
1047 }
1048 
1049 static int
1050 vtblk_alloc_requests(struct vtblk_softc *sc)
1051 {
1052 	struct vtblk_request *req;
1053 	int i, size;
1054 
1055 	size = virtqueue_size(sc->vtblk_vq);
1056 
1057 	/*
1058 	 * Preallocate sufficient requests to keep the virtqueue full. Each
1059 	 * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
1060 	 * the number allocated when indirect descriptors are not available.
1061 	 */
1062 	if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
1063 		size /= VTBLK_MIN_SEGMENTS;
1064 
1065 	for (i = 0; i < size; i++) {
1066 		req = uma_zalloc(vtblk_req_zone, M_NOWAIT);
1067 		if (req == NULL)
1068 			return (ENOMEM);
1069 
1070 		sc->vtblk_request_count++;
1071 		vtblk_enqueue_request(sc, req);
1072 	}
1073 
1074 	return (0);
1075 }
1076 
1077 static void
1078 vtblk_free_requests(struct vtblk_softc *sc)
1079 {
1080 	struct vtblk_request *req;
1081 
1082 	while ((req = vtblk_dequeue_request(sc)) != NULL) {
1083 		sc->vtblk_request_count--;
1084 		uma_zfree(vtblk_req_zone, req);
1085 	}
1086 
1087 	KASSERT(sc->vtblk_request_count == 0, ("leaked requests"));
1088 }
1089 
1090 static struct vtblk_request *
1091 vtblk_dequeue_request(struct vtblk_softc *sc)
1092 {
1093 	struct vtblk_request *req;
1094 
1095 	req = TAILQ_FIRST(&sc->vtblk_req_free);
1096 	if (req != NULL)
1097 		TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link);
1098 
1099 	return (req);
1100 }
1101 
1102 static void
1103 vtblk_enqueue_request(struct vtblk_softc *sc, struct vtblk_request *req)
1104 {
1105 
1106 	bzero(req, sizeof(struct vtblk_request));
1107 	TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
1108 }
1109 
1110 static struct vtblk_request *
1111 vtblk_dequeue_ready(struct vtblk_softc *sc)
1112 {
1113 	struct vtblk_request *req;
1114 
1115 	req = TAILQ_FIRST(&sc->vtblk_req_ready);
1116 	if (req != NULL)
1117 		TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link);
1118 
1119 	return (req);
1120 }
1121 
1122 static void
1123 vtblk_enqueue_ready(struct vtblk_softc *sc, struct vtblk_request *req)
1124 {
1125 
1126 	TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link);
1127 }
1128 
1129 static void
1130 vtblk_bio_error(struct bio *bp, int error)
1131 {
1132 
1133 	biofinish(bp, NULL, error);
1134 }
1135