xref: /freebsd/sys/dev/virtio/block/virtio_blk.c (revision 12f110aa1ad3c9d0ead55bf80f2f994b4b845ffb)
1 /*-
2  * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice unmodified, this list of conditions, and the following
10  *    disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 /* Driver for VirtIO block devices. */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/bio.h>
36 #include <sys/malloc.h>
37 #include <sys/module.h>
38 #include <sys/sglist.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/queue.h>
42 #include <sys/taskqueue.h>
43 
44 #include <geom/geom_disk.h>
45 
46 #include <machine/bus.h>
47 #include <machine/resource.h>
48 #include <sys/bus.h>
49 #include <sys/rman.h>
50 
51 #include <dev/virtio/virtio.h>
52 #include <dev/virtio/virtqueue.h>
53 #include <dev/virtio/block/virtio_blk.h>
54 
55 #include "virtio_if.h"
56 
57 struct vtblk_request {
58 	struct virtio_blk_outhdr	 vbr_hdr;
59 	struct bio			*vbr_bp;
60 	uint8_t				 vbr_ack;
61 
62 	TAILQ_ENTRY(vtblk_request)	 vbr_link;
63 };
64 
65 struct vtblk_softc {
66 	device_t		 vtblk_dev;
67 	struct mtx		 vtblk_mtx;
68 	uint64_t		 vtblk_features;
69 	uint32_t		 vtblk_flags;
70 #define VTBLK_FLAG_INDIRECT	0x0001
71 #define VTBLK_FLAG_READONLY	0x0002
72 #define VTBLK_FLAG_DETACH	0x0004
73 #define VTBLK_FLAG_SUSPEND	0x0008
74 #define VTBLK_FLAG_DUMPING	0x0010
75 
76 	struct virtqueue	*vtblk_vq;
77 	struct sglist		*vtblk_sglist;
78 	struct disk		*vtblk_disk;
79 
80 	struct bio_queue_head	 vtblk_bioq;
81 	TAILQ_HEAD(, vtblk_request)
82 				 vtblk_req_free;
83 	TAILQ_HEAD(, vtblk_request)
84 				vtblk_req_ready;
85 
86 	struct taskqueue	*vtblk_tq;
87 	struct task		 vtblk_intr_task;
88 
89 	int			 vtblk_max_nsegs;
90 	int			 vtblk_request_count;
91 
92 	struct vtblk_request	 vtblk_dump_request;
93 };
94 
95 static struct virtio_feature_desc vtblk_feature_desc[] = {
96 	{ VIRTIO_BLK_F_BARRIER,		"HostBarrier"	},
97 	{ VIRTIO_BLK_F_SIZE_MAX,	"MaxSegSize"	},
98 	{ VIRTIO_BLK_F_SEG_MAX,		"MaxNumSegs"	},
99 	{ VIRTIO_BLK_F_GEOMETRY,	"DiskGeometry"	},
100 	{ VIRTIO_BLK_F_RO,		"ReadOnly"	},
101 	{ VIRTIO_BLK_F_BLK_SIZE,	"BlockSize"	},
102 	{ VIRTIO_BLK_F_SCSI,		"SCSICmds"	},
103 	{ VIRTIO_BLK_F_FLUSH,		"FlushCmd"	},
104 	{ VIRTIO_BLK_F_TOPOLOGY,	"Topology"	},
105 
106 	{ 0, NULL }
107 };
108 
109 static int	vtblk_modevent(module_t, int, void *);
110 
111 static int	vtblk_probe(device_t);
112 static int	vtblk_attach(device_t);
113 static int	vtblk_detach(device_t);
114 static int	vtblk_suspend(device_t);
115 static int	vtblk_resume(device_t);
116 static int	vtblk_shutdown(device_t);
117 
118 static int	vtblk_open(struct disk *);
119 static int	vtblk_close(struct disk *);
120 static int	vtblk_ioctl(struct disk *, u_long, void *, int,
121 		    struct thread *);
122 static int	vtblk_dump(void *, void *, vm_offset_t, off_t, size_t);
123 static void	vtblk_strategy(struct bio *);
124 
125 static void	vtblk_negotiate_features(struct vtblk_softc *);
126 static int	vtblk_maximum_segments(struct vtblk_softc *,
127 		    struct virtio_blk_config *);
128 static int	vtblk_alloc_virtqueue(struct vtblk_softc *);
129 static void	vtblk_alloc_disk(struct vtblk_softc *,
130 		    struct virtio_blk_config *);
131 static void	vtblk_create_disk(struct vtblk_softc *);
132 
133 static int	vtblk_quiesce(struct vtblk_softc *);
134 static void	vtblk_startio(struct vtblk_softc *);
135 static struct vtblk_request * vtblk_bio_request(struct vtblk_softc *);
136 static int	vtblk_execute_request(struct vtblk_softc *,
137 		    struct vtblk_request *);
138 
139 static int	vtblk_vq_intr(void *);
140 static void	vtblk_intr_task(void *, int);
141 
142 static void	vtblk_stop(struct vtblk_softc *);
143 
144 static void	vtblk_get_ident(struct vtblk_softc *);
145 static void	vtblk_prepare_dump(struct vtblk_softc *);
146 static int	vtblk_write_dump(struct vtblk_softc *, void *, off_t, size_t);
147 static int	vtblk_flush_dump(struct vtblk_softc *);
148 static int	vtblk_poll_request(struct vtblk_softc *,
149 		    struct vtblk_request *);
150 
151 static void	vtblk_finish_completed(struct vtblk_softc *);
152 static void	vtblk_drain_vq(struct vtblk_softc *, int);
153 static void	vtblk_drain(struct vtblk_softc *);
154 
155 static int	vtblk_alloc_requests(struct vtblk_softc *);
156 static void	vtblk_free_requests(struct vtblk_softc *);
157 static struct vtblk_request * vtblk_dequeue_request(struct vtblk_softc *);
158 static void	vtblk_enqueue_request(struct vtblk_softc *,
159 		    struct vtblk_request *);
160 
161 static struct vtblk_request * vtblk_dequeue_ready(struct vtblk_softc *);
162 static void	vtblk_enqueue_ready(struct vtblk_softc *,
163 		    struct vtblk_request *);
164 
165 static int	vtblk_request_error(struct vtblk_request *);
166 static void	vtblk_finish_bio(struct bio *, int);
167 
168 /* Tunables. */
169 static int vtblk_no_ident = 0;
170 TUNABLE_INT("hw.vtblk.no_ident", &vtblk_no_ident);
171 
172 /* Features desired/implemented by this driver. */
173 #define VTBLK_FEATURES \
174     (VIRTIO_BLK_F_BARRIER		| \
175      VIRTIO_BLK_F_SIZE_MAX		| \
176      VIRTIO_BLK_F_SEG_MAX		| \
177      VIRTIO_BLK_F_GEOMETRY		| \
178      VIRTIO_BLK_F_RO			| \
179      VIRTIO_BLK_F_BLK_SIZE		| \
180      VIRTIO_BLK_F_FLUSH			| \
181      VIRTIO_RING_F_INDIRECT_DESC)
182 
183 #define VTBLK_MTX(_sc)		&(_sc)->vtblk_mtx
184 #define VTBLK_LOCK_INIT(_sc, _name) \
185 				mtx_init(VTBLK_MTX((_sc)), (_name), \
186 				    "VTBLK Lock", MTX_DEF)
187 #define VTBLK_LOCK(_sc)		mtx_lock(VTBLK_MTX((_sc)))
188 #define VTBLK_UNLOCK(_sc)	mtx_unlock(VTBLK_MTX((_sc)))
189 #define VTBLK_LOCK_DESTROY(_sc)	mtx_destroy(VTBLK_MTX((_sc)))
190 #define VTBLK_LOCK_ASSERT(_sc)	mtx_assert(VTBLK_MTX((_sc)), MA_OWNED)
191 #define VTBLK_LOCK_ASSERT_NOTOWNED(_sc) \
192 				mtx_assert(VTBLK_MTX((_sc)), MA_NOTOWNED)
193 
194 #define VTBLK_DISK_NAME		"vtbd"
195 #define VTBLK_QUIESCE_TIMEOUT	(30 * hz)
196 
197 /*
198  * Each block request uses at least two segments - one for the header
199  * and one for the status.
200  */
201 #define VTBLK_MIN_SEGMENTS	2
202 
203 static device_method_t vtblk_methods[] = {
204 	/* Device methods. */
205 	DEVMETHOD(device_probe,		vtblk_probe),
206 	DEVMETHOD(device_attach,	vtblk_attach),
207 	DEVMETHOD(device_detach,	vtblk_detach),
208 	DEVMETHOD(device_suspend,	vtblk_suspend),
209 	DEVMETHOD(device_resume,	vtblk_resume),
210 	DEVMETHOD(device_shutdown,	vtblk_shutdown),
211 
212 	DEVMETHOD_END
213 };
214 
215 static driver_t vtblk_driver = {
216 	"vtblk",
217 	vtblk_methods,
218 	sizeof(struct vtblk_softc)
219 };
220 static devclass_t vtblk_devclass;
221 
222 DRIVER_MODULE(virtio_blk, virtio_pci, vtblk_driver, vtblk_devclass,
223     vtblk_modevent, 0);
224 MODULE_VERSION(virtio_blk, 1);
225 MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
226 
227 static int
228 vtblk_modevent(module_t mod, int type, void *unused)
229 {
230 	int error;
231 
232 	error = 0;
233 
234 	switch (type) {
235 	case MOD_LOAD:
236 	case MOD_QUIESCE:
237 	case MOD_UNLOAD:
238 	case MOD_SHUTDOWN:
239 		break;
240 	default:
241 		error = EOPNOTSUPP;
242 		break;
243 	}
244 
245 	return (error);
246 }
247 
248 static int
249 vtblk_probe(device_t dev)
250 {
251 
252 	if (virtio_get_device_type(dev) != VIRTIO_ID_BLOCK)
253 		return (ENXIO);
254 
255 	device_set_desc(dev, "VirtIO Block Adapter");
256 
257 	return (BUS_PROBE_DEFAULT);
258 }
259 
260 static int
261 vtblk_attach(device_t dev)
262 {
263 	struct vtblk_softc *sc;
264 	struct virtio_blk_config blkcfg;
265 	int error;
266 
267 	sc = device_get_softc(dev);
268 	sc->vtblk_dev = dev;
269 
270 	VTBLK_LOCK_INIT(sc, device_get_nameunit(dev));
271 
272 	bioq_init(&sc->vtblk_bioq);
273 	TAILQ_INIT(&sc->vtblk_req_free);
274 	TAILQ_INIT(&sc->vtblk_req_ready);
275 
276 	virtio_set_feature_desc(dev, vtblk_feature_desc);
277 	vtblk_negotiate_features(sc);
278 
279 	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
280 		sc->vtblk_flags |= VTBLK_FLAG_INDIRECT;
281 
282 	if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
283 		sc->vtblk_flags |= VTBLK_FLAG_READONLY;
284 
285 	/* Get local copy of config. */
286 	virtio_read_device_config(dev, 0, &blkcfg,
287 	    sizeof(struct virtio_blk_config));
288 
289 	/*
290 	 * With the current sglist(9) implementation, it is not easy
291 	 * for us to support a maximum segment size as adjacent
292 	 * segments are coalesced. For now, just make sure it's larger
293 	 * than the maximum supported transfer size.
294 	 */
295 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
296 		if (blkcfg.size_max < MAXPHYS) {
297 			error = ENOTSUP;
298 			device_printf(dev, "host requires unsupported "
299 			    "maximum segment size feature\n");
300 			goto fail;
301 		}
302 	}
303 
304 	sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
305 	if (sc->vtblk_max_nsegs <= VTBLK_MIN_SEGMENTS) {
306 		error = EINVAL;
307 		device_printf(dev, "fewer than minimum number of segments "
308 		    "allowed: %d\n", sc->vtblk_max_nsegs);
309 		goto fail;
310 	}
311 
312 	sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_NOWAIT);
313 	if (sc->vtblk_sglist == NULL) {
314 		error = ENOMEM;
315 		device_printf(dev, "cannot allocate sglist\n");
316 		goto fail;
317 	}
318 
319 	error = vtblk_alloc_virtqueue(sc);
320 	if (error) {
321 		device_printf(dev, "cannot allocate virtqueue\n");
322 		goto fail;
323 	}
324 
325 	error = vtblk_alloc_requests(sc);
326 	if (error) {
327 		device_printf(dev, "cannot preallocate requests\n");
328 		goto fail;
329 	}
330 
331 	vtblk_alloc_disk(sc, &blkcfg);
332 
333 	TASK_INIT(&sc->vtblk_intr_task, 0, vtblk_intr_task, sc);
334 	sc->vtblk_tq = taskqueue_create_fast("vtblk_taskq", M_NOWAIT,
335 	    taskqueue_thread_enqueue, &sc->vtblk_tq);
336 	if (sc->vtblk_tq == NULL) {
337 		error = ENOMEM;
338 		device_printf(dev, "cannot allocate taskqueue\n");
339 		goto fail;
340 	}
341 
342 	error = virtio_setup_intr(dev, INTR_TYPE_BIO | INTR_ENTROPY);
343 	if (error) {
344 		device_printf(dev, "cannot setup virtqueue interrupt\n");
345 		goto fail;
346 	}
347 
348 	taskqueue_start_threads(&sc->vtblk_tq, 1, PI_DISK, "%s taskq",
349 	    device_get_nameunit(dev));
350 
351 	vtblk_create_disk(sc);
352 
353 	virtqueue_enable_intr(sc->vtblk_vq);
354 
355 fail:
356 	if (error)
357 		vtblk_detach(dev);
358 
359 	return (error);
360 }
361 
362 static int
363 vtblk_detach(device_t dev)
364 {
365 	struct vtblk_softc *sc;
366 
367 	sc = device_get_softc(dev);
368 
369 	VTBLK_LOCK(sc);
370 	sc->vtblk_flags |= VTBLK_FLAG_DETACH;
371 	if (device_is_attached(dev))
372 		vtblk_stop(sc);
373 	VTBLK_UNLOCK(sc);
374 
375 	if (sc->vtblk_tq != NULL) {
376 		taskqueue_drain(sc->vtblk_tq, &sc->vtblk_intr_task);
377 		taskqueue_free(sc->vtblk_tq);
378 		sc->vtblk_tq = NULL;
379 	}
380 
381 	vtblk_drain(sc);
382 
383 	if (sc->vtblk_disk != NULL) {
384 		disk_destroy(sc->vtblk_disk);
385 		sc->vtblk_disk = NULL;
386 	}
387 
388 	if (sc->vtblk_sglist != NULL) {
389 		sglist_free(sc->vtblk_sglist);
390 		sc->vtblk_sglist = NULL;
391 	}
392 
393 	VTBLK_LOCK_DESTROY(sc);
394 
395 	return (0);
396 }
397 
398 static int
399 vtblk_suspend(device_t dev)
400 {
401 	struct vtblk_softc *sc;
402 	int error;
403 
404 	sc = device_get_softc(dev);
405 
406 	VTBLK_LOCK(sc);
407 	sc->vtblk_flags |= VTBLK_FLAG_SUSPEND;
408 	/* XXX BMV: virtio_stop(), etc needed here? */
409 	error = vtblk_quiesce(sc);
410 	if (error)
411 		sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
412 	VTBLK_UNLOCK(sc);
413 
414 	return (error);
415 }
416 
417 static int
418 vtblk_resume(device_t dev)
419 {
420 	struct vtblk_softc *sc;
421 
422 	sc = device_get_softc(dev);
423 
424 	VTBLK_LOCK(sc);
425 	/* XXX BMV: virtio_reinit(), etc needed here? */
426 	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPEND;
427 	vtblk_startio(sc);
428 	VTBLK_UNLOCK(sc);
429 
430 	return (0);
431 }
432 
433 static int
434 vtblk_shutdown(device_t dev)
435 {
436 
437 	return (0);
438 }
439 
440 static int
441 vtblk_open(struct disk *dp)
442 {
443 	struct vtblk_softc *sc;
444 
445 	if ((sc = dp->d_drv1) == NULL)
446 		return (ENXIO);
447 
448 	return (sc->vtblk_flags & VTBLK_FLAG_DETACH ? ENXIO : 0);
449 }
450 
451 static int
452 vtblk_close(struct disk *dp)
453 {
454 	struct vtblk_softc *sc;
455 
456 	if ((sc = dp->d_drv1) == NULL)
457 		return (ENXIO);
458 
459 	return (0);
460 }
461 
462 static int
463 vtblk_ioctl(struct disk *dp, u_long cmd, void *addr, int flag,
464     struct thread *td)
465 {
466 	struct vtblk_softc *sc;
467 
468 	if ((sc = dp->d_drv1) == NULL)
469 		return (ENXIO);
470 
471 	return (ENOTTY);
472 }
473 
474 static int
475 vtblk_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
476     size_t length)
477 {
478 	struct disk *dp;
479 	struct vtblk_softc *sc;
480 	int error;
481 
482 	dp = arg;
483 
484 	if ((sc = dp->d_drv1) == NULL)
485 		return (ENXIO);
486 
487 	VTBLK_LOCK(sc);
488 
489 	if ((sc->vtblk_flags & VTBLK_FLAG_DUMPING) == 0) {
490 		vtblk_prepare_dump(sc);
491 		sc->vtblk_flags |= VTBLK_FLAG_DUMPING;
492 	}
493 
494 	if (length > 0)
495 		error = vtblk_write_dump(sc, virtual, offset, length);
496 	else if (virtual == NULL && offset == 0)
497 		error = vtblk_flush_dump(sc);
498 	else {
499 		error = EINVAL;
500 		sc->vtblk_flags &= ~VTBLK_FLAG_DUMPING;
501 	}
502 
503 	VTBLK_UNLOCK(sc);
504 
505 	return (error);
506 }
507 
508 static void
509 vtblk_strategy(struct bio *bp)
510 {
511 	struct vtblk_softc *sc;
512 
513 	if ((sc = bp->bio_disk->d_drv1) == NULL) {
514 		vtblk_finish_bio(bp, EINVAL);
515 		return;
516 	}
517 
518 	/*
519 	 * Fail any write if RO. Unfortunately, there does not seem to
520 	 * be a better way to report our readonly'ness to GEOM above.
521 	 */
522 	if (sc->vtblk_flags & VTBLK_FLAG_READONLY &&
523 	    (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FLUSH)) {
524 		vtblk_finish_bio(bp, EROFS);
525 		return;
526 	}
527 
528 #ifdef INVARIANTS
529 	/*
530 	 * Prevent read/write buffers spanning too many segments from
531 	 * getting into the queue. This should only trip if d_maxsize
532 	 * was incorrectly set.
533 	 */
534 	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
535 		int nsegs, max_nsegs;
536 
537 		nsegs = sglist_count(bp->bio_data, bp->bio_bcount);
538 		max_nsegs = sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS;
539 
540 		KASSERT(nsegs <= max_nsegs,
541 		    ("bio %p spanned too many segments: %d, max: %d",
542 		    bp, nsegs, max_nsegs));
543 	}
544 #endif
545 
546 	VTBLK_LOCK(sc);
547 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH)
548 		vtblk_finish_bio(bp, ENXIO);
549 	else {
550 		bioq_disksort(&sc->vtblk_bioq, bp);
551 
552 		if ((sc->vtblk_flags & VTBLK_FLAG_SUSPEND) == 0)
553 			vtblk_startio(sc);
554 	}
555 	VTBLK_UNLOCK(sc);
556 }
557 
558 static void
559 vtblk_negotiate_features(struct vtblk_softc *sc)
560 {
561 	device_t dev;
562 	uint64_t features;
563 
564 	dev = sc->vtblk_dev;
565 	features = VTBLK_FEATURES;
566 
567 	sc->vtblk_features = virtio_negotiate_features(dev, features);
568 }
569 
570 static int
571 vtblk_maximum_segments(struct vtblk_softc *sc,
572     struct virtio_blk_config *blkcfg)
573 {
574 	device_t dev;
575 	int nsegs;
576 
577 	dev = sc->vtblk_dev;
578 	nsegs = VTBLK_MIN_SEGMENTS;
579 
580 	if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
581 		nsegs += MIN(blkcfg->seg_max, MAXPHYS / PAGE_SIZE + 1);
582 		if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
583 			nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
584 	} else
585 		nsegs += 1;
586 
587 	return (nsegs);
588 }
589 
590 static int
591 vtblk_alloc_virtqueue(struct vtblk_softc *sc)
592 {
593 	device_t dev;
594 	struct vq_alloc_info vq_info;
595 
596 	dev = sc->vtblk_dev;
597 
598 	VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
599 	    vtblk_vq_intr, sc, &sc->vtblk_vq,
600 	    "%s request", device_get_nameunit(dev));
601 
602 	return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info));
603 }
604 
605 static void
606 vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
607 {
608 	device_t dev;
609 	struct disk *dp;
610 
611 	dev = sc->vtblk_dev;
612 
613 	sc->vtblk_disk = dp = disk_alloc();
614 	dp->d_open = vtblk_open;
615 	dp->d_close = vtblk_close;
616 	dp->d_ioctl = vtblk_ioctl;
617 	dp->d_strategy = vtblk_strategy;
618 	dp->d_name = VTBLK_DISK_NAME;
619 	dp->d_unit = device_get_unit(dev);
620 	dp->d_drv1 = sc;
621 
622 	if ((sc->vtblk_flags & VTBLK_FLAG_READONLY) == 0)
623 		dp->d_dump = vtblk_dump;
624 
625 	/* Capacity is always in 512-byte units. */
626 	dp->d_mediasize = blkcfg->capacity * 512;
627 
628 	if (virtio_with_feature(dev, VIRTIO_BLK_F_BLK_SIZE))
629 		dp->d_sectorsize = blkcfg->blk_size;
630 	else
631 		dp->d_sectorsize = 512;
632 
633 	/*
634 	 * The VirtIO maximum I/O size is given in terms of segments.
635 	 * However, FreeBSD limits I/O size by logical buffer size, not
636 	 * by physically contiguous pages. Therefore, we have to assume
637 	 * no pages are contiguous. This may impose an artificially low
638 	 * maximum I/O size. But in practice, since QEMU advertises 128
639 	 * segments, this gives us a maximum IO size of 125 * PAGE_SIZE,
640 	 * which is typically greater than MAXPHYS. Eventually we should
641 	 * just advertise MAXPHYS and split buffers that are too big.
642 	 *
643 	 * Note we must subtract one additional segment in case of non
644 	 * page aligned buffers.
645 	 */
646 	dp->d_maxsize = (sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS - 1) *
647 	    PAGE_SIZE;
648 	if (dp->d_maxsize < PAGE_SIZE)
649 		dp->d_maxsize = PAGE_SIZE; /* XXX */
650 
651 	if (virtio_with_feature(dev, VIRTIO_BLK_F_GEOMETRY)) {
652 		dp->d_fwsectors = blkcfg->geometry.sectors;
653 		dp->d_fwheads = blkcfg->geometry.heads;
654 	}
655 
656 	if (virtio_with_feature(dev, VIRTIO_BLK_F_FLUSH))
657 		dp->d_flags |= DISKFLAG_CANFLUSHCACHE;
658 }
659 
660 static void
661 vtblk_create_disk(struct vtblk_softc *sc)
662 {
663 	struct disk *dp;
664 
665 	dp = sc->vtblk_disk;
666 
667 	/*
668 	 * Retrieving the identification string must be done after
669 	 * the virtqueue interrupt is setup otherwise it will hang.
670 	 */
671 	vtblk_get_ident(sc);
672 
673 	device_printf(sc->vtblk_dev, "%juMB (%ju %u byte sectors)\n",
674 	    (uintmax_t) dp->d_mediasize >> 20,
675 	    (uintmax_t) dp->d_mediasize / dp->d_sectorsize,
676 	    dp->d_sectorsize);
677 
678 	disk_create(dp, DISK_VERSION);
679 }
680 
681 static int
682 vtblk_quiesce(struct vtblk_softc *sc)
683 {
684 	int error;
685 
686 	error = 0;
687 
688 	VTBLK_LOCK_ASSERT(sc);
689 
690 	while (!virtqueue_empty(sc->vtblk_vq)) {
691 		if (mtx_sleep(&sc->vtblk_vq, VTBLK_MTX(sc), PRIBIO, "vtblkq",
692 		    VTBLK_QUIESCE_TIMEOUT) == EWOULDBLOCK) {
693 			error = EBUSY;
694 			break;
695 		}
696 	}
697 
698 	return (error);
699 }
700 
701 static void
702 vtblk_startio(struct vtblk_softc *sc)
703 {
704 	struct virtqueue *vq;
705 	struct vtblk_request *req;
706 	int enq;
707 
708 	vq = sc->vtblk_vq;
709 	enq = 0;
710 
711 	VTBLK_LOCK_ASSERT(sc);
712 
713 	while (!virtqueue_full(vq)) {
714 		if ((req = vtblk_dequeue_ready(sc)) == NULL)
715 			req = vtblk_bio_request(sc);
716 		if (req == NULL)
717 			break;
718 
719 		if (vtblk_execute_request(sc, req) != 0) {
720 			vtblk_enqueue_ready(sc, req);
721 			break;
722 		}
723 
724 		enq++;
725 	}
726 
727 	if (enq > 0)
728 		virtqueue_notify(vq);
729 }
730 
731 static struct vtblk_request *
732 vtblk_bio_request(struct vtblk_softc *sc)
733 {
734 	struct bio_queue_head *bioq;
735 	struct vtblk_request *req;
736 	struct bio *bp;
737 
738 	bioq = &sc->vtblk_bioq;
739 
740 	if (bioq_first(bioq) == NULL)
741 		return (NULL);
742 
743 	req = vtblk_dequeue_request(sc);
744 	if (req == NULL)
745 		return (NULL);
746 
747 	bp = bioq_takefirst(bioq);
748 	req->vbr_bp = bp;
749 	req->vbr_ack = -1;
750 	req->vbr_hdr.ioprio = 1;
751 
752 	switch (bp->bio_cmd) {
753 	case BIO_FLUSH:
754 		req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
755 		break;
756 	case BIO_READ:
757 		req->vbr_hdr.type = VIRTIO_BLK_T_IN;
758 		req->vbr_hdr.sector = bp->bio_offset / 512;
759 		break;
760 	case BIO_WRITE:
761 		req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
762 		req->vbr_hdr.sector = bp->bio_offset / 512;
763 		break;
764 	default:
765 		panic("%s: bio with unhandled cmd: %d", __FUNCTION__,
766 		    bp->bio_cmd);
767 	}
768 
769 	if (bp->bio_flags & BIO_ORDERED)
770 		req->vbr_hdr.type |= VIRTIO_BLK_T_BARRIER;
771 
772 	return (req);
773 }
774 
775 static int
776 vtblk_execute_request(struct vtblk_softc *sc, struct vtblk_request *req)
777 {
778 	struct sglist *sg;
779 	struct bio *bp;
780 	int readable, writable, error;
781 
782 	sg = sc->vtblk_sglist;
783 	bp = req->vbr_bp;
784 	writable = 0;
785 
786 	VTBLK_LOCK_ASSERT(sc);
787 
788 	sglist_reset(sg);
789 
790 	sglist_append(sg, &req->vbr_hdr, sizeof(struct virtio_blk_outhdr));
791 
792 	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
793 		error = sglist_append(sg, bp->bio_data, bp->bio_bcount);
794 		if (error || sg->sg_nseg == sg->sg_maxseg)
795 			panic("%s: data buffer too big bio:%p error:%d",
796 			    __FUNCTION__, bp, error);
797 
798 		/* BIO_READ means the host writes into our buffer. */
799 		if (bp->bio_cmd == BIO_READ)
800 			writable = sg->sg_nseg - 1;
801 	}
802 
803 	writable++;
804 	sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
805 
806 	readable = sg->sg_nseg - writable;
807 
808 	return (virtqueue_enqueue(sc->vtblk_vq, req, sg, readable, writable));
809 }
810 
811 static int
812 vtblk_vq_intr(void *xsc)
813 {
814 	struct vtblk_softc *sc;
815 
816 	sc = xsc;
817 
818 	virtqueue_disable_intr(sc->vtblk_vq);
819 	taskqueue_enqueue_fast(sc->vtblk_tq, &sc->vtblk_intr_task);
820 
821 	return (1);
822 }
823 
824 static void
825 vtblk_intr_task(void *arg, int pending)
826 {
827 	struct vtblk_softc *sc;
828 	struct virtqueue *vq;
829 
830 	sc = arg;
831 	vq = sc->vtblk_vq;
832 
833 	VTBLK_LOCK(sc);
834 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH) {
835 		VTBLK_UNLOCK(sc);
836 		return;
837 	}
838 
839 	vtblk_finish_completed(sc);
840 
841 	if ((sc->vtblk_flags & VTBLK_FLAG_SUSPEND) == 0)
842 		vtblk_startio(sc);
843 	else
844 		wakeup(&sc->vtblk_vq);
845 
846 	if (virtqueue_enable_intr(vq) != 0) {
847 		virtqueue_disable_intr(vq);
848 		VTBLK_UNLOCK(sc);
849 		taskqueue_enqueue_fast(sc->vtblk_tq,
850 		    &sc->vtblk_intr_task);
851 		return;
852 	}
853 
854 	VTBLK_UNLOCK(sc);
855 }
856 
857 static void
858 vtblk_stop(struct vtblk_softc *sc)
859 {
860 
861 	virtqueue_disable_intr(sc->vtblk_vq);
862 	virtio_stop(sc->vtblk_dev);
863 }
864 
865 static void
866 vtblk_get_ident(struct vtblk_softc *sc)
867 {
868 	struct bio buf;
869 	struct disk *dp;
870 	struct vtblk_request *req;
871 	int len, error;
872 
873 	dp = sc->vtblk_disk;
874 	len = MIN(VIRTIO_BLK_ID_BYTES, DISK_IDENT_SIZE);
875 
876 	if (vtblk_no_ident != 0)
877 		return;
878 
879 	req = vtblk_dequeue_request(sc);
880 	if (req == NULL)
881 		return;
882 
883 	req->vbr_ack = -1;
884 	req->vbr_hdr.type = VIRTIO_BLK_T_GET_ID;
885 	req->vbr_hdr.ioprio = 1;
886 	req->vbr_hdr.sector = 0;
887 
888 	req->vbr_bp = &buf;
889 	bzero(&buf, sizeof(struct bio));
890 
891 	buf.bio_cmd = BIO_READ;
892 	buf.bio_data = dp->d_ident;
893 	buf.bio_bcount = len;
894 
895 	VTBLK_LOCK(sc);
896 	error = vtblk_poll_request(sc, req);
897 	VTBLK_UNLOCK(sc);
898 
899 	vtblk_enqueue_request(sc, req);
900 
901 	if (error) {
902 		device_printf(sc->vtblk_dev,
903 		    "error getting device identifier: %d\n", error);
904 	}
905 }
906 
907 static void
908 vtblk_prepare_dump(struct vtblk_softc *sc)
909 {
910 	device_t dev;
911 	struct virtqueue *vq;
912 
913 	dev = sc->vtblk_dev;
914 	vq = sc->vtblk_vq;
915 
916 	vtblk_stop(sc);
917 
918 	/*
919 	 * Drain all requests caught in-flight in the virtqueue,
920 	 * skipping biodone(). When dumping, only one request is
921 	 * outstanding at a time, and we just poll the virtqueue
922 	 * for the response.
923 	 */
924 	vtblk_drain_vq(sc, 1);
925 
926 	if (virtio_reinit(dev, sc->vtblk_features) != 0)
927 		panic("cannot reinit VirtIO block device during dump");
928 
929 	virtqueue_disable_intr(vq);
930 	virtio_reinit_complete(dev);
931 }
932 
933 static int
934 vtblk_write_dump(struct vtblk_softc *sc, void *virtual, off_t offset,
935     size_t length)
936 {
937 	struct bio buf;
938 	struct vtblk_request *req;
939 
940 	req = &sc->vtblk_dump_request;
941 	req->vbr_ack = -1;
942 	req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
943 	req->vbr_hdr.ioprio = 1;
944 	req->vbr_hdr.sector = offset / 512;
945 
946 	req->vbr_bp = &buf;
947 	bzero(&buf, sizeof(struct bio));
948 
949 	buf.bio_cmd = BIO_WRITE;
950 	buf.bio_data = virtual;
951 	buf.bio_bcount = length;
952 
953 	return (vtblk_poll_request(sc, req));
954 }
955 
956 static int
957 vtblk_flush_dump(struct vtblk_softc *sc)
958 {
959 	struct bio buf;
960 	struct vtblk_request *req;
961 
962 	req = &sc->vtblk_dump_request;
963 	req->vbr_ack = -1;
964 	req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
965 	req->vbr_hdr.ioprio = 1;
966 	req->vbr_hdr.sector = 0;
967 
968 	req->vbr_bp = &buf;
969 	bzero(&buf, sizeof(struct bio));
970 
971 	buf.bio_cmd = BIO_FLUSH;
972 
973 	return (vtblk_poll_request(sc, req));
974 }
975 
976 static int
977 vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
978 {
979 	struct virtqueue *vq;
980 	struct vtblk_request *r;
981 	int error;
982 
983 	vq = sc->vtblk_vq;
984 
985 	if (!virtqueue_empty(vq))
986 		return (EBUSY);
987 
988 	error = vtblk_execute_request(sc, req);
989 	if (error)
990 		return (error);
991 
992 	virtqueue_notify(vq);
993 
994 	r = virtqueue_poll(vq, NULL);
995 	KASSERT(r == req, ("unexpected request response: %p/%p", r, req));
996 
997 	error = vtblk_request_error(req);
998 	if (error && bootverbose) {
999 		device_printf(sc->vtblk_dev,
1000 		    "%s: IO error: %d\n", __FUNCTION__, error);
1001 	}
1002 
1003 	return (error);
1004 }
1005 
1006 static void
1007 vtblk_finish_completed(struct vtblk_softc *sc)
1008 {
1009 	struct vtblk_request *req;
1010 	struct bio *bp;
1011 	int error;
1012 
1013 	while ((req = virtqueue_dequeue(sc->vtblk_vq, NULL)) != NULL) {
1014 		bp = req->vbr_bp;
1015 
1016 		error = vtblk_request_error(req);
1017 		if (error)
1018 			disk_err(bp, "hard error", -1, 1);
1019 
1020 		vtblk_finish_bio(bp, error);
1021 		vtblk_enqueue_request(sc, req);
1022 	}
1023 }
1024 
1025 static void
1026 vtblk_drain_vq(struct vtblk_softc *sc, int skip_done)
1027 {
1028 	struct virtqueue *vq;
1029 	struct vtblk_request *req;
1030 	int last;
1031 
1032 	vq = sc->vtblk_vq;
1033 	last = 0;
1034 
1035 	while ((req = virtqueue_drain(vq, &last)) != NULL) {
1036 		if (!skip_done)
1037 			vtblk_finish_bio(req->vbr_bp, ENXIO);
1038 
1039 		vtblk_enqueue_request(sc, req);
1040 	}
1041 
1042 	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
1043 }
1044 
1045 static void
1046 vtblk_drain(struct vtblk_softc *sc)
1047 {
1048 	struct bio_queue_head *bioq;
1049 	struct vtblk_request *req;
1050 	struct bio *bp;
1051 
1052 	bioq = &sc->vtblk_bioq;
1053 
1054 	if (sc->vtblk_vq != NULL) {
1055 		vtblk_finish_completed(sc);
1056 		vtblk_drain_vq(sc, 0);
1057 	}
1058 
1059 	while ((req = vtblk_dequeue_ready(sc)) != NULL) {
1060 		vtblk_finish_bio(req->vbr_bp, ENXIO);
1061 		vtblk_enqueue_request(sc, req);
1062 	}
1063 
1064 	while (bioq_first(bioq) != NULL) {
1065 		bp = bioq_takefirst(bioq);
1066 		vtblk_finish_bio(bp, ENXIO);
1067 	}
1068 
1069 	vtblk_free_requests(sc);
1070 }
1071 
1072 #ifdef INVARIANTS
1073 static void
1074 vtblk_request_invariants(struct vtblk_request *req)
1075 {
1076 	int hdr_nsegs, ack_nsegs;
1077 
1078 	hdr_nsegs = sglist_count(&req->vbr_hdr, sizeof(req->vbr_hdr));
1079 	ack_nsegs = sglist_count(&req->vbr_ack, sizeof(req->vbr_ack));
1080 
1081 	KASSERT(hdr_nsegs == 1, ("request header crossed page boundary"));
1082 	KASSERT(ack_nsegs == 1, ("request ack crossed page boundary"));
1083 }
1084 #endif
1085 
1086 static int
1087 vtblk_alloc_requests(struct vtblk_softc *sc)
1088 {
1089 	struct vtblk_request *req;
1090 	int i, nreqs;
1091 
1092 	nreqs = virtqueue_size(sc->vtblk_vq);
1093 
1094 	/*
1095 	 * Preallocate sufficient requests to keep the virtqueue full. Each
1096 	 * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
1097 	 * the number allocated when indirect descriptors are not available.
1098 	 */
1099 	if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
1100 		nreqs /= VTBLK_MIN_SEGMENTS;
1101 
1102 	for (i = 0; i < nreqs; i++) {
1103 		req = malloc(sizeof(struct vtblk_request), M_DEVBUF, M_NOWAIT);
1104 		if (req == NULL)
1105 			return (ENOMEM);
1106 
1107 #ifdef INVARIANTS
1108 		vtblk_request_invariants(req);
1109 #endif
1110 
1111 		sc->vtblk_request_count++;
1112 		vtblk_enqueue_request(sc, req);
1113 	}
1114 
1115 	return (0);
1116 }
1117 
1118 static void
1119 vtblk_free_requests(struct vtblk_softc *sc)
1120 {
1121 	struct vtblk_request *req;
1122 
1123 	KASSERT(TAILQ_EMPTY(&sc->vtblk_req_ready),
1124 	    ("ready requests left on queue"));
1125 
1126 	while ((req = vtblk_dequeue_request(sc)) != NULL) {
1127 		sc->vtblk_request_count--;
1128 		free(req, M_DEVBUF);
1129 	}
1130 
1131 	KASSERT(sc->vtblk_request_count == 0,
1132 	    ("leaked requests: %d", sc->vtblk_request_count));
1133 }
1134 
1135 static struct vtblk_request *
1136 vtblk_dequeue_request(struct vtblk_softc *sc)
1137 {
1138 	struct vtblk_request *req;
1139 
1140 	req = TAILQ_FIRST(&sc->vtblk_req_free);
1141 	if (req != NULL)
1142 		TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link);
1143 
1144 	return (req);
1145 }
1146 
1147 static void
1148 vtblk_enqueue_request(struct vtblk_softc *sc, struct vtblk_request *req)
1149 {
1150 
1151 	bzero(req, sizeof(struct vtblk_request));
1152 	TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
1153 }
1154 
1155 static struct vtblk_request *
1156 vtblk_dequeue_ready(struct vtblk_softc *sc)
1157 {
1158 	struct vtblk_request *req;
1159 
1160 	req = TAILQ_FIRST(&sc->vtblk_req_ready);
1161 	if (req != NULL)
1162 		TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link);
1163 
1164 	return (req);
1165 }
1166 
1167 static void
1168 vtblk_enqueue_ready(struct vtblk_softc *sc, struct vtblk_request *req)
1169 {
1170 
1171 	TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link);
1172 }
1173 
1174 static int
1175 vtblk_request_error(struct vtblk_request *req)
1176 {
1177 	int error;
1178 
1179 	switch (req->vbr_ack) {
1180 	case VIRTIO_BLK_S_OK:
1181 		error = 0;
1182 		break;
1183 	case VIRTIO_BLK_S_UNSUPP:
1184 		error = ENOTSUP;
1185 		break;
1186 	default:
1187 		error = EIO;
1188 		break;
1189 	}
1190 
1191 	return (error);
1192 }
1193 
1194 static void
1195 vtblk_finish_bio(struct bio *bp, int error)
1196 {
1197 
1198 	if (error) {
1199 		bp->bio_resid = bp->bio_bcount;
1200 		bp->bio_error = error;
1201 		bp->bio_flags |= BIO_ERROR;
1202 	}
1203 
1204 	biodone(bp);
1205 }
1206