xref: /illumos-gate/usr/src/uts/common/io/vioblk/vioblk.c (revision 50d967713af8725306d090dd29033d9efe924715)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
24  * Copyright (c) 2012, Alexey Zaytsev <alexey.zaytsev@gmail.com>
25  * Copyright 2019 Joyent Inc.
26  * Copyright 2019 Western Digital Corporation.
27  */
28 
29 /*
30  * VIRTIO BLOCK DRIVER
31  *
32  * This driver provides support for Virtio Block devices.  Each driver instance
33  * attaches to a single underlying block device.
34  *
35  * REQUEST CHAIN LAYOUT
36  *
37  * Every request chain sent to the I/O queue has the following structure.  Each
38  * box in the diagram represents a descriptor entry (i.e., a DMA cookie) within
39  * the chain:
40  *
41  *    +-0-----------------------------------------+
42  *    | struct virtio_blk_hdr                     |-----------------------\
43  *    |   (written by driver, read by device)     |                       |
44  *    +-1-----------------------------------------+                       |
45  *    | optional data payload                     |--\                    |
46  *    |   (written by driver for write requests,  |  |                    |
47  *    |    or by device for read requests)        |  |                    |
48  *    +-2-----------------------------------------+  |                    |
49  *    | ,~`           :                              |-cookies loaned     |
50  *    |/              :                        ,~`|  | from blkdev        |
51  *                    :                       /   |  |                    |
52  *    +-(N - 1)-----------------------------------+  |                    |
53  *    | ... end of data payload.                  |  |                    |
54  *    |                                           |  |                    |
55  *    |                                           |--/                    |
56  *    +-N-----------------------------------------+                       |
57  *    | status byte                               |                       |
58  *    |   (written by device, read by driver)     |--------------------\  |
59  *    +-------------------------------------------+                    |  |
60  *                                                                     |  |
61  * The memory for the header and status bytes (i.e., 0 and N above)    |  |
62  * is allocated as a single chunk by vioblk_alloc_reqs():              |  |
63  *                                                                     |  |
64  *    +-------------------------------------------+                    |  |
65  *    | struct virtio_blk_hdr                     |<----------------------/
66  *    +-------------------------------------------+                    |
67  *    | status byte                               |<-------------------/
68  *    +-------------------------------------------+
69  */
70 
71 #include <sys/modctl.h>
72 #include <sys/blkdev.h>
73 #include <sys/types.h>
74 #include <sys/errno.h>
75 #include <sys/param.h>
76 #include <sys/stropts.h>
77 #include <sys/stream.h>
78 #include <sys/strsubr.h>
79 #include <sys/kmem.h>
80 #include <sys/conf.h>
81 #include <sys/devops.h>
82 #include <sys/ksynch.h>
83 #include <sys/stat.h>
84 #include <sys/modctl.h>
85 #include <sys/debug.h>
86 #include <sys/pci.h>
87 #include <sys/containerof.h>
88 #include <sys/ctype.h>
89 #include <sys/sysmacros.h>
90 
91 #include "virtio.h"
92 #include "vioblk.h"
93 
94 
95 static void vioblk_get_id(vioblk_t *);
96 uint_t vioblk_int_handler(caddr_t, caddr_t);
97 static uint_t vioblk_poll(vioblk_t *);
98 static int vioblk_quiesce(dev_info_t *);
99 static int vioblk_attach(dev_info_t *, ddi_attach_cmd_t);
100 static int vioblk_detach(dev_info_t *, ddi_detach_cmd_t);
101 
102 
103 static struct dev_ops vioblk_dev_ops = {
104 	.devo_rev =			DEVO_REV,
105 	.devo_refcnt =			0,
106 
107 	.devo_attach =			vioblk_attach,
108 	.devo_detach =			vioblk_detach,
109 	.devo_quiesce =			vioblk_quiesce,
110 
111 	.devo_getinfo =			ddi_no_info,
112 	.devo_identify =		nulldev,
113 	.devo_probe =			nulldev,
114 	.devo_reset =			nodev,
115 	.devo_cb_ops =			NULL,
116 	.devo_bus_ops =			NULL,
117 	.devo_power =			NULL,
118 };
119 
120 static struct modldrv vioblk_modldrv = {
121 	.drv_modops =			&mod_driverops,
122 	.drv_linkinfo =			"VIRTIO block driver",
123 	.drv_dev_ops =			&vioblk_dev_ops
124 };
125 
126 static struct modlinkage vioblk_modlinkage = {
127 	.ml_rev =			MODREV_1,
128 	.ml_linkage =			{ &vioblk_modldrv, NULL }
129 };
130 
131 /*
132  * DMA attribute template for header and status blocks.  We also make a
133  * per-instance copy of this template with negotiated sizes from the device for
134  * blkdev.
135  */
136 static const ddi_dma_attr_t vioblk_dma_attr = {
137 	.dma_attr_version =		DMA_ATTR_V0,
138 	.dma_attr_addr_lo =		0x0000000000000000,
139 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
140 	.dma_attr_count_max =		0x00000000FFFFFFFF,
141 	.dma_attr_align =		1,
142 	.dma_attr_burstsizes =		1,
143 	.dma_attr_minxfer =		1,
144 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
145 	.dma_attr_seg =			0x00000000FFFFFFFF,
146 	.dma_attr_sgllen =		1,
147 	.dma_attr_granular =		1,
148 	.dma_attr_flags =		0
149 };
150 
151 
152 static vioblk_req_t *
153 vioblk_req_alloc(vioblk_t *vib)
154 {
155 	vioblk_req_t *vbr;
156 
157 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
158 
159 	if ((vbr = list_remove_head(&vib->vib_reqs)) == NULL) {
160 		return (NULL);
161 	}
162 	vib->vib_nreqs_alloc++;
163 
164 	VERIFY0(vbr->vbr_status);
165 	vbr->vbr_status |= VIOBLK_REQSTAT_ALLOCATED;
166 
167 	VERIFY3P(vbr->vbr_xfer, ==, NULL);
168 	VERIFY3S(vbr->vbr_error, ==, 0);
169 
170 	return (vbr);
171 }
172 
173 static void
174 vioblk_req_free(vioblk_t *vib, vioblk_req_t *vbr)
175 {
176 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
177 
178 	/*
179 	 * Check that this request was allocated, then zero the status field to
180 	 * clear all status bits.
181 	 */
182 	VERIFY(vbr->vbr_status & VIOBLK_REQSTAT_ALLOCATED);
183 	vbr->vbr_status = 0;
184 
185 	vbr->vbr_xfer = NULL;
186 	vbr->vbr_error = 0;
187 	vbr->vbr_type = 0;
188 
189 	list_insert_head(&vib->vib_reqs, vbr);
190 
191 	VERIFY3U(vib->vib_nreqs_alloc, >, 0);
192 	vib->vib_nreqs_alloc--;
193 }
194 
195 static void
196 vioblk_complete(vioblk_t *vib, vioblk_req_t *vbr)
197 {
198 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
199 
200 	VERIFY(!(vbr->vbr_status & VIOBLK_REQSTAT_COMPLETE));
201 	vbr->vbr_status |= VIOBLK_REQSTAT_COMPLETE;
202 
203 	if (vbr->vbr_type == VIRTIO_BLK_T_FLUSH) {
204 		vib->vib_stats->vbs_rw_cacheflush.value.ui64++;
205 	}
206 
207 	if (vbr->vbr_xfer != NULL) {
208 		/*
209 		 * This is a blkdev framework request.
210 		 */
211 		mutex_exit(&vib->vib_mutex);
212 		bd_xfer_done(vbr->vbr_xfer, vbr->vbr_error);
213 		mutex_enter(&vib->vib_mutex);
214 		vbr->vbr_xfer = NULL;
215 	}
216 }
217 
218 static virtio_chain_t *
219 vioblk_common_start(vioblk_t *vib, int type, uint64_t sector,
220     boolean_t polled)
221 {
222 	vioblk_req_t *vbr = NULL;
223 	virtio_chain_t *vic = NULL;
224 
225 	if ((vbr = vioblk_req_alloc(vib)) == NULL) {
226 		vib->vib_stats->vbs_rw_outofmemory.value.ui64++;
227 		return (NULL);
228 	}
229 	vbr->vbr_type = type;
230 
231 	if (polled) {
232 		/*
233 		 * Mark this command as polled so that we can wait on it
234 		 * ourselves.
235 		 */
236 		vbr->vbr_status |= VIOBLK_REQSTAT_POLLED;
237 	}
238 
239 	if ((vic = virtio_chain_alloc(vib->vib_vq, KM_NOSLEEP)) == NULL) {
240 		vib->vib_stats->vbs_rw_outofmemory.value.ui64++;
241 		goto fail;
242 	}
243 
244 	struct vioblk_req_hdr vbh;
245 	vbh.vbh_type = type;
246 	vbh.vbh_ioprio = 0;
247 	vbh.vbh_sector = (sector * vib->vib_blk_size) / DEV_BSIZE;
248 	bcopy(&vbh, virtio_dma_va(vbr->vbr_dma, 0), sizeof (vbh));
249 
250 	virtio_chain_data_set(vic, vbr);
251 
252 	/*
253 	 * Put the header in the first descriptor.  See the block comment at
254 	 * the top of the file for more details on the chain layout.
255 	 */
256 	if (virtio_chain_append(vic, virtio_dma_cookie_pa(vbr->vbr_dma, 0),
257 	    sizeof (struct vioblk_req_hdr), VIRTIO_DIR_DEVICE_READS) !=
258 	    DDI_SUCCESS) {
259 		goto fail;
260 	}
261 
262 	return (vic);
263 
264 fail:
265 	vbr->vbr_xfer = NULL;
266 	vioblk_req_free(vib, vbr);
267 	if (vic != NULL) {
268 		virtio_chain_free(vic);
269 	}
270 	return (NULL);
271 }
272 
273 static int
274 vioblk_common_submit(vioblk_t *vib, virtio_chain_t *vic)
275 {
276 	int r;
277 	vioblk_req_t *vbr = virtio_chain_data(vic);
278 
279 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
280 
281 	/*
282 	 * The device will write the status byte into this last descriptor.
283 	 * See the block comment at the top of the file for more details on the
284 	 * chain layout.
285 	 */
286 	if (virtio_chain_append(vic, virtio_dma_cookie_pa(vbr->vbr_dma, 0) +
287 	    sizeof (struct vioblk_req_hdr), sizeof (uint8_t),
288 	    VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
289 		r = ENOMEM;
290 		goto out;
291 	}
292 
293 	virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORDEV);
294 	virtio_chain_submit(vic, B_TRUE);
295 
296 	if (!(vbr->vbr_status & VIOBLK_REQSTAT_POLLED)) {
297 		/*
298 		 * This is not a polled request.  Our request will be freed and
299 		 * the caller notified later in vioblk_poll().
300 		 */
301 		return (0);
302 	}
303 
304 	/*
305 	 * This is a polled request.  We need to block here and wait for the
306 	 * device to complete request processing.
307 	 */
308 	while (!(vbr->vbr_status & VIOBLK_REQSTAT_POLL_COMPLETE)) {
309 		if (ddi_in_panic()) {
310 			/*
311 			 * When panicking, interrupts are disabled.  We must
312 			 * poll the queue manually.
313 			 */
314 			drv_usecwait(10);
315 			(void) vioblk_poll(vib);
316 			continue;
317 		}
318 
319 		/*
320 		 * When not panicking, the device will interrupt on command
321 		 * completion and vioblk_poll() will be called to wake us up.
322 		 */
323 		cv_wait(&vib->vib_cv, &vib->vib_mutex);
324 	}
325 
326 	vioblk_complete(vib, vbr);
327 	r = vbr->vbr_error;
328 
329 out:
330 	vioblk_req_free(vib, vbr);
331 	virtio_chain_free(vic);
332 	return (r);
333 }
334 
335 static int
336 vioblk_internal(vioblk_t *vib, int type, virtio_dma_t *dma,
337     uint64_t sector, virtio_direction_t dir)
338 {
339 	virtio_chain_t *vic;
340 	vioblk_req_t *vbr;
341 	int r;
342 
343 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
344 
345 	/*
346 	 * Allocate a polled request.
347 	 */
348 	if ((vic = vioblk_common_start(vib, type, sector, B_TRUE)) == NULL) {
349 		return (ENOMEM);
350 	}
351 	vbr = virtio_chain_data(vic);
352 
353 	/*
354 	 * If there is a request payload, it goes between the header and the
355 	 * status byte.  See the block comment at the top of the file for more
356 	 * detail on the chain layout.
357 	 */
358 	if (dma != NULL) {
359 		for (uint_t n = 0; n < virtio_dma_ncookies(dma); n++) {
360 			if (virtio_chain_append(vic,
361 			    virtio_dma_cookie_pa(dma, n),
362 			    virtio_dma_cookie_size(dma, n), dir) !=
363 			    DDI_SUCCESS) {
364 				r = ENOMEM;
365 				goto out;
366 			}
367 		}
368 	}
369 
370 	return (vioblk_common_submit(vib, vic));
371 
372 out:
373 	vioblk_req_free(vib, vbr);
374 	virtio_chain_free(vic);
375 	return (r);
376 }
377 
378 static int
379 vioblk_request(vioblk_t *vib, bd_xfer_t *xfer, int type)
380 {
381 	virtio_chain_t *vic = NULL;
382 	vioblk_req_t *vbr = NULL;
383 	uint_t total_cookies = 2;
384 	boolean_t polled = (xfer->x_flags & BD_XFER_POLL) != 0;
385 	int r;
386 
387 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
388 
389 	/*
390 	 * Ensure that this request falls within the advertised size of the
391 	 * block device.  Be careful to avoid overflow.
392 	 */
393 	if (xfer->x_nblks > SIZE_MAX - xfer->x_blkno ||
394 	    (xfer->x_blkno + xfer->x_nblks) > vib->vib_nblks) {
395 		vib->vib_stats->vbs_rw_badoffset.value.ui64++;
396 		return (EINVAL);
397 	}
398 
399 	if ((vic = vioblk_common_start(vib, type, xfer->x_blkno, polled)) ==
400 	    NULL) {
401 		return (ENOMEM);
402 	}
403 	vbr = virtio_chain_data(vic);
404 	vbr->vbr_xfer = xfer;
405 
406 	/*
407 	 * If there is a request payload, it goes between the header and the
408 	 * status byte.  See the block comment at the top of the file for more
409 	 * detail on the chain layout.
410 	 */
411 	if ((type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_OUT) &&
412 	    xfer->x_nblks > 0) {
413 		virtio_direction_t dir = (type == VIRTIO_BLK_T_OUT) ?
414 		    VIRTIO_DIR_DEVICE_READS : VIRTIO_DIR_DEVICE_WRITES;
415 
416 		for (uint_t n = 0; n < xfer->x_ndmac; n++) {
417 			ddi_dma_cookie_t dmac;
418 
419 			if (n == 0) {
420 				/*
421 				 * The first cookie is in the blkdev request.
422 				 */
423 				dmac = xfer->x_dmac;
424 			} else {
425 				ddi_dma_nextcookie(xfer->x_dmah, &dmac);
426 			}
427 
428 			if (virtio_chain_append(vic, dmac.dmac_laddress,
429 			    dmac.dmac_size, dir) != DDI_SUCCESS) {
430 				r = ENOMEM;
431 				goto fail;
432 			}
433 		}
434 
435 		total_cookies += xfer->x_ndmac;
436 
437 	} else if (xfer->x_nblks > 0) {
438 		dev_err(vib->vib_dip, CE_PANIC,
439 		    "request of type %d had payload length of %lu blocks", type,
440 		    xfer->x_nblks);
441 	}
442 
443 	if (vib->vib_stats->vbs_rw_cookiesmax.value.ui32 < total_cookies) {
444 		vib->vib_stats->vbs_rw_cookiesmax.value.ui32 = total_cookies;
445 	}
446 
447 	return (vioblk_common_submit(vib, vic));
448 
449 fail:
450 	vbr->vbr_xfer = NULL;
451 	vioblk_req_free(vib, vbr);
452 	virtio_chain_free(vic);
453 	return (r);
454 }
455 
456 static int
457 vioblk_bd_read(void *arg, bd_xfer_t *xfer)
458 {
459 	vioblk_t *vib = arg;
460 	int r;
461 
462 	mutex_enter(&vib->vib_mutex);
463 	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_IN);
464 	mutex_exit(&vib->vib_mutex);
465 
466 	return (r);
467 }
468 
469 static int
470 vioblk_bd_write(void *arg, bd_xfer_t *xfer)
471 {
472 	vioblk_t *vib = arg;
473 	int r;
474 
475 	mutex_enter(&vib->vib_mutex);
476 	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_OUT);
477 	mutex_exit(&vib->vib_mutex);
478 
479 	return (r);
480 }
481 
482 static int
483 vioblk_bd_flush(void *arg, bd_xfer_t *xfer)
484 {
485 	vioblk_t *vib = arg;
486 	int r;
487 
488 	mutex_enter(&vib->vib_mutex);
489 	if (!virtio_feature_present(vib->vib_virtio, VIRTIO_BLK_F_FLUSH)) {
490 		/*
491 		 * We don't really expect to get here, because if we did not
492 		 * negotiate the flush feature we would not have installed this
493 		 * function in the blkdev ops vector.
494 		 */
495 		mutex_exit(&vib->vib_mutex);
496 		return (ENOTSUP);
497 	}
498 
499 	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_FLUSH);
500 	mutex_exit(&vib->vib_mutex);
501 
502 	return (r);
503 }
504 
505 static void
506 vioblk_bd_driveinfo(void *arg, bd_drive_t *drive)
507 {
508 	vioblk_t *vib = arg;
509 
510 	drive->d_qsize = vib->vib_reqs_capacity;
511 	drive->d_removable = B_FALSE;
512 	drive->d_hotpluggable = B_TRUE;
513 	drive->d_target = 0;
514 	drive->d_lun = 0;
515 
516 	drive->d_vendor = "Virtio";
517 	drive->d_vendor_len = strlen(drive->d_vendor);
518 
519 	drive->d_product = "Block Device";
520 	drive->d_product_len = strlen(drive->d_product);
521 
522 	drive->d_serial = vib->vib_devid;
523 	drive->d_serial_len = strlen(drive->d_serial);
524 
525 	drive->d_revision = "0000";
526 	drive->d_revision_len = strlen(drive->d_revision);
527 }
528 
529 static int
530 vioblk_bd_mediainfo(void *arg, bd_media_t *media)
531 {
532 	vioblk_t *vib = (void *)arg;
533 
534 	/*
535 	 * The device protocol is specified in terms of 512 byte logical
536 	 * blocks, regardless of the recommended I/O size which might be
537 	 * larger.
538 	 */
539 	media->m_nblks = vib->vib_nblks;
540 	media->m_blksize = vib->vib_blk_size;
541 
542 	media->m_readonly = vib->vib_readonly;
543 	media->m_pblksize = vib->vib_pblk_size;
544 	return (0);
545 }
546 
547 static void
548 vioblk_get_id(vioblk_t *vib)
549 {
550 	virtio_dma_t *dma;
551 	int r;
552 
553 	if ((dma = virtio_dma_alloc(vib->vib_virtio, VIRTIO_BLK_ID_BYTES,
554 	    &vioblk_dma_attr, DDI_DMA_CONSISTENT | DDI_DMA_READ,
555 	    KM_SLEEP)) == NULL) {
556 		return;
557 	}
558 
559 	mutex_enter(&vib->vib_mutex);
560 	if ((r = vioblk_internal(vib, VIRTIO_BLK_T_GET_ID, dma, 0,
561 	    VIRTIO_DIR_DEVICE_WRITES)) == 0) {
562 		const char *b = virtio_dma_va(dma, 0);
563 		uint_t pos = 0;
564 
565 		/*
566 		 * Save the entire response for debugging purposes.
567 		 */
568 		bcopy(virtio_dma_va(dma, 0), vib->vib_rawid,
569 		    VIRTIO_BLK_ID_BYTES);
570 
571 		/*
572 		 * Process the returned ID.
573 		 */
574 		bzero(vib->vib_devid, sizeof (vib->vib_devid));
575 		for (uint_t n = 0; n < VIRTIO_BLK_ID_BYTES; n++) {
576 			if (isalnum(b[n]) || b[n] == '-' || b[n] == '_') {
577 				/*
578 				 * Accept a subset of printable ASCII
579 				 * characters.
580 				 */
581 				vib->vib_devid[pos++] = b[n];
582 			} else {
583 				/*
584 				 * Stop processing at the first sign of
585 				 * trouble.
586 				 */
587 				break;
588 			}
589 		}
590 
591 		vib->vib_devid_fetched = B_TRUE;
592 	}
593 	mutex_exit(&vib->vib_mutex);
594 
595 	virtio_dma_free(dma);
596 }
597 
598 static int
599 vioblk_bd_devid(void *arg, dev_info_t *dip, ddi_devid_t *devid)
600 {
601 	vioblk_t *vib = arg;
602 	size_t len;
603 
604 	if ((len = strlen(vib->vib_devid)) == 0) {
605 		/*
606 		 * The device has no ID.
607 		 */
608 		return (DDI_FAILURE);
609 	}
610 
611 	return (ddi_devid_init(dip, DEVID_ATA_SERIAL, len, vib->vib_devid,
612 	    devid));
613 }
614 
615 /*
616  * As the device completes processing of a request, it returns the chain for
617  * that request to our I/O queue.  This routine is called in two contexts:
618  *   - from the interrupt handler, in response to notification from the device
619  *   - synchronously in line with request processing when panicking
620  */
621 static uint_t
622 vioblk_poll(vioblk_t *vib)
623 {
624 	virtio_chain_t *vic;
625 	uint_t count = 0;
626 	boolean_t wakeup = B_FALSE;
627 
628 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
629 
630 	while ((vic = virtio_queue_poll(vib->vib_vq)) != NULL) {
631 		vioblk_req_t *vbr = virtio_chain_data(vic);
632 		uint8_t status;
633 
634 		virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORCPU);
635 
636 		bcopy(virtio_dma_va(vbr->vbr_dma,
637 		    sizeof (struct vioblk_req_hdr)), &status, sizeof (status));
638 
639 		switch (status) {
640 		case VIRTIO_BLK_S_OK:
641 			vbr->vbr_error = 0;
642 			break;
643 		case VIRTIO_BLK_S_IOERR:
644 			vbr->vbr_error = EIO;
645 			vib->vib_stats->vbs_io_errors.value.ui64++;
646 			break;
647 		case VIRTIO_BLK_S_UNSUPP:
648 			vbr->vbr_error = ENOTTY;
649 			vib->vib_stats->vbs_unsupp_errors.value.ui64++;
650 			break;
651 		default:
652 			vbr->vbr_error = ENXIO;
653 			vib->vib_stats->vbs_nxio_errors.value.ui64++;
654 			break;
655 		}
656 
657 		count++;
658 
659 		if (vbr->vbr_status & VIOBLK_REQSTAT_POLLED) {
660 			/*
661 			 * This request must not be freed as it is being held
662 			 * by a call to vioblk_common_submit().
663 			 */
664 			VERIFY(!(vbr->vbr_status &
665 			    VIOBLK_REQSTAT_POLL_COMPLETE));
666 			vbr->vbr_status |= VIOBLK_REQSTAT_POLL_COMPLETE;
667 			wakeup = B_TRUE;
668 			continue;
669 		}
670 
671 		vioblk_complete(vib, vbr);
672 
673 		vioblk_req_free(vib, vbr);
674 		virtio_chain_free(vic);
675 	}
676 
677 	if (wakeup) {
678 		/*
679 		 * Signal anybody waiting for polled command completion.
680 		 */
681 		cv_broadcast(&vib->vib_cv);
682 	}
683 
684 	return (count);
685 }
686 
687 uint_t
688 vioblk_int_handler(caddr_t arg0, caddr_t arg1)
689 {
690 	vioblk_t *vib = (vioblk_t *)arg0;
691 	uint_t count;
692 
693 	mutex_enter(&vib->vib_mutex);
694 	if ((count = vioblk_poll(vib)) >
695 	    vib->vib_stats->vbs_intr_queuemax.value.ui32) {
696 		vib->vib_stats->vbs_intr_queuemax.value.ui32 = count;
697 	}
698 
699 	vib->vib_stats->vbs_intr_total.value.ui64++;
700 	mutex_exit(&vib->vib_mutex);
701 
702 	return (DDI_INTR_CLAIMED);
703 }
704 
705 static void
706 vioblk_free_reqs(vioblk_t *vib)
707 {
708 	VERIFY3U(vib->vib_nreqs_alloc, ==, 0);
709 
710 	for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) {
711 		struct vioblk_req *vbr = &vib->vib_reqs_mem[i];
712 
713 		VERIFY(list_link_active(&vbr->vbr_link));
714 		list_remove(&vib->vib_reqs, vbr);
715 
716 		VERIFY0(vbr->vbr_status);
717 
718 		if (vbr->vbr_dma != NULL) {
719 			virtio_dma_free(vbr->vbr_dma);
720 			vbr->vbr_dma = NULL;
721 		}
722 	}
723 	VERIFY(list_is_empty(&vib->vib_reqs));
724 
725 	if (vib->vib_reqs_mem != NULL) {
726 		kmem_free(vib->vib_reqs_mem,
727 		    sizeof (struct vioblk_req) * vib->vib_reqs_capacity);
728 		vib->vib_reqs_mem = NULL;
729 		vib->vib_reqs_capacity = 0;
730 	}
731 }
732 
733 static int
734 vioblk_alloc_reqs(vioblk_t *vib)
735 {
736 	vib->vib_reqs_capacity = MIN(virtio_queue_size(vib->vib_vq),
737 	    VIRTIO_BLK_REQ_BUFS);
738 	vib->vib_reqs_mem = kmem_zalloc(
739 	    sizeof (struct vioblk_req) * vib->vib_reqs_capacity, KM_SLEEP);
740 	vib->vib_nreqs_alloc = 0;
741 
742 	for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) {
743 		list_insert_tail(&vib->vib_reqs, &vib->vib_reqs_mem[i]);
744 	}
745 
746 	for (vioblk_req_t *vbr = list_head(&vib->vib_reqs); vbr != NULL;
747 	    vbr = list_next(&vib->vib_reqs, vbr)) {
748 		if ((vbr->vbr_dma = virtio_dma_alloc(vib->vib_virtio,
749 		    sizeof (struct vioblk_req_hdr) + sizeof (uint8_t),
750 		    &vioblk_dma_attr, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
751 		    KM_SLEEP)) == NULL) {
752 			goto fail;
753 		}
754 	}
755 
756 	return (0);
757 
758 fail:
759 	vioblk_free_reqs(vib);
760 	return (ENOMEM);
761 }
762 
763 static int
764 vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
765 {
766 	int instance = ddi_get_instance(dip);
767 	vioblk_t *vib;
768 	virtio_t *vio;
769 	boolean_t did_mutex = B_FALSE;
770 
771 	if (cmd != DDI_ATTACH) {
772 		return (DDI_FAILURE);
773 	}
774 
775 	if ((vio = virtio_init(dip, VIRTIO_BLK_WANTED_FEATURES, B_TRUE)) ==
776 	    NULL) {
777 		dev_err(dip, CE_WARN, "failed to start Virtio init");
778 		return (DDI_FAILURE);
779 	}
780 
781 	vib = kmem_zalloc(sizeof (*vib), KM_SLEEP);
782 	vib->vib_dip = dip;
783 	vib->vib_virtio = vio;
784 	ddi_set_driver_private(dip, vib);
785 	list_create(&vib->vib_reqs, sizeof (vioblk_req_t),
786 	    offsetof(vioblk_req_t, vbr_link));
787 
788 	/*
789 	 * Determine how many scatter-gather entries we can use in a single
790 	 * request.
791 	 */
792 	vib->vib_seg_max = VIRTIO_BLK_DEFAULT_MAX_SEG;
793 	if (virtio_feature_present(vio, VIRTIO_BLK_F_SEG_MAX)) {
794 		vib->vib_seg_max = virtio_dev_get32(vio,
795 		    VIRTIO_BLK_CONFIG_SEG_MAX);
796 
797 		if (vib->vib_seg_max == 0 || vib->vib_seg_max == PCI_EINVAL32) {
798 			/*
799 			 * We need to be able to use at least one data segment,
800 			 * so we'll assume that this device is just poorly
801 			 * implemented and try for one.
802 			 */
803 			vib->vib_seg_max = 1;
804 		}
805 	}
806 
807 	/*
808 	 * When allocating the request queue, we include two additional
809 	 * descriptors (beyond those required for request data) to account for
810 	 * the header and the status byte.
811 	 */
812 	if ((vib->vib_vq = virtio_queue_alloc(vio, VIRTIO_BLK_VIRTQ_IO, "io",
813 	    vioblk_int_handler, vib, B_FALSE, vib->vib_seg_max + 2)) == NULL) {
814 		goto fail;
815 	}
816 
817 	if (virtio_init_complete(vio, 0) != DDI_SUCCESS) {
818 		dev_err(dip, CE_WARN, "failed to complete Virtio init");
819 		goto fail;
820 	}
821 
822 	cv_init(&vib->vib_cv, NULL, CV_DRIVER, NULL);
823 	mutex_init(&vib->vib_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
824 	did_mutex = B_TRUE;
825 
826 	if ((vib->vib_kstat = kstat_create("vioblk", instance,
827 	    "statistics", "controller", KSTAT_TYPE_NAMED,
828 	    sizeof (struct vioblk_stats) / sizeof (kstat_named_t),
829 	    KSTAT_FLAG_PERSISTENT)) == NULL) {
830 		dev_err(dip, CE_WARN, "kstat_create failed");
831 		goto fail;
832 	}
833 	vib->vib_stats = (vioblk_stats_t *)vib->vib_kstat->ks_data;
834 	kstat_named_init(&vib->vib_stats->vbs_rw_outofmemory,
835 	    "total_rw_outofmemory", KSTAT_DATA_UINT64);
836 	kstat_named_init(&vib->vib_stats->vbs_rw_badoffset,
837 	    "total_rw_badoffset", KSTAT_DATA_UINT64);
838 	kstat_named_init(&vib->vib_stats->vbs_intr_total,
839 	    "total_intr", KSTAT_DATA_UINT64);
840 	kstat_named_init(&vib->vib_stats->vbs_io_errors,
841 	    "total_io_errors", KSTAT_DATA_UINT64);
842 	kstat_named_init(&vib->vib_stats->vbs_unsupp_errors,
843 	    "total_unsupp_errors", KSTAT_DATA_UINT64);
844 	kstat_named_init(&vib->vib_stats->vbs_nxio_errors,
845 	    "total_nxio_errors", KSTAT_DATA_UINT64);
846 	kstat_named_init(&vib->vib_stats->vbs_rw_cacheflush,
847 	    "total_rw_cacheflush", KSTAT_DATA_UINT64);
848 	kstat_named_init(&vib->vib_stats->vbs_rw_cookiesmax,
849 	    "max_rw_cookies", KSTAT_DATA_UINT32);
850 	kstat_named_init(&vib->vib_stats->vbs_intr_queuemax,
851 	    "max_intr_queue", KSTAT_DATA_UINT32);
852 	kstat_install(vib->vib_kstat);
853 
854 	vib->vib_readonly = virtio_feature_present(vio, VIRTIO_BLK_F_RO);
855 	if ((vib->vib_nblks = virtio_dev_get64(vio,
856 	    VIRTIO_BLK_CONFIG_CAPACITY)) == UINT64_MAX) {
857 		dev_err(dip, CE_WARN, "invalid capacity");
858 		goto fail;
859 	}
860 
861 	/*
862 	 * Determine the optimal logical block size recommended by the device.
863 	 * This size is advisory; the protocol always deals in 512 byte blocks.
864 	 */
865 	vib->vib_blk_size = DEV_BSIZE;
866 	if (virtio_feature_present(vio, VIRTIO_BLK_F_BLK_SIZE)) {
867 		uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_BLK_SIZE);
868 
869 		if (v != 0 && v != PCI_EINVAL32) {
870 			vib->vib_blk_size = v;
871 		}
872 	}
873 
874 	/*
875 	 * Device capacity is always in 512-byte units, convert to
876 	 * native blocks.
877 	 */
878 	vib->vib_nblks = (vib->vib_nblks * DEV_BSIZE) / vib->vib_blk_size;
879 
880 	/*
881 	 * The device may also provide an advisory physical block size.
882 	 */
883 	vib->vib_pblk_size = vib->vib_blk_size;
884 	if (virtio_feature_present(vio, VIRTIO_BLK_F_TOPOLOGY)) {
885 		uint8_t v = virtio_dev_get8(vio, VIRTIO_BLK_CONFIG_TOPO_PBEXP);
886 
887 		if (v != PCI_EINVAL8) {
888 			vib->vib_pblk_size <<= v;
889 		}
890 	}
891 
892 	/*
893 	 * The maximum size for a cookie in a request.
894 	 */
895 	vib->vib_seg_size_max = VIRTIO_BLK_DEFAULT_MAX_SIZE;
896 	if (virtio_feature_present(vio, VIRTIO_BLK_F_SIZE_MAX)) {
897 		uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_SIZE_MAX);
898 
899 		if (v != 0 && v != PCI_EINVAL32) {
900 			vib->vib_seg_size_max = v;
901 		}
902 	}
903 
904 	/*
905 	 * Set up the DMA attributes for blkdev to use for request data.  The
906 	 * specification is not extremely clear about whether DMA-related
907 	 * parameters include or exclude the header and status descriptors.
908 	 * For now, we assume they cover only the request data and not the
909 	 * headers.
910 	 */
911 	vib->vib_bd_dma_attr = vioblk_dma_attr;
912 	vib->vib_bd_dma_attr.dma_attr_sgllen = vib->vib_seg_max;
913 	vib->vib_bd_dma_attr.dma_attr_count_max = vib->vib_seg_size_max;
914 	vib->vib_bd_dma_attr.dma_attr_maxxfer = vib->vib_seg_max *
915 	    vib->vib_seg_size_max;
916 
917 	if (vioblk_alloc_reqs(vib) != 0) {
918 		goto fail;
919 	}
920 
921 	/*
922 	 * The blkdev framework does not provide a way to specify that the
923 	 * device does not support write cache flushing, except by omitting the
924 	 * "o_sync_cache" member from the ops vector.  As "bd_alloc_handle()"
925 	 * makes a copy of the ops vector, we can safely assemble one on the
926 	 * stack based on negotiated features.
927 	 */
928 	bd_ops_t vioblk_bd_ops = {
929 		.o_version =		BD_OPS_CURRENT_VERSION,
930 		.o_drive_info =		vioblk_bd_driveinfo,
931 		.o_media_info =		vioblk_bd_mediainfo,
932 		.o_devid_init =		vioblk_bd_devid,
933 		.o_sync_cache =		vioblk_bd_flush,
934 		.o_read =		vioblk_bd_read,
935 		.o_write =		vioblk_bd_write,
936 	};
937 	if (!virtio_feature_present(vio, VIRTIO_BLK_F_FLUSH)) {
938 		vioblk_bd_ops.o_sync_cache = NULL;
939 	}
940 
941 	vib->vib_bd_h = bd_alloc_handle(vib, &vioblk_bd_ops,
942 	    &vib->vib_bd_dma_attr, KM_SLEEP);
943 
944 	/*
945 	 * Enable interrupts now so that we can request the device identity.
946 	 */
947 	if (virtio_interrupts_enable(vio) != DDI_SUCCESS) {
948 		goto fail;
949 	}
950 
951 	vioblk_get_id(vib);
952 
953 	if (bd_attach_handle(dip, vib->vib_bd_h) != DDI_SUCCESS) {
954 		dev_err(dip, CE_WARN, "Failed to attach blkdev");
955 		goto fail;
956 	}
957 
958 	return (DDI_SUCCESS);
959 
960 fail:
961 	if (vib->vib_bd_h != NULL) {
962 		(void) bd_detach_handle(vib->vib_bd_h);
963 		bd_free_handle(vib->vib_bd_h);
964 	}
965 	if (vio != NULL) {
966 		(void) virtio_fini(vio, B_TRUE);
967 	}
968 	if (did_mutex) {
969 		mutex_destroy(&vib->vib_mutex);
970 		cv_destroy(&vib->vib_cv);
971 	}
972 	if (vib->vib_kstat != NULL) {
973 		kstat_delete(vib->vib_kstat);
974 	}
975 	vioblk_free_reqs(vib);
976 	kmem_free(vib, sizeof (*vib));
977 	return (DDI_FAILURE);
978 }
979 
980 static int
981 vioblk_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
982 {
983 	vioblk_t *vib = ddi_get_driver_private(dip);
984 
985 	if (cmd != DDI_DETACH) {
986 		return (DDI_FAILURE);
987 	}
988 
989 	mutex_enter(&vib->vib_mutex);
990 	if (vib->vib_nreqs_alloc > 0) {
991 		/*
992 		 * Cannot detach while there are still outstanding requests.
993 		 */
994 		mutex_exit(&vib->vib_mutex);
995 		return (DDI_FAILURE);
996 	}
997 
998 	if (bd_detach_handle(vib->vib_bd_h) != DDI_SUCCESS) {
999 		mutex_exit(&vib->vib_mutex);
1000 		return (DDI_FAILURE);
1001 	}
1002 
1003 	/*
1004 	 * Tear down the Virtio framework before freeing the rest of the
1005 	 * resources.  This will ensure the interrupt handlers are no longer
1006 	 * running.
1007 	 */
1008 	virtio_fini(vib->vib_virtio, B_FALSE);
1009 
1010 	vioblk_free_reqs(vib);
1011 	kstat_delete(vib->vib_kstat);
1012 
1013 	mutex_exit(&vib->vib_mutex);
1014 	mutex_destroy(&vib->vib_mutex);
1015 
1016 	kmem_free(vib, sizeof (*vib));
1017 
1018 	return (DDI_SUCCESS);
1019 }
1020 
1021 static int
1022 vioblk_quiesce(dev_info_t *dip)
1023 {
1024 	vioblk_t *vib;
1025 
1026 	if ((vib = ddi_get_driver_private(dip)) == NULL) {
1027 		return (DDI_FAILURE);
1028 	}
1029 
1030 	return (virtio_quiesce(vib->vib_virtio));
1031 }
1032 
1033 int
1034 _init(void)
1035 {
1036 	int rv;
1037 
1038 	bd_mod_init(&vioblk_dev_ops);
1039 
1040 	if ((rv = mod_install(&vioblk_modlinkage)) != 0) {
1041 		bd_mod_fini(&vioblk_dev_ops);
1042 	}
1043 
1044 	return (rv);
1045 }
1046 
1047 int
1048 _fini(void)
1049 {
1050 	int rv;
1051 
1052 	if ((rv = mod_remove(&vioblk_modlinkage)) == 0) {
1053 		bd_mod_fini(&vioblk_dev_ops);
1054 	}
1055 
1056 	return (rv);
1057 }
1058 
1059 int
1060 _info(struct modinfo *modinfop)
1061 {
1062 	return (mod_info(&vioblk_modlinkage, modinfop));
1063 }
1064