xref: /illumos-gate/usr/src/uts/common/io/vioblk/vioblk.c (revision dd72704bd9e794056c558153663c739e2012d721)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
24  * Copyright (c) 2012, Alexey Zaytsev <alexey.zaytsev@gmail.com>
25  * Copyright 2020 Joyent Inc.
26  * Copyright 2019 Western Digital Corporation.
27  * Copyright 2020 Oxide Computer Company
28  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
29  */
30 
31 /*
32  * VIRTIO BLOCK DRIVER
33  *
34  * This driver provides support for Virtio Block devices.  Each driver instance
35  * attaches to a single underlying block device.
36  *
37  * REQUEST CHAIN LAYOUT
38  *
39  * Every request chain sent to the I/O queue has the following structure.  Each
40  * box in the diagram represents a descriptor entry (i.e., a DMA cookie) within
41  * the chain:
42  *
43  *    +-0-----------------------------------------+
44  *    | struct virtio_blk_hdr                     |-----------------------\
45  *    |   (written by driver, read by device)     |                       |
46  *    +-1-----------------------------------------+                       |
47  *    | optional data payload                     |--\                    |
48  *    |   (written by driver for write requests,  |  |                    |
49  *    |    or by device for read requests)        |  |                    |
50  *    +-2-----------------------------------------+  |                    |
51  *    | ,~`           :                              |-cookies loaned     |
52  *    |/              :                        ,~`|  | from blkdev        |
53  *                    :                       /   |  |                    |
54  *    +-(N - 1)-----------------------------------+  |                    |
55  *    | ... end of data payload.                  |  |                    |
56  *    |                                           |  |                    |
57  *    |                                           |--/                    |
58  *    +-N-----------------------------------------+                       |
59  *    | status byte                               |                       |
60  *    |   (written by device, read by driver)     |--------------------\  |
61  *    +-------------------------------------------+                    |  |
62  *                                                                     |  |
63  * The memory for the header and status bytes (i.e., 0 and N above)    |  |
64  * is allocated as a single chunk by vioblk_alloc_reqs():              |  |
65  *                                                                     |  |
66  *    +-------------------------------------------+                    |  |
67  *    | struct virtio_blk_hdr                     |<----------------------/
68  *    +-------------------------------------------+                    |
69  *    | status byte                               |<-------------------/
70  *    +-------------------------------------------+
71  */
72 
73 #include <sys/modctl.h>
74 #include <sys/blkdev.h>
75 #include <sys/types.h>
76 #include <sys/errno.h>
77 #include <sys/param.h>
78 #include <sys/stropts.h>
79 #include <sys/stream.h>
80 #include <sys/strsubr.h>
81 #include <sys/kmem.h>
82 #include <sys/conf.h>
83 #include <sys/devops.h>
84 #include <sys/ksynch.h>
85 #include <sys/stat.h>
86 #include <sys/modctl.h>
87 #include <sys/debug.h>
88 #include <sys/pci.h>
89 #include <sys/containerof.h>
90 #include <sys/ctype.h>
91 #include <sys/sysmacros.h>
92 #include <sys/dkioc_free_util.h>
93 
94 #include "virtio.h"
95 #include "vioblk.h"
96 
97 static void vioblk_get_id(vioblk_t *);
98 static uint_t vioblk_int_handler(caddr_t, caddr_t);
99 static uint_t vioblk_poll(vioblk_t *);
100 static int vioblk_quiesce(dev_info_t *);
101 static int vioblk_read_capacity(vioblk_t *);
102 static int vioblk_attach(dev_info_t *, ddi_attach_cmd_t);
103 static int vioblk_detach(dev_info_t *, ddi_detach_cmd_t);
104 
105 
106 static struct dev_ops vioblk_dev_ops = {
107 	.devo_rev =			DEVO_REV,
108 	.devo_refcnt =			0,
109 
110 	.devo_attach =			vioblk_attach,
111 	.devo_detach =			vioblk_detach,
112 	.devo_quiesce =			vioblk_quiesce,
113 
114 	.devo_getinfo =			ddi_no_info,
115 	.devo_identify =		nulldev,
116 	.devo_probe =			nulldev,
117 	.devo_reset =			nodev,
118 	.devo_cb_ops =			NULL,
119 	.devo_bus_ops =			NULL,
120 	.devo_power =			NULL,
121 };
122 
123 static struct modldrv vioblk_modldrv = {
124 	.drv_modops =			&mod_driverops,
125 	.drv_linkinfo =			"VIRTIO block driver",
126 	.drv_dev_ops =			&vioblk_dev_ops
127 };
128 
129 static struct modlinkage vioblk_modlinkage = {
130 	.ml_rev =			MODREV_1,
131 	.ml_linkage =			{ &vioblk_modldrv, NULL }
132 };
133 
134 /*
135  * DMA attribute template for header and status blocks.  We also make a
136  * per-instance copy of this template with negotiated sizes from the device for
137  * blkdev.
138  */
139 static const ddi_dma_attr_t vioblk_dma_attr = {
140 	.dma_attr_version =		DMA_ATTR_V0,
141 	.dma_attr_addr_lo =		0x0000000000000000,
142 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
143 	.dma_attr_count_max =		0x00000000FFFFFFFF,
144 	.dma_attr_align =		1,
145 	.dma_attr_burstsizes =		1,
146 	.dma_attr_minxfer =		1,
147 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
148 	.dma_attr_seg =			0x00000000FFFFFFFF,
149 	.dma_attr_sgllen =		1,
150 	.dma_attr_granular =		1,
151 	.dma_attr_flags =		0
152 };
153 
154 static vioblk_req_t *
155 vioblk_req_alloc(vioblk_t *vib)
156 {
157 	vioblk_req_t *vbr;
158 
159 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
160 
161 	if ((vbr = list_remove_head(&vib->vib_reqs)) == NULL) {
162 		return (NULL);
163 	}
164 	vib->vib_nreqs_alloc++;
165 
166 	VERIFY0(vbr->vbr_status);
167 	vbr->vbr_status |= VIOBLK_REQSTAT_ALLOCATED;
168 
169 	VERIFY3P(vbr->vbr_chain, !=, NULL);
170 	VERIFY3P(vbr->vbr_xfer, ==, NULL);
171 	VERIFY3S(vbr->vbr_error, ==, 0);
172 
173 	return (vbr);
174 }
175 
176 static void
177 vioblk_req_free(vioblk_t *vib, vioblk_req_t *vbr)
178 {
179 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
180 
181 	/*
182 	 * Check that this request was allocated, then zero the status field to
183 	 * clear all status bits.
184 	 */
185 	VERIFY(vbr->vbr_status & VIOBLK_REQSTAT_ALLOCATED);
186 	vbr->vbr_status = 0;
187 
188 	vbr->vbr_xfer = NULL;
189 	vbr->vbr_error = 0;
190 	vbr->vbr_type = 0;
191 	virtio_chain_clear(vbr->vbr_chain);
192 
193 	list_insert_head(&vib->vib_reqs, vbr);
194 
195 	VERIFY3U(vib->vib_nreqs_alloc, >, 0);
196 	vib->vib_nreqs_alloc--;
197 }
198 
199 static void
200 vioblk_complete(vioblk_t *vib, vioblk_req_t *vbr)
201 {
202 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
203 
204 	VERIFY(!(vbr->vbr_status & VIOBLK_REQSTAT_COMPLETE));
205 	vbr->vbr_status |= VIOBLK_REQSTAT_COMPLETE;
206 
207 	if (vbr->vbr_type == VIRTIO_BLK_T_FLUSH) {
208 		vib->vib_stats->vbs_rw_cacheflush.value.ui64++;
209 	}
210 
211 	if (vbr->vbr_xfer != NULL) {
212 		/*
213 		 * This is a blkdev framework request.
214 		 */
215 		mutex_exit(&vib->vib_mutex);
216 		bd_xfer_done(vbr->vbr_xfer, vbr->vbr_error);
217 		mutex_enter(&vib->vib_mutex);
218 		vbr->vbr_xfer = NULL;
219 	}
220 }
221 
222 static vioblk_req_t *
223 vioblk_common_start(vioblk_t *vib, int type, uint64_t sector,
224     boolean_t polled)
225 {
226 	vioblk_req_t *vbr = NULL;
227 
228 	if ((vbr = vioblk_req_alloc(vib)) == NULL) {
229 		vib->vib_stats->vbs_rw_outofmemory.value.ui64++;
230 		return (NULL);
231 	}
232 	vbr->vbr_type = type;
233 
234 	if (polled) {
235 		/*
236 		 * Mark this command as polled so that we can wait on it
237 		 * ourselves.
238 		 */
239 		vbr->vbr_status |= VIOBLK_REQSTAT_POLLED;
240 	}
241 
242 	struct vioblk_req_hdr vbh;
243 	vbh.vbh_type = type;
244 	vbh.vbh_ioprio = 0;
245 	vbh.vbh_sector = (sector * vib->vib_blk_size) / DEV_BSIZE;
246 	bcopy(&vbh, virtio_dma_va(vbr->vbr_dma, 0), sizeof (vbh));
247 
248 	/*
249 	 * Put the header in the first descriptor.  See the block comment at
250 	 * the top of the file for more details on the chain layout.
251 	 */
252 	if (virtio_chain_append(vbr->vbr_chain,
253 	    virtio_dma_cookie_pa(vbr->vbr_dma, 0),
254 	    sizeof (struct vioblk_req_hdr), VIRTIO_DIR_DEVICE_READS) !=
255 	    DDI_SUCCESS) {
256 		vioblk_req_free(vib, vbr);
257 		return (NULL);
258 	}
259 
260 	return (vbr);
261 }
262 
263 static int
264 vioblk_common_submit(vioblk_t *vib, vioblk_req_t *vbr)
265 {
266 	virtio_chain_t *vic = vbr->vbr_chain;
267 	int r;
268 
269 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
270 
271 	/*
272 	 * The device will write the status byte into this last descriptor.
273 	 * See the block comment at the top of the file for more details on the
274 	 * chain layout.
275 	 */
276 	if (virtio_chain_append(vic, virtio_dma_cookie_pa(vbr->vbr_dma, 0) +
277 	    sizeof (struct vioblk_req_hdr), sizeof (uint8_t),
278 	    VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) {
279 		vioblk_req_free(vib, vbr);
280 		return (ENOMEM);
281 	}
282 
283 	virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORDEV);
284 	virtio_chain_submit(vic, B_TRUE);
285 
286 	if (!(vbr->vbr_status & VIOBLK_REQSTAT_POLLED)) {
287 		/*
288 		 * This is not a polled request.  Our request will be freed and
289 		 * the caller notified later in vioblk_poll().
290 		 */
291 		return (0);
292 	}
293 
294 	/*
295 	 * This is a polled request.  We need to block here and wait for the
296 	 * device to complete request processing.
297 	 */
298 	while (!(vbr->vbr_status & VIOBLK_REQSTAT_POLL_COMPLETE)) {
299 		if (ddi_in_panic()) {
300 			/*
301 			 * When panicking, interrupts are disabled.  We must
302 			 * poll the queue manually.
303 			 */
304 			drv_usecwait(10);
305 			(void) vioblk_poll(vib);
306 			continue;
307 		}
308 
309 		/*
310 		 * When not panicking, the device will interrupt on command
311 		 * completion and vioblk_poll() will be called to wake us up.
312 		 */
313 		cv_wait(&vib->vib_cv, &vib->vib_mutex);
314 	}
315 
316 	vioblk_complete(vib, vbr);
317 	r = vbr->vbr_error;
318 	vioblk_req_free(vib, vbr);
319 	return (r);
320 }
321 
322 static int
323 vioblk_internal(vioblk_t *vib, int type, virtio_dma_t *dma,
324     uint64_t sector, virtio_direction_t dir)
325 {
326 	vioblk_req_t *vbr;
327 
328 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
329 
330 	/*
331 	 * Allocate a polled request.
332 	 */
333 	if ((vbr = vioblk_common_start(vib, type, sector, B_TRUE)) == NULL) {
334 		return (ENOMEM);
335 	}
336 
337 	/*
338 	 * If there is a request payload, it goes between the header and the
339 	 * status byte.  See the block comment at the top of the file for more
340 	 * detail on the chain layout.
341 	 */
342 	if (dma != NULL) {
343 		virtio_chain_t *vic = vbr->vbr_chain;
344 		for (uint_t n = 0; n < virtio_dma_ncookies(dma); n++) {
345 			if (virtio_chain_append(vic,
346 			    virtio_dma_cookie_pa(dma, n),
347 			    virtio_dma_cookie_size(dma, n), dir) !=
348 			    DDI_SUCCESS) {
349 				vioblk_req_free(vib, vbr);
350 				return (ENOMEM);
351 			}
352 		}
353 	}
354 
355 	return (vioblk_common_submit(vib, vbr));
356 }
357 
358 static int
359 vioblk_map_discard(vioblk_t *vib, virtio_chain_t *vic, const bd_xfer_t *xfer)
360 {
361 	const dkioc_free_list_t *dfl = xfer->x_dfl;
362 	const dkioc_free_list_ext_t *exts = dfl->dfl_exts;
363 	virtio_dma_t *dma = NULL;
364 	struct vioblk_discard_write_zeroes *wzp = NULL;
365 
366 	dma = virtio_dma_alloc(vib->vib_virtio,
367 	    dfl->dfl_num_exts * sizeof (*wzp), &vioblk_dma_attr,
368 	    DDI_DMA_CONSISTENT | DDI_DMA_WRITE, KM_SLEEP);
369 	if (dma == NULL)
370 		return (ENOMEM);
371 
372 	wzp = virtio_dma_va(dma, 0);
373 
374 	for (uint64_t i = 0; i < dfl->dfl_num_exts; i++, exts++, wzp++) {
375 		uint64_t start = dfl->dfl_offset + exts->dfle_start;
376 
377 		const struct vioblk_discard_write_zeroes vdwz = {
378 			.vdwz_sector = start >> DEV_BSHIFT,
379 			.vdwz_num_sectors = exts->dfle_length >> DEV_BSHIFT,
380 			.vdwz_flags = 0
381 		};
382 
383 		bcopy(&vdwz, wzp, sizeof (*wzp));
384 	}
385 
386 	if (virtio_chain_append(vic,
387 	    virtio_dma_cookie_pa(dma, 0),
388 	    virtio_dma_cookie_size(dma, 0),
389 	    VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) {
390 		virtio_dma_free(dma);
391 		return (ENOMEM);
392 	}
393 
394 	return (0);
395 }
396 
397 static int
398 vioblk_request(vioblk_t *vib, bd_xfer_t *xfer, int type)
399 {
400 	vioblk_req_t *vbr = NULL;
401 	uint_t total_cookies = 2;
402 	boolean_t polled = (xfer->x_flags & BD_XFER_POLL) != 0;
403 
404 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
405 
406 	/*
407 	 * Ensure that this request falls within the advertised size of the
408 	 * block device.  Be careful to avoid overflow.
409 	 */
410 	if (xfer->x_nblks > SIZE_MAX - xfer->x_blkno ||
411 	    (xfer->x_blkno + xfer->x_nblks) > vib->vib_nblks) {
412 		vib->vib_stats->vbs_rw_badoffset.value.ui64++;
413 		return (EINVAL);
414 	}
415 
416 	if ((vbr = vioblk_common_start(vib, type, xfer->x_blkno, polled)) ==
417 	    NULL) {
418 		return (ENOMEM);
419 	}
420 	vbr->vbr_xfer = xfer;
421 
422 	/*
423 	 * If there is a request payload, it goes between the header and the
424 	 * status byte.  See the block comment at the top of the file for more
425 	 * detail on the chain layout.
426 	 */
427 	if ((type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_OUT) &&
428 	    xfer->x_nblks > 0) {
429 		virtio_direction_t dir = (type == VIRTIO_BLK_T_OUT) ?
430 		    VIRTIO_DIR_DEVICE_READS : VIRTIO_DIR_DEVICE_WRITES;
431 		virtio_chain_t *vic = vbr->vbr_chain;
432 
433 		for (uint_t n = 0; n < xfer->x_ndmac; n++) {
434 			ddi_dma_cookie_t dmac;
435 
436 			if (n == 0) {
437 				/*
438 				 * The first cookie is in the blkdev request.
439 				 */
440 				dmac = xfer->x_dmac;
441 			} else {
442 				ddi_dma_nextcookie(xfer->x_dmah, &dmac);
443 			}
444 
445 			if (virtio_chain_append(vic, dmac.dmac_laddress,
446 			    dmac.dmac_size, dir) != DDI_SUCCESS) {
447 				vioblk_req_free(vib, vbr);
448 				return (ENOMEM);
449 			}
450 		}
451 
452 		total_cookies += xfer->x_ndmac;
453 
454 	} else if (xfer->x_nblks > 0) {
455 		dev_err(vib->vib_dip, CE_PANIC,
456 		    "request of type %d had payload length of %lu blocks", type,
457 		    xfer->x_nblks);
458 	} else if (type == VIRTIO_BLK_T_DISCARD) {
459 		int r = vioblk_map_discard(vib, vbr->vbr_chain, xfer);
460 		if (r != 0) {
461 			vioblk_req_free(vib, vbr);
462 			return (r);
463 		}
464 	}
465 
466 	if (vib->vib_stats->vbs_rw_cookiesmax.value.ui32 < total_cookies) {
467 		vib->vib_stats->vbs_rw_cookiesmax.value.ui32 = total_cookies;
468 	}
469 
470 	return (vioblk_common_submit(vib, vbr));
471 }
472 
473 static int
474 vioblk_bd_read(void *arg, bd_xfer_t *xfer)
475 {
476 	vioblk_t *vib = arg;
477 	int r;
478 
479 	mutex_enter(&vib->vib_mutex);
480 	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_IN);
481 	mutex_exit(&vib->vib_mutex);
482 
483 	return (r);
484 }
485 
486 static int
487 vioblk_bd_write(void *arg, bd_xfer_t *xfer)
488 {
489 	vioblk_t *vib = arg;
490 	int r;
491 
492 	mutex_enter(&vib->vib_mutex);
493 	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_OUT);
494 	mutex_exit(&vib->vib_mutex);
495 
496 	return (r);
497 }
498 
499 static int
500 vioblk_bd_flush(void *arg, bd_xfer_t *xfer)
501 {
502 	vioblk_t *vib = arg;
503 	int r;
504 
505 	mutex_enter(&vib->vib_mutex);
506 	if (!virtio_feature_present(vib->vib_virtio, VIRTIO_BLK_F_FLUSH)) {
507 		/*
508 		 * We don't really expect to get here, because if we did not
509 		 * negotiate the flush feature we would not have installed this
510 		 * function in the blkdev ops vector.
511 		 */
512 		mutex_exit(&vib->vib_mutex);
513 		return (ENOTSUP);
514 	}
515 
516 	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_FLUSH);
517 	mutex_exit(&vib->vib_mutex);
518 
519 	return (r);
520 }
521 
522 static void
523 vioblk_bd_driveinfo(void *arg, bd_drive_t *drive)
524 {
525 	vioblk_t *vib = arg;
526 
527 	drive->d_qsize = vib->vib_reqs_capacity;
528 	drive->d_removable = B_FALSE;
529 	drive->d_hotpluggable = B_TRUE;
530 	drive->d_target = 0;
531 	drive->d_lun = 0;
532 
533 	drive->d_vendor = "Virtio";
534 	drive->d_vendor_len = strlen(drive->d_vendor);
535 
536 	drive->d_product = "Block Device";
537 	drive->d_product_len = strlen(drive->d_product);
538 
539 	drive->d_serial = vib->vib_devid;
540 	drive->d_serial_len = strlen(drive->d_serial);
541 
542 	drive->d_revision = "0000";
543 	drive->d_revision_len = strlen(drive->d_revision);
544 
545 	if (vib->vib_can_discard) {
546 		drive->d_free_align = vib->vib_discard_sector_align;
547 		drive->d_max_free_seg = vib->vib_max_discard_seg;
548 		drive->d_max_free_blks = vib->vib_max_discard_sectors;
549 		/*
550 		 * The virtio 1.1 spec doesn't specify a per segment sector
551 		 * limit for discards -- only a limit on the total sectors in
552 		 * a discard request. Therefore, we assume a vioblk device must
553 		 * be able to accept a single segment of vib_max_discard_sectors
554 		 * (when it supports discard requests) and use
555 		 * vib_max_discard_sectors both for the overall limit for
556 		 * a discard request, but also as the limit for a single
557 		 * segment. blkdev will ensure we are never called with
558 		 * a dkioc_free_list_t that violates either limit.
559 		 */
560 		drive->d_max_free_seg_blks = vib->vib_max_discard_sectors;
561 	}
562 }
563 
564 static int
565 vioblk_bd_mediainfo(void *arg, bd_media_t *media)
566 {
567 	vioblk_t *vib = (void *)arg;
568 
569 	/*
570 	 * The device protocol is specified in terms of 512 byte logical
571 	 * blocks, regardless of the recommended I/O size which might be
572 	 * larger.
573 	 */
574 	media->m_nblks = vib->vib_nblks;
575 	media->m_blksize = vib->vib_blk_size;
576 
577 	media->m_readonly = vib->vib_readonly;
578 	media->m_pblksize = vib->vib_pblk_size;
579 	return (0);
580 }
581 
582 static void
583 vioblk_get_id(vioblk_t *vib)
584 {
585 	virtio_dma_t *dma;
586 	int r;
587 
588 	if ((dma = virtio_dma_alloc(vib->vib_virtio, VIRTIO_BLK_ID_BYTES,
589 	    &vioblk_dma_attr, DDI_DMA_CONSISTENT | DDI_DMA_READ,
590 	    KM_SLEEP)) == NULL) {
591 		return;
592 	}
593 
594 	mutex_enter(&vib->vib_mutex);
595 	if ((r = vioblk_internal(vib, VIRTIO_BLK_T_GET_ID, dma, 0,
596 	    VIRTIO_DIR_DEVICE_WRITES)) == 0) {
597 		const char *b = virtio_dma_va(dma, 0);
598 		uint_t pos = 0;
599 
600 		/*
601 		 * Save the entire response for debugging purposes.
602 		 */
603 		bcopy(virtio_dma_va(dma, 0), vib->vib_rawid,
604 		    VIRTIO_BLK_ID_BYTES);
605 
606 		/*
607 		 * Process the returned ID.
608 		 */
609 		bzero(vib->vib_devid, sizeof (vib->vib_devid));
610 		for (uint_t n = 0; n < VIRTIO_BLK_ID_BYTES; n++) {
611 			if (isalnum(b[n]) || b[n] == '-' || b[n] == '_') {
612 				/*
613 				 * Accept a subset of printable ASCII
614 				 * characters.
615 				 */
616 				vib->vib_devid[pos++] = b[n];
617 			} else {
618 				/*
619 				 * Stop processing at the first sign of
620 				 * trouble.
621 				 */
622 				break;
623 			}
624 		}
625 
626 		vib->vib_devid_fetched = B_TRUE;
627 	}
628 	mutex_exit(&vib->vib_mutex);
629 
630 	virtio_dma_free(dma);
631 }
632 
633 static int
634 vioblk_bd_devid(void *arg, dev_info_t *dip, ddi_devid_t *devid)
635 {
636 	vioblk_t *vib = arg;
637 	size_t len;
638 
639 	if ((len = strlen(vib->vib_devid)) == 0) {
640 		/*
641 		 * The device has no ID.
642 		 */
643 		return (DDI_FAILURE);
644 	}
645 
646 	return (ddi_devid_init(dip, DEVID_ATA_SERIAL, len, vib->vib_devid,
647 	    devid));
648 }
649 
650 static int
651 vioblk_bd_free_space(void *arg, bd_xfer_t *xfer)
652 {
653 	vioblk_t *vib = arg;
654 	int r = 0;
655 
656 	/*
657 	 * Since vib_can_discard is write once (and set during attach),
658 	 * we can check if it's enabled without taking the mutex.
659 	 */
660 	if (!vib->vib_can_discard) {
661 		return (ENOTSUP);
662 	}
663 
664 	mutex_enter(&vib->vib_mutex);
665 	r = vioblk_request(vib, xfer, VIRTIO_BLK_T_DISCARD);
666 	mutex_exit(&vib->vib_mutex);
667 
668 	return (r);
669 }
670 
671 /*
672  * As the device completes processing of a request, it returns the chain for
673  * that request to our I/O queue.  This routine is called in two contexts:
674  *   - from the interrupt handler, in response to notification from the device
675  *   - synchronously in line with request processing when panicking
676  */
677 static uint_t
678 vioblk_poll(vioblk_t *vib)
679 {
680 	virtio_chain_t *vic;
681 	uint_t count = 0;
682 	boolean_t wakeup = B_FALSE;
683 
684 	VERIFY(MUTEX_HELD(&vib->vib_mutex));
685 
686 	while ((vic = virtio_queue_poll(vib->vib_vq)) != NULL) {
687 		vioblk_req_t *vbr = virtio_chain_data(vic);
688 		uint8_t status;
689 
690 		virtio_dma_sync(vbr->vbr_dma, DDI_DMA_SYNC_FORCPU);
691 
692 		bcopy(virtio_dma_va(vbr->vbr_dma,
693 		    sizeof (struct vioblk_req_hdr)), &status, sizeof (status));
694 
695 		switch (status) {
696 		case VIRTIO_BLK_S_OK:
697 			vbr->vbr_error = 0;
698 			break;
699 		case VIRTIO_BLK_S_IOERR:
700 			vbr->vbr_error = EIO;
701 			vib->vib_stats->vbs_io_errors.value.ui64++;
702 			break;
703 		case VIRTIO_BLK_S_UNSUPP:
704 			vbr->vbr_error = ENOTTY;
705 			vib->vib_stats->vbs_unsupp_errors.value.ui64++;
706 			break;
707 		default:
708 			vbr->vbr_error = ENXIO;
709 			vib->vib_stats->vbs_nxio_errors.value.ui64++;
710 			break;
711 		}
712 
713 		count++;
714 
715 		if (vbr->vbr_status & VIOBLK_REQSTAT_POLLED) {
716 			/*
717 			 * This request must not be freed as it is being held
718 			 * by a call to vioblk_common_submit().
719 			 */
720 			VERIFY(!(vbr->vbr_status &
721 			    VIOBLK_REQSTAT_POLL_COMPLETE));
722 			vbr->vbr_status |= VIOBLK_REQSTAT_POLL_COMPLETE;
723 			wakeup = B_TRUE;
724 			continue;
725 		}
726 
727 		vioblk_complete(vib, vbr);
728 
729 		vioblk_req_free(vib, vbr);
730 	}
731 
732 	if (wakeup) {
733 		/*
734 		 * Signal anybody waiting for polled command completion.
735 		 */
736 		cv_broadcast(&vib->vib_cv);
737 	}
738 
739 	return (count);
740 }
741 
742 static uint_t
743 vioblk_int_handler(caddr_t arg0, caddr_t arg1 __unused)
744 {
745 	vioblk_t *vib = (vioblk_t *)arg0;
746 	uint_t count;
747 
748 	mutex_enter(&vib->vib_mutex);
749 	if ((count = vioblk_poll(vib)) >
750 	    vib->vib_stats->vbs_intr_queuemax.value.ui32) {
751 		vib->vib_stats->vbs_intr_queuemax.value.ui32 = count;
752 	}
753 
754 	vib->vib_stats->vbs_intr_total.value.ui64++;
755 	mutex_exit(&vib->vib_mutex);
756 
757 	return (DDI_INTR_CLAIMED);
758 }
759 
760 static uint_t
761 vioblk_cfgchange(caddr_t arg0, caddr_t arg1 __unused)
762 {
763 	vioblk_t *vib = (vioblk_t *)arg0;
764 
765 	dev_err(vib->vib_dip, CE_NOTE, "!Configuration changed");
766 
767 	mutex_enter(&vib->vib_mutex);
768 
769 	/*
770 	 * The configuration space of the device has changed in some way.
771 	 * At present, we only re-read the device capacity and trigger
772 	 * blkdev to check the device state.
773 	 */
774 
775 	if (vioblk_read_capacity(vib) == DDI_FAILURE) {
776 		mutex_exit(&vib->vib_mutex);
777 		return (DDI_INTR_CLAIMED);
778 	}
779 
780 	mutex_exit(&vib->vib_mutex);
781 
782 	bd_state_change(vib->vib_bd_h);
783 
784 	return (DDI_INTR_CLAIMED);
785 }
786 
787 static void
788 vioblk_free_reqs(vioblk_t *vib)
789 {
790 	VERIFY3U(vib->vib_nreqs_alloc, ==, 0);
791 
792 	for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) {
793 		struct vioblk_req *vbr = &vib->vib_reqs_mem[i];
794 
795 		VERIFY(list_link_active(&vbr->vbr_link));
796 		list_remove(&vib->vib_reqs, vbr);
797 
798 		VERIFY0(vbr->vbr_status);
799 
800 		if (vbr->vbr_chain != NULL) {
801 			virtio_chain_free(vbr->vbr_chain);
802 			vbr->vbr_chain = NULL;
803 		}
804 		if (vbr->vbr_dma != NULL) {
805 			virtio_dma_free(vbr->vbr_dma);
806 			vbr->vbr_dma = NULL;
807 		}
808 	}
809 	VERIFY(list_is_empty(&vib->vib_reqs));
810 
811 	if (vib->vib_reqs_mem != NULL) {
812 		kmem_free(vib->vib_reqs_mem,
813 		    sizeof (struct vioblk_req) * vib->vib_reqs_capacity);
814 		vib->vib_reqs_mem = NULL;
815 		vib->vib_reqs_capacity = 0;
816 	}
817 }
818 
819 static int
820 vioblk_alloc_reqs(vioblk_t *vib)
821 {
822 	vib->vib_reqs_capacity = MIN(virtio_queue_size(vib->vib_vq),
823 	    VIRTIO_BLK_REQ_BUFS);
824 	vib->vib_reqs_mem = kmem_zalloc(
825 	    sizeof (struct vioblk_req) * vib->vib_reqs_capacity, KM_SLEEP);
826 	vib->vib_nreqs_alloc = 0;
827 
828 	for (uint_t i = 0; i < vib->vib_reqs_capacity; i++) {
829 		list_insert_tail(&vib->vib_reqs, &vib->vib_reqs_mem[i]);
830 	}
831 
832 	for (vioblk_req_t *vbr = list_head(&vib->vib_reqs); vbr != NULL;
833 	    vbr = list_next(&vib->vib_reqs, vbr)) {
834 		if ((vbr->vbr_dma = virtio_dma_alloc(vib->vib_virtio,
835 		    sizeof (struct vioblk_req_hdr) + sizeof (uint8_t),
836 		    &vioblk_dma_attr, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
837 		    KM_SLEEP)) == NULL) {
838 			goto fail;
839 		}
840 		vbr->vbr_chain = virtio_chain_alloc(vib->vib_vq, KM_SLEEP);
841 		if (vbr->vbr_chain == NULL) {
842 			goto fail;
843 		}
844 		virtio_chain_data_set(vbr->vbr_chain, vbr);
845 	}
846 
847 	return (0);
848 
849 fail:
850 	vioblk_free_reqs(vib);
851 	return (ENOMEM);
852 }
853 
854 static int
855 vioblk_read_capacity(vioblk_t *vib)
856 {
857 	virtio_t *vio = vib->vib_virtio;
858 
859 	/* The capacity is always available */
860 	if ((vib->vib_nblks = virtio_dev_get64(vio,
861 	    VIRTIO_BLK_CONFIG_CAPACITY)) == UINT64_MAX) {
862 		dev_err(vib->vib_dip, CE_WARN, "invalid capacity");
863 		return (DDI_FAILURE);
864 	}
865 
866 	/*
867 	 * Determine the optimal logical block size recommended by the device.
868 	 * This size is advisory; the protocol always deals in 512 byte blocks.
869 	 */
870 	vib->vib_blk_size = DEV_BSIZE;
871 	if (virtio_feature_present(vio, VIRTIO_BLK_F_BLK_SIZE)) {
872 		uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_BLK_SIZE);
873 
874 		if (v != 0 && v != PCI_EINVAL32)
875 			vib->vib_blk_size = v;
876 	}
877 
878 	/*
879 	 * Device capacity is always in 512-byte units, convert to
880 	 * native blocks.
881 	 */
882 	vib->vib_nblks = (vib->vib_nblks * DEV_BSIZE) / vib->vib_blk_size;
883 
884 	/*
885 	 * The device may also provide an advisory physical block size.
886 	 */
887 	vib->vib_pblk_size = vib->vib_blk_size;
888 	if (virtio_feature_present(vio, VIRTIO_BLK_F_TOPOLOGY)) {
889 		uint8_t v = virtio_dev_get8(vio, VIRTIO_BLK_CONFIG_TOPO_PBEXP);
890 
891 		if (v != PCI_EINVAL8)
892 			vib->vib_pblk_size <<= v;
893 	}
894 
895 	return (DDI_SUCCESS);
896 }
897 
898 static int
899 vioblk_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
900 {
901 	int instance = ddi_get_instance(dip);
902 	vioblk_t *vib;
903 	virtio_t *vio;
904 	boolean_t did_mutex = B_FALSE;
905 
906 	if (cmd != DDI_ATTACH) {
907 		return (DDI_FAILURE);
908 	}
909 
910 	if ((vio = virtio_init(dip, VIRTIO_BLK_WANTED_FEATURES, B_TRUE)) ==
911 	    NULL) {
912 		dev_err(dip, CE_WARN, "failed to start Virtio init");
913 		return (DDI_FAILURE);
914 	}
915 
916 	vib = kmem_zalloc(sizeof (*vib), KM_SLEEP);
917 	vib->vib_dip = dip;
918 	vib->vib_virtio = vio;
919 	ddi_set_driver_private(dip, vib);
920 	list_create(&vib->vib_reqs, sizeof (vioblk_req_t),
921 	    offsetof(vioblk_req_t, vbr_link));
922 
923 	/*
924 	 * Determine how many scatter-gather entries we can use in a single
925 	 * request.
926 	 */
927 	vib->vib_seg_max = VIRTIO_BLK_DEFAULT_MAX_SEG;
928 	if (virtio_feature_present(vio, VIRTIO_BLK_F_SEG_MAX)) {
929 		vib->vib_seg_max = virtio_dev_get32(vio,
930 		    VIRTIO_BLK_CONFIG_SEG_MAX);
931 
932 		if (vib->vib_seg_max == 0 || vib->vib_seg_max == PCI_EINVAL32) {
933 			/*
934 			 * We need to be able to use at least one data segment,
935 			 * so we'll assume that this device is just poorly
936 			 * implemented and try for one.
937 			 */
938 			vib->vib_seg_max = 1;
939 		}
940 	}
941 
942 	if (virtio_feature_present(vio, VIRTIO_BLK_F_DISCARD)) {
943 		vib->vib_max_discard_sectors = virtio_dev_get32(vio,
944 		    VIRTIO_BLK_CONFIG_MAX_DISCARD_SECT);
945 		vib->vib_max_discard_seg = virtio_dev_get32(vio,
946 		    VIRTIO_BLK_CONFIG_MAX_DISCARD_SEG);
947 		vib->vib_discard_sector_align = virtio_dev_get32(vio,
948 		    VIRTIO_BLK_CONFIG_DISCARD_ALIGN);
949 
950 		if (vib->vib_max_discard_sectors == 0 ||
951 		    vib->vib_max_discard_seg == 0 ||
952 		    vib->vib_discard_sector_align == 0) {
953 			vib->vib_can_discard = B_FALSE;
954 
955 			/*
956 			 * The hypervisor shouldn't be giving us bad values.
957 			 * If it is, it's probably worth notifying the
958 			 * operator.
959 			 */
960 			dev_err(dip, CE_NOTE,
961 			    "Host is advertising DISCARD support but with bad"
962 			    "parameters: max_discard_sectors=%u, "
963 			    "max_discard_segments=%u, discard_sector_align=%u",
964 			    vib->vib_max_discard_sectors,
965 			    vib->vib_max_discard_seg,
966 			    vib->vib_discard_sector_align);
967 		} else {
968 			vib->vib_can_discard = B_TRUE;
969 		}
970 	}
971 
972 	/*
973 	 * When allocating the request queue, we include two additional
974 	 * descriptors (beyond those required for request data) to account for
975 	 * the header and the status byte.
976 	 */
977 	if ((vib->vib_vq = virtio_queue_alloc(vio, VIRTIO_BLK_VIRTQ_IO, "io",
978 	    vioblk_int_handler, vib, B_FALSE, vib->vib_seg_max + 2)) == NULL) {
979 		goto fail;
980 	}
981 
982 	virtio_register_cfgchange_handler(vio, vioblk_cfgchange, vib);
983 
984 	if (virtio_init_complete(vio, VIRTIO_ANY_INTR_TYPE) != DDI_SUCCESS) {
985 		dev_err(dip, CE_WARN, "failed to complete Virtio init");
986 		goto fail;
987 	}
988 
989 	cv_init(&vib->vib_cv, NULL, CV_DRIVER, NULL);
990 	mutex_init(&vib->vib_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
991 	did_mutex = B_TRUE;
992 
993 	if ((vib->vib_kstat = kstat_create("vioblk", instance,
994 	    "statistics", "controller", KSTAT_TYPE_NAMED,
995 	    sizeof (struct vioblk_stats) / sizeof (kstat_named_t),
996 	    KSTAT_FLAG_PERSISTENT)) == NULL) {
997 		dev_err(dip, CE_WARN, "kstat_create failed");
998 		goto fail;
999 	}
1000 	vib->vib_stats = (vioblk_stats_t *)vib->vib_kstat->ks_data;
1001 	kstat_named_init(&vib->vib_stats->vbs_rw_outofmemory,
1002 	    "total_rw_outofmemory", KSTAT_DATA_UINT64);
1003 	kstat_named_init(&vib->vib_stats->vbs_rw_badoffset,
1004 	    "total_rw_badoffset", KSTAT_DATA_UINT64);
1005 	kstat_named_init(&vib->vib_stats->vbs_intr_total,
1006 	    "total_intr", KSTAT_DATA_UINT64);
1007 	kstat_named_init(&vib->vib_stats->vbs_io_errors,
1008 	    "total_io_errors", KSTAT_DATA_UINT64);
1009 	kstat_named_init(&vib->vib_stats->vbs_unsupp_errors,
1010 	    "total_unsupp_errors", KSTAT_DATA_UINT64);
1011 	kstat_named_init(&vib->vib_stats->vbs_nxio_errors,
1012 	    "total_nxio_errors", KSTAT_DATA_UINT64);
1013 	kstat_named_init(&vib->vib_stats->vbs_rw_cacheflush,
1014 	    "total_rw_cacheflush", KSTAT_DATA_UINT64);
1015 	kstat_named_init(&vib->vib_stats->vbs_rw_cookiesmax,
1016 	    "max_rw_cookies", KSTAT_DATA_UINT32);
1017 	kstat_named_init(&vib->vib_stats->vbs_intr_queuemax,
1018 	    "max_intr_queue", KSTAT_DATA_UINT32);
1019 	kstat_install(vib->vib_kstat);
1020 
1021 	vib->vib_readonly = virtio_feature_present(vio, VIRTIO_BLK_F_RO);
1022 
1023 	if (vioblk_read_capacity(vib) == DDI_FAILURE)
1024 		goto fail;
1025 
1026 	/*
1027 	 * The maximum size for a cookie in a request.
1028 	 */
1029 	vib->vib_seg_size_max = VIRTIO_BLK_DEFAULT_MAX_SIZE;
1030 	if (virtio_feature_present(vio, VIRTIO_BLK_F_SIZE_MAX)) {
1031 		uint32_t v = virtio_dev_get32(vio, VIRTIO_BLK_CONFIG_SIZE_MAX);
1032 
1033 		if (v != 0 && v != PCI_EINVAL32) {
1034 			vib->vib_seg_size_max = v;
1035 		}
1036 	}
1037 
1038 	/*
1039 	 * Set up the DMA attributes for blkdev to use for request data.  The
1040 	 * specification is not extremely clear about whether DMA-related
1041 	 * parameters include or exclude the header and status descriptors.
1042 	 * For now, we assume they cover only the request data and not the
1043 	 * headers.
1044 	 */
1045 	vib->vib_bd_dma_attr = vioblk_dma_attr;
1046 	vib->vib_bd_dma_attr.dma_attr_sgllen = vib->vib_seg_max;
1047 	vib->vib_bd_dma_attr.dma_attr_count_max = vib->vib_seg_size_max;
1048 	vib->vib_bd_dma_attr.dma_attr_maxxfer = vib->vib_seg_max *
1049 	    vib->vib_seg_size_max;
1050 
1051 	if (vioblk_alloc_reqs(vib) != 0) {
1052 		goto fail;
1053 	}
1054 
1055 	/*
1056 	 * The blkdev framework does not provide a way to specify that the
1057 	 * device does not support write cache flushing, except by omitting the
1058 	 * "o_sync_cache" member from the ops vector.  As "bd_alloc_handle()"
1059 	 * makes a copy of the ops vector, we can safely assemble one on the
1060 	 * stack based on negotiated features.
1061 	 *
1062 	 * Similarly, the blkdev framework does not provide a way to indicate
1063 	 * if a device supports an TRIM/UNMAP/DISCARD type operation except
1064 	 * by omitting the "o_free_space" member from the ops vector.
1065 	 */
1066 	bd_ops_t vioblk_bd_ops = {
1067 		.o_version =		BD_OPS_CURRENT_VERSION,
1068 		.o_drive_info =		vioblk_bd_driveinfo,
1069 		.o_media_info =		vioblk_bd_mediainfo,
1070 		.o_devid_init =		vioblk_bd_devid,
1071 		.o_sync_cache =		vioblk_bd_flush,
1072 		.o_read =		vioblk_bd_read,
1073 		.o_write =		vioblk_bd_write,
1074 		.o_free_space =		vioblk_bd_free_space,
1075 	};
1076 	if (!virtio_feature_present(vio, VIRTIO_BLK_F_FLUSH)) {
1077 		vioblk_bd_ops.o_sync_cache = NULL;
1078 	}
1079 	if (!vib->vib_can_discard) {
1080 		vioblk_bd_ops.o_free_space = NULL;
1081 	}
1082 
1083 	vib->vib_bd_h = bd_alloc_handle(vib, &vioblk_bd_ops,
1084 	    &vib->vib_bd_dma_attr, KM_SLEEP);
1085 
1086 	/*
1087 	 * Enable interrupts now so that we can request the device identity.
1088 	 */
1089 	if (virtio_interrupts_enable(vio) != DDI_SUCCESS) {
1090 		goto fail;
1091 	}
1092 
1093 	vioblk_get_id(vib);
1094 
1095 	if (bd_attach_handle(dip, vib->vib_bd_h) != DDI_SUCCESS) {
1096 		dev_err(dip, CE_WARN, "Failed to attach blkdev");
1097 		goto fail;
1098 	}
1099 
1100 	return (DDI_SUCCESS);
1101 
1102 fail:
1103 	if (vib->vib_bd_h != NULL) {
1104 		(void) bd_detach_handle(vib->vib_bd_h);
1105 		bd_free_handle(vib->vib_bd_h);
1106 	}
1107 	if (vio != NULL) {
1108 		(void) virtio_fini(vio, B_TRUE);
1109 	}
1110 	if (did_mutex) {
1111 		mutex_destroy(&vib->vib_mutex);
1112 		cv_destroy(&vib->vib_cv);
1113 	}
1114 	if (vib->vib_kstat != NULL) {
1115 		kstat_delete(vib->vib_kstat);
1116 	}
1117 	vioblk_free_reqs(vib);
1118 	kmem_free(vib, sizeof (*vib));
1119 	return (DDI_FAILURE);
1120 }
1121 
1122 static int
1123 vioblk_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1124 {
1125 	vioblk_t *vib = ddi_get_driver_private(dip);
1126 
1127 	if (cmd != DDI_DETACH) {
1128 		return (DDI_FAILURE);
1129 	}
1130 
1131 	mutex_enter(&vib->vib_mutex);
1132 	if (vib->vib_nreqs_alloc > 0) {
1133 		/*
1134 		 * Cannot detach while there are still outstanding requests.
1135 		 */
1136 		mutex_exit(&vib->vib_mutex);
1137 		return (DDI_FAILURE);
1138 	}
1139 
1140 	if (bd_detach_handle(vib->vib_bd_h) != DDI_SUCCESS) {
1141 		mutex_exit(&vib->vib_mutex);
1142 		return (DDI_FAILURE);
1143 	}
1144 
1145 	/*
1146 	 * Tear down the Virtio framework before freeing the rest of the
1147 	 * resources.  This will ensure the interrupt handlers are no longer
1148 	 * running.
1149 	 */
1150 	virtio_fini(vib->vib_virtio, B_FALSE);
1151 
1152 	vioblk_free_reqs(vib);
1153 	kstat_delete(vib->vib_kstat);
1154 
1155 	mutex_exit(&vib->vib_mutex);
1156 	mutex_destroy(&vib->vib_mutex);
1157 
1158 	kmem_free(vib, sizeof (*vib));
1159 
1160 	return (DDI_SUCCESS);
1161 }
1162 
1163 static int
1164 vioblk_quiesce(dev_info_t *dip)
1165 {
1166 	vioblk_t *vib;
1167 
1168 	if ((vib = ddi_get_driver_private(dip)) == NULL) {
1169 		return (DDI_FAILURE);
1170 	}
1171 
1172 	return (virtio_quiesce(vib->vib_virtio));
1173 }
1174 
1175 int
1176 _init(void)
1177 {
1178 	int rv;
1179 
1180 	bd_mod_init(&vioblk_dev_ops);
1181 
1182 	if ((rv = mod_install(&vioblk_modlinkage)) != 0) {
1183 		bd_mod_fini(&vioblk_dev_ops);
1184 	}
1185 
1186 	return (rv);
1187 }
1188 
1189 int
1190 _fini(void)
1191 {
1192 	int rv;
1193 
1194 	if ((rv = mod_remove(&vioblk_modlinkage)) == 0) {
1195 		bd_mod_fini(&vioblk_dev_ops);
1196 	}
1197 
1198 	return (rv);
1199 }
1200 
1201 int
1202 _info(struct modinfo *modinfop)
1203 {
1204 	return (mod_info(&vioblk_modlinkage, modinfop));
1205 }
1206