xref: /illumos-gate/usr/src/uts/common/io/virtio/virtio_main.c (revision 4c63bf63e4b7d55c99e024187bc355893f66c09c)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  */
15 
16 /*
17  * VIRTIO FRAMEWORK
18  *
19  * For design and usage documentation, see the comments in "virtio.h".
20  */
21 
22 #include <sys/conf.h>
23 #include <sys/kmem.h>
24 #include <sys/debug.h>
25 #include <sys/modctl.h>
26 #include <sys/autoconf.h>
27 #include <sys/ddi_impldefs.h>
28 #include <sys/ddi.h>
29 #include <sys/sunddi.h>
30 #include <sys/sunndi.h>
31 #include <sys/avintr.h>
32 #include <sys/spl.h>
33 #include <sys/promif.h>
34 #include <sys/list.h>
35 #include <sys/bootconf.h>
36 #include <sys/bootsvcs.h>
37 #include <sys/sysmacros.h>
38 #include <sys/pci.h>
39 
40 #include "virtio.h"
41 #include "virtio_impl.h"
42 
43 
44 /*
45  * Linkage structures
46  */
47 static struct modlmisc virtio_modlmisc = {
48 	.misc_modops =			&mod_miscops,
49 	.misc_linkinfo =		"VIRTIO common routines",
50 };
51 
52 static struct modlinkage virtio_modlinkage = {
53 	.ml_rev =			MODREV_1,
54 	.ml_linkage =			{ &virtio_modlmisc, NULL }
55 };
56 
57 int
58 _init(void)
59 {
60 	return (mod_install(&virtio_modlinkage));
61 }
62 
63 int
64 _fini(void)
65 {
66 	return (mod_remove(&virtio_modlinkage));
67 }
68 
69 int
70 _info(struct modinfo *modinfop)
71 {
72 	return (mod_info(&virtio_modlinkage, modinfop));
73 }
74 
75 
76 
77 static void virtio_set_status(virtio_t *, uint8_t);
78 static int virtio_chain_append_impl(virtio_chain_t *, uint64_t, size_t,
79     uint16_t);
80 static int virtio_interrupts_setup(virtio_t *, int);
81 static void virtio_interrupts_teardown(virtio_t *);
82 static void virtio_interrupts_disable_locked(virtio_t *);
83 static void virtio_queue_free(virtio_queue_t *);
84 
85 /*
86  * We use the same device access attributes for BAR mapping and access to the
87  * virtqueue memory.
88  */
89 ddi_device_acc_attr_t virtio_acc_attr = {
90 	.devacc_attr_version =		DDI_DEVICE_ATTR_V1,
91 	.devacc_attr_endian_flags =	DDI_NEVERSWAP_ACC,
92 	.devacc_attr_dataorder =	DDI_STORECACHING_OK_ACC,
93 	.devacc_attr_access =		DDI_DEFAULT_ACC
94 };
95 
96 
97 /*
98  * DMA attributes for the memory given to the device for queue management.
99  */
100 ddi_dma_attr_t virtio_dma_attr_queue = {
101 	.dma_attr_version =		DMA_ATTR_V0,
102 	.dma_attr_addr_lo =		0x0000000000000000,
103 	/*
104 	 * Queue memory is aligned on VIRTIO_PAGE_SIZE with the address shifted
105 	 * down by VIRTIO_PAGE_SHIFT before being passed to the device in a
106 	 * 32-bit register.
107 	 */
108 	.dma_attr_addr_hi =		0x00000FFFFFFFF000,
109 	.dma_attr_count_max =		0x00000000FFFFFFFF,
110 	.dma_attr_align =		VIRTIO_PAGE_SIZE,
111 	.dma_attr_burstsizes =		1,
112 	.dma_attr_minxfer =		1,
113 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
114 	.dma_attr_seg =			0x00000000FFFFFFFF,
115 	.dma_attr_sgllen =		1,
116 	.dma_attr_granular =		1,
117 	.dma_attr_flags =		0
118 };
119 
120 /*
121  * DMA attributes for the the allocation of indirect descriptor lists.  The
122  * indirect list is referenced by a regular descriptor entry: the physical
123  * address field is 64 bits wide, but the length field is only 32 bits.  Each
124  * descriptor is 16 bytes long.
125  */
126 ddi_dma_attr_t virtio_dma_attr_indirect = {
127 	.dma_attr_version =		DMA_ATTR_V0,
128 	.dma_attr_addr_lo =		0x0000000000000000,
129 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
130 	.dma_attr_count_max =		0x00000000FFFFFFFF,
131 	.dma_attr_align =		sizeof (struct virtio_vq_desc),
132 	.dma_attr_burstsizes =		1,
133 	.dma_attr_minxfer =		1,
134 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
135 	.dma_attr_seg =			0x00000000FFFFFFFF,
136 	.dma_attr_sgllen =		1,
137 	.dma_attr_granular =		1,
138 	.dma_attr_flags =		0
139 };
140 
141 
142 uint8_t
143 virtio_get8(virtio_t *vio, uintptr_t offset)
144 {
145 	return (ddi_get8(vio->vio_barh, (uint8_t *)(vio->vio_bar + offset)));
146 }
147 
148 uint16_t
149 virtio_get16(virtio_t *vio, uintptr_t offset)
150 {
151 	return (ddi_get16(vio->vio_barh, (uint16_t *)(vio->vio_bar + offset)));
152 }
153 
154 uint32_t
155 virtio_get32(virtio_t *vio, uintptr_t offset)
156 {
157 	return (ddi_get32(vio->vio_barh, (uint32_t *)(vio->vio_bar + offset)));
158 }
159 
160 void
161 virtio_put8(virtio_t *vio, uintptr_t offset, uint8_t value)
162 {
163 	ddi_put8(vio->vio_barh, (uint8_t *)(vio->vio_bar + offset), value);
164 }
165 
166 void
167 virtio_put16(virtio_t *vio, uintptr_t offset, uint16_t value)
168 {
169 	ddi_put16(vio->vio_barh, (uint16_t *)(vio->vio_bar + offset), value);
170 }
171 
172 void
173 virtio_put32(virtio_t *vio, uintptr_t offset, uint32_t value)
174 {
175 	ddi_put32(vio->vio_barh, (uint32_t *)(vio->vio_bar + offset), value);
176 }
177 
178 void
179 virtio_fini(virtio_t *vio, boolean_t failed)
180 {
181 	mutex_enter(&vio->vio_mutex);
182 
183 	virtio_interrupts_teardown(vio);
184 
185 	virtio_queue_t *viq;
186 	while ((viq = list_remove_head(&vio->vio_queues)) != NULL) {
187 		virtio_queue_free(viq);
188 	}
189 	list_destroy(&vio->vio_queues);
190 
191 	if (failed) {
192 		/*
193 		 * Signal to the host that device setup failed.
194 		 */
195 		virtio_set_status(vio, VIRTIO_STATUS_FAILED);
196 	} else {
197 		virtio_device_reset(vio);
198 	}
199 
200 	/*
201 	 * We don't need to do anything for the provider initlevel, as it
202 	 * merely records the fact that virtio_init_complete() was called.
203 	 */
204 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_PROVIDER;
205 
206 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_REGS) {
207 		/*
208 		 * Unmap PCI BAR0.
209 		 */
210 		ddi_regs_map_free(&vio->vio_barh);
211 
212 		vio->vio_initlevel &= ~VIRTIO_INITLEVEL_REGS;
213 	}
214 
215 	/*
216 	 * Ensure we have torn down everything we set up.
217 	 */
218 	VERIFY0(vio->vio_initlevel);
219 
220 	mutex_exit(&vio->vio_mutex);
221 	mutex_destroy(&vio->vio_mutex);
222 
223 	kmem_free(vio, sizeof (*vio));
224 }
225 
226 /*
227  * Early device initialisation for legacy (pre-1.0 specification) virtio
228  * devices.
229  */
230 virtio_t *
231 virtio_init(dev_info_t *dip, uint64_t driver_features, boolean_t allow_indirect)
232 {
233 	int r;
234 
235 	/*
236 	 * First, confirm that this is a legacy device.
237 	 */
238 	ddi_acc_handle_t pci;
239 	if (pci_config_setup(dip, &pci) != DDI_SUCCESS) {
240 		dev_err(dip, CE_WARN, "pci_config_setup failed");
241 		return (NULL);
242 	}
243 
244 	uint8_t revid;
245 	if ((revid = pci_config_get8(pci, PCI_CONF_REVID)) == PCI_EINVAL8) {
246 		dev_err(dip, CE_WARN, "could not read config space");
247 		pci_config_teardown(&pci);
248 		return (NULL);
249 	}
250 
251 	pci_config_teardown(&pci);
252 
253 	/*
254 	 * The legacy specification requires that the device advertise as PCI
255 	 * Revision 0.
256 	 */
257 	if (revid != 0) {
258 		dev_err(dip, CE_WARN, "PCI Revision %u incorrect for "
259 		    "legacy virtio device", (uint_t)revid);
260 		return (NULL);
261 	}
262 
263 	virtio_t *vio = kmem_zalloc(sizeof (*vio), KM_SLEEP);
264 	vio->vio_dip = dip;
265 
266 	/*
267 	 * Map PCI BAR0 for legacy device access.
268 	 */
269 	if ((r = ddi_regs_map_setup(dip, VIRTIO_LEGACY_PCI_BAR0,
270 	    (caddr_t *)&vio->vio_bar, 0, 0, &virtio_acc_attr,
271 	    &vio->vio_barh)) != DDI_SUCCESS) {
272 		dev_err(dip, CE_WARN, "ddi_regs_map_setup failure (%d)", r);
273 		kmem_free(vio, sizeof (*vio));
274 		return (NULL);
275 	}
276 	vio->vio_initlevel |= VIRTIO_INITLEVEL_REGS;
277 
278 	/*
279 	 * We initialise the mutex without an interrupt priority to ease the
280 	 * implementation of some of the configuration space access routines.
281 	 * Drivers using the virtio framework MUST make a call to
282 	 * "virtio_init_complete()" prior to spawning other threads or enabling
283 	 * interrupt handlers, at which time we will destroy and reinitialise
284 	 * the mutex for use in our interrupt handlers.
285 	 */
286 	mutex_init(&vio->vio_mutex, NULL, MUTEX_DRIVER, NULL);
287 
288 	list_create(&vio->vio_queues, sizeof (virtio_queue_t),
289 	    offsetof(virtio_queue_t, viq_link));
290 
291 	/*
292 	 * Legacy virtio devices require a few common steps before we can
293 	 * negotiate device features.
294 	 */
295 	virtio_device_reset(vio);
296 	virtio_set_status(vio, VIRTIO_STATUS_ACKNOWLEDGE);
297 	virtio_set_status(vio, VIRTIO_STATUS_DRIVER);
298 
299 	/*
300 	 * Negotiate features with the device.  Record the original supported
301 	 * feature set for debugging purposes.
302 	 */
303 	vio->vio_features_device = virtio_get32(vio,
304 	    VIRTIO_LEGACY_FEATURES_DEVICE);
305 	if (allow_indirect) {
306 		driver_features |= VIRTIO_F_RING_INDIRECT_DESC;
307 	}
308 	vio->vio_features = vio->vio_features_device & driver_features;
309 	virtio_put32(vio, VIRTIO_LEGACY_FEATURES_DRIVER, vio->vio_features);
310 
311 	/*
312 	 * The device-specific configuration begins at an offset into the BAR
313 	 * that depends on whether we have enabled MSI-X interrupts or not.
314 	 * Start out with the offset for pre-MSI-X operation so that we can
315 	 * read device configuration space prior to configuring interrupts.
316 	 */
317 	vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET;
318 
319 	return (vio);
320 }
321 
322 /*
323  * This function must be called by the driver once it has completed early setup
324  * calls.
325  */
326 int
327 virtio_init_complete(virtio_t *vio, int allowed_interrupt_types)
328 {
329 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_PROVIDER));
330 	vio->vio_initlevel |= VIRTIO_INITLEVEL_PROVIDER;
331 
332 	if (!list_is_empty(&vio->vio_queues)) {
333 		/*
334 		 * Set up interrupts for the queues that have been registered.
335 		 */
336 		if (virtio_interrupts_setup(vio, allowed_interrupt_types) !=
337 		    DDI_SUCCESS) {
338 			return (DDI_FAILURE);
339 		}
340 	}
341 
342 	/*
343 	 * We can allocate the mutex once we know the priority.
344 	 */
345 	mutex_destroy(&vio->vio_mutex);
346 	mutex_init(&vio->vio_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
347 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
348 	    viq = list_next(&vio->vio_queues, viq)) {
349 		mutex_destroy(&viq->viq_mutex);
350 		mutex_init(&viq->viq_mutex, NULL, MUTEX_DRIVER,
351 		    virtio_intr_pri(vio));
352 	}
353 
354 	virtio_set_status(vio, VIRTIO_STATUS_DRIVER_OK);
355 
356 	return (DDI_SUCCESS);
357 }
358 
359 boolean_t
360 virtio_feature_present(virtio_t *vio, uint64_t feature_mask)
361 {
362 	return ((vio->vio_features & feature_mask) != 0);
363 }
364 
365 void *
366 virtio_intr_pri(virtio_t *vio)
367 {
368 	VERIFY(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED);
369 
370 	return (DDI_INTR_PRI(vio->vio_interrupt_priority));
371 }
372 
373 /*
374  * Enable a bit in the device status register.  Each bit signals a level of
375  * guest readiness to the host.  Use the VIRTIO_CONFIG_DEVICE_STATUS_*
376  * constants for "status".  To zero the status field use virtio_device_reset().
377  */
378 static void
379 virtio_set_status(virtio_t *vio, uint8_t status)
380 {
381 	VERIFY3U(status, !=, 0);
382 
383 	mutex_enter(&vio->vio_mutex);
384 
385 	uint8_t old = virtio_get8(vio, VIRTIO_LEGACY_DEVICE_STATUS);
386 	virtio_put8(vio, VIRTIO_LEGACY_DEVICE_STATUS, status | old);
387 
388 	mutex_exit(&vio->vio_mutex);
389 }
390 
391 static void
392 virtio_device_reset_locked(virtio_t *vio)
393 {
394 	virtio_put8(vio, VIRTIO_LEGACY_DEVICE_STATUS, VIRTIO_STATUS_RESET);
395 }
396 
397 void
398 virtio_device_reset(virtio_t *vio)
399 {
400 	mutex_enter(&vio->vio_mutex);
401 	virtio_device_reset_locked(vio);
402 	mutex_exit(&vio->vio_mutex);
403 }
404 
405 /*
406  * Some queues are effectively long-polled; the driver submits a series of
407  * buffers and the device only returns them when there is data available.
408  * During detach, we need to coordinate the return of these buffers.  Calling
409  * "virtio_shutdown()" will reset the device, then allow the removal of all
410  * buffers that were in flight at the time of shutdown via
411  * "virtio_queue_evacuate()".
412  */
413 void
414 virtio_shutdown(virtio_t *vio)
415 {
416 	mutex_enter(&vio->vio_mutex);
417 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) {
418 		/*
419 		 * Shutdown has been performed already.
420 		 */
421 		mutex_exit(&vio->vio_mutex);
422 		return;
423 	}
424 
425 	/*
426 	 * First, mark all of the queues as shutdown.  This will prevent any
427 	 * further activity.
428 	 */
429 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
430 	    viq = list_next(&vio->vio_queues, viq)) {
431 		mutex_enter(&viq->viq_mutex);
432 		viq->viq_shutdown = B_TRUE;
433 		mutex_exit(&viq->viq_mutex);
434 	}
435 
436 	/*
437 	 * Now, reset the device.  This removes any queue configuration on the
438 	 * device side.
439 	 */
440 	virtio_device_reset_locked(vio);
441 	vio->vio_initlevel |= VIRTIO_INITLEVEL_SHUTDOWN;
442 	mutex_exit(&vio->vio_mutex);
443 }
444 
445 /*
446  * Common implementation of quiesce(9E) for simple Virtio-based devices.
447  */
448 int
449 virtio_quiesce(virtio_t *vio)
450 {
451 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) {
452 		/*
453 		 * Device has already been reset.
454 		 */
455 		return (DDI_SUCCESS);
456 	}
457 
458 	/*
459 	 * When we reset the device, it should immediately stop using any DMA
460 	 * memory we've previously passed to it.  All queue configuration is
461 	 * discarded.  This is good enough for quiesce(9E).
462 	 */
463 	virtio_device_reset_locked(vio);
464 
465 	return (DDI_SUCCESS);
466 }
467 
468 /*
469  * DEVICE-SPECIFIC REGISTER ACCESS
470  *
471  * Note that these functions take the mutex to avoid racing with interrupt
472  * enable/disable, when the device-specific offset can potentially change.
473  */
474 
475 uint8_t
476 virtio_dev_get8(virtio_t *vio, uintptr_t offset)
477 {
478 	mutex_enter(&vio->vio_mutex);
479 	uint8_t r = virtio_get8(vio, vio->vio_config_offset + offset);
480 	mutex_exit(&vio->vio_mutex);
481 
482 	return (r);
483 }
484 
485 uint16_t
486 virtio_dev_get16(virtio_t *vio, uintptr_t offset)
487 {
488 	mutex_enter(&vio->vio_mutex);
489 	uint16_t r = virtio_get16(vio, vio->vio_config_offset + offset);
490 	mutex_exit(&vio->vio_mutex);
491 
492 	return (r);
493 }
494 
495 uint32_t
496 virtio_dev_get32(virtio_t *vio, uintptr_t offset)
497 {
498 	mutex_enter(&vio->vio_mutex);
499 	uint32_t r = virtio_get32(vio, vio->vio_config_offset + offset);
500 	mutex_exit(&vio->vio_mutex);
501 
502 	return (r);
503 }
504 
505 uint64_t
506 virtio_dev_get64(virtio_t *vio, uintptr_t offset)
507 {
508 	mutex_enter(&vio->vio_mutex);
509 	/*
510 	 * On at least some systems, a 64-bit read or write to this BAR is not
511 	 * possible.  For legacy devices, there is no generation number to use
512 	 * to determine if configuration may have changed half-way through a
513 	 * read.  We need to continue to read both halves of the value until we
514 	 * read the same value at least twice.
515 	 */
516 	uintptr_t o_lo = vio->vio_config_offset + offset;
517 	uintptr_t o_hi = o_lo + 4;
518 
519 	uint64_t val = virtio_get32(vio, o_lo) |
520 	    ((uint64_t)virtio_get32(vio, o_hi) << 32);
521 
522 	for (;;) {
523 		uint64_t tval = virtio_get32(vio, o_lo) |
524 		    ((uint64_t)virtio_get32(vio, o_hi) << 32);
525 
526 		if (tval == val) {
527 			break;
528 		}
529 
530 		val = tval;
531 	}
532 
533 	mutex_exit(&vio->vio_mutex);
534 	return (val);
535 }
536 
537 void
538 virtio_dev_put8(virtio_t *vio, uintptr_t offset, uint8_t value)
539 {
540 	mutex_enter(&vio->vio_mutex);
541 	virtio_put8(vio, vio->vio_config_offset + offset, value);
542 	mutex_exit(&vio->vio_mutex);
543 }
544 
545 void
546 virtio_dev_put16(virtio_t *vio, uintptr_t offset, uint16_t value)
547 {
548 	mutex_enter(&vio->vio_mutex);
549 	virtio_put16(vio, vio->vio_config_offset + offset, value);
550 	mutex_exit(&vio->vio_mutex);
551 }
552 
553 void
554 virtio_dev_put32(virtio_t *vio, uintptr_t offset, uint32_t value)
555 {
556 	mutex_enter(&vio->vio_mutex);
557 	virtio_put32(vio, vio->vio_config_offset + offset, value);
558 	mutex_exit(&vio->vio_mutex);
559 }
560 
561 /*
562  * VIRTQUEUE MANAGEMENT
563  */
564 
565 static int
566 virtio_inflight_compar(const void *lp, const void *rp)
567 {
568 	const virtio_chain_t *l = lp;
569 	const virtio_chain_t *r = rp;
570 
571 	if (l->vic_head < r->vic_head) {
572 		return (-1);
573 	} else if (l->vic_head > r->vic_head) {
574 		return (1);
575 	} else {
576 		return (0);
577 	}
578 }
579 
580 virtio_queue_t *
581 virtio_queue_alloc(virtio_t *vio, uint16_t qidx, const char *name,
582     ddi_intr_handler_t *func, void *funcarg, boolean_t force_direct,
583     uint_t max_segs)
584 {
585 	uint16_t qsz;
586 	char space_name[256];
587 
588 	if (max_segs < 1) {
589 		/*
590 		 * Every descriptor, direct or indirect, needs to refer to at
591 		 * least one buffer.
592 		 */
593 		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
594 		    "segment count must be at least 1", name, (uint_t)qidx);
595 		return (NULL);
596 	}
597 
598 	mutex_enter(&vio->vio_mutex);
599 
600 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_PROVIDER) {
601 		/*
602 		 * Cannot configure any more queues once initial setup is
603 		 * complete and interrupts have been allocated.
604 		 */
605 		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
606 		    "alloc after init complete", name, (uint_t)qidx);
607 		mutex_exit(&vio->vio_mutex);
608 		return (NULL);
609 	}
610 
611 	/*
612 	 * There is no way to negotiate a different queue size for legacy
613 	 * devices.  We must read and use the native queue size of the device.
614 	 */
615 	virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qidx);
616 	if ((qsz = virtio_get16(vio, VIRTIO_LEGACY_QUEUE_SIZE)) == 0) {
617 		/*
618 		 * A size of zero means the device does not have a queue with
619 		 * this index.
620 		 */
621 		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
622 		    "does not exist on device", name, (uint_t)qidx);
623 		mutex_exit(&vio->vio_mutex);
624 		return (NULL);
625 	}
626 
627 	mutex_exit(&vio->vio_mutex);
628 
629 	virtio_queue_t *viq = kmem_zalloc(sizeof (*viq), KM_SLEEP);
630 	viq->viq_virtio = vio;
631 	viq->viq_name = name;
632 	viq->viq_index = qidx;
633 	viq->viq_size = qsz;
634 	viq->viq_func = func;
635 	viq->viq_funcarg = funcarg;
636 	viq->viq_max_segs = max_segs;
637 	avl_create(&viq->viq_inflight, virtio_inflight_compar,
638 	    sizeof (virtio_chain_t), offsetof(virtio_chain_t, vic_node));
639 
640 	/*
641 	 * Allocate the mutex without an interrupt priority for now, as we do
642 	 * with "vio_mutex".  We'll reinitialise it in
643 	 * "virtio_init_complete()".
644 	 */
645 	mutex_init(&viq->viq_mutex, NULL, MUTEX_DRIVER, NULL);
646 
647 	if (virtio_feature_present(vio, VIRTIO_F_RING_INDIRECT_DESC) &&
648 	    !force_direct) {
649 		/*
650 		 * If we were able to negotiate the indirect descriptor
651 		 * feature, and the caller has not explicitly forced the use of
652 		 * direct descriptors, we'll allocate indirect descriptor lists
653 		 * for each chain.
654 		 */
655 		viq->viq_indirect = B_TRUE;
656 	}
657 
658 	/*
659 	 * Track descriptor usage in an identifier space.
660 	 */
661 	(void) snprintf(space_name, sizeof (space_name), "%s%d_vq_%s",
662 	    ddi_get_name(vio->vio_dip), ddi_get_instance(vio->vio_dip), name);
663 	if ((viq->viq_descmap = id_space_create(space_name, 0, qsz)) == NULL) {
664 		dev_err(vio->vio_dip, CE_WARN, "could not allocate descriptor "
665 		    "ID space");
666 		virtio_queue_free(viq);
667 		return (NULL);
668 	}
669 
670 	/*
671 	 * For legacy devices, memory for the queue has a strict layout
672 	 * determined by the queue size.
673 	 */
674 	size_t sz_descs = sizeof (virtio_vq_desc_t) * qsz;
675 	size_t sz_driver = P2ROUNDUP_TYPED(sz_descs +
676 	    sizeof (virtio_vq_driver_t) +
677 	    sizeof (uint16_t) * qsz,
678 	    VIRTIO_PAGE_SIZE, size_t);
679 	size_t sz_device = P2ROUNDUP_TYPED(sizeof (virtio_vq_device_t) +
680 	    sizeof (virtio_vq_elem_t) * qsz,
681 	    VIRTIO_PAGE_SIZE, size_t);
682 
683 	if (virtio_dma_init(vio, &viq->viq_dma, sz_driver + sz_device,
684 	    &virtio_dma_attr_queue, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
685 	    KM_SLEEP) != DDI_SUCCESS) {
686 		dev_err(vio->vio_dip, CE_WARN, "could not allocate queue "
687 		    "DMA memory");
688 		virtio_queue_free(viq);
689 		return (NULL);
690 	}
691 
692 	/*
693 	 * NOTE: The viq_dma_* members below are used by
694 	 * VIRTQ_DMA_SYNC_FORDEV() and VIRTQ_DMA_SYNC_FORKERNEL() to calculate
695 	 * offsets into the DMA allocation for partial synchronisation.  If the
696 	 * ordering of, or relationship between, these pointers changes, the
697 	 * macros must be kept in sync.
698 	 */
699 	viq->viq_dma_descs = virtio_dma_va(&viq->viq_dma, 0);
700 	viq->viq_dma_driver = virtio_dma_va(&viq->viq_dma, sz_descs);
701 	viq->viq_dma_device = virtio_dma_va(&viq->viq_dma, sz_driver);
702 
703 	/*
704 	 * Install in the per-device list of queues.
705 	 */
706 	mutex_enter(&vio->vio_mutex);
707 	for (virtio_queue_t *chkvq = list_head(&vio->vio_queues); chkvq != NULL;
708 	    chkvq = list_next(&vio->vio_queues, chkvq)) {
709 		if (chkvq->viq_index == qidx) {
710 			dev_err(vio->vio_dip, CE_WARN, "attempt to register "
711 			    "queue \"%s\" with same index (%d) as queue \"%s\"",
712 			    name, qidx, chkvq->viq_name);
713 			mutex_exit(&vio->vio_mutex);
714 			virtio_queue_free(viq);
715 			return (NULL);
716 		}
717 	}
718 	list_insert_tail(&vio->vio_queues, viq);
719 
720 	/*
721 	 * Ensure the zeroing of the queue memory is visible to the host before
722 	 * we inform the device of the queue address.
723 	 */
724 	membar_producer();
725 	VIRTQ_DMA_SYNC_FORDEV(viq);
726 
727 	virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qidx);
728 	virtio_put32(vio, VIRTIO_LEGACY_QUEUE_ADDRESS,
729 	    virtio_dma_cookie_pa(&viq->viq_dma, 0) >> VIRTIO_PAGE_SHIFT);
730 
731 	mutex_exit(&vio->vio_mutex);
732 	return (viq);
733 }
734 
735 static void
736 virtio_queue_free(virtio_queue_t *viq)
737 {
738 	virtio_t *vio = viq->viq_virtio;
739 
740 	/*
741 	 * We are going to destroy the queue mutex.  Make sure we've already
742 	 * removed the interrupt handlers.
743 	 */
744 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED));
745 
746 	mutex_enter(&viq->viq_mutex);
747 
748 	/*
749 	 * If the device has not already been reset as part of a shutdown,
750 	 * detach the queue from the device now.
751 	 */
752 	if (!viq->viq_shutdown) {
753 		virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, viq->viq_index);
754 		virtio_put32(vio, VIRTIO_LEGACY_QUEUE_ADDRESS, 0);
755 	}
756 
757 	virtio_dma_fini(&viq->viq_dma);
758 
759 	VERIFY(avl_is_empty(&viq->viq_inflight));
760 	avl_destroy(&viq->viq_inflight);
761 	if (viq->viq_descmap != NULL) {
762 		id_space_destroy(viq->viq_descmap);
763 	}
764 
765 	mutex_exit(&viq->viq_mutex);
766 	mutex_destroy(&viq->viq_mutex);
767 
768 	kmem_free(viq, sizeof (*viq));
769 }
770 
771 void
772 virtio_queue_no_interrupt(virtio_queue_t *viq, boolean_t stop_interrupts)
773 {
774 	mutex_enter(&viq->viq_mutex);
775 
776 	if (stop_interrupts) {
777 		viq->viq_dma_driver->vqdr_flags |= VIRTQ_AVAIL_F_NO_INTERRUPT;
778 	} else {
779 		viq->viq_dma_driver->vqdr_flags &= ~VIRTQ_AVAIL_F_NO_INTERRUPT;
780 	}
781 	VIRTQ_DMA_SYNC_FORDEV(viq);
782 
783 	mutex_exit(&viq->viq_mutex);
784 }
785 
786 static virtio_chain_t *
787 virtio_queue_complete(virtio_queue_t *viq, uint_t index)
788 {
789 	VERIFY(MUTEX_HELD(&viq->viq_mutex));
790 
791 	virtio_chain_t *vic;
792 
793 	virtio_chain_t search;
794 	bzero(&search, sizeof (search));
795 	search.vic_head = index;
796 
797 	if ((vic = avl_find(&viq->viq_inflight, &search, NULL)) == NULL) {
798 		return (NULL);
799 	}
800 	avl_remove(&viq->viq_inflight, vic);
801 
802 	return (vic);
803 }
804 
805 uint_t
806 virtio_queue_size(virtio_queue_t *viq)
807 {
808 	return (viq->viq_size);
809 }
810 
811 uint_t
812 virtio_queue_nactive(virtio_queue_t *viq)
813 {
814 	mutex_enter(&viq->viq_mutex);
815 	uint_t r = avl_numnodes(&viq->viq_inflight);
816 	mutex_exit(&viq->viq_mutex);
817 
818 	return (r);
819 }
820 
821 virtio_chain_t *
822 virtio_queue_poll(virtio_queue_t *viq)
823 {
824 	mutex_enter(&viq->viq_mutex);
825 	if (viq->viq_shutdown) {
826 		/*
827 		 * The device has been reset by virtio_shutdown(), and queue
828 		 * processing has been halted.  Any previously submitted chains
829 		 * will be evacuated using virtio_queue_evacuate().
830 		 */
831 		mutex_exit(&viq->viq_mutex);
832 		return (NULL);
833 	}
834 
835 	VIRTQ_DMA_SYNC_FORKERNEL(viq);
836 	if (viq->viq_device_index == viq->viq_dma_device->vqde_index) {
837 		/*
838 		 * If the device index has not changed since the last poll,
839 		 * there are no new chains to process.
840 		 */
841 		mutex_exit(&viq->viq_mutex);
842 		return (NULL);
843 	}
844 
845 	/*
846 	 * We need to ensure that all reads from the descriptor (vqde_ring[])
847 	 * and any referenced memory by the descriptor occur after we have read
848 	 * the descriptor index value above (vqde_index).
849 	 */
850 	membar_consumer();
851 
852 	uint16_t index = (viq->viq_device_index++) % viq->viq_size;
853 	uint16_t start = viq->viq_dma_device->vqde_ring[index].vqe_start;
854 	uint32_t len = viq->viq_dma_device->vqde_ring[index].vqe_len;
855 
856 	virtio_chain_t *vic;
857 	if ((vic = virtio_queue_complete(viq, start)) == NULL) {
858 		/*
859 		 * We could not locate a chain for this descriptor index, which
860 		 * suggests that something has gone horribly wrong.
861 		 */
862 		dev_err(viq->viq_virtio->vio_dip, CE_PANIC,
863 		    "queue \"%s\" ring entry %u (descriptor %u) has no chain",
864 		    viq->viq_name, (uint16_t)index, (uint16_t)start);
865 	}
866 
867 	vic->vic_received_length = len;
868 
869 	mutex_exit(&viq->viq_mutex);
870 
871 	return (vic);
872 }
873 
874 /*
875  * After a call to "virtio_shutdown()", the driver must retrieve any previously
876  * submitted chains and free any associated resources.
877  */
878 virtio_chain_t *
879 virtio_queue_evacuate(virtio_queue_t *viq)
880 {
881 	virtio_t *vio = viq->viq_virtio;
882 
883 	mutex_enter(&vio->vio_mutex);
884 	if (!(vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN)) {
885 		dev_err(vio->vio_dip, CE_PANIC,
886 		    "virtio_queue_evacuate() without virtio_shutdown()");
887 	}
888 	mutex_exit(&vio->vio_mutex);
889 
890 	mutex_enter(&viq->viq_mutex);
891 	VERIFY(viq->viq_shutdown);
892 
893 	virtio_chain_t *vic = avl_first(&viq->viq_inflight);
894 	if (vic != NULL) {
895 		avl_remove(&viq->viq_inflight, vic);
896 	}
897 
898 	mutex_exit(&viq->viq_mutex);
899 
900 	return (vic);
901 }
902 
903 /*
904  * VIRTQUEUE DESCRIPTOR CHAIN MANAGEMENT
905  */
906 
907 /*
908  * When the device returns a descriptor chain to the driver, it may provide the
909  * length in bytes of data written into the chain.  Client drivers should use
910  * this value with care; the specification suggests some device implementations
911  * have not always provided a useful or correct value.
912  */
913 size_t
914 virtio_chain_received_length(virtio_chain_t *vic)
915 {
916 	return (vic->vic_received_length);
917 }
918 
919 /*
920  * Allocate a descriptor chain for use with this queue.  The "kmflags" value
921  * may be KM_SLEEP or KM_NOSLEEP as per kmem_alloc(9F).
922  */
923 virtio_chain_t *
924 virtio_chain_alloc(virtio_queue_t *viq, int kmflags)
925 {
926 	virtio_t *vio = viq->viq_virtio;
927 	virtio_chain_t *vic;
928 	uint_t cap;
929 
930 	/*
931 	 * Direct descriptors are known by their index in the descriptor table
932 	 * for the queue.  We use the variable-length array member at the end
933 	 * of the chain tracking object to hold the list of direct descriptors
934 	 * assigned to this chain.
935 	 */
936 	if (viq->viq_indirect) {
937 		/*
938 		 * When using indirect descriptors we still need one direct
939 		 * descriptor entry to hold the physical address and length of
940 		 * the indirect descriptor table.
941 		 */
942 		cap = 1;
943 	} else {
944 		/*
945 		 * For direct descriptors we need to be able to track a
946 		 * descriptor for each possible segment in a single chain.
947 		 */
948 		cap = viq->viq_max_segs;
949 	}
950 
951 	size_t vicsz = sizeof (*vic) + sizeof (uint16_t) * cap;
952 	if ((vic = kmem_zalloc(vicsz, kmflags)) == NULL) {
953 		return (NULL);
954 	}
955 	vic->vic_vq = viq;
956 	vic->vic_direct_capacity = cap;
957 
958 	if (viq->viq_indirect) {
959 		/*
960 		 * Allocate an indirect descriptor list with the appropriate
961 		 * number of entries.
962 		 */
963 		if (virtio_dma_init(vio, &vic->vic_indirect_dma,
964 		    sizeof (virtio_vq_desc_t) * viq->viq_max_segs,
965 		    &virtio_dma_attr_indirect,
966 		    DDI_DMA_CONSISTENT | DDI_DMA_WRITE,
967 		    kmflags) != DDI_SUCCESS) {
968 			goto fail;
969 		}
970 
971 		/*
972 		 * Allocate a single descriptor to hold the indirect list.
973 		 * Leave the length as zero for now; it will be set to include
974 		 * any occupied entries at push time.
975 		 */
976 		mutex_enter(&viq->viq_mutex);
977 		if (virtio_chain_append_impl(vic,
978 		    virtio_dma_cookie_pa(&vic->vic_indirect_dma, 0), 0,
979 		    VIRTQ_DESC_F_INDIRECT) != DDI_SUCCESS) {
980 			mutex_exit(&viq->viq_mutex);
981 			goto fail;
982 		}
983 		mutex_exit(&viq->viq_mutex);
984 		VERIFY3U(vic->vic_direct_used, ==, 1);
985 
986 		/*
987 		 * Don't set the indirect capacity until after we've installed
988 		 * the direct descriptor which points at the indirect list, or
989 		 * virtio_chain_append_impl() will be confused.
990 		 */
991 		vic->vic_indirect_capacity = viq->viq_max_segs;
992 	}
993 
994 	return (vic);
995 
996 fail:
997 	virtio_dma_fini(&vic->vic_indirect_dma);
998 	kmem_free(vic, vicsz);
999 	return (NULL);
1000 }
1001 
1002 void *
1003 virtio_chain_data(virtio_chain_t *vic)
1004 {
1005 	return (vic->vic_data);
1006 }
1007 
1008 void
1009 virtio_chain_data_set(virtio_chain_t *vic, void *data)
1010 {
1011 	vic->vic_data = data;
1012 }
1013 
1014 void
1015 virtio_chain_clear(virtio_chain_t *vic)
1016 {
1017 	if (vic->vic_indirect_capacity != 0) {
1018 		/*
1019 		 * There should only be one direct descriptor, which points at
1020 		 * our indirect descriptor list.  We don't want to clear it
1021 		 * here.
1022 		 */
1023 		VERIFY3U(vic->vic_direct_capacity, ==, 1);
1024 
1025 		if (vic->vic_indirect_used > 0) {
1026 			/*
1027 			 * Clear out the indirect descriptor table.
1028 			 */
1029 			vic->vic_indirect_used = 0;
1030 			bzero(virtio_dma_va(&vic->vic_indirect_dma, 0),
1031 			    virtio_dma_size(&vic->vic_indirect_dma));
1032 		}
1033 
1034 	} else if (vic->vic_direct_capacity > 0) {
1035 		/*
1036 		 * Release any descriptors that were assigned to us previously.
1037 		 */
1038 		for (uint_t i = 0; i < vic->vic_direct_used; i++) {
1039 			id_free(vic->vic_vq->viq_descmap, vic->vic_direct[i]);
1040 			vic->vic_direct[i] = 0;
1041 		}
1042 		vic->vic_direct_used = 0;
1043 	}
1044 }
1045 
1046 void
1047 virtio_chain_free(virtio_chain_t *vic)
1048 {
1049 	/*
1050 	 * First ensure that we have released any descriptors used by this
1051 	 * chain.
1052 	 */
1053 	virtio_chain_clear(vic);
1054 
1055 	if (vic->vic_indirect_capacity > 0) {
1056 		/*
1057 		 * Release the direct descriptor that points to our indirect
1058 		 * descriptor list.
1059 		 */
1060 		VERIFY3U(vic->vic_direct_capacity, ==, 1);
1061 		id_free(vic->vic_vq->viq_descmap, vic->vic_direct[0]);
1062 
1063 		virtio_dma_fini(&vic->vic_indirect_dma);
1064 	}
1065 
1066 	size_t vicsz = sizeof (*vic) +
1067 	    vic->vic_direct_capacity * sizeof (uint16_t);
1068 
1069 	kmem_free(vic, vicsz);
1070 }
1071 
1072 static inline int
1073 virtio_queue_descmap_alloc(virtio_queue_t *viq, uint_t *indexp)
1074 {
1075 	id_t index;
1076 
1077 	if ((index = id_alloc_nosleep(viq->viq_descmap)) == -1) {
1078 		return (ENOMEM);
1079 	}
1080 
1081 	VERIFY3S(index, >=, 0);
1082 	VERIFY3S(index, <=, viq->viq_size);
1083 
1084 	*indexp = (uint_t)index;
1085 	return (0);
1086 }
1087 
1088 static int
1089 virtio_chain_append_impl(virtio_chain_t *vic, uint64_t pa, size_t len,
1090     uint16_t flags)
1091 {
1092 	virtio_queue_t *viq = vic->vic_vq;
1093 	virtio_vq_desc_t *vqd;
1094 	uint_t index;
1095 
1096 	/*
1097 	 * We're modifying the queue-wide descriptor list so make sure we have
1098 	 * the appropriate lock.
1099 	 */
1100 	VERIFY(MUTEX_HELD(&viq->viq_mutex));
1101 
1102 	if (vic->vic_indirect_capacity != 0) {
1103 		/*
1104 		 * Use indirect descriptors.
1105 		 */
1106 		if (vic->vic_indirect_used >= vic->vic_indirect_capacity) {
1107 			return (DDI_FAILURE);
1108 		}
1109 
1110 		vqd = virtio_dma_va(&vic->vic_indirect_dma, 0);
1111 
1112 		if ((index = vic->vic_indirect_used++) > 0) {
1113 			/*
1114 			 * Chain the current last indirect descriptor to the
1115 			 * new one.
1116 			 */
1117 			vqd[index - 1].vqd_flags |= VIRTQ_DESC_F_NEXT;
1118 			vqd[index - 1].vqd_next = index;
1119 		}
1120 
1121 	} else {
1122 		/*
1123 		 * Use direct descriptors.
1124 		 */
1125 		if (vic->vic_direct_used >= vic->vic_direct_capacity) {
1126 			return (DDI_FAILURE);
1127 		}
1128 
1129 		if (virtio_queue_descmap_alloc(viq, &index) != 0) {
1130 			return (DDI_FAILURE);
1131 		}
1132 
1133 		vqd = virtio_dma_va(&viq->viq_dma, 0);
1134 
1135 		if (vic->vic_direct_used > 0) {
1136 			/*
1137 			 * This is not the first entry.  Chain the current
1138 			 * descriptor to the next one.
1139 			 */
1140 			uint16_t p = vic->vic_direct[vic->vic_direct_used - 1];
1141 
1142 			vqd[p].vqd_flags |= VIRTQ_DESC_F_NEXT;
1143 			vqd[p].vqd_next = index;
1144 		}
1145 		vic->vic_direct[vic->vic_direct_used++] = index;
1146 	}
1147 
1148 	vqd[index].vqd_addr = pa;
1149 	vqd[index].vqd_len = len;
1150 	vqd[index].vqd_flags = flags;
1151 	vqd[index].vqd_next = 0;
1152 
1153 	return (DDI_SUCCESS);
1154 }
1155 
1156 int
1157 virtio_chain_append(virtio_chain_t *vic, uint64_t pa, size_t len,
1158     virtio_direction_t dir)
1159 {
1160 	virtio_queue_t *viq = vic->vic_vq;
1161 	uint16_t flags = 0;
1162 
1163 	switch (dir) {
1164 	case VIRTIO_DIR_DEVICE_WRITES:
1165 		flags |= VIRTQ_DESC_F_WRITE;
1166 		break;
1167 
1168 	case VIRTIO_DIR_DEVICE_READS:
1169 		break;
1170 
1171 	default:
1172 		panic("unknown direction value %u", dir);
1173 	}
1174 
1175 	mutex_enter(&viq->viq_mutex);
1176 	int r = virtio_chain_append_impl(vic, pa, len, flags);
1177 	mutex_exit(&viq->viq_mutex);
1178 
1179 	return (r);
1180 }
1181 
1182 static void
1183 virtio_queue_flush_locked(virtio_queue_t *viq)
1184 {
1185 	VERIFY(MUTEX_HELD(&viq->viq_mutex));
1186 
1187 	/*
1188 	 * Make sure any writes we have just made to the descriptors
1189 	 * (vqdr_ring[]) are visible to the device before we update the ring
1190 	 * pointer (vqdr_index).
1191 	 */
1192 	membar_producer();
1193 	viq->viq_dma_driver->vqdr_index = viq->viq_driver_index;
1194 	VIRTQ_DMA_SYNC_FORDEV(viq);
1195 
1196 	/*
1197 	 * Determine whether the device expects us to notify it of new
1198 	 * descriptors.
1199 	 */
1200 	VIRTQ_DMA_SYNC_FORKERNEL(viq);
1201 	if (!(viq->viq_dma_device->vqde_flags & VIRTQ_USED_F_NO_NOTIFY)) {
1202 		virtio_put16(viq->viq_virtio, VIRTIO_LEGACY_QUEUE_NOTIFY,
1203 		    viq->viq_index);
1204 	}
1205 }
1206 
1207 void
1208 virtio_queue_flush(virtio_queue_t *viq)
1209 {
1210 	mutex_enter(&viq->viq_mutex);
1211 	virtio_queue_flush_locked(viq);
1212 	mutex_exit(&viq->viq_mutex);
1213 }
1214 
1215 void
1216 virtio_chain_submit(virtio_chain_t *vic, boolean_t flush)
1217 {
1218 	virtio_queue_t *viq = vic->vic_vq;
1219 
1220 	mutex_enter(&viq->viq_mutex);
1221 
1222 	if (vic->vic_indirect_capacity != 0) {
1223 		virtio_vq_desc_t *vqd = virtio_dma_va(&viq->viq_dma, 0);
1224 
1225 		VERIFY3U(vic->vic_direct_used, ==, 1);
1226 
1227 		/*
1228 		 * This is an indirect descriptor queue.  The length in bytes
1229 		 * of the descriptor must extend to cover the populated
1230 		 * indirect descriptor entries.
1231 		 */
1232 		vqd[vic->vic_direct[0]].vqd_len =
1233 		    sizeof (virtio_vq_desc_t) * vic->vic_indirect_used;
1234 
1235 		virtio_dma_sync(&vic->vic_indirect_dma, DDI_DMA_SYNC_FORDEV);
1236 	}
1237 
1238 	/*
1239 	 * Populate the next available slot in the driver-owned ring for this
1240 	 * chain.  The updated value of viq_driver_index is not yet visible to
1241 	 * the device until a subsequent queue flush.
1242 	 */
1243 	uint16_t index = (viq->viq_driver_index++) % viq->viq_size;
1244 	viq->viq_dma_driver->vqdr_ring[index] = vic->vic_direct[0];
1245 
1246 	vic->vic_head = vic->vic_direct[0];
1247 	avl_add(&viq->viq_inflight, vic);
1248 
1249 	if (flush) {
1250 		virtio_queue_flush_locked(vic->vic_vq);
1251 	}
1252 
1253 	mutex_exit(&viq->viq_mutex);
1254 }
1255 
1256 /*
1257  * INTERRUPTS MANAGEMENT
1258  */
1259 
1260 static const char *
1261 virtio_interrupt_type_name(int type)
1262 {
1263 	switch (type) {
1264 	case DDI_INTR_TYPE_MSIX:
1265 		return ("MSI-X");
1266 	case DDI_INTR_TYPE_MSI:
1267 		return ("MSI");
1268 	case DDI_INTR_TYPE_FIXED:
1269 		return ("fixed");
1270 	default:
1271 		return ("?");
1272 	}
1273 }
1274 
1275 static int
1276 virtio_interrupts_alloc(virtio_t *vio, int type, int nrequired)
1277 {
1278 	dev_info_t *dip = vio->vio_dip;
1279 	int nintrs = 0;
1280 	int navail = 0;
1281 
1282 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1283 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ALLOC));
1284 
1285 	if (ddi_intr_get_nintrs(dip, type, &nintrs) != DDI_SUCCESS) {
1286 		dev_err(dip, CE_WARN, "could not count %s interrupts",
1287 		    virtio_interrupt_type_name(type));
1288 		return (DDI_FAILURE);
1289 	}
1290 	if (nintrs < 1) {
1291 		dev_err(dip, CE_WARN, "no %s interrupts supported",
1292 		    virtio_interrupt_type_name(type));
1293 		return (DDI_FAILURE);
1294 	}
1295 
1296 	if (ddi_intr_get_navail(dip, type, &navail) != DDI_SUCCESS) {
1297 		dev_err(dip, CE_WARN, "could not count available %s interrupts",
1298 		    virtio_interrupt_type_name(type));
1299 		return (DDI_FAILURE);
1300 	}
1301 	if (navail < nrequired) {
1302 		dev_err(dip, CE_WARN, "need %d %s interrupts, but only %d "
1303 		    "available", nrequired, virtio_interrupt_type_name(type),
1304 		    navail);
1305 		return (DDI_FAILURE);
1306 	}
1307 
1308 	VERIFY3P(vio->vio_interrupts, ==, NULL);
1309 	vio->vio_interrupts = kmem_zalloc(
1310 	    sizeof (ddi_intr_handle_t) * nrequired, KM_SLEEP);
1311 
1312 	int r;
1313 	if ((r = ddi_intr_alloc(dip, vio->vio_interrupts, type, 0, nrequired,
1314 	    &vio->vio_ninterrupts, DDI_INTR_ALLOC_STRICT)) != DDI_SUCCESS) {
1315 		dev_err(dip, CE_WARN, "%s interrupt allocation failure (%d)",
1316 		    virtio_interrupt_type_name(type), r);
1317 		kmem_free(vio->vio_interrupts,
1318 		    sizeof (ddi_intr_handle_t) * nrequired);
1319 		vio->vio_interrupts = NULL;
1320 		return (DDI_FAILURE);
1321 	}
1322 
1323 	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ALLOC;
1324 	vio->vio_interrupt_type = type;
1325 	return (DDI_SUCCESS);
1326 }
1327 
1328 static uint_t
1329 virtio_shared_isr(caddr_t arg0, caddr_t arg1)
1330 {
1331 	virtio_t *vio = (virtio_t *)arg0;
1332 	uint_t r = DDI_INTR_UNCLAIMED;
1333 	uint8_t isr;
1334 
1335 	mutex_enter(&vio->vio_mutex);
1336 
1337 	/*
1338 	 * Check the ISR status to see if the interrupt applies to us.  Reading
1339 	 * this field resets it to zero.
1340 	 */
1341 	isr = virtio_get8(vio, VIRTIO_LEGACY_ISR_STATUS);
1342 	if ((isr & VIRTIO_ISR_CHECK_QUEUES) == 0) {
1343 		goto done;
1344 	}
1345 
1346 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
1347 	    viq = list_next(&vio->vio_queues, viq)) {
1348 		if (viq->viq_func != NULL) {
1349 			mutex_exit(&vio->vio_mutex);
1350 			if (viq->viq_func(viq->viq_funcarg, arg0) ==
1351 			    DDI_INTR_CLAIMED) {
1352 				r = DDI_INTR_CLAIMED;
1353 			}
1354 			mutex_enter(&vio->vio_mutex);
1355 
1356 			if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) {
1357 				/*
1358 				 * The device was shut down while in a queue
1359 				 * handler routine.
1360 				 */
1361 				goto done;
1362 			}
1363 		}
1364 	}
1365 
1366 done:
1367 	mutex_exit(&vio->vio_mutex);
1368 	return (r);
1369 }
1370 
1371 static int
1372 virtio_interrupts_setup(virtio_t *vio, int allow_types)
1373 {
1374 	dev_info_t *dip = vio->vio_dip;
1375 	int types;
1376 	int count = 0;
1377 
1378 	mutex_enter(&vio->vio_mutex);
1379 
1380 	/*
1381 	 * Determine the number of interrupts we'd like based on the number of
1382 	 * virtqueues.
1383 	 */
1384 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
1385 	    viq = list_next(&vio->vio_queues, viq)) {
1386 		if (viq->viq_func != NULL) {
1387 			count++;
1388 		}
1389 	}
1390 
1391 	if (ddi_intr_get_supported_types(dip, &types) != DDI_SUCCESS) {
1392 		dev_err(dip, CE_WARN, "could not get supported interrupts");
1393 		mutex_exit(&vio->vio_mutex);
1394 		return (DDI_FAILURE);
1395 	}
1396 
1397 	if (allow_types != 0) {
1398 		/*
1399 		 * Restrict the possible interrupt types at the request of the
1400 		 * driver.
1401 		 */
1402 		types &= allow_types;
1403 	}
1404 
1405 	/*
1406 	 * Try each potential interrupt type in descending order of preference.
1407 	 * Note that the specification does not appear to allow for the use of
1408 	 * classical MSI, so we are limited to either MSI-X or fixed
1409 	 * interrupts.
1410 	 */
1411 	if (types & DDI_INTR_TYPE_MSIX) {
1412 		if (virtio_interrupts_alloc(vio, DDI_INTR_TYPE_MSIX,
1413 		    count) == DDI_SUCCESS) {
1414 			goto add_handlers;
1415 		}
1416 	}
1417 	if (types & DDI_INTR_TYPE_FIXED) {
1418 		/*
1419 		 * If fixed interrupts are all that are available, we'll just
1420 		 * ask for one.
1421 		 */
1422 		if (virtio_interrupts_alloc(vio, DDI_INTR_TYPE_FIXED, 1) ==
1423 		    DDI_SUCCESS) {
1424 			goto add_handlers;
1425 		}
1426 	}
1427 
1428 	dev_err(dip, CE_WARN, "interrupt allocation failed");
1429 	mutex_exit(&vio->vio_mutex);
1430 	return (DDI_FAILURE);
1431 
1432 add_handlers:
1433 	/*
1434 	 * Ensure that we have not been given any high-level interrupts as our
1435 	 * interrupt handlers do not support them.
1436 	 */
1437 	for (int i = 0; i < vio->vio_ninterrupts; i++) {
1438 		uint_t ipri;
1439 
1440 		if (ddi_intr_get_pri(vio->vio_interrupts[i], &ipri) !=
1441 		    DDI_SUCCESS) {
1442 			dev_err(dip, CE_WARN, "could not determine interrupt "
1443 			    "priority");
1444 			goto fail;
1445 		}
1446 
1447 		if (ipri >= ddi_intr_get_hilevel_pri()) {
1448 			dev_err(dip, CE_WARN, "high level interrupts not "
1449 			    "supported");
1450 			goto fail;
1451 		}
1452 
1453 		/*
1454 		 * Record the highest priority we've been allocated to use for
1455 		 * mutex initialisation.
1456 		 */
1457 		if (i == 0 || ipri > vio->vio_interrupt_priority) {
1458 			vio->vio_interrupt_priority = ipri;
1459 		}
1460 	}
1461 
1462 	/*
1463 	 * Get the interrupt capabilities from the first handle to determine
1464 	 * whether we need to use ddi_intr_block_enable(9F).
1465 	 */
1466 	if (ddi_intr_get_cap(vio->vio_interrupts[0],
1467 	    &vio->vio_interrupt_cap) != DDI_SUCCESS) {
1468 		dev_err(dip, CE_WARN, "failed to get interrupt capabilities");
1469 		goto fail;
1470 	}
1471 
1472 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_FIXED) {
1473 		VERIFY3S(vio->vio_ninterrupts, ==, 1);
1474 		/*
1475 		 * For fixed interrupts, we need to use our shared handler to
1476 		 * multiplex the per-queue handlers provided by the driver.
1477 		 */
1478 		if (ddi_intr_add_handler(vio->vio_interrupts[0],
1479 		    virtio_shared_isr, (caddr_t)vio, NULL) != DDI_SUCCESS) {
1480 			dev_err(dip, CE_WARN, "adding shared %s interrupt "
1481 			    "handler failed", virtio_interrupt_type_name(
1482 			    vio->vio_interrupt_type));
1483 			goto fail;
1484 		}
1485 
1486 		goto done;
1487 	}
1488 
1489 	VERIFY3S(vio->vio_ninterrupts, ==, count);
1490 
1491 	uint_t n = 0;
1492 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
1493 	    viq = list_next(&vio->vio_queues, viq)) {
1494 		if (viq->viq_func == NULL) {
1495 			continue;
1496 		}
1497 
1498 		if (ddi_intr_add_handler(vio->vio_interrupts[n],
1499 		    viq->viq_func, (caddr_t)viq->viq_funcarg,
1500 		    (caddr_t)vio) != DDI_SUCCESS) {
1501 			dev_err(dip, CE_WARN, "adding interrupt %u (%s) failed",
1502 			    n, viq->viq_name);
1503 			goto fail;
1504 		}
1505 
1506 		viq->viq_handler_index = n;
1507 		viq->viq_handler_added = B_TRUE;
1508 		n++;
1509 	}
1510 
1511 done:
1512 	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ADDED;
1513 	mutex_exit(&vio->vio_mutex);
1514 	return (DDI_SUCCESS);
1515 
1516 fail:
1517 	virtio_interrupts_teardown(vio);
1518 	mutex_exit(&vio->vio_mutex);
1519 	return (DDI_FAILURE);
1520 }
1521 
1522 static void
1523 virtio_interrupts_teardown(virtio_t *vio)
1524 {
1525 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1526 
1527 	virtio_interrupts_disable_locked(vio);
1528 
1529 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_FIXED) {
1530 		/*
1531 		 * Remove the multiplexing interrupt handler.
1532 		 */
1533 		if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED) {
1534 			int r;
1535 
1536 			VERIFY3S(vio->vio_ninterrupts, ==, 1);
1537 
1538 			if ((r = ddi_intr_remove_handler(
1539 			    vio->vio_interrupts[0])) != DDI_SUCCESS) {
1540 				dev_err(vio->vio_dip, CE_WARN, "removing "
1541 				    "shared interrupt handler failed (%d)", r);
1542 			}
1543 		}
1544 	} else {
1545 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1546 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1547 			int r;
1548 
1549 			if (!viq->viq_handler_added) {
1550 				continue;
1551 			}
1552 
1553 			if ((r = ddi_intr_remove_handler(
1554 			    vio->vio_interrupts[viq->viq_handler_index])) !=
1555 			    DDI_SUCCESS) {
1556 				dev_err(vio->vio_dip, CE_WARN, "removing "
1557 				    "interrupt handler (%s) failed (%d)",
1558 				    viq->viq_name, r);
1559 			}
1560 
1561 			viq->viq_handler_added = B_FALSE;
1562 		}
1563 	}
1564 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ADDED;
1565 
1566 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ALLOC) {
1567 		for (int i = 0; i < vio->vio_ninterrupts; i++) {
1568 			int r;
1569 
1570 			if ((r = ddi_intr_free(vio->vio_interrupts[i])) !=
1571 			    DDI_SUCCESS) {
1572 				dev_err(vio->vio_dip, CE_WARN, "freeing "
1573 				    "interrupt %u failed (%d)", i, r);
1574 			}
1575 		}
1576 		kmem_free(vio->vio_interrupts,
1577 		    sizeof (ddi_intr_handle_t) * vio->vio_ninterrupts);
1578 		vio->vio_interrupts = NULL;
1579 		vio->vio_ninterrupts = 0;
1580 		vio->vio_interrupt_type = 0;
1581 		vio->vio_interrupt_cap = 0;
1582 		vio->vio_interrupt_priority = 0;
1583 
1584 		vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ALLOC;
1585 	}
1586 }
1587 
1588 static void
1589 virtio_interrupts_unwind(virtio_t *vio)
1590 {
1591 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1592 
1593 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_MSIX) {
1594 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1595 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1596 			if (!viq->viq_handler_added) {
1597 				continue;
1598 			}
1599 
1600 			virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT,
1601 			    viq->viq_index);
1602 			virtio_put16(vio, VIRTIO_LEGACY_MSIX_QUEUE,
1603 			    VIRTIO_LEGACY_MSI_NO_VECTOR);
1604 		}
1605 	}
1606 
1607 	if (vio->vio_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
1608 		(void) ddi_intr_block_disable(vio->vio_interrupts,
1609 		    vio->vio_ninterrupts);
1610 	} else {
1611 		for (int i = 0; i < vio->vio_ninterrupts; i++) {
1612 			(void) ddi_intr_disable(vio->vio_interrupts[i]);
1613 		}
1614 	}
1615 
1616 	/*
1617 	 * Disabling the interrupts makes the MSI-X fields disappear from the
1618 	 * BAR once more.
1619 	 */
1620 	vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET;
1621 }
1622 
1623 int
1624 virtio_interrupts_enable(virtio_t *vio)
1625 {
1626 	mutex_enter(&vio->vio_mutex);
1627 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ENABLED) {
1628 		mutex_exit(&vio->vio_mutex);
1629 		return (DDI_SUCCESS);
1630 	}
1631 
1632 	int r = DDI_SUCCESS;
1633 	if (vio->vio_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
1634 		r = ddi_intr_block_enable(vio->vio_interrupts,
1635 		    vio->vio_ninterrupts);
1636 	} else {
1637 		for (int i = 0; i < vio->vio_ninterrupts; i++) {
1638 			if ((r = ddi_intr_enable(vio->vio_interrupts[i])) !=
1639 			    DDI_SUCCESS) {
1640 				/*
1641 				 * Disable the interrupts we have enabled so
1642 				 * far.
1643 				 */
1644 				for (i--; i >= 0; i--) {
1645 					(void) ddi_intr_disable(
1646 					    vio->vio_interrupts[i]);
1647 				}
1648 				break;
1649 			}
1650 		}
1651 	}
1652 
1653 	if (r != DDI_SUCCESS) {
1654 		mutex_exit(&vio->vio_mutex);
1655 		return (r);
1656 	}
1657 
1658 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_MSIX) {
1659 		/*
1660 		 * When asked to enable the interrupts, the system enables
1661 		 * MSI-X in the PCI configuration for the device.  While
1662 		 * enabled, the extra MSI-X configuration table fields appear
1663 		 * between the general and the device-specific regions of the
1664 		 * BAR.
1665 		 */
1666 		vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET_MSIX;
1667 
1668 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1669 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1670 			if (!viq->viq_handler_added) {
1671 				continue;
1672 			}
1673 
1674 			uint16_t qi = viq->viq_index;
1675 			uint16_t msi = viq->viq_handler_index;
1676 
1677 			/*
1678 			 * Route interrupts for this queue to the assigned
1679 			 * MSI-X vector number.
1680 			 */
1681 			virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qi);
1682 			virtio_put16(vio, VIRTIO_LEGACY_MSIX_QUEUE, msi);
1683 
1684 			/*
1685 			 * The device may not actually accept the vector number
1686 			 * we're attempting to program.  We need to confirm
1687 			 * that configuration was successful by re-reading the
1688 			 * configuration we just wrote.
1689 			 */
1690 			if (virtio_get16(vio, VIRTIO_LEGACY_MSIX_QUEUE) !=
1691 			    msi) {
1692 				dev_err(vio->vio_dip, CE_WARN,
1693 				    "failed to configure MSI-X vector %u for "
1694 				    "queue \"%s\" (#%u)", (uint_t)msi,
1695 				    viq->viq_name, (uint_t)qi);
1696 
1697 				virtio_interrupts_unwind(vio);
1698 				mutex_exit(&vio->vio_mutex);
1699 				return (DDI_FAILURE);
1700 			}
1701 		}
1702 	}
1703 
1704 	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ENABLED;
1705 
1706 	mutex_exit(&vio->vio_mutex);
1707 	return (DDI_SUCCESS);
1708 }
1709 
1710 static void
1711 virtio_interrupts_disable_locked(virtio_t *vio)
1712 {
1713 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1714 
1715 	if (!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ENABLED)) {
1716 		return;
1717 	}
1718 
1719 	virtio_interrupts_unwind(vio);
1720 
1721 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ENABLED;
1722 }
1723 
1724 void
1725 virtio_interrupts_disable(virtio_t *vio)
1726 {
1727 	mutex_enter(&vio->vio_mutex);
1728 	virtio_interrupts_disable_locked(vio);
1729 	mutex_exit(&vio->vio_mutex);
1730 }
1731