xref: /illumos-gate/usr/src/uts/common/io/virtio/virtio_main.c (revision 1bff1300cebf1ea8e11ce928b10e208097e67f24)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  */
15 
16 /*
17  * VIRTIO FRAMEWORK
18  *
19  * For design and usage documentation, see the comments in "virtio.h".
20  */
21 
22 #include <sys/conf.h>
23 #include <sys/kmem.h>
24 #include <sys/debug.h>
25 #include <sys/modctl.h>
26 #include <sys/autoconf.h>
27 #include <sys/ddi_impldefs.h>
28 #include <sys/ddi.h>
29 #include <sys/sunddi.h>
30 #include <sys/sunndi.h>
31 #include <sys/avintr.h>
32 #include <sys/spl.h>
33 #include <sys/promif.h>
34 #include <sys/list.h>
35 #include <sys/bootconf.h>
36 #include <sys/bootsvcs.h>
37 #include <sys/sysmacros.h>
38 #include <sys/pci.h>
39 
40 #include "virtio.h"
41 #include "virtio_impl.h"
42 
43 
44 /*
45  * Linkage structures
46  */
47 static struct modlmisc virtio_modlmisc = {
48 	.misc_modops =			&mod_miscops,
49 	.misc_linkinfo =		"VIRTIO common routines",
50 };
51 
52 static struct modlinkage virtio_modlinkage = {
53 	.ml_rev =			MODREV_1,
54 	.ml_linkage =			{ &virtio_modlmisc, NULL }
55 };
56 
57 int
58 _init(void)
59 {
60 	return (mod_install(&virtio_modlinkage));
61 }
62 
63 int
64 _fini(void)
65 {
66 	return (mod_remove(&virtio_modlinkage));
67 }
68 
69 int
70 _info(struct modinfo *modinfop)
71 {
72 	return (mod_info(&virtio_modlinkage, modinfop));
73 }
74 
75 
76 
77 static void virtio_set_status(virtio_t *, uint8_t);
78 static int virtio_chain_append_impl(virtio_chain_t *, uint64_t, size_t,
79     uint16_t);
80 static int virtio_interrupts_setup(virtio_t *, int);
81 static void virtio_interrupts_teardown(virtio_t *);
82 static void virtio_interrupts_disable_locked(virtio_t *);
83 static void virtio_queue_free(virtio_queue_t *);
84 static void virtio_device_reset_locked(virtio_t *);
85 
86 /*
87  * We use the same device access attributes for BAR mapping and access to the
88  * virtqueue memory.
89  */
90 ddi_device_acc_attr_t virtio_acc_attr = {
91 	.devacc_attr_version =		DDI_DEVICE_ATTR_V1,
92 	.devacc_attr_endian_flags =	DDI_NEVERSWAP_ACC,
93 	.devacc_attr_dataorder =	DDI_STORECACHING_OK_ACC,
94 	.devacc_attr_access =		DDI_DEFAULT_ACC
95 };
96 
97 
98 /*
99  * DMA attributes for the memory given to the device for queue management.
100  */
101 ddi_dma_attr_t virtio_dma_attr_queue = {
102 	.dma_attr_version =		DMA_ATTR_V0,
103 	.dma_attr_addr_lo =		0x0000000000000000,
104 	/*
105 	 * Queue memory is aligned on VIRTIO_PAGE_SIZE with the address shifted
106 	 * down by VIRTIO_PAGE_SHIFT before being passed to the device in a
107 	 * 32-bit register.
108 	 */
109 	.dma_attr_addr_hi =		0x00000FFFFFFFF000,
110 	.dma_attr_count_max =		0x00000000FFFFFFFF,
111 	.dma_attr_align =		VIRTIO_PAGE_SIZE,
112 	.dma_attr_burstsizes =		1,
113 	.dma_attr_minxfer =		1,
114 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
115 	.dma_attr_seg =			0x00000000FFFFFFFF,
116 	.dma_attr_sgllen =		1,
117 	.dma_attr_granular =		1,
118 	.dma_attr_flags =		0
119 };
120 
121 /*
122  * DMA attributes for the the allocation of indirect descriptor lists.  The
123  * indirect list is referenced by a regular descriptor entry: the physical
124  * address field is 64 bits wide, but the length field is only 32 bits.  Each
125  * descriptor is 16 bytes long.
126  */
127 ddi_dma_attr_t virtio_dma_attr_indirect = {
128 	.dma_attr_version =		DMA_ATTR_V0,
129 	.dma_attr_addr_lo =		0x0000000000000000,
130 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
131 	.dma_attr_count_max =		0x00000000FFFFFFFF,
132 	.dma_attr_align =		sizeof (struct virtio_vq_desc),
133 	.dma_attr_burstsizes =		1,
134 	.dma_attr_minxfer =		1,
135 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
136 	.dma_attr_seg =			0x00000000FFFFFFFF,
137 	.dma_attr_sgllen =		1,
138 	.dma_attr_granular =		1,
139 	.dma_attr_flags =		0
140 };
141 
142 
143 uint8_t
144 virtio_get8(virtio_t *vio, uintptr_t offset)
145 {
146 	return (ddi_get8(vio->vio_barh, (uint8_t *)(vio->vio_bar + offset)));
147 }
148 
149 uint16_t
150 virtio_get16(virtio_t *vio, uintptr_t offset)
151 {
152 	return (ddi_get16(vio->vio_barh, (uint16_t *)(vio->vio_bar + offset)));
153 }
154 
155 uint32_t
156 virtio_get32(virtio_t *vio, uintptr_t offset)
157 {
158 	return (ddi_get32(vio->vio_barh, (uint32_t *)(vio->vio_bar + offset)));
159 }
160 
161 void
162 virtio_put8(virtio_t *vio, uintptr_t offset, uint8_t value)
163 {
164 	ddi_put8(vio->vio_barh, (uint8_t *)(vio->vio_bar + offset), value);
165 }
166 
167 void
168 virtio_put16(virtio_t *vio, uintptr_t offset, uint16_t value)
169 {
170 	ddi_put16(vio->vio_barh, (uint16_t *)(vio->vio_bar + offset), value);
171 }
172 
173 void
174 virtio_put32(virtio_t *vio, uintptr_t offset, uint32_t value)
175 {
176 	ddi_put32(vio->vio_barh, (uint32_t *)(vio->vio_bar + offset), value);
177 }
178 
179 void
180 virtio_fini(virtio_t *vio, boolean_t failed)
181 {
182 	mutex_enter(&vio->vio_mutex);
183 
184 	virtio_interrupts_teardown(vio);
185 
186 	virtio_queue_t *viq;
187 	while ((viq = list_remove_head(&vio->vio_queues)) != NULL) {
188 		virtio_queue_free(viq);
189 	}
190 	list_destroy(&vio->vio_queues);
191 
192 	if (failed) {
193 		/*
194 		 * Signal to the host that device setup failed.
195 		 */
196 		virtio_set_status(vio, VIRTIO_STATUS_FAILED);
197 	} else {
198 		virtio_device_reset_locked(vio);
199 	}
200 
201 	/*
202 	 * We don't need to do anything for the provider initlevel, as it
203 	 * merely records the fact that virtio_init_complete() was called.
204 	 */
205 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_PROVIDER;
206 
207 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_REGS) {
208 		/*
209 		 * Unmap PCI BAR0.
210 		 */
211 		ddi_regs_map_free(&vio->vio_barh);
212 
213 		vio->vio_initlevel &= ~VIRTIO_INITLEVEL_REGS;
214 	}
215 
216 	/*
217 	 * Ensure we have torn down everything we set up.
218 	 */
219 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_SHUTDOWN;
220 	VERIFY0(vio->vio_initlevel);
221 
222 	mutex_exit(&vio->vio_mutex);
223 	mutex_destroy(&vio->vio_mutex);
224 
225 	kmem_free(vio, sizeof (*vio));
226 }
227 
228 /*
229  * Early device initialisation for legacy (pre-1.0 specification) virtio
230  * devices.
231  */
232 virtio_t *
233 virtio_init(dev_info_t *dip, uint64_t driver_features, boolean_t allow_indirect)
234 {
235 	int r;
236 
237 	/*
238 	 * First, confirm that this is a legacy device.
239 	 */
240 	ddi_acc_handle_t pci;
241 	if (pci_config_setup(dip, &pci) != DDI_SUCCESS) {
242 		dev_err(dip, CE_WARN, "pci_config_setup failed");
243 		return (NULL);
244 	}
245 
246 	uint8_t revid;
247 	if ((revid = pci_config_get8(pci, PCI_CONF_REVID)) == PCI_EINVAL8) {
248 		dev_err(dip, CE_WARN, "could not read config space");
249 		pci_config_teardown(&pci);
250 		return (NULL);
251 	}
252 
253 	pci_config_teardown(&pci);
254 
255 	/*
256 	 * The legacy specification requires that the device advertise as PCI
257 	 * Revision 0.
258 	 */
259 	if (revid != 0) {
260 		dev_err(dip, CE_WARN, "PCI Revision %u incorrect for "
261 		    "legacy virtio device", (uint_t)revid);
262 		return (NULL);
263 	}
264 
265 	virtio_t *vio = kmem_zalloc(sizeof (*vio), KM_SLEEP);
266 	vio->vio_dip = dip;
267 
268 	/*
269 	 * Map PCI BAR0 for legacy device access.
270 	 */
271 	if ((r = ddi_regs_map_setup(dip, VIRTIO_LEGACY_PCI_BAR0,
272 	    (caddr_t *)&vio->vio_bar, 0, 0, &virtio_acc_attr,
273 	    &vio->vio_barh)) != DDI_SUCCESS) {
274 		dev_err(dip, CE_WARN, "ddi_regs_map_setup failure (%d)", r);
275 		kmem_free(vio, sizeof (*vio));
276 		return (NULL);
277 	}
278 	vio->vio_initlevel |= VIRTIO_INITLEVEL_REGS;
279 
280 	/*
281 	 * We initialise the mutex without an interrupt priority to ease the
282 	 * implementation of some of the configuration space access routines.
283 	 * Drivers using the virtio framework MUST make a call to
284 	 * "virtio_init_complete()" prior to spawning other threads or enabling
285 	 * interrupt handlers, at which time we will destroy and reinitialise
286 	 * the mutex for use in our interrupt handlers.
287 	 */
288 	mutex_init(&vio->vio_mutex, NULL, MUTEX_DRIVER, NULL);
289 
290 	list_create(&vio->vio_queues, sizeof (virtio_queue_t),
291 	    offsetof(virtio_queue_t, viq_link));
292 
293 	/*
294 	 * Legacy virtio devices require a few common steps before we can
295 	 * negotiate device features.
296 	 */
297 	virtio_device_reset(vio);
298 	virtio_set_status(vio, VIRTIO_STATUS_ACKNOWLEDGE);
299 	virtio_set_status(vio, VIRTIO_STATUS_DRIVER);
300 
301 	/*
302 	 * Negotiate features with the device.  Record the original supported
303 	 * feature set for debugging purposes.
304 	 */
305 	vio->vio_features_device = virtio_get32(vio,
306 	    VIRTIO_LEGACY_FEATURES_DEVICE);
307 	if (allow_indirect) {
308 		driver_features |= VIRTIO_F_RING_INDIRECT_DESC;
309 	}
310 	vio->vio_features = vio->vio_features_device & driver_features;
311 	virtio_put32(vio, VIRTIO_LEGACY_FEATURES_DRIVER, vio->vio_features);
312 
313 	/*
314 	 * The device-specific configuration begins at an offset into the BAR
315 	 * that depends on whether we have enabled MSI-X interrupts or not.
316 	 * Start out with the offset for pre-MSI-X operation so that we can
317 	 * read device configuration space prior to configuring interrupts.
318 	 */
319 	vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET;
320 
321 	return (vio);
322 }
323 
324 /*
325  * This function must be called by the driver once it has completed early setup
326  * calls.  The value of "allowed_interrupt_types" is a mask of interrupt types
327  * (DDI_INTR_TYPE_MSIX, etc) that we'll try to use when installing handlers, or
328  * the special value 0 to allow the system to use any available type.
329  */
330 int
331 virtio_init_complete(virtio_t *vio, int allowed_interrupt_types)
332 {
333 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_PROVIDER));
334 	vio->vio_initlevel |= VIRTIO_INITLEVEL_PROVIDER;
335 
336 	if (!list_is_empty(&vio->vio_queues)) {
337 		/*
338 		 * Set up interrupts for the queues that have been registered.
339 		 */
340 		if (virtio_interrupts_setup(vio, allowed_interrupt_types) !=
341 		    DDI_SUCCESS) {
342 			return (DDI_FAILURE);
343 		}
344 	}
345 
346 	/*
347 	 * We can allocate the mutex once we know the priority.
348 	 */
349 	mutex_destroy(&vio->vio_mutex);
350 	mutex_init(&vio->vio_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
351 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
352 	    viq = list_next(&vio->vio_queues, viq)) {
353 		mutex_destroy(&viq->viq_mutex);
354 		mutex_init(&viq->viq_mutex, NULL, MUTEX_DRIVER,
355 		    virtio_intr_pri(vio));
356 	}
357 
358 	virtio_set_status(vio, VIRTIO_STATUS_DRIVER_OK);
359 
360 	return (DDI_SUCCESS);
361 }
362 
363 boolean_t
364 virtio_feature_present(virtio_t *vio, uint64_t feature_mask)
365 {
366 	return ((vio->vio_features & feature_mask) != 0);
367 }
368 
369 void *
370 virtio_intr_pri(virtio_t *vio)
371 {
372 	VERIFY(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED);
373 
374 	return (DDI_INTR_PRI(vio->vio_interrupt_priority));
375 }
376 
377 /*
378  * Enable a bit in the device status register.  Each bit signals a level of
379  * guest readiness to the host.  Use the VIRTIO_CONFIG_DEVICE_STATUS_*
380  * constants for "status".  To zero the status field use virtio_device_reset().
381  */
382 static void
383 virtio_set_status(virtio_t *vio, uint8_t status)
384 {
385 	VERIFY3U(status, !=, 0);
386 
387 	mutex_enter(&vio->vio_mutex);
388 
389 	uint8_t old = virtio_get8(vio, VIRTIO_LEGACY_DEVICE_STATUS);
390 	virtio_put8(vio, VIRTIO_LEGACY_DEVICE_STATUS, status | old);
391 
392 	mutex_exit(&vio->vio_mutex);
393 }
394 
395 static void
396 virtio_device_reset_locked(virtio_t *vio)
397 {
398 	virtio_put8(vio, VIRTIO_LEGACY_DEVICE_STATUS, VIRTIO_STATUS_RESET);
399 }
400 
401 void
402 virtio_device_reset(virtio_t *vio)
403 {
404 	mutex_enter(&vio->vio_mutex);
405 	virtio_device_reset_locked(vio);
406 	mutex_exit(&vio->vio_mutex);
407 }
408 
409 /*
410  * Some queues are effectively long-polled; the driver submits a series of
411  * buffers and the device only returns them when there is data available.
412  * During detach, we need to coordinate the return of these buffers.  Calling
413  * "virtio_shutdown()" will reset the device, then allow the removal of all
414  * buffers that were in flight at the time of shutdown via
415  * "virtio_queue_evacuate()".
416  */
417 void
418 virtio_shutdown(virtio_t *vio)
419 {
420 	mutex_enter(&vio->vio_mutex);
421 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) {
422 		/*
423 		 * Shutdown has been performed already.
424 		 */
425 		mutex_exit(&vio->vio_mutex);
426 		return;
427 	}
428 
429 	/*
430 	 * First, mark all of the queues as shutdown.  This will prevent any
431 	 * further activity.
432 	 */
433 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
434 	    viq = list_next(&vio->vio_queues, viq)) {
435 		mutex_enter(&viq->viq_mutex);
436 		viq->viq_shutdown = B_TRUE;
437 		mutex_exit(&viq->viq_mutex);
438 	}
439 
440 	/*
441 	 * Now, reset the device.  This removes any queue configuration on the
442 	 * device side.
443 	 */
444 	virtio_device_reset_locked(vio);
445 	vio->vio_initlevel |= VIRTIO_INITLEVEL_SHUTDOWN;
446 	mutex_exit(&vio->vio_mutex);
447 }
448 
449 /*
450  * Common implementation of quiesce(9E) for simple Virtio-based devices.
451  */
452 int
453 virtio_quiesce(virtio_t *vio)
454 {
455 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) {
456 		/*
457 		 * Device has already been reset.
458 		 */
459 		return (DDI_SUCCESS);
460 	}
461 
462 	/*
463 	 * When we reset the device, it should immediately stop using any DMA
464 	 * memory we've previously passed to it.  All queue configuration is
465 	 * discarded.  This is good enough for quiesce(9E).
466 	 */
467 	virtio_device_reset_locked(vio);
468 
469 	return (DDI_SUCCESS);
470 }
471 
472 /*
473  * DEVICE-SPECIFIC REGISTER ACCESS
474  *
475  * Note that these functions take the mutex to avoid racing with interrupt
476  * enable/disable, when the device-specific offset can potentially change.
477  */
478 
479 uint8_t
480 virtio_dev_get8(virtio_t *vio, uintptr_t offset)
481 {
482 	mutex_enter(&vio->vio_mutex);
483 	uint8_t r = virtio_get8(vio, vio->vio_config_offset + offset);
484 	mutex_exit(&vio->vio_mutex);
485 
486 	return (r);
487 }
488 
489 uint16_t
490 virtio_dev_get16(virtio_t *vio, uintptr_t offset)
491 {
492 	mutex_enter(&vio->vio_mutex);
493 	uint16_t r = virtio_get16(vio, vio->vio_config_offset + offset);
494 	mutex_exit(&vio->vio_mutex);
495 
496 	return (r);
497 }
498 
499 uint32_t
500 virtio_dev_get32(virtio_t *vio, uintptr_t offset)
501 {
502 	mutex_enter(&vio->vio_mutex);
503 	uint32_t r = virtio_get32(vio, vio->vio_config_offset + offset);
504 	mutex_exit(&vio->vio_mutex);
505 
506 	return (r);
507 }
508 
509 uint64_t
510 virtio_dev_get64(virtio_t *vio, uintptr_t offset)
511 {
512 	mutex_enter(&vio->vio_mutex);
513 	/*
514 	 * On at least some systems, a 64-bit read or write to this BAR is not
515 	 * possible.  For legacy devices, there is no generation number to use
516 	 * to determine if configuration may have changed half-way through a
517 	 * read.  We need to continue to read both halves of the value until we
518 	 * read the same value at least twice.
519 	 */
520 	uintptr_t o_lo = vio->vio_config_offset + offset;
521 	uintptr_t o_hi = o_lo + 4;
522 
523 	uint64_t val = virtio_get32(vio, o_lo) |
524 	    ((uint64_t)virtio_get32(vio, o_hi) << 32);
525 
526 	for (;;) {
527 		uint64_t tval = virtio_get32(vio, o_lo) |
528 		    ((uint64_t)virtio_get32(vio, o_hi) << 32);
529 
530 		if (tval == val) {
531 			break;
532 		}
533 
534 		val = tval;
535 	}
536 
537 	mutex_exit(&vio->vio_mutex);
538 	return (val);
539 }
540 
541 void
542 virtio_dev_put8(virtio_t *vio, uintptr_t offset, uint8_t value)
543 {
544 	mutex_enter(&vio->vio_mutex);
545 	virtio_put8(vio, vio->vio_config_offset + offset, value);
546 	mutex_exit(&vio->vio_mutex);
547 }
548 
549 void
550 virtio_dev_put16(virtio_t *vio, uintptr_t offset, uint16_t value)
551 {
552 	mutex_enter(&vio->vio_mutex);
553 	virtio_put16(vio, vio->vio_config_offset + offset, value);
554 	mutex_exit(&vio->vio_mutex);
555 }
556 
557 void
558 virtio_dev_put32(virtio_t *vio, uintptr_t offset, uint32_t value)
559 {
560 	mutex_enter(&vio->vio_mutex);
561 	virtio_put32(vio, vio->vio_config_offset + offset, value);
562 	mutex_exit(&vio->vio_mutex);
563 }
564 
565 /*
566  * VIRTQUEUE MANAGEMENT
567  */
568 
569 static int
570 virtio_inflight_compar(const void *lp, const void *rp)
571 {
572 	const virtio_chain_t *l = lp;
573 	const virtio_chain_t *r = rp;
574 
575 	if (l->vic_head < r->vic_head) {
576 		return (-1);
577 	} else if (l->vic_head > r->vic_head) {
578 		return (1);
579 	} else {
580 		return (0);
581 	}
582 }
583 
584 virtio_queue_t *
585 virtio_queue_alloc(virtio_t *vio, uint16_t qidx, const char *name,
586     ddi_intr_handler_t *func, void *funcarg, boolean_t force_direct,
587     uint_t max_segs)
588 {
589 	uint16_t qsz;
590 	char space_name[256];
591 
592 	if (max_segs < 1) {
593 		/*
594 		 * Every descriptor, direct or indirect, needs to refer to at
595 		 * least one buffer.
596 		 */
597 		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
598 		    "segment count must be at least 1", name, (uint_t)qidx);
599 		return (NULL);
600 	}
601 
602 	mutex_enter(&vio->vio_mutex);
603 
604 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_PROVIDER) {
605 		/*
606 		 * Cannot configure any more queues once initial setup is
607 		 * complete and interrupts have been allocated.
608 		 */
609 		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
610 		    "alloc after init complete", name, (uint_t)qidx);
611 		mutex_exit(&vio->vio_mutex);
612 		return (NULL);
613 	}
614 
615 	/*
616 	 * There is no way to negotiate a different queue size for legacy
617 	 * devices.  We must read and use the native queue size of the device.
618 	 */
619 	virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qidx);
620 	if ((qsz = virtio_get16(vio, VIRTIO_LEGACY_QUEUE_SIZE)) == 0) {
621 		/*
622 		 * A size of zero means the device does not have a queue with
623 		 * this index.
624 		 */
625 		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
626 		    "does not exist on device", name, (uint_t)qidx);
627 		mutex_exit(&vio->vio_mutex);
628 		return (NULL);
629 	}
630 
631 	mutex_exit(&vio->vio_mutex);
632 
633 	virtio_queue_t *viq = kmem_zalloc(sizeof (*viq), KM_SLEEP);
634 	viq->viq_virtio = vio;
635 	viq->viq_name = name;
636 	viq->viq_index = qidx;
637 	viq->viq_size = qsz;
638 	viq->viq_func = func;
639 	viq->viq_funcarg = funcarg;
640 	viq->viq_max_segs = max_segs;
641 	avl_create(&viq->viq_inflight, virtio_inflight_compar,
642 	    sizeof (virtio_chain_t), offsetof(virtio_chain_t, vic_node));
643 
644 	/*
645 	 * Allocate the mutex without an interrupt priority for now, as we do
646 	 * with "vio_mutex".  We'll reinitialise it in
647 	 * "virtio_init_complete()".
648 	 */
649 	mutex_init(&viq->viq_mutex, NULL, MUTEX_DRIVER, NULL);
650 
651 	if (virtio_feature_present(vio, VIRTIO_F_RING_INDIRECT_DESC) &&
652 	    !force_direct) {
653 		/*
654 		 * If we were able to negotiate the indirect descriptor
655 		 * feature, and the caller has not explicitly forced the use of
656 		 * direct descriptors, we'll allocate indirect descriptor lists
657 		 * for each chain.
658 		 */
659 		viq->viq_indirect = B_TRUE;
660 	}
661 
662 	/*
663 	 * Track descriptor usage in an identifier space.
664 	 */
665 	(void) snprintf(space_name, sizeof (space_name), "%s%d_vq_%s",
666 	    ddi_get_name(vio->vio_dip), ddi_get_instance(vio->vio_dip), name);
667 	if ((viq->viq_descmap = id_space_create(space_name, 0, qsz)) == NULL) {
668 		dev_err(vio->vio_dip, CE_WARN, "could not allocate descriptor "
669 		    "ID space");
670 		virtio_queue_free(viq);
671 		return (NULL);
672 	}
673 
674 	/*
675 	 * For legacy devices, memory for the queue has a strict layout
676 	 * determined by the queue size.
677 	 */
678 	size_t sz_descs = sizeof (virtio_vq_desc_t) * qsz;
679 	size_t sz_driver = P2ROUNDUP_TYPED(sz_descs +
680 	    sizeof (virtio_vq_driver_t) +
681 	    sizeof (uint16_t) * qsz,
682 	    VIRTIO_PAGE_SIZE, size_t);
683 	size_t sz_device = P2ROUNDUP_TYPED(sizeof (virtio_vq_device_t) +
684 	    sizeof (virtio_vq_elem_t) * qsz,
685 	    VIRTIO_PAGE_SIZE, size_t);
686 
687 	if (virtio_dma_init(vio, &viq->viq_dma, sz_driver + sz_device,
688 	    &virtio_dma_attr_queue, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
689 	    KM_SLEEP) != DDI_SUCCESS) {
690 		dev_err(vio->vio_dip, CE_WARN, "could not allocate queue "
691 		    "DMA memory");
692 		virtio_queue_free(viq);
693 		return (NULL);
694 	}
695 
696 	/*
697 	 * NOTE: The viq_dma_* members below are used by
698 	 * VIRTQ_DMA_SYNC_FORDEV() and VIRTQ_DMA_SYNC_FORKERNEL() to calculate
699 	 * offsets into the DMA allocation for partial synchronisation.  If the
700 	 * ordering of, or relationship between, these pointers changes, the
701 	 * macros must be kept in sync.
702 	 */
703 	viq->viq_dma_descs = virtio_dma_va(&viq->viq_dma, 0);
704 	viq->viq_dma_driver = virtio_dma_va(&viq->viq_dma, sz_descs);
705 	viq->viq_dma_device = virtio_dma_va(&viq->viq_dma, sz_driver);
706 
707 	/*
708 	 * Install in the per-device list of queues.
709 	 */
710 	mutex_enter(&vio->vio_mutex);
711 	for (virtio_queue_t *chkvq = list_head(&vio->vio_queues); chkvq != NULL;
712 	    chkvq = list_next(&vio->vio_queues, chkvq)) {
713 		if (chkvq->viq_index == qidx) {
714 			dev_err(vio->vio_dip, CE_WARN, "attempt to register "
715 			    "queue \"%s\" with same index (%d) as queue \"%s\"",
716 			    name, qidx, chkvq->viq_name);
717 			mutex_exit(&vio->vio_mutex);
718 			virtio_queue_free(viq);
719 			return (NULL);
720 		}
721 	}
722 	list_insert_tail(&vio->vio_queues, viq);
723 
724 	/*
725 	 * Ensure the zeroing of the queue memory is visible to the host before
726 	 * we inform the device of the queue address.
727 	 */
728 	membar_producer();
729 	VIRTQ_DMA_SYNC_FORDEV(viq);
730 
731 	virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qidx);
732 	virtio_put32(vio, VIRTIO_LEGACY_QUEUE_ADDRESS,
733 	    virtio_dma_cookie_pa(&viq->viq_dma, 0) >> VIRTIO_PAGE_SHIFT);
734 
735 	mutex_exit(&vio->vio_mutex);
736 	return (viq);
737 }
738 
739 static void
740 virtio_queue_free(virtio_queue_t *viq)
741 {
742 	virtio_t *vio = viq->viq_virtio;
743 
744 	/*
745 	 * We are going to destroy the queue mutex.  Make sure we've already
746 	 * removed the interrupt handlers.
747 	 */
748 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED));
749 
750 	mutex_enter(&viq->viq_mutex);
751 
752 	/*
753 	 * If the device has not already been reset as part of a shutdown,
754 	 * detach the queue from the device now.
755 	 */
756 	if (!viq->viq_shutdown) {
757 		virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, viq->viq_index);
758 		virtio_put32(vio, VIRTIO_LEGACY_QUEUE_ADDRESS, 0);
759 	}
760 
761 	virtio_dma_fini(&viq->viq_dma);
762 
763 	VERIFY(avl_is_empty(&viq->viq_inflight));
764 	avl_destroy(&viq->viq_inflight);
765 	if (viq->viq_descmap != NULL) {
766 		id_space_destroy(viq->viq_descmap);
767 	}
768 
769 	mutex_exit(&viq->viq_mutex);
770 	mutex_destroy(&viq->viq_mutex);
771 
772 	kmem_free(viq, sizeof (*viq));
773 }
774 
775 void
776 virtio_queue_no_interrupt(virtio_queue_t *viq, boolean_t stop_interrupts)
777 {
778 	mutex_enter(&viq->viq_mutex);
779 
780 	if (stop_interrupts) {
781 		viq->viq_dma_driver->vqdr_flags |= VIRTQ_AVAIL_F_NO_INTERRUPT;
782 	} else {
783 		viq->viq_dma_driver->vqdr_flags &= ~VIRTQ_AVAIL_F_NO_INTERRUPT;
784 	}
785 	VIRTQ_DMA_SYNC_FORDEV(viq);
786 
787 	mutex_exit(&viq->viq_mutex);
788 }
789 
790 static virtio_chain_t *
791 virtio_queue_complete(virtio_queue_t *viq, uint_t index)
792 {
793 	VERIFY(MUTEX_HELD(&viq->viq_mutex));
794 
795 	virtio_chain_t *vic;
796 
797 	virtio_chain_t search;
798 	bzero(&search, sizeof (search));
799 	search.vic_head = index;
800 
801 	if ((vic = avl_find(&viq->viq_inflight, &search, NULL)) == NULL) {
802 		return (NULL);
803 	}
804 	avl_remove(&viq->viq_inflight, vic);
805 
806 	return (vic);
807 }
808 
809 uint_t
810 virtio_queue_size(virtio_queue_t *viq)
811 {
812 	return (viq->viq_size);
813 }
814 
815 uint_t
816 virtio_queue_nactive(virtio_queue_t *viq)
817 {
818 	mutex_enter(&viq->viq_mutex);
819 	uint_t r = avl_numnodes(&viq->viq_inflight);
820 	mutex_exit(&viq->viq_mutex);
821 
822 	return (r);
823 }
824 
825 virtio_chain_t *
826 virtio_queue_poll(virtio_queue_t *viq)
827 {
828 	mutex_enter(&viq->viq_mutex);
829 	if (viq->viq_shutdown) {
830 		/*
831 		 * The device has been reset by virtio_shutdown(), and queue
832 		 * processing has been halted.  Any previously submitted chains
833 		 * will be evacuated using virtio_queue_evacuate().
834 		 */
835 		mutex_exit(&viq->viq_mutex);
836 		return (NULL);
837 	}
838 
839 	VIRTQ_DMA_SYNC_FORKERNEL(viq);
840 	if (viq->viq_device_index == viq->viq_dma_device->vqde_index) {
841 		/*
842 		 * If the device index has not changed since the last poll,
843 		 * there are no new chains to process.
844 		 */
845 		mutex_exit(&viq->viq_mutex);
846 		return (NULL);
847 	}
848 
849 	/*
850 	 * We need to ensure that all reads from the descriptor (vqde_ring[])
851 	 * and any referenced memory by the descriptor occur after we have read
852 	 * the descriptor index value above (vqde_index).
853 	 */
854 	membar_consumer();
855 
856 	uint16_t index = (viq->viq_device_index++) % viq->viq_size;
857 	uint16_t start = viq->viq_dma_device->vqde_ring[index].vqe_start;
858 	uint32_t len = viq->viq_dma_device->vqde_ring[index].vqe_len;
859 
860 	virtio_chain_t *vic;
861 	if ((vic = virtio_queue_complete(viq, start)) == NULL) {
862 		/*
863 		 * We could not locate a chain for this descriptor index, which
864 		 * suggests that something has gone horribly wrong.
865 		 */
866 		dev_err(viq->viq_virtio->vio_dip, CE_PANIC,
867 		    "queue \"%s\" ring entry %u (descriptor %u) has no chain",
868 		    viq->viq_name, (uint16_t)index, (uint16_t)start);
869 	}
870 
871 	vic->vic_received_length = len;
872 
873 	mutex_exit(&viq->viq_mutex);
874 
875 	return (vic);
876 }
877 
878 /*
879  * After a call to "virtio_shutdown()", the driver must retrieve any previously
880  * submitted chains and free any associated resources.
881  */
882 virtio_chain_t *
883 virtio_queue_evacuate(virtio_queue_t *viq)
884 {
885 	virtio_t *vio = viq->viq_virtio;
886 
887 	mutex_enter(&vio->vio_mutex);
888 	if (!(vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN)) {
889 		dev_err(vio->vio_dip, CE_PANIC,
890 		    "virtio_queue_evacuate() without virtio_shutdown()");
891 	}
892 	mutex_exit(&vio->vio_mutex);
893 
894 	mutex_enter(&viq->viq_mutex);
895 	VERIFY(viq->viq_shutdown);
896 
897 	virtio_chain_t *vic = avl_first(&viq->viq_inflight);
898 	if (vic != NULL) {
899 		avl_remove(&viq->viq_inflight, vic);
900 	}
901 
902 	mutex_exit(&viq->viq_mutex);
903 
904 	return (vic);
905 }
906 
907 /*
908  * VIRTQUEUE DESCRIPTOR CHAIN MANAGEMENT
909  */
910 
911 /*
912  * When the device returns a descriptor chain to the driver, it may provide the
913  * length in bytes of data written into the chain.  Client drivers should use
914  * this value with care; the specification suggests some device implementations
915  * have not always provided a useful or correct value.
916  */
917 size_t
918 virtio_chain_received_length(virtio_chain_t *vic)
919 {
920 	return (vic->vic_received_length);
921 }
922 
923 /*
924  * Allocate a descriptor chain for use with this queue.  The "kmflags" value
925  * may be KM_SLEEP or KM_NOSLEEP as per kmem_alloc(9F).
926  */
927 virtio_chain_t *
928 virtio_chain_alloc(virtio_queue_t *viq, int kmflags)
929 {
930 	virtio_t *vio = viq->viq_virtio;
931 	virtio_chain_t *vic;
932 	uint_t cap;
933 
934 	/*
935 	 * Direct descriptors are known by their index in the descriptor table
936 	 * for the queue.  We use the variable-length array member at the end
937 	 * of the chain tracking object to hold the list of direct descriptors
938 	 * assigned to this chain.
939 	 */
940 	if (viq->viq_indirect) {
941 		/*
942 		 * When using indirect descriptors we still need one direct
943 		 * descriptor entry to hold the physical address and length of
944 		 * the indirect descriptor table.
945 		 */
946 		cap = 1;
947 	} else {
948 		/*
949 		 * For direct descriptors we need to be able to track a
950 		 * descriptor for each possible segment in a single chain.
951 		 */
952 		cap = viq->viq_max_segs;
953 	}
954 
955 	size_t vicsz = sizeof (*vic) + sizeof (uint16_t) * cap;
956 	if ((vic = kmem_zalloc(vicsz, kmflags)) == NULL) {
957 		return (NULL);
958 	}
959 	vic->vic_vq = viq;
960 	vic->vic_direct_capacity = cap;
961 
962 	if (viq->viq_indirect) {
963 		/*
964 		 * Allocate an indirect descriptor list with the appropriate
965 		 * number of entries.
966 		 */
967 		if (virtio_dma_init(vio, &vic->vic_indirect_dma,
968 		    sizeof (virtio_vq_desc_t) * viq->viq_max_segs,
969 		    &virtio_dma_attr_indirect,
970 		    DDI_DMA_CONSISTENT | DDI_DMA_WRITE,
971 		    kmflags) != DDI_SUCCESS) {
972 			goto fail;
973 		}
974 
975 		/*
976 		 * Allocate a single descriptor to hold the indirect list.
977 		 * Leave the length as zero for now; it will be set to include
978 		 * any occupied entries at push time.
979 		 */
980 		mutex_enter(&viq->viq_mutex);
981 		if (virtio_chain_append_impl(vic,
982 		    virtio_dma_cookie_pa(&vic->vic_indirect_dma, 0), 0,
983 		    VIRTQ_DESC_F_INDIRECT) != DDI_SUCCESS) {
984 			mutex_exit(&viq->viq_mutex);
985 			goto fail;
986 		}
987 		mutex_exit(&viq->viq_mutex);
988 		VERIFY3U(vic->vic_direct_used, ==, 1);
989 
990 		/*
991 		 * Don't set the indirect capacity until after we've installed
992 		 * the direct descriptor which points at the indirect list, or
993 		 * virtio_chain_append_impl() will be confused.
994 		 */
995 		vic->vic_indirect_capacity = viq->viq_max_segs;
996 	}
997 
998 	return (vic);
999 
1000 fail:
1001 	virtio_dma_fini(&vic->vic_indirect_dma);
1002 	kmem_free(vic, vicsz);
1003 	return (NULL);
1004 }
1005 
1006 void *
1007 virtio_chain_data(virtio_chain_t *vic)
1008 {
1009 	return (vic->vic_data);
1010 }
1011 
1012 void
1013 virtio_chain_data_set(virtio_chain_t *vic, void *data)
1014 {
1015 	vic->vic_data = data;
1016 }
1017 
1018 void
1019 virtio_chain_clear(virtio_chain_t *vic)
1020 {
1021 	if (vic->vic_indirect_capacity != 0) {
1022 		/*
1023 		 * There should only be one direct descriptor, which points at
1024 		 * our indirect descriptor list.  We don't want to clear it
1025 		 * here.
1026 		 */
1027 		VERIFY3U(vic->vic_direct_capacity, ==, 1);
1028 
1029 		if (vic->vic_indirect_used > 0) {
1030 			/*
1031 			 * Clear out the indirect descriptor table.
1032 			 */
1033 			vic->vic_indirect_used = 0;
1034 			bzero(virtio_dma_va(&vic->vic_indirect_dma, 0),
1035 			    virtio_dma_size(&vic->vic_indirect_dma));
1036 		}
1037 
1038 	} else if (vic->vic_direct_capacity > 0) {
1039 		/*
1040 		 * Release any descriptors that were assigned to us previously.
1041 		 */
1042 		for (uint_t i = 0; i < vic->vic_direct_used; i++) {
1043 			id_free(vic->vic_vq->viq_descmap, vic->vic_direct[i]);
1044 			vic->vic_direct[i] = 0;
1045 		}
1046 		vic->vic_direct_used = 0;
1047 	}
1048 }
1049 
1050 void
1051 virtio_chain_free(virtio_chain_t *vic)
1052 {
1053 	/*
1054 	 * First ensure that we have released any descriptors used by this
1055 	 * chain.
1056 	 */
1057 	virtio_chain_clear(vic);
1058 
1059 	if (vic->vic_indirect_capacity > 0) {
1060 		/*
1061 		 * Release the direct descriptor that points to our indirect
1062 		 * descriptor list.
1063 		 */
1064 		VERIFY3U(vic->vic_direct_capacity, ==, 1);
1065 		id_free(vic->vic_vq->viq_descmap, vic->vic_direct[0]);
1066 
1067 		virtio_dma_fini(&vic->vic_indirect_dma);
1068 	}
1069 
1070 	size_t vicsz = sizeof (*vic) +
1071 	    vic->vic_direct_capacity * sizeof (uint16_t);
1072 
1073 	kmem_free(vic, vicsz);
1074 }
1075 
1076 static inline int
1077 virtio_queue_descmap_alloc(virtio_queue_t *viq, uint_t *indexp)
1078 {
1079 	id_t index;
1080 
1081 	if ((index = id_alloc_nosleep(viq->viq_descmap)) == -1) {
1082 		return (ENOMEM);
1083 	}
1084 
1085 	VERIFY3S(index, >=, 0);
1086 	VERIFY3S(index, <=, viq->viq_size);
1087 
1088 	*indexp = (uint_t)index;
1089 	return (0);
1090 }
1091 
1092 static int
1093 virtio_chain_append_impl(virtio_chain_t *vic, uint64_t pa, size_t len,
1094     uint16_t flags)
1095 {
1096 	virtio_queue_t *viq = vic->vic_vq;
1097 	virtio_vq_desc_t *vqd;
1098 	uint_t index;
1099 
1100 	/*
1101 	 * We're modifying the queue-wide descriptor list so make sure we have
1102 	 * the appropriate lock.
1103 	 */
1104 	VERIFY(MUTEX_HELD(&viq->viq_mutex));
1105 
1106 	if (vic->vic_indirect_capacity != 0) {
1107 		/*
1108 		 * Use indirect descriptors.
1109 		 */
1110 		if (vic->vic_indirect_used >= vic->vic_indirect_capacity) {
1111 			return (DDI_FAILURE);
1112 		}
1113 
1114 		vqd = virtio_dma_va(&vic->vic_indirect_dma, 0);
1115 
1116 		if ((index = vic->vic_indirect_used++) > 0) {
1117 			/*
1118 			 * Chain the current last indirect descriptor to the
1119 			 * new one.
1120 			 */
1121 			vqd[index - 1].vqd_flags |= VIRTQ_DESC_F_NEXT;
1122 			vqd[index - 1].vqd_next = index;
1123 		}
1124 
1125 	} else {
1126 		/*
1127 		 * Use direct descriptors.
1128 		 */
1129 		if (vic->vic_direct_used >= vic->vic_direct_capacity) {
1130 			return (DDI_FAILURE);
1131 		}
1132 
1133 		if (virtio_queue_descmap_alloc(viq, &index) != 0) {
1134 			return (DDI_FAILURE);
1135 		}
1136 
1137 		vqd = virtio_dma_va(&viq->viq_dma, 0);
1138 
1139 		if (vic->vic_direct_used > 0) {
1140 			/*
1141 			 * This is not the first entry.  Chain the current
1142 			 * descriptor to the next one.
1143 			 */
1144 			uint16_t p = vic->vic_direct[vic->vic_direct_used - 1];
1145 
1146 			vqd[p].vqd_flags |= VIRTQ_DESC_F_NEXT;
1147 			vqd[p].vqd_next = index;
1148 		}
1149 		vic->vic_direct[vic->vic_direct_used++] = index;
1150 	}
1151 
1152 	vqd[index].vqd_addr = pa;
1153 	vqd[index].vqd_len = len;
1154 	vqd[index].vqd_flags = flags;
1155 	vqd[index].vqd_next = 0;
1156 
1157 	return (DDI_SUCCESS);
1158 }
1159 
1160 int
1161 virtio_chain_append(virtio_chain_t *vic, uint64_t pa, size_t len,
1162     virtio_direction_t dir)
1163 {
1164 	virtio_queue_t *viq = vic->vic_vq;
1165 	uint16_t flags = 0;
1166 
1167 	switch (dir) {
1168 	case VIRTIO_DIR_DEVICE_WRITES:
1169 		flags |= VIRTQ_DESC_F_WRITE;
1170 		break;
1171 
1172 	case VIRTIO_DIR_DEVICE_READS:
1173 		break;
1174 
1175 	default:
1176 		panic("unknown direction value %u", dir);
1177 	}
1178 
1179 	mutex_enter(&viq->viq_mutex);
1180 	int r = virtio_chain_append_impl(vic, pa, len, flags);
1181 	mutex_exit(&viq->viq_mutex);
1182 
1183 	return (r);
1184 }
1185 
1186 static void
1187 virtio_queue_flush_locked(virtio_queue_t *viq)
1188 {
1189 	VERIFY(MUTEX_HELD(&viq->viq_mutex));
1190 
1191 	/*
1192 	 * Make sure any writes we have just made to the descriptors
1193 	 * (vqdr_ring[]) are visible to the device before we update the ring
1194 	 * pointer (vqdr_index).
1195 	 */
1196 	membar_producer();
1197 	viq->viq_dma_driver->vqdr_index = viq->viq_driver_index;
1198 	VIRTQ_DMA_SYNC_FORDEV(viq);
1199 
1200 	/*
1201 	 * Determine whether the device expects us to notify it of new
1202 	 * descriptors.
1203 	 */
1204 	VIRTQ_DMA_SYNC_FORKERNEL(viq);
1205 	if (!(viq->viq_dma_device->vqde_flags & VIRTQ_USED_F_NO_NOTIFY)) {
1206 		virtio_put16(viq->viq_virtio, VIRTIO_LEGACY_QUEUE_NOTIFY,
1207 		    viq->viq_index);
1208 	}
1209 }
1210 
1211 void
1212 virtio_queue_flush(virtio_queue_t *viq)
1213 {
1214 	mutex_enter(&viq->viq_mutex);
1215 	virtio_queue_flush_locked(viq);
1216 	mutex_exit(&viq->viq_mutex);
1217 }
1218 
1219 void
1220 virtio_chain_submit(virtio_chain_t *vic, boolean_t flush)
1221 {
1222 	virtio_queue_t *viq = vic->vic_vq;
1223 
1224 	mutex_enter(&viq->viq_mutex);
1225 
1226 	if (vic->vic_indirect_capacity != 0) {
1227 		virtio_vq_desc_t *vqd = virtio_dma_va(&viq->viq_dma, 0);
1228 
1229 		VERIFY3U(vic->vic_direct_used, ==, 1);
1230 
1231 		/*
1232 		 * This is an indirect descriptor queue.  The length in bytes
1233 		 * of the descriptor must extend to cover the populated
1234 		 * indirect descriptor entries.
1235 		 */
1236 		vqd[vic->vic_direct[0]].vqd_len =
1237 		    sizeof (virtio_vq_desc_t) * vic->vic_indirect_used;
1238 
1239 		virtio_dma_sync(&vic->vic_indirect_dma, DDI_DMA_SYNC_FORDEV);
1240 	}
1241 
1242 	/*
1243 	 * Populate the next available slot in the driver-owned ring for this
1244 	 * chain.  The updated value of viq_driver_index is not yet visible to
1245 	 * the device until a subsequent queue flush.
1246 	 */
1247 	uint16_t index = (viq->viq_driver_index++) % viq->viq_size;
1248 	viq->viq_dma_driver->vqdr_ring[index] = vic->vic_direct[0];
1249 
1250 	vic->vic_head = vic->vic_direct[0];
1251 	avl_add(&viq->viq_inflight, vic);
1252 
1253 	if (flush) {
1254 		virtio_queue_flush_locked(vic->vic_vq);
1255 	}
1256 
1257 	mutex_exit(&viq->viq_mutex);
1258 }
1259 
1260 /*
1261  * INTERRUPTS MANAGEMENT
1262  */
1263 
1264 static const char *
1265 virtio_interrupt_type_name(int type)
1266 {
1267 	switch (type) {
1268 	case DDI_INTR_TYPE_MSIX:
1269 		return ("MSI-X");
1270 	case DDI_INTR_TYPE_MSI:
1271 		return ("MSI");
1272 	case DDI_INTR_TYPE_FIXED:
1273 		return ("fixed");
1274 	default:
1275 		return ("?");
1276 	}
1277 }
1278 
1279 static int
1280 virtio_interrupts_alloc(virtio_t *vio, int type, int nrequired)
1281 {
1282 	dev_info_t *dip = vio->vio_dip;
1283 	int nintrs = 0;
1284 	int navail = 0;
1285 
1286 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1287 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ALLOC));
1288 
1289 	if (ddi_intr_get_nintrs(dip, type, &nintrs) != DDI_SUCCESS) {
1290 		dev_err(dip, CE_WARN, "could not count %s interrupts",
1291 		    virtio_interrupt_type_name(type));
1292 		return (DDI_FAILURE);
1293 	}
1294 	if (nintrs < 1) {
1295 		dev_err(dip, CE_WARN, "no %s interrupts supported",
1296 		    virtio_interrupt_type_name(type));
1297 		return (DDI_FAILURE);
1298 	}
1299 
1300 	if (ddi_intr_get_navail(dip, type, &navail) != DDI_SUCCESS) {
1301 		dev_err(dip, CE_WARN, "could not count available %s interrupts",
1302 		    virtio_interrupt_type_name(type));
1303 		return (DDI_FAILURE);
1304 	}
1305 	if (navail < nrequired) {
1306 		dev_err(dip, CE_WARN, "need %d %s interrupts, but only %d "
1307 		    "available", nrequired, virtio_interrupt_type_name(type),
1308 		    navail);
1309 		return (DDI_FAILURE);
1310 	}
1311 
1312 	VERIFY3P(vio->vio_interrupts, ==, NULL);
1313 	vio->vio_interrupts = kmem_zalloc(
1314 	    sizeof (ddi_intr_handle_t) * nrequired, KM_SLEEP);
1315 
1316 	int r;
1317 	if ((r = ddi_intr_alloc(dip, vio->vio_interrupts, type, 0, nrequired,
1318 	    &vio->vio_ninterrupts, DDI_INTR_ALLOC_STRICT)) != DDI_SUCCESS) {
1319 		dev_err(dip, CE_WARN, "%s interrupt allocation failure (%d)",
1320 		    virtio_interrupt_type_name(type), r);
1321 		kmem_free(vio->vio_interrupts,
1322 		    sizeof (ddi_intr_handle_t) * nrequired);
1323 		vio->vio_interrupts = NULL;
1324 		return (DDI_FAILURE);
1325 	}
1326 
1327 	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ALLOC;
1328 	vio->vio_interrupt_type = type;
1329 	return (DDI_SUCCESS);
1330 }
1331 
1332 static uint_t
1333 virtio_shared_isr(caddr_t arg0, caddr_t arg1)
1334 {
1335 	virtio_t *vio = (virtio_t *)arg0;
1336 	uint_t r = DDI_INTR_UNCLAIMED;
1337 	uint8_t isr;
1338 
1339 	mutex_enter(&vio->vio_mutex);
1340 
1341 	/*
1342 	 * Check the ISR status to see if the interrupt applies to us.  Reading
1343 	 * this field resets it to zero.
1344 	 */
1345 	isr = virtio_get8(vio, VIRTIO_LEGACY_ISR_STATUS);
1346 	if ((isr & VIRTIO_ISR_CHECK_QUEUES) == 0) {
1347 		goto done;
1348 	}
1349 
1350 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
1351 	    viq = list_next(&vio->vio_queues, viq)) {
1352 		if (viq->viq_func != NULL) {
1353 			mutex_exit(&vio->vio_mutex);
1354 			if (viq->viq_func(viq->viq_funcarg, arg0) ==
1355 			    DDI_INTR_CLAIMED) {
1356 				r = DDI_INTR_CLAIMED;
1357 			}
1358 			mutex_enter(&vio->vio_mutex);
1359 
1360 			if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) {
1361 				/*
1362 				 * The device was shut down while in a queue
1363 				 * handler routine.
1364 				 */
1365 				goto done;
1366 			}
1367 		}
1368 	}
1369 
1370 done:
1371 	mutex_exit(&vio->vio_mutex);
1372 	return (r);
1373 }
1374 
1375 static int
1376 virtio_interrupts_setup(virtio_t *vio, int allow_types)
1377 {
1378 	dev_info_t *dip = vio->vio_dip;
1379 	int types;
1380 	int count = 0;
1381 
1382 	mutex_enter(&vio->vio_mutex);
1383 
1384 	/*
1385 	 * Determine the number of interrupts we'd like based on the number of
1386 	 * virtqueues.
1387 	 */
1388 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
1389 	    viq = list_next(&vio->vio_queues, viq)) {
1390 		if (viq->viq_func != NULL) {
1391 			count++;
1392 		}
1393 	}
1394 
1395 	if (ddi_intr_get_supported_types(dip, &types) != DDI_SUCCESS) {
1396 		dev_err(dip, CE_WARN, "could not get supported interrupts");
1397 		mutex_exit(&vio->vio_mutex);
1398 		return (DDI_FAILURE);
1399 	}
1400 
1401 	if (allow_types != 0) {
1402 		/*
1403 		 * Restrict the possible interrupt types at the request of the
1404 		 * driver.
1405 		 */
1406 		types &= allow_types;
1407 	}
1408 
1409 	/*
1410 	 * Try each potential interrupt type in descending order of preference.
1411 	 * Note that the specification does not appear to allow for the use of
1412 	 * classical MSI, so we are limited to either MSI-X or fixed
1413 	 * interrupts.
1414 	 */
1415 	if (types & DDI_INTR_TYPE_MSIX) {
1416 		if (virtio_interrupts_alloc(vio, DDI_INTR_TYPE_MSIX,
1417 		    count) == DDI_SUCCESS) {
1418 			goto add_handlers;
1419 		}
1420 	}
1421 	if (types & DDI_INTR_TYPE_FIXED) {
1422 		/*
1423 		 * If fixed interrupts are all that are available, we'll just
1424 		 * ask for one.
1425 		 */
1426 		if (virtio_interrupts_alloc(vio, DDI_INTR_TYPE_FIXED, 1) ==
1427 		    DDI_SUCCESS) {
1428 			goto add_handlers;
1429 		}
1430 	}
1431 
1432 	dev_err(dip, CE_WARN, "interrupt allocation failed");
1433 	mutex_exit(&vio->vio_mutex);
1434 	return (DDI_FAILURE);
1435 
1436 add_handlers:
1437 	/*
1438 	 * Ensure that we have not been given any high-level interrupts as our
1439 	 * interrupt handlers do not support them.
1440 	 */
1441 	for (int i = 0; i < vio->vio_ninterrupts; i++) {
1442 		uint_t ipri;
1443 
1444 		if (ddi_intr_get_pri(vio->vio_interrupts[i], &ipri) !=
1445 		    DDI_SUCCESS) {
1446 			dev_err(dip, CE_WARN, "could not determine interrupt "
1447 			    "priority");
1448 			goto fail;
1449 		}
1450 
1451 		if (ipri >= ddi_intr_get_hilevel_pri()) {
1452 			dev_err(dip, CE_WARN, "high level interrupts not "
1453 			    "supported");
1454 			goto fail;
1455 		}
1456 
1457 		/*
1458 		 * Record the highest priority we've been allocated to use for
1459 		 * mutex initialisation.
1460 		 */
1461 		if (i == 0 || ipri > vio->vio_interrupt_priority) {
1462 			vio->vio_interrupt_priority = ipri;
1463 		}
1464 	}
1465 
1466 	/*
1467 	 * Get the interrupt capabilities from the first handle to determine
1468 	 * whether we need to use ddi_intr_block_enable(9F).
1469 	 */
1470 	if (ddi_intr_get_cap(vio->vio_interrupts[0],
1471 	    &vio->vio_interrupt_cap) != DDI_SUCCESS) {
1472 		dev_err(dip, CE_WARN, "failed to get interrupt capabilities");
1473 		goto fail;
1474 	}
1475 
1476 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_FIXED) {
1477 		VERIFY3S(vio->vio_ninterrupts, ==, 1);
1478 		/*
1479 		 * For fixed interrupts, we need to use our shared handler to
1480 		 * multiplex the per-queue handlers provided by the driver.
1481 		 */
1482 		if (ddi_intr_add_handler(vio->vio_interrupts[0],
1483 		    virtio_shared_isr, (caddr_t)vio, NULL) != DDI_SUCCESS) {
1484 			dev_err(dip, CE_WARN, "adding shared %s interrupt "
1485 			    "handler failed", virtio_interrupt_type_name(
1486 			    vio->vio_interrupt_type));
1487 			goto fail;
1488 		}
1489 
1490 		goto done;
1491 	}
1492 
1493 	VERIFY3S(vio->vio_ninterrupts, ==, count);
1494 
1495 	uint_t n = 0;
1496 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
1497 	    viq = list_next(&vio->vio_queues, viq)) {
1498 		if (viq->viq_func == NULL) {
1499 			continue;
1500 		}
1501 
1502 		if (ddi_intr_add_handler(vio->vio_interrupts[n],
1503 		    viq->viq_func, (caddr_t)viq->viq_funcarg,
1504 		    (caddr_t)vio) != DDI_SUCCESS) {
1505 			dev_err(dip, CE_WARN, "adding interrupt %u (%s) failed",
1506 			    n, viq->viq_name);
1507 			goto fail;
1508 		}
1509 
1510 		viq->viq_handler_index = n;
1511 		viq->viq_handler_added = B_TRUE;
1512 		n++;
1513 	}
1514 
1515 done:
1516 	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ADDED;
1517 	mutex_exit(&vio->vio_mutex);
1518 	return (DDI_SUCCESS);
1519 
1520 fail:
1521 	virtio_interrupts_teardown(vio);
1522 	mutex_exit(&vio->vio_mutex);
1523 	return (DDI_FAILURE);
1524 }
1525 
1526 static void
1527 virtio_interrupts_teardown(virtio_t *vio)
1528 {
1529 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1530 
1531 	virtio_interrupts_disable_locked(vio);
1532 
1533 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_FIXED) {
1534 		/*
1535 		 * Remove the multiplexing interrupt handler.
1536 		 */
1537 		if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED) {
1538 			int r;
1539 
1540 			VERIFY3S(vio->vio_ninterrupts, ==, 1);
1541 
1542 			if ((r = ddi_intr_remove_handler(
1543 			    vio->vio_interrupts[0])) != DDI_SUCCESS) {
1544 				dev_err(vio->vio_dip, CE_WARN, "removing "
1545 				    "shared interrupt handler failed (%d)", r);
1546 			}
1547 		}
1548 	} else {
1549 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1550 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1551 			int r;
1552 
1553 			if (!viq->viq_handler_added) {
1554 				continue;
1555 			}
1556 
1557 			if ((r = ddi_intr_remove_handler(
1558 			    vio->vio_interrupts[viq->viq_handler_index])) !=
1559 			    DDI_SUCCESS) {
1560 				dev_err(vio->vio_dip, CE_WARN, "removing "
1561 				    "interrupt handler (%s) failed (%d)",
1562 				    viq->viq_name, r);
1563 			}
1564 
1565 			viq->viq_handler_added = B_FALSE;
1566 		}
1567 	}
1568 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ADDED;
1569 
1570 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ALLOC) {
1571 		for (int i = 0; i < vio->vio_ninterrupts; i++) {
1572 			int r;
1573 
1574 			if ((r = ddi_intr_free(vio->vio_interrupts[i])) !=
1575 			    DDI_SUCCESS) {
1576 				dev_err(vio->vio_dip, CE_WARN, "freeing "
1577 				    "interrupt %u failed (%d)", i, r);
1578 			}
1579 		}
1580 		kmem_free(vio->vio_interrupts,
1581 		    sizeof (ddi_intr_handle_t) * vio->vio_ninterrupts);
1582 		vio->vio_interrupts = NULL;
1583 		vio->vio_ninterrupts = 0;
1584 		vio->vio_interrupt_type = 0;
1585 		vio->vio_interrupt_cap = 0;
1586 		vio->vio_interrupt_priority = 0;
1587 
1588 		vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ALLOC;
1589 	}
1590 }
1591 
1592 static void
1593 virtio_interrupts_unwind(virtio_t *vio)
1594 {
1595 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1596 
1597 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_MSIX) {
1598 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1599 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1600 			if (!viq->viq_handler_added) {
1601 				continue;
1602 			}
1603 
1604 			virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT,
1605 			    viq->viq_index);
1606 			virtio_put16(vio, VIRTIO_LEGACY_MSIX_QUEUE,
1607 			    VIRTIO_LEGACY_MSI_NO_VECTOR);
1608 		}
1609 	}
1610 
1611 	if (vio->vio_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
1612 		(void) ddi_intr_block_disable(vio->vio_interrupts,
1613 		    vio->vio_ninterrupts);
1614 	} else {
1615 		for (int i = 0; i < vio->vio_ninterrupts; i++) {
1616 			(void) ddi_intr_disable(vio->vio_interrupts[i]);
1617 		}
1618 	}
1619 
1620 	/*
1621 	 * Disabling the interrupts makes the MSI-X fields disappear from the
1622 	 * BAR once more.
1623 	 */
1624 	vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET;
1625 }
1626 
1627 int
1628 virtio_interrupts_enable(virtio_t *vio)
1629 {
1630 	mutex_enter(&vio->vio_mutex);
1631 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ENABLED) {
1632 		mutex_exit(&vio->vio_mutex);
1633 		return (DDI_SUCCESS);
1634 	}
1635 
1636 	int r = DDI_SUCCESS;
1637 	if (vio->vio_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
1638 		r = ddi_intr_block_enable(vio->vio_interrupts,
1639 		    vio->vio_ninterrupts);
1640 	} else {
1641 		for (int i = 0; i < vio->vio_ninterrupts; i++) {
1642 			if ((r = ddi_intr_enable(vio->vio_interrupts[i])) !=
1643 			    DDI_SUCCESS) {
1644 				/*
1645 				 * Disable the interrupts we have enabled so
1646 				 * far.
1647 				 */
1648 				for (i--; i >= 0; i--) {
1649 					(void) ddi_intr_disable(
1650 					    vio->vio_interrupts[i]);
1651 				}
1652 				break;
1653 			}
1654 		}
1655 	}
1656 
1657 	if (r != DDI_SUCCESS) {
1658 		mutex_exit(&vio->vio_mutex);
1659 		return (r);
1660 	}
1661 
1662 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_MSIX) {
1663 		/*
1664 		 * When asked to enable the interrupts, the system enables
1665 		 * MSI-X in the PCI configuration for the device.  While
1666 		 * enabled, the extra MSI-X configuration table fields appear
1667 		 * between the general and the device-specific regions of the
1668 		 * BAR.
1669 		 */
1670 		vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET_MSIX;
1671 
1672 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1673 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1674 			if (!viq->viq_handler_added) {
1675 				continue;
1676 			}
1677 
1678 			uint16_t qi = viq->viq_index;
1679 			uint16_t msi = viq->viq_handler_index;
1680 
1681 			/*
1682 			 * Route interrupts for this queue to the assigned
1683 			 * MSI-X vector number.
1684 			 */
1685 			virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qi);
1686 			virtio_put16(vio, VIRTIO_LEGACY_MSIX_QUEUE, msi);
1687 
1688 			/*
1689 			 * The device may not actually accept the vector number
1690 			 * we're attempting to program.  We need to confirm
1691 			 * that configuration was successful by re-reading the
1692 			 * configuration we just wrote.
1693 			 */
1694 			if (virtio_get16(vio, VIRTIO_LEGACY_MSIX_QUEUE) !=
1695 			    msi) {
1696 				dev_err(vio->vio_dip, CE_WARN,
1697 				    "failed to configure MSI-X vector %u for "
1698 				    "queue \"%s\" (#%u)", (uint_t)msi,
1699 				    viq->viq_name, (uint_t)qi);
1700 
1701 				virtio_interrupts_unwind(vio);
1702 				mutex_exit(&vio->vio_mutex);
1703 				return (DDI_FAILURE);
1704 			}
1705 		}
1706 	}
1707 
1708 	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ENABLED;
1709 
1710 	mutex_exit(&vio->vio_mutex);
1711 	return (DDI_SUCCESS);
1712 }
1713 
1714 static void
1715 virtio_interrupts_disable_locked(virtio_t *vio)
1716 {
1717 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1718 
1719 	if (!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ENABLED)) {
1720 		return;
1721 	}
1722 
1723 	virtio_interrupts_unwind(vio);
1724 
1725 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ENABLED;
1726 }
1727 
1728 void
1729 virtio_interrupts_disable(virtio_t *vio)
1730 {
1731 	mutex_enter(&vio->vio_mutex);
1732 	virtio_interrupts_disable_locked(vio);
1733 	mutex_exit(&vio->vio_mutex);
1734 }
1735