xref: /illumos-gate/usr/src/uts/common/io/virtio/virtio_main.c (revision 2833423dc59f4c35fe4713dbb942950c82df0437)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
15  */
16 
17 /*
18  * VIRTIO FRAMEWORK
19  *
20  * For design and usage documentation, see the comments in "virtio.h".
21  */
22 
23 #include <sys/conf.h>
24 #include <sys/kmem.h>
25 #include <sys/debug.h>
26 #include <sys/modctl.h>
27 #include <sys/autoconf.h>
28 #include <sys/ddi_impldefs.h>
29 #include <sys/ddi.h>
30 #include <sys/sunddi.h>
31 #include <sys/sunndi.h>
32 #include <sys/avintr.h>
33 #include <sys/spl.h>
34 #include <sys/promif.h>
35 #include <sys/list.h>
36 #include <sys/bootconf.h>
37 #include <sys/bootsvcs.h>
38 #include <sys/sysmacros.h>
39 #include <sys/pci.h>
40 
41 #include "virtio.h"
42 #include "virtio_impl.h"
43 
44 
45 /*
46  * Linkage structures
47  */
48 static struct modlmisc virtio_modlmisc = {
49 	.misc_modops =			&mod_miscops,
50 	.misc_linkinfo =		"VIRTIO common routines",
51 };
52 
53 static struct modlinkage virtio_modlinkage = {
54 	.ml_rev =			MODREV_1,
55 	.ml_linkage =			{ &virtio_modlmisc, NULL }
56 };
57 
58 int
59 _init(void)
60 {
61 	return (mod_install(&virtio_modlinkage));
62 }
63 
64 int
65 _fini(void)
66 {
67 	return (mod_remove(&virtio_modlinkage));
68 }
69 
70 int
71 _info(struct modinfo *modinfop)
72 {
73 	return (mod_info(&virtio_modlinkage, modinfop));
74 }
75 
76 
77 
78 static void virtio_set_status(virtio_t *, uint8_t);
79 static void virtio_set_status_locked(virtio_t *, uint8_t);
80 static int virtio_chain_append_impl(virtio_chain_t *, uint64_t, size_t,
81     uint16_t);
82 static int virtio_interrupts_setup(virtio_t *, int);
83 static void virtio_interrupts_teardown(virtio_t *);
84 static void virtio_interrupts_disable_locked(virtio_t *);
85 static void virtio_queue_free(virtio_queue_t *);
86 static void virtio_device_reset_locked(virtio_t *);
87 
88 /*
89  * We use the same device access attributes for BAR mapping and access to the
90  * virtqueue memory.
91  */
92 ddi_device_acc_attr_t virtio_acc_attr = {
93 	.devacc_attr_version =		DDI_DEVICE_ATTR_V1,
94 	.devacc_attr_endian_flags =	DDI_NEVERSWAP_ACC,
95 	.devacc_attr_dataorder =	DDI_STORECACHING_OK_ACC,
96 	.devacc_attr_access =		DDI_DEFAULT_ACC
97 };
98 
99 
100 /*
101  * DMA attributes for the memory given to the device for queue management.
102  */
103 ddi_dma_attr_t virtio_dma_attr_queue = {
104 	.dma_attr_version =		DMA_ATTR_V0,
105 	.dma_attr_addr_lo =		0x0000000000000000,
106 	/*
107 	 * Queue memory is aligned on VIRTIO_PAGE_SIZE with the address shifted
108 	 * down by VIRTIO_PAGE_SHIFT before being passed to the device in a
109 	 * 32-bit register.
110 	 */
111 	.dma_attr_addr_hi =		0x00000FFFFFFFF000,
112 	.dma_attr_count_max =		0x00000000FFFFFFFF,
113 	.dma_attr_align =		VIRTIO_PAGE_SIZE,
114 	.dma_attr_burstsizes =		1,
115 	.dma_attr_minxfer =		1,
116 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
117 	.dma_attr_seg =			0x00000000FFFFFFFF,
118 	.dma_attr_sgllen =		1,
119 	.dma_attr_granular =		1,
120 	.dma_attr_flags =		0
121 };
122 
123 /*
124  * DMA attributes for the the allocation of indirect descriptor lists.  The
125  * indirect list is referenced by a regular descriptor entry: the physical
126  * address field is 64 bits wide, but the length field is only 32 bits.  Each
127  * descriptor is 16 bytes long.
128  */
129 ddi_dma_attr_t virtio_dma_attr_indirect = {
130 	.dma_attr_version =		DMA_ATTR_V0,
131 	.dma_attr_addr_lo =		0x0000000000000000,
132 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
133 	.dma_attr_count_max =		0x00000000FFFFFFFF,
134 	.dma_attr_align =		sizeof (struct virtio_vq_desc),
135 	.dma_attr_burstsizes =		1,
136 	.dma_attr_minxfer =		1,
137 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
138 	.dma_attr_seg =			0x00000000FFFFFFFF,
139 	.dma_attr_sgllen =		1,
140 	.dma_attr_granular =		1,
141 	.dma_attr_flags =		0
142 };
143 
144 
145 uint8_t
146 virtio_get8(virtio_t *vio, uintptr_t offset)
147 {
148 	return (ddi_get8(vio->vio_barh, (uint8_t *)(vio->vio_bar + offset)));
149 }
150 
151 uint16_t
152 virtio_get16(virtio_t *vio, uintptr_t offset)
153 {
154 	return (ddi_get16(vio->vio_barh, (uint16_t *)(vio->vio_bar + offset)));
155 }
156 
157 uint32_t
158 virtio_get32(virtio_t *vio, uintptr_t offset)
159 {
160 	return (ddi_get32(vio->vio_barh, (uint32_t *)(vio->vio_bar + offset)));
161 }
162 
163 void
164 virtio_put8(virtio_t *vio, uintptr_t offset, uint8_t value)
165 {
166 	ddi_put8(vio->vio_barh, (uint8_t *)(vio->vio_bar + offset), value);
167 }
168 
169 void
170 virtio_put16(virtio_t *vio, uintptr_t offset, uint16_t value)
171 {
172 	ddi_put16(vio->vio_barh, (uint16_t *)(vio->vio_bar + offset), value);
173 }
174 
175 void
176 virtio_put32(virtio_t *vio, uintptr_t offset, uint32_t value)
177 {
178 	ddi_put32(vio->vio_barh, (uint32_t *)(vio->vio_bar + offset), value);
179 }
180 
181 void
182 virtio_fini(virtio_t *vio, boolean_t failed)
183 {
184 	mutex_enter(&vio->vio_mutex);
185 
186 	virtio_interrupts_teardown(vio);
187 
188 	virtio_queue_t *viq;
189 	while ((viq = list_remove_head(&vio->vio_queues)) != NULL) {
190 		virtio_queue_free(viq);
191 	}
192 	list_destroy(&vio->vio_queues);
193 
194 	if (failed) {
195 		/*
196 		 * Signal to the host that device setup failed.
197 		 */
198 		virtio_set_status_locked(vio, VIRTIO_STATUS_FAILED);
199 	} else {
200 		virtio_device_reset_locked(vio);
201 	}
202 
203 	/*
204 	 * We don't need to do anything for the provider initlevel, as it
205 	 * merely records the fact that virtio_init_complete() was called.
206 	 */
207 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_PROVIDER;
208 
209 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_REGS) {
210 		/*
211 		 * Unmap PCI BAR0.
212 		 */
213 		ddi_regs_map_free(&vio->vio_barh);
214 
215 		vio->vio_initlevel &= ~VIRTIO_INITLEVEL_REGS;
216 	}
217 
218 	/*
219 	 * Ensure we have torn down everything we set up.
220 	 */
221 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_SHUTDOWN;
222 	VERIFY0(vio->vio_initlevel);
223 
224 	mutex_exit(&vio->vio_mutex);
225 	mutex_destroy(&vio->vio_mutex);
226 
227 	kmem_free(vio, sizeof (*vio));
228 }
229 
230 /*
231  * Early device initialisation for legacy (pre-1.0 specification) virtio
232  * devices.
233  */
234 virtio_t *
235 virtio_init(dev_info_t *dip, uint64_t driver_features, boolean_t allow_indirect)
236 {
237 	int r;
238 
239 	/*
240 	 * First, confirm that this is a legacy device.
241 	 */
242 	ddi_acc_handle_t pci;
243 	if (pci_config_setup(dip, &pci) != DDI_SUCCESS) {
244 		dev_err(dip, CE_WARN, "pci_config_setup failed");
245 		return (NULL);
246 	}
247 
248 	uint8_t revid;
249 	if ((revid = pci_config_get8(pci, PCI_CONF_REVID)) == PCI_EINVAL8) {
250 		dev_err(dip, CE_WARN, "could not read config space");
251 		pci_config_teardown(&pci);
252 		return (NULL);
253 	}
254 
255 	pci_config_teardown(&pci);
256 
257 	/*
258 	 * The legacy specification requires that the device advertise as PCI
259 	 * Revision 0.
260 	 */
261 	if (revid != 0) {
262 		dev_err(dip, CE_WARN, "PCI Revision %u incorrect for "
263 		    "legacy virtio device", (uint_t)revid);
264 		return (NULL);
265 	}
266 
267 	virtio_t *vio = kmem_zalloc(sizeof (*vio), KM_SLEEP);
268 	vio->vio_dip = dip;
269 
270 	/*
271 	 * Map PCI BAR0 for legacy device access.
272 	 */
273 	if ((r = ddi_regs_map_setup(dip, VIRTIO_LEGACY_PCI_BAR0,
274 	    (caddr_t *)&vio->vio_bar, 0, 0, &virtio_acc_attr,
275 	    &vio->vio_barh)) != DDI_SUCCESS) {
276 		dev_err(dip, CE_WARN, "ddi_regs_map_setup failure (%d)", r);
277 		kmem_free(vio, sizeof (*vio));
278 		return (NULL);
279 	}
280 	vio->vio_initlevel |= VIRTIO_INITLEVEL_REGS;
281 
282 	/*
283 	 * We initialise the mutex without an interrupt priority to ease the
284 	 * implementation of some of the configuration space access routines.
285 	 * Drivers using the virtio framework MUST make a call to
286 	 * "virtio_init_complete()" prior to spawning other threads or enabling
287 	 * interrupt handlers, at which time we will destroy and reinitialise
288 	 * the mutex for use in our interrupt handlers.
289 	 */
290 	mutex_init(&vio->vio_mutex, NULL, MUTEX_DRIVER, NULL);
291 
292 	list_create(&vio->vio_queues, sizeof (virtio_queue_t),
293 	    offsetof(virtio_queue_t, viq_link));
294 
295 	/*
296 	 * Legacy virtio devices require a few common steps before we can
297 	 * negotiate device features.
298 	 */
299 	virtio_device_reset(vio);
300 	virtio_set_status(vio, VIRTIO_STATUS_ACKNOWLEDGE);
301 	virtio_set_status(vio, VIRTIO_STATUS_DRIVER);
302 
303 	/*
304 	 * Negotiate features with the device.  Record the original supported
305 	 * feature set for debugging purposes.
306 	 */
307 	vio->vio_features_device = virtio_get32(vio,
308 	    VIRTIO_LEGACY_FEATURES_DEVICE);
309 	if (allow_indirect) {
310 		driver_features |= VIRTIO_F_RING_INDIRECT_DESC;
311 	}
312 	vio->vio_features = vio->vio_features_device & driver_features;
313 	virtio_put32(vio, VIRTIO_LEGACY_FEATURES_DRIVER, vio->vio_features);
314 
315 	/*
316 	 * The device-specific configuration begins at an offset into the BAR
317 	 * that depends on whether we have enabled MSI-X interrupts or not.
318 	 * Start out with the offset for pre-MSI-X operation so that we can
319 	 * read device configuration space prior to configuring interrupts.
320 	 */
321 	vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET;
322 
323 	return (vio);
324 }
325 
326 /*
327  * Some virtio devices can change their device configuration state at any
328  * time. This function may be called by the driver during the initialisation
329  * phase - before calling virtio_init_complete() - in order to register a
330  * handler function which will be called when the device configuration space
331  * is updated.
332  */
333 void
334 virtio_register_cfgchange_handler(virtio_t *vio, ddi_intr_handler_t *func,
335     void *funcarg)
336 {
337 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED));
338 	VERIFY(!vio->vio_cfgchange_handler_added);
339 
340 	mutex_enter(&vio->vio_mutex);
341 	vio->vio_cfgchange_handler = func;
342 	vio->vio_cfgchange_handlerarg = funcarg;
343 	mutex_exit(&vio->vio_mutex);
344 }
345 
346 /*
347  * This function must be called by the driver once it has completed early setup
348  * calls.  The value of "allowed_interrupt_types" is a mask of interrupt types
349  * (DDI_INTR_TYPE_MSIX, etc) that we'll try to use when installing handlers, or
350  * the special value 0 to allow the system to use any available type.
351  */
352 int
353 virtio_init_complete(virtio_t *vio, int allowed_interrupt_types)
354 {
355 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_PROVIDER));
356 	vio->vio_initlevel |= VIRTIO_INITLEVEL_PROVIDER;
357 
358 	if (!list_is_empty(&vio->vio_queues) ||
359 	    vio->vio_cfgchange_handler != NULL) {
360 		/*
361 		 * Set up interrupts for the queues that have been registered.
362 		 */
363 		if (virtio_interrupts_setup(vio, allowed_interrupt_types) !=
364 		    DDI_SUCCESS) {
365 			return (DDI_FAILURE);
366 		}
367 	}
368 
369 	/*
370 	 * We can allocate the mutex once we know the priority.
371 	 */
372 	mutex_destroy(&vio->vio_mutex);
373 	mutex_init(&vio->vio_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
374 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
375 	    viq = list_next(&vio->vio_queues, viq)) {
376 		mutex_destroy(&viq->viq_mutex);
377 		mutex_init(&viq->viq_mutex, NULL, MUTEX_DRIVER,
378 		    virtio_intr_pri(vio));
379 	}
380 
381 	virtio_set_status(vio, VIRTIO_STATUS_DRIVER_OK);
382 
383 	return (DDI_SUCCESS);
384 }
385 
386 boolean_t
387 virtio_feature_present(virtio_t *vio, uint64_t feature_mask)
388 {
389 	return ((vio->vio_features & feature_mask) != 0);
390 }
391 
392 void *
393 virtio_intr_pri(virtio_t *vio)
394 {
395 	VERIFY(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED);
396 
397 	return (DDI_INTR_PRI(vio->vio_interrupt_priority));
398 }
399 
400 /*
401  * Enable a bit in the device status register.  Each bit signals a level of
402  * guest readiness to the host.  Use the VIRTIO_CONFIG_DEVICE_STATUS_*
403  * constants for "status".  To zero the status field use virtio_device_reset().
404  */
405 static void
406 virtio_set_status_locked(virtio_t *vio, uint8_t status)
407 {
408 	VERIFY3U(status, !=, 0);
409 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
410 
411 	uint8_t old = virtio_get8(vio, VIRTIO_LEGACY_DEVICE_STATUS);
412 	virtio_put8(vio, VIRTIO_LEGACY_DEVICE_STATUS, status | old);
413 }
414 
415 static void
416 virtio_set_status(virtio_t *vio, uint8_t status)
417 {
418 	mutex_enter(&vio->vio_mutex);
419 	virtio_set_status_locked(vio, status);
420 	mutex_exit(&vio->vio_mutex);
421 }
422 
423 static void
424 virtio_device_reset_locked(virtio_t *vio)
425 {
426 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
427 	virtio_put8(vio, VIRTIO_LEGACY_DEVICE_STATUS, VIRTIO_STATUS_RESET);
428 }
429 
430 void
431 virtio_device_reset(virtio_t *vio)
432 {
433 	mutex_enter(&vio->vio_mutex);
434 	virtio_device_reset_locked(vio);
435 	mutex_exit(&vio->vio_mutex);
436 }
437 
438 /*
439  * Some queues are effectively long-polled; the driver submits a series of
440  * buffers and the device only returns them when there is data available.
441  * During detach, we need to coordinate the return of these buffers.  Calling
442  * "virtio_shutdown()" will reset the device, then allow the removal of all
443  * buffers that were in flight at the time of shutdown via
444  * "virtio_queue_evacuate()".
445  */
446 void
447 virtio_shutdown(virtio_t *vio)
448 {
449 	mutex_enter(&vio->vio_mutex);
450 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) {
451 		/*
452 		 * Shutdown has been performed already.
453 		 */
454 		mutex_exit(&vio->vio_mutex);
455 		return;
456 	}
457 
458 	/*
459 	 * First, mark all of the queues as shutdown.  This will prevent any
460 	 * further activity.
461 	 */
462 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
463 	    viq = list_next(&vio->vio_queues, viq)) {
464 		mutex_enter(&viq->viq_mutex);
465 		viq->viq_shutdown = B_TRUE;
466 		mutex_exit(&viq->viq_mutex);
467 	}
468 
469 	/*
470 	 * Now, reset the device.  This removes any queue configuration on the
471 	 * device side.
472 	 */
473 	virtio_device_reset_locked(vio);
474 	vio->vio_initlevel |= VIRTIO_INITLEVEL_SHUTDOWN;
475 	mutex_exit(&vio->vio_mutex);
476 }
477 
478 /*
479  * Common implementation of quiesce(9E) for simple Virtio-based devices.
480  */
481 int
482 virtio_quiesce(virtio_t *vio)
483 {
484 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) {
485 		/*
486 		 * Device has already been reset.
487 		 */
488 		return (DDI_SUCCESS);
489 	}
490 
491 	/*
492 	 * When we reset the device, it should immediately stop using any DMA
493 	 * memory we've previously passed to it.  All queue configuration is
494 	 * discarded.  This is good enough for quiesce(9E).
495 	 */
496 	virtio_device_reset_locked(vio);
497 
498 	return (DDI_SUCCESS);
499 }
500 
501 /*
502  * DEVICE-SPECIFIC REGISTER ACCESS
503  *
504  * Note that these functions take the mutex to avoid racing with interrupt
505  * enable/disable, when the device-specific offset can potentially change.
506  */
507 
508 uint8_t
509 virtio_dev_get8(virtio_t *vio, uintptr_t offset)
510 {
511 	mutex_enter(&vio->vio_mutex);
512 	uint8_t r = virtio_get8(vio, vio->vio_config_offset + offset);
513 	mutex_exit(&vio->vio_mutex);
514 
515 	return (r);
516 }
517 
518 uint16_t
519 virtio_dev_get16(virtio_t *vio, uintptr_t offset)
520 {
521 	mutex_enter(&vio->vio_mutex);
522 	uint16_t r = virtio_get16(vio, vio->vio_config_offset + offset);
523 	mutex_exit(&vio->vio_mutex);
524 
525 	return (r);
526 }
527 
528 uint32_t
529 virtio_dev_get32(virtio_t *vio, uintptr_t offset)
530 {
531 	mutex_enter(&vio->vio_mutex);
532 	uint32_t r = virtio_get32(vio, vio->vio_config_offset + offset);
533 	mutex_exit(&vio->vio_mutex);
534 
535 	return (r);
536 }
537 
538 uint64_t
539 virtio_dev_get64(virtio_t *vio, uintptr_t offset)
540 {
541 	mutex_enter(&vio->vio_mutex);
542 	/*
543 	 * On at least some systems, a 64-bit read or write to this BAR is not
544 	 * possible.  For legacy devices, there is no generation number to use
545 	 * to determine if configuration may have changed half-way through a
546 	 * read.  We need to continue to read both halves of the value until we
547 	 * read the same value at least twice.
548 	 */
549 	uintptr_t o_lo = vio->vio_config_offset + offset;
550 	uintptr_t o_hi = o_lo + 4;
551 
552 	uint64_t val = virtio_get32(vio, o_lo) |
553 	    ((uint64_t)virtio_get32(vio, o_hi) << 32);
554 
555 	for (;;) {
556 		uint64_t tval = virtio_get32(vio, o_lo) |
557 		    ((uint64_t)virtio_get32(vio, o_hi) << 32);
558 
559 		if (tval == val) {
560 			break;
561 		}
562 
563 		val = tval;
564 	}
565 
566 	mutex_exit(&vio->vio_mutex);
567 	return (val);
568 }
569 
570 void
571 virtio_dev_put8(virtio_t *vio, uintptr_t offset, uint8_t value)
572 {
573 	mutex_enter(&vio->vio_mutex);
574 	virtio_put8(vio, vio->vio_config_offset + offset, value);
575 	mutex_exit(&vio->vio_mutex);
576 }
577 
578 void
579 virtio_dev_put16(virtio_t *vio, uintptr_t offset, uint16_t value)
580 {
581 	mutex_enter(&vio->vio_mutex);
582 	virtio_put16(vio, vio->vio_config_offset + offset, value);
583 	mutex_exit(&vio->vio_mutex);
584 }
585 
586 void
587 virtio_dev_put32(virtio_t *vio, uintptr_t offset, uint32_t value)
588 {
589 	mutex_enter(&vio->vio_mutex);
590 	virtio_put32(vio, vio->vio_config_offset + offset, value);
591 	mutex_exit(&vio->vio_mutex);
592 }
593 
594 /*
595  * VIRTQUEUE MANAGEMENT
596  */
597 
598 static int
599 virtio_inflight_compar(const void *lp, const void *rp)
600 {
601 	const virtio_chain_t *l = lp;
602 	const virtio_chain_t *r = rp;
603 
604 	if (l->vic_head < r->vic_head) {
605 		return (-1);
606 	} else if (l->vic_head > r->vic_head) {
607 		return (1);
608 	} else {
609 		return (0);
610 	}
611 }
612 
613 virtio_queue_t *
614 virtio_queue_alloc(virtio_t *vio, uint16_t qidx, const char *name,
615     ddi_intr_handler_t *func, void *funcarg, boolean_t force_direct,
616     uint_t max_segs)
617 {
618 	uint16_t qsz;
619 	char space_name[256];
620 
621 	if (max_segs < 1) {
622 		/*
623 		 * Every descriptor, direct or indirect, needs to refer to at
624 		 * least one buffer.
625 		 */
626 		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
627 		    "segment count must be at least 1", name, (uint_t)qidx);
628 		return (NULL);
629 	}
630 
631 	mutex_enter(&vio->vio_mutex);
632 
633 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_PROVIDER) {
634 		/*
635 		 * Cannot configure any more queues once initial setup is
636 		 * complete and interrupts have been allocated.
637 		 */
638 		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
639 		    "alloc after init complete", name, (uint_t)qidx);
640 		mutex_exit(&vio->vio_mutex);
641 		return (NULL);
642 	}
643 
644 	/*
645 	 * There is no way to negotiate a different queue size for legacy
646 	 * devices.  We must read and use the native queue size of the device.
647 	 */
648 	virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qidx);
649 	if ((qsz = virtio_get16(vio, VIRTIO_LEGACY_QUEUE_SIZE)) == 0) {
650 		/*
651 		 * A size of zero means the device does not have a queue with
652 		 * this index.
653 		 */
654 		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
655 		    "does not exist on device", name, (uint_t)qidx);
656 		mutex_exit(&vio->vio_mutex);
657 		return (NULL);
658 	}
659 
660 	mutex_exit(&vio->vio_mutex);
661 
662 	virtio_queue_t *viq = kmem_zalloc(sizeof (*viq), KM_SLEEP);
663 	viq->viq_virtio = vio;
664 	viq->viq_name = name;
665 	viq->viq_index = qidx;
666 	viq->viq_size = qsz;
667 	viq->viq_func = func;
668 	viq->viq_funcarg = funcarg;
669 	viq->viq_max_segs = max_segs;
670 	avl_create(&viq->viq_inflight, virtio_inflight_compar,
671 	    sizeof (virtio_chain_t), offsetof(virtio_chain_t, vic_node));
672 
673 	/*
674 	 * Allocate the mutex without an interrupt priority for now, as we do
675 	 * with "vio_mutex".  We'll reinitialise it in
676 	 * "virtio_init_complete()".
677 	 */
678 	mutex_init(&viq->viq_mutex, NULL, MUTEX_DRIVER, NULL);
679 
680 	if (virtio_feature_present(vio, VIRTIO_F_RING_INDIRECT_DESC) &&
681 	    !force_direct) {
682 		/*
683 		 * If we were able to negotiate the indirect descriptor
684 		 * feature, and the caller has not explicitly forced the use of
685 		 * direct descriptors, we'll allocate indirect descriptor lists
686 		 * for each chain.
687 		 */
688 		viq->viq_indirect = B_TRUE;
689 	}
690 
691 	/*
692 	 * Track descriptor usage in an identifier space.
693 	 */
694 	(void) snprintf(space_name, sizeof (space_name), "%s%d_vq_%s",
695 	    ddi_get_name(vio->vio_dip), ddi_get_instance(vio->vio_dip), name);
696 	if ((viq->viq_descmap = id_space_create(space_name, 0, qsz)) == NULL) {
697 		dev_err(vio->vio_dip, CE_WARN, "could not allocate descriptor "
698 		    "ID space");
699 		virtio_queue_free(viq);
700 		return (NULL);
701 	}
702 
703 	/*
704 	 * For legacy devices, memory for the queue has a strict layout
705 	 * determined by the queue size.
706 	 */
707 	size_t sz_descs = sizeof (virtio_vq_desc_t) * qsz;
708 	size_t sz_driver = P2ROUNDUP_TYPED(sz_descs +
709 	    sizeof (virtio_vq_driver_t) +
710 	    sizeof (uint16_t) * qsz,
711 	    VIRTIO_PAGE_SIZE, size_t);
712 	size_t sz_device = P2ROUNDUP_TYPED(sizeof (virtio_vq_device_t) +
713 	    sizeof (virtio_vq_elem_t) * qsz,
714 	    VIRTIO_PAGE_SIZE, size_t);
715 
716 	if (virtio_dma_init(vio, &viq->viq_dma, sz_driver + sz_device,
717 	    &virtio_dma_attr_queue, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
718 	    KM_SLEEP) != DDI_SUCCESS) {
719 		dev_err(vio->vio_dip, CE_WARN, "could not allocate queue "
720 		    "DMA memory");
721 		virtio_queue_free(viq);
722 		return (NULL);
723 	}
724 
725 	/*
726 	 * NOTE: The viq_dma_* members below are used by
727 	 * VIRTQ_DMA_SYNC_FORDEV() and VIRTQ_DMA_SYNC_FORKERNEL() to calculate
728 	 * offsets into the DMA allocation for partial synchronisation.  If the
729 	 * ordering of, or relationship between, these pointers changes, the
730 	 * macros must be kept in sync.
731 	 */
732 	viq->viq_dma_descs = virtio_dma_va(&viq->viq_dma, 0);
733 	viq->viq_dma_driver = virtio_dma_va(&viq->viq_dma, sz_descs);
734 	viq->viq_dma_device = virtio_dma_va(&viq->viq_dma, sz_driver);
735 
736 	/*
737 	 * Install in the per-device list of queues.
738 	 */
739 	mutex_enter(&vio->vio_mutex);
740 	for (virtio_queue_t *chkvq = list_head(&vio->vio_queues); chkvq != NULL;
741 	    chkvq = list_next(&vio->vio_queues, chkvq)) {
742 		if (chkvq->viq_index == qidx) {
743 			dev_err(vio->vio_dip, CE_WARN, "attempt to register "
744 			    "queue \"%s\" with same index (%d) as queue \"%s\"",
745 			    name, qidx, chkvq->viq_name);
746 			mutex_exit(&vio->vio_mutex);
747 			virtio_queue_free(viq);
748 			return (NULL);
749 		}
750 	}
751 	list_insert_tail(&vio->vio_queues, viq);
752 
753 	/*
754 	 * Ensure the zeroing of the queue memory is visible to the host before
755 	 * we inform the device of the queue address.
756 	 */
757 	membar_producer();
758 	VIRTQ_DMA_SYNC_FORDEV(viq);
759 
760 	virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qidx);
761 	virtio_put32(vio, VIRTIO_LEGACY_QUEUE_ADDRESS,
762 	    virtio_dma_cookie_pa(&viq->viq_dma, 0) >> VIRTIO_PAGE_SHIFT);
763 
764 	mutex_exit(&vio->vio_mutex);
765 	return (viq);
766 }
767 
768 static void
769 virtio_queue_free(virtio_queue_t *viq)
770 {
771 	virtio_t *vio = viq->viq_virtio;
772 
773 	/*
774 	 * We are going to destroy the queue mutex.  Make sure we've already
775 	 * removed the interrupt handlers.
776 	 */
777 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED));
778 
779 	mutex_enter(&viq->viq_mutex);
780 
781 	/*
782 	 * If the device has not already been reset as part of a shutdown,
783 	 * detach the queue from the device now.
784 	 */
785 	if (!viq->viq_shutdown) {
786 		virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, viq->viq_index);
787 		virtio_put32(vio, VIRTIO_LEGACY_QUEUE_ADDRESS, 0);
788 	}
789 
790 	virtio_dma_fini(&viq->viq_dma);
791 
792 	VERIFY(avl_is_empty(&viq->viq_inflight));
793 	avl_destroy(&viq->viq_inflight);
794 	if (viq->viq_descmap != NULL) {
795 		id_space_destroy(viq->viq_descmap);
796 	}
797 
798 	mutex_exit(&viq->viq_mutex);
799 	mutex_destroy(&viq->viq_mutex);
800 
801 	kmem_free(viq, sizeof (*viq));
802 }
803 
804 void
805 virtio_queue_no_interrupt(virtio_queue_t *viq, boolean_t stop_interrupts)
806 {
807 	mutex_enter(&viq->viq_mutex);
808 
809 	if (stop_interrupts) {
810 		viq->viq_dma_driver->vqdr_flags |= VIRTQ_AVAIL_F_NO_INTERRUPT;
811 	} else {
812 		viq->viq_dma_driver->vqdr_flags &= ~VIRTQ_AVAIL_F_NO_INTERRUPT;
813 	}
814 	VIRTQ_DMA_SYNC_FORDEV(viq);
815 
816 	mutex_exit(&viq->viq_mutex);
817 }
818 
819 static virtio_chain_t *
820 virtio_queue_complete(virtio_queue_t *viq, uint_t index)
821 {
822 	VERIFY(MUTEX_HELD(&viq->viq_mutex));
823 
824 	virtio_chain_t *vic;
825 
826 	virtio_chain_t search;
827 	bzero(&search, sizeof (search));
828 	search.vic_head = index;
829 
830 	if ((vic = avl_find(&viq->viq_inflight, &search, NULL)) == NULL) {
831 		return (NULL);
832 	}
833 	avl_remove(&viq->viq_inflight, vic);
834 
835 	return (vic);
836 }
837 
838 uint_t
839 virtio_queue_size(virtio_queue_t *viq)
840 {
841 	return (viq->viq_size);
842 }
843 
844 uint_t
845 virtio_queue_nactive(virtio_queue_t *viq)
846 {
847 	mutex_enter(&viq->viq_mutex);
848 	uint_t r = avl_numnodes(&viq->viq_inflight);
849 	mutex_exit(&viq->viq_mutex);
850 
851 	return (r);
852 }
853 
854 virtio_chain_t *
855 virtio_queue_poll(virtio_queue_t *viq)
856 {
857 	mutex_enter(&viq->viq_mutex);
858 	if (viq->viq_shutdown) {
859 		/*
860 		 * The device has been reset by virtio_shutdown(), and queue
861 		 * processing has been halted.  Any previously submitted chains
862 		 * will be evacuated using virtio_queue_evacuate().
863 		 */
864 		mutex_exit(&viq->viq_mutex);
865 		return (NULL);
866 	}
867 
868 	VIRTQ_DMA_SYNC_FORKERNEL(viq);
869 	if (viq->viq_device_index == viq->viq_dma_device->vqde_index) {
870 		/*
871 		 * If the device index has not changed since the last poll,
872 		 * there are no new chains to process.
873 		 */
874 		mutex_exit(&viq->viq_mutex);
875 		return (NULL);
876 	}
877 
878 	/*
879 	 * We need to ensure that all reads from the descriptor (vqde_ring[])
880 	 * and any referenced memory by the descriptor occur after we have read
881 	 * the descriptor index value above (vqde_index).
882 	 */
883 	membar_consumer();
884 
885 	uint16_t index = (viq->viq_device_index++) % viq->viq_size;
886 	uint16_t start = viq->viq_dma_device->vqde_ring[index].vqe_start;
887 	uint32_t len = viq->viq_dma_device->vqde_ring[index].vqe_len;
888 
889 	virtio_chain_t *vic;
890 	if ((vic = virtio_queue_complete(viq, start)) == NULL) {
891 		/*
892 		 * We could not locate a chain for this descriptor index, which
893 		 * suggests that something has gone horribly wrong.
894 		 */
895 		dev_err(viq->viq_virtio->vio_dip, CE_PANIC,
896 		    "queue \"%s\" ring entry %u (descriptor %u) has no chain",
897 		    viq->viq_name, (uint16_t)index, (uint16_t)start);
898 	}
899 
900 	vic->vic_received_length = len;
901 
902 	mutex_exit(&viq->viq_mutex);
903 
904 	return (vic);
905 }
906 
907 /*
908  * After a call to "virtio_shutdown()", the driver must retrieve any previously
909  * submitted chains and free any associated resources.
910  */
911 virtio_chain_t *
912 virtio_queue_evacuate(virtio_queue_t *viq)
913 {
914 	virtio_t *vio = viq->viq_virtio;
915 
916 	mutex_enter(&vio->vio_mutex);
917 	if (!(vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN)) {
918 		dev_err(vio->vio_dip, CE_PANIC,
919 		    "virtio_queue_evacuate() without virtio_shutdown()");
920 	}
921 	mutex_exit(&vio->vio_mutex);
922 
923 	mutex_enter(&viq->viq_mutex);
924 	VERIFY(viq->viq_shutdown);
925 
926 	virtio_chain_t *vic = avl_first(&viq->viq_inflight);
927 	if (vic != NULL) {
928 		avl_remove(&viq->viq_inflight, vic);
929 	}
930 
931 	mutex_exit(&viq->viq_mutex);
932 
933 	return (vic);
934 }
935 
936 /*
937  * VIRTQUEUE DESCRIPTOR CHAIN MANAGEMENT
938  */
939 
940 /*
941  * When the device returns a descriptor chain to the driver, it may provide the
942  * length in bytes of data written into the chain.  Client drivers should use
943  * this value with care; the specification suggests some device implementations
944  * have not always provided a useful or correct value.
945  */
946 size_t
947 virtio_chain_received_length(virtio_chain_t *vic)
948 {
949 	return (vic->vic_received_length);
950 }
951 
952 /*
953  * Allocate a descriptor chain for use with this queue.  The "kmflags" value
954  * may be KM_SLEEP or KM_NOSLEEP as per kmem_alloc(9F).
955  */
956 virtio_chain_t *
957 virtio_chain_alloc(virtio_queue_t *viq, int kmflags)
958 {
959 	virtio_t *vio = viq->viq_virtio;
960 	virtio_chain_t *vic;
961 	uint_t cap;
962 
963 	/*
964 	 * Direct descriptors are known by their index in the descriptor table
965 	 * for the queue.  We use the variable-length array member at the end
966 	 * of the chain tracking object to hold the list of direct descriptors
967 	 * assigned to this chain.
968 	 */
969 	if (viq->viq_indirect) {
970 		/*
971 		 * When using indirect descriptors we still need one direct
972 		 * descriptor entry to hold the physical address and length of
973 		 * the indirect descriptor table.
974 		 */
975 		cap = 1;
976 	} else {
977 		/*
978 		 * For direct descriptors we need to be able to track a
979 		 * descriptor for each possible segment in a single chain.
980 		 */
981 		cap = viq->viq_max_segs;
982 	}
983 
984 	size_t vicsz = sizeof (*vic) + sizeof (uint16_t) * cap;
985 	if ((vic = kmem_zalloc(vicsz, kmflags)) == NULL) {
986 		return (NULL);
987 	}
988 	vic->vic_vq = viq;
989 	vic->vic_direct_capacity = cap;
990 
991 	if (viq->viq_indirect) {
992 		/*
993 		 * Allocate an indirect descriptor list with the appropriate
994 		 * number of entries.
995 		 */
996 		if (virtio_dma_init(vio, &vic->vic_indirect_dma,
997 		    sizeof (virtio_vq_desc_t) * viq->viq_max_segs,
998 		    &virtio_dma_attr_indirect,
999 		    DDI_DMA_CONSISTENT | DDI_DMA_WRITE,
1000 		    kmflags) != DDI_SUCCESS) {
1001 			goto fail;
1002 		}
1003 
1004 		/*
1005 		 * Allocate a single descriptor to hold the indirect list.
1006 		 * Leave the length as zero for now; it will be set to include
1007 		 * any occupied entries at push time.
1008 		 */
1009 		mutex_enter(&viq->viq_mutex);
1010 		if (virtio_chain_append_impl(vic,
1011 		    virtio_dma_cookie_pa(&vic->vic_indirect_dma, 0), 0,
1012 		    VIRTQ_DESC_F_INDIRECT) != DDI_SUCCESS) {
1013 			mutex_exit(&viq->viq_mutex);
1014 			goto fail;
1015 		}
1016 		mutex_exit(&viq->viq_mutex);
1017 		VERIFY3U(vic->vic_direct_used, ==, 1);
1018 
1019 		/*
1020 		 * Don't set the indirect capacity until after we've installed
1021 		 * the direct descriptor which points at the indirect list, or
1022 		 * virtio_chain_append_impl() will be confused.
1023 		 */
1024 		vic->vic_indirect_capacity = viq->viq_max_segs;
1025 	}
1026 
1027 	return (vic);
1028 
1029 fail:
1030 	virtio_dma_fini(&vic->vic_indirect_dma);
1031 	kmem_free(vic, vicsz);
1032 	return (NULL);
1033 }
1034 
1035 void *
1036 virtio_chain_data(virtio_chain_t *vic)
1037 {
1038 	return (vic->vic_data);
1039 }
1040 
1041 void
1042 virtio_chain_data_set(virtio_chain_t *vic, void *data)
1043 {
1044 	vic->vic_data = data;
1045 }
1046 
1047 void
1048 virtio_chain_clear(virtio_chain_t *vic)
1049 {
1050 	if (vic->vic_indirect_capacity != 0) {
1051 		/*
1052 		 * There should only be one direct descriptor, which points at
1053 		 * our indirect descriptor list.  We don't want to clear it
1054 		 * here.
1055 		 */
1056 		VERIFY3U(vic->vic_direct_capacity, ==, 1);
1057 
1058 		if (vic->vic_indirect_used > 0) {
1059 			/*
1060 			 * Clear out the indirect descriptor table.
1061 			 */
1062 			vic->vic_indirect_used = 0;
1063 			bzero(virtio_dma_va(&vic->vic_indirect_dma, 0),
1064 			    virtio_dma_size(&vic->vic_indirect_dma));
1065 		}
1066 
1067 	} else if (vic->vic_direct_capacity > 0) {
1068 		/*
1069 		 * Release any descriptors that were assigned to us previously.
1070 		 */
1071 		for (uint_t i = 0; i < vic->vic_direct_used; i++) {
1072 			id_free(vic->vic_vq->viq_descmap, vic->vic_direct[i]);
1073 			vic->vic_direct[i] = 0;
1074 		}
1075 		vic->vic_direct_used = 0;
1076 	}
1077 }
1078 
1079 void
1080 virtio_chain_free(virtio_chain_t *vic)
1081 {
1082 	/*
1083 	 * First ensure that we have released any descriptors used by this
1084 	 * chain.
1085 	 */
1086 	virtio_chain_clear(vic);
1087 
1088 	if (vic->vic_indirect_capacity > 0) {
1089 		/*
1090 		 * Release the direct descriptor that points to our indirect
1091 		 * descriptor list.
1092 		 */
1093 		VERIFY3U(vic->vic_direct_capacity, ==, 1);
1094 		id_free(vic->vic_vq->viq_descmap, vic->vic_direct[0]);
1095 
1096 		virtio_dma_fini(&vic->vic_indirect_dma);
1097 	}
1098 
1099 	size_t vicsz = sizeof (*vic) +
1100 	    vic->vic_direct_capacity * sizeof (uint16_t);
1101 
1102 	kmem_free(vic, vicsz);
1103 }
1104 
1105 static inline int
1106 virtio_queue_descmap_alloc(virtio_queue_t *viq, uint_t *indexp)
1107 {
1108 	id_t index;
1109 
1110 	if ((index = id_alloc_nosleep(viq->viq_descmap)) == -1) {
1111 		return (ENOMEM);
1112 	}
1113 
1114 	VERIFY3S(index, >=, 0);
1115 	VERIFY3S(index, <=, viq->viq_size);
1116 
1117 	*indexp = (uint_t)index;
1118 	return (0);
1119 }
1120 
1121 static int
1122 virtio_chain_append_impl(virtio_chain_t *vic, uint64_t pa, size_t len,
1123     uint16_t flags)
1124 {
1125 	virtio_queue_t *viq = vic->vic_vq;
1126 	virtio_vq_desc_t *vqd;
1127 	uint_t index;
1128 
1129 	/*
1130 	 * We're modifying the queue-wide descriptor list so make sure we have
1131 	 * the appropriate lock.
1132 	 */
1133 	VERIFY(MUTEX_HELD(&viq->viq_mutex));
1134 
1135 	if (vic->vic_indirect_capacity != 0) {
1136 		/*
1137 		 * Use indirect descriptors.
1138 		 */
1139 		if (vic->vic_indirect_used >= vic->vic_indirect_capacity) {
1140 			return (DDI_FAILURE);
1141 		}
1142 
1143 		vqd = virtio_dma_va(&vic->vic_indirect_dma, 0);
1144 
1145 		if ((index = vic->vic_indirect_used++) > 0) {
1146 			/*
1147 			 * Chain the current last indirect descriptor to the
1148 			 * new one.
1149 			 */
1150 			vqd[index - 1].vqd_flags |= VIRTQ_DESC_F_NEXT;
1151 			vqd[index - 1].vqd_next = index;
1152 		}
1153 
1154 	} else {
1155 		/*
1156 		 * Use direct descriptors.
1157 		 */
1158 		if (vic->vic_direct_used >= vic->vic_direct_capacity) {
1159 			return (DDI_FAILURE);
1160 		}
1161 
1162 		if (virtio_queue_descmap_alloc(viq, &index) != 0) {
1163 			return (DDI_FAILURE);
1164 		}
1165 
1166 		vqd = virtio_dma_va(&viq->viq_dma, 0);
1167 
1168 		if (vic->vic_direct_used > 0) {
1169 			/*
1170 			 * This is not the first entry.  Chain the current
1171 			 * descriptor to the next one.
1172 			 */
1173 			uint16_t p = vic->vic_direct[vic->vic_direct_used - 1];
1174 
1175 			vqd[p].vqd_flags |= VIRTQ_DESC_F_NEXT;
1176 			vqd[p].vqd_next = index;
1177 		}
1178 		vic->vic_direct[vic->vic_direct_used++] = index;
1179 	}
1180 
1181 	vqd[index].vqd_addr = pa;
1182 	vqd[index].vqd_len = len;
1183 	vqd[index].vqd_flags = flags;
1184 	vqd[index].vqd_next = 0;
1185 
1186 	return (DDI_SUCCESS);
1187 }
1188 
1189 int
1190 virtio_chain_append(virtio_chain_t *vic, uint64_t pa, size_t len,
1191     virtio_direction_t dir)
1192 {
1193 	virtio_queue_t *viq = vic->vic_vq;
1194 	uint16_t flags = 0;
1195 
1196 	switch (dir) {
1197 	case VIRTIO_DIR_DEVICE_WRITES:
1198 		flags |= VIRTQ_DESC_F_WRITE;
1199 		break;
1200 
1201 	case VIRTIO_DIR_DEVICE_READS:
1202 		break;
1203 
1204 	default:
1205 		panic("unknown direction value %u", dir);
1206 	}
1207 
1208 	mutex_enter(&viq->viq_mutex);
1209 	int r = virtio_chain_append_impl(vic, pa, len, flags);
1210 	mutex_exit(&viq->viq_mutex);
1211 
1212 	return (r);
1213 }
1214 
1215 static void
1216 virtio_queue_flush_locked(virtio_queue_t *viq)
1217 {
1218 	VERIFY(MUTEX_HELD(&viq->viq_mutex));
1219 
1220 	/*
1221 	 * Make sure any writes we have just made to the descriptors
1222 	 * (vqdr_ring[]) are visible to the device before we update the ring
1223 	 * pointer (vqdr_index).
1224 	 */
1225 	membar_producer();
1226 	viq->viq_dma_driver->vqdr_index = viq->viq_driver_index;
1227 	VIRTQ_DMA_SYNC_FORDEV(viq);
1228 
1229 	/*
1230 	 * Determine whether the device expects us to notify it of new
1231 	 * descriptors.
1232 	 */
1233 	VIRTQ_DMA_SYNC_FORKERNEL(viq);
1234 	if (!(viq->viq_dma_device->vqde_flags & VIRTQ_USED_F_NO_NOTIFY)) {
1235 		virtio_put16(viq->viq_virtio, VIRTIO_LEGACY_QUEUE_NOTIFY,
1236 		    viq->viq_index);
1237 	}
1238 }
1239 
1240 void
1241 virtio_queue_flush(virtio_queue_t *viq)
1242 {
1243 	mutex_enter(&viq->viq_mutex);
1244 	virtio_queue_flush_locked(viq);
1245 	mutex_exit(&viq->viq_mutex);
1246 }
1247 
1248 void
1249 virtio_chain_submit(virtio_chain_t *vic, boolean_t flush)
1250 {
1251 	virtio_queue_t *viq = vic->vic_vq;
1252 
1253 	mutex_enter(&viq->viq_mutex);
1254 
1255 	if (vic->vic_indirect_capacity != 0) {
1256 		virtio_vq_desc_t *vqd = virtio_dma_va(&viq->viq_dma, 0);
1257 
1258 		VERIFY3U(vic->vic_direct_used, ==, 1);
1259 
1260 		/*
1261 		 * This is an indirect descriptor queue.  The length in bytes
1262 		 * of the descriptor must extend to cover the populated
1263 		 * indirect descriptor entries.
1264 		 */
1265 		vqd[vic->vic_direct[0]].vqd_len =
1266 		    sizeof (virtio_vq_desc_t) * vic->vic_indirect_used;
1267 
1268 		virtio_dma_sync(&vic->vic_indirect_dma, DDI_DMA_SYNC_FORDEV);
1269 	}
1270 
1271 	/*
1272 	 * Populate the next available slot in the driver-owned ring for this
1273 	 * chain.  The updated value of viq_driver_index is not yet visible to
1274 	 * the device until a subsequent queue flush.
1275 	 */
1276 	uint16_t index = (viq->viq_driver_index++) % viq->viq_size;
1277 	viq->viq_dma_driver->vqdr_ring[index] = vic->vic_direct[0];
1278 
1279 	vic->vic_head = vic->vic_direct[0];
1280 	avl_add(&viq->viq_inflight, vic);
1281 
1282 	if (flush) {
1283 		virtio_queue_flush_locked(vic->vic_vq);
1284 	}
1285 
1286 	mutex_exit(&viq->viq_mutex);
1287 }
1288 
1289 /*
1290  * INTERRUPTS MANAGEMENT
1291  */
1292 
1293 static const char *
1294 virtio_interrupt_type_name(int type)
1295 {
1296 	switch (type) {
1297 	case DDI_INTR_TYPE_MSIX:
1298 		return ("MSI-X");
1299 	case DDI_INTR_TYPE_MSI:
1300 		return ("MSI");
1301 	case DDI_INTR_TYPE_FIXED:
1302 		return ("fixed");
1303 	default:
1304 		return ("?");
1305 	}
1306 }
1307 
1308 static int
1309 virtio_interrupts_alloc(virtio_t *vio, int type, int nrequired)
1310 {
1311 	dev_info_t *dip = vio->vio_dip;
1312 	int nintrs = 0;
1313 	int navail = 0;
1314 
1315 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1316 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ALLOC));
1317 
1318 	if (ddi_intr_get_nintrs(dip, type, &nintrs) != DDI_SUCCESS) {
1319 		dev_err(dip, CE_WARN, "could not count %s interrupts",
1320 		    virtio_interrupt_type_name(type));
1321 		return (DDI_FAILURE);
1322 	}
1323 	if (nintrs < 1) {
1324 		dev_err(dip, CE_WARN, "no %s interrupts supported",
1325 		    virtio_interrupt_type_name(type));
1326 		return (DDI_FAILURE);
1327 	}
1328 
1329 	if (ddi_intr_get_navail(dip, type, &navail) != DDI_SUCCESS) {
1330 		dev_err(dip, CE_WARN, "could not count available %s interrupts",
1331 		    virtio_interrupt_type_name(type));
1332 		return (DDI_FAILURE);
1333 	}
1334 	if (navail < nrequired) {
1335 		dev_err(dip, CE_WARN, "need %d %s interrupts, but only %d "
1336 		    "available", nrequired, virtio_interrupt_type_name(type),
1337 		    navail);
1338 		return (DDI_FAILURE);
1339 	}
1340 
1341 	VERIFY3P(vio->vio_interrupts, ==, NULL);
1342 	vio->vio_interrupts = kmem_zalloc(
1343 	    sizeof (ddi_intr_handle_t) * nrequired, KM_SLEEP);
1344 
1345 	int r;
1346 	if ((r = ddi_intr_alloc(dip, vio->vio_interrupts, type, 0, nrequired,
1347 	    &vio->vio_ninterrupts, DDI_INTR_ALLOC_STRICT)) != DDI_SUCCESS) {
1348 		dev_err(dip, CE_WARN, "%s interrupt allocation failure (%d)",
1349 		    virtio_interrupt_type_name(type), r);
1350 		kmem_free(vio->vio_interrupts,
1351 		    sizeof (ddi_intr_handle_t) * nrequired);
1352 		vio->vio_interrupts = NULL;
1353 		return (DDI_FAILURE);
1354 	}
1355 
1356 	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ALLOC;
1357 	vio->vio_interrupt_type = type;
1358 	return (DDI_SUCCESS);
1359 }
1360 
1361 static uint_t
1362 virtio_shared_isr(caddr_t arg0, caddr_t arg1)
1363 {
1364 	virtio_t *vio = (virtio_t *)arg0;
1365 	uint_t r = DDI_INTR_UNCLAIMED;
1366 	uint8_t isr;
1367 
1368 	mutex_enter(&vio->vio_mutex);
1369 
1370 	/*
1371 	 * Check the ISR status to see if the interrupt applies to us.  Reading
1372 	 * this field resets it to zero.
1373 	 */
1374 	isr = virtio_get8(vio, VIRTIO_LEGACY_ISR_STATUS);
1375 
1376 	if ((isr & VIRTIO_ISR_CHECK_QUEUES) != 0) {
1377 		r = DDI_INTR_CLAIMED;
1378 
1379 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1380 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1381 			if (viq->viq_func != NULL) {
1382 				mutex_exit(&vio->vio_mutex);
1383 				(void) viq->viq_func(viq->viq_funcarg, arg0);
1384 				mutex_enter(&vio->vio_mutex);
1385 
1386 				if (vio->vio_initlevel &
1387 				    VIRTIO_INITLEVEL_SHUTDOWN) {
1388 					/*
1389 					 * The device was shut down while in a
1390 					 * queue handler routine.
1391 					 */
1392 					break;
1393 				}
1394 			}
1395 		}
1396 	}
1397 
1398 	mutex_exit(&vio->vio_mutex);
1399 
1400 	/*
1401 	 * vio_cfgchange_{handler,handlerarg} cannot change while interrupts
1402 	 * are configured so it is safe to access them outside of the lock.
1403 	 */
1404 
1405 	if ((isr & VIRTIO_ISR_CHECK_CONFIG) != 0) {
1406 		r = DDI_INTR_CLAIMED;
1407 		if (vio->vio_cfgchange_handler != NULL) {
1408 			(void) vio->vio_cfgchange_handler(
1409 			    (caddr_t)vio->vio_cfgchange_handlerarg,
1410 			    (caddr_t)vio);
1411 		}
1412 	}
1413 
1414 	return (r);
1415 }
1416 
1417 static int
1418 virtio_interrupts_setup(virtio_t *vio, int allow_types)
1419 {
1420 	dev_info_t *dip = vio->vio_dip;
1421 	int types;
1422 	int count = 0;
1423 
1424 	mutex_enter(&vio->vio_mutex);
1425 
1426 	/*
1427 	 * Determine the number of interrupts we'd like based on the number of
1428 	 * virtqueues.
1429 	 */
1430 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
1431 	    viq = list_next(&vio->vio_queues, viq)) {
1432 		if (viq->viq_func != NULL) {
1433 			count++;
1434 		}
1435 	}
1436 
1437 	/*
1438 	 * If there is a configuration change handler, one extra interrupt
1439 	 * is needed for that.
1440 	 */
1441 	if (vio->vio_cfgchange_handler != NULL)
1442 		count++;
1443 
1444 	if (ddi_intr_get_supported_types(dip, &types) != DDI_SUCCESS) {
1445 		dev_err(dip, CE_WARN, "could not get supported interrupts");
1446 		mutex_exit(&vio->vio_mutex);
1447 		return (DDI_FAILURE);
1448 	}
1449 
1450 	if (allow_types != VIRTIO_ANY_INTR_TYPE) {
1451 		/*
1452 		 * Restrict the possible interrupt types at the request of the
1453 		 * driver.
1454 		 */
1455 		types &= allow_types;
1456 	}
1457 
1458 	/*
1459 	 * Try each potential interrupt type in descending order of preference.
1460 	 * Note that the specification does not appear to allow for the use of
1461 	 * classical MSI, so we are limited to either MSI-X or fixed
1462 	 * interrupts.
1463 	 */
1464 	if (types & DDI_INTR_TYPE_MSIX) {
1465 		if (virtio_interrupts_alloc(vio, DDI_INTR_TYPE_MSIX,
1466 		    count) == DDI_SUCCESS) {
1467 			goto add_handlers;
1468 		}
1469 	}
1470 	if (types & DDI_INTR_TYPE_FIXED) {
1471 		/*
1472 		 * If fixed interrupts are all that are available, we'll just
1473 		 * ask for one.
1474 		 */
1475 		if (virtio_interrupts_alloc(vio, DDI_INTR_TYPE_FIXED, 1) ==
1476 		    DDI_SUCCESS) {
1477 			goto add_handlers;
1478 		}
1479 	}
1480 
1481 	dev_err(dip, CE_WARN, "interrupt allocation failed");
1482 	mutex_exit(&vio->vio_mutex);
1483 	return (DDI_FAILURE);
1484 
1485 add_handlers:
1486 	/*
1487 	 * Ensure that we have not been given any high-level interrupts as our
1488 	 * interrupt handlers do not support them.
1489 	 */
1490 	for (int i = 0; i < vio->vio_ninterrupts; i++) {
1491 		uint_t ipri;
1492 
1493 		if (ddi_intr_get_pri(vio->vio_interrupts[i], &ipri) !=
1494 		    DDI_SUCCESS) {
1495 			dev_err(dip, CE_WARN, "could not determine interrupt "
1496 			    "priority");
1497 			goto fail;
1498 		}
1499 
1500 		if (ipri >= ddi_intr_get_hilevel_pri()) {
1501 			dev_err(dip, CE_WARN, "high level interrupts not "
1502 			    "supported");
1503 			goto fail;
1504 		}
1505 
1506 		/*
1507 		 * Record the highest priority we've been allocated to use for
1508 		 * mutex initialisation.
1509 		 */
1510 		if (i == 0 || ipri > vio->vio_interrupt_priority) {
1511 			vio->vio_interrupt_priority = ipri;
1512 		}
1513 	}
1514 
1515 	/*
1516 	 * Get the interrupt capabilities from the first handle to determine
1517 	 * whether we need to use ddi_intr_block_enable(9F).
1518 	 */
1519 	if (ddi_intr_get_cap(vio->vio_interrupts[0],
1520 	    &vio->vio_interrupt_cap) != DDI_SUCCESS) {
1521 		dev_err(dip, CE_WARN, "failed to get interrupt capabilities");
1522 		goto fail;
1523 	}
1524 
1525 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_FIXED) {
1526 		VERIFY3S(vio->vio_ninterrupts, ==, 1);
1527 		/*
1528 		 * For fixed interrupts, we need to use our shared handler to
1529 		 * multiplex the per-queue handlers provided by the driver.
1530 		 */
1531 		if (ddi_intr_add_handler(vio->vio_interrupts[0],
1532 		    virtio_shared_isr, (caddr_t)vio, NULL) != DDI_SUCCESS) {
1533 			dev_err(dip, CE_WARN, "adding shared %s interrupt "
1534 			    "handler failed", virtio_interrupt_type_name(
1535 			    vio->vio_interrupt_type));
1536 			goto fail;
1537 		}
1538 
1539 		goto done;
1540 	}
1541 
1542 	VERIFY3S(vio->vio_ninterrupts, ==, count);
1543 
1544 	uint_t n = 0;
1545 
1546 	/* Bind the configuration vector interrupt */
1547 	if (vio->vio_cfgchange_handler != NULL) {
1548 		if (ddi_intr_add_handler(vio->vio_interrupts[n],
1549 		    vio->vio_cfgchange_handler,
1550 		    (caddr_t)vio->vio_cfgchange_handlerarg,
1551 		    (caddr_t)vio) != DDI_SUCCESS) {
1552 			dev_err(dip, CE_WARN,
1553 			    "adding configuration change interrupt failed");
1554 			goto fail;
1555 		}
1556 		vio->vio_cfgchange_handler_added = B_TRUE;
1557 		vio->vio_cfgchange_handler_index = n;
1558 		n++;
1559 	}
1560 
1561 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
1562 	    viq = list_next(&vio->vio_queues, viq)) {
1563 		if (viq->viq_func == NULL) {
1564 			continue;
1565 		}
1566 
1567 		if (ddi_intr_add_handler(vio->vio_interrupts[n],
1568 		    viq->viq_func, (caddr_t)viq->viq_funcarg,
1569 		    (caddr_t)vio) != DDI_SUCCESS) {
1570 			dev_err(dip, CE_WARN, "adding interrupt %u (%s) failed",
1571 			    n, viq->viq_name);
1572 			goto fail;
1573 		}
1574 
1575 		viq->viq_handler_index = n;
1576 		viq->viq_handler_added = B_TRUE;
1577 		n++;
1578 	}
1579 
1580 done:
1581 	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ADDED;
1582 	mutex_exit(&vio->vio_mutex);
1583 	return (DDI_SUCCESS);
1584 
1585 fail:
1586 	virtio_interrupts_teardown(vio);
1587 	mutex_exit(&vio->vio_mutex);
1588 	return (DDI_FAILURE);
1589 }
1590 
1591 static void
1592 virtio_interrupts_teardown(virtio_t *vio)
1593 {
1594 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1595 
1596 	virtio_interrupts_disable_locked(vio);
1597 
1598 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_FIXED) {
1599 		/*
1600 		 * Remove the multiplexing interrupt handler.
1601 		 */
1602 		if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED) {
1603 			int r;
1604 
1605 			VERIFY3S(vio->vio_ninterrupts, ==, 1);
1606 
1607 			if ((r = ddi_intr_remove_handler(
1608 			    vio->vio_interrupts[0])) != DDI_SUCCESS) {
1609 				dev_err(vio->vio_dip, CE_WARN, "removing "
1610 				    "shared interrupt handler failed (%d)", r);
1611 			}
1612 		}
1613 	} else {
1614 		/*
1615 		 * Remove the configuration vector interrupt handler.
1616 		 */
1617 		if (vio->vio_cfgchange_handler_added) {
1618 			int r;
1619 
1620 			if ((r = ddi_intr_remove_handler(
1621 			    vio->vio_interrupts[0])) != DDI_SUCCESS) {
1622 				dev_err(vio->vio_dip, CE_WARN,
1623 				    "removing configuration change interrupt "
1624 				    "handler failed (%d)", r);
1625 			}
1626 			vio->vio_cfgchange_handler_added = B_FALSE;
1627 		}
1628 
1629 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1630 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1631 			int r;
1632 
1633 			if (!viq->viq_handler_added) {
1634 				continue;
1635 			}
1636 
1637 			if ((r = ddi_intr_remove_handler(
1638 			    vio->vio_interrupts[viq->viq_handler_index])) !=
1639 			    DDI_SUCCESS) {
1640 				dev_err(vio->vio_dip, CE_WARN, "removing "
1641 				    "interrupt handler (%s) failed (%d)",
1642 				    viq->viq_name, r);
1643 			}
1644 
1645 			viq->viq_handler_added = B_FALSE;
1646 		}
1647 	}
1648 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ADDED;
1649 
1650 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ALLOC) {
1651 		for (int i = 0; i < vio->vio_ninterrupts; i++) {
1652 			int r;
1653 
1654 			if ((r = ddi_intr_free(vio->vio_interrupts[i])) !=
1655 			    DDI_SUCCESS) {
1656 				dev_err(vio->vio_dip, CE_WARN, "freeing "
1657 				    "interrupt %u failed (%d)", i, r);
1658 			}
1659 		}
1660 		kmem_free(vio->vio_interrupts,
1661 		    sizeof (ddi_intr_handle_t) * vio->vio_ninterrupts);
1662 		vio->vio_interrupts = NULL;
1663 		vio->vio_ninterrupts = 0;
1664 		vio->vio_interrupt_type = 0;
1665 		vio->vio_interrupt_cap = 0;
1666 		vio->vio_interrupt_priority = 0;
1667 
1668 		vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ALLOC;
1669 	}
1670 }
1671 
1672 static void
1673 virtio_interrupts_unwind(virtio_t *vio)
1674 {
1675 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1676 
1677 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_MSIX) {
1678 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1679 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1680 			if (!viq->viq_handler_added) {
1681 				continue;
1682 			}
1683 
1684 			virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT,
1685 			    viq->viq_index);
1686 			virtio_put16(vio, VIRTIO_LEGACY_MSIX_QUEUE,
1687 			    VIRTIO_LEGACY_MSI_NO_VECTOR);
1688 		}
1689 
1690 		if (vio->vio_cfgchange_handler_added) {
1691 			virtio_put16(vio, VIRTIO_LEGACY_MSIX_CONFIG,
1692 			    VIRTIO_LEGACY_MSI_NO_VECTOR);
1693 		}
1694 	}
1695 
1696 	if (vio->vio_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
1697 		(void) ddi_intr_block_disable(vio->vio_interrupts,
1698 		    vio->vio_ninterrupts);
1699 	} else {
1700 		for (int i = 0; i < vio->vio_ninterrupts; i++) {
1701 			(void) ddi_intr_disable(vio->vio_interrupts[i]);
1702 		}
1703 	}
1704 
1705 	/*
1706 	 * Disabling the interrupts makes the MSI-X fields disappear from the
1707 	 * BAR once more.
1708 	 */
1709 	vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET;
1710 }
1711 
1712 int
1713 virtio_interrupts_enable(virtio_t *vio)
1714 {
1715 	mutex_enter(&vio->vio_mutex);
1716 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ENABLED) {
1717 		mutex_exit(&vio->vio_mutex);
1718 		return (DDI_SUCCESS);
1719 	}
1720 
1721 	int r = DDI_SUCCESS;
1722 	if (vio->vio_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
1723 		r = ddi_intr_block_enable(vio->vio_interrupts,
1724 		    vio->vio_ninterrupts);
1725 	} else {
1726 		for (int i = 0; i < vio->vio_ninterrupts; i++) {
1727 			if ((r = ddi_intr_enable(vio->vio_interrupts[i])) !=
1728 			    DDI_SUCCESS) {
1729 				/*
1730 				 * Disable the interrupts we have enabled so
1731 				 * far.
1732 				 */
1733 				for (i--; i >= 0; i--) {
1734 					(void) ddi_intr_disable(
1735 					    vio->vio_interrupts[i]);
1736 				}
1737 				break;
1738 			}
1739 		}
1740 	}
1741 
1742 	if (r != DDI_SUCCESS) {
1743 		mutex_exit(&vio->vio_mutex);
1744 		return (r);
1745 	}
1746 
1747 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_MSIX) {
1748 		/*
1749 		 * When asked to enable the interrupts, the system enables
1750 		 * MSI-X in the PCI configuration for the device.  While
1751 		 * enabled, the extra MSI-X configuration table fields appear
1752 		 * between the general and the device-specific regions of the
1753 		 * BAR.
1754 		 */
1755 		vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET_MSIX;
1756 
1757 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1758 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1759 			if (!viq->viq_handler_added) {
1760 				continue;
1761 			}
1762 
1763 			uint16_t qi = viq->viq_index;
1764 			uint16_t msi = viq->viq_handler_index;
1765 
1766 			/*
1767 			 * Route interrupts for this queue to the assigned
1768 			 * MSI-X vector number.
1769 			 */
1770 			virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qi);
1771 			virtio_put16(vio, VIRTIO_LEGACY_MSIX_QUEUE, msi);
1772 
1773 			/*
1774 			 * The device may not actually accept the vector number
1775 			 * we're attempting to program.  We need to confirm
1776 			 * that configuration was successful by re-reading the
1777 			 * configuration we just wrote.
1778 			 */
1779 			if (virtio_get16(vio, VIRTIO_LEGACY_MSIX_QUEUE) !=
1780 			    msi) {
1781 				dev_err(vio->vio_dip, CE_WARN,
1782 				    "failed to configure MSI-X vector %u for "
1783 				    "queue \"%s\" (#%u)", (uint_t)msi,
1784 				    viq->viq_name, (uint_t)qi);
1785 
1786 				virtio_interrupts_unwind(vio);
1787 				mutex_exit(&vio->vio_mutex);
1788 				return (DDI_FAILURE);
1789 			}
1790 		}
1791 
1792 		if (vio->vio_cfgchange_handler_added) {
1793 			virtio_put16(vio, VIRTIO_LEGACY_MSIX_CONFIG,
1794 			    vio->vio_cfgchange_handler_index);
1795 
1796 			/* Verify the value was accepted. */
1797 			if (virtio_get16(vio, VIRTIO_LEGACY_MSIX_CONFIG) !=
1798 			    vio->vio_cfgchange_handler_index) {
1799 				dev_err(vio->vio_dip, CE_WARN,
1800 				    "failed to configure MSI-X vector for "
1801 				    "configuration");
1802 
1803 				virtio_interrupts_unwind(vio);
1804 				mutex_exit(&vio->vio_mutex);
1805 				return (DDI_FAILURE);
1806 			}
1807 		}
1808 	}
1809 
1810 	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ENABLED;
1811 
1812 	mutex_exit(&vio->vio_mutex);
1813 	return (DDI_SUCCESS);
1814 }
1815 
1816 static void
1817 virtio_interrupts_disable_locked(virtio_t *vio)
1818 {
1819 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1820 
1821 	if (!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ENABLED)) {
1822 		return;
1823 	}
1824 
1825 	virtio_interrupts_unwind(vio);
1826 
1827 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ENABLED;
1828 }
1829 
1830 void
1831 virtio_interrupts_disable(virtio_t *vio)
1832 {
1833 	mutex_enter(&vio->vio_mutex);
1834 	virtio_interrupts_disable_locked(vio);
1835 	mutex_exit(&vio->vio_mutex);
1836 }
1837