xref: /illumos-gate/usr/src/uts/common/io/virtio/virtio_main.c (revision b8f43eb65c2ac2ff69cf1a69aabc90c27cdb859e)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2019 Joyent, Inc.
14  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
15  */
16 
17 /*
18  * VIRTIO FRAMEWORK
19  *
20  * For design and usage documentation, see the comments in "virtio.h".
21  */
22 
23 #include <sys/conf.h>
24 #include <sys/kmem.h>
25 #include <sys/debug.h>
26 #include <sys/modctl.h>
27 #include <sys/autoconf.h>
28 #include <sys/ddi_impldefs.h>
29 #include <sys/ddi.h>
30 #include <sys/sunddi.h>
31 #include <sys/sunndi.h>
32 #include <sys/avintr.h>
33 #include <sys/spl.h>
34 #include <sys/promif.h>
35 #include <sys/list.h>
36 #include <sys/bootconf.h>
37 #include <sys/bootsvcs.h>
38 #include <sys/sysmacros.h>
39 #include <sys/pci.h>
40 
41 #include "virtio.h"
42 #include "virtio_impl.h"
43 
44 
45 /*
46  * Linkage structures
47  */
48 static struct modlmisc virtio_modlmisc = {
49 	.misc_modops =			&mod_miscops,
50 	.misc_linkinfo =		"VIRTIO common routines",
51 };
52 
53 static struct modlinkage virtio_modlinkage = {
54 	.ml_rev =			MODREV_1,
55 	.ml_linkage =			{ &virtio_modlmisc, NULL }
56 };
57 
58 int
59 _init(void)
60 {
61 	return (mod_install(&virtio_modlinkage));
62 }
63 
64 int
65 _fini(void)
66 {
67 	return (mod_remove(&virtio_modlinkage));
68 }
69 
70 int
71 _info(struct modinfo *modinfop)
72 {
73 	return (mod_info(&virtio_modlinkage, modinfop));
74 }
75 
76 
77 
78 static void virtio_set_status(virtio_t *, uint8_t);
79 static int virtio_chain_append_impl(virtio_chain_t *, uint64_t, size_t,
80     uint16_t);
81 static int virtio_interrupts_setup(virtio_t *, int);
82 static void virtio_interrupts_teardown(virtio_t *);
83 static void virtio_interrupts_disable_locked(virtio_t *);
84 static void virtio_queue_free(virtio_queue_t *);
85 static void virtio_device_reset_locked(virtio_t *);
86 
87 /*
88  * We use the same device access attributes for BAR mapping and access to the
89  * virtqueue memory.
90  */
91 ddi_device_acc_attr_t virtio_acc_attr = {
92 	.devacc_attr_version =		DDI_DEVICE_ATTR_V1,
93 	.devacc_attr_endian_flags =	DDI_NEVERSWAP_ACC,
94 	.devacc_attr_dataorder =	DDI_STORECACHING_OK_ACC,
95 	.devacc_attr_access =		DDI_DEFAULT_ACC
96 };
97 
98 
99 /*
100  * DMA attributes for the memory given to the device for queue management.
101  */
102 ddi_dma_attr_t virtio_dma_attr_queue = {
103 	.dma_attr_version =		DMA_ATTR_V0,
104 	.dma_attr_addr_lo =		0x0000000000000000,
105 	/*
106 	 * Queue memory is aligned on VIRTIO_PAGE_SIZE with the address shifted
107 	 * down by VIRTIO_PAGE_SHIFT before being passed to the device in a
108 	 * 32-bit register.
109 	 */
110 	.dma_attr_addr_hi =		0x00000FFFFFFFF000,
111 	.dma_attr_count_max =		0x00000000FFFFFFFF,
112 	.dma_attr_align =		VIRTIO_PAGE_SIZE,
113 	.dma_attr_burstsizes =		1,
114 	.dma_attr_minxfer =		1,
115 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
116 	.dma_attr_seg =			0x00000000FFFFFFFF,
117 	.dma_attr_sgllen =		1,
118 	.dma_attr_granular =		1,
119 	.dma_attr_flags =		0
120 };
121 
122 /*
123  * DMA attributes for the the allocation of indirect descriptor lists.  The
124  * indirect list is referenced by a regular descriptor entry: the physical
125  * address field is 64 bits wide, but the length field is only 32 bits.  Each
126  * descriptor is 16 bytes long.
127  */
128 ddi_dma_attr_t virtio_dma_attr_indirect = {
129 	.dma_attr_version =		DMA_ATTR_V0,
130 	.dma_attr_addr_lo =		0x0000000000000000,
131 	.dma_attr_addr_hi =		0xFFFFFFFFFFFFFFFF,
132 	.dma_attr_count_max =		0x00000000FFFFFFFF,
133 	.dma_attr_align =		sizeof (struct virtio_vq_desc),
134 	.dma_attr_burstsizes =		1,
135 	.dma_attr_minxfer =		1,
136 	.dma_attr_maxxfer =		0x00000000FFFFFFFF,
137 	.dma_attr_seg =			0x00000000FFFFFFFF,
138 	.dma_attr_sgllen =		1,
139 	.dma_attr_granular =		1,
140 	.dma_attr_flags =		0
141 };
142 
143 
144 uint8_t
145 virtio_get8(virtio_t *vio, uintptr_t offset)
146 {
147 	return (ddi_get8(vio->vio_barh, (uint8_t *)(vio->vio_bar + offset)));
148 }
149 
150 uint16_t
151 virtio_get16(virtio_t *vio, uintptr_t offset)
152 {
153 	return (ddi_get16(vio->vio_barh, (uint16_t *)(vio->vio_bar + offset)));
154 }
155 
156 uint32_t
157 virtio_get32(virtio_t *vio, uintptr_t offset)
158 {
159 	return (ddi_get32(vio->vio_barh, (uint32_t *)(vio->vio_bar + offset)));
160 }
161 
162 void
163 virtio_put8(virtio_t *vio, uintptr_t offset, uint8_t value)
164 {
165 	ddi_put8(vio->vio_barh, (uint8_t *)(vio->vio_bar + offset), value);
166 }
167 
168 void
169 virtio_put16(virtio_t *vio, uintptr_t offset, uint16_t value)
170 {
171 	ddi_put16(vio->vio_barh, (uint16_t *)(vio->vio_bar + offset), value);
172 }
173 
174 void
175 virtio_put32(virtio_t *vio, uintptr_t offset, uint32_t value)
176 {
177 	ddi_put32(vio->vio_barh, (uint32_t *)(vio->vio_bar + offset), value);
178 }
179 
180 void
181 virtio_fini(virtio_t *vio, boolean_t failed)
182 {
183 	mutex_enter(&vio->vio_mutex);
184 
185 	virtio_interrupts_teardown(vio);
186 
187 	virtio_queue_t *viq;
188 	while ((viq = list_remove_head(&vio->vio_queues)) != NULL) {
189 		virtio_queue_free(viq);
190 	}
191 	list_destroy(&vio->vio_queues);
192 
193 	if (failed) {
194 		/*
195 		 * Signal to the host that device setup failed.
196 		 */
197 		virtio_set_status(vio, VIRTIO_STATUS_FAILED);
198 	} else {
199 		virtio_device_reset_locked(vio);
200 	}
201 
202 	/*
203 	 * We don't need to do anything for the provider initlevel, as it
204 	 * merely records the fact that virtio_init_complete() was called.
205 	 */
206 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_PROVIDER;
207 
208 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_REGS) {
209 		/*
210 		 * Unmap PCI BAR0.
211 		 */
212 		ddi_regs_map_free(&vio->vio_barh);
213 
214 		vio->vio_initlevel &= ~VIRTIO_INITLEVEL_REGS;
215 	}
216 
217 	/*
218 	 * Ensure we have torn down everything we set up.
219 	 */
220 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_SHUTDOWN;
221 	VERIFY0(vio->vio_initlevel);
222 
223 	mutex_exit(&vio->vio_mutex);
224 	mutex_destroy(&vio->vio_mutex);
225 
226 	kmem_free(vio, sizeof (*vio));
227 }
228 
229 /*
230  * Early device initialisation for legacy (pre-1.0 specification) virtio
231  * devices.
232  */
233 virtio_t *
234 virtio_init(dev_info_t *dip, uint64_t driver_features, boolean_t allow_indirect)
235 {
236 	int r;
237 
238 	/*
239 	 * First, confirm that this is a legacy device.
240 	 */
241 	ddi_acc_handle_t pci;
242 	if (pci_config_setup(dip, &pci) != DDI_SUCCESS) {
243 		dev_err(dip, CE_WARN, "pci_config_setup failed");
244 		return (NULL);
245 	}
246 
247 	uint8_t revid;
248 	if ((revid = pci_config_get8(pci, PCI_CONF_REVID)) == PCI_EINVAL8) {
249 		dev_err(dip, CE_WARN, "could not read config space");
250 		pci_config_teardown(&pci);
251 		return (NULL);
252 	}
253 
254 	pci_config_teardown(&pci);
255 
256 	/*
257 	 * The legacy specification requires that the device advertise as PCI
258 	 * Revision 0.
259 	 */
260 	if (revid != 0) {
261 		dev_err(dip, CE_WARN, "PCI Revision %u incorrect for "
262 		    "legacy virtio device", (uint_t)revid);
263 		return (NULL);
264 	}
265 
266 	virtio_t *vio = kmem_zalloc(sizeof (*vio), KM_SLEEP);
267 	vio->vio_dip = dip;
268 
269 	/*
270 	 * Map PCI BAR0 for legacy device access.
271 	 */
272 	if ((r = ddi_regs_map_setup(dip, VIRTIO_LEGACY_PCI_BAR0,
273 	    (caddr_t *)&vio->vio_bar, 0, 0, &virtio_acc_attr,
274 	    &vio->vio_barh)) != DDI_SUCCESS) {
275 		dev_err(dip, CE_WARN, "ddi_regs_map_setup failure (%d)", r);
276 		kmem_free(vio, sizeof (*vio));
277 		return (NULL);
278 	}
279 	vio->vio_initlevel |= VIRTIO_INITLEVEL_REGS;
280 
281 	/*
282 	 * We initialise the mutex without an interrupt priority to ease the
283 	 * implementation of some of the configuration space access routines.
284 	 * Drivers using the virtio framework MUST make a call to
285 	 * "virtio_init_complete()" prior to spawning other threads or enabling
286 	 * interrupt handlers, at which time we will destroy and reinitialise
287 	 * the mutex for use in our interrupt handlers.
288 	 */
289 	mutex_init(&vio->vio_mutex, NULL, MUTEX_DRIVER, NULL);
290 
291 	list_create(&vio->vio_queues, sizeof (virtio_queue_t),
292 	    offsetof(virtio_queue_t, viq_link));
293 
294 	/*
295 	 * Legacy virtio devices require a few common steps before we can
296 	 * negotiate device features.
297 	 */
298 	virtio_device_reset(vio);
299 	virtio_set_status(vio, VIRTIO_STATUS_ACKNOWLEDGE);
300 	virtio_set_status(vio, VIRTIO_STATUS_DRIVER);
301 
302 	/*
303 	 * Negotiate features with the device.  Record the original supported
304 	 * feature set for debugging purposes.
305 	 */
306 	vio->vio_features_device = virtio_get32(vio,
307 	    VIRTIO_LEGACY_FEATURES_DEVICE);
308 	if (allow_indirect) {
309 		driver_features |= VIRTIO_F_RING_INDIRECT_DESC;
310 	}
311 	vio->vio_features = vio->vio_features_device & driver_features;
312 	virtio_put32(vio, VIRTIO_LEGACY_FEATURES_DRIVER, vio->vio_features);
313 
314 	/*
315 	 * The device-specific configuration begins at an offset into the BAR
316 	 * that depends on whether we have enabled MSI-X interrupts or not.
317 	 * Start out with the offset for pre-MSI-X operation so that we can
318 	 * read device configuration space prior to configuring interrupts.
319 	 */
320 	vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET;
321 
322 	return (vio);
323 }
324 
325 /*
326  * Some virtio devices can change their device configuration state at any
327  * time. This function may be called by the driver during the initialisation
328  * phase - before calling virtio_init_complete() - in order to register a
329  * handler function which will be called when the device configuration space
330  * is updated.
331  */
332 void
333 virtio_register_cfgchange_handler(virtio_t *vio, ddi_intr_handler_t *func,
334     void *funcarg)
335 {
336 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED));
337 	VERIFY(!vio->vio_cfgchange_handler_added);
338 
339 	mutex_enter(&vio->vio_mutex);
340 	vio->vio_cfgchange_handler = func;
341 	vio->vio_cfgchange_handlerarg = funcarg;
342 	mutex_exit(&vio->vio_mutex);
343 }
344 
345 /*
346  * This function must be called by the driver once it has completed early setup
347  * calls.  The value of "allowed_interrupt_types" is a mask of interrupt types
348  * (DDI_INTR_TYPE_MSIX, etc) that we'll try to use when installing handlers, or
349  * the special value 0 to allow the system to use any available type.
350  */
351 int
352 virtio_init_complete(virtio_t *vio, int allowed_interrupt_types)
353 {
354 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_PROVIDER));
355 	vio->vio_initlevel |= VIRTIO_INITLEVEL_PROVIDER;
356 
357 	if (!list_is_empty(&vio->vio_queues) ||
358 	    vio->vio_cfgchange_handler != NULL) {
359 		/*
360 		 * Set up interrupts for the queues that have been registered.
361 		 */
362 		if (virtio_interrupts_setup(vio, allowed_interrupt_types) !=
363 		    DDI_SUCCESS) {
364 			return (DDI_FAILURE);
365 		}
366 	}
367 
368 	/*
369 	 * We can allocate the mutex once we know the priority.
370 	 */
371 	mutex_destroy(&vio->vio_mutex);
372 	mutex_init(&vio->vio_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio));
373 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
374 	    viq = list_next(&vio->vio_queues, viq)) {
375 		mutex_destroy(&viq->viq_mutex);
376 		mutex_init(&viq->viq_mutex, NULL, MUTEX_DRIVER,
377 		    virtio_intr_pri(vio));
378 	}
379 
380 	virtio_set_status(vio, VIRTIO_STATUS_DRIVER_OK);
381 
382 	return (DDI_SUCCESS);
383 }
384 
385 boolean_t
386 virtio_feature_present(virtio_t *vio, uint64_t feature_mask)
387 {
388 	return ((vio->vio_features & feature_mask) != 0);
389 }
390 
391 void *
392 virtio_intr_pri(virtio_t *vio)
393 {
394 	VERIFY(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED);
395 
396 	return (DDI_INTR_PRI(vio->vio_interrupt_priority));
397 }
398 
399 /*
400  * Enable a bit in the device status register.  Each bit signals a level of
401  * guest readiness to the host.  Use the VIRTIO_CONFIG_DEVICE_STATUS_*
402  * constants for "status".  To zero the status field use virtio_device_reset().
403  */
404 static void
405 virtio_set_status(virtio_t *vio, uint8_t status)
406 {
407 	VERIFY3U(status, !=, 0);
408 
409 	mutex_enter(&vio->vio_mutex);
410 
411 	uint8_t old = virtio_get8(vio, VIRTIO_LEGACY_DEVICE_STATUS);
412 	virtio_put8(vio, VIRTIO_LEGACY_DEVICE_STATUS, status | old);
413 
414 	mutex_exit(&vio->vio_mutex);
415 }
416 
417 static void
418 virtio_device_reset_locked(virtio_t *vio)
419 {
420 	virtio_put8(vio, VIRTIO_LEGACY_DEVICE_STATUS, VIRTIO_STATUS_RESET);
421 }
422 
423 void
424 virtio_device_reset(virtio_t *vio)
425 {
426 	mutex_enter(&vio->vio_mutex);
427 	virtio_device_reset_locked(vio);
428 	mutex_exit(&vio->vio_mutex);
429 }
430 
431 /*
432  * Some queues are effectively long-polled; the driver submits a series of
433  * buffers and the device only returns them when there is data available.
434  * During detach, we need to coordinate the return of these buffers.  Calling
435  * "virtio_shutdown()" will reset the device, then allow the removal of all
436  * buffers that were in flight at the time of shutdown via
437  * "virtio_queue_evacuate()".
438  */
439 void
440 virtio_shutdown(virtio_t *vio)
441 {
442 	mutex_enter(&vio->vio_mutex);
443 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) {
444 		/*
445 		 * Shutdown has been performed already.
446 		 */
447 		mutex_exit(&vio->vio_mutex);
448 		return;
449 	}
450 
451 	/*
452 	 * First, mark all of the queues as shutdown.  This will prevent any
453 	 * further activity.
454 	 */
455 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
456 	    viq = list_next(&vio->vio_queues, viq)) {
457 		mutex_enter(&viq->viq_mutex);
458 		viq->viq_shutdown = B_TRUE;
459 		mutex_exit(&viq->viq_mutex);
460 	}
461 
462 	/*
463 	 * Now, reset the device.  This removes any queue configuration on the
464 	 * device side.
465 	 */
466 	virtio_device_reset_locked(vio);
467 	vio->vio_initlevel |= VIRTIO_INITLEVEL_SHUTDOWN;
468 	mutex_exit(&vio->vio_mutex);
469 }
470 
471 /*
472  * Common implementation of quiesce(9E) for simple Virtio-based devices.
473  */
474 int
475 virtio_quiesce(virtio_t *vio)
476 {
477 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN) {
478 		/*
479 		 * Device has already been reset.
480 		 */
481 		return (DDI_SUCCESS);
482 	}
483 
484 	/*
485 	 * When we reset the device, it should immediately stop using any DMA
486 	 * memory we've previously passed to it.  All queue configuration is
487 	 * discarded.  This is good enough for quiesce(9E).
488 	 */
489 	virtio_device_reset_locked(vio);
490 
491 	return (DDI_SUCCESS);
492 }
493 
494 /*
495  * DEVICE-SPECIFIC REGISTER ACCESS
496  *
497  * Note that these functions take the mutex to avoid racing with interrupt
498  * enable/disable, when the device-specific offset can potentially change.
499  */
500 
501 uint8_t
502 virtio_dev_get8(virtio_t *vio, uintptr_t offset)
503 {
504 	mutex_enter(&vio->vio_mutex);
505 	uint8_t r = virtio_get8(vio, vio->vio_config_offset + offset);
506 	mutex_exit(&vio->vio_mutex);
507 
508 	return (r);
509 }
510 
511 uint16_t
512 virtio_dev_get16(virtio_t *vio, uintptr_t offset)
513 {
514 	mutex_enter(&vio->vio_mutex);
515 	uint16_t r = virtio_get16(vio, vio->vio_config_offset + offset);
516 	mutex_exit(&vio->vio_mutex);
517 
518 	return (r);
519 }
520 
521 uint32_t
522 virtio_dev_get32(virtio_t *vio, uintptr_t offset)
523 {
524 	mutex_enter(&vio->vio_mutex);
525 	uint32_t r = virtio_get32(vio, vio->vio_config_offset + offset);
526 	mutex_exit(&vio->vio_mutex);
527 
528 	return (r);
529 }
530 
531 uint64_t
532 virtio_dev_get64(virtio_t *vio, uintptr_t offset)
533 {
534 	mutex_enter(&vio->vio_mutex);
535 	/*
536 	 * On at least some systems, a 64-bit read or write to this BAR is not
537 	 * possible.  For legacy devices, there is no generation number to use
538 	 * to determine if configuration may have changed half-way through a
539 	 * read.  We need to continue to read both halves of the value until we
540 	 * read the same value at least twice.
541 	 */
542 	uintptr_t o_lo = vio->vio_config_offset + offset;
543 	uintptr_t o_hi = o_lo + 4;
544 
545 	uint64_t val = virtio_get32(vio, o_lo) |
546 	    ((uint64_t)virtio_get32(vio, o_hi) << 32);
547 
548 	for (;;) {
549 		uint64_t tval = virtio_get32(vio, o_lo) |
550 		    ((uint64_t)virtio_get32(vio, o_hi) << 32);
551 
552 		if (tval == val) {
553 			break;
554 		}
555 
556 		val = tval;
557 	}
558 
559 	mutex_exit(&vio->vio_mutex);
560 	return (val);
561 }
562 
563 void
564 virtio_dev_put8(virtio_t *vio, uintptr_t offset, uint8_t value)
565 {
566 	mutex_enter(&vio->vio_mutex);
567 	virtio_put8(vio, vio->vio_config_offset + offset, value);
568 	mutex_exit(&vio->vio_mutex);
569 }
570 
571 void
572 virtio_dev_put16(virtio_t *vio, uintptr_t offset, uint16_t value)
573 {
574 	mutex_enter(&vio->vio_mutex);
575 	virtio_put16(vio, vio->vio_config_offset + offset, value);
576 	mutex_exit(&vio->vio_mutex);
577 }
578 
579 void
580 virtio_dev_put32(virtio_t *vio, uintptr_t offset, uint32_t value)
581 {
582 	mutex_enter(&vio->vio_mutex);
583 	virtio_put32(vio, vio->vio_config_offset + offset, value);
584 	mutex_exit(&vio->vio_mutex);
585 }
586 
587 /*
588  * VIRTQUEUE MANAGEMENT
589  */
590 
591 static int
592 virtio_inflight_compar(const void *lp, const void *rp)
593 {
594 	const virtio_chain_t *l = lp;
595 	const virtio_chain_t *r = rp;
596 
597 	if (l->vic_head < r->vic_head) {
598 		return (-1);
599 	} else if (l->vic_head > r->vic_head) {
600 		return (1);
601 	} else {
602 		return (0);
603 	}
604 }
605 
606 virtio_queue_t *
607 virtio_queue_alloc(virtio_t *vio, uint16_t qidx, const char *name,
608     ddi_intr_handler_t *func, void *funcarg, boolean_t force_direct,
609     uint_t max_segs)
610 {
611 	uint16_t qsz;
612 	char space_name[256];
613 
614 	if (max_segs < 1) {
615 		/*
616 		 * Every descriptor, direct or indirect, needs to refer to at
617 		 * least one buffer.
618 		 */
619 		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
620 		    "segment count must be at least 1", name, (uint_t)qidx);
621 		return (NULL);
622 	}
623 
624 	mutex_enter(&vio->vio_mutex);
625 
626 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_PROVIDER) {
627 		/*
628 		 * Cannot configure any more queues once initial setup is
629 		 * complete and interrupts have been allocated.
630 		 */
631 		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
632 		    "alloc after init complete", name, (uint_t)qidx);
633 		mutex_exit(&vio->vio_mutex);
634 		return (NULL);
635 	}
636 
637 	/*
638 	 * There is no way to negotiate a different queue size for legacy
639 	 * devices.  We must read and use the native queue size of the device.
640 	 */
641 	virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qidx);
642 	if ((qsz = virtio_get16(vio, VIRTIO_LEGACY_QUEUE_SIZE)) == 0) {
643 		/*
644 		 * A size of zero means the device does not have a queue with
645 		 * this index.
646 		 */
647 		dev_err(vio->vio_dip, CE_WARN, "queue \"%s\" (%u) "
648 		    "does not exist on device", name, (uint_t)qidx);
649 		mutex_exit(&vio->vio_mutex);
650 		return (NULL);
651 	}
652 
653 	mutex_exit(&vio->vio_mutex);
654 
655 	virtio_queue_t *viq = kmem_zalloc(sizeof (*viq), KM_SLEEP);
656 	viq->viq_virtio = vio;
657 	viq->viq_name = name;
658 	viq->viq_index = qidx;
659 	viq->viq_size = qsz;
660 	viq->viq_func = func;
661 	viq->viq_funcarg = funcarg;
662 	viq->viq_max_segs = max_segs;
663 	avl_create(&viq->viq_inflight, virtio_inflight_compar,
664 	    sizeof (virtio_chain_t), offsetof(virtio_chain_t, vic_node));
665 
666 	/*
667 	 * Allocate the mutex without an interrupt priority for now, as we do
668 	 * with "vio_mutex".  We'll reinitialise it in
669 	 * "virtio_init_complete()".
670 	 */
671 	mutex_init(&viq->viq_mutex, NULL, MUTEX_DRIVER, NULL);
672 
673 	if (virtio_feature_present(vio, VIRTIO_F_RING_INDIRECT_DESC) &&
674 	    !force_direct) {
675 		/*
676 		 * If we were able to negotiate the indirect descriptor
677 		 * feature, and the caller has not explicitly forced the use of
678 		 * direct descriptors, we'll allocate indirect descriptor lists
679 		 * for each chain.
680 		 */
681 		viq->viq_indirect = B_TRUE;
682 	}
683 
684 	/*
685 	 * Track descriptor usage in an identifier space.
686 	 */
687 	(void) snprintf(space_name, sizeof (space_name), "%s%d_vq_%s",
688 	    ddi_get_name(vio->vio_dip), ddi_get_instance(vio->vio_dip), name);
689 	if ((viq->viq_descmap = id_space_create(space_name, 0, qsz)) == NULL) {
690 		dev_err(vio->vio_dip, CE_WARN, "could not allocate descriptor "
691 		    "ID space");
692 		virtio_queue_free(viq);
693 		return (NULL);
694 	}
695 
696 	/*
697 	 * For legacy devices, memory for the queue has a strict layout
698 	 * determined by the queue size.
699 	 */
700 	size_t sz_descs = sizeof (virtio_vq_desc_t) * qsz;
701 	size_t sz_driver = P2ROUNDUP_TYPED(sz_descs +
702 	    sizeof (virtio_vq_driver_t) +
703 	    sizeof (uint16_t) * qsz,
704 	    VIRTIO_PAGE_SIZE, size_t);
705 	size_t sz_device = P2ROUNDUP_TYPED(sizeof (virtio_vq_device_t) +
706 	    sizeof (virtio_vq_elem_t) * qsz,
707 	    VIRTIO_PAGE_SIZE, size_t);
708 
709 	if (virtio_dma_init(vio, &viq->viq_dma, sz_driver + sz_device,
710 	    &virtio_dma_attr_queue, DDI_DMA_RDWR | DDI_DMA_CONSISTENT,
711 	    KM_SLEEP) != DDI_SUCCESS) {
712 		dev_err(vio->vio_dip, CE_WARN, "could not allocate queue "
713 		    "DMA memory");
714 		virtio_queue_free(viq);
715 		return (NULL);
716 	}
717 
718 	/*
719 	 * NOTE: The viq_dma_* members below are used by
720 	 * VIRTQ_DMA_SYNC_FORDEV() and VIRTQ_DMA_SYNC_FORKERNEL() to calculate
721 	 * offsets into the DMA allocation for partial synchronisation.  If the
722 	 * ordering of, or relationship between, these pointers changes, the
723 	 * macros must be kept in sync.
724 	 */
725 	viq->viq_dma_descs = virtio_dma_va(&viq->viq_dma, 0);
726 	viq->viq_dma_driver = virtio_dma_va(&viq->viq_dma, sz_descs);
727 	viq->viq_dma_device = virtio_dma_va(&viq->viq_dma, sz_driver);
728 
729 	/*
730 	 * Install in the per-device list of queues.
731 	 */
732 	mutex_enter(&vio->vio_mutex);
733 	for (virtio_queue_t *chkvq = list_head(&vio->vio_queues); chkvq != NULL;
734 	    chkvq = list_next(&vio->vio_queues, chkvq)) {
735 		if (chkvq->viq_index == qidx) {
736 			dev_err(vio->vio_dip, CE_WARN, "attempt to register "
737 			    "queue \"%s\" with same index (%d) as queue \"%s\"",
738 			    name, qidx, chkvq->viq_name);
739 			mutex_exit(&vio->vio_mutex);
740 			virtio_queue_free(viq);
741 			return (NULL);
742 		}
743 	}
744 	list_insert_tail(&vio->vio_queues, viq);
745 
746 	/*
747 	 * Ensure the zeroing of the queue memory is visible to the host before
748 	 * we inform the device of the queue address.
749 	 */
750 	membar_producer();
751 	VIRTQ_DMA_SYNC_FORDEV(viq);
752 
753 	virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qidx);
754 	virtio_put32(vio, VIRTIO_LEGACY_QUEUE_ADDRESS,
755 	    virtio_dma_cookie_pa(&viq->viq_dma, 0) >> VIRTIO_PAGE_SHIFT);
756 
757 	mutex_exit(&vio->vio_mutex);
758 	return (viq);
759 }
760 
761 static void
762 virtio_queue_free(virtio_queue_t *viq)
763 {
764 	virtio_t *vio = viq->viq_virtio;
765 
766 	/*
767 	 * We are going to destroy the queue mutex.  Make sure we've already
768 	 * removed the interrupt handlers.
769 	 */
770 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED));
771 
772 	mutex_enter(&viq->viq_mutex);
773 
774 	/*
775 	 * If the device has not already been reset as part of a shutdown,
776 	 * detach the queue from the device now.
777 	 */
778 	if (!viq->viq_shutdown) {
779 		virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, viq->viq_index);
780 		virtio_put32(vio, VIRTIO_LEGACY_QUEUE_ADDRESS, 0);
781 	}
782 
783 	virtio_dma_fini(&viq->viq_dma);
784 
785 	VERIFY(avl_is_empty(&viq->viq_inflight));
786 	avl_destroy(&viq->viq_inflight);
787 	if (viq->viq_descmap != NULL) {
788 		id_space_destroy(viq->viq_descmap);
789 	}
790 
791 	mutex_exit(&viq->viq_mutex);
792 	mutex_destroy(&viq->viq_mutex);
793 
794 	kmem_free(viq, sizeof (*viq));
795 }
796 
797 void
798 virtio_queue_no_interrupt(virtio_queue_t *viq, boolean_t stop_interrupts)
799 {
800 	mutex_enter(&viq->viq_mutex);
801 
802 	if (stop_interrupts) {
803 		viq->viq_dma_driver->vqdr_flags |= VIRTQ_AVAIL_F_NO_INTERRUPT;
804 	} else {
805 		viq->viq_dma_driver->vqdr_flags &= ~VIRTQ_AVAIL_F_NO_INTERRUPT;
806 	}
807 	VIRTQ_DMA_SYNC_FORDEV(viq);
808 
809 	mutex_exit(&viq->viq_mutex);
810 }
811 
812 static virtio_chain_t *
813 virtio_queue_complete(virtio_queue_t *viq, uint_t index)
814 {
815 	VERIFY(MUTEX_HELD(&viq->viq_mutex));
816 
817 	virtio_chain_t *vic;
818 
819 	virtio_chain_t search;
820 	bzero(&search, sizeof (search));
821 	search.vic_head = index;
822 
823 	if ((vic = avl_find(&viq->viq_inflight, &search, NULL)) == NULL) {
824 		return (NULL);
825 	}
826 	avl_remove(&viq->viq_inflight, vic);
827 
828 	return (vic);
829 }
830 
831 uint_t
832 virtio_queue_size(virtio_queue_t *viq)
833 {
834 	return (viq->viq_size);
835 }
836 
837 uint_t
838 virtio_queue_nactive(virtio_queue_t *viq)
839 {
840 	mutex_enter(&viq->viq_mutex);
841 	uint_t r = avl_numnodes(&viq->viq_inflight);
842 	mutex_exit(&viq->viq_mutex);
843 
844 	return (r);
845 }
846 
847 virtio_chain_t *
848 virtio_queue_poll(virtio_queue_t *viq)
849 {
850 	mutex_enter(&viq->viq_mutex);
851 	if (viq->viq_shutdown) {
852 		/*
853 		 * The device has been reset by virtio_shutdown(), and queue
854 		 * processing has been halted.  Any previously submitted chains
855 		 * will be evacuated using virtio_queue_evacuate().
856 		 */
857 		mutex_exit(&viq->viq_mutex);
858 		return (NULL);
859 	}
860 
861 	VIRTQ_DMA_SYNC_FORKERNEL(viq);
862 	if (viq->viq_device_index == viq->viq_dma_device->vqde_index) {
863 		/*
864 		 * If the device index has not changed since the last poll,
865 		 * there are no new chains to process.
866 		 */
867 		mutex_exit(&viq->viq_mutex);
868 		return (NULL);
869 	}
870 
871 	/*
872 	 * We need to ensure that all reads from the descriptor (vqde_ring[])
873 	 * and any referenced memory by the descriptor occur after we have read
874 	 * the descriptor index value above (vqde_index).
875 	 */
876 	membar_consumer();
877 
878 	uint16_t index = (viq->viq_device_index++) % viq->viq_size;
879 	uint16_t start = viq->viq_dma_device->vqde_ring[index].vqe_start;
880 	uint32_t len = viq->viq_dma_device->vqde_ring[index].vqe_len;
881 
882 	virtio_chain_t *vic;
883 	if ((vic = virtio_queue_complete(viq, start)) == NULL) {
884 		/*
885 		 * We could not locate a chain for this descriptor index, which
886 		 * suggests that something has gone horribly wrong.
887 		 */
888 		dev_err(viq->viq_virtio->vio_dip, CE_PANIC,
889 		    "queue \"%s\" ring entry %u (descriptor %u) has no chain",
890 		    viq->viq_name, (uint16_t)index, (uint16_t)start);
891 	}
892 
893 	vic->vic_received_length = len;
894 
895 	mutex_exit(&viq->viq_mutex);
896 
897 	return (vic);
898 }
899 
900 /*
901  * After a call to "virtio_shutdown()", the driver must retrieve any previously
902  * submitted chains and free any associated resources.
903  */
904 virtio_chain_t *
905 virtio_queue_evacuate(virtio_queue_t *viq)
906 {
907 	virtio_t *vio = viq->viq_virtio;
908 
909 	mutex_enter(&vio->vio_mutex);
910 	if (!(vio->vio_initlevel & VIRTIO_INITLEVEL_SHUTDOWN)) {
911 		dev_err(vio->vio_dip, CE_PANIC,
912 		    "virtio_queue_evacuate() without virtio_shutdown()");
913 	}
914 	mutex_exit(&vio->vio_mutex);
915 
916 	mutex_enter(&viq->viq_mutex);
917 	VERIFY(viq->viq_shutdown);
918 
919 	virtio_chain_t *vic = avl_first(&viq->viq_inflight);
920 	if (vic != NULL) {
921 		avl_remove(&viq->viq_inflight, vic);
922 	}
923 
924 	mutex_exit(&viq->viq_mutex);
925 
926 	return (vic);
927 }
928 
929 /*
930  * VIRTQUEUE DESCRIPTOR CHAIN MANAGEMENT
931  */
932 
933 /*
934  * When the device returns a descriptor chain to the driver, it may provide the
935  * length in bytes of data written into the chain.  Client drivers should use
936  * this value with care; the specification suggests some device implementations
937  * have not always provided a useful or correct value.
938  */
939 size_t
940 virtio_chain_received_length(virtio_chain_t *vic)
941 {
942 	return (vic->vic_received_length);
943 }
944 
945 /*
946  * Allocate a descriptor chain for use with this queue.  The "kmflags" value
947  * may be KM_SLEEP or KM_NOSLEEP as per kmem_alloc(9F).
948  */
949 virtio_chain_t *
950 virtio_chain_alloc(virtio_queue_t *viq, int kmflags)
951 {
952 	virtio_t *vio = viq->viq_virtio;
953 	virtio_chain_t *vic;
954 	uint_t cap;
955 
956 	/*
957 	 * Direct descriptors are known by their index in the descriptor table
958 	 * for the queue.  We use the variable-length array member at the end
959 	 * of the chain tracking object to hold the list of direct descriptors
960 	 * assigned to this chain.
961 	 */
962 	if (viq->viq_indirect) {
963 		/*
964 		 * When using indirect descriptors we still need one direct
965 		 * descriptor entry to hold the physical address and length of
966 		 * the indirect descriptor table.
967 		 */
968 		cap = 1;
969 	} else {
970 		/*
971 		 * For direct descriptors we need to be able to track a
972 		 * descriptor for each possible segment in a single chain.
973 		 */
974 		cap = viq->viq_max_segs;
975 	}
976 
977 	size_t vicsz = sizeof (*vic) + sizeof (uint16_t) * cap;
978 	if ((vic = kmem_zalloc(vicsz, kmflags)) == NULL) {
979 		return (NULL);
980 	}
981 	vic->vic_vq = viq;
982 	vic->vic_direct_capacity = cap;
983 
984 	if (viq->viq_indirect) {
985 		/*
986 		 * Allocate an indirect descriptor list with the appropriate
987 		 * number of entries.
988 		 */
989 		if (virtio_dma_init(vio, &vic->vic_indirect_dma,
990 		    sizeof (virtio_vq_desc_t) * viq->viq_max_segs,
991 		    &virtio_dma_attr_indirect,
992 		    DDI_DMA_CONSISTENT | DDI_DMA_WRITE,
993 		    kmflags) != DDI_SUCCESS) {
994 			goto fail;
995 		}
996 
997 		/*
998 		 * Allocate a single descriptor to hold the indirect list.
999 		 * Leave the length as zero for now; it will be set to include
1000 		 * any occupied entries at push time.
1001 		 */
1002 		mutex_enter(&viq->viq_mutex);
1003 		if (virtio_chain_append_impl(vic,
1004 		    virtio_dma_cookie_pa(&vic->vic_indirect_dma, 0), 0,
1005 		    VIRTQ_DESC_F_INDIRECT) != DDI_SUCCESS) {
1006 			mutex_exit(&viq->viq_mutex);
1007 			goto fail;
1008 		}
1009 		mutex_exit(&viq->viq_mutex);
1010 		VERIFY3U(vic->vic_direct_used, ==, 1);
1011 
1012 		/*
1013 		 * Don't set the indirect capacity until after we've installed
1014 		 * the direct descriptor which points at the indirect list, or
1015 		 * virtio_chain_append_impl() will be confused.
1016 		 */
1017 		vic->vic_indirect_capacity = viq->viq_max_segs;
1018 	}
1019 
1020 	return (vic);
1021 
1022 fail:
1023 	virtio_dma_fini(&vic->vic_indirect_dma);
1024 	kmem_free(vic, vicsz);
1025 	return (NULL);
1026 }
1027 
1028 void *
1029 virtio_chain_data(virtio_chain_t *vic)
1030 {
1031 	return (vic->vic_data);
1032 }
1033 
1034 void
1035 virtio_chain_data_set(virtio_chain_t *vic, void *data)
1036 {
1037 	vic->vic_data = data;
1038 }
1039 
1040 void
1041 virtio_chain_clear(virtio_chain_t *vic)
1042 {
1043 	if (vic->vic_indirect_capacity != 0) {
1044 		/*
1045 		 * There should only be one direct descriptor, which points at
1046 		 * our indirect descriptor list.  We don't want to clear it
1047 		 * here.
1048 		 */
1049 		VERIFY3U(vic->vic_direct_capacity, ==, 1);
1050 
1051 		if (vic->vic_indirect_used > 0) {
1052 			/*
1053 			 * Clear out the indirect descriptor table.
1054 			 */
1055 			vic->vic_indirect_used = 0;
1056 			bzero(virtio_dma_va(&vic->vic_indirect_dma, 0),
1057 			    virtio_dma_size(&vic->vic_indirect_dma));
1058 		}
1059 
1060 	} else if (vic->vic_direct_capacity > 0) {
1061 		/*
1062 		 * Release any descriptors that were assigned to us previously.
1063 		 */
1064 		for (uint_t i = 0; i < vic->vic_direct_used; i++) {
1065 			id_free(vic->vic_vq->viq_descmap, vic->vic_direct[i]);
1066 			vic->vic_direct[i] = 0;
1067 		}
1068 		vic->vic_direct_used = 0;
1069 	}
1070 }
1071 
1072 void
1073 virtio_chain_free(virtio_chain_t *vic)
1074 {
1075 	/*
1076 	 * First ensure that we have released any descriptors used by this
1077 	 * chain.
1078 	 */
1079 	virtio_chain_clear(vic);
1080 
1081 	if (vic->vic_indirect_capacity > 0) {
1082 		/*
1083 		 * Release the direct descriptor that points to our indirect
1084 		 * descriptor list.
1085 		 */
1086 		VERIFY3U(vic->vic_direct_capacity, ==, 1);
1087 		id_free(vic->vic_vq->viq_descmap, vic->vic_direct[0]);
1088 
1089 		virtio_dma_fini(&vic->vic_indirect_dma);
1090 	}
1091 
1092 	size_t vicsz = sizeof (*vic) +
1093 	    vic->vic_direct_capacity * sizeof (uint16_t);
1094 
1095 	kmem_free(vic, vicsz);
1096 }
1097 
1098 static inline int
1099 virtio_queue_descmap_alloc(virtio_queue_t *viq, uint_t *indexp)
1100 {
1101 	id_t index;
1102 
1103 	if ((index = id_alloc_nosleep(viq->viq_descmap)) == -1) {
1104 		return (ENOMEM);
1105 	}
1106 
1107 	VERIFY3S(index, >=, 0);
1108 	VERIFY3S(index, <=, viq->viq_size);
1109 
1110 	*indexp = (uint_t)index;
1111 	return (0);
1112 }
1113 
1114 static int
1115 virtio_chain_append_impl(virtio_chain_t *vic, uint64_t pa, size_t len,
1116     uint16_t flags)
1117 {
1118 	virtio_queue_t *viq = vic->vic_vq;
1119 	virtio_vq_desc_t *vqd;
1120 	uint_t index;
1121 
1122 	/*
1123 	 * We're modifying the queue-wide descriptor list so make sure we have
1124 	 * the appropriate lock.
1125 	 */
1126 	VERIFY(MUTEX_HELD(&viq->viq_mutex));
1127 
1128 	if (vic->vic_indirect_capacity != 0) {
1129 		/*
1130 		 * Use indirect descriptors.
1131 		 */
1132 		if (vic->vic_indirect_used >= vic->vic_indirect_capacity) {
1133 			return (DDI_FAILURE);
1134 		}
1135 
1136 		vqd = virtio_dma_va(&vic->vic_indirect_dma, 0);
1137 
1138 		if ((index = vic->vic_indirect_used++) > 0) {
1139 			/*
1140 			 * Chain the current last indirect descriptor to the
1141 			 * new one.
1142 			 */
1143 			vqd[index - 1].vqd_flags |= VIRTQ_DESC_F_NEXT;
1144 			vqd[index - 1].vqd_next = index;
1145 		}
1146 
1147 	} else {
1148 		/*
1149 		 * Use direct descriptors.
1150 		 */
1151 		if (vic->vic_direct_used >= vic->vic_direct_capacity) {
1152 			return (DDI_FAILURE);
1153 		}
1154 
1155 		if (virtio_queue_descmap_alloc(viq, &index) != 0) {
1156 			return (DDI_FAILURE);
1157 		}
1158 
1159 		vqd = virtio_dma_va(&viq->viq_dma, 0);
1160 
1161 		if (vic->vic_direct_used > 0) {
1162 			/*
1163 			 * This is not the first entry.  Chain the current
1164 			 * descriptor to the next one.
1165 			 */
1166 			uint16_t p = vic->vic_direct[vic->vic_direct_used - 1];
1167 
1168 			vqd[p].vqd_flags |= VIRTQ_DESC_F_NEXT;
1169 			vqd[p].vqd_next = index;
1170 		}
1171 		vic->vic_direct[vic->vic_direct_used++] = index;
1172 	}
1173 
1174 	vqd[index].vqd_addr = pa;
1175 	vqd[index].vqd_len = len;
1176 	vqd[index].vqd_flags = flags;
1177 	vqd[index].vqd_next = 0;
1178 
1179 	return (DDI_SUCCESS);
1180 }
1181 
1182 int
1183 virtio_chain_append(virtio_chain_t *vic, uint64_t pa, size_t len,
1184     virtio_direction_t dir)
1185 {
1186 	virtio_queue_t *viq = vic->vic_vq;
1187 	uint16_t flags = 0;
1188 
1189 	switch (dir) {
1190 	case VIRTIO_DIR_DEVICE_WRITES:
1191 		flags |= VIRTQ_DESC_F_WRITE;
1192 		break;
1193 
1194 	case VIRTIO_DIR_DEVICE_READS:
1195 		break;
1196 
1197 	default:
1198 		panic("unknown direction value %u", dir);
1199 	}
1200 
1201 	mutex_enter(&viq->viq_mutex);
1202 	int r = virtio_chain_append_impl(vic, pa, len, flags);
1203 	mutex_exit(&viq->viq_mutex);
1204 
1205 	return (r);
1206 }
1207 
1208 static void
1209 virtio_queue_flush_locked(virtio_queue_t *viq)
1210 {
1211 	VERIFY(MUTEX_HELD(&viq->viq_mutex));
1212 
1213 	/*
1214 	 * Make sure any writes we have just made to the descriptors
1215 	 * (vqdr_ring[]) are visible to the device before we update the ring
1216 	 * pointer (vqdr_index).
1217 	 */
1218 	membar_producer();
1219 	viq->viq_dma_driver->vqdr_index = viq->viq_driver_index;
1220 	VIRTQ_DMA_SYNC_FORDEV(viq);
1221 
1222 	/*
1223 	 * Determine whether the device expects us to notify it of new
1224 	 * descriptors.
1225 	 */
1226 	VIRTQ_DMA_SYNC_FORKERNEL(viq);
1227 	if (!(viq->viq_dma_device->vqde_flags & VIRTQ_USED_F_NO_NOTIFY)) {
1228 		virtio_put16(viq->viq_virtio, VIRTIO_LEGACY_QUEUE_NOTIFY,
1229 		    viq->viq_index);
1230 	}
1231 }
1232 
1233 void
1234 virtio_queue_flush(virtio_queue_t *viq)
1235 {
1236 	mutex_enter(&viq->viq_mutex);
1237 	virtio_queue_flush_locked(viq);
1238 	mutex_exit(&viq->viq_mutex);
1239 }
1240 
1241 void
1242 virtio_chain_submit(virtio_chain_t *vic, boolean_t flush)
1243 {
1244 	virtio_queue_t *viq = vic->vic_vq;
1245 
1246 	mutex_enter(&viq->viq_mutex);
1247 
1248 	if (vic->vic_indirect_capacity != 0) {
1249 		virtio_vq_desc_t *vqd = virtio_dma_va(&viq->viq_dma, 0);
1250 
1251 		VERIFY3U(vic->vic_direct_used, ==, 1);
1252 
1253 		/*
1254 		 * This is an indirect descriptor queue.  The length in bytes
1255 		 * of the descriptor must extend to cover the populated
1256 		 * indirect descriptor entries.
1257 		 */
1258 		vqd[vic->vic_direct[0]].vqd_len =
1259 		    sizeof (virtio_vq_desc_t) * vic->vic_indirect_used;
1260 
1261 		virtio_dma_sync(&vic->vic_indirect_dma, DDI_DMA_SYNC_FORDEV);
1262 	}
1263 
1264 	/*
1265 	 * Populate the next available slot in the driver-owned ring for this
1266 	 * chain.  The updated value of viq_driver_index is not yet visible to
1267 	 * the device until a subsequent queue flush.
1268 	 */
1269 	uint16_t index = (viq->viq_driver_index++) % viq->viq_size;
1270 	viq->viq_dma_driver->vqdr_ring[index] = vic->vic_direct[0];
1271 
1272 	vic->vic_head = vic->vic_direct[0];
1273 	avl_add(&viq->viq_inflight, vic);
1274 
1275 	if (flush) {
1276 		virtio_queue_flush_locked(vic->vic_vq);
1277 	}
1278 
1279 	mutex_exit(&viq->viq_mutex);
1280 }
1281 
1282 /*
1283  * INTERRUPTS MANAGEMENT
1284  */
1285 
1286 static const char *
1287 virtio_interrupt_type_name(int type)
1288 {
1289 	switch (type) {
1290 	case DDI_INTR_TYPE_MSIX:
1291 		return ("MSI-X");
1292 	case DDI_INTR_TYPE_MSI:
1293 		return ("MSI");
1294 	case DDI_INTR_TYPE_FIXED:
1295 		return ("fixed");
1296 	default:
1297 		return ("?");
1298 	}
1299 }
1300 
1301 static int
1302 virtio_interrupts_alloc(virtio_t *vio, int type, int nrequired)
1303 {
1304 	dev_info_t *dip = vio->vio_dip;
1305 	int nintrs = 0;
1306 	int navail = 0;
1307 
1308 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1309 	VERIFY(!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ALLOC));
1310 
1311 	if (ddi_intr_get_nintrs(dip, type, &nintrs) != DDI_SUCCESS) {
1312 		dev_err(dip, CE_WARN, "could not count %s interrupts",
1313 		    virtio_interrupt_type_name(type));
1314 		return (DDI_FAILURE);
1315 	}
1316 	if (nintrs < 1) {
1317 		dev_err(dip, CE_WARN, "no %s interrupts supported",
1318 		    virtio_interrupt_type_name(type));
1319 		return (DDI_FAILURE);
1320 	}
1321 
1322 	if (ddi_intr_get_navail(dip, type, &navail) != DDI_SUCCESS) {
1323 		dev_err(dip, CE_WARN, "could not count available %s interrupts",
1324 		    virtio_interrupt_type_name(type));
1325 		return (DDI_FAILURE);
1326 	}
1327 	if (navail < nrequired) {
1328 		dev_err(dip, CE_WARN, "need %d %s interrupts, but only %d "
1329 		    "available", nrequired, virtio_interrupt_type_name(type),
1330 		    navail);
1331 		return (DDI_FAILURE);
1332 	}
1333 
1334 	VERIFY3P(vio->vio_interrupts, ==, NULL);
1335 	vio->vio_interrupts = kmem_zalloc(
1336 	    sizeof (ddi_intr_handle_t) * nrequired, KM_SLEEP);
1337 
1338 	int r;
1339 	if ((r = ddi_intr_alloc(dip, vio->vio_interrupts, type, 0, nrequired,
1340 	    &vio->vio_ninterrupts, DDI_INTR_ALLOC_STRICT)) != DDI_SUCCESS) {
1341 		dev_err(dip, CE_WARN, "%s interrupt allocation failure (%d)",
1342 		    virtio_interrupt_type_name(type), r);
1343 		kmem_free(vio->vio_interrupts,
1344 		    sizeof (ddi_intr_handle_t) * nrequired);
1345 		vio->vio_interrupts = NULL;
1346 		return (DDI_FAILURE);
1347 	}
1348 
1349 	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ALLOC;
1350 	vio->vio_interrupt_type = type;
1351 	return (DDI_SUCCESS);
1352 }
1353 
1354 static uint_t
1355 virtio_shared_isr(caddr_t arg0, caddr_t arg1)
1356 {
1357 	virtio_t *vio = (virtio_t *)arg0;
1358 	uint_t r = DDI_INTR_UNCLAIMED;
1359 	uint8_t isr;
1360 
1361 	mutex_enter(&vio->vio_mutex);
1362 
1363 	/*
1364 	 * Check the ISR status to see if the interrupt applies to us.  Reading
1365 	 * this field resets it to zero.
1366 	 */
1367 	isr = virtio_get8(vio, VIRTIO_LEGACY_ISR_STATUS);
1368 
1369 	if ((isr & VIRTIO_ISR_CHECK_QUEUES) != 0) {
1370 		r = DDI_INTR_CLAIMED;
1371 
1372 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1373 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1374 			if (viq->viq_func != NULL) {
1375 				mutex_exit(&vio->vio_mutex);
1376 				(void) viq->viq_func(viq->viq_funcarg, arg0);
1377 				mutex_enter(&vio->vio_mutex);
1378 
1379 				if (vio->vio_initlevel &
1380 				    VIRTIO_INITLEVEL_SHUTDOWN) {
1381 					/*
1382 					 * The device was shut down while in a
1383 					 * queue handler routine.
1384 					 */
1385 					break;
1386 				}
1387 			}
1388 		}
1389 	}
1390 
1391 	mutex_exit(&vio->vio_mutex);
1392 
1393 	/*
1394 	 * vio_cfgchange_{handler,handlerarg} cannot change while interrupts
1395 	 * are configured so it is safe to access them outside of the lock.
1396 	 */
1397 
1398 	if ((isr & VIRTIO_ISR_CHECK_CONFIG) != 0) {
1399 		r = DDI_INTR_CLAIMED;
1400 		if (vio->vio_cfgchange_handler != NULL) {
1401 			(void) vio->vio_cfgchange_handler(
1402 			    (caddr_t)vio->vio_cfgchange_handlerarg,
1403 			    (caddr_t)vio);
1404 		}
1405 	}
1406 
1407 	return (r);
1408 }
1409 
1410 static int
1411 virtio_interrupts_setup(virtio_t *vio, int allow_types)
1412 {
1413 	dev_info_t *dip = vio->vio_dip;
1414 	int types;
1415 	int count = 0;
1416 
1417 	mutex_enter(&vio->vio_mutex);
1418 
1419 	/*
1420 	 * Determine the number of interrupts we'd like based on the number of
1421 	 * virtqueues.
1422 	 */
1423 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
1424 	    viq = list_next(&vio->vio_queues, viq)) {
1425 		if (viq->viq_func != NULL) {
1426 			count++;
1427 		}
1428 	}
1429 
1430 	/*
1431 	 * If there is a configuration change handler, one extra interrupt
1432 	 * is needed for that.
1433 	 */
1434 	if (vio->vio_cfgchange_handler != NULL)
1435 		count++;
1436 
1437 	if (ddi_intr_get_supported_types(dip, &types) != DDI_SUCCESS) {
1438 		dev_err(dip, CE_WARN, "could not get supported interrupts");
1439 		mutex_exit(&vio->vio_mutex);
1440 		return (DDI_FAILURE);
1441 	}
1442 
1443 	if (allow_types != VIRTIO_ANY_INTR_TYPE) {
1444 		/*
1445 		 * Restrict the possible interrupt types at the request of the
1446 		 * driver.
1447 		 */
1448 		types &= allow_types;
1449 	}
1450 
1451 	/*
1452 	 * Try each potential interrupt type in descending order of preference.
1453 	 * Note that the specification does not appear to allow for the use of
1454 	 * classical MSI, so we are limited to either MSI-X or fixed
1455 	 * interrupts.
1456 	 */
1457 	if (types & DDI_INTR_TYPE_MSIX) {
1458 		if (virtio_interrupts_alloc(vio, DDI_INTR_TYPE_MSIX,
1459 		    count) == DDI_SUCCESS) {
1460 			goto add_handlers;
1461 		}
1462 	}
1463 	if (types & DDI_INTR_TYPE_FIXED) {
1464 		/*
1465 		 * If fixed interrupts are all that are available, we'll just
1466 		 * ask for one.
1467 		 */
1468 		if (virtio_interrupts_alloc(vio, DDI_INTR_TYPE_FIXED, 1) ==
1469 		    DDI_SUCCESS) {
1470 			goto add_handlers;
1471 		}
1472 	}
1473 
1474 	dev_err(dip, CE_WARN, "interrupt allocation failed");
1475 	mutex_exit(&vio->vio_mutex);
1476 	return (DDI_FAILURE);
1477 
1478 add_handlers:
1479 	/*
1480 	 * Ensure that we have not been given any high-level interrupts as our
1481 	 * interrupt handlers do not support them.
1482 	 */
1483 	for (int i = 0; i < vio->vio_ninterrupts; i++) {
1484 		uint_t ipri;
1485 
1486 		if (ddi_intr_get_pri(vio->vio_interrupts[i], &ipri) !=
1487 		    DDI_SUCCESS) {
1488 			dev_err(dip, CE_WARN, "could not determine interrupt "
1489 			    "priority");
1490 			goto fail;
1491 		}
1492 
1493 		if (ipri >= ddi_intr_get_hilevel_pri()) {
1494 			dev_err(dip, CE_WARN, "high level interrupts not "
1495 			    "supported");
1496 			goto fail;
1497 		}
1498 
1499 		/*
1500 		 * Record the highest priority we've been allocated to use for
1501 		 * mutex initialisation.
1502 		 */
1503 		if (i == 0 || ipri > vio->vio_interrupt_priority) {
1504 			vio->vio_interrupt_priority = ipri;
1505 		}
1506 	}
1507 
1508 	/*
1509 	 * Get the interrupt capabilities from the first handle to determine
1510 	 * whether we need to use ddi_intr_block_enable(9F).
1511 	 */
1512 	if (ddi_intr_get_cap(vio->vio_interrupts[0],
1513 	    &vio->vio_interrupt_cap) != DDI_SUCCESS) {
1514 		dev_err(dip, CE_WARN, "failed to get interrupt capabilities");
1515 		goto fail;
1516 	}
1517 
1518 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_FIXED) {
1519 		VERIFY3S(vio->vio_ninterrupts, ==, 1);
1520 		/*
1521 		 * For fixed interrupts, we need to use our shared handler to
1522 		 * multiplex the per-queue handlers provided by the driver.
1523 		 */
1524 		if (ddi_intr_add_handler(vio->vio_interrupts[0],
1525 		    virtio_shared_isr, (caddr_t)vio, NULL) != DDI_SUCCESS) {
1526 			dev_err(dip, CE_WARN, "adding shared %s interrupt "
1527 			    "handler failed", virtio_interrupt_type_name(
1528 			    vio->vio_interrupt_type));
1529 			goto fail;
1530 		}
1531 
1532 		goto done;
1533 	}
1534 
1535 	VERIFY3S(vio->vio_ninterrupts, ==, count);
1536 
1537 	uint_t n = 0;
1538 
1539 	/* Bind the configuration vector interrupt */
1540 	if (vio->vio_cfgchange_handler != NULL) {
1541 		if (ddi_intr_add_handler(vio->vio_interrupts[n],
1542 		    vio->vio_cfgchange_handler,
1543 		    (caddr_t)vio->vio_cfgchange_handlerarg,
1544 		    (caddr_t)vio) != DDI_SUCCESS) {
1545 			dev_err(dip, CE_WARN,
1546 			    "adding configuration change interrupt failed");
1547 			goto fail;
1548 		}
1549 		vio->vio_cfgchange_handler_added = B_TRUE;
1550 		vio->vio_cfgchange_handler_index = n;
1551 		n++;
1552 	}
1553 
1554 	for (virtio_queue_t *viq = list_head(&vio->vio_queues); viq != NULL;
1555 	    viq = list_next(&vio->vio_queues, viq)) {
1556 		if (viq->viq_func == NULL) {
1557 			continue;
1558 		}
1559 
1560 		if (ddi_intr_add_handler(vio->vio_interrupts[n],
1561 		    viq->viq_func, (caddr_t)viq->viq_funcarg,
1562 		    (caddr_t)vio) != DDI_SUCCESS) {
1563 			dev_err(dip, CE_WARN, "adding interrupt %u (%s) failed",
1564 			    n, viq->viq_name);
1565 			goto fail;
1566 		}
1567 
1568 		viq->viq_handler_index = n;
1569 		viq->viq_handler_added = B_TRUE;
1570 		n++;
1571 	}
1572 
1573 done:
1574 	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ADDED;
1575 	mutex_exit(&vio->vio_mutex);
1576 	return (DDI_SUCCESS);
1577 
1578 fail:
1579 	virtio_interrupts_teardown(vio);
1580 	mutex_exit(&vio->vio_mutex);
1581 	return (DDI_FAILURE);
1582 }
1583 
1584 static void
1585 virtio_interrupts_teardown(virtio_t *vio)
1586 {
1587 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1588 
1589 	virtio_interrupts_disable_locked(vio);
1590 
1591 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_FIXED) {
1592 		/*
1593 		 * Remove the multiplexing interrupt handler.
1594 		 */
1595 		if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ADDED) {
1596 			int r;
1597 
1598 			VERIFY3S(vio->vio_ninterrupts, ==, 1);
1599 
1600 			if ((r = ddi_intr_remove_handler(
1601 			    vio->vio_interrupts[0])) != DDI_SUCCESS) {
1602 				dev_err(vio->vio_dip, CE_WARN, "removing "
1603 				    "shared interrupt handler failed (%d)", r);
1604 			}
1605 		}
1606 	} else {
1607 		/*
1608 		 * Remove the configuration vector interrupt handler.
1609 		 */
1610 		if (vio->vio_cfgchange_handler_added) {
1611 			int r;
1612 
1613 			if ((r = ddi_intr_remove_handler(
1614 			    vio->vio_interrupts[0])) != DDI_SUCCESS) {
1615 				dev_err(vio->vio_dip, CE_WARN,
1616 				    "removing configuration change interrupt "
1617 				    "handler failed (%d)", r);
1618 			}
1619 			vio->vio_cfgchange_handler_added = B_FALSE;
1620 		}
1621 
1622 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1623 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1624 			int r;
1625 
1626 			if (!viq->viq_handler_added) {
1627 				continue;
1628 			}
1629 
1630 			if ((r = ddi_intr_remove_handler(
1631 			    vio->vio_interrupts[viq->viq_handler_index])) !=
1632 			    DDI_SUCCESS) {
1633 				dev_err(vio->vio_dip, CE_WARN, "removing "
1634 				    "interrupt handler (%s) failed (%d)",
1635 				    viq->viq_name, r);
1636 			}
1637 
1638 			viq->viq_handler_added = B_FALSE;
1639 		}
1640 	}
1641 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ADDED;
1642 
1643 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ALLOC) {
1644 		for (int i = 0; i < vio->vio_ninterrupts; i++) {
1645 			int r;
1646 
1647 			if ((r = ddi_intr_free(vio->vio_interrupts[i])) !=
1648 			    DDI_SUCCESS) {
1649 				dev_err(vio->vio_dip, CE_WARN, "freeing "
1650 				    "interrupt %u failed (%d)", i, r);
1651 			}
1652 		}
1653 		kmem_free(vio->vio_interrupts,
1654 		    sizeof (ddi_intr_handle_t) * vio->vio_ninterrupts);
1655 		vio->vio_interrupts = NULL;
1656 		vio->vio_ninterrupts = 0;
1657 		vio->vio_interrupt_type = 0;
1658 		vio->vio_interrupt_cap = 0;
1659 		vio->vio_interrupt_priority = 0;
1660 
1661 		vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ALLOC;
1662 	}
1663 }
1664 
1665 static void
1666 virtio_interrupts_unwind(virtio_t *vio)
1667 {
1668 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1669 
1670 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_MSIX) {
1671 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1672 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1673 			if (!viq->viq_handler_added) {
1674 				continue;
1675 			}
1676 
1677 			virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT,
1678 			    viq->viq_index);
1679 			virtio_put16(vio, VIRTIO_LEGACY_MSIX_QUEUE,
1680 			    VIRTIO_LEGACY_MSI_NO_VECTOR);
1681 		}
1682 
1683 		if (vio->vio_cfgchange_handler_added) {
1684 			virtio_put16(vio, VIRTIO_LEGACY_MSIX_CONFIG,
1685 			    VIRTIO_LEGACY_MSI_NO_VECTOR);
1686 		}
1687 	}
1688 
1689 	if (vio->vio_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
1690 		(void) ddi_intr_block_disable(vio->vio_interrupts,
1691 		    vio->vio_ninterrupts);
1692 	} else {
1693 		for (int i = 0; i < vio->vio_ninterrupts; i++) {
1694 			(void) ddi_intr_disable(vio->vio_interrupts[i]);
1695 		}
1696 	}
1697 
1698 	/*
1699 	 * Disabling the interrupts makes the MSI-X fields disappear from the
1700 	 * BAR once more.
1701 	 */
1702 	vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET;
1703 }
1704 
1705 int
1706 virtio_interrupts_enable(virtio_t *vio)
1707 {
1708 	mutex_enter(&vio->vio_mutex);
1709 	if (vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ENABLED) {
1710 		mutex_exit(&vio->vio_mutex);
1711 		return (DDI_SUCCESS);
1712 	}
1713 
1714 	int r = DDI_SUCCESS;
1715 	if (vio->vio_interrupt_cap & DDI_INTR_FLAG_BLOCK) {
1716 		r = ddi_intr_block_enable(vio->vio_interrupts,
1717 		    vio->vio_ninterrupts);
1718 	} else {
1719 		for (int i = 0; i < vio->vio_ninterrupts; i++) {
1720 			if ((r = ddi_intr_enable(vio->vio_interrupts[i])) !=
1721 			    DDI_SUCCESS) {
1722 				/*
1723 				 * Disable the interrupts we have enabled so
1724 				 * far.
1725 				 */
1726 				for (i--; i >= 0; i--) {
1727 					(void) ddi_intr_disable(
1728 					    vio->vio_interrupts[i]);
1729 				}
1730 				break;
1731 			}
1732 		}
1733 	}
1734 
1735 	if (r != DDI_SUCCESS) {
1736 		mutex_exit(&vio->vio_mutex);
1737 		return (r);
1738 	}
1739 
1740 	if (vio->vio_interrupt_type == DDI_INTR_TYPE_MSIX) {
1741 		/*
1742 		 * When asked to enable the interrupts, the system enables
1743 		 * MSI-X in the PCI configuration for the device.  While
1744 		 * enabled, the extra MSI-X configuration table fields appear
1745 		 * between the general and the device-specific regions of the
1746 		 * BAR.
1747 		 */
1748 		vio->vio_config_offset = VIRTIO_LEGACY_CFG_OFFSET_MSIX;
1749 
1750 		for (virtio_queue_t *viq = list_head(&vio->vio_queues);
1751 		    viq != NULL; viq = list_next(&vio->vio_queues, viq)) {
1752 			if (!viq->viq_handler_added) {
1753 				continue;
1754 			}
1755 
1756 			uint16_t qi = viq->viq_index;
1757 			uint16_t msi = viq->viq_handler_index;
1758 
1759 			/*
1760 			 * Route interrupts for this queue to the assigned
1761 			 * MSI-X vector number.
1762 			 */
1763 			virtio_put16(vio, VIRTIO_LEGACY_QUEUE_SELECT, qi);
1764 			virtio_put16(vio, VIRTIO_LEGACY_MSIX_QUEUE, msi);
1765 
1766 			/*
1767 			 * The device may not actually accept the vector number
1768 			 * we're attempting to program.  We need to confirm
1769 			 * that configuration was successful by re-reading the
1770 			 * configuration we just wrote.
1771 			 */
1772 			if (virtio_get16(vio, VIRTIO_LEGACY_MSIX_QUEUE) !=
1773 			    msi) {
1774 				dev_err(vio->vio_dip, CE_WARN,
1775 				    "failed to configure MSI-X vector %u for "
1776 				    "queue \"%s\" (#%u)", (uint_t)msi,
1777 				    viq->viq_name, (uint_t)qi);
1778 
1779 				virtio_interrupts_unwind(vio);
1780 				mutex_exit(&vio->vio_mutex);
1781 				return (DDI_FAILURE);
1782 			}
1783 		}
1784 
1785 		if (vio->vio_cfgchange_handler_added) {
1786 			virtio_put16(vio, VIRTIO_LEGACY_MSIX_CONFIG,
1787 			    vio->vio_cfgchange_handler_index);
1788 
1789 			/* Verify the value was accepted. */
1790 			if (virtio_get16(vio, VIRTIO_LEGACY_MSIX_CONFIG) !=
1791 			    vio->vio_cfgchange_handler_index) {
1792 				dev_err(vio->vio_dip, CE_WARN,
1793 				    "failed to configure MSI-X vector for "
1794 				    "configuration");
1795 
1796 				virtio_interrupts_unwind(vio);
1797 				mutex_exit(&vio->vio_mutex);
1798 				return (DDI_FAILURE);
1799 			}
1800 		}
1801 	}
1802 
1803 	vio->vio_initlevel |= VIRTIO_INITLEVEL_INT_ENABLED;
1804 
1805 	mutex_exit(&vio->vio_mutex);
1806 	return (DDI_SUCCESS);
1807 }
1808 
1809 static void
1810 virtio_interrupts_disable_locked(virtio_t *vio)
1811 {
1812 	VERIFY(MUTEX_HELD(&vio->vio_mutex));
1813 
1814 	if (!(vio->vio_initlevel & VIRTIO_INITLEVEL_INT_ENABLED)) {
1815 		return;
1816 	}
1817 
1818 	virtio_interrupts_unwind(vio);
1819 
1820 	vio->vio_initlevel &= ~VIRTIO_INITLEVEL_INT_ENABLED;
1821 }
1822 
1823 void
1824 virtio_interrupts_disable(virtio_t *vio)
1825 {
1826 	mutex_enter(&vio->vio_mutex);
1827 	virtio_interrupts_disable_locked(vio);
1828 	mutex_exit(&vio->vio_mutex);
1829 }
1830