xref: /illumos-gate/usr/src/uts/intel/io/viona/viona_ring.c (revision d48be21240dfd051b689384ce2b23479d757f2d8)
1 /*
2  * Copyright (c) 2013  Chris Torek <torek @ torek net>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * This file and its contents are supplied under the terms of the
28  * Common Development and Distribution License ("CDDL"), version 1.0.
29  * You may only use this file in accordance with the terms of version
30  * 1.0 of the CDDL.
31  *
32  * A full copy of the text of the CDDL should have accompanied this
33  * source.  A copy of the CDDL is also available via the Internet at
34  * http://www.illumos.org/license/CDDL.
35  *
36  * Copyright 2015 Pluribus Networks Inc.
37  * Copyright 2019 Joyent, Inc.
38  * Copyright 2022 Oxide Computer Company
39  */
40 
41 
42 #include <sys/disp.h>
43 
44 #include "viona_impl.h"
45 
46 #define	VRING_MAX_LEN		32768
47 
48 /* Layout and sizing as defined in the spec for a legacy-style virtqueue */
49 
50 #define	LEGACY_VQ_ALIGN		PAGESIZE
51 
52 #define	LEGACY_DESC_SZ(qsz)	((qsz) * sizeof (struct virtio_desc))
53 /*
54  * Available ring consists of avail_idx (uint16_t), flags (uint16_t), qsz avail
55  * descriptors (uint16_t each), and (optional) used_event (uint16_t).
56  */
57 #define	LEGACY_AVAIL_SZ(qsz)	(((qsz) + 3) * sizeof (uint16_t))
58 /*
59  * Used ring consists of used_idx (uint16_t), flags (uint16_t), qsz used
60  * descriptors (two uint32_t each), and (optional) avail_event (uint16_t).
61  */
62 #define	LEGACY_USED_SZ(qsz)	\
63 	((qsz) * sizeof (struct virtio_used) + 3 * sizeof (uint16_t))
64 
65 #define	LEGACY_AVAIL_FLAGS_OFF(qsz)	LEGACY_DESC_SZ(qsz)
66 #define	LEGACY_AVAIL_IDX_OFF(qsz)	\
67 	(LEGACY_DESC_SZ(qsz) + sizeof (uint16_t))
68 #define	LEGACY_AVAIL_ENT_OFF(qsz, idx)	\
69 	(LEGACY_DESC_SZ(qsz) + (2 + (idx)) * sizeof (uint16_t))
70 
71 #define	LEGACY_USED_FLAGS_OFF(qsz)	\
72 	P2ROUNDUP(LEGACY_DESC_SZ(qsz) + LEGACY_AVAIL_SZ(qsz), LEGACY_VQ_ALIGN)
73 #define	LEGACY_USED_IDX_OFF(qsz)	\
74 	(LEGACY_USED_FLAGS_OFF(qsz) + sizeof (uint16_t))
75 #define	LEGACY_USED_ENT_OFF(qsz, idx)	\
76 	(LEGACY_USED_FLAGS_OFF(qsz) + 2 * sizeof (uint16_t) + \
77 	(idx) * sizeof (struct virtio_used))
78 
79 #define	LEGACY_VQ_SIZE(qsz)	\
80 	(LEGACY_USED_FLAGS_OFF(qsz) + \
81 	P2ROUNDUP(LEGACY_USED_SZ(qsz), LEGACY_VQ_ALIGN))
82 #define	LEGACY_VQ_PAGES(qsz)	(LEGACY_VQ_SIZE(qsz) / PAGESIZE)
83 
84 struct vq_held_region {
85 	struct iovec	*vhr_iov;
86 	vmm_page_t	*vhr_head;
87 	vmm_page_t	*vhr_tail;
88 	/* Length of iovec array supplied in `vhr_iov` */
89 	uint_t		vhr_niov;
90 	/*
91 	 * Index into vhr_iov, indicating the next "free" entry (following the
92 	 * last entry which has valid contents).
93 	 */
94 	uint_t		vhr_idx;
95 };
96 typedef struct vq_held_region vq_held_region_t;
97 
98 static bool viona_ring_map(viona_vring_t *, bool);
99 static void viona_ring_unmap(viona_vring_t *);
100 static kthread_t *viona_create_worker(viona_vring_t *);
101 
102 static vmm_page_t *
103 vq_page_hold(viona_vring_t *ring, uint64_t gpa, bool writable)
104 {
105 	ASSERT3P(ring->vr_lease, !=, NULL);
106 
107 	int prot = PROT_READ;
108 	if (writable) {
109 		prot |= PROT_WRITE;
110 	}
111 
112 	return (vmm_drv_page_hold(ring->vr_lease, gpa, prot));
113 }
114 
115 /*
116  * Establish a hold on the page(s) which back the region of guest memory covered
117  * by [gpa, gpa + len).  The host-kernel-virtual pointers to those pages are
118  * stored in the iovec array supplied in `region`, along with the chain of
119  * vmm_page_t entries representing the held pages.  Since guest memory
120  * carries no guarantees of being physically contiguous (on the host), it is
121  * assumed that an iovec entry will be required for each PAGESIZE section
122  * covered by the specified `gpa` and `len` range.  For each iovec entry
123  * successfully populated by holding a page, `vhr_idx` will be incremented so it
124  * references the next available iovec entry (or `vhr_niov`, if the iovec array
125  * is full).  The responsibility for releasing the `vmm_page_t` chain (stored in
126  * `vhr_head` and `vhr_tail`) resides with the caller, regardless of the result.
127  */
128 static int
129 vq_region_hold(viona_vring_t *ring, uint64_t gpa, uint32_t len,
130     bool writable, vq_held_region_t *region)
131 {
132 	const uint32_t front_offset = gpa & PAGEOFFSET;
133 	const uint32_t front_len = MIN(len, PAGESIZE - front_offset);
134 	uint_t pages = 1;
135 	vmm_page_t *vmp;
136 	caddr_t buf;
137 
138 	ASSERT3U(region->vhr_idx, <, region->vhr_niov);
139 
140 	if (front_len < len) {
141 		pages += P2ROUNDUP((uint64_t)(len - front_len),
142 		    PAGESIZE) / PAGESIZE;
143 	}
144 	if (pages > (region->vhr_niov - region->vhr_idx)) {
145 		return (E2BIG);
146 	}
147 
148 	vmp = vq_page_hold(ring, gpa & PAGEMASK, writable);
149 	if (vmp == NULL) {
150 		return (EFAULT);
151 	}
152 	buf = (caddr_t)vmm_drv_page_readable(vmp);
153 
154 	region->vhr_iov[region->vhr_idx].iov_base = buf + front_offset;
155 	region->vhr_iov[region->vhr_idx].iov_len = front_len;
156 	region->vhr_idx++;
157 	gpa += front_len;
158 	len -= front_len;
159 	if (region->vhr_head == NULL) {
160 		region->vhr_head = vmp;
161 		region->vhr_tail = vmp;
162 	} else {
163 		vmm_drv_page_chain(region->vhr_tail, vmp);
164 		region->vhr_tail = vmp;
165 	}
166 
167 	for (uint_t i = 1; i < pages; i++) {
168 		ASSERT3U(gpa & PAGEOFFSET, ==, 0);
169 
170 		vmp = vq_page_hold(ring, gpa, writable);
171 		if (vmp == NULL) {
172 			return (EFAULT);
173 		}
174 		buf = (caddr_t)vmm_drv_page_readable(vmp);
175 
176 		const uint32_t chunk_len = MIN(len, PAGESIZE);
177 		region->vhr_iov[region->vhr_idx].iov_base = buf;
178 		region->vhr_iov[region->vhr_idx].iov_len = chunk_len;
179 		region->vhr_idx++;
180 		gpa += chunk_len;
181 		len -= chunk_len;
182 		vmm_drv_page_chain(region->vhr_tail, vmp);
183 		region->vhr_tail = vmp;
184 	}
185 
186 	return (0);
187 }
188 
189 static boolean_t
190 viona_ring_lease_expire_cb(void *arg)
191 {
192 	viona_vring_t *ring = arg;
193 
194 	mutex_enter(&ring->vr_lock);
195 	cv_broadcast(&ring->vr_cv);
196 	mutex_exit(&ring->vr_lock);
197 
198 	/* The lease will be broken asynchronously. */
199 	return (B_FALSE);
200 }
201 
202 static void
203 viona_ring_lease_drop(viona_vring_t *ring)
204 {
205 	ASSERT(MUTEX_HELD(&ring->vr_lock));
206 
207 	if (ring->vr_lease != NULL) {
208 		vmm_hold_t *hold = ring->vr_link->l_vm_hold;
209 
210 		ASSERT(hold != NULL);
211 
212 		/*
213 		 * Without an active lease, the ring mappings cannot be
214 		 * considered valid.
215 		 */
216 		viona_ring_unmap(ring);
217 
218 		vmm_drv_lease_break(hold, ring->vr_lease);
219 		ring->vr_lease = NULL;
220 	}
221 }
222 
223 boolean_t
224 viona_ring_lease_renew(viona_vring_t *ring)
225 {
226 	vmm_hold_t *hold = ring->vr_link->l_vm_hold;
227 
228 	ASSERT(hold != NULL);
229 	ASSERT(MUTEX_HELD(&ring->vr_lock));
230 
231 	viona_ring_lease_drop(ring);
232 
233 	/*
234 	 * Lease renewal will fail if the VM has requested that all holds be
235 	 * cleaned up.
236 	 */
237 	ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
238 	    ring);
239 	if (ring->vr_lease != NULL) {
240 		/* A ring undergoing renewal will need valid guest mappings */
241 		if (ring->vr_pa != 0 && ring->vr_size != 0) {
242 			/*
243 			 * If new mappings cannot be established, consider the
244 			 * lease renewal a failure.
245 			 */
246 			if (!viona_ring_map(ring, ring->vr_state == VRS_INIT)) {
247 				viona_ring_lease_drop(ring);
248 				return (B_FALSE);
249 			}
250 		}
251 	}
252 	return (ring->vr_lease != NULL);
253 }
254 
255 void
256 viona_ring_alloc(viona_link_t *link, viona_vring_t *ring)
257 {
258 	ring->vr_link = link;
259 	mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL);
260 	cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL);
261 	mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL);
262 	mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL);
263 }
264 
265 static void
266 viona_ring_misc_free(viona_vring_t *ring)
267 {
268 	const uint_t qsz = ring->vr_size;
269 
270 	viona_tx_ring_free(ring, qsz);
271 }
272 
273 void
274 viona_ring_free(viona_vring_t *ring)
275 {
276 	mutex_destroy(&ring->vr_lock);
277 	cv_destroy(&ring->vr_cv);
278 	mutex_destroy(&ring->vr_a_mutex);
279 	mutex_destroy(&ring->vr_u_mutex);
280 	ring->vr_link = NULL;
281 }
282 
283 int
284 viona_ring_init(viona_link_t *link, uint16_t idx,
285     const struct viona_ring_params *params)
286 {
287 	viona_vring_t *ring;
288 	kthread_t *t;
289 	int err = 0;
290 	const uint16_t qsz = params->vrp_size;
291 	const uint64_t pa = params->vrp_pa;
292 
293 	if (idx >= VIONA_VQ_MAX) {
294 		return (EINVAL);
295 	}
296 
297 	if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
298 		return (EINVAL);
299 	}
300 	if ((pa & (LEGACY_VQ_ALIGN - 1)) != 0) {
301 		return (EINVAL);
302 	}
303 
304 	ring = &link->l_vrings[idx];
305 	mutex_enter(&ring->vr_lock);
306 	if (ring->vr_state != VRS_RESET) {
307 		mutex_exit(&ring->vr_lock);
308 		return (EBUSY);
309 	}
310 	VERIFY(ring->vr_state_flags == 0);
311 
312 	ring->vr_lease = NULL;
313 	if (!viona_ring_lease_renew(ring)) {
314 		err = EBUSY;
315 		goto fail;
316 	}
317 
318 	ring->vr_size = qsz;
319 	ring->vr_mask = (ring->vr_size - 1);
320 	ring->vr_pa = pa;
321 	if (!viona_ring_map(ring, true)) {
322 		err = EINVAL;
323 		goto fail;
324 	}
325 
326 	/* Initialize queue indexes */
327 	ring->vr_cur_aidx = params->vrp_avail_idx;
328 	ring->vr_cur_uidx = params->vrp_used_idx;
329 
330 	if (idx == VIONA_VQ_TX) {
331 		viona_tx_ring_alloc(ring, qsz);
332 	}
333 
334 	/* Zero out MSI-X configuration */
335 	ring->vr_msi_addr = 0;
336 	ring->vr_msi_msg = 0;
337 
338 	/* Clear the stats */
339 	bzero(&ring->vr_stats, sizeof (ring->vr_stats));
340 
341 	t = viona_create_worker(ring);
342 	if (t == NULL) {
343 		err = ENOMEM;
344 		goto fail;
345 	}
346 	ring->vr_worker_thread = t;
347 	ring->vr_state = VRS_SETUP;
348 	cv_broadcast(&ring->vr_cv);
349 	mutex_exit(&ring->vr_lock);
350 	return (0);
351 
352 fail:
353 	viona_ring_lease_drop(ring);
354 	viona_ring_misc_free(ring);
355 	ring->vr_size = 0;
356 	ring->vr_mask = 0;
357 	ring->vr_pa = 0;
358 	ring->vr_cur_aidx = 0;
359 	ring->vr_cur_uidx = 0;
360 	mutex_exit(&ring->vr_lock);
361 	return (err);
362 }
363 
364 int
365 viona_ring_get_state(viona_link_t *link, uint16_t idx,
366     struct viona_ring_params *params)
367 {
368 	viona_vring_t *ring;
369 
370 	if (idx >= VIONA_VQ_MAX) {
371 		return (EINVAL);
372 	}
373 
374 	ring = &link->l_vrings[idx];
375 	mutex_enter(&ring->vr_lock);
376 
377 	params->vrp_size = ring->vr_size;
378 	params->vrp_pa = ring->vr_pa;
379 
380 	if (ring->vr_state == VRS_RUN) {
381 		/* On a running ring, we must heed the avail/used locks */
382 		mutex_enter(&ring->vr_a_mutex);
383 		params->vrp_avail_idx = ring->vr_cur_aidx;
384 		mutex_exit(&ring->vr_a_mutex);
385 		mutex_enter(&ring->vr_u_mutex);
386 		params->vrp_used_idx = ring->vr_cur_uidx;
387 		mutex_exit(&ring->vr_u_mutex);
388 	} else {
389 		/* Otherwise vr_lock is adequate protection */
390 		params->vrp_avail_idx = ring->vr_cur_aidx;
391 		params->vrp_used_idx = ring->vr_cur_uidx;
392 	}
393 
394 	mutex_exit(&ring->vr_lock);
395 
396 	return (0);
397 }
398 
399 int
400 viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
401 {
402 	mutex_enter(&ring->vr_lock);
403 	if (ring->vr_state == VRS_RESET) {
404 		mutex_exit(&ring->vr_lock);
405 		return (0);
406 	}
407 
408 	if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) {
409 		ring->vr_state_flags |= VRSF_REQ_STOP;
410 		cv_broadcast(&ring->vr_cv);
411 	}
412 	while (ring->vr_state != VRS_RESET) {
413 		if (!heed_signals) {
414 			cv_wait(&ring->vr_cv, &ring->vr_lock);
415 		} else {
416 			int rs;
417 
418 			rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
419 			if (rs <= 0 && ring->vr_state != VRS_RESET) {
420 				mutex_exit(&ring->vr_lock);
421 				return (EINTR);
422 			}
423 		}
424 	}
425 	mutex_exit(&ring->vr_lock);
426 	return (0);
427 }
428 
429 static bool
430 viona_ring_map(viona_vring_t *ring, bool defer_dirty)
431 {
432 	const uint16_t qsz = ring->vr_size;
433 	uintptr_t pa = ring->vr_pa;
434 
435 	ASSERT3U(qsz, !=, 0);
436 	ASSERT3U(qsz, <=, VRING_MAX_LEN);
437 	ASSERT3U(pa, !=, 0);
438 	ASSERT3U(pa & (LEGACY_VQ_ALIGN - 1), ==, 0);
439 	ASSERT3U(LEGACY_VQ_ALIGN, ==, PAGESIZE);
440 	ASSERT(MUTEX_HELD(&ring->vr_lock));
441 	ASSERT3P(ring->vr_map_pages, ==, NULL);
442 
443 	const uint_t npages = LEGACY_VQ_PAGES(qsz);
444 	ring->vr_map_pages = kmem_zalloc(npages * sizeof (void *), KM_SLEEP);
445 
446 	int page_flags = 0;
447 	if (defer_dirty) {
448 		/*
449 		 * During initialization, and when entering the paused state,
450 		 * the page holds for a virtqueue are established with the
451 		 * DEFER_DIRTY flag set.
452 		 *
453 		 * This prevents those page holds from immediately marking the
454 		 * underlying pages as dirty, since the viona emulation is not
455 		 * yet performing any accesses.  Once the ring transitions to
456 		 * the VRS_RUN state, the held pages will be marked as dirty.
457 		 *
458 		 * Any ring mappings performed outside those state conditions,
459 		 * such as those part of vmm_lease renewal during steady-state
460 		 * operation, will map the ring pages normally (as considered
461 		 * immediately dirty).
462 		 */
463 		page_flags |= VMPF_DEFER_DIRTY;
464 	}
465 
466 	vmm_page_t *prev = NULL;
467 	for (uint_t i = 0; i < npages; i++, pa += PAGESIZE) {
468 		vmm_page_t *vmp;
469 
470 		vmp = vmm_drv_page_hold_ext(ring->vr_lease, pa,
471 		    PROT_READ | PROT_WRITE, page_flags);
472 		if (vmp == NULL) {
473 			viona_ring_unmap(ring);
474 			return (false);
475 		}
476 
477 		/*
478 		 * Keep the first page has the head of the chain, appending all
479 		 * subsequent pages to the tail.
480 		 */
481 		if (prev == NULL) {
482 			ring->vr_map_hold = vmp;
483 		} else {
484 			vmm_drv_page_chain(prev, vmp);
485 		}
486 		prev = vmp;
487 		ring->vr_map_pages[i] = vmm_drv_page_writable(vmp);
488 	}
489 
490 	return (true);
491 }
492 
493 static void
494 viona_ring_mark_dirty(viona_vring_t *ring)
495 {
496 	ASSERT(MUTEX_HELD(&ring->vr_lock));
497 	ASSERT(ring->vr_map_hold != NULL);
498 
499 	for (vmm_page_t *vp = ring->vr_map_hold; vp != NULL;
500 	    vp = vmm_drv_page_next(vp)) {
501 		vmm_drv_page_mark_dirty(vp);
502 	}
503 }
504 
505 static void
506 viona_ring_unmap(viona_vring_t *ring)
507 {
508 	ASSERT(MUTEX_HELD(&ring->vr_lock));
509 
510 	void **map = ring->vr_map_pages;
511 	if (map != NULL) {
512 		const uint_t npages = LEGACY_VQ_PAGES(ring->vr_size);
513 		kmem_free(map, npages * sizeof (void *));
514 		ring->vr_map_pages = NULL;
515 
516 		vmm_drv_page_release_chain(ring->vr_map_hold);
517 		ring->vr_map_hold = NULL;
518 	} else {
519 		ASSERT3P(ring->vr_map_hold, ==, NULL);
520 	}
521 }
522 
523 static inline void *
524 viona_ring_addr(viona_vring_t *ring, uint_t off)
525 {
526 	ASSERT3P(ring->vr_map_pages, !=, NULL);
527 	ASSERT3U(LEGACY_VQ_SIZE(ring->vr_size), >, off);
528 
529 	const uint_t page_num = off / PAGESIZE;
530 	const uint_t page_off = off % PAGESIZE;
531 	return ((caddr_t)ring->vr_map_pages[page_num] + page_off);
532 }
533 
534 void
535 viona_intr_ring(viona_vring_t *ring, boolean_t skip_flags_check)
536 {
537 	if (!skip_flags_check) {
538 		volatile uint16_t *avail_flags = viona_ring_addr(ring,
539 		    LEGACY_AVAIL_FLAGS_OFF(ring->vr_size));
540 
541 		if ((*avail_flags & VRING_AVAIL_F_NO_INTERRUPT) != 0) {
542 			return;
543 		}
544 	}
545 
546 	mutex_enter(&ring->vr_lock);
547 	uint64_t addr = ring->vr_msi_addr;
548 	uint64_t msg = ring->vr_msi_msg;
549 	mutex_exit(&ring->vr_lock);
550 	if (addr != 0) {
551 		/* Deliver the interrupt directly, if so configured... */
552 		(void) vmm_drv_msi(ring->vr_lease, addr, msg);
553 	} else {
554 		/* ... otherwise, leave it to userspace */
555 		if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) {
556 			pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND);
557 		}
558 	}
559 }
560 
561 static inline bool
562 vring_stop_req(const viona_vring_t *ring)
563 {
564 	return ((ring->vr_state_flags & VRSF_REQ_STOP) != 0);
565 }
566 
567 static inline bool
568 vring_pause_req(const viona_vring_t *ring)
569 {
570 	return ((ring->vr_state_flags & VRSF_REQ_PAUSE) != 0);
571 }
572 
573 static inline bool
574 vring_start_req(const viona_vring_t *ring)
575 {
576 	return ((ring->vr_state_flags & VRSF_REQ_START) != 0);
577 }
578 
579 /*
580  * Check if vring worker thread should bail out.  This will heed indications
581  * that the containing process is exiting, as well as requests to stop or pause
582  * the ring.  The `stop_only` parameter controls if pause requests are ignored
583  * (true) or checked (false).
584  *
585  * Caller should hold vr_lock.
586  */
587 static bool
588 vring_need_bail_ext(const viona_vring_t *ring, bool stop_only)
589 {
590 	ASSERT(MUTEX_HELD(&ring->vr_lock));
591 
592 	if (vring_stop_req(ring) ||
593 	    (!stop_only && vring_pause_req(ring))) {
594 		return (true);
595 	}
596 
597 	kthread_t *t = ring->vr_worker_thread;
598 	if (t != NULL) {
599 		proc_t *p = ttoproc(t);
600 
601 		ASSERT(p != NULL);
602 		if ((p->p_flag & SEXITING) != 0) {
603 			return (true);
604 		}
605 	}
606 	return (false);
607 }
608 
609 bool
610 vring_need_bail(const viona_vring_t *ring)
611 {
612 	return (vring_need_bail_ext(ring, false));
613 }
614 
615 int
616 viona_ring_pause(viona_vring_t *ring)
617 {
618 	mutex_enter(&ring->vr_lock);
619 	switch (ring->vr_state) {
620 	case VRS_RESET:
621 	case VRS_SETUP:
622 	case VRS_INIT:
623 		/*
624 		 * For rings which have not yet started (even those in the
625 		 * VRS_SETUP and VRS_INIT phases, where there a running worker
626 		 * thread (waiting to be released to do its intended task), it
627 		 * is adequate to simply clear any start request, to keep them
628 		 * from proceeding into the actual work processing function.
629 		 */
630 		ring->vr_state_flags &= ~VRSF_REQ_START;
631 		mutex_exit(&ring->vr_lock);
632 		return (0);
633 
634 	case VRS_STOP:
635 		if ((ring->vr_state_flags & VRSF_REQ_STOP) != 0) {
636 			/* A ring on its way to RESET cannot be paused. */
637 			mutex_exit(&ring->vr_lock);
638 			return (EBUSY);
639 		}
640 		/* FALLTHROUGH */
641 	case VRS_RUN:
642 		ring->vr_state_flags |= VRSF_REQ_PAUSE;
643 		cv_broadcast(&ring->vr_cv);
644 		break;
645 
646 	default:
647 		panic("invalid ring state %d", ring->vr_state);
648 		break;
649 	}
650 
651 	for (;;) {
652 		int res = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
653 
654 		if (ring->vr_state == VRS_INIT ||
655 		    (ring->vr_state_flags & VRSF_REQ_PAUSE) == 0) {
656 			/* Ring made it to (or through) paused state */
657 			mutex_exit(&ring->vr_lock);
658 			return (0);
659 		}
660 		if (res == 0) {
661 			/* interrupted by signal */
662 			mutex_exit(&ring->vr_lock);
663 			return (EINTR);
664 		}
665 	}
666 	/* NOTREACHED */
667 }
668 
669 static void
670 viona_worker(void *arg)
671 {
672 	viona_vring_t *ring = (viona_vring_t *)arg;
673 	viona_link_t *link = ring->vr_link;
674 
675 	mutex_enter(&ring->vr_lock);
676 	VERIFY3U(ring->vr_state, ==, VRS_SETUP);
677 
678 	/* Bail immediately if ring shutdown or process exit was requested */
679 	if (vring_need_bail_ext(ring, true)) {
680 		goto ring_reset;
681 	}
682 
683 	/* Report worker thread as alive and notify creator */
684 ring_init:
685 	ring->vr_state = VRS_INIT;
686 	cv_broadcast(&ring->vr_cv);
687 
688 	while (!vring_start_req(ring)) {
689 		/*
690 		 * Keeping lease renewals timely while waiting for the ring to
691 		 * be started is important for avoiding deadlocks.
692 		 */
693 		if (vmm_drv_lease_expired(ring->vr_lease)) {
694 			if (!viona_ring_lease_renew(ring)) {
695 				goto ring_reset;
696 			}
697 		}
698 
699 		(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
700 
701 		if (vring_pause_req(ring)) {
702 			/* We are already paused in the INIT state. */
703 			ring->vr_state_flags &= ~VRSF_REQ_PAUSE;
704 		}
705 		if (vring_need_bail_ext(ring, true)) {
706 			goto ring_reset;
707 		}
708 	}
709 
710 	ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0);
711 	ring->vr_state = VRS_RUN;
712 	ring->vr_state_flags &= ~VRSF_REQ_START;
713 	viona_ring_mark_dirty(ring);
714 
715 	/* Ensure ring lease is valid first */
716 	if (vmm_drv_lease_expired(ring->vr_lease)) {
717 		if (!viona_ring_lease_renew(ring)) {
718 			goto ring_reset;
719 		}
720 	}
721 
722 	/* Process actual work */
723 	if (ring == &link->l_vrings[VIONA_VQ_RX]) {
724 		viona_worker_rx(ring, link);
725 	} else if (ring == &link->l_vrings[VIONA_VQ_TX]) {
726 		viona_worker_tx(ring, link);
727 	} else {
728 		panic("unexpected ring: %p", (void *)ring);
729 	}
730 
731 	VERIFY3U(ring->vr_state, ==, VRS_STOP);
732 	VERIFY3U(ring->vr_xfer_outstanding, ==, 0);
733 
734 	/* Respond to a pause request if the ring is not required to stop */
735 	if (vring_pause_req(ring)) {
736 		ring->vr_state_flags &= ~VRSF_REQ_PAUSE;
737 
738 		if (vring_need_bail_ext(ring, true)) {
739 			goto ring_reset;
740 		}
741 
742 		/*
743 		 * To complete pausing of the ring, unmap and re-map the pages
744 		 * underpinning the virtqueue.  This is to synchronize their
745 		 * dirty state in the backing page tables and restore the
746 		 * defer-dirty state on the held pages.
747 		 */
748 		viona_ring_unmap(ring);
749 		if (viona_ring_map(ring, true)) {
750 			goto ring_init;
751 		}
752 
753 		/*
754 		 * If the ring pages failed to be mapped, fallthrough to
755 		 * ring-reset like any other failure.
756 		 */
757 	}
758 
759 ring_reset:
760 	viona_ring_misc_free(ring);
761 
762 	viona_ring_lease_drop(ring);
763 	ring->vr_cur_aidx = 0;
764 	ring->vr_size = 0;
765 	ring->vr_mask = 0;
766 	ring->vr_pa = 0;
767 	ring->vr_state = VRS_RESET;
768 	ring->vr_state_flags = 0;
769 	ring->vr_worker_thread = NULL;
770 	cv_broadcast(&ring->vr_cv);
771 	mutex_exit(&ring->vr_lock);
772 
773 	mutex_enter(&ttoproc(curthread)->p_lock);
774 	lwp_exit();
775 }
776 
777 static kthread_t *
778 viona_create_worker(viona_vring_t *ring)
779 {
780 	k_sigset_t hold_set;
781 	proc_t *p = curproc;
782 	kthread_t *t;
783 	klwp_t *lwp;
784 
785 	ASSERT(MUTEX_HELD(&ring->vr_lock));
786 	ASSERT(ring->vr_state == VRS_RESET);
787 
788 	sigfillset(&hold_set);
789 	lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED,
790 	    minclsyspri - 1, &hold_set, curthread->t_cid, 0);
791 	if (lwp == NULL) {
792 		return (NULL);
793 	}
794 
795 	t = lwptot(lwp);
796 	mutex_enter(&p->p_lock);
797 	t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
798 	lwp_create_done(t);
799 	mutex_exit(&p->p_lock);
800 
801 	return (t);
802 }
803 
804 void
805 vq_read_desc(viona_vring_t *ring, uint16_t idx, struct virtio_desc *descp)
806 {
807 	const uint_t entry_off = idx * sizeof (struct virtio_desc);
808 
809 	ASSERT3U(idx, <, ring->vr_size);
810 
811 	bcopy(viona_ring_addr(ring, entry_off), descp, sizeof (*descp));
812 }
813 
814 static uint16_t
815 vq_read_avail(viona_vring_t *ring, uint16_t idx)
816 {
817 	ASSERT3U(idx, <, ring->vr_size);
818 
819 	volatile uint16_t *avail_ent =
820 	    viona_ring_addr(ring, LEGACY_AVAIL_ENT_OFF(ring->vr_size, idx));
821 	return (*avail_ent);
822 }
823 
824 /*
825  * Given a buffer descriptor `desc`, attempt to map the pages backing that
826  * region of guest physical memory, taking into account that there are no
827  * guarantees about guest-contiguous pages being host-contiguous.
828  */
829 static int
830 vq_map_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
831     vq_held_region_t *region)
832 {
833 	int err;
834 
835 	if (desc->vd_len == 0) {
836 		VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
837 		    uint32_t, desc->vd_len);
838 		VIONA_RING_STAT_INCR(ring, desc_bad_len);
839 		return (EINVAL);
840 	}
841 
842 	err = vq_region_hold(ring, desc->vd_addr, desc->vd_len,
843 	    (desc->vd_flags & VRING_DESC_F_WRITE) != 0, region);
844 	switch (err) {
845 	case E2BIG:
846 		VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
847 		VIONA_RING_STAT_INCR(ring, too_many_desc);
848 		break;
849 	case EFAULT:
850 		VIONA_PROBE_BAD_RING_ADDR(ring, desc->vd_addr);
851 		VIONA_RING_STAT_INCR(ring, bad_ring_addr);
852 		break;
853 	default:
854 		break;
855 	}
856 
857 	return (err);
858 }
859 
860 /*
861  * Walk an indirect buffer descriptor `desc`, attempting to map the pages
862  * backing the regions of guest memory covered by its constituent descriptors.
863  */
864 static int
865 vq_map_indir_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
866     vq_held_region_t *region)
867 {
868 	const uint16_t indir_count = desc->vd_len / sizeof (struct virtio_desc);
869 
870 	if ((desc->vd_len & 0xf) != 0 || indir_count == 0 ||
871 	    indir_count > ring->vr_size ||
872 	    desc->vd_addr > (desc->vd_addr + desc->vd_len)) {
873 		VIONA_PROBE2(indir_bad_len, viona_vring_t *, ring,
874 		    uint32_t, desc->vd_len);
875 		VIONA_RING_STAT_INCR(ring, indir_bad_len);
876 		return (EINVAL);
877 	}
878 
879 	uint16_t indir_next = 0;
880 	const uint8_t *buf = NULL;
881 	uint64_t buf_gpa = UINT64_MAX;
882 	vmm_page_t *vmp = NULL;
883 	int err = 0;
884 
885 	for (;;) {
886 		uint64_t indir_gpa =
887 		    desc->vd_addr + (indir_next * sizeof (struct virtio_desc));
888 		uint64_t indir_page = indir_gpa & PAGEMASK;
889 		struct virtio_desc vp;
890 
891 		/*
892 		 * Get a mapping for the page that the next indirect descriptor
893 		 * resides in, if has not already been done.
894 		 */
895 		if (indir_page != buf_gpa) {
896 			if (vmp != NULL) {
897 				vmm_drv_page_release(vmp);
898 			}
899 			vmp = vq_page_hold(ring, indir_page, false);
900 			if (vmp == NULL) {
901 				VIONA_PROBE_BAD_RING_ADDR(ring, indir_page);
902 				VIONA_RING_STAT_INCR(ring, bad_ring_addr);
903 				err = EFAULT;
904 				break;
905 			}
906 			buf_gpa = indir_page;
907 			buf = vmm_drv_page_readable(vmp);
908 		}
909 
910 		/*
911 		 * A copy of the indirect descriptor is made here, rather than
912 		 * simply using a reference pointer.  This prevents malicious or
913 		 * erroneous guest writes to the descriptor from fooling the
914 		 * flags/bounds verification through a race.
915 		 */
916 		bcopy(buf + (indir_gpa - indir_page), &vp, sizeof (vp));
917 
918 		if (vp.vd_flags & VRING_DESC_F_INDIRECT) {
919 			VIONA_PROBE1(indir_bad_nest, viona_vring_t *, ring);
920 			VIONA_RING_STAT_INCR(ring, indir_bad_nest);
921 			err = EINVAL;
922 			break;
923 		} else if (vp.vd_len == 0) {
924 			VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
925 			    uint32_t, vp.vd_len);
926 			VIONA_RING_STAT_INCR(ring, desc_bad_len);
927 			err = EINVAL;
928 			break;
929 		}
930 
931 		err = vq_map_desc_bufs(ring, &vp, region);
932 		if (err != 0) {
933 			break;
934 		}
935 
936 		/* Successfully reach the end of the indir chain */
937 		if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) {
938 			break;
939 		}
940 		if (region->vhr_idx >= region->vhr_niov) {
941 			VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
942 			VIONA_RING_STAT_INCR(ring, too_many_desc);
943 			err = E2BIG;
944 			break;
945 		}
946 
947 		indir_next = vp.vd_next;
948 		if (indir_next >= indir_count) {
949 			VIONA_PROBE3(indir_bad_next, viona_vring_t *, ring,
950 			    uint16_t, indir_next, uint16_t, indir_count);
951 			VIONA_RING_STAT_INCR(ring, indir_bad_next);
952 			err = EINVAL;
953 			break;
954 		}
955 	}
956 
957 	if (vmp != NULL) {
958 		vmm_drv_page_release(vmp);
959 	}
960 	return (err);
961 }
962 
963 int
964 vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov,
965     uint16_t *cookie, vmm_page_t **chain)
966 {
967 	uint16_t ndesc, idx, head, next;
968 	struct virtio_desc vdir;
969 	vq_held_region_t region = {
970 		.vhr_niov = niov,
971 		.vhr_iov = iov,
972 	};
973 
974 	ASSERT(iov != NULL);
975 	ASSERT(niov > 0 && niov < INT_MAX);
976 	ASSERT(*chain == NULL);
977 
978 	mutex_enter(&ring->vr_a_mutex);
979 	idx = ring->vr_cur_aidx;
980 	ndesc = viona_ring_num_avail(ring);
981 
982 	if (ndesc == 0) {
983 		mutex_exit(&ring->vr_a_mutex);
984 		return (0);
985 	}
986 	if (ndesc > ring->vr_size) {
987 		/*
988 		 * Despite the fact that the guest has provided an 'avail_idx'
989 		 * which indicates that an impossible number of descriptors are
990 		 * available, continue on and attempt to process the next one.
991 		 *
992 		 * The transgression will not escape the probe or stats though.
993 		 */
994 		VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring,
995 		    uint16_t, ndesc);
996 		VIONA_RING_STAT_INCR(ring, ndesc_too_high);
997 	}
998 
999 	head = vq_read_avail(ring, idx & ring->vr_mask);
1000 	next = head;
1001 
1002 	for (region.vhr_idx = 0; region.vhr_idx < niov; next = vdir.vd_next) {
1003 		if (next >= ring->vr_size) {
1004 			VIONA_PROBE2(bad_idx, viona_vring_t *, ring,
1005 			    uint16_t, next);
1006 			VIONA_RING_STAT_INCR(ring, bad_idx);
1007 			break;
1008 		}
1009 
1010 		vq_read_desc(ring, next, &vdir);
1011 		if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) {
1012 			if (vq_map_desc_bufs(ring, &vdir, &region) != 0) {
1013 				break;
1014 			}
1015 		} else {
1016 			/*
1017 			 * Per the specification (Virtio 1.1 S2.6.5.3.1):
1018 			 *   A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
1019 			 *   and VIRTQ_DESC_F_NEXT in `flags`.
1020 			 */
1021 			if ((vdir.vd_flags & VRING_DESC_F_NEXT) != 0) {
1022 				VIONA_PROBE3(indir_bad_next,
1023 				    viona_vring_t *, ring,
1024 				    uint16_t, next, uint16_t, 0);
1025 				VIONA_RING_STAT_INCR(ring, indir_bad_next);
1026 				break;
1027 			}
1028 
1029 			if (vq_map_indir_desc_bufs(ring, &vdir, &region) != 0) {
1030 				break;
1031 			}
1032 		}
1033 
1034 		if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) {
1035 			ring->vr_cur_aidx++;
1036 			mutex_exit(&ring->vr_a_mutex);
1037 
1038 			*cookie = head;
1039 			*chain = region.vhr_head;
1040 			return (region.vhr_idx);
1041 		}
1042 	}
1043 
1044 	mutex_exit(&ring->vr_a_mutex);
1045 	if (region.vhr_head != NULL) {
1046 		/*
1047 		 * If any pages were held prior to encountering an error, we
1048 		 * must release them now.
1049 		 */
1050 		vmm_drv_page_release_chain(region.vhr_head);
1051 	}
1052 	return (-1);
1053 }
1054 
1055 
1056 static void
1057 vq_write_used_ent(viona_vring_t *ring, uint16_t idx, uint16_t cookie,
1058     uint32_t len)
1059 {
1060 	/*
1061 	 * In a larger ring, entry could be split across pages, so be sure to
1062 	 * account for that when configuring the transfer by looking up the ID
1063 	 * and length addresses separately, rather than an address for a
1064 	 * combined `struct virtio_used`.
1065 	 */
1066 	const uint_t used_id_off = LEGACY_USED_ENT_OFF(ring->vr_size, idx);
1067 	const uint_t used_len_off = used_id_off + sizeof (uint32_t);
1068 	volatile uint32_t *idp = viona_ring_addr(ring, used_id_off);
1069 	volatile uint32_t *lenp = viona_ring_addr(ring, used_len_off);
1070 
1071 	ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
1072 
1073 	*idp = cookie;
1074 	*lenp = len;
1075 }
1076 
1077 static void
1078 vq_write_used_idx(viona_vring_t *ring, uint16_t idx)
1079 {
1080 	ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
1081 
1082 	volatile uint16_t *used_idx =
1083 	    viona_ring_addr(ring, LEGACY_USED_IDX_OFF(ring->vr_size));
1084 	*used_idx = idx;
1085 }
1086 
1087 void
1088 vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie)
1089 {
1090 	uint16_t uidx;
1091 
1092 	mutex_enter(&ring->vr_u_mutex);
1093 
1094 	uidx = ring->vr_cur_uidx;
1095 	vq_write_used_ent(ring, uidx & ring->vr_mask, cookie, len);
1096 	uidx++;
1097 	membar_producer();
1098 
1099 	vq_write_used_idx(ring, uidx);
1100 	ring->vr_cur_uidx = uidx;
1101 
1102 	mutex_exit(&ring->vr_u_mutex);
1103 }
1104 
1105 void
1106 vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem)
1107 {
1108 	uint16_t uidx;
1109 
1110 	mutex_enter(&ring->vr_u_mutex);
1111 
1112 	uidx = ring->vr_cur_uidx;
1113 
1114 	for (uint_t i = 0; i < num_bufs; i++, uidx++) {
1115 		vq_write_used_ent(ring, uidx & ring->vr_mask, elem[i].id,
1116 		    elem[i].len);
1117 	}
1118 
1119 	membar_producer();
1120 	vq_write_used_idx(ring, uidx);
1121 	ring->vr_cur_uidx = uidx;
1122 
1123 	mutex_exit(&ring->vr_u_mutex);
1124 }
1125 
1126 /*
1127  * Set USED_NO_NOTIFY on VQ so guest elides doorbell calls for new entries.
1128  */
1129 void
1130 viona_ring_disable_notify(viona_vring_t *ring)
1131 {
1132 	volatile uint16_t *used_flags =
1133 	    viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
1134 
1135 	*used_flags |= VRING_USED_F_NO_NOTIFY;
1136 }
1137 
1138 /*
1139  * Clear USED_NO_NOTIFY on VQ so guest resumes doorbell calls for new entries.
1140  */
1141 void
1142 viona_ring_enable_notify(viona_vring_t *ring)
1143 {
1144 	volatile uint16_t *used_flags =
1145 	    viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
1146 
1147 	*used_flags &= ~VRING_USED_F_NO_NOTIFY;
1148 }
1149 
1150 /*
1151  * Return the number of available descriptors in the vring taking care of the
1152  * 16-bit index wraparound.
1153  *
1154  * Note: If the number of apparently available descriptors is larger than the
1155  * ring size (due to guest misbehavior), this check will still report the
1156  * positive count of descriptors.
1157  */
1158 uint16_t
1159 viona_ring_num_avail(viona_vring_t *ring)
1160 {
1161 	volatile uint16_t *avail_idx =
1162 	    viona_ring_addr(ring, LEGACY_AVAIL_IDX_OFF(ring->vr_size));
1163 
1164 	return (*avail_idx - ring->vr_cur_aidx);
1165 }
1166