xref: /illumos-gate/usr/src/uts/intel/io/viona/viona_ring.c (revision 8119dad84d6416f13557b0ba8e2aaf9064cbcfd3)
1 /*
2  * Copyright (c) 2013  Chris Torek <torek @ torek net>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * This file and its contents are supplied under the terms of the
28  * Common Development and Distribution License ("CDDL"), version 1.0.
29  * You may only use this file in accordance with the terms of version
30  * 1.0 of the CDDL.
31  *
32  * A full copy of the text of the CDDL should have accompanied this
33  * source.  A copy of the CDDL is also available via the Internet at
34  * http://www.illumos.org/license/CDDL.
35  *
36  * Copyright 2015 Pluribus Networks Inc.
37  * Copyright 2019 Joyent, Inc.
38  * Copyright 2024 Oxide Computer Company
39  */
40 
41 
42 #include <sys/disp.h>
43 
44 #include "viona_impl.h"
45 
46 #define	VRING_MAX_LEN		32768
47 
48 /* Layout and sizing as defined in the spec for a legacy-style virtqueue */
49 
50 #define	LEGACY_VQ_ALIGN		PAGESIZE
51 
52 #define	LEGACY_DESC_SZ(qsz)	((qsz) * sizeof (struct virtio_desc))
53 /*
54  * Available ring consists of avail_idx (uint16_t), flags (uint16_t), qsz avail
55  * descriptors (uint16_t each), and (optional) used_event (uint16_t).
56  */
57 #define	LEGACY_AVAIL_SZ(qsz)	(((qsz) + 3) * sizeof (uint16_t))
58 /*
59  * Used ring consists of used_idx (uint16_t), flags (uint16_t), qsz used
60  * descriptors (two uint32_t each), and (optional) avail_event (uint16_t).
61  */
62 #define	LEGACY_USED_SZ(qsz)	\
63 	((qsz) * sizeof (struct virtio_used) + 3 * sizeof (uint16_t))
64 
65 #define	LEGACY_AVAIL_FLAGS_OFF(qsz)	LEGACY_DESC_SZ(qsz)
66 #define	LEGACY_AVAIL_IDX_OFF(qsz)	\
67 	(LEGACY_DESC_SZ(qsz) + sizeof (uint16_t))
68 #define	LEGACY_AVAIL_ENT_OFF(qsz, idx)	\
69 	(LEGACY_DESC_SZ(qsz) + (2 + (idx)) * sizeof (uint16_t))
70 
71 #define	LEGACY_USED_FLAGS_OFF(qsz)	\
72 	P2ROUNDUP(LEGACY_DESC_SZ(qsz) + LEGACY_AVAIL_SZ(qsz), LEGACY_VQ_ALIGN)
73 #define	LEGACY_USED_IDX_OFF(qsz)	\
74 	(LEGACY_USED_FLAGS_OFF(qsz) + sizeof (uint16_t))
75 #define	LEGACY_USED_ENT_OFF(qsz, idx)	\
76 	(LEGACY_USED_FLAGS_OFF(qsz) + 2 * sizeof (uint16_t) + \
77 	(idx) * sizeof (struct virtio_used))
78 
79 #define	LEGACY_VQ_SIZE(qsz)	\
80 	(LEGACY_USED_FLAGS_OFF(qsz) + \
81 	P2ROUNDUP(LEGACY_USED_SZ(qsz), LEGACY_VQ_ALIGN))
82 #define	LEGACY_VQ_PAGES(qsz)	(LEGACY_VQ_SIZE(qsz) / PAGESIZE)
83 
84 struct vq_held_region {
85 	struct iovec	*vhr_iov;
86 	vmm_page_t	*vhr_head;
87 	vmm_page_t	*vhr_tail;
88 	/* Length of iovec array supplied in `vhr_iov` */
89 	uint_t		vhr_niov;
90 	/*
91 	 * Index into vhr_iov, indicating the next "free" entry (following the
92 	 * last entry which has valid contents).
93 	 */
94 	uint_t		vhr_idx;
95 };
96 typedef struct vq_held_region vq_held_region_t;
97 
98 static bool viona_ring_map(viona_vring_t *, bool);
99 static void viona_ring_unmap(viona_vring_t *);
100 static kthread_t *viona_create_worker(viona_vring_t *);
101 static void viona_ring_consolidate_stats(viona_vring_t *);
102 
103 static vmm_page_t *
104 vq_page_hold(viona_vring_t *ring, uint64_t gpa, bool writable)
105 {
106 	ASSERT3P(ring->vr_lease, !=, NULL);
107 
108 	int prot = PROT_READ;
109 	if (writable) {
110 		prot |= PROT_WRITE;
111 	}
112 
113 	return (vmm_drv_page_hold(ring->vr_lease, gpa, prot));
114 }
115 
116 /*
117  * Establish a hold on the page(s) which back the region of guest memory covered
118  * by [gpa, gpa + len).  The host-kernel-virtual pointers to those pages are
119  * stored in the iovec array supplied in `region`, along with the chain of
120  * vmm_page_t entries representing the held pages.  Since guest memory
121  * carries no guarantees of being physically contiguous (on the host), it is
122  * assumed that an iovec entry will be required for each PAGESIZE section
123  * covered by the specified `gpa` and `len` range.  For each iovec entry
124  * successfully populated by holding a page, `vhr_idx` will be incremented so it
125  * references the next available iovec entry (or `vhr_niov`, if the iovec array
126  * is full).  The responsibility for releasing the `vmm_page_t` chain (stored in
127  * `vhr_head` and `vhr_tail`) resides with the caller, regardless of the result.
128  */
129 static int
130 vq_region_hold(viona_vring_t *ring, uint64_t gpa, uint32_t len,
131     bool writable, vq_held_region_t *region)
132 {
133 	const uint32_t front_offset = gpa & PAGEOFFSET;
134 	const uint32_t front_len = MIN(len, PAGESIZE - front_offset);
135 	uint_t pages = 1;
136 	vmm_page_t *vmp;
137 	caddr_t buf;
138 
139 	ASSERT3U(region->vhr_idx, <, region->vhr_niov);
140 
141 	if (front_len < len) {
142 		pages += P2ROUNDUP((uint64_t)(len - front_len),
143 		    PAGESIZE) / PAGESIZE;
144 	}
145 	if (pages > (region->vhr_niov - region->vhr_idx)) {
146 		return (E2BIG);
147 	}
148 
149 	vmp = vq_page_hold(ring, gpa & PAGEMASK, writable);
150 	if (vmp == NULL) {
151 		return (EFAULT);
152 	}
153 	buf = (caddr_t)vmm_drv_page_readable(vmp);
154 
155 	region->vhr_iov[region->vhr_idx].iov_base = buf + front_offset;
156 	region->vhr_iov[region->vhr_idx].iov_len = front_len;
157 	region->vhr_idx++;
158 	gpa += front_len;
159 	len -= front_len;
160 	if (region->vhr_head == NULL) {
161 		region->vhr_head = vmp;
162 		region->vhr_tail = vmp;
163 	} else {
164 		vmm_drv_page_chain(region->vhr_tail, vmp);
165 		region->vhr_tail = vmp;
166 	}
167 
168 	for (uint_t i = 1; i < pages; i++) {
169 		ASSERT3U(gpa & PAGEOFFSET, ==, 0);
170 
171 		vmp = vq_page_hold(ring, gpa, writable);
172 		if (vmp == NULL) {
173 			return (EFAULT);
174 		}
175 		buf = (caddr_t)vmm_drv_page_readable(vmp);
176 
177 		const uint32_t chunk_len = MIN(len, PAGESIZE);
178 		region->vhr_iov[region->vhr_idx].iov_base = buf;
179 		region->vhr_iov[region->vhr_idx].iov_len = chunk_len;
180 		region->vhr_idx++;
181 		gpa += chunk_len;
182 		len -= chunk_len;
183 		vmm_drv_page_chain(region->vhr_tail, vmp);
184 		region->vhr_tail = vmp;
185 	}
186 
187 	return (0);
188 }
189 
190 static boolean_t
191 viona_ring_lease_expire_cb(void *arg)
192 {
193 	viona_vring_t *ring = arg;
194 
195 	mutex_enter(&ring->vr_lock);
196 	cv_broadcast(&ring->vr_cv);
197 	mutex_exit(&ring->vr_lock);
198 
199 	/* The lease will be broken asynchronously. */
200 	return (B_FALSE);
201 }
202 
203 static void
204 viona_ring_lease_drop(viona_vring_t *ring)
205 {
206 	ASSERT(MUTEX_HELD(&ring->vr_lock));
207 
208 	if (ring->vr_lease != NULL) {
209 		vmm_hold_t *hold = ring->vr_link->l_vm_hold;
210 
211 		ASSERT(hold != NULL);
212 
213 		/*
214 		 * Without an active lease, the ring mappings cannot be
215 		 * considered valid.
216 		 */
217 		viona_ring_unmap(ring);
218 
219 		vmm_drv_lease_break(hold, ring->vr_lease);
220 		ring->vr_lease = NULL;
221 	}
222 }
223 
224 boolean_t
225 viona_ring_lease_renew(viona_vring_t *ring)
226 {
227 	vmm_hold_t *hold = ring->vr_link->l_vm_hold;
228 
229 	ASSERT(hold != NULL);
230 	ASSERT(MUTEX_HELD(&ring->vr_lock));
231 
232 	viona_ring_lease_drop(ring);
233 
234 	/*
235 	 * Lease renewal will fail if the VM has requested that all holds be
236 	 * cleaned up.
237 	 */
238 	ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
239 	    ring);
240 	if (ring->vr_lease != NULL) {
241 		/* A ring undergoing renewal will need valid guest mappings */
242 		if (ring->vr_pa != 0 && ring->vr_size != 0) {
243 			/*
244 			 * If new mappings cannot be established, consider the
245 			 * lease renewal a failure.
246 			 */
247 			if (!viona_ring_map(ring, ring->vr_state == VRS_INIT)) {
248 				viona_ring_lease_drop(ring);
249 				return (B_FALSE);
250 			}
251 		}
252 	}
253 	return (ring->vr_lease != NULL);
254 }
255 
256 void
257 viona_ring_alloc(viona_link_t *link, viona_vring_t *ring)
258 {
259 	ring->vr_link = link;
260 	mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL);
261 	cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL);
262 	mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL);
263 	mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL);
264 }
265 
266 static void
267 viona_ring_misc_free(viona_vring_t *ring)
268 {
269 	const uint_t qsz = ring->vr_size;
270 
271 	viona_tx_ring_free(ring, qsz);
272 }
273 
274 void
275 viona_ring_free(viona_vring_t *ring)
276 {
277 	mutex_destroy(&ring->vr_lock);
278 	cv_destroy(&ring->vr_cv);
279 	mutex_destroy(&ring->vr_a_mutex);
280 	mutex_destroy(&ring->vr_u_mutex);
281 	ring->vr_link = NULL;
282 }
283 
284 int
285 viona_ring_init(viona_link_t *link, uint16_t idx,
286     const struct viona_ring_params *params)
287 {
288 	viona_vring_t *ring;
289 	kthread_t *t;
290 	int err = 0;
291 	const uint16_t qsz = params->vrp_size;
292 	const uint64_t pa = params->vrp_pa;
293 
294 	if (idx >= VIONA_VQ_MAX) {
295 		return (EINVAL);
296 	}
297 
298 	if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
299 		return (EINVAL);
300 	}
301 	if ((pa & (LEGACY_VQ_ALIGN - 1)) != 0) {
302 		return (EINVAL);
303 	}
304 
305 	ring = &link->l_vrings[idx];
306 	mutex_enter(&ring->vr_lock);
307 	if (ring->vr_state != VRS_RESET) {
308 		mutex_exit(&ring->vr_lock);
309 		return (EBUSY);
310 	}
311 	VERIFY(ring->vr_state_flags == 0);
312 
313 	ring->vr_lease = NULL;
314 	if (!viona_ring_lease_renew(ring)) {
315 		err = EBUSY;
316 		goto fail;
317 	}
318 
319 	ring->vr_size = qsz;
320 	ring->vr_mask = (ring->vr_size - 1);
321 	ring->vr_pa = pa;
322 	if (!viona_ring_map(ring, true)) {
323 		err = EINVAL;
324 		goto fail;
325 	}
326 
327 	/* Initialize queue indexes */
328 	ring->vr_cur_aidx = params->vrp_avail_idx;
329 	ring->vr_cur_uidx = params->vrp_used_idx;
330 
331 	if (idx == VIONA_VQ_TX) {
332 		viona_tx_ring_alloc(ring, qsz);
333 	}
334 
335 	/* Zero out MSI-X configuration */
336 	ring->vr_msi_addr = 0;
337 	ring->vr_msi_msg = 0;
338 
339 	/* Clear the stats */
340 	bzero(&ring->vr_stats, sizeof (ring->vr_stats));
341 	bzero(&ring->vr_err_stats, sizeof (ring->vr_err_stats));
342 
343 	t = viona_create_worker(ring);
344 	if (t == NULL) {
345 		err = ENOMEM;
346 		goto fail;
347 	}
348 	ring->vr_worker_thread = t;
349 	ring->vr_state = VRS_SETUP;
350 	cv_broadcast(&ring->vr_cv);
351 	mutex_exit(&ring->vr_lock);
352 	return (0);
353 
354 fail:
355 	viona_ring_lease_drop(ring);
356 	viona_ring_misc_free(ring);
357 	ring->vr_size = 0;
358 	ring->vr_mask = 0;
359 	ring->vr_pa = 0;
360 	ring->vr_cur_aidx = 0;
361 	ring->vr_cur_uidx = 0;
362 	mutex_exit(&ring->vr_lock);
363 	return (err);
364 }
365 
366 int
367 viona_ring_get_state(viona_link_t *link, uint16_t idx,
368     struct viona_ring_params *params)
369 {
370 	viona_vring_t *ring;
371 
372 	if (idx >= VIONA_VQ_MAX) {
373 		return (EINVAL);
374 	}
375 
376 	ring = &link->l_vrings[idx];
377 	mutex_enter(&ring->vr_lock);
378 
379 	params->vrp_size = ring->vr_size;
380 	params->vrp_pa = ring->vr_pa;
381 
382 	if (ring->vr_state == VRS_RUN) {
383 		/* On a running ring, we must heed the avail/used locks */
384 		mutex_enter(&ring->vr_a_mutex);
385 		params->vrp_avail_idx = ring->vr_cur_aidx;
386 		mutex_exit(&ring->vr_a_mutex);
387 		mutex_enter(&ring->vr_u_mutex);
388 		params->vrp_used_idx = ring->vr_cur_uidx;
389 		mutex_exit(&ring->vr_u_mutex);
390 	} else {
391 		/* Otherwise vr_lock is adequate protection */
392 		params->vrp_avail_idx = ring->vr_cur_aidx;
393 		params->vrp_used_idx = ring->vr_cur_uidx;
394 	}
395 
396 	mutex_exit(&ring->vr_lock);
397 
398 	return (0);
399 }
400 
401 int
402 viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
403 {
404 	mutex_enter(&ring->vr_lock);
405 	if (ring->vr_state == VRS_RESET) {
406 		mutex_exit(&ring->vr_lock);
407 		return (0);
408 	}
409 
410 	if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) {
411 		ring->vr_state_flags |= VRSF_REQ_STOP;
412 		cv_broadcast(&ring->vr_cv);
413 	}
414 	while (ring->vr_state != VRS_RESET) {
415 		if (!heed_signals) {
416 			cv_wait(&ring->vr_cv, &ring->vr_lock);
417 		} else {
418 			int rs;
419 
420 			rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
421 			if (rs <= 0 && ring->vr_state != VRS_RESET) {
422 				mutex_exit(&ring->vr_lock);
423 				return (EINTR);
424 			}
425 		}
426 	}
427 	mutex_exit(&ring->vr_lock);
428 	return (0);
429 }
430 
431 static bool
432 viona_ring_map(viona_vring_t *ring, bool defer_dirty)
433 {
434 	const uint16_t qsz = ring->vr_size;
435 	uintptr_t pa = ring->vr_pa;
436 
437 	ASSERT3U(qsz, !=, 0);
438 	ASSERT3U(qsz, <=, VRING_MAX_LEN);
439 	ASSERT3U(pa, !=, 0);
440 	ASSERT3U(pa & (LEGACY_VQ_ALIGN - 1), ==, 0);
441 	ASSERT3U(LEGACY_VQ_ALIGN, ==, PAGESIZE);
442 	ASSERT(MUTEX_HELD(&ring->vr_lock));
443 	ASSERT3P(ring->vr_map_pages, ==, NULL);
444 
445 	const uint_t npages = LEGACY_VQ_PAGES(qsz);
446 	ring->vr_map_pages = kmem_zalloc(npages * sizeof (void *), KM_SLEEP);
447 
448 	int page_flags = 0;
449 	if (defer_dirty) {
450 		/*
451 		 * During initialization, and when entering the paused state,
452 		 * the page holds for a virtqueue are established with the
453 		 * DEFER_DIRTY flag set.
454 		 *
455 		 * This prevents those page holds from immediately marking the
456 		 * underlying pages as dirty, since the viona emulation is not
457 		 * yet performing any accesses.  Once the ring transitions to
458 		 * the VRS_RUN state, the held pages will be marked as dirty.
459 		 *
460 		 * Any ring mappings performed outside those state conditions,
461 		 * such as those part of vmm_lease renewal during steady-state
462 		 * operation, will map the ring pages normally (as considered
463 		 * immediately dirty).
464 		 */
465 		page_flags |= VMPF_DEFER_DIRTY;
466 	}
467 
468 	vmm_page_t *prev = NULL;
469 	for (uint_t i = 0; i < npages; i++, pa += PAGESIZE) {
470 		vmm_page_t *vmp;
471 
472 		vmp = vmm_drv_page_hold_ext(ring->vr_lease, pa,
473 		    PROT_READ | PROT_WRITE, page_flags);
474 		if (vmp == NULL) {
475 			viona_ring_unmap(ring);
476 			return (false);
477 		}
478 
479 		/*
480 		 * Keep the first page has the head of the chain, appending all
481 		 * subsequent pages to the tail.
482 		 */
483 		if (prev == NULL) {
484 			ring->vr_map_hold = vmp;
485 		} else {
486 			vmm_drv_page_chain(prev, vmp);
487 		}
488 		prev = vmp;
489 		ring->vr_map_pages[i] = vmm_drv_page_writable(vmp);
490 	}
491 
492 	return (true);
493 }
494 
495 static void
496 viona_ring_mark_dirty(viona_vring_t *ring)
497 {
498 	ASSERT(MUTEX_HELD(&ring->vr_lock));
499 	ASSERT(ring->vr_map_hold != NULL);
500 
501 	for (vmm_page_t *vp = ring->vr_map_hold; vp != NULL;
502 	    vp = vmm_drv_page_next(vp)) {
503 		vmm_drv_page_mark_dirty(vp);
504 	}
505 }
506 
507 static void
508 viona_ring_unmap(viona_vring_t *ring)
509 {
510 	ASSERT(MUTEX_HELD(&ring->vr_lock));
511 
512 	void **map = ring->vr_map_pages;
513 	if (map != NULL) {
514 		const uint_t npages = LEGACY_VQ_PAGES(ring->vr_size);
515 		kmem_free(map, npages * sizeof (void *));
516 		ring->vr_map_pages = NULL;
517 
518 		vmm_drv_page_release_chain(ring->vr_map_hold);
519 		ring->vr_map_hold = NULL;
520 	} else {
521 		ASSERT3P(ring->vr_map_hold, ==, NULL);
522 	}
523 }
524 
525 static inline void *
526 viona_ring_addr(viona_vring_t *ring, uint_t off)
527 {
528 	ASSERT3P(ring->vr_map_pages, !=, NULL);
529 	ASSERT3U(LEGACY_VQ_SIZE(ring->vr_size), >, off);
530 
531 	const uint_t page_num = off / PAGESIZE;
532 	const uint_t page_off = off % PAGESIZE;
533 	return ((caddr_t)ring->vr_map_pages[page_num] + page_off);
534 }
535 
536 void
537 viona_intr_ring(viona_vring_t *ring, boolean_t skip_flags_check)
538 {
539 	if (!skip_flags_check) {
540 		volatile uint16_t *avail_flags = viona_ring_addr(ring,
541 		    LEGACY_AVAIL_FLAGS_OFF(ring->vr_size));
542 
543 		if ((*avail_flags & VRING_AVAIL_F_NO_INTERRUPT) != 0) {
544 			return;
545 		}
546 	}
547 
548 	mutex_enter(&ring->vr_lock);
549 	uint64_t addr = ring->vr_msi_addr;
550 	uint64_t msg = ring->vr_msi_msg;
551 	mutex_exit(&ring->vr_lock);
552 	if (addr != 0) {
553 		/* Deliver the interrupt directly, if so configured... */
554 		(void) vmm_drv_msi(ring->vr_lease, addr, msg);
555 	} else {
556 		/* ... otherwise, leave it to userspace */
557 		if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) {
558 			pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND);
559 		}
560 	}
561 }
562 
563 static inline bool
564 vring_stop_req(const viona_vring_t *ring)
565 {
566 	return ((ring->vr_state_flags & VRSF_REQ_STOP) != 0);
567 }
568 
569 static inline bool
570 vring_pause_req(const viona_vring_t *ring)
571 {
572 	return ((ring->vr_state_flags & VRSF_REQ_PAUSE) != 0);
573 }
574 
575 static inline bool
576 vring_start_req(const viona_vring_t *ring)
577 {
578 	return ((ring->vr_state_flags & VRSF_REQ_START) != 0);
579 }
580 
581 /*
582  * Check if vring worker thread should bail out.  This will heed indications
583  * that the containing process is exiting, as well as requests to stop or pause
584  * the ring.  The `stop_only` parameter controls if pause requests are ignored
585  * (true) or checked (false).
586  *
587  * Caller should hold vr_lock.
588  */
589 static bool
590 vring_need_bail_ext(const viona_vring_t *ring, bool stop_only)
591 {
592 	ASSERT(MUTEX_HELD(&ring->vr_lock));
593 
594 	if (vring_stop_req(ring) ||
595 	    (!stop_only && vring_pause_req(ring))) {
596 		return (true);
597 	}
598 
599 	kthread_t *t = ring->vr_worker_thread;
600 	if (t != NULL) {
601 		proc_t *p = ttoproc(t);
602 
603 		ASSERT(p != NULL);
604 		if ((p->p_flag & SEXITING) != 0) {
605 			return (true);
606 		}
607 	}
608 	return (false);
609 }
610 
611 bool
612 vring_need_bail(const viona_vring_t *ring)
613 {
614 	return (vring_need_bail_ext(ring, false));
615 }
616 
617 int
618 viona_ring_pause(viona_vring_t *ring)
619 {
620 	mutex_enter(&ring->vr_lock);
621 	switch (ring->vr_state) {
622 	case VRS_RESET:
623 	case VRS_SETUP:
624 	case VRS_INIT:
625 		/*
626 		 * For rings which have not yet started (even those in the
627 		 * VRS_SETUP and VRS_INIT phases, where there a running worker
628 		 * thread (waiting to be released to do its intended task), it
629 		 * is adequate to simply clear any start request, to keep them
630 		 * from proceeding into the actual work processing function.
631 		 */
632 		ring->vr_state_flags &= ~VRSF_REQ_START;
633 		mutex_exit(&ring->vr_lock);
634 		return (0);
635 
636 	case VRS_STOP:
637 		if ((ring->vr_state_flags & VRSF_REQ_STOP) != 0) {
638 			/* A ring on its way to RESET cannot be paused. */
639 			mutex_exit(&ring->vr_lock);
640 			return (EBUSY);
641 		}
642 		/* FALLTHROUGH */
643 	case VRS_RUN:
644 		ring->vr_state_flags |= VRSF_REQ_PAUSE;
645 		cv_broadcast(&ring->vr_cv);
646 		break;
647 
648 	default:
649 		panic("invalid ring state %d", ring->vr_state);
650 		break;
651 	}
652 
653 	for (;;) {
654 		int res = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
655 
656 		if (ring->vr_state == VRS_INIT ||
657 		    (ring->vr_state_flags & VRSF_REQ_PAUSE) == 0) {
658 			/* Ring made it to (or through) paused state */
659 			mutex_exit(&ring->vr_lock);
660 			return (0);
661 		}
662 		if (res == 0) {
663 			/* interrupted by signal */
664 			mutex_exit(&ring->vr_lock);
665 			return (EINTR);
666 		}
667 	}
668 	/* NOTREACHED */
669 }
670 
671 static void
672 viona_worker(void *arg)
673 {
674 	viona_vring_t *ring = (viona_vring_t *)arg;
675 	viona_link_t *link = ring->vr_link;
676 
677 	mutex_enter(&ring->vr_lock);
678 	VERIFY3U(ring->vr_state, ==, VRS_SETUP);
679 
680 	/* Bail immediately if ring shutdown or process exit was requested */
681 	if (vring_need_bail_ext(ring, true)) {
682 		goto ring_reset;
683 	}
684 
685 	/* Report worker thread as alive and notify creator */
686 ring_init:
687 	ring->vr_state = VRS_INIT;
688 	cv_broadcast(&ring->vr_cv);
689 
690 	while (!vring_start_req(ring)) {
691 		/*
692 		 * Keeping lease renewals timely while waiting for the ring to
693 		 * be started is important for avoiding deadlocks.
694 		 */
695 		if (vmm_drv_lease_expired(ring->vr_lease)) {
696 			if (!viona_ring_lease_renew(ring)) {
697 				goto ring_reset;
698 			}
699 		}
700 
701 		(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
702 
703 		if (vring_pause_req(ring)) {
704 			/* We are already paused in the INIT state. */
705 			ring->vr_state_flags &= ~VRSF_REQ_PAUSE;
706 		}
707 		if (vring_need_bail_ext(ring, true)) {
708 			goto ring_reset;
709 		}
710 	}
711 
712 	ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0);
713 	ring->vr_state = VRS_RUN;
714 	ring->vr_state_flags &= ~VRSF_REQ_START;
715 	viona_ring_mark_dirty(ring);
716 
717 	/* Ensure ring lease is valid first */
718 	if (vmm_drv_lease_expired(ring->vr_lease)) {
719 		if (!viona_ring_lease_renew(ring)) {
720 			goto ring_reset;
721 		}
722 	}
723 
724 	/* Process actual work */
725 	if (ring == &link->l_vrings[VIONA_VQ_RX]) {
726 		viona_worker_rx(ring, link);
727 	} else if (ring == &link->l_vrings[VIONA_VQ_TX]) {
728 		viona_worker_tx(ring, link);
729 	} else {
730 		panic("unexpected ring: %p", (void *)ring);
731 	}
732 
733 	VERIFY3U(ring->vr_state, ==, VRS_STOP);
734 	VERIFY3U(ring->vr_xfer_outstanding, ==, 0);
735 
736 	/*
737 	 * Consolidate stats data so that it is not lost if/when this ring is
738 	 * being stopped.
739 	 */
740 	viona_ring_consolidate_stats(ring);
741 
742 	/* Respond to a pause request if the ring is not required to stop */
743 	if (vring_pause_req(ring)) {
744 		ring->vr_state_flags &= ~VRSF_REQ_PAUSE;
745 
746 		if (vring_need_bail_ext(ring, true)) {
747 			goto ring_reset;
748 		}
749 
750 		/*
751 		 * To complete pausing of the ring, unmap and re-map the pages
752 		 * underpinning the virtqueue.  This is to synchronize their
753 		 * dirty state in the backing page tables and restore the
754 		 * defer-dirty state on the held pages.
755 		 */
756 		viona_ring_unmap(ring);
757 		if (viona_ring_map(ring, true)) {
758 			goto ring_init;
759 		}
760 
761 		/*
762 		 * If the ring pages failed to be mapped, fallthrough to
763 		 * ring-reset like any other failure.
764 		 */
765 	}
766 
767 ring_reset:
768 	viona_ring_misc_free(ring);
769 
770 	viona_ring_lease_drop(ring);
771 	ring->vr_cur_aidx = 0;
772 	ring->vr_size = 0;
773 	ring->vr_mask = 0;
774 	ring->vr_pa = 0;
775 	ring->vr_state = VRS_RESET;
776 	ring->vr_state_flags = 0;
777 	ring->vr_worker_thread = NULL;
778 	cv_broadcast(&ring->vr_cv);
779 	mutex_exit(&ring->vr_lock);
780 
781 	mutex_enter(&ttoproc(curthread)->p_lock);
782 	lwp_exit();
783 }
784 
785 static kthread_t *
786 viona_create_worker(viona_vring_t *ring)
787 {
788 	k_sigset_t hold_set;
789 	proc_t *p = curproc;
790 	kthread_t *t;
791 	klwp_t *lwp;
792 
793 	ASSERT(MUTEX_HELD(&ring->vr_lock));
794 	ASSERT(ring->vr_state == VRS_RESET);
795 
796 	sigfillset(&hold_set);
797 	lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED,
798 	    minclsyspri - 1, &hold_set, curthread->t_cid, 0);
799 	if (lwp == NULL) {
800 		return (NULL);
801 	}
802 
803 	t = lwptot(lwp);
804 	mutex_enter(&p->p_lock);
805 	t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
806 	lwp_create_done(t);
807 	mutex_exit(&p->p_lock);
808 
809 	return (t);
810 }
811 
812 void
813 vq_read_desc(viona_vring_t *ring, uint16_t idx, struct virtio_desc *descp)
814 {
815 	const uint_t entry_off = idx * sizeof (struct virtio_desc);
816 
817 	ASSERT3U(idx, <, ring->vr_size);
818 
819 	bcopy(viona_ring_addr(ring, entry_off), descp, sizeof (*descp));
820 }
821 
822 static uint16_t
823 vq_read_avail(viona_vring_t *ring, uint16_t idx)
824 {
825 	ASSERT3U(idx, <, ring->vr_size);
826 
827 	volatile uint16_t *avail_ent =
828 	    viona_ring_addr(ring, LEGACY_AVAIL_ENT_OFF(ring->vr_size, idx));
829 	return (*avail_ent);
830 }
831 
832 /*
833  * Given a buffer descriptor `desc`, attempt to map the pages backing that
834  * region of guest physical memory, taking into account that there are no
835  * guarantees about guest-contiguous pages being host-contiguous.
836  */
837 static int
838 vq_map_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
839     vq_held_region_t *region)
840 {
841 	int err;
842 
843 	if (desc->vd_len == 0) {
844 		VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
845 		    uint32_t, desc->vd_len);
846 		VIONA_RING_STAT_INCR(ring, desc_bad_len);
847 		return (EINVAL);
848 	}
849 
850 	err = vq_region_hold(ring, desc->vd_addr, desc->vd_len,
851 	    (desc->vd_flags & VRING_DESC_F_WRITE) != 0, region);
852 	switch (err) {
853 	case E2BIG:
854 		VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
855 		VIONA_RING_STAT_INCR(ring, too_many_desc);
856 		break;
857 	case EFAULT:
858 		VIONA_PROBE_BAD_RING_ADDR(ring, desc->vd_addr);
859 		VIONA_RING_STAT_INCR(ring, bad_ring_addr);
860 		break;
861 	default:
862 		break;
863 	}
864 
865 	return (err);
866 }
867 
868 /*
869  * Walk an indirect buffer descriptor `desc`, attempting to map the pages
870  * backing the regions of guest memory covered by its constituent descriptors.
871  */
872 static int
873 vq_map_indir_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
874     vq_held_region_t *region)
875 {
876 	const uint16_t indir_count = desc->vd_len / sizeof (struct virtio_desc);
877 
878 	if ((desc->vd_len & 0xf) != 0 || indir_count == 0 ||
879 	    indir_count > ring->vr_size ||
880 	    desc->vd_addr > (desc->vd_addr + desc->vd_len)) {
881 		VIONA_PROBE2(indir_bad_len, viona_vring_t *, ring,
882 		    uint32_t, desc->vd_len);
883 		VIONA_RING_STAT_INCR(ring, indir_bad_len);
884 		return (EINVAL);
885 	}
886 
887 	uint16_t indir_next = 0;
888 	const uint8_t *buf = NULL;
889 	uint64_t buf_gpa = UINT64_MAX;
890 	vmm_page_t *vmp = NULL;
891 	int err = 0;
892 
893 	for (;;) {
894 		uint64_t indir_gpa =
895 		    desc->vd_addr + (indir_next * sizeof (struct virtio_desc));
896 		uint64_t indir_page = indir_gpa & PAGEMASK;
897 		struct virtio_desc vp;
898 
899 		/*
900 		 * Get a mapping for the page that the next indirect descriptor
901 		 * resides in, if has not already been done.
902 		 */
903 		if (indir_page != buf_gpa) {
904 			if (vmp != NULL) {
905 				vmm_drv_page_release(vmp);
906 			}
907 			vmp = vq_page_hold(ring, indir_page, false);
908 			if (vmp == NULL) {
909 				VIONA_PROBE_BAD_RING_ADDR(ring, indir_page);
910 				VIONA_RING_STAT_INCR(ring, bad_ring_addr);
911 				err = EFAULT;
912 				break;
913 			}
914 			buf_gpa = indir_page;
915 			buf = vmm_drv_page_readable(vmp);
916 		}
917 
918 		/*
919 		 * A copy of the indirect descriptor is made here, rather than
920 		 * simply using a reference pointer.  This prevents malicious or
921 		 * erroneous guest writes to the descriptor from fooling the
922 		 * flags/bounds verification through a race.
923 		 */
924 		bcopy(buf + (indir_gpa - indir_page), &vp, sizeof (vp));
925 
926 		if (vp.vd_flags & VRING_DESC_F_INDIRECT) {
927 			VIONA_PROBE1(indir_bad_nest, viona_vring_t *, ring);
928 			VIONA_RING_STAT_INCR(ring, indir_bad_nest);
929 			err = EINVAL;
930 			break;
931 		} else if (vp.vd_len == 0) {
932 			VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
933 			    uint32_t, vp.vd_len);
934 			VIONA_RING_STAT_INCR(ring, desc_bad_len);
935 			err = EINVAL;
936 			break;
937 		}
938 
939 		err = vq_map_desc_bufs(ring, &vp, region);
940 		if (err != 0) {
941 			break;
942 		}
943 
944 		/* Successfully reach the end of the indir chain */
945 		if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) {
946 			break;
947 		}
948 		if (region->vhr_idx >= region->vhr_niov) {
949 			VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
950 			VIONA_RING_STAT_INCR(ring, too_many_desc);
951 			err = E2BIG;
952 			break;
953 		}
954 
955 		indir_next = vp.vd_next;
956 		if (indir_next >= indir_count) {
957 			VIONA_PROBE3(indir_bad_next, viona_vring_t *, ring,
958 			    uint16_t, indir_next, uint16_t, indir_count);
959 			VIONA_RING_STAT_INCR(ring, indir_bad_next);
960 			err = EINVAL;
961 			break;
962 		}
963 	}
964 
965 	if (vmp != NULL) {
966 		vmm_drv_page_release(vmp);
967 	}
968 	return (err);
969 }
970 
971 int
972 vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov,
973     uint16_t *cookie, vmm_page_t **chain)
974 {
975 	uint16_t ndesc, idx, head, next;
976 	struct virtio_desc vdir;
977 	vq_held_region_t region = {
978 		.vhr_niov = niov,
979 		.vhr_iov = iov,
980 	};
981 
982 	ASSERT(iov != NULL);
983 	ASSERT(niov > 0 && niov < INT_MAX);
984 	ASSERT(*chain == NULL);
985 
986 	mutex_enter(&ring->vr_a_mutex);
987 	idx = ring->vr_cur_aidx;
988 	ndesc = viona_ring_num_avail(ring);
989 
990 	if (ndesc == 0) {
991 		mutex_exit(&ring->vr_a_mutex);
992 		return (0);
993 	}
994 	if (ndesc > ring->vr_size) {
995 		/*
996 		 * Despite the fact that the guest has provided an 'avail_idx'
997 		 * which indicates that an impossible number of descriptors are
998 		 * available, continue on and attempt to process the next one.
999 		 *
1000 		 * The transgression will not escape the probe or stats though.
1001 		 */
1002 		VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring,
1003 		    uint16_t, ndesc);
1004 		VIONA_RING_STAT_INCR(ring, ndesc_too_high);
1005 	}
1006 
1007 	head = vq_read_avail(ring, idx & ring->vr_mask);
1008 	next = head;
1009 
1010 	for (region.vhr_idx = 0; region.vhr_idx < niov; next = vdir.vd_next) {
1011 		if (next >= ring->vr_size) {
1012 			VIONA_PROBE2(bad_idx, viona_vring_t *, ring,
1013 			    uint16_t, next);
1014 			VIONA_RING_STAT_INCR(ring, bad_idx);
1015 			break;
1016 		}
1017 
1018 		vq_read_desc(ring, next, &vdir);
1019 		if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) {
1020 			if (vq_map_desc_bufs(ring, &vdir, &region) != 0) {
1021 				break;
1022 			}
1023 		} else {
1024 			/*
1025 			 * Per the specification (Virtio 1.1 S2.6.5.3.1):
1026 			 *   A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
1027 			 *   and VIRTQ_DESC_F_NEXT in `flags`.
1028 			 */
1029 			if ((vdir.vd_flags & VRING_DESC_F_NEXT) != 0) {
1030 				VIONA_PROBE3(indir_bad_next,
1031 				    viona_vring_t *, ring,
1032 				    uint16_t, next, uint16_t, 0);
1033 				VIONA_RING_STAT_INCR(ring, indir_bad_next);
1034 				break;
1035 			}
1036 
1037 			if (vq_map_indir_desc_bufs(ring, &vdir, &region) != 0) {
1038 				break;
1039 			}
1040 		}
1041 
1042 		if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) {
1043 			ring->vr_cur_aidx++;
1044 			mutex_exit(&ring->vr_a_mutex);
1045 
1046 			*cookie = head;
1047 			*chain = region.vhr_head;
1048 			return (region.vhr_idx);
1049 		}
1050 	}
1051 
1052 	mutex_exit(&ring->vr_a_mutex);
1053 	if (region.vhr_head != NULL) {
1054 		/*
1055 		 * If any pages were held prior to encountering an error, we
1056 		 * must release them now.
1057 		 */
1058 		vmm_drv_page_release_chain(region.vhr_head);
1059 	}
1060 	return (-1);
1061 }
1062 
1063 
1064 static void
1065 vq_write_used_ent(viona_vring_t *ring, uint16_t idx, uint16_t cookie,
1066     uint32_t len)
1067 {
1068 	/*
1069 	 * In a larger ring, entry could be split across pages, so be sure to
1070 	 * account for that when configuring the transfer by looking up the ID
1071 	 * and length addresses separately, rather than an address for a
1072 	 * combined `struct virtio_used`.
1073 	 */
1074 	const uint_t used_id_off = LEGACY_USED_ENT_OFF(ring->vr_size, idx);
1075 	const uint_t used_len_off = used_id_off + sizeof (uint32_t);
1076 	volatile uint32_t *idp = viona_ring_addr(ring, used_id_off);
1077 	volatile uint32_t *lenp = viona_ring_addr(ring, used_len_off);
1078 
1079 	ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
1080 
1081 	*idp = cookie;
1082 	*lenp = len;
1083 }
1084 
1085 static void
1086 vq_write_used_idx(viona_vring_t *ring, uint16_t idx)
1087 {
1088 	ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
1089 
1090 	volatile uint16_t *used_idx =
1091 	    viona_ring_addr(ring, LEGACY_USED_IDX_OFF(ring->vr_size));
1092 	*used_idx = idx;
1093 }
1094 
1095 void
1096 vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie)
1097 {
1098 	uint16_t uidx;
1099 
1100 	mutex_enter(&ring->vr_u_mutex);
1101 
1102 	uidx = ring->vr_cur_uidx;
1103 	vq_write_used_ent(ring, uidx & ring->vr_mask, cookie, len);
1104 	uidx++;
1105 	membar_producer();
1106 
1107 	vq_write_used_idx(ring, uidx);
1108 	ring->vr_cur_uidx = uidx;
1109 
1110 	mutex_exit(&ring->vr_u_mutex);
1111 }
1112 
1113 void
1114 vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem)
1115 {
1116 	uint16_t uidx;
1117 
1118 	mutex_enter(&ring->vr_u_mutex);
1119 
1120 	uidx = ring->vr_cur_uidx;
1121 
1122 	for (uint_t i = 0; i < num_bufs; i++, uidx++) {
1123 		vq_write_used_ent(ring, uidx & ring->vr_mask, elem[i].id,
1124 		    elem[i].len);
1125 	}
1126 
1127 	membar_producer();
1128 	vq_write_used_idx(ring, uidx);
1129 	ring->vr_cur_uidx = uidx;
1130 
1131 	mutex_exit(&ring->vr_u_mutex);
1132 }
1133 
1134 /*
1135  * Set USED_NO_NOTIFY on VQ so guest elides doorbell calls for new entries.
1136  */
1137 void
1138 viona_ring_disable_notify(viona_vring_t *ring)
1139 {
1140 	volatile uint16_t *used_flags =
1141 	    viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
1142 
1143 	*used_flags |= VRING_USED_F_NO_NOTIFY;
1144 }
1145 
1146 /*
1147  * Clear USED_NO_NOTIFY on VQ so guest resumes doorbell calls for new entries.
1148  */
1149 void
1150 viona_ring_enable_notify(viona_vring_t *ring)
1151 {
1152 	volatile uint16_t *used_flags =
1153 	    viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
1154 
1155 	*used_flags &= ~VRING_USED_F_NO_NOTIFY;
1156 }
1157 
1158 /*
1159  * Return the number of available descriptors in the vring taking care of the
1160  * 16-bit index wraparound.
1161  *
1162  * Note: If the number of apparently available descriptors is larger than the
1163  * ring size (due to guest misbehavior), this check will still report the
1164  * positive count of descriptors.
1165  */
1166 uint16_t
1167 viona_ring_num_avail(viona_vring_t *ring)
1168 {
1169 	volatile uint16_t *avail_idx =
1170 	    viona_ring_addr(ring, LEGACY_AVAIL_IDX_OFF(ring->vr_size));
1171 
1172 	return (*avail_idx - ring->vr_cur_aidx);
1173 }
1174 
1175 /* Record a successfully transferred packet for the ring stats */
1176 void
1177 viona_ring_stat_accept(viona_vring_t *ring, uint32_t len)
1178 {
1179 	atomic_inc_64(&ring->vr_stats.vts_packets);
1180 	atomic_add_64(&ring->vr_stats.vts_bytes, len);
1181 }
1182 
1183 /*
1184  * Record a dropped packet in the ring stats
1185  */
1186 void
1187 viona_ring_stat_drop(viona_vring_t *ring)
1188 {
1189 	atomic_inc_64(&ring->vr_stats.vts_drops);
1190 }
1191 
1192 /*
1193  * Record a packet transfer error in the ring stats
1194  */
1195 void
1196 viona_ring_stat_error(viona_vring_t *ring)
1197 {
1198 	atomic_inc_64(&ring->vr_stats.vts_errors);
1199 }
1200 
1201 /*
1202  * Consolidate statistic data for this ring into the totals for the link
1203  */
1204 static void
1205 viona_ring_consolidate_stats(viona_vring_t *ring)
1206 {
1207 	viona_link_t *link = ring->vr_link;
1208 	struct viona_transfer_stats *lstat =
1209 	    (ring == &link->l_vrings[VIONA_VQ_RX]) ?
1210 	    &link->l_stats.vls_rx : &link->l_stats.vls_tx;
1211 
1212 	mutex_enter(&link->l_stats_lock);
1213 	lstat->vts_packets += ring->vr_stats.vts_packets;
1214 	lstat->vts_bytes += ring->vr_stats.vts_bytes;
1215 	lstat->vts_drops += ring->vr_stats.vts_drops;
1216 	lstat->vts_errors += ring->vr_stats.vts_errors;
1217 	bzero(&ring->vr_stats, sizeof (ring->vr_stats));
1218 	mutex_exit(&link->l_stats_lock);
1219 }
1220