xref: /illumos-gate/usr/src/uts/intel/io/viona/viona_ring.c (revision ee653ea2dda5dfb303c7021b63e6ca4d2f4d642f)
1 /*
2  * Copyright (c) 2013  Chris Torek <torek @ torek net>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * This file and its contents are supplied under the terms of the
28  * Common Development and Distribution License ("CDDL"), version 1.0.
29  * You may only use this file in accordance with the terms of version
30  * 1.0 of the CDDL.
31  *
32  * A full copy of the text of the CDDL should have accompanied this
33  * source.  A copy of the CDDL is also available via the Internet at
34  * http://www.illumos.org/license/CDDL.
35  *
36  * Copyright 2015 Pluribus Networks Inc.
37  * Copyright 2019 Joyent, Inc.
38  * Copyright 2025 Oxide Computer Company
39  */
40 
41 
42 #include <sys/disp.h>
43 
44 #include "viona_impl.h"
45 
46 #define	VRING_MAX_LEN		32768
47 
48 /* Layout and sizing as defined in the spec for a legacy-style virtqueue */
49 
50 /*
51  * Because viona is not built with MACHDEP defined, PAGESIZE and friends are not
52  * constants but rather variable references.  While viona remains x86-only, we
53  * are free to hard-code this to 4k.
54  */
55 #define	VQ_PGSZ			4096UL
56 #define	VQ_PGOFF		(VQ_PGSZ - 1)
57 #define	VQ_PGMASK		~VQ_PGOFF
58 
59 #define	LEGACY_VQ_ALIGN		VQ_PGSZ
60 
61 #define	LEGACY_DESC_SZ(qsz)	((qsz) * sizeof (struct virtio_desc))
62 /*
63  * Available ring consists of avail_idx (uint16_t), flags (uint16_t), qsz avail
64  * descriptors (uint16_t each), and (optional) used_event (uint16_t).
65  */
66 #define	LEGACY_AVAIL_SZ(qsz)	(((qsz) + 3) * sizeof (uint16_t))
67 /*
68  * Used ring consists of used_idx (uint16_t), flags (uint16_t), qsz used
69  * descriptors (two uint32_t each), and (optional) avail_event (uint16_t).
70  */
71 #define	LEGACY_USED_SZ(qsz)	\
72 	((qsz) * sizeof (struct virtio_used) + 3 * sizeof (uint16_t))
73 
74 #define	LEGACY_AVAIL_FLAGS_OFF(qsz)	LEGACY_DESC_SZ(qsz)
75 #define	LEGACY_AVAIL_IDX_OFF(qsz)	\
76 	(LEGACY_DESC_SZ(qsz) + sizeof (uint16_t))
77 #define	LEGACY_AVAIL_ENT_OFF(qsz, idx)	\
78 	(LEGACY_DESC_SZ(qsz) + (2 + (idx)) * sizeof (uint16_t))
79 
80 #define	LEGACY_USED_FLAGS_OFF(qsz)	\
81 	P2ROUNDUP(LEGACY_DESC_SZ(qsz) + LEGACY_AVAIL_SZ(qsz), LEGACY_VQ_ALIGN)
82 #define	LEGACY_USED_IDX_OFF(qsz)	\
83 	(LEGACY_USED_FLAGS_OFF(qsz) + sizeof (uint16_t))
84 #define	LEGACY_USED_ENT_OFF(qsz, idx)	\
85 	(LEGACY_USED_FLAGS_OFF(qsz) + 2 * sizeof (uint16_t) + \
86 	(idx) * sizeof (struct virtio_used))
87 
88 #define	LEGACY_VQ_SIZE(qsz)	\
89 	(LEGACY_USED_FLAGS_OFF(qsz) + \
90 	P2ROUNDUP(LEGACY_USED_SZ(qsz), LEGACY_VQ_ALIGN))
91 #define	LEGACY_VQ_PAGES(qsz)	(LEGACY_VQ_SIZE(qsz) / VQ_PGSZ)
92 
93 struct vq_held_region {
94 	struct iovec	*vhr_iov;
95 	vmm_page_t	*vhr_head;
96 	vmm_page_t	*vhr_tail;
97 	/* Length of iovec array supplied in `vhr_iov` */
98 	uint_t		vhr_niov;
99 	/*
100 	 * Index into vhr_iov, indicating the next "free" entry (following the
101 	 * last entry which has valid contents).
102 	 */
103 	uint_t		vhr_idx;
104 
105 	/* Total length of populated entries in `vhr_iov` */
106 	uint32_t	vhr_len;
107 };
108 typedef struct vq_held_region vq_held_region_t;
109 
110 static bool viona_ring_map(viona_vring_t *, bool);
111 static void viona_ring_unmap(viona_vring_t *);
112 static kthread_t *viona_create_worker(viona_vring_t *);
113 static void viona_ring_consolidate_stats(viona_vring_t *);
114 
115 static vmm_page_t *
vq_page_hold(viona_vring_t * ring,uint64_t gpa,bool writable)116 vq_page_hold(viona_vring_t *ring, uint64_t gpa, bool writable)
117 {
118 	ASSERT3P(ring->vr_lease, !=, NULL);
119 
120 	int prot = PROT_READ;
121 	if (writable) {
122 		prot |= PROT_WRITE;
123 	}
124 
125 	return (vmm_drv_page_hold(ring->vr_lease, gpa, prot));
126 }
127 
128 /*
129  * Establish a hold on the page(s) which back the region of guest memory covered
130  * by [gpa, gpa + len).  The host-kernel-virtual pointers to those pages are
131  * stored in the iovec array supplied in `region`, along with the chain of
132  * vmm_page_t entries representing the held pages.  Since guest memory
133  * carries no guarantees of being physically contiguous (on the host), it is
134  * assumed that an iovec entry will be required for each page sized section
135  * covered by the specified `gpa` and `len` range.  For each iovec entry
136  * successfully populated by holding a page, `vhr_idx` will be incremented so it
137  * references the next available iovec entry (or `vhr_niov`, if the iovec array
138  * is full).  The responsibility for releasing the `vmm_page_t` chain (stored in
139  * `vhr_head` and `vhr_tail`) resides with the caller, regardless of the result.
140  */
141 static int
vq_region_hold(viona_vring_t * ring,uint64_t gpa,uint32_t len,bool writable,vq_held_region_t * region)142 vq_region_hold(viona_vring_t *ring, uint64_t gpa, uint32_t len,
143     bool writable, vq_held_region_t *region)
144 {
145 	const uint32_t front_offset = gpa & VQ_PGOFF;
146 	const uint32_t front_len = MIN(len, VQ_PGSZ - front_offset);
147 	uint_t pages = 1;
148 	vmm_page_t *vmp;
149 	caddr_t buf;
150 
151 	ASSERT3U(region->vhr_idx, <, region->vhr_niov);
152 
153 	if (front_len < len) {
154 		pages += P2ROUNDUP((uint64_t)(len - front_len),
155 		    VQ_PGSZ) / VQ_PGSZ;
156 	}
157 	if (pages > (region->vhr_niov - region->vhr_idx)) {
158 		return (E2BIG);
159 	}
160 
161 	vmp = vq_page_hold(ring, gpa & VQ_PGMASK, writable);
162 	if (vmp == NULL) {
163 		return (EFAULT);
164 	}
165 	buf = (caddr_t)vmm_drv_page_readable(vmp);
166 
167 	region->vhr_iov[region->vhr_idx].iov_base = buf + front_offset;
168 	region->vhr_iov[region->vhr_idx].iov_len = front_len;
169 	region->vhr_idx++;
170 	gpa += front_len;
171 	len -= front_len;
172 	if (region->vhr_head == NULL) {
173 		region->vhr_head = vmp;
174 		region->vhr_tail = vmp;
175 	} else {
176 		vmm_drv_page_chain(region->vhr_tail, vmp);
177 		region->vhr_tail = vmp;
178 	}
179 
180 	for (uint_t i = 1; i < pages; i++) {
181 		ASSERT3U(gpa & VQ_PGOFF, ==, 0);
182 
183 		vmp = vq_page_hold(ring, gpa, writable);
184 		if (vmp == NULL) {
185 			return (EFAULT);
186 		}
187 		buf = (caddr_t)vmm_drv_page_readable(vmp);
188 
189 		const uint32_t chunk_len = MIN(len, VQ_PGSZ);
190 		region->vhr_iov[region->vhr_idx].iov_base = buf;
191 		region->vhr_iov[region->vhr_idx].iov_len = chunk_len;
192 		region->vhr_idx++;
193 		gpa += chunk_len;
194 		len -= chunk_len;
195 		vmm_drv_page_chain(region->vhr_tail, vmp);
196 		region->vhr_tail = vmp;
197 	}
198 
199 	return (0);
200 }
201 
202 static boolean_t
viona_ring_lease_expire_cb(void * arg)203 viona_ring_lease_expire_cb(void *arg)
204 {
205 	viona_vring_t *ring = arg;
206 
207 	mutex_enter(&ring->vr_lock);
208 	cv_broadcast(&ring->vr_cv);
209 	mutex_exit(&ring->vr_lock);
210 
211 	/* The lease will be broken asynchronously. */
212 	return (B_FALSE);
213 }
214 
215 static void
viona_ring_lease_drop(viona_vring_t * ring)216 viona_ring_lease_drop(viona_vring_t *ring)
217 {
218 	ASSERT(MUTEX_HELD(&ring->vr_lock));
219 
220 	if (ring->vr_lease != NULL) {
221 		vmm_hold_t *hold = ring->vr_link->l_vm_hold;
222 
223 		ASSERT(hold != NULL);
224 
225 		/*
226 		 * Without an active lease, the ring mappings cannot be
227 		 * considered valid.
228 		 */
229 		viona_ring_unmap(ring);
230 
231 		vmm_drv_lease_break(hold, ring->vr_lease);
232 		ring->vr_lease = NULL;
233 	}
234 }
235 
236 boolean_t
viona_ring_lease_renew(viona_vring_t * ring)237 viona_ring_lease_renew(viona_vring_t *ring)
238 {
239 	vmm_hold_t *hold = ring->vr_link->l_vm_hold;
240 
241 	ASSERT(hold != NULL);
242 	ASSERT(MUTEX_HELD(&ring->vr_lock));
243 
244 	viona_ring_lease_drop(ring);
245 
246 	/*
247 	 * Lease renewal will fail if the VM has requested that all holds be
248 	 * cleaned up.
249 	 */
250 	ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
251 	    ring);
252 	if (ring->vr_lease != NULL) {
253 		/* A ring undergoing renewal will need valid guest mappings */
254 		if (ring->vr_pa != 0 && ring->vr_size != 0) {
255 			/*
256 			 * If new mappings cannot be established, consider the
257 			 * lease renewal a failure.
258 			 */
259 			if (!viona_ring_map(ring, ring->vr_state == VRS_INIT)) {
260 				viona_ring_lease_drop(ring);
261 				return (B_FALSE);
262 			}
263 		}
264 	}
265 	return (ring->vr_lease != NULL);
266 }
267 
268 void
viona_ring_alloc(viona_link_t * link,viona_vring_t * ring)269 viona_ring_alloc(viona_link_t *link, viona_vring_t *ring)
270 {
271 	ring->vr_link = link;
272 	mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL);
273 	cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL);
274 	mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL);
275 	mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL);
276 }
277 
278 static void
viona_ring_misc_free(viona_vring_t * ring)279 viona_ring_misc_free(viona_vring_t *ring)
280 {
281 	const uint_t qsz = ring->vr_size;
282 
283 	viona_tx_ring_free(ring, qsz);
284 }
285 
286 void
viona_ring_free(viona_vring_t * ring)287 viona_ring_free(viona_vring_t *ring)
288 {
289 	mutex_destroy(&ring->vr_lock);
290 	cv_destroy(&ring->vr_cv);
291 	mutex_destroy(&ring->vr_a_mutex);
292 	mutex_destroy(&ring->vr_u_mutex);
293 	ring->vr_link = NULL;
294 }
295 
296 int
viona_ring_init(viona_link_t * link,uint16_t idx,const struct viona_ring_params * params)297 viona_ring_init(viona_link_t *link, uint16_t idx,
298     const struct viona_ring_params *params)
299 {
300 	viona_vring_t *ring;
301 	kthread_t *t;
302 	int err = 0;
303 	const uint16_t qsz = params->vrp_size;
304 	const uint64_t pa = params->vrp_pa;
305 
306 	if (idx >= VIONA_VQ_MAX) {
307 		return (EINVAL);
308 	}
309 
310 	if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
311 		return (EINVAL);
312 	}
313 	if ((pa & (LEGACY_VQ_ALIGN - 1)) != 0) {
314 		return (EINVAL);
315 	}
316 
317 	ring = &link->l_vrings[idx];
318 	mutex_enter(&ring->vr_lock);
319 	if (ring->vr_state != VRS_RESET) {
320 		mutex_exit(&ring->vr_lock);
321 		return (EBUSY);
322 	}
323 	VERIFY(ring->vr_state_flags == 0);
324 
325 	ring->vr_lease = NULL;
326 	if (!viona_ring_lease_renew(ring)) {
327 		err = EBUSY;
328 		goto fail;
329 	}
330 
331 	ring->vr_size = qsz;
332 	ring->vr_mask = (ring->vr_size - 1);
333 	ring->vr_pa = pa;
334 	if (!viona_ring_map(ring, true)) {
335 		err = EINVAL;
336 		goto fail;
337 	}
338 
339 	/* Initialize queue indexes */
340 	ring->vr_cur_aidx = params->vrp_avail_idx;
341 	ring->vr_cur_uidx = params->vrp_used_idx;
342 
343 	if (idx == VIONA_VQ_TX) {
344 		viona_tx_ring_alloc(ring, qsz);
345 	}
346 
347 	/* Zero out MSI-X configuration */
348 	ring->vr_msi_addr = 0;
349 	ring->vr_msi_msg = 0;
350 
351 	/* Clear the stats */
352 	bzero(&ring->vr_stats, sizeof (ring->vr_stats));
353 	bzero(&ring->vr_err_stats, sizeof (ring->vr_err_stats));
354 
355 	t = viona_create_worker(ring);
356 	if (t == NULL) {
357 		err = ENOMEM;
358 		goto fail;
359 	}
360 	ring->vr_worker_thread = t;
361 	ring->vr_state = VRS_SETUP;
362 	cv_broadcast(&ring->vr_cv);
363 	mutex_exit(&ring->vr_lock);
364 	return (0);
365 
366 fail:
367 	viona_ring_lease_drop(ring);
368 	viona_ring_misc_free(ring);
369 	ring->vr_size = 0;
370 	ring->vr_mask = 0;
371 	ring->vr_pa = 0;
372 	ring->vr_cur_aidx = 0;
373 	ring->vr_cur_uidx = 0;
374 	mutex_exit(&ring->vr_lock);
375 	return (err);
376 }
377 
378 int
viona_ring_get_state(viona_link_t * link,uint16_t idx,struct viona_ring_params * params)379 viona_ring_get_state(viona_link_t *link, uint16_t idx,
380     struct viona_ring_params *params)
381 {
382 	viona_vring_t *ring;
383 
384 	if (idx >= VIONA_VQ_MAX) {
385 		return (EINVAL);
386 	}
387 
388 	ring = &link->l_vrings[idx];
389 	mutex_enter(&ring->vr_lock);
390 
391 	params->vrp_size = ring->vr_size;
392 	params->vrp_pa = ring->vr_pa;
393 
394 	if (ring->vr_state == VRS_RUN) {
395 		/* On a running ring, we must heed the avail/used locks */
396 		mutex_enter(&ring->vr_a_mutex);
397 		params->vrp_avail_idx = ring->vr_cur_aidx;
398 		mutex_exit(&ring->vr_a_mutex);
399 		mutex_enter(&ring->vr_u_mutex);
400 		params->vrp_used_idx = ring->vr_cur_uidx;
401 		mutex_exit(&ring->vr_u_mutex);
402 	} else {
403 		/* Otherwise vr_lock is adequate protection */
404 		params->vrp_avail_idx = ring->vr_cur_aidx;
405 		params->vrp_used_idx = ring->vr_cur_uidx;
406 	}
407 
408 	mutex_exit(&ring->vr_lock);
409 
410 	return (0);
411 }
412 
413 int
viona_ring_reset(viona_vring_t * ring,boolean_t heed_signals)414 viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
415 {
416 	mutex_enter(&ring->vr_lock);
417 	if (ring->vr_state == VRS_RESET) {
418 		mutex_exit(&ring->vr_lock);
419 		return (0);
420 	}
421 
422 	if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) {
423 		ring->vr_state_flags |= VRSF_REQ_STOP;
424 		cv_broadcast(&ring->vr_cv);
425 	}
426 	while (ring->vr_state != VRS_RESET) {
427 		if (!heed_signals) {
428 			cv_wait(&ring->vr_cv, &ring->vr_lock);
429 		} else {
430 			int rs;
431 
432 			rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
433 			if (rs <= 0 && ring->vr_state != VRS_RESET) {
434 				mutex_exit(&ring->vr_lock);
435 				return (EINTR);
436 			}
437 		}
438 	}
439 	mutex_exit(&ring->vr_lock);
440 	return (0);
441 }
442 
443 static bool
viona_ring_map(viona_vring_t * ring,bool defer_dirty)444 viona_ring_map(viona_vring_t *ring, bool defer_dirty)
445 {
446 	const uint16_t qsz = ring->vr_size;
447 	uintptr_t pa = ring->vr_pa;
448 
449 	ASSERT3U(qsz, !=, 0);
450 	ASSERT3U(qsz, <=, VRING_MAX_LEN);
451 	ASSERT3U(pa, !=, 0);
452 	ASSERT3U(pa & (LEGACY_VQ_ALIGN - 1), ==, 0);
453 	ASSERT(MUTEX_HELD(&ring->vr_lock));
454 	ASSERT3P(ring->vr_map_pages, ==, NULL);
455 
456 	const uint_t npages = LEGACY_VQ_PAGES(qsz);
457 	ring->vr_map_pages = kmem_zalloc(npages * sizeof (void *), KM_SLEEP);
458 
459 	int page_flags = 0;
460 	if (defer_dirty) {
461 		/*
462 		 * During initialization, and when entering the paused state,
463 		 * the page holds for a virtqueue are established with the
464 		 * DEFER_DIRTY flag set.
465 		 *
466 		 * This prevents those page holds from immediately marking the
467 		 * underlying pages as dirty, since the viona emulation is not
468 		 * yet performing any accesses.  Once the ring transitions to
469 		 * the VRS_RUN state, the held pages will be marked as dirty.
470 		 *
471 		 * Any ring mappings performed outside those state conditions,
472 		 * such as those part of vmm_lease renewal during steady-state
473 		 * operation, will map the ring pages normally (as considered
474 		 * immediately dirty).
475 		 */
476 		page_flags |= VMPF_DEFER_DIRTY;
477 	}
478 
479 	vmm_page_t *prev = NULL;
480 	for (uint_t i = 0; i < npages; i++, pa += VQ_PGSZ) {
481 		vmm_page_t *vmp;
482 
483 		vmp = vmm_drv_page_hold_ext(ring->vr_lease, pa,
484 		    PROT_READ | PROT_WRITE, page_flags);
485 		if (vmp == NULL) {
486 			viona_ring_unmap(ring);
487 			return (false);
488 		}
489 
490 		/*
491 		 * Keep the first page has the head of the chain, appending all
492 		 * subsequent pages to the tail.
493 		 */
494 		if (prev == NULL) {
495 			ring->vr_map_hold = vmp;
496 		} else {
497 			vmm_drv_page_chain(prev, vmp);
498 		}
499 		prev = vmp;
500 		ring->vr_map_pages[i] = vmm_drv_page_writable(vmp);
501 	}
502 
503 	return (true);
504 }
505 
506 static void
viona_ring_mark_dirty(viona_vring_t * ring)507 viona_ring_mark_dirty(viona_vring_t *ring)
508 {
509 	ASSERT(MUTEX_HELD(&ring->vr_lock));
510 	ASSERT(ring->vr_map_hold != NULL);
511 
512 	for (vmm_page_t *vp = ring->vr_map_hold; vp != NULL;
513 	    vp = vmm_drv_page_next(vp)) {
514 		vmm_drv_page_mark_dirty(vp);
515 	}
516 }
517 
518 static void
viona_ring_unmap(viona_vring_t * ring)519 viona_ring_unmap(viona_vring_t *ring)
520 {
521 	ASSERT(MUTEX_HELD(&ring->vr_lock));
522 
523 	void **map = ring->vr_map_pages;
524 	if (map != NULL) {
525 		const uint_t npages = LEGACY_VQ_PAGES(ring->vr_size);
526 		kmem_free(map, npages * sizeof (void *));
527 		ring->vr_map_pages = NULL;
528 
529 		vmm_drv_page_release_chain(ring->vr_map_hold);
530 		ring->vr_map_hold = NULL;
531 	} else {
532 		ASSERT3P(ring->vr_map_hold, ==, NULL);
533 	}
534 }
535 
536 static inline void *
viona_ring_addr(viona_vring_t * ring,uint_t off)537 viona_ring_addr(viona_vring_t *ring, uint_t off)
538 {
539 	ASSERT3P(ring->vr_map_pages, !=, NULL);
540 	ASSERT3U(LEGACY_VQ_SIZE(ring->vr_size), >, off);
541 
542 	const uint_t page_num = off / VQ_PGSZ;
543 	const uint_t page_off = off % VQ_PGSZ;
544 	return ((caddr_t)ring->vr_map_pages[page_num] + page_off);
545 }
546 
547 void
viona_intr_ring(viona_vring_t * ring,boolean_t skip_flags_check)548 viona_intr_ring(viona_vring_t *ring, boolean_t skip_flags_check)
549 {
550 	if (!skip_flags_check) {
551 		volatile uint16_t *avail_flags = viona_ring_addr(ring,
552 		    LEGACY_AVAIL_FLAGS_OFF(ring->vr_size));
553 
554 		if ((*avail_flags & VRING_AVAIL_F_NO_INTERRUPT) != 0) {
555 			return;
556 		}
557 	}
558 
559 	mutex_enter(&ring->vr_lock);
560 	uint64_t addr = ring->vr_msi_addr;
561 	uint64_t msg = ring->vr_msi_msg;
562 	mutex_exit(&ring->vr_lock);
563 	if (addr != 0) {
564 		/* Deliver the interrupt directly, if so configured... */
565 		(void) vmm_drv_msi(ring->vr_lease, addr, msg);
566 	} else {
567 		/* ... otherwise, leave it to userspace */
568 		if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) {
569 			pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND);
570 		}
571 	}
572 }
573 
574 static inline bool
vring_stop_req(const viona_vring_t * ring)575 vring_stop_req(const viona_vring_t *ring)
576 {
577 	return ((ring->vr_state_flags & VRSF_REQ_STOP) != 0);
578 }
579 
580 static inline bool
vring_pause_req(const viona_vring_t * ring)581 vring_pause_req(const viona_vring_t *ring)
582 {
583 	return ((ring->vr_state_flags & VRSF_REQ_PAUSE) != 0);
584 }
585 
586 static inline bool
vring_start_req(const viona_vring_t * ring)587 vring_start_req(const viona_vring_t *ring)
588 {
589 	return ((ring->vr_state_flags & VRSF_REQ_START) != 0);
590 }
591 
592 /*
593  * Check if vring worker thread should bail out.  This will heed indications
594  * that the containing process is exiting, as well as requests to stop or pause
595  * the ring.  The `stop_only` parameter controls if pause requests are ignored
596  * (true) or checked (false).
597  *
598  * Caller should hold vr_lock.
599  */
600 static bool
vring_need_bail_ext(const viona_vring_t * ring,bool stop_only)601 vring_need_bail_ext(const viona_vring_t *ring, bool stop_only)
602 {
603 	ASSERT(MUTEX_HELD(&ring->vr_lock));
604 
605 	if (vring_stop_req(ring) ||
606 	    (!stop_only && vring_pause_req(ring))) {
607 		return (true);
608 	}
609 
610 	kthread_t *t = ring->vr_worker_thread;
611 	if (t != NULL) {
612 		proc_t *p = ttoproc(t);
613 
614 		ASSERT(p != NULL);
615 		if ((p->p_flag & SEXITING) != 0) {
616 			return (true);
617 		}
618 	}
619 	return (false);
620 }
621 
622 bool
vring_need_bail(const viona_vring_t * ring)623 vring_need_bail(const viona_vring_t *ring)
624 {
625 	return (vring_need_bail_ext(ring, false));
626 }
627 
628 int
viona_ring_pause(viona_vring_t * ring)629 viona_ring_pause(viona_vring_t *ring)
630 {
631 	mutex_enter(&ring->vr_lock);
632 	switch (ring->vr_state) {
633 	case VRS_RESET:
634 	case VRS_SETUP:
635 	case VRS_INIT:
636 		/*
637 		 * For rings which have not yet started (even those in the
638 		 * VRS_SETUP and VRS_INIT phases, where there a running worker
639 		 * thread (waiting to be released to do its intended task), it
640 		 * is adequate to simply clear any start request, to keep them
641 		 * from proceeding into the actual work processing function.
642 		 */
643 		ring->vr_state_flags &= ~VRSF_REQ_START;
644 		mutex_exit(&ring->vr_lock);
645 		return (0);
646 
647 	case VRS_STOP:
648 		if ((ring->vr_state_flags & VRSF_REQ_STOP) != 0) {
649 			/* A ring on its way to RESET cannot be paused. */
650 			mutex_exit(&ring->vr_lock);
651 			return (EBUSY);
652 		}
653 		/* FALLTHROUGH */
654 	case VRS_RUN:
655 		ring->vr_state_flags |= VRSF_REQ_PAUSE;
656 		cv_broadcast(&ring->vr_cv);
657 		break;
658 
659 	default:
660 		panic("invalid ring state %d", ring->vr_state);
661 		break;
662 	}
663 
664 	for (;;) {
665 		int res = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
666 
667 		if (ring->vr_state == VRS_INIT ||
668 		    (ring->vr_state_flags & VRSF_REQ_PAUSE) == 0) {
669 			/* Ring made it to (or through) paused state */
670 			mutex_exit(&ring->vr_lock);
671 			return (0);
672 		}
673 		if (res == 0) {
674 			/* interrupted by signal */
675 			mutex_exit(&ring->vr_lock);
676 			return (EINTR);
677 		}
678 	}
679 	/* NOTREACHED */
680 }
681 
682 static void
viona_worker(void * arg)683 viona_worker(void *arg)
684 {
685 	viona_vring_t *ring = (viona_vring_t *)arg;
686 	viona_link_t *link = ring->vr_link;
687 
688 	mutex_enter(&ring->vr_lock);
689 	VERIFY3U(ring->vr_state, ==, VRS_SETUP);
690 
691 	/* Bail immediately if ring shutdown or process exit was requested */
692 	if (vring_need_bail_ext(ring, true)) {
693 		goto ring_reset;
694 	}
695 
696 	/* Report worker thread as alive and notify creator */
697 ring_init:
698 	ring->vr_state = VRS_INIT;
699 	cv_broadcast(&ring->vr_cv);
700 
701 	while (!vring_start_req(ring)) {
702 		/*
703 		 * Keeping lease renewals timely while waiting for the ring to
704 		 * be started is important for avoiding deadlocks.
705 		 */
706 		if (vmm_drv_lease_expired(ring->vr_lease)) {
707 			if (!viona_ring_lease_renew(ring)) {
708 				goto ring_reset;
709 			}
710 		}
711 
712 		(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
713 
714 		if (vring_pause_req(ring)) {
715 			/* We are already paused in the INIT state. */
716 			ring->vr_state_flags &= ~VRSF_REQ_PAUSE;
717 		}
718 		if (vring_need_bail_ext(ring, true)) {
719 			goto ring_reset;
720 		}
721 	}
722 
723 	ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0);
724 	ring->vr_state = VRS_RUN;
725 	ring->vr_state_flags &= ~VRSF_REQ_START;
726 	viona_ring_mark_dirty(ring);
727 
728 	/* Ensure ring lease is valid first */
729 	if (vmm_drv_lease_expired(ring->vr_lease)) {
730 		if (!viona_ring_lease_renew(ring)) {
731 			goto ring_reset;
732 		}
733 	}
734 
735 	/* Process actual work */
736 	if (ring == &link->l_vrings[VIONA_VQ_RX]) {
737 		viona_worker_rx(ring, link);
738 	} else if (ring == &link->l_vrings[VIONA_VQ_TX]) {
739 		viona_worker_tx(ring, link);
740 	} else {
741 		panic("unexpected ring: %p", (void *)ring);
742 	}
743 
744 	VERIFY3U(ring->vr_state, ==, VRS_STOP);
745 	VERIFY3U(ring->vr_xfer_outstanding, ==, 0);
746 
747 	/*
748 	 * Consolidate stats data so that it is not lost if/when this ring is
749 	 * being stopped.
750 	 */
751 	viona_ring_consolidate_stats(ring);
752 
753 	/* Respond to a pause request if the ring is not required to stop */
754 	if (vring_pause_req(ring)) {
755 		ring->vr_state_flags &= ~VRSF_REQ_PAUSE;
756 
757 		if (vring_need_bail_ext(ring, true)) {
758 			goto ring_reset;
759 		}
760 
761 		/*
762 		 * To complete pausing of the ring, unmap and re-map the pages
763 		 * underpinning the virtqueue.  This is to synchronize their
764 		 * dirty state in the backing page tables and restore the
765 		 * defer-dirty state on the held pages.
766 		 */
767 		viona_ring_unmap(ring);
768 		if (viona_ring_map(ring, true)) {
769 			goto ring_init;
770 		}
771 
772 		/*
773 		 * If the ring pages failed to be mapped, fallthrough to
774 		 * ring-reset like any other failure.
775 		 */
776 	}
777 
778 ring_reset:
779 	viona_ring_misc_free(ring);
780 
781 	viona_ring_lease_drop(ring);
782 	ring->vr_cur_aidx = 0;
783 	ring->vr_size = 0;
784 	ring->vr_mask = 0;
785 	ring->vr_pa = 0;
786 	ring->vr_state = VRS_RESET;
787 	ring->vr_state_flags = 0;
788 	ring->vr_worker_thread = NULL;
789 	cv_broadcast(&ring->vr_cv);
790 	mutex_exit(&ring->vr_lock);
791 
792 	mutex_enter(&ttoproc(curthread)->p_lock);
793 	lwp_exit();
794 }
795 
796 static kthread_t *
viona_create_worker(viona_vring_t * ring)797 viona_create_worker(viona_vring_t *ring)
798 {
799 	k_sigset_t hold_set;
800 	proc_t *p = curproc;
801 	kthread_t *t;
802 	klwp_t *lwp;
803 
804 	ASSERT(MUTEX_HELD(&ring->vr_lock));
805 	ASSERT(ring->vr_state == VRS_RESET);
806 
807 	sigfillset(&hold_set);
808 	lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED,
809 	    minclsyspri - 1, &hold_set, curthread->t_cid, 0);
810 	if (lwp == NULL) {
811 		return (NULL);
812 	}
813 
814 	t = lwptot(lwp);
815 	mutex_enter(&p->p_lock);
816 	t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
817 	lwp_create_done(t);
818 	mutex_exit(&p->p_lock);
819 
820 	return (t);
821 }
822 
823 static inline void
vq_read_desc(viona_vring_t * ring,uint16_t idx,struct virtio_desc * descp)824 vq_read_desc(viona_vring_t *ring, uint16_t idx, struct virtio_desc *descp)
825 {
826 	const uint_t entry_off = idx * sizeof (struct virtio_desc);
827 
828 	ASSERT3U(idx, <, ring->vr_size);
829 
830 	/*
831 	 * On both legacy and 1.x VirtIO, the virtqueue descriptors are required
832 	 * to be aligned to at least 16 bytes (4k for legacy).
833 	 */
834 	*descp = *(const struct virtio_desc *)viona_ring_addr(ring, entry_off);
835 }
836 
837 static uint16_t
vq_read_avail(viona_vring_t * ring,uint16_t idx)838 vq_read_avail(viona_vring_t *ring, uint16_t idx)
839 {
840 	ASSERT3U(idx, <, ring->vr_size);
841 
842 	volatile uint16_t *avail_ent =
843 	    viona_ring_addr(ring, LEGACY_AVAIL_ENT_OFF(ring->vr_size, idx));
844 	return (*avail_ent);
845 }
846 
847 /*
848  * Given a buffer descriptor `desc`, attempt to map the pages backing that
849  * region of guest physical memory, taking into account that there are no
850  * guarantees about guest-contiguous pages being host-contiguous.
851  */
852 static int
vq_map_desc_bufs(viona_vring_t * ring,const struct virtio_desc * desc,vq_held_region_t * region)853 vq_map_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
854     vq_held_region_t *region)
855 {
856 	if (desc->vd_len == 0) {
857 		VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
858 		    uint32_t, desc->vd_len);
859 		VIONA_RING_STAT_INCR(ring, desc_bad_len);
860 		return (EINVAL);
861 	} else if ((region->vhr_len + desc->vd_len) < region->vhr_len) {
862 		VIONA_PROBE1(len_overflow, viona_vring_t *, ring);
863 		VIONA_RING_STAT_INCR(ring, len_overflow);
864 		return (EOVERFLOW);
865 	}
866 
867 	int err = vq_region_hold(ring, desc->vd_addr, desc->vd_len,
868 	    (desc->vd_flags & VRING_DESC_F_WRITE) != 0, region);
869 	if (err == 0) {
870 		region->vhr_len += desc->vd_len;
871 	} else if (err == E2BIG) {
872 		VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
873 		VIONA_RING_STAT_INCR(ring, too_many_desc);
874 	} else if (err == EFAULT) {
875 		VIONA_PROBE_BAD_RING_ADDR(ring, desc->vd_addr);
876 		VIONA_RING_STAT_INCR(ring, bad_ring_addr);
877 	}
878 
879 	return (err);
880 }
881 
882 /*
883  * Walk an indirect buffer descriptor `desc`, attempting to map the pages
884  * backing the regions of guest memory covered by its constituent descriptors.
885  */
886 static int
vq_map_indir_desc_bufs(viona_vring_t * ring,const struct virtio_desc * desc,vq_held_region_t * region)887 vq_map_indir_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
888     vq_held_region_t *region)
889 {
890 	const uint16_t indir_count = desc->vd_len / sizeof (struct virtio_desc);
891 
892 	if ((desc->vd_len & 0xf) != 0 || indir_count == 0 ||
893 	    indir_count > ring->vr_size ||
894 	    desc->vd_addr > (desc->vd_addr + desc->vd_len)) {
895 		VIONA_PROBE2(indir_bad_len, viona_vring_t *, ring,
896 		    uint32_t, desc->vd_len);
897 		VIONA_RING_STAT_INCR(ring, indir_bad_len);
898 		return (EINVAL);
899 	}
900 
901 	uint16_t indir_next = 0;
902 	const uint8_t *buf = NULL;
903 	uint64_t buf_gpa = UINT64_MAX;
904 	vmm_page_t *vmp = NULL;
905 	int err = 0;
906 
907 	for (;;) {
908 		const uint64_t indir_gpa =
909 		    desc->vd_addr + (indir_next * sizeof (struct virtio_desc));
910 		const uint64_t indir_page = indir_gpa & VQ_PGMASK;
911 
912 		/*
913 		 * Get a mapping for the page that the next indirect descriptor
914 		 * resides in, if has not already been done.
915 		 */
916 		if (indir_page != buf_gpa) {
917 			if (vmp != NULL) {
918 				vmm_drv_page_release(vmp);
919 			}
920 			vmp = vq_page_hold(ring, indir_page, false);
921 			if (vmp == NULL) {
922 				VIONA_PROBE_BAD_RING_ADDR(ring, indir_page);
923 				VIONA_RING_STAT_INCR(ring, bad_ring_addr);
924 				err = EFAULT;
925 				break;
926 			}
927 			buf_gpa = indir_page;
928 			buf = vmm_drv_page_readable(vmp);
929 		}
930 
931 		/*
932 		 * A copy of the indirect descriptor is made here, rather than
933 		 * simply using a reference pointer.  This prevents malicious or
934 		 * erroneous guest writes to the descriptor from fooling the
935 		 * flags/bounds verification through a race.
936 		 *
937 		 * While indirect descriptors do not have the same alignment
938 		 * requirements as those residing in the virtqueue itself, we
939 		 * are not concerned about unaligned access while viona remains
940 		 * x86-only.
941 		 */
942 		struct virtio_desc vp = *(const struct virtio_desc *)
943 		    (buf + (indir_gpa - indir_page));
944 
945 		if (vp.vd_flags & VRING_DESC_F_INDIRECT) {
946 			VIONA_PROBE1(indir_bad_nest, viona_vring_t *, ring);
947 			VIONA_RING_STAT_INCR(ring, indir_bad_nest);
948 			err = EINVAL;
949 			break;
950 		} else if (vp.vd_len == 0) {
951 			VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
952 			    uint32_t, vp.vd_len);
953 			VIONA_RING_STAT_INCR(ring, desc_bad_len);
954 			err = EINVAL;
955 			break;
956 		}
957 
958 		err = vq_map_desc_bufs(ring, &vp, region);
959 		if (err != 0) {
960 			break;
961 		}
962 
963 		/* Successfully reach the end of the indir chain */
964 		if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) {
965 			break;
966 		}
967 		if (region->vhr_idx >= region->vhr_niov) {
968 			VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
969 			VIONA_RING_STAT_INCR(ring, too_many_desc);
970 			err = E2BIG;
971 			break;
972 		}
973 
974 		indir_next = vp.vd_next;
975 		if (indir_next >= indir_count) {
976 			VIONA_PROBE3(indir_bad_next, viona_vring_t *, ring,
977 			    uint16_t, indir_next, uint16_t, indir_count);
978 			VIONA_RING_STAT_INCR(ring, indir_bad_next);
979 			err = EINVAL;
980 			break;
981 		}
982 	}
983 
984 	if (vmp != NULL) {
985 		vmm_drv_page_release(vmp);
986 	}
987 	return (err);
988 }
989 
990 int
vq_popchain(viona_vring_t * ring,struct iovec * iov,uint_t niov,uint16_t * cookie,vmm_page_t ** chain,uint32_t * len)991 vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov,
992     uint16_t *cookie, vmm_page_t **chain, uint32_t *len)
993 {
994 	uint16_t ndesc, idx, head, next;
995 	struct virtio_desc vdir;
996 	vq_held_region_t region = {
997 		.vhr_niov = niov,
998 		.vhr_iov = iov,
999 	};
1000 
1001 	ASSERT(iov != NULL);
1002 	ASSERT(niov > 0 && niov < INT_MAX);
1003 	ASSERT(*chain == NULL);
1004 
1005 	mutex_enter(&ring->vr_a_mutex);
1006 	idx = ring->vr_cur_aidx;
1007 	ndesc = viona_ring_num_avail(ring);
1008 
1009 	if (ndesc == 0) {
1010 		mutex_exit(&ring->vr_a_mutex);
1011 		return (0);
1012 	}
1013 	if (ndesc > ring->vr_size) {
1014 		/*
1015 		 * Despite the fact that the guest has provided an 'avail_idx'
1016 		 * which indicates that an impossible number of descriptors are
1017 		 * available, continue on and attempt to process the next one.
1018 		 *
1019 		 * The transgression will not escape the probe or stats though.
1020 		 */
1021 		VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring,
1022 		    uint16_t, ndesc);
1023 		VIONA_RING_STAT_INCR(ring, ndesc_too_high);
1024 	}
1025 
1026 	head = vq_read_avail(ring, idx & ring->vr_mask);
1027 	next = head;
1028 
1029 	for (region.vhr_idx = 0; region.vhr_idx < niov; next = vdir.vd_next) {
1030 		if (next >= ring->vr_size) {
1031 			VIONA_PROBE2(bad_idx, viona_vring_t *, ring,
1032 			    uint16_t, next);
1033 			VIONA_RING_STAT_INCR(ring, bad_idx);
1034 			break;
1035 		}
1036 
1037 		vq_read_desc(ring, next, &vdir);
1038 		if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) {
1039 			if (vq_map_desc_bufs(ring, &vdir, &region) != 0) {
1040 				break;
1041 			}
1042 		} else {
1043 			/*
1044 			 * Per the specification (Virtio 1.1 S2.6.5.3.1):
1045 			 *   A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
1046 			 *   and VIRTQ_DESC_F_NEXT in `flags`.
1047 			 */
1048 			if ((vdir.vd_flags & VRING_DESC_F_NEXT) != 0) {
1049 				VIONA_PROBE3(indir_bad_next,
1050 				    viona_vring_t *, ring,
1051 				    uint16_t, next, uint16_t, 0);
1052 				VIONA_RING_STAT_INCR(ring, indir_bad_next);
1053 				break;
1054 			}
1055 
1056 			if (vq_map_indir_desc_bufs(ring, &vdir, &region) != 0) {
1057 				break;
1058 			}
1059 		}
1060 
1061 		if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) {
1062 			ring->vr_cur_aidx++;
1063 			mutex_exit(&ring->vr_a_mutex);
1064 
1065 			*cookie = head;
1066 			*chain = region.vhr_head;
1067 			if (len != NULL) {
1068 				*len = region.vhr_len;
1069 			}
1070 			return (region.vhr_idx);
1071 		}
1072 	}
1073 
1074 	mutex_exit(&ring->vr_a_mutex);
1075 	if (region.vhr_head != NULL) {
1076 		/*
1077 		 * If any pages were held prior to encountering an error, we
1078 		 * must release them now.
1079 		 */
1080 		vmm_drv_page_release_chain(region.vhr_head);
1081 	}
1082 	return (-1);
1083 }
1084 
1085 
1086 static void
vq_write_used_ent(viona_vring_t * ring,uint16_t idx,uint16_t cookie,uint32_t len)1087 vq_write_used_ent(viona_vring_t *ring, uint16_t idx, uint16_t cookie,
1088     uint32_t len)
1089 {
1090 	/*
1091 	 * In a larger ring, entry could be split across pages, so be sure to
1092 	 * account for that when configuring the transfer by looking up the ID
1093 	 * and length addresses separately, rather than an address for a
1094 	 * combined `struct virtio_used`.
1095 	 */
1096 	const uint_t used_id_off = LEGACY_USED_ENT_OFF(ring->vr_size, idx);
1097 	const uint_t used_len_off = used_id_off + sizeof (uint32_t);
1098 	volatile uint32_t *idp = viona_ring_addr(ring, used_id_off);
1099 	volatile uint32_t *lenp = viona_ring_addr(ring, used_len_off);
1100 
1101 	ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
1102 
1103 	*idp = cookie;
1104 	*lenp = len;
1105 }
1106 
1107 static void
vq_write_used_idx(viona_vring_t * ring,uint16_t idx)1108 vq_write_used_idx(viona_vring_t *ring, uint16_t idx)
1109 {
1110 	ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
1111 
1112 	volatile uint16_t *used_idx =
1113 	    viona_ring_addr(ring, LEGACY_USED_IDX_OFF(ring->vr_size));
1114 	*used_idx = idx;
1115 }
1116 
1117 void
vq_pushchain(viona_vring_t * ring,uint32_t len,uint16_t cookie)1118 vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie)
1119 {
1120 	uint16_t uidx;
1121 
1122 	mutex_enter(&ring->vr_u_mutex);
1123 
1124 	uidx = ring->vr_cur_uidx;
1125 	vq_write_used_ent(ring, uidx & ring->vr_mask, cookie, len);
1126 	uidx++;
1127 	membar_producer();
1128 
1129 	vq_write_used_idx(ring, uidx);
1130 	ring->vr_cur_uidx = uidx;
1131 
1132 	mutex_exit(&ring->vr_u_mutex);
1133 }
1134 
1135 void
vq_pushchain_many(viona_vring_t * ring,uint_t num_bufs,used_elem_t * elem)1136 vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem)
1137 {
1138 	uint16_t uidx;
1139 
1140 	mutex_enter(&ring->vr_u_mutex);
1141 
1142 	uidx = ring->vr_cur_uidx;
1143 
1144 	for (uint_t i = 0; i < num_bufs; i++, uidx++) {
1145 		vq_write_used_ent(ring, uidx & ring->vr_mask, elem[i].id,
1146 		    elem[i].len);
1147 	}
1148 
1149 	membar_producer();
1150 	vq_write_used_idx(ring, uidx);
1151 	ring->vr_cur_uidx = uidx;
1152 
1153 	mutex_exit(&ring->vr_u_mutex);
1154 }
1155 
1156 /*
1157  * Set USED_NO_NOTIFY on VQ so guest elides doorbell calls for new entries.
1158  */
1159 void
viona_ring_disable_notify(viona_vring_t * ring)1160 viona_ring_disable_notify(viona_vring_t *ring)
1161 {
1162 	volatile uint16_t *used_flags =
1163 	    viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
1164 
1165 	*used_flags |= VRING_USED_F_NO_NOTIFY;
1166 }
1167 
1168 /*
1169  * Clear USED_NO_NOTIFY on VQ so guest resumes doorbell calls for new entries.
1170  */
1171 void
viona_ring_enable_notify(viona_vring_t * ring)1172 viona_ring_enable_notify(viona_vring_t *ring)
1173 {
1174 	volatile uint16_t *used_flags =
1175 	    viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
1176 
1177 	*used_flags &= ~VRING_USED_F_NO_NOTIFY;
1178 }
1179 
1180 /*
1181  * Return the number of available descriptors in the vring taking care of the
1182  * 16-bit index wraparound.
1183  *
1184  * Note: If the number of apparently available descriptors is larger than the
1185  * ring size (due to guest misbehavior), this check will still report the
1186  * positive count of descriptors.
1187  */
1188 uint16_t
viona_ring_num_avail(viona_vring_t * ring)1189 viona_ring_num_avail(viona_vring_t *ring)
1190 {
1191 	volatile uint16_t *avail_idx =
1192 	    viona_ring_addr(ring, LEGACY_AVAIL_IDX_OFF(ring->vr_size));
1193 
1194 	return (*avail_idx - ring->vr_cur_aidx);
1195 }
1196 
1197 /* Record successfully transferred packet(s) for the ring stats */
1198 void
viona_ring_stat_accept(viona_vring_t * ring,size_t count,size_t len)1199 viona_ring_stat_accept(viona_vring_t *ring, size_t count, size_t len)
1200 {
1201 	atomic_add_64(&ring->vr_stats.vts_packets, count);
1202 	atomic_add_64(&ring->vr_stats.vts_bytes, len);
1203 }
1204 
1205 /*
1206  * Record dropped packet(s) in the ring stats
1207  */
1208 void
viona_ring_stat_drop(viona_vring_t * ring,size_t count)1209 viona_ring_stat_drop(viona_vring_t *ring, size_t count)
1210 {
1211 	atomic_add_64(&ring->vr_stats.vts_drops, count);
1212 }
1213 
1214 /*
1215  * Record a packet transfer error in the ring stats
1216  */
1217 void
viona_ring_stat_error(viona_vring_t * ring)1218 viona_ring_stat_error(viona_vring_t *ring)
1219 {
1220 	atomic_inc_64(&ring->vr_stats.vts_errors);
1221 }
1222 
1223 /*
1224  * Consolidate statistic data for this ring into the totals for the link
1225  */
1226 static void
viona_ring_consolidate_stats(viona_vring_t * ring)1227 viona_ring_consolidate_stats(viona_vring_t *ring)
1228 {
1229 	viona_link_t *link = ring->vr_link;
1230 	struct viona_transfer_stats *lstat =
1231 	    (ring == &link->l_vrings[VIONA_VQ_RX]) ?
1232 	    &link->l_stats.vls_rx : &link->l_stats.vls_tx;
1233 
1234 	mutex_enter(&link->l_stats_lock);
1235 	lstat->vts_packets += ring->vr_stats.vts_packets;
1236 	lstat->vts_bytes += ring->vr_stats.vts_bytes;
1237 	lstat->vts_drops += ring->vr_stats.vts_drops;
1238 	lstat->vts_errors += ring->vr_stats.vts_errors;
1239 	bzero(&ring->vr_stats, sizeof (ring->vr_stats));
1240 	mutex_exit(&link->l_stats_lock);
1241 }
1242 
1243 /*
1244  * Copy `sz` bytes from iovecs contained in `iob` to `dst.
1245  *
1246  * Returns `true` if copy was successful (implying adequate data was remaining
1247  * in the iov_bunch_t).
1248  */
1249 bool
iov_bunch_copy(iov_bunch_t * iob,void * dst,uint32_t sz)1250 iov_bunch_copy(iov_bunch_t *iob, void *dst, uint32_t sz)
1251 {
1252 	if (sz > iob->ib_remain) {
1253 		return (false);
1254 	}
1255 	if (sz == 0) {
1256 		return (true);
1257 	}
1258 
1259 	caddr_t dest = dst;
1260 	do {
1261 		struct iovec *iov = iob->ib_iov;
1262 
1263 		ASSERT3U(iov->iov_len, <, UINT32_MAX);
1264 		ASSERT3U(iov->iov_len, !=, 0);
1265 
1266 		const uint32_t iov_avail = (iov->iov_len - iob->ib_offset);
1267 		const uint32_t to_copy = MIN(sz, iov_avail);
1268 
1269 		if (to_copy != 0) {
1270 			bcopy((caddr_t)iov->iov_base + iob->ib_offset, dest,
1271 			    to_copy);
1272 		}
1273 
1274 		sz -= to_copy;
1275 		iob->ib_remain -= to_copy;
1276 		dest += to_copy;
1277 		iob->ib_offset += to_copy;
1278 
1279 		ASSERT3U(iob->ib_offset, <=, iov->iov_len);
1280 
1281 		if (iob->ib_offset == iov->iov_len) {
1282 			iob->ib_iov++;
1283 			iob->ib_offset = 0;
1284 		}
1285 	} while (sz > 0);
1286 
1287 	return (true);
1288 }
1289 
1290 /*
1291  * Get the data pointer and length of the current head iovec, less any
1292  * offsetting from prior copy operations.  This will advanced the iov_bunch_t as
1293  * if the caller had performed a copy of that chunk length.
1294  *
1295  * Returns `true` if the iov_bunch_t had at least one iovec (unconsumed bytes)
1296  * remaining, setting `chunk` and `chunk_sz` to the chunk pointer and size,
1297  * respectively.
1298  */
1299 bool
iov_bunch_next_chunk(iov_bunch_t * iob,caddr_t * chunk,uint32_t * chunk_sz)1300 iov_bunch_next_chunk(iov_bunch_t *iob, caddr_t *chunk, uint32_t *chunk_sz)
1301 {
1302 	if (iob->ib_remain == 0) {
1303 		*chunk = NULL;
1304 		*chunk_sz = 0;
1305 		return (false);
1306 	}
1307 
1308 	*chunk_sz = iob->ib_iov->iov_len - iob->ib_offset;
1309 	*chunk = (caddr_t)iob->ib_iov->iov_base + iob->ib_offset;
1310 	iob->ib_remain -= *chunk_sz;
1311 	iob->ib_iov++;
1312 	iob->ib_offset = 0;
1313 	return (true);
1314 }
1315