1 /*
2 * Copyright (c) 2013 Chris Torek <torek @ torek net>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 /*
27 * This file and its contents are supplied under the terms of the
28 * Common Development and Distribution License ("CDDL"), version 1.0.
29 * You may only use this file in accordance with the terms of version
30 * 1.0 of the CDDL.
31 *
32 * A full copy of the text of the CDDL should have accompanied this
33 * source. A copy of the CDDL is also available via the Internet at
34 * http://www.illumos.org/license/CDDL.
35 *
36 * Copyright 2015 Pluribus Networks Inc.
37 * Copyright 2019 Joyent, Inc.
38 * Copyright 2025 Oxide Computer Company
39 */
40
41
42 #include <sys/disp.h>
43
44 #include "viona_impl.h"
45
46 #define VRING_MAX_LEN 32768
47
48 /* Layout and sizing as defined in the spec for a legacy-style virtqueue */
49
50 /*
51 * Because viona is not built with MACHDEP defined, PAGESIZE and friends are not
52 * constants but rather variable references. While viona remains x86-only, we
53 * are free to hard-code this to 4k.
54 */
55 #define VQ_PGSZ 4096UL
56 #define VQ_PGOFF (VQ_PGSZ - 1)
57 #define VQ_PGMASK ~VQ_PGOFF
58
59 #define LEGACY_VQ_ALIGN VQ_PGSZ
60
61 #define LEGACY_DESC_SZ(qsz) ((qsz) * sizeof (struct virtio_desc))
62 /*
63 * Available ring consists of avail_idx (uint16_t), flags (uint16_t), qsz avail
64 * descriptors (uint16_t each), and (optional) used_event (uint16_t).
65 */
66 #define LEGACY_AVAIL_SZ(qsz) (((qsz) + 3) * sizeof (uint16_t))
67 /*
68 * Used ring consists of used_idx (uint16_t), flags (uint16_t), qsz used
69 * descriptors (two uint32_t each), and (optional) avail_event (uint16_t).
70 */
71 #define LEGACY_USED_SZ(qsz) \
72 ((qsz) * sizeof (struct virtio_used) + 3 * sizeof (uint16_t))
73
74 #define LEGACY_AVAIL_FLAGS_OFF(qsz) LEGACY_DESC_SZ(qsz)
75 #define LEGACY_AVAIL_IDX_OFF(qsz) \
76 (LEGACY_DESC_SZ(qsz) + sizeof (uint16_t))
77 #define LEGACY_AVAIL_ENT_OFF(qsz, idx) \
78 (LEGACY_DESC_SZ(qsz) + (2 + (idx)) * sizeof (uint16_t))
79
80 #define LEGACY_USED_FLAGS_OFF(qsz) \
81 P2ROUNDUP(LEGACY_DESC_SZ(qsz) + LEGACY_AVAIL_SZ(qsz), LEGACY_VQ_ALIGN)
82 #define LEGACY_USED_IDX_OFF(qsz) \
83 (LEGACY_USED_FLAGS_OFF(qsz) + sizeof (uint16_t))
84 #define LEGACY_USED_ENT_OFF(qsz, idx) \
85 (LEGACY_USED_FLAGS_OFF(qsz) + 2 * sizeof (uint16_t) + \
86 (idx) * sizeof (struct virtio_used))
87
88 #define LEGACY_VQ_SIZE(qsz) \
89 (LEGACY_USED_FLAGS_OFF(qsz) + \
90 P2ROUNDUP(LEGACY_USED_SZ(qsz), LEGACY_VQ_ALIGN))
91 #define LEGACY_VQ_PAGES(qsz) (LEGACY_VQ_SIZE(qsz) / VQ_PGSZ)
92
93 struct vq_held_region {
94 struct iovec *vhr_iov;
95 vmm_page_t *vhr_head;
96 vmm_page_t *vhr_tail;
97 /* Length of iovec array supplied in `vhr_iov` */
98 uint_t vhr_niov;
99 /*
100 * Index into vhr_iov, indicating the next "free" entry (following the
101 * last entry which has valid contents).
102 */
103 uint_t vhr_idx;
104
105 /* Total length of populated entries in `vhr_iov` */
106 uint32_t vhr_len;
107 };
108 typedef struct vq_held_region vq_held_region_t;
109
110 static bool viona_ring_map(viona_vring_t *, bool);
111 static void viona_ring_unmap(viona_vring_t *);
112 static kthread_t *viona_create_worker(viona_vring_t *);
113 static void viona_ring_consolidate_stats(viona_vring_t *);
114
115 static vmm_page_t *
vq_page_hold(viona_vring_t * ring,uint64_t gpa,bool writable)116 vq_page_hold(viona_vring_t *ring, uint64_t gpa, bool writable)
117 {
118 ASSERT3P(ring->vr_lease, !=, NULL);
119
120 int prot = PROT_READ;
121 if (writable) {
122 prot |= PROT_WRITE;
123 }
124
125 return (vmm_drv_page_hold(ring->vr_lease, gpa, prot));
126 }
127
128 /*
129 * Establish a hold on the page(s) which back the region of guest memory covered
130 * by [gpa, gpa + len). The host-kernel-virtual pointers to those pages are
131 * stored in the iovec array supplied in `region`, along with the chain of
132 * vmm_page_t entries representing the held pages. Since guest memory
133 * carries no guarantees of being physically contiguous (on the host), it is
134 * assumed that an iovec entry will be required for each page sized section
135 * covered by the specified `gpa` and `len` range. For each iovec entry
136 * successfully populated by holding a page, `vhr_idx` will be incremented so it
137 * references the next available iovec entry (or `vhr_niov`, if the iovec array
138 * is full). The responsibility for releasing the `vmm_page_t` chain (stored in
139 * `vhr_head` and `vhr_tail`) resides with the caller, regardless of the result.
140 */
141 static int
vq_region_hold(viona_vring_t * ring,uint64_t gpa,uint32_t len,bool writable,vq_held_region_t * region)142 vq_region_hold(viona_vring_t *ring, uint64_t gpa, uint32_t len,
143 bool writable, vq_held_region_t *region)
144 {
145 const uint32_t front_offset = gpa & VQ_PGOFF;
146 const uint32_t front_len = MIN(len, VQ_PGSZ - front_offset);
147 uint_t pages = 1;
148 vmm_page_t *vmp;
149 caddr_t buf;
150
151 ASSERT3U(region->vhr_idx, <, region->vhr_niov);
152
153 if (front_len < len) {
154 pages += P2ROUNDUP((uint64_t)(len - front_len),
155 VQ_PGSZ) / VQ_PGSZ;
156 }
157 if (pages > (region->vhr_niov - region->vhr_idx)) {
158 return (E2BIG);
159 }
160
161 vmp = vq_page_hold(ring, gpa & VQ_PGMASK, writable);
162 if (vmp == NULL) {
163 return (EFAULT);
164 }
165 buf = (caddr_t)vmm_drv_page_readable(vmp);
166
167 region->vhr_iov[region->vhr_idx].iov_base = buf + front_offset;
168 region->vhr_iov[region->vhr_idx].iov_len = front_len;
169 region->vhr_idx++;
170 gpa += front_len;
171 len -= front_len;
172 if (region->vhr_head == NULL) {
173 region->vhr_head = vmp;
174 region->vhr_tail = vmp;
175 } else {
176 vmm_drv_page_chain(region->vhr_tail, vmp);
177 region->vhr_tail = vmp;
178 }
179
180 for (uint_t i = 1; i < pages; i++) {
181 ASSERT3U(gpa & VQ_PGOFF, ==, 0);
182
183 vmp = vq_page_hold(ring, gpa, writable);
184 if (vmp == NULL) {
185 return (EFAULT);
186 }
187 buf = (caddr_t)vmm_drv_page_readable(vmp);
188
189 const uint32_t chunk_len = MIN(len, VQ_PGSZ);
190 region->vhr_iov[region->vhr_idx].iov_base = buf;
191 region->vhr_iov[region->vhr_idx].iov_len = chunk_len;
192 region->vhr_idx++;
193 gpa += chunk_len;
194 len -= chunk_len;
195 vmm_drv_page_chain(region->vhr_tail, vmp);
196 region->vhr_tail = vmp;
197 }
198
199 return (0);
200 }
201
202 static boolean_t
viona_ring_lease_expire_cb(void * arg)203 viona_ring_lease_expire_cb(void *arg)
204 {
205 viona_vring_t *ring = arg;
206
207 mutex_enter(&ring->vr_lock);
208 cv_broadcast(&ring->vr_cv);
209 mutex_exit(&ring->vr_lock);
210
211 /* The lease will be broken asynchronously. */
212 return (B_FALSE);
213 }
214
215 static void
viona_ring_lease_drop(viona_vring_t * ring)216 viona_ring_lease_drop(viona_vring_t *ring)
217 {
218 ASSERT(MUTEX_HELD(&ring->vr_lock));
219
220 if (ring->vr_lease != NULL) {
221 vmm_hold_t *hold = ring->vr_link->l_vm_hold;
222
223 ASSERT(hold != NULL);
224
225 /*
226 * Without an active lease, the ring mappings cannot be
227 * considered valid.
228 */
229 viona_ring_unmap(ring);
230
231 vmm_drv_lease_break(hold, ring->vr_lease);
232 ring->vr_lease = NULL;
233 }
234 }
235
236 boolean_t
viona_ring_lease_renew(viona_vring_t * ring)237 viona_ring_lease_renew(viona_vring_t *ring)
238 {
239 vmm_hold_t *hold = ring->vr_link->l_vm_hold;
240
241 ASSERT(hold != NULL);
242 ASSERT(MUTEX_HELD(&ring->vr_lock));
243
244 viona_ring_lease_drop(ring);
245
246 /*
247 * Lease renewal will fail if the VM has requested that all holds be
248 * cleaned up.
249 */
250 ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
251 ring);
252 if (ring->vr_lease != NULL) {
253 /* A ring undergoing renewal will need valid guest mappings */
254 if (ring->vr_pa != 0 && ring->vr_size != 0) {
255 /*
256 * If new mappings cannot be established, consider the
257 * lease renewal a failure.
258 */
259 if (!viona_ring_map(ring, ring->vr_state == VRS_INIT)) {
260 viona_ring_lease_drop(ring);
261 return (B_FALSE);
262 }
263 }
264 }
265 return (ring->vr_lease != NULL);
266 }
267
268 void
viona_ring_alloc(viona_link_t * link,viona_vring_t * ring)269 viona_ring_alloc(viona_link_t *link, viona_vring_t *ring)
270 {
271 ring->vr_link = link;
272 mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL);
273 cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL);
274 mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL);
275 mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL);
276 }
277
278 static void
viona_ring_misc_free(viona_vring_t * ring)279 viona_ring_misc_free(viona_vring_t *ring)
280 {
281 const uint_t qsz = ring->vr_size;
282
283 viona_tx_ring_free(ring, qsz);
284 }
285
286 void
viona_ring_free(viona_vring_t * ring)287 viona_ring_free(viona_vring_t *ring)
288 {
289 mutex_destroy(&ring->vr_lock);
290 cv_destroy(&ring->vr_cv);
291 mutex_destroy(&ring->vr_a_mutex);
292 mutex_destroy(&ring->vr_u_mutex);
293 ring->vr_link = NULL;
294 }
295
296 int
viona_ring_init(viona_link_t * link,uint16_t idx,const struct viona_ring_params * params)297 viona_ring_init(viona_link_t *link, uint16_t idx,
298 const struct viona_ring_params *params)
299 {
300 viona_vring_t *ring;
301 kthread_t *t;
302 int err = 0;
303 const uint16_t qsz = params->vrp_size;
304 const uint64_t pa = params->vrp_pa;
305
306 if (idx >= VIONA_VQ_MAX) {
307 return (EINVAL);
308 }
309
310 if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
311 return (EINVAL);
312 }
313 if ((pa & (LEGACY_VQ_ALIGN - 1)) != 0) {
314 return (EINVAL);
315 }
316
317 ring = &link->l_vrings[idx];
318 mutex_enter(&ring->vr_lock);
319 if (ring->vr_state != VRS_RESET) {
320 mutex_exit(&ring->vr_lock);
321 return (EBUSY);
322 }
323 VERIFY(ring->vr_state_flags == 0);
324
325 ring->vr_lease = NULL;
326 if (!viona_ring_lease_renew(ring)) {
327 err = EBUSY;
328 goto fail;
329 }
330
331 ring->vr_size = qsz;
332 ring->vr_mask = (ring->vr_size - 1);
333 ring->vr_pa = pa;
334 if (!viona_ring_map(ring, true)) {
335 err = EINVAL;
336 goto fail;
337 }
338
339 /* Initialize queue indexes */
340 ring->vr_cur_aidx = params->vrp_avail_idx;
341 ring->vr_cur_uidx = params->vrp_used_idx;
342
343 if (idx == VIONA_VQ_TX) {
344 viona_tx_ring_alloc(ring, qsz);
345 }
346
347 /* Zero out MSI-X configuration */
348 ring->vr_msi_addr = 0;
349 ring->vr_msi_msg = 0;
350
351 /* Clear the stats */
352 bzero(&ring->vr_stats, sizeof (ring->vr_stats));
353 bzero(&ring->vr_err_stats, sizeof (ring->vr_err_stats));
354
355 t = viona_create_worker(ring);
356 if (t == NULL) {
357 err = ENOMEM;
358 goto fail;
359 }
360 ring->vr_worker_thread = t;
361 ring->vr_state = VRS_SETUP;
362 cv_broadcast(&ring->vr_cv);
363 mutex_exit(&ring->vr_lock);
364 return (0);
365
366 fail:
367 viona_ring_lease_drop(ring);
368 viona_ring_misc_free(ring);
369 ring->vr_size = 0;
370 ring->vr_mask = 0;
371 ring->vr_pa = 0;
372 ring->vr_cur_aidx = 0;
373 ring->vr_cur_uidx = 0;
374 mutex_exit(&ring->vr_lock);
375 return (err);
376 }
377
378 int
viona_ring_get_state(viona_link_t * link,uint16_t idx,struct viona_ring_params * params)379 viona_ring_get_state(viona_link_t *link, uint16_t idx,
380 struct viona_ring_params *params)
381 {
382 viona_vring_t *ring;
383
384 if (idx >= VIONA_VQ_MAX) {
385 return (EINVAL);
386 }
387
388 ring = &link->l_vrings[idx];
389 mutex_enter(&ring->vr_lock);
390
391 params->vrp_size = ring->vr_size;
392 params->vrp_pa = ring->vr_pa;
393
394 if (ring->vr_state == VRS_RUN) {
395 /* On a running ring, we must heed the avail/used locks */
396 mutex_enter(&ring->vr_a_mutex);
397 params->vrp_avail_idx = ring->vr_cur_aidx;
398 mutex_exit(&ring->vr_a_mutex);
399 mutex_enter(&ring->vr_u_mutex);
400 params->vrp_used_idx = ring->vr_cur_uidx;
401 mutex_exit(&ring->vr_u_mutex);
402 } else {
403 /* Otherwise vr_lock is adequate protection */
404 params->vrp_avail_idx = ring->vr_cur_aidx;
405 params->vrp_used_idx = ring->vr_cur_uidx;
406 }
407
408 mutex_exit(&ring->vr_lock);
409
410 return (0);
411 }
412
413 int
viona_ring_reset(viona_vring_t * ring,boolean_t heed_signals)414 viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
415 {
416 mutex_enter(&ring->vr_lock);
417 if (ring->vr_state == VRS_RESET) {
418 mutex_exit(&ring->vr_lock);
419 return (0);
420 }
421
422 if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) {
423 ring->vr_state_flags |= VRSF_REQ_STOP;
424 cv_broadcast(&ring->vr_cv);
425 }
426 while (ring->vr_state != VRS_RESET) {
427 if (!heed_signals) {
428 cv_wait(&ring->vr_cv, &ring->vr_lock);
429 } else {
430 int rs;
431
432 rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
433 if (rs <= 0 && ring->vr_state != VRS_RESET) {
434 mutex_exit(&ring->vr_lock);
435 return (EINTR);
436 }
437 }
438 }
439 mutex_exit(&ring->vr_lock);
440 return (0);
441 }
442
443 static bool
viona_ring_map(viona_vring_t * ring,bool defer_dirty)444 viona_ring_map(viona_vring_t *ring, bool defer_dirty)
445 {
446 const uint16_t qsz = ring->vr_size;
447 uintptr_t pa = ring->vr_pa;
448
449 ASSERT3U(qsz, !=, 0);
450 ASSERT3U(qsz, <=, VRING_MAX_LEN);
451 ASSERT3U(pa, !=, 0);
452 ASSERT3U(pa & (LEGACY_VQ_ALIGN - 1), ==, 0);
453 ASSERT(MUTEX_HELD(&ring->vr_lock));
454 ASSERT3P(ring->vr_map_pages, ==, NULL);
455
456 const uint_t npages = LEGACY_VQ_PAGES(qsz);
457 ring->vr_map_pages = kmem_zalloc(npages * sizeof (void *), KM_SLEEP);
458
459 int page_flags = 0;
460 if (defer_dirty) {
461 /*
462 * During initialization, and when entering the paused state,
463 * the page holds for a virtqueue are established with the
464 * DEFER_DIRTY flag set.
465 *
466 * This prevents those page holds from immediately marking the
467 * underlying pages as dirty, since the viona emulation is not
468 * yet performing any accesses. Once the ring transitions to
469 * the VRS_RUN state, the held pages will be marked as dirty.
470 *
471 * Any ring mappings performed outside those state conditions,
472 * such as those part of vmm_lease renewal during steady-state
473 * operation, will map the ring pages normally (as considered
474 * immediately dirty).
475 */
476 page_flags |= VMPF_DEFER_DIRTY;
477 }
478
479 vmm_page_t *prev = NULL;
480 for (uint_t i = 0; i < npages; i++, pa += VQ_PGSZ) {
481 vmm_page_t *vmp;
482
483 vmp = vmm_drv_page_hold_ext(ring->vr_lease, pa,
484 PROT_READ | PROT_WRITE, page_flags);
485 if (vmp == NULL) {
486 viona_ring_unmap(ring);
487 return (false);
488 }
489
490 /*
491 * Keep the first page has the head of the chain, appending all
492 * subsequent pages to the tail.
493 */
494 if (prev == NULL) {
495 ring->vr_map_hold = vmp;
496 } else {
497 vmm_drv_page_chain(prev, vmp);
498 }
499 prev = vmp;
500 ring->vr_map_pages[i] = vmm_drv_page_writable(vmp);
501 }
502
503 return (true);
504 }
505
506 static void
viona_ring_mark_dirty(viona_vring_t * ring)507 viona_ring_mark_dirty(viona_vring_t *ring)
508 {
509 ASSERT(MUTEX_HELD(&ring->vr_lock));
510 ASSERT(ring->vr_map_hold != NULL);
511
512 for (vmm_page_t *vp = ring->vr_map_hold; vp != NULL;
513 vp = vmm_drv_page_next(vp)) {
514 vmm_drv_page_mark_dirty(vp);
515 }
516 }
517
518 static void
viona_ring_unmap(viona_vring_t * ring)519 viona_ring_unmap(viona_vring_t *ring)
520 {
521 ASSERT(MUTEX_HELD(&ring->vr_lock));
522
523 void **map = ring->vr_map_pages;
524 if (map != NULL) {
525 const uint_t npages = LEGACY_VQ_PAGES(ring->vr_size);
526 kmem_free(map, npages * sizeof (void *));
527 ring->vr_map_pages = NULL;
528
529 vmm_drv_page_release_chain(ring->vr_map_hold);
530 ring->vr_map_hold = NULL;
531 } else {
532 ASSERT3P(ring->vr_map_hold, ==, NULL);
533 }
534 }
535
536 static inline void *
viona_ring_addr(viona_vring_t * ring,uint_t off)537 viona_ring_addr(viona_vring_t *ring, uint_t off)
538 {
539 ASSERT3P(ring->vr_map_pages, !=, NULL);
540 ASSERT3U(LEGACY_VQ_SIZE(ring->vr_size), >, off);
541
542 const uint_t page_num = off / VQ_PGSZ;
543 const uint_t page_off = off % VQ_PGSZ;
544 return ((caddr_t)ring->vr_map_pages[page_num] + page_off);
545 }
546
547 void
viona_intr_ring(viona_vring_t * ring,boolean_t skip_flags_check)548 viona_intr_ring(viona_vring_t *ring, boolean_t skip_flags_check)
549 {
550 if (!skip_flags_check) {
551 volatile uint16_t *avail_flags = viona_ring_addr(ring,
552 LEGACY_AVAIL_FLAGS_OFF(ring->vr_size));
553
554 if ((*avail_flags & VRING_AVAIL_F_NO_INTERRUPT) != 0) {
555 return;
556 }
557 }
558
559 mutex_enter(&ring->vr_lock);
560 uint64_t addr = ring->vr_msi_addr;
561 uint64_t msg = ring->vr_msi_msg;
562 mutex_exit(&ring->vr_lock);
563 if (addr != 0) {
564 /* Deliver the interrupt directly, if so configured... */
565 (void) vmm_drv_msi(ring->vr_lease, addr, msg);
566 } else {
567 /* ... otherwise, leave it to userspace */
568 if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) {
569 pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND);
570 }
571 }
572 }
573
574 static inline bool
vring_stop_req(const viona_vring_t * ring)575 vring_stop_req(const viona_vring_t *ring)
576 {
577 return ((ring->vr_state_flags & VRSF_REQ_STOP) != 0);
578 }
579
580 static inline bool
vring_pause_req(const viona_vring_t * ring)581 vring_pause_req(const viona_vring_t *ring)
582 {
583 return ((ring->vr_state_flags & VRSF_REQ_PAUSE) != 0);
584 }
585
586 static inline bool
vring_start_req(const viona_vring_t * ring)587 vring_start_req(const viona_vring_t *ring)
588 {
589 return ((ring->vr_state_flags & VRSF_REQ_START) != 0);
590 }
591
592 /*
593 * Check if vring worker thread should bail out. This will heed indications
594 * that the containing process is exiting, as well as requests to stop or pause
595 * the ring. The `stop_only` parameter controls if pause requests are ignored
596 * (true) or checked (false).
597 *
598 * Caller should hold vr_lock.
599 */
600 static bool
vring_need_bail_ext(const viona_vring_t * ring,bool stop_only)601 vring_need_bail_ext(const viona_vring_t *ring, bool stop_only)
602 {
603 ASSERT(MUTEX_HELD(&ring->vr_lock));
604
605 if (vring_stop_req(ring) ||
606 (!stop_only && vring_pause_req(ring))) {
607 return (true);
608 }
609
610 kthread_t *t = ring->vr_worker_thread;
611 if (t != NULL) {
612 proc_t *p = ttoproc(t);
613
614 ASSERT(p != NULL);
615 if ((p->p_flag & SEXITING) != 0) {
616 return (true);
617 }
618 }
619 return (false);
620 }
621
622 bool
vring_need_bail(const viona_vring_t * ring)623 vring_need_bail(const viona_vring_t *ring)
624 {
625 return (vring_need_bail_ext(ring, false));
626 }
627
628 int
viona_ring_pause(viona_vring_t * ring)629 viona_ring_pause(viona_vring_t *ring)
630 {
631 mutex_enter(&ring->vr_lock);
632 switch (ring->vr_state) {
633 case VRS_RESET:
634 case VRS_SETUP:
635 case VRS_INIT:
636 /*
637 * For rings which have not yet started (even those in the
638 * VRS_SETUP and VRS_INIT phases, where there a running worker
639 * thread (waiting to be released to do its intended task), it
640 * is adequate to simply clear any start request, to keep them
641 * from proceeding into the actual work processing function.
642 */
643 ring->vr_state_flags &= ~VRSF_REQ_START;
644 mutex_exit(&ring->vr_lock);
645 return (0);
646
647 case VRS_STOP:
648 if ((ring->vr_state_flags & VRSF_REQ_STOP) != 0) {
649 /* A ring on its way to RESET cannot be paused. */
650 mutex_exit(&ring->vr_lock);
651 return (EBUSY);
652 }
653 /* FALLTHROUGH */
654 case VRS_RUN:
655 ring->vr_state_flags |= VRSF_REQ_PAUSE;
656 cv_broadcast(&ring->vr_cv);
657 break;
658
659 default:
660 panic("invalid ring state %d", ring->vr_state);
661 break;
662 }
663
664 for (;;) {
665 int res = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
666
667 if (ring->vr_state == VRS_INIT ||
668 (ring->vr_state_flags & VRSF_REQ_PAUSE) == 0) {
669 /* Ring made it to (or through) paused state */
670 mutex_exit(&ring->vr_lock);
671 return (0);
672 }
673 if (res == 0) {
674 /* interrupted by signal */
675 mutex_exit(&ring->vr_lock);
676 return (EINTR);
677 }
678 }
679 /* NOTREACHED */
680 }
681
682 static void
viona_worker(void * arg)683 viona_worker(void *arg)
684 {
685 viona_vring_t *ring = (viona_vring_t *)arg;
686 viona_link_t *link = ring->vr_link;
687
688 mutex_enter(&ring->vr_lock);
689 VERIFY3U(ring->vr_state, ==, VRS_SETUP);
690
691 /* Bail immediately if ring shutdown or process exit was requested */
692 if (vring_need_bail_ext(ring, true)) {
693 goto ring_reset;
694 }
695
696 /* Report worker thread as alive and notify creator */
697 ring_init:
698 ring->vr_state = VRS_INIT;
699 cv_broadcast(&ring->vr_cv);
700
701 while (!vring_start_req(ring)) {
702 /*
703 * Keeping lease renewals timely while waiting for the ring to
704 * be started is important for avoiding deadlocks.
705 */
706 if (vmm_drv_lease_expired(ring->vr_lease)) {
707 if (!viona_ring_lease_renew(ring)) {
708 goto ring_reset;
709 }
710 }
711
712 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
713
714 if (vring_pause_req(ring)) {
715 /* We are already paused in the INIT state. */
716 ring->vr_state_flags &= ~VRSF_REQ_PAUSE;
717 }
718 if (vring_need_bail_ext(ring, true)) {
719 goto ring_reset;
720 }
721 }
722
723 ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0);
724 ring->vr_state = VRS_RUN;
725 ring->vr_state_flags &= ~VRSF_REQ_START;
726 viona_ring_mark_dirty(ring);
727
728 /* Ensure ring lease is valid first */
729 if (vmm_drv_lease_expired(ring->vr_lease)) {
730 if (!viona_ring_lease_renew(ring)) {
731 goto ring_reset;
732 }
733 }
734
735 /* Process actual work */
736 if (ring == &link->l_vrings[VIONA_VQ_RX]) {
737 viona_worker_rx(ring, link);
738 } else if (ring == &link->l_vrings[VIONA_VQ_TX]) {
739 viona_worker_tx(ring, link);
740 } else {
741 panic("unexpected ring: %p", (void *)ring);
742 }
743
744 VERIFY3U(ring->vr_state, ==, VRS_STOP);
745 VERIFY3U(ring->vr_xfer_outstanding, ==, 0);
746
747 /*
748 * Consolidate stats data so that it is not lost if/when this ring is
749 * being stopped.
750 */
751 viona_ring_consolidate_stats(ring);
752
753 /* Respond to a pause request if the ring is not required to stop */
754 if (vring_pause_req(ring)) {
755 ring->vr_state_flags &= ~VRSF_REQ_PAUSE;
756
757 if (vring_need_bail_ext(ring, true)) {
758 goto ring_reset;
759 }
760
761 /*
762 * To complete pausing of the ring, unmap and re-map the pages
763 * underpinning the virtqueue. This is to synchronize their
764 * dirty state in the backing page tables and restore the
765 * defer-dirty state on the held pages.
766 */
767 viona_ring_unmap(ring);
768 if (viona_ring_map(ring, true)) {
769 goto ring_init;
770 }
771
772 /*
773 * If the ring pages failed to be mapped, fallthrough to
774 * ring-reset like any other failure.
775 */
776 }
777
778 ring_reset:
779 viona_ring_misc_free(ring);
780
781 viona_ring_lease_drop(ring);
782 ring->vr_cur_aidx = 0;
783 ring->vr_size = 0;
784 ring->vr_mask = 0;
785 ring->vr_pa = 0;
786 ring->vr_state = VRS_RESET;
787 ring->vr_state_flags = 0;
788 ring->vr_worker_thread = NULL;
789 cv_broadcast(&ring->vr_cv);
790 mutex_exit(&ring->vr_lock);
791
792 mutex_enter(&ttoproc(curthread)->p_lock);
793 lwp_exit();
794 }
795
796 static kthread_t *
viona_create_worker(viona_vring_t * ring)797 viona_create_worker(viona_vring_t *ring)
798 {
799 k_sigset_t hold_set;
800 proc_t *p = curproc;
801 kthread_t *t;
802 klwp_t *lwp;
803
804 ASSERT(MUTEX_HELD(&ring->vr_lock));
805 ASSERT(ring->vr_state == VRS_RESET);
806
807 sigfillset(&hold_set);
808 lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED,
809 minclsyspri - 1, &hold_set, curthread->t_cid, 0);
810 if (lwp == NULL) {
811 return (NULL);
812 }
813
814 t = lwptot(lwp);
815 mutex_enter(&p->p_lock);
816 t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
817 lwp_create_done(t);
818 mutex_exit(&p->p_lock);
819
820 return (t);
821 }
822
823 static inline void
vq_read_desc(viona_vring_t * ring,uint16_t idx,struct virtio_desc * descp)824 vq_read_desc(viona_vring_t *ring, uint16_t idx, struct virtio_desc *descp)
825 {
826 const uint_t entry_off = idx * sizeof (struct virtio_desc);
827
828 ASSERT3U(idx, <, ring->vr_size);
829
830 /*
831 * On both legacy and 1.x VirtIO, the virtqueue descriptors are required
832 * to be aligned to at least 16 bytes (4k for legacy).
833 */
834 *descp = *(const struct virtio_desc *)viona_ring_addr(ring, entry_off);
835 }
836
837 static uint16_t
vq_read_avail(viona_vring_t * ring,uint16_t idx)838 vq_read_avail(viona_vring_t *ring, uint16_t idx)
839 {
840 ASSERT3U(idx, <, ring->vr_size);
841
842 volatile uint16_t *avail_ent =
843 viona_ring_addr(ring, LEGACY_AVAIL_ENT_OFF(ring->vr_size, idx));
844 return (*avail_ent);
845 }
846
847 /*
848 * Given a buffer descriptor `desc`, attempt to map the pages backing that
849 * region of guest physical memory, taking into account that there are no
850 * guarantees about guest-contiguous pages being host-contiguous.
851 */
852 static int
vq_map_desc_bufs(viona_vring_t * ring,const struct virtio_desc * desc,vq_held_region_t * region)853 vq_map_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
854 vq_held_region_t *region)
855 {
856 if (desc->vd_len == 0) {
857 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
858 uint32_t, desc->vd_len);
859 VIONA_RING_STAT_INCR(ring, desc_bad_len);
860 return (EINVAL);
861 } else if ((region->vhr_len + desc->vd_len) < region->vhr_len) {
862 VIONA_PROBE1(len_overflow, viona_vring_t *, ring);
863 VIONA_RING_STAT_INCR(ring, len_overflow);
864 return (EOVERFLOW);
865 }
866
867 int err = vq_region_hold(ring, desc->vd_addr, desc->vd_len,
868 (desc->vd_flags & VRING_DESC_F_WRITE) != 0, region);
869 if (err == 0) {
870 region->vhr_len += desc->vd_len;
871 } else if (err == E2BIG) {
872 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
873 VIONA_RING_STAT_INCR(ring, too_many_desc);
874 } else if (err == EFAULT) {
875 VIONA_PROBE_BAD_RING_ADDR(ring, desc->vd_addr);
876 VIONA_RING_STAT_INCR(ring, bad_ring_addr);
877 }
878
879 return (err);
880 }
881
882 /*
883 * Walk an indirect buffer descriptor `desc`, attempting to map the pages
884 * backing the regions of guest memory covered by its constituent descriptors.
885 */
886 static int
vq_map_indir_desc_bufs(viona_vring_t * ring,const struct virtio_desc * desc,vq_held_region_t * region)887 vq_map_indir_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
888 vq_held_region_t *region)
889 {
890 const uint16_t indir_count = desc->vd_len / sizeof (struct virtio_desc);
891
892 if ((desc->vd_len & 0xf) != 0 || indir_count == 0 ||
893 indir_count > ring->vr_size ||
894 desc->vd_addr > (desc->vd_addr + desc->vd_len)) {
895 VIONA_PROBE2(indir_bad_len, viona_vring_t *, ring,
896 uint32_t, desc->vd_len);
897 VIONA_RING_STAT_INCR(ring, indir_bad_len);
898 return (EINVAL);
899 }
900
901 uint16_t indir_next = 0;
902 const uint8_t *buf = NULL;
903 uint64_t buf_gpa = UINT64_MAX;
904 vmm_page_t *vmp = NULL;
905 int err = 0;
906
907 for (;;) {
908 const uint64_t indir_gpa =
909 desc->vd_addr + (indir_next * sizeof (struct virtio_desc));
910 const uint64_t indir_page = indir_gpa & VQ_PGMASK;
911
912 /*
913 * Get a mapping for the page that the next indirect descriptor
914 * resides in, if has not already been done.
915 */
916 if (indir_page != buf_gpa) {
917 if (vmp != NULL) {
918 vmm_drv_page_release(vmp);
919 }
920 vmp = vq_page_hold(ring, indir_page, false);
921 if (vmp == NULL) {
922 VIONA_PROBE_BAD_RING_ADDR(ring, indir_page);
923 VIONA_RING_STAT_INCR(ring, bad_ring_addr);
924 err = EFAULT;
925 break;
926 }
927 buf_gpa = indir_page;
928 buf = vmm_drv_page_readable(vmp);
929 }
930
931 /*
932 * A copy of the indirect descriptor is made here, rather than
933 * simply using a reference pointer. This prevents malicious or
934 * erroneous guest writes to the descriptor from fooling the
935 * flags/bounds verification through a race.
936 *
937 * While indirect descriptors do not have the same alignment
938 * requirements as those residing in the virtqueue itself, we
939 * are not concerned about unaligned access while viona remains
940 * x86-only.
941 */
942 struct virtio_desc vp = *(const struct virtio_desc *)
943 (buf + (indir_gpa - indir_page));
944
945 if (vp.vd_flags & VRING_DESC_F_INDIRECT) {
946 VIONA_PROBE1(indir_bad_nest, viona_vring_t *, ring);
947 VIONA_RING_STAT_INCR(ring, indir_bad_nest);
948 err = EINVAL;
949 break;
950 } else if (vp.vd_len == 0) {
951 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
952 uint32_t, vp.vd_len);
953 VIONA_RING_STAT_INCR(ring, desc_bad_len);
954 err = EINVAL;
955 break;
956 }
957
958 err = vq_map_desc_bufs(ring, &vp, region);
959 if (err != 0) {
960 break;
961 }
962
963 /* Successfully reach the end of the indir chain */
964 if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) {
965 break;
966 }
967 if (region->vhr_idx >= region->vhr_niov) {
968 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
969 VIONA_RING_STAT_INCR(ring, too_many_desc);
970 err = E2BIG;
971 break;
972 }
973
974 indir_next = vp.vd_next;
975 if (indir_next >= indir_count) {
976 VIONA_PROBE3(indir_bad_next, viona_vring_t *, ring,
977 uint16_t, indir_next, uint16_t, indir_count);
978 VIONA_RING_STAT_INCR(ring, indir_bad_next);
979 err = EINVAL;
980 break;
981 }
982 }
983
984 if (vmp != NULL) {
985 vmm_drv_page_release(vmp);
986 }
987 return (err);
988 }
989
990 int
vq_popchain(viona_vring_t * ring,struct iovec * iov,uint_t niov,uint16_t * cookie,vmm_page_t ** chain,uint32_t * len)991 vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov,
992 uint16_t *cookie, vmm_page_t **chain, uint32_t *len)
993 {
994 uint16_t ndesc, idx, head, next;
995 struct virtio_desc vdir;
996 vq_held_region_t region = {
997 .vhr_niov = niov,
998 .vhr_iov = iov,
999 };
1000
1001 ASSERT(iov != NULL);
1002 ASSERT(niov > 0 && niov < INT_MAX);
1003 ASSERT(*chain == NULL);
1004
1005 mutex_enter(&ring->vr_a_mutex);
1006 idx = ring->vr_cur_aidx;
1007 ndesc = viona_ring_num_avail(ring);
1008
1009 if (ndesc == 0) {
1010 mutex_exit(&ring->vr_a_mutex);
1011 return (0);
1012 }
1013 if (ndesc > ring->vr_size) {
1014 /*
1015 * Despite the fact that the guest has provided an 'avail_idx'
1016 * which indicates that an impossible number of descriptors are
1017 * available, continue on and attempt to process the next one.
1018 *
1019 * The transgression will not escape the probe or stats though.
1020 */
1021 VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring,
1022 uint16_t, ndesc);
1023 VIONA_RING_STAT_INCR(ring, ndesc_too_high);
1024 }
1025
1026 head = vq_read_avail(ring, idx & ring->vr_mask);
1027 next = head;
1028
1029 for (region.vhr_idx = 0; region.vhr_idx < niov; next = vdir.vd_next) {
1030 if (next >= ring->vr_size) {
1031 VIONA_PROBE2(bad_idx, viona_vring_t *, ring,
1032 uint16_t, next);
1033 VIONA_RING_STAT_INCR(ring, bad_idx);
1034 break;
1035 }
1036
1037 vq_read_desc(ring, next, &vdir);
1038 if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) {
1039 if (vq_map_desc_bufs(ring, &vdir, ®ion) != 0) {
1040 break;
1041 }
1042 } else {
1043 /*
1044 * Per the specification (Virtio 1.1 S2.6.5.3.1):
1045 * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
1046 * and VIRTQ_DESC_F_NEXT in `flags`.
1047 */
1048 if ((vdir.vd_flags & VRING_DESC_F_NEXT) != 0) {
1049 VIONA_PROBE3(indir_bad_next,
1050 viona_vring_t *, ring,
1051 uint16_t, next, uint16_t, 0);
1052 VIONA_RING_STAT_INCR(ring, indir_bad_next);
1053 break;
1054 }
1055
1056 if (vq_map_indir_desc_bufs(ring, &vdir, ®ion) != 0) {
1057 break;
1058 }
1059 }
1060
1061 if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) {
1062 ring->vr_cur_aidx++;
1063 mutex_exit(&ring->vr_a_mutex);
1064
1065 *cookie = head;
1066 *chain = region.vhr_head;
1067 if (len != NULL) {
1068 *len = region.vhr_len;
1069 }
1070 return (region.vhr_idx);
1071 }
1072 }
1073
1074 mutex_exit(&ring->vr_a_mutex);
1075 if (region.vhr_head != NULL) {
1076 /*
1077 * If any pages were held prior to encountering an error, we
1078 * must release them now.
1079 */
1080 vmm_drv_page_release_chain(region.vhr_head);
1081 }
1082 return (-1);
1083 }
1084
1085
1086 static void
vq_write_used_ent(viona_vring_t * ring,uint16_t idx,uint16_t cookie,uint32_t len)1087 vq_write_used_ent(viona_vring_t *ring, uint16_t idx, uint16_t cookie,
1088 uint32_t len)
1089 {
1090 /*
1091 * In a larger ring, entry could be split across pages, so be sure to
1092 * account for that when configuring the transfer by looking up the ID
1093 * and length addresses separately, rather than an address for a
1094 * combined `struct virtio_used`.
1095 */
1096 const uint_t used_id_off = LEGACY_USED_ENT_OFF(ring->vr_size, idx);
1097 const uint_t used_len_off = used_id_off + sizeof (uint32_t);
1098 volatile uint32_t *idp = viona_ring_addr(ring, used_id_off);
1099 volatile uint32_t *lenp = viona_ring_addr(ring, used_len_off);
1100
1101 ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
1102
1103 *idp = cookie;
1104 *lenp = len;
1105 }
1106
1107 static void
vq_write_used_idx(viona_vring_t * ring,uint16_t idx)1108 vq_write_used_idx(viona_vring_t *ring, uint16_t idx)
1109 {
1110 ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
1111
1112 volatile uint16_t *used_idx =
1113 viona_ring_addr(ring, LEGACY_USED_IDX_OFF(ring->vr_size));
1114 *used_idx = idx;
1115 }
1116
1117 void
vq_pushchain(viona_vring_t * ring,uint32_t len,uint16_t cookie)1118 vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie)
1119 {
1120 uint16_t uidx;
1121
1122 mutex_enter(&ring->vr_u_mutex);
1123
1124 uidx = ring->vr_cur_uidx;
1125 vq_write_used_ent(ring, uidx & ring->vr_mask, cookie, len);
1126 uidx++;
1127 membar_producer();
1128
1129 vq_write_used_idx(ring, uidx);
1130 ring->vr_cur_uidx = uidx;
1131
1132 mutex_exit(&ring->vr_u_mutex);
1133 }
1134
1135 void
vq_pushchain_many(viona_vring_t * ring,uint_t num_bufs,used_elem_t * elem)1136 vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem)
1137 {
1138 uint16_t uidx;
1139
1140 mutex_enter(&ring->vr_u_mutex);
1141
1142 uidx = ring->vr_cur_uidx;
1143
1144 for (uint_t i = 0; i < num_bufs; i++, uidx++) {
1145 vq_write_used_ent(ring, uidx & ring->vr_mask, elem[i].id,
1146 elem[i].len);
1147 }
1148
1149 membar_producer();
1150 vq_write_used_idx(ring, uidx);
1151 ring->vr_cur_uidx = uidx;
1152
1153 mutex_exit(&ring->vr_u_mutex);
1154 }
1155
1156 /*
1157 * Set USED_NO_NOTIFY on VQ so guest elides doorbell calls for new entries.
1158 */
1159 void
viona_ring_disable_notify(viona_vring_t * ring)1160 viona_ring_disable_notify(viona_vring_t *ring)
1161 {
1162 volatile uint16_t *used_flags =
1163 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
1164
1165 *used_flags |= VRING_USED_F_NO_NOTIFY;
1166 }
1167
1168 /*
1169 * Clear USED_NO_NOTIFY on VQ so guest resumes doorbell calls for new entries.
1170 */
1171 void
viona_ring_enable_notify(viona_vring_t * ring)1172 viona_ring_enable_notify(viona_vring_t *ring)
1173 {
1174 volatile uint16_t *used_flags =
1175 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
1176
1177 *used_flags &= ~VRING_USED_F_NO_NOTIFY;
1178 }
1179
1180 /*
1181 * Return the number of available descriptors in the vring taking care of the
1182 * 16-bit index wraparound.
1183 *
1184 * Note: If the number of apparently available descriptors is larger than the
1185 * ring size (due to guest misbehavior), this check will still report the
1186 * positive count of descriptors.
1187 */
1188 uint16_t
viona_ring_num_avail(viona_vring_t * ring)1189 viona_ring_num_avail(viona_vring_t *ring)
1190 {
1191 volatile uint16_t *avail_idx =
1192 viona_ring_addr(ring, LEGACY_AVAIL_IDX_OFF(ring->vr_size));
1193
1194 return (*avail_idx - ring->vr_cur_aidx);
1195 }
1196
1197 /* Record successfully transferred packet(s) for the ring stats */
1198 void
viona_ring_stat_accept(viona_vring_t * ring,size_t count,size_t len)1199 viona_ring_stat_accept(viona_vring_t *ring, size_t count, size_t len)
1200 {
1201 atomic_add_64(&ring->vr_stats.vts_packets, count);
1202 atomic_add_64(&ring->vr_stats.vts_bytes, len);
1203 }
1204
1205 /*
1206 * Record dropped packet(s) in the ring stats
1207 */
1208 void
viona_ring_stat_drop(viona_vring_t * ring,size_t count)1209 viona_ring_stat_drop(viona_vring_t *ring, size_t count)
1210 {
1211 atomic_add_64(&ring->vr_stats.vts_drops, count);
1212 }
1213
1214 /*
1215 * Record a packet transfer error in the ring stats
1216 */
1217 void
viona_ring_stat_error(viona_vring_t * ring)1218 viona_ring_stat_error(viona_vring_t *ring)
1219 {
1220 atomic_inc_64(&ring->vr_stats.vts_errors);
1221 }
1222
1223 /*
1224 * Consolidate statistic data for this ring into the totals for the link
1225 */
1226 static void
viona_ring_consolidate_stats(viona_vring_t * ring)1227 viona_ring_consolidate_stats(viona_vring_t *ring)
1228 {
1229 viona_link_t *link = ring->vr_link;
1230 struct viona_transfer_stats *lstat =
1231 (ring == &link->l_vrings[VIONA_VQ_RX]) ?
1232 &link->l_stats.vls_rx : &link->l_stats.vls_tx;
1233
1234 mutex_enter(&link->l_stats_lock);
1235 lstat->vts_packets += ring->vr_stats.vts_packets;
1236 lstat->vts_bytes += ring->vr_stats.vts_bytes;
1237 lstat->vts_drops += ring->vr_stats.vts_drops;
1238 lstat->vts_errors += ring->vr_stats.vts_errors;
1239 bzero(&ring->vr_stats, sizeof (ring->vr_stats));
1240 mutex_exit(&link->l_stats_lock);
1241 }
1242
1243 /*
1244 * Copy `sz` bytes from iovecs contained in `iob` to `dst.
1245 *
1246 * Returns `true` if copy was successful (implying adequate data was remaining
1247 * in the iov_bunch_t).
1248 */
1249 bool
iov_bunch_copy(iov_bunch_t * iob,void * dst,uint32_t sz)1250 iov_bunch_copy(iov_bunch_t *iob, void *dst, uint32_t sz)
1251 {
1252 if (sz > iob->ib_remain) {
1253 return (false);
1254 }
1255 if (sz == 0) {
1256 return (true);
1257 }
1258
1259 caddr_t dest = dst;
1260 do {
1261 struct iovec *iov = iob->ib_iov;
1262
1263 ASSERT3U(iov->iov_len, <, UINT32_MAX);
1264 ASSERT3U(iov->iov_len, !=, 0);
1265
1266 const uint32_t iov_avail = (iov->iov_len - iob->ib_offset);
1267 const uint32_t to_copy = MIN(sz, iov_avail);
1268
1269 if (to_copy != 0) {
1270 bcopy((caddr_t)iov->iov_base + iob->ib_offset, dest,
1271 to_copy);
1272 }
1273
1274 sz -= to_copy;
1275 iob->ib_remain -= to_copy;
1276 dest += to_copy;
1277 iob->ib_offset += to_copy;
1278
1279 ASSERT3U(iob->ib_offset, <=, iov->iov_len);
1280
1281 if (iob->ib_offset == iov->iov_len) {
1282 iob->ib_iov++;
1283 iob->ib_offset = 0;
1284 }
1285 } while (sz > 0);
1286
1287 return (true);
1288 }
1289
1290 /*
1291 * Get the data pointer and length of the current head iovec, less any
1292 * offsetting from prior copy operations. This will advanced the iov_bunch_t as
1293 * if the caller had performed a copy of that chunk length.
1294 *
1295 * Returns `true` if the iov_bunch_t had at least one iovec (unconsumed bytes)
1296 * remaining, setting `chunk` and `chunk_sz` to the chunk pointer and size,
1297 * respectively.
1298 */
1299 bool
iov_bunch_next_chunk(iov_bunch_t * iob,caddr_t * chunk,uint32_t * chunk_sz)1300 iov_bunch_next_chunk(iov_bunch_t *iob, caddr_t *chunk, uint32_t *chunk_sz)
1301 {
1302 if (iob->ib_remain == 0) {
1303 *chunk = NULL;
1304 *chunk_sz = 0;
1305 return (false);
1306 }
1307
1308 *chunk_sz = iob->ib_iov->iov_len - iob->ib_offset;
1309 *chunk = (caddr_t)iob->ib_iov->iov_base + iob->ib_offset;
1310 iob->ib_remain -= *chunk_sz;
1311 iob->ib_iov++;
1312 iob->ib_offset = 0;
1313 return (true);
1314 }
1315