1 /*
2 * Copyright (c) 2013 Chris Torek <torek @ torek net>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 /*
27 * This file and its contents are supplied under the terms of the
28 * Common Development and Distribution License ("CDDL"), version 1.0.
29 * You may only use this file in accordance with the terms of version
30 * 1.0 of the CDDL.
31 *
32 * A full copy of the text of the CDDL should have accompanied this
33 * source. A copy of the CDDL is also available via the Internet at
34 * http://www.illumos.org/license/CDDL.
35 *
36 * Copyright 2015 Pluribus Networks Inc.
37 * Copyright 2019 Joyent, Inc.
38 * Copyright 2024 Oxide Computer Company
39 */
40
41
42 #include <sys/disp.h>
43
44 #include "viona_impl.h"
45
46 #define VRING_MAX_LEN 32768
47
48 /* Layout and sizing as defined in the spec for a legacy-style virtqueue */
49
50 #define LEGACY_VQ_ALIGN PAGESIZE
51
52 #define LEGACY_DESC_SZ(qsz) ((qsz) * sizeof (struct virtio_desc))
53 /*
54 * Available ring consists of avail_idx (uint16_t), flags (uint16_t), qsz avail
55 * descriptors (uint16_t each), and (optional) used_event (uint16_t).
56 */
57 #define LEGACY_AVAIL_SZ(qsz) (((qsz) + 3) * sizeof (uint16_t))
58 /*
59 * Used ring consists of used_idx (uint16_t), flags (uint16_t), qsz used
60 * descriptors (two uint32_t each), and (optional) avail_event (uint16_t).
61 */
62 #define LEGACY_USED_SZ(qsz) \
63 ((qsz) * sizeof (struct virtio_used) + 3 * sizeof (uint16_t))
64
65 #define LEGACY_AVAIL_FLAGS_OFF(qsz) LEGACY_DESC_SZ(qsz)
66 #define LEGACY_AVAIL_IDX_OFF(qsz) \
67 (LEGACY_DESC_SZ(qsz) + sizeof (uint16_t))
68 #define LEGACY_AVAIL_ENT_OFF(qsz, idx) \
69 (LEGACY_DESC_SZ(qsz) + (2 + (idx)) * sizeof (uint16_t))
70
71 #define LEGACY_USED_FLAGS_OFF(qsz) \
72 P2ROUNDUP(LEGACY_DESC_SZ(qsz) + LEGACY_AVAIL_SZ(qsz), LEGACY_VQ_ALIGN)
73 #define LEGACY_USED_IDX_OFF(qsz) \
74 (LEGACY_USED_FLAGS_OFF(qsz) + sizeof (uint16_t))
75 #define LEGACY_USED_ENT_OFF(qsz, idx) \
76 (LEGACY_USED_FLAGS_OFF(qsz) + 2 * sizeof (uint16_t) + \
77 (idx) * sizeof (struct virtio_used))
78
79 #define LEGACY_VQ_SIZE(qsz) \
80 (LEGACY_USED_FLAGS_OFF(qsz) + \
81 P2ROUNDUP(LEGACY_USED_SZ(qsz), LEGACY_VQ_ALIGN))
82 #define LEGACY_VQ_PAGES(qsz) (LEGACY_VQ_SIZE(qsz) / PAGESIZE)
83
84 struct vq_held_region {
85 struct iovec *vhr_iov;
86 vmm_page_t *vhr_head;
87 vmm_page_t *vhr_tail;
88 /* Length of iovec array supplied in `vhr_iov` */
89 uint_t vhr_niov;
90 /*
91 * Index into vhr_iov, indicating the next "free" entry (following the
92 * last entry which has valid contents).
93 */
94 uint_t vhr_idx;
95
96 /* Total length of populated entries in `vhr_iov` */
97 uint32_t vhr_len;
98 };
99 typedef struct vq_held_region vq_held_region_t;
100
101 static bool viona_ring_map(viona_vring_t *, bool);
102 static void viona_ring_unmap(viona_vring_t *);
103 static kthread_t *viona_create_worker(viona_vring_t *);
104 static void viona_ring_consolidate_stats(viona_vring_t *);
105
106 static vmm_page_t *
vq_page_hold(viona_vring_t * ring,uint64_t gpa,bool writable)107 vq_page_hold(viona_vring_t *ring, uint64_t gpa, bool writable)
108 {
109 ASSERT3P(ring->vr_lease, !=, NULL);
110
111 int prot = PROT_READ;
112 if (writable) {
113 prot |= PROT_WRITE;
114 }
115
116 return (vmm_drv_page_hold(ring->vr_lease, gpa, prot));
117 }
118
119 /*
120 * Establish a hold on the page(s) which back the region of guest memory covered
121 * by [gpa, gpa + len). The host-kernel-virtual pointers to those pages are
122 * stored in the iovec array supplied in `region`, along with the chain of
123 * vmm_page_t entries representing the held pages. Since guest memory
124 * carries no guarantees of being physically contiguous (on the host), it is
125 * assumed that an iovec entry will be required for each PAGESIZE section
126 * covered by the specified `gpa` and `len` range. For each iovec entry
127 * successfully populated by holding a page, `vhr_idx` will be incremented so it
128 * references the next available iovec entry (or `vhr_niov`, if the iovec array
129 * is full). The responsibility for releasing the `vmm_page_t` chain (stored in
130 * `vhr_head` and `vhr_tail`) resides with the caller, regardless of the result.
131 */
132 static int
vq_region_hold(viona_vring_t * ring,uint64_t gpa,uint32_t len,bool writable,vq_held_region_t * region)133 vq_region_hold(viona_vring_t *ring, uint64_t gpa, uint32_t len,
134 bool writable, vq_held_region_t *region)
135 {
136 const uint32_t front_offset = gpa & PAGEOFFSET;
137 const uint32_t front_len = MIN(len, PAGESIZE - front_offset);
138 uint_t pages = 1;
139 vmm_page_t *vmp;
140 caddr_t buf;
141
142 ASSERT3U(region->vhr_idx, <, region->vhr_niov);
143
144 if (front_len < len) {
145 pages += P2ROUNDUP((uint64_t)(len - front_len),
146 PAGESIZE) / PAGESIZE;
147 }
148 if (pages > (region->vhr_niov - region->vhr_idx)) {
149 return (E2BIG);
150 }
151
152 vmp = vq_page_hold(ring, gpa & PAGEMASK, writable);
153 if (vmp == NULL) {
154 return (EFAULT);
155 }
156 buf = (caddr_t)vmm_drv_page_readable(vmp);
157
158 region->vhr_iov[region->vhr_idx].iov_base = buf + front_offset;
159 region->vhr_iov[region->vhr_idx].iov_len = front_len;
160 region->vhr_idx++;
161 gpa += front_len;
162 len -= front_len;
163 if (region->vhr_head == NULL) {
164 region->vhr_head = vmp;
165 region->vhr_tail = vmp;
166 } else {
167 vmm_drv_page_chain(region->vhr_tail, vmp);
168 region->vhr_tail = vmp;
169 }
170
171 for (uint_t i = 1; i < pages; i++) {
172 ASSERT3U(gpa & PAGEOFFSET, ==, 0);
173
174 vmp = vq_page_hold(ring, gpa, writable);
175 if (vmp == NULL) {
176 return (EFAULT);
177 }
178 buf = (caddr_t)vmm_drv_page_readable(vmp);
179
180 const uint32_t chunk_len = MIN(len, PAGESIZE);
181 region->vhr_iov[region->vhr_idx].iov_base = buf;
182 region->vhr_iov[region->vhr_idx].iov_len = chunk_len;
183 region->vhr_idx++;
184 gpa += chunk_len;
185 len -= chunk_len;
186 vmm_drv_page_chain(region->vhr_tail, vmp);
187 region->vhr_tail = vmp;
188 }
189
190 return (0);
191 }
192
193 static boolean_t
viona_ring_lease_expire_cb(void * arg)194 viona_ring_lease_expire_cb(void *arg)
195 {
196 viona_vring_t *ring = arg;
197
198 mutex_enter(&ring->vr_lock);
199 cv_broadcast(&ring->vr_cv);
200 mutex_exit(&ring->vr_lock);
201
202 /* The lease will be broken asynchronously. */
203 return (B_FALSE);
204 }
205
206 static void
viona_ring_lease_drop(viona_vring_t * ring)207 viona_ring_lease_drop(viona_vring_t *ring)
208 {
209 ASSERT(MUTEX_HELD(&ring->vr_lock));
210
211 if (ring->vr_lease != NULL) {
212 vmm_hold_t *hold = ring->vr_link->l_vm_hold;
213
214 ASSERT(hold != NULL);
215
216 /*
217 * Without an active lease, the ring mappings cannot be
218 * considered valid.
219 */
220 viona_ring_unmap(ring);
221
222 vmm_drv_lease_break(hold, ring->vr_lease);
223 ring->vr_lease = NULL;
224 }
225 }
226
227 boolean_t
viona_ring_lease_renew(viona_vring_t * ring)228 viona_ring_lease_renew(viona_vring_t *ring)
229 {
230 vmm_hold_t *hold = ring->vr_link->l_vm_hold;
231
232 ASSERT(hold != NULL);
233 ASSERT(MUTEX_HELD(&ring->vr_lock));
234
235 viona_ring_lease_drop(ring);
236
237 /*
238 * Lease renewal will fail if the VM has requested that all holds be
239 * cleaned up.
240 */
241 ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
242 ring);
243 if (ring->vr_lease != NULL) {
244 /* A ring undergoing renewal will need valid guest mappings */
245 if (ring->vr_pa != 0 && ring->vr_size != 0) {
246 /*
247 * If new mappings cannot be established, consider the
248 * lease renewal a failure.
249 */
250 if (!viona_ring_map(ring, ring->vr_state == VRS_INIT)) {
251 viona_ring_lease_drop(ring);
252 return (B_FALSE);
253 }
254 }
255 }
256 return (ring->vr_lease != NULL);
257 }
258
259 void
viona_ring_alloc(viona_link_t * link,viona_vring_t * ring)260 viona_ring_alloc(viona_link_t *link, viona_vring_t *ring)
261 {
262 ring->vr_link = link;
263 mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL);
264 cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL);
265 mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL);
266 mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL);
267 }
268
269 static void
viona_ring_misc_free(viona_vring_t * ring)270 viona_ring_misc_free(viona_vring_t *ring)
271 {
272 const uint_t qsz = ring->vr_size;
273
274 viona_tx_ring_free(ring, qsz);
275 }
276
277 void
viona_ring_free(viona_vring_t * ring)278 viona_ring_free(viona_vring_t *ring)
279 {
280 mutex_destroy(&ring->vr_lock);
281 cv_destroy(&ring->vr_cv);
282 mutex_destroy(&ring->vr_a_mutex);
283 mutex_destroy(&ring->vr_u_mutex);
284 ring->vr_link = NULL;
285 }
286
287 int
viona_ring_init(viona_link_t * link,uint16_t idx,const struct viona_ring_params * params)288 viona_ring_init(viona_link_t *link, uint16_t idx,
289 const struct viona_ring_params *params)
290 {
291 viona_vring_t *ring;
292 kthread_t *t;
293 int err = 0;
294 const uint16_t qsz = params->vrp_size;
295 const uint64_t pa = params->vrp_pa;
296
297 if (idx >= VIONA_VQ_MAX) {
298 return (EINVAL);
299 }
300
301 if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
302 return (EINVAL);
303 }
304 if ((pa & (LEGACY_VQ_ALIGN - 1)) != 0) {
305 return (EINVAL);
306 }
307
308 ring = &link->l_vrings[idx];
309 mutex_enter(&ring->vr_lock);
310 if (ring->vr_state != VRS_RESET) {
311 mutex_exit(&ring->vr_lock);
312 return (EBUSY);
313 }
314 VERIFY(ring->vr_state_flags == 0);
315
316 ring->vr_lease = NULL;
317 if (!viona_ring_lease_renew(ring)) {
318 err = EBUSY;
319 goto fail;
320 }
321
322 ring->vr_size = qsz;
323 ring->vr_mask = (ring->vr_size - 1);
324 ring->vr_pa = pa;
325 if (!viona_ring_map(ring, true)) {
326 err = EINVAL;
327 goto fail;
328 }
329
330 /* Initialize queue indexes */
331 ring->vr_cur_aidx = params->vrp_avail_idx;
332 ring->vr_cur_uidx = params->vrp_used_idx;
333
334 if (idx == VIONA_VQ_TX) {
335 viona_tx_ring_alloc(ring, qsz);
336 }
337
338 /* Zero out MSI-X configuration */
339 ring->vr_msi_addr = 0;
340 ring->vr_msi_msg = 0;
341
342 /* Clear the stats */
343 bzero(&ring->vr_stats, sizeof (ring->vr_stats));
344 bzero(&ring->vr_err_stats, sizeof (ring->vr_err_stats));
345
346 t = viona_create_worker(ring);
347 if (t == NULL) {
348 err = ENOMEM;
349 goto fail;
350 }
351 ring->vr_worker_thread = t;
352 ring->vr_state = VRS_SETUP;
353 cv_broadcast(&ring->vr_cv);
354 mutex_exit(&ring->vr_lock);
355 return (0);
356
357 fail:
358 viona_ring_lease_drop(ring);
359 viona_ring_misc_free(ring);
360 ring->vr_size = 0;
361 ring->vr_mask = 0;
362 ring->vr_pa = 0;
363 ring->vr_cur_aidx = 0;
364 ring->vr_cur_uidx = 0;
365 mutex_exit(&ring->vr_lock);
366 return (err);
367 }
368
369 int
viona_ring_get_state(viona_link_t * link,uint16_t idx,struct viona_ring_params * params)370 viona_ring_get_state(viona_link_t *link, uint16_t idx,
371 struct viona_ring_params *params)
372 {
373 viona_vring_t *ring;
374
375 if (idx >= VIONA_VQ_MAX) {
376 return (EINVAL);
377 }
378
379 ring = &link->l_vrings[idx];
380 mutex_enter(&ring->vr_lock);
381
382 params->vrp_size = ring->vr_size;
383 params->vrp_pa = ring->vr_pa;
384
385 if (ring->vr_state == VRS_RUN) {
386 /* On a running ring, we must heed the avail/used locks */
387 mutex_enter(&ring->vr_a_mutex);
388 params->vrp_avail_idx = ring->vr_cur_aidx;
389 mutex_exit(&ring->vr_a_mutex);
390 mutex_enter(&ring->vr_u_mutex);
391 params->vrp_used_idx = ring->vr_cur_uidx;
392 mutex_exit(&ring->vr_u_mutex);
393 } else {
394 /* Otherwise vr_lock is adequate protection */
395 params->vrp_avail_idx = ring->vr_cur_aidx;
396 params->vrp_used_idx = ring->vr_cur_uidx;
397 }
398
399 mutex_exit(&ring->vr_lock);
400
401 return (0);
402 }
403
404 int
viona_ring_reset(viona_vring_t * ring,boolean_t heed_signals)405 viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
406 {
407 mutex_enter(&ring->vr_lock);
408 if (ring->vr_state == VRS_RESET) {
409 mutex_exit(&ring->vr_lock);
410 return (0);
411 }
412
413 if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) {
414 ring->vr_state_flags |= VRSF_REQ_STOP;
415 cv_broadcast(&ring->vr_cv);
416 }
417 while (ring->vr_state != VRS_RESET) {
418 if (!heed_signals) {
419 cv_wait(&ring->vr_cv, &ring->vr_lock);
420 } else {
421 int rs;
422
423 rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
424 if (rs <= 0 && ring->vr_state != VRS_RESET) {
425 mutex_exit(&ring->vr_lock);
426 return (EINTR);
427 }
428 }
429 }
430 mutex_exit(&ring->vr_lock);
431 return (0);
432 }
433
434 static bool
viona_ring_map(viona_vring_t * ring,bool defer_dirty)435 viona_ring_map(viona_vring_t *ring, bool defer_dirty)
436 {
437 const uint16_t qsz = ring->vr_size;
438 uintptr_t pa = ring->vr_pa;
439
440 ASSERT3U(qsz, !=, 0);
441 ASSERT3U(qsz, <=, VRING_MAX_LEN);
442 ASSERT3U(pa, !=, 0);
443 ASSERT3U(pa & (LEGACY_VQ_ALIGN - 1), ==, 0);
444 ASSERT3U(LEGACY_VQ_ALIGN, ==, PAGESIZE);
445 ASSERT(MUTEX_HELD(&ring->vr_lock));
446 ASSERT3P(ring->vr_map_pages, ==, NULL);
447
448 const uint_t npages = LEGACY_VQ_PAGES(qsz);
449 ring->vr_map_pages = kmem_zalloc(npages * sizeof (void *), KM_SLEEP);
450
451 int page_flags = 0;
452 if (defer_dirty) {
453 /*
454 * During initialization, and when entering the paused state,
455 * the page holds for a virtqueue are established with the
456 * DEFER_DIRTY flag set.
457 *
458 * This prevents those page holds from immediately marking the
459 * underlying pages as dirty, since the viona emulation is not
460 * yet performing any accesses. Once the ring transitions to
461 * the VRS_RUN state, the held pages will be marked as dirty.
462 *
463 * Any ring mappings performed outside those state conditions,
464 * such as those part of vmm_lease renewal during steady-state
465 * operation, will map the ring pages normally (as considered
466 * immediately dirty).
467 */
468 page_flags |= VMPF_DEFER_DIRTY;
469 }
470
471 vmm_page_t *prev = NULL;
472 for (uint_t i = 0; i < npages; i++, pa += PAGESIZE) {
473 vmm_page_t *vmp;
474
475 vmp = vmm_drv_page_hold_ext(ring->vr_lease, pa,
476 PROT_READ | PROT_WRITE, page_flags);
477 if (vmp == NULL) {
478 viona_ring_unmap(ring);
479 return (false);
480 }
481
482 /*
483 * Keep the first page has the head of the chain, appending all
484 * subsequent pages to the tail.
485 */
486 if (prev == NULL) {
487 ring->vr_map_hold = vmp;
488 } else {
489 vmm_drv_page_chain(prev, vmp);
490 }
491 prev = vmp;
492 ring->vr_map_pages[i] = vmm_drv_page_writable(vmp);
493 }
494
495 return (true);
496 }
497
498 static void
viona_ring_mark_dirty(viona_vring_t * ring)499 viona_ring_mark_dirty(viona_vring_t *ring)
500 {
501 ASSERT(MUTEX_HELD(&ring->vr_lock));
502 ASSERT(ring->vr_map_hold != NULL);
503
504 for (vmm_page_t *vp = ring->vr_map_hold; vp != NULL;
505 vp = vmm_drv_page_next(vp)) {
506 vmm_drv_page_mark_dirty(vp);
507 }
508 }
509
510 static void
viona_ring_unmap(viona_vring_t * ring)511 viona_ring_unmap(viona_vring_t *ring)
512 {
513 ASSERT(MUTEX_HELD(&ring->vr_lock));
514
515 void **map = ring->vr_map_pages;
516 if (map != NULL) {
517 const uint_t npages = LEGACY_VQ_PAGES(ring->vr_size);
518 kmem_free(map, npages * sizeof (void *));
519 ring->vr_map_pages = NULL;
520
521 vmm_drv_page_release_chain(ring->vr_map_hold);
522 ring->vr_map_hold = NULL;
523 } else {
524 ASSERT3P(ring->vr_map_hold, ==, NULL);
525 }
526 }
527
528 static inline void *
viona_ring_addr(viona_vring_t * ring,uint_t off)529 viona_ring_addr(viona_vring_t *ring, uint_t off)
530 {
531 ASSERT3P(ring->vr_map_pages, !=, NULL);
532 ASSERT3U(LEGACY_VQ_SIZE(ring->vr_size), >, off);
533
534 const uint_t page_num = off / PAGESIZE;
535 const uint_t page_off = off % PAGESIZE;
536 return ((caddr_t)ring->vr_map_pages[page_num] + page_off);
537 }
538
539 void
viona_intr_ring(viona_vring_t * ring,boolean_t skip_flags_check)540 viona_intr_ring(viona_vring_t *ring, boolean_t skip_flags_check)
541 {
542 if (!skip_flags_check) {
543 volatile uint16_t *avail_flags = viona_ring_addr(ring,
544 LEGACY_AVAIL_FLAGS_OFF(ring->vr_size));
545
546 if ((*avail_flags & VRING_AVAIL_F_NO_INTERRUPT) != 0) {
547 return;
548 }
549 }
550
551 mutex_enter(&ring->vr_lock);
552 uint64_t addr = ring->vr_msi_addr;
553 uint64_t msg = ring->vr_msi_msg;
554 mutex_exit(&ring->vr_lock);
555 if (addr != 0) {
556 /* Deliver the interrupt directly, if so configured... */
557 (void) vmm_drv_msi(ring->vr_lease, addr, msg);
558 } else {
559 /* ... otherwise, leave it to userspace */
560 if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) {
561 pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND);
562 }
563 }
564 }
565
566 static inline bool
vring_stop_req(const viona_vring_t * ring)567 vring_stop_req(const viona_vring_t *ring)
568 {
569 return ((ring->vr_state_flags & VRSF_REQ_STOP) != 0);
570 }
571
572 static inline bool
vring_pause_req(const viona_vring_t * ring)573 vring_pause_req(const viona_vring_t *ring)
574 {
575 return ((ring->vr_state_flags & VRSF_REQ_PAUSE) != 0);
576 }
577
578 static inline bool
vring_start_req(const viona_vring_t * ring)579 vring_start_req(const viona_vring_t *ring)
580 {
581 return ((ring->vr_state_flags & VRSF_REQ_START) != 0);
582 }
583
584 /*
585 * Check if vring worker thread should bail out. This will heed indications
586 * that the containing process is exiting, as well as requests to stop or pause
587 * the ring. The `stop_only` parameter controls if pause requests are ignored
588 * (true) or checked (false).
589 *
590 * Caller should hold vr_lock.
591 */
592 static bool
vring_need_bail_ext(const viona_vring_t * ring,bool stop_only)593 vring_need_bail_ext(const viona_vring_t *ring, bool stop_only)
594 {
595 ASSERT(MUTEX_HELD(&ring->vr_lock));
596
597 if (vring_stop_req(ring) ||
598 (!stop_only && vring_pause_req(ring))) {
599 return (true);
600 }
601
602 kthread_t *t = ring->vr_worker_thread;
603 if (t != NULL) {
604 proc_t *p = ttoproc(t);
605
606 ASSERT(p != NULL);
607 if ((p->p_flag & SEXITING) != 0) {
608 return (true);
609 }
610 }
611 return (false);
612 }
613
614 bool
vring_need_bail(const viona_vring_t * ring)615 vring_need_bail(const viona_vring_t *ring)
616 {
617 return (vring_need_bail_ext(ring, false));
618 }
619
620 int
viona_ring_pause(viona_vring_t * ring)621 viona_ring_pause(viona_vring_t *ring)
622 {
623 mutex_enter(&ring->vr_lock);
624 switch (ring->vr_state) {
625 case VRS_RESET:
626 case VRS_SETUP:
627 case VRS_INIT:
628 /*
629 * For rings which have not yet started (even those in the
630 * VRS_SETUP and VRS_INIT phases, where there a running worker
631 * thread (waiting to be released to do its intended task), it
632 * is adequate to simply clear any start request, to keep them
633 * from proceeding into the actual work processing function.
634 */
635 ring->vr_state_flags &= ~VRSF_REQ_START;
636 mutex_exit(&ring->vr_lock);
637 return (0);
638
639 case VRS_STOP:
640 if ((ring->vr_state_flags & VRSF_REQ_STOP) != 0) {
641 /* A ring on its way to RESET cannot be paused. */
642 mutex_exit(&ring->vr_lock);
643 return (EBUSY);
644 }
645 /* FALLTHROUGH */
646 case VRS_RUN:
647 ring->vr_state_flags |= VRSF_REQ_PAUSE;
648 cv_broadcast(&ring->vr_cv);
649 break;
650
651 default:
652 panic("invalid ring state %d", ring->vr_state);
653 break;
654 }
655
656 for (;;) {
657 int res = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
658
659 if (ring->vr_state == VRS_INIT ||
660 (ring->vr_state_flags & VRSF_REQ_PAUSE) == 0) {
661 /* Ring made it to (or through) paused state */
662 mutex_exit(&ring->vr_lock);
663 return (0);
664 }
665 if (res == 0) {
666 /* interrupted by signal */
667 mutex_exit(&ring->vr_lock);
668 return (EINTR);
669 }
670 }
671 /* NOTREACHED */
672 }
673
674 static void
viona_worker(void * arg)675 viona_worker(void *arg)
676 {
677 viona_vring_t *ring = (viona_vring_t *)arg;
678 viona_link_t *link = ring->vr_link;
679
680 mutex_enter(&ring->vr_lock);
681 VERIFY3U(ring->vr_state, ==, VRS_SETUP);
682
683 /* Bail immediately if ring shutdown or process exit was requested */
684 if (vring_need_bail_ext(ring, true)) {
685 goto ring_reset;
686 }
687
688 /* Report worker thread as alive and notify creator */
689 ring_init:
690 ring->vr_state = VRS_INIT;
691 cv_broadcast(&ring->vr_cv);
692
693 while (!vring_start_req(ring)) {
694 /*
695 * Keeping lease renewals timely while waiting for the ring to
696 * be started is important for avoiding deadlocks.
697 */
698 if (vmm_drv_lease_expired(ring->vr_lease)) {
699 if (!viona_ring_lease_renew(ring)) {
700 goto ring_reset;
701 }
702 }
703
704 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
705
706 if (vring_pause_req(ring)) {
707 /* We are already paused in the INIT state. */
708 ring->vr_state_flags &= ~VRSF_REQ_PAUSE;
709 }
710 if (vring_need_bail_ext(ring, true)) {
711 goto ring_reset;
712 }
713 }
714
715 ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0);
716 ring->vr_state = VRS_RUN;
717 ring->vr_state_flags &= ~VRSF_REQ_START;
718 viona_ring_mark_dirty(ring);
719
720 /* Ensure ring lease is valid first */
721 if (vmm_drv_lease_expired(ring->vr_lease)) {
722 if (!viona_ring_lease_renew(ring)) {
723 goto ring_reset;
724 }
725 }
726
727 /* Process actual work */
728 if (ring == &link->l_vrings[VIONA_VQ_RX]) {
729 viona_worker_rx(ring, link);
730 } else if (ring == &link->l_vrings[VIONA_VQ_TX]) {
731 viona_worker_tx(ring, link);
732 } else {
733 panic("unexpected ring: %p", (void *)ring);
734 }
735
736 VERIFY3U(ring->vr_state, ==, VRS_STOP);
737 VERIFY3U(ring->vr_xfer_outstanding, ==, 0);
738
739 /*
740 * Consolidate stats data so that it is not lost if/when this ring is
741 * being stopped.
742 */
743 viona_ring_consolidate_stats(ring);
744
745 /* Respond to a pause request if the ring is not required to stop */
746 if (vring_pause_req(ring)) {
747 ring->vr_state_flags &= ~VRSF_REQ_PAUSE;
748
749 if (vring_need_bail_ext(ring, true)) {
750 goto ring_reset;
751 }
752
753 /*
754 * To complete pausing of the ring, unmap and re-map the pages
755 * underpinning the virtqueue. This is to synchronize their
756 * dirty state in the backing page tables and restore the
757 * defer-dirty state on the held pages.
758 */
759 viona_ring_unmap(ring);
760 if (viona_ring_map(ring, true)) {
761 goto ring_init;
762 }
763
764 /*
765 * If the ring pages failed to be mapped, fallthrough to
766 * ring-reset like any other failure.
767 */
768 }
769
770 ring_reset:
771 viona_ring_misc_free(ring);
772
773 viona_ring_lease_drop(ring);
774 ring->vr_cur_aidx = 0;
775 ring->vr_size = 0;
776 ring->vr_mask = 0;
777 ring->vr_pa = 0;
778 ring->vr_state = VRS_RESET;
779 ring->vr_state_flags = 0;
780 ring->vr_worker_thread = NULL;
781 cv_broadcast(&ring->vr_cv);
782 mutex_exit(&ring->vr_lock);
783
784 mutex_enter(&ttoproc(curthread)->p_lock);
785 lwp_exit();
786 }
787
788 static kthread_t *
viona_create_worker(viona_vring_t * ring)789 viona_create_worker(viona_vring_t *ring)
790 {
791 k_sigset_t hold_set;
792 proc_t *p = curproc;
793 kthread_t *t;
794 klwp_t *lwp;
795
796 ASSERT(MUTEX_HELD(&ring->vr_lock));
797 ASSERT(ring->vr_state == VRS_RESET);
798
799 sigfillset(&hold_set);
800 lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED,
801 minclsyspri - 1, &hold_set, curthread->t_cid, 0);
802 if (lwp == NULL) {
803 return (NULL);
804 }
805
806 t = lwptot(lwp);
807 mutex_enter(&p->p_lock);
808 t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
809 lwp_create_done(t);
810 mutex_exit(&p->p_lock);
811
812 return (t);
813 }
814
815 void
vq_read_desc(viona_vring_t * ring,uint16_t idx,struct virtio_desc * descp)816 vq_read_desc(viona_vring_t *ring, uint16_t idx, struct virtio_desc *descp)
817 {
818 const uint_t entry_off = idx * sizeof (struct virtio_desc);
819
820 ASSERT3U(idx, <, ring->vr_size);
821
822 bcopy(viona_ring_addr(ring, entry_off), descp, sizeof (*descp));
823 }
824
825 static uint16_t
vq_read_avail(viona_vring_t * ring,uint16_t idx)826 vq_read_avail(viona_vring_t *ring, uint16_t idx)
827 {
828 ASSERT3U(idx, <, ring->vr_size);
829
830 volatile uint16_t *avail_ent =
831 viona_ring_addr(ring, LEGACY_AVAIL_ENT_OFF(ring->vr_size, idx));
832 return (*avail_ent);
833 }
834
835 /*
836 * Given a buffer descriptor `desc`, attempt to map the pages backing that
837 * region of guest physical memory, taking into account that there are no
838 * guarantees about guest-contiguous pages being host-contiguous.
839 */
840 static int
vq_map_desc_bufs(viona_vring_t * ring,const struct virtio_desc * desc,vq_held_region_t * region)841 vq_map_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
842 vq_held_region_t *region)
843 {
844 if (desc->vd_len == 0) {
845 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
846 uint32_t, desc->vd_len);
847 VIONA_RING_STAT_INCR(ring, desc_bad_len);
848 return (EINVAL);
849 } else if ((region->vhr_len + desc->vd_len) < region->vhr_len) {
850 VIONA_PROBE1(len_overflow, viona_vring_t *, ring);
851 VIONA_RING_STAT_INCR(ring, len_overflow);
852 return (EOVERFLOW);
853 }
854
855 int err = vq_region_hold(ring, desc->vd_addr, desc->vd_len,
856 (desc->vd_flags & VRING_DESC_F_WRITE) != 0, region);
857 if (err == 0) {
858 region->vhr_len += desc->vd_len;
859 } else if (err == E2BIG) {
860 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
861 VIONA_RING_STAT_INCR(ring, too_many_desc);
862 } else if (err == EFAULT) {
863 VIONA_PROBE_BAD_RING_ADDR(ring, desc->vd_addr);
864 VIONA_RING_STAT_INCR(ring, bad_ring_addr);
865 }
866
867 return (err);
868 }
869
870 /*
871 * Walk an indirect buffer descriptor `desc`, attempting to map the pages
872 * backing the regions of guest memory covered by its constituent descriptors.
873 */
874 static int
vq_map_indir_desc_bufs(viona_vring_t * ring,const struct virtio_desc * desc,vq_held_region_t * region)875 vq_map_indir_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
876 vq_held_region_t *region)
877 {
878 const uint16_t indir_count = desc->vd_len / sizeof (struct virtio_desc);
879
880 if ((desc->vd_len & 0xf) != 0 || indir_count == 0 ||
881 indir_count > ring->vr_size ||
882 desc->vd_addr > (desc->vd_addr + desc->vd_len)) {
883 VIONA_PROBE2(indir_bad_len, viona_vring_t *, ring,
884 uint32_t, desc->vd_len);
885 VIONA_RING_STAT_INCR(ring, indir_bad_len);
886 return (EINVAL);
887 }
888
889 uint16_t indir_next = 0;
890 const uint8_t *buf = NULL;
891 uint64_t buf_gpa = UINT64_MAX;
892 vmm_page_t *vmp = NULL;
893 int err = 0;
894
895 for (;;) {
896 uint64_t indir_gpa =
897 desc->vd_addr + (indir_next * sizeof (struct virtio_desc));
898 uint64_t indir_page = indir_gpa & PAGEMASK;
899 struct virtio_desc vp;
900
901 /*
902 * Get a mapping for the page that the next indirect descriptor
903 * resides in, if has not already been done.
904 */
905 if (indir_page != buf_gpa) {
906 if (vmp != NULL) {
907 vmm_drv_page_release(vmp);
908 }
909 vmp = vq_page_hold(ring, indir_page, false);
910 if (vmp == NULL) {
911 VIONA_PROBE_BAD_RING_ADDR(ring, indir_page);
912 VIONA_RING_STAT_INCR(ring, bad_ring_addr);
913 err = EFAULT;
914 break;
915 }
916 buf_gpa = indir_page;
917 buf = vmm_drv_page_readable(vmp);
918 }
919
920 /*
921 * A copy of the indirect descriptor is made here, rather than
922 * simply using a reference pointer. This prevents malicious or
923 * erroneous guest writes to the descriptor from fooling the
924 * flags/bounds verification through a race.
925 */
926 bcopy(buf + (indir_gpa - indir_page), &vp, sizeof (vp));
927
928 if (vp.vd_flags & VRING_DESC_F_INDIRECT) {
929 VIONA_PROBE1(indir_bad_nest, viona_vring_t *, ring);
930 VIONA_RING_STAT_INCR(ring, indir_bad_nest);
931 err = EINVAL;
932 break;
933 } else if (vp.vd_len == 0) {
934 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
935 uint32_t, vp.vd_len);
936 VIONA_RING_STAT_INCR(ring, desc_bad_len);
937 err = EINVAL;
938 break;
939 }
940
941 err = vq_map_desc_bufs(ring, &vp, region);
942 if (err != 0) {
943 break;
944 }
945
946 /* Successfully reach the end of the indir chain */
947 if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) {
948 break;
949 }
950 if (region->vhr_idx >= region->vhr_niov) {
951 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
952 VIONA_RING_STAT_INCR(ring, too_many_desc);
953 err = E2BIG;
954 break;
955 }
956
957 indir_next = vp.vd_next;
958 if (indir_next >= indir_count) {
959 VIONA_PROBE3(indir_bad_next, viona_vring_t *, ring,
960 uint16_t, indir_next, uint16_t, indir_count);
961 VIONA_RING_STAT_INCR(ring, indir_bad_next);
962 err = EINVAL;
963 break;
964 }
965 }
966
967 if (vmp != NULL) {
968 vmm_drv_page_release(vmp);
969 }
970 return (err);
971 }
972
973 int
vq_popchain(viona_vring_t * ring,struct iovec * iov,uint_t niov,uint16_t * cookie,vmm_page_t ** chain,uint32_t * len)974 vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov,
975 uint16_t *cookie, vmm_page_t **chain, uint32_t *len)
976 {
977 uint16_t ndesc, idx, head, next;
978 struct virtio_desc vdir;
979 vq_held_region_t region = {
980 .vhr_niov = niov,
981 .vhr_iov = iov,
982 };
983
984 ASSERT(iov != NULL);
985 ASSERT(niov > 0 && niov < INT_MAX);
986 ASSERT(*chain == NULL);
987
988 mutex_enter(&ring->vr_a_mutex);
989 idx = ring->vr_cur_aidx;
990 ndesc = viona_ring_num_avail(ring);
991
992 if (ndesc == 0) {
993 mutex_exit(&ring->vr_a_mutex);
994 return (0);
995 }
996 if (ndesc > ring->vr_size) {
997 /*
998 * Despite the fact that the guest has provided an 'avail_idx'
999 * which indicates that an impossible number of descriptors are
1000 * available, continue on and attempt to process the next one.
1001 *
1002 * The transgression will not escape the probe or stats though.
1003 */
1004 VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring,
1005 uint16_t, ndesc);
1006 VIONA_RING_STAT_INCR(ring, ndesc_too_high);
1007 }
1008
1009 head = vq_read_avail(ring, idx & ring->vr_mask);
1010 next = head;
1011
1012 for (region.vhr_idx = 0; region.vhr_idx < niov; next = vdir.vd_next) {
1013 if (next >= ring->vr_size) {
1014 VIONA_PROBE2(bad_idx, viona_vring_t *, ring,
1015 uint16_t, next);
1016 VIONA_RING_STAT_INCR(ring, bad_idx);
1017 break;
1018 }
1019
1020 vq_read_desc(ring, next, &vdir);
1021 if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) {
1022 if (vq_map_desc_bufs(ring, &vdir, ®ion) != 0) {
1023 break;
1024 }
1025 } else {
1026 /*
1027 * Per the specification (Virtio 1.1 S2.6.5.3.1):
1028 * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
1029 * and VIRTQ_DESC_F_NEXT in `flags`.
1030 */
1031 if ((vdir.vd_flags & VRING_DESC_F_NEXT) != 0) {
1032 VIONA_PROBE3(indir_bad_next,
1033 viona_vring_t *, ring,
1034 uint16_t, next, uint16_t, 0);
1035 VIONA_RING_STAT_INCR(ring, indir_bad_next);
1036 break;
1037 }
1038
1039 if (vq_map_indir_desc_bufs(ring, &vdir, ®ion) != 0) {
1040 break;
1041 }
1042 }
1043
1044 if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) {
1045 ring->vr_cur_aidx++;
1046 mutex_exit(&ring->vr_a_mutex);
1047
1048 *cookie = head;
1049 *chain = region.vhr_head;
1050 if (len != NULL) {
1051 *len = region.vhr_len;
1052 }
1053 return (region.vhr_idx);
1054 }
1055 }
1056
1057 mutex_exit(&ring->vr_a_mutex);
1058 if (region.vhr_head != NULL) {
1059 /*
1060 * If any pages were held prior to encountering an error, we
1061 * must release them now.
1062 */
1063 vmm_drv_page_release_chain(region.vhr_head);
1064 }
1065 return (-1);
1066 }
1067
1068
1069 static void
vq_write_used_ent(viona_vring_t * ring,uint16_t idx,uint16_t cookie,uint32_t len)1070 vq_write_used_ent(viona_vring_t *ring, uint16_t idx, uint16_t cookie,
1071 uint32_t len)
1072 {
1073 /*
1074 * In a larger ring, entry could be split across pages, so be sure to
1075 * account for that when configuring the transfer by looking up the ID
1076 * and length addresses separately, rather than an address for a
1077 * combined `struct virtio_used`.
1078 */
1079 const uint_t used_id_off = LEGACY_USED_ENT_OFF(ring->vr_size, idx);
1080 const uint_t used_len_off = used_id_off + sizeof (uint32_t);
1081 volatile uint32_t *idp = viona_ring_addr(ring, used_id_off);
1082 volatile uint32_t *lenp = viona_ring_addr(ring, used_len_off);
1083
1084 ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
1085
1086 *idp = cookie;
1087 *lenp = len;
1088 }
1089
1090 static void
vq_write_used_idx(viona_vring_t * ring,uint16_t idx)1091 vq_write_used_idx(viona_vring_t *ring, uint16_t idx)
1092 {
1093 ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
1094
1095 volatile uint16_t *used_idx =
1096 viona_ring_addr(ring, LEGACY_USED_IDX_OFF(ring->vr_size));
1097 *used_idx = idx;
1098 }
1099
1100 void
vq_pushchain(viona_vring_t * ring,uint32_t len,uint16_t cookie)1101 vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie)
1102 {
1103 uint16_t uidx;
1104
1105 mutex_enter(&ring->vr_u_mutex);
1106
1107 uidx = ring->vr_cur_uidx;
1108 vq_write_used_ent(ring, uidx & ring->vr_mask, cookie, len);
1109 uidx++;
1110 membar_producer();
1111
1112 vq_write_used_idx(ring, uidx);
1113 ring->vr_cur_uidx = uidx;
1114
1115 mutex_exit(&ring->vr_u_mutex);
1116 }
1117
1118 void
vq_pushchain_many(viona_vring_t * ring,uint_t num_bufs,used_elem_t * elem)1119 vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem)
1120 {
1121 uint16_t uidx;
1122
1123 mutex_enter(&ring->vr_u_mutex);
1124
1125 uidx = ring->vr_cur_uidx;
1126
1127 for (uint_t i = 0; i < num_bufs; i++, uidx++) {
1128 vq_write_used_ent(ring, uidx & ring->vr_mask, elem[i].id,
1129 elem[i].len);
1130 }
1131
1132 membar_producer();
1133 vq_write_used_idx(ring, uidx);
1134 ring->vr_cur_uidx = uidx;
1135
1136 mutex_exit(&ring->vr_u_mutex);
1137 }
1138
1139 /*
1140 * Set USED_NO_NOTIFY on VQ so guest elides doorbell calls for new entries.
1141 */
1142 void
viona_ring_disable_notify(viona_vring_t * ring)1143 viona_ring_disable_notify(viona_vring_t *ring)
1144 {
1145 volatile uint16_t *used_flags =
1146 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
1147
1148 *used_flags |= VRING_USED_F_NO_NOTIFY;
1149 }
1150
1151 /*
1152 * Clear USED_NO_NOTIFY on VQ so guest resumes doorbell calls for new entries.
1153 */
1154 void
viona_ring_enable_notify(viona_vring_t * ring)1155 viona_ring_enable_notify(viona_vring_t *ring)
1156 {
1157 volatile uint16_t *used_flags =
1158 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
1159
1160 *used_flags &= ~VRING_USED_F_NO_NOTIFY;
1161 }
1162
1163 /*
1164 * Return the number of available descriptors in the vring taking care of the
1165 * 16-bit index wraparound.
1166 *
1167 * Note: If the number of apparently available descriptors is larger than the
1168 * ring size (due to guest misbehavior), this check will still report the
1169 * positive count of descriptors.
1170 */
1171 uint16_t
viona_ring_num_avail(viona_vring_t * ring)1172 viona_ring_num_avail(viona_vring_t *ring)
1173 {
1174 volatile uint16_t *avail_idx =
1175 viona_ring_addr(ring, LEGACY_AVAIL_IDX_OFF(ring->vr_size));
1176
1177 return (*avail_idx - ring->vr_cur_aidx);
1178 }
1179
1180 /* Record a successfully transferred packet for the ring stats */
1181 void
viona_ring_stat_accept(viona_vring_t * ring,uint32_t len)1182 viona_ring_stat_accept(viona_vring_t *ring, uint32_t len)
1183 {
1184 atomic_inc_64(&ring->vr_stats.vts_packets);
1185 atomic_add_64(&ring->vr_stats.vts_bytes, len);
1186 }
1187
1188 /*
1189 * Record a dropped packet in the ring stats
1190 */
1191 void
viona_ring_stat_drop(viona_vring_t * ring)1192 viona_ring_stat_drop(viona_vring_t *ring)
1193 {
1194 atomic_inc_64(&ring->vr_stats.vts_drops);
1195 }
1196
1197 /*
1198 * Record a packet transfer error in the ring stats
1199 */
1200 void
viona_ring_stat_error(viona_vring_t * ring)1201 viona_ring_stat_error(viona_vring_t *ring)
1202 {
1203 atomic_inc_64(&ring->vr_stats.vts_errors);
1204 }
1205
1206 /*
1207 * Consolidate statistic data for this ring into the totals for the link
1208 */
1209 static void
viona_ring_consolidate_stats(viona_vring_t * ring)1210 viona_ring_consolidate_stats(viona_vring_t *ring)
1211 {
1212 viona_link_t *link = ring->vr_link;
1213 struct viona_transfer_stats *lstat =
1214 (ring == &link->l_vrings[VIONA_VQ_RX]) ?
1215 &link->l_stats.vls_rx : &link->l_stats.vls_tx;
1216
1217 mutex_enter(&link->l_stats_lock);
1218 lstat->vts_packets += ring->vr_stats.vts_packets;
1219 lstat->vts_bytes += ring->vr_stats.vts_bytes;
1220 lstat->vts_drops += ring->vr_stats.vts_drops;
1221 lstat->vts_errors += ring->vr_stats.vts_errors;
1222 bzero(&ring->vr_stats, sizeof (ring->vr_stats));
1223 mutex_exit(&link->l_stats_lock);
1224 }
1225
1226 /*
1227 * Copy `sz` bytes from iovecs contained in `iob` to `dst.
1228 *
1229 * Returns `true` if copy was successful (implying adequate data was remaining
1230 * in the iov_bunch_t).
1231 */
1232 bool
iov_bunch_copy(iov_bunch_t * iob,void * dst,uint32_t sz)1233 iov_bunch_copy(iov_bunch_t *iob, void *dst, uint32_t sz)
1234 {
1235 if (sz > iob->ib_remain) {
1236 return (false);
1237 }
1238 if (sz == 0) {
1239 return (true);
1240 }
1241
1242 caddr_t dest = dst;
1243 do {
1244 struct iovec *iov = iob->ib_iov;
1245
1246 ASSERT3U(iov->iov_len, <, UINT32_MAX);
1247 ASSERT3U(iov->iov_len, !=, 0);
1248
1249 const uint32_t iov_avail = (iov->iov_len - iob->ib_offset);
1250 const uint32_t to_copy = MIN(sz, iov_avail);
1251
1252 if (to_copy != 0) {
1253 bcopy((caddr_t)iov->iov_base + iob->ib_offset, dest,
1254 to_copy);
1255 }
1256
1257 sz -= to_copy;
1258 iob->ib_remain -= to_copy;
1259 dest += to_copy;
1260 iob->ib_offset += to_copy;
1261
1262 ASSERT3U(iob->ib_offset, <=, iov->iov_len);
1263
1264 if (iob->ib_offset == iov->iov_len) {
1265 iob->ib_iov++;
1266 iob->ib_offset = 0;
1267 }
1268 } while (sz > 0);
1269
1270 return (true);
1271 }
1272
1273 /*
1274 * Get the data pointer and length of the current head iovec, less any
1275 * offsetting from prior copy operations. This will advanced the iov_bunch_t as
1276 * if the caller had performed a copy of that chunk length.
1277 *
1278 * Returns `true` if the iov_bunch_t had at least one iovec (unconsumed bytes)
1279 * remaining, setting `chunk` and `chunk_sz` to the chunk pointer and size,
1280 * respectively.
1281 */
1282 bool
iov_bunch_next_chunk(iov_bunch_t * iob,caddr_t * chunk,uint32_t * chunk_sz)1283 iov_bunch_next_chunk(iov_bunch_t *iob, caddr_t *chunk, uint32_t *chunk_sz)
1284 {
1285 if (iob->ib_remain == 0) {
1286 *chunk = NULL;
1287 *chunk_sz = 0;
1288 return (false);
1289 }
1290
1291 *chunk_sz = iob->ib_iov->iov_len - iob->ib_offset;
1292 *chunk = (caddr_t)iob->ib_iov->iov_base + iob->ib_offset;
1293 iob->ib_remain -= *chunk_sz;
1294 iob->ib_iov++;
1295 iob->ib_offset = 0;
1296 return (true);
1297 }
1298