1 /*
2 * Copyright (c) 2013 Chris Torek <torek @ torek net>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 /*
27 * This file and its contents are supplied under the terms of the
28 * Common Development and Distribution License ("CDDL"), version 1.0.
29 * You may only use this file in accordance with the terms of version
30 * 1.0 of the CDDL.
31 *
32 * A full copy of the text of the CDDL should have accompanied this
33 * source. A copy of the CDDL is also available via the Internet at
34 * http://www.illumos.org/license/CDDL.
35 *
36 * Copyright 2015 Pluribus Networks Inc.
37 * Copyright 2019 Joyent, Inc.
38 * Copyright 2024 Oxide Computer Company
39 */
40
41
42 #include <sys/disp.h>
43
44 #include "viona_impl.h"
45
46 #define VRING_MAX_LEN 32768
47
48 /* Layout and sizing as defined in the spec for a legacy-style virtqueue */
49
50 #define LEGACY_VQ_ALIGN PAGESIZE
51
52 #define LEGACY_DESC_SZ(qsz) ((qsz) * sizeof (struct virtio_desc))
53 /*
54 * Available ring consists of avail_idx (uint16_t), flags (uint16_t), qsz avail
55 * descriptors (uint16_t each), and (optional) used_event (uint16_t).
56 */
57 #define LEGACY_AVAIL_SZ(qsz) (((qsz) + 3) * sizeof (uint16_t))
58 /*
59 * Used ring consists of used_idx (uint16_t), flags (uint16_t), qsz used
60 * descriptors (two uint32_t each), and (optional) avail_event (uint16_t).
61 */
62 #define LEGACY_USED_SZ(qsz) \
63 ((qsz) * sizeof (struct virtio_used) + 3 * sizeof (uint16_t))
64
65 #define LEGACY_AVAIL_FLAGS_OFF(qsz) LEGACY_DESC_SZ(qsz)
66 #define LEGACY_AVAIL_IDX_OFF(qsz) \
67 (LEGACY_DESC_SZ(qsz) + sizeof (uint16_t))
68 #define LEGACY_AVAIL_ENT_OFF(qsz, idx) \
69 (LEGACY_DESC_SZ(qsz) + (2 + (idx)) * sizeof (uint16_t))
70
71 #define LEGACY_USED_FLAGS_OFF(qsz) \
72 P2ROUNDUP(LEGACY_DESC_SZ(qsz) + LEGACY_AVAIL_SZ(qsz), LEGACY_VQ_ALIGN)
73 #define LEGACY_USED_IDX_OFF(qsz) \
74 (LEGACY_USED_FLAGS_OFF(qsz) + sizeof (uint16_t))
75 #define LEGACY_USED_ENT_OFF(qsz, idx) \
76 (LEGACY_USED_FLAGS_OFF(qsz) + 2 * sizeof (uint16_t) + \
77 (idx) * sizeof (struct virtio_used))
78
79 #define LEGACY_VQ_SIZE(qsz) \
80 (LEGACY_USED_FLAGS_OFF(qsz) + \
81 P2ROUNDUP(LEGACY_USED_SZ(qsz), LEGACY_VQ_ALIGN))
82 #define LEGACY_VQ_PAGES(qsz) (LEGACY_VQ_SIZE(qsz) / PAGESIZE)
83
84 struct vq_held_region {
85 struct iovec *vhr_iov;
86 vmm_page_t *vhr_head;
87 vmm_page_t *vhr_tail;
88 /* Length of iovec array supplied in `vhr_iov` */
89 uint_t vhr_niov;
90 /*
91 * Index into vhr_iov, indicating the next "free" entry (following the
92 * last entry which has valid contents).
93 */
94 uint_t vhr_idx;
95 };
96 typedef struct vq_held_region vq_held_region_t;
97
98 static bool viona_ring_map(viona_vring_t *, bool);
99 static void viona_ring_unmap(viona_vring_t *);
100 static kthread_t *viona_create_worker(viona_vring_t *);
101 static void viona_ring_consolidate_stats(viona_vring_t *);
102
103 static vmm_page_t *
vq_page_hold(viona_vring_t * ring,uint64_t gpa,bool writable)104 vq_page_hold(viona_vring_t *ring, uint64_t gpa, bool writable)
105 {
106 ASSERT3P(ring->vr_lease, !=, NULL);
107
108 int prot = PROT_READ;
109 if (writable) {
110 prot |= PROT_WRITE;
111 }
112
113 return (vmm_drv_page_hold(ring->vr_lease, gpa, prot));
114 }
115
116 /*
117 * Establish a hold on the page(s) which back the region of guest memory covered
118 * by [gpa, gpa + len). The host-kernel-virtual pointers to those pages are
119 * stored in the iovec array supplied in `region`, along with the chain of
120 * vmm_page_t entries representing the held pages. Since guest memory
121 * carries no guarantees of being physically contiguous (on the host), it is
122 * assumed that an iovec entry will be required for each PAGESIZE section
123 * covered by the specified `gpa` and `len` range. For each iovec entry
124 * successfully populated by holding a page, `vhr_idx` will be incremented so it
125 * references the next available iovec entry (or `vhr_niov`, if the iovec array
126 * is full). The responsibility for releasing the `vmm_page_t` chain (stored in
127 * `vhr_head` and `vhr_tail`) resides with the caller, regardless of the result.
128 */
129 static int
vq_region_hold(viona_vring_t * ring,uint64_t gpa,uint32_t len,bool writable,vq_held_region_t * region)130 vq_region_hold(viona_vring_t *ring, uint64_t gpa, uint32_t len,
131 bool writable, vq_held_region_t *region)
132 {
133 const uint32_t front_offset = gpa & PAGEOFFSET;
134 const uint32_t front_len = MIN(len, PAGESIZE - front_offset);
135 uint_t pages = 1;
136 vmm_page_t *vmp;
137 caddr_t buf;
138
139 ASSERT3U(region->vhr_idx, <, region->vhr_niov);
140
141 if (front_len < len) {
142 pages += P2ROUNDUP((uint64_t)(len - front_len),
143 PAGESIZE) / PAGESIZE;
144 }
145 if (pages > (region->vhr_niov - region->vhr_idx)) {
146 return (E2BIG);
147 }
148
149 vmp = vq_page_hold(ring, gpa & PAGEMASK, writable);
150 if (vmp == NULL) {
151 return (EFAULT);
152 }
153 buf = (caddr_t)vmm_drv_page_readable(vmp);
154
155 region->vhr_iov[region->vhr_idx].iov_base = buf + front_offset;
156 region->vhr_iov[region->vhr_idx].iov_len = front_len;
157 region->vhr_idx++;
158 gpa += front_len;
159 len -= front_len;
160 if (region->vhr_head == NULL) {
161 region->vhr_head = vmp;
162 region->vhr_tail = vmp;
163 } else {
164 vmm_drv_page_chain(region->vhr_tail, vmp);
165 region->vhr_tail = vmp;
166 }
167
168 for (uint_t i = 1; i < pages; i++) {
169 ASSERT3U(gpa & PAGEOFFSET, ==, 0);
170
171 vmp = vq_page_hold(ring, gpa, writable);
172 if (vmp == NULL) {
173 return (EFAULT);
174 }
175 buf = (caddr_t)vmm_drv_page_readable(vmp);
176
177 const uint32_t chunk_len = MIN(len, PAGESIZE);
178 region->vhr_iov[region->vhr_idx].iov_base = buf;
179 region->vhr_iov[region->vhr_idx].iov_len = chunk_len;
180 region->vhr_idx++;
181 gpa += chunk_len;
182 len -= chunk_len;
183 vmm_drv_page_chain(region->vhr_tail, vmp);
184 region->vhr_tail = vmp;
185 }
186
187 return (0);
188 }
189
190 static boolean_t
viona_ring_lease_expire_cb(void * arg)191 viona_ring_lease_expire_cb(void *arg)
192 {
193 viona_vring_t *ring = arg;
194
195 mutex_enter(&ring->vr_lock);
196 cv_broadcast(&ring->vr_cv);
197 mutex_exit(&ring->vr_lock);
198
199 /* The lease will be broken asynchronously. */
200 return (B_FALSE);
201 }
202
203 static void
viona_ring_lease_drop(viona_vring_t * ring)204 viona_ring_lease_drop(viona_vring_t *ring)
205 {
206 ASSERT(MUTEX_HELD(&ring->vr_lock));
207
208 if (ring->vr_lease != NULL) {
209 vmm_hold_t *hold = ring->vr_link->l_vm_hold;
210
211 ASSERT(hold != NULL);
212
213 /*
214 * Without an active lease, the ring mappings cannot be
215 * considered valid.
216 */
217 viona_ring_unmap(ring);
218
219 vmm_drv_lease_break(hold, ring->vr_lease);
220 ring->vr_lease = NULL;
221 }
222 }
223
224 boolean_t
viona_ring_lease_renew(viona_vring_t * ring)225 viona_ring_lease_renew(viona_vring_t *ring)
226 {
227 vmm_hold_t *hold = ring->vr_link->l_vm_hold;
228
229 ASSERT(hold != NULL);
230 ASSERT(MUTEX_HELD(&ring->vr_lock));
231
232 viona_ring_lease_drop(ring);
233
234 /*
235 * Lease renewal will fail if the VM has requested that all holds be
236 * cleaned up.
237 */
238 ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
239 ring);
240 if (ring->vr_lease != NULL) {
241 /* A ring undergoing renewal will need valid guest mappings */
242 if (ring->vr_pa != 0 && ring->vr_size != 0) {
243 /*
244 * If new mappings cannot be established, consider the
245 * lease renewal a failure.
246 */
247 if (!viona_ring_map(ring, ring->vr_state == VRS_INIT)) {
248 viona_ring_lease_drop(ring);
249 return (B_FALSE);
250 }
251 }
252 }
253 return (ring->vr_lease != NULL);
254 }
255
256 void
viona_ring_alloc(viona_link_t * link,viona_vring_t * ring)257 viona_ring_alloc(viona_link_t *link, viona_vring_t *ring)
258 {
259 ring->vr_link = link;
260 mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL);
261 cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL);
262 mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL);
263 mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL);
264 }
265
266 static void
viona_ring_misc_free(viona_vring_t * ring)267 viona_ring_misc_free(viona_vring_t *ring)
268 {
269 const uint_t qsz = ring->vr_size;
270
271 viona_tx_ring_free(ring, qsz);
272 }
273
274 void
viona_ring_free(viona_vring_t * ring)275 viona_ring_free(viona_vring_t *ring)
276 {
277 mutex_destroy(&ring->vr_lock);
278 cv_destroy(&ring->vr_cv);
279 mutex_destroy(&ring->vr_a_mutex);
280 mutex_destroy(&ring->vr_u_mutex);
281 ring->vr_link = NULL;
282 }
283
284 int
viona_ring_init(viona_link_t * link,uint16_t idx,const struct viona_ring_params * params)285 viona_ring_init(viona_link_t *link, uint16_t idx,
286 const struct viona_ring_params *params)
287 {
288 viona_vring_t *ring;
289 kthread_t *t;
290 int err = 0;
291 const uint16_t qsz = params->vrp_size;
292 const uint64_t pa = params->vrp_pa;
293
294 if (idx >= VIONA_VQ_MAX) {
295 return (EINVAL);
296 }
297
298 if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
299 return (EINVAL);
300 }
301 if ((pa & (LEGACY_VQ_ALIGN - 1)) != 0) {
302 return (EINVAL);
303 }
304
305 ring = &link->l_vrings[idx];
306 mutex_enter(&ring->vr_lock);
307 if (ring->vr_state != VRS_RESET) {
308 mutex_exit(&ring->vr_lock);
309 return (EBUSY);
310 }
311 VERIFY(ring->vr_state_flags == 0);
312
313 ring->vr_lease = NULL;
314 if (!viona_ring_lease_renew(ring)) {
315 err = EBUSY;
316 goto fail;
317 }
318
319 ring->vr_size = qsz;
320 ring->vr_mask = (ring->vr_size - 1);
321 ring->vr_pa = pa;
322 if (!viona_ring_map(ring, true)) {
323 err = EINVAL;
324 goto fail;
325 }
326
327 /* Initialize queue indexes */
328 ring->vr_cur_aidx = params->vrp_avail_idx;
329 ring->vr_cur_uidx = params->vrp_used_idx;
330
331 if (idx == VIONA_VQ_TX) {
332 viona_tx_ring_alloc(ring, qsz);
333 }
334
335 /* Zero out MSI-X configuration */
336 ring->vr_msi_addr = 0;
337 ring->vr_msi_msg = 0;
338
339 /* Clear the stats */
340 bzero(&ring->vr_stats, sizeof (ring->vr_stats));
341 bzero(&ring->vr_err_stats, sizeof (ring->vr_err_stats));
342
343 t = viona_create_worker(ring);
344 if (t == NULL) {
345 err = ENOMEM;
346 goto fail;
347 }
348 ring->vr_worker_thread = t;
349 ring->vr_state = VRS_SETUP;
350 cv_broadcast(&ring->vr_cv);
351 mutex_exit(&ring->vr_lock);
352 return (0);
353
354 fail:
355 viona_ring_lease_drop(ring);
356 viona_ring_misc_free(ring);
357 ring->vr_size = 0;
358 ring->vr_mask = 0;
359 ring->vr_pa = 0;
360 ring->vr_cur_aidx = 0;
361 ring->vr_cur_uidx = 0;
362 mutex_exit(&ring->vr_lock);
363 return (err);
364 }
365
366 int
viona_ring_get_state(viona_link_t * link,uint16_t idx,struct viona_ring_params * params)367 viona_ring_get_state(viona_link_t *link, uint16_t idx,
368 struct viona_ring_params *params)
369 {
370 viona_vring_t *ring;
371
372 if (idx >= VIONA_VQ_MAX) {
373 return (EINVAL);
374 }
375
376 ring = &link->l_vrings[idx];
377 mutex_enter(&ring->vr_lock);
378
379 params->vrp_size = ring->vr_size;
380 params->vrp_pa = ring->vr_pa;
381
382 if (ring->vr_state == VRS_RUN) {
383 /* On a running ring, we must heed the avail/used locks */
384 mutex_enter(&ring->vr_a_mutex);
385 params->vrp_avail_idx = ring->vr_cur_aidx;
386 mutex_exit(&ring->vr_a_mutex);
387 mutex_enter(&ring->vr_u_mutex);
388 params->vrp_used_idx = ring->vr_cur_uidx;
389 mutex_exit(&ring->vr_u_mutex);
390 } else {
391 /* Otherwise vr_lock is adequate protection */
392 params->vrp_avail_idx = ring->vr_cur_aidx;
393 params->vrp_used_idx = ring->vr_cur_uidx;
394 }
395
396 mutex_exit(&ring->vr_lock);
397
398 return (0);
399 }
400
401 int
viona_ring_reset(viona_vring_t * ring,boolean_t heed_signals)402 viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
403 {
404 mutex_enter(&ring->vr_lock);
405 if (ring->vr_state == VRS_RESET) {
406 mutex_exit(&ring->vr_lock);
407 return (0);
408 }
409
410 if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) {
411 ring->vr_state_flags |= VRSF_REQ_STOP;
412 cv_broadcast(&ring->vr_cv);
413 }
414 while (ring->vr_state != VRS_RESET) {
415 if (!heed_signals) {
416 cv_wait(&ring->vr_cv, &ring->vr_lock);
417 } else {
418 int rs;
419
420 rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
421 if (rs <= 0 && ring->vr_state != VRS_RESET) {
422 mutex_exit(&ring->vr_lock);
423 return (EINTR);
424 }
425 }
426 }
427 mutex_exit(&ring->vr_lock);
428 return (0);
429 }
430
431 static bool
viona_ring_map(viona_vring_t * ring,bool defer_dirty)432 viona_ring_map(viona_vring_t *ring, bool defer_dirty)
433 {
434 const uint16_t qsz = ring->vr_size;
435 uintptr_t pa = ring->vr_pa;
436
437 ASSERT3U(qsz, !=, 0);
438 ASSERT3U(qsz, <=, VRING_MAX_LEN);
439 ASSERT3U(pa, !=, 0);
440 ASSERT3U(pa & (LEGACY_VQ_ALIGN - 1), ==, 0);
441 ASSERT3U(LEGACY_VQ_ALIGN, ==, PAGESIZE);
442 ASSERT(MUTEX_HELD(&ring->vr_lock));
443 ASSERT3P(ring->vr_map_pages, ==, NULL);
444
445 const uint_t npages = LEGACY_VQ_PAGES(qsz);
446 ring->vr_map_pages = kmem_zalloc(npages * sizeof (void *), KM_SLEEP);
447
448 int page_flags = 0;
449 if (defer_dirty) {
450 /*
451 * During initialization, and when entering the paused state,
452 * the page holds for a virtqueue are established with the
453 * DEFER_DIRTY flag set.
454 *
455 * This prevents those page holds from immediately marking the
456 * underlying pages as dirty, since the viona emulation is not
457 * yet performing any accesses. Once the ring transitions to
458 * the VRS_RUN state, the held pages will be marked as dirty.
459 *
460 * Any ring mappings performed outside those state conditions,
461 * such as those part of vmm_lease renewal during steady-state
462 * operation, will map the ring pages normally (as considered
463 * immediately dirty).
464 */
465 page_flags |= VMPF_DEFER_DIRTY;
466 }
467
468 vmm_page_t *prev = NULL;
469 for (uint_t i = 0; i < npages; i++, pa += PAGESIZE) {
470 vmm_page_t *vmp;
471
472 vmp = vmm_drv_page_hold_ext(ring->vr_lease, pa,
473 PROT_READ | PROT_WRITE, page_flags);
474 if (vmp == NULL) {
475 viona_ring_unmap(ring);
476 return (false);
477 }
478
479 /*
480 * Keep the first page has the head of the chain, appending all
481 * subsequent pages to the tail.
482 */
483 if (prev == NULL) {
484 ring->vr_map_hold = vmp;
485 } else {
486 vmm_drv_page_chain(prev, vmp);
487 }
488 prev = vmp;
489 ring->vr_map_pages[i] = vmm_drv_page_writable(vmp);
490 }
491
492 return (true);
493 }
494
495 static void
viona_ring_mark_dirty(viona_vring_t * ring)496 viona_ring_mark_dirty(viona_vring_t *ring)
497 {
498 ASSERT(MUTEX_HELD(&ring->vr_lock));
499 ASSERT(ring->vr_map_hold != NULL);
500
501 for (vmm_page_t *vp = ring->vr_map_hold; vp != NULL;
502 vp = vmm_drv_page_next(vp)) {
503 vmm_drv_page_mark_dirty(vp);
504 }
505 }
506
507 static void
viona_ring_unmap(viona_vring_t * ring)508 viona_ring_unmap(viona_vring_t *ring)
509 {
510 ASSERT(MUTEX_HELD(&ring->vr_lock));
511
512 void **map = ring->vr_map_pages;
513 if (map != NULL) {
514 const uint_t npages = LEGACY_VQ_PAGES(ring->vr_size);
515 kmem_free(map, npages * sizeof (void *));
516 ring->vr_map_pages = NULL;
517
518 vmm_drv_page_release_chain(ring->vr_map_hold);
519 ring->vr_map_hold = NULL;
520 } else {
521 ASSERT3P(ring->vr_map_hold, ==, NULL);
522 }
523 }
524
525 static inline void *
viona_ring_addr(viona_vring_t * ring,uint_t off)526 viona_ring_addr(viona_vring_t *ring, uint_t off)
527 {
528 ASSERT3P(ring->vr_map_pages, !=, NULL);
529 ASSERT3U(LEGACY_VQ_SIZE(ring->vr_size), >, off);
530
531 const uint_t page_num = off / PAGESIZE;
532 const uint_t page_off = off % PAGESIZE;
533 return ((caddr_t)ring->vr_map_pages[page_num] + page_off);
534 }
535
536 void
viona_intr_ring(viona_vring_t * ring,boolean_t skip_flags_check)537 viona_intr_ring(viona_vring_t *ring, boolean_t skip_flags_check)
538 {
539 if (!skip_flags_check) {
540 volatile uint16_t *avail_flags = viona_ring_addr(ring,
541 LEGACY_AVAIL_FLAGS_OFF(ring->vr_size));
542
543 if ((*avail_flags & VRING_AVAIL_F_NO_INTERRUPT) != 0) {
544 return;
545 }
546 }
547
548 mutex_enter(&ring->vr_lock);
549 uint64_t addr = ring->vr_msi_addr;
550 uint64_t msg = ring->vr_msi_msg;
551 mutex_exit(&ring->vr_lock);
552 if (addr != 0) {
553 /* Deliver the interrupt directly, if so configured... */
554 (void) vmm_drv_msi(ring->vr_lease, addr, msg);
555 } else {
556 /* ... otherwise, leave it to userspace */
557 if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) {
558 pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND);
559 }
560 }
561 }
562
563 static inline bool
vring_stop_req(const viona_vring_t * ring)564 vring_stop_req(const viona_vring_t *ring)
565 {
566 return ((ring->vr_state_flags & VRSF_REQ_STOP) != 0);
567 }
568
569 static inline bool
vring_pause_req(const viona_vring_t * ring)570 vring_pause_req(const viona_vring_t *ring)
571 {
572 return ((ring->vr_state_flags & VRSF_REQ_PAUSE) != 0);
573 }
574
575 static inline bool
vring_start_req(const viona_vring_t * ring)576 vring_start_req(const viona_vring_t *ring)
577 {
578 return ((ring->vr_state_flags & VRSF_REQ_START) != 0);
579 }
580
581 /*
582 * Check if vring worker thread should bail out. This will heed indications
583 * that the containing process is exiting, as well as requests to stop or pause
584 * the ring. The `stop_only` parameter controls if pause requests are ignored
585 * (true) or checked (false).
586 *
587 * Caller should hold vr_lock.
588 */
589 static bool
vring_need_bail_ext(const viona_vring_t * ring,bool stop_only)590 vring_need_bail_ext(const viona_vring_t *ring, bool stop_only)
591 {
592 ASSERT(MUTEX_HELD(&ring->vr_lock));
593
594 if (vring_stop_req(ring) ||
595 (!stop_only && vring_pause_req(ring))) {
596 return (true);
597 }
598
599 kthread_t *t = ring->vr_worker_thread;
600 if (t != NULL) {
601 proc_t *p = ttoproc(t);
602
603 ASSERT(p != NULL);
604 if ((p->p_flag & SEXITING) != 0) {
605 return (true);
606 }
607 }
608 return (false);
609 }
610
611 bool
vring_need_bail(const viona_vring_t * ring)612 vring_need_bail(const viona_vring_t *ring)
613 {
614 return (vring_need_bail_ext(ring, false));
615 }
616
617 int
viona_ring_pause(viona_vring_t * ring)618 viona_ring_pause(viona_vring_t *ring)
619 {
620 mutex_enter(&ring->vr_lock);
621 switch (ring->vr_state) {
622 case VRS_RESET:
623 case VRS_SETUP:
624 case VRS_INIT:
625 /*
626 * For rings which have not yet started (even those in the
627 * VRS_SETUP and VRS_INIT phases, where there a running worker
628 * thread (waiting to be released to do its intended task), it
629 * is adequate to simply clear any start request, to keep them
630 * from proceeding into the actual work processing function.
631 */
632 ring->vr_state_flags &= ~VRSF_REQ_START;
633 mutex_exit(&ring->vr_lock);
634 return (0);
635
636 case VRS_STOP:
637 if ((ring->vr_state_flags & VRSF_REQ_STOP) != 0) {
638 /* A ring on its way to RESET cannot be paused. */
639 mutex_exit(&ring->vr_lock);
640 return (EBUSY);
641 }
642 /* FALLTHROUGH */
643 case VRS_RUN:
644 ring->vr_state_flags |= VRSF_REQ_PAUSE;
645 cv_broadcast(&ring->vr_cv);
646 break;
647
648 default:
649 panic("invalid ring state %d", ring->vr_state);
650 break;
651 }
652
653 for (;;) {
654 int res = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
655
656 if (ring->vr_state == VRS_INIT ||
657 (ring->vr_state_flags & VRSF_REQ_PAUSE) == 0) {
658 /* Ring made it to (or through) paused state */
659 mutex_exit(&ring->vr_lock);
660 return (0);
661 }
662 if (res == 0) {
663 /* interrupted by signal */
664 mutex_exit(&ring->vr_lock);
665 return (EINTR);
666 }
667 }
668 /* NOTREACHED */
669 }
670
671 static void
viona_worker(void * arg)672 viona_worker(void *arg)
673 {
674 viona_vring_t *ring = (viona_vring_t *)arg;
675 viona_link_t *link = ring->vr_link;
676
677 mutex_enter(&ring->vr_lock);
678 VERIFY3U(ring->vr_state, ==, VRS_SETUP);
679
680 /* Bail immediately if ring shutdown or process exit was requested */
681 if (vring_need_bail_ext(ring, true)) {
682 goto ring_reset;
683 }
684
685 /* Report worker thread as alive and notify creator */
686 ring_init:
687 ring->vr_state = VRS_INIT;
688 cv_broadcast(&ring->vr_cv);
689
690 while (!vring_start_req(ring)) {
691 /*
692 * Keeping lease renewals timely while waiting for the ring to
693 * be started is important for avoiding deadlocks.
694 */
695 if (vmm_drv_lease_expired(ring->vr_lease)) {
696 if (!viona_ring_lease_renew(ring)) {
697 goto ring_reset;
698 }
699 }
700
701 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
702
703 if (vring_pause_req(ring)) {
704 /* We are already paused in the INIT state. */
705 ring->vr_state_flags &= ~VRSF_REQ_PAUSE;
706 }
707 if (vring_need_bail_ext(ring, true)) {
708 goto ring_reset;
709 }
710 }
711
712 ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0);
713 ring->vr_state = VRS_RUN;
714 ring->vr_state_flags &= ~VRSF_REQ_START;
715 viona_ring_mark_dirty(ring);
716
717 /* Ensure ring lease is valid first */
718 if (vmm_drv_lease_expired(ring->vr_lease)) {
719 if (!viona_ring_lease_renew(ring)) {
720 goto ring_reset;
721 }
722 }
723
724 /* Process actual work */
725 if (ring == &link->l_vrings[VIONA_VQ_RX]) {
726 viona_worker_rx(ring, link);
727 } else if (ring == &link->l_vrings[VIONA_VQ_TX]) {
728 viona_worker_tx(ring, link);
729 } else {
730 panic("unexpected ring: %p", (void *)ring);
731 }
732
733 VERIFY3U(ring->vr_state, ==, VRS_STOP);
734 VERIFY3U(ring->vr_xfer_outstanding, ==, 0);
735
736 /*
737 * Consolidate stats data so that it is not lost if/when this ring is
738 * being stopped.
739 */
740 viona_ring_consolidate_stats(ring);
741
742 /* Respond to a pause request if the ring is not required to stop */
743 if (vring_pause_req(ring)) {
744 ring->vr_state_flags &= ~VRSF_REQ_PAUSE;
745
746 if (vring_need_bail_ext(ring, true)) {
747 goto ring_reset;
748 }
749
750 /*
751 * To complete pausing of the ring, unmap and re-map the pages
752 * underpinning the virtqueue. This is to synchronize their
753 * dirty state in the backing page tables and restore the
754 * defer-dirty state on the held pages.
755 */
756 viona_ring_unmap(ring);
757 if (viona_ring_map(ring, true)) {
758 goto ring_init;
759 }
760
761 /*
762 * If the ring pages failed to be mapped, fallthrough to
763 * ring-reset like any other failure.
764 */
765 }
766
767 ring_reset:
768 viona_ring_misc_free(ring);
769
770 viona_ring_lease_drop(ring);
771 ring->vr_cur_aidx = 0;
772 ring->vr_size = 0;
773 ring->vr_mask = 0;
774 ring->vr_pa = 0;
775 ring->vr_state = VRS_RESET;
776 ring->vr_state_flags = 0;
777 ring->vr_worker_thread = NULL;
778 cv_broadcast(&ring->vr_cv);
779 mutex_exit(&ring->vr_lock);
780
781 mutex_enter(&ttoproc(curthread)->p_lock);
782 lwp_exit();
783 }
784
785 static kthread_t *
viona_create_worker(viona_vring_t * ring)786 viona_create_worker(viona_vring_t *ring)
787 {
788 k_sigset_t hold_set;
789 proc_t *p = curproc;
790 kthread_t *t;
791 klwp_t *lwp;
792
793 ASSERT(MUTEX_HELD(&ring->vr_lock));
794 ASSERT(ring->vr_state == VRS_RESET);
795
796 sigfillset(&hold_set);
797 lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED,
798 minclsyspri - 1, &hold_set, curthread->t_cid, 0);
799 if (lwp == NULL) {
800 return (NULL);
801 }
802
803 t = lwptot(lwp);
804 mutex_enter(&p->p_lock);
805 t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
806 lwp_create_done(t);
807 mutex_exit(&p->p_lock);
808
809 return (t);
810 }
811
812 void
vq_read_desc(viona_vring_t * ring,uint16_t idx,struct virtio_desc * descp)813 vq_read_desc(viona_vring_t *ring, uint16_t idx, struct virtio_desc *descp)
814 {
815 const uint_t entry_off = idx * sizeof (struct virtio_desc);
816
817 ASSERT3U(idx, <, ring->vr_size);
818
819 bcopy(viona_ring_addr(ring, entry_off), descp, sizeof (*descp));
820 }
821
822 static uint16_t
vq_read_avail(viona_vring_t * ring,uint16_t idx)823 vq_read_avail(viona_vring_t *ring, uint16_t idx)
824 {
825 ASSERT3U(idx, <, ring->vr_size);
826
827 volatile uint16_t *avail_ent =
828 viona_ring_addr(ring, LEGACY_AVAIL_ENT_OFF(ring->vr_size, idx));
829 return (*avail_ent);
830 }
831
832 /*
833 * Given a buffer descriptor `desc`, attempt to map the pages backing that
834 * region of guest physical memory, taking into account that there are no
835 * guarantees about guest-contiguous pages being host-contiguous.
836 */
837 static int
vq_map_desc_bufs(viona_vring_t * ring,const struct virtio_desc * desc,vq_held_region_t * region)838 vq_map_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
839 vq_held_region_t *region)
840 {
841 int err;
842
843 if (desc->vd_len == 0) {
844 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
845 uint32_t, desc->vd_len);
846 VIONA_RING_STAT_INCR(ring, desc_bad_len);
847 return (EINVAL);
848 }
849
850 err = vq_region_hold(ring, desc->vd_addr, desc->vd_len,
851 (desc->vd_flags & VRING_DESC_F_WRITE) != 0, region);
852 switch (err) {
853 case E2BIG:
854 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
855 VIONA_RING_STAT_INCR(ring, too_many_desc);
856 break;
857 case EFAULT:
858 VIONA_PROBE_BAD_RING_ADDR(ring, desc->vd_addr);
859 VIONA_RING_STAT_INCR(ring, bad_ring_addr);
860 break;
861 default:
862 break;
863 }
864
865 return (err);
866 }
867
868 /*
869 * Walk an indirect buffer descriptor `desc`, attempting to map the pages
870 * backing the regions of guest memory covered by its constituent descriptors.
871 */
872 static int
vq_map_indir_desc_bufs(viona_vring_t * ring,const struct virtio_desc * desc,vq_held_region_t * region)873 vq_map_indir_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
874 vq_held_region_t *region)
875 {
876 const uint16_t indir_count = desc->vd_len / sizeof (struct virtio_desc);
877
878 if ((desc->vd_len & 0xf) != 0 || indir_count == 0 ||
879 indir_count > ring->vr_size ||
880 desc->vd_addr > (desc->vd_addr + desc->vd_len)) {
881 VIONA_PROBE2(indir_bad_len, viona_vring_t *, ring,
882 uint32_t, desc->vd_len);
883 VIONA_RING_STAT_INCR(ring, indir_bad_len);
884 return (EINVAL);
885 }
886
887 uint16_t indir_next = 0;
888 const uint8_t *buf = NULL;
889 uint64_t buf_gpa = UINT64_MAX;
890 vmm_page_t *vmp = NULL;
891 int err = 0;
892
893 for (;;) {
894 uint64_t indir_gpa =
895 desc->vd_addr + (indir_next * sizeof (struct virtio_desc));
896 uint64_t indir_page = indir_gpa & PAGEMASK;
897 struct virtio_desc vp;
898
899 /*
900 * Get a mapping for the page that the next indirect descriptor
901 * resides in, if has not already been done.
902 */
903 if (indir_page != buf_gpa) {
904 if (vmp != NULL) {
905 vmm_drv_page_release(vmp);
906 }
907 vmp = vq_page_hold(ring, indir_page, false);
908 if (vmp == NULL) {
909 VIONA_PROBE_BAD_RING_ADDR(ring, indir_page);
910 VIONA_RING_STAT_INCR(ring, bad_ring_addr);
911 err = EFAULT;
912 break;
913 }
914 buf_gpa = indir_page;
915 buf = vmm_drv_page_readable(vmp);
916 }
917
918 /*
919 * A copy of the indirect descriptor is made here, rather than
920 * simply using a reference pointer. This prevents malicious or
921 * erroneous guest writes to the descriptor from fooling the
922 * flags/bounds verification through a race.
923 */
924 bcopy(buf + (indir_gpa - indir_page), &vp, sizeof (vp));
925
926 if (vp.vd_flags & VRING_DESC_F_INDIRECT) {
927 VIONA_PROBE1(indir_bad_nest, viona_vring_t *, ring);
928 VIONA_RING_STAT_INCR(ring, indir_bad_nest);
929 err = EINVAL;
930 break;
931 } else if (vp.vd_len == 0) {
932 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
933 uint32_t, vp.vd_len);
934 VIONA_RING_STAT_INCR(ring, desc_bad_len);
935 err = EINVAL;
936 break;
937 }
938
939 err = vq_map_desc_bufs(ring, &vp, region);
940 if (err != 0) {
941 break;
942 }
943
944 /* Successfully reach the end of the indir chain */
945 if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) {
946 break;
947 }
948 if (region->vhr_idx >= region->vhr_niov) {
949 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
950 VIONA_RING_STAT_INCR(ring, too_many_desc);
951 err = E2BIG;
952 break;
953 }
954
955 indir_next = vp.vd_next;
956 if (indir_next >= indir_count) {
957 VIONA_PROBE3(indir_bad_next, viona_vring_t *, ring,
958 uint16_t, indir_next, uint16_t, indir_count);
959 VIONA_RING_STAT_INCR(ring, indir_bad_next);
960 err = EINVAL;
961 break;
962 }
963 }
964
965 if (vmp != NULL) {
966 vmm_drv_page_release(vmp);
967 }
968 return (err);
969 }
970
971 int
vq_popchain(viona_vring_t * ring,struct iovec * iov,uint_t niov,uint16_t * cookie,vmm_page_t ** chain)972 vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov,
973 uint16_t *cookie, vmm_page_t **chain)
974 {
975 uint16_t ndesc, idx, head, next;
976 struct virtio_desc vdir;
977 vq_held_region_t region = {
978 .vhr_niov = niov,
979 .vhr_iov = iov,
980 };
981
982 ASSERT(iov != NULL);
983 ASSERT(niov > 0 && niov < INT_MAX);
984 ASSERT(*chain == NULL);
985
986 mutex_enter(&ring->vr_a_mutex);
987 idx = ring->vr_cur_aidx;
988 ndesc = viona_ring_num_avail(ring);
989
990 if (ndesc == 0) {
991 mutex_exit(&ring->vr_a_mutex);
992 return (0);
993 }
994 if (ndesc > ring->vr_size) {
995 /*
996 * Despite the fact that the guest has provided an 'avail_idx'
997 * which indicates that an impossible number of descriptors are
998 * available, continue on and attempt to process the next one.
999 *
1000 * The transgression will not escape the probe or stats though.
1001 */
1002 VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring,
1003 uint16_t, ndesc);
1004 VIONA_RING_STAT_INCR(ring, ndesc_too_high);
1005 }
1006
1007 head = vq_read_avail(ring, idx & ring->vr_mask);
1008 next = head;
1009
1010 for (region.vhr_idx = 0; region.vhr_idx < niov; next = vdir.vd_next) {
1011 if (next >= ring->vr_size) {
1012 VIONA_PROBE2(bad_idx, viona_vring_t *, ring,
1013 uint16_t, next);
1014 VIONA_RING_STAT_INCR(ring, bad_idx);
1015 break;
1016 }
1017
1018 vq_read_desc(ring, next, &vdir);
1019 if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) {
1020 if (vq_map_desc_bufs(ring, &vdir, ®ion) != 0) {
1021 break;
1022 }
1023 } else {
1024 /*
1025 * Per the specification (Virtio 1.1 S2.6.5.3.1):
1026 * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
1027 * and VIRTQ_DESC_F_NEXT in `flags`.
1028 */
1029 if ((vdir.vd_flags & VRING_DESC_F_NEXT) != 0) {
1030 VIONA_PROBE3(indir_bad_next,
1031 viona_vring_t *, ring,
1032 uint16_t, next, uint16_t, 0);
1033 VIONA_RING_STAT_INCR(ring, indir_bad_next);
1034 break;
1035 }
1036
1037 if (vq_map_indir_desc_bufs(ring, &vdir, ®ion) != 0) {
1038 break;
1039 }
1040 }
1041
1042 if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) {
1043 ring->vr_cur_aidx++;
1044 mutex_exit(&ring->vr_a_mutex);
1045
1046 *cookie = head;
1047 *chain = region.vhr_head;
1048 return (region.vhr_idx);
1049 }
1050 }
1051
1052 mutex_exit(&ring->vr_a_mutex);
1053 if (region.vhr_head != NULL) {
1054 /*
1055 * If any pages were held prior to encountering an error, we
1056 * must release them now.
1057 */
1058 vmm_drv_page_release_chain(region.vhr_head);
1059 }
1060 return (-1);
1061 }
1062
1063
1064 static void
vq_write_used_ent(viona_vring_t * ring,uint16_t idx,uint16_t cookie,uint32_t len)1065 vq_write_used_ent(viona_vring_t *ring, uint16_t idx, uint16_t cookie,
1066 uint32_t len)
1067 {
1068 /*
1069 * In a larger ring, entry could be split across pages, so be sure to
1070 * account for that when configuring the transfer by looking up the ID
1071 * and length addresses separately, rather than an address for a
1072 * combined `struct virtio_used`.
1073 */
1074 const uint_t used_id_off = LEGACY_USED_ENT_OFF(ring->vr_size, idx);
1075 const uint_t used_len_off = used_id_off + sizeof (uint32_t);
1076 volatile uint32_t *idp = viona_ring_addr(ring, used_id_off);
1077 volatile uint32_t *lenp = viona_ring_addr(ring, used_len_off);
1078
1079 ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
1080
1081 *idp = cookie;
1082 *lenp = len;
1083 }
1084
1085 static void
vq_write_used_idx(viona_vring_t * ring,uint16_t idx)1086 vq_write_used_idx(viona_vring_t *ring, uint16_t idx)
1087 {
1088 ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
1089
1090 volatile uint16_t *used_idx =
1091 viona_ring_addr(ring, LEGACY_USED_IDX_OFF(ring->vr_size));
1092 *used_idx = idx;
1093 }
1094
1095 void
vq_pushchain(viona_vring_t * ring,uint32_t len,uint16_t cookie)1096 vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie)
1097 {
1098 uint16_t uidx;
1099
1100 mutex_enter(&ring->vr_u_mutex);
1101
1102 uidx = ring->vr_cur_uidx;
1103 vq_write_used_ent(ring, uidx & ring->vr_mask, cookie, len);
1104 uidx++;
1105 membar_producer();
1106
1107 vq_write_used_idx(ring, uidx);
1108 ring->vr_cur_uidx = uidx;
1109
1110 mutex_exit(&ring->vr_u_mutex);
1111 }
1112
1113 void
vq_pushchain_many(viona_vring_t * ring,uint_t num_bufs,used_elem_t * elem)1114 vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem)
1115 {
1116 uint16_t uidx;
1117
1118 mutex_enter(&ring->vr_u_mutex);
1119
1120 uidx = ring->vr_cur_uidx;
1121
1122 for (uint_t i = 0; i < num_bufs; i++, uidx++) {
1123 vq_write_used_ent(ring, uidx & ring->vr_mask, elem[i].id,
1124 elem[i].len);
1125 }
1126
1127 membar_producer();
1128 vq_write_used_idx(ring, uidx);
1129 ring->vr_cur_uidx = uidx;
1130
1131 mutex_exit(&ring->vr_u_mutex);
1132 }
1133
1134 /*
1135 * Set USED_NO_NOTIFY on VQ so guest elides doorbell calls for new entries.
1136 */
1137 void
viona_ring_disable_notify(viona_vring_t * ring)1138 viona_ring_disable_notify(viona_vring_t *ring)
1139 {
1140 volatile uint16_t *used_flags =
1141 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
1142
1143 *used_flags |= VRING_USED_F_NO_NOTIFY;
1144 }
1145
1146 /*
1147 * Clear USED_NO_NOTIFY on VQ so guest resumes doorbell calls for new entries.
1148 */
1149 void
viona_ring_enable_notify(viona_vring_t * ring)1150 viona_ring_enable_notify(viona_vring_t *ring)
1151 {
1152 volatile uint16_t *used_flags =
1153 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
1154
1155 *used_flags &= ~VRING_USED_F_NO_NOTIFY;
1156 }
1157
1158 /*
1159 * Return the number of available descriptors in the vring taking care of the
1160 * 16-bit index wraparound.
1161 *
1162 * Note: If the number of apparently available descriptors is larger than the
1163 * ring size (due to guest misbehavior), this check will still report the
1164 * positive count of descriptors.
1165 */
1166 uint16_t
viona_ring_num_avail(viona_vring_t * ring)1167 viona_ring_num_avail(viona_vring_t *ring)
1168 {
1169 volatile uint16_t *avail_idx =
1170 viona_ring_addr(ring, LEGACY_AVAIL_IDX_OFF(ring->vr_size));
1171
1172 return (*avail_idx - ring->vr_cur_aidx);
1173 }
1174
1175 /* Record a successfully transferred packet for the ring stats */
1176 void
viona_ring_stat_accept(viona_vring_t * ring,uint32_t len)1177 viona_ring_stat_accept(viona_vring_t *ring, uint32_t len)
1178 {
1179 atomic_inc_64(&ring->vr_stats.vts_packets);
1180 atomic_add_64(&ring->vr_stats.vts_bytes, len);
1181 }
1182
1183 /*
1184 * Record a dropped packet in the ring stats
1185 */
1186 void
viona_ring_stat_drop(viona_vring_t * ring)1187 viona_ring_stat_drop(viona_vring_t *ring)
1188 {
1189 atomic_inc_64(&ring->vr_stats.vts_drops);
1190 }
1191
1192 /*
1193 * Record a packet transfer error in the ring stats
1194 */
1195 void
viona_ring_stat_error(viona_vring_t * ring)1196 viona_ring_stat_error(viona_vring_t *ring)
1197 {
1198 atomic_inc_64(&ring->vr_stats.vts_errors);
1199 }
1200
1201 /*
1202 * Consolidate statistic data for this ring into the totals for the link
1203 */
1204 static void
viona_ring_consolidate_stats(viona_vring_t * ring)1205 viona_ring_consolidate_stats(viona_vring_t *ring)
1206 {
1207 viona_link_t *link = ring->vr_link;
1208 struct viona_transfer_stats *lstat =
1209 (ring == &link->l_vrings[VIONA_VQ_RX]) ?
1210 &link->l_stats.vls_rx : &link->l_stats.vls_tx;
1211
1212 mutex_enter(&link->l_stats_lock);
1213 lstat->vts_packets += ring->vr_stats.vts_packets;
1214 lstat->vts_bytes += ring->vr_stats.vts_bytes;
1215 lstat->vts_drops += ring->vr_stats.vts_drops;
1216 lstat->vts_errors += ring->vr_stats.vts_errors;
1217 bzero(&ring->vr_stats, sizeof (ring->vr_stats));
1218 mutex_exit(&link->l_stats_lock);
1219 }
1220