1 /*
2 * Copyright (c) 2013 Chris Torek <torek @ torek net>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 /*
27 * This file and its contents are supplied under the terms of the
28 * Common Development and Distribution License ("CDDL"), version 1.0.
29 * You may only use this file in accordance with the terms of version
30 * 1.0 of the CDDL.
31 *
32 * A full copy of the text of the CDDL should have accompanied this
33 * source. A copy of the CDDL is also available via the Internet at
34 * http://www.illumos.org/license/CDDL.
35 *
36 * Copyright 2015 Pluribus Networks Inc.
37 * Copyright 2019 Joyent, Inc.
38 * Copyright 2022 Oxide Computer Company
39 */
40
41
42 #include <sys/disp.h>
43
44 #include "viona_impl.h"
45
46 #define VRING_MAX_LEN 32768
47
48 /* Layout and sizing as defined in the spec for a legacy-style virtqueue */
49
50 #define LEGACY_VQ_ALIGN PAGESIZE
51
52 #define LEGACY_DESC_SZ(qsz) ((qsz) * sizeof (struct virtio_desc))
53 /*
54 * Available ring consists of avail_idx (uint16_t), flags (uint16_t), qsz avail
55 * descriptors (uint16_t each), and (optional) used_event (uint16_t).
56 */
57 #define LEGACY_AVAIL_SZ(qsz) (((qsz) + 3) * sizeof (uint16_t))
58 /*
59 * Used ring consists of used_idx (uint16_t), flags (uint16_t), qsz used
60 * descriptors (two uint32_t each), and (optional) avail_event (uint16_t).
61 */
62 #define LEGACY_USED_SZ(qsz) \
63 ((qsz) * sizeof (struct virtio_used) + 3 * sizeof (uint16_t))
64
65 #define LEGACY_AVAIL_FLAGS_OFF(qsz) LEGACY_DESC_SZ(qsz)
66 #define LEGACY_AVAIL_IDX_OFF(qsz) \
67 (LEGACY_DESC_SZ(qsz) + sizeof (uint16_t))
68 #define LEGACY_AVAIL_ENT_OFF(qsz, idx) \
69 (LEGACY_DESC_SZ(qsz) + (2 + (idx)) * sizeof (uint16_t))
70
71 #define LEGACY_USED_FLAGS_OFF(qsz) \
72 P2ROUNDUP(LEGACY_DESC_SZ(qsz) + LEGACY_AVAIL_SZ(qsz), LEGACY_VQ_ALIGN)
73 #define LEGACY_USED_IDX_OFF(qsz) \
74 (LEGACY_USED_FLAGS_OFF(qsz) + sizeof (uint16_t))
75 #define LEGACY_USED_ENT_OFF(qsz, idx) \
76 (LEGACY_USED_FLAGS_OFF(qsz) + 2 * sizeof (uint16_t) + \
77 (idx) * sizeof (struct virtio_used))
78
79 #define LEGACY_VQ_SIZE(qsz) \
80 (LEGACY_USED_FLAGS_OFF(qsz) + \
81 P2ROUNDUP(LEGACY_USED_SZ(qsz), LEGACY_VQ_ALIGN))
82 #define LEGACY_VQ_PAGES(qsz) (LEGACY_VQ_SIZE(qsz) / PAGESIZE)
83
84 struct vq_held_region {
85 struct iovec *vhr_iov;
86 vmm_page_t *vhr_head;
87 vmm_page_t *vhr_tail;
88 /* Length of iovec array supplied in `vhr_iov` */
89 uint_t vhr_niov;
90 /*
91 * Index into vhr_iov, indicating the next "free" entry (following the
92 * last entry which has valid contents).
93 */
94 uint_t vhr_idx;
95 };
96 typedef struct vq_held_region vq_held_region_t;
97
98 static bool viona_ring_map(viona_vring_t *, bool);
99 static void viona_ring_unmap(viona_vring_t *);
100 static kthread_t *viona_create_worker(viona_vring_t *);
101
102 static vmm_page_t *
vq_page_hold(viona_vring_t * ring,uint64_t gpa,bool writable)103 vq_page_hold(viona_vring_t *ring, uint64_t gpa, bool writable)
104 {
105 ASSERT3P(ring->vr_lease, !=, NULL);
106
107 int prot = PROT_READ;
108 if (writable) {
109 prot |= PROT_WRITE;
110 }
111
112 return (vmm_drv_page_hold(ring->vr_lease, gpa, prot));
113 }
114
115 /*
116 * Establish a hold on the page(s) which back the region of guest memory covered
117 * by [gpa, gpa + len). The host-kernel-virtual pointers to those pages are
118 * stored in the iovec array supplied in `region`, along with the chain of
119 * vmm_page_t entries representing the held pages. Since guest memory
120 * carries no guarantees of being physically contiguous (on the host), it is
121 * assumed that an iovec entry will be required for each PAGESIZE section
122 * covered by the specified `gpa` and `len` range. For each iovec entry
123 * successfully populated by holding a page, `vhr_idx` will be incremented so it
124 * references the next available iovec entry (or `vhr_niov`, if the iovec array
125 * is full). The responsibility for releasing the `vmm_page_t` chain (stored in
126 * `vhr_head` and `vhr_tail`) resides with the caller, regardless of the result.
127 */
128 static int
vq_region_hold(viona_vring_t * ring,uint64_t gpa,uint32_t len,bool writable,vq_held_region_t * region)129 vq_region_hold(viona_vring_t *ring, uint64_t gpa, uint32_t len,
130 bool writable, vq_held_region_t *region)
131 {
132 const uint32_t front_offset = gpa & PAGEOFFSET;
133 const uint32_t front_len = MIN(len, PAGESIZE - front_offset);
134 uint_t pages = 1;
135 vmm_page_t *vmp;
136 caddr_t buf;
137
138 ASSERT3U(region->vhr_idx, <, region->vhr_niov);
139
140 if (front_len < len) {
141 pages += P2ROUNDUP((uint64_t)(len - front_len),
142 PAGESIZE) / PAGESIZE;
143 }
144 if (pages > (region->vhr_niov - region->vhr_idx)) {
145 return (E2BIG);
146 }
147
148 vmp = vq_page_hold(ring, gpa & PAGEMASK, writable);
149 if (vmp == NULL) {
150 return (EFAULT);
151 }
152 buf = (caddr_t)vmm_drv_page_readable(vmp);
153
154 region->vhr_iov[region->vhr_idx].iov_base = buf + front_offset;
155 region->vhr_iov[region->vhr_idx].iov_len = front_len;
156 region->vhr_idx++;
157 gpa += front_len;
158 len -= front_len;
159 if (region->vhr_head == NULL) {
160 region->vhr_head = vmp;
161 region->vhr_tail = vmp;
162 } else {
163 vmm_drv_page_chain(region->vhr_tail, vmp);
164 region->vhr_tail = vmp;
165 }
166
167 for (uint_t i = 1; i < pages; i++) {
168 ASSERT3U(gpa & PAGEOFFSET, ==, 0);
169
170 vmp = vq_page_hold(ring, gpa, writable);
171 if (vmp == NULL) {
172 return (EFAULT);
173 }
174 buf = (caddr_t)vmm_drv_page_readable(vmp);
175
176 const uint32_t chunk_len = MIN(len, PAGESIZE);
177 region->vhr_iov[region->vhr_idx].iov_base = buf;
178 region->vhr_iov[region->vhr_idx].iov_len = chunk_len;
179 region->vhr_idx++;
180 gpa += chunk_len;
181 len -= chunk_len;
182 vmm_drv_page_chain(region->vhr_tail, vmp);
183 region->vhr_tail = vmp;
184 }
185
186 return (0);
187 }
188
189 static boolean_t
viona_ring_lease_expire_cb(void * arg)190 viona_ring_lease_expire_cb(void *arg)
191 {
192 viona_vring_t *ring = arg;
193
194 mutex_enter(&ring->vr_lock);
195 cv_broadcast(&ring->vr_cv);
196 mutex_exit(&ring->vr_lock);
197
198 /* The lease will be broken asynchronously. */
199 return (B_FALSE);
200 }
201
202 static void
viona_ring_lease_drop(viona_vring_t * ring)203 viona_ring_lease_drop(viona_vring_t *ring)
204 {
205 ASSERT(MUTEX_HELD(&ring->vr_lock));
206
207 if (ring->vr_lease != NULL) {
208 vmm_hold_t *hold = ring->vr_link->l_vm_hold;
209
210 ASSERT(hold != NULL);
211
212 /*
213 * Without an active lease, the ring mappings cannot be
214 * considered valid.
215 */
216 viona_ring_unmap(ring);
217
218 vmm_drv_lease_break(hold, ring->vr_lease);
219 ring->vr_lease = NULL;
220 }
221 }
222
223 boolean_t
viona_ring_lease_renew(viona_vring_t * ring)224 viona_ring_lease_renew(viona_vring_t *ring)
225 {
226 vmm_hold_t *hold = ring->vr_link->l_vm_hold;
227
228 ASSERT(hold != NULL);
229 ASSERT(MUTEX_HELD(&ring->vr_lock));
230
231 viona_ring_lease_drop(ring);
232
233 /*
234 * Lease renewal will fail if the VM has requested that all holds be
235 * cleaned up.
236 */
237 ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb,
238 ring);
239 if (ring->vr_lease != NULL) {
240 /* A ring undergoing renewal will need valid guest mappings */
241 if (ring->vr_pa != 0 && ring->vr_size != 0) {
242 /*
243 * If new mappings cannot be established, consider the
244 * lease renewal a failure.
245 */
246 if (!viona_ring_map(ring, ring->vr_state == VRS_INIT)) {
247 viona_ring_lease_drop(ring);
248 return (B_FALSE);
249 }
250 }
251 }
252 return (ring->vr_lease != NULL);
253 }
254
255 void
viona_ring_alloc(viona_link_t * link,viona_vring_t * ring)256 viona_ring_alloc(viona_link_t *link, viona_vring_t *ring)
257 {
258 ring->vr_link = link;
259 mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL);
260 cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL);
261 mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL);
262 mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL);
263 }
264
265 static void
viona_ring_misc_free(viona_vring_t * ring)266 viona_ring_misc_free(viona_vring_t *ring)
267 {
268 const uint_t qsz = ring->vr_size;
269
270 viona_tx_ring_free(ring, qsz);
271 }
272
273 void
viona_ring_free(viona_vring_t * ring)274 viona_ring_free(viona_vring_t *ring)
275 {
276 mutex_destroy(&ring->vr_lock);
277 cv_destroy(&ring->vr_cv);
278 mutex_destroy(&ring->vr_a_mutex);
279 mutex_destroy(&ring->vr_u_mutex);
280 ring->vr_link = NULL;
281 }
282
283 int
viona_ring_init(viona_link_t * link,uint16_t idx,const struct viona_ring_params * params)284 viona_ring_init(viona_link_t *link, uint16_t idx,
285 const struct viona_ring_params *params)
286 {
287 viona_vring_t *ring;
288 kthread_t *t;
289 int err = 0;
290 const uint16_t qsz = params->vrp_size;
291 const uint64_t pa = params->vrp_pa;
292
293 if (idx >= VIONA_VQ_MAX) {
294 return (EINVAL);
295 }
296
297 if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) {
298 return (EINVAL);
299 }
300 if ((pa & (LEGACY_VQ_ALIGN - 1)) != 0) {
301 return (EINVAL);
302 }
303
304 ring = &link->l_vrings[idx];
305 mutex_enter(&ring->vr_lock);
306 if (ring->vr_state != VRS_RESET) {
307 mutex_exit(&ring->vr_lock);
308 return (EBUSY);
309 }
310 VERIFY(ring->vr_state_flags == 0);
311
312 ring->vr_lease = NULL;
313 if (!viona_ring_lease_renew(ring)) {
314 err = EBUSY;
315 goto fail;
316 }
317
318 ring->vr_size = qsz;
319 ring->vr_mask = (ring->vr_size - 1);
320 ring->vr_pa = pa;
321 if (!viona_ring_map(ring, true)) {
322 err = EINVAL;
323 goto fail;
324 }
325
326 /* Initialize queue indexes */
327 ring->vr_cur_aidx = params->vrp_avail_idx;
328 ring->vr_cur_uidx = params->vrp_used_idx;
329
330 if (idx == VIONA_VQ_TX) {
331 viona_tx_ring_alloc(ring, qsz);
332 }
333
334 /* Zero out MSI-X configuration */
335 ring->vr_msi_addr = 0;
336 ring->vr_msi_msg = 0;
337
338 /* Clear the stats */
339 bzero(&ring->vr_stats, sizeof (ring->vr_stats));
340
341 t = viona_create_worker(ring);
342 if (t == NULL) {
343 err = ENOMEM;
344 goto fail;
345 }
346 ring->vr_worker_thread = t;
347 ring->vr_state = VRS_SETUP;
348 cv_broadcast(&ring->vr_cv);
349 mutex_exit(&ring->vr_lock);
350 return (0);
351
352 fail:
353 viona_ring_lease_drop(ring);
354 viona_ring_misc_free(ring);
355 ring->vr_size = 0;
356 ring->vr_mask = 0;
357 ring->vr_pa = 0;
358 ring->vr_cur_aidx = 0;
359 ring->vr_cur_uidx = 0;
360 mutex_exit(&ring->vr_lock);
361 return (err);
362 }
363
364 int
viona_ring_get_state(viona_link_t * link,uint16_t idx,struct viona_ring_params * params)365 viona_ring_get_state(viona_link_t *link, uint16_t idx,
366 struct viona_ring_params *params)
367 {
368 viona_vring_t *ring;
369
370 if (idx >= VIONA_VQ_MAX) {
371 return (EINVAL);
372 }
373
374 ring = &link->l_vrings[idx];
375 mutex_enter(&ring->vr_lock);
376
377 params->vrp_size = ring->vr_size;
378 params->vrp_pa = ring->vr_pa;
379
380 if (ring->vr_state == VRS_RUN) {
381 /* On a running ring, we must heed the avail/used locks */
382 mutex_enter(&ring->vr_a_mutex);
383 params->vrp_avail_idx = ring->vr_cur_aidx;
384 mutex_exit(&ring->vr_a_mutex);
385 mutex_enter(&ring->vr_u_mutex);
386 params->vrp_used_idx = ring->vr_cur_uidx;
387 mutex_exit(&ring->vr_u_mutex);
388 } else {
389 /* Otherwise vr_lock is adequate protection */
390 params->vrp_avail_idx = ring->vr_cur_aidx;
391 params->vrp_used_idx = ring->vr_cur_uidx;
392 }
393
394 mutex_exit(&ring->vr_lock);
395
396 return (0);
397 }
398
399 int
viona_ring_reset(viona_vring_t * ring,boolean_t heed_signals)400 viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals)
401 {
402 mutex_enter(&ring->vr_lock);
403 if (ring->vr_state == VRS_RESET) {
404 mutex_exit(&ring->vr_lock);
405 return (0);
406 }
407
408 if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) {
409 ring->vr_state_flags |= VRSF_REQ_STOP;
410 cv_broadcast(&ring->vr_cv);
411 }
412 while (ring->vr_state != VRS_RESET) {
413 if (!heed_signals) {
414 cv_wait(&ring->vr_cv, &ring->vr_lock);
415 } else {
416 int rs;
417
418 rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
419 if (rs <= 0 && ring->vr_state != VRS_RESET) {
420 mutex_exit(&ring->vr_lock);
421 return (EINTR);
422 }
423 }
424 }
425 mutex_exit(&ring->vr_lock);
426 return (0);
427 }
428
429 static bool
viona_ring_map(viona_vring_t * ring,bool defer_dirty)430 viona_ring_map(viona_vring_t *ring, bool defer_dirty)
431 {
432 const uint16_t qsz = ring->vr_size;
433 uintptr_t pa = ring->vr_pa;
434
435 ASSERT3U(qsz, !=, 0);
436 ASSERT3U(qsz, <=, VRING_MAX_LEN);
437 ASSERT3U(pa, !=, 0);
438 ASSERT3U(pa & (LEGACY_VQ_ALIGN - 1), ==, 0);
439 ASSERT3U(LEGACY_VQ_ALIGN, ==, PAGESIZE);
440 ASSERT(MUTEX_HELD(&ring->vr_lock));
441 ASSERT3P(ring->vr_map_pages, ==, NULL);
442
443 const uint_t npages = LEGACY_VQ_PAGES(qsz);
444 ring->vr_map_pages = kmem_zalloc(npages * sizeof (void *), KM_SLEEP);
445
446 int page_flags = 0;
447 if (defer_dirty) {
448 /*
449 * During initialization, and when entering the paused state,
450 * the page holds for a virtqueue are established with the
451 * DEFER_DIRTY flag set.
452 *
453 * This prevents those page holds from immediately marking the
454 * underlying pages as dirty, since the viona emulation is not
455 * yet performing any accesses. Once the ring transitions to
456 * the VRS_RUN state, the held pages will be marked as dirty.
457 *
458 * Any ring mappings performed outside those state conditions,
459 * such as those part of vmm_lease renewal during steady-state
460 * operation, will map the ring pages normally (as considered
461 * immediately dirty).
462 */
463 page_flags |= VMPF_DEFER_DIRTY;
464 }
465
466 vmm_page_t *prev = NULL;
467 for (uint_t i = 0; i < npages; i++, pa += PAGESIZE) {
468 vmm_page_t *vmp;
469
470 vmp = vmm_drv_page_hold_ext(ring->vr_lease, pa,
471 PROT_READ | PROT_WRITE, page_flags);
472 if (vmp == NULL) {
473 viona_ring_unmap(ring);
474 return (false);
475 }
476
477 /*
478 * Keep the first page has the head of the chain, appending all
479 * subsequent pages to the tail.
480 */
481 if (prev == NULL) {
482 ring->vr_map_hold = vmp;
483 } else {
484 vmm_drv_page_chain(prev, vmp);
485 }
486 prev = vmp;
487 ring->vr_map_pages[i] = vmm_drv_page_writable(vmp);
488 }
489
490 return (true);
491 }
492
493 static void
viona_ring_mark_dirty(viona_vring_t * ring)494 viona_ring_mark_dirty(viona_vring_t *ring)
495 {
496 ASSERT(MUTEX_HELD(&ring->vr_lock));
497 ASSERT(ring->vr_map_hold != NULL);
498
499 for (vmm_page_t *vp = ring->vr_map_hold; vp != NULL;
500 vp = vmm_drv_page_next(vp)) {
501 vmm_drv_page_mark_dirty(vp);
502 }
503 }
504
505 static void
viona_ring_unmap(viona_vring_t * ring)506 viona_ring_unmap(viona_vring_t *ring)
507 {
508 ASSERT(MUTEX_HELD(&ring->vr_lock));
509
510 void **map = ring->vr_map_pages;
511 if (map != NULL) {
512 const uint_t npages = LEGACY_VQ_PAGES(ring->vr_size);
513 kmem_free(map, npages * sizeof (void *));
514 ring->vr_map_pages = NULL;
515
516 vmm_drv_page_release_chain(ring->vr_map_hold);
517 ring->vr_map_hold = NULL;
518 } else {
519 ASSERT3P(ring->vr_map_hold, ==, NULL);
520 }
521 }
522
523 static inline void *
viona_ring_addr(viona_vring_t * ring,uint_t off)524 viona_ring_addr(viona_vring_t *ring, uint_t off)
525 {
526 ASSERT3P(ring->vr_map_pages, !=, NULL);
527 ASSERT3U(LEGACY_VQ_SIZE(ring->vr_size), >, off);
528
529 const uint_t page_num = off / PAGESIZE;
530 const uint_t page_off = off % PAGESIZE;
531 return ((caddr_t)ring->vr_map_pages[page_num] + page_off);
532 }
533
534 void
viona_intr_ring(viona_vring_t * ring,boolean_t skip_flags_check)535 viona_intr_ring(viona_vring_t *ring, boolean_t skip_flags_check)
536 {
537 if (!skip_flags_check) {
538 volatile uint16_t *avail_flags = viona_ring_addr(ring,
539 LEGACY_AVAIL_FLAGS_OFF(ring->vr_size));
540
541 if ((*avail_flags & VRING_AVAIL_F_NO_INTERRUPT) != 0) {
542 return;
543 }
544 }
545
546 mutex_enter(&ring->vr_lock);
547 uint64_t addr = ring->vr_msi_addr;
548 uint64_t msg = ring->vr_msi_msg;
549 mutex_exit(&ring->vr_lock);
550 if (addr != 0) {
551 /* Deliver the interrupt directly, if so configured... */
552 (void) vmm_drv_msi(ring->vr_lease, addr, msg);
553 } else {
554 /* ... otherwise, leave it to userspace */
555 if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) {
556 pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND);
557 }
558 }
559 }
560
561 static inline bool
vring_stop_req(const viona_vring_t * ring)562 vring_stop_req(const viona_vring_t *ring)
563 {
564 return ((ring->vr_state_flags & VRSF_REQ_STOP) != 0);
565 }
566
567 static inline bool
vring_pause_req(const viona_vring_t * ring)568 vring_pause_req(const viona_vring_t *ring)
569 {
570 return ((ring->vr_state_flags & VRSF_REQ_PAUSE) != 0);
571 }
572
573 static inline bool
vring_start_req(const viona_vring_t * ring)574 vring_start_req(const viona_vring_t *ring)
575 {
576 return ((ring->vr_state_flags & VRSF_REQ_START) != 0);
577 }
578
579 /*
580 * Check if vring worker thread should bail out. This will heed indications
581 * that the containing process is exiting, as well as requests to stop or pause
582 * the ring. The `stop_only` parameter controls if pause requests are ignored
583 * (true) or checked (false).
584 *
585 * Caller should hold vr_lock.
586 */
587 static bool
vring_need_bail_ext(const viona_vring_t * ring,bool stop_only)588 vring_need_bail_ext(const viona_vring_t *ring, bool stop_only)
589 {
590 ASSERT(MUTEX_HELD(&ring->vr_lock));
591
592 if (vring_stop_req(ring) ||
593 (!stop_only && vring_pause_req(ring))) {
594 return (true);
595 }
596
597 kthread_t *t = ring->vr_worker_thread;
598 if (t != NULL) {
599 proc_t *p = ttoproc(t);
600
601 ASSERT(p != NULL);
602 if ((p->p_flag & SEXITING) != 0) {
603 return (true);
604 }
605 }
606 return (false);
607 }
608
609 bool
vring_need_bail(const viona_vring_t * ring)610 vring_need_bail(const viona_vring_t *ring)
611 {
612 return (vring_need_bail_ext(ring, false));
613 }
614
615 int
viona_ring_pause(viona_vring_t * ring)616 viona_ring_pause(viona_vring_t *ring)
617 {
618 mutex_enter(&ring->vr_lock);
619 switch (ring->vr_state) {
620 case VRS_RESET:
621 case VRS_SETUP:
622 case VRS_INIT:
623 /*
624 * For rings which have not yet started (even those in the
625 * VRS_SETUP and VRS_INIT phases, where there a running worker
626 * thread (waiting to be released to do its intended task), it
627 * is adequate to simply clear any start request, to keep them
628 * from proceeding into the actual work processing function.
629 */
630 ring->vr_state_flags &= ~VRSF_REQ_START;
631 mutex_exit(&ring->vr_lock);
632 return (0);
633
634 case VRS_STOP:
635 if ((ring->vr_state_flags & VRSF_REQ_STOP) != 0) {
636 /* A ring on its way to RESET cannot be paused. */
637 mutex_exit(&ring->vr_lock);
638 return (EBUSY);
639 }
640 /* FALLTHROUGH */
641 case VRS_RUN:
642 ring->vr_state_flags |= VRSF_REQ_PAUSE;
643 cv_broadcast(&ring->vr_cv);
644 break;
645
646 default:
647 panic("invalid ring state %d", ring->vr_state);
648 break;
649 }
650
651 for (;;) {
652 int res = cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
653
654 if (ring->vr_state == VRS_INIT ||
655 (ring->vr_state_flags & VRSF_REQ_PAUSE) == 0) {
656 /* Ring made it to (or through) paused state */
657 mutex_exit(&ring->vr_lock);
658 return (0);
659 }
660 if (res == 0) {
661 /* interrupted by signal */
662 mutex_exit(&ring->vr_lock);
663 return (EINTR);
664 }
665 }
666 /* NOTREACHED */
667 }
668
669 static void
viona_worker(void * arg)670 viona_worker(void *arg)
671 {
672 viona_vring_t *ring = (viona_vring_t *)arg;
673 viona_link_t *link = ring->vr_link;
674
675 mutex_enter(&ring->vr_lock);
676 VERIFY3U(ring->vr_state, ==, VRS_SETUP);
677
678 /* Bail immediately if ring shutdown or process exit was requested */
679 if (vring_need_bail_ext(ring, true)) {
680 goto ring_reset;
681 }
682
683 /* Report worker thread as alive and notify creator */
684 ring_init:
685 ring->vr_state = VRS_INIT;
686 cv_broadcast(&ring->vr_cv);
687
688 while (!vring_start_req(ring)) {
689 /*
690 * Keeping lease renewals timely while waiting for the ring to
691 * be started is important for avoiding deadlocks.
692 */
693 if (vmm_drv_lease_expired(ring->vr_lease)) {
694 if (!viona_ring_lease_renew(ring)) {
695 goto ring_reset;
696 }
697 }
698
699 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
700
701 if (vring_pause_req(ring)) {
702 /* We are already paused in the INIT state. */
703 ring->vr_state_flags &= ~VRSF_REQ_PAUSE;
704 }
705 if (vring_need_bail_ext(ring, true)) {
706 goto ring_reset;
707 }
708 }
709
710 ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0);
711 ring->vr_state = VRS_RUN;
712 ring->vr_state_flags &= ~VRSF_REQ_START;
713 viona_ring_mark_dirty(ring);
714
715 /* Ensure ring lease is valid first */
716 if (vmm_drv_lease_expired(ring->vr_lease)) {
717 if (!viona_ring_lease_renew(ring)) {
718 goto ring_reset;
719 }
720 }
721
722 /* Process actual work */
723 if (ring == &link->l_vrings[VIONA_VQ_RX]) {
724 viona_worker_rx(ring, link);
725 } else if (ring == &link->l_vrings[VIONA_VQ_TX]) {
726 viona_worker_tx(ring, link);
727 } else {
728 panic("unexpected ring: %p", (void *)ring);
729 }
730
731 VERIFY3U(ring->vr_state, ==, VRS_STOP);
732 VERIFY3U(ring->vr_xfer_outstanding, ==, 0);
733
734 /* Respond to a pause request if the ring is not required to stop */
735 if (vring_pause_req(ring)) {
736 ring->vr_state_flags &= ~VRSF_REQ_PAUSE;
737
738 if (vring_need_bail_ext(ring, true)) {
739 goto ring_reset;
740 }
741
742 /*
743 * To complete pausing of the ring, unmap and re-map the pages
744 * underpinning the virtqueue. This is to synchronize their
745 * dirty state in the backing page tables and restore the
746 * defer-dirty state on the held pages.
747 */
748 viona_ring_unmap(ring);
749 if (viona_ring_map(ring, true)) {
750 goto ring_init;
751 }
752
753 /*
754 * If the ring pages failed to be mapped, fallthrough to
755 * ring-reset like any other failure.
756 */
757 }
758
759 ring_reset:
760 viona_ring_misc_free(ring);
761
762 viona_ring_lease_drop(ring);
763 ring->vr_cur_aidx = 0;
764 ring->vr_size = 0;
765 ring->vr_mask = 0;
766 ring->vr_pa = 0;
767 ring->vr_state = VRS_RESET;
768 ring->vr_state_flags = 0;
769 ring->vr_worker_thread = NULL;
770 cv_broadcast(&ring->vr_cv);
771 mutex_exit(&ring->vr_lock);
772
773 mutex_enter(&ttoproc(curthread)->p_lock);
774 lwp_exit();
775 }
776
777 static kthread_t *
viona_create_worker(viona_vring_t * ring)778 viona_create_worker(viona_vring_t *ring)
779 {
780 k_sigset_t hold_set;
781 proc_t *p = curproc;
782 kthread_t *t;
783 klwp_t *lwp;
784
785 ASSERT(MUTEX_HELD(&ring->vr_lock));
786 ASSERT(ring->vr_state == VRS_RESET);
787
788 sigfillset(&hold_set);
789 lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED,
790 minclsyspri - 1, &hold_set, curthread->t_cid, 0);
791 if (lwp == NULL) {
792 return (NULL);
793 }
794
795 t = lwptot(lwp);
796 mutex_enter(&p->p_lock);
797 t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD;
798 lwp_create_done(t);
799 mutex_exit(&p->p_lock);
800
801 return (t);
802 }
803
804 void
vq_read_desc(viona_vring_t * ring,uint16_t idx,struct virtio_desc * descp)805 vq_read_desc(viona_vring_t *ring, uint16_t idx, struct virtio_desc *descp)
806 {
807 const uint_t entry_off = idx * sizeof (struct virtio_desc);
808
809 ASSERT3U(idx, <, ring->vr_size);
810
811 bcopy(viona_ring_addr(ring, entry_off), descp, sizeof (*descp));
812 }
813
814 static uint16_t
vq_read_avail(viona_vring_t * ring,uint16_t idx)815 vq_read_avail(viona_vring_t *ring, uint16_t idx)
816 {
817 ASSERT3U(idx, <, ring->vr_size);
818
819 volatile uint16_t *avail_ent =
820 viona_ring_addr(ring, LEGACY_AVAIL_ENT_OFF(ring->vr_size, idx));
821 return (*avail_ent);
822 }
823
824 /*
825 * Given a buffer descriptor `desc`, attempt to map the pages backing that
826 * region of guest physical memory, taking into account that there are no
827 * guarantees about guest-contiguous pages being host-contiguous.
828 */
829 static int
vq_map_desc_bufs(viona_vring_t * ring,const struct virtio_desc * desc,vq_held_region_t * region)830 vq_map_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
831 vq_held_region_t *region)
832 {
833 int err;
834
835 if (desc->vd_len == 0) {
836 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
837 uint32_t, desc->vd_len);
838 VIONA_RING_STAT_INCR(ring, desc_bad_len);
839 return (EINVAL);
840 }
841
842 err = vq_region_hold(ring, desc->vd_addr, desc->vd_len,
843 (desc->vd_flags & VRING_DESC_F_WRITE) != 0, region);
844 switch (err) {
845 case E2BIG:
846 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
847 VIONA_RING_STAT_INCR(ring, too_many_desc);
848 break;
849 case EFAULT:
850 VIONA_PROBE_BAD_RING_ADDR(ring, desc->vd_addr);
851 VIONA_RING_STAT_INCR(ring, bad_ring_addr);
852 break;
853 default:
854 break;
855 }
856
857 return (err);
858 }
859
860 /*
861 * Walk an indirect buffer descriptor `desc`, attempting to map the pages
862 * backing the regions of guest memory covered by its constituent descriptors.
863 */
864 static int
vq_map_indir_desc_bufs(viona_vring_t * ring,const struct virtio_desc * desc,vq_held_region_t * region)865 vq_map_indir_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc,
866 vq_held_region_t *region)
867 {
868 const uint16_t indir_count = desc->vd_len / sizeof (struct virtio_desc);
869
870 if ((desc->vd_len & 0xf) != 0 || indir_count == 0 ||
871 indir_count > ring->vr_size ||
872 desc->vd_addr > (desc->vd_addr + desc->vd_len)) {
873 VIONA_PROBE2(indir_bad_len, viona_vring_t *, ring,
874 uint32_t, desc->vd_len);
875 VIONA_RING_STAT_INCR(ring, indir_bad_len);
876 return (EINVAL);
877 }
878
879 uint16_t indir_next = 0;
880 const uint8_t *buf = NULL;
881 uint64_t buf_gpa = UINT64_MAX;
882 vmm_page_t *vmp = NULL;
883 int err = 0;
884
885 for (;;) {
886 uint64_t indir_gpa =
887 desc->vd_addr + (indir_next * sizeof (struct virtio_desc));
888 uint64_t indir_page = indir_gpa & PAGEMASK;
889 struct virtio_desc vp;
890
891 /*
892 * Get a mapping for the page that the next indirect descriptor
893 * resides in, if has not already been done.
894 */
895 if (indir_page != buf_gpa) {
896 if (vmp != NULL) {
897 vmm_drv_page_release(vmp);
898 }
899 vmp = vq_page_hold(ring, indir_page, false);
900 if (vmp == NULL) {
901 VIONA_PROBE_BAD_RING_ADDR(ring, indir_page);
902 VIONA_RING_STAT_INCR(ring, bad_ring_addr);
903 err = EFAULT;
904 break;
905 }
906 buf_gpa = indir_page;
907 buf = vmm_drv_page_readable(vmp);
908 }
909
910 /*
911 * A copy of the indirect descriptor is made here, rather than
912 * simply using a reference pointer. This prevents malicious or
913 * erroneous guest writes to the descriptor from fooling the
914 * flags/bounds verification through a race.
915 */
916 bcopy(buf + (indir_gpa - indir_page), &vp, sizeof (vp));
917
918 if (vp.vd_flags & VRING_DESC_F_INDIRECT) {
919 VIONA_PROBE1(indir_bad_nest, viona_vring_t *, ring);
920 VIONA_RING_STAT_INCR(ring, indir_bad_nest);
921 err = EINVAL;
922 break;
923 } else if (vp.vd_len == 0) {
924 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring,
925 uint32_t, vp.vd_len);
926 VIONA_RING_STAT_INCR(ring, desc_bad_len);
927 err = EINVAL;
928 break;
929 }
930
931 err = vq_map_desc_bufs(ring, &vp, region);
932 if (err != 0) {
933 break;
934 }
935
936 /* Successfully reach the end of the indir chain */
937 if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) {
938 break;
939 }
940 if (region->vhr_idx >= region->vhr_niov) {
941 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring);
942 VIONA_RING_STAT_INCR(ring, too_many_desc);
943 err = E2BIG;
944 break;
945 }
946
947 indir_next = vp.vd_next;
948 if (indir_next >= indir_count) {
949 VIONA_PROBE3(indir_bad_next, viona_vring_t *, ring,
950 uint16_t, indir_next, uint16_t, indir_count);
951 VIONA_RING_STAT_INCR(ring, indir_bad_next);
952 err = EINVAL;
953 break;
954 }
955 }
956
957 if (vmp != NULL) {
958 vmm_drv_page_release(vmp);
959 }
960 return (err);
961 }
962
963 int
vq_popchain(viona_vring_t * ring,struct iovec * iov,uint_t niov,uint16_t * cookie,vmm_page_t ** chain)964 vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov,
965 uint16_t *cookie, vmm_page_t **chain)
966 {
967 uint16_t ndesc, idx, head, next;
968 struct virtio_desc vdir;
969 vq_held_region_t region = {
970 .vhr_niov = niov,
971 .vhr_iov = iov,
972 };
973
974 ASSERT(iov != NULL);
975 ASSERT(niov > 0 && niov < INT_MAX);
976 ASSERT(*chain == NULL);
977
978 mutex_enter(&ring->vr_a_mutex);
979 idx = ring->vr_cur_aidx;
980 ndesc = viona_ring_num_avail(ring);
981
982 if (ndesc == 0) {
983 mutex_exit(&ring->vr_a_mutex);
984 return (0);
985 }
986 if (ndesc > ring->vr_size) {
987 /*
988 * Despite the fact that the guest has provided an 'avail_idx'
989 * which indicates that an impossible number of descriptors are
990 * available, continue on and attempt to process the next one.
991 *
992 * The transgression will not escape the probe or stats though.
993 */
994 VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring,
995 uint16_t, ndesc);
996 VIONA_RING_STAT_INCR(ring, ndesc_too_high);
997 }
998
999 head = vq_read_avail(ring, idx & ring->vr_mask);
1000 next = head;
1001
1002 for (region.vhr_idx = 0; region.vhr_idx < niov; next = vdir.vd_next) {
1003 if (next >= ring->vr_size) {
1004 VIONA_PROBE2(bad_idx, viona_vring_t *, ring,
1005 uint16_t, next);
1006 VIONA_RING_STAT_INCR(ring, bad_idx);
1007 break;
1008 }
1009
1010 vq_read_desc(ring, next, &vdir);
1011 if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) {
1012 if (vq_map_desc_bufs(ring, &vdir, ®ion) != 0) {
1013 break;
1014 }
1015 } else {
1016 /*
1017 * Per the specification (Virtio 1.1 S2.6.5.3.1):
1018 * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT
1019 * and VIRTQ_DESC_F_NEXT in `flags`.
1020 */
1021 if ((vdir.vd_flags & VRING_DESC_F_NEXT) != 0) {
1022 VIONA_PROBE3(indir_bad_next,
1023 viona_vring_t *, ring,
1024 uint16_t, next, uint16_t, 0);
1025 VIONA_RING_STAT_INCR(ring, indir_bad_next);
1026 break;
1027 }
1028
1029 if (vq_map_indir_desc_bufs(ring, &vdir, ®ion) != 0) {
1030 break;
1031 }
1032 }
1033
1034 if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) {
1035 ring->vr_cur_aidx++;
1036 mutex_exit(&ring->vr_a_mutex);
1037
1038 *cookie = head;
1039 *chain = region.vhr_head;
1040 return (region.vhr_idx);
1041 }
1042 }
1043
1044 mutex_exit(&ring->vr_a_mutex);
1045 if (region.vhr_head != NULL) {
1046 /*
1047 * If any pages were held prior to encountering an error, we
1048 * must release them now.
1049 */
1050 vmm_drv_page_release_chain(region.vhr_head);
1051 }
1052 return (-1);
1053 }
1054
1055
1056 static void
vq_write_used_ent(viona_vring_t * ring,uint16_t idx,uint16_t cookie,uint32_t len)1057 vq_write_used_ent(viona_vring_t *ring, uint16_t idx, uint16_t cookie,
1058 uint32_t len)
1059 {
1060 /*
1061 * In a larger ring, entry could be split across pages, so be sure to
1062 * account for that when configuring the transfer by looking up the ID
1063 * and length addresses separately, rather than an address for a
1064 * combined `struct virtio_used`.
1065 */
1066 const uint_t used_id_off = LEGACY_USED_ENT_OFF(ring->vr_size, idx);
1067 const uint_t used_len_off = used_id_off + sizeof (uint32_t);
1068 volatile uint32_t *idp = viona_ring_addr(ring, used_id_off);
1069 volatile uint32_t *lenp = viona_ring_addr(ring, used_len_off);
1070
1071 ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
1072
1073 *idp = cookie;
1074 *lenp = len;
1075 }
1076
1077 static void
vq_write_used_idx(viona_vring_t * ring,uint16_t idx)1078 vq_write_used_idx(viona_vring_t *ring, uint16_t idx)
1079 {
1080 ASSERT(MUTEX_HELD(&ring->vr_u_mutex));
1081
1082 volatile uint16_t *used_idx =
1083 viona_ring_addr(ring, LEGACY_USED_IDX_OFF(ring->vr_size));
1084 *used_idx = idx;
1085 }
1086
1087 void
vq_pushchain(viona_vring_t * ring,uint32_t len,uint16_t cookie)1088 vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie)
1089 {
1090 uint16_t uidx;
1091
1092 mutex_enter(&ring->vr_u_mutex);
1093
1094 uidx = ring->vr_cur_uidx;
1095 vq_write_used_ent(ring, uidx & ring->vr_mask, cookie, len);
1096 uidx++;
1097 membar_producer();
1098
1099 vq_write_used_idx(ring, uidx);
1100 ring->vr_cur_uidx = uidx;
1101
1102 mutex_exit(&ring->vr_u_mutex);
1103 }
1104
1105 void
vq_pushchain_many(viona_vring_t * ring,uint_t num_bufs,used_elem_t * elem)1106 vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem)
1107 {
1108 uint16_t uidx;
1109
1110 mutex_enter(&ring->vr_u_mutex);
1111
1112 uidx = ring->vr_cur_uidx;
1113
1114 for (uint_t i = 0; i < num_bufs; i++, uidx++) {
1115 vq_write_used_ent(ring, uidx & ring->vr_mask, elem[i].id,
1116 elem[i].len);
1117 }
1118
1119 membar_producer();
1120 vq_write_used_idx(ring, uidx);
1121 ring->vr_cur_uidx = uidx;
1122
1123 mutex_exit(&ring->vr_u_mutex);
1124 }
1125
1126 /*
1127 * Set USED_NO_NOTIFY on VQ so guest elides doorbell calls for new entries.
1128 */
1129 void
viona_ring_disable_notify(viona_vring_t * ring)1130 viona_ring_disable_notify(viona_vring_t *ring)
1131 {
1132 volatile uint16_t *used_flags =
1133 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
1134
1135 *used_flags |= VRING_USED_F_NO_NOTIFY;
1136 }
1137
1138 /*
1139 * Clear USED_NO_NOTIFY on VQ so guest resumes doorbell calls for new entries.
1140 */
1141 void
viona_ring_enable_notify(viona_vring_t * ring)1142 viona_ring_enable_notify(viona_vring_t *ring)
1143 {
1144 volatile uint16_t *used_flags =
1145 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size));
1146
1147 *used_flags &= ~VRING_USED_F_NO_NOTIFY;
1148 }
1149
1150 /*
1151 * Return the number of available descriptors in the vring taking care of the
1152 * 16-bit index wraparound.
1153 *
1154 * Note: If the number of apparently available descriptors is larger than the
1155 * ring size (due to guest misbehavior), this check will still report the
1156 * positive count of descriptors.
1157 */
1158 uint16_t
viona_ring_num_avail(viona_vring_t * ring)1159 viona_ring_num_avail(viona_vring_t *ring)
1160 {
1161 volatile uint16_t *avail_idx =
1162 viona_ring_addr(ring, LEGACY_AVAIL_IDX_OFF(ring->vr_size));
1163
1164 return (*avail_idx - ring->vr_cur_aidx);
1165 }
1166