1 /* 2 * Copyright (c) 2013 Chris Torek <torek @ torek net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * This file and its contents are supplied under the terms of the 28 * Common Development and Distribution License ("CDDL"), version 1.0. 29 * You may only use this file in accordance with the terms of version 30 * 1.0 of the CDDL. 31 * 32 * A full copy of the text of the CDDL should have accompanied this 33 * source. A copy of the CDDL is also available via the Internet at 34 * http://www.illumos.org/license/CDDL. 35 * 36 * Copyright 2015 Pluribus Networks Inc. 37 * Copyright 2019 Joyent, Inc. 38 * Copyright 2022 Oxide Computer Company 39 */ 40 41 42 #include <sys/disp.h> 43 44 #include "viona_impl.h" 45 46 #define VRING_MAX_LEN 32768 47 48 /* Layout and sizing as defined in the spec for a legacy-style virtqueue */ 49 50 #define LEGACY_VQ_ALIGN PAGESIZE 51 52 #define LEGACY_DESC_SZ(qsz) ((qsz) * sizeof (struct virtio_desc)) 53 /* 54 * Available ring consists of avail_idx (uint16_t), flags (uint16_t), qsz avail 55 * descriptors (uint16_t each), and (optional) used_event (uint16_t). 56 */ 57 #define LEGACY_AVAIL_SZ(qsz) (((qsz) + 3) * sizeof (uint16_t)) 58 /* 59 * Used ring consists of used_idx (uint16_t), flags (uint16_t), qsz used 60 * descriptors (two uint32_t each), and (optional) avail_event (uint16_t). 61 */ 62 #define LEGACY_USED_SZ(qsz) \ 63 ((qsz) * sizeof (struct virtio_used) + 3 * sizeof (uint16_t)) 64 65 #define LEGACY_AVAIL_FLAGS_OFF(qsz) LEGACY_DESC_SZ(qsz) 66 #define LEGACY_AVAIL_IDX_OFF(qsz) \ 67 (LEGACY_DESC_SZ(qsz) + sizeof (uint16_t)) 68 #define LEGACY_AVAIL_ENT_OFF(qsz, idx) \ 69 (LEGACY_DESC_SZ(qsz) + (2 + (idx)) * sizeof (uint16_t)) 70 71 #define LEGACY_USED_FLAGS_OFF(qsz) \ 72 P2ROUNDUP(LEGACY_DESC_SZ(qsz) + LEGACY_AVAIL_SZ(qsz), LEGACY_VQ_ALIGN) 73 #define LEGACY_USED_IDX_OFF(qsz) \ 74 (LEGACY_USED_FLAGS_OFF(qsz) + sizeof (uint16_t)) 75 #define LEGACY_USED_ENT_OFF(qsz, idx) \ 76 (LEGACY_USED_FLAGS_OFF(qsz) + 2 * sizeof (uint16_t) + \ 77 (idx) * sizeof (struct virtio_used)) 78 79 #define LEGACY_VQ_SIZE(qsz) \ 80 (LEGACY_USED_FLAGS_OFF(qsz) + \ 81 P2ROUNDUP(LEGACY_USED_SZ(qsz), LEGACY_VQ_ALIGN)) 82 #define LEGACY_VQ_PAGES(qsz) (LEGACY_VQ_SIZE(qsz) / PAGESIZE) 83 84 struct vq_held_region { 85 struct iovec *vhr_iov; 86 vmm_page_t *vhr_head; 87 vmm_page_t *vhr_tail; 88 /* Length of iovec array supplied in `vhr_iov` */ 89 uint_t vhr_niov; 90 /* 91 * Index into vhr_iov, indicating the next "free" entry (following the 92 * last entry which has valid contents). 93 */ 94 uint_t vhr_idx; 95 }; 96 typedef struct vq_held_region vq_held_region_t; 97 98 static bool viona_ring_map(viona_vring_t *, bool); 99 static void viona_ring_unmap(viona_vring_t *); 100 static kthread_t *viona_create_worker(viona_vring_t *); 101 102 static vmm_page_t * 103 vq_page_hold(viona_vring_t *ring, uint64_t gpa, bool writable) 104 { 105 ASSERT3P(ring->vr_lease, !=, NULL); 106 107 int prot = PROT_READ; 108 if (writable) { 109 prot |= PROT_WRITE; 110 } 111 112 return (vmm_drv_page_hold(ring->vr_lease, gpa, prot)); 113 } 114 115 /* 116 * Establish a hold on the page(s) which back the region of guest memory covered 117 * by [gpa, gpa + len). The host-kernel-virtual pointers to those pages are 118 * stored in the iovec array supplied in `region`, along with the chain of 119 * vmm_page_t entries representing the held pages. Since guest memory 120 * carries no guarantees of being physically contiguous (on the host), it is 121 * assumed that an iovec entry will be required for each PAGESIZE section 122 * covered by the specified `gpa` and `len` range. For each iovec entry 123 * successfully populated by holding a page, `vhr_idx` will be incremented so it 124 * references the next available iovec entry (or `vhr_niov`, if the iovec array 125 * is full). The responsibility for releasing the `vmm_page_t` chain (stored in 126 * `vhr_head` and `vhr_tail`) resides with the caller, regardless of the result. 127 */ 128 static int 129 vq_region_hold(viona_vring_t *ring, uint64_t gpa, uint32_t len, 130 bool writable, vq_held_region_t *region) 131 { 132 const uint32_t front_offset = gpa & PAGEOFFSET; 133 const uint32_t front_len = MIN(len, PAGESIZE - front_offset); 134 uint_t pages = 1; 135 vmm_page_t *vmp; 136 caddr_t buf; 137 138 ASSERT3U(region->vhr_idx, <, region->vhr_niov); 139 140 if (front_len < len) { 141 pages += P2ROUNDUP((uint64_t)(len - front_len), 142 PAGESIZE) / PAGESIZE; 143 } 144 if (pages > (region->vhr_niov - region->vhr_idx)) { 145 return (E2BIG); 146 } 147 148 vmp = vq_page_hold(ring, gpa & PAGEMASK, writable); 149 if (vmp == NULL) { 150 return (EFAULT); 151 } 152 buf = (caddr_t)vmm_drv_page_readable(vmp); 153 154 region->vhr_iov[region->vhr_idx].iov_base = buf + front_offset; 155 region->vhr_iov[region->vhr_idx].iov_len = front_len; 156 region->vhr_idx++; 157 gpa += front_len; 158 len -= front_len; 159 if (region->vhr_head == NULL) { 160 region->vhr_head = vmp; 161 region->vhr_tail = vmp; 162 } else { 163 vmm_drv_page_chain(region->vhr_tail, vmp); 164 region->vhr_tail = vmp; 165 } 166 167 for (uint_t i = 1; i < pages; i++) { 168 ASSERT3U(gpa & PAGEOFFSET, ==, 0); 169 170 vmp = vq_page_hold(ring, gpa, writable); 171 if (vmp == NULL) { 172 return (EFAULT); 173 } 174 buf = (caddr_t)vmm_drv_page_readable(vmp); 175 176 const uint32_t chunk_len = MIN(len, PAGESIZE); 177 region->vhr_iov[region->vhr_idx].iov_base = buf; 178 region->vhr_iov[region->vhr_idx].iov_len = chunk_len; 179 region->vhr_idx++; 180 gpa += chunk_len; 181 len -= chunk_len; 182 vmm_drv_page_chain(region->vhr_tail, vmp); 183 region->vhr_tail = vmp; 184 } 185 186 return (0); 187 } 188 189 static boolean_t 190 viona_ring_lease_expire_cb(void *arg) 191 { 192 viona_vring_t *ring = arg; 193 194 mutex_enter(&ring->vr_lock); 195 cv_broadcast(&ring->vr_cv); 196 mutex_exit(&ring->vr_lock); 197 198 /* The lease will be broken asynchronously. */ 199 return (B_FALSE); 200 } 201 202 static void 203 viona_ring_lease_drop(viona_vring_t *ring) 204 { 205 ASSERT(MUTEX_HELD(&ring->vr_lock)); 206 207 if (ring->vr_lease != NULL) { 208 vmm_hold_t *hold = ring->vr_link->l_vm_hold; 209 210 ASSERT(hold != NULL); 211 212 /* 213 * Without an active lease, the ring mappings cannot be 214 * considered valid. 215 */ 216 viona_ring_unmap(ring); 217 218 vmm_drv_lease_break(hold, ring->vr_lease); 219 ring->vr_lease = NULL; 220 } 221 } 222 223 boolean_t 224 viona_ring_lease_renew(viona_vring_t *ring) 225 { 226 vmm_hold_t *hold = ring->vr_link->l_vm_hold; 227 228 ASSERT(hold != NULL); 229 ASSERT(MUTEX_HELD(&ring->vr_lock)); 230 231 viona_ring_lease_drop(ring); 232 233 /* 234 * Lease renewal will fail if the VM has requested that all holds be 235 * cleaned up. 236 */ 237 ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb, 238 ring); 239 if (ring->vr_lease != NULL) { 240 /* A ring undergoing renewal will need valid guest mappings */ 241 if (ring->vr_pa != 0 && ring->vr_size != 0) { 242 /* 243 * If new mappings cannot be established, consider the 244 * lease renewal a failure. 245 */ 246 if (!viona_ring_map(ring, ring->vr_state == VRS_INIT)) { 247 viona_ring_lease_drop(ring); 248 return (B_FALSE); 249 } 250 } 251 } 252 return (ring->vr_lease != NULL); 253 } 254 255 void 256 viona_ring_alloc(viona_link_t *link, viona_vring_t *ring) 257 { 258 ring->vr_link = link; 259 mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL); 260 cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL); 261 mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL); 262 mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL); 263 } 264 265 static void 266 viona_ring_misc_free(viona_vring_t *ring) 267 { 268 const uint_t qsz = ring->vr_size; 269 270 viona_tx_ring_free(ring, qsz); 271 } 272 273 void 274 viona_ring_free(viona_vring_t *ring) 275 { 276 mutex_destroy(&ring->vr_lock); 277 cv_destroy(&ring->vr_cv); 278 mutex_destroy(&ring->vr_a_mutex); 279 mutex_destroy(&ring->vr_u_mutex); 280 ring->vr_link = NULL; 281 } 282 283 int 284 viona_ring_init(viona_link_t *link, uint16_t idx, 285 const struct viona_ring_params *params) 286 { 287 viona_vring_t *ring; 288 kthread_t *t; 289 int err = 0; 290 const uint16_t qsz = params->vrp_size; 291 const uint64_t pa = params->vrp_pa; 292 293 if (idx >= VIONA_VQ_MAX) { 294 return (EINVAL); 295 } 296 297 if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) { 298 return (EINVAL); 299 } 300 if ((pa & (LEGACY_VQ_ALIGN - 1)) != 0) { 301 return (EINVAL); 302 } 303 304 ring = &link->l_vrings[idx]; 305 mutex_enter(&ring->vr_lock); 306 if (ring->vr_state != VRS_RESET) { 307 mutex_exit(&ring->vr_lock); 308 return (EBUSY); 309 } 310 VERIFY(ring->vr_state_flags == 0); 311 312 ring->vr_lease = NULL; 313 if (!viona_ring_lease_renew(ring)) { 314 err = EBUSY; 315 goto fail; 316 } 317 318 ring->vr_size = qsz; 319 ring->vr_mask = (ring->vr_size - 1); 320 ring->vr_pa = pa; 321 if (!viona_ring_map(ring, true)) { 322 err = EINVAL; 323 goto fail; 324 } 325 326 /* Initialize queue indexes */ 327 ring->vr_cur_aidx = params->vrp_avail_idx; 328 ring->vr_cur_uidx = params->vrp_used_idx; 329 330 if (idx == VIONA_VQ_TX) { 331 viona_tx_ring_alloc(ring, qsz); 332 } 333 334 /* Zero out MSI-X configuration */ 335 ring->vr_msi_addr = 0; 336 ring->vr_msi_msg = 0; 337 338 /* Clear the stats */ 339 bzero(&ring->vr_stats, sizeof (ring->vr_stats)); 340 341 t = viona_create_worker(ring); 342 if (t == NULL) { 343 err = ENOMEM; 344 goto fail; 345 } 346 ring->vr_worker_thread = t; 347 ring->vr_state = VRS_SETUP; 348 cv_broadcast(&ring->vr_cv); 349 mutex_exit(&ring->vr_lock); 350 return (0); 351 352 fail: 353 viona_ring_lease_drop(ring); 354 viona_ring_misc_free(ring); 355 ring->vr_size = 0; 356 ring->vr_mask = 0; 357 ring->vr_pa = 0; 358 ring->vr_cur_aidx = 0; 359 ring->vr_cur_uidx = 0; 360 mutex_exit(&ring->vr_lock); 361 return (err); 362 } 363 364 int 365 viona_ring_get_state(viona_link_t *link, uint16_t idx, 366 struct viona_ring_params *params) 367 { 368 viona_vring_t *ring; 369 370 if (idx >= VIONA_VQ_MAX) { 371 return (EINVAL); 372 } 373 374 ring = &link->l_vrings[idx]; 375 mutex_enter(&ring->vr_lock); 376 377 params->vrp_size = ring->vr_size; 378 params->vrp_pa = ring->vr_pa; 379 380 if (ring->vr_state == VRS_RUN) { 381 /* On a running ring, we must heed the avail/used locks */ 382 mutex_enter(&ring->vr_a_mutex); 383 params->vrp_avail_idx = ring->vr_cur_aidx; 384 mutex_exit(&ring->vr_a_mutex); 385 mutex_enter(&ring->vr_u_mutex); 386 params->vrp_used_idx = ring->vr_cur_uidx; 387 mutex_exit(&ring->vr_u_mutex); 388 } else { 389 /* Otherwise vr_lock is adequate protection */ 390 params->vrp_avail_idx = ring->vr_cur_aidx; 391 params->vrp_used_idx = ring->vr_cur_uidx; 392 } 393 394 mutex_exit(&ring->vr_lock); 395 396 return (0); 397 } 398 399 int 400 viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals) 401 { 402 mutex_enter(&ring->vr_lock); 403 if (ring->vr_state == VRS_RESET) { 404 mutex_exit(&ring->vr_lock); 405 return (0); 406 } 407 408 if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) { 409 ring->vr_state_flags |= VRSF_REQ_STOP; 410 cv_broadcast(&ring->vr_cv); 411 } 412 while (ring->vr_state != VRS_RESET) { 413 if (!heed_signals) { 414 cv_wait(&ring->vr_cv, &ring->vr_lock); 415 } else { 416 int rs; 417 418 rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 419 if (rs <= 0 && ring->vr_state != VRS_RESET) { 420 mutex_exit(&ring->vr_lock); 421 return (EINTR); 422 } 423 } 424 } 425 mutex_exit(&ring->vr_lock); 426 return (0); 427 } 428 429 static bool 430 viona_ring_map(viona_vring_t *ring, bool defer_dirty) 431 { 432 const uint16_t qsz = ring->vr_size; 433 uintptr_t pa = ring->vr_pa; 434 435 ASSERT3U(qsz, !=, 0); 436 ASSERT3U(qsz, <=, VRING_MAX_LEN); 437 ASSERT3U(pa, !=, 0); 438 ASSERT3U(pa & (LEGACY_VQ_ALIGN - 1), ==, 0); 439 ASSERT3U(LEGACY_VQ_ALIGN, ==, PAGESIZE); 440 ASSERT(MUTEX_HELD(&ring->vr_lock)); 441 ASSERT3P(ring->vr_map_pages, ==, NULL); 442 443 const uint_t npages = LEGACY_VQ_PAGES(qsz); 444 ring->vr_map_pages = kmem_zalloc(npages * sizeof (void *), KM_SLEEP); 445 446 int page_flags = 0; 447 if (defer_dirty) { 448 /* 449 * During initialization, and when entering the paused state, 450 * the page holds for a virtqueue are established with the 451 * DEFER_DIRTY flag set. 452 * 453 * This prevents those page holds from immediately marking the 454 * underlying pages as dirty, since the viona emulation is not 455 * yet performing any accesses. Once the ring transitions to 456 * the VRS_RUN state, the held pages will be marked as dirty. 457 * 458 * Any ring mappings performed outside those state conditions, 459 * such as those part of vmm_lease renewal during steady-state 460 * operation, will map the ring pages normally (as considered 461 * immediately dirty). 462 */ 463 page_flags |= VMPF_DEFER_DIRTY; 464 } 465 466 vmm_page_t *prev = NULL; 467 for (uint_t i = 0; i < npages; i++, pa += PAGESIZE) { 468 vmm_page_t *vmp; 469 470 vmp = vmm_drv_page_hold_ext(ring->vr_lease, pa, 471 PROT_READ | PROT_WRITE, page_flags); 472 if (vmp == NULL) { 473 viona_ring_unmap(ring); 474 return (false); 475 } 476 477 /* 478 * Keep the first page has the head of the chain, appending all 479 * subsequent pages to the tail. 480 */ 481 if (prev == NULL) { 482 ring->vr_map_hold = vmp; 483 } else { 484 vmm_drv_page_chain(prev, vmp); 485 } 486 prev = vmp; 487 ring->vr_map_pages[i] = vmm_drv_page_writable(vmp); 488 } 489 490 return (true); 491 } 492 493 static void 494 viona_ring_mark_dirty(viona_vring_t *ring) 495 { 496 ASSERT(MUTEX_HELD(&ring->vr_lock)); 497 ASSERT(ring->vr_map_hold != NULL); 498 499 for (vmm_page_t *vp = ring->vr_map_hold; vp != NULL; 500 vp = vmm_drv_page_next(vp)) { 501 vmm_drv_page_mark_dirty(vp); 502 } 503 } 504 505 static void 506 viona_ring_unmap(viona_vring_t *ring) 507 { 508 ASSERT(MUTEX_HELD(&ring->vr_lock)); 509 510 void **map = ring->vr_map_pages; 511 if (map != NULL) { 512 const uint_t npages = LEGACY_VQ_PAGES(ring->vr_size); 513 kmem_free(map, npages * sizeof (void *)); 514 ring->vr_map_pages = NULL; 515 516 vmm_drv_page_release_chain(ring->vr_map_hold); 517 ring->vr_map_hold = NULL; 518 } else { 519 ASSERT3P(ring->vr_map_hold, ==, NULL); 520 } 521 } 522 523 static inline void * 524 viona_ring_addr(viona_vring_t *ring, uint_t off) 525 { 526 ASSERT3P(ring->vr_map_pages, !=, NULL); 527 ASSERT3U(LEGACY_VQ_SIZE(ring->vr_size), >, off); 528 529 const uint_t page_num = off / PAGESIZE; 530 const uint_t page_off = off % PAGESIZE; 531 return ((caddr_t)ring->vr_map_pages[page_num] + page_off); 532 } 533 534 void 535 viona_intr_ring(viona_vring_t *ring, boolean_t skip_flags_check) 536 { 537 if (!skip_flags_check) { 538 volatile uint16_t *avail_flags = viona_ring_addr(ring, 539 LEGACY_AVAIL_FLAGS_OFF(ring->vr_size)); 540 541 if ((*avail_flags & VRING_AVAIL_F_NO_INTERRUPT) != 0) { 542 return; 543 } 544 } 545 546 mutex_enter(&ring->vr_lock); 547 uint64_t addr = ring->vr_msi_addr; 548 uint64_t msg = ring->vr_msi_msg; 549 mutex_exit(&ring->vr_lock); 550 if (addr != 0) { 551 /* Deliver the interrupt directly, if so configured... */ 552 (void) vmm_drv_msi(ring->vr_lease, addr, msg); 553 } else { 554 /* ... otherwise, leave it to userspace */ 555 if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) { 556 pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND); 557 } 558 } 559 } 560 561 static inline bool 562 vring_stop_req(const viona_vring_t *ring) 563 { 564 return ((ring->vr_state_flags & VRSF_REQ_STOP) != 0); 565 } 566 567 static inline bool 568 vring_pause_req(const viona_vring_t *ring) 569 { 570 return ((ring->vr_state_flags & VRSF_REQ_PAUSE) != 0); 571 } 572 573 static inline bool 574 vring_start_req(const viona_vring_t *ring) 575 { 576 return ((ring->vr_state_flags & VRSF_REQ_START) != 0); 577 } 578 579 /* 580 * Check if vring worker thread should bail out. This will heed indications 581 * that the containing process is exiting, as well as requests to stop or pause 582 * the ring. The `stop_only` parameter controls if pause requests are ignored 583 * (true) or checked (false). 584 * 585 * Caller should hold vr_lock. 586 */ 587 static bool 588 vring_need_bail_ext(const viona_vring_t *ring, bool stop_only) 589 { 590 ASSERT(MUTEX_HELD(&ring->vr_lock)); 591 592 if (vring_stop_req(ring) || 593 (!stop_only && vring_pause_req(ring))) { 594 return (true); 595 } 596 597 kthread_t *t = ring->vr_worker_thread; 598 if (t != NULL) { 599 proc_t *p = ttoproc(t); 600 601 ASSERT(p != NULL); 602 if ((p->p_flag & SEXITING) != 0) { 603 return (true); 604 } 605 } 606 return (false); 607 } 608 609 bool 610 vring_need_bail(const viona_vring_t *ring) 611 { 612 return (vring_need_bail_ext(ring, false)); 613 } 614 615 int 616 viona_ring_pause(viona_vring_t *ring) 617 { 618 mutex_enter(&ring->vr_lock); 619 switch (ring->vr_state) { 620 case VRS_RESET: 621 case VRS_SETUP: 622 case VRS_INIT: 623 /* 624 * For rings which have not yet started (even those in the 625 * VRS_SETUP and VRS_INIT phases, where there a running worker 626 * thread (waiting to be released to do its intended task), it 627 * is adequate to simply clear any start request, to keep them 628 * from proceeding into the actual work processing function. 629 */ 630 ring->vr_state_flags &= ~VRSF_REQ_START; 631 mutex_exit(&ring->vr_lock); 632 return (0); 633 634 case VRS_STOP: 635 if ((ring->vr_state_flags & VRSF_REQ_STOP) != 0) { 636 /* A ring on its way to RESET cannot be paused. */ 637 mutex_exit(&ring->vr_lock); 638 return (EBUSY); 639 } 640 /* FALLTHROUGH */ 641 case VRS_RUN: 642 ring->vr_state_flags |= VRSF_REQ_PAUSE; 643 cv_broadcast(&ring->vr_cv); 644 break; 645 646 default: 647 panic("invalid ring state %d", ring->vr_state); 648 break; 649 } 650 651 for (;;) { 652 int res = cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 653 654 if (ring->vr_state == VRS_INIT || 655 (ring->vr_state_flags & VRSF_REQ_PAUSE) == 0) { 656 /* Ring made it to (or through) paused state */ 657 mutex_exit(&ring->vr_lock); 658 return (0); 659 } 660 if (res == 0) { 661 /* interrupted by signal */ 662 mutex_exit(&ring->vr_lock); 663 return (EINTR); 664 } 665 } 666 /* NOTREACHED */ 667 } 668 669 static void 670 viona_worker(void *arg) 671 { 672 viona_vring_t *ring = (viona_vring_t *)arg; 673 viona_link_t *link = ring->vr_link; 674 675 mutex_enter(&ring->vr_lock); 676 VERIFY3U(ring->vr_state, ==, VRS_SETUP); 677 678 /* Bail immediately if ring shutdown or process exit was requested */ 679 if (vring_need_bail_ext(ring, true)) { 680 goto ring_reset; 681 } 682 683 /* Report worker thread as alive and notify creator */ 684 ring_init: 685 ring->vr_state = VRS_INIT; 686 cv_broadcast(&ring->vr_cv); 687 688 while (!vring_start_req(ring)) { 689 /* 690 * Keeping lease renewals timely while waiting for the ring to 691 * be started is important for avoiding deadlocks. 692 */ 693 if (vmm_drv_lease_expired(ring->vr_lease)) { 694 if (!viona_ring_lease_renew(ring)) { 695 goto ring_reset; 696 } 697 } 698 699 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 700 701 if (vring_pause_req(ring)) { 702 /* We are already paused in the INIT state. */ 703 ring->vr_state_flags &= ~VRSF_REQ_PAUSE; 704 } 705 if (vring_need_bail_ext(ring, true)) { 706 goto ring_reset; 707 } 708 } 709 710 ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0); 711 ring->vr_state = VRS_RUN; 712 ring->vr_state_flags &= ~VRSF_REQ_START; 713 viona_ring_mark_dirty(ring); 714 715 /* Ensure ring lease is valid first */ 716 if (vmm_drv_lease_expired(ring->vr_lease)) { 717 if (!viona_ring_lease_renew(ring)) { 718 goto ring_reset; 719 } 720 } 721 722 /* Process actual work */ 723 if (ring == &link->l_vrings[VIONA_VQ_RX]) { 724 viona_worker_rx(ring, link); 725 } else if (ring == &link->l_vrings[VIONA_VQ_TX]) { 726 viona_worker_tx(ring, link); 727 } else { 728 panic("unexpected ring: %p", (void *)ring); 729 } 730 731 VERIFY3U(ring->vr_state, ==, VRS_STOP); 732 VERIFY3U(ring->vr_xfer_outstanding, ==, 0); 733 734 /* Respond to a pause request if the ring is not required to stop */ 735 if (vring_pause_req(ring)) { 736 ring->vr_state_flags &= ~VRSF_REQ_PAUSE; 737 738 if (vring_need_bail_ext(ring, true)) { 739 goto ring_reset; 740 } 741 742 /* 743 * To complete pausing of the ring, unmap and re-map the pages 744 * underpinning the virtqueue. This is to synchronize their 745 * dirty state in the backing page tables and restore the 746 * defer-dirty state on the held pages. 747 */ 748 viona_ring_unmap(ring); 749 if (viona_ring_map(ring, true)) { 750 goto ring_init; 751 } 752 753 /* 754 * If the ring pages failed to be mapped, fallthrough to 755 * ring-reset like any other failure. 756 */ 757 } 758 759 ring_reset: 760 viona_ring_misc_free(ring); 761 762 viona_ring_lease_drop(ring); 763 ring->vr_cur_aidx = 0; 764 ring->vr_size = 0; 765 ring->vr_mask = 0; 766 ring->vr_pa = 0; 767 ring->vr_state = VRS_RESET; 768 ring->vr_state_flags = 0; 769 ring->vr_worker_thread = NULL; 770 cv_broadcast(&ring->vr_cv); 771 mutex_exit(&ring->vr_lock); 772 773 mutex_enter(&ttoproc(curthread)->p_lock); 774 lwp_exit(); 775 } 776 777 static kthread_t * 778 viona_create_worker(viona_vring_t *ring) 779 { 780 k_sigset_t hold_set; 781 proc_t *p = curproc; 782 kthread_t *t; 783 klwp_t *lwp; 784 785 ASSERT(MUTEX_HELD(&ring->vr_lock)); 786 ASSERT(ring->vr_state == VRS_RESET); 787 788 sigfillset(&hold_set); 789 lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED, 790 minclsyspri - 1, &hold_set, curthread->t_cid, 0); 791 if (lwp == NULL) { 792 return (NULL); 793 } 794 795 t = lwptot(lwp); 796 mutex_enter(&p->p_lock); 797 t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD; 798 lwp_create_done(t); 799 mutex_exit(&p->p_lock); 800 801 return (t); 802 } 803 804 void 805 vq_read_desc(viona_vring_t *ring, uint16_t idx, struct virtio_desc *descp) 806 { 807 const uint_t entry_off = idx * sizeof (struct virtio_desc); 808 809 ASSERT3U(idx, <, ring->vr_size); 810 811 bcopy(viona_ring_addr(ring, entry_off), descp, sizeof (*descp)); 812 } 813 814 static uint16_t 815 vq_read_avail(viona_vring_t *ring, uint16_t idx) 816 { 817 ASSERT3U(idx, <, ring->vr_size); 818 819 volatile uint16_t *avail_ent = 820 viona_ring_addr(ring, LEGACY_AVAIL_ENT_OFF(ring->vr_size, idx)); 821 return (*avail_ent); 822 } 823 824 /* 825 * Given a buffer descriptor `desc`, attempt to map the pages backing that 826 * region of guest physical memory, taking into account that there are no 827 * guarantees about guest-contiguous pages being host-contiguous. 828 */ 829 static int 830 vq_map_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc, 831 vq_held_region_t *region) 832 { 833 int err; 834 835 if (desc->vd_len == 0) { 836 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring, 837 uint32_t, desc->vd_len); 838 VIONA_RING_STAT_INCR(ring, desc_bad_len); 839 return (EINVAL); 840 } 841 842 err = vq_region_hold(ring, desc->vd_addr, desc->vd_len, 843 (desc->vd_flags & VRING_DESC_F_WRITE) != 0, region); 844 switch (err) { 845 case E2BIG: 846 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring); 847 VIONA_RING_STAT_INCR(ring, too_many_desc); 848 break; 849 case EFAULT: 850 VIONA_PROBE_BAD_RING_ADDR(ring, desc->vd_addr); 851 VIONA_RING_STAT_INCR(ring, bad_ring_addr); 852 break; 853 default: 854 break; 855 } 856 857 return (err); 858 } 859 860 /* 861 * Walk an indirect buffer descriptor `desc`, attempting to map the pages 862 * backing the regions of guest memory covered by its constituent descriptors. 863 */ 864 static int 865 vq_map_indir_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc, 866 vq_held_region_t *region) 867 { 868 const uint16_t indir_count = desc->vd_len / sizeof (struct virtio_desc); 869 870 if ((desc->vd_len & 0xf) != 0 || indir_count == 0 || 871 indir_count > ring->vr_size || 872 desc->vd_addr > (desc->vd_addr + desc->vd_len)) { 873 VIONA_PROBE2(indir_bad_len, viona_vring_t *, ring, 874 uint32_t, desc->vd_len); 875 VIONA_RING_STAT_INCR(ring, indir_bad_len); 876 return (EINVAL); 877 } 878 879 uint16_t indir_next = 0; 880 const uint8_t *buf = NULL; 881 uint64_t buf_gpa = UINT64_MAX; 882 vmm_page_t *vmp = NULL; 883 int err = 0; 884 885 for (;;) { 886 uint64_t indir_gpa = 887 desc->vd_addr + (indir_next * sizeof (struct virtio_desc)); 888 uint64_t indir_page = indir_gpa & PAGEMASK; 889 struct virtio_desc vp; 890 891 /* 892 * Get a mapping for the page that the next indirect descriptor 893 * resides in, if has not already been done. 894 */ 895 if (indir_page != buf_gpa) { 896 if (vmp != NULL) { 897 vmm_drv_page_release(vmp); 898 } 899 vmp = vq_page_hold(ring, indir_page, false); 900 if (vmp == NULL) { 901 VIONA_PROBE_BAD_RING_ADDR(ring, indir_page); 902 VIONA_RING_STAT_INCR(ring, bad_ring_addr); 903 err = EFAULT; 904 break; 905 } 906 buf_gpa = indir_page; 907 buf = vmm_drv_page_readable(vmp); 908 } 909 910 /* 911 * A copy of the indirect descriptor is made here, rather than 912 * simply using a reference pointer. This prevents malicious or 913 * erroneous guest writes to the descriptor from fooling the 914 * flags/bounds verification through a race. 915 */ 916 bcopy(buf + (indir_gpa - indir_page), &vp, sizeof (vp)); 917 918 if (vp.vd_flags & VRING_DESC_F_INDIRECT) { 919 VIONA_PROBE1(indir_bad_nest, viona_vring_t *, ring); 920 VIONA_RING_STAT_INCR(ring, indir_bad_nest); 921 err = EINVAL; 922 break; 923 } else if (vp.vd_len == 0) { 924 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring, 925 uint32_t, vp.vd_len); 926 VIONA_RING_STAT_INCR(ring, desc_bad_len); 927 err = EINVAL; 928 break; 929 } 930 931 err = vq_map_desc_bufs(ring, &vp, region); 932 if (err != 0) { 933 break; 934 } 935 936 /* Successfully reach the end of the indir chain */ 937 if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) { 938 break; 939 } 940 if (region->vhr_idx >= region->vhr_niov) { 941 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring); 942 VIONA_RING_STAT_INCR(ring, too_many_desc); 943 err = E2BIG; 944 break; 945 } 946 947 indir_next = vp.vd_next; 948 if (indir_next >= indir_count) { 949 VIONA_PROBE3(indir_bad_next, viona_vring_t *, ring, 950 uint16_t, indir_next, uint16_t, indir_count); 951 VIONA_RING_STAT_INCR(ring, indir_bad_next); 952 err = EINVAL; 953 break; 954 } 955 } 956 957 if (vmp != NULL) { 958 vmm_drv_page_release(vmp); 959 } 960 return (err); 961 } 962 963 int 964 vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov, 965 uint16_t *cookie, vmm_page_t **chain) 966 { 967 uint16_t ndesc, idx, head, next; 968 struct virtio_desc vdir; 969 vq_held_region_t region = { 970 .vhr_niov = niov, 971 .vhr_iov = iov, 972 }; 973 974 ASSERT(iov != NULL); 975 ASSERT(niov > 0 && niov < INT_MAX); 976 ASSERT(*chain == NULL); 977 978 mutex_enter(&ring->vr_a_mutex); 979 idx = ring->vr_cur_aidx; 980 ndesc = viona_ring_num_avail(ring); 981 982 if (ndesc == 0) { 983 mutex_exit(&ring->vr_a_mutex); 984 return (0); 985 } 986 if (ndesc > ring->vr_size) { 987 /* 988 * Despite the fact that the guest has provided an 'avail_idx' 989 * which indicates that an impossible number of descriptors are 990 * available, continue on and attempt to process the next one. 991 * 992 * The transgression will not escape the probe or stats though. 993 */ 994 VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring, 995 uint16_t, ndesc); 996 VIONA_RING_STAT_INCR(ring, ndesc_too_high); 997 } 998 999 head = vq_read_avail(ring, idx & ring->vr_mask); 1000 next = head; 1001 1002 for (region.vhr_idx = 0; region.vhr_idx < niov; next = vdir.vd_next) { 1003 if (next >= ring->vr_size) { 1004 VIONA_PROBE2(bad_idx, viona_vring_t *, ring, 1005 uint16_t, next); 1006 VIONA_RING_STAT_INCR(ring, bad_idx); 1007 break; 1008 } 1009 1010 vq_read_desc(ring, next, &vdir); 1011 if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) { 1012 if (vq_map_desc_bufs(ring, &vdir, ®ion) != 0) { 1013 break; 1014 } 1015 } else { 1016 /* 1017 * Per the specification (Virtio 1.1 S2.6.5.3.1): 1018 * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT 1019 * and VIRTQ_DESC_F_NEXT in `flags`. 1020 */ 1021 if ((vdir.vd_flags & VRING_DESC_F_NEXT) != 0) { 1022 VIONA_PROBE3(indir_bad_next, 1023 viona_vring_t *, ring, 1024 uint16_t, next, uint16_t, 0); 1025 VIONA_RING_STAT_INCR(ring, indir_bad_next); 1026 break; 1027 } 1028 1029 if (vq_map_indir_desc_bufs(ring, &vdir, ®ion) != 0) { 1030 break; 1031 } 1032 } 1033 1034 if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) { 1035 ring->vr_cur_aidx++; 1036 mutex_exit(&ring->vr_a_mutex); 1037 1038 *cookie = head; 1039 *chain = region.vhr_head; 1040 return (region.vhr_idx); 1041 } 1042 } 1043 1044 mutex_exit(&ring->vr_a_mutex); 1045 if (region.vhr_head != NULL) { 1046 /* 1047 * If any pages were held prior to encountering an error, we 1048 * must release them now. 1049 */ 1050 vmm_drv_page_release_chain(region.vhr_head); 1051 } 1052 return (-1); 1053 } 1054 1055 1056 static void 1057 vq_write_used_ent(viona_vring_t *ring, uint16_t idx, uint16_t cookie, 1058 uint32_t len) 1059 { 1060 /* 1061 * In a larger ring, entry could be split across pages, so be sure to 1062 * account for that when configuring the transfer by looking up the ID 1063 * and length addresses separately, rather than an address for a 1064 * combined `struct virtio_used`. 1065 */ 1066 const uint_t used_id_off = LEGACY_USED_ENT_OFF(ring->vr_size, idx); 1067 const uint_t used_len_off = used_id_off + sizeof (uint32_t); 1068 volatile uint32_t *idp = viona_ring_addr(ring, used_id_off); 1069 volatile uint32_t *lenp = viona_ring_addr(ring, used_len_off); 1070 1071 ASSERT(MUTEX_HELD(&ring->vr_u_mutex)); 1072 1073 *idp = cookie; 1074 *lenp = len; 1075 } 1076 1077 static void 1078 vq_write_used_idx(viona_vring_t *ring, uint16_t idx) 1079 { 1080 ASSERT(MUTEX_HELD(&ring->vr_u_mutex)); 1081 1082 volatile uint16_t *used_idx = 1083 viona_ring_addr(ring, LEGACY_USED_IDX_OFF(ring->vr_size)); 1084 *used_idx = idx; 1085 } 1086 1087 void 1088 vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie) 1089 { 1090 uint16_t uidx; 1091 1092 mutex_enter(&ring->vr_u_mutex); 1093 1094 uidx = ring->vr_cur_uidx; 1095 vq_write_used_ent(ring, uidx & ring->vr_mask, cookie, len); 1096 uidx++; 1097 membar_producer(); 1098 1099 vq_write_used_idx(ring, uidx); 1100 ring->vr_cur_uidx = uidx; 1101 1102 mutex_exit(&ring->vr_u_mutex); 1103 } 1104 1105 void 1106 vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem) 1107 { 1108 uint16_t uidx; 1109 1110 mutex_enter(&ring->vr_u_mutex); 1111 1112 uidx = ring->vr_cur_uidx; 1113 1114 for (uint_t i = 0; i < num_bufs; i++, uidx++) { 1115 vq_write_used_ent(ring, uidx & ring->vr_mask, elem[i].id, 1116 elem[i].len); 1117 } 1118 1119 membar_producer(); 1120 vq_write_used_idx(ring, uidx); 1121 ring->vr_cur_uidx = uidx; 1122 1123 mutex_exit(&ring->vr_u_mutex); 1124 } 1125 1126 /* 1127 * Set USED_NO_NOTIFY on VQ so guest elides doorbell calls for new entries. 1128 */ 1129 void 1130 viona_ring_disable_notify(viona_vring_t *ring) 1131 { 1132 volatile uint16_t *used_flags = 1133 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size)); 1134 1135 *used_flags |= VRING_USED_F_NO_NOTIFY; 1136 } 1137 1138 /* 1139 * Clear USED_NO_NOTIFY on VQ so guest resumes doorbell calls for new entries. 1140 */ 1141 void 1142 viona_ring_enable_notify(viona_vring_t *ring) 1143 { 1144 volatile uint16_t *used_flags = 1145 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size)); 1146 1147 *used_flags &= ~VRING_USED_F_NO_NOTIFY; 1148 } 1149 1150 /* 1151 * Return the number of available descriptors in the vring taking care of the 1152 * 16-bit index wraparound. 1153 * 1154 * Note: If the number of apparently available descriptors is larger than the 1155 * ring size (due to guest misbehavior), this check will still report the 1156 * positive count of descriptors. 1157 */ 1158 uint16_t 1159 viona_ring_num_avail(viona_vring_t *ring) 1160 { 1161 volatile uint16_t *avail_idx = 1162 viona_ring_addr(ring, LEGACY_AVAIL_IDX_OFF(ring->vr_size)); 1163 1164 return (*avail_idx - ring->vr_cur_aidx); 1165 } 1166