1 /* 2 * Copyright (c) 2013 Chris Torek <torek @ torek net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * This file and its contents are supplied under the terms of the 28 * Common Development and Distribution License ("CDDL"), version 1.0. 29 * You may only use this file in accordance with the terms of version 30 * 1.0 of the CDDL. 31 * 32 * A full copy of the text of the CDDL should have accompanied this 33 * source. A copy of the CDDL is also available via the Internet at 34 * http://www.illumos.org/license/CDDL. 35 * 36 * Copyright 2015 Pluribus Networks Inc. 37 * Copyright 2019 Joyent, Inc. 38 * Copyright 2022 Oxide Computer Company 39 */ 40 41 42 #include <sys/disp.h> 43 44 #include "viona_impl.h" 45 46 #define VRING_MAX_LEN 32768 47 48 /* Layout and sizing as defined in the spec for a legacy-style virtqueue */ 49 50 #define LEGACY_VQ_ALIGN PAGESIZE 51 52 #define LEGACY_DESC_SZ(qsz) ((qsz) * sizeof (struct virtio_desc)) 53 /* 54 * Available ring consists of avail_idx (uint16_t), flags (uint16_t), qsz avail 55 * descriptors (uint16_t each), and (optional) used_event (uint16_t). 56 */ 57 #define LEGACY_AVAIL_SZ(qsz) (((qsz) + 3) * sizeof (uint16_t)) 58 /* 59 * Used ring consists of used_idx (uint16_t), flags (uint16_t), qsz used 60 * descriptors (two uint32_t each), and (optional) avail_event (uint16_t). 61 */ 62 #define LEGACY_USED_SZ(qsz) \ 63 ((qsz) * sizeof (struct virtio_used) + 3 * sizeof (uint16_t)) 64 65 #define LEGACY_AVAIL_FLAGS_OFF(qsz) LEGACY_DESC_SZ(qsz) 66 #define LEGACY_AVAIL_IDX_OFF(qsz) \ 67 (LEGACY_DESC_SZ(qsz) + sizeof (uint16_t)) 68 #define LEGACY_AVAIL_ENT_OFF(qsz, idx) \ 69 (LEGACY_DESC_SZ(qsz) + (2 + (idx)) * sizeof (uint16_t)) 70 71 #define LEGACY_USED_FLAGS_OFF(qsz) \ 72 P2ROUNDUP(LEGACY_DESC_SZ(qsz) + LEGACY_AVAIL_SZ(qsz), LEGACY_VQ_ALIGN) 73 #define LEGACY_USED_IDX_OFF(qsz) \ 74 (LEGACY_USED_FLAGS_OFF(qsz) + sizeof (uint16_t)) 75 #define LEGACY_USED_ENT_OFF(qsz, idx) \ 76 (LEGACY_USED_FLAGS_OFF(qsz) + 2 * sizeof (uint16_t) + \ 77 (idx) * sizeof (struct virtio_used)) 78 79 #define LEGACY_VQ_SIZE(qsz) \ 80 (LEGACY_USED_FLAGS_OFF(qsz) + \ 81 P2ROUNDUP(LEGACY_USED_SZ(qsz), LEGACY_VQ_ALIGN)) 82 #define LEGACY_VQ_PAGES(qsz) (LEGACY_VQ_SIZE(qsz) / PAGESIZE) 83 84 struct vq_held_region { 85 struct iovec *vhr_iov; 86 vmm_page_t *vhr_head; 87 vmm_page_t *vhr_tail; 88 /* Length of iovec array supplied in `vhr_iov` */ 89 uint_t vhr_niov; 90 /* 91 * Index into vhr_iov, indicating the next "free" entry (following the 92 * last entry which has valid contents). 93 */ 94 uint_t vhr_idx; 95 }; 96 typedef struct vq_held_region vq_held_region_t; 97 98 static boolean_t viona_ring_map(viona_vring_t *); 99 static void viona_ring_unmap(viona_vring_t *); 100 static kthread_t *viona_create_worker(viona_vring_t *); 101 102 static vmm_page_t * 103 vq_page_hold(viona_vring_t *ring, uint64_t gpa, bool writable) 104 { 105 ASSERT3P(ring->vr_lease, !=, NULL); 106 107 int prot = PROT_READ; 108 if (writable) { 109 prot |= PROT_WRITE; 110 } 111 112 return (vmm_drv_page_hold(ring->vr_lease, gpa, prot)); 113 } 114 115 /* 116 * Establish a hold on the page(s) which back the region of guest memory covered 117 * by [gpa, gpa + len). The host-kernel-virtual pointers to those pages are 118 * stored in the iovec array supplied in `region`, along with the chain of 119 * vmm_page_t entries representing the held pages. Since guest memory 120 * carries no guarantees of being physically contiguous (on the host), it is 121 * assumed that an iovec entry will be required for each PAGESIZE section 122 * covered by the specified `gpa` and `len` range. For each iovec entry 123 * successfully populated by holding a page, `vhr_idx` will be incremented so it 124 * references the next available iovec entry (or `vhr_niov`, if the iovec array 125 * is full). The responsibility for releasing the `vmm_page_t` chain (stored in 126 * `vhr_head` and `vhr_tail`) resides with the caller, regardless of the result. 127 */ 128 static int 129 vq_region_hold(viona_vring_t *ring, uint64_t gpa, uint32_t len, 130 bool writable, vq_held_region_t *region) 131 { 132 const uint32_t front_offset = gpa & PAGEOFFSET; 133 const uint32_t front_len = MIN(len, PAGESIZE - front_offset); 134 uint_t pages = 1; 135 vmm_page_t *vmp; 136 caddr_t buf; 137 138 ASSERT3U(region->vhr_idx, <, region->vhr_niov); 139 140 if (front_len < len) { 141 pages += P2ROUNDUP((uint64_t)(len - front_len), 142 PAGESIZE) / PAGESIZE; 143 } 144 if (pages > (region->vhr_niov - region->vhr_idx)) { 145 return (E2BIG); 146 } 147 148 vmp = vq_page_hold(ring, gpa & PAGEMASK, writable); 149 if (vmp == NULL) { 150 return (EFAULT); 151 } 152 buf = (caddr_t)vmm_drv_page_readable(vmp); 153 154 region->vhr_iov[region->vhr_idx].iov_base = buf + front_offset; 155 region->vhr_iov[region->vhr_idx].iov_len = front_len; 156 region->vhr_idx++; 157 gpa += front_len; 158 len -= front_len; 159 if (region->vhr_head == NULL) { 160 region->vhr_head = vmp; 161 region->vhr_tail = vmp; 162 } else { 163 vmm_drv_page_chain(region->vhr_tail, vmp); 164 region->vhr_tail = vmp; 165 } 166 167 for (uint_t i = 1; i < pages; i++) { 168 ASSERT3U(gpa & PAGEOFFSET, ==, 0); 169 170 vmp = vq_page_hold(ring, gpa, writable); 171 if (vmp == NULL) { 172 return (EFAULT); 173 } 174 buf = (caddr_t)vmm_drv_page_readable(vmp); 175 176 const uint32_t chunk_len = MIN(len, PAGESIZE); 177 region->vhr_iov[region->vhr_idx].iov_base = buf; 178 region->vhr_iov[region->vhr_idx].iov_len = chunk_len; 179 region->vhr_idx++; 180 gpa += chunk_len; 181 len -= chunk_len; 182 vmm_drv_page_chain(region->vhr_tail, vmp); 183 region->vhr_tail = vmp; 184 } 185 186 return (0); 187 } 188 189 static boolean_t 190 viona_ring_lease_expire_cb(void *arg) 191 { 192 viona_vring_t *ring = arg; 193 194 mutex_enter(&ring->vr_lock); 195 cv_broadcast(&ring->vr_cv); 196 mutex_exit(&ring->vr_lock); 197 198 /* The lease will be broken asynchronously. */ 199 return (B_FALSE); 200 } 201 202 static void 203 viona_ring_lease_drop(viona_vring_t *ring) 204 { 205 ASSERT(MUTEX_HELD(&ring->vr_lock)); 206 207 if (ring->vr_lease != NULL) { 208 vmm_hold_t *hold = ring->vr_link->l_vm_hold; 209 210 ASSERT(hold != NULL); 211 212 /* 213 * Without an active lease, the ring mappings cannot be 214 * considered valid. 215 */ 216 viona_ring_unmap(ring); 217 218 vmm_drv_lease_break(hold, ring->vr_lease); 219 ring->vr_lease = NULL; 220 } 221 } 222 223 boolean_t 224 viona_ring_lease_renew(viona_vring_t *ring) 225 { 226 vmm_hold_t *hold = ring->vr_link->l_vm_hold; 227 228 ASSERT(hold != NULL); 229 ASSERT(MUTEX_HELD(&ring->vr_lock)); 230 231 viona_ring_lease_drop(ring); 232 233 /* 234 * Lease renewal will fail if the VM has requested that all holds be 235 * cleaned up. 236 */ 237 ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb, 238 ring); 239 if (ring->vr_lease != NULL) { 240 /* A ring undergoing renewal will need valid guest mappings */ 241 if (ring->vr_pa != 0 && ring->vr_size != 0) { 242 /* 243 * If new mappings cannot be established, consider the 244 * lease renewal a failure. 245 */ 246 if (!viona_ring_map(ring)) { 247 viona_ring_lease_drop(ring); 248 return (B_FALSE); 249 } 250 } 251 } 252 return (ring->vr_lease != NULL); 253 } 254 255 void 256 viona_ring_alloc(viona_link_t *link, viona_vring_t *ring) 257 { 258 ring->vr_link = link; 259 mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL); 260 cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL); 261 mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL); 262 mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL); 263 } 264 265 static void 266 viona_ring_misc_free(viona_vring_t *ring) 267 { 268 const uint_t qsz = ring->vr_size; 269 270 viona_tx_ring_free(ring, qsz); 271 } 272 273 void 274 viona_ring_free(viona_vring_t *ring) 275 { 276 mutex_destroy(&ring->vr_lock); 277 cv_destroy(&ring->vr_cv); 278 mutex_destroy(&ring->vr_a_mutex); 279 mutex_destroy(&ring->vr_u_mutex); 280 ring->vr_link = NULL; 281 } 282 283 int 284 viona_ring_init(viona_link_t *link, uint16_t idx, 285 const struct viona_ring_params *params) 286 { 287 viona_vring_t *ring; 288 kthread_t *t; 289 int err = 0; 290 const uint16_t qsz = params->vrp_size; 291 const uint64_t pa = params->vrp_pa; 292 293 if (idx >= VIONA_VQ_MAX) { 294 return (EINVAL); 295 } 296 297 if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) { 298 return (EINVAL); 299 } 300 if ((pa & (LEGACY_VQ_ALIGN - 1)) != 0) { 301 return (EINVAL); 302 } 303 304 ring = &link->l_vrings[idx]; 305 mutex_enter(&ring->vr_lock); 306 if (ring->vr_state != VRS_RESET) { 307 mutex_exit(&ring->vr_lock); 308 return (EBUSY); 309 } 310 VERIFY(ring->vr_state_flags == 0); 311 312 ring->vr_lease = NULL; 313 if (!viona_ring_lease_renew(ring)) { 314 err = EBUSY; 315 goto fail; 316 } 317 318 ring->vr_size = qsz; 319 ring->vr_mask = (ring->vr_size - 1); 320 ring->vr_pa = pa; 321 if (!viona_ring_map(ring)) { 322 err = EINVAL; 323 goto fail; 324 } 325 326 /* Initialize queue indexes */ 327 ring->vr_cur_aidx = params->vrp_avail_idx; 328 ring->vr_cur_uidx = params->vrp_used_idx; 329 330 if (idx == VIONA_VQ_TX) { 331 viona_tx_ring_alloc(ring, qsz); 332 } 333 334 /* Zero out MSI-X configuration */ 335 ring->vr_msi_addr = 0; 336 ring->vr_msi_msg = 0; 337 338 /* Clear the stats */ 339 bzero(&ring->vr_stats, sizeof (ring->vr_stats)); 340 341 t = viona_create_worker(ring); 342 if (t == NULL) { 343 err = ENOMEM; 344 goto fail; 345 } 346 ring->vr_worker_thread = t; 347 ring->vr_state = VRS_SETUP; 348 cv_broadcast(&ring->vr_cv); 349 mutex_exit(&ring->vr_lock); 350 return (0); 351 352 fail: 353 viona_ring_lease_drop(ring); 354 viona_ring_misc_free(ring); 355 ring->vr_size = 0; 356 ring->vr_mask = 0; 357 ring->vr_pa = 0; 358 ring->vr_cur_aidx = 0; 359 ring->vr_cur_uidx = 0; 360 mutex_exit(&ring->vr_lock); 361 return (err); 362 } 363 364 int 365 viona_ring_get_state(viona_link_t *link, uint16_t idx, 366 struct viona_ring_params *params) 367 { 368 viona_vring_t *ring; 369 370 if (idx >= VIONA_VQ_MAX) { 371 return (EINVAL); 372 } 373 374 ring = &link->l_vrings[idx]; 375 mutex_enter(&ring->vr_lock); 376 377 params->vrp_size = ring->vr_size; 378 params->vrp_pa = ring->vr_pa; 379 380 if (ring->vr_state == VRS_RUN) { 381 /* On a running ring, we must heed the avail/used locks */ 382 mutex_enter(&ring->vr_a_mutex); 383 params->vrp_avail_idx = ring->vr_cur_aidx; 384 mutex_exit(&ring->vr_a_mutex); 385 mutex_enter(&ring->vr_u_mutex); 386 params->vrp_used_idx = ring->vr_cur_uidx; 387 mutex_exit(&ring->vr_u_mutex); 388 } else { 389 /* Otherwise vr_lock is adequate protection */ 390 params->vrp_avail_idx = ring->vr_cur_aidx; 391 params->vrp_used_idx = ring->vr_cur_uidx; 392 } 393 394 mutex_exit(&ring->vr_lock); 395 396 return (0); 397 } 398 399 int 400 viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals) 401 { 402 mutex_enter(&ring->vr_lock); 403 if (ring->vr_state == VRS_RESET) { 404 mutex_exit(&ring->vr_lock); 405 return (0); 406 } 407 408 if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) { 409 ring->vr_state_flags |= VRSF_REQ_STOP; 410 cv_broadcast(&ring->vr_cv); 411 } 412 while (ring->vr_state != VRS_RESET) { 413 if (!heed_signals) { 414 cv_wait(&ring->vr_cv, &ring->vr_lock); 415 } else { 416 int rs; 417 418 rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 419 if (rs <= 0 && ring->vr_state != VRS_RESET) { 420 mutex_exit(&ring->vr_lock); 421 return (EINTR); 422 } 423 } 424 } 425 mutex_exit(&ring->vr_lock); 426 return (0); 427 } 428 429 static boolean_t 430 viona_ring_map(viona_vring_t *ring) 431 { 432 const uint16_t qsz = ring->vr_size; 433 uintptr_t pa = ring->vr_pa; 434 435 ASSERT3U(qsz, !=, 0); 436 ASSERT3U(qsz, <=, VRING_MAX_LEN); 437 ASSERT3U(pa, !=, 0); 438 ASSERT3U(pa & (LEGACY_VQ_ALIGN - 1), ==, 0); 439 ASSERT3U(LEGACY_VQ_ALIGN, ==, PAGESIZE); 440 ASSERT(MUTEX_HELD(&ring->vr_lock)); 441 ASSERT3P(ring->vr_map_pages, ==, NULL); 442 443 const uint_t npages = LEGACY_VQ_PAGES(qsz); 444 ring->vr_map_pages = kmem_zalloc(npages * sizeof (void *), KM_SLEEP); 445 446 vmm_page_t *prev = NULL; 447 448 for (uint_t i = 0; i < npages; i++, pa += PAGESIZE) { 449 vmm_page_t *vmp; 450 451 vmp = vq_page_hold(ring, pa, true); 452 if (vmp == NULL) { 453 viona_ring_unmap(ring); 454 return (B_FALSE); 455 } 456 457 /* 458 * Keep the first page has the head of the chain, appending all 459 * subsequent pages to the tail. 460 */ 461 if (prev == NULL) { 462 ring->vr_map_hold = vmp; 463 } else { 464 vmm_drv_page_chain(prev, vmp); 465 } 466 prev = vmp; 467 ring->vr_map_pages[i] = vmm_drv_page_writable(vmp); 468 } 469 470 return (B_TRUE); 471 } 472 473 static void 474 viona_ring_unmap(viona_vring_t *ring) 475 { 476 ASSERT(MUTEX_HELD(&ring->vr_lock)); 477 478 void **map = ring->vr_map_pages; 479 if (map != NULL) { 480 const uint_t npages = LEGACY_VQ_PAGES(ring->vr_size); 481 kmem_free(map, npages * sizeof (void *)); 482 ring->vr_map_pages = NULL; 483 484 vmm_drv_page_release_chain(ring->vr_map_hold); 485 ring->vr_map_hold = NULL; 486 } else { 487 ASSERT3P(ring->vr_map_hold, ==, NULL); 488 } 489 } 490 491 static inline void * 492 viona_ring_addr(viona_vring_t *ring, uint_t off) 493 { 494 ASSERT3P(ring->vr_map_pages, !=, NULL); 495 ASSERT3U(LEGACY_VQ_SIZE(ring->vr_size), >, off); 496 497 const uint_t page_num = off / PAGESIZE; 498 const uint_t page_off = off % PAGESIZE; 499 return ((caddr_t)ring->vr_map_pages[page_num] + page_off); 500 } 501 502 void 503 viona_intr_ring(viona_vring_t *ring, boolean_t skip_flags_check) 504 { 505 if (!skip_flags_check) { 506 volatile uint16_t *avail_flags = viona_ring_addr(ring, 507 LEGACY_AVAIL_FLAGS_OFF(ring->vr_size)); 508 509 if ((*avail_flags & VRING_AVAIL_F_NO_INTERRUPT) != 0) { 510 return; 511 } 512 } 513 514 mutex_enter(&ring->vr_lock); 515 uint64_t addr = ring->vr_msi_addr; 516 uint64_t msg = ring->vr_msi_msg; 517 mutex_exit(&ring->vr_lock); 518 if (addr != 0) { 519 /* Deliver the interrupt directly, if so configured... */ 520 (void) vmm_drv_msi(ring->vr_lease, addr, msg); 521 } else { 522 /* ... otherwise, leave it to userspace */ 523 if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) { 524 pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND); 525 } 526 } 527 } 528 529 static inline bool 530 vring_stop_req(const viona_vring_t *ring) 531 { 532 return ((ring->vr_state_flags & VRSF_REQ_STOP) != 0); 533 } 534 535 static inline bool 536 vring_pause_req(const viona_vring_t *ring) 537 { 538 return ((ring->vr_state_flags & VRSF_REQ_PAUSE) != 0); 539 } 540 541 static inline bool 542 vring_start_req(const viona_vring_t *ring) 543 { 544 return ((ring->vr_state_flags & VRSF_REQ_START) != 0); 545 } 546 547 /* 548 * Check if vring worker thread should bail out. This will heed indications 549 * that the containing process is exiting, as well as requests to stop or pause 550 * the ring. The `stop_only` parameter controls if pause requests are ignored 551 * (true) or checked (false). 552 * 553 * Caller should hold vr_lock. 554 */ 555 static bool 556 vring_need_bail_ext(const viona_vring_t *ring, bool stop_only) 557 { 558 ASSERT(MUTEX_HELD(&ring->vr_lock)); 559 560 if (vring_stop_req(ring) || 561 (!stop_only && vring_pause_req(ring))) { 562 return (true); 563 } 564 565 kthread_t *t = ring->vr_worker_thread; 566 if (t != NULL) { 567 proc_t *p = ttoproc(t); 568 569 ASSERT(p != NULL); 570 if ((p->p_flag & SEXITING) != 0) { 571 return (true); 572 } 573 } 574 return (false); 575 } 576 577 bool 578 vring_need_bail(const viona_vring_t *ring) 579 { 580 return (vring_need_bail_ext(ring, false)); 581 } 582 583 int 584 viona_ring_pause(viona_vring_t *ring) 585 { 586 mutex_enter(&ring->vr_lock); 587 switch (ring->vr_state) { 588 case VRS_RESET: 589 case VRS_SETUP: 590 case VRS_INIT: 591 /* 592 * For rings which have not yet started (even those in the 593 * VRS_SETUP and VRS_INIT phases, where there a running worker 594 * thread (waiting to be released to do its intended task), it 595 * is adequate to simply clear any start request, to keep them 596 * from proceeding into the actual work processing function. 597 */ 598 ring->vr_state_flags &= ~VRSF_REQ_START; 599 mutex_exit(&ring->vr_lock); 600 return (0); 601 602 case VRS_STOP: 603 if ((ring->vr_state_flags & VRSF_REQ_STOP) != 0) { 604 /* A ring on its way to RESET cannot be paused. */ 605 mutex_exit(&ring->vr_lock); 606 return (EBUSY); 607 } 608 /* FALLTHROUGH */ 609 case VRS_RUN: 610 ring->vr_state_flags |= VRSF_REQ_PAUSE; 611 cv_broadcast(&ring->vr_cv); 612 break; 613 614 default: 615 panic("invalid ring state %d", ring->vr_state); 616 break; 617 } 618 619 for (;;) { 620 int res = cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 621 622 if (ring->vr_state == VRS_INIT || 623 (ring->vr_state_flags & VRSF_REQ_PAUSE) == 0) { 624 /* Ring made it to (or through) paused state */ 625 mutex_exit(&ring->vr_lock); 626 return (0); 627 } 628 if (res == 0) { 629 /* interrupted by signal */ 630 mutex_exit(&ring->vr_lock); 631 return (EINTR); 632 } 633 } 634 /* NOTREACHED */ 635 } 636 637 static void 638 viona_worker(void *arg) 639 { 640 viona_vring_t *ring = (viona_vring_t *)arg; 641 viona_link_t *link = ring->vr_link; 642 643 mutex_enter(&ring->vr_lock); 644 VERIFY3U(ring->vr_state, ==, VRS_SETUP); 645 646 /* Bail immediately if ring shutdown or process exit was requested */ 647 if (vring_need_bail_ext(ring, true)) { 648 goto ring_reset; 649 } 650 651 /* Report worker thread as alive and notify creator */ 652 ring_init: 653 ring->vr_state = VRS_INIT; 654 cv_broadcast(&ring->vr_cv); 655 656 while (!vring_start_req(ring)) { 657 /* 658 * Keeping lease renewals timely while waiting for the ring to 659 * be started is important for avoiding deadlocks. 660 */ 661 if (vmm_drv_lease_expired(ring->vr_lease)) { 662 if (!viona_ring_lease_renew(ring)) { 663 goto ring_reset; 664 } 665 } 666 667 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 668 669 if (vring_pause_req(ring)) { 670 /* We are already paused in the INIT state. */ 671 ring->vr_state_flags &= ~VRSF_REQ_PAUSE; 672 } 673 if (vring_need_bail_ext(ring, true)) { 674 goto ring_reset; 675 } 676 } 677 678 ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0); 679 ring->vr_state = VRS_RUN; 680 ring->vr_state_flags &= ~VRSF_REQ_START; 681 682 /* Ensure ring lease is valid first */ 683 if (vmm_drv_lease_expired(ring->vr_lease)) { 684 if (!viona_ring_lease_renew(ring)) { 685 goto ring_reset; 686 } 687 } 688 689 /* Process actual work */ 690 if (ring == &link->l_vrings[VIONA_VQ_RX]) { 691 viona_worker_rx(ring, link); 692 } else if (ring == &link->l_vrings[VIONA_VQ_TX]) { 693 viona_worker_tx(ring, link); 694 } else { 695 panic("unexpected ring: %p", (void *)ring); 696 } 697 698 VERIFY3U(ring->vr_state, ==, VRS_STOP); 699 VERIFY3U(ring->vr_xfer_outstanding, ==, 0); 700 701 /* Respond to a pause request if the ring is not required to stop */ 702 if (vring_pause_req(ring)) { 703 ring->vr_state_flags &= ~VRSF_REQ_PAUSE; 704 705 if (!vring_need_bail_ext(ring, true)) { 706 goto ring_init; 707 } 708 } 709 710 ring_reset: 711 viona_ring_misc_free(ring); 712 713 viona_ring_lease_drop(ring); 714 ring->vr_cur_aidx = 0; 715 ring->vr_size = 0; 716 ring->vr_mask = 0; 717 ring->vr_pa = 0; 718 ring->vr_state = VRS_RESET; 719 ring->vr_state_flags = 0; 720 ring->vr_worker_thread = NULL; 721 cv_broadcast(&ring->vr_cv); 722 mutex_exit(&ring->vr_lock); 723 724 mutex_enter(&ttoproc(curthread)->p_lock); 725 lwp_exit(); 726 } 727 728 static kthread_t * 729 viona_create_worker(viona_vring_t *ring) 730 { 731 k_sigset_t hold_set; 732 proc_t *p = curproc; 733 kthread_t *t; 734 klwp_t *lwp; 735 736 ASSERT(MUTEX_HELD(&ring->vr_lock)); 737 ASSERT(ring->vr_state == VRS_RESET); 738 739 sigfillset(&hold_set); 740 lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED, 741 minclsyspri - 1, &hold_set, curthread->t_cid, 0); 742 if (lwp == NULL) { 743 return (NULL); 744 } 745 746 t = lwptot(lwp); 747 mutex_enter(&p->p_lock); 748 t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD; 749 lwp_create_done(t); 750 mutex_exit(&p->p_lock); 751 752 return (t); 753 } 754 755 void 756 vq_read_desc(viona_vring_t *ring, uint16_t idx, struct virtio_desc *descp) 757 { 758 const uint_t entry_off = idx * sizeof (struct virtio_desc); 759 760 ASSERT3U(idx, <, ring->vr_size); 761 762 bcopy(viona_ring_addr(ring, entry_off), descp, sizeof (*descp)); 763 } 764 765 static uint16_t 766 vq_read_avail(viona_vring_t *ring, uint16_t idx) 767 { 768 ASSERT3U(idx, <, ring->vr_size); 769 770 volatile uint16_t *avail_ent = 771 viona_ring_addr(ring, LEGACY_AVAIL_ENT_OFF(ring->vr_size, idx)); 772 return (*avail_ent); 773 } 774 775 /* 776 * Given a buffer descriptor `desc`, attempt to map the pages backing that 777 * region of guest physical memory, taking into account that there are no 778 * guarantees about guest-contiguous pages being host-contiguous. 779 */ 780 static int 781 vq_map_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc, 782 vq_held_region_t *region) 783 { 784 int err; 785 786 if (desc->vd_len == 0) { 787 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring, 788 uint32_t, desc->vd_len); 789 VIONA_RING_STAT_INCR(ring, desc_bad_len); 790 return (EINVAL); 791 } 792 793 err = vq_region_hold(ring, desc->vd_addr, desc->vd_len, 794 (desc->vd_flags & VRING_DESC_F_WRITE) != 0, region); 795 switch (err) { 796 case E2BIG: 797 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring); 798 VIONA_RING_STAT_INCR(ring, too_many_desc); 799 break; 800 case EFAULT: 801 VIONA_PROBE_BAD_RING_ADDR(ring, desc->vd_addr); 802 VIONA_RING_STAT_INCR(ring, bad_ring_addr); 803 break; 804 default: 805 break; 806 } 807 808 return (err); 809 } 810 811 /* 812 * Walk an indirect buffer descriptor `desc`, attempting to map the pages 813 * backing the regions of guest memory covered by its contituent descriptors. 814 */ 815 static int 816 vq_map_indir_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc, 817 vq_held_region_t *region) 818 { 819 const uint16_t indir_count = desc->vd_len / sizeof (struct virtio_desc); 820 821 if ((desc->vd_len & 0xf) != 0 || indir_count == 0 || 822 indir_count > ring->vr_size || 823 desc->vd_addr > (desc->vd_addr + desc->vd_len)) { 824 VIONA_PROBE2(indir_bad_len, viona_vring_t *, ring, 825 uint32_t, desc->vd_len); 826 VIONA_RING_STAT_INCR(ring, indir_bad_len); 827 return (EINVAL); 828 } 829 830 uint16_t indir_next = 0; 831 const uint8_t *buf = NULL; 832 uint64_t buf_gpa = UINT64_MAX; 833 vmm_page_t *vmp = NULL; 834 int err = 0; 835 836 for (;;) { 837 uint64_t indir_gpa = 838 desc->vd_addr + (indir_next * sizeof (struct virtio_desc)); 839 uint64_t indir_page = indir_gpa & PAGEMASK; 840 struct virtio_desc vp; 841 842 /* 843 * Get a mapping for the page that the next indirect descriptor 844 * resides in, if has not already been done. 845 */ 846 if (indir_page != buf_gpa) { 847 if (vmp != NULL) { 848 vmm_drv_page_release(vmp); 849 } 850 vmp = vq_page_hold(ring, indir_page, false); 851 if (vmp == NULL) { 852 VIONA_PROBE_BAD_RING_ADDR(ring, indir_page); 853 VIONA_RING_STAT_INCR(ring, bad_ring_addr); 854 err = EFAULT; 855 break; 856 } 857 buf_gpa = indir_page; 858 buf = vmm_drv_page_readable(vmp); 859 } 860 861 /* 862 * A copy of the indirect descriptor is made here, rather than 863 * simply using a reference pointer. This prevents malicious or 864 * erroneous guest writes to the descriptor from fooling the 865 * flags/bounds verification through a race. 866 */ 867 bcopy(buf + (indir_gpa - indir_page), &vp, sizeof (vp)); 868 869 if (vp.vd_flags & VRING_DESC_F_INDIRECT) { 870 VIONA_PROBE1(indir_bad_nest, viona_vring_t *, ring); 871 VIONA_RING_STAT_INCR(ring, indir_bad_nest); 872 err = EINVAL; 873 break; 874 } else if (vp.vd_len == 0) { 875 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring, 876 uint32_t, vp.vd_len); 877 VIONA_RING_STAT_INCR(ring, desc_bad_len); 878 err = EINVAL; 879 break; 880 } 881 882 err = vq_map_desc_bufs(ring, &vp, region); 883 if (err != 0) { 884 break; 885 } 886 887 /* Successfully reach the end of the indir chain */ 888 if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) { 889 break; 890 } 891 if (region->vhr_idx >= region->vhr_niov) { 892 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring); 893 VIONA_RING_STAT_INCR(ring, too_many_desc); 894 err = E2BIG; 895 break; 896 } 897 898 indir_next = vp.vd_next; 899 if (indir_next >= indir_count) { 900 VIONA_PROBE3(indir_bad_next, viona_vring_t *, ring, 901 uint16_t, indir_next, uint16_t, indir_count); 902 VIONA_RING_STAT_INCR(ring, indir_bad_next); 903 err = EINVAL; 904 break; 905 } 906 } 907 908 if (vmp != NULL) { 909 vmm_drv_page_release(vmp); 910 } 911 return (err); 912 } 913 914 int 915 vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov, 916 uint16_t *cookie, vmm_page_t **chain) 917 { 918 uint16_t ndesc, idx, head, next; 919 struct virtio_desc vdir; 920 vq_held_region_t region = { 921 .vhr_niov = niov, 922 .vhr_iov = iov, 923 }; 924 925 ASSERT(iov != NULL); 926 ASSERT(niov > 0 && niov < INT_MAX); 927 ASSERT(*chain == NULL); 928 929 mutex_enter(&ring->vr_a_mutex); 930 idx = ring->vr_cur_aidx; 931 ndesc = viona_ring_num_avail(ring); 932 933 if (ndesc == 0) { 934 mutex_exit(&ring->vr_a_mutex); 935 return (0); 936 } 937 if (ndesc > ring->vr_size) { 938 /* 939 * Despite the fact that the guest has provided an 'avail_idx' 940 * which indicates that an impossible number of descriptors are 941 * available, continue on and attempt to process the next one. 942 * 943 * The transgression will not escape the probe or stats though. 944 */ 945 VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring, 946 uint16_t, ndesc); 947 VIONA_RING_STAT_INCR(ring, ndesc_too_high); 948 } 949 950 head = vq_read_avail(ring, idx & ring->vr_mask); 951 next = head; 952 953 for (region.vhr_idx = 0; region.vhr_idx < niov; next = vdir.vd_next) { 954 if (next >= ring->vr_size) { 955 VIONA_PROBE2(bad_idx, viona_vring_t *, ring, 956 uint16_t, next); 957 VIONA_RING_STAT_INCR(ring, bad_idx); 958 break; 959 } 960 961 vq_read_desc(ring, next, &vdir); 962 if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) { 963 if (vq_map_desc_bufs(ring, &vdir, ®ion) != 0) { 964 break; 965 } 966 } else { 967 /* 968 * Per the specification (Virtio 1.1 S2.6.5.3.1): 969 * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT 970 * and VIRTQ_DESC_F_NEXT in `flags`. 971 */ 972 if ((vdir.vd_flags & VRING_DESC_F_NEXT) != 0) { 973 VIONA_PROBE3(indir_bad_next, 974 viona_vring_t *, ring, 975 uint16_t, next, uint16_t, 0); 976 VIONA_RING_STAT_INCR(ring, indir_bad_next); 977 break; 978 } 979 980 if (vq_map_indir_desc_bufs(ring, &vdir, ®ion) != 0) { 981 break; 982 } 983 } 984 985 if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) { 986 ring->vr_cur_aidx++; 987 mutex_exit(&ring->vr_a_mutex); 988 989 *cookie = head; 990 *chain = region.vhr_head; 991 return (region.vhr_idx); 992 } 993 } 994 995 mutex_exit(&ring->vr_a_mutex); 996 if (region.vhr_head != NULL) { 997 /* 998 * If any pages were held prior to encountering an error, we 999 * must release them now. 1000 */ 1001 vmm_drv_page_release_chain(region.vhr_head); 1002 } 1003 return (-1); 1004 } 1005 1006 1007 static void 1008 vq_write_used_ent(viona_vring_t *ring, uint16_t idx, uint16_t cookie, 1009 uint32_t len) 1010 { 1011 /* 1012 * In a larger ring, entry could be split across pages, so be sure to 1013 * account for that when configuring the transfer by looking up the ID 1014 * and length addresses separately, rather than an address for a 1015 * combined `struct virtio_used`. 1016 */ 1017 const uint_t used_id_off = LEGACY_USED_ENT_OFF(ring->vr_size, idx); 1018 const uint_t used_len_off = used_id_off + sizeof (uint32_t); 1019 volatile uint32_t *idp = viona_ring_addr(ring, used_id_off); 1020 volatile uint32_t *lenp = viona_ring_addr(ring, used_len_off); 1021 1022 ASSERT(MUTEX_HELD(&ring->vr_u_mutex)); 1023 1024 *idp = cookie; 1025 *lenp = len; 1026 } 1027 1028 static void 1029 vq_write_used_idx(viona_vring_t *ring, uint16_t idx) 1030 { 1031 ASSERT(MUTEX_HELD(&ring->vr_u_mutex)); 1032 1033 volatile uint16_t *used_idx = 1034 viona_ring_addr(ring, LEGACY_USED_IDX_OFF(ring->vr_size)); 1035 *used_idx = idx; 1036 } 1037 1038 void 1039 vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie) 1040 { 1041 uint16_t uidx; 1042 1043 mutex_enter(&ring->vr_u_mutex); 1044 1045 uidx = ring->vr_cur_uidx; 1046 vq_write_used_ent(ring, uidx & ring->vr_mask, cookie, len); 1047 uidx++; 1048 membar_producer(); 1049 1050 vq_write_used_idx(ring, uidx); 1051 ring->vr_cur_uidx = uidx; 1052 1053 mutex_exit(&ring->vr_u_mutex); 1054 } 1055 1056 void 1057 vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem) 1058 { 1059 uint16_t uidx; 1060 1061 mutex_enter(&ring->vr_u_mutex); 1062 1063 uidx = ring->vr_cur_uidx; 1064 1065 for (uint_t i = 0; i < num_bufs; i++, uidx++) { 1066 vq_write_used_ent(ring, uidx & ring->vr_mask, elem[i].id, 1067 elem[i].len); 1068 } 1069 1070 membar_producer(); 1071 vq_write_used_idx(ring, uidx); 1072 ring->vr_cur_uidx = uidx; 1073 1074 mutex_exit(&ring->vr_u_mutex); 1075 } 1076 1077 /* 1078 * Set USED_NO_NOTIFY on VQ so guest elides doorbell calls for new entries. 1079 */ 1080 void 1081 viona_ring_disable_notify(viona_vring_t *ring) 1082 { 1083 volatile uint16_t *used_flags = 1084 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size)); 1085 1086 *used_flags |= VRING_USED_F_NO_NOTIFY; 1087 } 1088 1089 /* 1090 * Clear USED_NO_NOTIFY on VQ so guest resumes doorbell calls for new entries. 1091 */ 1092 void 1093 viona_ring_enable_notify(viona_vring_t *ring) 1094 { 1095 volatile uint16_t *used_flags = 1096 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size)); 1097 1098 *used_flags &= ~VRING_USED_F_NO_NOTIFY; 1099 } 1100 1101 /* 1102 * Return the number of available descriptors in the vring taking care of the 1103 * 16-bit index wraparound. 1104 * 1105 * Note: If the number of apparently available descriptors is larger than the 1106 * ring size (due to guest misbehavior), this check will still report the 1107 * positive count of descriptors. 1108 */ 1109 uint16_t 1110 viona_ring_num_avail(viona_vring_t *ring) 1111 { 1112 volatile uint16_t *avail_idx = 1113 viona_ring_addr(ring, LEGACY_AVAIL_IDX_OFF(ring->vr_size)); 1114 1115 return (*avail_idx - ring->vr_cur_aidx); 1116 } 1117