1 /* 2 * Copyright (c) 2013 Chris Torek <torek @ torek net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * This file and its contents are supplied under the terms of the 28 * Common Development and Distribution License ("CDDL"), version 1.0. 29 * You may only use this file in accordance with the terms of version 30 * 1.0 of the CDDL. 31 * 32 * A full copy of the text of the CDDL should have accompanied this 33 * source. A copy of the CDDL is also available via the Internet at 34 * http://www.illumos.org/license/CDDL. 35 * 36 * Copyright 2015 Pluribus Networks Inc. 37 * Copyright 2019 Joyent, Inc. 38 * Copyright 2024 Oxide Computer Company 39 */ 40 41 42 #include <sys/disp.h> 43 44 #include "viona_impl.h" 45 46 #define VRING_MAX_LEN 32768 47 48 /* Layout and sizing as defined in the spec for a legacy-style virtqueue */ 49 50 #define LEGACY_VQ_ALIGN PAGESIZE 51 52 #define LEGACY_DESC_SZ(qsz) ((qsz) * sizeof (struct virtio_desc)) 53 /* 54 * Available ring consists of avail_idx (uint16_t), flags (uint16_t), qsz avail 55 * descriptors (uint16_t each), and (optional) used_event (uint16_t). 56 */ 57 #define LEGACY_AVAIL_SZ(qsz) (((qsz) + 3) * sizeof (uint16_t)) 58 /* 59 * Used ring consists of used_idx (uint16_t), flags (uint16_t), qsz used 60 * descriptors (two uint32_t each), and (optional) avail_event (uint16_t). 61 */ 62 #define LEGACY_USED_SZ(qsz) \ 63 ((qsz) * sizeof (struct virtio_used) + 3 * sizeof (uint16_t)) 64 65 #define LEGACY_AVAIL_FLAGS_OFF(qsz) LEGACY_DESC_SZ(qsz) 66 #define LEGACY_AVAIL_IDX_OFF(qsz) \ 67 (LEGACY_DESC_SZ(qsz) + sizeof (uint16_t)) 68 #define LEGACY_AVAIL_ENT_OFF(qsz, idx) \ 69 (LEGACY_DESC_SZ(qsz) + (2 + (idx)) * sizeof (uint16_t)) 70 71 #define LEGACY_USED_FLAGS_OFF(qsz) \ 72 P2ROUNDUP(LEGACY_DESC_SZ(qsz) + LEGACY_AVAIL_SZ(qsz), LEGACY_VQ_ALIGN) 73 #define LEGACY_USED_IDX_OFF(qsz) \ 74 (LEGACY_USED_FLAGS_OFF(qsz) + sizeof (uint16_t)) 75 #define LEGACY_USED_ENT_OFF(qsz, idx) \ 76 (LEGACY_USED_FLAGS_OFF(qsz) + 2 * sizeof (uint16_t) + \ 77 (idx) * sizeof (struct virtio_used)) 78 79 #define LEGACY_VQ_SIZE(qsz) \ 80 (LEGACY_USED_FLAGS_OFF(qsz) + \ 81 P2ROUNDUP(LEGACY_USED_SZ(qsz), LEGACY_VQ_ALIGN)) 82 #define LEGACY_VQ_PAGES(qsz) (LEGACY_VQ_SIZE(qsz) / PAGESIZE) 83 84 struct vq_held_region { 85 struct iovec *vhr_iov; 86 vmm_page_t *vhr_head; 87 vmm_page_t *vhr_tail; 88 /* Length of iovec array supplied in `vhr_iov` */ 89 uint_t vhr_niov; 90 /* 91 * Index into vhr_iov, indicating the next "free" entry (following the 92 * last entry which has valid contents). 93 */ 94 uint_t vhr_idx; 95 }; 96 typedef struct vq_held_region vq_held_region_t; 97 98 static bool viona_ring_map(viona_vring_t *, bool); 99 static void viona_ring_unmap(viona_vring_t *); 100 static kthread_t *viona_create_worker(viona_vring_t *); 101 static void viona_ring_consolidate_stats(viona_vring_t *); 102 103 static vmm_page_t * 104 vq_page_hold(viona_vring_t *ring, uint64_t gpa, bool writable) 105 { 106 ASSERT3P(ring->vr_lease, !=, NULL); 107 108 int prot = PROT_READ; 109 if (writable) { 110 prot |= PROT_WRITE; 111 } 112 113 return (vmm_drv_page_hold(ring->vr_lease, gpa, prot)); 114 } 115 116 /* 117 * Establish a hold on the page(s) which back the region of guest memory covered 118 * by [gpa, gpa + len). The host-kernel-virtual pointers to those pages are 119 * stored in the iovec array supplied in `region`, along with the chain of 120 * vmm_page_t entries representing the held pages. Since guest memory 121 * carries no guarantees of being physically contiguous (on the host), it is 122 * assumed that an iovec entry will be required for each PAGESIZE section 123 * covered by the specified `gpa` and `len` range. For each iovec entry 124 * successfully populated by holding a page, `vhr_idx` will be incremented so it 125 * references the next available iovec entry (or `vhr_niov`, if the iovec array 126 * is full). The responsibility for releasing the `vmm_page_t` chain (stored in 127 * `vhr_head` and `vhr_tail`) resides with the caller, regardless of the result. 128 */ 129 static int 130 vq_region_hold(viona_vring_t *ring, uint64_t gpa, uint32_t len, 131 bool writable, vq_held_region_t *region) 132 { 133 const uint32_t front_offset = gpa & PAGEOFFSET; 134 const uint32_t front_len = MIN(len, PAGESIZE - front_offset); 135 uint_t pages = 1; 136 vmm_page_t *vmp; 137 caddr_t buf; 138 139 ASSERT3U(region->vhr_idx, <, region->vhr_niov); 140 141 if (front_len < len) { 142 pages += P2ROUNDUP((uint64_t)(len - front_len), 143 PAGESIZE) / PAGESIZE; 144 } 145 if (pages > (region->vhr_niov - region->vhr_idx)) { 146 return (E2BIG); 147 } 148 149 vmp = vq_page_hold(ring, gpa & PAGEMASK, writable); 150 if (vmp == NULL) { 151 return (EFAULT); 152 } 153 buf = (caddr_t)vmm_drv_page_readable(vmp); 154 155 region->vhr_iov[region->vhr_idx].iov_base = buf + front_offset; 156 region->vhr_iov[region->vhr_idx].iov_len = front_len; 157 region->vhr_idx++; 158 gpa += front_len; 159 len -= front_len; 160 if (region->vhr_head == NULL) { 161 region->vhr_head = vmp; 162 region->vhr_tail = vmp; 163 } else { 164 vmm_drv_page_chain(region->vhr_tail, vmp); 165 region->vhr_tail = vmp; 166 } 167 168 for (uint_t i = 1; i < pages; i++) { 169 ASSERT3U(gpa & PAGEOFFSET, ==, 0); 170 171 vmp = vq_page_hold(ring, gpa, writable); 172 if (vmp == NULL) { 173 return (EFAULT); 174 } 175 buf = (caddr_t)vmm_drv_page_readable(vmp); 176 177 const uint32_t chunk_len = MIN(len, PAGESIZE); 178 region->vhr_iov[region->vhr_idx].iov_base = buf; 179 region->vhr_iov[region->vhr_idx].iov_len = chunk_len; 180 region->vhr_idx++; 181 gpa += chunk_len; 182 len -= chunk_len; 183 vmm_drv_page_chain(region->vhr_tail, vmp); 184 region->vhr_tail = vmp; 185 } 186 187 return (0); 188 } 189 190 static boolean_t 191 viona_ring_lease_expire_cb(void *arg) 192 { 193 viona_vring_t *ring = arg; 194 195 mutex_enter(&ring->vr_lock); 196 cv_broadcast(&ring->vr_cv); 197 mutex_exit(&ring->vr_lock); 198 199 /* The lease will be broken asynchronously. */ 200 return (B_FALSE); 201 } 202 203 static void 204 viona_ring_lease_drop(viona_vring_t *ring) 205 { 206 ASSERT(MUTEX_HELD(&ring->vr_lock)); 207 208 if (ring->vr_lease != NULL) { 209 vmm_hold_t *hold = ring->vr_link->l_vm_hold; 210 211 ASSERT(hold != NULL); 212 213 /* 214 * Without an active lease, the ring mappings cannot be 215 * considered valid. 216 */ 217 viona_ring_unmap(ring); 218 219 vmm_drv_lease_break(hold, ring->vr_lease); 220 ring->vr_lease = NULL; 221 } 222 } 223 224 boolean_t 225 viona_ring_lease_renew(viona_vring_t *ring) 226 { 227 vmm_hold_t *hold = ring->vr_link->l_vm_hold; 228 229 ASSERT(hold != NULL); 230 ASSERT(MUTEX_HELD(&ring->vr_lock)); 231 232 viona_ring_lease_drop(ring); 233 234 /* 235 * Lease renewal will fail if the VM has requested that all holds be 236 * cleaned up. 237 */ 238 ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb, 239 ring); 240 if (ring->vr_lease != NULL) { 241 /* A ring undergoing renewal will need valid guest mappings */ 242 if (ring->vr_pa != 0 && ring->vr_size != 0) { 243 /* 244 * If new mappings cannot be established, consider the 245 * lease renewal a failure. 246 */ 247 if (!viona_ring_map(ring, ring->vr_state == VRS_INIT)) { 248 viona_ring_lease_drop(ring); 249 return (B_FALSE); 250 } 251 } 252 } 253 return (ring->vr_lease != NULL); 254 } 255 256 void 257 viona_ring_alloc(viona_link_t *link, viona_vring_t *ring) 258 { 259 ring->vr_link = link; 260 mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL); 261 cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL); 262 mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL); 263 mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL); 264 } 265 266 static void 267 viona_ring_misc_free(viona_vring_t *ring) 268 { 269 const uint_t qsz = ring->vr_size; 270 271 viona_tx_ring_free(ring, qsz); 272 } 273 274 void 275 viona_ring_free(viona_vring_t *ring) 276 { 277 mutex_destroy(&ring->vr_lock); 278 cv_destroy(&ring->vr_cv); 279 mutex_destroy(&ring->vr_a_mutex); 280 mutex_destroy(&ring->vr_u_mutex); 281 ring->vr_link = NULL; 282 } 283 284 int 285 viona_ring_init(viona_link_t *link, uint16_t idx, 286 const struct viona_ring_params *params) 287 { 288 viona_vring_t *ring; 289 kthread_t *t; 290 int err = 0; 291 const uint16_t qsz = params->vrp_size; 292 const uint64_t pa = params->vrp_pa; 293 294 if (idx >= VIONA_VQ_MAX) { 295 return (EINVAL); 296 } 297 298 if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) { 299 return (EINVAL); 300 } 301 if ((pa & (LEGACY_VQ_ALIGN - 1)) != 0) { 302 return (EINVAL); 303 } 304 305 ring = &link->l_vrings[idx]; 306 mutex_enter(&ring->vr_lock); 307 if (ring->vr_state != VRS_RESET) { 308 mutex_exit(&ring->vr_lock); 309 return (EBUSY); 310 } 311 VERIFY(ring->vr_state_flags == 0); 312 313 ring->vr_lease = NULL; 314 if (!viona_ring_lease_renew(ring)) { 315 err = EBUSY; 316 goto fail; 317 } 318 319 ring->vr_size = qsz; 320 ring->vr_mask = (ring->vr_size - 1); 321 ring->vr_pa = pa; 322 if (!viona_ring_map(ring, true)) { 323 err = EINVAL; 324 goto fail; 325 } 326 327 /* Initialize queue indexes */ 328 ring->vr_cur_aidx = params->vrp_avail_idx; 329 ring->vr_cur_uidx = params->vrp_used_idx; 330 331 if (idx == VIONA_VQ_TX) { 332 viona_tx_ring_alloc(ring, qsz); 333 } 334 335 /* Zero out MSI-X configuration */ 336 ring->vr_msi_addr = 0; 337 ring->vr_msi_msg = 0; 338 339 /* Clear the stats */ 340 bzero(&ring->vr_stats, sizeof (ring->vr_stats)); 341 bzero(&ring->vr_err_stats, sizeof (ring->vr_err_stats)); 342 343 t = viona_create_worker(ring); 344 if (t == NULL) { 345 err = ENOMEM; 346 goto fail; 347 } 348 ring->vr_worker_thread = t; 349 ring->vr_state = VRS_SETUP; 350 cv_broadcast(&ring->vr_cv); 351 mutex_exit(&ring->vr_lock); 352 return (0); 353 354 fail: 355 viona_ring_lease_drop(ring); 356 viona_ring_misc_free(ring); 357 ring->vr_size = 0; 358 ring->vr_mask = 0; 359 ring->vr_pa = 0; 360 ring->vr_cur_aidx = 0; 361 ring->vr_cur_uidx = 0; 362 mutex_exit(&ring->vr_lock); 363 return (err); 364 } 365 366 int 367 viona_ring_get_state(viona_link_t *link, uint16_t idx, 368 struct viona_ring_params *params) 369 { 370 viona_vring_t *ring; 371 372 if (idx >= VIONA_VQ_MAX) { 373 return (EINVAL); 374 } 375 376 ring = &link->l_vrings[idx]; 377 mutex_enter(&ring->vr_lock); 378 379 params->vrp_size = ring->vr_size; 380 params->vrp_pa = ring->vr_pa; 381 382 if (ring->vr_state == VRS_RUN) { 383 /* On a running ring, we must heed the avail/used locks */ 384 mutex_enter(&ring->vr_a_mutex); 385 params->vrp_avail_idx = ring->vr_cur_aidx; 386 mutex_exit(&ring->vr_a_mutex); 387 mutex_enter(&ring->vr_u_mutex); 388 params->vrp_used_idx = ring->vr_cur_uidx; 389 mutex_exit(&ring->vr_u_mutex); 390 } else { 391 /* Otherwise vr_lock is adequate protection */ 392 params->vrp_avail_idx = ring->vr_cur_aidx; 393 params->vrp_used_idx = ring->vr_cur_uidx; 394 } 395 396 mutex_exit(&ring->vr_lock); 397 398 return (0); 399 } 400 401 int 402 viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals) 403 { 404 mutex_enter(&ring->vr_lock); 405 if (ring->vr_state == VRS_RESET) { 406 mutex_exit(&ring->vr_lock); 407 return (0); 408 } 409 410 if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) { 411 ring->vr_state_flags |= VRSF_REQ_STOP; 412 cv_broadcast(&ring->vr_cv); 413 } 414 while (ring->vr_state != VRS_RESET) { 415 if (!heed_signals) { 416 cv_wait(&ring->vr_cv, &ring->vr_lock); 417 } else { 418 int rs; 419 420 rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 421 if (rs <= 0 && ring->vr_state != VRS_RESET) { 422 mutex_exit(&ring->vr_lock); 423 return (EINTR); 424 } 425 } 426 } 427 mutex_exit(&ring->vr_lock); 428 return (0); 429 } 430 431 static bool 432 viona_ring_map(viona_vring_t *ring, bool defer_dirty) 433 { 434 const uint16_t qsz = ring->vr_size; 435 uintptr_t pa = ring->vr_pa; 436 437 ASSERT3U(qsz, !=, 0); 438 ASSERT3U(qsz, <=, VRING_MAX_LEN); 439 ASSERT3U(pa, !=, 0); 440 ASSERT3U(pa & (LEGACY_VQ_ALIGN - 1), ==, 0); 441 ASSERT3U(LEGACY_VQ_ALIGN, ==, PAGESIZE); 442 ASSERT(MUTEX_HELD(&ring->vr_lock)); 443 ASSERT3P(ring->vr_map_pages, ==, NULL); 444 445 const uint_t npages = LEGACY_VQ_PAGES(qsz); 446 ring->vr_map_pages = kmem_zalloc(npages * sizeof (void *), KM_SLEEP); 447 448 int page_flags = 0; 449 if (defer_dirty) { 450 /* 451 * During initialization, and when entering the paused state, 452 * the page holds for a virtqueue are established with the 453 * DEFER_DIRTY flag set. 454 * 455 * This prevents those page holds from immediately marking the 456 * underlying pages as dirty, since the viona emulation is not 457 * yet performing any accesses. Once the ring transitions to 458 * the VRS_RUN state, the held pages will be marked as dirty. 459 * 460 * Any ring mappings performed outside those state conditions, 461 * such as those part of vmm_lease renewal during steady-state 462 * operation, will map the ring pages normally (as considered 463 * immediately dirty). 464 */ 465 page_flags |= VMPF_DEFER_DIRTY; 466 } 467 468 vmm_page_t *prev = NULL; 469 for (uint_t i = 0; i < npages; i++, pa += PAGESIZE) { 470 vmm_page_t *vmp; 471 472 vmp = vmm_drv_page_hold_ext(ring->vr_lease, pa, 473 PROT_READ | PROT_WRITE, page_flags); 474 if (vmp == NULL) { 475 viona_ring_unmap(ring); 476 return (false); 477 } 478 479 /* 480 * Keep the first page has the head of the chain, appending all 481 * subsequent pages to the tail. 482 */ 483 if (prev == NULL) { 484 ring->vr_map_hold = vmp; 485 } else { 486 vmm_drv_page_chain(prev, vmp); 487 } 488 prev = vmp; 489 ring->vr_map_pages[i] = vmm_drv_page_writable(vmp); 490 } 491 492 return (true); 493 } 494 495 static void 496 viona_ring_mark_dirty(viona_vring_t *ring) 497 { 498 ASSERT(MUTEX_HELD(&ring->vr_lock)); 499 ASSERT(ring->vr_map_hold != NULL); 500 501 for (vmm_page_t *vp = ring->vr_map_hold; vp != NULL; 502 vp = vmm_drv_page_next(vp)) { 503 vmm_drv_page_mark_dirty(vp); 504 } 505 } 506 507 static void 508 viona_ring_unmap(viona_vring_t *ring) 509 { 510 ASSERT(MUTEX_HELD(&ring->vr_lock)); 511 512 void **map = ring->vr_map_pages; 513 if (map != NULL) { 514 const uint_t npages = LEGACY_VQ_PAGES(ring->vr_size); 515 kmem_free(map, npages * sizeof (void *)); 516 ring->vr_map_pages = NULL; 517 518 vmm_drv_page_release_chain(ring->vr_map_hold); 519 ring->vr_map_hold = NULL; 520 } else { 521 ASSERT3P(ring->vr_map_hold, ==, NULL); 522 } 523 } 524 525 static inline void * 526 viona_ring_addr(viona_vring_t *ring, uint_t off) 527 { 528 ASSERT3P(ring->vr_map_pages, !=, NULL); 529 ASSERT3U(LEGACY_VQ_SIZE(ring->vr_size), >, off); 530 531 const uint_t page_num = off / PAGESIZE; 532 const uint_t page_off = off % PAGESIZE; 533 return ((caddr_t)ring->vr_map_pages[page_num] + page_off); 534 } 535 536 void 537 viona_intr_ring(viona_vring_t *ring, boolean_t skip_flags_check) 538 { 539 if (!skip_flags_check) { 540 volatile uint16_t *avail_flags = viona_ring_addr(ring, 541 LEGACY_AVAIL_FLAGS_OFF(ring->vr_size)); 542 543 if ((*avail_flags & VRING_AVAIL_F_NO_INTERRUPT) != 0) { 544 return; 545 } 546 } 547 548 mutex_enter(&ring->vr_lock); 549 uint64_t addr = ring->vr_msi_addr; 550 uint64_t msg = ring->vr_msi_msg; 551 mutex_exit(&ring->vr_lock); 552 if (addr != 0) { 553 /* Deliver the interrupt directly, if so configured... */ 554 (void) vmm_drv_msi(ring->vr_lease, addr, msg); 555 } else { 556 /* ... otherwise, leave it to userspace */ 557 if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) { 558 pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND); 559 } 560 } 561 } 562 563 static inline bool 564 vring_stop_req(const viona_vring_t *ring) 565 { 566 return ((ring->vr_state_flags & VRSF_REQ_STOP) != 0); 567 } 568 569 static inline bool 570 vring_pause_req(const viona_vring_t *ring) 571 { 572 return ((ring->vr_state_flags & VRSF_REQ_PAUSE) != 0); 573 } 574 575 static inline bool 576 vring_start_req(const viona_vring_t *ring) 577 { 578 return ((ring->vr_state_flags & VRSF_REQ_START) != 0); 579 } 580 581 /* 582 * Check if vring worker thread should bail out. This will heed indications 583 * that the containing process is exiting, as well as requests to stop or pause 584 * the ring. The `stop_only` parameter controls if pause requests are ignored 585 * (true) or checked (false). 586 * 587 * Caller should hold vr_lock. 588 */ 589 static bool 590 vring_need_bail_ext(const viona_vring_t *ring, bool stop_only) 591 { 592 ASSERT(MUTEX_HELD(&ring->vr_lock)); 593 594 if (vring_stop_req(ring) || 595 (!stop_only && vring_pause_req(ring))) { 596 return (true); 597 } 598 599 kthread_t *t = ring->vr_worker_thread; 600 if (t != NULL) { 601 proc_t *p = ttoproc(t); 602 603 ASSERT(p != NULL); 604 if ((p->p_flag & SEXITING) != 0) { 605 return (true); 606 } 607 } 608 return (false); 609 } 610 611 bool 612 vring_need_bail(const viona_vring_t *ring) 613 { 614 return (vring_need_bail_ext(ring, false)); 615 } 616 617 int 618 viona_ring_pause(viona_vring_t *ring) 619 { 620 mutex_enter(&ring->vr_lock); 621 switch (ring->vr_state) { 622 case VRS_RESET: 623 case VRS_SETUP: 624 case VRS_INIT: 625 /* 626 * For rings which have not yet started (even those in the 627 * VRS_SETUP and VRS_INIT phases, where there a running worker 628 * thread (waiting to be released to do its intended task), it 629 * is adequate to simply clear any start request, to keep them 630 * from proceeding into the actual work processing function. 631 */ 632 ring->vr_state_flags &= ~VRSF_REQ_START; 633 mutex_exit(&ring->vr_lock); 634 return (0); 635 636 case VRS_STOP: 637 if ((ring->vr_state_flags & VRSF_REQ_STOP) != 0) { 638 /* A ring on its way to RESET cannot be paused. */ 639 mutex_exit(&ring->vr_lock); 640 return (EBUSY); 641 } 642 /* FALLTHROUGH */ 643 case VRS_RUN: 644 ring->vr_state_flags |= VRSF_REQ_PAUSE; 645 cv_broadcast(&ring->vr_cv); 646 break; 647 648 default: 649 panic("invalid ring state %d", ring->vr_state); 650 break; 651 } 652 653 for (;;) { 654 int res = cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 655 656 if (ring->vr_state == VRS_INIT || 657 (ring->vr_state_flags & VRSF_REQ_PAUSE) == 0) { 658 /* Ring made it to (or through) paused state */ 659 mutex_exit(&ring->vr_lock); 660 return (0); 661 } 662 if (res == 0) { 663 /* interrupted by signal */ 664 mutex_exit(&ring->vr_lock); 665 return (EINTR); 666 } 667 } 668 /* NOTREACHED */ 669 } 670 671 static void 672 viona_worker(void *arg) 673 { 674 viona_vring_t *ring = (viona_vring_t *)arg; 675 viona_link_t *link = ring->vr_link; 676 677 mutex_enter(&ring->vr_lock); 678 VERIFY3U(ring->vr_state, ==, VRS_SETUP); 679 680 /* Bail immediately if ring shutdown or process exit was requested */ 681 if (vring_need_bail_ext(ring, true)) { 682 goto ring_reset; 683 } 684 685 /* Report worker thread as alive and notify creator */ 686 ring_init: 687 ring->vr_state = VRS_INIT; 688 cv_broadcast(&ring->vr_cv); 689 690 while (!vring_start_req(ring)) { 691 /* 692 * Keeping lease renewals timely while waiting for the ring to 693 * be started is important for avoiding deadlocks. 694 */ 695 if (vmm_drv_lease_expired(ring->vr_lease)) { 696 if (!viona_ring_lease_renew(ring)) { 697 goto ring_reset; 698 } 699 } 700 701 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 702 703 if (vring_pause_req(ring)) { 704 /* We are already paused in the INIT state. */ 705 ring->vr_state_flags &= ~VRSF_REQ_PAUSE; 706 } 707 if (vring_need_bail_ext(ring, true)) { 708 goto ring_reset; 709 } 710 } 711 712 ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0); 713 ring->vr_state = VRS_RUN; 714 ring->vr_state_flags &= ~VRSF_REQ_START; 715 viona_ring_mark_dirty(ring); 716 717 /* Ensure ring lease is valid first */ 718 if (vmm_drv_lease_expired(ring->vr_lease)) { 719 if (!viona_ring_lease_renew(ring)) { 720 goto ring_reset; 721 } 722 } 723 724 /* Process actual work */ 725 if (ring == &link->l_vrings[VIONA_VQ_RX]) { 726 viona_worker_rx(ring, link); 727 } else if (ring == &link->l_vrings[VIONA_VQ_TX]) { 728 viona_worker_tx(ring, link); 729 } else { 730 panic("unexpected ring: %p", (void *)ring); 731 } 732 733 VERIFY3U(ring->vr_state, ==, VRS_STOP); 734 VERIFY3U(ring->vr_xfer_outstanding, ==, 0); 735 736 /* 737 * Consolidate stats data so that it is not lost if/when this ring is 738 * being stopped. 739 */ 740 viona_ring_consolidate_stats(ring); 741 742 /* Respond to a pause request if the ring is not required to stop */ 743 if (vring_pause_req(ring)) { 744 ring->vr_state_flags &= ~VRSF_REQ_PAUSE; 745 746 if (vring_need_bail_ext(ring, true)) { 747 goto ring_reset; 748 } 749 750 /* 751 * To complete pausing of the ring, unmap and re-map the pages 752 * underpinning the virtqueue. This is to synchronize their 753 * dirty state in the backing page tables and restore the 754 * defer-dirty state on the held pages. 755 */ 756 viona_ring_unmap(ring); 757 if (viona_ring_map(ring, true)) { 758 goto ring_init; 759 } 760 761 /* 762 * If the ring pages failed to be mapped, fallthrough to 763 * ring-reset like any other failure. 764 */ 765 } 766 767 ring_reset: 768 viona_ring_misc_free(ring); 769 770 viona_ring_lease_drop(ring); 771 ring->vr_cur_aidx = 0; 772 ring->vr_size = 0; 773 ring->vr_mask = 0; 774 ring->vr_pa = 0; 775 ring->vr_state = VRS_RESET; 776 ring->vr_state_flags = 0; 777 ring->vr_worker_thread = NULL; 778 cv_broadcast(&ring->vr_cv); 779 mutex_exit(&ring->vr_lock); 780 781 mutex_enter(&ttoproc(curthread)->p_lock); 782 lwp_exit(); 783 } 784 785 static kthread_t * 786 viona_create_worker(viona_vring_t *ring) 787 { 788 k_sigset_t hold_set; 789 proc_t *p = curproc; 790 kthread_t *t; 791 klwp_t *lwp; 792 793 ASSERT(MUTEX_HELD(&ring->vr_lock)); 794 ASSERT(ring->vr_state == VRS_RESET); 795 796 sigfillset(&hold_set); 797 lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED, 798 minclsyspri - 1, &hold_set, curthread->t_cid, 0); 799 if (lwp == NULL) { 800 return (NULL); 801 } 802 803 t = lwptot(lwp); 804 mutex_enter(&p->p_lock); 805 t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD; 806 lwp_create_done(t); 807 mutex_exit(&p->p_lock); 808 809 return (t); 810 } 811 812 void 813 vq_read_desc(viona_vring_t *ring, uint16_t idx, struct virtio_desc *descp) 814 { 815 const uint_t entry_off = idx * sizeof (struct virtio_desc); 816 817 ASSERT3U(idx, <, ring->vr_size); 818 819 bcopy(viona_ring_addr(ring, entry_off), descp, sizeof (*descp)); 820 } 821 822 static uint16_t 823 vq_read_avail(viona_vring_t *ring, uint16_t idx) 824 { 825 ASSERT3U(idx, <, ring->vr_size); 826 827 volatile uint16_t *avail_ent = 828 viona_ring_addr(ring, LEGACY_AVAIL_ENT_OFF(ring->vr_size, idx)); 829 return (*avail_ent); 830 } 831 832 /* 833 * Given a buffer descriptor `desc`, attempt to map the pages backing that 834 * region of guest physical memory, taking into account that there are no 835 * guarantees about guest-contiguous pages being host-contiguous. 836 */ 837 static int 838 vq_map_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc, 839 vq_held_region_t *region) 840 { 841 int err; 842 843 if (desc->vd_len == 0) { 844 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring, 845 uint32_t, desc->vd_len); 846 VIONA_RING_STAT_INCR(ring, desc_bad_len); 847 return (EINVAL); 848 } 849 850 err = vq_region_hold(ring, desc->vd_addr, desc->vd_len, 851 (desc->vd_flags & VRING_DESC_F_WRITE) != 0, region); 852 switch (err) { 853 case E2BIG: 854 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring); 855 VIONA_RING_STAT_INCR(ring, too_many_desc); 856 break; 857 case EFAULT: 858 VIONA_PROBE_BAD_RING_ADDR(ring, desc->vd_addr); 859 VIONA_RING_STAT_INCR(ring, bad_ring_addr); 860 break; 861 default: 862 break; 863 } 864 865 return (err); 866 } 867 868 /* 869 * Walk an indirect buffer descriptor `desc`, attempting to map the pages 870 * backing the regions of guest memory covered by its constituent descriptors. 871 */ 872 static int 873 vq_map_indir_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc, 874 vq_held_region_t *region) 875 { 876 const uint16_t indir_count = desc->vd_len / sizeof (struct virtio_desc); 877 878 if ((desc->vd_len & 0xf) != 0 || indir_count == 0 || 879 indir_count > ring->vr_size || 880 desc->vd_addr > (desc->vd_addr + desc->vd_len)) { 881 VIONA_PROBE2(indir_bad_len, viona_vring_t *, ring, 882 uint32_t, desc->vd_len); 883 VIONA_RING_STAT_INCR(ring, indir_bad_len); 884 return (EINVAL); 885 } 886 887 uint16_t indir_next = 0; 888 const uint8_t *buf = NULL; 889 uint64_t buf_gpa = UINT64_MAX; 890 vmm_page_t *vmp = NULL; 891 int err = 0; 892 893 for (;;) { 894 uint64_t indir_gpa = 895 desc->vd_addr + (indir_next * sizeof (struct virtio_desc)); 896 uint64_t indir_page = indir_gpa & PAGEMASK; 897 struct virtio_desc vp; 898 899 /* 900 * Get a mapping for the page that the next indirect descriptor 901 * resides in, if has not already been done. 902 */ 903 if (indir_page != buf_gpa) { 904 if (vmp != NULL) { 905 vmm_drv_page_release(vmp); 906 } 907 vmp = vq_page_hold(ring, indir_page, false); 908 if (vmp == NULL) { 909 VIONA_PROBE_BAD_RING_ADDR(ring, indir_page); 910 VIONA_RING_STAT_INCR(ring, bad_ring_addr); 911 err = EFAULT; 912 break; 913 } 914 buf_gpa = indir_page; 915 buf = vmm_drv_page_readable(vmp); 916 } 917 918 /* 919 * A copy of the indirect descriptor is made here, rather than 920 * simply using a reference pointer. This prevents malicious or 921 * erroneous guest writes to the descriptor from fooling the 922 * flags/bounds verification through a race. 923 */ 924 bcopy(buf + (indir_gpa - indir_page), &vp, sizeof (vp)); 925 926 if (vp.vd_flags & VRING_DESC_F_INDIRECT) { 927 VIONA_PROBE1(indir_bad_nest, viona_vring_t *, ring); 928 VIONA_RING_STAT_INCR(ring, indir_bad_nest); 929 err = EINVAL; 930 break; 931 } else if (vp.vd_len == 0) { 932 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring, 933 uint32_t, vp.vd_len); 934 VIONA_RING_STAT_INCR(ring, desc_bad_len); 935 err = EINVAL; 936 break; 937 } 938 939 err = vq_map_desc_bufs(ring, &vp, region); 940 if (err != 0) { 941 break; 942 } 943 944 /* Successfully reach the end of the indir chain */ 945 if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) { 946 break; 947 } 948 if (region->vhr_idx >= region->vhr_niov) { 949 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring); 950 VIONA_RING_STAT_INCR(ring, too_many_desc); 951 err = E2BIG; 952 break; 953 } 954 955 indir_next = vp.vd_next; 956 if (indir_next >= indir_count) { 957 VIONA_PROBE3(indir_bad_next, viona_vring_t *, ring, 958 uint16_t, indir_next, uint16_t, indir_count); 959 VIONA_RING_STAT_INCR(ring, indir_bad_next); 960 err = EINVAL; 961 break; 962 } 963 } 964 965 if (vmp != NULL) { 966 vmm_drv_page_release(vmp); 967 } 968 return (err); 969 } 970 971 int 972 vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov, 973 uint16_t *cookie, vmm_page_t **chain) 974 { 975 uint16_t ndesc, idx, head, next; 976 struct virtio_desc vdir; 977 vq_held_region_t region = { 978 .vhr_niov = niov, 979 .vhr_iov = iov, 980 }; 981 982 ASSERT(iov != NULL); 983 ASSERT(niov > 0 && niov < INT_MAX); 984 ASSERT(*chain == NULL); 985 986 mutex_enter(&ring->vr_a_mutex); 987 idx = ring->vr_cur_aidx; 988 ndesc = viona_ring_num_avail(ring); 989 990 if (ndesc == 0) { 991 mutex_exit(&ring->vr_a_mutex); 992 return (0); 993 } 994 if (ndesc > ring->vr_size) { 995 /* 996 * Despite the fact that the guest has provided an 'avail_idx' 997 * which indicates that an impossible number of descriptors are 998 * available, continue on and attempt to process the next one. 999 * 1000 * The transgression will not escape the probe or stats though. 1001 */ 1002 VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring, 1003 uint16_t, ndesc); 1004 VIONA_RING_STAT_INCR(ring, ndesc_too_high); 1005 } 1006 1007 head = vq_read_avail(ring, idx & ring->vr_mask); 1008 next = head; 1009 1010 for (region.vhr_idx = 0; region.vhr_idx < niov; next = vdir.vd_next) { 1011 if (next >= ring->vr_size) { 1012 VIONA_PROBE2(bad_idx, viona_vring_t *, ring, 1013 uint16_t, next); 1014 VIONA_RING_STAT_INCR(ring, bad_idx); 1015 break; 1016 } 1017 1018 vq_read_desc(ring, next, &vdir); 1019 if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) { 1020 if (vq_map_desc_bufs(ring, &vdir, ®ion) != 0) { 1021 break; 1022 } 1023 } else { 1024 /* 1025 * Per the specification (Virtio 1.1 S2.6.5.3.1): 1026 * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT 1027 * and VIRTQ_DESC_F_NEXT in `flags`. 1028 */ 1029 if ((vdir.vd_flags & VRING_DESC_F_NEXT) != 0) { 1030 VIONA_PROBE3(indir_bad_next, 1031 viona_vring_t *, ring, 1032 uint16_t, next, uint16_t, 0); 1033 VIONA_RING_STAT_INCR(ring, indir_bad_next); 1034 break; 1035 } 1036 1037 if (vq_map_indir_desc_bufs(ring, &vdir, ®ion) != 0) { 1038 break; 1039 } 1040 } 1041 1042 if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) { 1043 ring->vr_cur_aidx++; 1044 mutex_exit(&ring->vr_a_mutex); 1045 1046 *cookie = head; 1047 *chain = region.vhr_head; 1048 return (region.vhr_idx); 1049 } 1050 } 1051 1052 mutex_exit(&ring->vr_a_mutex); 1053 if (region.vhr_head != NULL) { 1054 /* 1055 * If any pages were held prior to encountering an error, we 1056 * must release them now. 1057 */ 1058 vmm_drv_page_release_chain(region.vhr_head); 1059 } 1060 return (-1); 1061 } 1062 1063 1064 static void 1065 vq_write_used_ent(viona_vring_t *ring, uint16_t idx, uint16_t cookie, 1066 uint32_t len) 1067 { 1068 /* 1069 * In a larger ring, entry could be split across pages, so be sure to 1070 * account for that when configuring the transfer by looking up the ID 1071 * and length addresses separately, rather than an address for a 1072 * combined `struct virtio_used`. 1073 */ 1074 const uint_t used_id_off = LEGACY_USED_ENT_OFF(ring->vr_size, idx); 1075 const uint_t used_len_off = used_id_off + sizeof (uint32_t); 1076 volatile uint32_t *idp = viona_ring_addr(ring, used_id_off); 1077 volatile uint32_t *lenp = viona_ring_addr(ring, used_len_off); 1078 1079 ASSERT(MUTEX_HELD(&ring->vr_u_mutex)); 1080 1081 *idp = cookie; 1082 *lenp = len; 1083 } 1084 1085 static void 1086 vq_write_used_idx(viona_vring_t *ring, uint16_t idx) 1087 { 1088 ASSERT(MUTEX_HELD(&ring->vr_u_mutex)); 1089 1090 volatile uint16_t *used_idx = 1091 viona_ring_addr(ring, LEGACY_USED_IDX_OFF(ring->vr_size)); 1092 *used_idx = idx; 1093 } 1094 1095 void 1096 vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie) 1097 { 1098 uint16_t uidx; 1099 1100 mutex_enter(&ring->vr_u_mutex); 1101 1102 uidx = ring->vr_cur_uidx; 1103 vq_write_used_ent(ring, uidx & ring->vr_mask, cookie, len); 1104 uidx++; 1105 membar_producer(); 1106 1107 vq_write_used_idx(ring, uidx); 1108 ring->vr_cur_uidx = uidx; 1109 1110 mutex_exit(&ring->vr_u_mutex); 1111 } 1112 1113 void 1114 vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem) 1115 { 1116 uint16_t uidx; 1117 1118 mutex_enter(&ring->vr_u_mutex); 1119 1120 uidx = ring->vr_cur_uidx; 1121 1122 for (uint_t i = 0; i < num_bufs; i++, uidx++) { 1123 vq_write_used_ent(ring, uidx & ring->vr_mask, elem[i].id, 1124 elem[i].len); 1125 } 1126 1127 membar_producer(); 1128 vq_write_used_idx(ring, uidx); 1129 ring->vr_cur_uidx = uidx; 1130 1131 mutex_exit(&ring->vr_u_mutex); 1132 } 1133 1134 /* 1135 * Set USED_NO_NOTIFY on VQ so guest elides doorbell calls for new entries. 1136 */ 1137 void 1138 viona_ring_disable_notify(viona_vring_t *ring) 1139 { 1140 volatile uint16_t *used_flags = 1141 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size)); 1142 1143 *used_flags |= VRING_USED_F_NO_NOTIFY; 1144 } 1145 1146 /* 1147 * Clear USED_NO_NOTIFY on VQ so guest resumes doorbell calls for new entries. 1148 */ 1149 void 1150 viona_ring_enable_notify(viona_vring_t *ring) 1151 { 1152 volatile uint16_t *used_flags = 1153 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size)); 1154 1155 *used_flags &= ~VRING_USED_F_NO_NOTIFY; 1156 } 1157 1158 /* 1159 * Return the number of available descriptors in the vring taking care of the 1160 * 16-bit index wraparound. 1161 * 1162 * Note: If the number of apparently available descriptors is larger than the 1163 * ring size (due to guest misbehavior), this check will still report the 1164 * positive count of descriptors. 1165 */ 1166 uint16_t 1167 viona_ring_num_avail(viona_vring_t *ring) 1168 { 1169 volatile uint16_t *avail_idx = 1170 viona_ring_addr(ring, LEGACY_AVAIL_IDX_OFF(ring->vr_size)); 1171 1172 return (*avail_idx - ring->vr_cur_aidx); 1173 } 1174 1175 /* Record a successfully transferred packet for the ring stats */ 1176 void 1177 viona_ring_stat_accept(viona_vring_t *ring, uint32_t len) 1178 { 1179 atomic_inc_64(&ring->vr_stats.vts_packets); 1180 atomic_add_64(&ring->vr_stats.vts_bytes, len); 1181 } 1182 1183 /* 1184 * Record a dropped packet in the ring stats 1185 */ 1186 void 1187 viona_ring_stat_drop(viona_vring_t *ring) 1188 { 1189 atomic_inc_64(&ring->vr_stats.vts_drops); 1190 } 1191 1192 /* 1193 * Record a packet transfer error in the ring stats 1194 */ 1195 void 1196 viona_ring_stat_error(viona_vring_t *ring) 1197 { 1198 atomic_inc_64(&ring->vr_stats.vts_errors); 1199 } 1200 1201 /* 1202 * Consolidate statistic data for this ring into the totals for the link 1203 */ 1204 static void 1205 viona_ring_consolidate_stats(viona_vring_t *ring) 1206 { 1207 viona_link_t *link = ring->vr_link; 1208 struct viona_transfer_stats *lstat = 1209 (ring == &link->l_vrings[VIONA_VQ_RX]) ? 1210 &link->l_stats.vls_rx : &link->l_stats.vls_tx; 1211 1212 mutex_enter(&link->l_stats_lock); 1213 lstat->vts_packets += ring->vr_stats.vts_packets; 1214 lstat->vts_bytes += ring->vr_stats.vts_bytes; 1215 lstat->vts_drops += ring->vr_stats.vts_drops; 1216 lstat->vts_errors += ring->vr_stats.vts_errors; 1217 bzero(&ring->vr_stats, sizeof (ring->vr_stats)); 1218 mutex_exit(&link->l_stats_lock); 1219 } 1220