1 /* 2 * Copyright (c) 2013 Chris Torek <torek @ torek net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * This file and its contents are supplied under the terms of the 28 * Common Development and Distribution License ("CDDL"), version 1.0. 29 * You may only use this file in accordance with the terms of version 30 * 1.0 of the CDDL. 31 * 32 * A full copy of the text of the CDDL should have accompanied this 33 * source. A copy of the CDDL is also available via the Internet at 34 * http://www.illumos.org/license/CDDL. 35 * 36 * Copyright 2015 Pluribus Networks Inc. 37 * Copyright 2019 Joyent, Inc. 38 * Copyright 2021 Oxide Computer Company 39 */ 40 41 42 #include <sys/disp.h> 43 44 #include "viona_impl.h" 45 46 #define VRING_MAX_LEN 32768 47 48 /* Layout and sizing as defined in the spec for a legacy-style virtqueue */ 49 50 #define LEGACY_VQ_ALIGN PAGESIZE 51 52 #define LEGACY_DESC_SZ(qsz) ((qsz) * sizeof (struct virtio_desc)) 53 /* 54 * Available ring consists of avail_idx (uint16_t), flags (uint16_t), qsz avail 55 * descriptors (uint16_t each), and (optional) used_event (uint16_t). 56 */ 57 #define LEGACY_AVAIL_SZ(qsz) (((qsz) + 3) * sizeof (uint16_t)) 58 /* 59 * Used ring consists of used_idx (uint16_t), flags (uint16_t), qsz used 60 * descriptors (two uint32_t each), and (optional) avail_event (uint16_t). 61 */ 62 #define LEGACY_USED_SZ(qsz) \ 63 ((qsz) * sizeof (struct virtio_used) + 3 * sizeof (uint16_t)) 64 65 #define LEGACY_AVAIL_FLAGS_OFF(qsz) LEGACY_DESC_SZ(qsz) 66 #define LEGACY_AVAIL_IDX_OFF(qsz) \ 67 (LEGACY_DESC_SZ(qsz) + sizeof (uint16_t)) 68 #define LEGACY_AVAIL_ENT_OFF(qsz, idx) \ 69 (LEGACY_DESC_SZ(qsz) + (2 + (idx)) * sizeof (uint16_t)) 70 71 #define LEGACY_USED_FLAGS_OFF(qsz) \ 72 P2ROUNDUP(LEGACY_DESC_SZ(qsz) + LEGACY_AVAIL_SZ(qsz), LEGACY_VQ_ALIGN) 73 #define LEGACY_USED_IDX_OFF(qsz) \ 74 (LEGACY_USED_FLAGS_OFF(qsz) + sizeof (uint16_t)) 75 #define LEGACY_USED_ENT_OFF(qsz, idx) \ 76 (LEGACY_USED_FLAGS_OFF(qsz) + 2 * sizeof (uint16_t) + \ 77 (idx) * sizeof (struct virtio_used)) 78 79 #define LEGACY_VQ_SIZE(qsz) \ 80 (LEGACY_USED_FLAGS_OFF(qsz) + \ 81 P2ROUNDUP(LEGACY_USED_SZ(qsz), LEGACY_VQ_ALIGN)) 82 #define LEGACY_VQ_PAGES(qsz) (LEGACY_VQ_SIZE(qsz) / PAGESIZE) 83 84 struct vq_held_region { 85 struct iovec *vhr_iov; 86 vmm_page_t *vhr_head; 87 vmm_page_t *vhr_tail; 88 /* Length of iovec array supplied in `vhr_iov` */ 89 uint_t vhr_niov; 90 /* 91 * Index into vhr_iov, indicating the next "free" entry (following the 92 * last entry which has valid contents). 93 */ 94 uint_t vhr_idx; 95 }; 96 typedef struct vq_held_region vq_held_region_t; 97 98 static boolean_t viona_ring_map(viona_vring_t *); 99 static void viona_ring_unmap(viona_vring_t *); 100 static kthread_t *viona_create_worker(viona_vring_t *); 101 102 static vmm_page_t * 103 vq_page_hold(viona_vring_t *ring, uint64_t gpa, bool writable) 104 { 105 ASSERT3P(ring->vr_lease, !=, NULL); 106 107 int prot = PROT_READ; 108 if (writable) { 109 prot |= PROT_WRITE; 110 } 111 112 return (vmm_drv_page_hold(ring->vr_lease, gpa, prot)); 113 } 114 115 /* 116 * Establish a hold on the page(s) which back the region of guest memory covered 117 * by [gpa, gpa + len). The host-kernel-virtual pointers to those pages are 118 * stored in the iovec array supplied in `region`, along with the chain of 119 * vmm_page_t entries representing the held pages. Since guest memory 120 * carries no guarantees of being physically contiguous (on the host), it is 121 * assumed that an iovec entry will be required for each PAGESIZE section 122 * covered by the specified `gpa` and `len` range. For each iovec entry 123 * successfully populated by holding a page, `vhr_idx` will be incremented so it 124 * references the next available iovec entry (or `vhr_niov`, if the iovec array 125 * is full). The responsibility for releasing the `vmm_page_t` chain (stored in 126 * `vhr_head` and `vhr_tail`) resides with the caller, regardless of the result. 127 */ 128 static int 129 vq_region_hold(viona_vring_t *ring, uint64_t gpa, uint32_t len, 130 bool writable, vq_held_region_t *region) 131 { 132 const uint32_t front_offset = gpa & PAGEOFFSET; 133 const uint32_t front_len = MIN(len, PAGESIZE - front_offset); 134 uint_t pages = 1; 135 vmm_page_t *vmp; 136 caddr_t buf; 137 138 ASSERT3U(region->vhr_idx, <, region->vhr_niov); 139 140 if (front_len < len) { 141 pages += P2ROUNDUP((uint64_t)(len - front_len), 142 PAGESIZE) / PAGESIZE; 143 } 144 if (pages > (region->vhr_niov - region->vhr_idx)) { 145 return (E2BIG); 146 } 147 148 vmp = vq_page_hold(ring, gpa & PAGEMASK, writable); 149 if (vmp == NULL) { 150 return (EFAULT); 151 } 152 buf = (caddr_t)vmm_drv_page_readable(vmp); 153 154 region->vhr_iov[region->vhr_idx].iov_base = buf + front_offset; 155 region->vhr_iov[region->vhr_idx].iov_len = front_len; 156 region->vhr_idx++; 157 gpa += front_len; 158 len -= front_len; 159 if (region->vhr_head == NULL) { 160 region->vhr_head = vmp; 161 region->vhr_tail = vmp; 162 } else { 163 vmm_drv_page_chain(region->vhr_tail, vmp); 164 region->vhr_tail = vmp; 165 } 166 167 for (uint_t i = 1; i < pages; i++) { 168 ASSERT3U(gpa & PAGEOFFSET, ==, 0); 169 170 vmp = vq_page_hold(ring, gpa, writable); 171 if (vmp == NULL) { 172 return (EFAULT); 173 } 174 buf = (caddr_t)vmm_drv_page_readable(vmp); 175 176 const uint32_t chunk_len = MIN(len, PAGESIZE); 177 region->vhr_iov[region->vhr_idx].iov_base = buf; 178 region->vhr_iov[region->vhr_idx].iov_len = chunk_len; 179 region->vhr_idx++; 180 gpa += chunk_len; 181 len -= chunk_len; 182 vmm_drv_page_chain(region->vhr_tail, vmp); 183 region->vhr_tail = vmp; 184 } 185 186 return (0); 187 } 188 189 static boolean_t 190 viona_ring_lease_expire_cb(void *arg) 191 { 192 viona_vring_t *ring = arg; 193 194 mutex_enter(&ring->vr_lock); 195 cv_broadcast(&ring->vr_cv); 196 mutex_exit(&ring->vr_lock); 197 198 /* The lease will be broken asynchronously. */ 199 return (B_FALSE); 200 } 201 202 static void 203 viona_ring_lease_drop(viona_vring_t *ring) 204 { 205 ASSERT(MUTEX_HELD(&ring->vr_lock)); 206 207 if (ring->vr_lease != NULL) { 208 vmm_hold_t *hold = ring->vr_link->l_vm_hold; 209 210 ASSERT(hold != NULL); 211 212 /* 213 * Without an active lease, the ring mappings cannot be 214 * considered valid. 215 */ 216 viona_ring_unmap(ring); 217 218 vmm_drv_lease_break(hold, ring->vr_lease); 219 ring->vr_lease = NULL; 220 } 221 } 222 223 boolean_t 224 viona_ring_lease_renew(viona_vring_t *ring) 225 { 226 vmm_hold_t *hold = ring->vr_link->l_vm_hold; 227 228 ASSERT(hold != NULL); 229 ASSERT(MUTEX_HELD(&ring->vr_lock)); 230 231 viona_ring_lease_drop(ring); 232 233 /* 234 * Lease renewal will fail if the VM has requested that all holds be 235 * cleaned up. 236 */ 237 ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb, 238 ring); 239 if (ring->vr_lease != NULL) { 240 /* A ring undergoing renewal will need valid guest mappings */ 241 if (ring->vr_pa != 0 && ring->vr_size != 0) { 242 /* 243 * If new mappings cannot be established, consider the 244 * lease renewal a failure. 245 */ 246 if (!viona_ring_map(ring)) { 247 viona_ring_lease_drop(ring); 248 return (B_FALSE); 249 } 250 } 251 } 252 return (ring->vr_lease != NULL); 253 } 254 255 void 256 viona_ring_alloc(viona_link_t *link, viona_vring_t *ring) 257 { 258 ring->vr_link = link; 259 mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL); 260 cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL); 261 mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL); 262 mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL); 263 } 264 265 static void 266 viona_ring_misc_free(viona_vring_t *ring) 267 { 268 const uint_t qsz = ring->vr_size; 269 270 viona_tx_ring_free(ring, qsz); 271 } 272 273 void 274 viona_ring_free(viona_vring_t *ring) 275 { 276 mutex_destroy(&ring->vr_lock); 277 cv_destroy(&ring->vr_cv); 278 mutex_destroy(&ring->vr_a_mutex); 279 mutex_destroy(&ring->vr_u_mutex); 280 ring->vr_link = NULL; 281 } 282 283 int 284 viona_ring_init(viona_link_t *link, uint16_t idx, uint16_t qsz, uint64_t pa) 285 { 286 viona_vring_t *ring; 287 kthread_t *t; 288 int err = 0; 289 290 if (idx >= VIONA_VQ_MAX) { 291 return (EINVAL); 292 } 293 if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) { 294 return (EINVAL); 295 } 296 if ((pa & (LEGACY_VQ_ALIGN - 1)) != 0) { 297 return (EINVAL); 298 } 299 300 ring = &link->l_vrings[idx]; 301 mutex_enter(&ring->vr_lock); 302 if (ring->vr_state != VRS_RESET) { 303 mutex_exit(&ring->vr_lock); 304 return (EBUSY); 305 } 306 VERIFY(ring->vr_state_flags == 0); 307 308 ring->vr_lease = NULL; 309 if (!viona_ring_lease_renew(ring)) { 310 err = EBUSY; 311 goto fail; 312 } 313 314 ring->vr_size = qsz; 315 ring->vr_mask = (ring->vr_size - 1); 316 ring->vr_pa = pa; 317 if (!viona_ring_map(ring)) { 318 err = EINVAL; 319 goto fail; 320 } 321 322 /* Initialize queue indexes */ 323 ring->vr_cur_aidx = 0; 324 ring->vr_cur_uidx = 0; 325 326 if (idx == VIONA_VQ_TX) { 327 viona_tx_ring_alloc(ring, qsz); 328 } 329 330 /* Zero out MSI-X configuration */ 331 ring->vr_msi_addr = 0; 332 ring->vr_msi_msg = 0; 333 334 /* Clear the stats */ 335 bzero(&ring->vr_stats, sizeof (ring->vr_stats)); 336 337 t = viona_create_worker(ring); 338 if (t == NULL) { 339 err = ENOMEM; 340 goto fail; 341 } 342 ring->vr_worker_thread = t; 343 ring->vr_state = VRS_SETUP; 344 cv_broadcast(&ring->vr_cv); 345 mutex_exit(&ring->vr_lock); 346 return (0); 347 348 fail: 349 viona_ring_lease_drop(ring); 350 viona_ring_misc_free(ring); 351 ring->vr_size = 0; 352 ring->vr_mask = 0; 353 ring->vr_pa = 0; 354 mutex_exit(&ring->vr_lock); 355 return (err); 356 } 357 358 int 359 viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals) 360 { 361 mutex_enter(&ring->vr_lock); 362 if (ring->vr_state == VRS_RESET) { 363 mutex_exit(&ring->vr_lock); 364 return (0); 365 } 366 367 if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) { 368 ring->vr_state_flags |= VRSF_REQ_STOP; 369 cv_broadcast(&ring->vr_cv); 370 } 371 while (ring->vr_state != VRS_RESET) { 372 if (!heed_signals) { 373 cv_wait(&ring->vr_cv, &ring->vr_lock); 374 } else { 375 int rs; 376 377 rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 378 if (rs <= 0 && ring->vr_state != VRS_RESET) { 379 mutex_exit(&ring->vr_lock); 380 return (EINTR); 381 } 382 } 383 } 384 mutex_exit(&ring->vr_lock); 385 return (0); 386 } 387 388 static boolean_t 389 viona_ring_map(viona_vring_t *ring) 390 { 391 const uint16_t qsz = ring->vr_size; 392 uintptr_t pa = ring->vr_pa; 393 394 ASSERT3U(qsz, !=, 0); 395 ASSERT3U(qsz, <=, VRING_MAX_LEN); 396 ASSERT3U(pa, !=, 0); 397 ASSERT3U(pa & (LEGACY_VQ_ALIGN - 1), ==, 0); 398 ASSERT3U(LEGACY_VQ_ALIGN, ==, PAGESIZE); 399 ASSERT(MUTEX_HELD(&ring->vr_lock)); 400 ASSERT3P(ring->vr_map_pages, ==, NULL); 401 402 const uint_t npages = LEGACY_VQ_PAGES(qsz); 403 ring->vr_map_pages = kmem_zalloc(npages * sizeof (void *), KM_SLEEP); 404 405 vmm_page_t *prev = NULL; 406 407 for (uint_t i = 0; i < npages; i++, pa += PAGESIZE) { 408 vmm_page_t *vmp; 409 410 vmp = vq_page_hold(ring, pa, true); 411 if (vmp == NULL) { 412 viona_ring_unmap(ring); 413 return (B_FALSE); 414 } 415 416 /* 417 * Keep the first page has the head of the chain, appending all 418 * subsequent pages to the tail. 419 */ 420 if (prev == NULL) { 421 ring->vr_map_hold = vmp; 422 } else { 423 vmm_drv_page_chain(prev, vmp); 424 } 425 prev = vmp; 426 ring->vr_map_pages[i] = vmm_drv_page_writable(vmp); 427 } 428 429 return (B_TRUE); 430 } 431 432 static void 433 viona_ring_unmap(viona_vring_t *ring) 434 { 435 ASSERT(MUTEX_HELD(&ring->vr_lock)); 436 437 void **map = ring->vr_map_pages; 438 if (map != NULL) { 439 const uint_t npages = LEGACY_VQ_PAGES(ring->vr_size); 440 kmem_free(map, npages * sizeof (void *)); 441 ring->vr_map_pages = NULL; 442 443 vmm_drv_page_release_chain(ring->vr_map_hold); 444 ring->vr_map_hold = NULL; 445 } else { 446 ASSERT3P(ring->vr_map_hold, ==, NULL); 447 } 448 } 449 450 static inline void * 451 viona_ring_addr(viona_vring_t *ring, uint_t off) 452 { 453 ASSERT3P(ring->vr_map_pages, !=, NULL); 454 ASSERT3U(LEGACY_VQ_SIZE(ring->vr_size), >, off); 455 456 const uint_t page_num = off / PAGESIZE; 457 const uint_t page_off = off % PAGESIZE; 458 return ((caddr_t)ring->vr_map_pages[page_num] + page_off); 459 } 460 461 void 462 viona_intr_ring(viona_vring_t *ring, boolean_t skip_flags_check) 463 { 464 if (!skip_flags_check) { 465 volatile uint16_t *avail_flags = viona_ring_addr(ring, 466 LEGACY_AVAIL_FLAGS_OFF(ring->vr_size)); 467 468 if ((*avail_flags & VRING_AVAIL_F_NO_INTERRUPT) != 0) { 469 return; 470 } 471 } 472 473 mutex_enter(&ring->vr_lock); 474 uint64_t addr = ring->vr_msi_addr; 475 uint64_t msg = ring->vr_msi_msg; 476 mutex_exit(&ring->vr_lock); 477 if (addr != 0) { 478 /* Deliver the interrupt directly, if so configured... */ 479 (void) vmm_drv_msi(ring->vr_lease, addr, msg); 480 } else { 481 /* ... otherwise, leave it to userspace */ 482 if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) { 483 pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND); 484 } 485 } 486 } 487 488 static void 489 viona_worker(void *arg) 490 { 491 viona_vring_t *ring = (viona_vring_t *)arg; 492 viona_link_t *link = ring->vr_link; 493 proc_t *p = ttoproc(curthread); 494 495 mutex_enter(&ring->vr_lock); 496 VERIFY3U(ring->vr_state, ==, VRS_SETUP); 497 498 /* Bail immediately if ring shutdown or process exit was requested */ 499 if (VRING_NEED_BAIL(ring, p)) { 500 goto cleanup; 501 } 502 503 /* Report worker thread as alive and notify creator */ 504 ring->vr_state = VRS_INIT; 505 cv_broadcast(&ring->vr_cv); 506 507 while (ring->vr_state_flags == 0) { 508 /* 509 * Keeping lease renewals timely while waiting for the ring to 510 * be started is important for avoiding deadlocks. 511 */ 512 if (vmm_drv_lease_expired(ring->vr_lease)) { 513 if (!viona_ring_lease_renew(ring)) { 514 goto cleanup; 515 } 516 } 517 518 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 519 520 if (VRING_NEED_BAIL(ring, p)) { 521 goto cleanup; 522 } 523 } 524 525 ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0); 526 ring->vr_state = VRS_RUN; 527 ring->vr_state_flags &= ~VRSF_REQ_START; 528 529 /* Ensure ring lease is valid first */ 530 if (vmm_drv_lease_expired(ring->vr_lease)) { 531 if (!viona_ring_lease_renew(ring)) { 532 goto cleanup; 533 } 534 } 535 536 /* Process actual work */ 537 if (ring == &link->l_vrings[VIONA_VQ_RX]) { 538 viona_worker_rx(ring, link); 539 } else if (ring == &link->l_vrings[VIONA_VQ_TX]) { 540 viona_worker_tx(ring, link); 541 } else { 542 panic("unexpected ring: %p", (void *)ring); 543 } 544 545 VERIFY3U(ring->vr_state, ==, VRS_STOP); 546 547 cleanup: 548 if (ring->vr_txdesb != NULL) { 549 /* 550 * Transmit activity must be entirely concluded before the 551 * associated descriptors can be cleaned up. 552 */ 553 VERIFY(ring->vr_xfer_outstanding == 0); 554 } 555 viona_ring_misc_free(ring); 556 557 viona_ring_lease_drop(ring); 558 ring->vr_cur_aidx = 0; 559 ring->vr_size = 0; 560 ring->vr_mask = 0; 561 ring->vr_pa = 0; 562 ring->vr_state = VRS_RESET; 563 ring->vr_state_flags = 0; 564 ring->vr_worker_thread = NULL; 565 cv_broadcast(&ring->vr_cv); 566 mutex_exit(&ring->vr_lock); 567 568 mutex_enter(&ttoproc(curthread)->p_lock); 569 lwp_exit(); 570 } 571 572 static kthread_t * 573 viona_create_worker(viona_vring_t *ring) 574 { 575 k_sigset_t hold_set; 576 proc_t *p = curproc; 577 kthread_t *t; 578 klwp_t *lwp; 579 580 ASSERT(MUTEX_HELD(&ring->vr_lock)); 581 ASSERT(ring->vr_state == VRS_RESET); 582 583 sigfillset(&hold_set); 584 lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED, 585 minclsyspri - 1, &hold_set, curthread->t_cid, 0); 586 if (lwp == NULL) { 587 return (NULL); 588 } 589 590 t = lwptot(lwp); 591 mutex_enter(&p->p_lock); 592 t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD; 593 lwp_create_done(t); 594 mutex_exit(&p->p_lock); 595 596 return (t); 597 } 598 599 void 600 vq_read_desc(viona_vring_t *ring, uint16_t idx, struct virtio_desc *descp) 601 { 602 const uint_t entry_off = idx * sizeof (struct virtio_desc); 603 604 ASSERT3U(idx, <, ring->vr_size); 605 606 bcopy(viona_ring_addr(ring, entry_off), descp, sizeof (*descp)); 607 } 608 609 static uint16_t 610 vq_read_avail(viona_vring_t *ring, uint16_t idx) 611 { 612 ASSERT3U(idx, <, ring->vr_size); 613 614 volatile uint16_t *avail_ent = 615 viona_ring_addr(ring, LEGACY_AVAIL_ENT_OFF(ring->vr_size, idx)); 616 return (*avail_ent); 617 } 618 619 /* 620 * Given a buffer descriptor `desc`, attempt to map the pages backing that 621 * region of guest physical memory, taking into account that there are no 622 * guarantees about guest-contiguous pages being host-contiguous. 623 */ 624 static int 625 vq_map_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc, 626 vq_held_region_t *region) 627 { 628 int err; 629 630 if (desc->vd_len == 0) { 631 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring, 632 uint32_t, desc->vd_len); 633 VIONA_RING_STAT_INCR(ring, desc_bad_len); 634 return (EINVAL); 635 } 636 637 err = vq_region_hold(ring, desc->vd_addr, desc->vd_len, 638 (desc->vd_flags & VRING_DESC_F_WRITE) != 0, region); 639 switch (err) { 640 case E2BIG: 641 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring); 642 VIONA_RING_STAT_INCR(ring, too_many_desc); 643 break; 644 case EFAULT: 645 VIONA_PROBE_BAD_RING_ADDR(ring, desc->vd_addr); 646 VIONA_RING_STAT_INCR(ring, bad_ring_addr); 647 break; 648 default: 649 break; 650 } 651 652 return (err); 653 } 654 655 /* 656 * Walk an indirect buffer descriptor `desc`, attempting to map the pages 657 * backing the regions of guest memory covered by its contituent descriptors. 658 */ 659 static int 660 vq_map_indir_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc, 661 vq_held_region_t *region) 662 { 663 const uint16_t indir_count = desc->vd_len / sizeof (struct virtio_desc); 664 665 if ((desc->vd_len & 0xf) != 0 || indir_count == 0 || 666 indir_count > ring->vr_size || 667 desc->vd_addr > (desc->vd_addr + desc->vd_len)) { 668 VIONA_PROBE2(indir_bad_len, viona_vring_t *, ring, 669 uint32_t, desc->vd_len); 670 VIONA_RING_STAT_INCR(ring, indir_bad_len); 671 return (EINVAL); 672 } 673 674 uint16_t indir_next = 0; 675 const uint8_t *buf = NULL; 676 uint64_t buf_gpa = UINT64_MAX; 677 vmm_page_t *vmp = NULL; 678 int err = 0; 679 680 for (;;) { 681 uint64_t indir_gpa = 682 desc->vd_addr + (indir_next * sizeof (struct virtio_desc)); 683 uint64_t indir_page = indir_gpa & PAGEMASK; 684 struct virtio_desc vp; 685 686 /* 687 * Get a mapping for the page that the next indirect descriptor 688 * resides in, if has not already been done. 689 */ 690 if (indir_page != buf_gpa) { 691 if (vmp != NULL) { 692 vmm_drv_page_release(vmp); 693 } 694 vmp = vq_page_hold(ring, indir_page, false); 695 if (vmp == NULL) { 696 VIONA_PROBE_BAD_RING_ADDR(ring, indir_page); 697 VIONA_RING_STAT_INCR(ring, bad_ring_addr); 698 err = EFAULT; 699 break; 700 } 701 buf_gpa = indir_page; 702 buf = vmm_drv_page_readable(vmp); 703 } 704 705 /* 706 * A copy of the indirect descriptor is made here, rather than 707 * simply using a reference pointer. This prevents malicious or 708 * erroneous guest writes to the descriptor from fooling the 709 * flags/bounds verification through a race. 710 */ 711 bcopy(buf + (indir_gpa - indir_page), &vp, sizeof (vp)); 712 713 if (vp.vd_flags & VRING_DESC_F_INDIRECT) { 714 VIONA_PROBE1(indir_bad_nest, viona_vring_t *, ring); 715 VIONA_RING_STAT_INCR(ring, indir_bad_nest); 716 err = EINVAL; 717 break; 718 } else if (vp.vd_len == 0) { 719 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring, 720 uint32_t, vp.vd_len); 721 VIONA_RING_STAT_INCR(ring, desc_bad_len); 722 err = EINVAL; 723 break; 724 } 725 726 err = vq_map_desc_bufs(ring, &vp, region); 727 if (err != 0) { 728 break; 729 } 730 731 /* Successfully reach the end of the indir chain */ 732 if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) { 733 break; 734 } 735 if (region->vhr_idx >= region->vhr_niov) { 736 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring); 737 VIONA_RING_STAT_INCR(ring, too_many_desc); 738 err = E2BIG; 739 break; 740 } 741 742 indir_next = vp.vd_next; 743 if (indir_next >= indir_count) { 744 VIONA_PROBE3(indir_bad_next, viona_vring_t *, ring, 745 uint16_t, indir_next, uint16_t, indir_count); 746 VIONA_RING_STAT_INCR(ring, indir_bad_next); 747 err = EINVAL; 748 break; 749 } 750 } 751 752 if (vmp != NULL) { 753 vmm_drv_page_release(vmp); 754 } 755 return (err); 756 } 757 758 int 759 vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov, 760 uint16_t *cookie, vmm_page_t **chain) 761 { 762 uint16_t ndesc, idx, head, next; 763 struct virtio_desc vdir; 764 vq_held_region_t region = { 765 .vhr_niov = niov, 766 .vhr_iov = iov, 767 }; 768 769 ASSERT(iov != NULL); 770 ASSERT(niov > 0 && niov < INT_MAX); 771 ASSERT(*chain == NULL); 772 773 mutex_enter(&ring->vr_a_mutex); 774 idx = ring->vr_cur_aidx; 775 ndesc = viona_ring_num_avail(ring); 776 777 if (ndesc == 0) { 778 mutex_exit(&ring->vr_a_mutex); 779 return (0); 780 } 781 if (ndesc > ring->vr_size) { 782 /* 783 * Despite the fact that the guest has provided an 'avail_idx' 784 * which indicates that an impossible number of descriptors are 785 * available, continue on and attempt to process the next one. 786 * 787 * The transgression will not escape the probe or stats though. 788 */ 789 VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring, 790 uint16_t, ndesc); 791 VIONA_RING_STAT_INCR(ring, ndesc_too_high); 792 } 793 794 head = vq_read_avail(ring, idx & ring->vr_mask); 795 next = head; 796 797 for (region.vhr_idx = 0; region.vhr_idx < niov; next = vdir.vd_next) { 798 if (next >= ring->vr_size) { 799 VIONA_PROBE2(bad_idx, viona_vring_t *, ring, 800 uint16_t, next); 801 VIONA_RING_STAT_INCR(ring, bad_idx); 802 break; 803 } 804 805 vq_read_desc(ring, next, &vdir); 806 if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) { 807 if (vq_map_desc_bufs(ring, &vdir, ®ion) != 0) { 808 break; 809 } 810 } else { 811 /* 812 * Per the specification (Virtio 1.1 S2.6.5.3.1): 813 * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT 814 * and VIRTQ_DESC_F_NEXT in `flags`. 815 */ 816 if ((vdir.vd_flags & VRING_DESC_F_NEXT) != 0) { 817 VIONA_PROBE3(indir_bad_next, 818 viona_vring_t *, ring, 819 uint16_t, next, uint16_t, 0); 820 VIONA_RING_STAT_INCR(ring, indir_bad_next); 821 break; 822 } 823 824 if (vq_map_indir_desc_bufs(ring, &vdir, ®ion) != 0) { 825 break; 826 } 827 } 828 829 if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) { 830 ring->vr_cur_aidx++; 831 mutex_exit(&ring->vr_a_mutex); 832 833 *cookie = head; 834 *chain = region.vhr_head; 835 return (region.vhr_idx); 836 } 837 } 838 839 mutex_exit(&ring->vr_a_mutex); 840 if (region.vhr_head != NULL) { 841 /* 842 * If any pages were held prior to encountering an error, we 843 * must release them now. 844 */ 845 vmm_drv_page_release_chain(region.vhr_head); 846 } 847 return (-1); 848 } 849 850 851 static void 852 vq_write_used_ent(viona_vring_t *ring, uint16_t idx, uint16_t cookie, 853 uint32_t len) 854 { 855 /* 856 * In a larger ring, entry could be split across pages, so be sure to 857 * account for that when configuring the transfer by looking up the ID 858 * and length addresses separately, rather than an address for a 859 * combined `struct virtio_used`. 860 */ 861 const uint_t used_id_off = LEGACY_USED_ENT_OFF(ring->vr_size, idx); 862 const uint_t used_len_off = used_id_off + sizeof (uint32_t); 863 volatile uint32_t *idp = viona_ring_addr(ring, used_id_off); 864 volatile uint32_t *lenp = viona_ring_addr(ring, used_len_off); 865 866 ASSERT(MUTEX_HELD(&ring->vr_u_mutex)); 867 868 *idp = cookie; 869 *lenp = len; 870 } 871 872 static void 873 vq_write_used_idx(viona_vring_t *ring, uint16_t idx) 874 { 875 ASSERT(MUTEX_HELD(&ring->vr_u_mutex)); 876 877 volatile uint16_t *used_idx = 878 viona_ring_addr(ring, LEGACY_USED_IDX_OFF(ring->vr_size)); 879 *used_idx = idx; 880 } 881 882 void 883 vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie) 884 { 885 uint16_t uidx; 886 887 mutex_enter(&ring->vr_u_mutex); 888 889 uidx = ring->vr_cur_uidx; 890 vq_write_used_ent(ring, uidx & ring->vr_mask, cookie, len); 891 uidx++; 892 membar_producer(); 893 894 vq_write_used_idx(ring, uidx); 895 ring->vr_cur_uidx = uidx; 896 897 mutex_exit(&ring->vr_u_mutex); 898 } 899 900 void 901 vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem) 902 { 903 uint16_t uidx; 904 905 mutex_enter(&ring->vr_u_mutex); 906 907 uidx = ring->vr_cur_uidx; 908 909 for (uint_t i = 0; i < num_bufs; i++, uidx++) { 910 vq_write_used_ent(ring, uidx & ring->vr_mask, elem[i].id, 911 elem[i].len); 912 } 913 914 membar_producer(); 915 vq_write_used_idx(ring, uidx); 916 ring->vr_cur_uidx = uidx; 917 918 mutex_exit(&ring->vr_u_mutex); 919 } 920 921 /* 922 * Set USED_NO_NOTIFY on VQ so guest elides doorbell calls for new entries. 923 */ 924 void 925 viona_ring_disable_notify(viona_vring_t *ring) 926 { 927 volatile uint16_t *used_flags = 928 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size)); 929 930 *used_flags |= VRING_USED_F_NO_NOTIFY; 931 } 932 933 /* 934 * Clear USED_NO_NOTIFY on VQ so guest resumes doorbell calls for new entries. 935 */ 936 void 937 viona_ring_enable_notify(viona_vring_t *ring) 938 { 939 volatile uint16_t *used_flags = 940 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size)); 941 942 *used_flags &= ~VRING_USED_F_NO_NOTIFY; 943 } 944 945 /* 946 * Return the number of available descriptors in the vring taking care of the 947 * 16-bit index wraparound. 948 * 949 * Note: If the number of apparently available descriptors is larger than the 950 * ring size (due to guest misbehavior), this check will still report the 951 * positive count of descriptors. 952 */ 953 uint16_t 954 viona_ring_num_avail(viona_vring_t *ring) 955 { 956 volatile uint16_t *avail_idx = 957 viona_ring_addr(ring, LEGACY_AVAIL_IDX_OFF(ring->vr_size)); 958 959 return (*avail_idx - ring->vr_cur_aidx); 960 } 961