1 /* 2 * Copyright (c) 2013 Chris Torek <torek @ torek net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * This file and its contents are supplied under the terms of the 28 * Common Development and Distribution License ("CDDL"), version 1.0. 29 * You may only use this file in accordance with the terms of version 30 * 1.0 of the CDDL. 31 * 32 * A full copy of the text of the CDDL should have accompanied this 33 * source. A copy of the CDDL is also available via the Internet at 34 * http://www.illumos.org/license/CDDL. 35 * 36 * Copyright 2015 Pluribus Networks Inc. 37 * Copyright 2019 Joyent, Inc. 38 * Copyright 2025 Oxide Computer Company 39 */ 40 41 42 #include <sys/disp.h> 43 44 #include "viona_impl.h" 45 46 #define VRING_MAX_LEN 32768 47 48 /* Layout and sizing as defined in the spec for a legacy-style virtqueue */ 49 50 /* 51 * Because viona is not built with MACHDEP defined, PAGESIZE and friends are not 52 * constants but rather variable references. While viona remains x86-only, we 53 * are free to hard-code this to 4k. 54 */ 55 #define VQ_PGSZ 4096UL 56 #define VQ_PGOFF (VQ_PGSZ - 1) 57 #define VQ_PGMASK ~VQ_PGOFF 58 59 #define LEGACY_VQ_ALIGN VQ_PGSZ 60 61 #define LEGACY_DESC_SZ(qsz) ((qsz) * sizeof (struct virtio_desc)) 62 /* 63 * Available ring consists of avail_idx (uint16_t), flags (uint16_t), qsz avail 64 * descriptors (uint16_t each), and (optional) used_event (uint16_t). 65 */ 66 #define LEGACY_AVAIL_SZ(qsz) (((qsz) + 3) * sizeof (uint16_t)) 67 /* 68 * Used ring consists of used_idx (uint16_t), flags (uint16_t), qsz used 69 * descriptors (two uint32_t each), and (optional) avail_event (uint16_t). 70 */ 71 #define LEGACY_USED_SZ(qsz) \ 72 ((qsz) * sizeof (struct virtio_used) + 3 * sizeof (uint16_t)) 73 74 #define LEGACY_AVAIL_FLAGS_OFF(qsz) LEGACY_DESC_SZ(qsz) 75 #define LEGACY_AVAIL_IDX_OFF(qsz) \ 76 (LEGACY_DESC_SZ(qsz) + sizeof (uint16_t)) 77 #define LEGACY_AVAIL_ENT_OFF(qsz, idx) \ 78 (LEGACY_DESC_SZ(qsz) + (2 + (idx)) * sizeof (uint16_t)) 79 80 #define LEGACY_USED_FLAGS_OFF(qsz) \ 81 P2ROUNDUP(LEGACY_DESC_SZ(qsz) + LEGACY_AVAIL_SZ(qsz), LEGACY_VQ_ALIGN) 82 #define LEGACY_USED_IDX_OFF(qsz) \ 83 (LEGACY_USED_FLAGS_OFF(qsz) + sizeof (uint16_t)) 84 #define LEGACY_USED_ENT_OFF(qsz, idx) \ 85 (LEGACY_USED_FLAGS_OFF(qsz) + 2 * sizeof (uint16_t) + \ 86 (idx) * sizeof (struct virtio_used)) 87 88 #define LEGACY_VQ_SIZE(qsz) \ 89 (LEGACY_USED_FLAGS_OFF(qsz) + \ 90 P2ROUNDUP(LEGACY_USED_SZ(qsz), LEGACY_VQ_ALIGN)) 91 #define LEGACY_VQ_PAGES(qsz) (LEGACY_VQ_SIZE(qsz) / VQ_PGSZ) 92 93 struct vq_held_region { 94 struct iovec *vhr_iov; 95 vmm_page_t *vhr_head; 96 vmm_page_t *vhr_tail; 97 /* Length of iovec array supplied in `vhr_iov` */ 98 uint_t vhr_niov; 99 /* 100 * Index into vhr_iov, indicating the next "free" entry (following the 101 * last entry which has valid contents). 102 */ 103 uint_t vhr_idx; 104 105 /* Total length of populated entries in `vhr_iov` */ 106 uint32_t vhr_len; 107 }; 108 typedef struct vq_held_region vq_held_region_t; 109 110 static bool viona_ring_map(viona_vring_t *, bool); 111 static void viona_ring_unmap(viona_vring_t *); 112 static kthread_t *viona_create_worker(viona_vring_t *); 113 static void viona_ring_consolidate_stats(viona_vring_t *); 114 115 static vmm_page_t * 116 vq_page_hold(viona_vring_t *ring, uint64_t gpa, bool writable) 117 { 118 ASSERT3P(ring->vr_lease, !=, NULL); 119 120 int prot = PROT_READ; 121 if (writable) { 122 prot |= PROT_WRITE; 123 } 124 125 return (vmm_drv_page_hold(ring->vr_lease, gpa, prot)); 126 } 127 128 /* 129 * Establish a hold on the page(s) which back the region of guest memory covered 130 * by [gpa, gpa + len). The host-kernel-virtual pointers to those pages are 131 * stored in the iovec array supplied in `region`, along with the chain of 132 * vmm_page_t entries representing the held pages. Since guest memory 133 * carries no guarantees of being physically contiguous (on the host), it is 134 * assumed that an iovec entry will be required for each page sized section 135 * covered by the specified `gpa` and `len` range. For each iovec entry 136 * successfully populated by holding a page, `vhr_idx` will be incremented so it 137 * references the next available iovec entry (or `vhr_niov`, if the iovec array 138 * is full). The responsibility for releasing the `vmm_page_t` chain (stored in 139 * `vhr_head` and `vhr_tail`) resides with the caller, regardless of the result. 140 */ 141 static int 142 vq_region_hold(viona_vring_t *ring, uint64_t gpa, uint32_t len, 143 bool writable, vq_held_region_t *region) 144 { 145 const uint32_t front_offset = gpa & VQ_PGOFF; 146 const uint32_t front_len = MIN(len, VQ_PGSZ - front_offset); 147 uint_t pages = 1; 148 vmm_page_t *vmp; 149 caddr_t buf; 150 151 ASSERT3U(region->vhr_idx, <, region->vhr_niov); 152 153 if (front_len < len) { 154 pages += P2ROUNDUP((uint64_t)(len - front_len), 155 VQ_PGSZ) / VQ_PGSZ; 156 } 157 if (pages > (region->vhr_niov - region->vhr_idx)) { 158 return (E2BIG); 159 } 160 161 vmp = vq_page_hold(ring, gpa & VQ_PGMASK, writable); 162 if (vmp == NULL) { 163 return (EFAULT); 164 } 165 buf = (caddr_t)vmm_drv_page_readable(vmp); 166 167 region->vhr_iov[region->vhr_idx].iov_base = buf + front_offset; 168 region->vhr_iov[region->vhr_idx].iov_len = front_len; 169 region->vhr_idx++; 170 gpa += front_len; 171 len -= front_len; 172 if (region->vhr_head == NULL) { 173 region->vhr_head = vmp; 174 region->vhr_tail = vmp; 175 } else { 176 vmm_drv_page_chain(region->vhr_tail, vmp); 177 region->vhr_tail = vmp; 178 } 179 180 for (uint_t i = 1; i < pages; i++) { 181 ASSERT3U(gpa & VQ_PGOFF, ==, 0); 182 183 vmp = vq_page_hold(ring, gpa, writable); 184 if (vmp == NULL) { 185 return (EFAULT); 186 } 187 buf = (caddr_t)vmm_drv_page_readable(vmp); 188 189 const uint32_t chunk_len = MIN(len, VQ_PGSZ); 190 region->vhr_iov[region->vhr_idx].iov_base = buf; 191 region->vhr_iov[region->vhr_idx].iov_len = chunk_len; 192 region->vhr_idx++; 193 gpa += chunk_len; 194 len -= chunk_len; 195 vmm_drv_page_chain(region->vhr_tail, vmp); 196 region->vhr_tail = vmp; 197 } 198 199 return (0); 200 } 201 202 static boolean_t 203 viona_ring_lease_expire_cb(void *arg) 204 { 205 viona_vring_t *ring = arg; 206 207 mutex_enter(&ring->vr_lock); 208 cv_broadcast(&ring->vr_cv); 209 mutex_exit(&ring->vr_lock); 210 211 /* The lease will be broken asynchronously. */ 212 return (B_FALSE); 213 } 214 215 static void 216 viona_ring_lease_drop(viona_vring_t *ring) 217 { 218 ASSERT(MUTEX_HELD(&ring->vr_lock)); 219 220 if (ring->vr_lease != NULL) { 221 vmm_hold_t *hold = ring->vr_link->l_vm_hold; 222 223 ASSERT(hold != NULL); 224 225 /* 226 * Without an active lease, the ring mappings cannot be 227 * considered valid. 228 */ 229 viona_ring_unmap(ring); 230 231 vmm_drv_lease_break(hold, ring->vr_lease); 232 ring->vr_lease = NULL; 233 } 234 } 235 236 boolean_t 237 viona_ring_lease_renew(viona_vring_t *ring) 238 { 239 vmm_hold_t *hold = ring->vr_link->l_vm_hold; 240 241 ASSERT(hold != NULL); 242 ASSERT(MUTEX_HELD(&ring->vr_lock)); 243 244 viona_ring_lease_drop(ring); 245 246 /* 247 * Lease renewal will fail if the VM has requested that all holds be 248 * cleaned up. 249 */ 250 ring->vr_lease = vmm_drv_lease_sign(hold, viona_ring_lease_expire_cb, 251 ring); 252 if (ring->vr_lease != NULL) { 253 /* A ring undergoing renewal will need valid guest mappings */ 254 if (ring->vr_pa != 0 && ring->vr_size != 0) { 255 /* 256 * If new mappings cannot be established, consider the 257 * lease renewal a failure. 258 */ 259 if (!viona_ring_map(ring, ring->vr_state == VRS_INIT)) { 260 viona_ring_lease_drop(ring); 261 return (B_FALSE); 262 } 263 } 264 } 265 return (ring->vr_lease != NULL); 266 } 267 268 void 269 viona_ring_alloc(viona_link_t *link, viona_vring_t *ring) 270 { 271 ring->vr_link = link; 272 mutex_init(&ring->vr_lock, NULL, MUTEX_DRIVER, NULL); 273 cv_init(&ring->vr_cv, NULL, CV_DRIVER, NULL); 274 mutex_init(&ring->vr_a_mutex, NULL, MUTEX_DRIVER, NULL); 275 mutex_init(&ring->vr_u_mutex, NULL, MUTEX_DRIVER, NULL); 276 } 277 278 static void 279 viona_ring_misc_free(viona_vring_t *ring) 280 { 281 const uint_t qsz = ring->vr_size; 282 283 viona_tx_ring_free(ring, qsz); 284 } 285 286 void 287 viona_ring_free(viona_vring_t *ring) 288 { 289 mutex_destroy(&ring->vr_lock); 290 cv_destroy(&ring->vr_cv); 291 mutex_destroy(&ring->vr_a_mutex); 292 mutex_destroy(&ring->vr_u_mutex); 293 ring->vr_link = NULL; 294 } 295 296 int 297 viona_ring_init(viona_link_t *link, uint16_t idx, 298 const struct viona_ring_params *params) 299 { 300 viona_vring_t *ring; 301 kthread_t *t; 302 int err = 0; 303 const uint16_t qsz = params->vrp_size; 304 const uint64_t pa = params->vrp_pa; 305 306 if (idx >= VIONA_VQ_MAX) { 307 return (EINVAL); 308 } 309 310 if (qsz == 0 || qsz > VRING_MAX_LEN || (1 << (ffs(qsz) - 1)) != qsz) { 311 return (EINVAL); 312 } 313 if ((pa & (LEGACY_VQ_ALIGN - 1)) != 0) { 314 return (EINVAL); 315 } 316 317 ring = &link->l_vrings[idx]; 318 mutex_enter(&ring->vr_lock); 319 if (ring->vr_state != VRS_RESET) { 320 mutex_exit(&ring->vr_lock); 321 return (EBUSY); 322 } 323 VERIFY(ring->vr_state_flags == 0); 324 325 ring->vr_lease = NULL; 326 if (!viona_ring_lease_renew(ring)) { 327 err = EBUSY; 328 goto fail; 329 } 330 331 ring->vr_size = qsz; 332 ring->vr_mask = (ring->vr_size - 1); 333 ring->vr_pa = pa; 334 if (!viona_ring_map(ring, true)) { 335 err = EINVAL; 336 goto fail; 337 } 338 339 /* Initialize queue indexes */ 340 ring->vr_cur_aidx = params->vrp_avail_idx; 341 ring->vr_cur_uidx = params->vrp_used_idx; 342 343 if (idx == VIONA_VQ_TX) { 344 viona_tx_ring_alloc(ring, qsz); 345 } 346 347 /* Zero out MSI-X configuration */ 348 ring->vr_msi_addr = 0; 349 ring->vr_msi_msg = 0; 350 351 /* Clear the stats */ 352 bzero(&ring->vr_stats, sizeof (ring->vr_stats)); 353 bzero(&ring->vr_err_stats, sizeof (ring->vr_err_stats)); 354 355 t = viona_create_worker(ring); 356 if (t == NULL) { 357 err = ENOMEM; 358 goto fail; 359 } 360 ring->vr_worker_thread = t; 361 ring->vr_state = VRS_SETUP; 362 cv_broadcast(&ring->vr_cv); 363 mutex_exit(&ring->vr_lock); 364 return (0); 365 366 fail: 367 viona_ring_lease_drop(ring); 368 viona_ring_misc_free(ring); 369 ring->vr_size = 0; 370 ring->vr_mask = 0; 371 ring->vr_pa = 0; 372 ring->vr_cur_aidx = 0; 373 ring->vr_cur_uidx = 0; 374 mutex_exit(&ring->vr_lock); 375 return (err); 376 } 377 378 int 379 viona_ring_get_state(viona_link_t *link, uint16_t idx, 380 struct viona_ring_params *params) 381 { 382 viona_vring_t *ring; 383 384 if (idx >= VIONA_VQ_MAX) { 385 return (EINVAL); 386 } 387 388 ring = &link->l_vrings[idx]; 389 mutex_enter(&ring->vr_lock); 390 391 params->vrp_size = ring->vr_size; 392 params->vrp_pa = ring->vr_pa; 393 394 if (ring->vr_state == VRS_RUN) { 395 /* On a running ring, we must heed the avail/used locks */ 396 mutex_enter(&ring->vr_a_mutex); 397 params->vrp_avail_idx = ring->vr_cur_aidx; 398 mutex_exit(&ring->vr_a_mutex); 399 mutex_enter(&ring->vr_u_mutex); 400 params->vrp_used_idx = ring->vr_cur_uidx; 401 mutex_exit(&ring->vr_u_mutex); 402 } else { 403 /* Otherwise vr_lock is adequate protection */ 404 params->vrp_avail_idx = ring->vr_cur_aidx; 405 params->vrp_used_idx = ring->vr_cur_uidx; 406 } 407 408 mutex_exit(&ring->vr_lock); 409 410 return (0); 411 } 412 413 int 414 viona_ring_reset(viona_vring_t *ring, boolean_t heed_signals) 415 { 416 mutex_enter(&ring->vr_lock); 417 if (ring->vr_state == VRS_RESET) { 418 mutex_exit(&ring->vr_lock); 419 return (0); 420 } 421 422 if ((ring->vr_state_flags & VRSF_REQ_STOP) == 0) { 423 ring->vr_state_flags |= VRSF_REQ_STOP; 424 cv_broadcast(&ring->vr_cv); 425 } 426 while (ring->vr_state != VRS_RESET) { 427 if (!heed_signals) { 428 cv_wait(&ring->vr_cv, &ring->vr_lock); 429 } else { 430 int rs; 431 432 rs = cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 433 if (rs <= 0 && ring->vr_state != VRS_RESET) { 434 mutex_exit(&ring->vr_lock); 435 return (EINTR); 436 } 437 } 438 } 439 mutex_exit(&ring->vr_lock); 440 return (0); 441 } 442 443 static bool 444 viona_ring_map(viona_vring_t *ring, bool defer_dirty) 445 { 446 const uint16_t qsz = ring->vr_size; 447 uintptr_t pa = ring->vr_pa; 448 449 ASSERT3U(qsz, !=, 0); 450 ASSERT3U(qsz, <=, VRING_MAX_LEN); 451 ASSERT3U(pa, !=, 0); 452 ASSERT3U(pa & (LEGACY_VQ_ALIGN - 1), ==, 0); 453 ASSERT(MUTEX_HELD(&ring->vr_lock)); 454 ASSERT3P(ring->vr_map_pages, ==, NULL); 455 456 const uint_t npages = LEGACY_VQ_PAGES(qsz); 457 ring->vr_map_pages = kmem_zalloc(npages * sizeof (void *), KM_SLEEP); 458 459 int page_flags = 0; 460 if (defer_dirty) { 461 /* 462 * During initialization, and when entering the paused state, 463 * the page holds for a virtqueue are established with the 464 * DEFER_DIRTY flag set. 465 * 466 * This prevents those page holds from immediately marking the 467 * underlying pages as dirty, since the viona emulation is not 468 * yet performing any accesses. Once the ring transitions to 469 * the VRS_RUN state, the held pages will be marked as dirty. 470 * 471 * Any ring mappings performed outside those state conditions, 472 * such as those part of vmm_lease renewal during steady-state 473 * operation, will map the ring pages normally (as considered 474 * immediately dirty). 475 */ 476 page_flags |= VMPF_DEFER_DIRTY; 477 } 478 479 vmm_page_t *prev = NULL; 480 for (uint_t i = 0; i < npages; i++, pa += VQ_PGSZ) { 481 vmm_page_t *vmp; 482 483 vmp = vmm_drv_page_hold_ext(ring->vr_lease, pa, 484 PROT_READ | PROT_WRITE, page_flags); 485 if (vmp == NULL) { 486 viona_ring_unmap(ring); 487 return (false); 488 } 489 490 /* 491 * Keep the first page has the head of the chain, appending all 492 * subsequent pages to the tail. 493 */ 494 if (prev == NULL) { 495 ring->vr_map_hold = vmp; 496 } else { 497 vmm_drv_page_chain(prev, vmp); 498 } 499 prev = vmp; 500 ring->vr_map_pages[i] = vmm_drv_page_writable(vmp); 501 } 502 503 return (true); 504 } 505 506 static void 507 viona_ring_mark_dirty(viona_vring_t *ring) 508 { 509 ASSERT(MUTEX_HELD(&ring->vr_lock)); 510 ASSERT(ring->vr_map_hold != NULL); 511 512 for (vmm_page_t *vp = ring->vr_map_hold; vp != NULL; 513 vp = vmm_drv_page_next(vp)) { 514 vmm_drv_page_mark_dirty(vp); 515 } 516 } 517 518 static void 519 viona_ring_unmap(viona_vring_t *ring) 520 { 521 ASSERT(MUTEX_HELD(&ring->vr_lock)); 522 523 void **map = ring->vr_map_pages; 524 if (map != NULL) { 525 const uint_t npages = LEGACY_VQ_PAGES(ring->vr_size); 526 kmem_free(map, npages * sizeof (void *)); 527 ring->vr_map_pages = NULL; 528 529 vmm_drv_page_release_chain(ring->vr_map_hold); 530 ring->vr_map_hold = NULL; 531 } else { 532 ASSERT3P(ring->vr_map_hold, ==, NULL); 533 } 534 } 535 536 static inline void * 537 viona_ring_addr(viona_vring_t *ring, uint_t off) 538 { 539 ASSERT3P(ring->vr_map_pages, !=, NULL); 540 ASSERT3U(LEGACY_VQ_SIZE(ring->vr_size), >, off); 541 542 const uint_t page_num = off / VQ_PGSZ; 543 const uint_t page_off = off % VQ_PGSZ; 544 return ((caddr_t)ring->vr_map_pages[page_num] + page_off); 545 } 546 547 void 548 viona_intr_ring(viona_vring_t *ring, boolean_t skip_flags_check) 549 { 550 if (!skip_flags_check) { 551 volatile uint16_t *avail_flags = viona_ring_addr(ring, 552 LEGACY_AVAIL_FLAGS_OFF(ring->vr_size)); 553 554 if ((*avail_flags & VRING_AVAIL_F_NO_INTERRUPT) != 0) { 555 return; 556 } 557 } 558 559 mutex_enter(&ring->vr_lock); 560 uint64_t addr = ring->vr_msi_addr; 561 uint64_t msg = ring->vr_msi_msg; 562 mutex_exit(&ring->vr_lock); 563 if (addr != 0) { 564 /* Deliver the interrupt directly, if so configured... */ 565 (void) vmm_drv_msi(ring->vr_lease, addr, msg); 566 } else { 567 /* ... otherwise, leave it to userspace */ 568 if (atomic_cas_uint(&ring->vr_intr_enabled, 0, 1) == 0) { 569 pollwakeup(&ring->vr_link->l_pollhead, POLLRDBAND); 570 } 571 } 572 } 573 574 static inline bool 575 vring_stop_req(const viona_vring_t *ring) 576 { 577 return ((ring->vr_state_flags & VRSF_REQ_STOP) != 0); 578 } 579 580 static inline bool 581 vring_pause_req(const viona_vring_t *ring) 582 { 583 return ((ring->vr_state_flags & VRSF_REQ_PAUSE) != 0); 584 } 585 586 static inline bool 587 vring_start_req(const viona_vring_t *ring) 588 { 589 return ((ring->vr_state_flags & VRSF_REQ_START) != 0); 590 } 591 592 /* 593 * Check if vring worker thread should bail out. This will heed indications 594 * that the containing process is exiting, as well as requests to stop or pause 595 * the ring. The `stop_only` parameter controls if pause requests are ignored 596 * (true) or checked (false). 597 * 598 * Caller should hold vr_lock. 599 */ 600 static bool 601 vring_need_bail_ext(const viona_vring_t *ring, bool stop_only) 602 { 603 ASSERT(MUTEX_HELD(&ring->vr_lock)); 604 605 if (vring_stop_req(ring) || 606 (!stop_only && vring_pause_req(ring))) { 607 return (true); 608 } 609 610 kthread_t *t = ring->vr_worker_thread; 611 if (t != NULL) { 612 proc_t *p = ttoproc(t); 613 614 ASSERT(p != NULL); 615 if ((p->p_flag & SEXITING) != 0) { 616 return (true); 617 } 618 } 619 return (false); 620 } 621 622 bool 623 vring_need_bail(const viona_vring_t *ring) 624 { 625 return (vring_need_bail_ext(ring, false)); 626 } 627 628 int 629 viona_ring_pause(viona_vring_t *ring) 630 { 631 mutex_enter(&ring->vr_lock); 632 switch (ring->vr_state) { 633 case VRS_RESET: 634 case VRS_SETUP: 635 case VRS_INIT: 636 /* 637 * For rings which have not yet started (even those in the 638 * VRS_SETUP and VRS_INIT phases, where there a running worker 639 * thread (waiting to be released to do its intended task), it 640 * is adequate to simply clear any start request, to keep them 641 * from proceeding into the actual work processing function. 642 */ 643 ring->vr_state_flags &= ~VRSF_REQ_START; 644 mutex_exit(&ring->vr_lock); 645 return (0); 646 647 case VRS_STOP: 648 if ((ring->vr_state_flags & VRSF_REQ_STOP) != 0) { 649 /* A ring on its way to RESET cannot be paused. */ 650 mutex_exit(&ring->vr_lock); 651 return (EBUSY); 652 } 653 /* FALLTHROUGH */ 654 case VRS_RUN: 655 ring->vr_state_flags |= VRSF_REQ_PAUSE; 656 cv_broadcast(&ring->vr_cv); 657 break; 658 659 default: 660 panic("invalid ring state %d", ring->vr_state); 661 break; 662 } 663 664 for (;;) { 665 int res = cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 666 667 if (ring->vr_state == VRS_INIT || 668 (ring->vr_state_flags & VRSF_REQ_PAUSE) == 0) { 669 /* Ring made it to (or through) paused state */ 670 mutex_exit(&ring->vr_lock); 671 return (0); 672 } 673 if (res == 0) { 674 /* interrupted by signal */ 675 mutex_exit(&ring->vr_lock); 676 return (EINTR); 677 } 678 } 679 /* NOTREACHED */ 680 } 681 682 static void 683 viona_worker(void *arg) 684 { 685 viona_vring_t *ring = (viona_vring_t *)arg; 686 viona_link_t *link = ring->vr_link; 687 688 mutex_enter(&ring->vr_lock); 689 VERIFY3U(ring->vr_state, ==, VRS_SETUP); 690 691 /* Bail immediately if ring shutdown or process exit was requested */ 692 if (vring_need_bail_ext(ring, true)) { 693 goto ring_reset; 694 } 695 696 /* Report worker thread as alive and notify creator */ 697 ring_init: 698 ring->vr_state = VRS_INIT; 699 cv_broadcast(&ring->vr_cv); 700 701 while (!vring_start_req(ring)) { 702 /* 703 * Keeping lease renewals timely while waiting for the ring to 704 * be started is important for avoiding deadlocks. 705 */ 706 if (vmm_drv_lease_expired(ring->vr_lease)) { 707 if (!viona_ring_lease_renew(ring)) { 708 goto ring_reset; 709 } 710 } 711 712 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 713 714 if (vring_pause_req(ring)) { 715 /* We are already paused in the INIT state. */ 716 ring->vr_state_flags &= ~VRSF_REQ_PAUSE; 717 } 718 if (vring_need_bail_ext(ring, true)) { 719 goto ring_reset; 720 } 721 } 722 723 ASSERT((ring->vr_state_flags & VRSF_REQ_START) != 0); 724 ring->vr_state = VRS_RUN; 725 ring->vr_state_flags &= ~VRSF_REQ_START; 726 viona_ring_mark_dirty(ring); 727 728 /* Ensure ring lease is valid first */ 729 if (vmm_drv_lease_expired(ring->vr_lease)) { 730 if (!viona_ring_lease_renew(ring)) { 731 goto ring_reset; 732 } 733 } 734 735 /* Process actual work */ 736 if (ring == &link->l_vrings[VIONA_VQ_RX]) { 737 viona_worker_rx(ring, link); 738 } else if (ring == &link->l_vrings[VIONA_VQ_TX]) { 739 viona_worker_tx(ring, link); 740 } else { 741 panic("unexpected ring: %p", (void *)ring); 742 } 743 744 VERIFY3U(ring->vr_state, ==, VRS_STOP); 745 VERIFY3U(ring->vr_xfer_outstanding, ==, 0); 746 747 /* 748 * Consolidate stats data so that it is not lost if/when this ring is 749 * being stopped. 750 */ 751 viona_ring_consolidate_stats(ring); 752 753 /* Respond to a pause request if the ring is not required to stop */ 754 if (vring_pause_req(ring)) { 755 ring->vr_state_flags &= ~VRSF_REQ_PAUSE; 756 757 if (vring_need_bail_ext(ring, true)) { 758 goto ring_reset; 759 } 760 761 /* 762 * To complete pausing of the ring, unmap and re-map the pages 763 * underpinning the virtqueue. This is to synchronize their 764 * dirty state in the backing page tables and restore the 765 * defer-dirty state on the held pages. 766 */ 767 viona_ring_unmap(ring); 768 if (viona_ring_map(ring, true)) { 769 goto ring_init; 770 } 771 772 /* 773 * If the ring pages failed to be mapped, fallthrough to 774 * ring-reset like any other failure. 775 */ 776 } 777 778 ring_reset: 779 viona_ring_misc_free(ring); 780 781 viona_ring_lease_drop(ring); 782 ring->vr_cur_aidx = 0; 783 ring->vr_size = 0; 784 ring->vr_mask = 0; 785 ring->vr_pa = 0; 786 ring->vr_state = VRS_RESET; 787 ring->vr_state_flags = 0; 788 ring->vr_worker_thread = NULL; 789 cv_broadcast(&ring->vr_cv); 790 mutex_exit(&ring->vr_lock); 791 792 mutex_enter(&ttoproc(curthread)->p_lock); 793 lwp_exit(); 794 } 795 796 static kthread_t * 797 viona_create_worker(viona_vring_t *ring) 798 { 799 k_sigset_t hold_set; 800 proc_t *p = curproc; 801 kthread_t *t; 802 klwp_t *lwp; 803 804 ASSERT(MUTEX_HELD(&ring->vr_lock)); 805 ASSERT(ring->vr_state == VRS_RESET); 806 807 sigfillset(&hold_set); 808 lwp = lwp_create(viona_worker, (void *)ring, 0, p, TS_STOPPED, 809 minclsyspri - 1, &hold_set, curthread->t_cid, 0); 810 if (lwp == NULL) { 811 return (NULL); 812 } 813 814 t = lwptot(lwp); 815 mutex_enter(&p->p_lock); 816 t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD; 817 lwp_create_done(t); 818 mutex_exit(&p->p_lock); 819 820 return (t); 821 } 822 823 static inline void 824 vq_read_desc(viona_vring_t *ring, uint16_t idx, struct virtio_desc *descp) 825 { 826 const uint_t entry_off = idx * sizeof (struct virtio_desc); 827 828 ASSERT3U(idx, <, ring->vr_size); 829 830 /* 831 * On both legacy and 1.x VirtIO, the virtqueue descriptors are required 832 * to be aligned to at least 16 bytes (4k for legacy). 833 */ 834 *descp = *(const struct virtio_desc *)viona_ring_addr(ring, entry_off); 835 } 836 837 static uint16_t 838 vq_read_avail(viona_vring_t *ring, uint16_t idx) 839 { 840 ASSERT3U(idx, <, ring->vr_size); 841 842 volatile uint16_t *avail_ent = 843 viona_ring_addr(ring, LEGACY_AVAIL_ENT_OFF(ring->vr_size, idx)); 844 return (*avail_ent); 845 } 846 847 /* 848 * Given a buffer descriptor `desc`, attempt to map the pages backing that 849 * region of guest physical memory, taking into account that there are no 850 * guarantees about guest-contiguous pages being host-contiguous. 851 */ 852 static int 853 vq_map_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc, 854 vq_held_region_t *region) 855 { 856 if (desc->vd_len == 0) { 857 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring, 858 uint32_t, desc->vd_len); 859 VIONA_RING_STAT_INCR(ring, desc_bad_len); 860 return (EINVAL); 861 } else if ((region->vhr_len + desc->vd_len) < region->vhr_len) { 862 VIONA_PROBE1(len_overflow, viona_vring_t *, ring); 863 VIONA_RING_STAT_INCR(ring, len_overflow); 864 return (EOVERFLOW); 865 } 866 867 int err = vq_region_hold(ring, desc->vd_addr, desc->vd_len, 868 (desc->vd_flags & VRING_DESC_F_WRITE) != 0, region); 869 if (err == 0) { 870 region->vhr_len += desc->vd_len; 871 } else if (err == E2BIG) { 872 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring); 873 VIONA_RING_STAT_INCR(ring, too_many_desc); 874 } else if (err == EFAULT) { 875 VIONA_PROBE_BAD_RING_ADDR(ring, desc->vd_addr); 876 VIONA_RING_STAT_INCR(ring, bad_ring_addr); 877 } 878 879 return (err); 880 } 881 882 /* 883 * Walk an indirect buffer descriptor `desc`, attempting to map the pages 884 * backing the regions of guest memory covered by its constituent descriptors. 885 */ 886 static int 887 vq_map_indir_desc_bufs(viona_vring_t *ring, const struct virtio_desc *desc, 888 vq_held_region_t *region) 889 { 890 const uint16_t indir_count = desc->vd_len / sizeof (struct virtio_desc); 891 892 if ((desc->vd_len & 0xf) != 0 || indir_count == 0 || 893 indir_count > ring->vr_size || 894 desc->vd_addr > (desc->vd_addr + desc->vd_len)) { 895 VIONA_PROBE2(indir_bad_len, viona_vring_t *, ring, 896 uint32_t, desc->vd_len); 897 VIONA_RING_STAT_INCR(ring, indir_bad_len); 898 return (EINVAL); 899 } 900 901 uint16_t indir_next = 0; 902 const uint8_t *buf = NULL; 903 uint64_t buf_gpa = UINT64_MAX; 904 vmm_page_t *vmp = NULL; 905 int err = 0; 906 907 for (;;) { 908 const uint64_t indir_gpa = 909 desc->vd_addr + (indir_next * sizeof (struct virtio_desc)); 910 const uint64_t indir_page = indir_gpa & VQ_PGMASK; 911 912 /* 913 * Get a mapping for the page that the next indirect descriptor 914 * resides in, if has not already been done. 915 */ 916 if (indir_page != buf_gpa) { 917 if (vmp != NULL) { 918 vmm_drv_page_release(vmp); 919 } 920 vmp = vq_page_hold(ring, indir_page, false); 921 if (vmp == NULL) { 922 VIONA_PROBE_BAD_RING_ADDR(ring, indir_page); 923 VIONA_RING_STAT_INCR(ring, bad_ring_addr); 924 err = EFAULT; 925 break; 926 } 927 buf_gpa = indir_page; 928 buf = vmm_drv_page_readable(vmp); 929 } 930 931 /* 932 * A copy of the indirect descriptor is made here, rather than 933 * simply using a reference pointer. This prevents malicious or 934 * erroneous guest writes to the descriptor from fooling the 935 * flags/bounds verification through a race. 936 * 937 * While indirect descriptors do not have the same alignment 938 * requirements as those residing in the virtqueue itself, we 939 * are not concerned about unaligned access while viona remains 940 * x86-only. 941 */ 942 struct virtio_desc vp = *(const struct virtio_desc *) 943 (buf + (indir_gpa - indir_page)); 944 945 if (vp.vd_flags & VRING_DESC_F_INDIRECT) { 946 VIONA_PROBE1(indir_bad_nest, viona_vring_t *, ring); 947 VIONA_RING_STAT_INCR(ring, indir_bad_nest); 948 err = EINVAL; 949 break; 950 } else if (vp.vd_len == 0) { 951 VIONA_PROBE2(desc_bad_len, viona_vring_t *, ring, 952 uint32_t, vp.vd_len); 953 VIONA_RING_STAT_INCR(ring, desc_bad_len); 954 err = EINVAL; 955 break; 956 } 957 958 err = vq_map_desc_bufs(ring, &vp, region); 959 if (err != 0) { 960 break; 961 } 962 963 /* Successfully reach the end of the indir chain */ 964 if ((vp.vd_flags & VRING_DESC_F_NEXT) == 0) { 965 break; 966 } 967 if (region->vhr_idx >= region->vhr_niov) { 968 VIONA_PROBE1(too_many_desc, viona_vring_t *, ring); 969 VIONA_RING_STAT_INCR(ring, too_many_desc); 970 err = E2BIG; 971 break; 972 } 973 974 indir_next = vp.vd_next; 975 if (indir_next >= indir_count) { 976 VIONA_PROBE3(indir_bad_next, viona_vring_t *, ring, 977 uint16_t, indir_next, uint16_t, indir_count); 978 VIONA_RING_STAT_INCR(ring, indir_bad_next); 979 err = EINVAL; 980 break; 981 } 982 } 983 984 if (vmp != NULL) { 985 vmm_drv_page_release(vmp); 986 } 987 return (err); 988 } 989 990 int 991 vq_popchain(viona_vring_t *ring, struct iovec *iov, uint_t niov, 992 uint16_t *cookie, vmm_page_t **chain, uint32_t *len) 993 { 994 uint16_t ndesc, idx, head, next; 995 struct virtio_desc vdir; 996 vq_held_region_t region = { 997 .vhr_niov = niov, 998 .vhr_iov = iov, 999 }; 1000 1001 ASSERT(iov != NULL); 1002 ASSERT(niov > 0 && niov < INT_MAX); 1003 ASSERT(*chain == NULL); 1004 1005 mutex_enter(&ring->vr_a_mutex); 1006 idx = ring->vr_cur_aidx; 1007 ndesc = viona_ring_num_avail(ring); 1008 1009 if (ndesc == 0) { 1010 mutex_exit(&ring->vr_a_mutex); 1011 return (0); 1012 } 1013 if (ndesc > ring->vr_size) { 1014 /* 1015 * Despite the fact that the guest has provided an 'avail_idx' 1016 * which indicates that an impossible number of descriptors are 1017 * available, continue on and attempt to process the next one. 1018 * 1019 * The transgression will not escape the probe or stats though. 1020 */ 1021 VIONA_PROBE2(ndesc_too_high, viona_vring_t *, ring, 1022 uint16_t, ndesc); 1023 VIONA_RING_STAT_INCR(ring, ndesc_too_high); 1024 } 1025 1026 head = vq_read_avail(ring, idx & ring->vr_mask); 1027 next = head; 1028 1029 for (region.vhr_idx = 0; region.vhr_idx < niov; next = vdir.vd_next) { 1030 if (next >= ring->vr_size) { 1031 VIONA_PROBE2(bad_idx, viona_vring_t *, ring, 1032 uint16_t, next); 1033 VIONA_RING_STAT_INCR(ring, bad_idx); 1034 break; 1035 } 1036 1037 vq_read_desc(ring, next, &vdir); 1038 if ((vdir.vd_flags & VRING_DESC_F_INDIRECT) == 0) { 1039 if (vq_map_desc_bufs(ring, &vdir, ®ion) != 0) { 1040 break; 1041 } 1042 } else { 1043 /* 1044 * Per the specification (Virtio 1.1 S2.6.5.3.1): 1045 * A driver MUST NOT set both VIRTQ_DESC_F_INDIRECT 1046 * and VIRTQ_DESC_F_NEXT in `flags`. 1047 */ 1048 if ((vdir.vd_flags & VRING_DESC_F_NEXT) != 0) { 1049 VIONA_PROBE3(indir_bad_next, 1050 viona_vring_t *, ring, 1051 uint16_t, next, uint16_t, 0); 1052 VIONA_RING_STAT_INCR(ring, indir_bad_next); 1053 break; 1054 } 1055 1056 if (vq_map_indir_desc_bufs(ring, &vdir, ®ion) != 0) { 1057 break; 1058 } 1059 } 1060 1061 if ((vdir.vd_flags & VRING_DESC_F_NEXT) == 0) { 1062 ring->vr_cur_aidx++; 1063 mutex_exit(&ring->vr_a_mutex); 1064 1065 *cookie = head; 1066 *chain = region.vhr_head; 1067 if (len != NULL) { 1068 *len = region.vhr_len; 1069 } 1070 return (region.vhr_idx); 1071 } 1072 } 1073 1074 mutex_exit(&ring->vr_a_mutex); 1075 if (region.vhr_head != NULL) { 1076 /* 1077 * If any pages were held prior to encountering an error, we 1078 * must release them now. 1079 */ 1080 vmm_drv_page_release_chain(region.vhr_head); 1081 } 1082 return (-1); 1083 } 1084 1085 1086 static void 1087 vq_write_used_ent(viona_vring_t *ring, uint16_t idx, uint16_t cookie, 1088 uint32_t len) 1089 { 1090 /* 1091 * In a larger ring, entry could be split across pages, so be sure to 1092 * account for that when configuring the transfer by looking up the ID 1093 * and length addresses separately, rather than an address for a 1094 * combined `struct virtio_used`. 1095 */ 1096 const uint_t used_id_off = LEGACY_USED_ENT_OFF(ring->vr_size, idx); 1097 const uint_t used_len_off = used_id_off + sizeof (uint32_t); 1098 volatile uint32_t *idp = viona_ring_addr(ring, used_id_off); 1099 volatile uint32_t *lenp = viona_ring_addr(ring, used_len_off); 1100 1101 ASSERT(MUTEX_HELD(&ring->vr_u_mutex)); 1102 1103 *idp = cookie; 1104 *lenp = len; 1105 } 1106 1107 static void 1108 vq_write_used_idx(viona_vring_t *ring, uint16_t idx) 1109 { 1110 ASSERT(MUTEX_HELD(&ring->vr_u_mutex)); 1111 1112 volatile uint16_t *used_idx = 1113 viona_ring_addr(ring, LEGACY_USED_IDX_OFF(ring->vr_size)); 1114 *used_idx = idx; 1115 } 1116 1117 void 1118 vq_pushchain(viona_vring_t *ring, uint32_t len, uint16_t cookie) 1119 { 1120 uint16_t uidx; 1121 1122 mutex_enter(&ring->vr_u_mutex); 1123 1124 uidx = ring->vr_cur_uidx; 1125 vq_write_used_ent(ring, uidx & ring->vr_mask, cookie, len); 1126 uidx++; 1127 membar_producer(); 1128 1129 vq_write_used_idx(ring, uidx); 1130 ring->vr_cur_uidx = uidx; 1131 1132 mutex_exit(&ring->vr_u_mutex); 1133 } 1134 1135 void 1136 vq_pushchain_many(viona_vring_t *ring, uint_t num_bufs, used_elem_t *elem) 1137 { 1138 uint16_t uidx; 1139 1140 mutex_enter(&ring->vr_u_mutex); 1141 1142 uidx = ring->vr_cur_uidx; 1143 1144 for (uint_t i = 0; i < num_bufs; i++, uidx++) { 1145 vq_write_used_ent(ring, uidx & ring->vr_mask, elem[i].id, 1146 elem[i].len); 1147 } 1148 1149 membar_producer(); 1150 vq_write_used_idx(ring, uidx); 1151 ring->vr_cur_uidx = uidx; 1152 1153 mutex_exit(&ring->vr_u_mutex); 1154 } 1155 1156 /* 1157 * Set USED_NO_NOTIFY on VQ so guest elides doorbell calls for new entries. 1158 */ 1159 void 1160 viona_ring_disable_notify(viona_vring_t *ring) 1161 { 1162 volatile uint16_t *used_flags = 1163 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size)); 1164 1165 *used_flags |= VRING_USED_F_NO_NOTIFY; 1166 } 1167 1168 /* 1169 * Clear USED_NO_NOTIFY on VQ so guest resumes doorbell calls for new entries. 1170 */ 1171 void 1172 viona_ring_enable_notify(viona_vring_t *ring) 1173 { 1174 volatile uint16_t *used_flags = 1175 viona_ring_addr(ring, LEGACY_USED_FLAGS_OFF(ring->vr_size)); 1176 1177 *used_flags &= ~VRING_USED_F_NO_NOTIFY; 1178 } 1179 1180 /* 1181 * Return the number of available descriptors in the vring taking care of the 1182 * 16-bit index wraparound. 1183 * 1184 * Note: If the number of apparently available descriptors is larger than the 1185 * ring size (due to guest misbehavior), this check will still report the 1186 * positive count of descriptors. 1187 */ 1188 uint16_t 1189 viona_ring_num_avail(viona_vring_t *ring) 1190 { 1191 volatile uint16_t *avail_idx = 1192 viona_ring_addr(ring, LEGACY_AVAIL_IDX_OFF(ring->vr_size)); 1193 1194 return (*avail_idx - ring->vr_cur_aidx); 1195 } 1196 1197 /* Record successfully transferred packet(s) for the ring stats */ 1198 void 1199 viona_ring_stat_accept(viona_vring_t *ring, size_t count, size_t len) 1200 { 1201 atomic_add_64(&ring->vr_stats.vts_packets, count); 1202 atomic_add_64(&ring->vr_stats.vts_bytes, len); 1203 } 1204 1205 /* 1206 * Record dropped packet(s) in the ring stats 1207 */ 1208 void 1209 viona_ring_stat_drop(viona_vring_t *ring, size_t count) 1210 { 1211 atomic_add_64(&ring->vr_stats.vts_drops, count); 1212 } 1213 1214 /* 1215 * Record a packet transfer error in the ring stats 1216 */ 1217 void 1218 viona_ring_stat_error(viona_vring_t *ring) 1219 { 1220 atomic_inc_64(&ring->vr_stats.vts_errors); 1221 } 1222 1223 /* 1224 * Consolidate statistic data for this ring into the totals for the link 1225 */ 1226 static void 1227 viona_ring_consolidate_stats(viona_vring_t *ring) 1228 { 1229 viona_link_t *link = ring->vr_link; 1230 struct viona_transfer_stats *lstat = 1231 (ring == &link->l_vrings[VIONA_VQ_RX]) ? 1232 &link->l_stats.vls_rx : &link->l_stats.vls_tx; 1233 1234 mutex_enter(&link->l_stats_lock); 1235 lstat->vts_packets += ring->vr_stats.vts_packets; 1236 lstat->vts_bytes += ring->vr_stats.vts_bytes; 1237 lstat->vts_drops += ring->vr_stats.vts_drops; 1238 lstat->vts_errors += ring->vr_stats.vts_errors; 1239 bzero(&ring->vr_stats, sizeof (ring->vr_stats)); 1240 mutex_exit(&link->l_stats_lock); 1241 } 1242 1243 /* 1244 * Copy `sz` bytes from iovecs contained in `iob` to `dst. 1245 * 1246 * Returns `true` if copy was successful (implying adequate data was remaining 1247 * in the iov_bunch_t). 1248 */ 1249 bool 1250 iov_bunch_copy(iov_bunch_t *iob, void *dst, uint32_t sz) 1251 { 1252 if (sz > iob->ib_remain) { 1253 return (false); 1254 } 1255 if (sz == 0) { 1256 return (true); 1257 } 1258 1259 caddr_t dest = dst; 1260 do { 1261 struct iovec *iov = iob->ib_iov; 1262 1263 ASSERT3U(iov->iov_len, <, UINT32_MAX); 1264 ASSERT3U(iov->iov_len, !=, 0); 1265 1266 const uint32_t iov_avail = (iov->iov_len - iob->ib_offset); 1267 const uint32_t to_copy = MIN(sz, iov_avail); 1268 1269 if (to_copy != 0) { 1270 bcopy((caddr_t)iov->iov_base + iob->ib_offset, dest, 1271 to_copy); 1272 } 1273 1274 sz -= to_copy; 1275 iob->ib_remain -= to_copy; 1276 dest += to_copy; 1277 iob->ib_offset += to_copy; 1278 1279 ASSERT3U(iob->ib_offset, <=, iov->iov_len); 1280 1281 if (iob->ib_offset == iov->iov_len) { 1282 iob->ib_iov++; 1283 iob->ib_offset = 0; 1284 } 1285 } while (sz > 0); 1286 1287 return (true); 1288 } 1289 1290 /* 1291 * Get the data pointer and length of the current head iovec, less any 1292 * offsetting from prior copy operations. This will advanced the iov_bunch_t as 1293 * if the caller had performed a copy of that chunk length. 1294 * 1295 * Returns `true` if the iov_bunch_t had at least one iovec (unconsumed bytes) 1296 * remaining, setting `chunk` and `chunk_sz` to the chunk pointer and size, 1297 * respectively. 1298 */ 1299 bool 1300 iov_bunch_next_chunk(iov_bunch_t *iob, caddr_t *chunk, uint32_t *chunk_sz) 1301 { 1302 if (iob->ib_remain == 0) { 1303 *chunk = NULL; 1304 *chunk_sz = 0; 1305 return (false); 1306 } 1307 1308 *chunk_sz = iob->ib_iov->iov_len - iob->ib_offset; 1309 *chunk = (caddr_t)iob->ib_iov->iov_base + iob->ib_offset; 1310 iob->ib_remain -= *chunk_sz; 1311 iob->ib_iov++; 1312 iob->ib_offset = 0; 1313 return (true); 1314 } 1315