1 /* 2 * Copyright (c) 2013 Chris Torek <torek @ torek net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * This file and its contents are supplied under the terms of the 28 * Common Development and Distribution License ("CDDL"), version 1.0. 29 * You may only use this file in accordance with the terms of version 30 * 1.0 of the CDDL. 31 * 32 * A full copy of the text of the CDDL should have accompanied this 33 * source. A copy of the CDDL is also available via the Internet at 34 * http://www.illumos.org/license/CDDL. 35 * 36 * Copyright 2015 Pluribus Networks Inc. 37 * Copyright 2019 Joyent, Inc. 38 * Copyright 2025 Oxide Computer Company 39 */ 40 41 42 #include <sys/types.h> 43 #include <sys/smt.h> 44 #include <sys/strsubr.h> 45 46 #include <sys/pattr.h> 47 #include <sys/dlpi.h> 48 #include <inet/ip.h> 49 #include <inet/ip_impl.h> 50 51 #include "viona_impl.h" 52 53 #define BNXE_NIC_DRIVER "bnxe" 54 55 /* 56 * Tunable controls tx copy by default on or off 57 */ 58 boolean_t viona_default_tx_copy = B_TRUE; 59 60 /* 61 * Tunable for maximum configured TX header padding. 62 */ 63 uint_t viona_max_header_pad = 256; 64 65 /* 66 * copy tx mbufs from virtio ring to avoid necessitating a wait for packet 67 * transmission to free resources. 68 */ 69 kmutex_t viona_force_copy_lock; 70 static enum viona_force_copy { 71 VFC_UNINITALIZED = 0, 72 VFC_COPY_UNEEDED = 1, 73 VFC_COPY_REQUIRED = 2, 74 } viona_force_copy_state = VFC_UNINITALIZED; 75 76 struct viona_desb { 77 frtn_t d_frtn; 78 viona_vring_t *d_ring; 79 uint_t d_ref; 80 uint32_t d_len; 81 uint16_t d_cookie; 82 uchar_t *d_headers; 83 vmm_page_t *d_pages; 84 }; 85 86 static size_t viona_tx(viona_link_t *, viona_vring_t *); 87 static void viona_desb_release(viona_desb_t *); 88 89 90 static void 91 viona_tx_wait_outstanding(viona_vring_t *ring) 92 { 93 ASSERT(MUTEX_HELD(&ring->vr_lock)); 94 95 while (ring->vr_xfer_outstanding != 0) { 96 /* 97 * Paying heed to signals is counterproductive here. This is a 98 * very tight loop if pending transfers take an extended amount 99 * of time to be reclaimed while the host process is exiting. 100 */ 101 cv_wait(&ring->vr_cv, &ring->vr_lock); 102 } 103 } 104 105 /* 106 * Check if full TX packet copying is needed. This should not be called from 107 * viona attach()/detach() context. 108 */ 109 boolean_t 110 viona_tx_copy_needed(void) 111 { 112 boolean_t result; 113 114 if (viona_default_tx_copy) { 115 return (B_TRUE); 116 } 117 118 mutex_enter(&viona_force_copy_lock); 119 if (viona_force_copy_state == VFC_UNINITALIZED) { 120 major_t bnxe_major; 121 122 /* 123 * The original code for viona featured an explicit check for 124 * the bnxe driver which, when found present, necessitated that 125 * all transmissions be copied into their own mblks instead of 126 * passing guest memory to the underlying device. 127 * 128 * The motivations for this are unclear, but until it can be 129 * proven unnecessary, the check lives on. 130 */ 131 viona_force_copy_state = VFC_COPY_UNEEDED; 132 if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER)) 133 != DDI_MAJOR_T_NONE) { 134 if (ddi_hold_installed_driver(bnxe_major) != NULL) { 135 viona_force_copy_state = VFC_COPY_REQUIRED; 136 ddi_rele_driver(bnxe_major); 137 } 138 } 139 } 140 result = (viona_force_copy_state == VFC_COPY_REQUIRED); 141 mutex_exit(&viona_force_copy_lock); 142 143 return (result); 144 } 145 146 void 147 viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz) 148 { 149 const viona_link_params_t *vlp = &ring->vr_link->l_params; 150 151 ring->vr_tx.vrt_header_pad = vlp->vlp_tx_header_pad; 152 /* Allocate desb handles for TX ring if packet copying not forced */ 153 if (!ring->vr_link->l_params.vlp_tx_copy_data) { 154 viona_desb_t *dp = 155 kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP); 156 ring->vr_tx.vrt_desb = dp; 157 158 const size_t header_sz = 159 VIONA_MAX_HDRS_LEN + ring->vr_tx.vrt_header_pad; 160 for (uint_t i = 0; i < qsz; i++, dp++) { 161 dp->d_frtn.free_func = viona_desb_release; 162 dp->d_frtn.free_arg = (void *)dp; 163 dp->d_ring = ring; 164 dp->d_headers = kmem_zalloc(header_sz, KM_SLEEP); 165 } 166 } 167 168 /* Allocate ring-sized iovec buffers for TX */ 169 ring->vr_tx.vrt_iov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP); 170 ring->vr_tx.vrt_iov_cnt = qsz; 171 } 172 173 void 174 viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz) 175 { 176 if (ring->vr_tx.vrt_desb != NULL) { 177 viona_desb_t *dp = ring->vr_tx.vrt_desb; 178 179 const size_t header_sz = 180 VIONA_MAX_HDRS_LEN + ring->vr_tx.vrt_header_pad; 181 for (uint_t i = 0; i < qsz; i++, dp++) { 182 kmem_free(dp->d_headers, header_sz); 183 } 184 kmem_free(ring->vr_tx.vrt_desb, sizeof (viona_desb_t) * qsz); 185 ring->vr_tx.vrt_desb = NULL; 186 } 187 188 if (ring->vr_tx.vrt_iov != NULL) { 189 ASSERT3U(ring->vr_tx.vrt_iov_cnt, !=, 0); 190 191 kmem_free(ring->vr_tx.vrt_iov, 192 sizeof (struct iovec) * ring->vr_tx.vrt_iov_cnt); 193 ring->vr_tx.vrt_iov = NULL; 194 ring->vr_tx.vrt_iov_cnt = 0; 195 } 196 } 197 198 static void 199 viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie) 200 { 201 vq_pushchain(ring, len, cookie); 202 203 membar_enter(); 204 viona_intr_ring(ring, B_FALSE); 205 } 206 207 #define TX_BURST_THRESH 32 208 209 void 210 viona_worker_tx(viona_vring_t *ring, viona_link_t *link) 211 { 212 (void) thread_vsetname(curthread, "viona_tx_%p", ring); 213 214 ASSERT(MUTEX_HELD(&ring->vr_lock)); 215 ASSERT3U(ring->vr_state, ==, VRS_RUN); 216 217 mutex_exit(&ring->vr_lock); 218 219 for (;;) { 220 size_t cnt_tx = 0, size_tx = 0; 221 uint_t burst = 0; 222 223 viona_ring_disable_notify(ring); 224 while (viona_ring_num_avail(ring) != 0) { 225 const size_t size_sent = viona_tx(link, ring); 226 if (size_sent != 0) { 227 /* Account for successful transmissions */ 228 size_tx += size_sent; 229 cnt_tx++; 230 } 231 burst++; 232 233 /* 234 * It is advantageous for throughput to keep this 235 * transmission loop tight, but periodic breaks to 236 * check for other events are of value too. 237 */ 238 if (burst >= TX_BURST_THRESH) { 239 mutex_enter(&ring->vr_lock); 240 const bool need_bail = vring_need_bail(ring); 241 mutex_exit(&ring->vr_lock); 242 243 if (need_bail) { 244 break; 245 } 246 burst = 0; 247 } 248 } 249 250 VIONA_PROBE2(tx, viona_link_t *, link, size_t, cnt_tx); 251 if (cnt_tx != 0) { 252 viona_ring_stat_accept(ring, cnt_tx, size_tx); 253 } 254 255 /* 256 * Check for available descriptors on the ring once more in 257 * case a late addition raced with the NO_NOTIFY flag toggle. 258 * 259 * The barrier ensures that visibility of the no-notify 260 * store does not cross the viona_ring_num_avail() check below. 261 */ 262 viona_ring_enable_notify(ring); 263 membar_enter(); 264 265 if (viona_ring_num_avail(ring) == 0 && 266 (link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) { 267 /* 268 * The NOTIFY_ON_EMPTY interrupt should not pay heed to 269 * the presence of AVAIL_NO_INTERRUPT. 270 */ 271 viona_intr_ring(ring, B_TRUE); 272 } 273 274 mutex_enter(&ring->vr_lock); 275 for (;;) { 276 if (vring_need_bail(ring)) { 277 ring->vr_state = VRS_STOP; 278 viona_tx_wait_outstanding(ring); 279 return; 280 } 281 282 if (vmm_drv_lease_expired(ring->vr_lease)) { 283 ring->vr_state_flags |= VRSF_RENEW; 284 /* 285 * When renewing the lease for the ring, no TX 286 * frames may be outstanding, as they contain 287 * references to guest memory. 288 */ 289 viona_tx_wait_outstanding(ring); 290 291 const boolean_t renewed = 292 viona_ring_lease_renew(ring); 293 ring->vr_state_flags &= ~VRSF_RENEW; 294 295 if (!renewed) { 296 /* stop ring on failed renewal */ 297 ring->vr_state = VRS_STOP; 298 return; 299 } 300 } 301 302 if (viona_ring_num_avail(ring) != 0) { 303 break; 304 } 305 306 /* Wait for further activity on the ring */ 307 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 308 } 309 mutex_exit(&ring->vr_lock); 310 } 311 /* UNREACHABLE */ 312 } 313 314 static void 315 viona_desb_release(viona_desb_t *dp) 316 { 317 viona_vring_t *ring = dp->d_ring; 318 uint_t ref; 319 uint32_t len; 320 uint16_t cookie; 321 322 ref = atomic_dec_uint_nv(&dp->d_ref); 323 if (ref > 1) { 324 return; 325 } 326 327 /* 328 * The desb corresponding to this index must be ready for reuse before 329 * the descriptor is returned to the guest via the 'used' ring. 330 */ 331 len = dp->d_len; 332 cookie = dp->d_cookie; 333 dp->d_len = 0; 334 dp->d_cookie = 0; 335 vmm_drv_page_release_chain(dp->d_pages); 336 dp->d_pages = NULL; 337 338 /* 339 * Ensure all other changes to the desb are visible prior to zeroing its 340 * refcount, signifying its readiness for reuse. 341 */ 342 membar_exit(); 343 dp->d_ref = 0; 344 345 viona_tx_done(ring, len, cookie); 346 347 mutex_enter(&ring->vr_lock); 348 if ((--ring->vr_xfer_outstanding) == 0) { 349 cv_broadcast(&ring->vr_cv); 350 } 351 mutex_exit(&ring->vr_lock); 352 } 353 354 /* 355 * Confirm that the requested checksum operation acts within the bounds of the 356 * provided packet, and that the checksum itself will be stored in the "copied 357 * headers" portion of said packet. 358 */ 359 static boolean_t 360 viona_tx_csum_req_valid(const struct virtio_net_mrgrxhdr *hdr, 361 const mac_ether_offload_info_t *meoi, uint_t copied_len) 362 { 363 const uint_t csum_off = hdr->vrh_csum_offset + hdr->vrh_csum_start; 364 365 if (hdr->vrh_csum_start >= meoi->meoi_len || 366 hdr->vrh_csum_start < meoi->meoi_l2hlen || 367 csum_off >= meoi->meoi_len || 368 (csum_off + sizeof (uint16_t)) > copied_len) { 369 return (B_FALSE); 370 } 371 372 return (B_TRUE); 373 } 374 375 /* 376 * Configure mblk to request full checksum offloading, given the virtio and meoi 377 * details provided. 378 */ 379 static void 380 viona_tx_hcksum_full(mblk_t *mp, const struct virtio_net_mrgrxhdr *hdr, 381 const mac_ether_offload_info_t *meoi, uint32_t added_flags) 382 { 383 /* 384 * Out of caution, zero the checksum field in case any driver and/or 385 * device would erroneously use it in the sum calculation. 386 */ 387 uint16_t *csump = (uint16_t *) 388 (mp->b_rptr + hdr->vrh_csum_start + hdr->vrh_csum_offset); 389 *csump = 0; 390 391 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM | added_flags); 392 } 393 394 /* 395 * Configure mblk to request partial checksum offloading, given the virtio and 396 * meoi details provided. 397 */ 398 static void 399 viona_tx_hcksum_partial(mblk_t *mp, const struct virtio_net_mrgrxhdr *hdr, 400 const mac_ether_offload_info_t *meoi, uint32_t added_flags) 401 { 402 /* 403 * MAC expects these offsets to be relative to the start of the L3 404 * header rather than the L2 frame. 405 */ 406 mac_hcksum_set(mp, 407 hdr->vrh_csum_start - meoi->meoi_l2hlen, 408 hdr->vrh_csum_start + hdr->vrh_csum_offset - meoi->meoi_l2hlen, 409 meoi->meoi_len - meoi->meoi_l2hlen, 410 0, HCK_PARTIALCKSUM | added_flags); 411 } 412 413 static boolean_t 414 viona_tx_offloads(viona_vring_t *ring, const struct virtio_net_mrgrxhdr *hdr, 415 const mac_ether_offload_info_t *meoi, mblk_t *mp, uint32_t len) 416 { 417 viona_link_t *link = ring->vr_link; 418 const uint32_t cap_csum = link->l_cap_csum; 419 420 /* 421 * Since viona is a "legacy device", the data stored by the driver will 422 * be in the guest's native endian format (see sections 2.4.3 and 423 * 5.1.6.1 of the VIRTIO 1.0 spec for more info). At this time the only 424 * guests using viona are x86 and we can assume little-endian. 425 */ 426 const uint16_t gso_size = LE_16(hdr->vrh_gso_size); 427 428 if (!viona_tx_csum_req_valid(hdr, meoi, MBLKL(mp))) { 429 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp); 430 VIONA_RING_STAT_INCR(ring, fail_hcksum); 431 return (B_FALSE); 432 } 433 434 const uint16_t ftype = meoi->meoi_l3proto; 435 const uint8_t ipproto = meoi->meoi_l4proto; 436 if (ftype != ETHERTYPE_IP && ftype != ETHERTYPE_IPV6) { 437 /* Ignore checksum offload requests for non-IP protocols. */ 438 VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, 439 mblk_t *, mp); 440 VIONA_RING_STAT_INCR(ring, fail_hcksum_proto); 441 return (B_FALSE); 442 } 443 444 /* Configure TCPv4 LSO when requested */ 445 if ((hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 && 446 ftype == ETHERTYPE_IP) { 447 if ((link->l_features & VIRTIO_NET_F_HOST_TSO4) == 0) { 448 VIONA_PROBE2(tx_gso_fail, viona_link_t *, link, 449 mblk_t *, mp); 450 VIONA_RING_STAT_INCR(ring, tx_gso_fail); 451 return (B_FALSE); 452 } 453 454 lso_info_set(mp, gso_size, HW_LSO); 455 456 /* 457 * We should have already verified that an adequate form of 458 * hardware checksum offload is present for TSOv4 459 */ 460 ASSERT3U(cap_csum & 461 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4), !=, 0); 462 463 if ((cap_csum & HCKSUM_INET_FULL_V4) != 0) { 464 viona_tx_hcksum_full(mp, hdr, meoi, HW_LSO); 465 } else if ((cap_csum & HCKSUM_INET_PARTIAL) != 0) { 466 /* 467 * Our native IP stack doesn't set the L4 length field 468 * of the pseudo header when LSO is in play. Other IP 469 * stacks, e.g. Linux, do include the length field. 470 * This is a problem because the hardware expects that 471 * the length field is not set. When it is set, it will 472 * cause an incorrect TCP checksum to be generated. 473 * Linux avoids this issue by correcting the 474 * pseudo-header checksum in the driver code. 475 * 476 * In order to get the correct HW checksum we need to 477 * assume the guest's IP stack gave us a bogus TCP 478 * partial checksum and calculate it ourselves. 479 */ 480 ipha_t *ipha = 481 (ipha_t *)(mp->b_rptr + meoi->meoi_l2hlen); 482 uint16_t *cksump = 483 IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha)); 484 485 uint32_t cksum = IP_TCP_CSUM_COMP; 486 const ipaddr_t src = ipha->ipha_src; 487 const ipaddr_t dst = ipha->ipha_dst; 488 cksum += (dst >> 16) + (dst & 0xffff) + 489 (src >> 16) + (src & 0xffff); 490 cksum = (cksum & 0xffff) + (cksum >> 16); 491 *cksump = (cksum & 0xffff) + (cksum >> 16); 492 493 /* 494 * NICs such as ixgbe require that ipv4 checksum offload 495 * also be enabled when performing LSO. 496 */ 497 uint32_t v4csum = 0; 498 if ((cap_csum & HCKSUM_IPHDRCKSUM) != 0) { 499 v4csum = HCK_IPV4_HDRCKSUM; 500 ipha->ipha_hdr_checksum = 0; 501 } 502 503 viona_tx_hcksum_partial(mp, hdr, meoi, HW_LSO | v4csum); 504 } else { 505 /* 506 * This should be unreachable: We do not permit LSO 507 * without adequate checksum offload capability. 508 */ 509 VIONA_PROBE2(tx_gso_fail, viona_link_t *, link, 510 mblk_t *, mp); 511 VIONA_RING_STAT_INCR(ring, tx_gso_fail); 512 return (B_FALSE); 513 } 514 515 return (B_TRUE); 516 } 517 518 /* 519 * Partial checksum support from the NIC is ideal, since it most closely 520 * maps to the interface defined by virtio. 521 */ 522 if ((cap_csum & HCKSUM_INET_PARTIAL) != 0 && 523 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { 524 viona_tx_hcksum_partial(mp, hdr, meoi, 0); 525 return (B_TRUE); 526 } 527 528 /* 529 * Without partial checksum support, look to the L3/L4 protocol 530 * information to see if the NIC can handle it. If not, the checksum 531 * will need to calculated inline. 532 */ 533 if (ftype == ETHERTYPE_IP) { 534 if ((cap_csum & HCKSUM_INET_FULL_V4) != 0 && 535 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { 536 viona_tx_hcksum_full(mp, hdr, meoi, 0); 537 return (B_TRUE); 538 } 539 540 /* XXX: Implement manual fallback checksumming? */ 541 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp); 542 VIONA_RING_STAT_INCR(ring, fail_hcksum); 543 return (B_FALSE); 544 } else if (ftype == ETHERTYPE_IPV6) { 545 if ((cap_csum & HCKSUM_INET_FULL_V6) != 0 && 546 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { 547 viona_tx_hcksum_full(mp, hdr, meoi, 0); 548 return (B_TRUE); 549 } 550 551 /* XXX: Implement manual fallback checksumming? */ 552 VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp); 553 VIONA_RING_STAT_INCR(ring, fail_hcksum6); 554 return (B_FALSE); 555 } 556 557 /* 558 * Note the failure for unrecognized protocols, but soldier on to make 559 * our best effort at getting the frame out the door. 560 */ 561 VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp); 562 VIONA_RING_STAT_INCR(ring, fail_hcksum_proto); 563 return (B_FALSE); 564 } 565 566 static mblk_t * 567 viona_tx_alloc_headers(viona_vring_t *ring, uint16_t cookie, viona_desb_t **dpp, 568 uint32_t len) 569 { 570 ASSERT3P(*dpp, ==, NULL); 571 572 mblk_t *mp = NULL; 573 const size_t header_pad = ring->vr_tx.vrt_header_pad; 574 575 if (ring->vr_tx.vrt_desb != NULL) { 576 viona_desb_t *dp = &ring->vr_tx.vrt_desb[cookie]; 577 const size_t header_sz = VIONA_MAX_HDRS_LEN + header_pad; 578 579 /* 580 * If the guest driver is operating properly, each desb slot 581 * should be available for use when processing a TX descriptor 582 * from the 'avail' ring. In the case of drivers that reuse a 583 * descriptor before it has been posted to the 'used' ring, the 584 * data is simply dropped. 585 */ 586 if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) { 587 return (NULL); 588 } 589 590 dp->d_cookie = cookie; 591 mp = desballoc(dp->d_headers, header_sz, 0, &dp->d_frtn); 592 593 if (mp != NULL) { 594 /* 595 * Account for the successful desballoc, and communicate 596 * out the desb handle for subsequent use 597 */ 598 dp->d_ref++; 599 *dpp = dp; 600 } else { 601 /* Reset the desb back to its "available" state */ 602 dp->d_ref = 0; 603 } 604 } else { 605 /* 606 * If we are going to be copying the entire packet, we might as 607 * well allocate for it all in one go. 608 */ 609 mp = allocb(len + header_pad, 0); 610 } 611 612 /* Push pointers forward to account for requested header padding */ 613 if (mp != NULL && header_pad != 0) { 614 mp->b_rptr = mp->b_wptr = (DB_BASE(mp) + header_pad); 615 } 616 617 return (mp); 618 } 619 620 static boolean_t 621 viona_tx_copy_headers(viona_vring_t *ring, iov_bunch_t *iob, mblk_t *mp, 622 mac_ether_offload_info_t *meoi) 623 { 624 ASSERT(mp->b_cont == NULL); 625 626 if (ring->vr_tx.vrt_desb == NULL) { 627 /* 628 * If not using guest data loaning through the desb, then we 629 * expect viona_tx_alloc_headers() to have allocated space for 630 * the entire packet, which we should copy now. 631 */ 632 const uint32_t pkt_size = iob->ib_remain; 633 634 VERIFY(MBLKTAIL(mp) >= pkt_size); 635 VERIFY(iov_bunch_copy(iob, mp->b_wptr, pkt_size)); 636 mp->b_wptr += pkt_size; 637 mac_ether_offload_info(mp, meoi); 638 return (B_TRUE); 639 } 640 641 /* 642 * We want to maximize the amount of guest data we loan when performing 643 * packet transmission, with the caveat that we must copy the packet 644 * headers to prevent TOCTOU issues. 645 */ 646 const uint32_t copy_sz = MIN(iob->ib_remain, MBLKTAIL(mp)); 647 648 VERIFY(iov_bunch_copy(iob, mp->b_wptr, copy_sz)); 649 mp->b_wptr += copy_sz; 650 651 if (iob->ib_remain == 0) { 652 mac_ether_offload_info(mp, meoi); 653 return (B_TRUE); 654 } 655 656 mac_ether_offload_info(mp, meoi); 657 if ((meoi->meoi_flags & MEOI_L2INFO_SET) == 0) { 658 /* If the L2 header cannot be parsed, give up now */ 659 return (B_FALSE); 660 } 661 if ((meoi->meoi_flags & MEOI_L4INFO_SET) != 0) { 662 const uint32_t full_hdr_sz = 663 meoi->meoi_l2hlen + meoi->meoi_l3hlen + meoi->meoi_l4hlen; 664 if (copy_sz >= full_hdr_sz) { 665 /* All headers are already copied */ 666 return (B_TRUE); 667 } 668 } 669 670 /* 671 * The full headers do not appear to be along for the ride yet, or the 672 * packet bears a protocol we do not handle. Just allocate a 673 * buffer and copy the remainder of the packet. 674 */ 675 const uint32_t remain_sz = iob->ib_remain; 676 mblk_t *remain_mp = allocb(remain_sz, 0); 677 if (remain_mp == NULL) { 678 return (B_FALSE); 679 } 680 VERIFY(iov_bunch_copy(iob, remain_mp->b_wptr, remain_sz)); 681 remain_mp->b_wptr += remain_sz; 682 mp->b_cont = remain_mp; 683 /* Refresh header info now that we have copied the rest */ 684 mac_ether_offload_info(mp, meoi); 685 686 return (B_TRUE); 687 } 688 689 static size_t 690 viona_tx(viona_link_t *link, viona_vring_t *ring) 691 { 692 struct iovec *iov = ring->vr_tx.vrt_iov; 693 const uint_t max_segs = ring->vr_tx.vrt_iov_cnt; 694 uint16_t cookie; 695 vmm_page_t *pages = NULL; 696 uint32_t total_len; 697 mblk_t *mp_head = NULL; 698 viona_desb_t *dp = NULL; 699 const boolean_t merge_enabled = 700 ((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0); 701 702 ASSERT(iov != NULL); 703 704 const int n = vq_popchain(ring, iov, max_segs, &cookie, &pages, 705 &total_len); 706 if (n == 0) { 707 VIONA_PROBE1(tx_absent, viona_vring_t *, ring); 708 VIONA_RING_STAT_INCR(ring, tx_absent); 709 return (0); 710 } else if (n < 0) { 711 /* 712 * Any error encountered in vq_popchain has already resulted in 713 * specific probe and statistic handling. Further action here 714 * is unnecessary. 715 */ 716 return (0); 717 } 718 719 /* 720 * Get setup to copy the VirtIO header from in front of the packet. 721 * 722 * With an eye toward supporting VirtIO 1.0 behavior in the future, we 723 * determine the size of the header based on the device state. This 724 * goes a bit beyond the expectations of legacy VirtIO, where the first 725 * buffer must cover the header and nothing else. 726 */ 727 iov_bunch_t iob = { 728 .ib_iov = iov, 729 .ib_remain = total_len, 730 }; 731 struct virtio_net_mrgrxhdr hdr; 732 uint32_t vio_hdr_len = 0; 733 if (merge_enabled) { 734 /* 735 * Presence of the "num_bufs" member is determined by the 736 * merge-rxbuf feature on the device, despite the fact that we 737 * are in transmission context here. 738 */ 739 vio_hdr_len = sizeof (struct virtio_net_mrgrxhdr); 740 } else { 741 vio_hdr_len = sizeof (struct virtio_net_hdr); 742 /* 743 * We ignore "num_bufs" from the guest anyways, but zero it out 744 * just in case. 745 */ 746 hdr.vrh_bufs = 0; 747 } 748 const uint32_t pkt_len = total_len - vio_hdr_len; 749 if (!iov_bunch_copy(&iob, &hdr, vio_hdr_len)) { 750 goto drop_fail; 751 } 752 753 if (pkt_len > VIONA_MAX_PACKET_SIZE || 754 pkt_len < sizeof (struct ether_header)) { 755 goto drop_fail; 756 } 757 758 mp_head = viona_tx_alloc_headers(ring, cookie, &dp, pkt_len); 759 if (mp_head == NULL) { 760 goto drop_fail; 761 } 762 763 /* 764 * Copy the the packet headers (L2 through L4, if present) to prevent 765 * TOCTOU attacks in any subsequent consumers of that data. 766 */ 767 mac_ether_offload_info_t meoi = { 0 }; 768 if (!viona_tx_copy_headers(ring, &iob, mp_head, &meoi)) { 769 goto drop_fail; 770 } 771 772 if (dp != NULL && iob.ib_remain != 0) { 773 /* 774 * If this device is loaning guest memory, rather than copying 775 * the entire body of the packet, we may need to establish mblks 776 * for the remaining data-to-be-loaned after the header copy. 777 */ 778 uint32_t chunk_sz; 779 caddr_t chunk; 780 mblk_t *mp_tail = mp_head; 781 782 /* 783 * Ensure that our view of the tail is accurate in the rare case 784 * that the header allocation/copying logic has already resulted 785 * in a chained mblk. 786 */ 787 while (mp_tail->b_cont != NULL) { 788 mp_tail = mp_tail->b_cont; 789 } 790 791 while (iov_bunch_next_chunk(&iob, &chunk, &chunk_sz)) { 792 mblk_t *mp = desballoc((uchar_t *)chunk, chunk_sz, 0, 793 &dp->d_frtn); 794 if (mp == NULL) { 795 goto drop_fail; 796 } 797 798 mp->b_wptr += chunk_sz; 799 dp->d_ref++; 800 mp_tail->b_cont = mp; 801 mp_tail = mp; 802 } 803 } else { 804 /* The copy-everything strategy should be done by now */ 805 VERIFY0(iob.ib_remain); 806 } 807 808 if (VNETHOOK_INTERESTED_OUT(link->l_neti)) { 809 /* 810 * The hook consumer may elect to free the mblk_t and set 811 * our mblk_t ** to NULL. When using a viona_desb_t 812 * (dp != NULL), we do not want the corresponding cleanup to 813 * occur during the viona_hook() call. We instead want to 814 * reset and recycle dp for future use. To prevent cleanup 815 * during the viona_hook() call, we take a ref on dp (if being 816 * used), and release it on success. On failure, the 817 * freemsgchain() call will release all the refs taken earlier 818 * in viona_tx() (aside from the initial ref and the one we 819 * take), and drop_hook will reset dp for reuse. 820 */ 821 if (dp != NULL) 822 dp->d_ref++; 823 824 /* 825 * Pass &mp instead of &mp_head so we don't lose track of 826 * mp_head if the hook consumer (i.e. ipf) elects to free mp 827 * and set mp to NULL. 828 */ 829 mblk_t *mp = mp_head; 830 if (viona_hook(link, ring, &mp, B_TRUE) != 0) { 831 if (mp != NULL) 832 freemsgchain(mp); 833 goto drop_hook; 834 } 835 836 if (dp != NULL) { 837 dp->d_ref--; 838 839 /* 840 * It is possible that the hook(s) accepted the packet, 841 * but as part of its processing, it issued a pull-up 842 * which released all references to the desb. In that 843 * case, go back to acting like the packet is entirely 844 * copied (which it is). 845 */ 846 if (dp->d_ref == 1) { 847 dp->d_cookie = 0; 848 dp->d_ref = 0; 849 dp = NULL; 850 } 851 } 852 } 853 854 /* 855 * Translate request for offloaded checksumming. If the guest sent an 856 * LSO packet then it must have also negotiated and requested partial 857 * checksum; therefore the LSO logic is contained within 858 * viona_tx_offloads(). 859 */ 860 if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 && 861 (hdr.vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) { 862 if (!viona_tx_offloads(ring, &hdr, &meoi, mp_head, pkt_len)) { 863 /* 864 * If processing of any checksum offload request fails, 865 * we can still pass the packet on for transmission. 866 * Even with this best-effort behavior, which may in 867 * fact succeed in the end, we record it as an error. 868 */ 869 viona_ring_stat_error(ring); 870 } 871 } 872 873 if (dp != NULL) { 874 /* 875 * Record the info required to record this descriptor in the 876 * used ring once its transmission has completed. 877 */ 878 dp->d_len = total_len; 879 dp->d_pages = pages; 880 mutex_enter(&ring->vr_lock); 881 ring->vr_xfer_outstanding++; 882 mutex_exit(&ring->vr_lock); 883 } else { 884 /* 885 * If the data was cloned out of the ring, the descriptors can 886 * be marked as 'used' now, rather than deferring that action 887 * until after successful packet transmission. 888 */ 889 vmm_drv_page_release_chain(pages); 890 viona_tx_done(ring, total_len, cookie); 891 } 892 893 /* 894 * From viona's point of view, this is a successful transmit, even if 895 * something downstream decides to drop the packet. 896 */ 897 VIONA_PROBE3(pkt__tx, viona_vring_t *, ring, mblk_t, mp_head, 898 size_t, pkt_len) 899 900 /* 901 * We're potentially going deep into the networking layer; make sure the 902 * guest can't run concurrently. 903 */ 904 smt_begin_unsafe(); 905 /* 906 * Ignore, for now, any signal from MAC about whether the outgoing 907 * packet was dropped or not. 908 */ 909 (void) mac_tx(link->l_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL); 910 smt_end_unsafe(); 911 return (pkt_len); 912 913 drop_fail: 914 /* 915 * On the off chance that memory is not available via the desballoc or 916 * allocb calls, there are few options left besides to fail and drop 917 * the frame on the floor. 918 * 919 * First account for it in the error stats. 920 */ 921 viona_ring_stat_error(ring); 922 923 if (dp != NULL) { 924 /* 925 * Take an additional reference on the desb handle (if present) 926 * so any desballoc-sourced mblks can release their hold on it 927 * without the handle reaching its final state and executing 928 * its clean-up logic. 929 */ 930 dp->d_ref++; 931 } 932 933 /* 934 * Free any already-allocated blocks and sum up the total length of the 935 * dropped data to be released to the used ring. 936 */ 937 freemsgchain(mp_head); 938 939 drop_hook: 940 if (dp != NULL) { 941 VERIFY(dp->d_ref == 2); 942 943 /* Clean up the desb handle, releasing the extra hold. */ 944 dp->d_len = 0; 945 dp->d_cookie = 0; 946 dp->d_ref = 0; 947 } 948 949 /* Count in the stats as a drop, rather than an error */ 950 viona_ring_stat_drop(ring, 1); 951 952 VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, pkt_len, 953 uint16_t, cookie); 954 vmm_drv_page_release_chain(pages); 955 viona_tx_done(ring, total_len, cookie); 956 return (0); 957 } 958