1 /* 2 * Copyright (c) 2013 Chris Torek <torek @ torek net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * This file and its contents are supplied under the terms of the 28 * Common Development and Distribution License ("CDDL"), version 1.0. 29 * You may only use this file in accordance with the terms of version 30 * 1.0 of the CDDL. 31 * 32 * A full copy of the text of the CDDL should have accompanied this 33 * source. A copy of the CDDL is also available via the Internet at 34 * http://www.illumos.org/license/CDDL. 35 * 36 * Copyright 2015 Pluribus Networks Inc. 37 * Copyright 2019 Joyent, Inc. 38 * Copyright 2024 Oxide Computer Company 39 */ 40 41 42 #include <sys/types.h> 43 #include <sys/smt.h> 44 #include <sys/strsubr.h> 45 46 #include <sys/pattr.h> 47 #include <sys/dlpi.h> 48 #include <inet/ip.h> 49 #include <inet/ip_impl.h> 50 51 #include "viona_impl.h" 52 53 #define BNXE_NIC_DRIVER "bnxe" 54 55 /* 56 * Tunable controls tx copy by default on or off 57 */ 58 boolean_t viona_default_tx_copy = B_TRUE; 59 60 /* 61 * Tunable for maximum configured TX header padding. 62 */ 63 uint_t viona_max_header_pad = 256; 64 65 /* 66 * copy tx mbufs from virtio ring to avoid necessitating a wait for packet 67 * transmission to free resources. 68 */ 69 kmutex_t viona_force_copy_lock; 70 static enum viona_force_copy { 71 VFC_UNINITALIZED = 0, 72 VFC_COPY_UNEEDED = 1, 73 VFC_COPY_REQUIRED = 2, 74 } viona_force_copy_state = VFC_UNINITALIZED; 75 76 struct viona_desb { 77 frtn_t d_frtn; 78 viona_vring_t *d_ring; 79 uint_t d_ref; 80 uint32_t d_len; 81 uint16_t d_cookie; 82 uchar_t *d_headers; 83 vmm_page_t *d_pages; 84 }; 85 86 static void viona_tx(viona_link_t *, viona_vring_t *); 87 static void viona_desb_release(viona_desb_t *); 88 89 90 static void 91 viona_tx_wait_outstanding(viona_vring_t *ring) 92 { 93 ASSERT(MUTEX_HELD(&ring->vr_lock)); 94 95 while (ring->vr_xfer_outstanding != 0) { 96 /* 97 * Paying heed to signals is counterproductive here. This is a 98 * very tight loop if pending transfers take an extended amount 99 * of time to be reclaimed while the host process is exiting. 100 */ 101 cv_wait(&ring->vr_cv, &ring->vr_lock); 102 } 103 } 104 105 /* 106 * Check if full TX packet copying is needed. This should not be called from 107 * viona attach()/detach() context. 108 */ 109 boolean_t 110 viona_tx_copy_needed(void) 111 { 112 boolean_t result; 113 114 if (viona_default_tx_copy) { 115 return (B_TRUE); 116 } 117 118 mutex_enter(&viona_force_copy_lock); 119 if (viona_force_copy_state == VFC_UNINITALIZED) { 120 major_t bnxe_major; 121 122 /* 123 * The original code for viona featured an explicit check for 124 * the bnxe driver which, when found present, necessitated that 125 * all transmissions be copied into their own mblks instead of 126 * passing guest memory to the underlying device. 127 * 128 * The motivations for this are unclear, but until it can be 129 * proven unnecessary, the check lives on. 130 */ 131 viona_force_copy_state = VFC_COPY_UNEEDED; 132 if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER)) 133 != DDI_MAJOR_T_NONE) { 134 if (ddi_hold_installed_driver(bnxe_major) != NULL) { 135 viona_force_copy_state = VFC_COPY_REQUIRED; 136 ddi_rele_driver(bnxe_major); 137 } 138 } 139 } 140 result = (viona_force_copy_state == VFC_COPY_REQUIRED); 141 mutex_exit(&viona_force_copy_lock); 142 143 return (result); 144 } 145 146 void 147 viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz) 148 { 149 const viona_link_params_t *vlp = &ring->vr_link->l_params; 150 151 ring->vr_tx.vrt_header_pad = vlp->vlp_tx_header_pad; 152 /* Allocate desb handles for TX ring if packet copying not forced */ 153 if (!ring->vr_link->l_params.vlp_tx_copy_data) { 154 viona_desb_t *dp = 155 kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP); 156 ring->vr_tx.vrt_desb = dp; 157 158 const size_t header_sz = 159 VIONA_MAX_HDRS_LEN + ring->vr_tx.vrt_header_pad; 160 for (uint_t i = 0; i < qsz; i++, dp++) { 161 dp->d_frtn.free_func = viona_desb_release; 162 dp->d_frtn.free_arg = (void *)dp; 163 dp->d_ring = ring; 164 dp->d_headers = kmem_zalloc(header_sz, KM_SLEEP); 165 } 166 } 167 168 /* Allocate ring-sized iovec buffers for TX */ 169 ring->vr_tx.vrt_iov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP); 170 ring->vr_tx.vrt_iov_cnt = qsz; 171 } 172 173 void 174 viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz) 175 { 176 if (ring->vr_tx.vrt_desb != NULL) { 177 viona_desb_t *dp = ring->vr_tx.vrt_desb; 178 179 const size_t header_sz = 180 VIONA_MAX_HDRS_LEN + ring->vr_tx.vrt_header_pad; 181 for (uint_t i = 0; i < qsz; i++, dp++) { 182 kmem_free(dp->d_headers, header_sz); 183 } 184 kmem_free(ring->vr_tx.vrt_desb, sizeof (viona_desb_t) * qsz); 185 ring->vr_tx.vrt_desb = NULL; 186 } 187 188 if (ring->vr_tx.vrt_iov != NULL) { 189 ASSERT3U(ring->vr_tx.vrt_iov_cnt, !=, 0); 190 191 kmem_free(ring->vr_tx.vrt_iov, 192 sizeof (struct iovec) * ring->vr_tx.vrt_iov_cnt); 193 ring->vr_tx.vrt_iov = NULL; 194 ring->vr_tx.vrt_iov_cnt = 0; 195 } 196 } 197 198 static void 199 viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie) 200 { 201 vq_pushchain(ring, len, cookie); 202 203 membar_enter(); 204 viona_intr_ring(ring, B_FALSE); 205 } 206 207 #define TX_BURST_THRESH 32 208 209 void 210 viona_worker_tx(viona_vring_t *ring, viona_link_t *link) 211 { 212 (void) thread_vsetname(curthread, "viona_tx_%p", ring); 213 214 ASSERT(MUTEX_HELD(&ring->vr_lock)); 215 ASSERT3U(ring->vr_state, ==, VRS_RUN); 216 217 mutex_exit(&ring->vr_lock); 218 219 for (;;) { 220 uint_t ntx = 0, burst = 0; 221 222 viona_ring_disable_notify(ring); 223 while (viona_ring_num_avail(ring) != 0) { 224 viona_tx(link, ring); 225 ntx++; 226 burst++; 227 228 /* 229 * It is advantageous for throughput to keep this 230 * transmission loop tight, but periodic breaks to 231 * check for other events are of value too. 232 */ 233 if (burst >= TX_BURST_THRESH) { 234 mutex_enter(&ring->vr_lock); 235 const bool need_bail = vring_need_bail(ring); 236 mutex_exit(&ring->vr_lock); 237 238 if (need_bail) { 239 break; 240 } 241 burst = 0; 242 } 243 } 244 245 VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx); 246 247 /* 248 * Check for available descriptors on the ring once more in 249 * case a late addition raced with the NO_NOTIFY flag toggle. 250 * 251 * The barrier ensures that visibility of the no-notify 252 * store does not cross the viona_ring_num_avail() check below. 253 */ 254 viona_ring_enable_notify(ring); 255 membar_enter(); 256 257 if (viona_ring_num_avail(ring) == 0 && 258 (link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) { 259 /* 260 * The NOTIFY_ON_EMPTY interrupt should not pay heed to 261 * the presence of AVAIL_NO_INTERRUPT. 262 */ 263 viona_intr_ring(ring, B_TRUE); 264 } 265 266 mutex_enter(&ring->vr_lock); 267 for (;;) { 268 if (vring_need_bail(ring)) { 269 ring->vr_state = VRS_STOP; 270 viona_tx_wait_outstanding(ring); 271 return; 272 } 273 274 if (vmm_drv_lease_expired(ring->vr_lease)) { 275 ring->vr_state_flags |= VRSF_RENEW; 276 /* 277 * When renewing the lease for the ring, no TX 278 * frames may be outstanding, as they contain 279 * references to guest memory. 280 */ 281 viona_tx_wait_outstanding(ring); 282 283 const boolean_t renewed = 284 viona_ring_lease_renew(ring); 285 ring->vr_state_flags &= ~VRSF_RENEW; 286 287 if (!renewed) { 288 /* stop ring on failed renewal */ 289 ring->vr_state = VRS_STOP; 290 return; 291 } 292 } 293 294 if (viona_ring_num_avail(ring) != 0) { 295 break; 296 } 297 298 /* Wait for further activity on the ring */ 299 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 300 } 301 mutex_exit(&ring->vr_lock); 302 } 303 /* UNREACHABLE */ 304 } 305 306 static void 307 viona_desb_release(viona_desb_t *dp) 308 { 309 viona_vring_t *ring = dp->d_ring; 310 uint_t ref; 311 uint32_t len; 312 uint16_t cookie; 313 314 ref = atomic_dec_uint_nv(&dp->d_ref); 315 if (ref > 1) { 316 return; 317 } 318 319 /* 320 * The desb corresponding to this index must be ready for reuse before 321 * the descriptor is returned to the guest via the 'used' ring. 322 */ 323 len = dp->d_len; 324 cookie = dp->d_cookie; 325 dp->d_len = 0; 326 dp->d_cookie = 0; 327 vmm_drv_page_release_chain(dp->d_pages); 328 dp->d_pages = NULL; 329 330 /* 331 * Ensure all other changes to the desb are visible prior to zeroing its 332 * refcount, signifying its readiness for reuse. 333 */ 334 membar_exit(); 335 dp->d_ref = 0; 336 337 viona_tx_done(ring, len, cookie); 338 339 mutex_enter(&ring->vr_lock); 340 if ((--ring->vr_xfer_outstanding) == 0) { 341 cv_broadcast(&ring->vr_cv); 342 } 343 mutex_exit(&ring->vr_lock); 344 } 345 346 /* 347 * Confirm that the requested checksum operation acts within the bounds of the 348 * provided packet, and that the checksum itself will be stored in the "copied 349 * headers" portion of said packet. 350 */ 351 static boolean_t 352 viona_tx_csum_req_valid(const struct virtio_net_mrgrxhdr *hdr, 353 const mac_ether_offload_info_t *meoi, uint_t copied_len) 354 { 355 const uint_t csum_off = hdr->vrh_csum_offset + hdr->vrh_csum_start; 356 357 if (hdr->vrh_csum_start >= meoi->meoi_len || 358 hdr->vrh_csum_start < meoi->meoi_l2hlen || 359 csum_off >= meoi->meoi_len || 360 (csum_off + sizeof (uint16_t)) > copied_len) { 361 return (B_FALSE); 362 } 363 364 return (B_TRUE); 365 } 366 367 /* 368 * Configure mblk to request full checksum offloading, given the virtio and meoi 369 * details provided. 370 */ 371 static void 372 viona_tx_hcksum_full(mblk_t *mp, const struct virtio_net_mrgrxhdr *hdr, 373 const mac_ether_offload_info_t *meoi, uint32_t added_flags) 374 { 375 /* 376 * Out of caution, zero the checksum field in case any driver and/or 377 * device would erroneously use it in the sum calculation. 378 */ 379 uint16_t *csump = (uint16_t *) 380 (mp->b_rptr + hdr->vrh_csum_start + hdr->vrh_csum_offset); 381 *csump = 0; 382 383 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM | added_flags); 384 } 385 386 /* 387 * Configure mblk to request partial checksum offloading, given the virtio and 388 * meoi details provided. 389 */ 390 static void 391 viona_tx_hcksum_partial(mblk_t *mp, const struct virtio_net_mrgrxhdr *hdr, 392 const mac_ether_offload_info_t *meoi, uint32_t added_flags) 393 { 394 /* 395 * MAC expects these offsets to be relative to the start of the L3 396 * header rather than the L2 frame. 397 */ 398 mac_hcksum_set(mp, 399 hdr->vrh_csum_start - meoi->meoi_l2hlen, 400 hdr->vrh_csum_start + hdr->vrh_csum_offset - meoi->meoi_l2hlen, 401 meoi->meoi_len - meoi->meoi_l2hlen, 402 0, HCK_PARTIALCKSUM | added_flags); 403 } 404 405 static boolean_t 406 viona_tx_offloads(viona_vring_t *ring, const struct virtio_net_mrgrxhdr *hdr, 407 const mac_ether_offload_info_t *meoi, mblk_t *mp, uint32_t len) 408 { 409 viona_link_t *link = ring->vr_link; 410 const uint32_t cap_csum = link->l_cap_csum; 411 412 /* 413 * Since viona is a "legacy device", the data stored by the driver will 414 * be in the guest's native endian format (see sections 2.4.3 and 415 * 5.1.6.1 of the VIRTIO 1.0 spec for more info). At this time the only 416 * guests using viona are x86 and we can assume little-endian. 417 */ 418 const uint16_t gso_size = LE_16(hdr->vrh_gso_size); 419 420 if (!viona_tx_csum_req_valid(hdr, meoi, MBLKL(mp))) { 421 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp); 422 VIONA_RING_STAT_INCR(ring, fail_hcksum); 423 return (B_FALSE); 424 } 425 426 const uint16_t ftype = meoi->meoi_l3proto; 427 const uint8_t ipproto = meoi->meoi_l4proto; 428 if (ftype != ETHERTYPE_IP && ftype != ETHERTYPE_IPV6) { 429 /* Ignore checksum offload requests for non-IP protocols. */ 430 VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, 431 mblk_t *, mp); 432 VIONA_RING_STAT_INCR(ring, fail_hcksum_proto); 433 return (B_FALSE); 434 } 435 436 /* Configure TCPv4 LSO when requested */ 437 if ((hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 && 438 ftype == ETHERTYPE_IP) { 439 if ((link->l_features & VIRTIO_NET_F_HOST_TSO4) == 0) { 440 VIONA_PROBE2(tx_gso_fail, viona_link_t *, link, 441 mblk_t *, mp); 442 VIONA_RING_STAT_INCR(ring, tx_gso_fail); 443 return (B_FALSE); 444 } 445 446 lso_info_set(mp, gso_size, HW_LSO); 447 448 /* 449 * We should have already verified that an adequate form of 450 * hardware checksum offload is present for TSOv4 451 */ 452 ASSERT3U(cap_csum & 453 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4), !=, 0); 454 455 if ((cap_csum & HCKSUM_INET_FULL_V4) != 0) { 456 viona_tx_hcksum_full(mp, hdr, meoi, HW_LSO); 457 } else if ((cap_csum & HCKSUM_INET_PARTIAL) != 0) { 458 /* 459 * Our native IP stack doesn't set the L4 length field 460 * of the pseudo header when LSO is in play. Other IP 461 * stacks, e.g. Linux, do include the length field. 462 * This is a problem because the hardware expects that 463 * the length field is not set. When it is set, it will 464 * cause an incorrect TCP checksum to be generated. 465 * Linux avoids this issue by correcting the 466 * pseudo-header checksum in the driver code. 467 * 468 * In order to get the correct HW checksum we need to 469 * assume the guest's IP stack gave us a bogus TCP 470 * partial checksum and calculate it ourselves. 471 */ 472 ipha_t *ipha = 473 (ipha_t *)(mp->b_rptr + meoi->meoi_l2hlen); 474 uint16_t *cksump = 475 IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha)); 476 477 uint32_t cksum = IP_TCP_CSUM_COMP; 478 const ipaddr_t src = ipha->ipha_src; 479 const ipaddr_t dst = ipha->ipha_dst; 480 cksum += (dst >> 16) + (dst & 0xffff) + 481 (src >> 16) + (src & 0xffff); 482 cksum = (cksum & 0xffff) + (cksum >> 16); 483 *cksump = (cksum & 0xffff) + (cksum >> 16); 484 485 /* 486 * NICs such as ixgbe require that ipv4 checksum offload 487 * also be enabled when performing LSO. 488 */ 489 uint32_t v4csum = 0; 490 if ((cap_csum & HCKSUM_IPHDRCKSUM) != 0) { 491 v4csum = HCK_IPV4_HDRCKSUM; 492 ipha->ipha_hdr_checksum = 0; 493 } 494 495 viona_tx_hcksum_partial(mp, hdr, meoi, HW_LSO | v4csum); 496 } else { 497 /* 498 * This should be unreachable: We do not permit LSO 499 * without adequate checksum offload capability. 500 */ 501 VIONA_PROBE2(tx_gso_fail, viona_link_t *, link, 502 mblk_t *, mp); 503 VIONA_RING_STAT_INCR(ring, tx_gso_fail); 504 return (B_FALSE); 505 } 506 507 return (B_TRUE); 508 } 509 510 /* 511 * Partial checksum support from the NIC is ideal, since it most closely 512 * maps to the interface defined by virtio. 513 */ 514 if ((cap_csum & HCKSUM_INET_PARTIAL) != 0 && 515 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { 516 viona_tx_hcksum_partial(mp, hdr, meoi, 0); 517 return (B_TRUE); 518 } 519 520 /* 521 * Without partial checksum support, look to the L3/L4 protocol 522 * information to see if the NIC can handle it. If not, the checksum 523 * will need to calculated inline. 524 */ 525 if (ftype == ETHERTYPE_IP) { 526 if ((cap_csum & HCKSUM_INET_FULL_V4) != 0 && 527 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { 528 viona_tx_hcksum_full(mp, hdr, meoi, 0); 529 return (B_TRUE); 530 } 531 532 /* XXX: Implement manual fallback checksumming? */ 533 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp); 534 VIONA_RING_STAT_INCR(ring, fail_hcksum); 535 return (B_FALSE); 536 } else if (ftype == ETHERTYPE_IPV6) { 537 if ((cap_csum & HCKSUM_INET_FULL_V6) != 0 && 538 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { 539 viona_tx_hcksum_full(mp, hdr, meoi, 0); 540 return (B_TRUE); 541 } 542 543 /* XXX: Implement manual fallback checksumming? */ 544 VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp); 545 VIONA_RING_STAT_INCR(ring, fail_hcksum6); 546 return (B_FALSE); 547 } 548 549 /* 550 * Note the failure for unrecognized protocols, but soldier on to make 551 * our best effort at getting the frame out the door. 552 */ 553 VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp); 554 VIONA_RING_STAT_INCR(ring, fail_hcksum_proto); 555 return (B_FALSE); 556 } 557 558 static mblk_t * 559 viona_tx_alloc_headers(viona_vring_t *ring, uint16_t cookie, viona_desb_t **dpp, 560 uint32_t len) 561 { 562 ASSERT3P(*dpp, ==, NULL); 563 564 mblk_t *mp = NULL; 565 const size_t header_pad = ring->vr_tx.vrt_header_pad; 566 567 if (ring->vr_tx.vrt_desb != NULL) { 568 viona_desb_t *dp = &ring->vr_tx.vrt_desb[cookie]; 569 const size_t header_sz = VIONA_MAX_HDRS_LEN + header_pad; 570 571 /* 572 * If the guest driver is operating properly, each desb slot 573 * should be available for use when processing a TX descriptor 574 * from the 'avail' ring. In the case of drivers that reuse a 575 * descriptor before it has been posted to the 'used' ring, the 576 * data is simply dropped. 577 */ 578 if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) { 579 return (NULL); 580 } 581 582 dp->d_cookie = cookie; 583 mp = desballoc(dp->d_headers, header_sz, 0, &dp->d_frtn); 584 585 if (mp != NULL) { 586 /* 587 * Account for the successful desballoc, and communicate 588 * out the desb handle for subsequent use 589 */ 590 dp->d_ref++; 591 *dpp = dp; 592 } else { 593 /* Reset the desb back to its "available" state */ 594 dp->d_ref = 0; 595 } 596 } else { 597 /* 598 * If we are going to be copying the entire packet, we might as 599 * well allocate for it all in one go. 600 */ 601 mp = allocb(len + header_pad, 0); 602 } 603 604 /* Push pointers forward to account for requested header padding */ 605 if (mp != NULL && header_pad != 0) { 606 mp->b_rptr = mp->b_wptr = (DB_BASE(mp) + header_pad); 607 } 608 609 return (mp); 610 } 611 612 static boolean_t 613 viona_tx_copy_headers(viona_vring_t *ring, iov_bunch_t *iob, mblk_t *mp, 614 mac_ether_offload_info_t *meoi) 615 { 616 ASSERT(mp->b_cont == NULL); 617 618 if (ring->vr_tx.vrt_desb == NULL) { 619 /* 620 * If not using guest data loaning through the desb, then we 621 * expect viona_tx_alloc_headers() to have allocated space for 622 * the entire packet, which we should copy now. 623 */ 624 const uint32_t pkt_size = iob->ib_remain; 625 626 VERIFY(MBLKTAIL(mp) >= pkt_size); 627 VERIFY(iov_bunch_copy(iob, mp->b_wptr, pkt_size)); 628 mp->b_wptr += pkt_size; 629 (void) mac_ether_offload_info(mp, meoi); 630 return (B_TRUE); 631 } 632 633 /* 634 * We want to maximize the amount of guest data we loan when performing 635 * packet transmission, with the caveat that we must copy the packet 636 * headers to prevent TOCTOU issues. 637 */ 638 const uint32_t copy_sz = MIN(iob->ib_remain, MBLKTAIL(mp)); 639 640 VERIFY(iov_bunch_copy(iob, mp->b_wptr, copy_sz)); 641 mp->b_wptr += copy_sz; 642 643 if (iob->ib_remain == 0) { 644 (void) mac_ether_offload_info(mp, meoi); 645 return (B_TRUE); 646 } 647 648 /* 649 * Attempt to confirm that our buffer contains at least the entire 650 * (L2-L4) packet headers. 651 */ 652 if (mac_ether_offload_info(mp, meoi) == 0) { 653 const uint32_t full_hdr_sz = 654 meoi->meoi_l2hlen + meoi->meoi_l3hlen + meoi->meoi_l4hlen; 655 656 if (copy_sz >= full_hdr_sz) { 657 return (B_TRUE); 658 } 659 } 660 661 /* 662 * Despite our best efforts, the full headers do not appear to be along 663 * for the ride yet. Just allocate a buffer and copy the remainder of 664 * the packet. 665 */ 666 const uint32_t remain_sz = iob->ib_remain; 667 mblk_t *remain_mp = allocb(remain_sz, 0); 668 if (remain_mp == NULL) { 669 return (B_FALSE); 670 } 671 VERIFY(iov_bunch_copy(iob, remain_mp->b_wptr, remain_sz)); 672 remain_mp->b_wptr += remain_sz; 673 mp->b_cont = remain_mp; 674 /* Refresh header info now that we have copied the rest */ 675 (void) mac_ether_offload_info(mp, meoi); 676 677 return (B_TRUE); 678 } 679 680 static void 681 viona_tx(viona_link_t *link, viona_vring_t *ring) 682 { 683 struct iovec *iov = ring->vr_tx.vrt_iov; 684 const uint_t max_segs = ring->vr_tx.vrt_iov_cnt; 685 uint16_t cookie; 686 vmm_page_t *pages = NULL; 687 uint32_t total_len; 688 mblk_t *mp_head = NULL; 689 viona_desb_t *dp = NULL; 690 const boolean_t merge_enabled = 691 ((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0); 692 693 ASSERT(iov != NULL); 694 695 const int n = vq_popchain(ring, iov, max_segs, &cookie, &pages, 696 &total_len); 697 if (n == 0) { 698 VIONA_PROBE1(tx_absent, viona_vring_t *, ring); 699 VIONA_RING_STAT_INCR(ring, tx_absent); 700 return; 701 } else if (n < 0) { 702 /* 703 * Any error encountered in vq_popchain has already resulted in 704 * specific probe and statistic handling. Further action here 705 * is unnecessary. 706 */ 707 return; 708 } 709 710 /* 711 * Get setup to copy the VirtIO header from in front of the packet. 712 * 713 * With an eye toward supporting VirtIO 1.0 behavior in the future, we 714 * determine the size of the header based on the device state. This 715 * goes a bit beyond the expectations of legacy VirtIO, where the first 716 * buffer must cover the header and nothing else. 717 */ 718 iov_bunch_t iob = { 719 .ib_iov = iov, 720 .ib_remain = total_len, 721 }; 722 struct virtio_net_mrgrxhdr hdr; 723 uint32_t vio_hdr_len = 0; 724 if (merge_enabled) { 725 /* 726 * Presence of the "num_bufs" member is determined by the 727 * merge-rxbuf feature on the device, despite the fact that we 728 * are in transmission context here. 729 */ 730 vio_hdr_len = sizeof (struct virtio_net_mrgrxhdr); 731 } else { 732 vio_hdr_len = sizeof (struct virtio_net_hdr); 733 /* 734 * We ignore "num_bufs" from the guest anyways, but zero it out 735 * just in case. 736 */ 737 hdr.vrh_bufs = 0; 738 } 739 uint32_t pkt_len = 0; 740 if (!iov_bunch_copy(&iob, &hdr, vio_hdr_len)) { 741 goto drop_fail; 742 } 743 744 pkt_len = total_len - vio_hdr_len; 745 if (pkt_len > VIONA_MAX_PACKET_SIZE || 746 pkt_len < sizeof (struct ether_header)) { 747 goto drop_fail; 748 } 749 750 mp_head = viona_tx_alloc_headers(ring, cookie, &dp, pkt_len); 751 if (mp_head == NULL) { 752 goto drop_fail; 753 } 754 755 /* 756 * Copy the the packet headers (L2 through L4, if present) to prevent 757 * TOCTOU attacks in any subsequent consumers of that data. 758 */ 759 mac_ether_offload_info_t meoi = { 0 }; 760 if (!viona_tx_copy_headers(ring, &iob, mp_head, &meoi)) { 761 goto drop_fail; 762 } 763 764 if (dp != NULL && iob.ib_remain != 0) { 765 /* 766 * If this device is loaning guest memory, rather than copying 767 * the entire body of the packet, we may need to establish mblks 768 * for the remaining data-to-be-loaned after the header copy. 769 */ 770 uint32_t chunk_sz; 771 caddr_t chunk; 772 mblk_t *mp_tail = mp_head; 773 774 /* 775 * Ensure that our view of the tail is accurate in the rare case 776 * that the header allocation/copying logic has already resulted 777 * in a chained mblk. 778 */ 779 while (mp_tail->b_cont != NULL) { 780 mp_tail = mp_tail->b_cont; 781 } 782 783 while (iov_bunch_next_chunk(&iob, &chunk, &chunk_sz)) { 784 mblk_t *mp = desballoc((uchar_t *)chunk, chunk_sz, 0, 785 &dp->d_frtn); 786 if (mp == NULL) { 787 goto drop_fail; 788 } 789 790 mp->b_wptr += chunk_sz; 791 dp->d_ref++; 792 mp_tail->b_cont = mp; 793 mp_tail = mp; 794 } 795 } else { 796 /* The copy-everything strategy should be done by now */ 797 VERIFY0(iob.ib_remain); 798 } 799 800 if (VNETHOOK_INTERESTED_OUT(link->l_neti)) { 801 /* 802 * The hook consumer may elect to free the mblk_t and set 803 * our mblk_t ** to NULL. When using a viona_desb_t 804 * (dp != NULL), we do not want the corresponding cleanup to 805 * occur during the viona_hook() call. We instead want to 806 * reset and recycle dp for future use. To prevent cleanup 807 * during the viona_hook() call, we take a ref on dp (if being 808 * used), and release it on success. On failure, the 809 * freemsgchain() call will release all the refs taken earlier 810 * in viona_tx() (aside from the initial ref and the one we 811 * take), and drop_hook will reset dp for reuse. 812 */ 813 if (dp != NULL) 814 dp->d_ref++; 815 816 /* 817 * Pass &mp instead of &mp_head so we don't lose track of 818 * mp_head if the hook consumer (i.e. ipf) elects to free mp 819 * and set mp to NULL. 820 */ 821 mblk_t *mp = mp_head; 822 if (viona_hook(link, ring, &mp, B_TRUE) != 0) { 823 if (mp != NULL) 824 freemsgchain(mp); 825 goto drop_hook; 826 } 827 828 if (dp != NULL) { 829 dp->d_ref--; 830 831 /* 832 * It is possible that the hook(s) accepted the packet, 833 * but as part of its processing, it issued a pull-up 834 * which released all references to the desb. In that 835 * case, go back to acting like the packet is entirely 836 * copied (which it is). 837 */ 838 if (dp->d_ref == 1) { 839 dp->d_cookie = 0; 840 dp->d_ref = 0; 841 dp = NULL; 842 } 843 } 844 } 845 846 /* 847 * Translate request for offloaded checksumming. If the guest sent an 848 * LSO packet then it must have also negotiated and requested partial 849 * checksum; therefore the LSO logic is contained within 850 * viona_tx_offloads(). 851 */ 852 if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 && 853 (hdr.vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) { 854 if (!viona_tx_offloads(ring, &hdr, &meoi, mp_head, pkt_len)) { 855 /* 856 * If processing of any checksum offload request fails, 857 * we can still pass the packet on for transmission. 858 * Even with this best-effort behavior, which may in 859 * fact succeed in the end, we record it as an error. 860 */ 861 viona_ring_stat_error(ring); 862 } 863 } 864 865 if (dp != NULL) { 866 /* 867 * Record the info required to record this descriptor in the 868 * used ring once its transmission has completed. 869 */ 870 dp->d_len = total_len; 871 dp->d_pages = pages; 872 mutex_enter(&ring->vr_lock); 873 ring->vr_xfer_outstanding++; 874 mutex_exit(&ring->vr_lock); 875 } else { 876 /* 877 * If the data was cloned out of the ring, the descriptors can 878 * be marked as 'used' now, rather than deferring that action 879 * until after successful packet transmission. 880 */ 881 vmm_drv_page_release_chain(pages); 882 viona_tx_done(ring, total_len, cookie); 883 } 884 885 /* 886 * From viona's point of view, this is a successful transmit, even if 887 * something downstream decides to drop the packet. 888 */ 889 viona_ring_stat_accept(ring, pkt_len); 890 891 /* 892 * We're potentially going deep into the networking layer; make sure the 893 * guest can't run concurrently. 894 */ 895 smt_begin_unsafe(); 896 /* 897 * Ignore, for now, any signal from MAC about whether the outgoing 898 * packet was dropped or not. 899 */ 900 (void) mac_tx(link->l_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL); 901 smt_end_unsafe(); 902 return; 903 904 drop_fail: 905 /* 906 * On the off chance that memory is not available via the desballoc or 907 * allocb calls, there are few options left besides to fail and drop 908 * the frame on the floor. 909 * 910 * First account for it in the error stats. 911 */ 912 viona_ring_stat_error(ring); 913 914 if (dp != NULL) { 915 /* 916 * Take an additional reference on the desb handle (if present) 917 * so any desballoc-sourced mblks can release their hold on it 918 * without the handle reaching its final state and executing 919 * its clean-up logic. 920 */ 921 dp->d_ref++; 922 } 923 924 /* 925 * Free any already-allocated blocks and sum up the total length of the 926 * dropped data to be released to the used ring. 927 */ 928 freemsgchain(mp_head); 929 930 drop_hook: 931 if (dp != NULL) { 932 VERIFY(dp->d_ref == 2); 933 934 /* Clean up the desb handle, releasing the extra hold. */ 935 dp->d_len = 0; 936 dp->d_cookie = 0; 937 dp->d_ref = 0; 938 } 939 940 /* Count in the stats as a drop, rather than an error */ 941 viona_ring_stat_drop(ring); 942 943 VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, pkt_len, 944 uint16_t, cookie); 945 vmm_drv_page_release_chain(pages); 946 viona_tx_done(ring, total_len, cookie); 947 } 948