1 /* 2 * Copyright (c) 2013 Chris Torek <torek @ torek net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * This file and its contents are supplied under the terms of the 28 * Common Development and Distribution License ("CDDL"), version 1.0. 29 * You may only use this file in accordance with the terms of version 30 * 1.0 of the CDDL. 31 * 32 * A full copy of the text of the CDDL should have accompanied this 33 * source. A copy of the CDDL is also available via the Internet at 34 * http://www.illumos.org/license/CDDL. 35 * 36 * Copyright 2015 Pluribus Networks Inc. 37 * Copyright 2019 Joyent, Inc. 38 * Copyright 2024 Oxide Computer Company 39 */ 40 41 42 #include <sys/types.h> 43 #include <sys/smt.h> 44 #include <sys/strsubr.h> 45 46 #include <sys/pattr.h> 47 #include <sys/dlpi.h> 48 #include <inet/ip.h> 49 #include <inet/ip_impl.h> 50 51 #include "viona_impl.h" 52 53 #define BNXE_NIC_DRIVER "bnxe" 54 55 /* 56 * Tunable controls tx copy by default on or off 57 */ 58 boolean_t viona_default_tx_copy = B_TRUE; 59 60 /* 61 * copy tx mbufs from virtio ring to avoid necessitating a wait for packet 62 * transmission to free resources. 63 */ 64 kmutex_t viona_force_copy_lock; 65 static enum viona_force_copy { 66 VFC_UNINITALIZED = 0, 67 VFC_COPY_UNEEDED = 1, 68 VFC_COPY_REQUIRED = 2, 69 } viona_force_copy_state = VFC_UNINITALIZED; 70 71 struct viona_desb { 72 frtn_t d_frtn; 73 viona_vring_t *d_ring; 74 uint_t d_ref; 75 uint32_t d_len; 76 uint16_t d_cookie; 77 uchar_t *d_headers; 78 vmm_page_t *d_pages; 79 }; 80 81 static void viona_tx(viona_link_t *, viona_vring_t *); 82 static void viona_desb_release(viona_desb_t *); 83 84 85 static void 86 viona_tx_wait_outstanding(viona_vring_t *ring) 87 { 88 ASSERT(MUTEX_HELD(&ring->vr_lock)); 89 90 while (ring->vr_xfer_outstanding != 0) { 91 /* 92 * Paying heed to signals is counterproductive here. This is a 93 * very tight loop if pending transfers take an extended amount 94 * of time to be reclaimed while the host process is exiting. 95 */ 96 cv_wait(&ring->vr_cv, &ring->vr_lock); 97 } 98 } 99 100 /* 101 * Check if full TX packet copying is needed. This should not be called from 102 * viona attach()/detach() context. 103 */ 104 static boolean_t 105 viona_tx_copy_needed(void) 106 { 107 boolean_t result; 108 109 if (viona_default_tx_copy) { 110 return (B_TRUE); 111 } 112 113 mutex_enter(&viona_force_copy_lock); 114 if (viona_force_copy_state == VFC_UNINITALIZED) { 115 major_t bnxe_major; 116 117 /* 118 * The original code for viona featured an explicit check for 119 * the bnxe driver which, when found present, necessitated that 120 * all transmissions be copied into their own mblks instead of 121 * passing guest memory to the underlying device. 122 * 123 * The motivations for this are unclear, but until it can be 124 * proven unnecessary, the check lives on. 125 */ 126 viona_force_copy_state = VFC_COPY_UNEEDED; 127 if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER)) 128 != DDI_MAJOR_T_NONE) { 129 if (ddi_hold_installed_driver(bnxe_major) != NULL) { 130 viona_force_copy_state = VFC_COPY_REQUIRED; 131 ddi_rele_driver(bnxe_major); 132 } 133 } 134 } 135 result = (viona_force_copy_state == VFC_COPY_REQUIRED); 136 mutex_exit(&viona_force_copy_lock); 137 138 return (result); 139 } 140 141 void 142 viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz) 143 { 144 /* Allocate desb handles for TX ring if packet copying is disabled */ 145 if (!viona_tx_copy_needed()) { 146 viona_desb_t *dp; 147 148 dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP); 149 ring->vr_txdesb = dp; 150 for (uint_t i = 0; i < qsz; i++, dp++) { 151 dp->d_frtn.free_func = viona_desb_release; 152 dp->d_frtn.free_arg = (void *)dp; 153 dp->d_ring = ring; 154 dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN, 155 KM_SLEEP); 156 } 157 } 158 159 /* Allocate ring-sized iovec buffers for TX */ 160 ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP); 161 } 162 163 void 164 viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz) 165 { 166 if (ring->vr_txdesb != NULL) { 167 viona_desb_t *dp = ring->vr_txdesb; 168 169 for (uint_t i = 0; i < qsz; i++, dp++) { 170 kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN); 171 } 172 kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz); 173 ring->vr_txdesb = NULL; 174 } 175 176 if (ring->vr_txiov != NULL) { 177 kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz); 178 ring->vr_txiov = NULL; 179 } 180 } 181 182 static void 183 viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie) 184 { 185 vq_pushchain(ring, len, cookie); 186 187 membar_enter(); 188 viona_intr_ring(ring, B_FALSE); 189 } 190 191 #define TX_BURST_THRESH 32 192 193 void 194 viona_worker_tx(viona_vring_t *ring, viona_link_t *link) 195 { 196 (void) thread_vsetname(curthread, "viona_tx_%p", ring); 197 198 ASSERT(MUTEX_HELD(&ring->vr_lock)); 199 ASSERT3U(ring->vr_state, ==, VRS_RUN); 200 201 mutex_exit(&ring->vr_lock); 202 203 for (;;) { 204 uint_t ntx = 0, burst = 0; 205 206 viona_ring_disable_notify(ring); 207 while (viona_ring_num_avail(ring) != 0) { 208 viona_tx(link, ring); 209 ntx++; 210 burst++; 211 212 /* 213 * It is advantageous for throughput to keep this 214 * transmission loop tight, but periodic breaks to 215 * check for other events are of value too. 216 */ 217 if (burst >= TX_BURST_THRESH) { 218 mutex_enter(&ring->vr_lock); 219 const bool need_bail = vring_need_bail(ring); 220 mutex_exit(&ring->vr_lock); 221 222 if (need_bail) { 223 break; 224 } 225 burst = 0; 226 } 227 } 228 229 VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx); 230 231 /* 232 * Check for available descriptors on the ring once more in 233 * case a late addition raced with the NO_NOTIFY flag toggle. 234 * 235 * The barrier ensures that visibility of the no-notify 236 * store does not cross the viona_ring_num_avail() check below. 237 */ 238 viona_ring_enable_notify(ring); 239 membar_enter(); 240 241 if (viona_ring_num_avail(ring) == 0 && 242 (link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) { 243 /* 244 * The NOTIFY_ON_EMPTY interrupt should not pay heed to 245 * the presence of AVAIL_NO_INTERRUPT. 246 */ 247 viona_intr_ring(ring, B_TRUE); 248 } 249 250 mutex_enter(&ring->vr_lock); 251 for (;;) { 252 if (vring_need_bail(ring)) { 253 ring->vr_state = VRS_STOP; 254 viona_tx_wait_outstanding(ring); 255 return; 256 } 257 258 if (vmm_drv_lease_expired(ring->vr_lease)) { 259 ring->vr_state_flags |= VRSF_RENEW; 260 /* 261 * When renewing the lease for the ring, no TX 262 * frames may be outstanding, as they contain 263 * references to guest memory. 264 */ 265 viona_tx_wait_outstanding(ring); 266 267 const boolean_t renewed = 268 viona_ring_lease_renew(ring); 269 ring->vr_state_flags &= ~VRSF_RENEW; 270 271 if (!renewed) { 272 /* stop ring on failed renewal */ 273 ring->vr_state = VRS_STOP; 274 return; 275 } 276 } 277 278 if (viona_ring_num_avail(ring) != 0) { 279 break; 280 } 281 282 /* Wait for further activity on the ring */ 283 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 284 } 285 mutex_exit(&ring->vr_lock); 286 } 287 /* UNREACHABLE */ 288 } 289 290 static void 291 viona_desb_release(viona_desb_t *dp) 292 { 293 viona_vring_t *ring = dp->d_ring; 294 uint_t ref; 295 uint32_t len; 296 uint16_t cookie; 297 298 ref = atomic_dec_uint_nv(&dp->d_ref); 299 if (ref > 1) { 300 return; 301 } 302 303 /* 304 * The desb corresponding to this index must be ready for reuse before 305 * the descriptor is returned to the guest via the 'used' ring. 306 */ 307 len = dp->d_len; 308 cookie = dp->d_cookie; 309 dp->d_len = 0; 310 dp->d_cookie = 0; 311 vmm_drv_page_release_chain(dp->d_pages); 312 dp->d_pages = NULL; 313 314 /* 315 * Ensure all other changes to the desb are visible prior to zeroing its 316 * refcount, signifying its readiness for reuse. 317 */ 318 membar_exit(); 319 dp->d_ref = 0; 320 321 viona_tx_done(ring, len, cookie); 322 323 mutex_enter(&ring->vr_lock); 324 if ((--ring->vr_xfer_outstanding) == 0) { 325 cv_broadcast(&ring->vr_cv); 326 } 327 mutex_exit(&ring->vr_lock); 328 } 329 330 static boolean_t 331 viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr, 332 mblk_t *mp, uint32_t len) 333 { 334 viona_link_t *link = ring->vr_link; 335 const struct ether_header *eth; 336 uint_t eth_len = sizeof (struct ether_header); 337 ushort_t ftype; 338 ipha_t *ipha = NULL; 339 uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */ 340 uint16_t flags = 0; 341 const uint_t csum_start = hdr->vrh_csum_start; 342 const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start; 343 344 /* 345 * Validate that the checksum offsets provided by the guest are within 346 * the bounds of the packet. Additionally, ensure that the checksum 347 * contents field is within the headers mblk copied by viona_tx(). 348 */ 349 if (csum_start >= len || csum_start < eth_len || csum_stuff >= len || 350 (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) { 351 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp); 352 VIONA_RING_STAT_INCR(ring, fail_hcksum); 353 return (B_FALSE); 354 } 355 356 /* 357 * This is guaranteed to be safe thanks to the header copying 358 * done in viona_tx(). 359 */ 360 eth = (const struct ether_header *)mp->b_rptr; 361 ftype = ntohs(eth->ether_type); 362 363 if (ftype == ETHERTYPE_VLAN) { 364 const struct ether_vlan_header *veth; 365 366 /* punt on QinQ for now */ 367 eth_len = sizeof (struct ether_vlan_header); 368 veth = (const struct ether_vlan_header *)eth; 369 ftype = ntohs(veth->ether_type); 370 } 371 372 if (ftype == ETHERTYPE_IP) { 373 ipha = (ipha_t *)(mp->b_rptr + eth_len); 374 375 ipproto = ipha->ipha_protocol; 376 } else if (ftype == ETHERTYPE_IPV6) { 377 ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len); 378 379 ipproto = ip6h->ip6_nxt; 380 } 381 382 /* 383 * We ignore hdr_len because the spec says it can't be 384 * trusted. Besides, our own stack will determine the header 385 * boundary. 386 */ 387 if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 && 388 (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 && 389 ftype == ETHERTYPE_IP) { 390 uint16_t *cksump; 391 uint32_t cksum; 392 ipaddr_t src = ipha->ipha_src; 393 ipaddr_t dst = ipha->ipha_dst; 394 395 /* 396 * Our native IP stack doesn't set the L4 length field 397 * of the pseudo header when LSO is in play. Other IP 398 * stacks, e.g. Linux, do include the length field. 399 * This is a problem because the hardware expects that 400 * the length field is not set. When it is set it will 401 * cause an incorrect TCP checksum to be generated. 402 * The reason this works in Linux is because Linux 403 * corrects the pseudo-header checksum in the driver 404 * code. In order to get the correct HW checksum we 405 * need to assume the guest's IP stack gave us a bogus 406 * TCP partial checksum and calculate it ourselves. 407 */ 408 cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha)); 409 cksum = IP_TCP_CSUM_COMP; 410 cksum += (dst >> 16) + (dst & 0xFFFF) + 411 (src >> 16) + (src & 0xFFFF); 412 cksum = (cksum & 0xFFFF) + (cksum >> 16); 413 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); 414 415 /* 416 * Since viona is a "legacy device", the data stored 417 * by the driver will be in the guest's native endian 418 * format (see sections 2.4.3 and 5.1.6.1 of the 419 * VIRTIO 1.0 spec for more info). At this time the 420 * only guests using viona are x86 and we can assume 421 * little-endian. 422 */ 423 lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO); 424 425 /* 426 * Hardware, like ixgbe, expects the client to request 427 * IP header checksum offload if it's sending LSO (see 428 * ixgbe_get_context()). Unfortunately, virtio makes 429 * no allowances for negotiating IP header checksum 430 * and HW offload, only TCP checksum. We add the flag 431 * and zero-out the checksum field. This mirrors the 432 * behavior of our native IP stack (which does this in 433 * the interest of HW that expects the field to be 434 * zero). 435 */ 436 flags |= HCK_IPV4_HDRCKSUM; 437 ipha->ipha_hdr_checksum = 0; 438 } 439 440 /* 441 * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure 442 * HW_LSO, if present, is not lost. 443 */ 444 flags |= DB_CKSUMFLAGS(mp); 445 446 /* 447 * Partial checksum support from the NIC is ideal, since it most 448 * closely maps to the interface defined by virtio. 449 */ 450 if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 && 451 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { 452 /* 453 * MAC expects these offsets to be relative to the 454 * start of the L3 header rather than the L2 frame. 455 */ 456 flags |= HCK_PARTIALCKSUM; 457 mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len, 458 len - eth_len, 0, flags); 459 return (B_TRUE); 460 } 461 462 /* 463 * Without partial checksum support, look to the L3/L4 protocol 464 * information to see if the NIC can handle it. If not, the 465 * checksum will need to calculated inline. 466 */ 467 if (ftype == ETHERTYPE_IP) { 468 if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 && 469 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { 470 uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff); 471 *csump = 0; 472 flags |= HCK_FULLCKSUM; 473 mac_hcksum_set(mp, 0, 0, 0, 0, flags); 474 return (B_TRUE); 475 } 476 477 /* XXX: Implement manual fallback checksumming? */ 478 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp); 479 VIONA_RING_STAT_INCR(ring, fail_hcksum); 480 return (B_FALSE); 481 } else if (ftype == ETHERTYPE_IPV6) { 482 if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 && 483 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { 484 uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff); 485 *csump = 0; 486 flags |= HCK_FULLCKSUM; 487 mac_hcksum_set(mp, 0, 0, 0, 0, flags); 488 return (B_TRUE); 489 } 490 491 /* XXX: Implement manual fallback checksumming? */ 492 VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp); 493 VIONA_RING_STAT_INCR(ring, fail_hcksum6); 494 return (B_FALSE); 495 } 496 497 /* Cannot even emulate hcksum for unrecognized protocols */ 498 VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp); 499 VIONA_RING_STAT_INCR(ring, fail_hcksum_proto); 500 return (B_FALSE); 501 } 502 503 static void 504 viona_tx(viona_link_t *link, viona_vring_t *ring) 505 { 506 struct iovec *iov = ring->vr_txiov; 507 const uint_t max_segs = ring->vr_size; 508 uint16_t cookie; 509 int i, n; 510 uint32_t len, base_off = 0; 511 uint32_t min_copy = VIONA_MAX_HDRS_LEN; 512 mblk_t *mp_head, *mp_tail, *mp; 513 viona_desb_t *dp = NULL; 514 mac_client_handle_t link_mch = link->l_mch; 515 const struct virtio_net_hdr *hdr; 516 vmm_page_t *pages = NULL; 517 518 mp_head = mp_tail = NULL; 519 520 ASSERT(iov != NULL); 521 522 n = vq_popchain(ring, iov, max_segs, &cookie, &pages); 523 if (n == 0) { 524 VIONA_PROBE1(tx_absent, viona_vring_t *, ring); 525 VIONA_RING_STAT_INCR(ring, tx_absent); 526 return; 527 } else if (n < 0) { 528 /* 529 * Any error encountered in vq_popchain has already resulted in 530 * specific probe and statistic handling. Further action here 531 * is unnecessary. 532 */ 533 return; 534 } 535 536 /* Grab the header and ensure it is of adequate length */ 537 hdr = (const struct virtio_net_hdr *)iov[0].iov_base; 538 len = iov[0].iov_len; 539 if (len < sizeof (struct virtio_net_hdr)) { 540 goto drop_fail; 541 } 542 543 /* Make sure the packet headers are always in the first mblk. */ 544 if (ring->vr_txdesb != NULL) { 545 dp = &ring->vr_txdesb[cookie]; 546 547 /* 548 * If the guest driver is operating properly, each desb slot 549 * should be available for use when processing a TX descriptor 550 * from the 'avail' ring. In the case of drivers that reuse a 551 * descriptor before it has been posted to the 'used' ring, the 552 * data is simply dropped. 553 */ 554 if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) { 555 dp = NULL; 556 goto drop_fail; 557 } 558 559 dp->d_cookie = cookie; 560 mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0, 561 &dp->d_frtn); 562 563 /* Account for the successful desballoc. */ 564 if (mp_head != NULL) 565 dp->d_ref++; 566 } else { 567 mp_head = allocb(VIONA_MAX_HDRS_LEN, 0); 568 } 569 570 if (mp_head == NULL) 571 goto drop_fail; 572 573 mp_tail = mp_head; 574 575 /* 576 * We always copy enough of the guest data to cover the 577 * headers. This protects us from TOCTOU attacks and allows 578 * message block length assumptions to be made in subsequent 579 * code. In many cases, this means copying more data than 580 * strictly necessary. That's okay, as it is the larger packets 581 * (such as LSO) that really benefit from desballoc(). 582 */ 583 for (i = 1; i < n; i++) { 584 const uint32_t to_copy = MIN(min_copy, iov[i].iov_len); 585 586 bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy); 587 mp_head->b_wptr += to_copy; 588 len += to_copy; 589 min_copy -= to_copy; 590 591 /* 592 * We've met the minimum copy requirement. The rest of 593 * the guest data can be referenced. 594 */ 595 if (min_copy == 0) { 596 /* 597 * If we copied all contents of this 598 * descriptor then move onto the next one. 599 * Otherwise, record how far we are into the 600 * current descriptor. 601 */ 602 if (iov[i].iov_len == to_copy) 603 i++; 604 else 605 base_off = to_copy; 606 607 break; 608 } 609 } 610 611 ASSERT3P(mp_head, !=, NULL); 612 ASSERT3P(mp_tail, !=, NULL); 613 614 for (; i < n; i++) { 615 uintptr_t base = (uintptr_t)iov[i].iov_base + base_off; 616 uint32_t chunk = iov[i].iov_len - base_off; 617 618 ASSERT3U(base_off, <, iov[i].iov_len); 619 ASSERT3U(chunk, >, 0); 620 621 if (dp != NULL) { 622 mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn); 623 if (mp == NULL) { 624 goto drop_fail; 625 } 626 dp->d_ref++; 627 } else { 628 mp = allocb(chunk, BPRI_MED); 629 if (mp == NULL) { 630 goto drop_fail; 631 } 632 bcopy((uchar_t *)base, mp->b_wptr, chunk); 633 } 634 635 base_off = 0; 636 len += chunk; 637 mp->b_wptr += chunk; 638 mp_tail->b_cont = mp; 639 mp_tail = mp; 640 } 641 642 if (VNETHOOK_INTERESTED_OUT(link->l_neti)) { 643 /* 644 * The hook consumer may elect to free the mblk_t and set 645 * our mblk_t ** to NULL. When using a viona_desb_t 646 * (dp != NULL), we do not want the corresponding cleanup to 647 * occur during the viona_hook() call. We instead want to 648 * reset and recycle dp for future use. To prevent cleanup 649 * during the viona_hook() call, we take a ref on dp (if being 650 * used), and release it on success. On failure, the 651 * freemsgchain() call will release all the refs taken earlier 652 * in viona_tx() (aside from the initial ref and the one we 653 * take), and drop_hook will reset dp for reuse. 654 */ 655 if (dp != NULL) 656 dp->d_ref++; 657 658 /* 659 * Pass &mp instead of &mp_head so we don't lose track of 660 * mp_head if the hook consumer (i.e. ipf) elects to free mp 661 * and set mp to NULL. 662 */ 663 mp = mp_head; 664 if (viona_hook(link, ring, &mp, B_TRUE) != 0) { 665 if (mp != NULL) 666 freemsgchain(mp); 667 goto drop_hook; 668 } 669 670 if (dp != NULL) { 671 dp->d_ref--; 672 673 /* 674 * It is possible that the hook(s) accepted the packet, 675 * but as part of its processing, it issued a pull-up 676 * which released all references to the desb. In that 677 * case, go back to acting like the packet is entirely 678 * copied (which it is). 679 */ 680 if (dp->d_ref == 1) { 681 dp->d_cookie = 0; 682 dp->d_ref = 0; 683 dp = NULL; 684 } 685 } 686 } 687 688 /* 689 * Request hardware checksumming, if necessary. If the guest 690 * sent an LSO packet then it must have also negotiated and 691 * requested partial checksum; therefore the LSO logic is 692 * contained within viona_tx_csum(). 693 */ 694 if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 && 695 (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) { 696 if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) { 697 goto drop_fail; 698 } 699 } 700 701 if (dp != NULL) { 702 dp->d_len = len; 703 dp->d_pages = pages; 704 mutex_enter(&ring->vr_lock); 705 ring->vr_xfer_outstanding++; 706 mutex_exit(&ring->vr_lock); 707 } else { 708 /* 709 * If the data was cloned out of the ring, the descriptors can 710 * be marked as 'used' now, rather than deferring that action 711 * until after successful packet transmission. 712 */ 713 vmm_drv_page_release_chain(pages); 714 viona_tx_done(ring, len, cookie); 715 } 716 717 /* 718 * From viona's point of view, this is a successful transmit, even if 719 * something downstream decides to drop the packet. 720 */ 721 viona_ring_stat_accept(ring, len); 722 723 /* 724 * We're potentially going deep into the networking layer; make sure the 725 * guest can't run concurrently. 726 */ 727 smt_begin_unsafe(); 728 /* 729 * Ignore, for now, any signal from MAC about whether the outgoing 730 * packet was dropped or not. 731 */ 732 (void) mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL); 733 smt_end_unsafe(); 734 return; 735 736 drop_fail: 737 /* 738 * On the off chance that memory is not available via the desballoc or 739 * allocb calls, there are few options left besides to fail and drop 740 * the frame on the floor. 741 * 742 * First account for it in the error stats. 743 */ 744 viona_ring_stat_error(ring); 745 746 if (dp != NULL) { 747 /* 748 * Take an additional reference on the desb handle (if present) 749 * so any desballoc-sourced mblks can release their hold on it 750 * without the handle reaching its final state and executing 751 * its clean-up logic. 752 */ 753 dp->d_ref++; 754 } 755 756 /* 757 * Free any already-allocated blocks and sum up the total length of the 758 * dropped data to be released to the used ring. 759 */ 760 freemsgchain(mp_head); 761 762 drop_hook: 763 len = 0; 764 for (uint_t i = 0; i < n; i++) { 765 len += iov[i].iov_len; 766 } 767 768 if (dp != NULL) { 769 VERIFY(dp->d_ref == 2); 770 771 /* Clean up the desb handle, releasing the extra hold. */ 772 dp->d_len = 0; 773 dp->d_cookie = 0; 774 dp->d_ref = 0; 775 } 776 777 /* Count in the stats as a drop, rather than an error */ 778 viona_ring_stat_drop(ring); 779 780 VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len, 781 uint16_t, cookie); 782 vmm_drv_page_release_chain(pages); 783 viona_tx_done(ring, len, cookie); 784 } 785