1 /* 2 * Copyright (c) 2013 Chris Torek <torek @ torek net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * This file and its contents are supplied under the terms of the 28 * Common Development and Distribution License ("CDDL"), version 1.0. 29 * You may only use this file in accordance with the terms of version 30 * 1.0 of the CDDL. 31 * 32 * A full copy of the text of the CDDL should have accompanied this 33 * source. A copy of the CDDL is also available via the Internet at 34 * http://www.illumos.org/license/CDDL. 35 * 36 * Copyright 2015 Pluribus Networks Inc. 37 * Copyright 2019 Joyent, Inc. 38 * Copyright 2022 Oxide Computer Company 39 */ 40 41 42 #include <sys/types.h> 43 #include <sys/smt.h> 44 #include <sys/strsubr.h> 45 46 #include <sys/pattr.h> 47 #include <sys/dlpi.h> 48 #include <inet/ip.h> 49 #include <inet/ip_impl.h> 50 51 #include "viona_impl.h" 52 53 #define BNXE_NIC_DRIVER "bnxe" 54 55 /* 56 * copy tx mbufs from virtio ring to avoid necessitating a wait for packet 57 * transmission to free resources. 58 */ 59 kmutex_t viona_force_copy_lock; 60 static enum viona_force_copy { 61 VFC_UNINITALIZED = 0, 62 VFC_COPY_UNEEDED = 1, 63 VFC_COPY_REQUIRED = 2, 64 } viona_force_copy_state = VFC_UNINITALIZED; 65 66 struct viona_desb { 67 frtn_t d_frtn; 68 viona_vring_t *d_ring; 69 uint_t d_ref; 70 uint32_t d_len; 71 uint16_t d_cookie; 72 uchar_t *d_headers; 73 vmm_page_t *d_pages; 74 }; 75 76 static void viona_tx(viona_link_t *, viona_vring_t *); 77 static void viona_desb_release(viona_desb_t *); 78 79 80 static void 81 viona_tx_wait_outstanding(viona_vring_t *ring) 82 { 83 ASSERT(MUTEX_HELD(&ring->vr_lock)); 84 85 while (ring->vr_xfer_outstanding != 0) { 86 /* 87 * Paying heed to signals is counterproductive here. This is a 88 * very tight loop if pending transfers take an extended amount 89 * of time to be reclaimed while the host process is exiting. 90 */ 91 cv_wait(&ring->vr_cv, &ring->vr_lock); 92 } 93 } 94 95 /* 96 * Check if full TX packet copying is needed. This should not be called from 97 * viona attach()/detach() context. 98 */ 99 static boolean_t 100 viona_tx_copy_needed(void) 101 { 102 boolean_t result; 103 104 mutex_enter(&viona_force_copy_lock); 105 if (viona_force_copy_state == VFC_UNINITALIZED) { 106 major_t bnxe_major; 107 108 /* 109 * The original code for viona featured an explicit check for 110 * the bnxe driver which, when found present, necessitated that 111 * all transmissions be copied into their own mblks instead of 112 * passing guest memory to the underlying device. 113 * 114 * The motivations for this are unclear, but until it can be 115 * proven unnecessary, the check lives on. 116 */ 117 viona_force_copy_state = VFC_COPY_UNEEDED; 118 if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER)) 119 != DDI_MAJOR_T_NONE) { 120 if (ddi_hold_installed_driver(bnxe_major) != NULL) { 121 viona_force_copy_state = VFC_COPY_REQUIRED; 122 ddi_rele_driver(bnxe_major); 123 } 124 } 125 } 126 result = (viona_force_copy_state == VFC_COPY_REQUIRED); 127 mutex_exit(&viona_force_copy_lock); 128 129 return (result); 130 } 131 132 void 133 viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz) 134 { 135 /* Allocate desb handles for TX ring if packet copying is disabled */ 136 if (!viona_tx_copy_needed()) { 137 viona_desb_t *dp; 138 139 dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP); 140 ring->vr_txdesb = dp; 141 for (uint_t i = 0; i < qsz; i++, dp++) { 142 dp->d_frtn.free_func = viona_desb_release; 143 dp->d_frtn.free_arg = (void *)dp; 144 dp->d_ring = ring; 145 dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN, 146 KM_SLEEP); 147 } 148 } 149 150 /* Allocate ring-sized iovec buffers for TX */ 151 ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP); 152 } 153 154 void 155 viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz) 156 { 157 if (ring->vr_txdesb != NULL) { 158 viona_desb_t *dp = ring->vr_txdesb; 159 160 for (uint_t i = 0; i < qsz; i++, dp++) { 161 kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN); 162 } 163 kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz); 164 ring->vr_txdesb = NULL; 165 } 166 167 if (ring->vr_txiov != NULL) { 168 kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz); 169 ring->vr_txiov = NULL; 170 } 171 } 172 173 static void 174 viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie) 175 { 176 vq_pushchain(ring, len, cookie); 177 178 membar_enter(); 179 viona_intr_ring(ring, B_FALSE); 180 } 181 182 #define TX_BURST_THRESH 32 183 184 void 185 viona_worker_tx(viona_vring_t *ring, viona_link_t *link) 186 { 187 (void) thread_vsetname(curthread, "viona_tx_%p", ring); 188 189 ASSERT(MUTEX_HELD(&ring->vr_lock)); 190 ASSERT3U(ring->vr_state, ==, VRS_RUN); 191 192 mutex_exit(&ring->vr_lock); 193 194 for (;;) { 195 uint_t ntx = 0, burst = 0; 196 197 viona_ring_disable_notify(ring); 198 while (viona_ring_num_avail(ring) != 0) { 199 viona_tx(link, ring); 200 ntx++; 201 burst++; 202 203 /* 204 * It is advantageous for throughput to keep this 205 * transmission loop tight, but periodic breaks to 206 * check for other events are of value too. 207 */ 208 if (burst >= TX_BURST_THRESH) { 209 mutex_enter(&ring->vr_lock); 210 const bool need_bail = vring_need_bail(ring); 211 mutex_exit(&ring->vr_lock); 212 213 if (need_bail) { 214 break; 215 } 216 burst = 0; 217 } 218 } 219 220 VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx); 221 222 /* 223 * Check for available descriptors on the ring once more in 224 * case a late addition raced with the NO_NOTIFY flag toggle. 225 * 226 * The barrier ensures that visibility of the no-notify 227 * store does not cross the viona_ring_num_avail() check below. 228 */ 229 viona_ring_enable_notify(ring); 230 membar_enter(); 231 232 if (viona_ring_num_avail(ring) == 0 && 233 (link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) { 234 /* 235 * The NOTIFY_ON_EMPTY interrupt should not pay heed to 236 * the presence of AVAIL_NO_INTERRUPT. 237 */ 238 viona_intr_ring(ring, B_TRUE); 239 } 240 241 mutex_enter(&ring->vr_lock); 242 for (;;) { 243 if (vring_need_bail(ring)) { 244 ring->vr_state = VRS_STOP; 245 viona_tx_wait_outstanding(ring); 246 return; 247 } 248 249 if (vmm_drv_lease_expired(ring->vr_lease)) { 250 ring->vr_state_flags |= VRSF_RENEW; 251 /* 252 * When renewing the lease for the ring, no TX 253 * frames may be outstanding, as they contain 254 * references to guest memory. 255 */ 256 viona_tx_wait_outstanding(ring); 257 258 const boolean_t renewed = 259 viona_ring_lease_renew(ring); 260 ring->vr_state_flags &= ~VRSF_RENEW; 261 262 if (!renewed) { 263 /* stop ring on failed renewal */ 264 ring->vr_state = VRS_STOP; 265 return; 266 } 267 } 268 269 if (viona_ring_num_avail(ring) != 0) { 270 break; 271 } 272 273 /* Wait for further activity on the ring */ 274 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 275 } 276 mutex_exit(&ring->vr_lock); 277 } 278 /* UNREACHABLE */ 279 } 280 281 static void 282 viona_desb_release(viona_desb_t *dp) 283 { 284 viona_vring_t *ring = dp->d_ring; 285 uint_t ref; 286 uint32_t len; 287 uint16_t cookie; 288 289 ref = atomic_dec_uint_nv(&dp->d_ref); 290 if (ref > 1) { 291 return; 292 } 293 294 /* 295 * The desb corresponding to this index must be ready for reuse before 296 * the descriptor is returned to the guest via the 'used' ring. 297 */ 298 len = dp->d_len; 299 cookie = dp->d_cookie; 300 dp->d_len = 0; 301 dp->d_cookie = 0; 302 vmm_drv_page_release_chain(dp->d_pages); 303 dp->d_pages = NULL; 304 305 /* 306 * Ensure all other changes to the desb are visible prior to zeroing its 307 * refcount, signifying its readiness for reuse. 308 */ 309 membar_exit(); 310 dp->d_ref = 0; 311 312 viona_tx_done(ring, len, cookie); 313 314 mutex_enter(&ring->vr_lock); 315 if ((--ring->vr_xfer_outstanding) == 0) { 316 cv_broadcast(&ring->vr_cv); 317 } 318 mutex_exit(&ring->vr_lock); 319 } 320 321 static boolean_t 322 viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr, 323 mblk_t *mp, uint32_t len) 324 { 325 viona_link_t *link = ring->vr_link; 326 const struct ether_header *eth; 327 uint_t eth_len = sizeof (struct ether_header); 328 ushort_t ftype; 329 ipha_t *ipha = NULL; 330 uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */ 331 uint16_t flags = 0; 332 const uint_t csum_start = hdr->vrh_csum_start; 333 const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start; 334 335 /* 336 * Validate that the checksum offsets provided by the guest are within 337 * the bounds of the packet. Additionally, ensure that the checksum 338 * contents field is within the headers mblk copied by viona_tx(). 339 */ 340 if (csum_start >= len || csum_start < eth_len || csum_stuff >= len || 341 (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) { 342 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp); 343 VIONA_RING_STAT_INCR(ring, fail_hcksum); 344 return (B_FALSE); 345 } 346 347 /* 348 * This is guaranteed to be safe thanks to the header copying 349 * done in viona_tx(). 350 */ 351 eth = (const struct ether_header *)mp->b_rptr; 352 ftype = ntohs(eth->ether_type); 353 354 if (ftype == ETHERTYPE_VLAN) { 355 const struct ether_vlan_header *veth; 356 357 /* punt on QinQ for now */ 358 eth_len = sizeof (struct ether_vlan_header); 359 veth = (const struct ether_vlan_header *)eth; 360 ftype = ntohs(veth->ether_type); 361 } 362 363 if (ftype == ETHERTYPE_IP) { 364 ipha = (ipha_t *)(mp->b_rptr + eth_len); 365 366 ipproto = ipha->ipha_protocol; 367 } else if (ftype == ETHERTYPE_IPV6) { 368 ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len); 369 370 ipproto = ip6h->ip6_nxt; 371 } 372 373 /* 374 * We ignore hdr_len because the spec says it can't be 375 * trusted. Besides, our own stack will determine the header 376 * boundary. 377 */ 378 if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 && 379 (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 && 380 ftype == ETHERTYPE_IP) { 381 uint16_t *cksump; 382 uint32_t cksum; 383 ipaddr_t src = ipha->ipha_src; 384 ipaddr_t dst = ipha->ipha_dst; 385 386 /* 387 * Our native IP stack doesn't set the L4 length field 388 * of the pseudo header when LSO is in play. Other IP 389 * stacks, e.g. Linux, do include the length field. 390 * This is a problem because the hardware expects that 391 * the length field is not set. When it is set it will 392 * cause an incorrect TCP checksum to be generated. 393 * The reason this works in Linux is because Linux 394 * corrects the pseudo-header checksum in the driver 395 * code. In order to get the correct HW checksum we 396 * need to assume the guest's IP stack gave us a bogus 397 * TCP partial checksum and calculate it ourselves. 398 */ 399 cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha)); 400 cksum = IP_TCP_CSUM_COMP; 401 cksum += (dst >> 16) + (dst & 0xFFFF) + 402 (src >> 16) + (src & 0xFFFF); 403 cksum = (cksum & 0xFFFF) + (cksum >> 16); 404 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); 405 406 /* 407 * Since viona is a "legacy device", the data stored 408 * by the driver will be in the guest's native endian 409 * format (see sections 2.4.3 and 5.1.6.1 of the 410 * VIRTIO 1.0 spec for more info). At this time the 411 * only guests using viona are x86 and we can assume 412 * little-endian. 413 */ 414 lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO); 415 416 /* 417 * Hardware, like ixgbe, expects the client to request 418 * IP header checksum offload if it's sending LSO (see 419 * ixgbe_get_context()). Unfortunately, virtio makes 420 * no allowances for negotiating IP header checksum 421 * and HW offload, only TCP checksum. We add the flag 422 * and zero-out the checksum field. This mirrors the 423 * behavior of our native IP stack (which does this in 424 * the interest of HW that expects the field to be 425 * zero). 426 */ 427 flags |= HCK_IPV4_HDRCKSUM; 428 ipha->ipha_hdr_checksum = 0; 429 } 430 431 /* 432 * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure 433 * HW_LSO, if present, is not lost. 434 */ 435 flags |= DB_CKSUMFLAGS(mp); 436 437 /* 438 * Partial checksum support from the NIC is ideal, since it most 439 * closely maps to the interface defined by virtio. 440 */ 441 if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 && 442 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { 443 /* 444 * MAC expects these offsets to be relative to the 445 * start of the L3 header rather than the L2 frame. 446 */ 447 flags |= HCK_PARTIALCKSUM; 448 mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len, 449 len - eth_len, 0, flags); 450 return (B_TRUE); 451 } 452 453 /* 454 * Without partial checksum support, look to the L3/L4 protocol 455 * information to see if the NIC can handle it. If not, the 456 * checksum will need to calculated inline. 457 */ 458 if (ftype == ETHERTYPE_IP) { 459 if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 && 460 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { 461 uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff); 462 *csump = 0; 463 flags |= HCK_FULLCKSUM; 464 mac_hcksum_set(mp, 0, 0, 0, 0, flags); 465 return (B_TRUE); 466 } 467 468 /* XXX: Implement manual fallback checksumming? */ 469 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp); 470 VIONA_RING_STAT_INCR(ring, fail_hcksum); 471 return (B_FALSE); 472 } else if (ftype == ETHERTYPE_IPV6) { 473 if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 && 474 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { 475 uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff); 476 *csump = 0; 477 flags |= HCK_FULLCKSUM; 478 mac_hcksum_set(mp, 0, 0, 0, 0, flags); 479 return (B_TRUE); 480 } 481 482 /* XXX: Implement manual fallback checksumming? */ 483 VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp); 484 VIONA_RING_STAT_INCR(ring, fail_hcksum6); 485 return (B_FALSE); 486 } 487 488 /* Cannot even emulate hcksum for unrecognized protocols */ 489 VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp); 490 VIONA_RING_STAT_INCR(ring, fail_hcksum_proto); 491 return (B_FALSE); 492 } 493 494 static void 495 viona_tx(viona_link_t *link, viona_vring_t *ring) 496 { 497 struct iovec *iov = ring->vr_txiov; 498 const uint_t max_segs = ring->vr_size; 499 uint16_t cookie; 500 int i, n; 501 uint32_t len, base_off = 0; 502 uint32_t min_copy = VIONA_MAX_HDRS_LEN; 503 mblk_t *mp_head, *mp_tail, *mp; 504 viona_desb_t *dp = NULL; 505 mac_client_handle_t link_mch = link->l_mch; 506 const struct virtio_net_hdr *hdr; 507 vmm_page_t *pages = NULL; 508 509 mp_head = mp_tail = NULL; 510 511 ASSERT(iov != NULL); 512 513 n = vq_popchain(ring, iov, max_segs, &cookie, &pages); 514 if (n == 0) { 515 VIONA_PROBE1(tx_absent, viona_vring_t *, ring); 516 VIONA_RING_STAT_INCR(ring, tx_absent); 517 return; 518 } else if (n < 0) { 519 /* 520 * Any error encountered in vq_popchain has already resulted in 521 * specific probe and statistic handling. Further action here 522 * is unnecessary. 523 */ 524 return; 525 } 526 527 /* Grab the header and ensure it is of adequate length */ 528 hdr = (const struct virtio_net_hdr *)iov[0].iov_base; 529 len = iov[0].iov_len; 530 if (len < sizeof (struct virtio_net_hdr)) { 531 goto drop_fail; 532 } 533 534 /* Make sure the packet headers are always in the first mblk. */ 535 if (ring->vr_txdesb != NULL) { 536 dp = &ring->vr_txdesb[cookie]; 537 538 /* 539 * If the guest driver is operating properly, each desb slot 540 * should be available for use when processing a TX descriptor 541 * from the 'avail' ring. In the case of drivers that reuse a 542 * descriptor before it has been posted to the 'used' ring, the 543 * data is simply dropped. 544 */ 545 if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) { 546 dp = NULL; 547 goto drop_fail; 548 } 549 550 dp->d_cookie = cookie; 551 mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0, 552 &dp->d_frtn); 553 554 /* Account for the successful desballoc. */ 555 if (mp_head != NULL) 556 dp->d_ref++; 557 } else { 558 mp_head = allocb(VIONA_MAX_HDRS_LEN, 0); 559 } 560 561 if (mp_head == NULL) 562 goto drop_fail; 563 564 mp_tail = mp_head; 565 566 /* 567 * We always copy enough of the guest data to cover the 568 * headers. This protects us from TOCTOU attacks and allows 569 * message block length assumptions to be made in subsequent 570 * code. In many cases, this means copying more data than 571 * strictly necessary. That's okay, as it is the larger packets 572 * (such as LSO) that really benefit from desballoc(). 573 */ 574 for (i = 1; i < n; i++) { 575 const uint32_t to_copy = MIN(min_copy, iov[i].iov_len); 576 577 bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy); 578 mp_head->b_wptr += to_copy; 579 len += to_copy; 580 min_copy -= to_copy; 581 582 /* 583 * We've met the minimum copy requirement. The rest of 584 * the guest data can be referenced. 585 */ 586 if (min_copy == 0) { 587 /* 588 * If we copied all contents of this 589 * descriptor then move onto the next one. 590 * Otherwise, record how far we are into the 591 * current descriptor. 592 */ 593 if (iov[i].iov_len == to_copy) 594 i++; 595 else 596 base_off = to_copy; 597 598 break; 599 } 600 } 601 602 ASSERT3P(mp_head, !=, NULL); 603 ASSERT3P(mp_tail, !=, NULL); 604 605 for (; i < n; i++) { 606 uintptr_t base = (uintptr_t)iov[i].iov_base + base_off; 607 uint32_t chunk = iov[i].iov_len - base_off; 608 609 ASSERT3U(base_off, <, iov[i].iov_len); 610 ASSERT3U(chunk, >, 0); 611 612 if (dp != NULL) { 613 mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn); 614 if (mp == NULL) { 615 goto drop_fail; 616 } 617 dp->d_ref++; 618 } else { 619 mp = allocb(chunk, BPRI_MED); 620 if (mp == NULL) { 621 goto drop_fail; 622 } 623 bcopy((uchar_t *)base, mp->b_wptr, chunk); 624 } 625 626 base_off = 0; 627 len += chunk; 628 mp->b_wptr += chunk; 629 mp_tail->b_cont = mp; 630 mp_tail = mp; 631 } 632 633 if (VNETHOOK_INTERESTED_OUT(link->l_neti)) { 634 /* 635 * The hook consumer may elect to free the mblk_t and set 636 * our mblk_t ** to NULL. When using a viona_desb_t 637 * (dp != NULL), we do not want the corresponding cleanup to 638 * occur during the viona_hook() call. We instead want to 639 * reset and recycle dp for future use. To prevent cleanup 640 * during the viona_hook() call, we take a ref on dp (if being 641 * used), and release it on success. On failure, the 642 * freemsgchain() call will release all the refs taken earlier 643 * in viona_tx() (aside from the initial ref and the one we 644 * take), and drop_hook will reset dp for reuse. 645 */ 646 if (dp != NULL) 647 dp->d_ref++; 648 649 /* 650 * Pass &mp instead of &mp_head so we don't lose track of 651 * mp_head if the hook consumer (i.e. ipf) elects to free mp 652 * and set mp to NULL. 653 */ 654 mp = mp_head; 655 if (viona_hook(link, ring, &mp, B_TRUE) != 0) { 656 if (mp != NULL) 657 freemsgchain(mp); 658 goto drop_hook; 659 } 660 661 if (dp != NULL) { 662 dp->d_ref--; 663 664 /* 665 * It is possible that the hook(s) accepted the packet, 666 * but as part of its processing, it issued a pull-up 667 * which released all references to the desb. In that 668 * case, go back to acting like the packet is entirely 669 * copied (which it is). 670 */ 671 if (dp->d_ref == 1) { 672 dp->d_cookie = 0; 673 dp->d_ref = 0; 674 dp = NULL; 675 } 676 } 677 } 678 679 /* 680 * Request hardware checksumming, if necessary. If the guest 681 * sent an LSO packet then it must have also negotiated and 682 * requested partial checksum; therefore the LSO logic is 683 * contained within viona_tx_csum(). 684 */ 685 if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 && 686 (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) { 687 if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) { 688 goto drop_fail; 689 } 690 } 691 692 if (dp != NULL) { 693 dp->d_len = len; 694 dp->d_pages = pages; 695 mutex_enter(&ring->vr_lock); 696 ring->vr_xfer_outstanding++; 697 mutex_exit(&ring->vr_lock); 698 } else { 699 /* 700 * If the data was cloned out of the ring, the descriptors can 701 * be marked as 'used' now, rather than deferring that action 702 * until after successful packet transmission. 703 */ 704 vmm_drv_page_release_chain(pages); 705 viona_tx_done(ring, len, cookie); 706 } 707 708 /* 709 * We're potentially going deep into the networking layer; make sure the 710 * guest can't run concurrently. 711 */ 712 smt_begin_unsafe(); 713 /* 714 * Ignore, for now, any signal from MAC about whether the outgoing 715 * packet was dropped or not. 716 */ 717 (void) mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL); 718 smt_end_unsafe(); 719 return; 720 721 drop_fail: 722 /* 723 * On the off chance that memory is not available via the desballoc or 724 * allocb calls, there are few options left besides to fail and drop 725 * the frame on the floor. 726 */ 727 728 if (dp != NULL) { 729 /* 730 * Take an additional reference on the desb handle (if present) 731 * so any desballoc-sourced mblks can release their hold on it 732 * without the handle reaching its final state and executing 733 * its clean-up logic. 734 */ 735 dp->d_ref++; 736 } 737 738 /* 739 * Free any already-allocated blocks and sum up the total length of the 740 * dropped data to be released to the used ring. 741 */ 742 freemsgchain(mp_head); 743 744 drop_hook: 745 len = 0; 746 for (uint_t i = 0; i < n; i++) { 747 len += iov[i].iov_len; 748 } 749 750 if (dp != NULL) { 751 VERIFY(dp->d_ref == 2); 752 753 /* Clean up the desb handle, releasing the extra hold. */ 754 dp->d_len = 0; 755 dp->d_cookie = 0; 756 dp->d_ref = 0; 757 } 758 759 VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len, 760 uint16_t, cookie); 761 vmm_drv_page_release_chain(pages); 762 viona_tx_done(ring, len, cookie); 763 } 764