1 /* 2 * Copyright (c) 2013 Chris Torek <torek @ torek net> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * This file and its contents are supplied under the terms of the 28 * Common Development and Distribution License ("CDDL"), version 1.0. 29 * You may only use this file in accordance with the terms of version 30 * 1.0 of the CDDL. 31 * 32 * A full copy of the text of the CDDL should have accompanied this 33 * source. A copy of the CDDL is also available via the Internet at 34 * http://www.illumos.org/license/CDDL. 35 * 36 * Copyright 2015 Pluribus Networks Inc. 37 * Copyright 2019 Joyent, Inc. 38 * Copyright 2021 Oxide Computer Company 39 */ 40 41 42 #include <sys/types.h> 43 #include <sys/smt.h> 44 #include <sys/strsubr.h> 45 46 #include <sys/pattr.h> 47 #include <sys/dlpi.h> 48 #include <inet/ip.h> 49 #include <inet/ip_impl.h> 50 51 #include "viona_impl.h" 52 53 #define BNXE_NIC_DRIVER "bnxe" 54 55 /* 56 * copy tx mbufs from virtio ring to avoid necessitating a wait for packet 57 * transmission to free resources. 58 */ 59 kmutex_t viona_force_copy_lock; 60 static enum viona_force_copy { 61 VFC_UNINITALIZED = 0, 62 VFC_COPY_UNEEDED = 1, 63 VFC_COPY_REQUIRED = 2, 64 } viona_force_copy_state = VFC_UNINITALIZED; 65 66 struct viona_desb { 67 frtn_t d_frtn; 68 viona_vring_t *d_ring; 69 uint_t d_ref; 70 uint32_t d_len; 71 uint16_t d_cookie; 72 uchar_t *d_headers; 73 vmm_page_t *d_pages; 74 }; 75 76 static void viona_tx(viona_link_t *, viona_vring_t *); 77 static void viona_desb_release(viona_desb_t *); 78 79 80 static void 81 viona_tx_wait_outstanding(viona_vring_t *ring) 82 { 83 ASSERT(MUTEX_HELD(&ring->vr_lock)); 84 85 while (ring->vr_xfer_outstanding != 0) { 86 /* 87 * Paying heed to signals is counterproductive here. This is a 88 * very tight loop if pending transfers take an extended amount 89 * of time to be reclaimed while the host process is exiting. 90 */ 91 cv_wait(&ring->vr_cv, &ring->vr_lock); 92 } 93 } 94 95 /* 96 * Check if full TX packet copying is needed. This should not be called from 97 * viona attach()/detach() context. 98 */ 99 static boolean_t 100 viona_tx_copy_needed(void) 101 { 102 boolean_t result; 103 104 mutex_enter(&viona_force_copy_lock); 105 if (viona_force_copy_state == VFC_UNINITALIZED) { 106 major_t bnxe_major; 107 108 /* 109 * The original code for viona featured an explicit check for 110 * the bnxe driver which, when found present, necessitated that 111 * all transmissions be copied into their own mblks instead of 112 * passing guest memory to the underlying device. 113 * 114 * The motivations for this are unclear, but until it can be 115 * proven unnecessary, the check lives on. 116 */ 117 viona_force_copy_state = VFC_COPY_UNEEDED; 118 if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER)) 119 != DDI_MAJOR_T_NONE) { 120 if (ddi_hold_installed_driver(bnxe_major) != NULL) { 121 viona_force_copy_state = VFC_COPY_REQUIRED; 122 ddi_rele_driver(bnxe_major); 123 } 124 } 125 } 126 result = (viona_force_copy_state == VFC_COPY_REQUIRED); 127 mutex_exit(&viona_force_copy_lock); 128 129 return (result); 130 } 131 132 void 133 viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz) 134 { 135 /* Allocate desb handles for TX ring if packet copying not disabled */ 136 if (!viona_tx_copy_needed()) { 137 viona_desb_t *dp; 138 139 dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP); 140 ring->vr_txdesb = dp; 141 for (uint_t i = 0; i < qsz; i++, dp++) { 142 dp->d_frtn.free_func = viona_desb_release; 143 dp->d_frtn.free_arg = (void *)dp; 144 dp->d_ring = ring; 145 dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN, 146 KM_SLEEP); 147 } 148 } 149 150 /* Allocate ring-sized iovec buffers for TX */ 151 ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP); 152 } 153 154 void 155 viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz) 156 { 157 if (ring->vr_txdesb != NULL) { 158 viona_desb_t *dp = ring->vr_txdesb; 159 160 for (uint_t i = 0; i < qsz; i++, dp++) { 161 kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN); 162 } 163 kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz); 164 ring->vr_txdesb = NULL; 165 } 166 167 if (ring->vr_txiov != NULL) { 168 kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz); 169 ring->vr_txiov = NULL; 170 } 171 } 172 173 static void 174 viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie) 175 { 176 vq_pushchain(ring, len, cookie); 177 178 membar_enter(); 179 viona_intr_ring(ring, B_FALSE); 180 } 181 182 void 183 viona_worker_tx(viona_vring_t *ring, viona_link_t *link) 184 { 185 proc_t *p = ttoproc(curthread); 186 187 (void) thread_vsetname(curthread, "viona_tx_%p", ring); 188 189 ASSERT(MUTEX_HELD(&ring->vr_lock)); 190 ASSERT3U(ring->vr_state, ==, VRS_RUN); 191 192 mutex_exit(&ring->vr_lock); 193 194 for (;;) { 195 boolean_t bail = B_FALSE; 196 boolean_t renew = B_FALSE; 197 uint_t ntx = 0; 198 199 viona_ring_disable_notify(ring); 200 while (viona_ring_num_avail(ring)) { 201 viona_tx(link, ring); 202 203 /* 204 * It is advantageous for throughput to keep this 205 * transmission loop tight, but periodic breaks to 206 * check for other events are of value too. 207 */ 208 if (ntx++ >= ring->vr_size) 209 break; 210 } 211 viona_ring_enable_notify(ring); 212 213 VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx); 214 215 /* 216 * Check for available descriptors on the ring once more in 217 * case a late addition raced with the NO_NOTIFY flag toggle. 218 * 219 * The barrier ensures that visibility of the no-notify 220 * store does not cross the viona_ring_num_avail() check below. 221 */ 222 membar_enter(); 223 bail = VRING_NEED_BAIL(ring, p); 224 renew = vmm_drv_lease_expired(ring->vr_lease); 225 if (!bail && !renew && viona_ring_num_avail(ring)) { 226 continue; 227 } 228 229 if ((link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) { 230 /* 231 * The NOTIFY_ON_EMPTY interrupt should not pay heed to 232 * the presence of AVAIL_NO_INTERRUPT. 233 */ 234 viona_intr_ring(ring, B_TRUE); 235 } 236 237 mutex_enter(&ring->vr_lock); 238 239 while (!bail && !renew && !viona_ring_num_avail(ring)) { 240 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock); 241 bail = VRING_NEED_BAIL(ring, p); 242 renew = vmm_drv_lease_expired(ring->vr_lease); 243 } 244 245 if (bail) { 246 break; 247 } else if (renew) { 248 ring->vr_state_flags |= VRSF_RENEW; 249 /* 250 * When renewing the lease for the ring, no TX 251 * frames may be outstanding, as they contain 252 * references to guest memory. 253 */ 254 viona_tx_wait_outstanding(ring); 255 256 if (!viona_ring_lease_renew(ring)) { 257 break; 258 } 259 ring->vr_state_flags &= ~VRSF_RENEW; 260 } 261 mutex_exit(&ring->vr_lock); 262 } 263 264 ASSERT(MUTEX_HELD(&ring->vr_lock)); 265 266 ring->vr_state = VRS_STOP; 267 viona_tx_wait_outstanding(ring); 268 } 269 270 static void 271 viona_desb_release(viona_desb_t *dp) 272 { 273 viona_vring_t *ring = dp->d_ring; 274 uint_t ref; 275 uint32_t len; 276 uint16_t cookie; 277 278 ref = atomic_dec_uint_nv(&dp->d_ref); 279 if (ref > 1) { 280 return; 281 } 282 283 /* 284 * The desb corresponding to this index must be ready for reuse before 285 * the descriptor is returned to the guest via the 'used' ring. 286 */ 287 len = dp->d_len; 288 cookie = dp->d_cookie; 289 dp->d_len = 0; 290 dp->d_cookie = 0; 291 vmm_drv_page_release_chain(dp->d_pages); 292 dp->d_pages = NULL; 293 294 /* 295 * Ensure all other changes to the desb are visible prior to zeroing its 296 * refcount, signifying its readiness for reuse. 297 */ 298 membar_exit(); 299 dp->d_ref = 0; 300 301 viona_tx_done(ring, len, cookie); 302 303 mutex_enter(&ring->vr_lock); 304 if ((--ring->vr_xfer_outstanding) == 0) { 305 cv_broadcast(&ring->vr_cv); 306 } 307 mutex_exit(&ring->vr_lock); 308 } 309 310 static boolean_t 311 viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr, 312 mblk_t *mp, uint32_t len) 313 { 314 viona_link_t *link = ring->vr_link; 315 const struct ether_header *eth; 316 uint_t eth_len = sizeof (struct ether_header); 317 ushort_t ftype; 318 ipha_t *ipha = NULL; 319 uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */ 320 uint16_t flags = 0; 321 const uint_t csum_start = hdr->vrh_csum_start; 322 const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start; 323 324 /* 325 * Validate that the checksum offsets provided by the guest are within 326 * the bounds of the packet. Additionally, ensure that the checksum 327 * contents field is within the headers mblk copied by viona_tx(). 328 */ 329 if (csum_start >= len || csum_start < eth_len || csum_stuff >= len || 330 (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) { 331 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp); 332 VIONA_RING_STAT_INCR(ring, fail_hcksum); 333 return (B_FALSE); 334 } 335 336 /* 337 * This is guaranteed to be safe thanks to the header copying 338 * done in viona_tx(). 339 */ 340 eth = (const struct ether_header *)mp->b_rptr; 341 ftype = ntohs(eth->ether_type); 342 343 if (ftype == ETHERTYPE_VLAN) { 344 const struct ether_vlan_header *veth; 345 346 /* punt on QinQ for now */ 347 eth_len = sizeof (struct ether_vlan_header); 348 veth = (const struct ether_vlan_header *)eth; 349 ftype = ntohs(veth->ether_type); 350 } 351 352 if (ftype == ETHERTYPE_IP) { 353 ipha = (ipha_t *)(mp->b_rptr + eth_len); 354 355 ipproto = ipha->ipha_protocol; 356 } else if (ftype == ETHERTYPE_IPV6) { 357 ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len); 358 359 ipproto = ip6h->ip6_nxt; 360 } 361 362 /* 363 * We ignore hdr_len because the spec says it can't be 364 * trusted. Besides, our own stack will determine the header 365 * boundary. 366 */ 367 if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 && 368 (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 && 369 ftype == ETHERTYPE_IP) { 370 uint16_t *cksump; 371 uint32_t cksum; 372 ipaddr_t src = ipha->ipha_src; 373 ipaddr_t dst = ipha->ipha_dst; 374 375 /* 376 * Our native IP stack doesn't set the L4 length field 377 * of the pseudo header when LSO is in play. Other IP 378 * stacks, e.g. Linux, do include the length field. 379 * This is a problem because the hardware expects that 380 * the length field is not set. When it is set it will 381 * cause an incorrect TCP checksum to be generated. 382 * The reason this works in Linux is because Linux 383 * corrects the pseudo-header checksum in the driver 384 * code. In order to get the correct HW checksum we 385 * need to assume the guest's IP stack gave us a bogus 386 * TCP partial checksum and calculate it ourselves. 387 */ 388 cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha)); 389 cksum = IP_TCP_CSUM_COMP; 390 cksum += (dst >> 16) + (dst & 0xFFFF) + 391 (src >> 16) + (src & 0xFFFF); 392 cksum = (cksum & 0xFFFF) + (cksum >> 16); 393 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); 394 395 /* 396 * Since viona is a "legacy device", the data stored 397 * by the driver will be in the guest's native endian 398 * format (see sections 2.4.3 and 5.1.6.1 of the 399 * VIRTIO 1.0 spec for more info). At this time the 400 * only guests using viona are x86 and we can assume 401 * little-endian. 402 */ 403 lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO); 404 405 /* 406 * Hardware, like ixgbe, expects the client to request 407 * IP header checksum offload if it's sending LSO (see 408 * ixgbe_get_context()). Unfortunately, virtio makes 409 * no allowances for negotiating IP header checksum 410 * and HW offload, only TCP checksum. We add the flag 411 * and zero-out the checksum field. This mirrors the 412 * behavior of our native IP stack (which does this in 413 * the interest of HW that expects the field to be 414 * zero). 415 */ 416 flags |= HCK_IPV4_HDRCKSUM; 417 ipha->ipha_hdr_checksum = 0; 418 } 419 420 /* 421 * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure 422 * HW_LSO, if present, is not lost. 423 */ 424 flags |= DB_CKSUMFLAGS(mp); 425 426 /* 427 * Partial checksum support from the NIC is ideal, since it most 428 * closely maps to the interface defined by virtio. 429 */ 430 if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 && 431 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { 432 /* 433 * MAC expects these offsets to be relative to the 434 * start of the L3 header rather than the L2 frame. 435 */ 436 flags |= HCK_PARTIALCKSUM; 437 mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len, 438 len - eth_len, 0, flags); 439 return (B_TRUE); 440 } 441 442 /* 443 * Without partial checksum support, look to the L3/L4 protocol 444 * information to see if the NIC can handle it. If not, the 445 * checksum will need to calculated inline. 446 */ 447 if (ftype == ETHERTYPE_IP) { 448 if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 && 449 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { 450 uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff); 451 *csump = 0; 452 flags |= HCK_FULLCKSUM; 453 mac_hcksum_set(mp, 0, 0, 0, 0, flags); 454 return (B_TRUE); 455 } 456 457 /* XXX: Implement manual fallback checksumming? */ 458 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp); 459 VIONA_RING_STAT_INCR(ring, fail_hcksum); 460 return (B_FALSE); 461 } else if (ftype == ETHERTYPE_IPV6) { 462 if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 && 463 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) { 464 uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff); 465 *csump = 0; 466 flags |= HCK_FULLCKSUM; 467 mac_hcksum_set(mp, 0, 0, 0, 0, flags); 468 return (B_TRUE); 469 } 470 471 /* XXX: Implement manual fallback checksumming? */ 472 VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp); 473 VIONA_RING_STAT_INCR(ring, fail_hcksum6); 474 return (B_FALSE); 475 } 476 477 /* Cannot even emulate hcksum for unrecognized protocols */ 478 VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp); 479 VIONA_RING_STAT_INCR(ring, fail_hcksum_proto); 480 return (B_FALSE); 481 } 482 483 static void 484 viona_tx(viona_link_t *link, viona_vring_t *ring) 485 { 486 struct iovec *iov = ring->vr_txiov; 487 const uint_t max_segs = ring->vr_size; 488 uint16_t cookie; 489 int i, n; 490 uint32_t len, base_off = 0; 491 uint32_t min_copy = VIONA_MAX_HDRS_LEN; 492 mblk_t *mp_head, *mp_tail, *mp; 493 viona_desb_t *dp = NULL; 494 mac_client_handle_t link_mch = link->l_mch; 495 const struct virtio_net_hdr *hdr; 496 vmm_page_t *pages = NULL; 497 498 mp_head = mp_tail = NULL; 499 500 ASSERT(iov != NULL); 501 502 n = vq_popchain(ring, iov, max_segs, &cookie, &pages); 503 if (n == 0) { 504 VIONA_PROBE1(tx_absent, viona_vring_t *, ring); 505 VIONA_RING_STAT_INCR(ring, tx_absent); 506 return; 507 } else if (n < 0) { 508 /* 509 * Any error encountered in vq_popchain has already resulted in 510 * specific probe and statistic handling. Further action here 511 * is unnecessary. 512 */ 513 return; 514 } 515 516 /* Grab the header and ensure it is of adequate length */ 517 hdr = (const struct virtio_net_hdr *)iov[0].iov_base; 518 len = iov[0].iov_len; 519 if (len < sizeof (struct virtio_net_hdr)) { 520 goto drop_fail; 521 } 522 523 /* Make sure the packet headers are always in the first mblk. */ 524 if (ring->vr_txdesb != NULL) { 525 dp = &ring->vr_txdesb[cookie]; 526 527 /* 528 * If the guest driver is operating properly, each desb slot 529 * should be available for use when processing a TX descriptor 530 * from the 'avail' ring. In the case of drivers that reuse a 531 * descriptor before it has been posted to the 'used' ring, the 532 * data is simply dropped. 533 */ 534 if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) { 535 dp = NULL; 536 goto drop_fail; 537 } 538 539 dp->d_cookie = cookie; 540 mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0, 541 &dp->d_frtn); 542 543 /* Account for the successful desballoc. */ 544 if (mp_head != NULL) 545 dp->d_ref++; 546 } else { 547 mp_head = allocb(VIONA_MAX_HDRS_LEN, 0); 548 } 549 550 if (mp_head == NULL) 551 goto drop_fail; 552 553 mp_tail = mp_head; 554 555 /* 556 * We always copy enough of the guest data to cover the 557 * headers. This protects us from TOCTOU attacks and allows 558 * message block length assumptions to be made in subsequent 559 * code. In many cases, this means copying more data than 560 * strictly necessary. That's okay, as it is the larger packets 561 * (such as LSO) that really benefit from desballoc(). 562 */ 563 for (i = 1; i < n; i++) { 564 const uint32_t to_copy = MIN(min_copy, iov[i].iov_len); 565 566 bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy); 567 mp_head->b_wptr += to_copy; 568 len += to_copy; 569 min_copy -= to_copy; 570 571 /* 572 * We've met the minimum copy requirement. The rest of 573 * the guest data can be referenced. 574 */ 575 if (min_copy == 0) { 576 /* 577 * If we copied all contents of this 578 * descriptor then move onto the next one. 579 * Otherwise, record how far we are into the 580 * current descriptor. 581 */ 582 if (iov[i].iov_len == to_copy) 583 i++; 584 else 585 base_off = to_copy; 586 587 break; 588 } 589 } 590 591 ASSERT3P(mp_head, !=, NULL); 592 ASSERT3P(mp_tail, !=, NULL); 593 594 for (; i < n; i++) { 595 uintptr_t base = (uintptr_t)iov[i].iov_base + base_off; 596 uint32_t chunk = iov[i].iov_len - base_off; 597 598 ASSERT3U(base_off, <, iov[i].iov_len); 599 ASSERT3U(chunk, >, 0); 600 601 if (dp != NULL) { 602 mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn); 603 if (mp == NULL) { 604 goto drop_fail; 605 } 606 dp->d_ref++; 607 } else { 608 mp = allocb(chunk, BPRI_MED); 609 if (mp == NULL) { 610 goto drop_fail; 611 } 612 bcopy((uchar_t *)base, mp->b_wptr, chunk); 613 } 614 615 base_off = 0; 616 len += chunk; 617 mp->b_wptr += chunk; 618 mp_tail->b_cont = mp; 619 mp_tail = mp; 620 } 621 622 if (VNETHOOK_INTERESTED_OUT(link->l_neti)) { 623 /* 624 * The hook consumer may elect to free the mblk_t and set 625 * our mblk_t ** to NULL. When using a viona_desb_t 626 * (dp != NULL), we do not want the corresponding cleanup to 627 * occur during the viona_hook() call. We instead want to 628 * reset and recycle dp for future use. To prevent cleanup 629 * during the viona_hook() call, we take a ref on dp (if being 630 * used), and release it on success. On failure, the 631 * freemsgchain() call will release all the refs taken earlier 632 * in viona_tx() (aside from the initial ref and the one we 633 * take), and drop_hook will reset dp for reuse. 634 */ 635 if (dp != NULL) 636 dp->d_ref++; 637 638 /* 639 * Pass &mp instead of &mp_head so we don't lose track of 640 * mp_head if the hook consumer (i.e. ipf) elects to free mp 641 * and set mp to NULL. 642 */ 643 mp = mp_head; 644 if (viona_hook(link, ring, &mp, B_TRUE) != 0) { 645 if (mp != NULL) 646 freemsgchain(mp); 647 goto drop_hook; 648 } 649 650 if (dp != NULL) { 651 dp->d_ref--; 652 653 /* 654 * It is possible that the hook(s) accepted the packet, 655 * but as part of its processing, it issued a pull-up 656 * which released all references to the desb. In that 657 * case, go back to acting like the packet is entirely 658 * copied (which it is). 659 */ 660 if (dp->d_ref == 1) { 661 dp->d_cookie = 0; 662 dp->d_ref = 0; 663 dp = NULL; 664 } 665 } 666 } 667 668 /* 669 * Request hardware checksumming, if necessary. If the guest 670 * sent an LSO packet then it must have also negotiated and 671 * requested partial checksum; therefore the LSO logic is 672 * contained within viona_tx_csum(). 673 */ 674 if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 && 675 (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) { 676 if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) { 677 goto drop_fail; 678 } 679 } 680 681 if (dp != NULL) { 682 dp->d_len = len; 683 dp->d_pages = pages; 684 mutex_enter(&ring->vr_lock); 685 ring->vr_xfer_outstanding++; 686 mutex_exit(&ring->vr_lock); 687 } else { 688 /* 689 * If the data was cloned out of the ring, the descriptors can 690 * be marked as 'used' now, rather than deferring that action 691 * until after successful packet transmission. 692 */ 693 vmm_drv_page_release_chain(pages); 694 viona_tx_done(ring, len, cookie); 695 } 696 697 /* 698 * We're potentially going deep into the networking layer; make sure the 699 * guest can't run concurrently. 700 */ 701 smt_begin_unsafe(); 702 /* 703 * Ignore, for now, any signal from MAC about whether the outgoing 704 * packet was dropped or not. 705 */ 706 (void) mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL); 707 smt_end_unsafe(); 708 return; 709 710 drop_fail: 711 /* 712 * On the off chance that memory is not available via the desballoc or 713 * allocb calls, there are few options left besides to fail and drop 714 * the frame on the floor. 715 */ 716 717 if (dp != NULL) { 718 /* 719 * Take an additional reference on the desb handle (if present) 720 * so any desballoc-sourced mblks can release their hold on it 721 * without the handle reaching its final state and executing 722 * its clean-up logic. 723 */ 724 dp->d_ref++; 725 } 726 727 /* 728 * Free any already-allocated blocks and sum up the total length of the 729 * dropped data to be released to the used ring. 730 */ 731 freemsgchain(mp_head); 732 733 drop_hook: 734 len = 0; 735 for (uint_t i = 0; i < n; i++) { 736 len += iov[i].iov_len; 737 } 738 739 if (dp != NULL) { 740 VERIFY(dp->d_ref == 2); 741 742 /* Clean up the desb handle, releasing the extra hold. */ 743 dp->d_len = 0; 744 dp->d_cookie = 0; 745 dp->d_ref = 0; 746 } 747 748 VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len, 749 uint16_t, cookie); 750 vmm_drv_page_release_chain(pages); 751 viona_tx_done(ring, len, cookie); 752 } 753