1 /*
2 * Copyright (c) 2013 Chris Torek <torek @ torek net>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 /*
27 * This file and its contents are supplied under the terms of the
28 * Common Development and Distribution License ("CDDL"), version 1.0.
29 * You may only use this file in accordance with the terms of version
30 * 1.0 of the CDDL.
31 *
32 * A full copy of the text of the CDDL should have accompanied this
33 * source. A copy of the CDDL is also available via the Internet at
34 * http://www.illumos.org/license/CDDL.
35 *
36 * Copyright 2015 Pluribus Networks Inc.
37 * Copyright 2019 Joyent, Inc.
38 * Copyright 2022 Oxide Computer Company
39 */
40
41
42 #include <sys/types.h>
43 #include <sys/smt.h>
44 #include <sys/strsubr.h>
45
46 #include <sys/pattr.h>
47 #include <sys/dlpi.h>
48 #include <inet/ip.h>
49 #include <inet/ip_impl.h>
50
51 #include "viona_impl.h"
52
53 #define BNXE_NIC_DRIVER "bnxe"
54
55 /*
56 * Tunable controls tx copy by default on or off
57 */
58 boolean_t viona_default_tx_copy = B_TRUE;
59
60 /*
61 * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
62 * transmission to free resources.
63 */
64 kmutex_t viona_force_copy_lock;
65 static enum viona_force_copy {
66 VFC_UNINITALIZED = 0,
67 VFC_COPY_UNEEDED = 1,
68 VFC_COPY_REQUIRED = 2,
69 } viona_force_copy_state = VFC_UNINITALIZED;
70
71 struct viona_desb {
72 frtn_t d_frtn;
73 viona_vring_t *d_ring;
74 uint_t d_ref;
75 uint32_t d_len;
76 uint16_t d_cookie;
77 uchar_t *d_headers;
78 vmm_page_t *d_pages;
79 };
80
81 static void viona_tx(viona_link_t *, viona_vring_t *);
82 static void viona_desb_release(viona_desb_t *);
83
84
85 static void
viona_tx_wait_outstanding(viona_vring_t * ring)86 viona_tx_wait_outstanding(viona_vring_t *ring)
87 {
88 ASSERT(MUTEX_HELD(&ring->vr_lock));
89
90 while (ring->vr_xfer_outstanding != 0) {
91 /*
92 * Paying heed to signals is counterproductive here. This is a
93 * very tight loop if pending transfers take an extended amount
94 * of time to be reclaimed while the host process is exiting.
95 */
96 cv_wait(&ring->vr_cv, &ring->vr_lock);
97 }
98 }
99
100 /*
101 * Check if full TX packet copying is needed. This should not be called from
102 * viona attach()/detach() context.
103 */
104 static boolean_t
viona_tx_copy_needed(void)105 viona_tx_copy_needed(void)
106 {
107 boolean_t result;
108
109 if (viona_default_tx_copy) {
110 return (B_TRUE);
111 }
112
113 mutex_enter(&viona_force_copy_lock);
114 if (viona_force_copy_state == VFC_UNINITALIZED) {
115 major_t bnxe_major;
116
117 /*
118 * The original code for viona featured an explicit check for
119 * the bnxe driver which, when found present, necessitated that
120 * all transmissions be copied into their own mblks instead of
121 * passing guest memory to the underlying device.
122 *
123 * The motivations for this are unclear, but until it can be
124 * proven unnecessary, the check lives on.
125 */
126 viona_force_copy_state = VFC_COPY_UNEEDED;
127 if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER))
128 != DDI_MAJOR_T_NONE) {
129 if (ddi_hold_installed_driver(bnxe_major) != NULL) {
130 viona_force_copy_state = VFC_COPY_REQUIRED;
131 ddi_rele_driver(bnxe_major);
132 }
133 }
134 }
135 result = (viona_force_copy_state == VFC_COPY_REQUIRED);
136 mutex_exit(&viona_force_copy_lock);
137
138 return (result);
139 }
140
141 void
viona_tx_ring_alloc(viona_vring_t * ring,const uint16_t qsz)142 viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz)
143 {
144 /* Allocate desb handles for TX ring if packet copying is disabled */
145 if (!viona_tx_copy_needed()) {
146 viona_desb_t *dp;
147
148 dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
149 ring->vr_txdesb = dp;
150 for (uint_t i = 0; i < qsz; i++, dp++) {
151 dp->d_frtn.free_func = viona_desb_release;
152 dp->d_frtn.free_arg = (void *)dp;
153 dp->d_ring = ring;
154 dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN,
155 KM_SLEEP);
156 }
157 }
158
159 /* Allocate ring-sized iovec buffers for TX */
160 ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP);
161 }
162
163 void
viona_tx_ring_free(viona_vring_t * ring,const uint16_t qsz)164 viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz)
165 {
166 if (ring->vr_txdesb != NULL) {
167 viona_desb_t *dp = ring->vr_txdesb;
168
169 for (uint_t i = 0; i < qsz; i++, dp++) {
170 kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN);
171 }
172 kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz);
173 ring->vr_txdesb = NULL;
174 }
175
176 if (ring->vr_txiov != NULL) {
177 kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz);
178 ring->vr_txiov = NULL;
179 }
180 }
181
182 static void
viona_tx_done(viona_vring_t * ring,uint32_t len,uint16_t cookie)183 viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie)
184 {
185 vq_pushchain(ring, len, cookie);
186
187 membar_enter();
188 viona_intr_ring(ring, B_FALSE);
189 }
190
191 #define TX_BURST_THRESH 32
192
193 void
viona_worker_tx(viona_vring_t * ring,viona_link_t * link)194 viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
195 {
196 (void) thread_vsetname(curthread, "viona_tx_%p", ring);
197
198 ASSERT(MUTEX_HELD(&ring->vr_lock));
199 ASSERT3U(ring->vr_state, ==, VRS_RUN);
200
201 mutex_exit(&ring->vr_lock);
202
203 for (;;) {
204 uint_t ntx = 0, burst = 0;
205
206 viona_ring_disable_notify(ring);
207 while (viona_ring_num_avail(ring) != 0) {
208 viona_tx(link, ring);
209 ntx++;
210 burst++;
211
212 /*
213 * It is advantageous for throughput to keep this
214 * transmission loop tight, but periodic breaks to
215 * check for other events are of value too.
216 */
217 if (burst >= TX_BURST_THRESH) {
218 mutex_enter(&ring->vr_lock);
219 const bool need_bail = vring_need_bail(ring);
220 mutex_exit(&ring->vr_lock);
221
222 if (need_bail) {
223 break;
224 }
225 burst = 0;
226 }
227 }
228
229 VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx);
230
231 /*
232 * Check for available descriptors on the ring once more in
233 * case a late addition raced with the NO_NOTIFY flag toggle.
234 *
235 * The barrier ensures that visibility of the no-notify
236 * store does not cross the viona_ring_num_avail() check below.
237 */
238 viona_ring_enable_notify(ring);
239 membar_enter();
240
241 if (viona_ring_num_avail(ring) == 0 &&
242 (link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) {
243 /*
244 * The NOTIFY_ON_EMPTY interrupt should not pay heed to
245 * the presence of AVAIL_NO_INTERRUPT.
246 */
247 viona_intr_ring(ring, B_TRUE);
248 }
249
250 mutex_enter(&ring->vr_lock);
251 for (;;) {
252 if (vring_need_bail(ring)) {
253 ring->vr_state = VRS_STOP;
254 viona_tx_wait_outstanding(ring);
255 return;
256 }
257
258 if (vmm_drv_lease_expired(ring->vr_lease)) {
259 ring->vr_state_flags |= VRSF_RENEW;
260 /*
261 * When renewing the lease for the ring, no TX
262 * frames may be outstanding, as they contain
263 * references to guest memory.
264 */
265 viona_tx_wait_outstanding(ring);
266
267 const boolean_t renewed =
268 viona_ring_lease_renew(ring);
269 ring->vr_state_flags &= ~VRSF_RENEW;
270
271 if (!renewed) {
272 /* stop ring on failed renewal */
273 ring->vr_state = VRS_STOP;
274 return;
275 }
276 }
277
278 if (viona_ring_num_avail(ring) != 0) {
279 break;
280 }
281
282 /* Wait for further activity on the ring */
283 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
284 }
285 mutex_exit(&ring->vr_lock);
286 }
287 /* UNREACHABLE */
288 }
289
290 static void
viona_desb_release(viona_desb_t * dp)291 viona_desb_release(viona_desb_t *dp)
292 {
293 viona_vring_t *ring = dp->d_ring;
294 uint_t ref;
295 uint32_t len;
296 uint16_t cookie;
297
298 ref = atomic_dec_uint_nv(&dp->d_ref);
299 if (ref > 1) {
300 return;
301 }
302
303 /*
304 * The desb corresponding to this index must be ready for reuse before
305 * the descriptor is returned to the guest via the 'used' ring.
306 */
307 len = dp->d_len;
308 cookie = dp->d_cookie;
309 dp->d_len = 0;
310 dp->d_cookie = 0;
311 vmm_drv_page_release_chain(dp->d_pages);
312 dp->d_pages = NULL;
313
314 /*
315 * Ensure all other changes to the desb are visible prior to zeroing its
316 * refcount, signifying its readiness for reuse.
317 */
318 membar_exit();
319 dp->d_ref = 0;
320
321 viona_tx_done(ring, len, cookie);
322
323 mutex_enter(&ring->vr_lock);
324 if ((--ring->vr_xfer_outstanding) == 0) {
325 cv_broadcast(&ring->vr_cv);
326 }
327 mutex_exit(&ring->vr_lock);
328 }
329
330 static boolean_t
viona_tx_csum(viona_vring_t * ring,const struct virtio_net_hdr * hdr,mblk_t * mp,uint32_t len)331 viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
332 mblk_t *mp, uint32_t len)
333 {
334 viona_link_t *link = ring->vr_link;
335 const struct ether_header *eth;
336 uint_t eth_len = sizeof (struct ether_header);
337 ushort_t ftype;
338 ipha_t *ipha = NULL;
339 uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */
340 uint16_t flags = 0;
341 const uint_t csum_start = hdr->vrh_csum_start;
342 const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start;
343
344 /*
345 * Validate that the checksum offsets provided by the guest are within
346 * the bounds of the packet. Additionally, ensure that the checksum
347 * contents field is within the headers mblk copied by viona_tx().
348 */
349 if (csum_start >= len || csum_start < eth_len || csum_stuff >= len ||
350 (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) {
351 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
352 VIONA_RING_STAT_INCR(ring, fail_hcksum);
353 return (B_FALSE);
354 }
355
356 /*
357 * This is guaranteed to be safe thanks to the header copying
358 * done in viona_tx().
359 */
360 eth = (const struct ether_header *)mp->b_rptr;
361 ftype = ntohs(eth->ether_type);
362
363 if (ftype == ETHERTYPE_VLAN) {
364 const struct ether_vlan_header *veth;
365
366 /* punt on QinQ for now */
367 eth_len = sizeof (struct ether_vlan_header);
368 veth = (const struct ether_vlan_header *)eth;
369 ftype = ntohs(veth->ether_type);
370 }
371
372 if (ftype == ETHERTYPE_IP) {
373 ipha = (ipha_t *)(mp->b_rptr + eth_len);
374
375 ipproto = ipha->ipha_protocol;
376 } else if (ftype == ETHERTYPE_IPV6) {
377 ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len);
378
379 ipproto = ip6h->ip6_nxt;
380 }
381
382 /*
383 * We ignore hdr_len because the spec says it can't be
384 * trusted. Besides, our own stack will determine the header
385 * boundary.
386 */
387 if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
388 (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
389 ftype == ETHERTYPE_IP) {
390 uint16_t *cksump;
391 uint32_t cksum;
392 ipaddr_t src = ipha->ipha_src;
393 ipaddr_t dst = ipha->ipha_dst;
394
395 /*
396 * Our native IP stack doesn't set the L4 length field
397 * of the pseudo header when LSO is in play. Other IP
398 * stacks, e.g. Linux, do include the length field.
399 * This is a problem because the hardware expects that
400 * the length field is not set. When it is set it will
401 * cause an incorrect TCP checksum to be generated.
402 * The reason this works in Linux is because Linux
403 * corrects the pseudo-header checksum in the driver
404 * code. In order to get the correct HW checksum we
405 * need to assume the guest's IP stack gave us a bogus
406 * TCP partial checksum and calculate it ourselves.
407 */
408 cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
409 cksum = IP_TCP_CSUM_COMP;
410 cksum += (dst >> 16) + (dst & 0xFFFF) +
411 (src >> 16) + (src & 0xFFFF);
412 cksum = (cksum & 0xFFFF) + (cksum >> 16);
413 *(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
414
415 /*
416 * Since viona is a "legacy device", the data stored
417 * by the driver will be in the guest's native endian
418 * format (see sections 2.4.3 and 5.1.6.1 of the
419 * VIRTIO 1.0 spec for more info). At this time the
420 * only guests using viona are x86 and we can assume
421 * little-endian.
422 */
423 lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO);
424
425 /*
426 * Hardware, like ixgbe, expects the client to request
427 * IP header checksum offload if it's sending LSO (see
428 * ixgbe_get_context()). Unfortunately, virtio makes
429 * no allowances for negotiating IP header checksum
430 * and HW offload, only TCP checksum. We add the flag
431 * and zero-out the checksum field. This mirrors the
432 * behavior of our native IP stack (which does this in
433 * the interest of HW that expects the field to be
434 * zero).
435 */
436 flags |= HCK_IPV4_HDRCKSUM;
437 ipha->ipha_hdr_checksum = 0;
438 }
439
440 /*
441 * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure
442 * HW_LSO, if present, is not lost.
443 */
444 flags |= DB_CKSUMFLAGS(mp);
445
446 /*
447 * Partial checksum support from the NIC is ideal, since it most
448 * closely maps to the interface defined by virtio.
449 */
450 if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
451 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
452 /*
453 * MAC expects these offsets to be relative to the
454 * start of the L3 header rather than the L2 frame.
455 */
456 flags |= HCK_PARTIALCKSUM;
457 mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len,
458 len - eth_len, 0, flags);
459 return (B_TRUE);
460 }
461
462 /*
463 * Without partial checksum support, look to the L3/L4 protocol
464 * information to see if the NIC can handle it. If not, the
465 * checksum will need to calculated inline.
466 */
467 if (ftype == ETHERTYPE_IP) {
468 if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
469 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
470 uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
471 *csump = 0;
472 flags |= HCK_FULLCKSUM;
473 mac_hcksum_set(mp, 0, 0, 0, 0, flags);
474 return (B_TRUE);
475 }
476
477 /* XXX: Implement manual fallback checksumming? */
478 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
479 VIONA_RING_STAT_INCR(ring, fail_hcksum);
480 return (B_FALSE);
481 } else if (ftype == ETHERTYPE_IPV6) {
482 if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
483 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
484 uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
485 *csump = 0;
486 flags |= HCK_FULLCKSUM;
487 mac_hcksum_set(mp, 0, 0, 0, 0, flags);
488 return (B_TRUE);
489 }
490
491 /* XXX: Implement manual fallback checksumming? */
492 VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp);
493 VIONA_RING_STAT_INCR(ring, fail_hcksum6);
494 return (B_FALSE);
495 }
496
497 /* Cannot even emulate hcksum for unrecognized protocols */
498 VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp);
499 VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
500 return (B_FALSE);
501 }
502
503 static void
viona_tx(viona_link_t * link,viona_vring_t * ring)504 viona_tx(viona_link_t *link, viona_vring_t *ring)
505 {
506 struct iovec *iov = ring->vr_txiov;
507 const uint_t max_segs = ring->vr_size;
508 uint16_t cookie;
509 int i, n;
510 uint32_t len, base_off = 0;
511 uint32_t min_copy = VIONA_MAX_HDRS_LEN;
512 mblk_t *mp_head, *mp_tail, *mp;
513 viona_desb_t *dp = NULL;
514 mac_client_handle_t link_mch = link->l_mch;
515 const struct virtio_net_hdr *hdr;
516 vmm_page_t *pages = NULL;
517
518 mp_head = mp_tail = NULL;
519
520 ASSERT(iov != NULL);
521
522 n = vq_popchain(ring, iov, max_segs, &cookie, &pages);
523 if (n == 0) {
524 VIONA_PROBE1(tx_absent, viona_vring_t *, ring);
525 VIONA_RING_STAT_INCR(ring, tx_absent);
526 return;
527 } else if (n < 0) {
528 /*
529 * Any error encountered in vq_popchain has already resulted in
530 * specific probe and statistic handling. Further action here
531 * is unnecessary.
532 */
533 return;
534 }
535
536 /* Grab the header and ensure it is of adequate length */
537 hdr = (const struct virtio_net_hdr *)iov[0].iov_base;
538 len = iov[0].iov_len;
539 if (len < sizeof (struct virtio_net_hdr)) {
540 goto drop_fail;
541 }
542
543 /* Make sure the packet headers are always in the first mblk. */
544 if (ring->vr_txdesb != NULL) {
545 dp = &ring->vr_txdesb[cookie];
546
547 /*
548 * If the guest driver is operating properly, each desb slot
549 * should be available for use when processing a TX descriptor
550 * from the 'avail' ring. In the case of drivers that reuse a
551 * descriptor before it has been posted to the 'used' ring, the
552 * data is simply dropped.
553 */
554 if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) {
555 dp = NULL;
556 goto drop_fail;
557 }
558
559 dp->d_cookie = cookie;
560 mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0,
561 &dp->d_frtn);
562
563 /* Account for the successful desballoc. */
564 if (mp_head != NULL)
565 dp->d_ref++;
566 } else {
567 mp_head = allocb(VIONA_MAX_HDRS_LEN, 0);
568 }
569
570 if (mp_head == NULL)
571 goto drop_fail;
572
573 mp_tail = mp_head;
574
575 /*
576 * We always copy enough of the guest data to cover the
577 * headers. This protects us from TOCTOU attacks and allows
578 * message block length assumptions to be made in subsequent
579 * code. In many cases, this means copying more data than
580 * strictly necessary. That's okay, as it is the larger packets
581 * (such as LSO) that really benefit from desballoc().
582 */
583 for (i = 1; i < n; i++) {
584 const uint32_t to_copy = MIN(min_copy, iov[i].iov_len);
585
586 bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy);
587 mp_head->b_wptr += to_copy;
588 len += to_copy;
589 min_copy -= to_copy;
590
591 /*
592 * We've met the minimum copy requirement. The rest of
593 * the guest data can be referenced.
594 */
595 if (min_copy == 0) {
596 /*
597 * If we copied all contents of this
598 * descriptor then move onto the next one.
599 * Otherwise, record how far we are into the
600 * current descriptor.
601 */
602 if (iov[i].iov_len == to_copy)
603 i++;
604 else
605 base_off = to_copy;
606
607 break;
608 }
609 }
610
611 ASSERT3P(mp_head, !=, NULL);
612 ASSERT3P(mp_tail, !=, NULL);
613
614 for (; i < n; i++) {
615 uintptr_t base = (uintptr_t)iov[i].iov_base + base_off;
616 uint32_t chunk = iov[i].iov_len - base_off;
617
618 ASSERT3U(base_off, <, iov[i].iov_len);
619 ASSERT3U(chunk, >, 0);
620
621 if (dp != NULL) {
622 mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn);
623 if (mp == NULL) {
624 goto drop_fail;
625 }
626 dp->d_ref++;
627 } else {
628 mp = allocb(chunk, BPRI_MED);
629 if (mp == NULL) {
630 goto drop_fail;
631 }
632 bcopy((uchar_t *)base, mp->b_wptr, chunk);
633 }
634
635 base_off = 0;
636 len += chunk;
637 mp->b_wptr += chunk;
638 mp_tail->b_cont = mp;
639 mp_tail = mp;
640 }
641
642 if (VNETHOOK_INTERESTED_OUT(link->l_neti)) {
643 /*
644 * The hook consumer may elect to free the mblk_t and set
645 * our mblk_t ** to NULL. When using a viona_desb_t
646 * (dp != NULL), we do not want the corresponding cleanup to
647 * occur during the viona_hook() call. We instead want to
648 * reset and recycle dp for future use. To prevent cleanup
649 * during the viona_hook() call, we take a ref on dp (if being
650 * used), and release it on success. On failure, the
651 * freemsgchain() call will release all the refs taken earlier
652 * in viona_tx() (aside from the initial ref and the one we
653 * take), and drop_hook will reset dp for reuse.
654 */
655 if (dp != NULL)
656 dp->d_ref++;
657
658 /*
659 * Pass &mp instead of &mp_head so we don't lose track of
660 * mp_head if the hook consumer (i.e. ipf) elects to free mp
661 * and set mp to NULL.
662 */
663 mp = mp_head;
664 if (viona_hook(link, ring, &mp, B_TRUE) != 0) {
665 if (mp != NULL)
666 freemsgchain(mp);
667 goto drop_hook;
668 }
669
670 if (dp != NULL) {
671 dp->d_ref--;
672
673 /*
674 * It is possible that the hook(s) accepted the packet,
675 * but as part of its processing, it issued a pull-up
676 * which released all references to the desb. In that
677 * case, go back to acting like the packet is entirely
678 * copied (which it is).
679 */
680 if (dp->d_ref == 1) {
681 dp->d_cookie = 0;
682 dp->d_ref = 0;
683 dp = NULL;
684 }
685 }
686 }
687
688 /*
689 * Request hardware checksumming, if necessary. If the guest
690 * sent an LSO packet then it must have also negotiated and
691 * requested partial checksum; therefore the LSO logic is
692 * contained within viona_tx_csum().
693 */
694 if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
695 (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
696 if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) {
697 goto drop_fail;
698 }
699 }
700
701 if (dp != NULL) {
702 dp->d_len = len;
703 dp->d_pages = pages;
704 mutex_enter(&ring->vr_lock);
705 ring->vr_xfer_outstanding++;
706 mutex_exit(&ring->vr_lock);
707 } else {
708 /*
709 * If the data was cloned out of the ring, the descriptors can
710 * be marked as 'used' now, rather than deferring that action
711 * until after successful packet transmission.
712 */
713 vmm_drv_page_release_chain(pages);
714 viona_tx_done(ring, len, cookie);
715 }
716
717 /*
718 * We're potentially going deep into the networking layer; make sure the
719 * guest can't run concurrently.
720 */
721 smt_begin_unsafe();
722 /*
723 * Ignore, for now, any signal from MAC about whether the outgoing
724 * packet was dropped or not.
725 */
726 (void) mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
727 smt_end_unsafe();
728 return;
729
730 drop_fail:
731 /*
732 * On the off chance that memory is not available via the desballoc or
733 * allocb calls, there are few options left besides to fail and drop
734 * the frame on the floor.
735 */
736
737 if (dp != NULL) {
738 /*
739 * Take an additional reference on the desb handle (if present)
740 * so any desballoc-sourced mblks can release their hold on it
741 * without the handle reaching its final state and executing
742 * its clean-up logic.
743 */
744 dp->d_ref++;
745 }
746
747 /*
748 * Free any already-allocated blocks and sum up the total length of the
749 * dropped data to be released to the used ring.
750 */
751 freemsgchain(mp_head);
752
753 drop_hook:
754 len = 0;
755 for (uint_t i = 0; i < n; i++) {
756 len += iov[i].iov_len;
757 }
758
759 if (dp != NULL) {
760 VERIFY(dp->d_ref == 2);
761
762 /* Clean up the desb handle, releasing the extra hold. */
763 dp->d_len = 0;
764 dp->d_cookie = 0;
765 dp->d_ref = 0;
766 }
767
768 VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len,
769 uint16_t, cookie);
770 vmm_drv_page_release_chain(pages);
771 viona_tx_done(ring, len, cookie);
772 }
773