1 /*
2 * Copyright (c) 2013 Chris Torek <torek @ torek net>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26 /*
27 * This file and its contents are supplied under the terms of the
28 * Common Development and Distribution License ("CDDL"), version 1.0.
29 * You may only use this file in accordance with the terms of version
30 * 1.0 of the CDDL.
31 *
32 * A full copy of the text of the CDDL should have accompanied this
33 * source. A copy of the CDDL is also available via the Internet at
34 * http://www.illumos.org/license/CDDL.
35 *
36 * Copyright 2015 Pluribus Networks Inc.
37 * Copyright 2019 Joyent, Inc.
38 * Copyright 2025 Oxide Computer Company
39 */
40
41
42 #include <sys/types.h>
43 #include <sys/smt.h>
44 #include <sys/strsubr.h>
45
46 #include <sys/pattr.h>
47 #include <sys/dlpi.h>
48 #include <inet/ip.h>
49 #include <inet/ip_impl.h>
50
51 #include "viona_impl.h"
52
53 #define BNXE_NIC_DRIVER "bnxe"
54
55 /*
56 * Tunable controls tx copy by default on or off
57 */
58 boolean_t viona_default_tx_copy = B_TRUE;
59
60 /*
61 * Tunable for maximum configured TX header padding.
62 */
63 uint_t viona_max_header_pad = 256;
64
65 /*
66 * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
67 * transmission to free resources.
68 */
69 kmutex_t viona_force_copy_lock;
70 static enum viona_force_copy {
71 VFC_UNINITALIZED = 0,
72 VFC_COPY_UNEEDED = 1,
73 VFC_COPY_REQUIRED = 2,
74 } viona_force_copy_state = VFC_UNINITALIZED;
75
76 struct viona_desb {
77 frtn_t d_frtn;
78 viona_vring_t *d_ring;
79 uint_t d_ref;
80 uint32_t d_len;
81 uint16_t d_cookie;
82 uchar_t *d_headers;
83 vmm_page_t *d_pages;
84 };
85
86 static size_t viona_tx(viona_link_t *, viona_vring_t *);
87 static void viona_desb_release(viona_desb_t *);
88
89
90 static void
viona_tx_wait_outstanding(viona_vring_t * ring)91 viona_tx_wait_outstanding(viona_vring_t *ring)
92 {
93 ASSERT(MUTEX_HELD(&ring->vr_lock));
94
95 while (ring->vr_xfer_outstanding != 0) {
96 /*
97 * Paying heed to signals is counterproductive here. This is a
98 * very tight loop if pending transfers take an extended amount
99 * of time to be reclaimed while the host process is exiting.
100 */
101 cv_wait(&ring->vr_cv, &ring->vr_lock);
102 }
103 }
104
105 /*
106 * Check if full TX packet copying is needed. This should not be called from
107 * viona attach()/detach() context.
108 */
109 boolean_t
viona_tx_copy_needed(void)110 viona_tx_copy_needed(void)
111 {
112 boolean_t result;
113
114 if (viona_default_tx_copy) {
115 return (B_TRUE);
116 }
117
118 mutex_enter(&viona_force_copy_lock);
119 if (viona_force_copy_state == VFC_UNINITALIZED) {
120 major_t bnxe_major;
121
122 /*
123 * The original code for viona featured an explicit check for
124 * the bnxe driver which, when found present, necessitated that
125 * all transmissions be copied into their own mblks instead of
126 * passing guest memory to the underlying device.
127 *
128 * The motivations for this are unclear, but until it can be
129 * proven unnecessary, the check lives on.
130 */
131 viona_force_copy_state = VFC_COPY_UNEEDED;
132 if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER))
133 != DDI_MAJOR_T_NONE) {
134 if (ddi_hold_installed_driver(bnxe_major) != NULL) {
135 viona_force_copy_state = VFC_COPY_REQUIRED;
136 ddi_rele_driver(bnxe_major);
137 }
138 }
139 }
140 result = (viona_force_copy_state == VFC_COPY_REQUIRED);
141 mutex_exit(&viona_force_copy_lock);
142
143 return (result);
144 }
145
146 void
viona_tx_ring_alloc(viona_vring_t * ring,const uint16_t qsz)147 viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz)
148 {
149 const viona_link_params_t *vlp = &ring->vr_link->l_params;
150
151 ring->vr_tx.vrt_header_pad = vlp->vlp_tx_header_pad;
152 /* Allocate desb handles for TX ring if packet copying not forced */
153 if (!ring->vr_link->l_params.vlp_tx_copy_data) {
154 viona_desb_t *dp =
155 kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
156 ring->vr_tx.vrt_desb = dp;
157
158 const size_t header_sz =
159 VIONA_MAX_HDRS_LEN + ring->vr_tx.vrt_header_pad;
160 for (uint_t i = 0; i < qsz; i++, dp++) {
161 dp->d_frtn.free_func = viona_desb_release;
162 dp->d_frtn.free_arg = (void *)dp;
163 dp->d_ring = ring;
164 dp->d_headers = kmem_zalloc(header_sz, KM_SLEEP);
165 }
166 }
167
168 /* Allocate ring-sized iovec buffers for TX */
169 ring->vr_tx.vrt_iov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP);
170 ring->vr_tx.vrt_iov_cnt = qsz;
171 }
172
173 void
viona_tx_ring_free(viona_vring_t * ring,const uint16_t qsz)174 viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz)
175 {
176 if (ring->vr_tx.vrt_desb != NULL) {
177 viona_desb_t *dp = ring->vr_tx.vrt_desb;
178
179 const size_t header_sz =
180 VIONA_MAX_HDRS_LEN + ring->vr_tx.vrt_header_pad;
181 for (uint_t i = 0; i < qsz; i++, dp++) {
182 kmem_free(dp->d_headers, header_sz);
183 }
184 kmem_free(ring->vr_tx.vrt_desb, sizeof (viona_desb_t) * qsz);
185 ring->vr_tx.vrt_desb = NULL;
186 }
187
188 if (ring->vr_tx.vrt_iov != NULL) {
189 ASSERT3U(ring->vr_tx.vrt_iov_cnt, !=, 0);
190
191 kmem_free(ring->vr_tx.vrt_iov,
192 sizeof (struct iovec) * ring->vr_tx.vrt_iov_cnt);
193 ring->vr_tx.vrt_iov = NULL;
194 ring->vr_tx.vrt_iov_cnt = 0;
195 }
196 }
197
198 static void
viona_tx_done(viona_vring_t * ring,uint32_t len,uint16_t cookie)199 viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie)
200 {
201 vq_pushchain(ring, len, cookie);
202
203 membar_enter();
204 viona_intr_ring(ring, B_FALSE);
205 }
206
207 #define TX_BURST_THRESH 32
208
209 void
viona_worker_tx(viona_vring_t * ring,viona_link_t * link)210 viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
211 {
212 (void) thread_vsetname(curthread, "viona_tx_%p", ring);
213
214 ASSERT(MUTEX_HELD(&ring->vr_lock));
215 ASSERT3U(ring->vr_state, ==, VRS_RUN);
216
217 mutex_exit(&ring->vr_lock);
218
219 for (;;) {
220 size_t cnt_tx = 0, size_tx = 0;
221 uint_t burst = 0;
222
223 viona_ring_disable_notify(ring);
224 while (viona_ring_num_avail(ring) != 0) {
225 const size_t size_sent = viona_tx(link, ring);
226 if (size_sent != 0) {
227 /* Account for successful transmissions */
228 size_tx += size_sent;
229 cnt_tx++;
230 }
231 burst++;
232
233 /*
234 * It is advantageous for throughput to keep this
235 * transmission loop tight, but periodic breaks to
236 * check for other events are of value too.
237 */
238 if (burst >= TX_BURST_THRESH) {
239 mutex_enter(&ring->vr_lock);
240 const bool need_bail = vring_need_bail(ring);
241 mutex_exit(&ring->vr_lock);
242
243 if (need_bail) {
244 break;
245 }
246 burst = 0;
247 }
248 }
249
250 VIONA_PROBE2(tx, viona_link_t *, link, size_t, cnt_tx);
251 if (cnt_tx != 0) {
252 viona_ring_stat_accept(ring, cnt_tx, size_tx);
253 }
254
255 /*
256 * Check for available descriptors on the ring once more in
257 * case a late addition raced with the NO_NOTIFY flag toggle.
258 *
259 * The barrier ensures that visibility of the no-notify
260 * store does not cross the viona_ring_num_avail() check below.
261 */
262 viona_ring_enable_notify(ring);
263 membar_enter();
264
265 if (viona_ring_num_avail(ring) == 0 &&
266 (link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) {
267 /*
268 * The NOTIFY_ON_EMPTY interrupt should not pay heed to
269 * the presence of AVAIL_NO_INTERRUPT.
270 */
271 viona_intr_ring(ring, B_TRUE);
272 }
273
274 mutex_enter(&ring->vr_lock);
275 for (;;) {
276 if (vring_need_bail(ring)) {
277 ring->vr_state = VRS_STOP;
278 viona_tx_wait_outstanding(ring);
279 return;
280 }
281
282 if (vmm_drv_lease_expired(ring->vr_lease)) {
283 ring->vr_state_flags |= VRSF_RENEW;
284 /*
285 * When renewing the lease for the ring, no TX
286 * frames may be outstanding, as they contain
287 * references to guest memory.
288 */
289 viona_tx_wait_outstanding(ring);
290
291 const boolean_t renewed =
292 viona_ring_lease_renew(ring);
293 ring->vr_state_flags &= ~VRSF_RENEW;
294
295 if (!renewed) {
296 /* stop ring on failed renewal */
297 ring->vr_state = VRS_STOP;
298 return;
299 }
300 }
301
302 if (viona_ring_num_avail(ring) != 0) {
303 break;
304 }
305
306 /* Wait for further activity on the ring */
307 (void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
308 }
309 mutex_exit(&ring->vr_lock);
310 }
311 /* UNREACHABLE */
312 }
313
314 static void
viona_desb_release(viona_desb_t * dp)315 viona_desb_release(viona_desb_t *dp)
316 {
317 viona_vring_t *ring = dp->d_ring;
318 uint_t ref;
319 uint32_t len;
320 uint16_t cookie;
321
322 ref = atomic_dec_uint_nv(&dp->d_ref);
323 if (ref > 1) {
324 return;
325 }
326
327 /*
328 * The desb corresponding to this index must be ready for reuse before
329 * the descriptor is returned to the guest via the 'used' ring.
330 */
331 len = dp->d_len;
332 cookie = dp->d_cookie;
333 dp->d_len = 0;
334 dp->d_cookie = 0;
335 vmm_drv_page_release_chain(dp->d_pages);
336 dp->d_pages = NULL;
337
338 /*
339 * Ensure all other changes to the desb are visible prior to zeroing its
340 * refcount, signifying its readiness for reuse.
341 */
342 membar_exit();
343 dp->d_ref = 0;
344
345 viona_tx_done(ring, len, cookie);
346
347 mutex_enter(&ring->vr_lock);
348 if ((--ring->vr_xfer_outstanding) == 0) {
349 cv_broadcast(&ring->vr_cv);
350 }
351 mutex_exit(&ring->vr_lock);
352 }
353
354 /*
355 * Confirm that the requested checksum operation acts within the bounds of the
356 * provided packet, and that the checksum itself will be stored in the "copied
357 * headers" portion of said packet.
358 */
359 static boolean_t
viona_tx_csum_req_valid(const struct virtio_net_mrgrxhdr * hdr,const mac_ether_offload_info_t * meoi,uint_t copied_len)360 viona_tx_csum_req_valid(const struct virtio_net_mrgrxhdr *hdr,
361 const mac_ether_offload_info_t *meoi, uint_t copied_len)
362 {
363 const uint_t csum_off = hdr->vrh_csum_offset + hdr->vrh_csum_start;
364
365 if (hdr->vrh_csum_start >= meoi->meoi_len ||
366 hdr->vrh_csum_start < meoi->meoi_l2hlen ||
367 csum_off >= meoi->meoi_len ||
368 (csum_off + sizeof (uint16_t)) > copied_len) {
369 return (B_FALSE);
370 }
371
372 return (B_TRUE);
373 }
374
375 /*
376 * Configure mblk to request full checksum offloading, given the virtio and meoi
377 * details provided.
378 */
379 static void
viona_tx_hcksum_full(mblk_t * mp,const struct virtio_net_mrgrxhdr * hdr,const mac_ether_offload_info_t * meoi,uint32_t added_flags)380 viona_tx_hcksum_full(mblk_t *mp, const struct virtio_net_mrgrxhdr *hdr,
381 const mac_ether_offload_info_t *meoi, uint32_t added_flags)
382 {
383 /*
384 * Out of caution, zero the checksum field in case any driver and/or
385 * device would erroneously use it in the sum calculation.
386 */
387 uint16_t *csump = (uint16_t *)
388 (mp->b_rptr + hdr->vrh_csum_start + hdr->vrh_csum_offset);
389 *csump = 0;
390
391 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM | added_flags);
392 }
393
394 /*
395 * Configure mblk to request partial checksum offloading, given the virtio and
396 * meoi details provided.
397 */
398 static void
viona_tx_hcksum_partial(mblk_t * mp,const struct virtio_net_mrgrxhdr * hdr,const mac_ether_offload_info_t * meoi,uint32_t added_flags)399 viona_tx_hcksum_partial(mblk_t *mp, const struct virtio_net_mrgrxhdr *hdr,
400 const mac_ether_offload_info_t *meoi, uint32_t added_flags)
401 {
402 /*
403 * MAC expects these offsets to be relative to the start of the L3
404 * header rather than the L2 frame.
405 */
406 mac_hcksum_set(mp,
407 hdr->vrh_csum_start - meoi->meoi_l2hlen,
408 hdr->vrh_csum_start + hdr->vrh_csum_offset - meoi->meoi_l2hlen,
409 meoi->meoi_len - meoi->meoi_l2hlen,
410 0, HCK_PARTIALCKSUM | added_flags);
411 }
412
413 static boolean_t
viona_tx_offloads(viona_vring_t * ring,const struct virtio_net_mrgrxhdr * hdr,const mac_ether_offload_info_t * meoi,mblk_t * mp,uint32_t len)414 viona_tx_offloads(viona_vring_t *ring, const struct virtio_net_mrgrxhdr *hdr,
415 const mac_ether_offload_info_t *meoi, mblk_t *mp, uint32_t len)
416 {
417 viona_link_t *link = ring->vr_link;
418 const uint32_t cap_csum = link->l_cap_csum;
419
420 /*
421 * Since viona is a "legacy device", the data stored by the driver will
422 * be in the guest's native endian format (see sections 2.4.3 and
423 * 5.1.6.1 of the VIRTIO 1.0 spec for more info). At this time the only
424 * guests using viona are x86 and we can assume little-endian.
425 */
426 const uint16_t gso_size = LE_16(hdr->vrh_gso_size);
427
428 if (!viona_tx_csum_req_valid(hdr, meoi, MBLKL(mp))) {
429 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
430 VIONA_RING_STAT_INCR(ring, fail_hcksum);
431 return (B_FALSE);
432 }
433
434 const uint16_t ftype = meoi->meoi_l3proto;
435 const uint8_t ipproto = meoi->meoi_l4proto;
436 if (ftype != ETHERTYPE_IP && ftype != ETHERTYPE_IPV6) {
437 /* Ignore checksum offload requests for non-IP protocols. */
438 VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link,
439 mblk_t *, mp);
440 VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
441 return (B_FALSE);
442 }
443
444 /* Configure TCPv4 LSO when requested */
445 if ((hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
446 ftype == ETHERTYPE_IP) {
447 if ((link->l_features & VIRTIO_NET_F_HOST_TSO4) == 0) {
448 VIONA_PROBE2(tx_gso_fail, viona_link_t *, link,
449 mblk_t *, mp);
450 VIONA_RING_STAT_INCR(ring, tx_gso_fail);
451 return (B_FALSE);
452 }
453
454 lso_info_set(mp, gso_size, HW_LSO);
455
456 /*
457 * We should have already verified that an adequate form of
458 * hardware checksum offload is present for TSOv4
459 */
460 ASSERT3U(cap_csum &
461 (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4), !=, 0);
462
463 if ((cap_csum & HCKSUM_INET_FULL_V4) != 0) {
464 viona_tx_hcksum_full(mp, hdr, meoi, HW_LSO);
465 } else if ((cap_csum & HCKSUM_INET_PARTIAL) != 0) {
466 /*
467 * Our native IP stack doesn't set the L4 length field
468 * of the pseudo header when LSO is in play. Other IP
469 * stacks, e.g. Linux, do include the length field.
470 * This is a problem because the hardware expects that
471 * the length field is not set. When it is set, it will
472 * cause an incorrect TCP checksum to be generated.
473 * Linux avoids this issue by correcting the
474 * pseudo-header checksum in the driver code.
475 *
476 * In order to get the correct HW checksum we need to
477 * assume the guest's IP stack gave us a bogus TCP
478 * partial checksum and calculate it ourselves.
479 */
480 ipha_t *ipha =
481 (ipha_t *)(mp->b_rptr + meoi->meoi_l2hlen);
482 uint16_t *cksump =
483 IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
484
485 uint32_t cksum = IP_TCP_CSUM_COMP;
486 const ipaddr_t src = ipha->ipha_src;
487 const ipaddr_t dst = ipha->ipha_dst;
488 cksum += (dst >> 16) + (dst & 0xffff) +
489 (src >> 16) + (src & 0xffff);
490 cksum = (cksum & 0xffff) + (cksum >> 16);
491 *cksump = (cksum & 0xffff) + (cksum >> 16);
492
493 /*
494 * NICs such as ixgbe require that ipv4 checksum offload
495 * also be enabled when performing LSO.
496 */
497 uint32_t v4csum = 0;
498 if ((cap_csum & HCKSUM_IPHDRCKSUM) != 0) {
499 v4csum = HCK_IPV4_HDRCKSUM;
500 ipha->ipha_hdr_checksum = 0;
501 }
502
503 viona_tx_hcksum_partial(mp, hdr, meoi, HW_LSO | v4csum);
504 } else {
505 /*
506 * This should be unreachable: We do not permit LSO
507 * without adequate checksum offload capability.
508 */
509 VIONA_PROBE2(tx_gso_fail, viona_link_t *, link,
510 mblk_t *, mp);
511 VIONA_RING_STAT_INCR(ring, tx_gso_fail);
512 return (B_FALSE);
513 }
514
515 return (B_TRUE);
516 }
517
518 /*
519 * Partial checksum support from the NIC is ideal, since it most closely
520 * maps to the interface defined by virtio.
521 */
522 if ((cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
523 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
524 viona_tx_hcksum_partial(mp, hdr, meoi, 0);
525 return (B_TRUE);
526 }
527
528 /*
529 * Without partial checksum support, look to the L3/L4 protocol
530 * information to see if the NIC can handle it. If not, the checksum
531 * will need to calculated inline.
532 */
533 if (ftype == ETHERTYPE_IP) {
534 if ((cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
535 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
536 viona_tx_hcksum_full(mp, hdr, meoi, 0);
537 return (B_TRUE);
538 }
539
540 /* XXX: Implement manual fallback checksumming? */
541 VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
542 VIONA_RING_STAT_INCR(ring, fail_hcksum);
543 return (B_FALSE);
544 } else if (ftype == ETHERTYPE_IPV6) {
545 if ((cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
546 (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
547 viona_tx_hcksum_full(mp, hdr, meoi, 0);
548 return (B_TRUE);
549 }
550
551 /* XXX: Implement manual fallback checksumming? */
552 VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp);
553 VIONA_RING_STAT_INCR(ring, fail_hcksum6);
554 return (B_FALSE);
555 }
556
557 /*
558 * Note the failure for unrecognized protocols, but soldier on to make
559 * our best effort at getting the frame out the door.
560 */
561 VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp);
562 VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
563 return (B_FALSE);
564 }
565
566 static mblk_t *
viona_tx_alloc_headers(viona_vring_t * ring,uint16_t cookie,viona_desb_t ** dpp,uint32_t len)567 viona_tx_alloc_headers(viona_vring_t *ring, uint16_t cookie, viona_desb_t **dpp,
568 uint32_t len)
569 {
570 ASSERT3P(*dpp, ==, NULL);
571
572 mblk_t *mp = NULL;
573 const size_t header_pad = ring->vr_tx.vrt_header_pad;
574
575 if (ring->vr_tx.vrt_desb != NULL) {
576 viona_desb_t *dp = &ring->vr_tx.vrt_desb[cookie];
577 const size_t header_sz = VIONA_MAX_HDRS_LEN + header_pad;
578
579 /*
580 * If the guest driver is operating properly, each desb slot
581 * should be available for use when processing a TX descriptor
582 * from the 'avail' ring. In the case of drivers that reuse a
583 * descriptor before it has been posted to the 'used' ring, the
584 * data is simply dropped.
585 */
586 if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) {
587 return (NULL);
588 }
589
590 dp->d_cookie = cookie;
591 mp = desballoc(dp->d_headers, header_sz, 0, &dp->d_frtn);
592
593 if (mp != NULL) {
594 /*
595 * Account for the successful desballoc, and communicate
596 * out the desb handle for subsequent use
597 */
598 dp->d_ref++;
599 *dpp = dp;
600 } else {
601 /* Reset the desb back to its "available" state */
602 dp->d_ref = 0;
603 }
604 } else {
605 /*
606 * If we are going to be copying the entire packet, we might as
607 * well allocate for it all in one go.
608 */
609 mp = allocb(len + header_pad, 0);
610 }
611
612 /* Push pointers forward to account for requested header padding */
613 if (mp != NULL && header_pad != 0) {
614 mp->b_rptr = mp->b_wptr = (DB_BASE(mp) + header_pad);
615 }
616
617 return (mp);
618 }
619
620 static boolean_t
viona_tx_copy_headers(viona_vring_t * ring,iov_bunch_t * iob,mblk_t * mp,mac_ether_offload_info_t * meoi)621 viona_tx_copy_headers(viona_vring_t *ring, iov_bunch_t *iob, mblk_t *mp,
622 mac_ether_offload_info_t *meoi)
623 {
624 ASSERT(mp->b_cont == NULL);
625
626 if (ring->vr_tx.vrt_desb == NULL) {
627 /*
628 * If not using guest data loaning through the desb, then we
629 * expect viona_tx_alloc_headers() to have allocated space for
630 * the entire packet, which we should copy now.
631 */
632 const uint32_t pkt_size = iob->ib_remain;
633
634 VERIFY(MBLKTAIL(mp) >= pkt_size);
635 VERIFY(iov_bunch_copy(iob, mp->b_wptr, pkt_size));
636 mp->b_wptr += pkt_size;
637 mac_ether_offload_info(mp, meoi);
638 return (B_TRUE);
639 }
640
641 /*
642 * We want to maximize the amount of guest data we loan when performing
643 * packet transmission, with the caveat that we must copy the packet
644 * headers to prevent TOCTOU issues.
645 */
646 const uint32_t copy_sz = MIN(iob->ib_remain, MBLKTAIL(mp));
647
648 VERIFY(iov_bunch_copy(iob, mp->b_wptr, copy_sz));
649 mp->b_wptr += copy_sz;
650
651 if (iob->ib_remain == 0) {
652 mac_ether_offload_info(mp, meoi);
653 return (B_TRUE);
654 }
655
656 mac_ether_offload_info(mp, meoi);
657 if ((meoi->meoi_flags & MEOI_L2INFO_SET) == 0) {
658 /* If the L2 header cannot be parsed, give up now */
659 return (B_FALSE);
660 }
661 if ((meoi->meoi_flags & MEOI_L4INFO_SET) != 0) {
662 const uint32_t full_hdr_sz =
663 meoi->meoi_l2hlen + meoi->meoi_l3hlen + meoi->meoi_l4hlen;
664 if (copy_sz >= full_hdr_sz) {
665 /* All headers are already copied */
666 return (B_TRUE);
667 }
668 }
669
670 /*
671 * The full headers do not appear to be along for the ride yet, or the
672 * packet bears a protocol we do not handle. Just allocate a
673 * buffer and copy the remainder of the packet.
674 */
675 const uint32_t remain_sz = iob->ib_remain;
676 mblk_t *remain_mp = allocb(remain_sz, 0);
677 if (remain_mp == NULL) {
678 return (B_FALSE);
679 }
680 VERIFY(iov_bunch_copy(iob, remain_mp->b_wptr, remain_sz));
681 remain_mp->b_wptr += remain_sz;
682 mp->b_cont = remain_mp;
683 /* Refresh header info now that we have copied the rest */
684 mac_ether_offload_info(mp, meoi);
685
686 return (B_TRUE);
687 }
688
689 static size_t
viona_tx(viona_link_t * link,viona_vring_t * ring)690 viona_tx(viona_link_t *link, viona_vring_t *ring)
691 {
692 struct iovec *iov = ring->vr_tx.vrt_iov;
693 const uint_t max_segs = ring->vr_tx.vrt_iov_cnt;
694 uint16_t cookie;
695 vmm_page_t *pages = NULL;
696 uint32_t total_len;
697 mblk_t *mp_head = NULL;
698 viona_desb_t *dp = NULL;
699 const boolean_t merge_enabled =
700 ((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0);
701
702 ASSERT(iov != NULL);
703
704 const int n = vq_popchain(ring, iov, max_segs, &cookie, &pages,
705 &total_len);
706 if (n == 0) {
707 VIONA_PROBE1(tx_absent, viona_vring_t *, ring);
708 VIONA_RING_STAT_INCR(ring, tx_absent);
709 return (0);
710 } else if (n < 0) {
711 /*
712 * Any error encountered in vq_popchain has already resulted in
713 * specific probe and statistic handling. Further action here
714 * is unnecessary.
715 */
716 return (0);
717 }
718
719 /*
720 * Get setup to copy the VirtIO header from in front of the packet.
721 *
722 * With an eye toward supporting VirtIO 1.0 behavior in the future, we
723 * determine the size of the header based on the device state. This
724 * goes a bit beyond the expectations of legacy VirtIO, where the first
725 * buffer must cover the header and nothing else.
726 */
727 iov_bunch_t iob = {
728 .ib_iov = iov,
729 .ib_remain = total_len,
730 };
731 struct virtio_net_mrgrxhdr hdr;
732 uint32_t vio_hdr_len = 0;
733 if (merge_enabled) {
734 /*
735 * Presence of the "num_bufs" member is determined by the
736 * merge-rxbuf feature on the device, despite the fact that we
737 * are in transmission context here.
738 */
739 vio_hdr_len = sizeof (struct virtio_net_mrgrxhdr);
740 } else {
741 vio_hdr_len = sizeof (struct virtio_net_hdr);
742 /*
743 * We ignore "num_bufs" from the guest anyways, but zero it out
744 * just in case.
745 */
746 hdr.vrh_bufs = 0;
747 }
748 const uint32_t pkt_len = total_len - vio_hdr_len;
749 if (!iov_bunch_copy(&iob, &hdr, vio_hdr_len)) {
750 goto drop_fail;
751 }
752
753 if (pkt_len > VIONA_MAX_PACKET_SIZE ||
754 pkt_len < sizeof (struct ether_header)) {
755 goto drop_fail;
756 }
757
758 mp_head = viona_tx_alloc_headers(ring, cookie, &dp, pkt_len);
759 if (mp_head == NULL) {
760 goto drop_fail;
761 }
762
763 /*
764 * Copy the the packet headers (L2 through L4, if present) to prevent
765 * TOCTOU attacks in any subsequent consumers of that data.
766 */
767 mac_ether_offload_info_t meoi = { 0 };
768 if (!viona_tx_copy_headers(ring, &iob, mp_head, &meoi)) {
769 goto drop_fail;
770 }
771
772 if (dp != NULL && iob.ib_remain != 0) {
773 /*
774 * If this device is loaning guest memory, rather than copying
775 * the entire body of the packet, we may need to establish mblks
776 * for the remaining data-to-be-loaned after the header copy.
777 */
778 uint32_t chunk_sz;
779 caddr_t chunk;
780 mblk_t *mp_tail = mp_head;
781
782 /*
783 * Ensure that our view of the tail is accurate in the rare case
784 * that the header allocation/copying logic has already resulted
785 * in a chained mblk.
786 */
787 while (mp_tail->b_cont != NULL) {
788 mp_tail = mp_tail->b_cont;
789 }
790
791 while (iov_bunch_next_chunk(&iob, &chunk, &chunk_sz)) {
792 mblk_t *mp = desballoc((uchar_t *)chunk, chunk_sz, 0,
793 &dp->d_frtn);
794 if (mp == NULL) {
795 goto drop_fail;
796 }
797
798 mp->b_wptr += chunk_sz;
799 dp->d_ref++;
800 mp_tail->b_cont = mp;
801 mp_tail = mp;
802 }
803 } else {
804 /* The copy-everything strategy should be done by now */
805 VERIFY0(iob.ib_remain);
806 }
807
808 if (VNETHOOK_INTERESTED_OUT(link->l_neti)) {
809 /*
810 * The hook consumer may elect to free the mblk_t and set
811 * our mblk_t ** to NULL. When using a viona_desb_t
812 * (dp != NULL), we do not want the corresponding cleanup to
813 * occur during the viona_hook() call. We instead want to
814 * reset and recycle dp for future use. To prevent cleanup
815 * during the viona_hook() call, we take a ref on dp (if being
816 * used), and release it on success. On failure, the
817 * freemsgchain() call will release all the refs taken earlier
818 * in viona_tx() (aside from the initial ref and the one we
819 * take), and drop_hook will reset dp for reuse.
820 */
821 if (dp != NULL)
822 dp->d_ref++;
823
824 /*
825 * Pass &mp instead of &mp_head so we don't lose track of
826 * mp_head if the hook consumer (i.e. ipf) elects to free mp
827 * and set mp to NULL.
828 */
829 mblk_t *mp = mp_head;
830 if (viona_hook(link, ring, &mp, B_TRUE) != 0) {
831 if (mp != NULL)
832 freemsgchain(mp);
833 goto drop_hook;
834 }
835
836 if (dp != NULL) {
837 dp->d_ref--;
838
839 /*
840 * It is possible that the hook(s) accepted the packet,
841 * but as part of its processing, it issued a pull-up
842 * which released all references to the desb. In that
843 * case, go back to acting like the packet is entirely
844 * copied (which it is).
845 */
846 if (dp->d_ref == 1) {
847 dp->d_cookie = 0;
848 dp->d_ref = 0;
849 dp = NULL;
850 }
851 }
852 }
853
854 /*
855 * Translate request for offloaded checksumming. If the guest sent an
856 * LSO packet then it must have also negotiated and requested partial
857 * checksum; therefore the LSO logic is contained within
858 * viona_tx_offloads().
859 */
860 if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
861 (hdr.vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
862 if (!viona_tx_offloads(ring, &hdr, &meoi, mp_head, pkt_len)) {
863 /*
864 * If processing of any checksum offload request fails,
865 * we can still pass the packet on for transmission.
866 * Even with this best-effort behavior, which may in
867 * fact succeed in the end, we record it as an error.
868 */
869 viona_ring_stat_error(ring);
870 }
871 }
872
873 if (dp != NULL) {
874 /*
875 * Record the info required to record this descriptor in the
876 * used ring once its transmission has completed.
877 */
878 dp->d_len = total_len;
879 dp->d_pages = pages;
880 mutex_enter(&ring->vr_lock);
881 ring->vr_xfer_outstanding++;
882 mutex_exit(&ring->vr_lock);
883 } else {
884 /*
885 * If the data was cloned out of the ring, the descriptors can
886 * be marked as 'used' now, rather than deferring that action
887 * until after successful packet transmission.
888 */
889 vmm_drv_page_release_chain(pages);
890 viona_tx_done(ring, total_len, cookie);
891 }
892
893 /*
894 * From viona's point of view, this is a successful transmit, even if
895 * something downstream decides to drop the packet.
896 */
897 VIONA_PROBE3(pkt__tx, viona_vring_t *, ring, mblk_t, mp_head,
898 size_t, pkt_len)
899
900 /*
901 * We're potentially going deep into the networking layer; make sure the
902 * guest can't run concurrently.
903 */
904 smt_begin_unsafe();
905 /*
906 * Ignore, for now, any signal from MAC about whether the outgoing
907 * packet was dropped or not.
908 */
909 (void) mac_tx(link->l_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
910 smt_end_unsafe();
911 return (pkt_len);
912
913 drop_fail:
914 /*
915 * On the off chance that memory is not available via the desballoc or
916 * allocb calls, there are few options left besides to fail and drop
917 * the frame on the floor.
918 *
919 * First account for it in the error stats.
920 */
921 viona_ring_stat_error(ring);
922
923 if (dp != NULL) {
924 /*
925 * Take an additional reference on the desb handle (if present)
926 * so any desballoc-sourced mblks can release their hold on it
927 * without the handle reaching its final state and executing
928 * its clean-up logic.
929 */
930 dp->d_ref++;
931 }
932
933 /*
934 * Free any already-allocated blocks and sum up the total length of the
935 * dropped data to be released to the used ring.
936 */
937 freemsgchain(mp_head);
938
939 drop_hook:
940 if (dp != NULL) {
941 VERIFY(dp->d_ref == 2);
942
943 /* Clean up the desb handle, releasing the extra hold. */
944 dp->d_len = 0;
945 dp->d_cookie = 0;
946 dp->d_ref = 0;
947 }
948
949 /* Count in the stats as a drop, rather than an error */
950 viona_ring_stat_drop(ring, 1);
951
952 VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, pkt_len,
953 uint16_t, cookie);
954 vmm_drv_page_release_chain(pages);
955 viona_tx_done(ring, total_len, cookie);
956 return (0);
957 }
958