xref: /illumos-gate/usr/src/uts/intel/io/viona/viona_tx.c (revision ca28c3d8eab8b53ff145fd15cf80cdc2da3fc032)
1 /*
2  * Copyright (c) 2013  Chris Torek <torek @ torek net>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * This file and its contents are supplied under the terms of the
28  * Common Development and Distribution License ("CDDL"), version 1.0.
29  * You may only use this file in accordance with the terms of version
30  * 1.0 of the CDDL.
31  *
32  * A full copy of the text of the CDDL should have accompanied this
33  * source.  A copy of the CDDL is also available via the Internet at
34  * http://www.illumos.org/license/CDDL.
35  *
36  * Copyright 2015 Pluribus Networks Inc.
37  * Copyright 2019 Joyent, Inc.
38  * Copyright 2024 Oxide Computer Company
39  */
40 
41 
42 #include <sys/types.h>
43 #include <sys/smt.h>
44 #include <sys/strsubr.h>
45 
46 #include <sys/pattr.h>
47 #include <sys/dlpi.h>
48 #include <inet/ip.h>
49 #include <inet/ip_impl.h>
50 
51 #include "viona_impl.h"
52 
53 #define	BNXE_NIC_DRIVER		"bnxe"
54 
55 /*
56  * Tunable controls tx copy by default on or off
57  */
58 boolean_t viona_default_tx_copy = B_TRUE;
59 
60 /*
61  * Tunable for maximum configured TX header padding.
62  */
63 uint_t viona_max_header_pad = 256;
64 
65 /*
66  * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
67  * transmission to free resources.
68  */
69 kmutex_t viona_force_copy_lock;
70 static enum viona_force_copy {
71 	VFC_UNINITALIZED	= 0,
72 	VFC_COPY_UNEEDED	= 1,
73 	VFC_COPY_REQUIRED	= 2,
74 } viona_force_copy_state = VFC_UNINITALIZED;
75 
76 struct viona_desb {
77 	frtn_t			d_frtn;
78 	viona_vring_t		*d_ring;
79 	uint_t			d_ref;
80 	uint32_t		d_len;
81 	uint16_t		d_cookie;
82 	uchar_t			*d_headers;
83 	vmm_page_t		*d_pages;
84 };
85 
86 static void viona_tx(viona_link_t *, viona_vring_t *);
87 static void viona_desb_release(viona_desb_t *);
88 
89 
90 static void
viona_tx_wait_outstanding(viona_vring_t * ring)91 viona_tx_wait_outstanding(viona_vring_t *ring)
92 {
93 	ASSERT(MUTEX_HELD(&ring->vr_lock));
94 
95 	while (ring->vr_xfer_outstanding != 0) {
96 		/*
97 		 * Paying heed to signals is counterproductive here.  This is a
98 		 * very tight loop if pending transfers take an extended amount
99 		 * of time to be reclaimed while the host process is exiting.
100 		 */
101 		cv_wait(&ring->vr_cv, &ring->vr_lock);
102 	}
103 }
104 
105 /*
106  * Check if full TX packet copying is needed.  This should not be called from
107  * viona attach()/detach() context.
108  */
109 boolean_t
viona_tx_copy_needed(void)110 viona_tx_copy_needed(void)
111 {
112 	boolean_t result;
113 
114 	if (viona_default_tx_copy) {
115 		return (B_TRUE);
116 	}
117 
118 	mutex_enter(&viona_force_copy_lock);
119 	if (viona_force_copy_state == VFC_UNINITALIZED) {
120 		major_t bnxe_major;
121 
122 		/*
123 		 * The original code for viona featured an explicit check for
124 		 * the bnxe driver which, when found present, necessitated that
125 		 * all transmissions be copied into their own mblks instead of
126 		 * passing guest memory to the underlying device.
127 		 *
128 		 * The motivations for this are unclear, but until it can be
129 		 * proven unnecessary, the check lives on.
130 		 */
131 		viona_force_copy_state = VFC_COPY_UNEEDED;
132 		if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER))
133 		    != DDI_MAJOR_T_NONE) {
134 			if (ddi_hold_installed_driver(bnxe_major) != NULL) {
135 				viona_force_copy_state = VFC_COPY_REQUIRED;
136 				ddi_rele_driver(bnxe_major);
137 			}
138 		}
139 	}
140 	result = (viona_force_copy_state == VFC_COPY_REQUIRED);
141 	mutex_exit(&viona_force_copy_lock);
142 
143 	return (result);
144 }
145 
146 void
viona_tx_ring_alloc(viona_vring_t * ring,const uint16_t qsz)147 viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz)
148 {
149 	const viona_link_params_t *vlp = &ring->vr_link->l_params;
150 
151 	ring->vr_tx.vrt_header_pad = vlp->vlp_tx_header_pad;
152 	/* Allocate desb handles for TX ring if packet copying not forced */
153 	if (!ring->vr_link->l_params.vlp_tx_copy_data) {
154 		viona_desb_t *dp =
155 		    kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
156 		ring->vr_tx.vrt_desb = dp;
157 
158 		const size_t header_sz =
159 		    VIONA_MAX_HDRS_LEN + ring->vr_tx.vrt_header_pad;
160 		for (uint_t i = 0; i < qsz; i++, dp++) {
161 			dp->d_frtn.free_func = viona_desb_release;
162 			dp->d_frtn.free_arg = (void *)dp;
163 			dp->d_ring = ring;
164 			dp->d_headers = kmem_zalloc(header_sz, KM_SLEEP);
165 		}
166 	}
167 
168 	/* Allocate ring-sized iovec buffers for TX */
169 	ring->vr_tx.vrt_iov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP);
170 	ring->vr_tx.vrt_iov_cnt = qsz;
171 }
172 
173 void
viona_tx_ring_free(viona_vring_t * ring,const uint16_t qsz)174 viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz)
175 {
176 	if (ring->vr_tx.vrt_desb != NULL) {
177 		viona_desb_t *dp = ring->vr_tx.vrt_desb;
178 
179 		const size_t header_sz =
180 		    VIONA_MAX_HDRS_LEN + ring->vr_tx.vrt_header_pad;
181 		for (uint_t i = 0; i < qsz; i++, dp++) {
182 			kmem_free(dp->d_headers, header_sz);
183 		}
184 		kmem_free(ring->vr_tx.vrt_desb, sizeof (viona_desb_t) * qsz);
185 		ring->vr_tx.vrt_desb = NULL;
186 	}
187 
188 	if (ring->vr_tx.vrt_iov != NULL) {
189 		ASSERT3U(ring->vr_tx.vrt_iov_cnt, !=, 0);
190 
191 		kmem_free(ring->vr_tx.vrt_iov,
192 		    sizeof (struct iovec) * ring->vr_tx.vrt_iov_cnt);
193 		ring->vr_tx.vrt_iov = NULL;
194 		ring->vr_tx.vrt_iov_cnt = 0;
195 	}
196 }
197 
198 static void
viona_tx_done(viona_vring_t * ring,uint32_t len,uint16_t cookie)199 viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie)
200 {
201 	vq_pushchain(ring, len, cookie);
202 
203 	membar_enter();
204 	viona_intr_ring(ring, B_FALSE);
205 }
206 
207 #define	TX_BURST_THRESH	32
208 
209 void
viona_worker_tx(viona_vring_t * ring,viona_link_t * link)210 viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
211 {
212 	(void) thread_vsetname(curthread, "viona_tx_%p", ring);
213 
214 	ASSERT(MUTEX_HELD(&ring->vr_lock));
215 	ASSERT3U(ring->vr_state, ==, VRS_RUN);
216 
217 	mutex_exit(&ring->vr_lock);
218 
219 	for (;;) {
220 		uint_t ntx = 0, burst = 0;
221 
222 		viona_ring_disable_notify(ring);
223 		while (viona_ring_num_avail(ring) != 0) {
224 			viona_tx(link, ring);
225 			ntx++;
226 			burst++;
227 
228 			/*
229 			 * It is advantageous for throughput to keep this
230 			 * transmission loop tight, but periodic breaks to
231 			 * check for other events are of value too.
232 			 */
233 			if (burst >= TX_BURST_THRESH) {
234 				mutex_enter(&ring->vr_lock);
235 				const bool need_bail = vring_need_bail(ring);
236 				mutex_exit(&ring->vr_lock);
237 
238 				if (need_bail) {
239 					break;
240 				}
241 				burst = 0;
242 			}
243 		}
244 
245 		VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx);
246 
247 		/*
248 		 * Check for available descriptors on the ring once more in
249 		 * case a late addition raced with the NO_NOTIFY flag toggle.
250 		 *
251 		 * The barrier ensures that visibility of the no-notify
252 		 * store does not cross the viona_ring_num_avail() check below.
253 		 */
254 		viona_ring_enable_notify(ring);
255 		membar_enter();
256 
257 		if (viona_ring_num_avail(ring) == 0 &&
258 		    (link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) {
259 			/*
260 			 * The NOTIFY_ON_EMPTY interrupt should not pay heed to
261 			 * the presence of AVAIL_NO_INTERRUPT.
262 			 */
263 			viona_intr_ring(ring, B_TRUE);
264 		}
265 
266 		mutex_enter(&ring->vr_lock);
267 		for (;;) {
268 			if (vring_need_bail(ring)) {
269 				ring->vr_state = VRS_STOP;
270 				viona_tx_wait_outstanding(ring);
271 				return;
272 			}
273 
274 			if (vmm_drv_lease_expired(ring->vr_lease)) {
275 				ring->vr_state_flags |= VRSF_RENEW;
276 				/*
277 				 * When renewing the lease for the ring, no TX
278 				 * frames may be outstanding, as they contain
279 				 * references to guest memory.
280 				 */
281 				viona_tx_wait_outstanding(ring);
282 
283 				const boolean_t renewed =
284 				    viona_ring_lease_renew(ring);
285 				ring->vr_state_flags &= ~VRSF_RENEW;
286 
287 				if (!renewed) {
288 					/* stop ring on failed renewal */
289 					ring->vr_state = VRS_STOP;
290 					return;
291 				}
292 			}
293 
294 			if (viona_ring_num_avail(ring) != 0) {
295 				break;
296 			}
297 
298 			/* Wait for further activity on the ring */
299 			(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
300 		}
301 		mutex_exit(&ring->vr_lock);
302 	}
303 	/* UNREACHABLE */
304 }
305 
306 static void
viona_desb_release(viona_desb_t * dp)307 viona_desb_release(viona_desb_t *dp)
308 {
309 	viona_vring_t *ring = dp->d_ring;
310 	uint_t ref;
311 	uint32_t len;
312 	uint16_t cookie;
313 
314 	ref = atomic_dec_uint_nv(&dp->d_ref);
315 	if (ref > 1) {
316 		return;
317 	}
318 
319 	/*
320 	 * The desb corresponding to this index must be ready for reuse before
321 	 * the descriptor is returned to the guest via the 'used' ring.
322 	 */
323 	len = dp->d_len;
324 	cookie = dp->d_cookie;
325 	dp->d_len = 0;
326 	dp->d_cookie = 0;
327 	vmm_drv_page_release_chain(dp->d_pages);
328 	dp->d_pages = NULL;
329 
330 	/*
331 	 * Ensure all other changes to the desb are visible prior to zeroing its
332 	 * refcount, signifying its readiness for reuse.
333 	 */
334 	membar_exit();
335 	dp->d_ref = 0;
336 
337 	viona_tx_done(ring, len, cookie);
338 
339 	mutex_enter(&ring->vr_lock);
340 	if ((--ring->vr_xfer_outstanding) == 0) {
341 		cv_broadcast(&ring->vr_cv);
342 	}
343 	mutex_exit(&ring->vr_lock);
344 }
345 
346 /*
347  * Confirm that the requested checksum operation acts within the bounds of the
348  * provided packet, and that the checksum itself will be stored in the "copied
349  * headers" portion of said packet.
350  */
351 static boolean_t
viona_tx_csum_req_valid(const struct virtio_net_mrgrxhdr * hdr,const mac_ether_offload_info_t * meoi,uint_t copied_len)352 viona_tx_csum_req_valid(const struct virtio_net_mrgrxhdr *hdr,
353     const mac_ether_offload_info_t *meoi, uint_t copied_len)
354 {
355 	const uint_t csum_off = hdr->vrh_csum_offset + hdr->vrh_csum_start;
356 
357 	if (hdr->vrh_csum_start >= meoi->meoi_len ||
358 	    hdr->vrh_csum_start < meoi->meoi_l2hlen ||
359 	    csum_off >= meoi->meoi_len ||
360 	    (csum_off + sizeof (uint16_t)) > copied_len) {
361 		return (B_FALSE);
362 	}
363 
364 	return (B_TRUE);
365 }
366 
367 /*
368  * Configure mblk to request full checksum offloading, given the virtio and meoi
369  * details provided.
370  */
371 static void
viona_tx_hcksum_full(mblk_t * mp,const struct virtio_net_mrgrxhdr * hdr,const mac_ether_offload_info_t * meoi,uint32_t added_flags)372 viona_tx_hcksum_full(mblk_t *mp, const struct virtio_net_mrgrxhdr *hdr,
373     const mac_ether_offload_info_t *meoi, uint32_t added_flags)
374 {
375 	/*
376 	 * Out of caution, zero the checksum field in case any driver and/or
377 	 * device would erroneously use it in the sum calculation.
378 	 */
379 	uint16_t *csump = (uint16_t *)
380 	    (mp->b_rptr + hdr->vrh_csum_start + hdr->vrh_csum_offset);
381 	*csump = 0;
382 
383 	mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM | added_flags);
384 }
385 
386 /*
387  * Configure mblk to request partial checksum offloading, given the virtio and
388  * meoi details provided.
389  */
390 static void
viona_tx_hcksum_partial(mblk_t * mp,const struct virtio_net_mrgrxhdr * hdr,const mac_ether_offload_info_t * meoi,uint32_t added_flags)391 viona_tx_hcksum_partial(mblk_t *mp, const struct virtio_net_mrgrxhdr *hdr,
392     const mac_ether_offload_info_t *meoi, uint32_t added_flags)
393 {
394 	/*
395 	 * MAC expects these offsets to be relative to the start of the L3
396 	 * header rather than the L2 frame.
397 	 */
398 	mac_hcksum_set(mp,
399 	    hdr->vrh_csum_start - meoi->meoi_l2hlen,
400 	    hdr->vrh_csum_start + hdr->vrh_csum_offset - meoi->meoi_l2hlen,
401 	    meoi->meoi_len - meoi->meoi_l2hlen,
402 	    0, HCK_PARTIALCKSUM | added_flags);
403 }
404 
405 static boolean_t
viona_tx_offloads(viona_vring_t * ring,const struct virtio_net_mrgrxhdr * hdr,const mac_ether_offload_info_t * meoi,mblk_t * mp,uint32_t len)406 viona_tx_offloads(viona_vring_t *ring, const struct virtio_net_mrgrxhdr *hdr,
407     const mac_ether_offload_info_t *meoi, mblk_t *mp, uint32_t len)
408 {
409 	viona_link_t *link = ring->vr_link;
410 	const uint32_t cap_csum = link->l_cap_csum;
411 
412 	/*
413 	 * Since viona is a "legacy device", the data stored by the driver will
414 	 * be in the guest's native endian format (see sections 2.4.3 and
415 	 * 5.1.6.1 of the VIRTIO 1.0 spec for more info). At this time the only
416 	 * guests using viona are x86 and we can assume little-endian.
417 	 */
418 	const uint16_t gso_size = LE_16(hdr->vrh_gso_size);
419 
420 	if (!viona_tx_csum_req_valid(hdr, meoi, MBLKL(mp))) {
421 		VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
422 		VIONA_RING_STAT_INCR(ring, fail_hcksum);
423 		return (B_FALSE);
424 	}
425 
426 	const uint16_t ftype = meoi->meoi_l3proto;
427 	const uint8_t ipproto = meoi->meoi_l4proto;
428 	if (ftype != ETHERTYPE_IP && ftype != ETHERTYPE_IPV6) {
429 		/* Ignore checksum offload requests for non-IP protocols. */
430 		VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link,
431 		    mblk_t *, mp);
432 		VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
433 		return (B_FALSE);
434 	}
435 
436 	/* Configure TCPv4 LSO when requested */
437 	if ((hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
438 	    ftype == ETHERTYPE_IP) {
439 		if ((link->l_features & VIRTIO_NET_F_HOST_TSO4) == 0) {
440 			VIONA_PROBE2(tx_gso_fail, viona_link_t *, link,
441 			    mblk_t *, mp);
442 			VIONA_RING_STAT_INCR(ring, tx_gso_fail);
443 			return (B_FALSE);
444 		}
445 
446 		lso_info_set(mp, gso_size, HW_LSO);
447 
448 		/*
449 		 * We should have already verified that an adequate form of
450 		 * hardware checksum offload is present for TSOv4
451 		 */
452 		ASSERT3U(cap_csum &
453 		    (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4), !=, 0);
454 
455 		if ((cap_csum & HCKSUM_INET_FULL_V4) != 0) {
456 			viona_tx_hcksum_full(mp, hdr, meoi, HW_LSO);
457 		} else if ((cap_csum & HCKSUM_INET_PARTIAL) != 0) {
458 			/*
459 			 * Our native IP stack doesn't set the L4 length field
460 			 * of the pseudo header when LSO is in play.  Other IP
461 			 * stacks, e.g.  Linux, do include the length field.
462 			 * This is a problem because the hardware expects that
463 			 * the length field is not set. When it is set, it will
464 			 * cause an incorrect TCP checksum to be generated.
465 			 * Linux avoids this issue by correcting the
466 			 * pseudo-header checksum in the driver code.
467 			 *
468 			 * In order to get the correct HW checksum we need to
469 			 * assume the guest's IP stack gave us a bogus TCP
470 			 * partial checksum and calculate it ourselves.
471 			 */
472 			ipha_t *ipha =
473 			    (ipha_t *)(mp->b_rptr + meoi->meoi_l2hlen);
474 			uint16_t *cksump =
475 			    IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
476 
477 			uint32_t cksum = IP_TCP_CSUM_COMP;
478 			const ipaddr_t src = ipha->ipha_src;
479 			const ipaddr_t dst = ipha->ipha_dst;
480 			cksum += (dst >> 16) + (dst & 0xffff) +
481 			    (src >> 16) + (src & 0xffff);
482 			cksum = (cksum & 0xffff) + (cksum >> 16);
483 			*cksump = (cksum & 0xffff) + (cksum >> 16);
484 
485 			/*
486 			 * NICs such as ixgbe require that ipv4 checksum offload
487 			 * also be enabled when performing LSO.
488 			 */
489 			uint32_t v4csum = 0;
490 			if ((cap_csum & HCKSUM_IPHDRCKSUM) != 0) {
491 				v4csum = HCK_IPV4_HDRCKSUM;
492 				ipha->ipha_hdr_checksum = 0;
493 			}
494 
495 			viona_tx_hcksum_partial(mp, hdr, meoi, HW_LSO | v4csum);
496 		} else {
497 			/*
498 			 * This should be unreachable: We do not permit LSO
499 			 * without adequate checksum offload capability.
500 			 */
501 			VIONA_PROBE2(tx_gso_fail, viona_link_t *, link,
502 			    mblk_t *, mp);
503 			VIONA_RING_STAT_INCR(ring, tx_gso_fail);
504 			return (B_FALSE);
505 		}
506 
507 		return (B_TRUE);
508 	}
509 
510 	/*
511 	 * Partial checksum support from the NIC is ideal, since it most closely
512 	 * maps to the interface defined by virtio.
513 	 */
514 	if ((cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
515 	    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
516 		viona_tx_hcksum_partial(mp, hdr, meoi, 0);
517 		return (B_TRUE);
518 	}
519 
520 	/*
521 	 * Without partial checksum support, look to the L3/L4 protocol
522 	 * information to see if the NIC can handle it.  If not, the checksum
523 	 * will need to calculated inline.
524 	 */
525 	if (ftype == ETHERTYPE_IP) {
526 		if ((cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
527 		    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
528 			viona_tx_hcksum_full(mp, hdr, meoi, 0);
529 			return (B_TRUE);
530 		}
531 
532 		/* XXX: Implement manual fallback checksumming? */
533 		VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
534 		VIONA_RING_STAT_INCR(ring, fail_hcksum);
535 		return (B_FALSE);
536 	} else if (ftype == ETHERTYPE_IPV6) {
537 		if ((cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
538 		    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
539 			viona_tx_hcksum_full(mp, hdr, meoi, 0);
540 			return (B_TRUE);
541 		}
542 
543 		/* XXX: Implement manual fallback checksumming? */
544 		VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp);
545 		VIONA_RING_STAT_INCR(ring, fail_hcksum6);
546 		return (B_FALSE);
547 	}
548 
549 	/*
550 	 * Note the failure for unrecognized protocols, but soldier on to make
551 	 * our best effort at getting the frame out the door.
552 	 */
553 	VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp);
554 	VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
555 	return (B_FALSE);
556 }
557 
558 static mblk_t *
viona_tx_alloc_headers(viona_vring_t * ring,uint16_t cookie,viona_desb_t ** dpp,uint32_t len)559 viona_tx_alloc_headers(viona_vring_t *ring, uint16_t cookie, viona_desb_t **dpp,
560     uint32_t len)
561 {
562 	ASSERT3P(*dpp, ==, NULL);
563 
564 	mblk_t *mp = NULL;
565 	const size_t header_pad = ring->vr_tx.vrt_header_pad;
566 
567 	if (ring->vr_tx.vrt_desb != NULL) {
568 		viona_desb_t *dp = &ring->vr_tx.vrt_desb[cookie];
569 		const size_t header_sz = VIONA_MAX_HDRS_LEN + header_pad;
570 
571 		/*
572 		 * If the guest driver is operating properly, each desb slot
573 		 * should be available for use when processing a TX descriptor
574 		 * from the 'avail' ring.  In the case of drivers that reuse a
575 		 * descriptor before it has been posted to the 'used' ring, the
576 		 * data is simply dropped.
577 		 */
578 		if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) {
579 			return (NULL);
580 		}
581 
582 		dp->d_cookie = cookie;
583 		mp = desballoc(dp->d_headers, header_sz, 0, &dp->d_frtn);
584 
585 		if (mp != NULL) {
586 			/*
587 			 * Account for the successful desballoc, and communicate
588 			 * out the desb handle for subsequent use
589 			 */
590 			dp->d_ref++;
591 			*dpp = dp;
592 		} else {
593 			/* Reset the desb back to its "available" state */
594 			dp->d_ref = 0;
595 		}
596 	} else {
597 		/*
598 		 * If we are going to be copying the entire packet, we might as
599 		 * well allocate for it all in one go.
600 		 */
601 		mp = allocb(len + header_pad, 0);
602 	}
603 
604 	/* Push pointers forward to account for requested header padding */
605 	if (mp != NULL && header_pad != 0) {
606 		mp->b_rptr = mp->b_wptr = (DB_BASE(mp) + header_pad);
607 	}
608 
609 	return (mp);
610 }
611 
612 static boolean_t
viona_tx_copy_headers(viona_vring_t * ring,iov_bunch_t * iob,mblk_t * mp,mac_ether_offload_info_t * meoi)613 viona_tx_copy_headers(viona_vring_t *ring, iov_bunch_t *iob, mblk_t *mp,
614     mac_ether_offload_info_t *meoi)
615 {
616 	ASSERT(mp->b_cont == NULL);
617 
618 	if (ring->vr_tx.vrt_desb == NULL) {
619 		/*
620 		 * If not using guest data loaning through the desb, then we
621 		 * expect viona_tx_alloc_headers() to have allocated space for
622 		 * the entire packet, which we should copy now.
623 		 */
624 		const uint32_t pkt_size = iob->ib_remain;
625 
626 		VERIFY(MBLKTAIL(mp) >= pkt_size);
627 		VERIFY(iov_bunch_copy(iob, mp->b_wptr, pkt_size));
628 		mp->b_wptr += pkt_size;
629 		mac_ether_offload_info(mp, meoi);
630 		return (B_TRUE);
631 	}
632 
633 	/*
634 	 * We want to maximize the amount of guest data we loan when performing
635 	 * packet transmission, with the caveat that we must copy the packet
636 	 * headers to prevent TOCTOU issues.
637 	 */
638 	const uint32_t copy_sz = MIN(iob->ib_remain, MBLKTAIL(mp));
639 
640 	VERIFY(iov_bunch_copy(iob, mp->b_wptr, copy_sz));
641 	mp->b_wptr += copy_sz;
642 
643 	if (iob->ib_remain == 0) {
644 		mac_ether_offload_info(mp, meoi);
645 		return (B_TRUE);
646 	}
647 
648 	mac_ether_offload_info(mp, meoi);
649 	if ((meoi->meoi_flags & MEOI_L2INFO_SET) == 0) {
650 		/* If the L2 header cannot be parsed, give up now */
651 		return (B_FALSE);
652 	}
653 	if ((meoi->meoi_flags & MEOI_L4INFO_SET) != 0) {
654 		const uint32_t full_hdr_sz =
655 		    meoi->meoi_l2hlen + meoi->meoi_l3hlen + meoi->meoi_l4hlen;
656 		if (copy_sz >= full_hdr_sz) {
657 			/* All headers are already copied */
658 			return (B_TRUE);
659 		}
660 	}
661 
662 	/*
663 	 * The full headers do not appear to be along for the ride yet, or the
664 	 * packet bears a protocol we do not handle.  Just allocate a
665 	 * buffer and copy the remainder of the packet.
666 	 */
667 	const uint32_t remain_sz = iob->ib_remain;
668 	mblk_t *remain_mp = allocb(remain_sz, 0);
669 	if (remain_mp == NULL) {
670 		return (B_FALSE);
671 	}
672 	VERIFY(iov_bunch_copy(iob, remain_mp->b_wptr, remain_sz));
673 	remain_mp->b_wptr += remain_sz;
674 	mp->b_cont = remain_mp;
675 	/* Refresh header info now that we have copied the rest */
676 	mac_ether_offload_info(mp, meoi);
677 
678 	return (B_TRUE);
679 }
680 
681 static void
viona_tx(viona_link_t * link,viona_vring_t * ring)682 viona_tx(viona_link_t *link, viona_vring_t *ring)
683 {
684 	struct iovec		*iov = ring->vr_tx.vrt_iov;
685 	const uint_t		max_segs = ring->vr_tx.vrt_iov_cnt;
686 	uint16_t		cookie;
687 	vmm_page_t		*pages = NULL;
688 	uint32_t		total_len;
689 	mblk_t			*mp_head = NULL;
690 	viona_desb_t		*dp = NULL;
691 	const boolean_t merge_enabled =
692 	    ((link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0);
693 
694 	ASSERT(iov != NULL);
695 
696 	const int n = vq_popchain(ring, iov, max_segs, &cookie, &pages,
697 	    &total_len);
698 	if (n == 0) {
699 		VIONA_PROBE1(tx_absent, viona_vring_t *, ring);
700 		VIONA_RING_STAT_INCR(ring, tx_absent);
701 		return;
702 	} else if (n < 0) {
703 		/*
704 		 * Any error encountered in vq_popchain has already resulted in
705 		 * specific probe and statistic handling.  Further action here
706 		 * is unnecessary.
707 		 */
708 		return;
709 	}
710 
711 	/*
712 	 * Get setup to copy the VirtIO header from in front of the packet.
713 	 *
714 	 * With an eye toward supporting VirtIO 1.0 behavior in the future, we
715 	 * determine the size of the header based on the device state.  This
716 	 * goes a bit beyond the expectations of legacy VirtIO, where the first
717 	 * buffer must cover the header and nothing else.
718 	 */
719 	iov_bunch_t iob = {
720 		.ib_iov = iov,
721 		.ib_remain = total_len,
722 	};
723 	struct virtio_net_mrgrxhdr hdr;
724 	uint32_t vio_hdr_len = 0;
725 	if (merge_enabled) {
726 		/*
727 		 * Presence of the "num_bufs" member is determined by the
728 		 * merge-rxbuf feature on the device, despite the fact that we
729 		 * are in transmission context here.
730 		 */
731 		vio_hdr_len = sizeof (struct virtio_net_mrgrxhdr);
732 	} else {
733 		vio_hdr_len = sizeof (struct virtio_net_hdr);
734 		/*
735 		 * We ignore "num_bufs" from the guest anyways, but zero it out
736 		 * just in case.
737 		 */
738 		hdr.vrh_bufs = 0;
739 	}
740 	uint32_t pkt_len = 0;
741 	if (!iov_bunch_copy(&iob, &hdr, vio_hdr_len)) {
742 		goto drop_fail;
743 	}
744 
745 	pkt_len = total_len - vio_hdr_len;
746 	if (pkt_len > VIONA_MAX_PACKET_SIZE ||
747 	    pkt_len < sizeof (struct ether_header)) {
748 		goto drop_fail;
749 	}
750 
751 	mp_head = viona_tx_alloc_headers(ring, cookie, &dp, pkt_len);
752 	if (mp_head == NULL) {
753 		goto drop_fail;
754 	}
755 
756 	/*
757 	 * Copy the the packet headers (L2 through L4, if present) to prevent
758 	 * TOCTOU attacks in any subsequent consumers of that data.
759 	 */
760 	mac_ether_offload_info_t meoi = { 0 };
761 	if (!viona_tx_copy_headers(ring, &iob, mp_head, &meoi)) {
762 		goto drop_fail;
763 	}
764 
765 	if (dp != NULL && iob.ib_remain != 0) {
766 		/*
767 		 * If this device is loaning guest memory, rather than copying
768 		 * the entire body of the packet, we may need to establish mblks
769 		 * for the remaining data-to-be-loaned after the header copy.
770 		 */
771 		uint32_t chunk_sz;
772 		caddr_t chunk;
773 		mblk_t *mp_tail = mp_head;
774 
775 		/*
776 		 * Ensure that our view of the tail is accurate in the rare case
777 		 * that the header allocation/copying logic has already resulted
778 		 * in a chained mblk.
779 		 */
780 		while (mp_tail->b_cont != NULL) {
781 			mp_tail = mp_tail->b_cont;
782 		}
783 
784 		while (iov_bunch_next_chunk(&iob, &chunk, &chunk_sz)) {
785 			mblk_t *mp = desballoc((uchar_t *)chunk, chunk_sz, 0,
786 			    &dp->d_frtn);
787 			if (mp == NULL) {
788 				goto drop_fail;
789 			}
790 
791 			mp->b_wptr += chunk_sz;
792 			dp->d_ref++;
793 			mp_tail->b_cont = mp;
794 			mp_tail = mp;
795 		}
796 	} else {
797 		/* The copy-everything strategy should be done by now */
798 		VERIFY0(iob.ib_remain);
799 	}
800 
801 	if (VNETHOOK_INTERESTED_OUT(link->l_neti)) {
802 		/*
803 		 * The hook consumer may elect to free the mblk_t and set
804 		 * our mblk_t ** to NULL.  When using a viona_desb_t
805 		 * (dp != NULL), we do not want the corresponding cleanup to
806 		 * occur during the viona_hook() call. We instead want to
807 		 * reset and recycle dp for future use.  To prevent cleanup
808 		 * during the viona_hook() call, we take a ref on dp (if being
809 		 * used), and release it on success.  On failure, the
810 		 * freemsgchain() call will release all the refs taken earlier
811 		 * in viona_tx() (aside from the initial ref and the one we
812 		 * take), and drop_hook will reset dp for reuse.
813 		 */
814 		if (dp != NULL)
815 			dp->d_ref++;
816 
817 		/*
818 		 * Pass &mp instead of &mp_head so we don't lose track of
819 		 * mp_head if the hook consumer (i.e. ipf) elects to free mp
820 		 * and set mp to NULL.
821 		 */
822 		mblk_t *mp = mp_head;
823 		if (viona_hook(link, ring, &mp, B_TRUE) != 0) {
824 			if (mp != NULL)
825 				freemsgchain(mp);
826 			goto drop_hook;
827 		}
828 
829 		if (dp != NULL) {
830 			dp->d_ref--;
831 
832 			/*
833 			 * It is possible that the hook(s) accepted the packet,
834 			 * but as part of its processing, it issued a pull-up
835 			 * which released all references to the desb.  In that
836 			 * case, go back to acting like the packet is entirely
837 			 * copied (which it is).
838 			 */
839 			if (dp->d_ref == 1) {
840 				dp->d_cookie = 0;
841 				dp->d_ref = 0;
842 				dp = NULL;
843 			}
844 		}
845 	}
846 
847 	/*
848 	 * Translate request for offloaded checksumming. If the guest sent an
849 	 * LSO packet then it must have also negotiated and requested partial
850 	 * checksum; therefore the LSO logic is contained within
851 	 * viona_tx_offloads().
852 	 */
853 	if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
854 	    (hdr.vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
855 		if (!viona_tx_offloads(ring, &hdr, &meoi, mp_head, pkt_len)) {
856 			/*
857 			 * If processing of any checksum offload request fails,
858 			 * we can still pass the packet on for transmission.
859 			 * Even with this best-effort behavior, which may in
860 			 * fact succeed in the end, we record it as an error.
861 			 */
862 			viona_ring_stat_error(ring);
863 		}
864 	}
865 
866 	if (dp != NULL) {
867 		/*
868 		 * Record the info required to record this descriptor in the
869 		 * used ring once its transmission has completed.
870 		 */
871 		dp->d_len = total_len;
872 		dp->d_pages = pages;
873 		mutex_enter(&ring->vr_lock);
874 		ring->vr_xfer_outstanding++;
875 		mutex_exit(&ring->vr_lock);
876 	} else {
877 		/*
878 		 * If the data was cloned out of the ring, the descriptors can
879 		 * be marked as 'used' now, rather than deferring that action
880 		 * until after successful packet transmission.
881 		 */
882 		vmm_drv_page_release_chain(pages);
883 		viona_tx_done(ring, total_len, cookie);
884 	}
885 
886 	/*
887 	 * From viona's point of view, this is a successful transmit, even if
888 	 * something downstream decides to drop the packet.
889 	 */
890 	viona_ring_stat_accept(ring, pkt_len);
891 
892 	/*
893 	 * We're potentially going deep into the networking layer; make sure the
894 	 * guest can't run concurrently.
895 	 */
896 	smt_begin_unsafe();
897 	/*
898 	 * Ignore, for now, any signal from MAC about whether the outgoing
899 	 * packet was dropped or not.
900 	 */
901 	(void) mac_tx(link->l_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
902 	smt_end_unsafe();
903 	return;
904 
905 drop_fail:
906 	/*
907 	 * On the off chance that memory is not available via the desballoc or
908 	 * allocb calls, there are few options left besides to fail and drop
909 	 * the frame on the floor.
910 	 *
911 	 * First account for it in the error stats.
912 	 */
913 	viona_ring_stat_error(ring);
914 
915 	if (dp != NULL) {
916 		/*
917 		 * Take an additional reference on the desb handle (if present)
918 		 * so any desballoc-sourced mblks can release their hold on it
919 		 * without the handle reaching its final state and executing
920 		 * its clean-up logic.
921 		 */
922 		dp->d_ref++;
923 	}
924 
925 	/*
926 	 * Free any already-allocated blocks and sum up the total length of the
927 	 * dropped data to be released to the used ring.
928 	 */
929 	freemsgchain(mp_head);
930 
931 drop_hook:
932 	if (dp != NULL) {
933 		VERIFY(dp->d_ref == 2);
934 
935 		/* Clean up the desb handle, releasing the extra hold. */
936 		dp->d_len = 0;
937 		dp->d_cookie = 0;
938 		dp->d_ref = 0;
939 	}
940 
941 	/* Count in the stats as a drop, rather than an error */
942 	viona_ring_stat_drop(ring);
943 
944 	VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, pkt_len,
945 	    uint16_t, cookie);
946 	vmm_drv_page_release_chain(pages);
947 	viona_tx_done(ring, total_len, cookie);
948 }
949