xref: /illumos-gate/usr/src/uts/intel/io/viona/viona_tx.c (revision badf94ff3599fab15963f6c532929e9bc411757a)
1 /*
2  * Copyright (c) 2013  Chris Torek <torek @ torek net>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * This file and its contents are supplied under the terms of the
28  * Common Development and Distribution License ("CDDL"), version 1.0.
29  * You may only use this file in accordance with the terms of version
30  * 1.0 of the CDDL.
31  *
32  * A full copy of the text of the CDDL should have accompanied this
33  * source.  A copy of the CDDL is also available via the Internet at
34  * http://www.illumos.org/license/CDDL.
35  *
36  * Copyright 2015 Pluribus Networks Inc.
37  * Copyright 2019 Joyent, Inc.
38  * Copyright 2021 Oxide Computer Company
39  */
40 
41 
42 #include <sys/types.h>
43 #include <sys/smt.h>
44 #include <sys/strsubr.h>
45 
46 #include <sys/pattr.h>
47 #include <sys/dlpi.h>
48 #include <inet/ip.h>
49 #include <inet/ip_impl.h>
50 
51 #include "viona_impl.h"
52 
53 #define	BNXE_NIC_DRIVER		"bnxe"
54 
55 /*
56  * copy tx mbufs from virtio ring to avoid necessitating a wait for packet
57  * transmission to free resources.
58  */
59 kmutex_t viona_force_copy_lock;
60 static enum viona_force_copy {
61 	VFC_UNINITALIZED	= 0,
62 	VFC_COPY_UNEEDED	= 1,
63 	VFC_COPY_REQUIRED	= 2,
64 } viona_force_copy_state = VFC_UNINITALIZED;
65 
66 struct viona_desb {
67 	frtn_t			d_frtn;
68 	viona_vring_t		*d_ring;
69 	uint_t			d_ref;
70 	uint32_t		d_len;
71 	uint16_t		d_cookie;
72 	uchar_t			*d_headers;
73 	vmm_page_t		*d_pages;
74 };
75 
76 static void viona_tx(viona_link_t *, viona_vring_t *);
77 static void viona_desb_release(viona_desb_t *);
78 
79 
80 static void
81 viona_tx_wait_outstanding(viona_vring_t *ring)
82 {
83 	ASSERT(MUTEX_HELD(&ring->vr_lock));
84 
85 	while (ring->vr_xfer_outstanding != 0) {
86 		/*
87 		 * Paying heed to signals is counterproductive here.  This is a
88 		 * very tight loop if pending transfers take an extended amount
89 		 * of time to be reclaimed while the host process is exiting.
90 		 */
91 		cv_wait(&ring->vr_cv, &ring->vr_lock);
92 	}
93 }
94 
95 /*
96  * Check if full TX packet copying is needed.  This should not be called from
97  * viona attach()/detach() context.
98  */
99 static boolean_t
100 viona_tx_copy_needed(void)
101 {
102 	boolean_t result;
103 
104 	mutex_enter(&viona_force_copy_lock);
105 	if (viona_force_copy_state == VFC_UNINITALIZED) {
106 		major_t bnxe_major;
107 
108 		/*
109 		 * The original code for viona featured an explicit check for
110 		 * the bnxe driver which, when found present, necessitated that
111 		 * all transmissions be copied into their own mblks instead of
112 		 * passing guest memory to the underlying device.
113 		 *
114 		 * The motivations for this are unclear, but until it can be
115 		 * proven unnecessary, the check lives on.
116 		 */
117 		viona_force_copy_state = VFC_COPY_UNEEDED;
118 		if ((bnxe_major = ddi_name_to_major(BNXE_NIC_DRIVER))
119 		    != DDI_MAJOR_T_NONE) {
120 			if (ddi_hold_installed_driver(bnxe_major) != NULL) {
121 				viona_force_copy_state = VFC_COPY_REQUIRED;
122 				ddi_rele_driver(bnxe_major);
123 			}
124 		}
125 	}
126 	result = (viona_force_copy_state == VFC_COPY_REQUIRED);
127 	mutex_exit(&viona_force_copy_lock);
128 
129 	return (result);
130 }
131 
132 void
133 viona_tx_ring_alloc(viona_vring_t *ring, const uint16_t qsz)
134 {
135 	/* Allocate desb handles for TX ring if packet copying not disabled */
136 	if (!viona_tx_copy_needed()) {
137 		viona_desb_t *dp;
138 
139 		dp = kmem_zalloc(sizeof (viona_desb_t) * qsz, KM_SLEEP);
140 		ring->vr_txdesb = dp;
141 		for (uint_t i = 0; i < qsz; i++, dp++) {
142 			dp->d_frtn.free_func = viona_desb_release;
143 			dp->d_frtn.free_arg = (void *)dp;
144 			dp->d_ring = ring;
145 			dp->d_headers = kmem_zalloc(VIONA_MAX_HDRS_LEN,
146 			    KM_SLEEP);
147 		}
148 	}
149 
150 	/* Allocate ring-sized iovec buffers for TX */
151 	ring->vr_txiov = kmem_alloc(sizeof (struct iovec) * qsz, KM_SLEEP);
152 }
153 
154 void
155 viona_tx_ring_free(viona_vring_t *ring, const uint16_t qsz)
156 {
157 	if (ring->vr_txdesb != NULL) {
158 		viona_desb_t *dp = ring->vr_txdesb;
159 
160 		for (uint_t i = 0; i < qsz; i++, dp++) {
161 			kmem_free(dp->d_headers, VIONA_MAX_HDRS_LEN);
162 		}
163 		kmem_free(ring->vr_txdesb, sizeof (viona_desb_t) * qsz);
164 		ring->vr_txdesb = NULL;
165 	}
166 
167 	if (ring->vr_txiov != NULL) {
168 		kmem_free(ring->vr_txiov, sizeof (struct iovec) * qsz);
169 		ring->vr_txiov = NULL;
170 	}
171 }
172 
173 static void
174 viona_tx_done(viona_vring_t *ring, uint32_t len, uint16_t cookie)
175 {
176 	vq_pushchain(ring, len, cookie);
177 
178 	membar_enter();
179 	viona_intr_ring(ring, B_FALSE);
180 }
181 
182 void
183 viona_worker_tx(viona_vring_t *ring, viona_link_t *link)
184 {
185 	proc_t *p = ttoproc(curthread);
186 
187 	(void) thread_vsetname(curthread, "viona_tx_%p", ring);
188 
189 	ASSERT(MUTEX_HELD(&ring->vr_lock));
190 	ASSERT3U(ring->vr_state, ==, VRS_RUN);
191 
192 	mutex_exit(&ring->vr_lock);
193 
194 	for (;;) {
195 		boolean_t bail = B_FALSE;
196 		boolean_t renew = B_FALSE;
197 		uint_t ntx = 0;
198 
199 		viona_ring_disable_notify(ring);
200 		while (viona_ring_num_avail(ring)) {
201 			viona_tx(link, ring);
202 
203 			/*
204 			 * It is advantageous for throughput to keep this
205 			 * transmission loop tight, but periodic breaks to
206 			 * check for other events are of value too.
207 			 */
208 			if (ntx++ >= ring->vr_size)
209 				break;
210 		}
211 		viona_ring_enable_notify(ring);
212 
213 		VIONA_PROBE2(tx, viona_link_t *, link, uint_t, ntx);
214 
215 		/*
216 		 * Check for available descriptors on the ring once more in
217 		 * case a late addition raced with the NO_NOTIFY flag toggle.
218 		 *
219 		 * The barrier ensures that visibility of the no-notify
220 		 * store does not cross the viona_ring_num_avail() check below.
221 		 */
222 		membar_enter();
223 		bail = VRING_NEED_BAIL(ring, p);
224 		renew = vmm_drv_lease_expired(ring->vr_lease);
225 		if (!bail && !renew && viona_ring_num_avail(ring)) {
226 			continue;
227 		}
228 
229 		if ((link->l_features & VIRTIO_F_RING_NOTIFY_ON_EMPTY) != 0) {
230 			/*
231 			 * The NOTIFY_ON_EMPTY interrupt should not pay heed to
232 			 * the presence of AVAIL_NO_INTERRUPT.
233 			 */
234 			viona_intr_ring(ring, B_TRUE);
235 		}
236 
237 		mutex_enter(&ring->vr_lock);
238 
239 		while (!bail && !renew && !viona_ring_num_avail(ring)) {
240 			(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
241 			bail = VRING_NEED_BAIL(ring, p);
242 			renew = vmm_drv_lease_expired(ring->vr_lease);
243 		}
244 
245 		if (bail) {
246 			break;
247 		} else if (renew) {
248 			ring->vr_state_flags |= VRSF_RENEW;
249 			/*
250 			 * When renewing the lease for the ring, no TX
251 			 * frames may be outstanding, as they contain
252 			 * references to guest memory.
253 			 */
254 			viona_tx_wait_outstanding(ring);
255 
256 			if (!viona_ring_lease_renew(ring)) {
257 				break;
258 			}
259 			ring->vr_state_flags &= ~VRSF_RENEW;
260 		}
261 		mutex_exit(&ring->vr_lock);
262 	}
263 
264 	ASSERT(MUTEX_HELD(&ring->vr_lock));
265 
266 	ring->vr_state = VRS_STOP;
267 	viona_tx_wait_outstanding(ring);
268 }
269 
270 static void
271 viona_desb_release(viona_desb_t *dp)
272 {
273 	viona_vring_t *ring = dp->d_ring;
274 	uint_t ref;
275 	uint32_t len;
276 	uint16_t cookie;
277 
278 	ref = atomic_dec_uint_nv(&dp->d_ref);
279 	if (ref > 1) {
280 		return;
281 	}
282 
283 	/*
284 	 * The desb corresponding to this index must be ready for reuse before
285 	 * the descriptor is returned to the guest via the 'used' ring.
286 	 */
287 	len = dp->d_len;
288 	cookie = dp->d_cookie;
289 	dp->d_len = 0;
290 	dp->d_cookie = 0;
291 	vmm_drv_page_release_chain(dp->d_pages);
292 	dp->d_pages = NULL;
293 
294 	/*
295 	 * Ensure all other changes to the desb are visible prior to zeroing its
296 	 * refcount, signifying its readiness for reuse.
297 	 */
298 	membar_exit();
299 	dp->d_ref = 0;
300 
301 	viona_tx_done(ring, len, cookie);
302 
303 	mutex_enter(&ring->vr_lock);
304 	if ((--ring->vr_xfer_outstanding) == 0) {
305 		cv_broadcast(&ring->vr_cv);
306 	}
307 	mutex_exit(&ring->vr_lock);
308 }
309 
310 static boolean_t
311 viona_tx_csum(viona_vring_t *ring, const struct virtio_net_hdr *hdr,
312     mblk_t *mp, uint32_t len)
313 {
314 	viona_link_t *link = ring->vr_link;
315 	const struct ether_header *eth;
316 	uint_t eth_len = sizeof (struct ether_header);
317 	ushort_t ftype;
318 	ipha_t *ipha = NULL;
319 	uint8_t ipproto = IPPROTO_NONE; /* NONE is not exactly right, but ok */
320 	uint16_t flags = 0;
321 	const uint_t csum_start = hdr->vrh_csum_start;
322 	const uint_t csum_stuff = hdr->vrh_csum_offset + csum_start;
323 
324 	/*
325 	 * Validate that the checksum offsets provided by the guest are within
326 	 * the bounds of the packet.  Additionally, ensure that the checksum
327 	 * contents field is within the headers mblk copied by viona_tx().
328 	 */
329 	if (csum_start >= len || csum_start < eth_len || csum_stuff >= len ||
330 	    (csum_stuff + sizeof (uint16_t)) > MBLKL(mp)) {
331 		VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
332 		VIONA_RING_STAT_INCR(ring, fail_hcksum);
333 		return (B_FALSE);
334 	}
335 
336 	/*
337 	 * This is guaranteed to be safe thanks to the header copying
338 	 * done in viona_tx().
339 	 */
340 	eth = (const struct ether_header *)mp->b_rptr;
341 	ftype = ntohs(eth->ether_type);
342 
343 	if (ftype == ETHERTYPE_VLAN) {
344 		const struct ether_vlan_header *veth;
345 
346 		/* punt on QinQ for now */
347 		eth_len = sizeof (struct ether_vlan_header);
348 		veth = (const struct ether_vlan_header *)eth;
349 		ftype = ntohs(veth->ether_type);
350 	}
351 
352 	if (ftype == ETHERTYPE_IP) {
353 		ipha = (ipha_t *)(mp->b_rptr + eth_len);
354 
355 		ipproto = ipha->ipha_protocol;
356 	} else if (ftype == ETHERTYPE_IPV6) {
357 		ip6_t *ip6h = (ip6_t *)(mp->b_rptr + eth_len);
358 
359 		ipproto = ip6h->ip6_nxt;
360 	}
361 
362 	/*
363 	 * We ignore hdr_len because the spec says it can't be
364 	 * trusted. Besides, our own stack will determine the header
365 	 * boundary.
366 	 */
367 	if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
368 	    (hdr->vrh_gso_type & VIRTIO_NET_HDR_GSO_TCPV4) != 0 &&
369 	    ftype == ETHERTYPE_IP) {
370 		uint16_t	*cksump;
371 		uint32_t	cksum;
372 		ipaddr_t	src = ipha->ipha_src;
373 		ipaddr_t	dst = ipha->ipha_dst;
374 
375 		/*
376 		 * Our native IP stack doesn't set the L4 length field
377 		 * of the pseudo header when LSO is in play. Other IP
378 		 * stacks, e.g. Linux, do include the length field.
379 		 * This is a problem because the hardware expects that
380 		 * the length field is not set. When it is set it will
381 		 * cause an incorrect TCP checksum to be generated.
382 		 * The reason this works in Linux is because Linux
383 		 * corrects the pseudo-header checksum in the driver
384 		 * code. In order to get the correct HW checksum we
385 		 * need to assume the guest's IP stack gave us a bogus
386 		 * TCP partial checksum and calculate it ourselves.
387 		 */
388 		cksump = IPH_TCPH_CHECKSUMP(ipha, IPH_HDR_LENGTH(ipha));
389 		cksum = IP_TCP_CSUM_COMP;
390 		cksum += (dst >> 16) + (dst & 0xFFFF) +
391 		    (src >> 16) + (src & 0xFFFF);
392 		cksum = (cksum & 0xFFFF) + (cksum >> 16);
393 		*(cksump) = (cksum & 0xFFFF) + (cksum >> 16);
394 
395 		/*
396 		 * Since viona is a "legacy device", the data stored
397 		 * by the driver will be in the guest's native endian
398 		 * format (see sections 2.4.3 and 5.1.6.1 of the
399 		 * VIRTIO 1.0 spec for more info). At this time the
400 		 * only guests using viona are x86 and we can assume
401 		 * little-endian.
402 		 */
403 		lso_info_set(mp, LE_16(hdr->vrh_gso_size), HW_LSO);
404 
405 		/*
406 		 * Hardware, like ixgbe, expects the client to request
407 		 * IP header checksum offload if it's sending LSO (see
408 		 * ixgbe_get_context()). Unfortunately, virtio makes
409 		 * no allowances for negotiating IP header checksum
410 		 * and HW offload, only TCP checksum. We add the flag
411 		 * and zero-out the checksum field. This mirrors the
412 		 * behavior of our native IP stack (which does this in
413 		 * the interest of HW that expects the field to be
414 		 * zero).
415 		 */
416 		flags |= HCK_IPV4_HDRCKSUM;
417 		ipha->ipha_hdr_checksum = 0;
418 	}
419 
420 	/*
421 	 * Use DB_CKSUMFLAGS instead of mac_hcksum_get() to make sure
422 	 * HW_LSO, if present, is not lost.
423 	 */
424 	flags |= DB_CKSUMFLAGS(mp);
425 
426 	/*
427 	 * Partial checksum support from the NIC is ideal, since it most
428 	 * closely maps to the interface defined by virtio.
429 	 */
430 	if ((link->l_cap_csum & HCKSUM_INET_PARTIAL) != 0 &&
431 	    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
432 		/*
433 		 * MAC expects these offsets to be relative to the
434 		 * start of the L3 header rather than the L2 frame.
435 		 */
436 		flags |= HCK_PARTIALCKSUM;
437 		mac_hcksum_set(mp, csum_start - eth_len, csum_stuff - eth_len,
438 		    len - eth_len, 0, flags);
439 		return (B_TRUE);
440 	}
441 
442 	/*
443 	 * Without partial checksum support, look to the L3/L4 protocol
444 	 * information to see if the NIC can handle it.  If not, the
445 	 * checksum will need to calculated inline.
446 	 */
447 	if (ftype == ETHERTYPE_IP) {
448 		if ((link->l_cap_csum & HCKSUM_INET_FULL_V4) != 0 &&
449 		    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
450 			uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
451 			*csump = 0;
452 			flags |= HCK_FULLCKSUM;
453 			mac_hcksum_set(mp, 0, 0, 0, 0, flags);
454 			return (B_TRUE);
455 		}
456 
457 		/* XXX: Implement manual fallback checksumming? */
458 		VIONA_PROBE2(fail_hcksum, viona_link_t *, link, mblk_t *, mp);
459 		VIONA_RING_STAT_INCR(ring, fail_hcksum);
460 		return (B_FALSE);
461 	} else if (ftype == ETHERTYPE_IPV6) {
462 		if ((link->l_cap_csum & HCKSUM_INET_FULL_V6) != 0 &&
463 		    (ipproto == IPPROTO_TCP || ipproto == IPPROTO_UDP)) {
464 			uint16_t *csump = (uint16_t *)(mp->b_rptr + csum_stuff);
465 			*csump = 0;
466 			flags |= HCK_FULLCKSUM;
467 			mac_hcksum_set(mp, 0, 0, 0, 0, flags);
468 			return (B_TRUE);
469 		}
470 
471 		/* XXX: Implement manual fallback checksumming? */
472 		VIONA_PROBE2(fail_hcksum6, viona_link_t *, link, mblk_t *, mp);
473 		VIONA_RING_STAT_INCR(ring, fail_hcksum6);
474 		return (B_FALSE);
475 	}
476 
477 	/* Cannot even emulate hcksum for unrecognized protocols */
478 	VIONA_PROBE2(fail_hcksum_proto, viona_link_t *, link, mblk_t *, mp);
479 	VIONA_RING_STAT_INCR(ring, fail_hcksum_proto);
480 	return (B_FALSE);
481 }
482 
483 static void
484 viona_tx(viona_link_t *link, viona_vring_t *ring)
485 {
486 	struct iovec		*iov = ring->vr_txiov;
487 	const uint_t		max_segs = ring->vr_size;
488 	uint16_t		cookie;
489 	int			i, n;
490 	uint32_t		len, base_off = 0;
491 	uint32_t		min_copy = VIONA_MAX_HDRS_LEN;
492 	mblk_t			*mp_head, *mp_tail, *mp;
493 	viona_desb_t		*dp = NULL;
494 	mac_client_handle_t	link_mch = link->l_mch;
495 	const struct virtio_net_hdr *hdr;
496 	vmm_page_t *pages = NULL;
497 
498 	mp_head = mp_tail = NULL;
499 
500 	ASSERT(iov != NULL);
501 
502 	n = vq_popchain(ring, iov, max_segs, &cookie, &pages);
503 	if (n == 0) {
504 		VIONA_PROBE1(tx_absent, viona_vring_t *, ring);
505 		VIONA_RING_STAT_INCR(ring, tx_absent);
506 		return;
507 	} else if (n < 0) {
508 		/*
509 		 * Any error encountered in vq_popchain has already resulted in
510 		 * specific probe and statistic handling.  Further action here
511 		 * is unnecessary.
512 		 */
513 		return;
514 	}
515 
516 	/* Grab the header and ensure it is of adequate length */
517 	hdr = (const struct virtio_net_hdr *)iov[0].iov_base;
518 	len = iov[0].iov_len;
519 	if (len < sizeof (struct virtio_net_hdr)) {
520 		goto drop_fail;
521 	}
522 
523 	/* Make sure the packet headers are always in the first mblk. */
524 	if (ring->vr_txdesb != NULL) {
525 		dp = &ring->vr_txdesb[cookie];
526 
527 		/*
528 		 * If the guest driver is operating properly, each desb slot
529 		 * should be available for use when processing a TX descriptor
530 		 * from the 'avail' ring.  In the case of drivers that reuse a
531 		 * descriptor before it has been posted to the 'used' ring, the
532 		 * data is simply dropped.
533 		 */
534 		if (atomic_cas_uint(&dp->d_ref, 0, 1) != 0) {
535 			dp = NULL;
536 			goto drop_fail;
537 		}
538 
539 		dp->d_cookie = cookie;
540 		mp_head = desballoc(dp->d_headers, VIONA_MAX_HDRS_LEN, 0,
541 		    &dp->d_frtn);
542 
543 		/* Account for the successful desballoc. */
544 		if (mp_head != NULL)
545 			dp->d_ref++;
546 	} else {
547 		mp_head = allocb(VIONA_MAX_HDRS_LEN, 0);
548 	}
549 
550 	if (mp_head == NULL)
551 		goto drop_fail;
552 
553 	mp_tail = mp_head;
554 
555 	/*
556 	 * We always copy enough of the guest data to cover the
557 	 * headers. This protects us from TOCTOU attacks and allows
558 	 * message block length assumptions to be made in subsequent
559 	 * code. In many cases, this means copying more data than
560 	 * strictly necessary. That's okay, as it is the larger packets
561 	 * (such as LSO) that really benefit from desballoc().
562 	 */
563 	for (i = 1; i < n; i++) {
564 		const uint32_t to_copy = MIN(min_copy, iov[i].iov_len);
565 
566 		bcopy(iov[i].iov_base, mp_head->b_wptr, to_copy);
567 		mp_head->b_wptr += to_copy;
568 		len += to_copy;
569 		min_copy -= to_copy;
570 
571 		/*
572 		 * We've met the minimum copy requirement. The rest of
573 		 * the guest data can be referenced.
574 		 */
575 		if (min_copy == 0) {
576 			/*
577 			 * If we copied all contents of this
578 			 * descriptor then move onto the next one.
579 			 * Otherwise, record how far we are into the
580 			 * current descriptor.
581 			 */
582 			if (iov[i].iov_len == to_copy)
583 				i++;
584 			else
585 				base_off = to_copy;
586 
587 			break;
588 		}
589 	}
590 
591 	ASSERT3P(mp_head, !=, NULL);
592 	ASSERT3P(mp_tail, !=, NULL);
593 
594 	for (; i < n; i++) {
595 		uintptr_t base = (uintptr_t)iov[i].iov_base + base_off;
596 		uint32_t chunk = iov[i].iov_len - base_off;
597 
598 		ASSERT3U(base_off, <, iov[i].iov_len);
599 		ASSERT3U(chunk, >, 0);
600 
601 		if (dp != NULL) {
602 			mp = desballoc((uchar_t *)base, chunk, 0, &dp->d_frtn);
603 			if (mp == NULL) {
604 				goto drop_fail;
605 			}
606 			dp->d_ref++;
607 		} else {
608 			mp = allocb(chunk, BPRI_MED);
609 			if (mp == NULL) {
610 				goto drop_fail;
611 			}
612 			bcopy((uchar_t *)base, mp->b_wptr, chunk);
613 		}
614 
615 		base_off = 0;
616 		len += chunk;
617 		mp->b_wptr += chunk;
618 		mp_tail->b_cont = mp;
619 		mp_tail = mp;
620 	}
621 
622 	if (VNETHOOK_INTERESTED_OUT(link->l_neti)) {
623 		/*
624 		 * The hook consumer may elect to free the mblk_t and set
625 		 * our mblk_t ** to NULL.  When using a viona_desb_t
626 		 * (dp != NULL), we do not want the corresponding cleanup to
627 		 * occur during the viona_hook() call. We instead want to
628 		 * reset and recycle dp for future use.  To prevent cleanup
629 		 * during the viona_hook() call, we take a ref on dp (if being
630 		 * used), and release it on success.  On failure, the
631 		 * freemsgchain() call will release all the refs taken earlier
632 		 * in viona_tx() (aside from the initial ref and the one we
633 		 * take), and drop_hook will reset dp for reuse.
634 		 */
635 		if (dp != NULL)
636 			dp->d_ref++;
637 
638 		/*
639 		 * Pass &mp instead of &mp_head so we don't lose track of
640 		 * mp_head if the hook consumer (i.e. ipf) elects to free mp
641 		 * and set mp to NULL.
642 		 */
643 		mp = mp_head;
644 		if (viona_hook(link, ring, &mp, B_TRUE) != 0) {
645 			if (mp != NULL)
646 				freemsgchain(mp);
647 			goto drop_hook;
648 		}
649 
650 		if (dp != NULL) {
651 			dp->d_ref--;
652 
653 			/*
654 			 * It is possible that the hook(s) accepted the packet,
655 			 * but as part of its processing, it issued a pull-up
656 			 * which released all references to the desb.  In that
657 			 * case, go back to acting like the packet is entirely
658 			 * copied (which it is).
659 			 */
660 			if (dp->d_ref == 1) {
661 				dp->d_cookie = 0;
662 				dp->d_ref = 0;
663 				dp = NULL;
664 			}
665 		}
666 	}
667 
668 	/*
669 	 * Request hardware checksumming, if necessary. If the guest
670 	 * sent an LSO packet then it must have also negotiated and
671 	 * requested partial checksum; therefore the LSO logic is
672 	 * contained within viona_tx_csum().
673 	 */
674 	if ((link->l_features & VIRTIO_NET_F_CSUM) != 0 &&
675 	    (hdr->vrh_flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) != 0) {
676 		if (!viona_tx_csum(ring, hdr, mp_head, len - iov[0].iov_len)) {
677 			goto drop_fail;
678 		}
679 	}
680 
681 	if (dp != NULL) {
682 		dp->d_len = len;
683 		dp->d_pages = pages;
684 		mutex_enter(&ring->vr_lock);
685 		ring->vr_xfer_outstanding++;
686 		mutex_exit(&ring->vr_lock);
687 	} else {
688 		/*
689 		 * If the data was cloned out of the ring, the descriptors can
690 		 * be marked as 'used' now, rather than deferring that action
691 		 * until after successful packet transmission.
692 		 */
693 		vmm_drv_page_release_chain(pages);
694 		viona_tx_done(ring, len, cookie);
695 	}
696 
697 	/*
698 	 * We're potentially going deep into the networking layer; make sure the
699 	 * guest can't run concurrently.
700 	 */
701 	smt_begin_unsafe();
702 	/*
703 	 * Ignore, for now, any signal from MAC about whether the outgoing
704 	 * packet was dropped or not.
705 	 */
706 	(void) mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
707 	smt_end_unsafe();
708 	return;
709 
710 drop_fail:
711 	/*
712 	 * On the off chance that memory is not available via the desballoc or
713 	 * allocb calls, there are few options left besides to fail and drop
714 	 * the frame on the floor.
715 	 */
716 
717 	if (dp != NULL) {
718 		/*
719 		 * Take an additional reference on the desb handle (if present)
720 		 * so any desballoc-sourced mblks can release their hold on it
721 		 * without the handle reaching its final state and executing
722 		 * its clean-up logic.
723 		 */
724 		dp->d_ref++;
725 	}
726 
727 	/*
728 	 * Free any already-allocated blocks and sum up the total length of the
729 	 * dropped data to be released to the used ring.
730 	 */
731 	freemsgchain(mp_head);
732 
733 drop_hook:
734 	len = 0;
735 	for (uint_t i = 0; i < n; i++) {
736 		len += iov[i].iov_len;
737 	}
738 
739 	if (dp != NULL) {
740 		VERIFY(dp->d_ref == 2);
741 
742 		/* Clean up the desb handle, releasing the extra hold. */
743 		dp->d_len = 0;
744 		dp->d_cookie = 0;
745 		dp->d_ref = 0;
746 	}
747 
748 	VIONA_PROBE3(tx_drop, viona_vring_t *, ring, uint32_t, len,
749 	    uint16_t, cookie);
750 	vmm_drv_page_release_chain(pages);
751 	viona_tx_done(ring, len, cookie);
752 }
753