xref: /illumos-gate/usr/src/uts/intel/io/viona/viona_rx.c (revision 7037363ac736070a1e853a6841cd42d938f1e4f7)
1 /*
2  * Copyright (c) 2013  Chris Torek <torek @ torek net>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * This file and its contents are supplied under the terms of the
28  * Common Development and Distribution License ("CDDL"), version 1.0.
29  * You may only use this file in accordance with the terms of version
30  * 1.0 of the CDDL.
31  *
32  * A full copy of the text of the CDDL should have accompanied this
33  * source.  A copy of the CDDL is also available via the Internet at
34  * http://www.illumos.org/license/CDDL.
35  *
36  * Copyright 2015 Pluribus Networks Inc.
37  * Copyright 2019 Joyent, Inc.
38  * Copyright 2025 Oxide Computer Company
39  * Copyright 2022 Michael Zeller
40  * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
41  */
42 
43 #include <sys/types.h>
44 #include <sys/strsubr.h>
45 
46 #include <sys/dlpi.h>
47 #include <sys/pattr.h>
48 #include <sys/vlan.h>
49 
50 #include "viona_impl.h"
51 
52 
53 
54 #define	VTNET_MAXSEGS		32
55 
56 /* Min. octets in an ethernet frame minus FCS */
57 #define	MIN_BUF_SIZE		60
58 #define	NEED_VLAN_PAD_SIZE	(MIN_BUF_SIZE - VLAN_TAGSZ)
59 
60 static mblk_t *viona_vlan_pad_mp;
61 
62 void
viona_rx_init(void)63 viona_rx_init(void)
64 {
65 	mblk_t *mp;
66 
67 	ASSERT(viona_vlan_pad_mp == NULL);
68 
69 	/* Create mblk for padding when VLAN tags are stripped */
70 	mp = allocb_wait(VLAN_TAGSZ, BPRI_HI, STR_NOSIG, NULL);
71 	bzero(mp->b_rptr, VLAN_TAGSZ);
72 	mp->b_wptr += VLAN_TAGSZ;
73 	viona_vlan_pad_mp = mp;
74 }
75 
76 void
viona_rx_fini(void)77 viona_rx_fini(void)
78 {
79 	mblk_t *mp;
80 
81 	/* Clean up the VLAN padding mblk */
82 	mp = viona_vlan_pad_mp;
83 	viona_vlan_pad_mp = NULL;
84 	VERIFY(mp != NULL && mp->b_cont == NULL);
85 	freemsg(mp);
86 }
87 
88 void
viona_worker_rx(viona_vring_t * ring,viona_link_t * link)89 viona_worker_rx(viona_vring_t *ring, viona_link_t *link)
90 {
91 	(void) thread_vsetname(curthread, "viona_rx_%p", ring);
92 
93 	ASSERT(MUTEX_HELD(&ring->vr_lock));
94 	ASSERT3U(ring->vr_state, ==, VRS_RUN);
95 
96 	viona_ring_disable_notify(ring);
97 
98 	do {
99 		if (vmm_drv_lease_expired(ring->vr_lease)) {
100 			/*
101 			 * Set the renewal flag, causing incoming traffic to be
102 			 * dropped, and issue an RX barrier to ensure any
103 			 * threads in the RX callbacks will have finished.
104 			 * The vr_lock cannot be held across the barrier as it
105 			 * poses a deadlock risk.
106 			 */
107 			ring->vr_state_flags |= VRSF_RENEW;
108 			mutex_exit(&ring->vr_lock);
109 			mac_rx_barrier(link->l_mch);
110 			mutex_enter(&ring->vr_lock);
111 
112 			if (!viona_ring_lease_renew(ring)) {
113 				break;
114 			}
115 			ring->vr_state_flags &= ~VRSF_RENEW;
116 		}
117 
118 		/*
119 		 * For now, there is little to do in the RX worker as inbound
120 		 * data is delivered by MAC via the RX callbacks.  If tap-like
121 		 * functionality is added later, this would be a convenient
122 		 * place to inject frames into the guest.
123 		 */
124 		(void) cv_wait_sig(&ring->vr_cv, &ring->vr_lock);
125 	} while (!vring_need_bail(ring));
126 
127 	ring->vr_state = VRS_STOP;
128 
129 	/*
130 	 * The RX ring is stopping, before we start tearing it down it
131 	 * is imperative that we perform an RX barrier so that
132 	 * incoming packets are dropped at viona_rx_classified().
133 	 */
134 	mutex_exit(&ring->vr_lock);
135 	mac_rx_barrier(link->l_mch);
136 	mutex_enter(&ring->vr_lock);
137 
138 	/*
139 	 * If we bailed while renewing the ring lease, we cannot reset
140 	 * USED_NO_NOTIFY, since we lack a valid mapping to do so.
141 	 */
142 	if (ring->vr_lease != NULL) {
143 		viona_ring_enable_notify(ring);
144 	}
145 }
146 
147 static size_t
viona_copy_mblk(const mblk_t * mp,size_t seek,caddr_t buf,size_t len,boolean_t * end)148 viona_copy_mblk(const mblk_t *mp, size_t seek, caddr_t buf, size_t len,
149     boolean_t *end)
150 {
151 	size_t copied = 0;
152 	size_t off = 0;
153 
154 	/* Seek past already-consumed data */
155 	while (seek > 0 && mp != NULL) {
156 		const size_t chunk = MBLKL(mp);
157 
158 		if (chunk > seek) {
159 			off = seek;
160 			break;
161 		}
162 		mp = mp->b_cont;
163 		seek -= chunk;
164 	}
165 
166 	while (mp != NULL) {
167 		const size_t chunk = MBLKL(mp) - off;
168 		const size_t to_copy = MIN(chunk, len);
169 
170 		bcopy(mp->b_rptr + off, buf, to_copy);
171 		copied += to_copy;
172 		buf += to_copy;
173 		len -= to_copy;
174 
175 		/*
176 		 * If all the remaining data in the mblk_t was copied, move on
177 		 * to the next one in the chain.  Any seek offset applied to
178 		 * the first mblk copy is zeroed out for subsequent operations.
179 		 */
180 		if (chunk == to_copy) {
181 			mp = mp->b_cont;
182 			off = 0;
183 		}
184 #ifdef DEBUG
185 		else {
186 			/*
187 			 * The only valid reason for the copy to consume less
188 			 * than the entire contents of the mblk_t is because
189 			 * the output buffer has been filled.
190 			 */
191 			ASSERT0(len);
192 		}
193 #endif
194 
195 		/* Go no further if the buffer has been filled */
196 		if (len == 0) {
197 			break;
198 		}
199 
200 	}
201 	*end = (mp == NULL);
202 	return (copied);
203 }
204 
205 static int
viona_recv_plain(viona_vring_t * ring,const mblk_t * mp,size_t msz)206 viona_recv_plain(viona_vring_t *ring, const mblk_t *mp, size_t msz)
207 {
208 	struct iovec iov[VTNET_MAXSEGS];
209 	uint16_t cookie;
210 	int n;
211 	const size_t hdr_sz = sizeof (struct virtio_net_hdr);
212 	struct virtio_net_hdr *hdr;
213 	size_t len, copied = 0;
214 	caddr_t buf = NULL;
215 	boolean_t end = B_FALSE;
216 	const uint32_t features = ring->vr_link->l_features;
217 	vmm_page_t *pages = NULL;
218 
219 	ASSERT(msz >= MIN_BUF_SIZE);
220 
221 	n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie, &pages, NULL);
222 	if (n <= 0) {
223 		/* Without available buffers, the frame must be dropped. */
224 		return (ENOSPC);
225 	}
226 	if (iov[0].iov_len < hdr_sz) {
227 		/*
228 		 * There is little to do if there is not even space available
229 		 * for the sole header.  Zero the buffer and bail out as a last
230 		 * act of desperation.
231 		 */
232 		bzero(iov[0].iov_base, iov[0].iov_len);
233 		goto bad_frame;
234 	}
235 
236 	/* Grab the address of the header before anything else */
237 	hdr = (struct virtio_net_hdr *)iov[0].iov_base;
238 
239 	/*
240 	 * If there is any space remaining in the first buffer after writing
241 	 * the header, fill it with frame data.
242 	 */
243 	if (iov[0].iov_len > hdr_sz) {
244 		buf = (caddr_t)iov[0].iov_base + hdr_sz;
245 		len = iov[0].iov_len - hdr_sz;
246 
247 		copied += viona_copy_mblk(mp, copied, buf, len, &end);
248 	}
249 
250 	/* Copy any remaining data into subsequent buffers, if present */
251 	for (int i = 1; i < n && !end; i++) {
252 		buf = (caddr_t)iov[i].iov_base;
253 		len = iov[i].iov_len;
254 
255 		copied += viona_copy_mblk(mp, copied, buf, len, &end);
256 	}
257 
258 	/* Was the expected amount of data copied? */
259 	if (copied != msz) {
260 		VIONA_PROBE5(too_short, viona_vring_t *, ring,
261 		    uint16_t, cookie, mblk_t *, mp, size_t, copied,
262 		    size_t, msz);
263 		VIONA_RING_STAT_INCR(ring, too_short);
264 		goto bad_frame;
265 	}
266 
267 	/* Populate (read: zero) the header and account for it in the size */
268 	bzero(hdr, hdr_sz);
269 	copied += hdr_sz;
270 
271 	/* Add chksum bits, if needed */
272 	if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
273 		uint32_t cksum_flags;
274 
275 		if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
276 		    ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
277 			hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
278 			hdr->vrh_gso_size = DB_LSOMSS(mp);
279 		}
280 
281 		mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
282 		    &cksum_flags);
283 		if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
284 			hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID;
285 		}
286 	}
287 
288 	/* Release this chain */
289 	vmm_drv_page_release_chain(pages);
290 	vq_pushchain(ring, copied, cookie);
291 	return (0);
292 
293 bad_frame:
294 	VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring, uint16_t, cookie,
295 	    mblk_t *, mp);
296 	VIONA_RING_STAT_INCR(ring, bad_rx_frame);
297 
298 	vmm_drv_page_release_chain(pages);
299 	vq_pushchain(ring, MAX(copied, MIN_BUF_SIZE + hdr_sz), cookie);
300 	return (EINVAL);
301 }
302 
303 static int
viona_recv_merged(viona_vring_t * ring,const mblk_t * mp,size_t msz)304 viona_recv_merged(viona_vring_t *ring, const mblk_t *mp, size_t msz)
305 {
306 	struct iovec iov[VTNET_MAXSEGS];
307 	used_elem_t uelem[VTNET_MAXSEGS];
308 	vmm_page_t *pages = NULL, *hdr_pages = NULL;
309 	int n, i = 0, buf_idx = 0, err = 0;
310 	uint16_t cookie;
311 	caddr_t buf;
312 	size_t len, copied = 0, chunk = 0;
313 	struct virtio_net_mrgrxhdr *hdr = NULL;
314 	const size_t hdr_sz = sizeof (struct virtio_net_mrgrxhdr);
315 	boolean_t end = B_FALSE;
316 	const uint32_t features = ring->vr_link->l_features;
317 
318 	ASSERT(msz >= MIN_BUF_SIZE);
319 
320 	n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie, &hdr_pages, NULL);
321 	if (n <= 0) {
322 		/* Without available buffers, the frame must be dropped. */
323 		VIONA_PROBE2(no_space, viona_vring_t *, ring, mblk_t *, mp);
324 		VIONA_RING_STAT_INCR(ring, no_space);
325 		return (ENOSPC);
326 	}
327 	if (iov[0].iov_len < hdr_sz) {
328 		/*
329 		 * There is little to do if there is not even space available
330 		 * for the sole header.  Zero the buffer and bail out as a last
331 		 * act of desperation.
332 		 */
333 		bzero(iov[0].iov_base, iov[0].iov_len);
334 		uelem[0].id = cookie;
335 		uelem[0].len = iov[0].iov_len;
336 		err = EINVAL;
337 		goto done;
338 	}
339 
340 	/* Grab the address of the header and do initial population */
341 	hdr = (struct virtio_net_mrgrxhdr *)iov[0].iov_base;
342 	bzero(hdr, hdr_sz);
343 	hdr->vrh_bufs = 1;
344 
345 	/*
346 	 * If there is any space remaining in the first buffer after writing
347 	 * the header, fill it with frame data.  The size of the header itself
348 	 * is accounted for later.
349 	 */
350 	if (iov[0].iov_len > hdr_sz) {
351 		buf = iov[0].iov_base + hdr_sz;
352 		len = iov[0].iov_len - hdr_sz;
353 
354 		size_t copy_len;
355 		copy_len = viona_copy_mblk(mp, copied, buf, len, &end);
356 		chunk += copy_len;
357 		copied += copy_len;
358 	}
359 	i = 1;
360 
361 	do {
362 		while (i < n && !end) {
363 			buf = iov[i].iov_base;
364 			len = iov[i].iov_len;
365 
366 			size_t copy_len;
367 			copy_len = viona_copy_mblk(mp, copied, buf, len, &end);
368 			chunk += copy_len;
369 			copied += copy_len;
370 			i++;
371 		}
372 
373 		uelem[buf_idx].id = cookie;
374 		uelem[buf_idx].len = chunk;
375 
376 		/*
377 		 * Try to grab another buffer from the ring if the mblk has not
378 		 * yet been entirely copied out.
379 		 */
380 		if (!end) {
381 			if (buf_idx == (VTNET_MAXSEGS - 1)) {
382 				/*
383 				 * Our arbitrary limit on the number of buffers
384 				 * to offer for merge has already been reached.
385 				 */
386 				err = EOVERFLOW;
387 				break;
388 			}
389 			if (pages != NULL) {
390 				vmm_drv_page_release_chain(pages);
391 				pages = NULL;
392 			}
393 			n = vq_popchain(ring, iov, VTNET_MAXSEGS, &cookie,
394 			    &pages, NULL);
395 			if (n <= 0) {
396 				/*
397 				 * Without more immediate space to perform the
398 				 * copying, there is little choice left but to
399 				 * drop the packet.
400 				 */
401 				err = EMSGSIZE;
402 				break;
403 			}
404 			chunk = 0;
405 			i = 0;
406 			buf_idx++;
407 			/*
408 			 * Keep the header up-to-date with the number of
409 			 * buffers, but never reference its value since the
410 			 * guest could meddle with it.
411 			 */
412 			hdr->vrh_bufs++;
413 		}
414 	} while (!end && copied < msz);
415 
416 	/* Account for the header size in the first buffer */
417 	uelem[0].len += hdr_sz;
418 
419 	/*
420 	 * If no other errors were encounted during the copy, was the expected
421 	 * amount of data transferred?
422 	 */
423 	if (err == 0 && copied != msz) {
424 		VIONA_PROBE5(too_short, viona_vring_t *, ring,
425 		    uint16_t, cookie, mblk_t *, mp, size_t, copied,
426 		    size_t, msz);
427 		VIONA_RING_STAT_INCR(ring, too_short);
428 		err = EINVAL;
429 	}
430 
431 	/* Add chksum bits, if needed */
432 	if ((features & VIRTIO_NET_F_GUEST_CSUM) != 0) {
433 		uint32_t cksum_flags;
434 
435 		if (((features & VIRTIO_NET_F_GUEST_TSO4) != 0) &&
436 		    ((DB_CKSUMFLAGS(mp) & HW_LSO) != 0)) {
437 			hdr->vrh_gso_type |= VIRTIO_NET_HDR_GSO_TCPV4;
438 			hdr->vrh_gso_size = DB_LSOMSS(mp);
439 		}
440 
441 		mac_hcksum_get((mblk_t *)mp, NULL, NULL, NULL, NULL,
442 		    &cksum_flags);
443 		if ((cksum_flags & HCK_FULLCKSUM_OK) != 0) {
444 			hdr->vrh_flags |= VIRTIO_NET_HDR_F_DATA_VALID;
445 		}
446 	}
447 
448 done:
449 	switch (err) {
450 	case 0:
451 		/* Success can fall right through to ring delivery */
452 		break;
453 
454 	case EMSGSIZE:
455 		VIONA_PROBE3(rx_merge_underrun, viona_vring_t *, ring,
456 		    uint16_t, cookie, mblk_t *, mp);
457 		VIONA_RING_STAT_INCR(ring, rx_merge_underrun);
458 		break;
459 
460 	case EOVERFLOW:
461 		VIONA_PROBE3(rx_merge_overrun, viona_vring_t *, ring,
462 		    uint16_t, cookie, mblk_t *, mp);
463 		VIONA_RING_STAT_INCR(ring, rx_merge_overrun);
464 		break;
465 
466 	default:
467 		VIONA_PROBE3(bad_rx_frame, viona_vring_t *, ring,
468 		    uint16_t, cookie, mblk_t *, mp);
469 		VIONA_RING_STAT_INCR(ring, bad_rx_frame);
470 	}
471 
472 	if (hdr_pages != NULL) {
473 		vmm_drv_page_release_chain(hdr_pages);
474 	}
475 	if (pages != NULL) {
476 		vmm_drv_page_release_chain(pages);
477 	}
478 	vq_pushchain_many(ring, buf_idx + 1, uelem);
479 	return (err);
480 }
481 
482 static void
viona_rx_common(viona_vring_t * ring,mblk_t * mp,boolean_t is_loopback)483 viona_rx_common(viona_vring_t *ring, mblk_t *mp, boolean_t is_loopback)
484 {
485 	viona_link_t *link = ring->vr_link;
486 	mblk_t *mprx = NULL, **mprx_prevp = &mprx;
487 	mblk_t *mpdrop = NULL, **mpdrop_prevp = &mpdrop;
488 	const boolean_t do_merge =
489 	    (link->l_features & VIRTIO_NET_F_MRG_RXBUF) != 0;
490 	const boolean_t allow_gro =
491 	    (link->l_features & VIRTIO_NET_F_GUEST_TSO4) != 0;
492 
493 	size_t nrx = 0, ndrop = 0;
494 
495 	while (mp != NULL) {
496 		mblk_t *next = mp->b_next;
497 		mblk_t *pad = NULL;
498 		size_t size = msgsize(mp);
499 		int err = 0;
500 
501 		mp->b_next = NULL;
502 
503 		/*
504 		 * We treat both a 'drop' response and errors the same here
505 		 * and put the packet on the drop chain.  As packets may be
506 		 * subject to different actions in ipf (which do not all
507 		 * return the same set of error values), an error processing
508 		 * one packet doesn't mean the next packet will also generate
509 		 * an error.
510 		 */
511 		if (VNETHOOK_INTERESTED_IN(link->l_neti) &&
512 		    viona_hook(link, ring, &mp, B_FALSE) != 0) {
513 			if (mp != NULL) {
514 				*mpdrop_prevp = mp;
515 				mpdrop_prevp = &mp->b_next;
516 			} else {
517 				/*
518 				 * If the hook consumer (e.g. ipf) already
519 				 * freed the mblk_t, update the drop count now.
520 				 */
521 				ndrop++;
522 			}
523 			mp = next;
524 			continue;
525 		}
526 
527 		/*
528 		 * Virtio devices are prohibited from passing on packets larger
529 		 * than the MTU + Eth if the guest has not negotiated GRO flags
530 		 * (e.g., GUEST_TSO*). This occurs irrespective of `do_merge`.
531 		 */
532 		if (size > sizeof (struct ether_header) + link->l_mtu) {
533 			const boolean_t can_emu_lso = DB_LSOMSS(mp) != 0;
534 			const boolean_t attempt_emu =
535 			    !allow_gro || size > VIONA_GRO_MAX_PACKET_SIZE;
536 
537 			if ((DB_CKSUMFLAGS(mp) & HW_LSO) == 0 ||
538 			    (attempt_emu && !can_emu_lso)) {
539 				VIONA_PROBE3(rx_drop_over_mtu, viona_vring_t *,
540 				    ring, mblk_t *, mp, size_t, size);
541 				VIONA_RING_STAT_INCR(ring, rx_drop_over_mtu);
542 				err = E2BIG;
543 				goto pad_drop;
544 			}
545 
546 			/*
547 			 * If the packet has come from another device or viona
548 			 * which expected to make use of LSO, we can split the
549 			 * packet on its behalf.
550 			 */
551 			if (attempt_emu) {
552 				mblk_t *tail = NULL;
553 				uint_t n_pkts = 0;
554 
555 				DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM |
556 				    HCK_FULLCKSUM;
557 				mac_hw_emul(&mp, &tail, &n_pkts, MAC_ALL_EMULS);
558 				if (mp == NULL) {
559 					VIONA_RING_STAT_INCR(ring,
560 					    rx_gro_fallback_fail);
561 					viona_ring_stat_error(ring);
562 					mp = next;
563 					continue;
564 				}
565 				VIONA_PROBE4(rx_gro_fallback, viona_vring_t *,
566 				    ring, mblk_t *, mp, size_t, size,
567 				    uint_t, n_pkts);
568 				VIONA_RING_STAT_INCR(ring, rx_gro_fallback);
569 				ASSERT3P(tail, !=, NULL);
570 				if (tail != mp) {
571 					tail->b_next = next;
572 					next = mp->b_next;
573 					mp->b_next = NULL;
574 				}
575 				size = msgsize(mp);
576 			}
577 		}
578 
579 		/*
580 		 * Ethernet frames are expected to be padded out in order to
581 		 * meet the minimum size.
582 		 *
583 		 * A special case is made for frames which are short by
584 		 * VLAN_TAGSZ, having been stripped of their VLAN tag while
585 		 * traversing MAC.  A preallocated (and recycled) mblk is used
586 		 * for that specific condition.
587 		 *
588 		 * All other frames that fall short on length will have custom
589 		 * zero-padding allocated appended to them.
590 		 */
591 		if (size == NEED_VLAN_PAD_SIZE) {
592 			ASSERT(MBLKL(viona_vlan_pad_mp) == VLAN_TAGSZ);
593 			ASSERT(viona_vlan_pad_mp->b_cont == NULL);
594 
595 			for (pad = mp; pad->b_cont != NULL; pad = pad->b_cont)
596 				;
597 
598 			pad->b_cont = viona_vlan_pad_mp;
599 			size += VLAN_TAGSZ;
600 		} else if (size < MIN_BUF_SIZE) {
601 			const size_t pad_size = MIN_BUF_SIZE - size;
602 			mblk_t *zero_mp;
603 
604 			zero_mp = allocb(pad_size, BPRI_MED);
605 			if (zero_mp == NULL) {
606 				err = ENOMEM;
607 				goto pad_drop;
608 			}
609 
610 			VIONA_PROBE3(rx_pad_short, viona_vring_t *, ring,
611 			    mblk_t *, mp, size_t, pad_size);
612 			VIONA_RING_STAT_INCR(ring, rx_pad_short);
613 			zero_mp->b_wptr += pad_size;
614 			bzero(zero_mp->b_rptr, pad_size);
615 			linkb(mp, zero_mp);
616 			size += pad_size;
617 		}
618 
619 		if (do_merge) {
620 			err = viona_recv_merged(ring, mp, size);
621 		} else {
622 			err = viona_recv_plain(ring, mp, size);
623 		}
624 
625 		/*
626 		 * The VLAN padding mblk is meant for continual reuse, so
627 		 * remove it from the chain to prevent it from being freed.
628 		 *
629 		 * Custom allocated padding does not require this treatment and
630 		 * is freed normally.
631 		 */
632 		if (pad != NULL) {
633 			pad->b_cont = NULL;
634 		}
635 
636 pad_drop:
637 		/*
638 		 * While an error during rx processing
639 		 * (viona_recv_{merged,plain}) does not free mp on error,
640 		 * hook processing might or might not free mp.  Handle either
641 		 * scenario -- if mp is not yet free, it is queued up and
642 		 * freed after the guest has been notified.  If mp is
643 		 * already NULL, just proceed on.
644 		 */
645 		if (err != 0) {
646 			*mpdrop_prevp = mp;
647 			mpdrop_prevp = &mp->b_next;
648 
649 			/*
650 			 * If the available ring is empty, do not bother
651 			 * attempting to deliver any more frames.  Count the
652 			 * rest as dropped too.
653 			 */
654 			if (err == ENOSPC) {
655 				mp->b_next = next;
656 				break;
657 			} else {
658 				/*
659 				 * Cases other than the ring being empty of
660 				 * available descriptors count as errors for the
661 				 * ring/link stats.
662 				 */
663 				viona_ring_stat_error(ring);
664 			}
665 		} else {
666 			/* Chain successful mblks to be freed later */
667 			*mprx_prevp = mp;
668 			mprx_prevp = &mp->b_next;
669 			nrx++;
670 			viona_ring_stat_accept(ring, size);
671 		}
672 		mp = next;
673 	}
674 
675 	membar_enter();
676 	viona_intr_ring(ring, B_FALSE);
677 
678 	/* Free successfully received frames */
679 	if (mprx != NULL) {
680 		freemsgchain(mprx);
681 	}
682 
683 	/* Free dropped frames, also tallying them */
684 	mp = mpdrop;
685 	while (mp != NULL) {
686 		mblk_t *next = mp->b_next;
687 
688 		mp->b_next = NULL;
689 		freemsg(mp);
690 		mp = next;
691 		ndrop++;
692 		viona_ring_stat_drop(ring);
693 	}
694 	VIONA_PROBE3(rx, viona_link_t *, link, size_t, nrx, size_t, ndrop);
695 }
696 
697 static void
viona_rx_classified(void * arg,mac_resource_handle_t mrh,mblk_t * mp,boolean_t is_loopback)698 viona_rx_classified(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
699     boolean_t is_loopback)
700 {
701 	viona_vring_t *ring = (viona_vring_t *)arg;
702 
703 	/* Drop traffic if ring is inactive or renewing its lease */
704 	if (ring->vr_state != VRS_RUN ||
705 	    (ring->vr_state_flags & VRSF_RENEW) != 0) {
706 		freemsgchain(mp);
707 		return;
708 	}
709 
710 	viona_rx_common(ring, mp, is_loopback);
711 }
712 
713 static void
viona_rx_mcast(void * arg,mac_resource_handle_t mrh,mblk_t * mp,boolean_t is_loopback)714 viona_rx_mcast(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
715     boolean_t is_loopback)
716 {
717 	viona_vring_t *ring = (viona_vring_t *)arg;
718 	mac_handle_t mh = ring->vr_link->l_mh;
719 	mblk_t *mp_mcast_only = NULL;
720 	mblk_t **mpp = &mp_mcast_only;
721 
722 	/* Drop traffic if ring is inactive or renewing its lease */
723 	if (ring->vr_state != VRS_RUN ||
724 	    (ring->vr_state_flags & VRSF_RENEW) != 0) {
725 		freemsgchain(mp);
726 		return;
727 	}
728 
729 	/*
730 	 * In addition to multicast traffic, broadcast packets will also arrive
731 	 * via the MAC_CLIENT_PROMISC_MULTI handler. The mac_rx_set() callback
732 	 * for fully-classified traffic has already delivered that broadcast
733 	 * traffic, so it should be suppressed here, rather than duplicating it
734 	 * to the guest.
735 	 */
736 	while (mp != NULL) {
737 		mblk_t *mp_next;
738 		mac_header_info_t mhi;
739 		int err;
740 
741 		mp_next = mp->b_next;
742 		mp->b_next = NULL;
743 
744 		/* Determine the packet type */
745 		err = mac_vlan_header_info(mh, mp, &mhi);
746 		if (err != 0) {
747 			mblk_t *pull;
748 
749 			/*
750 			 * It is possible that gathering of the header
751 			 * information was impeded by a leading mblk_t which
752 			 * was of inadequate length to reference the needed
753 			 * fields.  Try again, in case that could be solved
754 			 * with a pull-up.
755 			 */
756 			pull = msgpullup(mp, sizeof (struct ether_vlan_header));
757 			if (pull == NULL) {
758 				err = ENOMEM;
759 			} else {
760 				err = mac_vlan_header_info(mh, pull, &mhi);
761 				freemsg(pull);
762 			}
763 
764 			if (err != 0) {
765 				VIONA_RING_STAT_INCR(ring, rx_mcast_check);
766 			}
767 		}
768 
769 		/* Chain up matching packets while discarding others */
770 		if (err == 0 && mhi.mhi_dsttype == MAC_ADDRTYPE_MULTICAST) {
771 			*mpp = mp;
772 			mpp = &mp->b_next;
773 		} else {
774 			freemsg(mp);
775 		}
776 
777 		mp = mp_next;
778 	}
779 
780 	if (mp_mcast_only != NULL) {
781 		viona_rx_common(ring, mp_mcast_only, is_loopback);
782 	}
783 }
784 
785 int
viona_rx_set(viona_link_t * link,viona_promisc_t mode)786 viona_rx_set(viona_link_t *link, viona_promisc_t mode)
787 {
788 	viona_vring_t *ring = &link->l_vrings[VIONA_VQ_RX];
789 	int err = 0;
790 
791 	if (link->l_mph != NULL) {
792 		mac_promisc_remove(link->l_mph);
793 		link->l_mph = NULL;
794 	}
795 
796 	switch (mode) {
797 	case VIONA_PROMISC_MULTI:
798 		mac_rx_set(link->l_mch, viona_rx_classified, ring);
799 		err = mac_promisc_add(link->l_mch, MAC_CLIENT_PROMISC_MULTI,
800 		    viona_rx_mcast, ring, &link->l_mph,
801 		    MAC_PROMISC_FLAGS_NO_TX_LOOP |
802 		    MAC_PROMISC_FLAGS_VLAN_TAG_STRIP);
803 		break;
804 	case VIONA_PROMISC_ALL:
805 		mac_rx_clear(link->l_mch);
806 		err = mac_promisc_add(link->l_mch, MAC_CLIENT_PROMISC_ALL,
807 		    viona_rx_classified, ring, &link->l_mph,
808 		    MAC_PROMISC_FLAGS_NO_TX_LOOP |
809 		    MAC_PROMISC_FLAGS_VLAN_TAG_STRIP);
810 		/*
811 		 * In case adding the promisc handler failed, restore the
812 		 * generic classified callback so that packets continue to
813 		 * flow to the guest.
814 		 */
815 		if (err != 0) {
816 			mac_rx_set(link->l_mch, viona_rx_classified, ring);
817 		}
818 		break;
819 	case VIONA_PROMISC_NONE:
820 	default:
821 		mac_rx_set(link->l_mch, viona_rx_classified, ring);
822 		break;
823 	}
824 
825 	return (err);
826 }
827 
828 void
viona_rx_clear(viona_link_t * link)829 viona_rx_clear(viona_link_t *link)
830 {
831 	if (link->l_mph != NULL) {
832 		mac_promisc_remove(link->l_mph);
833 		link->l_mph = NULL;
834 	}
835 	mac_rx_clear(link->l_mch);
836 }
837