xref: /freebsd/sys/dev/virtio/network/if_vtnet.c (revision 608da65de9552d5678c1000776ed69da04a45983)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011, Bryan Venteicher <bryanv@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice unmodified, this list of conditions, and the following
12  *    disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /* Driver for VirtIO network devices. */
30 
31 #include <sys/cdefs.h>
32 #include <sys/param.h>
33 #include <sys/eventhandler.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/sockio.h>
37 #include <sys/malloc.h>
38 #include <sys/mbuf.h>
39 #include <sys/module.h>
40 #include <sys/msan.h>
41 #include <sys/socket.h>
42 #include <sys/sysctl.h>
43 #include <sys/random.h>
44 #include <sys/sglist.h>
45 #include <sys/lock.h>
46 #include <sys/mutex.h>
47 #include <sys/taskqueue.h>
48 #include <sys/smp.h>
49 #include <machine/smp.h>
50 
51 #include <vm/uma.h>
52 
53 #include <net/debugnet.h>
54 #include <net/ethernet.h>
55 #include <net/pfil.h>
56 #include <net/if.h>
57 #include <net/if_var.h>
58 #include <net/if_arp.h>
59 #include <net/if_dl.h>
60 #include <net/if_types.h>
61 #include <net/if_media.h>
62 #include <net/if_vlan_var.h>
63 
64 #include <net/bpf.h>
65 
66 #include <netinet/in_systm.h>
67 #include <netinet/in.h>
68 #include <netinet/ip.h>
69 #include <netinet/ip6.h>
70 #include <netinet6/ip6_var.h>
71 #include <netinet/udp.h>
72 #include <netinet/tcp.h>
73 #include <netinet/tcp_lro.h>
74 
75 #include <machine/bus.h>
76 #include <machine/resource.h>
77 #include <sys/bus.h>
78 #include <sys/rman.h>
79 
80 #include <dev/virtio/virtio.h>
81 #include <dev/virtio/virtqueue.h>
82 #include <dev/virtio/network/virtio_net.h>
83 #include <dev/virtio/network/if_vtnetvar.h>
84 #include "virtio_if.h"
85 
86 #include "opt_inet.h"
87 #include "opt_inet6.h"
88 
89 #if defined(INET) || defined(INET6)
90 #include <machine/in_cksum.h>
91 #endif
92 
93 static int	vtnet_modevent(module_t, int, void *);
94 
95 static int	vtnet_probe(device_t);
96 static int	vtnet_attach(device_t);
97 static int	vtnet_detach(device_t);
98 static int	vtnet_suspend(device_t);
99 static int	vtnet_resume(device_t);
100 static int	vtnet_shutdown(device_t);
101 static int	vtnet_attach_completed(device_t);
102 static int	vtnet_config_change(device_t);
103 
104 static int	vtnet_negotiate_features(struct vtnet_softc *);
105 static int	vtnet_setup_features(struct vtnet_softc *);
106 static int	vtnet_init_rxq(struct vtnet_softc *, int);
107 static int	vtnet_init_txq(struct vtnet_softc *, int);
108 static int	vtnet_alloc_rxtx_queues(struct vtnet_softc *);
109 static void	vtnet_free_rxtx_queues(struct vtnet_softc *);
110 static int	vtnet_alloc_rx_filters(struct vtnet_softc *);
111 static void	vtnet_free_rx_filters(struct vtnet_softc *);
112 static int	vtnet_alloc_virtqueues(struct vtnet_softc *);
113 static int	vtnet_alloc_interface(struct vtnet_softc *);
114 static int	vtnet_setup_interface(struct vtnet_softc *);
115 static int	vtnet_ioctl_mtu(struct vtnet_softc *, u_int);
116 static int	vtnet_ioctl_ifflags(struct vtnet_softc *);
117 static int	vtnet_ioctl_multi(struct vtnet_softc *);
118 static int	vtnet_ioctl_ifcap(struct vtnet_softc *, struct ifreq *);
119 static int	vtnet_ioctl(if_t, u_long, caddr_t);
120 static uint64_t	vtnet_get_counter(if_t, ift_counter);
121 
122 static int	vtnet_rxq_populate(struct vtnet_rxq *);
123 static void	vtnet_rxq_free_mbufs(struct vtnet_rxq *);
124 static struct mbuf *
125 		vtnet_rx_alloc_buf(struct vtnet_softc *, int , struct mbuf **);
126 static int	vtnet_rxq_replace_lro_nomrg_buf(struct vtnet_rxq *,
127 		    struct mbuf *, int);
128 static int	vtnet_rxq_replace_buf(struct vtnet_rxq *, struct mbuf *, int);
129 static int	vtnet_rxq_enqueue_buf(struct vtnet_rxq *, struct mbuf *);
130 static int	vtnet_rxq_new_buf(struct vtnet_rxq *);
131 static int	vtnet_rxq_csum_needs_csum(struct vtnet_rxq *, struct mbuf *,
132 		     uint16_t, int, struct virtio_net_hdr *);
133 static int	vtnet_rxq_csum_data_valid(struct vtnet_rxq *, struct mbuf *,
134 		     uint16_t, int, struct virtio_net_hdr *);
135 static int	vtnet_rxq_csum(struct vtnet_rxq *, struct mbuf *,
136 		     struct virtio_net_hdr *);
137 static void	vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *, int);
138 static void	vtnet_rxq_discard_buf(struct vtnet_rxq *, struct mbuf *);
139 static int	vtnet_rxq_merged_eof(struct vtnet_rxq *, struct mbuf *, int);
140 static void	vtnet_rxq_input(struct vtnet_rxq *, struct mbuf *,
141 		    struct virtio_net_hdr *);
142 static int	vtnet_rxq_eof(struct vtnet_rxq *);
143 static void	vtnet_rx_vq_process(struct vtnet_rxq *rxq, int tries);
144 static void	vtnet_rx_vq_intr(void *);
145 static void	vtnet_rxq_tq_intr(void *, int);
146 
147 static int	vtnet_txq_intr_threshold(struct vtnet_txq *);
148 static int	vtnet_txq_below_threshold(struct vtnet_txq *);
149 static int	vtnet_txq_notify(struct vtnet_txq *);
150 static void	vtnet_txq_free_mbufs(struct vtnet_txq *);
151 static int	vtnet_txq_offload_ctx(struct vtnet_txq *, struct mbuf *,
152 		    int *, int *, int *);
153 static int	vtnet_txq_offload_tso(struct vtnet_txq *, struct mbuf *, int,
154 		    int, struct virtio_net_hdr *);
155 static struct mbuf *
156 		vtnet_txq_offload(struct vtnet_txq *, struct mbuf *,
157 		    struct virtio_net_hdr *);
158 static int	vtnet_txq_enqueue_buf(struct vtnet_txq *, struct mbuf **,
159 		    struct vtnet_tx_header *);
160 static int	vtnet_txq_encap(struct vtnet_txq *, struct mbuf **, int);
161 #ifdef VTNET_LEGACY_TX
162 static void	vtnet_start_locked(struct vtnet_txq *, if_t);
163 static void	vtnet_start(if_t);
164 #else
165 static int	vtnet_txq_mq_start_locked(struct vtnet_txq *, struct mbuf *);
166 static int	vtnet_txq_mq_start(if_t, struct mbuf *);
167 static void	vtnet_txq_tq_deferred(void *, int);
168 #endif
169 static void	vtnet_txq_start(struct vtnet_txq *);
170 static void	vtnet_txq_tq_intr(void *, int);
171 static int	vtnet_txq_eof(struct vtnet_txq *);
172 static void	vtnet_tx_vq_intr(void *);
173 static void	vtnet_tx_start_all(struct vtnet_softc *);
174 
175 #ifndef VTNET_LEGACY_TX
176 static void	vtnet_qflush(if_t);
177 #endif
178 
179 static int	vtnet_watchdog(struct vtnet_txq *);
180 static void	vtnet_accum_stats(struct vtnet_softc *,
181 		    struct vtnet_rxq_stats *, struct vtnet_txq_stats *);
182 static void	vtnet_tick(void *);
183 
184 static void	vtnet_start_taskqueues(struct vtnet_softc *);
185 static void	vtnet_free_taskqueues(struct vtnet_softc *);
186 static void	vtnet_drain_taskqueues(struct vtnet_softc *);
187 
188 static void	vtnet_drain_rxtx_queues(struct vtnet_softc *);
189 static void	vtnet_stop_rendezvous(struct vtnet_softc *);
190 static void	vtnet_stop(struct vtnet_softc *);
191 static int	vtnet_virtio_reinit(struct vtnet_softc *);
192 static void	vtnet_init_rx_filters(struct vtnet_softc *);
193 static int	vtnet_init_rx_queues(struct vtnet_softc *);
194 static int	vtnet_init_tx_queues(struct vtnet_softc *);
195 static int	vtnet_init_rxtx_queues(struct vtnet_softc *);
196 static void	vtnet_set_active_vq_pairs(struct vtnet_softc *);
197 static void	vtnet_update_rx_offloads(struct vtnet_softc *);
198 static int	vtnet_reinit(struct vtnet_softc *);
199 static void	vtnet_init_locked(struct vtnet_softc *, int);
200 static void	vtnet_init(void *);
201 
202 static void	vtnet_free_ctrl_vq(struct vtnet_softc *);
203 static void	vtnet_exec_ctrl_cmd(struct vtnet_softc *, void *,
204 		    struct sglist *, int, int);
205 static int	vtnet_ctrl_mac_cmd(struct vtnet_softc *, uint8_t *);
206 static int	vtnet_ctrl_guest_offloads(struct vtnet_softc *, uint64_t);
207 static int	vtnet_ctrl_mq_cmd(struct vtnet_softc *, uint16_t);
208 static int	vtnet_ctrl_rx_cmd(struct vtnet_softc *, uint8_t, bool);
209 static int	vtnet_set_promisc(struct vtnet_softc *, bool);
210 static int	vtnet_set_allmulti(struct vtnet_softc *, bool);
211 static void	vtnet_rx_filter(struct vtnet_softc *);
212 static void	vtnet_rx_filter_mac(struct vtnet_softc *);
213 static int	vtnet_exec_vlan_filter(struct vtnet_softc *, int, uint16_t);
214 static void	vtnet_rx_filter_vlan(struct vtnet_softc *);
215 static void	vtnet_update_vlan_filter(struct vtnet_softc *, int, uint16_t);
216 static void	vtnet_register_vlan(void *, if_t, uint16_t);
217 static void	vtnet_unregister_vlan(void *, if_t, uint16_t);
218 
219 static void	vtnet_update_speed_duplex(struct vtnet_softc *);
220 static int	vtnet_is_link_up(struct vtnet_softc *);
221 static void	vtnet_update_link_status(struct vtnet_softc *);
222 static int	vtnet_ifmedia_upd(if_t);
223 static void	vtnet_ifmedia_sts(if_t, struct ifmediareq *);
224 static void	vtnet_get_macaddr(struct vtnet_softc *);
225 static void	vtnet_set_macaddr(struct vtnet_softc *);
226 static void	vtnet_attached_set_macaddr(struct vtnet_softc *);
227 static void	vtnet_vlan_tag_remove(struct mbuf *);
228 static void	vtnet_set_rx_process_limit(struct vtnet_softc *);
229 
230 static void	vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *,
231 		    struct sysctl_oid_list *, struct vtnet_rxq *);
232 static void	vtnet_setup_txq_sysctl(struct sysctl_ctx_list *,
233 		    struct sysctl_oid_list *, struct vtnet_txq *);
234 static void	vtnet_setup_queue_sysctl(struct vtnet_softc *);
235 static void	vtnet_load_tunables(struct vtnet_softc *);
236 static void	vtnet_setup_sysctl(struct vtnet_softc *);
237 
238 static int	vtnet_rxq_enable_intr(struct vtnet_rxq *);
239 static void	vtnet_rxq_disable_intr(struct vtnet_rxq *);
240 static int	vtnet_txq_enable_intr(struct vtnet_txq *);
241 static void	vtnet_txq_disable_intr(struct vtnet_txq *);
242 static void	vtnet_enable_rx_interrupts(struct vtnet_softc *);
243 static void	vtnet_enable_tx_interrupts(struct vtnet_softc *);
244 static void	vtnet_enable_interrupts(struct vtnet_softc *);
245 static void	vtnet_disable_rx_interrupts(struct vtnet_softc *);
246 static void	vtnet_disable_tx_interrupts(struct vtnet_softc *);
247 static void	vtnet_disable_interrupts(struct vtnet_softc *);
248 
249 static int	vtnet_tunable_int(struct vtnet_softc *, const char *, int);
250 
251 DEBUGNET_DEFINE(vtnet);
252 
253 #define vtnet_htog16(_sc, _val)	virtio_htog16(vtnet_modern(_sc), _val)
254 #define vtnet_htog32(_sc, _val)	virtio_htog32(vtnet_modern(_sc), _val)
255 #define vtnet_htog64(_sc, _val)	virtio_htog64(vtnet_modern(_sc), _val)
256 #define vtnet_gtoh16(_sc, _val)	virtio_gtoh16(vtnet_modern(_sc), _val)
257 #define vtnet_gtoh32(_sc, _val)	virtio_gtoh32(vtnet_modern(_sc), _val)
258 #define vtnet_gtoh64(_sc, _val)	virtio_gtoh64(vtnet_modern(_sc), _val)
259 
260 /* Tunables. */
261 static SYSCTL_NODE(_hw, OID_AUTO, vtnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
262     "VirtIO Net driver parameters");
263 
264 static int vtnet_csum_disable = 0;
265 SYSCTL_INT(_hw_vtnet, OID_AUTO, csum_disable, CTLFLAG_RDTUN,
266     &vtnet_csum_disable, 0, "Disables receive and send checksum offload");
267 
268 static int vtnet_fixup_needs_csum = 0;
269 SYSCTL_INT(_hw_vtnet, OID_AUTO, fixup_needs_csum, CTLFLAG_RDTUN,
270     &vtnet_fixup_needs_csum, 0,
271     "Calculate valid checksum for NEEDS_CSUM packets");
272 
273 static int vtnet_tso_disable = 0;
274 SYSCTL_INT(_hw_vtnet, OID_AUTO, tso_disable, CTLFLAG_RDTUN,
275     &vtnet_tso_disable, 0, "Disables TSO");
276 
277 static int vtnet_lro_disable = 0;
278 SYSCTL_INT(_hw_vtnet, OID_AUTO, lro_disable, CTLFLAG_RDTUN,
279     &vtnet_lro_disable, 0, "Disables hardware LRO");
280 
281 static int vtnet_mq_disable = 0;
282 SYSCTL_INT(_hw_vtnet, OID_AUTO, mq_disable, CTLFLAG_RDTUN,
283     &vtnet_mq_disable, 0, "Disables multiqueue support");
284 
285 static int vtnet_mq_max_pairs = VTNET_MAX_QUEUE_PAIRS;
286 SYSCTL_INT(_hw_vtnet, OID_AUTO, mq_max_pairs, CTLFLAG_RDTUN,
287     &vtnet_mq_max_pairs, 0, "Maximum number of multiqueue pairs");
288 
289 static int vtnet_tso_maxlen = IP_MAXPACKET;
290 SYSCTL_INT(_hw_vtnet, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN,
291     &vtnet_tso_maxlen, 0, "TSO burst limit");
292 
293 static int vtnet_rx_process_limit = 1024;
294 SYSCTL_INT(_hw_vtnet, OID_AUTO, rx_process_limit, CTLFLAG_RDTUN,
295     &vtnet_rx_process_limit, 0,
296     "Number of RX segments processed in one pass");
297 
298 static int vtnet_lro_entry_count = 128;
299 SYSCTL_INT(_hw_vtnet, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN,
300     &vtnet_lro_entry_count, 0, "Software LRO entry count");
301 
302 /* Enable sorted LRO, and the depth of the mbuf queue. */
303 static int vtnet_lro_mbufq_depth = 0;
304 SYSCTL_UINT(_hw_vtnet, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN,
305     &vtnet_lro_mbufq_depth, 0, "Depth of software LRO mbuf queue");
306 
307 static uma_zone_t vtnet_tx_header_zone;
308 
309 static struct virtio_feature_desc vtnet_feature_desc[] = {
310 	{ VIRTIO_NET_F_CSUM,			"TxChecksum"		},
311 	{ VIRTIO_NET_F_GUEST_CSUM,		"RxChecksum"		},
312 	{ VIRTIO_NET_F_CTRL_GUEST_OFFLOADS,	"CtrlRxOffloads"	},
313 	{ VIRTIO_NET_F_MAC,			"MAC"			},
314 	{ VIRTIO_NET_F_GSO,			"TxGSO"			},
315 	{ VIRTIO_NET_F_GUEST_TSO4,		"RxLROv4"		},
316 	{ VIRTIO_NET_F_GUEST_TSO6,		"RxLROv6"		},
317 	{ VIRTIO_NET_F_GUEST_ECN,		"RxLROECN"		},
318 	{ VIRTIO_NET_F_GUEST_UFO,		"RxUFO"			},
319 	{ VIRTIO_NET_F_HOST_TSO4,		"TxTSOv4"		},
320 	{ VIRTIO_NET_F_HOST_TSO6,		"TxTSOv6"		},
321 	{ VIRTIO_NET_F_HOST_ECN,		"TxTSOECN"		},
322 	{ VIRTIO_NET_F_HOST_UFO,		"TxUFO"			},
323 	{ VIRTIO_NET_F_MRG_RXBUF,		"MrgRxBuf"		},
324 	{ VIRTIO_NET_F_STATUS,			"Status"		},
325 	{ VIRTIO_NET_F_CTRL_VQ,			"CtrlVq"		},
326 	{ VIRTIO_NET_F_CTRL_RX,			"CtrlRxMode"		},
327 	{ VIRTIO_NET_F_CTRL_VLAN,		"CtrlVLANFilter"	},
328 	{ VIRTIO_NET_F_CTRL_RX_EXTRA,		"CtrlRxModeExtra"	},
329 	{ VIRTIO_NET_F_GUEST_ANNOUNCE,		"GuestAnnounce"		},
330 	{ VIRTIO_NET_F_MQ,			"Multiqueue"		},
331 	{ VIRTIO_NET_F_CTRL_MAC_ADDR,		"CtrlMacAddr"		},
332 	{ VIRTIO_NET_F_SPEED_DUPLEX,		"SpeedDuplex"		},
333 
334 	{ 0, NULL }
335 };
336 
337 static device_method_t vtnet_methods[] = {
338 	/* Device methods. */
339 	DEVMETHOD(device_probe,			vtnet_probe),
340 	DEVMETHOD(device_attach,		vtnet_attach),
341 	DEVMETHOD(device_detach,		vtnet_detach),
342 	DEVMETHOD(device_suspend,		vtnet_suspend),
343 	DEVMETHOD(device_resume,		vtnet_resume),
344 	DEVMETHOD(device_shutdown,		vtnet_shutdown),
345 
346 	/* VirtIO methods. */
347 	DEVMETHOD(virtio_attach_completed,	vtnet_attach_completed),
348 	DEVMETHOD(virtio_config_change,		vtnet_config_change),
349 
350 	DEVMETHOD_END
351 };
352 
353 #ifdef DEV_NETMAP
354 #include <dev/netmap/if_vtnet_netmap.h>
355 #endif
356 
357 static driver_t vtnet_driver = {
358     .name = "vtnet",
359     .methods = vtnet_methods,
360     .size = sizeof(struct vtnet_softc)
361 };
362 VIRTIO_DRIVER_MODULE(vtnet, vtnet_driver, vtnet_modevent, NULL);
363 MODULE_VERSION(vtnet, 1);
364 MODULE_DEPEND(vtnet, virtio, 1, 1, 1);
365 #ifdef DEV_NETMAP
366 MODULE_DEPEND(vtnet, netmap, 1, 1, 1);
367 #endif
368 
369 VIRTIO_SIMPLE_PNPINFO(vtnet, VIRTIO_ID_NETWORK, "VirtIO Networking Adapter");
370 
371 static int
372 vtnet_modevent(module_t mod __unused, int type, void *unused __unused)
373 {
374 	int error = 0;
375 	static int loaded = 0;
376 
377 	switch (type) {
378 	case MOD_LOAD:
379 		if (loaded++ == 0) {
380 			vtnet_tx_header_zone = uma_zcreate("vtnet_tx_hdr",
381 				sizeof(struct vtnet_tx_header),
382 				NULL, NULL, NULL, NULL, 0, 0);
383 #ifdef DEBUGNET
384 			/*
385 			 * We need to allocate from this zone in the transmit path, so ensure
386 			 * that we have at least one item per header available.
387 			 * XXX add a separate zone like we do for mbufs? otherwise we may alloc
388 			 * buckets
389 			 */
390 			uma_zone_reserve(vtnet_tx_header_zone, DEBUGNET_MAX_IN_FLIGHT * 2);
391 			uma_prealloc(vtnet_tx_header_zone, DEBUGNET_MAX_IN_FLIGHT * 2);
392 #endif
393 		}
394 		break;
395 	case MOD_QUIESCE:
396 		if (uma_zone_get_cur(vtnet_tx_header_zone) > 0)
397 			error = EBUSY;
398 		break;
399 	case MOD_UNLOAD:
400 		if (--loaded == 0) {
401 			uma_zdestroy(vtnet_tx_header_zone);
402 			vtnet_tx_header_zone = NULL;
403 		}
404 		break;
405 	case MOD_SHUTDOWN:
406 		break;
407 	default:
408 		error = EOPNOTSUPP;
409 		break;
410 	}
411 
412 	return (error);
413 }
414 
415 static int
416 vtnet_probe(device_t dev)
417 {
418 	return (VIRTIO_SIMPLE_PROBE(dev, vtnet));
419 }
420 
421 static int
422 vtnet_attach(device_t dev)
423 {
424 	struct vtnet_softc *sc;
425 	int error;
426 
427 	sc = device_get_softc(dev);
428 	sc->vtnet_dev = dev;
429 	virtio_set_feature_desc(dev, vtnet_feature_desc);
430 
431 	VTNET_CORE_LOCK_INIT(sc);
432 	callout_init_mtx(&sc->vtnet_tick_ch, VTNET_CORE_MTX(sc), 0);
433 	vtnet_load_tunables(sc);
434 
435 	error = vtnet_alloc_interface(sc);
436 	if (error) {
437 		device_printf(dev, "cannot allocate interface\n");
438 		goto fail;
439 	}
440 
441 	vtnet_setup_sysctl(sc);
442 
443 	error = vtnet_setup_features(sc);
444 	if (error) {
445 		device_printf(dev, "cannot setup features\n");
446 		goto fail;
447 	}
448 
449 	error = vtnet_alloc_rx_filters(sc);
450 	if (error) {
451 		device_printf(dev, "cannot allocate Rx filters\n");
452 		goto fail;
453 	}
454 
455 	error = vtnet_alloc_rxtx_queues(sc);
456 	if (error) {
457 		device_printf(dev, "cannot allocate queues\n");
458 		goto fail;
459 	}
460 
461 	error = vtnet_alloc_virtqueues(sc);
462 	if (error) {
463 		device_printf(dev, "cannot allocate virtqueues\n");
464 		goto fail;
465 	}
466 
467 	error = vtnet_setup_interface(sc);
468 	if (error) {
469 		device_printf(dev, "cannot setup interface\n");
470 		goto fail;
471 	}
472 
473 	error = virtio_setup_intr(dev, INTR_TYPE_NET);
474 	if (error) {
475 		device_printf(dev, "cannot setup interrupts\n");
476 		ether_ifdetach(sc->vtnet_ifp);
477 		goto fail;
478 	}
479 
480 #ifdef DEV_NETMAP
481 	vtnet_netmap_attach(sc);
482 #endif
483 	vtnet_start_taskqueues(sc);
484 
485 fail:
486 	if (error)
487 		vtnet_detach(dev);
488 
489 	return (error);
490 }
491 
492 static int
493 vtnet_detach(device_t dev)
494 {
495 	struct vtnet_softc *sc;
496 	if_t ifp;
497 
498 	sc = device_get_softc(dev);
499 	ifp = sc->vtnet_ifp;
500 
501 	if (device_is_attached(dev)) {
502 		VTNET_CORE_LOCK(sc);
503 		vtnet_stop(sc);
504 		VTNET_CORE_UNLOCK(sc);
505 
506 		callout_drain(&sc->vtnet_tick_ch);
507 		vtnet_drain_taskqueues(sc);
508 
509 		ether_ifdetach(ifp);
510 	}
511 
512 #ifdef DEV_NETMAP
513 	netmap_detach(ifp);
514 #endif
515 
516 	if (sc->vtnet_pfil != NULL) {
517 		pfil_head_unregister(sc->vtnet_pfil);
518 		sc->vtnet_pfil = NULL;
519 	}
520 
521 	vtnet_free_taskqueues(sc);
522 
523 	if (sc->vtnet_vlan_attach != NULL) {
524 		EVENTHANDLER_DEREGISTER(vlan_config, sc->vtnet_vlan_attach);
525 		sc->vtnet_vlan_attach = NULL;
526 	}
527 	if (sc->vtnet_vlan_detach != NULL) {
528 		EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vtnet_vlan_detach);
529 		sc->vtnet_vlan_detach = NULL;
530 	}
531 
532 	ifmedia_removeall(&sc->vtnet_media);
533 
534 	if (ifp != NULL) {
535 		if_free(ifp);
536 		sc->vtnet_ifp = NULL;
537 	}
538 
539 	vtnet_free_rxtx_queues(sc);
540 	vtnet_free_rx_filters(sc);
541 
542 	if (sc->vtnet_ctrl_vq != NULL)
543 		vtnet_free_ctrl_vq(sc);
544 
545 	VTNET_CORE_LOCK_DESTROY(sc);
546 
547 	return (0);
548 }
549 
550 static int
551 vtnet_suspend(device_t dev)
552 {
553 	struct vtnet_softc *sc;
554 
555 	sc = device_get_softc(dev);
556 
557 	VTNET_CORE_LOCK(sc);
558 	vtnet_stop(sc);
559 	sc->vtnet_flags |= VTNET_FLAG_SUSPENDED;
560 	VTNET_CORE_UNLOCK(sc);
561 
562 	return (0);
563 }
564 
565 static int
566 vtnet_resume(device_t dev)
567 {
568 	struct vtnet_softc *sc;
569 	if_t ifp;
570 
571 	sc = device_get_softc(dev);
572 	ifp = sc->vtnet_ifp;
573 
574 	VTNET_CORE_LOCK(sc);
575 	if (if_getflags(ifp) & IFF_UP)
576 		vtnet_init_locked(sc, 0);
577 	sc->vtnet_flags &= ~VTNET_FLAG_SUSPENDED;
578 	VTNET_CORE_UNLOCK(sc);
579 
580 	return (0);
581 }
582 
583 static int
584 vtnet_shutdown(device_t dev)
585 {
586 	/*
587 	 * Suspend already does all of what we need to
588 	 * do here; we just never expect to be resumed.
589 	 */
590 	return (vtnet_suspend(dev));
591 }
592 
593 static int
594 vtnet_attach_completed(device_t dev)
595 {
596 	struct vtnet_softc *sc;
597 
598 	sc = device_get_softc(dev);
599 
600 	VTNET_CORE_LOCK(sc);
601 	vtnet_attached_set_macaddr(sc);
602 	VTNET_CORE_UNLOCK(sc);
603 
604 	return (0);
605 }
606 
607 static int
608 vtnet_config_change(device_t dev)
609 {
610 	struct vtnet_softc *sc;
611 
612 	sc = device_get_softc(dev);
613 
614 	VTNET_CORE_LOCK(sc);
615 	vtnet_update_link_status(sc);
616 	if (sc->vtnet_link_active != 0)
617 		vtnet_tx_start_all(sc);
618 	VTNET_CORE_UNLOCK(sc);
619 
620 	return (0);
621 }
622 
623 static int
624 vtnet_negotiate_features(struct vtnet_softc *sc)
625 {
626 	device_t dev;
627 	uint64_t features, negotiated_features;
628 	int no_csum;
629 
630 	dev = sc->vtnet_dev;
631 	features = virtio_bus_is_modern(dev) ? VTNET_MODERN_FEATURES :
632 	    VTNET_LEGACY_FEATURES;
633 
634 	/*
635 	 * TSO and LRO are only available when their corresponding checksum
636 	 * offload feature is also negotiated.
637 	 */
638 	no_csum = vtnet_tunable_int(sc, "csum_disable", vtnet_csum_disable);
639 	if (no_csum)
640 		features &= ~(VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM);
641 	if (no_csum || vtnet_tunable_int(sc, "tso_disable", vtnet_tso_disable))
642 		features &= ~VTNET_TSO_FEATURES;
643 	if (no_csum || vtnet_tunable_int(sc, "lro_disable", vtnet_lro_disable))
644 		features &= ~VTNET_LRO_FEATURES;
645 
646 #ifndef VTNET_LEGACY_TX
647 	if (vtnet_tunable_int(sc, "mq_disable", vtnet_mq_disable))
648 		features &= ~VIRTIO_NET_F_MQ;
649 #else
650 	features &= ~VIRTIO_NET_F_MQ;
651 #endif
652 
653 	negotiated_features = virtio_negotiate_features(dev, features);
654 
655 	if (virtio_with_feature(dev, VIRTIO_NET_F_MTU)) {
656 		uint16_t mtu;
657 
658 		mtu = virtio_read_dev_config_2(dev,
659 		    offsetof(struct virtio_net_config, mtu));
660 		if (mtu < VTNET_MIN_MTU /* || mtu > VTNET_MAX_MTU */) {
661 			device_printf(dev, "Invalid MTU value: %d. "
662 			    "MTU feature disabled.\n", mtu);
663 			features &= ~VIRTIO_NET_F_MTU;
664 			negotiated_features =
665 			    virtio_negotiate_features(dev, features);
666 		}
667 	}
668 
669 	if (virtio_with_feature(dev, VIRTIO_NET_F_MQ)) {
670 		uint16_t npairs;
671 
672 		npairs = virtio_read_dev_config_2(dev,
673 		    offsetof(struct virtio_net_config, max_virtqueue_pairs));
674 		if (npairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
675 		    npairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) {
676 			device_printf(dev, "Invalid max_virtqueue_pairs value: "
677 			    "%d. Multiqueue feature disabled.\n", npairs);
678 			features &= ~VIRTIO_NET_F_MQ;
679 			negotiated_features =
680 			    virtio_negotiate_features(dev, features);
681 		}
682 	}
683 
684 	if (virtio_with_feature(dev, VTNET_LRO_FEATURES) &&
685 	    virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF) == 0) {
686 		/*
687 		 * LRO without mergeable buffers requires special care. This
688 		 * is not ideal because every receive buffer must be large
689 		 * enough to hold the maximum TCP packet, the Ethernet header,
690 		 * and the header. This requires up to 34 descriptors with
691 		 * MCLBYTES clusters. If we do not have indirect descriptors,
692 		 * LRO is disabled since the virtqueue will not contain very
693 		 * many receive buffers.
694 		 */
695 		if (!virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC)) {
696 			device_printf(dev,
697 			    "Host LRO disabled since both mergeable buffers "
698 			    "and indirect descriptors were not negotiated\n");
699 			features &= ~VTNET_LRO_FEATURES;
700 			negotiated_features =
701 			    virtio_negotiate_features(dev, features);
702 		} else
703 			sc->vtnet_flags |= VTNET_FLAG_LRO_NOMRG;
704 	}
705 
706 	sc->vtnet_features = negotiated_features;
707 	sc->vtnet_negotiated_features = negotiated_features;
708 
709 	return (virtio_finalize_features(dev));
710 }
711 
712 static int
713 vtnet_setup_features(struct vtnet_softc *sc)
714 {
715 	device_t dev;
716 	int error;
717 
718 	dev = sc->vtnet_dev;
719 
720 	error = vtnet_negotiate_features(sc);
721 	if (error)
722 		return (error);
723 
724 	if (virtio_with_feature(dev, VIRTIO_F_VERSION_1))
725 		sc->vtnet_flags |= VTNET_FLAG_MODERN;
726 	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
727 		sc->vtnet_flags |= VTNET_FLAG_INDIRECT;
728 	if (virtio_with_feature(dev, VIRTIO_RING_F_EVENT_IDX))
729 		sc->vtnet_flags |= VTNET_FLAG_EVENT_IDX;
730 
731 	if (virtio_with_feature(dev, VIRTIO_NET_F_MAC)) {
732 		/* This feature should always be negotiated. */
733 		sc->vtnet_flags |= VTNET_FLAG_MAC;
734 	}
735 
736 	if (virtio_with_feature(dev, VIRTIO_NET_F_MTU)) {
737 		sc->vtnet_max_mtu = virtio_read_dev_config_2(dev,
738 		    offsetof(struct virtio_net_config, mtu));
739 	} else
740 		sc->vtnet_max_mtu = VTNET_MAX_MTU;
741 
742 	if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF)) {
743 		sc->vtnet_flags |= VTNET_FLAG_MRG_RXBUFS;
744 		sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
745 	} else if (vtnet_modern(sc)) {
746 		/* This is identical to the mergeable header. */
747 		sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr_v1);
748 	} else
749 		sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr);
750 
751 	if (vtnet_modern(sc) || sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS)
752 		sc->vtnet_rx_nsegs = VTNET_RX_SEGS_HDR_INLINE;
753 	else if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG)
754 		sc->vtnet_rx_nsegs = VTNET_RX_SEGS_LRO_NOMRG;
755 	else
756 		sc->vtnet_rx_nsegs = VTNET_RX_SEGS_HDR_SEPARATE;
757 
758 	/*
759 	 * Favor "hardware" LRO if negotiated, but support software LRO as
760 	 * a fallback; there is usually little benefit (or worse) with both.
761 	 */
762 	if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO4) == 0 &&
763 	    virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO6) == 0)
764 		sc->vtnet_flags |= VTNET_FLAG_SW_LRO;
765 
766 	if (virtio_with_feature(dev, VIRTIO_NET_F_GSO) ||
767 	    virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4) ||
768 	    virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6))
769 		sc->vtnet_tx_nsegs = VTNET_TX_SEGS_MAX;
770 	else
771 		sc->vtnet_tx_nsegs = VTNET_TX_SEGS_MIN;
772 
773 	sc->vtnet_req_vq_pairs = 1;
774 	sc->vtnet_max_vq_pairs = 1;
775 
776 	if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VQ)) {
777 		sc->vtnet_flags |= VTNET_FLAG_CTRL_VQ;
778 
779 		if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_RX))
780 			sc->vtnet_flags |= VTNET_FLAG_CTRL_RX;
781 		if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VLAN))
782 			sc->vtnet_flags |= VTNET_FLAG_VLAN_FILTER;
783 		if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_MAC_ADDR))
784 			sc->vtnet_flags |= VTNET_FLAG_CTRL_MAC;
785 
786 		if (virtio_with_feature(dev, VIRTIO_NET_F_MQ)) {
787 			sc->vtnet_max_vq_pairs = virtio_read_dev_config_2(dev,
788 			    offsetof(struct virtio_net_config,
789 			    max_virtqueue_pairs));
790 		}
791 	}
792 
793 	if (sc->vtnet_max_vq_pairs > 1) {
794 		int req;
795 
796 		/*
797 		 * Limit the maximum number of requested queue pairs to the
798 		 * number of CPUs and the configured maximum.
799 		 */
800 		req = vtnet_tunable_int(sc, "mq_max_pairs", vtnet_mq_max_pairs);
801 		if (req < 0)
802 			req = 1;
803 		if (req == 0)
804 			req = mp_ncpus;
805 		if (req > sc->vtnet_max_vq_pairs)
806 			req = sc->vtnet_max_vq_pairs;
807 		if (req > mp_ncpus)
808 			req = mp_ncpus;
809 		if (req > 1) {
810 			sc->vtnet_req_vq_pairs = req;
811 			sc->vtnet_flags |= VTNET_FLAG_MQ;
812 		}
813 	}
814 
815 	return (0);
816 }
817 
818 static int
819 vtnet_init_rxq(struct vtnet_softc *sc, int id)
820 {
821 	struct vtnet_rxq *rxq;
822 
823 	rxq = &sc->vtnet_rxqs[id];
824 
825 	snprintf(rxq->vtnrx_name, sizeof(rxq->vtnrx_name), "%s-rx%d",
826 	    device_get_nameunit(sc->vtnet_dev), id);
827 	mtx_init(&rxq->vtnrx_mtx, rxq->vtnrx_name, NULL, MTX_DEF);
828 
829 	rxq->vtnrx_sc = sc;
830 	rxq->vtnrx_id = id;
831 
832 	rxq->vtnrx_sg = sglist_alloc(sc->vtnet_rx_nsegs, M_NOWAIT);
833 	if (rxq->vtnrx_sg == NULL)
834 		return (ENOMEM);
835 
836 #if defined(INET) || defined(INET6)
837 	if (vtnet_software_lro(sc)) {
838 		if (tcp_lro_init_args(&rxq->vtnrx_lro, sc->vtnet_ifp,
839 		    sc->vtnet_lro_entry_count, sc->vtnet_lro_mbufq_depth) != 0)
840 			return (ENOMEM);
841 	}
842 #endif
843 
844 	NET_TASK_INIT(&rxq->vtnrx_intrtask, 0, vtnet_rxq_tq_intr, rxq);
845 	rxq->vtnrx_tq = taskqueue_create(rxq->vtnrx_name, M_NOWAIT,
846 	    taskqueue_thread_enqueue, &rxq->vtnrx_tq);
847 
848 	return (rxq->vtnrx_tq == NULL ? ENOMEM : 0);
849 }
850 
851 static int
852 vtnet_init_txq(struct vtnet_softc *sc, int id)
853 {
854 	struct vtnet_txq *txq;
855 
856 	txq = &sc->vtnet_txqs[id];
857 
858 	snprintf(txq->vtntx_name, sizeof(txq->vtntx_name), "%s-tx%d",
859 	    device_get_nameunit(sc->vtnet_dev), id);
860 	mtx_init(&txq->vtntx_mtx, txq->vtntx_name, NULL, MTX_DEF);
861 
862 	txq->vtntx_sc = sc;
863 	txq->vtntx_id = id;
864 
865 	txq->vtntx_sg = sglist_alloc(sc->vtnet_tx_nsegs, M_NOWAIT);
866 	if (txq->vtntx_sg == NULL)
867 		return (ENOMEM);
868 
869 #ifndef VTNET_LEGACY_TX
870 	txq->vtntx_br = buf_ring_alloc(VTNET_DEFAULT_BUFRING_SIZE, M_DEVBUF,
871 	    M_NOWAIT, &txq->vtntx_mtx);
872 	if (txq->vtntx_br == NULL)
873 		return (ENOMEM);
874 
875 	TASK_INIT(&txq->vtntx_defrtask, 0, vtnet_txq_tq_deferred, txq);
876 #endif
877 	TASK_INIT(&txq->vtntx_intrtask, 0, vtnet_txq_tq_intr, txq);
878 	txq->vtntx_tq = taskqueue_create(txq->vtntx_name, M_NOWAIT,
879 	    taskqueue_thread_enqueue, &txq->vtntx_tq);
880 	if (txq->vtntx_tq == NULL)
881 		return (ENOMEM);
882 
883 	return (0);
884 }
885 
886 static int
887 vtnet_alloc_rxtx_queues(struct vtnet_softc *sc)
888 {
889 	int i, npairs, error;
890 
891 	npairs = sc->vtnet_max_vq_pairs;
892 
893 	sc->vtnet_rxqs = malloc(sizeof(struct vtnet_rxq) * npairs, M_DEVBUF,
894 	    M_NOWAIT | M_ZERO);
895 	sc->vtnet_txqs = malloc(sizeof(struct vtnet_txq) * npairs, M_DEVBUF,
896 	    M_NOWAIT | M_ZERO);
897 	if (sc->vtnet_rxqs == NULL || sc->vtnet_txqs == NULL)
898 		return (ENOMEM);
899 
900 	for (i = 0; i < npairs; i++) {
901 		error = vtnet_init_rxq(sc, i);
902 		if (error)
903 			return (error);
904 		error = vtnet_init_txq(sc, i);
905 		if (error)
906 			return (error);
907 	}
908 
909 	vtnet_set_rx_process_limit(sc);
910 	vtnet_setup_queue_sysctl(sc);
911 
912 	return (0);
913 }
914 
915 static void
916 vtnet_destroy_rxq(struct vtnet_rxq *rxq)
917 {
918 
919 	rxq->vtnrx_sc = NULL;
920 	rxq->vtnrx_id = -1;
921 
922 #if defined(INET) || defined(INET6)
923 	tcp_lro_free(&rxq->vtnrx_lro);
924 #endif
925 
926 	if (rxq->vtnrx_sg != NULL) {
927 		sglist_free(rxq->vtnrx_sg);
928 		rxq->vtnrx_sg = NULL;
929 	}
930 
931 	if (mtx_initialized(&rxq->vtnrx_mtx) != 0)
932 		mtx_destroy(&rxq->vtnrx_mtx);
933 }
934 
935 static void
936 vtnet_destroy_txq(struct vtnet_txq *txq)
937 {
938 
939 	txq->vtntx_sc = NULL;
940 	txq->vtntx_id = -1;
941 
942 	if (txq->vtntx_sg != NULL) {
943 		sglist_free(txq->vtntx_sg);
944 		txq->vtntx_sg = NULL;
945 	}
946 
947 #ifndef VTNET_LEGACY_TX
948 	if (txq->vtntx_br != NULL) {
949 		buf_ring_free(txq->vtntx_br, M_DEVBUF);
950 		txq->vtntx_br = NULL;
951 	}
952 #endif
953 
954 	if (mtx_initialized(&txq->vtntx_mtx) != 0)
955 		mtx_destroy(&txq->vtntx_mtx);
956 }
957 
958 static void
959 vtnet_free_rxtx_queues(struct vtnet_softc *sc)
960 {
961 	int i;
962 
963 	if (sc->vtnet_rxqs != NULL) {
964 		for (i = 0; i < sc->vtnet_max_vq_pairs; i++)
965 			vtnet_destroy_rxq(&sc->vtnet_rxqs[i]);
966 		free(sc->vtnet_rxqs, M_DEVBUF);
967 		sc->vtnet_rxqs = NULL;
968 	}
969 
970 	if (sc->vtnet_txqs != NULL) {
971 		for (i = 0; i < sc->vtnet_max_vq_pairs; i++)
972 			vtnet_destroy_txq(&sc->vtnet_txqs[i]);
973 		free(sc->vtnet_txqs, M_DEVBUF);
974 		sc->vtnet_txqs = NULL;
975 	}
976 }
977 
978 static int
979 vtnet_alloc_rx_filters(struct vtnet_softc *sc)
980 {
981 
982 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) {
983 		sc->vtnet_mac_filter = malloc(sizeof(struct vtnet_mac_filter),
984 		    M_DEVBUF, M_NOWAIT | M_ZERO);
985 		if (sc->vtnet_mac_filter == NULL)
986 			return (ENOMEM);
987 	}
988 
989 	if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) {
990 		sc->vtnet_vlan_filter = malloc(sizeof(uint32_t) *
991 		    VTNET_VLAN_FILTER_NWORDS, M_DEVBUF, M_NOWAIT | M_ZERO);
992 		if (sc->vtnet_vlan_filter == NULL)
993 			return (ENOMEM);
994 	}
995 
996 	return (0);
997 }
998 
999 static void
1000 vtnet_free_rx_filters(struct vtnet_softc *sc)
1001 {
1002 
1003 	if (sc->vtnet_mac_filter != NULL) {
1004 		free(sc->vtnet_mac_filter, M_DEVBUF);
1005 		sc->vtnet_mac_filter = NULL;
1006 	}
1007 
1008 	if (sc->vtnet_vlan_filter != NULL) {
1009 		free(sc->vtnet_vlan_filter, M_DEVBUF);
1010 		sc->vtnet_vlan_filter = NULL;
1011 	}
1012 }
1013 
1014 static int
1015 vtnet_alloc_virtqueues(struct vtnet_softc *sc)
1016 {
1017 	device_t dev;
1018 	struct vq_alloc_info *info;
1019 	struct vtnet_rxq *rxq;
1020 	struct vtnet_txq *txq;
1021 	int i, idx, nvqs, error;
1022 
1023 	dev = sc->vtnet_dev;
1024 
1025 	nvqs = sc->vtnet_max_vq_pairs * 2;
1026 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ)
1027 		nvqs++;
1028 
1029 	info = malloc(sizeof(struct vq_alloc_info) * nvqs, M_TEMP, M_NOWAIT);
1030 	if (info == NULL)
1031 		return (ENOMEM);
1032 
1033 	for (i = 0, idx = 0; i < sc->vtnet_req_vq_pairs; i++, idx += 2) {
1034 		rxq = &sc->vtnet_rxqs[i];
1035 		VQ_ALLOC_INFO_INIT(&info[idx], sc->vtnet_rx_nsegs,
1036 		    vtnet_rx_vq_intr, rxq, &rxq->vtnrx_vq,
1037 		    "%s-rx%d", device_get_nameunit(dev), rxq->vtnrx_id);
1038 
1039 		txq = &sc->vtnet_txqs[i];
1040 		VQ_ALLOC_INFO_INIT(&info[idx+1], sc->vtnet_tx_nsegs,
1041 		    vtnet_tx_vq_intr, txq, &txq->vtntx_vq,
1042 		    "%s-tx%d", device_get_nameunit(dev), txq->vtntx_id);
1043 	}
1044 
1045 	/* These queues will not be used so allocate the minimum resources. */
1046 	for (/**/; i < sc->vtnet_max_vq_pairs; i++, idx += 2) {
1047 		rxq = &sc->vtnet_rxqs[i];
1048 		VQ_ALLOC_INFO_INIT(&info[idx], 0, NULL, rxq, &rxq->vtnrx_vq,
1049 		    "%s-rx%d", device_get_nameunit(dev), rxq->vtnrx_id);
1050 
1051 		txq = &sc->vtnet_txqs[i];
1052 		VQ_ALLOC_INFO_INIT(&info[idx+1], 0, NULL, txq, &txq->vtntx_vq,
1053 		    "%s-tx%d", device_get_nameunit(dev), txq->vtntx_id);
1054 	}
1055 
1056 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) {
1057 		VQ_ALLOC_INFO_INIT(&info[idx], 0, NULL, NULL,
1058 		    &sc->vtnet_ctrl_vq, "%s ctrl", device_get_nameunit(dev));
1059 	}
1060 
1061 	error = virtio_alloc_virtqueues(dev, nvqs, info);
1062 	free(info, M_TEMP);
1063 
1064 	return (error);
1065 }
1066 
1067 static int
1068 vtnet_alloc_interface(struct vtnet_softc *sc)
1069 {
1070 	device_t dev;
1071 	if_t ifp;
1072 
1073 	dev = sc->vtnet_dev;
1074 
1075 	ifp = if_alloc(IFT_ETHER);
1076 	if (ifp == NULL)
1077 		return (ENOMEM);
1078 
1079 	sc->vtnet_ifp = ifp;
1080 	if_setsoftc(ifp, sc);
1081 	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
1082 
1083 	return (0);
1084 }
1085 
1086 static int
1087 vtnet_setup_interface(struct vtnet_softc *sc)
1088 {
1089 	device_t dev;
1090 	struct pfil_head_args pa;
1091 	if_t ifp;
1092 
1093 	dev = sc->vtnet_dev;
1094 	ifp = sc->vtnet_ifp;
1095 
1096 	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
1097 	if_setbaudrate(ifp, IF_Gbps(10));
1098 	if_setinitfn(ifp, vtnet_init);
1099 	if_setioctlfn(ifp, vtnet_ioctl);
1100 	if_setgetcounterfn(ifp, vtnet_get_counter);
1101 #ifndef VTNET_LEGACY_TX
1102 	if_settransmitfn(ifp, vtnet_txq_mq_start);
1103 	if_setqflushfn(ifp, vtnet_qflush);
1104 #else
1105 	struct virtqueue *vq = sc->vtnet_txqs[0].vtntx_vq;
1106 	if_setstartfn(ifp, vtnet_start);
1107 	if_setsendqlen(ifp, virtqueue_size(vq) - 1);
1108 	if_setsendqready(ifp);
1109 #endif
1110 
1111 	vtnet_get_macaddr(sc);
1112 
1113 	if (virtio_with_feature(dev, VIRTIO_NET_F_STATUS))
1114 		if_setcapabilitiesbit(ifp, IFCAP_LINKSTATE, 0);
1115 
1116 	ifmedia_init(&sc->vtnet_media, 0, vtnet_ifmedia_upd, vtnet_ifmedia_sts);
1117 	ifmedia_add(&sc->vtnet_media, IFM_ETHER | IFM_AUTO, 0, NULL);
1118 	ifmedia_set(&sc->vtnet_media, IFM_ETHER | IFM_AUTO);
1119 
1120 	if (virtio_with_feature(dev, VIRTIO_NET_F_CSUM)) {
1121 		int gso;
1122 
1123 		if_setcapabilitiesbit(ifp, IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6, 0);
1124 
1125 		gso = virtio_with_feature(dev, VIRTIO_NET_F_GSO);
1126 		if (gso || virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4))
1127 			if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0);
1128 		if (gso || virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6))
1129 			if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0);
1130 		if (gso || virtio_with_feature(dev, VIRTIO_NET_F_HOST_ECN))
1131 			sc->vtnet_flags |= VTNET_FLAG_TSO_ECN;
1132 
1133 		if (if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) {
1134 			int tso_maxlen;
1135 
1136 			if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTSO, 0);
1137 
1138 			tso_maxlen = vtnet_tunable_int(sc, "tso_maxlen",
1139 			    vtnet_tso_maxlen);
1140 			if_sethwtsomax(ifp, tso_maxlen -
1141 			    (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
1142 			if_sethwtsomaxsegcount(ifp, sc->vtnet_tx_nsegs - 1);
1143 			if_sethwtsomaxsegsize(ifp, PAGE_SIZE);
1144 		}
1145 	}
1146 
1147 	if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_CSUM)) {
1148 		if_setcapabilitiesbit(ifp, IFCAP_RXCSUM, 0);
1149 #ifdef notyet
1150 		/* BMV: Rx checksums not distinguished between IPv4 and IPv6. */
1151 		if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0);
1152 #endif
1153 
1154 		if (vtnet_tunable_int(sc, "fixup_needs_csum",
1155 		    vtnet_fixup_needs_csum) != 0)
1156 			sc->vtnet_flags |= VTNET_FLAG_FIXUP_NEEDS_CSUM;
1157 
1158 		/* Support either "hardware" or software LRO. */
1159 		if_setcapabilitiesbit(ifp, IFCAP_LRO, 0);
1160 	}
1161 
1162 	if (if_getcapabilities(ifp) & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6)) {
1163 		/*
1164 		 * VirtIO does not support VLAN tagging, but we can fake
1165 		 * it by inserting and removing the 802.1Q header during
1166 		 * transmit and receive. We are then able to do checksum
1167 		 * offloading of VLAN frames.
1168 		 */
1169 		if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM, 0);
1170 	}
1171 
1172 	if (sc->vtnet_max_mtu >= ETHERMTU_JUMBO)
1173 		if_setcapabilitiesbit(ifp, IFCAP_JUMBO_MTU, 0);
1174 	if_setcapabilitiesbit(ifp, IFCAP_VLAN_MTU, 0);
1175 
1176 	/*
1177 	 * Capabilities after here are not enabled by default.
1178 	 */
1179 	if_setcapenable(ifp, if_getcapabilities(ifp));
1180 
1181 	if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) {
1182 		if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWFILTER, 0);
1183 
1184 		sc->vtnet_vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
1185 		    vtnet_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
1186 		sc->vtnet_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
1187 		    vtnet_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
1188 	}
1189 
1190 	ether_ifattach(ifp, sc->vtnet_hwaddr);
1191 
1192 	/* Tell the upper layer(s) we support long frames. */
1193 	if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
1194 
1195 	DEBUGNET_SET(ifp, vtnet);
1196 
1197 	pa.pa_version = PFIL_VERSION;
1198 	pa.pa_flags = PFIL_IN;
1199 	pa.pa_type = PFIL_TYPE_ETHERNET;
1200 	pa.pa_headname = if_name(ifp);
1201 	sc->vtnet_pfil = pfil_head_register(&pa);
1202 
1203 	return (0);
1204 }
1205 
1206 static int
1207 vtnet_rx_cluster_size(struct vtnet_softc *sc, int mtu)
1208 {
1209 	int framesz;
1210 
1211 	if (sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS)
1212 		return (MJUMPAGESIZE);
1213 	else if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG)
1214 		return (MCLBYTES);
1215 
1216 	/*
1217 	 * Try to scale the receive mbuf cluster size from the MTU. We
1218 	 * could also use the VQ size to influence the selected size,
1219 	 * but that would only matter for very small queues.
1220 	 */
1221 	if (vtnet_modern(sc)) {
1222 		MPASS(sc->vtnet_hdr_size == sizeof(struct virtio_net_hdr_v1));
1223 		framesz = sizeof(struct virtio_net_hdr_v1);
1224 	} else
1225 		framesz = sizeof(struct vtnet_rx_header);
1226 	framesz += sizeof(struct ether_vlan_header) + mtu;
1227 
1228 	if (framesz <= MCLBYTES)
1229 		return (MCLBYTES);
1230 	else if (framesz <= MJUMPAGESIZE)
1231 		return (MJUMPAGESIZE);
1232 	else if (framesz <= MJUM9BYTES)
1233 		return (MJUM9BYTES);
1234 
1235 	/* Sane default; avoid 16KB clusters. */
1236 	return (MCLBYTES);
1237 }
1238 
1239 static int
1240 vtnet_ioctl_mtu(struct vtnet_softc *sc, u_int mtu)
1241 {
1242 	if_t ifp;
1243 	int clustersz;
1244 
1245 	ifp = sc->vtnet_ifp;
1246 	VTNET_CORE_LOCK_ASSERT(sc);
1247 
1248 	if (if_getmtu(ifp) == mtu)
1249 		return (0);
1250 	else if (mtu < ETHERMIN || mtu > sc->vtnet_max_mtu)
1251 		return (EINVAL);
1252 
1253 	if_setmtu(ifp, mtu);
1254 	clustersz = vtnet_rx_cluster_size(sc, mtu);
1255 
1256 	if (clustersz != sc->vtnet_rx_clustersz &&
1257 	    if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
1258 		if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
1259 		vtnet_init_locked(sc, 0);
1260 	}
1261 
1262 	return (0);
1263 }
1264 
1265 static int
1266 vtnet_ioctl_ifflags(struct vtnet_softc *sc)
1267 {
1268 	if_t ifp;
1269 	int drv_running;
1270 
1271 	ifp = sc->vtnet_ifp;
1272 	drv_running = (if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0;
1273 
1274 	VTNET_CORE_LOCK_ASSERT(sc);
1275 
1276 	if ((if_getflags(ifp) & IFF_UP) == 0) {
1277 		if (drv_running)
1278 			vtnet_stop(sc);
1279 		goto out;
1280 	}
1281 
1282 	if (!drv_running) {
1283 		vtnet_init_locked(sc, 0);
1284 		goto out;
1285 	}
1286 
1287 	if ((if_getflags(ifp) ^ sc->vtnet_if_flags) &
1288 	    (IFF_PROMISC | IFF_ALLMULTI)) {
1289 		if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX)
1290 			vtnet_rx_filter(sc);
1291 		else {
1292 			/*
1293 			 * We don't support filtering out multicast, so
1294 			 * ALLMULTI is always set.
1295 			 */
1296 			if_setflagbits(ifp, IFF_ALLMULTI, 0);
1297 			if_setflagbits(ifp, IFF_PROMISC, 0);
1298 		}
1299 	}
1300 
1301 out:
1302 	sc->vtnet_if_flags = if_getflags(ifp);
1303 	return (0);
1304 }
1305 
1306 static int
1307 vtnet_ioctl_multi(struct vtnet_softc *sc)
1308 {
1309 	if_t ifp;
1310 
1311 	ifp = sc->vtnet_ifp;
1312 
1313 	VTNET_CORE_LOCK_ASSERT(sc);
1314 
1315 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX &&
1316 	    if_getdrvflags(ifp) & IFF_DRV_RUNNING)
1317 		vtnet_rx_filter_mac(sc);
1318 
1319 	return (0);
1320 }
1321 
1322 static int
1323 vtnet_ioctl_ifcap(struct vtnet_softc *sc, struct ifreq *ifr)
1324 {
1325 	if_t ifp;
1326 	int mask, reinit, update;
1327 
1328 	ifp = sc->vtnet_ifp;
1329 	mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^ if_getcapenable(ifp);
1330 	reinit = update = 0;
1331 
1332 	VTNET_CORE_LOCK_ASSERT(sc);
1333 
1334 	if (mask & IFCAP_TXCSUM)
1335 		if_togglecapenable(ifp, IFCAP_TXCSUM);
1336 	if (mask & IFCAP_TXCSUM_IPV6)
1337 		if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6);
1338 	if (mask & IFCAP_TSO4)
1339 		if_togglecapenable(ifp, IFCAP_TSO4);
1340 	if (mask & IFCAP_TSO6)
1341 		if_togglecapenable(ifp, IFCAP_TSO6);
1342 
1343 	if (mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO)) {
1344 		/*
1345 		 * These Rx features require the negotiated features to
1346 		 * be updated. Avoid a full reinit if possible.
1347 		 */
1348 		if (sc->vtnet_features & VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)
1349 			update = 1;
1350 		else
1351 			reinit = 1;
1352 
1353 		/* BMV: Avoid needless renegotiation for just software LRO. */
1354 		if ((mask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO)) ==
1355 		    IFCAP_LRO && vtnet_software_lro(sc))
1356 			reinit = update = 0;
1357 
1358 		if (mask & IFCAP_RXCSUM)
1359 			if_togglecapenable(ifp, IFCAP_RXCSUM);
1360 		if (mask & IFCAP_RXCSUM_IPV6)
1361 			if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6);
1362 		if (mask & IFCAP_LRO)
1363 			if_togglecapenable(ifp, IFCAP_LRO);
1364 
1365 		/*
1366 		 * VirtIO does not distinguish between IPv4 and IPv6 checksums
1367 		 * so treat them as a pair. Guest TSO (LRO) requires receive
1368 		 * checksums.
1369 		 */
1370 		if (if_getcapenable(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
1371 			if_setcapenablebit(ifp, IFCAP_RXCSUM, 0);
1372 #ifdef notyet
1373 			if_setcapenablebit(ifp, IFCAP_RXCSUM_IPV6, 0);
1374 #endif
1375 		} else
1376 			if_setcapenablebit(ifp, 0,
1377 			    (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 | IFCAP_LRO));
1378 	}
1379 
1380 	if (mask & IFCAP_VLAN_HWFILTER) {
1381 		/* These Rx features require renegotiation. */
1382 		reinit = 1;
1383 
1384 		if (mask & IFCAP_VLAN_HWFILTER)
1385 			if_togglecapenable(ifp, IFCAP_VLAN_HWFILTER);
1386 	}
1387 
1388 	if (mask & IFCAP_VLAN_HWTSO)
1389 		if_togglecapenable(ifp, IFCAP_VLAN_HWTSO);
1390 	if (mask & IFCAP_VLAN_HWTAGGING)
1391 		if_togglecapenable(ifp, IFCAP_VLAN_HWTAGGING);
1392 
1393 	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
1394 		if (reinit) {
1395 			if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
1396 			vtnet_init_locked(sc, 0);
1397 		} else if (update)
1398 			vtnet_update_rx_offloads(sc);
1399 	}
1400 
1401 	return (0);
1402 }
1403 
1404 static int
1405 vtnet_ioctl(if_t ifp, u_long cmd, caddr_t data)
1406 {
1407 	struct vtnet_softc *sc;
1408 	struct ifreq *ifr;
1409 	int error;
1410 
1411 	sc = if_getsoftc(ifp);
1412 	ifr = (struct ifreq *) data;
1413 	error = 0;
1414 
1415 	switch (cmd) {
1416 	case SIOCSIFMTU:
1417 		VTNET_CORE_LOCK(sc);
1418 		error = vtnet_ioctl_mtu(sc, ifr->ifr_mtu);
1419 		VTNET_CORE_UNLOCK(sc);
1420 		break;
1421 
1422 	case SIOCSIFFLAGS:
1423 		VTNET_CORE_LOCK(sc);
1424 		error = vtnet_ioctl_ifflags(sc);
1425 		VTNET_CORE_UNLOCK(sc);
1426 		break;
1427 
1428 	case SIOCADDMULTI:
1429 	case SIOCDELMULTI:
1430 		VTNET_CORE_LOCK(sc);
1431 		error = vtnet_ioctl_multi(sc);
1432 		VTNET_CORE_UNLOCK(sc);
1433 		break;
1434 
1435 	case SIOCSIFMEDIA:
1436 	case SIOCGIFMEDIA:
1437 		error = ifmedia_ioctl(ifp, ifr, &sc->vtnet_media, cmd);
1438 		break;
1439 
1440 	case SIOCSIFCAP:
1441 		VTNET_CORE_LOCK(sc);
1442 		error = vtnet_ioctl_ifcap(sc, ifr);
1443 		VTNET_CORE_UNLOCK(sc);
1444 		VLAN_CAPABILITIES(ifp);
1445 		break;
1446 
1447 	default:
1448 		error = ether_ioctl(ifp, cmd, data);
1449 		break;
1450 	}
1451 
1452 	VTNET_CORE_LOCK_ASSERT_NOTOWNED(sc);
1453 
1454 	return (error);
1455 }
1456 
1457 static int
1458 vtnet_rxq_populate(struct vtnet_rxq *rxq)
1459 {
1460 	struct virtqueue *vq;
1461 	int nbufs, error;
1462 
1463 #ifdef DEV_NETMAP
1464 	error = vtnet_netmap_rxq_populate(rxq);
1465 	if (error >= 0)
1466 		return (error);
1467 #endif  /* DEV_NETMAP */
1468 
1469 	vq = rxq->vtnrx_vq;
1470 	error = ENOSPC;
1471 
1472 	for (nbufs = 0; !virtqueue_full(vq); nbufs++) {
1473 		error = vtnet_rxq_new_buf(rxq);
1474 		if (error)
1475 			break;
1476 	}
1477 
1478 	if (nbufs > 0) {
1479 		virtqueue_notify(vq);
1480 		/*
1481 		 * EMSGSIZE signifies the virtqueue did not have enough
1482 		 * entries available to hold the last mbuf. This is not
1483 		 * an error.
1484 		 */
1485 		if (error == EMSGSIZE)
1486 			error = 0;
1487 	}
1488 
1489 	return (error);
1490 }
1491 
1492 static void
1493 vtnet_rxq_free_mbufs(struct vtnet_rxq *rxq)
1494 {
1495 	struct virtqueue *vq;
1496 	struct mbuf *m;
1497 	int last;
1498 #ifdef DEV_NETMAP
1499 	struct netmap_kring *kring = netmap_kring_on(NA(rxq->vtnrx_sc->vtnet_ifp),
1500 							rxq->vtnrx_id, NR_RX);
1501 #else  /* !DEV_NETMAP */
1502 	void *kring = NULL;
1503 #endif /* !DEV_NETMAP */
1504 
1505 	vq = rxq->vtnrx_vq;
1506 	last = 0;
1507 
1508 	while ((m = virtqueue_drain(vq, &last)) != NULL) {
1509 		if (kring == NULL)
1510 			m_freem(m);
1511 	}
1512 
1513 	KASSERT(virtqueue_empty(vq),
1514 	    ("%s: mbufs remaining in rx queue %p", __func__, rxq));
1515 }
1516 
1517 static struct mbuf *
1518 vtnet_rx_alloc_buf(struct vtnet_softc *sc, int nbufs, struct mbuf **m_tailp)
1519 {
1520 	struct mbuf *m_head, *m_tail, *m;
1521 	int i, size;
1522 
1523 	m_head = NULL;
1524 	size = sc->vtnet_rx_clustersz;
1525 
1526 	KASSERT(nbufs == 1 || sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG,
1527 	    ("%s: mbuf %d chain requested without LRO_NOMRG", __func__, nbufs));
1528 
1529 	for (i = 0; i < nbufs; i++) {
1530 		m = m_getjcl(M_NOWAIT, MT_DATA, i == 0 ? M_PKTHDR : 0, size);
1531 		if (m == NULL) {
1532 			sc->vtnet_stats.mbuf_alloc_failed++;
1533 			m_freem(m_head);
1534 			return (NULL);
1535 		}
1536 
1537 		m->m_len = size;
1538 		if (m_head != NULL) {
1539 			m_tail->m_next = m;
1540 			m_tail = m;
1541 		} else
1542 			m_head = m_tail = m;
1543 	}
1544 
1545 	if (m_tailp != NULL)
1546 		*m_tailp = m_tail;
1547 
1548 	return (m_head);
1549 }
1550 
1551 /*
1552  * Slow path for when LRO without mergeable buffers is negotiated.
1553  */
1554 static int
1555 vtnet_rxq_replace_lro_nomrg_buf(struct vtnet_rxq *rxq, struct mbuf *m0,
1556     int len0)
1557 {
1558 	struct vtnet_softc *sc;
1559 	struct mbuf *m, *m_prev, *m_new, *m_tail;
1560 	int len, clustersz, nreplace, error;
1561 
1562 	sc = rxq->vtnrx_sc;
1563 	clustersz = sc->vtnet_rx_clustersz;
1564 
1565 	m_prev = NULL;
1566 	m_tail = NULL;
1567 	nreplace = 0;
1568 
1569 	m = m0;
1570 	len = len0;
1571 
1572 	/*
1573 	 * Since these mbuf chains are so large, avoid allocating a complete
1574 	 * replacement when the received frame did not consume the entire
1575 	 * chain. Unused mbufs are moved to the tail of the replacement mbuf.
1576 	 */
1577 	while (len > 0) {
1578 		if (m == NULL) {
1579 			sc->vtnet_stats.rx_frame_too_large++;
1580 			return (EMSGSIZE);
1581 		}
1582 
1583 		/*
1584 		 * Every mbuf should have the expected cluster size since that
1585 		 * is also used to allocate the replacements.
1586 		 */
1587 		KASSERT(m->m_len == clustersz,
1588 		    ("%s: mbuf size %d not expected cluster size %d", __func__,
1589 		    m->m_len, clustersz));
1590 
1591 		m->m_len = MIN(m->m_len, len);
1592 		len -= m->m_len;
1593 
1594 		m_prev = m;
1595 		m = m->m_next;
1596 		nreplace++;
1597 	}
1598 
1599 	KASSERT(nreplace > 0 && nreplace <= sc->vtnet_rx_nmbufs,
1600 	    ("%s: invalid replacement mbuf count %d max %d", __func__,
1601 	    nreplace, sc->vtnet_rx_nmbufs));
1602 
1603 	m_new = vtnet_rx_alloc_buf(sc, nreplace, &m_tail);
1604 	if (m_new == NULL) {
1605 		m_prev->m_len = clustersz;
1606 		return (ENOBUFS);
1607 	}
1608 
1609 	/*
1610 	 * Move any unused mbufs from the received mbuf chain onto the
1611 	 * end of the replacement chain.
1612 	 */
1613 	if (m_prev->m_next != NULL) {
1614 		m_tail->m_next = m_prev->m_next;
1615 		m_prev->m_next = NULL;
1616 	}
1617 
1618 	error = vtnet_rxq_enqueue_buf(rxq, m_new);
1619 	if (error) {
1620 		/*
1621 		 * The replacement is suppose to be an copy of the one
1622 		 * dequeued so this is a very unexpected error.
1623 		 *
1624 		 * Restore the m0 chain to the original state if it was
1625 		 * modified so we can then discard it.
1626 		 */
1627 		if (m_tail->m_next != NULL) {
1628 			m_prev->m_next = m_tail->m_next;
1629 			m_tail->m_next = NULL;
1630 		}
1631 		m_prev->m_len = clustersz;
1632 		sc->vtnet_stats.rx_enq_replacement_failed++;
1633 		m_freem(m_new);
1634 	}
1635 
1636 	return (error);
1637 }
1638 
1639 static int
1640 vtnet_rxq_replace_buf(struct vtnet_rxq *rxq, struct mbuf *m, int len)
1641 {
1642 	struct vtnet_softc *sc;
1643 	struct mbuf *m_new;
1644 	int error;
1645 
1646 	sc = rxq->vtnrx_sc;
1647 
1648 	if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG)
1649 		return (vtnet_rxq_replace_lro_nomrg_buf(rxq, m, len));
1650 
1651 	MPASS(m->m_next == NULL);
1652 	if (m->m_len < len)
1653 		return (EMSGSIZE);
1654 
1655 	m_new = vtnet_rx_alloc_buf(sc, 1, NULL);
1656 	if (m_new == NULL)
1657 		return (ENOBUFS);
1658 
1659 	error = vtnet_rxq_enqueue_buf(rxq, m_new);
1660 	if (error) {
1661 		sc->vtnet_stats.rx_enq_replacement_failed++;
1662 		m_freem(m_new);
1663 	} else
1664 		m->m_len = len;
1665 
1666 	return (error);
1667 }
1668 
1669 static int
1670 vtnet_rxq_enqueue_buf(struct vtnet_rxq *rxq, struct mbuf *m)
1671 {
1672 	struct vtnet_softc *sc;
1673 	struct sglist *sg;
1674 	int header_inlined, error;
1675 
1676 	sc = rxq->vtnrx_sc;
1677 	sg = rxq->vtnrx_sg;
1678 
1679 	KASSERT(m->m_next == NULL || sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG,
1680 	    ("%s: mbuf chain without LRO_NOMRG", __func__));
1681 	VTNET_RXQ_LOCK_ASSERT(rxq);
1682 
1683 	sglist_reset(sg);
1684 	header_inlined = vtnet_modern(sc) ||
1685 	    (sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) != 0; /* TODO: ANY_LAYOUT */
1686 
1687 	if (header_inlined)
1688 		error = sglist_append_mbuf(sg, m);
1689 	else {
1690 		struct vtnet_rx_header *rxhdr =
1691 		    mtod(m, struct vtnet_rx_header *);
1692 		MPASS(sc->vtnet_hdr_size == sizeof(struct virtio_net_hdr));
1693 
1694 		/* Append the header and remaining mbuf data. */
1695 		error = sglist_append(sg, &rxhdr->vrh_hdr, sc->vtnet_hdr_size);
1696 		if (error)
1697 			return (error);
1698 		error = sglist_append(sg, &rxhdr[1],
1699 		    m->m_len - sizeof(struct vtnet_rx_header));
1700 		if (error)
1701 			return (error);
1702 
1703 		if (m->m_next != NULL)
1704 			error = sglist_append_mbuf(sg, m->m_next);
1705 	}
1706 
1707 	if (error)
1708 		return (error);
1709 
1710 	return (virtqueue_enqueue(rxq->vtnrx_vq, m, sg, 0, sg->sg_nseg));
1711 }
1712 
1713 static int
1714 vtnet_rxq_new_buf(struct vtnet_rxq *rxq)
1715 {
1716 	struct vtnet_softc *sc;
1717 	struct mbuf *m;
1718 	int error;
1719 
1720 	sc = rxq->vtnrx_sc;
1721 
1722 	m = vtnet_rx_alloc_buf(sc, sc->vtnet_rx_nmbufs, NULL);
1723 	if (m == NULL)
1724 		return (ENOBUFS);
1725 
1726 	error = vtnet_rxq_enqueue_buf(rxq, m);
1727 	if (error)
1728 		m_freem(m);
1729 
1730 	return (error);
1731 }
1732 
1733 static int
1734 vtnet_rxq_csum_needs_csum(struct vtnet_rxq *rxq, struct mbuf *m, uint16_t etype,
1735     int hoff, struct virtio_net_hdr *hdr)
1736 {
1737 	struct vtnet_softc *sc;
1738 	int error;
1739 
1740 	sc = rxq->vtnrx_sc;
1741 
1742 	/*
1743 	 * NEEDS_CSUM corresponds to Linux's CHECKSUM_PARTIAL, but FreeBSD does
1744 	 * not have an analogous CSUM flag. The checksum has been validated,
1745 	 * but is incomplete (TCP/UDP pseudo header).
1746 	 *
1747 	 * The packet is likely from another VM on the same host that itself
1748 	 * performed checksum offloading so Tx/Rx is basically a memcpy and
1749 	 * the checksum has little value.
1750 	 *
1751 	 * Default to receiving the packet as-is for performance reasons, but
1752 	 * this can cause issues if the packet is to be forwarded because it
1753 	 * does not contain a valid checksum. This patch may be helpful:
1754 	 * https://reviews.freebsd.org/D6611. In the meantime, have the driver
1755 	 * compute the checksum if requested.
1756 	 *
1757 	 * BMV: Need to add an CSUM_PARTIAL flag?
1758 	 */
1759 	if ((sc->vtnet_flags & VTNET_FLAG_FIXUP_NEEDS_CSUM) == 0) {
1760 		error = vtnet_rxq_csum_data_valid(rxq, m, etype, hoff, hdr);
1761 		return (error);
1762 	}
1763 
1764 	/*
1765 	 * Compute the checksum in the driver so the packet will contain a
1766 	 * valid checksum. The checksum is at csum_offset from csum_start.
1767 	 */
1768 	switch (etype) {
1769 #if defined(INET) || defined(INET6)
1770 	case ETHERTYPE_IP:
1771 	case ETHERTYPE_IPV6: {
1772 		int csum_off, csum_end;
1773 		uint16_t csum;
1774 
1775 		csum_off = hdr->csum_start + hdr->csum_offset;
1776 		csum_end = csum_off + sizeof(uint16_t);
1777 
1778 		/* Assume checksum will be in the first mbuf. */
1779 		if (m->m_len < csum_end || m->m_pkthdr.len < csum_end)
1780 			return (1);
1781 
1782 		/*
1783 		 * Like in_delayed_cksum()/in6_delayed_cksum(), compute the
1784 		 * checksum and write it at the specified offset. We could
1785 		 * try to verify the packet: csum_start should probably
1786 		 * correspond to the start of the TCP/UDP header.
1787 		 *
1788 		 * BMV: Need to properly handle UDP with zero checksum. Is
1789 		 * the IPv4 header checksum implicitly validated?
1790 		 */
1791 		csum = in_cksum_skip(m, m->m_pkthdr.len, hdr->csum_start);
1792 		*(uint16_t *)(mtodo(m, csum_off)) = csum;
1793 		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1794 		m->m_pkthdr.csum_data = 0xFFFF;
1795 		break;
1796 	}
1797 #endif
1798 	default:
1799 		sc->vtnet_stats.rx_csum_bad_ethtype++;
1800 		return (1);
1801 	}
1802 
1803 	return (0);
1804 }
1805 
1806 static int
1807 vtnet_rxq_csum_data_valid(struct vtnet_rxq *rxq, struct mbuf *m,
1808     uint16_t etype, int hoff, struct virtio_net_hdr *hdr __unused)
1809 {
1810 #if 0
1811 	struct vtnet_softc *sc;
1812 #endif
1813 	int protocol;
1814 
1815 #if 0
1816 	sc = rxq->vtnrx_sc;
1817 #endif
1818 
1819 	switch (etype) {
1820 #if defined(INET)
1821 	case ETHERTYPE_IP:
1822 		if (__predict_false(m->m_len < hoff + sizeof(struct ip)))
1823 			protocol = IPPROTO_DONE;
1824 		else {
1825 			struct ip *ip = (struct ip *)(m->m_data + hoff);
1826 			protocol = ip->ip_p;
1827 		}
1828 		break;
1829 #endif
1830 #if defined(INET6)
1831 	case ETHERTYPE_IPV6:
1832 		if (__predict_false(m->m_len < hoff + sizeof(struct ip6_hdr))
1833 		    || ip6_lasthdr(m, hoff, IPPROTO_IPV6, &protocol) < 0)
1834 			protocol = IPPROTO_DONE;
1835 		break;
1836 #endif
1837 	default:
1838 		protocol = IPPROTO_DONE;
1839 		break;
1840 	}
1841 
1842 	switch (protocol) {
1843 	case IPPROTO_TCP:
1844 	case IPPROTO_UDP:
1845 		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1846 		m->m_pkthdr.csum_data = 0xFFFF;
1847 		break;
1848 	default:
1849 		/*
1850 		 * FreeBSD does not support checksum offloading of this
1851 		 * protocol. Let the stack re-verify the checksum later
1852 		 * if the protocol is supported.
1853 		 */
1854 #if 0
1855 		if_printf(sc->vtnet_ifp,
1856 		    "%s: checksum offload of unsupported protocol "
1857 		    "etype=%#x protocol=%d csum_start=%d csum_offset=%d\n",
1858 		    __func__, etype, protocol, hdr->csum_start,
1859 		    hdr->csum_offset);
1860 #endif
1861 		break;
1862 	}
1863 
1864 	return (0);
1865 }
1866 
1867 static int
1868 vtnet_rxq_csum(struct vtnet_rxq *rxq, struct mbuf *m,
1869     struct virtio_net_hdr *hdr)
1870 {
1871 	const struct ether_header *eh;
1872 	int hoff;
1873 	uint16_t etype;
1874 
1875 	eh = mtod(m, const struct ether_header *);
1876 	etype = ntohs(eh->ether_type);
1877 	if (etype == ETHERTYPE_VLAN) {
1878 		/* TODO BMV: Handle QinQ. */
1879 		const struct ether_vlan_header *evh =
1880 		    mtod(m, const struct ether_vlan_header *);
1881 		etype = ntohs(evh->evl_proto);
1882 		hoff = sizeof(struct ether_vlan_header);
1883 	} else
1884 		hoff = sizeof(struct ether_header);
1885 
1886 	if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
1887 		return (vtnet_rxq_csum_needs_csum(rxq, m, etype, hoff, hdr));
1888 	else /* VIRTIO_NET_HDR_F_DATA_VALID */
1889 		return (vtnet_rxq_csum_data_valid(rxq, m, etype, hoff, hdr));
1890 }
1891 
1892 static void
1893 vtnet_rxq_discard_merged_bufs(struct vtnet_rxq *rxq, int nbufs)
1894 {
1895 	struct mbuf *m;
1896 
1897 	while (--nbufs > 0) {
1898 		m = virtqueue_dequeue(rxq->vtnrx_vq, NULL);
1899 		if (m == NULL)
1900 			break;
1901 		vtnet_rxq_discard_buf(rxq, m);
1902 	}
1903 }
1904 
1905 static void
1906 vtnet_rxq_discard_buf(struct vtnet_rxq *rxq, struct mbuf *m)
1907 {
1908 	int error __diagused;
1909 
1910 	/*
1911 	 * Requeue the discarded mbuf. This should always be successful
1912 	 * since it was just dequeued.
1913 	 */
1914 	error = vtnet_rxq_enqueue_buf(rxq, m);
1915 	KASSERT(error == 0,
1916 	    ("%s: cannot requeue discarded mbuf %d", __func__, error));
1917 }
1918 
1919 static int
1920 vtnet_rxq_merged_eof(struct vtnet_rxq *rxq, struct mbuf *m_head, int nbufs)
1921 {
1922 	struct vtnet_softc *sc;
1923 	struct virtqueue *vq;
1924 	struct mbuf *m_tail;
1925 
1926 	sc = rxq->vtnrx_sc;
1927 	vq = rxq->vtnrx_vq;
1928 	m_tail = m_head;
1929 
1930 	while (--nbufs > 0) {
1931 		struct mbuf *m;
1932 		uint32_t len;
1933 
1934 		m = virtqueue_dequeue(vq, &len);
1935 		if (m == NULL) {
1936 			rxq->vtnrx_stats.vrxs_ierrors++;
1937 			goto fail;
1938 		}
1939 
1940 		if (vtnet_rxq_new_buf(rxq) != 0) {
1941 			rxq->vtnrx_stats.vrxs_iqdrops++;
1942 			vtnet_rxq_discard_buf(rxq, m);
1943 			if (nbufs > 1)
1944 				vtnet_rxq_discard_merged_bufs(rxq, nbufs);
1945 			goto fail;
1946 		}
1947 
1948 		if (m->m_len < len)
1949 			len = m->m_len;
1950 
1951 		m->m_len = len;
1952 		m->m_flags &= ~M_PKTHDR;
1953 
1954 		m_head->m_pkthdr.len += len;
1955 		m_tail->m_next = m;
1956 		m_tail = m;
1957 	}
1958 
1959 	return (0);
1960 
1961 fail:
1962 	sc->vtnet_stats.rx_mergeable_failed++;
1963 	m_freem(m_head);
1964 
1965 	return (1);
1966 }
1967 
1968 #if defined(INET) || defined(INET6)
1969 static int
1970 vtnet_lro_rx(struct vtnet_rxq *rxq, struct mbuf *m)
1971 {
1972 	struct lro_ctrl *lro;
1973 
1974 	lro = &rxq->vtnrx_lro;
1975 
1976 	if (lro->lro_mbuf_max != 0) {
1977 		tcp_lro_queue_mbuf(lro, m);
1978 		return (0);
1979 	}
1980 
1981 	return (tcp_lro_rx(lro, m, 0));
1982 }
1983 #endif
1984 
1985 static void
1986 vtnet_rxq_input(struct vtnet_rxq *rxq, struct mbuf *m,
1987     struct virtio_net_hdr *hdr)
1988 {
1989 	struct vtnet_softc *sc;
1990 	if_t ifp;
1991 
1992 	sc = rxq->vtnrx_sc;
1993 	ifp = sc->vtnet_ifp;
1994 
1995 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWTAGGING) {
1996 		struct ether_header *eh = mtod(m, struct ether_header *);
1997 		if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
1998 			vtnet_vlan_tag_remove(m);
1999 			/*
2000 			 * With the 802.1Q header removed, update the
2001 			 * checksum starting location accordingly.
2002 			 */
2003 			if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
2004 				hdr->csum_start -= ETHER_VLAN_ENCAP_LEN;
2005 		}
2006 	}
2007 
2008 	m->m_pkthdr.flowid = rxq->vtnrx_id;
2009 	M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
2010 
2011 	if (hdr->flags &
2012 	    (VIRTIO_NET_HDR_F_NEEDS_CSUM | VIRTIO_NET_HDR_F_DATA_VALID)) {
2013 		if (vtnet_rxq_csum(rxq, m, hdr) == 0)
2014 			rxq->vtnrx_stats.vrxs_csum++;
2015 		else
2016 			rxq->vtnrx_stats.vrxs_csum_failed++;
2017 	}
2018 
2019 	if (hdr->gso_size != 0) {
2020 		switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
2021 		case VIRTIO_NET_HDR_GSO_TCPV4:
2022 		case VIRTIO_NET_HDR_GSO_TCPV6:
2023 			m->m_pkthdr.lro_nsegs =
2024 			    howmany(m->m_pkthdr.len, hdr->gso_size);
2025 			rxq->vtnrx_stats.vrxs_host_lro++;
2026 			break;
2027 		}
2028 	}
2029 
2030 	rxq->vtnrx_stats.vrxs_ipackets++;
2031 	rxq->vtnrx_stats.vrxs_ibytes += m->m_pkthdr.len;
2032 
2033 #if defined(INET) || defined(INET6)
2034 	if (vtnet_software_lro(sc) && if_getcapenable(ifp) & IFCAP_LRO) {
2035 		if (vtnet_lro_rx(rxq, m) == 0)
2036 			return;
2037 	}
2038 #endif
2039 
2040 	if_input(ifp, m);
2041 }
2042 
2043 static int
2044 vtnet_rxq_eof(struct vtnet_rxq *rxq)
2045 {
2046 	struct virtio_net_hdr lhdr, *hdr;
2047 	struct vtnet_softc *sc;
2048 	if_t ifp;
2049 	struct virtqueue *vq;
2050 	int deq, count;
2051 
2052 	sc = rxq->vtnrx_sc;
2053 	vq = rxq->vtnrx_vq;
2054 	ifp = sc->vtnet_ifp;
2055 	deq = 0;
2056 	count = sc->vtnet_rx_process_limit;
2057 
2058 	VTNET_RXQ_LOCK_ASSERT(rxq);
2059 
2060 	while (count-- > 0) {
2061 		struct mbuf *m;
2062 		uint32_t len, nbufs, adjsz;
2063 
2064 		m = virtqueue_dequeue(vq, &len);
2065 		if (m == NULL)
2066 			break;
2067 		deq++;
2068 
2069 		if (len < sc->vtnet_hdr_size + ETHER_HDR_LEN) {
2070 			rxq->vtnrx_stats.vrxs_ierrors++;
2071 			vtnet_rxq_discard_buf(rxq, m);
2072 			continue;
2073 		}
2074 
2075 		if (sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) {
2076 			struct virtio_net_hdr_mrg_rxbuf *mhdr =
2077 			    mtod(m, struct virtio_net_hdr_mrg_rxbuf *);
2078 			kmsan_mark(mhdr, sizeof(*mhdr), KMSAN_STATE_INITED);
2079 			nbufs = vtnet_htog16(sc, mhdr->num_buffers);
2080 			adjsz = sizeof(struct virtio_net_hdr_mrg_rxbuf);
2081 		} else if (vtnet_modern(sc)) {
2082 			nbufs = 1; /* num_buffers is always 1 */
2083 			adjsz = sizeof(struct virtio_net_hdr_v1);
2084 		} else {
2085 			nbufs = 1;
2086 			adjsz = sizeof(struct vtnet_rx_header);
2087 			/*
2088 			 * Account for our gap between the header and start of
2089 			 * data to keep the segments separated.
2090 			 */
2091 			len += VTNET_RX_HEADER_PAD;
2092 		}
2093 
2094 		if (vtnet_rxq_replace_buf(rxq, m, len) != 0) {
2095 			rxq->vtnrx_stats.vrxs_iqdrops++;
2096 			vtnet_rxq_discard_buf(rxq, m);
2097 			if (nbufs > 1)
2098 				vtnet_rxq_discard_merged_bufs(rxq, nbufs);
2099 			continue;
2100 		}
2101 
2102 		m->m_pkthdr.len = len;
2103 		m->m_pkthdr.rcvif = ifp;
2104 		m->m_pkthdr.csum_flags = 0;
2105 
2106 		if (nbufs > 1) {
2107 			/* Dequeue the rest of chain. */
2108 			if (vtnet_rxq_merged_eof(rxq, m, nbufs) != 0)
2109 				continue;
2110 		}
2111 
2112 		kmsan_mark_mbuf(m, KMSAN_STATE_INITED);
2113 
2114 		/*
2115 		 * Save an endian swapped version of the header prior to it
2116 		 * being stripped. The header is always at the start of the
2117 		 * mbuf data. num_buffers was already saved (and not needed)
2118 		 * so use the standard header.
2119 		 */
2120 		hdr = mtod(m, struct virtio_net_hdr *);
2121 		lhdr.flags = hdr->flags;
2122 		lhdr.gso_type = hdr->gso_type;
2123 		lhdr.hdr_len = vtnet_htog16(sc, hdr->hdr_len);
2124 		lhdr.gso_size = vtnet_htog16(sc, hdr->gso_size);
2125 		lhdr.csum_start = vtnet_htog16(sc, hdr->csum_start);
2126 		lhdr.csum_offset = vtnet_htog16(sc, hdr->csum_offset);
2127 		m_adj(m, adjsz);
2128 
2129 		if (PFIL_HOOKED_IN(sc->vtnet_pfil)) {
2130 			pfil_return_t pfil;
2131 
2132 			pfil = pfil_mbuf_in(sc->vtnet_pfil, &m, ifp, NULL);
2133 			switch (pfil) {
2134 			case PFIL_DROPPED:
2135 			case PFIL_CONSUMED:
2136 				continue;
2137 			default:
2138 				KASSERT(pfil == PFIL_PASS,
2139 				    ("Filter returned %d!", pfil));
2140 			}
2141 		}
2142 
2143 		vtnet_rxq_input(rxq, m, &lhdr);
2144 	}
2145 
2146 	if (deq > 0) {
2147 #if defined(INET) || defined(INET6)
2148 		if (vtnet_software_lro(sc))
2149 			tcp_lro_flush_all(&rxq->vtnrx_lro);
2150 #endif
2151 		virtqueue_notify(vq);
2152 	}
2153 
2154 	return (count > 0 ? 0 : EAGAIN);
2155 }
2156 
2157 static void
2158 vtnet_rx_vq_process(struct vtnet_rxq *rxq, int tries)
2159 {
2160 	struct vtnet_softc *sc;
2161 	if_t ifp;
2162 	u_int more;
2163 #ifdef DEV_NETMAP
2164 	int nmirq;
2165 #endif /* DEV_NETMAP */
2166 
2167 	sc = rxq->vtnrx_sc;
2168 	ifp = sc->vtnet_ifp;
2169 
2170 	if (__predict_false(rxq->vtnrx_id >= sc->vtnet_act_vq_pairs)) {
2171 		/*
2172 		 * Ignore this interrupt. Either this is a spurious interrupt
2173 		 * or multiqueue without per-VQ MSIX so every queue needs to
2174 		 * be polled (a brain dead configuration we could try harder
2175 		 * to avoid).
2176 		 */
2177 		vtnet_rxq_disable_intr(rxq);
2178 		return;
2179 	}
2180 
2181 	VTNET_RXQ_LOCK(rxq);
2182 
2183 #ifdef DEV_NETMAP
2184 	/*
2185 	 * We call netmap_rx_irq() under lock to prevent concurrent calls.
2186 	 * This is not necessary to serialize the access to the RX vq, but
2187 	 * rather to avoid races that may happen if this interface is
2188 	 * attached to a VALE switch, which would cause received packets
2189 	 * to stall in the RX queue (nm_kr_tryget() could find the kring
2190 	 * busy when called from netmap_bwrap_intr_notify()).
2191 	 */
2192 	nmirq = netmap_rx_irq(ifp, rxq->vtnrx_id, &more);
2193 	if (nmirq != NM_IRQ_PASS) {
2194 		VTNET_RXQ_UNLOCK(rxq);
2195 		if (nmirq == NM_IRQ_RESCHED) {
2196 			taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask);
2197 		}
2198 		return;
2199 	}
2200 #endif /* DEV_NETMAP */
2201 
2202 again:
2203 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
2204 		VTNET_RXQ_UNLOCK(rxq);
2205 		return;
2206 	}
2207 
2208 	more = vtnet_rxq_eof(rxq);
2209 	if (more || vtnet_rxq_enable_intr(rxq) != 0) {
2210 		if (!more)
2211 			vtnet_rxq_disable_intr(rxq);
2212 		/*
2213 		 * This is an occasional condition or race (when !more),
2214 		 * so retry a few times before scheduling the taskqueue.
2215 		 */
2216 		if (tries-- > 0)
2217 			goto again;
2218 
2219 		rxq->vtnrx_stats.vrxs_rescheduled++;
2220 		VTNET_RXQ_UNLOCK(rxq);
2221 		taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask);
2222 	} else
2223 		VTNET_RXQ_UNLOCK(rxq);
2224 }
2225 
2226 static void
2227 vtnet_rx_vq_intr(void *xrxq)
2228 {
2229 	struct vtnet_rxq *rxq;
2230 
2231 	rxq = xrxq;
2232 	vtnet_rx_vq_process(rxq, VTNET_INTR_DISABLE_RETRIES);
2233 }
2234 
2235 static void
2236 vtnet_rxq_tq_intr(void *xrxq, int pending __unused)
2237 {
2238 	struct vtnet_rxq *rxq;
2239 
2240 	rxq = xrxq;
2241 	vtnet_rx_vq_process(rxq, 0);
2242 }
2243 
2244 static int
2245 vtnet_txq_intr_threshold(struct vtnet_txq *txq)
2246 {
2247 	struct vtnet_softc *sc;
2248 	int threshold;
2249 
2250 	sc = txq->vtntx_sc;
2251 
2252 	/*
2253 	 * The Tx interrupt is disabled until the queue free count falls
2254 	 * below our threshold. Completed frames are drained from the Tx
2255 	 * virtqueue before transmitting new frames and in the watchdog
2256 	 * callout, so the frequency of Tx interrupts is greatly reduced,
2257 	 * at the cost of not freeing mbufs as quickly as they otherwise
2258 	 * would be.
2259 	 */
2260 	threshold = virtqueue_size(txq->vtntx_vq) / 4;
2261 
2262 	/*
2263 	 * Without indirect descriptors, leave enough room for the most
2264 	 * segments we handle.
2265 	 */
2266 	if ((sc->vtnet_flags & VTNET_FLAG_INDIRECT) == 0 &&
2267 	    threshold < sc->vtnet_tx_nsegs)
2268 		threshold = sc->vtnet_tx_nsegs;
2269 
2270 	return (threshold);
2271 }
2272 
2273 static int
2274 vtnet_txq_below_threshold(struct vtnet_txq *txq)
2275 {
2276 	struct virtqueue *vq;
2277 
2278 	vq = txq->vtntx_vq;
2279 
2280 	return (virtqueue_nfree(vq) <= txq->vtntx_intr_threshold);
2281 }
2282 
2283 static int
2284 vtnet_txq_notify(struct vtnet_txq *txq)
2285 {
2286 	struct virtqueue *vq;
2287 
2288 	vq = txq->vtntx_vq;
2289 
2290 	txq->vtntx_watchdog = VTNET_TX_TIMEOUT;
2291 	virtqueue_notify(vq);
2292 
2293 	if (vtnet_txq_enable_intr(txq) == 0)
2294 		return (0);
2295 
2296 	/*
2297 	 * Drain frames that were completed since last checked. If this
2298 	 * causes the queue to go above the threshold, the caller should
2299 	 * continue transmitting.
2300 	 */
2301 	if (vtnet_txq_eof(txq) != 0 && vtnet_txq_below_threshold(txq) == 0) {
2302 		virtqueue_disable_intr(vq);
2303 		return (1);
2304 	}
2305 
2306 	return (0);
2307 }
2308 
2309 static void
2310 vtnet_txq_free_mbufs(struct vtnet_txq *txq)
2311 {
2312 	struct virtqueue *vq;
2313 	struct vtnet_tx_header *txhdr;
2314 	int last;
2315 #ifdef DEV_NETMAP
2316 	struct netmap_kring *kring = netmap_kring_on(NA(txq->vtntx_sc->vtnet_ifp),
2317 							txq->vtntx_id, NR_TX);
2318 #else  /* !DEV_NETMAP */
2319 	void *kring = NULL;
2320 #endif /* !DEV_NETMAP */
2321 
2322 	vq = txq->vtntx_vq;
2323 	last = 0;
2324 
2325 	while ((txhdr = virtqueue_drain(vq, &last)) != NULL) {
2326 		if (kring == NULL) {
2327 			m_freem(txhdr->vth_mbuf);
2328 			uma_zfree(vtnet_tx_header_zone, txhdr);
2329 		}
2330 	}
2331 
2332 	KASSERT(virtqueue_empty(vq),
2333 	    ("%s: mbufs remaining in tx queue %p", __func__, txq));
2334 }
2335 
2336 /*
2337  * BMV: This can go away once we finally have offsets in the mbuf header.
2338  */
2339 static int
2340 vtnet_txq_offload_ctx(struct vtnet_txq *txq, struct mbuf *m, int *etype,
2341     int *proto, int *start)
2342 {
2343 	struct vtnet_softc *sc;
2344 	struct ether_vlan_header *evh;
2345 #if defined(INET) || defined(INET6)
2346 	int offset;
2347 #endif
2348 
2349 	sc = txq->vtntx_sc;
2350 
2351 	evh = mtod(m, struct ether_vlan_header *);
2352 	if (evh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
2353 		/* BMV: We should handle nested VLAN tags too. */
2354 		*etype = ntohs(evh->evl_proto);
2355 #if defined(INET) || defined(INET6)
2356 		offset = sizeof(struct ether_vlan_header);
2357 #endif
2358 	} else {
2359 		*etype = ntohs(evh->evl_encap_proto);
2360 #if defined(INET) || defined(INET6)
2361 		offset = sizeof(struct ether_header);
2362 #endif
2363 	}
2364 
2365 	switch (*etype) {
2366 #if defined(INET)
2367 	case ETHERTYPE_IP: {
2368 		struct ip *ip, iphdr;
2369 		if (__predict_false(m->m_len < offset + sizeof(struct ip))) {
2370 			m_copydata(m, offset, sizeof(struct ip),
2371 			    (caddr_t) &iphdr);
2372 			ip = &iphdr;
2373 		} else
2374 			ip = (struct ip *)(m->m_data + offset);
2375 		*proto = ip->ip_p;
2376 		*start = offset + (ip->ip_hl << 2);
2377 		break;
2378 	}
2379 #endif
2380 #if defined(INET6)
2381 	case ETHERTYPE_IPV6:
2382 		*proto = -1;
2383 		*start = ip6_lasthdr(m, offset, IPPROTO_IPV6, proto);
2384 		/* Assert the network stack sent us a valid packet. */
2385 		KASSERT(*start > offset,
2386 		    ("%s: mbuf %p start %d offset %d proto %d", __func__, m,
2387 		    *start, offset, *proto));
2388 		break;
2389 #endif
2390 	default:
2391 		sc->vtnet_stats.tx_csum_unknown_ethtype++;
2392 		return (EINVAL);
2393 	}
2394 
2395 	return (0);
2396 }
2397 
2398 static int
2399 vtnet_txq_offload_tso(struct vtnet_txq *txq, struct mbuf *m, int eth_type,
2400     int offset, struct virtio_net_hdr *hdr)
2401 {
2402 	static struct timeval lastecn;
2403 	static int curecn;
2404 	struct vtnet_softc *sc;
2405 	struct tcphdr *tcp, tcphdr;
2406 
2407 	sc = txq->vtntx_sc;
2408 
2409 	if (__predict_false(m->m_len < offset + sizeof(struct tcphdr))) {
2410 		m_copydata(m, offset, sizeof(struct tcphdr), (caddr_t) &tcphdr);
2411 		tcp = &tcphdr;
2412 	} else
2413 		tcp = (struct tcphdr *)(m->m_data + offset);
2414 
2415 	hdr->hdr_len = vtnet_gtoh16(sc, offset + (tcp->th_off << 2));
2416 	hdr->gso_size = vtnet_gtoh16(sc, m->m_pkthdr.tso_segsz);
2417 	hdr->gso_type = eth_type == ETHERTYPE_IP ? VIRTIO_NET_HDR_GSO_TCPV4 :
2418 	    VIRTIO_NET_HDR_GSO_TCPV6;
2419 
2420 	if (__predict_false(tcp->th_flags & TH_CWR)) {
2421 		/*
2422 		 * Drop if VIRTIO_NET_F_HOST_ECN was not negotiated. In
2423 		 * FreeBSD, ECN support is not on a per-interface basis,
2424 		 * but globally via the net.inet.tcp.ecn.enable sysctl
2425 		 * knob. The default is off.
2426 		 */
2427 		if ((sc->vtnet_flags & VTNET_FLAG_TSO_ECN) == 0) {
2428 			if (ppsratecheck(&lastecn, &curecn, 1))
2429 				if_printf(sc->vtnet_ifp,
2430 				    "TSO with ECN not negotiated with host\n");
2431 			return (ENOTSUP);
2432 		}
2433 		hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
2434 	}
2435 
2436 	txq->vtntx_stats.vtxs_tso++;
2437 
2438 	return (0);
2439 }
2440 
2441 static struct mbuf *
2442 vtnet_txq_offload(struct vtnet_txq *txq, struct mbuf *m,
2443     struct virtio_net_hdr *hdr)
2444 {
2445 	struct vtnet_softc *sc;
2446 	int flags, etype, csum_start, proto, error;
2447 
2448 	sc = txq->vtntx_sc;
2449 	flags = m->m_pkthdr.csum_flags;
2450 
2451 	error = vtnet_txq_offload_ctx(txq, m, &etype, &proto, &csum_start);
2452 	if (error)
2453 		goto drop;
2454 
2455 	if (flags & (VTNET_CSUM_OFFLOAD | VTNET_CSUM_OFFLOAD_IPV6)) {
2456 		/* Sanity check the parsed mbuf matches the offload flags. */
2457 		if (__predict_false((flags & VTNET_CSUM_OFFLOAD &&
2458 		    etype != ETHERTYPE_IP) || (flags & VTNET_CSUM_OFFLOAD_IPV6
2459 		    && etype != ETHERTYPE_IPV6))) {
2460 			sc->vtnet_stats.tx_csum_proto_mismatch++;
2461 			goto drop;
2462 		}
2463 
2464 		hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
2465 		hdr->csum_start = vtnet_gtoh16(sc, csum_start);
2466 		hdr->csum_offset = vtnet_gtoh16(sc, m->m_pkthdr.csum_data);
2467 		txq->vtntx_stats.vtxs_csum++;
2468 	}
2469 
2470 	if (flags & (CSUM_IP_TSO | CSUM_IP6_TSO)) {
2471 		/*
2472 		 * Sanity check the parsed mbuf IP protocol is TCP, and
2473 		 * VirtIO TSO reqires the checksum offloading above.
2474 		 */
2475 		if (__predict_false(proto != IPPROTO_TCP)) {
2476 			sc->vtnet_stats.tx_tso_not_tcp++;
2477 			goto drop;
2478 		} else if (__predict_false((hdr->flags &
2479 		    VIRTIO_NET_HDR_F_NEEDS_CSUM) == 0)) {
2480 			sc->vtnet_stats.tx_tso_without_csum++;
2481 			goto drop;
2482 		}
2483 
2484 		error = vtnet_txq_offload_tso(txq, m, etype, csum_start, hdr);
2485 		if (error)
2486 			goto drop;
2487 	}
2488 
2489 	return (m);
2490 
2491 drop:
2492 	m_freem(m);
2493 	return (NULL);
2494 }
2495 
2496 static int
2497 vtnet_txq_enqueue_buf(struct vtnet_txq *txq, struct mbuf **m_head,
2498     struct vtnet_tx_header *txhdr)
2499 {
2500 	struct vtnet_softc *sc;
2501 	struct virtqueue *vq;
2502 	struct sglist *sg;
2503 	struct mbuf *m;
2504 	int error;
2505 
2506 	sc = txq->vtntx_sc;
2507 	vq = txq->vtntx_vq;
2508 	sg = txq->vtntx_sg;
2509 	m = *m_head;
2510 
2511 	sglist_reset(sg);
2512 	error = sglist_append(sg, &txhdr->vth_uhdr, sc->vtnet_hdr_size);
2513 	if (error != 0 || sg->sg_nseg != 1) {
2514 		KASSERT(0, ("%s: cannot add header to sglist error %d nseg %d",
2515 		    __func__, error, sg->sg_nseg));
2516 		goto fail;
2517 	}
2518 
2519 	error = sglist_append_mbuf(sg, m);
2520 	if (error) {
2521 		m = m_defrag(m, M_NOWAIT);
2522 		if (m == NULL)
2523 			goto fail;
2524 
2525 		*m_head = m;
2526 		sc->vtnet_stats.tx_defragged++;
2527 
2528 		error = sglist_append_mbuf(sg, m);
2529 		if (error)
2530 			goto fail;
2531 	}
2532 
2533 	txhdr->vth_mbuf = m;
2534 	error = virtqueue_enqueue(vq, txhdr, sg, sg->sg_nseg, 0);
2535 
2536 	return (error);
2537 
2538 fail:
2539 	sc->vtnet_stats.tx_defrag_failed++;
2540 	m_freem(*m_head);
2541 	*m_head = NULL;
2542 
2543 	return (ENOBUFS);
2544 }
2545 
2546 static int
2547 vtnet_txq_encap(struct vtnet_txq *txq, struct mbuf **m_head, int flags)
2548 {
2549 	struct vtnet_tx_header *txhdr;
2550 	struct virtio_net_hdr *hdr;
2551 	struct mbuf *m;
2552 	int error;
2553 
2554 	m = *m_head;
2555 	M_ASSERTPKTHDR(m);
2556 
2557 	txhdr = uma_zalloc(vtnet_tx_header_zone, flags | M_ZERO);
2558 	if (txhdr == NULL) {
2559 		m_freem(m);
2560 		*m_head = NULL;
2561 		return (ENOMEM);
2562 	}
2563 
2564 	/*
2565 	 * Always use the non-mergeable header, regardless if mergable headers
2566 	 * were negotiated, because for transmit num_buffers is always zero.
2567 	 * The vtnet_hdr_size is used to enqueue the right header size segment.
2568 	 */
2569 	hdr = &txhdr->vth_uhdr.hdr;
2570 
2571 	if (m->m_flags & M_VLANTAG) {
2572 		m = ether_vlanencap(m, m->m_pkthdr.ether_vtag);
2573 		if ((*m_head = m) == NULL) {
2574 			error = ENOBUFS;
2575 			goto fail;
2576 		}
2577 		m->m_flags &= ~M_VLANTAG;
2578 	}
2579 
2580 	if (m->m_pkthdr.csum_flags & VTNET_CSUM_ALL_OFFLOAD) {
2581 		m = vtnet_txq_offload(txq, m, hdr);
2582 		if ((*m_head = m) == NULL) {
2583 			error = ENOBUFS;
2584 			goto fail;
2585 		}
2586 	}
2587 
2588 	error = vtnet_txq_enqueue_buf(txq, m_head, txhdr);
2589 fail:
2590 	if (error)
2591 		uma_zfree(vtnet_tx_header_zone, txhdr);
2592 
2593 	return (error);
2594 }
2595 
2596 #ifdef VTNET_LEGACY_TX
2597 
2598 static void
2599 vtnet_start_locked(struct vtnet_txq *txq, if_t ifp)
2600 {
2601 	struct vtnet_softc *sc;
2602 	struct virtqueue *vq;
2603 	struct mbuf *m0;
2604 	int tries, enq;
2605 
2606 	sc = txq->vtntx_sc;
2607 	vq = txq->vtntx_vq;
2608 	tries = 0;
2609 
2610 	VTNET_TXQ_LOCK_ASSERT(txq);
2611 
2612 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 ||
2613 	    sc->vtnet_link_active == 0)
2614 		return;
2615 
2616 	vtnet_txq_eof(txq);
2617 
2618 again:
2619 	enq = 0;
2620 
2621 	while (!if_sendq_empty(ifp)) {
2622 		if (virtqueue_full(vq))
2623 			break;
2624 
2625 		m0 = if_dequeue(ifp);
2626 		if (m0 == NULL)
2627 			break;
2628 
2629 		if (vtnet_txq_encap(txq, &m0, M_NOWAIT) != 0) {
2630 			if (m0 != NULL)
2631 				if_sendq_prepend(ifp, m0);
2632 			break;
2633 		}
2634 
2635 		enq++;
2636 		ETHER_BPF_MTAP(ifp, m0);
2637 	}
2638 
2639 	if (enq > 0 && vtnet_txq_notify(txq) != 0) {
2640 		if (tries++ < VTNET_NOTIFY_RETRIES)
2641 			goto again;
2642 
2643 		txq->vtntx_stats.vtxs_rescheduled++;
2644 		taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_intrtask);
2645 	}
2646 }
2647 
2648 static void
2649 vtnet_start(if_t ifp)
2650 {
2651 	struct vtnet_softc *sc;
2652 	struct vtnet_txq *txq;
2653 
2654 	sc = if_getsoftc(ifp);
2655 	txq = &sc->vtnet_txqs[0];
2656 
2657 	VTNET_TXQ_LOCK(txq);
2658 	vtnet_start_locked(txq, ifp);
2659 	VTNET_TXQ_UNLOCK(txq);
2660 }
2661 
2662 #else /* !VTNET_LEGACY_TX */
2663 
2664 static int
2665 vtnet_txq_mq_start_locked(struct vtnet_txq *txq, struct mbuf *m)
2666 {
2667 	struct vtnet_softc *sc;
2668 	struct virtqueue *vq;
2669 	struct buf_ring *br;
2670 	if_t ifp;
2671 	int enq, tries, error;
2672 
2673 	sc = txq->vtntx_sc;
2674 	vq = txq->vtntx_vq;
2675 	br = txq->vtntx_br;
2676 	ifp = sc->vtnet_ifp;
2677 	tries = 0;
2678 	error = 0;
2679 
2680 	VTNET_TXQ_LOCK_ASSERT(txq);
2681 
2682 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 ||
2683 	    sc->vtnet_link_active == 0) {
2684 		if (m != NULL)
2685 			error = drbr_enqueue(ifp, br, m);
2686 		return (error);
2687 	}
2688 
2689 	if (m != NULL) {
2690 		error = drbr_enqueue(ifp, br, m);
2691 		if (error)
2692 			return (error);
2693 	}
2694 
2695 	vtnet_txq_eof(txq);
2696 
2697 again:
2698 	enq = 0;
2699 
2700 	while ((m = drbr_peek(ifp, br)) != NULL) {
2701 		if (virtqueue_full(vq)) {
2702 			drbr_putback(ifp, br, m);
2703 			break;
2704 		}
2705 
2706 		if (vtnet_txq_encap(txq, &m, M_NOWAIT) != 0) {
2707 			if (m != NULL)
2708 				drbr_putback(ifp, br, m);
2709 			else
2710 				drbr_advance(ifp, br);
2711 			break;
2712 		}
2713 		drbr_advance(ifp, br);
2714 
2715 		enq++;
2716 		ETHER_BPF_MTAP(ifp, m);
2717 	}
2718 
2719 	if (enq > 0 && vtnet_txq_notify(txq) != 0) {
2720 		if (tries++ < VTNET_NOTIFY_RETRIES)
2721 			goto again;
2722 
2723 		txq->vtntx_stats.vtxs_rescheduled++;
2724 		taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_intrtask);
2725 	}
2726 
2727 	return (0);
2728 }
2729 
2730 static int
2731 vtnet_txq_mq_start(if_t ifp, struct mbuf *m)
2732 {
2733 	struct vtnet_softc *sc;
2734 	struct vtnet_txq *txq;
2735 	int i, npairs, error;
2736 
2737 	sc = if_getsoftc(ifp);
2738 	npairs = sc->vtnet_act_vq_pairs;
2739 
2740 	if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
2741 		i = m->m_pkthdr.flowid % npairs;
2742 	else
2743 		i = curcpu % npairs;
2744 
2745 	txq = &sc->vtnet_txqs[i];
2746 
2747 	if (VTNET_TXQ_TRYLOCK(txq) != 0) {
2748 		error = vtnet_txq_mq_start_locked(txq, m);
2749 		VTNET_TXQ_UNLOCK(txq);
2750 	} else {
2751 		error = drbr_enqueue(ifp, txq->vtntx_br, m);
2752 		taskqueue_enqueue(txq->vtntx_tq, &txq->vtntx_defrtask);
2753 	}
2754 
2755 	return (error);
2756 }
2757 
2758 static void
2759 vtnet_txq_tq_deferred(void *xtxq, int pending __unused)
2760 {
2761 	struct vtnet_softc *sc;
2762 	struct vtnet_txq *txq;
2763 
2764 	txq = xtxq;
2765 	sc = txq->vtntx_sc;
2766 
2767 	VTNET_TXQ_LOCK(txq);
2768 	if (!drbr_empty(sc->vtnet_ifp, txq->vtntx_br))
2769 		vtnet_txq_mq_start_locked(txq, NULL);
2770 	VTNET_TXQ_UNLOCK(txq);
2771 }
2772 
2773 #endif /* VTNET_LEGACY_TX */
2774 
2775 static void
2776 vtnet_txq_start(struct vtnet_txq *txq)
2777 {
2778 	struct vtnet_softc *sc;
2779 	if_t ifp;
2780 
2781 	sc = txq->vtntx_sc;
2782 	ifp = sc->vtnet_ifp;
2783 
2784 #ifdef VTNET_LEGACY_TX
2785 	if (!if_sendq_empty(ifp))
2786 		vtnet_start_locked(txq, ifp);
2787 #else
2788 	if (!drbr_empty(ifp, txq->vtntx_br))
2789 		vtnet_txq_mq_start_locked(txq, NULL);
2790 #endif
2791 }
2792 
2793 static void
2794 vtnet_txq_tq_intr(void *xtxq, int pending __unused)
2795 {
2796 	struct vtnet_softc *sc;
2797 	struct vtnet_txq *txq;
2798 	if_t ifp;
2799 
2800 	txq = xtxq;
2801 	sc = txq->vtntx_sc;
2802 	ifp = sc->vtnet_ifp;
2803 
2804 	VTNET_TXQ_LOCK(txq);
2805 
2806 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
2807 		VTNET_TXQ_UNLOCK(txq);
2808 		return;
2809 	}
2810 
2811 	vtnet_txq_eof(txq);
2812 	vtnet_txq_start(txq);
2813 
2814 	VTNET_TXQ_UNLOCK(txq);
2815 }
2816 
2817 static int
2818 vtnet_txq_eof(struct vtnet_txq *txq)
2819 {
2820 	struct virtqueue *vq;
2821 	struct vtnet_tx_header *txhdr;
2822 	struct mbuf *m;
2823 	int deq;
2824 
2825 	vq = txq->vtntx_vq;
2826 	deq = 0;
2827 	VTNET_TXQ_LOCK_ASSERT(txq);
2828 
2829 	while ((txhdr = virtqueue_dequeue(vq, NULL)) != NULL) {
2830 		m = txhdr->vth_mbuf;
2831 		deq++;
2832 
2833 		txq->vtntx_stats.vtxs_opackets++;
2834 		txq->vtntx_stats.vtxs_obytes += m->m_pkthdr.len;
2835 		if (m->m_flags & M_MCAST)
2836 			txq->vtntx_stats.vtxs_omcasts++;
2837 
2838 		m_freem(m);
2839 		uma_zfree(vtnet_tx_header_zone, txhdr);
2840 	}
2841 
2842 	if (virtqueue_empty(vq))
2843 		txq->vtntx_watchdog = 0;
2844 
2845 	return (deq);
2846 }
2847 
2848 static void
2849 vtnet_tx_vq_intr(void *xtxq)
2850 {
2851 	struct vtnet_softc *sc;
2852 	struct vtnet_txq *txq;
2853 	if_t ifp;
2854 
2855 	txq = xtxq;
2856 	sc = txq->vtntx_sc;
2857 	ifp = sc->vtnet_ifp;
2858 
2859 	if (__predict_false(txq->vtntx_id >= sc->vtnet_act_vq_pairs)) {
2860 		/*
2861 		 * Ignore this interrupt. Either this is a spurious interrupt
2862 		 * or multiqueue without per-VQ MSIX so every queue needs to
2863 		 * be polled (a brain dead configuration we could try harder
2864 		 * to avoid).
2865 		 */
2866 		vtnet_txq_disable_intr(txq);
2867 		return;
2868 	}
2869 
2870 #ifdef DEV_NETMAP
2871 	if (netmap_tx_irq(ifp, txq->vtntx_id) != NM_IRQ_PASS)
2872 		return;
2873 #endif /* DEV_NETMAP */
2874 
2875 	VTNET_TXQ_LOCK(txq);
2876 
2877 	if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) {
2878 		VTNET_TXQ_UNLOCK(txq);
2879 		return;
2880 	}
2881 
2882 	vtnet_txq_eof(txq);
2883 	vtnet_txq_start(txq);
2884 
2885 	VTNET_TXQ_UNLOCK(txq);
2886 }
2887 
2888 static void
2889 vtnet_tx_start_all(struct vtnet_softc *sc)
2890 {
2891 	struct vtnet_txq *txq;
2892 	int i;
2893 
2894 	VTNET_CORE_LOCK_ASSERT(sc);
2895 
2896 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
2897 		txq = &sc->vtnet_txqs[i];
2898 
2899 		VTNET_TXQ_LOCK(txq);
2900 		vtnet_txq_start(txq);
2901 		VTNET_TXQ_UNLOCK(txq);
2902 	}
2903 }
2904 
2905 #ifndef VTNET_LEGACY_TX
2906 static void
2907 vtnet_qflush(if_t ifp)
2908 {
2909 	struct vtnet_softc *sc;
2910 	struct vtnet_txq *txq;
2911 	struct mbuf *m;
2912 	int i;
2913 
2914 	sc = if_getsoftc(ifp);
2915 
2916 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
2917 		txq = &sc->vtnet_txqs[i];
2918 
2919 		VTNET_TXQ_LOCK(txq);
2920 		while ((m = buf_ring_dequeue_sc(txq->vtntx_br)) != NULL)
2921 			m_freem(m);
2922 		VTNET_TXQ_UNLOCK(txq);
2923 	}
2924 
2925 	if_qflush(ifp);
2926 }
2927 #endif
2928 
2929 static int
2930 vtnet_watchdog(struct vtnet_txq *txq)
2931 {
2932 	if_t ifp;
2933 
2934 	ifp = txq->vtntx_sc->vtnet_ifp;
2935 
2936 	VTNET_TXQ_LOCK(txq);
2937 	if (txq->vtntx_watchdog == 1) {
2938 		/*
2939 		 * Only drain completed frames if the watchdog is about to
2940 		 * expire. If any frames were drained, there may be enough
2941 		 * free descriptors now available to transmit queued frames.
2942 		 * In that case, the timer will immediately be decremented
2943 		 * below, but the timeout is generous enough that should not
2944 		 * be a problem.
2945 		 */
2946 		if (vtnet_txq_eof(txq) != 0)
2947 			vtnet_txq_start(txq);
2948 	}
2949 
2950 	if (txq->vtntx_watchdog == 0 || --txq->vtntx_watchdog) {
2951 		VTNET_TXQ_UNLOCK(txq);
2952 		return (0);
2953 	}
2954 	VTNET_TXQ_UNLOCK(txq);
2955 
2956 	if_printf(ifp, "watchdog timeout on queue %d\n", txq->vtntx_id);
2957 	return (1);
2958 }
2959 
2960 static void
2961 vtnet_accum_stats(struct vtnet_softc *sc, struct vtnet_rxq_stats *rxacc,
2962     struct vtnet_txq_stats *txacc)
2963 {
2964 
2965 	bzero(rxacc, sizeof(struct vtnet_rxq_stats));
2966 	bzero(txacc, sizeof(struct vtnet_txq_stats));
2967 
2968 	for (int i = 0; i < sc->vtnet_max_vq_pairs; i++) {
2969 		struct vtnet_rxq_stats *rxst;
2970 		struct vtnet_txq_stats *txst;
2971 
2972 		rxst = &sc->vtnet_rxqs[i].vtnrx_stats;
2973 		rxacc->vrxs_ipackets += rxst->vrxs_ipackets;
2974 		rxacc->vrxs_ibytes += rxst->vrxs_ibytes;
2975 		rxacc->vrxs_iqdrops += rxst->vrxs_iqdrops;
2976 		rxacc->vrxs_csum += rxst->vrxs_csum;
2977 		rxacc->vrxs_csum_failed += rxst->vrxs_csum_failed;
2978 		rxacc->vrxs_rescheduled += rxst->vrxs_rescheduled;
2979 
2980 		txst = &sc->vtnet_txqs[i].vtntx_stats;
2981 		txacc->vtxs_opackets += txst->vtxs_opackets;
2982 		txacc->vtxs_obytes += txst->vtxs_obytes;
2983 		txacc->vtxs_csum += txst->vtxs_csum;
2984 		txacc->vtxs_tso += txst->vtxs_tso;
2985 		txacc->vtxs_rescheduled += txst->vtxs_rescheduled;
2986 	}
2987 }
2988 
2989 static uint64_t
2990 vtnet_get_counter(if_t ifp, ift_counter cnt)
2991 {
2992 	struct vtnet_softc *sc;
2993 	struct vtnet_rxq_stats rxaccum;
2994 	struct vtnet_txq_stats txaccum;
2995 
2996 	sc = if_getsoftc(ifp);
2997 	vtnet_accum_stats(sc, &rxaccum, &txaccum);
2998 
2999 	switch (cnt) {
3000 	case IFCOUNTER_IPACKETS:
3001 		return (rxaccum.vrxs_ipackets);
3002 	case IFCOUNTER_IQDROPS:
3003 		return (rxaccum.vrxs_iqdrops);
3004 	case IFCOUNTER_IERRORS:
3005 		return (rxaccum.vrxs_ierrors);
3006 	case IFCOUNTER_OPACKETS:
3007 		return (txaccum.vtxs_opackets);
3008 #ifndef VTNET_LEGACY_TX
3009 	case IFCOUNTER_OBYTES:
3010 		return (txaccum.vtxs_obytes);
3011 	case IFCOUNTER_OMCASTS:
3012 		return (txaccum.vtxs_omcasts);
3013 #endif
3014 	default:
3015 		return (if_get_counter_default(ifp, cnt));
3016 	}
3017 }
3018 
3019 static void
3020 vtnet_tick(void *xsc)
3021 {
3022 	struct vtnet_softc *sc;
3023 	if_t ifp;
3024 	int i, timedout;
3025 
3026 	sc = xsc;
3027 	ifp = sc->vtnet_ifp;
3028 	timedout = 0;
3029 
3030 	VTNET_CORE_LOCK_ASSERT(sc);
3031 
3032 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++)
3033 		timedout |= vtnet_watchdog(&sc->vtnet_txqs[i]);
3034 
3035 	if (timedout != 0) {
3036 		if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
3037 		vtnet_init_locked(sc, 0);
3038 	} else
3039 		callout_schedule(&sc->vtnet_tick_ch, hz);
3040 }
3041 
3042 static void
3043 vtnet_start_taskqueues(struct vtnet_softc *sc)
3044 {
3045 	device_t dev;
3046 	struct vtnet_rxq *rxq;
3047 	struct vtnet_txq *txq;
3048 	int i, error;
3049 
3050 	dev = sc->vtnet_dev;
3051 
3052 	/*
3053 	 * Errors here are very difficult to recover from - we cannot
3054 	 * easily fail because, if this is during boot, we will hang
3055 	 * when freeing any successfully started taskqueues because
3056 	 * the scheduler isn't up yet.
3057 	 *
3058 	 * Most drivers just ignore the return value - it only fails
3059 	 * with ENOMEM so an error is not likely.
3060 	 */
3061 	for (i = 0; i < sc->vtnet_req_vq_pairs; i++) {
3062 		rxq = &sc->vtnet_rxqs[i];
3063 		error = taskqueue_start_threads(&rxq->vtnrx_tq, 1, PI_NET,
3064 		    "%s rxq %d", device_get_nameunit(dev), rxq->vtnrx_id);
3065 		if (error) {
3066 			device_printf(dev, "failed to start rx taskq %d\n",
3067 			    rxq->vtnrx_id);
3068 		}
3069 
3070 		txq = &sc->vtnet_txqs[i];
3071 		error = taskqueue_start_threads(&txq->vtntx_tq, 1, PI_NET,
3072 		    "%s txq %d", device_get_nameunit(dev), txq->vtntx_id);
3073 		if (error) {
3074 			device_printf(dev, "failed to start tx taskq %d\n",
3075 			    txq->vtntx_id);
3076 		}
3077 	}
3078 }
3079 
3080 static void
3081 vtnet_free_taskqueues(struct vtnet_softc *sc)
3082 {
3083 	struct vtnet_rxq *rxq;
3084 	struct vtnet_txq *txq;
3085 	int i;
3086 
3087 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
3088 		rxq = &sc->vtnet_rxqs[i];
3089 		if (rxq->vtnrx_tq != NULL) {
3090 			taskqueue_free(rxq->vtnrx_tq);
3091 			rxq->vtnrx_tq = NULL;
3092 		}
3093 
3094 		txq = &sc->vtnet_txqs[i];
3095 		if (txq->vtntx_tq != NULL) {
3096 			taskqueue_free(txq->vtntx_tq);
3097 			txq->vtntx_tq = NULL;
3098 		}
3099 	}
3100 }
3101 
3102 static void
3103 vtnet_drain_taskqueues(struct vtnet_softc *sc)
3104 {
3105 	struct vtnet_rxq *rxq;
3106 	struct vtnet_txq *txq;
3107 	int i;
3108 
3109 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
3110 		rxq = &sc->vtnet_rxqs[i];
3111 		if (rxq->vtnrx_tq != NULL)
3112 			taskqueue_drain(rxq->vtnrx_tq, &rxq->vtnrx_intrtask);
3113 
3114 		txq = &sc->vtnet_txqs[i];
3115 		if (txq->vtntx_tq != NULL) {
3116 			taskqueue_drain(txq->vtntx_tq, &txq->vtntx_intrtask);
3117 #ifndef VTNET_LEGACY_TX
3118 			taskqueue_drain(txq->vtntx_tq, &txq->vtntx_defrtask);
3119 #endif
3120 		}
3121 	}
3122 }
3123 
3124 static void
3125 vtnet_drain_rxtx_queues(struct vtnet_softc *sc)
3126 {
3127 	struct vtnet_rxq *rxq;
3128 	struct vtnet_txq *txq;
3129 	int i;
3130 
3131 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
3132 		rxq = &sc->vtnet_rxqs[i];
3133 		vtnet_rxq_free_mbufs(rxq);
3134 
3135 		txq = &sc->vtnet_txqs[i];
3136 		vtnet_txq_free_mbufs(txq);
3137 	}
3138 }
3139 
3140 static void
3141 vtnet_stop_rendezvous(struct vtnet_softc *sc)
3142 {
3143 	struct vtnet_rxq *rxq;
3144 	struct vtnet_txq *txq;
3145 	int i;
3146 
3147 	VTNET_CORE_LOCK_ASSERT(sc);
3148 
3149 	/*
3150 	 * Lock and unlock the per-queue mutex so we known the stop
3151 	 * state is visible. Doing only the active queues should be
3152 	 * sufficient, but it does not cost much extra to do all the
3153 	 * queues.
3154 	 */
3155 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++) {
3156 		rxq = &sc->vtnet_rxqs[i];
3157 		VTNET_RXQ_LOCK(rxq);
3158 		VTNET_RXQ_UNLOCK(rxq);
3159 
3160 		txq = &sc->vtnet_txqs[i];
3161 		VTNET_TXQ_LOCK(txq);
3162 		VTNET_TXQ_UNLOCK(txq);
3163 	}
3164 }
3165 
3166 static void
3167 vtnet_stop(struct vtnet_softc *sc)
3168 {
3169 	device_t dev;
3170 	if_t ifp;
3171 
3172 	dev = sc->vtnet_dev;
3173 	ifp = sc->vtnet_ifp;
3174 
3175 	VTNET_CORE_LOCK_ASSERT(sc);
3176 
3177 	if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
3178 	sc->vtnet_link_active = 0;
3179 	callout_stop(&sc->vtnet_tick_ch);
3180 
3181 	/* Only advisory. */
3182 	vtnet_disable_interrupts(sc);
3183 
3184 #ifdef DEV_NETMAP
3185 	/* Stop any pending txsync/rxsync and disable them. */
3186 	netmap_disable_all_rings(ifp);
3187 #endif /* DEV_NETMAP */
3188 
3189 	/*
3190 	 * Stop the host adapter. This resets it to the pre-initialized
3191 	 * state. It will not generate any interrupts until after it is
3192 	 * reinitialized.
3193 	 */
3194 	virtio_stop(dev);
3195 	vtnet_stop_rendezvous(sc);
3196 
3197 	vtnet_drain_rxtx_queues(sc);
3198 	sc->vtnet_act_vq_pairs = 1;
3199 }
3200 
3201 static int
3202 vtnet_virtio_reinit(struct vtnet_softc *sc)
3203 {
3204 	device_t dev;
3205 	if_t ifp;
3206 	uint64_t features;
3207 	int error;
3208 
3209 	dev = sc->vtnet_dev;
3210 	ifp = sc->vtnet_ifp;
3211 	features = sc->vtnet_negotiated_features;
3212 
3213 	/*
3214 	 * Re-negotiate with the host, removing any disabled receive
3215 	 * features. Transmit features are disabled only on our side
3216 	 * via if_capenable and if_hwassist.
3217 	 */
3218 
3219 	if ((if_getcapenable(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) == 0)
3220 		features &= ~(VIRTIO_NET_F_GUEST_CSUM | VTNET_LRO_FEATURES);
3221 
3222 	if ((if_getcapenable(ifp) & IFCAP_LRO) == 0)
3223 		features &= ~VTNET_LRO_FEATURES;
3224 
3225 	if ((if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER) == 0)
3226 		features &= ~VIRTIO_NET_F_CTRL_VLAN;
3227 
3228 	error = virtio_reinit(dev, features);
3229 	if (error) {
3230 		device_printf(dev, "virtio reinit error %d\n", error);
3231 		return (error);
3232 	}
3233 
3234 	sc->vtnet_features = features;
3235 	virtio_reinit_complete(dev);
3236 
3237 	return (0);
3238 }
3239 
3240 static void
3241 vtnet_init_rx_filters(struct vtnet_softc *sc)
3242 {
3243 	if_t ifp;
3244 
3245 	ifp = sc->vtnet_ifp;
3246 
3247 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) {
3248 		vtnet_rx_filter(sc);
3249 		vtnet_rx_filter_mac(sc);
3250 	}
3251 
3252 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
3253 		vtnet_rx_filter_vlan(sc);
3254 }
3255 
3256 static int
3257 vtnet_init_rx_queues(struct vtnet_softc *sc)
3258 {
3259 	device_t dev;
3260 	if_t ifp;
3261 	struct vtnet_rxq *rxq;
3262 	int i, clustersz, error;
3263 
3264 	dev = sc->vtnet_dev;
3265 	ifp = sc->vtnet_ifp;
3266 
3267 	clustersz = vtnet_rx_cluster_size(sc, if_getmtu(ifp));
3268 	sc->vtnet_rx_clustersz = clustersz;
3269 
3270 	if (sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG) {
3271 		sc->vtnet_rx_nmbufs = howmany(sizeof(struct vtnet_rx_header) +
3272 		    VTNET_MAX_RX_SIZE, clustersz);
3273 		KASSERT(sc->vtnet_rx_nmbufs < sc->vtnet_rx_nsegs,
3274 		    ("%s: too many rx mbufs %d for %d segments", __func__,
3275 		    sc->vtnet_rx_nmbufs, sc->vtnet_rx_nsegs));
3276 	} else
3277 		sc->vtnet_rx_nmbufs = 1;
3278 
3279 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
3280 		rxq = &sc->vtnet_rxqs[i];
3281 
3282 		/* Hold the lock to satisfy asserts. */
3283 		VTNET_RXQ_LOCK(rxq);
3284 		error = vtnet_rxq_populate(rxq);
3285 		VTNET_RXQ_UNLOCK(rxq);
3286 
3287 		if (error) {
3288 			device_printf(dev, "cannot populate Rx queue %d\n", i);
3289 			return (error);
3290 		}
3291 	}
3292 
3293 	return (0);
3294 }
3295 
3296 static int
3297 vtnet_init_tx_queues(struct vtnet_softc *sc)
3298 {
3299 	struct vtnet_txq *txq;
3300 	int i;
3301 
3302 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
3303 		txq = &sc->vtnet_txqs[i];
3304 		txq->vtntx_watchdog = 0;
3305 		txq->vtntx_intr_threshold = vtnet_txq_intr_threshold(txq);
3306 #ifdef DEV_NETMAP
3307 		netmap_reset(NA(sc->vtnet_ifp), NR_TX, i, 0);
3308 #endif /* DEV_NETMAP */
3309 	}
3310 
3311 	return (0);
3312 }
3313 
3314 static int
3315 vtnet_init_rxtx_queues(struct vtnet_softc *sc)
3316 {
3317 	int error;
3318 
3319 	error = vtnet_init_rx_queues(sc);
3320 	if (error)
3321 		return (error);
3322 
3323 	error = vtnet_init_tx_queues(sc);
3324 	if (error)
3325 		return (error);
3326 
3327 	return (0);
3328 }
3329 
3330 static void
3331 vtnet_set_active_vq_pairs(struct vtnet_softc *sc)
3332 {
3333 	device_t dev;
3334 	int npairs;
3335 
3336 	dev = sc->vtnet_dev;
3337 
3338 	if ((sc->vtnet_flags & VTNET_FLAG_MQ) == 0) {
3339 		sc->vtnet_act_vq_pairs = 1;
3340 		return;
3341 	}
3342 
3343 	npairs = sc->vtnet_req_vq_pairs;
3344 
3345 	if (vtnet_ctrl_mq_cmd(sc, npairs) != 0) {
3346 		device_printf(dev, "cannot set active queue pairs to %d, "
3347 		    "falling back to 1 queue pair\n", npairs);
3348 		npairs = 1;
3349 	}
3350 
3351 	sc->vtnet_act_vq_pairs = npairs;
3352 }
3353 
3354 static void
3355 vtnet_update_rx_offloads(struct vtnet_softc *sc)
3356 {
3357 	if_t ifp;
3358 	uint64_t features;
3359 	int error;
3360 
3361 	ifp = sc->vtnet_ifp;
3362 	features = sc->vtnet_features;
3363 
3364 	VTNET_CORE_LOCK_ASSERT(sc);
3365 
3366 	if (if_getcapabilities(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
3367 		if (if_getcapenable(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6))
3368 			features |= VIRTIO_NET_F_GUEST_CSUM;
3369 		else
3370 			features &= ~VIRTIO_NET_F_GUEST_CSUM;
3371 	}
3372 
3373 	if (if_getcapabilities(ifp) & IFCAP_LRO && !vtnet_software_lro(sc)) {
3374 		if (if_getcapenable(ifp) & IFCAP_LRO)
3375 			features |= VTNET_LRO_FEATURES;
3376 		else
3377 			features &= ~VTNET_LRO_FEATURES;
3378 	}
3379 
3380 	error = vtnet_ctrl_guest_offloads(sc,
3381 	    features & (VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 |
3382 		        VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_ECN  |
3383 			VIRTIO_NET_F_GUEST_UFO));
3384 	if (error) {
3385 		device_printf(sc->vtnet_dev,
3386 		    "%s: cannot update Rx features\n", __func__);
3387 		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
3388 			if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING);
3389 			vtnet_init_locked(sc, 0);
3390 		}
3391 	} else
3392 		sc->vtnet_features = features;
3393 }
3394 
3395 static int
3396 vtnet_reinit(struct vtnet_softc *sc)
3397 {
3398 	if_t ifp;
3399 	int error;
3400 
3401 	ifp = sc->vtnet_ifp;
3402 
3403 	bcopy(if_getlladdr(ifp), sc->vtnet_hwaddr, ETHER_ADDR_LEN);
3404 
3405 	error = vtnet_virtio_reinit(sc);
3406 	if (error)
3407 		return (error);
3408 
3409 	vtnet_set_macaddr(sc);
3410 	vtnet_set_active_vq_pairs(sc);
3411 
3412 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ)
3413 		vtnet_init_rx_filters(sc);
3414 
3415 	if_sethwassist(ifp, 0);
3416 	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
3417 		if_sethwassistbits(ifp, VTNET_CSUM_OFFLOAD, 0);
3418 	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
3419 		if_sethwassistbits(ifp, VTNET_CSUM_OFFLOAD_IPV6, 0);
3420 	if (if_getcapenable(ifp) & IFCAP_TSO4)
3421 		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
3422 	if (if_getcapenable(ifp) & IFCAP_TSO6)
3423 		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
3424 
3425 	error = vtnet_init_rxtx_queues(sc);
3426 	if (error)
3427 		return (error);
3428 
3429 	return (0);
3430 }
3431 
3432 static void
3433 vtnet_init_locked(struct vtnet_softc *sc, int init_mode)
3434 {
3435 	if_t ifp;
3436 
3437 	ifp = sc->vtnet_ifp;
3438 
3439 	VTNET_CORE_LOCK_ASSERT(sc);
3440 
3441 	if (if_getdrvflags(ifp) & IFF_DRV_RUNNING)
3442 		return;
3443 
3444 	vtnet_stop(sc);
3445 
3446 #ifdef DEV_NETMAP
3447 	/* Once stopped we can update the netmap flags, if necessary. */
3448 	switch (init_mode) {
3449 	case VTNET_INIT_NETMAP_ENTER:
3450 		nm_set_native_flags(NA(ifp));
3451 		break;
3452 	case VTNET_INIT_NETMAP_EXIT:
3453 		nm_clear_native_flags(NA(ifp));
3454 		break;
3455 	}
3456 #endif /* DEV_NETMAP */
3457 
3458 	if (vtnet_reinit(sc) != 0) {
3459 		vtnet_stop(sc);
3460 		return;
3461 	}
3462 
3463 	if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0);
3464 	vtnet_update_link_status(sc);
3465 	vtnet_enable_interrupts(sc);
3466 	callout_reset(&sc->vtnet_tick_ch, hz, vtnet_tick, sc);
3467 
3468 #ifdef DEV_NETMAP
3469 	/* Re-enable txsync/rxsync. */
3470 	netmap_enable_all_rings(ifp);
3471 #endif /* DEV_NETMAP */
3472 }
3473 
3474 static void
3475 vtnet_init(void *xsc)
3476 {
3477 	struct vtnet_softc *sc;
3478 
3479 	sc = xsc;
3480 
3481 	VTNET_CORE_LOCK(sc);
3482 	vtnet_init_locked(sc, 0);
3483 	VTNET_CORE_UNLOCK(sc);
3484 }
3485 
3486 static void
3487 vtnet_free_ctrl_vq(struct vtnet_softc *sc)
3488 {
3489 
3490 	/*
3491 	 * The control virtqueue is only polled and therefore it should
3492 	 * already be empty.
3493 	 */
3494 	KASSERT(virtqueue_empty(sc->vtnet_ctrl_vq),
3495 	    ("%s: ctrl vq %p not empty", __func__, sc->vtnet_ctrl_vq));
3496 }
3497 
3498 static void
3499 vtnet_exec_ctrl_cmd(struct vtnet_softc *sc, void *cookie,
3500     struct sglist *sg, int readable, int writable)
3501 {
3502 	struct virtqueue *vq;
3503 
3504 	vq = sc->vtnet_ctrl_vq;
3505 
3506 	MPASS(sc->vtnet_flags & VTNET_FLAG_CTRL_VQ);
3507 	VTNET_CORE_LOCK_ASSERT(sc);
3508 
3509 	if (!virtqueue_empty(vq))
3510 		return;
3511 
3512 	/*
3513 	 * Poll for the response, but the command is likely completed before
3514 	 * returning from the notify.
3515 	 */
3516 	if (virtqueue_enqueue(vq, cookie, sg, readable, writable) == 0)  {
3517 		virtqueue_notify(vq);
3518 		virtqueue_poll(vq, NULL);
3519 	}
3520 }
3521 
3522 static int
3523 vtnet_ctrl_mac_cmd(struct vtnet_softc *sc, uint8_t *hwaddr)
3524 {
3525 	struct sglist_seg segs[3];
3526 	struct sglist sg;
3527 	struct {
3528 		struct virtio_net_ctrl_hdr hdr __aligned(2);
3529 		uint8_t pad1;
3530 		uint8_t addr[ETHER_ADDR_LEN] __aligned(8);
3531 		uint8_t pad2;
3532 		uint8_t ack;
3533 	} s;
3534 	int error;
3535 
3536 	error = 0;
3537 	MPASS(sc->vtnet_flags & VTNET_FLAG_CTRL_MAC);
3538 
3539 	s.hdr.class = VIRTIO_NET_CTRL_MAC;
3540 	s.hdr.cmd = VIRTIO_NET_CTRL_MAC_ADDR_SET;
3541 	bcopy(hwaddr, &s.addr[0], ETHER_ADDR_LEN);
3542 	s.ack = VIRTIO_NET_ERR;
3543 
3544 	sglist_init(&sg, nitems(segs), segs);
3545 	error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr));
3546 	error |= sglist_append(&sg, &s.addr[0], ETHER_ADDR_LEN);
3547 	error |= sglist_append(&sg, &s.ack, sizeof(uint8_t));
3548 	MPASS(error == 0 && sg.sg_nseg == nitems(segs));
3549 
3550 	if (error == 0)
3551 		vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1);
3552 
3553 	return (s.ack == VIRTIO_NET_OK ? 0 : EIO);
3554 }
3555 
3556 static int
3557 vtnet_ctrl_guest_offloads(struct vtnet_softc *sc, uint64_t offloads)
3558 {
3559 	struct sglist_seg segs[3];
3560 	struct sglist sg;
3561 	struct {
3562 		struct virtio_net_ctrl_hdr hdr __aligned(2);
3563 		uint8_t pad1;
3564 		uint64_t offloads __aligned(8);
3565 		uint8_t pad2;
3566 		uint8_t ack;
3567 	} s;
3568 	int error;
3569 
3570 	error = 0;
3571 	MPASS(sc->vtnet_features & VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
3572 
3573 	s.hdr.class = VIRTIO_NET_CTRL_GUEST_OFFLOADS;
3574 	s.hdr.cmd = VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET;
3575 	s.offloads = vtnet_gtoh64(sc, offloads);
3576 	s.ack = VIRTIO_NET_ERR;
3577 
3578 	sglist_init(&sg, nitems(segs), segs);
3579 	error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr));
3580 	error |= sglist_append(&sg, &s.offloads, sizeof(uint64_t));
3581 	error |= sglist_append(&sg, &s.ack, sizeof(uint8_t));
3582 	MPASS(error == 0 && sg.sg_nseg == nitems(segs));
3583 
3584 	if (error == 0)
3585 		vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1);
3586 
3587 	return (s.ack == VIRTIO_NET_OK ? 0 : EIO);
3588 }
3589 
3590 static int
3591 vtnet_ctrl_mq_cmd(struct vtnet_softc *sc, uint16_t npairs)
3592 {
3593 	struct sglist_seg segs[3];
3594 	struct sglist sg;
3595 	struct {
3596 		struct virtio_net_ctrl_hdr hdr __aligned(2);
3597 		uint8_t pad1;
3598 		struct virtio_net_ctrl_mq mq __aligned(2);
3599 		uint8_t pad2;
3600 		uint8_t ack;
3601 	} s;
3602 	int error;
3603 
3604 	error = 0;
3605 	MPASS(sc->vtnet_flags & VTNET_FLAG_MQ);
3606 
3607 	s.hdr.class = VIRTIO_NET_CTRL_MQ;
3608 	s.hdr.cmd = VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET;
3609 	s.mq.virtqueue_pairs = vtnet_gtoh16(sc, npairs);
3610 	s.ack = VIRTIO_NET_ERR;
3611 
3612 	sglist_init(&sg, nitems(segs), segs);
3613 	error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr));
3614 	error |= sglist_append(&sg, &s.mq, sizeof(struct virtio_net_ctrl_mq));
3615 	error |= sglist_append(&sg, &s.ack, sizeof(uint8_t));
3616 	MPASS(error == 0 && sg.sg_nseg == nitems(segs));
3617 
3618 	if (error == 0)
3619 		vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1);
3620 
3621 	return (s.ack == VIRTIO_NET_OK ? 0 : EIO);
3622 }
3623 
3624 static int
3625 vtnet_ctrl_rx_cmd(struct vtnet_softc *sc, uint8_t cmd, bool on)
3626 {
3627 	struct sglist_seg segs[3];
3628 	struct sglist sg;
3629 	struct {
3630 		struct virtio_net_ctrl_hdr hdr __aligned(2);
3631 		uint8_t pad1;
3632 		uint8_t onoff;
3633 		uint8_t pad2;
3634 		uint8_t ack;
3635 	} s;
3636 	int error;
3637 
3638 	error = 0;
3639 	MPASS(sc->vtnet_flags & VTNET_FLAG_CTRL_RX);
3640 
3641 	s.hdr.class = VIRTIO_NET_CTRL_RX;
3642 	s.hdr.cmd = cmd;
3643 	s.onoff = on;
3644 	s.ack = VIRTIO_NET_ERR;
3645 
3646 	sglist_init(&sg, nitems(segs), segs);
3647 	error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr));
3648 	error |= sglist_append(&sg, &s.onoff, sizeof(uint8_t));
3649 	error |= sglist_append(&sg, &s.ack, sizeof(uint8_t));
3650 	MPASS(error == 0 && sg.sg_nseg == nitems(segs));
3651 
3652 	if (error == 0)
3653 		vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1);
3654 
3655 	return (s.ack == VIRTIO_NET_OK ? 0 : EIO);
3656 }
3657 
3658 static int
3659 vtnet_set_promisc(struct vtnet_softc *sc, bool on)
3660 {
3661 	return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_PROMISC, on));
3662 }
3663 
3664 static int
3665 vtnet_set_allmulti(struct vtnet_softc *sc, bool on)
3666 {
3667 	return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_ALLMULTI, on));
3668 }
3669 
3670 static void
3671 vtnet_rx_filter(struct vtnet_softc *sc)
3672 {
3673 	device_t dev;
3674 	if_t ifp;
3675 
3676 	dev = sc->vtnet_dev;
3677 	ifp = sc->vtnet_ifp;
3678 
3679 	VTNET_CORE_LOCK_ASSERT(sc);
3680 
3681 	if (vtnet_set_promisc(sc, if_getflags(ifp) & IFF_PROMISC) != 0) {
3682 		device_printf(dev, "cannot %s promiscuous mode\n",
3683 		    if_getflags(ifp) & IFF_PROMISC ? "enable" : "disable");
3684 	}
3685 
3686 	if (vtnet_set_allmulti(sc, if_getflags(ifp) & IFF_ALLMULTI) != 0) {
3687 		device_printf(dev, "cannot %s all-multicast mode\n",
3688 		    if_getflags(ifp) & IFF_ALLMULTI ? "enable" : "disable");
3689 	}
3690 }
3691 
3692 static u_int
3693 vtnet_copy_ifaddr(void *arg, struct sockaddr_dl *sdl, u_int ucnt)
3694 {
3695 	struct vtnet_softc *sc = arg;
3696 
3697 	if (memcmp(LLADDR(sdl), sc->vtnet_hwaddr, ETHER_ADDR_LEN) == 0)
3698 		return (0);
3699 
3700 	if (ucnt < VTNET_MAX_MAC_ENTRIES)
3701 		bcopy(LLADDR(sdl),
3702 		    &sc->vtnet_mac_filter->vmf_unicast.macs[ucnt],
3703 		    ETHER_ADDR_LEN);
3704 
3705 	return (1);
3706 }
3707 
3708 static u_int
3709 vtnet_copy_maddr(void *arg, struct sockaddr_dl *sdl, u_int mcnt)
3710 {
3711 	struct vtnet_mac_filter *filter = arg;
3712 
3713 	if (mcnt < VTNET_MAX_MAC_ENTRIES)
3714 		bcopy(LLADDR(sdl), &filter->vmf_multicast.macs[mcnt],
3715 		    ETHER_ADDR_LEN);
3716 
3717 	return (1);
3718 }
3719 
3720 static void
3721 vtnet_rx_filter_mac(struct vtnet_softc *sc)
3722 {
3723 	struct virtio_net_ctrl_hdr hdr __aligned(2);
3724 	struct vtnet_mac_filter *filter;
3725 	struct sglist_seg segs[4];
3726 	struct sglist sg;
3727 	if_t ifp;
3728 	bool promisc, allmulti;
3729 	u_int ucnt, mcnt;
3730 	int error;
3731 	uint8_t ack;
3732 
3733 	ifp = sc->vtnet_ifp;
3734 	filter = sc->vtnet_mac_filter;
3735 	error = 0;
3736 
3737 	MPASS(sc->vtnet_flags & VTNET_FLAG_CTRL_RX);
3738 	VTNET_CORE_LOCK_ASSERT(sc);
3739 
3740 	/* Unicast MAC addresses: */
3741 	ucnt = if_foreach_lladdr(ifp, vtnet_copy_ifaddr, sc);
3742 	promisc = (ucnt > VTNET_MAX_MAC_ENTRIES);
3743 
3744 	if (promisc) {
3745 		ucnt = 0;
3746 		if_printf(ifp, "more than %d MAC addresses assigned, "
3747 		    "falling back to promiscuous mode\n",
3748 		    VTNET_MAX_MAC_ENTRIES);
3749 	}
3750 
3751 	/* Multicast MAC addresses: */
3752 	mcnt = if_foreach_llmaddr(ifp, vtnet_copy_maddr, filter);
3753 	allmulti = (mcnt > VTNET_MAX_MAC_ENTRIES);
3754 
3755 	if (allmulti) {
3756 		mcnt = 0;
3757 		if_printf(ifp, "more than %d multicast MAC addresses "
3758 		    "assigned, falling back to all-multicast mode\n",
3759 		    VTNET_MAX_MAC_ENTRIES);
3760 	}
3761 
3762 	if (promisc && allmulti)
3763 		goto out;
3764 
3765 	filter->vmf_unicast.nentries = vtnet_gtoh32(sc, ucnt);
3766 	filter->vmf_multicast.nentries = vtnet_gtoh32(sc, mcnt);
3767 
3768 	hdr.class = VIRTIO_NET_CTRL_MAC;
3769 	hdr.cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET;
3770 	ack = VIRTIO_NET_ERR;
3771 
3772 	sglist_init(&sg, nitems(segs), segs);
3773 	error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr));
3774 	error |= sglist_append(&sg, &filter->vmf_unicast,
3775 	    sizeof(uint32_t) + ucnt * ETHER_ADDR_LEN);
3776 	error |= sglist_append(&sg, &filter->vmf_multicast,
3777 	    sizeof(uint32_t) + mcnt * ETHER_ADDR_LEN);
3778 	error |= sglist_append(&sg, &ack, sizeof(uint8_t));
3779 	MPASS(error == 0 && sg.sg_nseg == nitems(segs));
3780 
3781 	if (error == 0)
3782 		vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1);
3783 	if (ack != VIRTIO_NET_OK)
3784 		if_printf(ifp, "error setting host MAC filter table\n");
3785 
3786 out:
3787 	if (promisc != 0 && vtnet_set_promisc(sc, true) != 0)
3788 		if_printf(ifp, "cannot enable promiscuous mode\n");
3789 	if (allmulti != 0 && vtnet_set_allmulti(sc, true) != 0)
3790 		if_printf(ifp, "cannot enable all-multicast mode\n");
3791 }
3792 
3793 static int
3794 vtnet_exec_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag)
3795 {
3796 	struct sglist_seg segs[3];
3797 	struct sglist sg;
3798 	struct {
3799 		struct virtio_net_ctrl_hdr hdr __aligned(2);
3800 		uint8_t pad1;
3801 		uint16_t tag __aligned(2);
3802 		uint8_t pad2;
3803 		uint8_t ack;
3804 	} s;
3805 	int error;
3806 
3807 	error = 0;
3808 	MPASS(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER);
3809 
3810 	s.hdr.class = VIRTIO_NET_CTRL_VLAN;
3811 	s.hdr.cmd = add ? VIRTIO_NET_CTRL_VLAN_ADD : VIRTIO_NET_CTRL_VLAN_DEL;
3812 	s.tag = vtnet_gtoh16(sc, tag);
3813 	s.ack = VIRTIO_NET_ERR;
3814 
3815 	sglist_init(&sg, nitems(segs), segs);
3816 	error |= sglist_append(&sg, &s.hdr, sizeof(struct virtio_net_ctrl_hdr));
3817 	error |= sglist_append(&sg, &s.tag, sizeof(uint16_t));
3818 	error |= sglist_append(&sg, &s.ack, sizeof(uint8_t));
3819 	MPASS(error == 0 && sg.sg_nseg == nitems(segs));
3820 
3821 	if (error == 0)
3822 		vtnet_exec_ctrl_cmd(sc, &s.ack, &sg, sg.sg_nseg - 1, 1);
3823 
3824 	return (s.ack == VIRTIO_NET_OK ? 0 : EIO);
3825 }
3826 
3827 static void
3828 vtnet_rx_filter_vlan(struct vtnet_softc *sc)
3829 {
3830 	int i, bit;
3831 	uint32_t w;
3832 	uint16_t tag;
3833 
3834 	MPASS(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER);
3835 	VTNET_CORE_LOCK_ASSERT(sc);
3836 
3837 	/* Enable the filter for each configured VLAN. */
3838 	for (i = 0; i < VTNET_VLAN_FILTER_NWORDS; i++) {
3839 		w = sc->vtnet_vlan_filter[i];
3840 
3841 		while ((bit = ffs(w) - 1) != -1) {
3842 			w &= ~(1 << bit);
3843 			tag = sizeof(w) * CHAR_BIT * i + bit;
3844 
3845 			if (vtnet_exec_vlan_filter(sc, 1, tag) != 0) {
3846 				device_printf(sc->vtnet_dev,
3847 				    "cannot enable VLAN %d filter\n", tag);
3848 			}
3849 		}
3850 	}
3851 }
3852 
3853 static void
3854 vtnet_update_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag)
3855 {
3856 	if_t ifp;
3857 	int idx, bit;
3858 
3859 	ifp = sc->vtnet_ifp;
3860 	idx = (tag >> 5) & 0x7F;
3861 	bit = tag & 0x1F;
3862 
3863 	if (tag == 0 || tag > 4095)
3864 		return;
3865 
3866 	VTNET_CORE_LOCK(sc);
3867 
3868 	if (add)
3869 		sc->vtnet_vlan_filter[idx] |= (1 << bit);
3870 	else
3871 		sc->vtnet_vlan_filter[idx] &= ~(1 << bit);
3872 
3873 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER &&
3874 	    if_getdrvflags(ifp) & IFF_DRV_RUNNING &&
3875 	    vtnet_exec_vlan_filter(sc, add, tag) != 0) {
3876 		device_printf(sc->vtnet_dev,
3877 		    "cannot %s VLAN %d %s the host filter table\n",
3878 		    add ? "add" : "remove", tag, add ? "to" : "from");
3879 	}
3880 
3881 	VTNET_CORE_UNLOCK(sc);
3882 }
3883 
3884 static void
3885 vtnet_register_vlan(void *arg, if_t ifp, uint16_t tag)
3886 {
3887 
3888 	if (if_getsoftc(ifp) != arg)
3889 		return;
3890 
3891 	vtnet_update_vlan_filter(arg, 1, tag);
3892 }
3893 
3894 static void
3895 vtnet_unregister_vlan(void *arg, if_t ifp, uint16_t tag)
3896 {
3897 
3898 	if (if_getsoftc(ifp) != arg)
3899 		return;
3900 
3901 	vtnet_update_vlan_filter(arg, 0, tag);
3902 }
3903 
3904 static void
3905 vtnet_update_speed_duplex(struct vtnet_softc *sc)
3906 {
3907 	if_t ifp;
3908 	uint32_t speed;
3909 
3910 	ifp = sc->vtnet_ifp;
3911 
3912 	if ((sc->vtnet_features & VIRTIO_NET_F_SPEED_DUPLEX) == 0)
3913 		return;
3914 
3915 	/* BMV: Ignore duplex. */
3916 	speed = virtio_read_dev_config_4(sc->vtnet_dev,
3917 	    offsetof(struct virtio_net_config, speed));
3918 	if (speed != UINT32_MAX)
3919 		if_setbaudrate(ifp, IF_Mbps(speed));
3920 }
3921 
3922 static int
3923 vtnet_is_link_up(struct vtnet_softc *sc)
3924 {
3925 	uint16_t status;
3926 
3927 	if ((sc->vtnet_features & VIRTIO_NET_F_STATUS) == 0)
3928 		return (1);
3929 
3930 	status = virtio_read_dev_config_2(sc->vtnet_dev,
3931 	    offsetof(struct virtio_net_config, status));
3932 
3933 	return ((status & VIRTIO_NET_S_LINK_UP) != 0);
3934 }
3935 
3936 static void
3937 vtnet_update_link_status(struct vtnet_softc *sc)
3938 {
3939 	if_t ifp;
3940 	int link;
3941 
3942 	ifp = sc->vtnet_ifp;
3943 	VTNET_CORE_LOCK_ASSERT(sc);
3944 	link = vtnet_is_link_up(sc);
3945 
3946 	/* Notify if the link status has changed. */
3947 	if (link != 0 && sc->vtnet_link_active == 0) {
3948 		vtnet_update_speed_duplex(sc);
3949 		sc->vtnet_link_active = 1;
3950 		if_link_state_change(ifp, LINK_STATE_UP);
3951 	} else if (link == 0 && sc->vtnet_link_active != 0) {
3952 		sc->vtnet_link_active = 0;
3953 		if_link_state_change(ifp, LINK_STATE_DOWN);
3954 	}
3955 }
3956 
3957 static int
3958 vtnet_ifmedia_upd(if_t ifp __unused)
3959 {
3960 	return (EOPNOTSUPP);
3961 }
3962 
3963 static void
3964 vtnet_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr)
3965 {
3966 	struct vtnet_softc *sc;
3967 
3968 	sc = if_getsoftc(ifp);
3969 
3970 	ifmr->ifm_status = IFM_AVALID;
3971 	ifmr->ifm_active = IFM_ETHER;
3972 
3973 	VTNET_CORE_LOCK(sc);
3974 	if (vtnet_is_link_up(sc) != 0) {
3975 		ifmr->ifm_status |= IFM_ACTIVE;
3976 		ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
3977 	} else
3978 		ifmr->ifm_active |= IFM_NONE;
3979 	VTNET_CORE_UNLOCK(sc);
3980 }
3981 
3982 static void
3983 vtnet_get_macaddr(struct vtnet_softc *sc)
3984 {
3985 
3986 	if (sc->vtnet_flags & VTNET_FLAG_MAC) {
3987 		virtio_read_device_config_array(sc->vtnet_dev,
3988 		    offsetof(struct virtio_net_config, mac),
3989 		    &sc->vtnet_hwaddr[0], sizeof(uint8_t), ETHER_ADDR_LEN);
3990 	} else {
3991 		/* Generate a random locally administered unicast address. */
3992 		sc->vtnet_hwaddr[0] = 0xB2;
3993 		arc4rand(&sc->vtnet_hwaddr[1], ETHER_ADDR_LEN - 1, 0);
3994 	}
3995 }
3996 
3997 static void
3998 vtnet_set_macaddr(struct vtnet_softc *sc)
3999 {
4000 	device_t dev;
4001 	int error;
4002 
4003 	dev = sc->vtnet_dev;
4004 
4005 	if (sc->vtnet_flags & VTNET_FLAG_CTRL_MAC) {
4006 		error = vtnet_ctrl_mac_cmd(sc, sc->vtnet_hwaddr);
4007 		if (error)
4008 			device_printf(dev, "unable to set MAC address\n");
4009 		return;
4010 	}
4011 
4012 	/* MAC in config is read-only in modern VirtIO. */
4013 	if (!vtnet_modern(sc) && sc->vtnet_flags & VTNET_FLAG_MAC) {
4014 		for (int i = 0; i < ETHER_ADDR_LEN; i++) {
4015 			virtio_write_dev_config_1(dev,
4016 			    offsetof(struct virtio_net_config, mac) + i,
4017 			    sc->vtnet_hwaddr[i]);
4018 		}
4019 	}
4020 }
4021 
4022 static void
4023 vtnet_attached_set_macaddr(struct vtnet_softc *sc)
4024 {
4025 
4026 	/* Assign MAC address if it was generated. */
4027 	if ((sc->vtnet_flags & VTNET_FLAG_MAC) == 0)
4028 		vtnet_set_macaddr(sc);
4029 }
4030 
4031 static void
4032 vtnet_vlan_tag_remove(struct mbuf *m)
4033 {
4034 	struct ether_vlan_header *evh;
4035 
4036 	evh = mtod(m, struct ether_vlan_header *);
4037 	m->m_pkthdr.ether_vtag = ntohs(evh->evl_tag);
4038 	m->m_flags |= M_VLANTAG;
4039 
4040 	/* Strip the 802.1Q header. */
4041 	bcopy((char *) evh, (char *) evh + ETHER_VLAN_ENCAP_LEN,
4042 	    ETHER_HDR_LEN - ETHER_TYPE_LEN);
4043 	m_adj(m, ETHER_VLAN_ENCAP_LEN);
4044 }
4045 
4046 static void
4047 vtnet_set_rx_process_limit(struct vtnet_softc *sc)
4048 {
4049 	int limit;
4050 
4051 	limit = vtnet_tunable_int(sc, "rx_process_limit",
4052 	    vtnet_rx_process_limit);
4053 	if (limit < 0)
4054 		limit = INT_MAX;
4055 	sc->vtnet_rx_process_limit = limit;
4056 }
4057 
4058 static void
4059 vtnet_setup_rxq_sysctl(struct sysctl_ctx_list *ctx,
4060     struct sysctl_oid_list *child, struct vtnet_rxq *rxq)
4061 {
4062 	struct sysctl_oid *node;
4063 	struct sysctl_oid_list *list;
4064 	struct vtnet_rxq_stats *stats;
4065 	char namebuf[16];
4066 
4067 	snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->vtnrx_id);
4068 	node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
4069 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Receive Queue");
4070 	list = SYSCTL_CHILDREN(node);
4071 
4072 	stats = &rxq->vtnrx_stats;
4073 
4074 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ipackets", CTLFLAG_RD,
4075 	    &stats->vrxs_ipackets, "Receive packets");
4076 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ibytes", CTLFLAG_RD,
4077 	    &stats->vrxs_ibytes, "Receive bytes");
4078 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "iqdrops", CTLFLAG_RD,
4079 	    &stats->vrxs_iqdrops, "Receive drops");
4080 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "ierrors", CTLFLAG_RD,
4081 	    &stats->vrxs_ierrors, "Receive errors");
4082 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD,
4083 	    &stats->vrxs_csum, "Receive checksum offloaded");
4084 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum_failed", CTLFLAG_RD,
4085 	    &stats->vrxs_csum_failed, "Receive checksum offload failed");
4086 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "host_lro", CTLFLAG_RD,
4087 	    &stats->vrxs_host_lro, "Receive host segmentation offloaded");
4088 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "rescheduled", CTLFLAG_RD,
4089 	    &stats->vrxs_rescheduled,
4090 	    "Receive interrupt handler rescheduled");
4091 }
4092 
4093 static void
4094 vtnet_setup_txq_sysctl(struct sysctl_ctx_list *ctx,
4095     struct sysctl_oid_list *child, struct vtnet_txq *txq)
4096 {
4097 	struct sysctl_oid *node;
4098 	struct sysctl_oid_list *list;
4099 	struct vtnet_txq_stats *stats;
4100 	char namebuf[16];
4101 
4102 	snprintf(namebuf, sizeof(namebuf), "txq%d", txq->vtntx_id);
4103 	node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
4104 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Transmit Queue");
4105 	list = SYSCTL_CHILDREN(node);
4106 
4107 	stats = &txq->vtntx_stats;
4108 
4109 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "opackets", CTLFLAG_RD,
4110 	    &stats->vtxs_opackets, "Transmit packets");
4111 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "obytes", CTLFLAG_RD,
4112 	    &stats->vtxs_obytes, "Transmit bytes");
4113 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "omcasts", CTLFLAG_RD,
4114 	    &stats->vtxs_omcasts, "Transmit multicasts");
4115 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "csum", CTLFLAG_RD,
4116 	    &stats->vtxs_csum, "Transmit checksum offloaded");
4117 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "tso", CTLFLAG_RD,
4118 	    &stats->vtxs_tso, "Transmit TCP segmentation offloaded");
4119 	SYSCTL_ADD_UQUAD(ctx, list, OID_AUTO, "rescheduled", CTLFLAG_RD,
4120 	    &stats->vtxs_rescheduled,
4121 	    "Transmit interrupt handler rescheduled");
4122 }
4123 
4124 static void
4125 vtnet_setup_queue_sysctl(struct vtnet_softc *sc)
4126 {
4127 	device_t dev;
4128 	struct sysctl_ctx_list *ctx;
4129 	struct sysctl_oid *tree;
4130 	struct sysctl_oid_list *child;
4131 	int i;
4132 
4133 	dev = sc->vtnet_dev;
4134 	ctx = device_get_sysctl_ctx(dev);
4135 	tree = device_get_sysctl_tree(dev);
4136 	child = SYSCTL_CHILDREN(tree);
4137 
4138 	for (i = 0; i < sc->vtnet_req_vq_pairs; i++) {
4139 		vtnet_setup_rxq_sysctl(ctx, child, &sc->vtnet_rxqs[i]);
4140 		vtnet_setup_txq_sysctl(ctx, child, &sc->vtnet_txqs[i]);
4141 	}
4142 }
4143 
4144 static void
4145 vtnet_setup_stat_sysctl(struct sysctl_ctx_list *ctx,
4146     struct sysctl_oid_list *child, struct vtnet_softc *sc)
4147 {
4148 	struct vtnet_statistics *stats;
4149 	struct vtnet_rxq_stats rxaccum;
4150 	struct vtnet_txq_stats txaccum;
4151 
4152 	vtnet_accum_stats(sc, &rxaccum, &txaccum);
4153 
4154 	stats = &sc->vtnet_stats;
4155 	stats->rx_csum_offloaded = rxaccum.vrxs_csum;
4156 	stats->rx_csum_failed = rxaccum.vrxs_csum_failed;
4157 	stats->rx_task_rescheduled = rxaccum.vrxs_rescheduled;
4158 	stats->tx_csum_offloaded = txaccum.vtxs_csum;
4159 	stats->tx_tso_offloaded = txaccum.vtxs_tso;
4160 	stats->tx_task_rescheduled = txaccum.vtxs_rescheduled;
4161 
4162 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "mbuf_alloc_failed",
4163 	    CTLFLAG_RD, &stats->mbuf_alloc_failed,
4164 	    "Mbuf cluster allocation failures");
4165 
4166 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_frame_too_large",
4167 	    CTLFLAG_RD, &stats->rx_frame_too_large,
4168 	    "Received frame larger than the mbuf chain");
4169 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_enq_replacement_failed",
4170 	    CTLFLAG_RD, &stats->rx_enq_replacement_failed,
4171 	    "Enqueuing the replacement receive mbuf failed");
4172 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_mergeable_failed",
4173 	    CTLFLAG_RD, &stats->rx_mergeable_failed,
4174 	    "Mergeable buffers receive failures");
4175 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_ethtype",
4176 	    CTLFLAG_RD, &stats->rx_csum_bad_ethtype,
4177 	    "Received checksum offloaded buffer with unsupported "
4178 	    "Ethernet type");
4179 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_ipproto",
4180 	    CTLFLAG_RD, &stats->rx_csum_bad_ipproto,
4181 	    "Received checksum offloaded buffer with incorrect IP protocol");
4182 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_offset",
4183 	    CTLFLAG_RD, &stats->rx_csum_bad_offset,
4184 	    "Received checksum offloaded buffer with incorrect offset");
4185 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_bad_proto",
4186 	    CTLFLAG_RD, &stats->rx_csum_bad_proto,
4187 	    "Received checksum offloaded buffer with incorrect protocol");
4188 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_failed",
4189 	    CTLFLAG_RD, &stats->rx_csum_failed,
4190 	    "Received buffer checksum offload failed");
4191 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_csum_offloaded",
4192 	    CTLFLAG_RD, &stats->rx_csum_offloaded,
4193 	    "Received buffer checksum offload succeeded");
4194 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "rx_task_rescheduled",
4195 	    CTLFLAG_RD, &stats->rx_task_rescheduled,
4196 	    "Times the receive interrupt task rescheduled itself");
4197 
4198 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_unknown_ethtype",
4199 	    CTLFLAG_RD, &stats->tx_csum_unknown_ethtype,
4200 	    "Aborted transmit of checksum offloaded buffer with unknown "
4201 	    "Ethernet type");
4202 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_proto_mismatch",
4203 	    CTLFLAG_RD, &stats->tx_csum_proto_mismatch,
4204 	    "Aborted transmit of checksum offloaded buffer because mismatched "
4205 	    "protocols");
4206 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_not_tcp",
4207 	    CTLFLAG_RD, &stats->tx_tso_not_tcp,
4208 	    "Aborted transmit of TSO buffer with non TCP protocol");
4209 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_without_csum",
4210 	    CTLFLAG_RD, &stats->tx_tso_without_csum,
4211 	    "Aborted transmit of TSO buffer without TCP checksum offload");
4212 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_defragged",
4213 	    CTLFLAG_RD, &stats->tx_defragged,
4214 	    "Transmit mbufs defragged");
4215 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_defrag_failed",
4216 	    CTLFLAG_RD, &stats->tx_defrag_failed,
4217 	    "Aborted transmit of buffer because defrag failed");
4218 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_csum_offloaded",
4219 	    CTLFLAG_RD, &stats->tx_csum_offloaded,
4220 	    "Offloaded checksum of transmitted buffer");
4221 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_tso_offloaded",
4222 	    CTLFLAG_RD, &stats->tx_tso_offloaded,
4223 	    "Segmentation offload of transmitted buffer");
4224 	SYSCTL_ADD_UQUAD(ctx, child, OID_AUTO, "tx_task_rescheduled",
4225 	    CTLFLAG_RD, &stats->tx_task_rescheduled,
4226 	    "Times the transmit interrupt task rescheduled itself");
4227 }
4228 
4229 static void
4230 vtnet_setup_sysctl(struct vtnet_softc *sc)
4231 {
4232 	device_t dev;
4233 	struct sysctl_ctx_list *ctx;
4234 	struct sysctl_oid *tree;
4235 	struct sysctl_oid_list *child;
4236 
4237 	dev = sc->vtnet_dev;
4238 	ctx = device_get_sysctl_ctx(dev);
4239 	tree = device_get_sysctl_tree(dev);
4240 	child = SYSCTL_CHILDREN(tree);
4241 
4242 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "max_vq_pairs",
4243 	    CTLFLAG_RD, &sc->vtnet_max_vq_pairs, 0,
4244 	    "Number of maximum supported virtqueue pairs");
4245 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "req_vq_pairs",
4246 	    CTLFLAG_RD, &sc->vtnet_req_vq_pairs, 0,
4247 	    "Number of requested virtqueue pairs");
4248 	SYSCTL_ADD_INT(ctx, child, OID_AUTO, "act_vq_pairs",
4249 	    CTLFLAG_RD, &sc->vtnet_act_vq_pairs, 0,
4250 	    "Number of active virtqueue pairs");
4251 
4252 	vtnet_setup_stat_sysctl(ctx, child, sc);
4253 }
4254 
4255 static void
4256 vtnet_load_tunables(struct vtnet_softc *sc)
4257 {
4258 
4259 	sc->vtnet_lro_entry_count = vtnet_tunable_int(sc,
4260 	    "lro_entry_count", vtnet_lro_entry_count);
4261 	if (sc->vtnet_lro_entry_count < TCP_LRO_ENTRIES)
4262 		sc->vtnet_lro_entry_count = TCP_LRO_ENTRIES;
4263 
4264 	sc->vtnet_lro_mbufq_depth = vtnet_tunable_int(sc,
4265 	    "lro_mbufq_depth", vtnet_lro_mbufq_depth);
4266 }
4267 
4268 static int
4269 vtnet_rxq_enable_intr(struct vtnet_rxq *rxq)
4270 {
4271 
4272 	return (virtqueue_enable_intr(rxq->vtnrx_vq));
4273 }
4274 
4275 static void
4276 vtnet_rxq_disable_intr(struct vtnet_rxq *rxq)
4277 {
4278 
4279 	virtqueue_disable_intr(rxq->vtnrx_vq);
4280 }
4281 
4282 static int
4283 vtnet_txq_enable_intr(struct vtnet_txq *txq)
4284 {
4285 	struct virtqueue *vq;
4286 
4287 	vq = txq->vtntx_vq;
4288 
4289 	if (vtnet_txq_below_threshold(txq) != 0)
4290 		return (virtqueue_postpone_intr(vq, VQ_POSTPONE_LONG));
4291 
4292 	/*
4293 	 * The free count is above our threshold. Keep the Tx interrupt
4294 	 * disabled until the queue is fuller.
4295 	 */
4296 	return (0);
4297 }
4298 
4299 static void
4300 vtnet_txq_disable_intr(struct vtnet_txq *txq)
4301 {
4302 
4303 	virtqueue_disable_intr(txq->vtntx_vq);
4304 }
4305 
4306 static void
4307 vtnet_enable_rx_interrupts(struct vtnet_softc *sc)
4308 {
4309 	struct vtnet_rxq *rxq;
4310 	int i;
4311 
4312 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++) {
4313 		rxq = &sc->vtnet_rxqs[i];
4314 		if (vtnet_rxq_enable_intr(rxq) != 0)
4315 			taskqueue_enqueue(rxq->vtnrx_tq, &rxq->vtnrx_intrtask);
4316 	}
4317 }
4318 
4319 static void
4320 vtnet_enable_tx_interrupts(struct vtnet_softc *sc)
4321 {
4322 	int i;
4323 
4324 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++)
4325 		vtnet_txq_enable_intr(&sc->vtnet_txqs[i]);
4326 }
4327 
4328 static void
4329 vtnet_enable_interrupts(struct vtnet_softc *sc)
4330 {
4331 
4332 	vtnet_enable_rx_interrupts(sc);
4333 	vtnet_enable_tx_interrupts(sc);
4334 }
4335 
4336 static void
4337 vtnet_disable_rx_interrupts(struct vtnet_softc *sc)
4338 {
4339 	int i;
4340 
4341 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++)
4342 		vtnet_rxq_disable_intr(&sc->vtnet_rxqs[i]);
4343 }
4344 
4345 static void
4346 vtnet_disable_tx_interrupts(struct vtnet_softc *sc)
4347 {
4348 	int i;
4349 
4350 	for (i = 0; i < sc->vtnet_max_vq_pairs; i++)
4351 		vtnet_txq_disable_intr(&sc->vtnet_txqs[i]);
4352 }
4353 
4354 static void
4355 vtnet_disable_interrupts(struct vtnet_softc *sc)
4356 {
4357 
4358 	vtnet_disable_rx_interrupts(sc);
4359 	vtnet_disable_tx_interrupts(sc);
4360 }
4361 
4362 static int
4363 vtnet_tunable_int(struct vtnet_softc *sc, const char *knob, int def)
4364 {
4365 	char path[64];
4366 
4367 	snprintf(path, sizeof(path),
4368 	    "hw.vtnet.%d.%s", device_get_unit(sc->vtnet_dev), knob);
4369 	TUNABLE_INT_FETCH(path, &def);
4370 
4371 	return (def);
4372 }
4373 
4374 #ifdef DEBUGNET
4375 static void
4376 vtnet_debugnet_init(if_t ifp, int *nrxr, int *ncl, int *clsize)
4377 {
4378 	struct vtnet_softc *sc;
4379 
4380 	sc = if_getsoftc(ifp);
4381 
4382 	VTNET_CORE_LOCK(sc);
4383 	*nrxr = sc->vtnet_req_vq_pairs;
4384 	*ncl = DEBUGNET_MAX_IN_FLIGHT;
4385 	*clsize = sc->vtnet_rx_clustersz;
4386 	VTNET_CORE_UNLOCK(sc);
4387 }
4388 
4389 static void
4390 vtnet_debugnet_event(if_t ifp __unused, enum debugnet_ev event)
4391 {
4392 	struct vtnet_softc *sc;
4393 	static bool sw_lro_enabled = false;
4394 
4395 	/*
4396 	 * Disable software LRO, since it would require entering the network
4397 	 * epoch when calling vtnet_txq_eof() in vtnet_debugnet_poll().
4398 	 */
4399 	sc = if_getsoftc(ifp);
4400 	switch (event) {
4401 	case DEBUGNET_START:
4402 		sw_lro_enabled = (sc->vtnet_flags & VTNET_FLAG_SW_LRO) != 0;
4403 		if (sw_lro_enabled)
4404 			sc->vtnet_flags &= ~VTNET_FLAG_SW_LRO;
4405 		break;
4406 	case DEBUGNET_END:
4407 		if (sw_lro_enabled)
4408 			sc->vtnet_flags |= VTNET_FLAG_SW_LRO;
4409 		break;
4410 	}
4411 }
4412 
4413 static int
4414 vtnet_debugnet_transmit(if_t ifp, struct mbuf *m)
4415 {
4416 	struct vtnet_softc *sc;
4417 	struct vtnet_txq *txq;
4418 	int error;
4419 
4420 	sc = if_getsoftc(ifp);
4421 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
4422 	    IFF_DRV_RUNNING)
4423 		return (EBUSY);
4424 
4425 	txq = &sc->vtnet_txqs[0];
4426 	error = vtnet_txq_encap(txq, &m, M_NOWAIT | M_USE_RESERVE);
4427 	if (error == 0)
4428 		(void)vtnet_txq_notify(txq);
4429 	return (error);
4430 }
4431 
4432 static int
4433 vtnet_debugnet_poll(if_t ifp, int count)
4434 {
4435 	struct vtnet_softc *sc;
4436 	int i;
4437 
4438 	sc = if_getsoftc(ifp);
4439 	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
4440 	    IFF_DRV_RUNNING)
4441 		return (EBUSY);
4442 
4443 	(void)vtnet_txq_eof(&sc->vtnet_txqs[0]);
4444 	for (i = 0; i < sc->vtnet_act_vq_pairs; i++)
4445 		(void)vtnet_rxq_eof(&sc->vtnet_rxqs[i]);
4446 	return (0);
4447 }
4448 #endif /* DEBUGNET */
4449