xref: /freebsd/sys/dev/vmware/vmxnet3/if_vmx.c (revision 5ca8c28cd8c725b81781201cfdb5f9969396f934)
1 /*-
2  * Copyright (c) 2013 Tsubai Masanari
3  * Copyright (c) 2013 Bryan Venteicher <bryanv@FreeBSD.org>
4  * Copyright (c) 2018 Patrick Kelsey
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  *
18  * $OpenBSD: src/sys/dev/pci/if_vmx.c,v 1.11 2013/06/22 00:28:10 uebayasi Exp $
19  */
20 
21 /* Driver for VMware vmxnet3 virtual ethernet devices. */
22 
23 #include <sys/cdefs.h>
24 #include "opt_rss.h"
25 
26 #include <sys/param.h>
27 #include <sys/systm.h>
28 #include <sys/kernel.h>
29 #include <sys/endian.h>
30 #include <sys/sockio.h>
31 #include <sys/mbuf.h>
32 #include <sys/malloc.h>
33 #include <sys/module.h>
34 #include <sys/socket.h>
35 #include <sys/sysctl.h>
36 #include <sys/smp.h>
37 #include <vm/vm.h>
38 #include <vm/pmap.h>
39 
40 #include <net/ethernet.h>
41 #include <net/if.h>
42 #include <net/if_var.h>
43 #include <net/if_arp.h>
44 #include <net/if_dl.h>
45 #include <net/if_types.h>
46 #include <net/if_media.h>
47 #include <net/if_vlan_var.h>
48 #include <net/iflib.h>
49 #ifdef RSS
50 #include <net/rss_config.h>
51 #endif
52 
53 #include <netinet/in_systm.h>
54 #include <netinet/in.h>
55 #include <netinet/ip.h>
56 #include <netinet/ip6.h>
57 #include <netinet6/ip6_var.h>
58 #include <netinet/udp.h>
59 #include <netinet/tcp.h>
60 
61 #include <machine/bus.h>
62 #include <machine/resource.h>
63 #include <sys/bus.h>
64 #include <sys/rman.h>
65 
66 #include <dev/pci/pcireg.h>
67 #include <dev/pci/pcivar.h>
68 
69 #include "ifdi_if.h"
70 
71 #include "if_vmxreg.h"
72 #include "if_vmxvar.h"
73 
74 #include "opt_inet.h"
75 #include "opt_inet6.h"
76 
77 #define VMXNET3_VMWARE_VENDOR_ID	0x15AD
78 #define VMXNET3_VMWARE_DEVICE_ID	0x07B0
79 
80 static const pci_vendor_info_t vmxnet3_vendor_info_array[] =
81 {
82 	PVID(VMXNET3_VMWARE_VENDOR_ID, VMXNET3_VMWARE_DEVICE_ID, "VMware VMXNET3 Ethernet Adapter"),
83 	/* required last entry */
84 	PVID_END
85 };
86 
87 static void	*vmxnet3_register(device_t);
88 static int	vmxnet3_attach_pre(if_ctx_t);
89 static int	vmxnet3_msix_intr_assign(if_ctx_t, int);
90 static void	vmxnet3_free_irqs(struct vmxnet3_softc *);
91 static int	vmxnet3_attach_post(if_ctx_t);
92 static int	vmxnet3_detach(if_ctx_t);
93 static int	vmxnet3_shutdown(if_ctx_t);
94 static int	vmxnet3_suspend(if_ctx_t);
95 static int	vmxnet3_resume(if_ctx_t);
96 
97 static int	vmxnet3_alloc_resources(struct vmxnet3_softc *);
98 static void	vmxnet3_free_resources(struct vmxnet3_softc *);
99 static int	vmxnet3_check_version(struct vmxnet3_softc *);
100 static void	vmxnet3_set_interrupt_idx(struct vmxnet3_softc *);
101 
102 static int	vmxnet3_queues_shared_alloc(struct vmxnet3_softc *);
103 static void	vmxnet3_init_txq(struct vmxnet3_softc *, int);
104 static int	vmxnet3_tx_queues_alloc(if_ctx_t, caddr_t *, uint64_t *, int, int);
105 static void	vmxnet3_init_rxq(struct vmxnet3_softc *, int, int);
106 static int	vmxnet3_rx_queues_alloc(if_ctx_t, caddr_t *, uint64_t *, int, int);
107 static void	vmxnet3_queues_free(if_ctx_t);
108 
109 static int	vmxnet3_alloc_shared_data(struct vmxnet3_softc *);
110 static void	vmxnet3_free_shared_data(struct vmxnet3_softc *);
111 static int	vmxnet3_alloc_mcast_table(struct vmxnet3_softc *);
112 static void	vmxnet3_free_mcast_table(struct vmxnet3_softc *);
113 static void	vmxnet3_init_shared_data(struct vmxnet3_softc *);
114 static void	vmxnet3_reinit_rss_shared_data(struct vmxnet3_softc *);
115 static void	vmxnet3_reinit_shared_data(struct vmxnet3_softc *);
116 static int	vmxnet3_alloc_data(struct vmxnet3_softc *);
117 static void	vmxnet3_free_data(struct vmxnet3_softc *);
118 
119 static void	vmxnet3_evintr(struct vmxnet3_softc *);
120 static int	vmxnet3_isc_txd_encap(void *, if_pkt_info_t);
121 static void	vmxnet3_isc_txd_flush(void *, uint16_t, qidx_t);
122 static int	vmxnet3_isc_txd_credits_update(void *, uint16_t, bool);
123 static int	vmxnet3_isc_rxd_available(void *, uint16_t, qidx_t, qidx_t);
124 static int	vmxnet3_isc_rxd_pkt_get(void *, if_rxd_info_t);
125 static void	vmxnet3_isc_rxd_refill(void *, if_rxd_update_t);
126 static void	vmxnet3_isc_rxd_flush(void *, uint16_t, uint8_t, qidx_t);
127 static int	vmxnet3_legacy_intr(void *);
128 static int	vmxnet3_rxq_intr(void *);
129 static int	vmxnet3_event_intr(void *);
130 
131 static void	vmxnet3_stop(if_ctx_t);
132 
133 static void	vmxnet3_txinit(struct vmxnet3_softc *, struct vmxnet3_txqueue *);
134 static void	vmxnet3_rxinit(struct vmxnet3_softc *, struct vmxnet3_rxqueue *);
135 static void	vmxnet3_reinit_queues(struct vmxnet3_softc *);
136 static int	vmxnet3_enable_device(struct vmxnet3_softc *);
137 static void	vmxnet3_reinit_rxfilters(struct vmxnet3_softc *);
138 static void	vmxnet3_init(if_ctx_t);
139 static void	vmxnet3_multi_set(if_ctx_t);
140 static int	vmxnet3_mtu_set(if_ctx_t, uint32_t);
141 static void	vmxnet3_media_status(if_ctx_t, struct ifmediareq *);
142 static int	vmxnet3_media_change(if_ctx_t);
143 static int	vmxnet3_promisc_set(if_ctx_t, int);
144 static uint64_t	vmxnet3_get_counter(if_ctx_t, ift_counter);
145 static void	vmxnet3_update_admin_status(if_ctx_t);
146 static void	vmxnet3_txq_timer(if_ctx_t, uint16_t);
147 
148 static void	vmxnet3_update_vlan_filter(struct vmxnet3_softc *, int,
149 		    uint16_t);
150 static void	vmxnet3_vlan_register(if_ctx_t, uint16_t);
151 static void	vmxnet3_vlan_unregister(if_ctx_t, uint16_t);
152 static void	vmxnet3_set_rxfilter(struct vmxnet3_softc *, int);
153 
154 static void	vmxnet3_refresh_host_stats(struct vmxnet3_softc *);
155 static int	vmxnet3_link_is_up(struct vmxnet3_softc *);
156 static void	vmxnet3_link_status(struct vmxnet3_softc *);
157 static void	vmxnet3_set_lladdr(struct vmxnet3_softc *);
158 static void	vmxnet3_get_lladdr(struct vmxnet3_softc *);
159 
160 static void	vmxnet3_setup_txq_sysctl(struct vmxnet3_txqueue *,
161 		    struct sysctl_ctx_list *, struct sysctl_oid_list *);
162 static void	vmxnet3_setup_rxq_sysctl(struct vmxnet3_rxqueue *,
163 		    struct sysctl_ctx_list *, struct sysctl_oid_list *);
164 static void	vmxnet3_setup_queue_sysctl(struct vmxnet3_softc *,
165 		    struct sysctl_ctx_list *, struct sysctl_oid_list *);
166 static void	vmxnet3_setup_sysctl(struct vmxnet3_softc *);
167 
168 static void	vmxnet3_write_bar0(struct vmxnet3_softc *, bus_size_t,
169 		    uint32_t);
170 static uint32_t	vmxnet3_read_bar1(struct vmxnet3_softc *, bus_size_t);
171 static void	vmxnet3_write_bar1(struct vmxnet3_softc *, bus_size_t,
172 		    uint32_t);
173 static void	vmxnet3_write_cmd(struct vmxnet3_softc *, uint32_t);
174 static uint32_t	vmxnet3_read_cmd(struct vmxnet3_softc *, uint32_t);
175 
176 static int	vmxnet3_tx_queue_intr_enable(if_ctx_t, uint16_t);
177 static int	vmxnet3_rx_queue_intr_enable(if_ctx_t, uint16_t);
178 static void	vmxnet3_link_intr_enable(if_ctx_t);
179 static void	vmxnet3_enable_intr(struct vmxnet3_softc *, int);
180 static void	vmxnet3_disable_intr(struct vmxnet3_softc *, int);
181 static void	vmxnet3_intr_enable_all(if_ctx_t);
182 static void	vmxnet3_intr_disable_all(if_ctx_t);
183 static bool	vmxnet3_if_needs_restart(if_ctx_t, enum iflib_restart_event);
184 
185 typedef enum {
186 	VMXNET3_BARRIER_RD,
187 	VMXNET3_BARRIER_WR,
188 	VMXNET3_BARRIER_RDWR,
189 } vmxnet3_barrier_t;
190 
191 static void	vmxnet3_barrier(struct vmxnet3_softc *, vmxnet3_barrier_t);
192 
193 static device_method_t vmxnet3_methods[] = {
194 	/* Device interface */
195 	DEVMETHOD(device_register, vmxnet3_register),
196 	DEVMETHOD(device_probe, iflib_device_probe),
197 	DEVMETHOD(device_attach, iflib_device_attach),
198 	DEVMETHOD(device_detach, iflib_device_detach),
199 	DEVMETHOD(device_shutdown, iflib_device_shutdown),
200 	DEVMETHOD(device_suspend, iflib_device_suspend),
201 	DEVMETHOD(device_resume, iflib_device_resume),
202 	DEVMETHOD_END
203 };
204 
205 static driver_t vmxnet3_driver = {
206 	"vmx", vmxnet3_methods, sizeof(struct vmxnet3_softc)
207 };
208 
209 DRIVER_MODULE(vmx, pci, vmxnet3_driver, 0, 0);
210 IFLIB_PNP_INFO(pci, vmx, vmxnet3_vendor_info_array);
211 MODULE_VERSION(vmx, 2);
212 
213 MODULE_DEPEND(vmx, pci, 1, 1, 1);
214 MODULE_DEPEND(vmx, ether, 1, 1, 1);
215 MODULE_DEPEND(vmx, iflib, 1, 1, 1);
216 
217 static device_method_t vmxnet3_iflib_methods[] = {
218 	DEVMETHOD(ifdi_tx_queues_alloc, vmxnet3_tx_queues_alloc),
219 	DEVMETHOD(ifdi_rx_queues_alloc, vmxnet3_rx_queues_alloc),
220 	DEVMETHOD(ifdi_queues_free, vmxnet3_queues_free),
221 
222 	DEVMETHOD(ifdi_attach_pre, vmxnet3_attach_pre),
223 	DEVMETHOD(ifdi_attach_post, vmxnet3_attach_post),
224 	DEVMETHOD(ifdi_detach, vmxnet3_detach),
225 
226 	DEVMETHOD(ifdi_init, vmxnet3_init),
227 	DEVMETHOD(ifdi_stop, vmxnet3_stop),
228 	DEVMETHOD(ifdi_multi_set, vmxnet3_multi_set),
229 	DEVMETHOD(ifdi_mtu_set, vmxnet3_mtu_set),
230 	DEVMETHOD(ifdi_media_status, vmxnet3_media_status),
231 	DEVMETHOD(ifdi_media_change, vmxnet3_media_change),
232 	DEVMETHOD(ifdi_promisc_set, vmxnet3_promisc_set),
233 	DEVMETHOD(ifdi_get_counter, vmxnet3_get_counter),
234 	DEVMETHOD(ifdi_update_admin_status, vmxnet3_update_admin_status),
235 	DEVMETHOD(ifdi_timer, vmxnet3_txq_timer),
236 
237 	DEVMETHOD(ifdi_tx_queue_intr_enable, vmxnet3_tx_queue_intr_enable),
238 	DEVMETHOD(ifdi_rx_queue_intr_enable, vmxnet3_rx_queue_intr_enable),
239 	DEVMETHOD(ifdi_link_intr_enable, vmxnet3_link_intr_enable),
240 	DEVMETHOD(ifdi_intr_enable, vmxnet3_intr_enable_all),
241 	DEVMETHOD(ifdi_intr_disable, vmxnet3_intr_disable_all),
242 	DEVMETHOD(ifdi_msix_intr_assign, vmxnet3_msix_intr_assign),
243 
244 	DEVMETHOD(ifdi_vlan_register, vmxnet3_vlan_register),
245 	DEVMETHOD(ifdi_vlan_unregister, vmxnet3_vlan_unregister),
246 
247 	DEVMETHOD(ifdi_shutdown, vmxnet3_shutdown),
248 	DEVMETHOD(ifdi_suspend, vmxnet3_suspend),
249 	DEVMETHOD(ifdi_resume, vmxnet3_resume),
250 
251 	DEVMETHOD(ifdi_needs_restart, vmxnet3_if_needs_restart),
252 
253 	DEVMETHOD_END
254 };
255 
256 static driver_t vmxnet3_iflib_driver = {
257 	"vmx", vmxnet3_iflib_methods, sizeof(struct vmxnet3_softc)
258 };
259 
260 struct if_txrx vmxnet3_txrx = {
261 	.ift_txd_encap = vmxnet3_isc_txd_encap,
262 	.ift_txd_flush = vmxnet3_isc_txd_flush,
263 	.ift_txd_credits_update = vmxnet3_isc_txd_credits_update,
264 	.ift_rxd_available = vmxnet3_isc_rxd_available,
265 	.ift_rxd_pkt_get = vmxnet3_isc_rxd_pkt_get,
266 	.ift_rxd_refill = vmxnet3_isc_rxd_refill,
267 	.ift_rxd_flush = vmxnet3_isc_rxd_flush,
268 	.ift_legacy_intr = vmxnet3_legacy_intr
269 };
270 
271 static struct if_shared_ctx vmxnet3_sctx_init = {
272 	.isc_magic = IFLIB_MAGIC,
273 	.isc_q_align = 512,
274 
275 	.isc_tx_maxsize = VMXNET3_TX_MAXSIZE,
276 	.isc_tx_maxsegsize = VMXNET3_TX_MAXSEGSIZE,
277 	.isc_tso_maxsize = VMXNET3_TSO_MAXSIZE + sizeof(struct ether_vlan_header),
278 	.isc_tso_maxsegsize = VMXNET3_TX_MAXSEGSIZE,
279 
280 	/*
281 	 * These values are used to configure the busdma tag used for
282 	 * receive descriptors.  Each receive descriptor only points to one
283 	 * buffer.
284 	 */
285 	.isc_rx_maxsize = VMXNET3_RX_MAXSEGSIZE, /* One buf per descriptor */
286 	.isc_rx_nsegments = 1,  /* One mapping per descriptor */
287 	.isc_rx_maxsegsize = VMXNET3_RX_MAXSEGSIZE,
288 
289 	.isc_admin_intrcnt = 1,
290 	.isc_vendor_info = vmxnet3_vendor_info_array,
291 	.isc_driver_version = "2",
292 	.isc_driver = &vmxnet3_iflib_driver,
293 	.isc_flags = IFLIB_HAS_RXCQ | IFLIB_HAS_TXCQ | IFLIB_SINGLE_IRQ_RX_ONLY,
294 
295 	/*
296 	 * Number of receive queues per receive queue set, with associated
297 	 * descriptor settings for each.
298 	 */
299 	.isc_nrxqs = 3,
300 	.isc_nfl = 2, /* one free list for each receive command queue */
301 	.isc_nrxd_min = {VMXNET3_MIN_RX_NDESC, VMXNET3_MIN_RX_NDESC, VMXNET3_MIN_RX_NDESC},
302 	.isc_nrxd_max = {VMXNET3_MAX_RX_NDESC, VMXNET3_MAX_RX_NDESC, VMXNET3_MAX_RX_NDESC},
303 	.isc_nrxd_default = {VMXNET3_DEF_RX_NDESC, VMXNET3_DEF_RX_NDESC, VMXNET3_DEF_RX_NDESC},
304 
305 	/*
306 	 * Number of transmit queues per transmit queue set, with associated
307 	 * descriptor settings for each.
308 	 */
309 	.isc_ntxqs = 2,
310 	.isc_ntxd_min = {VMXNET3_MIN_TX_NDESC, VMXNET3_MIN_TX_NDESC},
311 	.isc_ntxd_max = {VMXNET3_MAX_TX_NDESC, VMXNET3_MAX_TX_NDESC},
312 	.isc_ntxd_default = {VMXNET3_DEF_TX_NDESC, VMXNET3_DEF_TX_NDESC},
313 };
314 
315 static void *
316 vmxnet3_register(device_t dev)
317 {
318 	return (&vmxnet3_sctx_init);
319 }
320 
321 static int
322 trunc_powerof2(int val)
323 {
324 
325 	return (1U << (fls(val) - 1));
326 }
327 
328 static int
329 vmxnet3_attach_pre(if_ctx_t ctx)
330 {
331 	device_t dev;
332 	if_softc_ctx_t scctx;
333 	struct vmxnet3_softc *sc;
334 	uint32_t intr_config;
335 	int error;
336 
337 	dev = iflib_get_dev(ctx);
338 	sc = iflib_get_softc(ctx);
339 	sc->vmx_dev = dev;
340 	sc->vmx_ctx = ctx;
341 	sc->vmx_sctx = iflib_get_sctx(ctx);
342 	sc->vmx_scctx = iflib_get_softc_ctx(ctx);
343 	sc->vmx_ifp = iflib_get_ifp(ctx);
344 	sc->vmx_media = iflib_get_media(ctx);
345 	scctx = sc->vmx_scctx;
346 
347 	scctx->isc_tx_nsegments = VMXNET3_TX_MAXSEGS;
348 	scctx->isc_tx_tso_segments_max = VMXNET3_TX_MAXSEGS;
349 	/* isc_tx_tso_size_max doesn't include possible vlan header */
350 	scctx->isc_tx_tso_size_max = VMXNET3_TSO_MAXSIZE;
351 	scctx->isc_tx_tso_segsize_max = VMXNET3_TX_MAXSEGSIZE;
352 	scctx->isc_txrx = &vmxnet3_txrx;
353 
354 	/* If 0, the iflib tunable was not set, so set to the default */
355 	if (scctx->isc_nrxqsets == 0)
356 		scctx->isc_nrxqsets = VMXNET3_DEF_RX_QUEUES;
357 	scctx->isc_nrxqsets = trunc_powerof2(scctx->isc_nrxqsets);
358 	scctx->isc_nrxqsets_max = min(VMXNET3_MAX_RX_QUEUES, mp_ncpus);
359 	scctx->isc_nrxqsets_max = trunc_powerof2(scctx->isc_nrxqsets_max);
360 
361 	/* If 0, the iflib tunable was not set, so set to the default */
362 	if (scctx->isc_ntxqsets == 0)
363 		scctx->isc_ntxqsets = VMXNET3_DEF_TX_QUEUES;
364 	scctx->isc_ntxqsets = trunc_powerof2(scctx->isc_ntxqsets);
365 	scctx->isc_ntxqsets_max = min(VMXNET3_MAX_TX_QUEUES, mp_ncpus);
366 	scctx->isc_ntxqsets_max = trunc_powerof2(scctx->isc_ntxqsets_max);
367 
368 	/*
369 	 * Enforce that the transmit completion queue descriptor count is
370 	 * the same as the transmit command queue descriptor count.
371 	 */
372 	scctx->isc_ntxd[0] = scctx->isc_ntxd[1];
373 	scctx->isc_txqsizes[0] =
374 	    sizeof(struct vmxnet3_txcompdesc) * scctx->isc_ntxd[0];
375 	scctx->isc_txqsizes[1] =
376 	    sizeof(struct vmxnet3_txdesc) * scctx->isc_ntxd[1];
377 
378 	/*
379 	 * Enforce that the receive completion queue descriptor count is the
380 	 * sum of the receive command queue descriptor counts, and that the
381 	 * second receive command queue descriptor count is the same as the
382 	 * first one.
383 	 */
384 	scctx->isc_nrxd[2] = scctx->isc_nrxd[1];
385 	scctx->isc_nrxd[0] = scctx->isc_nrxd[1] + scctx->isc_nrxd[2];
386 	scctx->isc_rxqsizes[0] =
387 	    sizeof(struct vmxnet3_rxcompdesc) * scctx->isc_nrxd[0];
388 	scctx->isc_rxqsizes[1] =
389 	    sizeof(struct vmxnet3_rxdesc) * scctx->isc_nrxd[1];
390 	scctx->isc_rxqsizes[2] =
391 	    sizeof(struct vmxnet3_rxdesc) * scctx->isc_nrxd[2];
392 
393 	/*
394 	 * Initialize the max frame size and descriptor queue buffer
395 	 * sizes.
396 	 */
397 	vmxnet3_mtu_set(ctx, if_getmtu(sc->vmx_ifp));
398 
399 	scctx->isc_rss_table_size = UPT1_RSS_MAX_IND_TABLE_SIZE;
400 
401 	/* Map PCI BARs */
402 	error = vmxnet3_alloc_resources(sc);
403 	if (error)
404 		goto fail;
405 
406 	/* Check device versions */
407 	error = vmxnet3_check_version(sc);
408 	if (error)
409 		goto fail;
410 
411 	/*
412 	 * The interrupt mode can be set in the hypervisor configuration via
413 	 * the parameter ethernet<N>.intrMode.
414 	 */
415 	intr_config = vmxnet3_read_cmd(sc, VMXNET3_CMD_GET_INTRCFG);
416 	sc->vmx_intr_mask_mode = (intr_config >> 2) & 0x03;
417 
418 	/*
419 	 * Configure the softc context to attempt to configure the interrupt
420 	 * mode now indicated by intr_config.  iflib will follow the usual
421 	 * fallback path MSI-X -> MSI -> LEGACY, starting at the configured
422 	 * starting mode.
423 	 */
424 	switch (intr_config & 0x03) {
425 	case VMXNET3_IT_AUTO:
426 	case VMXNET3_IT_MSIX:
427 		scctx->isc_msix_bar = pci_msix_table_bar(dev);
428 		break;
429 	case VMXNET3_IT_MSI:
430 		scctx->isc_msix_bar = -1;
431 		scctx->isc_disable_msix = 1;
432 		break;
433 	case VMXNET3_IT_LEGACY:
434 		scctx->isc_msix_bar = 0;
435 		break;
436 	}
437 
438 	scctx->isc_tx_csum_flags = VMXNET3_CSUM_ALL_OFFLOAD;
439 	scctx->isc_capabilities = scctx->isc_capenable =
440 	    IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6 |
441 	    IFCAP_TSO4 | IFCAP_TSO6 |
442 	    IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6 |
443 	    IFCAP_VLAN_MTU | IFCAP_VLAN_HWTAGGING |
444 	    IFCAP_VLAN_HWCSUM | IFCAP_VLAN_HWTSO |
445 	    IFCAP_JUMBO_MTU;
446 
447 	/* These capabilities are not enabled by default. */
448 	scctx->isc_capabilities |= IFCAP_LRO | IFCAP_VLAN_HWFILTER;
449 
450 	vmxnet3_get_lladdr(sc);
451 	iflib_set_mac(ctx, sc->vmx_lladdr);
452 
453 	return (0);
454 fail:
455 	/*
456 	 * We must completely clean up anything allocated above as iflib
457 	 * will not invoke any other driver entry points as a result of this
458 	 * failure.
459 	 */
460 	vmxnet3_free_resources(sc);
461 
462 	return (error);
463 }
464 
465 static int
466 vmxnet3_msix_intr_assign(if_ctx_t ctx, int msix)
467 {
468 	struct vmxnet3_softc *sc;
469 	if_softc_ctx_t scctx;
470 	struct vmxnet3_rxqueue *rxq;
471 	int error;
472 	int i;
473 	char irq_name[16];
474 
475 	sc = iflib_get_softc(ctx);
476 	scctx = sc->vmx_scctx;
477 
478 	for (i = 0; i < scctx->isc_nrxqsets; i++) {
479 		snprintf(irq_name, sizeof(irq_name), "rxq%d", i);
480 
481 		rxq = &sc->vmx_rxq[i];
482 		error = iflib_irq_alloc_generic(ctx, &rxq->vxrxq_irq, i + 1,
483 		    IFLIB_INTR_RXTX, vmxnet3_rxq_intr, rxq, i, irq_name);
484 		if (error) {
485 			device_printf(iflib_get_dev(ctx),
486 			    "Failed to register rxq %d interrupt handler\n", i);
487 			return (error);
488 		}
489 	}
490 
491 	for (i = 0; i < scctx->isc_ntxqsets; i++) {
492 		snprintf(irq_name, sizeof(irq_name), "txq%d", i);
493 
494 		/*
495 		 * Don't provide the corresponding rxq irq for reference -
496 		 * we want the transmit task to be attached to a task queue
497 		 * that is different from the one used by the corresponding
498 		 * rxq irq.  That is because the TX doorbell writes are very
499 		 * expensive as virtualized MMIO operations, so we want to
500 		 * be able to defer them to another core when possible so
501 		 * that they don't steal receive processing cycles during
502 		 * stack turnarounds like TCP ACK generation.  The other
503 		 * piece to this approach is enabling the iflib abdicate
504 		 * option (currently via an interface-specific
505 		 * tunable/sysctl).
506 		 */
507 		iflib_softirq_alloc_generic(ctx, NULL, IFLIB_INTR_TX, NULL, i,
508 		    irq_name);
509 	}
510 
511 	error = iflib_irq_alloc_generic(ctx, &sc->vmx_event_intr_irq,
512 	    scctx->isc_nrxqsets + 1, IFLIB_INTR_ADMIN, vmxnet3_event_intr, sc, 0,
513 	    "event");
514 	if (error) {
515 		device_printf(iflib_get_dev(ctx),
516 		    "Failed to register event interrupt handler\n");
517 		return (error);
518 	}
519 
520 	return (0);
521 }
522 
523 static void
524 vmxnet3_free_irqs(struct vmxnet3_softc *sc)
525 {
526 	if_softc_ctx_t scctx;
527 	struct vmxnet3_rxqueue *rxq;
528 	int i;
529 
530 	scctx = sc->vmx_scctx;
531 
532 	for (i = 0; i < scctx->isc_nrxqsets; i++) {
533 		rxq = &sc->vmx_rxq[i];
534 		iflib_irq_free(sc->vmx_ctx, &rxq->vxrxq_irq);
535 	}
536 
537 	iflib_irq_free(sc->vmx_ctx, &sc->vmx_event_intr_irq);
538 }
539 
540 static int
541 vmxnet3_attach_post(if_ctx_t ctx)
542 {
543 	if_softc_ctx_t scctx;
544 	struct vmxnet3_softc *sc;
545 	int error;
546 
547 	scctx = iflib_get_softc_ctx(ctx);
548 	sc = iflib_get_softc(ctx);
549 
550 	if (scctx->isc_nrxqsets > 1)
551 		sc->vmx_flags |= VMXNET3_FLAG_RSS;
552 
553 	error = vmxnet3_alloc_data(sc);
554 	if (error)
555 		goto fail;
556 
557 	vmxnet3_set_interrupt_idx(sc);
558 	vmxnet3_setup_sysctl(sc);
559 
560 	ifmedia_add(sc->vmx_media, IFM_ETHER | IFM_AUTO, 0, NULL);
561 	ifmedia_set(sc->vmx_media, IFM_ETHER | IFM_AUTO);
562 
563 fail:
564 	return (error);
565 }
566 
567 static int
568 vmxnet3_detach(if_ctx_t ctx)
569 {
570 	struct vmxnet3_softc *sc;
571 
572 	sc = iflib_get_softc(ctx);
573 
574 	vmxnet3_free_irqs(sc);
575 	vmxnet3_free_data(sc);
576 	vmxnet3_free_resources(sc);
577 
578 	return (0);
579 }
580 
581 static int
582 vmxnet3_shutdown(if_ctx_t ctx)
583 {
584 
585 	return (0);
586 }
587 
588 static int
589 vmxnet3_suspend(if_ctx_t ctx)
590 {
591 
592 	return (0);
593 }
594 
595 static int
596 vmxnet3_resume(if_ctx_t ctx)
597 {
598 
599 	return (0);
600 }
601 
602 static int
603 vmxnet3_alloc_resources(struct vmxnet3_softc *sc)
604 {
605 	device_t dev;
606 	int rid;
607 
608 	dev = sc->vmx_dev;
609 
610 	rid = PCIR_BAR(0);
611 	sc->vmx_res0 = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
612 	    RF_ACTIVE);
613 	if (sc->vmx_res0 == NULL) {
614 		device_printf(dev,
615 		    "could not map BAR0 memory\n");
616 		return (ENXIO);
617 	}
618 
619 	sc->vmx_iot0 = rman_get_bustag(sc->vmx_res0);
620 	sc->vmx_ioh0 = rman_get_bushandle(sc->vmx_res0);
621 
622 	rid = PCIR_BAR(1);
623 	sc->vmx_res1 = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
624 	    RF_ACTIVE);
625 	if (sc->vmx_res1 == NULL) {
626 		device_printf(dev,
627 		    "could not map BAR1 memory\n");
628 		return (ENXIO);
629 	}
630 
631 	sc->vmx_iot1 = rman_get_bustag(sc->vmx_res1);
632 	sc->vmx_ioh1 = rman_get_bushandle(sc->vmx_res1);
633 
634 	return (0);
635 }
636 
637 static void
638 vmxnet3_free_resources(struct vmxnet3_softc *sc)
639 {
640 	device_t dev;
641 
642 	dev = sc->vmx_dev;
643 
644 	if (sc->vmx_res0 != NULL) {
645 		bus_release_resource(dev, SYS_RES_MEMORY,
646 		    rman_get_rid(sc->vmx_res0), sc->vmx_res0);
647 		sc->vmx_res0 = NULL;
648 	}
649 
650 	if (sc->vmx_res1 != NULL) {
651 		bus_release_resource(dev, SYS_RES_MEMORY,
652 		    rman_get_rid(sc->vmx_res1), sc->vmx_res1);
653 		sc->vmx_res1 = NULL;
654 	}
655 }
656 
657 static int
658 vmxnet3_check_version(struct vmxnet3_softc *sc)
659 {
660 	device_t dev;
661 	uint32_t version;
662 
663 	dev = sc->vmx_dev;
664 
665 	version = vmxnet3_read_bar1(sc, VMXNET3_BAR1_VRRS);
666 	if ((version & 0x01) == 0) {
667 		device_printf(dev, "unsupported hardware version %#x\n",
668 		    version);
669 		return (ENOTSUP);
670 	}
671 	vmxnet3_write_bar1(sc, VMXNET3_BAR1_VRRS, 1);
672 
673 	version = vmxnet3_read_bar1(sc, VMXNET3_BAR1_UVRS);
674 	if ((version & 0x01) == 0) {
675 		device_printf(dev, "unsupported UPT version %#x\n", version);
676 		return (ENOTSUP);
677 	}
678 	vmxnet3_write_bar1(sc, VMXNET3_BAR1_UVRS, 1);
679 
680 	return (0);
681 }
682 
683 static void
684 vmxnet3_set_interrupt_idx(struct vmxnet3_softc *sc)
685 {
686 	if_softc_ctx_t scctx;
687 	struct vmxnet3_txqueue *txq;
688 	struct vmxnet3_txq_shared *txs;
689 	struct vmxnet3_rxqueue *rxq;
690 	struct vmxnet3_rxq_shared *rxs;
691 	int intr_idx;
692 	int i;
693 
694 	scctx = sc->vmx_scctx;
695 
696 	/*
697 	 * There is always one interrupt per receive queue, assigned
698 	 * starting with the first interrupt.  When there is only one
699 	 * interrupt available, the event interrupt shares the receive queue
700 	 * interrupt, otherwise it uses the interrupt following the last
701 	 * receive queue interrupt.  Transmit queues are not assigned
702 	 * interrupts, so they are given indexes beyond the indexes that
703 	 * correspond to the real interrupts.
704 	 */
705 
706 	/* The event interrupt is always the last vector. */
707 	sc->vmx_event_intr_idx = scctx->isc_vectors - 1;
708 
709 	intr_idx = 0;
710 	for (i = 0; i < scctx->isc_nrxqsets; i++, intr_idx++) {
711 		rxq = &sc->vmx_rxq[i];
712 		rxs = rxq->vxrxq_rs;
713 		rxq->vxrxq_intr_idx = intr_idx;
714 		rxs->intr_idx = rxq->vxrxq_intr_idx;
715 	}
716 
717 	/*
718 	 * Assign the tx queues interrupt indexes above what we are actually
719 	 * using.  These interrupts will never be enabled.
720 	 */
721 	intr_idx = scctx->isc_vectors;
722 	for (i = 0; i < scctx->isc_ntxqsets; i++, intr_idx++) {
723 		txq = &sc->vmx_txq[i];
724 		txs = txq->vxtxq_ts;
725 		txq->vxtxq_intr_idx = intr_idx;
726 		txs->intr_idx = txq->vxtxq_intr_idx;
727 	}
728 }
729 
730 static int
731 vmxnet3_queues_shared_alloc(struct vmxnet3_softc *sc)
732 {
733 	if_softc_ctx_t scctx;
734 	int size;
735 	int error;
736 
737 	scctx = sc->vmx_scctx;
738 
739 	/*
740 	 * The txq and rxq shared data areas must be allocated contiguously
741 	 * as vmxnet3_driver_shared contains only a single address member
742 	 * for the shared queue data area.
743 	 */
744 	size = scctx->isc_ntxqsets * sizeof(struct vmxnet3_txq_shared) +
745 	    scctx->isc_nrxqsets * sizeof(struct vmxnet3_rxq_shared);
746 	error = iflib_dma_alloc_align(sc->vmx_ctx, size, 128, &sc->vmx_qs_dma, 0);
747 	if (error) {
748 		device_printf(sc->vmx_dev, "cannot alloc queue shared memory\n");
749 		return (error);
750 	}
751 
752 	return (0);
753 }
754 
755 static void
756 vmxnet3_init_txq(struct vmxnet3_softc *sc, int q)
757 {
758 	struct vmxnet3_txqueue *txq;
759 	struct vmxnet3_comp_ring *txc;
760 	struct vmxnet3_txring *txr;
761 	if_softc_ctx_t scctx;
762 
763 	txq = &sc->vmx_txq[q];
764 	txc = &txq->vxtxq_comp_ring;
765 	txr = &txq->vxtxq_cmd_ring;
766 	scctx = sc->vmx_scctx;
767 
768 	snprintf(txq->vxtxq_name, sizeof(txq->vxtxq_name), "%s-tx%d",
769 	    device_get_nameunit(sc->vmx_dev), q);
770 
771 	txq->vxtxq_sc = sc;
772 	txq->vxtxq_id = q;
773 	txc->vxcr_ndesc = scctx->isc_ntxd[0];
774 	txr->vxtxr_ndesc = scctx->isc_ntxd[1];
775 }
776 
777 static int
778 vmxnet3_tx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs, uint64_t *paddrs,
779     int ntxqs, int ntxqsets)
780 {
781 	struct vmxnet3_softc *sc;
782 	int q;
783 	int error;
784 	caddr_t kva;
785 
786 	sc = iflib_get_softc(ctx);
787 
788 	/* Allocate the array of transmit queues */
789 	sc->vmx_txq = malloc(sizeof(struct vmxnet3_txqueue) *
790 	    ntxqsets, M_DEVBUF, M_NOWAIT | M_ZERO);
791 	if (sc->vmx_txq == NULL)
792 		return (ENOMEM);
793 
794 	/* Initialize driver state for each transmit queue */
795 	for (q = 0; q < ntxqsets; q++)
796 		vmxnet3_init_txq(sc, q);
797 
798 	/*
799 	 * Allocate queue state that is shared with the device.  This check
800 	 * and call is performed in both vmxnet3_tx_queues_alloc() and
801 	 * vmxnet3_rx_queues_alloc() so that we don't have to care which
802 	 * order iflib invokes those routines in.
803 	 */
804 	if (sc->vmx_qs_dma.idi_size == 0) {
805 		error = vmxnet3_queues_shared_alloc(sc);
806 		if (error)
807 			return (error);
808 	}
809 
810 	kva = sc->vmx_qs_dma.idi_vaddr;
811 	for (q = 0; q < ntxqsets; q++) {
812 		sc->vmx_txq[q].vxtxq_ts = (struct vmxnet3_txq_shared *) kva;
813 		kva += sizeof(struct vmxnet3_txq_shared);
814 	}
815 
816 	/* Record descriptor ring vaddrs and paddrs */
817 	for (q = 0; q < ntxqsets; q++) {
818 		struct vmxnet3_txqueue *txq;
819 		struct vmxnet3_txring *txr;
820 		struct vmxnet3_comp_ring *txc;
821 
822 		txq = &sc->vmx_txq[q];
823 		txc = &txq->vxtxq_comp_ring;
824 		txr = &txq->vxtxq_cmd_ring;
825 
826 		/* Completion ring */
827 		txc->vxcr_u.txcd =
828 		    (struct vmxnet3_txcompdesc *) vaddrs[q * ntxqs + 0];
829 		txc->vxcr_paddr = paddrs[q * ntxqs + 0];
830 
831 		/* Command ring */
832 		txr->vxtxr_txd =
833 		    (struct vmxnet3_txdesc *) vaddrs[q * ntxqs + 1];
834 		txr->vxtxr_paddr = paddrs[q * ntxqs + 1];
835 	}
836 
837 	return (0);
838 }
839 
840 static void
841 vmxnet3_init_rxq(struct vmxnet3_softc *sc, int q, int nrxqs)
842 {
843 	struct vmxnet3_rxqueue *rxq;
844 	struct vmxnet3_comp_ring *rxc;
845 	struct vmxnet3_rxring *rxr;
846 	if_softc_ctx_t scctx;
847 	int i;
848 
849 	rxq = &sc->vmx_rxq[q];
850 	rxc = &rxq->vxrxq_comp_ring;
851 	scctx = sc->vmx_scctx;
852 
853 	snprintf(rxq->vxrxq_name, sizeof(rxq->vxrxq_name), "%s-rx%d",
854 	    device_get_nameunit(sc->vmx_dev), q);
855 
856 	rxq->vxrxq_sc = sc;
857 	rxq->vxrxq_id = q;
858 
859 	/*
860 	 * First rxq is the completion queue, so there are nrxqs - 1 command
861 	 * rings starting at iflib queue id 1.
862 	 */
863 	rxc->vxcr_ndesc = scctx->isc_nrxd[0];
864 	for (i = 0; i < nrxqs - 1; i++) {
865 		rxr = &rxq->vxrxq_cmd_ring[i];
866 		rxr->vxrxr_ndesc = scctx->isc_nrxd[i + 1];
867 	}
868 }
869 
870 static int
871 vmxnet3_rx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs, uint64_t *paddrs,
872     int nrxqs, int nrxqsets)
873 {
874 	struct vmxnet3_softc *sc;
875 	if_softc_ctx_t scctx;
876 	int q;
877 	int i;
878 	int error;
879 	caddr_t kva;
880 
881 	sc = iflib_get_softc(ctx);
882 	scctx = sc->vmx_scctx;
883 
884 	/* Allocate the array of receive queues */
885 	sc->vmx_rxq = malloc(sizeof(struct vmxnet3_rxqueue) *
886 	    nrxqsets, M_DEVBUF, M_NOWAIT | M_ZERO);
887 	if (sc->vmx_rxq == NULL)
888 		return (ENOMEM);
889 
890 	/* Initialize driver state for each receive queue */
891 	for (q = 0; q < nrxqsets; q++)
892 		vmxnet3_init_rxq(sc, q, nrxqs);
893 
894 	/*
895 	 * Allocate queue state that is shared with the device.  This check
896 	 * and call is performed in both vmxnet3_tx_queues_alloc() and
897 	 * vmxnet3_rx_queues_alloc() so that we don't have to care which
898 	 * order iflib invokes those routines in.
899 	 */
900 	if (sc->vmx_qs_dma.idi_size == 0) {
901 		error = vmxnet3_queues_shared_alloc(sc);
902 		if (error)
903 			return (error);
904 	}
905 
906 	kva = sc->vmx_qs_dma.idi_vaddr +
907 	    scctx->isc_ntxqsets * sizeof(struct vmxnet3_txq_shared);
908 	for (q = 0; q < nrxqsets; q++) {
909 		sc->vmx_rxq[q].vxrxq_rs = (struct vmxnet3_rxq_shared *) kva;
910 		kva += sizeof(struct vmxnet3_rxq_shared);
911 	}
912 
913 	/* Record descriptor ring vaddrs and paddrs */
914 	for (q = 0; q < nrxqsets; q++) {
915 		struct vmxnet3_rxqueue *rxq;
916 		struct vmxnet3_rxring *rxr;
917 		struct vmxnet3_comp_ring *rxc;
918 
919 		rxq = &sc->vmx_rxq[q];
920 		rxc = &rxq->vxrxq_comp_ring;
921 
922 		/* Completion ring */
923 		rxc->vxcr_u.rxcd =
924 		    (struct vmxnet3_rxcompdesc *) vaddrs[q * nrxqs + 0];
925 		rxc->vxcr_paddr = paddrs[q * nrxqs + 0];
926 
927 		/* Command ring(s) */
928 		for (i = 0; i < nrxqs - 1; i++) {
929 			rxr = &rxq->vxrxq_cmd_ring[i];
930 
931 			rxr->vxrxr_rxd =
932 			    (struct vmxnet3_rxdesc *) vaddrs[q * nrxqs + 1 + i];
933 			rxr->vxrxr_paddr = paddrs[q * nrxqs + 1 + i];
934 		}
935 	}
936 
937 	return (0);
938 }
939 
940 static void
941 vmxnet3_queues_free(if_ctx_t ctx)
942 {
943 	struct vmxnet3_softc *sc;
944 
945 	sc = iflib_get_softc(ctx);
946 
947 	/* Free queue state area that is shared with the device */
948 	if (sc->vmx_qs_dma.idi_size != 0) {
949 		iflib_dma_free(&sc->vmx_qs_dma);
950 		sc->vmx_qs_dma.idi_size = 0;
951 	}
952 
953 	/* Free array of receive queues */
954 	if (sc->vmx_rxq != NULL) {
955 		free(sc->vmx_rxq, M_DEVBUF);
956 		sc->vmx_rxq = NULL;
957 	}
958 
959 	/* Free array of transmit queues */
960 	if (sc->vmx_txq != NULL) {
961 		free(sc->vmx_txq, M_DEVBUF);
962 		sc->vmx_txq = NULL;
963 	}
964 }
965 
966 static int
967 vmxnet3_alloc_shared_data(struct vmxnet3_softc *sc)
968 {
969 	device_t dev;
970 	size_t size;
971 	int error;
972 
973 	dev = sc->vmx_dev;
974 
975 	/* Top level state structure shared with the device */
976 	size = sizeof(struct vmxnet3_driver_shared);
977 	error = iflib_dma_alloc_align(sc->vmx_ctx, size, 1, &sc->vmx_ds_dma, 0);
978 	if (error) {
979 		device_printf(dev, "cannot alloc shared memory\n");
980 		return (error);
981 	}
982 	sc->vmx_ds = (struct vmxnet3_driver_shared *) sc->vmx_ds_dma.idi_vaddr;
983 
984 	/* RSS table state shared with the device */
985 	if (sc->vmx_flags & VMXNET3_FLAG_RSS) {
986 		size = sizeof(struct vmxnet3_rss_shared);
987 		error = iflib_dma_alloc_align(sc->vmx_ctx, size, 128,
988 		    &sc->vmx_rss_dma, 0);
989 		if (error) {
990 			device_printf(dev, "cannot alloc rss shared memory\n");
991 			return (error);
992 		}
993 		sc->vmx_rss =
994 		    (struct vmxnet3_rss_shared *) sc->vmx_rss_dma.idi_vaddr;
995 	}
996 
997 	return (0);
998 }
999 
1000 static void
1001 vmxnet3_free_shared_data(struct vmxnet3_softc *sc)
1002 {
1003 
1004 	/* Free RSS table state shared with the device */
1005 	if (sc->vmx_rss != NULL) {
1006 		iflib_dma_free(&sc->vmx_rss_dma);
1007 		sc->vmx_rss = NULL;
1008 	}
1009 
1010 	/* Free top level state structure shared with the device */
1011 	if (sc->vmx_ds != NULL) {
1012 		iflib_dma_free(&sc->vmx_ds_dma);
1013 		sc->vmx_ds = NULL;
1014 	}
1015 }
1016 
1017 static int
1018 vmxnet3_alloc_mcast_table(struct vmxnet3_softc *sc)
1019 {
1020 	int error;
1021 
1022 	/* Multicast table state shared with the device */
1023 	error = iflib_dma_alloc_align(sc->vmx_ctx,
1024 	    VMXNET3_MULTICAST_MAX * ETHER_ADDR_LEN, 32, &sc->vmx_mcast_dma, 0);
1025 	if (error)
1026 		device_printf(sc->vmx_dev, "unable to alloc multicast table\n");
1027 	else
1028 		sc->vmx_mcast = sc->vmx_mcast_dma.idi_vaddr;
1029 
1030 	return (error);
1031 }
1032 
1033 static void
1034 vmxnet3_free_mcast_table(struct vmxnet3_softc *sc)
1035 {
1036 
1037 	/* Free multicast table state shared with the device */
1038 	if (sc->vmx_mcast != NULL) {
1039 		iflib_dma_free(&sc->vmx_mcast_dma);
1040 		sc->vmx_mcast = NULL;
1041 	}
1042 }
1043 
1044 static void
1045 vmxnet3_init_shared_data(struct vmxnet3_softc *sc)
1046 {
1047 	struct vmxnet3_driver_shared *ds;
1048 	if_softc_ctx_t scctx;
1049 	struct vmxnet3_txqueue *txq;
1050 	struct vmxnet3_txq_shared *txs;
1051 	struct vmxnet3_rxqueue *rxq;
1052 	struct vmxnet3_rxq_shared *rxs;
1053 	int i;
1054 
1055 	ds = sc->vmx_ds;
1056 	scctx = sc->vmx_scctx;
1057 
1058 	/*
1059 	 * Initialize fields of the shared data that remains the same across
1060 	 * reinits. Note the shared data is zero'd when allocated.
1061 	 */
1062 
1063 	ds->magic = VMXNET3_REV1_MAGIC;
1064 
1065 	/* DriverInfo */
1066 	ds->version = VMXNET3_DRIVER_VERSION;
1067 	ds->guest = VMXNET3_GOS_FREEBSD |
1068 #ifdef __LP64__
1069 	    VMXNET3_GOS_64BIT;
1070 #else
1071 	    VMXNET3_GOS_32BIT;
1072 #endif
1073 	ds->vmxnet3_revision = 1;
1074 	ds->upt_version = 1;
1075 
1076 	/* Misc. conf */
1077 	ds->driver_data = vtophys(sc);
1078 	ds->driver_data_len = sizeof(struct vmxnet3_softc);
1079 	ds->queue_shared = sc->vmx_qs_dma.idi_paddr;
1080 	ds->queue_shared_len = sc->vmx_qs_dma.idi_size;
1081 	ds->nrxsg_max = IFLIB_MAX_RX_SEGS;
1082 
1083 	/* RSS conf */
1084 	if (sc->vmx_flags & VMXNET3_FLAG_RSS) {
1085 		ds->rss.version = 1;
1086 		ds->rss.paddr = sc->vmx_rss_dma.idi_paddr;
1087 		ds->rss.len = sc->vmx_rss_dma.idi_size;
1088 	}
1089 
1090 	/* Interrupt control. */
1091 	ds->automask = sc->vmx_intr_mask_mode == VMXNET3_IMM_AUTO;
1092 	/*
1093 	 * Total number of interrupt indexes we are using in the shared
1094 	 * config data, even though we don't actually allocate interrupt
1095 	 * resources for the tx queues.  Some versions of the device will
1096 	 * fail to initialize successfully if interrupt indexes are used in
1097 	 * the shared config that exceed the number of interrupts configured
1098 	 * here.
1099 	 */
1100 	ds->nintr = (scctx->isc_vectors == 1) ?
1101 	    2 : (scctx->isc_nrxqsets + scctx->isc_ntxqsets + 1);
1102 	ds->evintr = sc->vmx_event_intr_idx;
1103 	ds->ictrl = VMXNET3_ICTRL_DISABLE_ALL;
1104 
1105 	for (i = 0; i < ds->nintr; i++)
1106 		ds->modlevel[i] = UPT1_IMOD_ADAPTIVE;
1107 
1108 	/* Receive filter. */
1109 	ds->mcast_table = sc->vmx_mcast_dma.idi_paddr;
1110 	ds->mcast_tablelen = sc->vmx_mcast_dma.idi_size;
1111 
1112 	/* Tx queues */
1113 	for (i = 0; i < scctx->isc_ntxqsets; i++) {
1114 		txq = &sc->vmx_txq[i];
1115 		txs = txq->vxtxq_ts;
1116 
1117 		txs->cmd_ring = txq->vxtxq_cmd_ring.vxtxr_paddr;
1118 		txs->cmd_ring_len = txq->vxtxq_cmd_ring.vxtxr_ndesc;
1119 		txs->comp_ring = txq->vxtxq_comp_ring.vxcr_paddr;
1120 		txs->comp_ring_len = txq->vxtxq_comp_ring.vxcr_ndesc;
1121 		txs->driver_data = vtophys(txq);
1122 		txs->driver_data_len = sizeof(struct vmxnet3_txqueue);
1123 	}
1124 
1125 	/* Rx queues */
1126 	for (i = 0; i < scctx->isc_nrxqsets; i++) {
1127 		rxq = &sc->vmx_rxq[i];
1128 		rxs = rxq->vxrxq_rs;
1129 
1130 		rxs->cmd_ring[0] = rxq->vxrxq_cmd_ring[0].vxrxr_paddr;
1131 		rxs->cmd_ring_len[0] = rxq->vxrxq_cmd_ring[0].vxrxr_ndesc;
1132 		rxs->cmd_ring[1] = rxq->vxrxq_cmd_ring[1].vxrxr_paddr;
1133 		rxs->cmd_ring_len[1] = rxq->vxrxq_cmd_ring[1].vxrxr_ndesc;
1134 		rxs->comp_ring = rxq->vxrxq_comp_ring.vxcr_paddr;
1135 		rxs->comp_ring_len = rxq->vxrxq_comp_ring.vxcr_ndesc;
1136 		rxs->driver_data = vtophys(rxq);
1137 		rxs->driver_data_len = sizeof(struct vmxnet3_rxqueue);
1138 	}
1139 }
1140 
1141 static void
1142 vmxnet3_reinit_rss_shared_data(struct vmxnet3_softc *sc)
1143 {
1144 	/*
1145 	 * Use the same key as the Linux driver until FreeBSD can do
1146 	 * RSS (presumably Toeplitz) in software.
1147 	 */
1148 	static const uint8_t rss_key[UPT1_RSS_MAX_KEY_SIZE] = {
1149 	    0x3b, 0x56, 0xd1, 0x56, 0x13, 0x4a, 0xe7, 0xac,
1150 	    0xe8, 0x79, 0x09, 0x75, 0xe8, 0x65, 0x79, 0x28,
1151 	    0x35, 0x12, 0xb9, 0x56, 0x7c, 0x76, 0x4b, 0x70,
1152 	    0xd8, 0x56, 0xa3, 0x18, 0x9b, 0x0a, 0xee, 0xf3,
1153 	    0x96, 0xa6, 0x9f, 0x8f, 0x9e, 0x8c, 0x90, 0xc9,
1154 	};
1155 
1156 	if_softc_ctx_t scctx;
1157 	struct vmxnet3_rss_shared *rss;
1158 #ifdef RSS
1159 	uint8_t rss_algo;
1160 #endif
1161 	int i;
1162 
1163 	scctx = sc->vmx_scctx;
1164 	rss = sc->vmx_rss;
1165 
1166 	rss->hash_type =
1167 	    UPT1_RSS_HASH_TYPE_IPV4 | UPT1_RSS_HASH_TYPE_TCP_IPV4 |
1168 	    UPT1_RSS_HASH_TYPE_IPV6 | UPT1_RSS_HASH_TYPE_TCP_IPV6;
1169 	rss->hash_func = UPT1_RSS_HASH_FUNC_TOEPLITZ;
1170 	rss->hash_key_size = UPT1_RSS_MAX_KEY_SIZE;
1171 	rss->ind_table_size = UPT1_RSS_MAX_IND_TABLE_SIZE;
1172 #ifdef RSS
1173 	/*
1174 	 * If the software RSS is configured to anything else other than
1175 	 * Toeplitz, then just do Toeplitz in "hardware" for the sake of
1176 	 * the packet distribution, but report the hash as opaque to
1177 	 * disengage from the software RSS.
1178 	 */
1179 	rss_algo = rss_gethashalgo();
1180 	if (rss_algo == RSS_HASH_TOEPLITZ) {
1181 		rss_getkey(rss->hash_key);
1182 		for (i = 0; i < UPT1_RSS_MAX_IND_TABLE_SIZE; i++) {
1183 			rss->ind_table[i] = rss_get_indirection_to_bucket(i) %
1184 			    scctx->isc_nrxqsets;
1185 		}
1186 		sc->vmx_flags |= VMXNET3_FLAG_SOFT_RSS;
1187 	} else
1188 #endif
1189 	{
1190 		memcpy(rss->hash_key, rss_key, UPT1_RSS_MAX_KEY_SIZE);
1191 		for (i = 0; i < UPT1_RSS_MAX_IND_TABLE_SIZE; i++)
1192 			rss->ind_table[i] = i % scctx->isc_nrxqsets;
1193 		sc->vmx_flags &= ~VMXNET3_FLAG_SOFT_RSS;
1194 	}
1195 }
1196 
1197 static void
1198 vmxnet3_reinit_shared_data(struct vmxnet3_softc *sc)
1199 {
1200 	if_t ifp;
1201 	struct vmxnet3_driver_shared *ds;
1202 	if_softc_ctx_t scctx;
1203 
1204 	ifp = sc->vmx_ifp;
1205 	ds = sc->vmx_ds;
1206 	scctx = sc->vmx_scctx;
1207 
1208 	ds->mtu = if_getmtu(ifp);
1209 	ds->ntxqueue = scctx->isc_ntxqsets;
1210 	ds->nrxqueue = scctx->isc_nrxqsets;
1211 
1212 	ds->upt_features = 0;
1213 	if (if_getcapenable(ifp) & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6))
1214 		ds->upt_features |= UPT1_F_CSUM;
1215 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWTAGGING)
1216 		ds->upt_features |= UPT1_F_VLAN;
1217 	if (if_getcapenable(ifp) & IFCAP_LRO)
1218 		ds->upt_features |= UPT1_F_LRO;
1219 
1220 	if (sc->vmx_flags & VMXNET3_FLAG_RSS) {
1221 		ds->upt_features |= UPT1_F_RSS;
1222 		vmxnet3_reinit_rss_shared_data(sc);
1223 	}
1224 
1225 	vmxnet3_write_bar1(sc, VMXNET3_BAR1_DSL, sc->vmx_ds_dma.idi_paddr);
1226 	vmxnet3_write_bar1(sc, VMXNET3_BAR1_DSH,
1227 	    (uint64_t) sc->vmx_ds_dma.idi_paddr >> 32);
1228 }
1229 
1230 static int
1231 vmxnet3_alloc_data(struct vmxnet3_softc *sc)
1232 {
1233 	int error;
1234 
1235 	error = vmxnet3_alloc_shared_data(sc);
1236 	if (error)
1237 		return (error);
1238 
1239 	error = vmxnet3_alloc_mcast_table(sc);
1240 	if (error)
1241 		return (error);
1242 
1243 	vmxnet3_init_shared_data(sc);
1244 
1245 	return (0);
1246 }
1247 
1248 static void
1249 vmxnet3_free_data(struct vmxnet3_softc *sc)
1250 {
1251 
1252 	vmxnet3_free_mcast_table(sc);
1253 	vmxnet3_free_shared_data(sc);
1254 }
1255 
1256 static void
1257 vmxnet3_evintr(struct vmxnet3_softc *sc)
1258 {
1259 	device_t dev;
1260 	struct vmxnet3_txq_shared *ts;
1261 	struct vmxnet3_rxq_shared *rs;
1262 	uint32_t event;
1263 
1264 	dev = sc->vmx_dev;
1265 
1266 	/* Clear events. */
1267 	event = sc->vmx_ds->event;
1268 	vmxnet3_write_bar1(sc, VMXNET3_BAR1_EVENT, event);
1269 
1270 	if (event & VMXNET3_EVENT_LINK)
1271 		vmxnet3_link_status(sc);
1272 
1273 	if (event & (VMXNET3_EVENT_TQERROR | VMXNET3_EVENT_RQERROR)) {
1274 		vmxnet3_read_cmd(sc, VMXNET3_CMD_GET_STATUS);
1275 		ts = sc->vmx_txq[0].vxtxq_ts;
1276 		if (ts->stopped != 0)
1277 			device_printf(dev, "Tx queue error %#x\n", ts->error);
1278 		rs = sc->vmx_rxq[0].vxrxq_rs;
1279 		if (rs->stopped != 0)
1280 			device_printf(dev, "Rx queue error %#x\n", rs->error);
1281 
1282 		/* XXX - rely on liflib watchdog to reset us? */
1283 		device_printf(dev, "Rx/Tx queue error event ... "
1284 		    "waiting for iflib watchdog reset\n");
1285 	}
1286 
1287 	if (event & VMXNET3_EVENT_DIC)
1288 		device_printf(dev, "device implementation change event\n");
1289 	if (event & VMXNET3_EVENT_DEBUG)
1290 		device_printf(dev, "debug event\n");
1291 }
1292 
1293 static int
1294 vmxnet3_isc_txd_encap(void *vsc, if_pkt_info_t pi)
1295 {
1296 	struct vmxnet3_softc *sc;
1297 	struct vmxnet3_txqueue *txq;
1298 	struct vmxnet3_txring *txr;
1299 	struct vmxnet3_txdesc *txd, *sop;
1300 	bus_dma_segment_t *segs;
1301 	int nsegs;
1302 	int pidx;
1303 	int hdrlen;
1304 	int i;
1305 	int gen;
1306 
1307 	sc = vsc;
1308 	txq = &sc->vmx_txq[pi->ipi_qsidx];
1309 	txr = &txq->vxtxq_cmd_ring;
1310 	segs = pi->ipi_segs;
1311 	nsegs = pi->ipi_nsegs;
1312 	pidx = pi->ipi_pidx;
1313 
1314 	KASSERT(nsegs <= VMXNET3_TX_MAXSEGS,
1315 	    ("%s: packet with too many segments %d", __func__, nsegs));
1316 
1317 	sop = &txr->vxtxr_txd[pidx];
1318 	gen = txr->vxtxr_gen ^ 1;	/* Owned by cpu (yet) */
1319 
1320 	for (i = 0; i < nsegs; i++) {
1321 		txd = &txr->vxtxr_txd[pidx];
1322 
1323 		txd->addr = segs[i].ds_addr;
1324 		txd->len = segs[i].ds_len;
1325 		txd->gen = gen;
1326 		txd->dtype = 0;
1327 		txd->offload_mode = VMXNET3_OM_NONE;
1328 		txd->offload_pos = 0;
1329 		txd->hlen = 0;
1330 		txd->eop = 0;
1331 		txd->compreq = 0;
1332 		txd->vtag_mode = 0;
1333 		txd->vtag = 0;
1334 
1335 		if (++pidx == txr->vxtxr_ndesc) {
1336 			pidx = 0;
1337 			txr->vxtxr_gen ^= 1;
1338 		}
1339 		gen = txr->vxtxr_gen;
1340 	}
1341 	txd->eop = 1;
1342 	txd->compreq = !!(pi->ipi_flags & IPI_TX_INTR);
1343 	pi->ipi_new_pidx = pidx;
1344 
1345 	/*
1346 	 * VLAN
1347 	 */
1348 	if (pi->ipi_mflags & M_VLANTAG) {
1349 		sop->vtag_mode = 1;
1350 		sop->vtag = pi->ipi_vtag;
1351 	}
1352 
1353 	/*
1354 	 * TSO and checksum offloads
1355 	 */
1356 	hdrlen = pi->ipi_ehdrlen + pi->ipi_ip_hlen;
1357 	if (pi->ipi_csum_flags & CSUM_TSO) {
1358 		sop->offload_mode = VMXNET3_OM_TSO;
1359 		sop->hlen = hdrlen + pi->ipi_tcp_hlen;
1360 		sop->offload_pos = pi->ipi_tso_segsz;
1361 	} else if (pi->ipi_csum_flags & (VMXNET3_CSUM_OFFLOAD |
1362 	    VMXNET3_CSUM_OFFLOAD_IPV6)) {
1363 		sop->offload_mode = VMXNET3_OM_CSUM;
1364 		sop->hlen = hdrlen;
1365 		sop->offload_pos = hdrlen +
1366 		    ((pi->ipi_ipproto == IPPROTO_TCP) ?
1367 			offsetof(struct tcphdr, th_sum) :
1368 			offsetof(struct udphdr, uh_sum));
1369 	}
1370 
1371 	/* Finally, change the ownership. */
1372 	vmxnet3_barrier(sc, VMXNET3_BARRIER_WR);
1373 	sop->gen ^= 1;
1374 
1375 	return (0);
1376 }
1377 
1378 static void
1379 vmxnet3_isc_txd_flush(void *vsc, uint16_t txqid, qidx_t pidx)
1380 {
1381 	struct vmxnet3_softc *sc;
1382 	struct vmxnet3_txqueue *txq;
1383 
1384 	sc = vsc;
1385 	txq = &sc->vmx_txq[txqid];
1386 
1387 	/*
1388 	 * pidx is what we last set ipi_new_pidx to in
1389 	 * vmxnet3_isc_txd_encap()
1390 	 */
1391 
1392 	/*
1393 	 * Avoid expensive register updates if the flush request is
1394 	 * redundant.
1395 	 */
1396 	if (txq->vxtxq_last_flush == pidx)
1397 		return;
1398 	txq->vxtxq_last_flush = pidx;
1399 	vmxnet3_write_bar0(sc, VMXNET3_BAR0_TXH(txq->vxtxq_id), pidx);
1400 }
1401 
1402 static int
1403 vmxnet3_isc_txd_credits_update(void *vsc, uint16_t txqid, bool clear)
1404 {
1405 	struct vmxnet3_softc *sc;
1406 	struct vmxnet3_txqueue *txq;
1407 	struct vmxnet3_comp_ring *txc;
1408 	struct vmxnet3_txcompdesc *txcd;
1409 	struct vmxnet3_txring *txr;
1410 	int processed;
1411 
1412 	sc = vsc;
1413 	txq = &sc->vmx_txq[txqid];
1414 	txc = &txq->vxtxq_comp_ring;
1415 	txr = &txq->vxtxq_cmd_ring;
1416 
1417 	/*
1418 	 * If clear is true, we need to report the number of TX command ring
1419 	 * descriptors that have been processed by the device.  If clear is
1420 	 * false, we just need to report whether or not at least one TX
1421 	 * command ring descriptor has been processed by the device.
1422 	 */
1423 	processed = 0;
1424 	for (;;) {
1425 		txcd = &txc->vxcr_u.txcd[txc->vxcr_next];
1426 		if (txcd->gen != txc->vxcr_gen)
1427 			break;
1428 		else if (!clear)
1429 			return (1);
1430 		vmxnet3_barrier(sc, VMXNET3_BARRIER_RD);
1431 
1432 		MPASS(txc->vxcr_next < txc->vxcr_ndesc);
1433 		if (++txc->vxcr_next >= txc->vxcr_ndesc) {
1434 			txc->vxcr_next = 0;
1435 			txc->vxcr_gen ^= 1;
1436 		}
1437 
1438 		if (txcd->eop_idx < txr->vxtxr_next)
1439 			processed += txr->vxtxr_ndesc -
1440 			    (txr->vxtxr_next - txcd->eop_idx) + 1;
1441 		else
1442 			processed += txcd->eop_idx - txr->vxtxr_next + 1;
1443 		txr->vxtxr_next = (txcd->eop_idx + 1) % txr->vxtxr_ndesc;
1444 	}
1445 
1446 	return (processed);
1447 }
1448 
1449 static int
1450 vmxnet3_isc_rxd_available(void *vsc, uint16_t rxqid, qidx_t idx, qidx_t budget)
1451 {
1452 	struct vmxnet3_softc *sc;
1453 	struct vmxnet3_rxqueue *rxq;
1454 	struct vmxnet3_comp_ring *rxc;
1455 	struct vmxnet3_rxcompdesc *rxcd;
1456 	int avail;
1457 	int completed_gen;
1458 #ifdef INVARIANTS
1459 	int expect_sop = 1;
1460 #endif
1461 	sc = vsc;
1462 	rxq = &sc->vmx_rxq[rxqid];
1463 	rxc = &rxq->vxrxq_comp_ring;
1464 
1465 	avail = 0;
1466 	completed_gen = rxc->vxcr_gen;
1467 	for (;;) {
1468 		rxcd = &rxc->vxcr_u.rxcd[idx];
1469 		if (rxcd->gen != completed_gen)
1470 			break;
1471 		vmxnet3_barrier(sc, VMXNET3_BARRIER_RD);
1472 
1473 #ifdef INVARIANTS
1474 		if (expect_sop)
1475 			KASSERT(rxcd->sop, ("%s: expected sop", __func__));
1476 		else
1477 			KASSERT(!rxcd->sop, ("%s: unexpected sop", __func__));
1478 		expect_sop = rxcd->eop;
1479 #endif
1480 		if (rxcd->eop && (rxcd->len != 0))
1481 			avail++;
1482 		if (avail > budget)
1483 			break;
1484 		if (++idx == rxc->vxcr_ndesc) {
1485 			idx = 0;
1486 			completed_gen ^= 1;
1487 		}
1488 	}
1489 
1490 	return (avail);
1491 }
1492 
1493 static int
1494 vmxnet3_isc_rxd_pkt_get(void *vsc, if_rxd_info_t ri)
1495 {
1496 	struct vmxnet3_softc *sc;
1497 	if_softc_ctx_t scctx;
1498 	struct vmxnet3_rxqueue *rxq;
1499 	struct vmxnet3_comp_ring *rxc;
1500 	struct vmxnet3_rxcompdesc *rxcd;
1501 	if_rxd_frag_t frag;
1502 	int cqidx;
1503 	uint16_t total_len;
1504 	uint8_t nfrags;
1505 	uint8_t i;
1506 	uint8_t flid;
1507 
1508 	sc = vsc;
1509 	scctx = sc->vmx_scctx;
1510 	rxq = &sc->vmx_rxq[ri->iri_qsidx];
1511 	rxc = &rxq->vxrxq_comp_ring;
1512 
1513 	/*
1514 	 * Get a single packet starting at the given index in the completion
1515 	 * queue.  That we have been called indicates that
1516 	 * vmxnet3_isc_rxd_available() has already verified that either
1517 	 * there is a complete packet available starting at the given index,
1518 	 * or there are one or more zero length packets starting at the
1519 	 * given index followed by a complete packet, so no verification of
1520 	 * ownership of the descriptors (and no associated read barrier) is
1521 	 * required here.
1522 	 */
1523 	cqidx = ri->iri_cidx;
1524 	rxcd = &rxc->vxcr_u.rxcd[cqidx];
1525 	while (rxcd->len == 0) {
1526 		KASSERT(rxcd->sop && rxcd->eop,
1527 		    ("%s: zero-length packet without both sop and eop set",
1528 			__func__));
1529 		rxc->vxcr_zero_length++;
1530 		if (++cqidx == rxc->vxcr_ndesc) {
1531 			cqidx = 0;
1532 			rxc->vxcr_gen ^= 1;
1533 		}
1534 		rxcd = &rxc->vxcr_u.rxcd[cqidx];
1535 	}
1536 	KASSERT(rxcd->sop, ("%s: expected sop", __func__));
1537 
1538 	/*
1539 	 * RSS and flow ID.
1540 	 * Types other than M_HASHTYPE_NONE and M_HASHTYPE_OPAQUE_HASH should
1541 	 * be used only if the software RSS is enabled and it uses the same
1542 	 * algorithm and the hash key as the "hardware".  If the software RSS
1543 	 * is not enabled, then it's simply pointless to use those types.
1544 	 * If it's enabled but with different parameters, then hash values will
1545 	 * not match.
1546 	 */
1547 	ri->iri_flowid = rxcd->rss_hash;
1548 #ifdef RSS
1549 	if ((sc->vmx_flags & VMXNET3_FLAG_SOFT_RSS) != 0) {
1550 		switch (rxcd->rss_type) {
1551 		case VMXNET3_RCD_RSS_TYPE_NONE:
1552 			ri->iri_flowid = ri->iri_qsidx;
1553 			ri->iri_rsstype = M_HASHTYPE_NONE;
1554 			break;
1555 		case VMXNET3_RCD_RSS_TYPE_IPV4:
1556 			ri->iri_rsstype = M_HASHTYPE_RSS_IPV4;
1557 			break;
1558 		case VMXNET3_RCD_RSS_TYPE_TCPIPV4:
1559 			ri->iri_rsstype = M_HASHTYPE_RSS_TCP_IPV4;
1560 			break;
1561 		case VMXNET3_RCD_RSS_TYPE_IPV6:
1562 			ri->iri_rsstype = M_HASHTYPE_RSS_IPV6;
1563 			break;
1564 		case VMXNET3_RCD_RSS_TYPE_TCPIPV6:
1565 			ri->iri_rsstype = M_HASHTYPE_RSS_TCP_IPV6;
1566 			break;
1567 		default:
1568 			ri->iri_rsstype = M_HASHTYPE_OPAQUE_HASH;
1569 			break;
1570 		}
1571 	} else
1572 #endif
1573 	{
1574 		switch (rxcd->rss_type) {
1575 		case VMXNET3_RCD_RSS_TYPE_NONE:
1576 			ri->iri_flowid = ri->iri_qsidx;
1577 			ri->iri_rsstype = M_HASHTYPE_NONE;
1578 			break;
1579 		default:
1580 			ri->iri_rsstype = M_HASHTYPE_OPAQUE_HASH;
1581 			break;
1582 		}
1583 	}
1584 
1585 	/*
1586 	 * The queue numbering scheme used for rxcd->qid is as follows:
1587 	 *  - All of the command ring 0s are numbered [0, nrxqsets - 1]
1588 	 *  - All of the command ring 1s are numbered [nrxqsets, 2*nrxqsets - 1]
1589 	 *
1590 	 * Thus, rxcd->qid less than nrxqsets indicates command ring (and
1591 	 * flid) 0, and rxcd->qid greater than or equal to nrxqsets
1592 	 * indicates command ring (and flid) 1.
1593 	 */
1594 	nfrags = 0;
1595 	total_len = 0;
1596 	do {
1597 		rxcd = &rxc->vxcr_u.rxcd[cqidx];
1598 		KASSERT(rxcd->gen == rxc->vxcr_gen,
1599 		    ("%s: generation mismatch", __func__));
1600 		KASSERT(nfrags < IFLIB_MAX_RX_SEGS,
1601 		    ("%s: too many fragments", __func__));
1602 		if (__predict_true(rxcd->len != 0)) {
1603 			frag = &ri->iri_frags[nfrags];
1604 			flid = (rxcd->qid >= scctx->isc_nrxqsets) ? 1 : 0;
1605 			frag->irf_flid = flid;
1606 			frag->irf_idx = rxcd->rxd_idx;
1607 			frag->irf_len = rxcd->len;
1608 			total_len += rxcd->len;
1609 			nfrags++;
1610 		} else {
1611 			rxc->vcxr_zero_length_frag++;
1612 		}
1613 		if (++cqidx == rxc->vxcr_ndesc) {
1614 			cqidx = 0;
1615 			rxc->vxcr_gen ^= 1;
1616 		}
1617 	} while (!rxcd->eop);
1618 
1619 	ri->iri_cidx = cqidx;
1620 	ri->iri_nfrags = nfrags;
1621 	ri->iri_len = total_len;
1622 
1623 	/*
1624 	 * If there's an error, the last descriptor in the packet will
1625 	 * have the error indicator set.  In this case, set all
1626 	 * fragment lengths to zero.  This will cause iflib to discard
1627 	 * the packet, but process all associated descriptors through
1628 	 * the refill mechanism.
1629 	 */
1630 	if (__predict_false(rxcd->error)) {
1631 		rxc->vxcr_pkt_errors++;
1632 		for (i = 0; i < nfrags; i++) {
1633 			frag = &ri->iri_frags[i];
1634 			frag->irf_len = 0;
1635 		}
1636 	} else {
1637 		/* Checksum offload information is in the last descriptor. */
1638 		if (!rxcd->no_csum) {
1639 			uint32_t csum_flags = 0;
1640 
1641 			if (rxcd->ipv4) {
1642 				csum_flags |= CSUM_IP_CHECKED;
1643 				if (rxcd->ipcsum_ok)
1644 					csum_flags |= CSUM_IP_VALID;
1645 			}
1646 			if (!rxcd->fragment && (rxcd->tcp || rxcd->udp)) {
1647 				csum_flags |= CSUM_L4_CALC;
1648 				if (rxcd->csum_ok) {
1649 					csum_flags |= CSUM_L4_VALID;
1650 					ri->iri_csum_data = 0xffff;
1651 				}
1652 			}
1653 			ri->iri_csum_flags = csum_flags;
1654 		}
1655 
1656 		/* VLAN information is in the last descriptor. */
1657 		if (rxcd->vlan) {
1658 			ri->iri_flags |= M_VLANTAG;
1659 			ri->iri_vtag = rxcd->vtag;
1660 		}
1661 	}
1662 
1663 	return (0);
1664 }
1665 
1666 static void
1667 vmxnet3_isc_rxd_refill(void *vsc, if_rxd_update_t iru)
1668 {
1669 	struct vmxnet3_softc *sc;
1670 	struct vmxnet3_rxqueue *rxq;
1671 	struct vmxnet3_rxring *rxr;
1672 	struct vmxnet3_rxdesc *rxd;
1673 	uint64_t *paddrs;
1674 	int count;
1675 	int len;
1676 	int idx;
1677 	int i;
1678 	uint8_t flid;
1679 	uint8_t btype;
1680 
1681 	count = iru->iru_count;
1682 	len = iru->iru_buf_size;
1683 	flid = iru->iru_flidx;
1684 	paddrs = iru->iru_paddrs;
1685 
1686 	sc = vsc;
1687 	rxq = &sc->vmx_rxq[iru->iru_qsidx];
1688 	rxr = &rxq->vxrxq_cmd_ring[flid];
1689 	rxd = rxr->vxrxr_rxd;
1690 
1691 	/*
1692 	 * Command ring 0 is filled with BTYPE_HEAD descriptors, and
1693 	 * command ring 1 is filled with BTYPE_BODY descriptors.
1694 	 */
1695 	btype = (flid == 0) ? VMXNET3_BTYPE_HEAD : VMXNET3_BTYPE_BODY;
1696 	/*
1697 	 * The refill entries from iflib will advance monotonically,
1698 	 * but the refilled descriptors may not be contiguous due to
1699 	 * earlier skipping of descriptors by the device.  The refill
1700 	 * entries from iflib need an entire state update, while the
1701 	 * descriptors previously skipped by the device only need to
1702 	 * have their generation numbers updated.
1703 	 */
1704 	idx = rxr->vxrxr_refill_start;
1705 	i = 0;
1706 	do {
1707 		if (idx == iru->iru_idxs[i]) {
1708 			rxd[idx].addr = paddrs[i];
1709 			rxd[idx].len = len;
1710 			rxd[idx].btype = btype;
1711 			i++;
1712 		} else
1713 			rxr->vxrxr_desc_skips++;
1714 		rxd[idx].gen = rxr->vxrxr_gen;
1715 
1716 		if (++idx == rxr->vxrxr_ndesc) {
1717 			idx = 0;
1718 			rxr->vxrxr_gen ^= 1;
1719 		}
1720 	} while (i != count);
1721 	rxr->vxrxr_refill_start = idx;
1722 }
1723 
1724 static void
1725 vmxnet3_isc_rxd_flush(void *vsc, uint16_t rxqid, uint8_t flid, qidx_t pidx)
1726 {
1727 	struct vmxnet3_softc *sc;
1728 	bus_size_t r;
1729 
1730 	sc = vsc;
1731 
1732 	if (flid == 0)
1733 		r = VMXNET3_BAR0_RXH1(rxqid);
1734 	else
1735 		r = VMXNET3_BAR0_RXH2(rxqid);
1736 
1737 	vmxnet3_write_bar0(sc, r, pidx);
1738 }
1739 
1740 static int
1741 vmxnet3_legacy_intr(void *xsc)
1742 {
1743 	struct vmxnet3_softc *sc;
1744 	if_softc_ctx_t scctx;
1745 	if_ctx_t ctx;
1746 
1747 	sc = xsc;
1748 	scctx = sc->vmx_scctx;
1749 	ctx = sc->vmx_ctx;
1750 
1751 	/*
1752 	 * When there is only a single interrupt configured, this routine
1753 	 * runs in fast interrupt context, following which the rxq 0 task
1754 	 * will be enqueued.
1755 	 */
1756 	if (scctx->isc_intr == IFLIB_INTR_LEGACY) {
1757 		if (vmxnet3_read_bar1(sc, VMXNET3_BAR1_INTR) == 0)
1758 			return (FILTER_HANDLED);
1759 	}
1760 	if (sc->vmx_intr_mask_mode == VMXNET3_IMM_ACTIVE)
1761 		vmxnet3_intr_disable_all(ctx);
1762 
1763 	if (sc->vmx_ds->event != 0)
1764 		iflib_admin_intr_deferred(ctx);
1765 
1766 	/*
1767 	 * XXX - When there is both rxq and event activity, do we care
1768 	 * whether the rxq 0 task or the admin task re-enables the interrupt
1769 	 * first?
1770 	 */
1771 	return (FILTER_SCHEDULE_THREAD);
1772 }
1773 
1774 static int
1775 vmxnet3_rxq_intr(void *vrxq)
1776 {
1777 	struct vmxnet3_softc *sc;
1778 	struct vmxnet3_rxqueue *rxq;
1779 
1780 	rxq = vrxq;
1781 	sc = rxq->vxrxq_sc;
1782 
1783 	if (sc->vmx_intr_mask_mode == VMXNET3_IMM_ACTIVE)
1784 		vmxnet3_disable_intr(sc, rxq->vxrxq_intr_idx);
1785 
1786 	return (FILTER_SCHEDULE_THREAD);
1787 }
1788 
1789 static int
1790 vmxnet3_event_intr(void *vsc)
1791 {
1792 	struct vmxnet3_softc *sc;
1793 
1794 	sc = vsc;
1795 
1796 	if (sc->vmx_intr_mask_mode == VMXNET3_IMM_ACTIVE)
1797 		vmxnet3_disable_intr(sc, sc->vmx_event_intr_idx);
1798 
1799 	/*
1800 	 * The work will be done via vmxnet3_update_admin_status(), and the
1801 	 * interrupt will be re-enabled in vmxnet3_link_intr_enable().
1802 	 *
1803 	 * The interrupt will be re-enabled by vmxnet3_link_intr_enable().
1804 	 */
1805 	return (FILTER_SCHEDULE_THREAD);
1806 }
1807 
1808 static void
1809 vmxnet3_stop(if_ctx_t ctx)
1810 {
1811 	struct vmxnet3_softc *sc;
1812 
1813 	sc = iflib_get_softc(ctx);
1814 
1815 	sc->vmx_link_active = 0;
1816 	vmxnet3_write_cmd(sc, VMXNET3_CMD_DISABLE);
1817 	vmxnet3_write_cmd(sc, VMXNET3_CMD_RESET);
1818 }
1819 
1820 static void
1821 vmxnet3_txinit(struct vmxnet3_softc *sc, struct vmxnet3_txqueue *txq)
1822 {
1823 	struct vmxnet3_txring *txr;
1824 	struct vmxnet3_comp_ring *txc;
1825 
1826 	txq->vxtxq_last_flush = -1;
1827 
1828 	txr = &txq->vxtxq_cmd_ring;
1829 	txr->vxtxr_next = 0;
1830 	txr->vxtxr_gen = VMXNET3_INIT_GEN;
1831 	/*
1832 	 * iflib has zeroed out the descriptor array during the prior attach
1833 	 * or stop
1834 	 */
1835 
1836 	txc = &txq->vxtxq_comp_ring;
1837 	txc->vxcr_next = 0;
1838 	txc->vxcr_gen = VMXNET3_INIT_GEN;
1839 	/*
1840 	 * iflib has zeroed out the descriptor array during the prior attach
1841 	 * or stop
1842 	 */
1843 }
1844 
1845 static void
1846 vmxnet3_rxinit(struct vmxnet3_softc *sc, struct vmxnet3_rxqueue *rxq)
1847 {
1848 	struct vmxnet3_rxring *rxr;
1849 	struct vmxnet3_comp_ring *rxc;
1850 	int i;
1851 
1852 	/*
1853 	 * The descriptors will be populated with buffers during a
1854 	 * subsequent invocation of vmxnet3_isc_rxd_refill()
1855 	 */
1856 	for (i = 0; i < sc->vmx_sctx->isc_nrxqs - 1; i++) {
1857 		rxr = &rxq->vxrxq_cmd_ring[i];
1858 		rxr->vxrxr_gen = VMXNET3_INIT_GEN;
1859 		rxr->vxrxr_desc_skips = 0;
1860 		rxr->vxrxr_refill_start = 0;
1861 		/*
1862 		 * iflib has zeroed out the descriptor array during the
1863 		 * prior attach or stop
1864 		 */
1865 	}
1866 
1867 	for (/**/; i < VMXNET3_RXRINGS_PERQ; i++) {
1868 		rxr = &rxq->vxrxq_cmd_ring[i];
1869 		rxr->vxrxr_gen = 0;
1870 		rxr->vxrxr_desc_skips = 0;
1871 		rxr->vxrxr_refill_start = 0;
1872 		bzero(rxr->vxrxr_rxd,
1873 		    rxr->vxrxr_ndesc * sizeof(struct vmxnet3_rxdesc));
1874 	}
1875 
1876 	rxc = &rxq->vxrxq_comp_ring;
1877 	rxc->vxcr_next = 0;
1878 	rxc->vxcr_gen = VMXNET3_INIT_GEN;
1879 	rxc->vxcr_zero_length = 0;
1880 	rxc->vcxr_zero_length_frag = 0;
1881 	rxc->vxcr_pkt_errors = 0;
1882 	/*
1883 	 * iflib has zeroed out the descriptor array during the prior attach
1884 	 * or stop
1885 	 */
1886 }
1887 
1888 static void
1889 vmxnet3_reinit_queues(struct vmxnet3_softc *sc)
1890 {
1891 	if_softc_ctx_t scctx;
1892 	int q;
1893 
1894 	scctx = sc->vmx_scctx;
1895 
1896 	for (q = 0; q < scctx->isc_ntxqsets; q++)
1897 		vmxnet3_txinit(sc, &sc->vmx_txq[q]);
1898 
1899 	for (q = 0; q < scctx->isc_nrxqsets; q++)
1900 		vmxnet3_rxinit(sc, &sc->vmx_rxq[q]);
1901 }
1902 
1903 static int
1904 vmxnet3_enable_device(struct vmxnet3_softc *sc)
1905 {
1906 	if_softc_ctx_t scctx;
1907 	int q;
1908 
1909 	scctx = sc->vmx_scctx;
1910 
1911 	if (vmxnet3_read_cmd(sc, VMXNET3_CMD_ENABLE) != 0) {
1912 		device_printf(sc->vmx_dev, "device enable command failed!\n");
1913 		return (1);
1914 	}
1915 
1916 	/* Reset the Rx queue heads. */
1917 	for (q = 0; q < scctx->isc_nrxqsets; q++) {
1918 		vmxnet3_write_bar0(sc, VMXNET3_BAR0_RXH1(q), 0);
1919 		vmxnet3_write_bar0(sc, VMXNET3_BAR0_RXH2(q), 0);
1920 	}
1921 
1922 	return (0);
1923 }
1924 
1925 static void
1926 vmxnet3_reinit_rxfilters(struct vmxnet3_softc *sc)
1927 {
1928 	if_t ifp;
1929 
1930 	ifp = sc->vmx_ifp;
1931 
1932 	vmxnet3_set_rxfilter(sc, if_getflags(ifp));
1933 
1934 	if (if_getcapenable(ifp) & IFCAP_VLAN_HWFILTER)
1935 		bcopy(sc->vmx_vlan_filter, sc->vmx_ds->vlan_filter,
1936 		    sizeof(sc->vmx_ds->vlan_filter));
1937 	else
1938 		bzero(sc->vmx_ds->vlan_filter,
1939 		    sizeof(sc->vmx_ds->vlan_filter));
1940 	vmxnet3_write_cmd(sc, VMXNET3_CMD_VLAN_FILTER);
1941 }
1942 
1943 static void
1944 vmxnet3_init(if_ctx_t ctx)
1945 {
1946 	struct vmxnet3_softc *sc;
1947 
1948 	sc = iflib_get_softc(ctx);
1949 
1950 	/* Use the current MAC address. */
1951 	bcopy(if_getlladdr(sc->vmx_ifp), sc->vmx_lladdr, ETHER_ADDR_LEN);
1952 	vmxnet3_set_lladdr(sc);
1953 
1954 	vmxnet3_reinit_shared_data(sc);
1955 	vmxnet3_reinit_queues(sc);
1956 
1957 	vmxnet3_enable_device(sc);
1958 
1959 	vmxnet3_reinit_rxfilters(sc);
1960 	vmxnet3_link_status(sc);
1961 }
1962 
1963 static void
1964 vmxnet3_multi_set(if_ctx_t ctx)
1965 {
1966 
1967 	vmxnet3_set_rxfilter(iflib_get_softc(ctx),
1968 	    if_getflags(iflib_get_ifp(ctx)));
1969 }
1970 
1971 static int
1972 vmxnet3_mtu_set(if_ctx_t ctx, uint32_t mtu)
1973 {
1974 	struct vmxnet3_softc *sc;
1975 	if_softc_ctx_t scctx;
1976 
1977 	sc = iflib_get_softc(ctx);
1978 	scctx = sc->vmx_scctx;
1979 
1980 	if (mtu > VMXNET3_TX_MAXSIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN +
1981 		ETHER_CRC_LEN))
1982 		return (EINVAL);
1983 
1984 	/*
1985 	 * Update the max frame size so that the rx mbuf size is
1986 	 * chosen based on the new mtu during the interface init that
1987 	 * will occur after this routine returns.
1988 	 */
1989 	scctx->isc_max_frame_size = mtu +
1990 		ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + ETHER_CRC_LEN;
1991 	/* RX completion queue - n/a */
1992 	scctx->isc_rxd_buf_size[0] = 0;
1993 	/*
1994 	 * For header-type descriptors (used for first segment of
1995 	 * packet), let iflib determine the buffer size based on the
1996 	 * max frame size.
1997 	 */
1998 	scctx->isc_rxd_buf_size[1] = 0;
1999 	/*
2000 	 * For body-type descriptors (used for jumbo frames and LRO),
2001 	 * always use page-sized buffers.
2002 	 */
2003 	scctx->isc_rxd_buf_size[2] = MJUMPAGESIZE;
2004 
2005 	return (0);
2006 }
2007 
2008 static void
2009 vmxnet3_media_status(if_ctx_t ctx, struct ifmediareq * ifmr)
2010 {
2011 	struct vmxnet3_softc *sc;
2012 
2013 	sc = iflib_get_softc(ctx);
2014 
2015 	ifmr->ifm_status = IFM_AVALID;
2016 	ifmr->ifm_active = IFM_ETHER;
2017 
2018 	if (vmxnet3_link_is_up(sc) != 0) {
2019 		ifmr->ifm_status |= IFM_ACTIVE;
2020 		ifmr->ifm_active |= IFM_AUTO;
2021 	} else
2022 		ifmr->ifm_active |= IFM_NONE;
2023 }
2024 
2025 static int
2026 vmxnet3_media_change(if_ctx_t ctx)
2027 {
2028 
2029 	/* Ignore. */
2030 	return (0);
2031 }
2032 
2033 static int
2034 vmxnet3_promisc_set(if_ctx_t ctx, int flags)
2035 {
2036 
2037 	vmxnet3_set_rxfilter(iflib_get_softc(ctx), flags);
2038 
2039 	return (0);
2040 }
2041 
2042 static uint64_t
2043 vmxnet3_get_counter(if_ctx_t ctx, ift_counter cnt)
2044 {
2045 	if_t ifp = iflib_get_ifp(ctx);
2046 
2047 	if (cnt < IFCOUNTERS)
2048 		return if_get_counter_default(ifp, cnt);
2049 
2050 	return (0);
2051 }
2052 
2053 static void
2054 vmxnet3_update_admin_status(if_ctx_t ctx)
2055 {
2056 	struct vmxnet3_softc *sc;
2057 
2058 	sc = iflib_get_softc(ctx);
2059 	if (sc->vmx_ds->event != 0)
2060 		vmxnet3_evintr(sc);
2061 
2062 	vmxnet3_refresh_host_stats(sc);
2063 }
2064 
2065 static void
2066 vmxnet3_txq_timer(if_ctx_t ctx, uint16_t qid)
2067 {
2068 	/* Host stats refresh is global, so just trigger it on txq 0 */
2069 	if (qid == 0)
2070 		vmxnet3_refresh_host_stats(iflib_get_softc(ctx));
2071 }
2072 
2073 static void
2074 vmxnet3_update_vlan_filter(struct vmxnet3_softc *sc, int add, uint16_t tag)
2075 {
2076 	int idx, bit;
2077 
2078 	if (tag == 0 || tag > 4095)
2079 		return;
2080 
2081 	idx = (tag >> 5) & 0x7F;
2082 	bit = tag & 0x1F;
2083 
2084 	/* Update our private VLAN bitvector. */
2085 	if (add)
2086 		sc->vmx_vlan_filter[idx] |= (1 << bit);
2087 	else
2088 		sc->vmx_vlan_filter[idx] &= ~(1 << bit);
2089 }
2090 
2091 static void
2092 vmxnet3_vlan_register(if_ctx_t ctx, uint16_t tag)
2093 {
2094 
2095 	vmxnet3_update_vlan_filter(iflib_get_softc(ctx), 1, tag);
2096 }
2097 
2098 static void
2099 vmxnet3_vlan_unregister(if_ctx_t ctx, uint16_t tag)
2100 {
2101 
2102 	vmxnet3_update_vlan_filter(iflib_get_softc(ctx), 0, tag);
2103 }
2104 
2105 static u_int
2106 vmxnet3_hash_maddr(void *arg, struct sockaddr_dl *sdl, u_int count)
2107 {
2108 	struct vmxnet3_softc *sc = arg;
2109 
2110 	if (count < VMXNET3_MULTICAST_MAX)
2111 		bcopy(LLADDR(sdl), &sc->vmx_mcast[count * ETHER_ADDR_LEN],
2112 		    ETHER_ADDR_LEN);
2113 
2114 	return (1);
2115 }
2116 
2117 static void
2118 vmxnet3_set_rxfilter(struct vmxnet3_softc *sc, int flags)
2119 {
2120 	if_t ifp;
2121 	struct vmxnet3_driver_shared *ds;
2122 	u_int mode;
2123 
2124 	ifp = sc->vmx_ifp;
2125 	ds = sc->vmx_ds;
2126 
2127 	mode = VMXNET3_RXMODE_UCAST | VMXNET3_RXMODE_BCAST;
2128 	if (flags & IFF_PROMISC)
2129 		mode |= VMXNET3_RXMODE_PROMISC;
2130 	if (flags & IFF_ALLMULTI)
2131 		mode |= VMXNET3_RXMODE_ALLMULTI;
2132 	else {
2133 		int cnt;
2134 
2135 		cnt = if_foreach_llmaddr(ifp, vmxnet3_hash_maddr, sc);
2136 		if (cnt >= VMXNET3_MULTICAST_MAX) {
2137 			cnt = 0;
2138 			mode |= VMXNET3_RXMODE_ALLMULTI;
2139 		} else if (cnt > 0)
2140 			mode |= VMXNET3_RXMODE_MCAST;
2141 		ds->mcast_tablelen = cnt * ETHER_ADDR_LEN;
2142 	}
2143 
2144 	ds->rxmode = mode;
2145 
2146 	vmxnet3_write_cmd(sc, VMXNET3_CMD_SET_FILTER);
2147 	vmxnet3_write_cmd(sc, VMXNET3_CMD_SET_RXMODE);
2148 }
2149 
2150 static void
2151 vmxnet3_refresh_host_stats(struct vmxnet3_softc *sc)
2152 {
2153 
2154 	vmxnet3_write_cmd(sc, VMXNET3_CMD_GET_STATS);
2155 }
2156 
2157 static int
2158 vmxnet3_link_is_up(struct vmxnet3_softc *sc)
2159 {
2160 	uint32_t status;
2161 
2162 	status = vmxnet3_read_cmd(sc, VMXNET3_CMD_GET_LINK);
2163 	return !!(status & 0x1);
2164 }
2165 
2166 static void
2167 vmxnet3_link_status(struct vmxnet3_softc *sc)
2168 {
2169 	if_ctx_t ctx;
2170 	uint64_t speed;
2171 	int link;
2172 
2173 	ctx = sc->vmx_ctx;
2174 	link = vmxnet3_link_is_up(sc);
2175 	speed = IF_Gbps(10);
2176 
2177 	if (link != 0 && sc->vmx_link_active == 0) {
2178 		sc->vmx_link_active = 1;
2179 		iflib_link_state_change(ctx, LINK_STATE_UP, speed);
2180 	} else if (link == 0 && sc->vmx_link_active != 0) {
2181 		sc->vmx_link_active = 0;
2182 		iflib_link_state_change(ctx, LINK_STATE_DOWN, speed);
2183 	}
2184 }
2185 
2186 static void
2187 vmxnet3_set_lladdr(struct vmxnet3_softc *sc)
2188 {
2189 	uint32_t ml, mh;
2190 
2191 	ml  = sc->vmx_lladdr[0];
2192 	ml |= sc->vmx_lladdr[1] << 8;
2193 	ml |= sc->vmx_lladdr[2] << 16;
2194 	ml |= sc->vmx_lladdr[3] << 24;
2195 	vmxnet3_write_bar1(sc, VMXNET3_BAR1_MACL, ml);
2196 
2197 	mh  = sc->vmx_lladdr[4];
2198 	mh |= sc->vmx_lladdr[5] << 8;
2199 	vmxnet3_write_bar1(sc, VMXNET3_BAR1_MACH, mh);
2200 }
2201 
2202 static void
2203 vmxnet3_get_lladdr(struct vmxnet3_softc *sc)
2204 {
2205 	uint32_t ml, mh;
2206 
2207 	ml = vmxnet3_read_cmd(sc, VMXNET3_CMD_GET_MACL);
2208 	mh = vmxnet3_read_cmd(sc, VMXNET3_CMD_GET_MACH);
2209 
2210 	sc->vmx_lladdr[0] = ml;
2211 	sc->vmx_lladdr[1] = ml >> 8;
2212 	sc->vmx_lladdr[2] = ml >> 16;
2213 	sc->vmx_lladdr[3] = ml >> 24;
2214 	sc->vmx_lladdr[4] = mh;
2215 	sc->vmx_lladdr[5] = mh >> 8;
2216 }
2217 
2218 static void
2219 vmxnet3_setup_txq_sysctl(struct vmxnet3_txqueue *txq,
2220     struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child)
2221 {
2222 	struct sysctl_oid *node, *txsnode;
2223 	struct sysctl_oid_list *list, *txslist;
2224 	struct UPT1_TxStats *txstats;
2225 	char namebuf[16];
2226 
2227 	txstats = &txq->vxtxq_ts->stats;
2228 
2229 	snprintf(namebuf, sizeof(namebuf), "txq%d", txq->vxtxq_id);
2230 	node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
2231 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Transmit Queue");
2232 	txq->vxtxq_sysctl = list = SYSCTL_CHILDREN(node);
2233 
2234 	/*
2235 	 * Add statistics reported by the host. These are updated by the
2236 	 * iflib txq timer on txq 0.
2237 	 */
2238 	txsnode = SYSCTL_ADD_NODE(ctx, list, OID_AUTO, "hstats",
2239 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Host Statistics");
2240 	txslist = SYSCTL_CHILDREN(txsnode);
2241 	SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "tso_packets", CTLFLAG_RD,
2242 	    &txstats->TSO_packets, "TSO packets");
2243 	SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "tso_bytes", CTLFLAG_RD,
2244 	    &txstats->TSO_bytes, "TSO bytes");
2245 	SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "ucast_packets", CTLFLAG_RD,
2246 	    &txstats->ucast_packets, "Unicast packets");
2247 	SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "unicast_bytes", CTLFLAG_RD,
2248 	    &txstats->ucast_bytes, "Unicast bytes");
2249 	SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "mcast_packets", CTLFLAG_RD,
2250 	    &txstats->mcast_packets, "Multicast packets");
2251 	SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "mcast_bytes", CTLFLAG_RD,
2252 	    &txstats->mcast_bytes, "Multicast bytes");
2253 	SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "error", CTLFLAG_RD,
2254 	    &txstats->error, "Errors");
2255 	SYSCTL_ADD_UQUAD(ctx, txslist, OID_AUTO, "discard", CTLFLAG_RD,
2256 	    &txstats->discard, "Discards");
2257 }
2258 
2259 static void
2260 vmxnet3_setup_rxq_sysctl(struct vmxnet3_rxqueue *rxq,
2261     struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child)
2262 {
2263 	struct sysctl_oid *node, *rxsnode;
2264 	struct sysctl_oid_list *list, *rxslist;
2265 	struct UPT1_RxStats *rxstats;
2266 	char namebuf[16];
2267 
2268 	rxstats = &rxq->vxrxq_rs->stats;
2269 
2270 	snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->vxrxq_id);
2271 	node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf,
2272 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Receive Queue");
2273 	rxq->vxrxq_sysctl = list = SYSCTL_CHILDREN(node);
2274 
2275 	/*
2276 	 * Add statistics reported by the host. These are updated by the
2277 	 * iflib txq timer on txq 0.
2278 	 */
2279 	rxsnode = SYSCTL_ADD_NODE(ctx, list, OID_AUTO, "hstats",
2280 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Host Statistics");
2281 	rxslist = SYSCTL_CHILDREN(rxsnode);
2282 	SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "lro_packets", CTLFLAG_RD,
2283 	    &rxstats->LRO_packets, "LRO packets");
2284 	SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "lro_bytes", CTLFLAG_RD,
2285 	    &rxstats->LRO_bytes, "LRO bytes");
2286 	SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "ucast_packets", CTLFLAG_RD,
2287 	    &rxstats->ucast_packets, "Unicast packets");
2288 	SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "unicast_bytes", CTLFLAG_RD,
2289 	    &rxstats->ucast_bytes, "Unicast bytes");
2290 	SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "mcast_packets", CTLFLAG_RD,
2291 	    &rxstats->mcast_packets, "Multicast packets");
2292 	SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "mcast_bytes", CTLFLAG_RD,
2293 	    &rxstats->mcast_bytes, "Multicast bytes");
2294 	SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "bcast_packets", CTLFLAG_RD,
2295 	    &rxstats->bcast_packets, "Broadcast packets");
2296 	SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "bcast_bytes", CTLFLAG_RD,
2297 	    &rxstats->bcast_bytes, "Broadcast bytes");
2298 	SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "nobuffer", CTLFLAG_RD,
2299 	    &rxstats->nobuffer, "No buffer");
2300 	SYSCTL_ADD_UQUAD(ctx, rxslist, OID_AUTO, "error", CTLFLAG_RD,
2301 	    &rxstats->error, "Errors");
2302 }
2303 
2304 static void
2305 vmxnet3_setup_debug_sysctl(struct vmxnet3_softc *sc,
2306     struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child)
2307 {
2308 	if_softc_ctx_t scctx;
2309 	struct sysctl_oid *node;
2310 	struct sysctl_oid_list *list;
2311 	int i;
2312 
2313 	scctx = sc->vmx_scctx;
2314 
2315 	for (i = 0; i < scctx->isc_ntxqsets; i++) {
2316 		struct vmxnet3_txqueue *txq = &sc->vmx_txq[i];
2317 
2318 		node = SYSCTL_ADD_NODE(ctx, txq->vxtxq_sysctl, OID_AUTO,
2319 		    "debug", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
2320 		list = SYSCTL_CHILDREN(node);
2321 
2322 		SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd_next", CTLFLAG_RD,
2323 		    &txq->vxtxq_cmd_ring.vxtxr_next, 0, "");
2324 		SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd_ndesc", CTLFLAG_RD,
2325 		    &txq->vxtxq_cmd_ring.vxtxr_ndesc, 0, "");
2326 		SYSCTL_ADD_INT(ctx, list, OID_AUTO, "cmd_gen", CTLFLAG_RD,
2327 		    &txq->vxtxq_cmd_ring.vxtxr_gen, 0, "");
2328 		SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "comp_next", CTLFLAG_RD,
2329 		    &txq->vxtxq_comp_ring.vxcr_next, 0, "");
2330 		SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "comp_ndesc", CTLFLAG_RD,
2331 		    &txq->vxtxq_comp_ring.vxcr_ndesc, 0,"");
2332 		SYSCTL_ADD_INT(ctx, list, OID_AUTO, "comp_gen", CTLFLAG_RD,
2333 		    &txq->vxtxq_comp_ring.vxcr_gen, 0, "");
2334 	}
2335 
2336 	for (i = 0; i < scctx->isc_nrxqsets; i++) {
2337 		struct vmxnet3_rxqueue *rxq = &sc->vmx_rxq[i];
2338 
2339 		node = SYSCTL_ADD_NODE(ctx, rxq->vxrxq_sysctl, OID_AUTO,
2340 		    "debug", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
2341 		list = SYSCTL_CHILDREN(node);
2342 
2343 		SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd0_ndesc", CTLFLAG_RD,
2344 		    &rxq->vxrxq_cmd_ring[0].vxrxr_ndesc, 0, "");
2345 		SYSCTL_ADD_INT(ctx, list, OID_AUTO, "cmd0_gen", CTLFLAG_RD,
2346 		    &rxq->vxrxq_cmd_ring[0].vxrxr_gen, 0, "");
2347 		SYSCTL_ADD_U64(ctx, list, OID_AUTO, "cmd0_desc_skips", CTLFLAG_RD,
2348 		    &rxq->vxrxq_cmd_ring[0].vxrxr_desc_skips, 0, "");
2349 		SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "cmd1_ndesc", CTLFLAG_RD,
2350 		    &rxq->vxrxq_cmd_ring[1].vxrxr_ndesc, 0, "");
2351 		SYSCTL_ADD_INT(ctx, list, OID_AUTO, "cmd1_gen", CTLFLAG_RD,
2352 		    &rxq->vxrxq_cmd_ring[1].vxrxr_gen, 0, "");
2353 		SYSCTL_ADD_U64(ctx, list, OID_AUTO, "cmd1_desc_skips", CTLFLAG_RD,
2354 		    &rxq->vxrxq_cmd_ring[1].vxrxr_desc_skips, 0, "");
2355 		SYSCTL_ADD_UINT(ctx, list, OID_AUTO, "comp_ndesc", CTLFLAG_RD,
2356 		    &rxq->vxrxq_comp_ring.vxcr_ndesc, 0,"");
2357 		SYSCTL_ADD_INT(ctx, list, OID_AUTO, "comp_gen", CTLFLAG_RD,
2358 		    &rxq->vxrxq_comp_ring.vxcr_gen, 0, "");
2359 		SYSCTL_ADD_U64(ctx, list, OID_AUTO, "comp_zero_length", CTLFLAG_RD,
2360 		    &rxq->vxrxq_comp_ring.vxcr_zero_length, 0, "");
2361 		SYSCTL_ADD_U64(ctx, list, OID_AUTO, "comp_zero_length_frag",
2362 		    CTLFLAG_RD, &rxq->vxrxq_comp_ring.vcxr_zero_length_frag,
2363 		    0, "");
2364 		SYSCTL_ADD_U64(ctx, list, OID_AUTO, "comp_pkt_errors", CTLFLAG_RD,
2365 		    &rxq->vxrxq_comp_ring.vxcr_pkt_errors, 0, "");
2366 	}
2367 }
2368 
2369 static void
2370 vmxnet3_setup_queue_sysctl(struct vmxnet3_softc *sc,
2371     struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child)
2372 {
2373 	if_softc_ctx_t scctx;
2374 	int i;
2375 
2376 	scctx = sc->vmx_scctx;
2377 
2378 	for (i = 0; i < scctx->isc_ntxqsets; i++)
2379 		vmxnet3_setup_txq_sysctl(&sc->vmx_txq[i], ctx, child);
2380 	for (i = 0; i < scctx->isc_nrxqsets; i++)
2381 		vmxnet3_setup_rxq_sysctl(&sc->vmx_rxq[i], ctx, child);
2382 
2383 	vmxnet3_setup_debug_sysctl(sc, ctx, child);
2384 }
2385 
2386 static void
2387 vmxnet3_setup_sysctl(struct vmxnet3_softc *sc)
2388 {
2389 	device_t dev;
2390 	struct sysctl_ctx_list *ctx;
2391 	struct sysctl_oid *tree;
2392 	struct sysctl_oid_list *child;
2393 
2394 	dev = sc->vmx_dev;
2395 	ctx = device_get_sysctl_ctx(dev);
2396 	tree = device_get_sysctl_tree(dev);
2397 	child = SYSCTL_CHILDREN(tree);
2398 
2399 	vmxnet3_setup_queue_sysctl(sc, ctx, child);
2400 }
2401 
2402 static void
2403 vmxnet3_write_bar0(struct vmxnet3_softc *sc, bus_size_t r, uint32_t v)
2404 {
2405 
2406 	bus_space_write_4(sc->vmx_iot0, sc->vmx_ioh0, r, v);
2407 }
2408 
2409 static uint32_t
2410 vmxnet3_read_bar1(struct vmxnet3_softc *sc, bus_size_t r)
2411 {
2412 
2413 	return (bus_space_read_4(sc->vmx_iot1, sc->vmx_ioh1, r));
2414 }
2415 
2416 static void
2417 vmxnet3_write_bar1(struct vmxnet3_softc *sc, bus_size_t r, uint32_t v)
2418 {
2419 
2420 	bus_space_write_4(sc->vmx_iot1, sc->vmx_ioh1, r, v);
2421 }
2422 
2423 static void
2424 vmxnet3_write_cmd(struct vmxnet3_softc *sc, uint32_t cmd)
2425 {
2426 
2427 	vmxnet3_write_bar1(sc, VMXNET3_BAR1_CMD, cmd);
2428 }
2429 
2430 static uint32_t
2431 vmxnet3_read_cmd(struct vmxnet3_softc *sc, uint32_t cmd)
2432 {
2433 
2434 	vmxnet3_write_cmd(sc, cmd);
2435 	bus_space_barrier(sc->vmx_iot1, sc->vmx_ioh1, 0, 0,
2436 	    BUS_SPACE_BARRIER_READ | BUS_SPACE_BARRIER_WRITE);
2437 	return (vmxnet3_read_bar1(sc, VMXNET3_BAR1_CMD));
2438 }
2439 
2440 static void
2441 vmxnet3_enable_intr(struct vmxnet3_softc *sc, int irq)
2442 {
2443 
2444 	vmxnet3_write_bar0(sc, VMXNET3_BAR0_IMASK(irq), 0);
2445 }
2446 
2447 static void
2448 vmxnet3_disable_intr(struct vmxnet3_softc *sc, int irq)
2449 {
2450 
2451 	vmxnet3_write_bar0(sc, VMXNET3_BAR0_IMASK(irq), 1);
2452 }
2453 
2454 static int
2455 vmxnet3_tx_queue_intr_enable(if_ctx_t ctx, uint16_t qid)
2456 {
2457 	/* Not using interrupts for TX */
2458 	return (0);
2459 }
2460 
2461 static int
2462 vmxnet3_rx_queue_intr_enable(if_ctx_t ctx, uint16_t qid)
2463 {
2464 	struct vmxnet3_softc *sc;
2465 
2466 	sc = iflib_get_softc(ctx);
2467 	vmxnet3_enable_intr(sc, sc->vmx_rxq[qid].vxrxq_intr_idx);
2468 	return (0);
2469 }
2470 
2471 static void
2472 vmxnet3_link_intr_enable(if_ctx_t ctx)
2473 {
2474 	struct vmxnet3_softc *sc;
2475 
2476 	sc = iflib_get_softc(ctx);
2477 	vmxnet3_enable_intr(sc, sc->vmx_event_intr_idx);
2478 }
2479 
2480 static void
2481 vmxnet3_intr_enable_all(if_ctx_t ctx)
2482 {
2483 	struct vmxnet3_softc *sc;
2484 	if_softc_ctx_t scctx;
2485 	int i;
2486 
2487 	sc = iflib_get_softc(ctx);
2488 	scctx = sc->vmx_scctx;
2489 	sc->vmx_ds->ictrl &= ~VMXNET3_ICTRL_DISABLE_ALL;
2490 	for (i = 0; i < scctx->isc_vectors; i++)
2491 		vmxnet3_enable_intr(sc, i);
2492 }
2493 
2494 static void
2495 vmxnet3_intr_disable_all(if_ctx_t ctx)
2496 {
2497 	struct vmxnet3_softc *sc;
2498 	int i;
2499 
2500 	sc = iflib_get_softc(ctx);
2501 	/*
2502 	 * iflib may invoke this routine before vmxnet3_attach_post() has
2503 	 * run, which is before the top level shared data area is
2504 	 * initialized and the device made aware of it.
2505 	 */
2506 	if (sc->vmx_ds != NULL)
2507 		sc->vmx_ds->ictrl |= VMXNET3_ICTRL_DISABLE_ALL;
2508 	for (i = 0; i < VMXNET3_MAX_INTRS; i++)
2509 		vmxnet3_disable_intr(sc, i);
2510 }
2511 
2512 static bool
2513 vmxnet3_if_needs_restart(if_ctx_t ctx __unused, enum iflib_restart_event event)
2514 {
2515 	switch (event) {
2516 	case IFLIB_RESTART_VLAN_CONFIG:
2517 		return (true);
2518 	default:
2519 		return (false);
2520 	}
2521 }
2522 
2523 /*
2524  * Since this is a purely paravirtualized device, we do not have
2525  * to worry about DMA coherency. But at times, we must make sure
2526  * both the compiler and CPU do not reorder memory operations.
2527  */
2528 static inline void
2529 vmxnet3_barrier(struct vmxnet3_softc *sc, vmxnet3_barrier_t type)
2530 {
2531 
2532 	switch (type) {
2533 	case VMXNET3_BARRIER_RD:
2534 		rmb();
2535 		break;
2536 	case VMXNET3_BARRIER_WR:
2537 		wmb();
2538 		break;
2539 	case VMXNET3_BARRIER_RDWR:
2540 		mb();
2541 		break;
2542 	default:
2543 		panic("%s: bad barrier type %d", __func__, type);
2544 	}
2545 }
2546