xref: /freebsd/sys/net/iflib.c (revision 88b187401d5186e7274a9f064efa8a16d5fa76ea)
1  /*-
2   * Copyright (c) 2014-2018, Matthew Macy <mmacy@mattmacy.io>
3   * All rights reserved.
4   *
5   * Redistribution and use in source and binary forms, with or without
6   * modification, are permitted provided that the following conditions are met:
7   *
8   *  1. Redistributions of source code must retain the above copyright notice,
9   *     this list of conditions and the following disclaimer.
10   *
11   *  2. Neither the name of Matthew Macy nor the names of its
12   *     contributors may be used to endorse or promote products derived from
13   *     this software without specific prior written permission.
14   *
15   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16   * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
19   * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20   * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21   * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22   * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23   * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25   * POSSIBILITY OF SUCH DAMAGE.
26   */
27  
28  #include <sys/cdefs.h>
29  #include "opt_inet.h"
30  #include "opt_inet6.h"
31  #include "opt_acpi.h"
32  #include "opt_sched.h"
33  
34  #include <sys/param.h>
35  #include <sys/types.h>
36  #include <sys/bus.h>
37  #include <sys/eventhandler.h>
38  #include <sys/kernel.h>
39  #include <sys/lock.h>
40  #include <sys/mutex.h>
41  #include <sys/module.h>
42  #include <sys/kobj.h>
43  #include <sys/rman.h>
44  #include <sys/sbuf.h>
45  #include <sys/smp.h>
46  #include <sys/socket.h>
47  #include <sys/sockio.h>
48  #include <sys/sysctl.h>
49  #include <sys/syslog.h>
50  #include <sys/taskqueue.h>
51  #include <sys/limits.h>
52  
53  #include <net/if.h>
54  #include <net/if_var.h>
55  #include <net/if_private.h>
56  #include <net/if_types.h>
57  #include <net/if_media.h>
58  #include <net/bpf.h>
59  #include <net/ethernet.h>
60  #include <net/mp_ring.h>
61  #include <net/debugnet.h>
62  #include <net/pfil.h>
63  #include <net/vnet.h>
64  
65  #include <netinet/in.h>
66  #include <netinet/in_pcb.h>
67  #include <netinet/tcp_lro.h>
68  #include <netinet/in_systm.h>
69  #include <netinet/if_ether.h>
70  #include <netinet/ip.h>
71  #include <netinet/ip6.h>
72  #include <netinet/tcp.h>
73  #include <netinet/ip_var.h>
74  #include <netinet6/ip6_var.h>
75  
76  #include <machine/bus.h>
77  #include <machine/in_cksum.h>
78  
79  #include <vm/vm.h>
80  #include <vm/pmap.h>
81  
82  #include <dev/led/led.h>
83  #include <dev/pci/pcireg.h>
84  #include <dev/pci/pcivar.h>
85  #include <dev/pci/pci_private.h>
86  
87  #include <net/iflib.h>
88  
89  #include "ifdi_if.h"
90  
91  #ifdef PCI_IOV
92  #include <dev/pci/pci_iov.h>
93  #endif
94  
95  #include <sys/bitstring.h>
96  /*
97   * enable accounting of every mbuf as it comes in to and goes out of
98   * iflib's software descriptor references
99   */
100  #define MEMORY_LOGGING 0
101  /*
102   * Enable mbuf vectors for compressing long mbuf chains
103   */
104  
105  /*
106   * NB:
107   * - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead
108   *   we prefetch needs to be determined by the time spent in m_free vis a vis
109   *   the cost of a prefetch. This will of course vary based on the workload:
110   *      - NFLX's m_free path is dominated by vm-based M_EXT manipulation which
111   *        is quite expensive, thus suggesting very little prefetch.
112   *      - small packet forwarding which is just returning a single mbuf to
113   *        UMA will typically be very fast vis a vis the cost of a memory
114   *        access.
115   */
116  
117  /*
118   * File organization:
119   *  - private structures
120   *  - iflib private utility functions
121   *  - ifnet functions
122   *  - vlan registry and other exported functions
123   *  - iflib public core functions
124   *
125   *
126   */
127  static MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library");
128  
129  #define	IFLIB_RXEOF_MORE	(1U << 0)
130  #define	IFLIB_RXEOF_EMPTY	(2U << 0)
131  
132  struct iflib_txq;
133  typedef struct iflib_txq *iflib_txq_t;
134  struct iflib_rxq;
135  typedef struct iflib_rxq *iflib_rxq_t;
136  struct iflib_fl;
137  typedef struct iflib_fl *iflib_fl_t;
138  
139  struct iflib_ctx;
140  
141  static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid);
142  static void iflib_timer(void *arg);
143  static void iflib_tqg_detach(if_ctx_t ctx);
144  
145  typedef struct iflib_filter_info {
146  	driver_filter_t *ifi_filter;
147  	void *ifi_filter_arg;
148  	struct grouptask *ifi_task;
149  	void *ifi_ctx;
150  } *iflib_filter_info_t;
151  
152  struct iflib_ctx {
153  	KOBJ_FIELDS;
154  	/*
155  	 * Pointer to hardware driver's softc
156  	 */
157  	void *ifc_softc;
158  	device_t ifc_dev;
159  	if_t ifc_ifp;
160  
161  	cpuset_t ifc_cpus;
162  	if_shared_ctx_t ifc_sctx;
163  	struct if_softc_ctx ifc_softc_ctx;
164  
165  	struct sx ifc_ctx_sx;
166  	struct mtx ifc_state_mtx;
167  
168  	iflib_txq_t ifc_txqs;
169  	iflib_rxq_t ifc_rxqs;
170  	uint32_t ifc_if_flags;
171  	uint32_t ifc_flags;
172  	uint32_t ifc_max_fl_buf_size;
173  	uint32_t ifc_rx_mbuf_sz;
174  
175  	int ifc_link_state;
176  	int ifc_watchdog_events;
177  	struct cdev *ifc_led_dev;
178  	struct resource *ifc_msix_mem;
179  
180  	struct if_irq ifc_legacy_irq;
181  	struct task ifc_admin_task;
182  	struct task ifc_vflr_task;
183  	struct taskqueue *ifc_tq;
184  	struct iflib_filter_info ifc_filter_info;
185  	struct ifmedia	ifc_media;
186  	struct ifmedia	*ifc_mediap;
187  
188  	struct sysctl_oid *ifc_sysctl_node;
189  	uint16_t ifc_sysctl_ntxqs;
190  	uint16_t ifc_sysctl_nrxqs;
191  	uint16_t ifc_sysctl_qs_eq_override;
192  	uint16_t ifc_sysctl_rx_budget;
193  	uint16_t ifc_sysctl_tx_abdicate;
194  	uint16_t ifc_sysctl_core_offset;
195  #define	CORE_OFFSET_UNSPECIFIED	0xffff
196  	uint8_t  ifc_sysctl_separate_txrx;
197  	uint8_t  ifc_sysctl_use_logical_cores;
198  	uint16_t ifc_sysctl_extra_msix_vectors;
199  	bool	 ifc_cpus_are_physical_cores;
200  
201  	qidx_t ifc_sysctl_ntxds[8];
202  	qidx_t ifc_sysctl_nrxds[8];
203  	struct if_txrx ifc_txrx;
204  #define isc_txd_encap		ifc_txrx.ift_txd_encap
205  #define isc_txd_flush		ifc_txrx.ift_txd_flush
206  #define isc_txd_credits_update	ifc_txrx.ift_txd_credits_update
207  #define isc_rxd_available	ifc_txrx.ift_rxd_available
208  #define isc_rxd_pkt_get		ifc_txrx.ift_rxd_pkt_get
209  #define isc_rxd_refill		ifc_txrx.ift_rxd_refill
210  #define isc_rxd_flush		ifc_txrx.ift_rxd_flush
211  #define isc_legacy_intr		ifc_txrx.ift_legacy_intr
212  #define isc_txq_select		ifc_txrx.ift_txq_select
213  #define isc_txq_select_v2	ifc_txrx.ift_txq_select_v2
214  
215  	eventhandler_tag ifc_vlan_attach_event;
216  	eventhandler_tag ifc_vlan_detach_event;
217  	struct ether_addr ifc_mac;
218  };
219  
220  void *
iflib_get_softc(if_ctx_t ctx)221  iflib_get_softc(if_ctx_t ctx)
222  {
223  
224  	return (ctx->ifc_softc);
225  }
226  
227  device_t
iflib_get_dev(if_ctx_t ctx)228  iflib_get_dev(if_ctx_t ctx)
229  {
230  
231  	return (ctx->ifc_dev);
232  }
233  
234  if_t
iflib_get_ifp(if_ctx_t ctx)235  iflib_get_ifp(if_ctx_t ctx)
236  {
237  
238  	return (ctx->ifc_ifp);
239  }
240  
241  struct ifmedia *
iflib_get_media(if_ctx_t ctx)242  iflib_get_media(if_ctx_t ctx)
243  {
244  
245  	return (ctx->ifc_mediap);
246  }
247  
248  void
iflib_set_mac(if_ctx_t ctx,uint8_t mac[ETHER_ADDR_LEN])249  iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN])
250  {
251  
252  	bcopy(mac, ctx->ifc_mac.octet, ETHER_ADDR_LEN);
253  }
254  
255  if_softc_ctx_t
iflib_get_softc_ctx(if_ctx_t ctx)256  iflib_get_softc_ctx(if_ctx_t ctx)
257  {
258  
259  	return (&ctx->ifc_softc_ctx);
260  }
261  
262  if_shared_ctx_t
iflib_get_sctx(if_ctx_t ctx)263  iflib_get_sctx(if_ctx_t ctx)
264  {
265  
266  	return (ctx->ifc_sctx);
267  }
268  
269  uint16_t
iflib_get_extra_msix_vectors_sysctl(if_ctx_t ctx)270  iflib_get_extra_msix_vectors_sysctl(if_ctx_t ctx)
271  {
272  
273  	return (ctx->ifc_sysctl_extra_msix_vectors);
274  }
275  
276  #define IP_ALIGNED(m)		((((uintptr_t)(m)->m_data) & 0x3) == 0x2)
277  #define CACHE_PTR_INCREMENT	(CACHE_LINE_SIZE / sizeof(void *))
278  #define CACHE_PTR_NEXT(ptr)	((void *)(roundup2(ptr, CACHE_LINE_SIZE)))
279  
280  #define LINK_ACTIVE(ctx)	((ctx)->ifc_link_state == LINK_STATE_UP)
281  #define CTX_IS_VF(ctx)		((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
282  
283  typedef struct iflib_sw_rx_desc_array {
284  	bus_dmamap_t	*ifsd_map;         /* bus_dma maps for packet */
285  	struct mbuf	**ifsd_m;           /* pkthdr mbufs */
286  	caddr_t		*ifsd_cl;          /* direct cluster pointer for rx */
287  	bus_addr_t	*ifsd_ba;          /* bus addr of cluster for rx */
288  } iflib_rxsd_array_t;
289  
290  typedef struct iflib_sw_tx_desc_array {
291  	bus_dmamap_t    *ifsd_map;         /* bus_dma maps for packet */
292  	bus_dmamap_t	*ifsd_tso_map;     /* bus_dma maps for TSO packet */
293  	struct mbuf    **ifsd_m;           /* pkthdr mbufs */
294  } if_txsd_vec_t;
295  
296  /* magic number that should be high enough for any hardware */
297  #define IFLIB_MAX_TX_SEGS		128
298  #define IFLIB_RX_COPY_THRESH		128
299  #define IFLIB_MAX_RX_REFRESH		32
300  /* The minimum descriptors per second before we start coalescing */
301  #define IFLIB_MIN_DESC_SEC		16384
302  #define IFLIB_DEFAULT_TX_UPDATE_FREQ	16
303  #define IFLIB_QUEUE_IDLE		0
304  #define IFLIB_QUEUE_HUNG		1
305  #define IFLIB_QUEUE_WORKING		2
306  /* maximum number of txqs that can share an rx interrupt */
307  #define IFLIB_MAX_TX_SHARED_INTR	4
308  
309  /* this should really scale with ring size - this is a fairly arbitrary value */
310  #define TX_BATCH_SIZE			32
311  
312  #define IFLIB_RESTART_BUDGET		8
313  
314  #define	IFC_LEGACY		0x001
315  #define	IFC_QFLUSH		0x002
316  #define	IFC_MULTISEG		0x004
317  #define	IFC_SPARE1		0x008
318  #define	IFC_SC_ALLOCATED	0x010
319  #define	IFC_INIT_DONE		0x020
320  #define	IFC_PREFETCH		0x040
321  #define	IFC_DO_RESET		0x080
322  #define	IFC_DO_WATCHDOG		0x100
323  #define	IFC_SPARE0		0x200
324  #define	IFC_SPARE2		0x400
325  #define	IFC_IN_DETACH		0x800
326  
327  #define	IFC_NETMAP_TX_IRQ	0x80000000
328  
329  #define CSUM_OFFLOAD		(CSUM_IP_TSO | CSUM_IP6_TSO | CSUM_IP | \
330  				 CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_SCTP | \
331  				 CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_SCTP)
332  
333  struct iflib_txq {
334  	qidx_t		ift_in_use;
335  	qidx_t		ift_cidx;
336  	qidx_t		ift_cidx_processed;
337  	qidx_t		ift_pidx;
338  	uint8_t		ift_gen;
339  	uint8_t		ift_br_offset;
340  	uint16_t	ift_npending;
341  	uint16_t	ift_db_pending;
342  	uint16_t	ift_rs_pending;
343  	/* implicit pad */
344  	uint8_t		ift_txd_size[8];
345  	uint64_t	ift_processed;
346  	uint64_t	ift_cleaned;
347  	uint64_t	ift_cleaned_prev;
348  #if MEMORY_LOGGING
349  	uint64_t	ift_enqueued;
350  	uint64_t	ift_dequeued;
351  #endif
352  	uint64_t	ift_no_tx_dma_setup;
353  	uint64_t	ift_no_desc_avail;
354  	uint64_t	ift_mbuf_defrag_failed;
355  	uint64_t	ift_mbuf_defrag;
356  	uint64_t	ift_map_failed;
357  	uint64_t	ift_txd_encap_efbig;
358  	uint64_t	ift_pullups;
359  	uint64_t	ift_last_timer_tick;
360  
361  	struct mtx	ift_mtx;
362  	struct mtx	ift_db_mtx;
363  
364  	/* constant values */
365  	if_ctx_t	ift_ctx;
366  	struct ifmp_ring        *ift_br;
367  	struct grouptask	ift_task;
368  	qidx_t		ift_size;
369  	uint16_t	ift_id;
370  	struct callout	ift_timer;
371  #ifdef DEV_NETMAP
372  	struct callout	ift_netmap_timer;
373  #endif /* DEV_NETMAP */
374  
375  	if_txsd_vec_t	ift_sds;
376  	uint8_t		ift_qstatus;
377  	uint8_t		ift_closed;
378  	uint8_t		ift_update_freq;
379  	struct iflib_filter_info ift_filter_info;
380  	bus_dma_tag_t	ift_buf_tag;
381  	bus_dma_tag_t	ift_tso_buf_tag;
382  	iflib_dma_info_t	ift_ifdi;
383  #define	MTX_NAME_LEN	32
384  	char                    ift_mtx_name[MTX_NAME_LEN];
385  	bus_dma_segment_t	ift_segs[IFLIB_MAX_TX_SEGS]  __aligned(CACHE_LINE_SIZE);
386  #ifdef IFLIB_DIAGNOSTICS
387  	uint64_t ift_cpu_exec_count[256];
388  #endif
389  } __aligned(CACHE_LINE_SIZE);
390  
391  struct iflib_fl {
392  	qidx_t		ifl_cidx;
393  	qidx_t		ifl_pidx;
394  	qidx_t		ifl_credits;
395  	uint8_t		ifl_gen;
396  	uint8_t		ifl_rxd_size;
397  #if MEMORY_LOGGING
398  	uint64_t	ifl_m_enqueued;
399  	uint64_t	ifl_m_dequeued;
400  	uint64_t	ifl_cl_enqueued;
401  	uint64_t	ifl_cl_dequeued;
402  #endif
403  	/* implicit pad */
404  	bitstr_t 	*ifl_rx_bitmap;
405  	qidx_t		ifl_fragidx;
406  	/* constant */
407  	qidx_t		ifl_size;
408  	uint16_t	ifl_buf_size;
409  	uint16_t	ifl_cltype;
410  	uma_zone_t	ifl_zone;
411  	iflib_rxsd_array_t	ifl_sds;
412  	iflib_rxq_t	ifl_rxq;
413  	uint8_t		ifl_id;
414  	bus_dma_tag_t	ifl_buf_tag;
415  	iflib_dma_info_t	ifl_ifdi;
416  	uint64_t	ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE);
417  	qidx_t		ifl_rxd_idxs[IFLIB_MAX_RX_REFRESH];
418  }  __aligned(CACHE_LINE_SIZE);
419  
420  static inline qidx_t
get_inuse(int size,qidx_t cidx,qidx_t pidx,uint8_t gen)421  get_inuse(int size, qidx_t cidx, qidx_t pidx, uint8_t gen)
422  {
423  	qidx_t used;
424  
425  	if (pidx > cidx)
426  		used = pidx - cidx;
427  	else if (pidx < cidx)
428  		used = size - cidx + pidx;
429  	else if (gen == 0 && pidx == cidx)
430  		used = 0;
431  	else if (gen == 1 && pidx == cidx)
432  		used = size;
433  	else
434  		panic("bad state");
435  
436  	return (used);
437  }
438  
439  #define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen))
440  
441  #define IDXDIFF(head, tail, wrap) \
442  	((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
443  
444  struct iflib_rxq {
445  	if_ctx_t	ifr_ctx;
446  	iflib_fl_t	ifr_fl;
447  	uint64_t	ifr_rx_irq;
448  	struct pfil_head	*pfil;
449  	/*
450  	 * If there is a separate completion queue (IFLIB_HAS_RXCQ), this is
451  	 * the completion queue consumer index.  Otherwise it's unused.
452  	 */
453  	qidx_t		ifr_cq_cidx;
454  	uint16_t	ifr_id;
455  	uint8_t		ifr_nfl;
456  	uint8_t		ifr_ntxqirq;
457  	uint8_t		ifr_txqid[IFLIB_MAX_TX_SHARED_INTR];
458  	uint8_t		ifr_fl_offset;
459  	struct lro_ctrl		ifr_lc;
460  	struct grouptask        ifr_task;
461  	struct callout		ifr_watchdog;
462  	struct iflib_filter_info ifr_filter_info;
463  	iflib_dma_info_t	ifr_ifdi;
464  
465  	/* dynamically allocate if any drivers need a value substantially larger than this */
466  	struct if_rxd_frag	ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE);
467  #ifdef IFLIB_DIAGNOSTICS
468  	uint64_t ifr_cpu_exec_count[256];
469  #endif
470  }  __aligned(CACHE_LINE_SIZE);
471  
472  typedef struct if_rxsd {
473  	caddr_t *ifsd_cl;
474  	iflib_fl_t ifsd_fl;
475  } *if_rxsd_t;
476  
477  /* multiple of word size */
478  #ifdef __LP64__
479  #define PKT_INFO_SIZE	6
480  #define RXD_INFO_SIZE	5
481  #define PKT_TYPE uint64_t
482  #else
483  #define PKT_INFO_SIZE	11
484  #define RXD_INFO_SIZE	8
485  #define PKT_TYPE uint32_t
486  #endif
487  #define PKT_LOOP_BOUND	((PKT_INFO_SIZE / 3) * 3)
488  #define RXD_LOOP_BOUND	((RXD_INFO_SIZE / 4) * 4)
489  
490  typedef struct if_pkt_info_pad {
491  	PKT_TYPE pkt_val[PKT_INFO_SIZE];
492  } *if_pkt_info_pad_t;
493  typedef struct if_rxd_info_pad {
494  	PKT_TYPE rxd_val[RXD_INFO_SIZE];
495  } *if_rxd_info_pad_t;
496  
497  CTASSERT(sizeof(struct if_pkt_info_pad) == sizeof(struct if_pkt_info));
498  CTASSERT(sizeof(struct if_rxd_info_pad) == sizeof(struct if_rxd_info));
499  
500  static inline void
pkt_info_zero(if_pkt_info_t pi)501  pkt_info_zero(if_pkt_info_t pi)
502  {
503  	if_pkt_info_pad_t pi_pad;
504  
505  	pi_pad = (if_pkt_info_pad_t)pi;
506  	pi_pad->pkt_val[0] = 0; pi_pad->pkt_val[1] = 0; pi_pad->pkt_val[2] = 0;
507  	pi_pad->pkt_val[3] = 0; pi_pad->pkt_val[4] = 0; pi_pad->pkt_val[5] = 0;
508  #ifndef __LP64__
509  	pi_pad->pkt_val[6] = 0; pi_pad->pkt_val[7] = 0; pi_pad->pkt_val[8] = 0;
510  	pi_pad->pkt_val[9] = 0; pi_pad->pkt_val[10] = 0;
511  #endif
512  }
513  
514  static inline void
rxd_info_zero(if_rxd_info_t ri)515  rxd_info_zero(if_rxd_info_t ri)
516  {
517  	if_rxd_info_pad_t ri_pad;
518  	int i;
519  
520  	ri_pad = (if_rxd_info_pad_t)ri;
521  	for (i = 0; i < RXD_LOOP_BOUND; i += 4) {
522  		ri_pad->rxd_val[i] = 0;
523  		ri_pad->rxd_val[i + 1] = 0;
524  		ri_pad->rxd_val[i + 2] = 0;
525  		ri_pad->rxd_val[i + 3] = 0;
526  	}
527  #ifdef __LP64__
528  	ri_pad->rxd_val[RXD_INFO_SIZE - 1] = 0;
529  #endif
530  }
531  
532  /*
533   * Only allow a single packet to take up most 1/nth of the tx ring
534   */
535  #define MAX_SINGLE_PACKET_FRACTION 12
536  #define IF_BAD_DMA	((bus_addr_t)-1)
537  
538  #define CTX_ACTIVE(ctx)	((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING))
539  
540  #define CTX_LOCK_INIT(_sc)	sx_init(&(_sc)->ifc_ctx_sx, "iflib ctx lock")
541  #define CTX_LOCK(ctx)		sx_xlock(&(ctx)->ifc_ctx_sx)
542  #define CTX_UNLOCK(ctx)		sx_xunlock(&(ctx)->ifc_ctx_sx)
543  #define CTX_LOCK_DESTROY(ctx)	sx_destroy(&(ctx)->ifc_ctx_sx)
544  
545  #define STATE_LOCK_INIT(_sc, _name)	mtx_init(&(_sc)->ifc_state_mtx, _name, "iflib state lock", MTX_DEF)
546  #define STATE_LOCK(ctx)		mtx_lock(&(ctx)->ifc_state_mtx)
547  #define STATE_UNLOCK(ctx)	mtx_unlock(&(ctx)->ifc_state_mtx)
548  #define STATE_LOCK_DESTROY(ctx)	mtx_destroy(&(ctx)->ifc_state_mtx)
549  
550  #define CALLOUT_LOCK(txq)	mtx_lock(&txq->ift_mtx)
551  #define CALLOUT_UNLOCK(txq) 	mtx_unlock(&txq->ift_mtx)
552  
553  /* Our boot-time initialization hook */
554  static int	iflib_module_event_handler(module_t, int, void *);
555  
556  static moduledata_t iflib_moduledata = {
557  	"iflib",
558  	iflib_module_event_handler,
559  	NULL
560  };
561  
562  DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY);
563  MODULE_VERSION(iflib, 1);
564  
565  MODULE_DEPEND(iflib, pci, 1, 1, 1);
566  MODULE_DEPEND(iflib, ether, 1, 1, 1);
567  
568  TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
569  TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
570  
571  #ifndef IFLIB_DEBUG_COUNTERS
572  #ifdef INVARIANTS
573  #define IFLIB_DEBUG_COUNTERS 1
574  #else
575  #define IFLIB_DEBUG_COUNTERS 0
576  #endif /* !INVARIANTS */
577  #endif
578  
579  static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
580      "iflib driver parameters");
581  
582  /*
583   * XXX need to ensure that this can't accidentally cause the head to be moved backwards
584   */
585  static int iflib_min_tx_latency = 0;
586  SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW,
587      &iflib_min_tx_latency, 0,
588      "minimize transmit latency at the possible expense of throughput");
589  static int iflib_no_tx_batch = 0;
590  SYSCTL_INT(_net_iflib, OID_AUTO, no_tx_batch, CTLFLAG_RW,
591      &iflib_no_tx_batch, 0,
592      "minimize transmit latency at the possible expense of throughput");
593  static int iflib_timer_default = 1000;
594  SYSCTL_INT(_net_iflib, OID_AUTO, timer_default, CTLFLAG_RW,
595      &iflib_timer_default, 0, "number of ticks between iflib_timer calls");
596  
597  
598  #if IFLIB_DEBUG_COUNTERS
599  
600  static int iflib_tx_seen;
601  static int iflib_tx_sent;
602  static int iflib_tx_encap;
603  static int iflib_rx_allocs;
604  static int iflib_fl_refills;
605  static int iflib_fl_refills_large;
606  static int iflib_tx_frees;
607  
608  SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD, &iflib_tx_seen, 0,
609      "# TX mbufs seen");
610  SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD, &iflib_tx_sent, 0,
611      "# TX mbufs sent");
612  SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD, &iflib_tx_encap, 0,
613      "# TX mbufs encapped");
614  SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD, &iflib_tx_frees, 0,
615      "# TX frees");
616  SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD, &iflib_rx_allocs, 0,
617      "# RX allocations");
618  SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD, &iflib_fl_refills, 0,
619      "# refills");
620  SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD,
621      &iflib_fl_refills_large, 0, "# large refills");
622  
623  static int iflib_txq_drain_flushing;
624  static int iflib_txq_drain_oactive;
625  static int iflib_txq_drain_notready;
626  
627  SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD,
628      &iflib_txq_drain_flushing, 0, "# drain flushes");
629  SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD,
630      &iflib_txq_drain_oactive, 0, "# drain oactives");
631  SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD,
632      &iflib_txq_drain_notready, 0, "# drain notready");
633  
634  static int iflib_encap_load_mbuf_fail;
635  static int iflib_encap_pad_mbuf_fail;
636  static int iflib_encap_txq_avail_fail;
637  static int iflib_encap_txd_encap_fail;
638  
639  SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD,
640      &iflib_encap_load_mbuf_fail, 0, "# busdma load failures");
641  SYSCTL_INT(_net_iflib, OID_AUTO, encap_pad_mbuf_fail, CTLFLAG_RD,
642      &iflib_encap_pad_mbuf_fail, 0, "# runt frame pad failures");
643  SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD,
644      &iflib_encap_txq_avail_fail, 0, "# txq avail failures");
645  SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD,
646      &iflib_encap_txd_encap_fail, 0, "# driver encap failures");
647  
648  static int iflib_task_fn_rxs;
649  static int iflib_rx_intr_enables;
650  static int iflib_fast_intrs;
651  static int iflib_rx_unavail;
652  static int iflib_rx_ctx_inactive;
653  static int iflib_rx_if_input;
654  static int iflib_rxd_flush;
655  
656  static int iflib_verbose_debug;
657  
658  SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD, &iflib_task_fn_rxs, 0,
659      "# task_fn_rx calls");
660  SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD,
661      &iflib_rx_intr_enables, 0, "# RX intr enables");
662  SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD, &iflib_fast_intrs, 0,
663      "# fast_intr calls");
664  SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD, &iflib_rx_unavail, 0,
665      "# times rxeof called with no available data");
666  SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD,
667      &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context");
668  SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD, &iflib_rx_if_input,
669      0, "# times rxeof called if_input");
670  SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD, &iflib_rxd_flush, 0,
671      "# times rxd_flush called");
672  SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
673      &iflib_verbose_debug, 0, "enable verbose debugging");
674  
675  #define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1)
676  static void
iflib_debug_reset(void)677  iflib_debug_reset(void)
678  {
679  	iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs =
680  		iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees =
681  		iflib_txq_drain_flushing = iflib_txq_drain_oactive =
682  		iflib_txq_drain_notready =
683  		iflib_encap_load_mbuf_fail = iflib_encap_pad_mbuf_fail =
684  		iflib_encap_txq_avail_fail = iflib_encap_txd_encap_fail =
685  		iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs =
686  		iflib_rx_unavail =
687  		iflib_rx_ctx_inactive = iflib_rx_if_input =
688  		iflib_rxd_flush = 0;
689  }
690  
691  #else
692  #define DBG_COUNTER_INC(name)
iflib_debug_reset(void)693  static void iflib_debug_reset(void) {}
694  #endif
695  
696  #define IFLIB_DEBUG 0
697  
698  static void iflib_tx_structures_free(if_ctx_t ctx);
699  static void iflib_rx_structures_free(if_ctx_t ctx);
700  static int iflib_queues_alloc(if_ctx_t ctx);
701  static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq);
702  static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget);
703  static int iflib_qset_structures_setup(if_ctx_t ctx);
704  static int iflib_msix_init(if_ctx_t ctx);
705  static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, const char *str);
706  static void iflib_txq_check_drain(iflib_txq_t txq, int budget);
707  static uint32_t iflib_txq_can_drain(struct ifmp_ring *);
708  #ifdef ALTQ
709  static void iflib_altq_if_start(if_t ifp);
710  static int iflib_altq_if_transmit(if_t ifp, struct mbuf *m);
711  #endif
712  static int iflib_register(if_ctx_t);
713  static void iflib_deregister(if_ctx_t);
714  static void iflib_unregister_vlan_handlers(if_ctx_t ctx);
715  static uint16_t iflib_get_mbuf_size_for(unsigned int size);
716  static void iflib_init_locked(if_ctx_t ctx);
717  static void iflib_add_device_sysctl_pre(if_ctx_t ctx);
718  static void iflib_add_device_sysctl_post(if_ctx_t ctx);
719  static void iflib_ifmp_purge(iflib_txq_t txq);
720  static void _iflib_pre_assert(if_softc_ctx_t scctx);
721  static void iflib_stop(if_ctx_t ctx);
722  static void iflib_if_init_locked(if_ctx_t ctx);
723  static void iflib_free_intr_mem(if_ctx_t ctx);
724  #ifndef __NO_STRICT_ALIGNMENT
725  static struct mbuf *iflib_fixup_rx(struct mbuf *m);
726  #endif
727  
728  static SLIST_HEAD(cpu_offset_list, cpu_offset) cpu_offsets =
729      SLIST_HEAD_INITIALIZER(cpu_offsets);
730  struct cpu_offset {
731  	SLIST_ENTRY(cpu_offset) entries;
732  	cpuset_t	set;
733  	unsigned int	refcount;
734  	uint16_t	next_cpuid;
735  };
736  static struct mtx cpu_offset_mtx;
737  MTX_SYSINIT(iflib_cpu_offset, &cpu_offset_mtx, "iflib_cpu_offset lock",
738      MTX_DEF);
739  
740  DEBUGNET_DEFINE(iflib);
741  
742  static int
iflib_num_rx_descs(if_ctx_t ctx)743  iflib_num_rx_descs(if_ctx_t ctx)
744  {
745  	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
746  	if_shared_ctx_t sctx = ctx->ifc_sctx;
747  	uint16_t first_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0;
748  
749  	return (scctx->isc_nrxd[first_rxq]);
750  }
751  
752  static int
iflib_num_tx_descs(if_ctx_t ctx)753  iflib_num_tx_descs(if_ctx_t ctx)
754  {
755  	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
756  	if_shared_ctx_t sctx = ctx->ifc_sctx;
757  	uint16_t first_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0;
758  
759  	return (scctx->isc_ntxd[first_txq]);
760  }
761  
762  #ifdef DEV_NETMAP
763  #include <sys/selinfo.h>
764  #include <net/netmap.h>
765  #include <dev/netmap/netmap_kern.h>
766  
767  MODULE_DEPEND(iflib, netmap, 1, 1, 1);
768  
769  static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init);
770  static void iflib_netmap_timer(void *arg);
771  
772  /*
773   * device-specific sysctl variables:
774   *
775   * iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
776   *	During regular operations the CRC is stripped, but on some
777   *	hardware reception of frames not multiple of 64 is slower,
778   *	so using crcstrip=0 helps in benchmarks.
779   *
780   * iflib_rx_miss, iflib_rx_miss_bufs:
781   *	count packets that might be missed due to lost interrupts.
782   */
783  SYSCTL_DECL(_dev_netmap);
784  /*
785   * The xl driver by default strips CRCs and we do not override it.
786   */
787  
788  int iflib_crcstrip = 1;
789  SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip,
790      CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on RX frames");
791  
792  int iflib_rx_miss, iflib_rx_miss_bufs;
793  SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss,
794      CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed RX intr");
795  SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs,
796      CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed RX intr bufs");
797  
798  /*
799   * Register/unregister. We are already under netmap lock.
800   * Only called on the first register or the last unregister.
801   */
802  static int
iflib_netmap_register(struct netmap_adapter * na,int onoff)803  iflib_netmap_register(struct netmap_adapter *na, int onoff)
804  {
805  	if_t ifp = na->ifp;
806  	if_ctx_t ctx = if_getsoftc(ifp);
807  	int status;
808  
809  	CTX_LOCK(ctx);
810  	if (!CTX_IS_VF(ctx))
811  		IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip);
812  
813  	iflib_stop(ctx);
814  
815  	/*
816  	 * Enable (or disable) netmap flags, and intercept (or restore)
817  	 * ifp->if_transmit. This is done once the device has been stopped
818  	 * to prevent race conditions. Also, this must be done after
819  	 * calling netmap_disable_all_rings() and before calling
820  	 * netmap_enable_all_rings(), so that these two functions see the
821  	 * updated state of the NAF_NETMAP_ON bit.
822  	 */
823  	if (onoff) {
824  		nm_set_native_flags(na);
825  	} else {
826  		nm_clear_native_flags(na);
827  	}
828  
829  	iflib_init_locked(ctx);
830  	IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ?
831  	status = if_getdrvflags(ifp) & IFF_DRV_RUNNING ? 0 : 1;
832  	if (status)
833  		nm_clear_native_flags(na);
834  	CTX_UNLOCK(ctx);
835  	return (status);
836  }
837  
838  static int
iflib_netmap_config(struct netmap_adapter * na,struct nm_config_info * info)839  iflib_netmap_config(struct netmap_adapter *na, struct nm_config_info *info)
840  {
841  	if_t ifp = na->ifp;
842  	if_ctx_t ctx = if_getsoftc(ifp);
843  	iflib_rxq_t rxq = &ctx->ifc_rxqs[0];
844  	iflib_fl_t fl = &rxq->ifr_fl[0];
845  
846  	info->num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
847  	info->num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
848  	info->num_tx_descs = iflib_num_tx_descs(ctx);
849  	info->num_rx_descs = iflib_num_rx_descs(ctx);
850  	info->rx_buf_maxsize = fl->ifl_buf_size;
851  	nm_prinf("txr %u rxr %u txd %u rxd %u rbufsz %u",
852  		info->num_tx_rings, info->num_rx_rings, info->num_tx_descs,
853  		info->num_rx_descs, info->rx_buf_maxsize);
854  
855  	return (0);
856  }
857  
858  static int
netmap_fl_refill(iflib_rxq_t rxq,struct netmap_kring * kring,bool init)859  netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init)
860  {
861  	struct netmap_adapter *na = kring->na;
862  	u_int const lim = kring->nkr_num_slots - 1;
863  	struct netmap_ring *ring = kring->ring;
864  	bus_dmamap_t *map;
865  	struct if_rxd_update iru;
866  	if_ctx_t ctx = rxq->ifr_ctx;
867  	iflib_fl_t fl = &rxq->ifr_fl[0];
868  	u_int nic_i_first, nic_i;
869  	u_int nm_i;
870  	int i, n;
871  #if IFLIB_DEBUG_COUNTERS
872  	int rf_count = 0;
873  #endif
874  
875  	/*
876  	 * This function is used both at initialization and in rxsync.
877  	 * At initialization we need to prepare (with isc_rxd_refill())
878  	 * all the netmap buffers currently owned by the kernel, in
879  	 * such a way to keep fl->ifl_pidx and kring->nr_hwcur in sync
880  	 * (except for kring->nkr_hwofs). These may be less than
881  	 * kring->nkr_num_slots if netmap_reset() was called while
882  	 * an application using the kring that still owned some
883  	 * buffers.
884  	 * At rxsync time, both indexes point to the next buffer to be
885  	 * refilled.
886  	 * In any case we publish (with isc_rxd_flush()) up to
887  	 * (fl->ifl_pidx - 1) % N (included), to avoid the NIC tail/prod
888  	 * pointer to overrun the head/cons pointer, although this is
889  	 * not necessary for some NICs (e.g. vmx).
890  	 */
891  	if (__predict_false(init)) {
892  		n = kring->nkr_num_slots - nm_kr_rxspace(kring);
893  	} else {
894  		n = kring->rhead - kring->nr_hwcur;
895  		if (n == 0)
896  			return (0); /* Nothing to do. */
897  		if (n < 0)
898  			n += kring->nkr_num_slots;
899  	}
900  
901  	iru_init(&iru, rxq, 0 /* flid */);
902  	map = fl->ifl_sds.ifsd_map;
903  	nic_i = fl->ifl_pidx;
904  	nm_i = netmap_idx_n2k(kring, nic_i);
905  	if (__predict_false(init)) {
906  		/*
907  		 * On init/reset, nic_i must be 0, and we must
908  		 * start to refill from hwtail (see netmap_reset()).
909  		 */
910  		MPASS(nic_i == 0);
911  		MPASS(nm_i == kring->nr_hwtail);
912  	} else
913  		MPASS(nm_i == kring->nr_hwcur);
914  	DBG_COUNTER_INC(fl_refills);
915  	while (n > 0) {
916  #if IFLIB_DEBUG_COUNTERS
917  		if (++rf_count == 9)
918  			DBG_COUNTER_INC(fl_refills_large);
919  #endif
920  		nic_i_first = nic_i;
921  		for (i = 0; n > 0 && i < IFLIB_MAX_RX_REFRESH; n--, i++) {
922  			struct netmap_slot *slot = &ring->slot[nm_i];
923  			uint64_t paddr;
924  			void *addr = PNMB(na, slot, &paddr);
925  
926  			MPASS(i < IFLIB_MAX_RX_REFRESH);
927  
928  			if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
929  			        return (netmap_ring_reinit(kring));
930  
931  			fl->ifl_bus_addrs[i] = paddr +
932  			    nm_get_offset(kring, slot);
933  			fl->ifl_rxd_idxs[i] = nic_i;
934  
935  			if (__predict_false(init)) {
936  				netmap_load_map(na, fl->ifl_buf_tag,
937  				    map[nic_i], addr);
938  			} else if (slot->flags & NS_BUF_CHANGED) {
939  				/* buffer has changed, reload map */
940  				netmap_reload_map(na, fl->ifl_buf_tag,
941  				    map[nic_i], addr);
942  			}
943  			bus_dmamap_sync(fl->ifl_buf_tag, map[nic_i],
944  			    BUS_DMASYNC_PREREAD);
945  			slot->flags &= ~NS_BUF_CHANGED;
946  
947  			nm_i = nm_next(nm_i, lim);
948  			nic_i = nm_next(nic_i, lim);
949  		}
950  
951  		iru.iru_pidx = nic_i_first;
952  		iru.iru_count = i;
953  		ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
954  	}
955  	fl->ifl_pidx = nic_i;
956  	/*
957  	 * At the end of the loop we must have refilled everything
958  	 * we could possibly refill.
959  	 */
960  	MPASS(nm_i == kring->rhead);
961  	kring->nr_hwcur = nm_i;
962  
963  	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
964  	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
965  	ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id,
966  	    nm_prev(nic_i, lim));
967  	DBG_COUNTER_INC(rxd_flush);
968  
969  	return (0);
970  }
971  
972  #define NETMAP_TX_TIMER_US	90
973  
974  /*
975   * Reconcile kernel and user view of the transmit ring.
976   *
977   * All information is in the kring.
978   * Userspace wants to send packets up to the one before kring->rhead,
979   * kernel knows kring->nr_hwcur is the first unsent packet.
980   *
981   * Here we push packets out (as many as possible), and possibly
982   * reclaim buffers from previously completed transmission.
983   *
984   * The caller (netmap) guarantees that there is only one instance
985   * running at any time. Any interference with other driver
986   * methods should be handled by the individual drivers.
987   */
988  static int
iflib_netmap_txsync(struct netmap_kring * kring,int flags)989  iflib_netmap_txsync(struct netmap_kring *kring, int flags)
990  {
991  	struct netmap_adapter *na = kring->na;
992  	if_t ifp = na->ifp;
993  	struct netmap_ring *ring = kring->ring;
994  	u_int nm_i;	/* index into the netmap kring */
995  	u_int nic_i;	/* index into the NIC ring */
996  	u_int const lim = kring->nkr_num_slots - 1;
997  	u_int const head = kring->rhead;
998  	struct if_pkt_info pi;
999  	int tx_pkts = 0, tx_bytes = 0;
1000  
1001  	/*
1002  	 * interrupts on every tx packet are expensive so request
1003  	 * them every half ring, or where NS_REPORT is set
1004  	 */
1005  	u_int report_frequency = kring->nkr_num_slots >> 1;
1006  	/* device-specific */
1007  	if_ctx_t ctx = if_getsoftc(ifp);
1008  	iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id];
1009  
1010  	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
1011  	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1012  
1013  	/*
1014  	 * First part: process new packets to send.
1015  	 * nm_i is the current index in the netmap kring,
1016  	 * nic_i is the corresponding index in the NIC ring.
1017  	 *
1018  	 * If we have packets to send (nm_i != head)
1019  	 * iterate over the netmap ring, fetch length and update
1020  	 * the corresponding slot in the NIC ring. Some drivers also
1021  	 * need to update the buffer's physical address in the NIC slot
1022  	 * even NS_BUF_CHANGED is not set (PNMB computes the addresses).
1023  	 *
1024  	 * The netmap_reload_map() calls is especially expensive,
1025  	 * even when (as in this case) the tag is 0, so do only
1026  	 * when the buffer has actually changed.
1027  	 *
1028  	 * If possible do not set the report/intr bit on all slots,
1029  	 * but only a few times per ring or when NS_REPORT is set.
1030  	 *
1031  	 * Finally, on 10G and faster drivers, it might be useful
1032  	 * to prefetch the next slot and txr entry.
1033  	 */
1034  
1035  	nm_i = kring->nr_hwcur;
1036  	if (nm_i != head) {	/* we have new packets to send */
1037  		uint32_t pkt_len = 0, seg_idx = 0;
1038  		int nic_i_start = -1, flags = 0;
1039  		pkt_info_zero(&pi);
1040  		pi.ipi_segs = txq->ift_segs;
1041  		pi.ipi_qsidx = kring->ring_id;
1042  		nic_i = netmap_idx_k2n(kring, nm_i);
1043  
1044  		__builtin_prefetch(&ring->slot[nm_i]);
1045  		__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]);
1046  		__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
1047  
1048  		while (nm_i != head) {
1049  			struct netmap_slot *slot = &ring->slot[nm_i];
1050  			uint64_t offset = nm_get_offset(kring, slot);
1051  			u_int len = slot->len;
1052  			uint64_t paddr;
1053  			void *addr = PNMB(na, slot, &paddr);
1054  
1055  			flags |= (slot->flags & NS_REPORT ||
1056  				nic_i == 0 || nic_i == report_frequency) ?
1057  				IPI_TX_INTR : 0;
1058  
1059  			/*
1060  			 * If this is the first packet fragment, save the
1061  			 * index of the first NIC slot for later.
1062  			 */
1063  			if (nic_i_start < 0)
1064  				nic_i_start = nic_i;
1065  
1066  			pi.ipi_segs[seg_idx].ds_addr = paddr + offset;
1067  			pi.ipi_segs[seg_idx].ds_len = len;
1068  			if (len) {
1069  				pkt_len += len;
1070  				seg_idx++;
1071  			}
1072  
1073  			if (!(slot->flags & NS_MOREFRAG)) {
1074  				pi.ipi_len = pkt_len;
1075  				pi.ipi_nsegs = seg_idx;
1076  				pi.ipi_pidx = nic_i_start;
1077  				pi.ipi_ndescs = 0;
1078  				pi.ipi_flags = flags;
1079  
1080  				/* Prepare the NIC TX ring. */
1081  				ctx->isc_txd_encap(ctx->ifc_softc, &pi);
1082  				DBG_COUNTER_INC(tx_encap);
1083  
1084  				/* Update transmit counters */
1085  				tx_bytes += pi.ipi_len;
1086  				tx_pkts++;
1087  
1088  				/* Reinit per-packet info for the next one. */
1089  				flags = seg_idx = pkt_len = 0;
1090  				nic_i_start = -1;
1091  			}
1092  
1093  			/* prefetch for next round */
1094  			__builtin_prefetch(&ring->slot[nm_i + 1]);
1095  			__builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]);
1096  			__builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
1097  
1098  			NM_CHECK_ADDR_LEN_OFF(na, len, offset);
1099  
1100  			if (slot->flags & NS_BUF_CHANGED) {
1101  				/* buffer has changed, reload map */
1102  				netmap_reload_map(na, txq->ift_buf_tag,
1103  				    txq->ift_sds.ifsd_map[nic_i], addr);
1104  			}
1105  			/* make sure changes to the buffer are synced */
1106  			bus_dmamap_sync(txq->ift_buf_tag,
1107  			    txq->ift_sds.ifsd_map[nic_i],
1108  			    BUS_DMASYNC_PREWRITE);
1109  
1110  			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED | NS_MOREFRAG);
1111  			nm_i = nm_next(nm_i, lim);
1112  			nic_i = nm_next(nic_i, lim);
1113  		}
1114  		kring->nr_hwcur = nm_i;
1115  
1116  		/* synchronize the NIC ring */
1117  		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
1118  		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1119  
1120  		/* (re)start the tx unit up to slot nic_i (excluded) */
1121  		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i);
1122  	}
1123  
1124  	/*
1125  	 * Second part: reclaim buffers for completed transmissions.
1126  	 *
1127  	 * If there are unclaimed buffers, attempt to reclaim them.
1128  	 * If we don't manage to reclaim them all, and TX IRQs are not in use,
1129  	 * trigger a per-tx-queue timer to try again later.
1130  	 */
1131  	if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
1132  		if (iflib_tx_credits_update(ctx, txq)) {
1133  			/* some tx completed, increment avail */
1134  			nic_i = txq->ift_cidx_processed;
1135  			kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
1136  		}
1137  	}
1138  
1139  	if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ))
1140  		if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
1141  			callout_reset_sbt_on(&txq->ift_netmap_timer,
1142  			    NETMAP_TX_TIMER_US * SBT_1US, SBT_1US,
1143  			    iflib_netmap_timer, txq,
1144  			    txq->ift_netmap_timer.c_cpu, 0);
1145  		}
1146  
1147  	if_inc_counter(ifp, IFCOUNTER_OBYTES, tx_bytes);
1148  	if_inc_counter(ifp, IFCOUNTER_OPACKETS, tx_pkts);
1149  
1150  	return (0);
1151  }
1152  
1153  /*
1154   * Reconcile kernel and user view of the receive ring.
1155   * Same as for the txsync, this routine must be efficient.
1156   * The caller guarantees a single invocations, but races against
1157   * the rest of the driver should be handled here.
1158   *
1159   * On call, kring->rhead is the first packet that userspace wants
1160   * to keep, and kring->rcur is the wakeup point.
1161   * The kernel has previously reported packets up to kring->rtail.
1162   *
1163   * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
1164   * of whether or not we received an interrupt.
1165   */
1166  static int
iflib_netmap_rxsync(struct netmap_kring * kring,int flags)1167  iflib_netmap_rxsync(struct netmap_kring *kring, int flags)
1168  {
1169  	struct netmap_adapter *na = kring->na;
1170  	struct netmap_ring *ring = kring->ring;
1171  	if_t ifp = na->ifp;
1172  	uint32_t nm_i;	/* index into the netmap ring */
1173  	uint32_t nic_i;	/* index into the NIC ring */
1174  	u_int n;
1175  	u_int const lim = kring->nkr_num_slots - 1;
1176  	int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
1177  	int i = 0, rx_bytes = 0, rx_pkts = 0;
1178  
1179  	if_ctx_t ctx = if_getsoftc(ifp);
1180  	if_shared_ctx_t sctx = ctx->ifc_sctx;
1181  	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1182  	iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id];
1183  	iflib_fl_t fl = &rxq->ifr_fl[0];
1184  	struct if_rxd_info ri;
1185  	qidx_t *cidxp;
1186  
1187  	/*
1188  	 * netmap only uses free list 0, to avoid out of order consumption
1189  	 * of receive buffers
1190  	 */
1191  
1192  	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
1193  	    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1194  
1195  	/*
1196  	 * First part: import newly received packets.
1197  	 *
1198  	 * nm_i is the index of the next free slot in the netmap ring,
1199  	 * nic_i is the index of the next received packet in the NIC ring
1200  	 * (or in the free list 0 if IFLIB_HAS_RXCQ is set), and they may
1201  	 * differ in case if_init() has been called while
1202  	 * in netmap mode. For the receive ring we have
1203  	 *
1204  	 *	nic_i = fl->ifl_cidx;
1205  	 *	nm_i = kring->nr_hwtail (previous)
1206  	 * and
1207  	 *	nm_i == (nic_i + kring->nkr_hwofs) % ring_size
1208  	 *
1209  	 * fl->ifl_cidx is set to 0 on a ring reinit
1210  	 */
1211  	if (netmap_no_pendintr || force_update) {
1212  		uint32_t hwtail_lim = nm_prev(kring->nr_hwcur, lim);
1213  		bool have_rxcq = sctx->isc_flags & IFLIB_HAS_RXCQ;
1214  		int crclen = iflib_crcstrip ? 0 : 4;
1215  		int error, avail;
1216  
1217  		/*
1218  		 * For the free list consumer index, we use the same
1219  		 * logic as in iflib_rxeof().
1220  		 */
1221  		if (have_rxcq)
1222  			cidxp = &rxq->ifr_cq_cidx;
1223  		else
1224  			cidxp = &fl->ifl_cidx;
1225  		avail = ctx->isc_rxd_available(ctx->ifc_softc,
1226  		    rxq->ifr_id, *cidxp, USHRT_MAX);
1227  
1228  		nic_i = fl->ifl_cidx;
1229  		nm_i = netmap_idx_n2k(kring, nic_i);
1230  		MPASS(nm_i == kring->nr_hwtail);
1231  		for (n = 0; avail > 0 && nm_i != hwtail_lim; n++, avail--) {
1232  			rxd_info_zero(&ri);
1233  			ri.iri_frags = rxq->ifr_frags;
1234  			ri.iri_qsidx = kring->ring_id;
1235  			ri.iri_ifp = ctx->ifc_ifp;
1236  			ri.iri_cidx = *cidxp;
1237  
1238  			error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
1239  			for (i = 0; i < ri.iri_nfrags; i++) {
1240  				if (error) {
1241  					ring->slot[nm_i].len = 0;
1242  					ring->slot[nm_i].flags = 0;
1243  				} else {
1244  					ring->slot[nm_i].len = ri.iri_frags[i].irf_len;
1245  					if (i == (ri.iri_nfrags - 1)) {
1246  						ring->slot[nm_i].len -= crclen;
1247  						ring->slot[nm_i].flags = 0;
1248  
1249  						/* Update receive counters */
1250  						rx_bytes += ri.iri_len;
1251  						rx_pkts++;
1252  					} else
1253  						ring->slot[nm_i].flags = NS_MOREFRAG;
1254  				}
1255  
1256  				bus_dmamap_sync(fl->ifl_buf_tag,
1257  				    fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD);
1258  				nm_i = nm_next(nm_i, lim);
1259  				fl->ifl_cidx = nic_i = nm_next(nic_i, lim);
1260  			}
1261  
1262  			if (have_rxcq) {
1263  				*cidxp = ri.iri_cidx;
1264  				while (*cidxp >= scctx->isc_nrxd[0])
1265  					*cidxp -= scctx->isc_nrxd[0];
1266  			}
1267  
1268  		}
1269  		if (n) { /* update the state variables */
1270  			if (netmap_no_pendintr && !force_update) {
1271  				/* diagnostics */
1272  				iflib_rx_miss++;
1273  				iflib_rx_miss_bufs += n;
1274  			}
1275  			kring->nr_hwtail = nm_i;
1276  		}
1277  		kring->nr_kflags &= ~NKR_PENDINTR;
1278  	}
1279  	/*
1280  	 * Second part: skip past packets that userspace has released.
1281  	 * (kring->nr_hwcur to head excluded),
1282  	 * and make the buffers available for reception.
1283  	 * As usual nm_i is the index in the netmap ring,
1284  	 * nic_i is the index in the NIC ring, and
1285  	 * nm_i == (nic_i + kring->nkr_hwofs) % ring_size
1286  	 */
1287  	netmap_fl_refill(rxq, kring, false);
1288  
1289  	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
1290  	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
1291  
1292  	return (0);
1293  }
1294  
1295  static void
iflib_netmap_intr(struct netmap_adapter * na,int onoff)1296  iflib_netmap_intr(struct netmap_adapter *na, int onoff)
1297  {
1298  	if_ctx_t ctx = if_getsoftc(na->ifp);
1299  
1300  	CTX_LOCK(ctx);
1301  	if (onoff) {
1302  		IFDI_INTR_ENABLE(ctx);
1303  	} else {
1304  		IFDI_INTR_DISABLE(ctx);
1305  	}
1306  	CTX_UNLOCK(ctx);
1307  }
1308  
1309  static int
iflib_netmap_attach(if_ctx_t ctx)1310  iflib_netmap_attach(if_ctx_t ctx)
1311  {
1312  	struct netmap_adapter na;
1313  
1314  	bzero(&na, sizeof(na));
1315  
1316  	na.ifp = ctx->ifc_ifp;
1317  	na.na_flags = NAF_BDG_MAYSLEEP | NAF_MOREFRAG | NAF_OFFSETS;
1318  	MPASS(ctx->ifc_softc_ctx.isc_ntxqsets);
1319  	MPASS(ctx->ifc_softc_ctx.isc_nrxqsets);
1320  
1321  	na.num_tx_desc = iflib_num_tx_descs(ctx);
1322  	na.num_rx_desc = iflib_num_rx_descs(ctx);
1323  	na.nm_txsync = iflib_netmap_txsync;
1324  	na.nm_rxsync = iflib_netmap_rxsync;
1325  	na.nm_register = iflib_netmap_register;
1326  	na.nm_intr = iflib_netmap_intr;
1327  	na.nm_config = iflib_netmap_config;
1328  	na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
1329  	na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
1330  	return (netmap_attach(&na));
1331  }
1332  
1333  static int
iflib_netmap_txq_init(if_ctx_t ctx,iflib_txq_t txq)1334  iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq)
1335  {
1336  	struct netmap_adapter *na = NA(ctx->ifc_ifp);
1337  	struct netmap_slot *slot;
1338  
1339  	slot = netmap_reset(na, NR_TX, txq->ift_id, 0);
1340  	if (slot == NULL)
1341  		return (0);
1342  	for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) {
1343  		/*
1344  		 * In netmap mode, set the map for the packet buffer.
1345  		 * NOTE: Some drivers (not this one) also need to set
1346  		 * the physical buffer address in the NIC ring.
1347  		 * netmap_idx_n2k() maps a nic index, i, into the corresponding
1348  		 * netmap slot index, si
1349  		 */
1350  		int si = netmap_idx_n2k(na->tx_rings[txq->ift_id], i);
1351  		netmap_load_map(na, txq->ift_buf_tag, txq->ift_sds.ifsd_map[i],
1352  		    NMB(na, slot + si));
1353  	}
1354  	return (1);
1355  }
1356  
1357  static int
iflib_netmap_rxq_init(if_ctx_t ctx,iflib_rxq_t rxq)1358  iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq)
1359  {
1360  	struct netmap_adapter *na = NA(ctx->ifc_ifp);
1361  	struct netmap_kring *kring;
1362  	struct netmap_slot *slot;
1363  
1364  	slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0);
1365  	if (slot == NULL)
1366  		return (0);
1367  	kring = na->rx_rings[rxq->ifr_id];
1368  	netmap_fl_refill(rxq, kring, true);
1369  	return (1);
1370  }
1371  
1372  static void
iflib_netmap_timer(void * arg)1373  iflib_netmap_timer(void *arg)
1374  {
1375  	iflib_txq_t txq = arg;
1376  	if_ctx_t ctx = txq->ift_ctx;
1377  
1378  	/*
1379  	 * Wake up the netmap application, to give it a chance to
1380  	 * call txsync and reclaim more completed TX buffers.
1381  	 */
1382  	netmap_tx_irq(ctx->ifc_ifp, txq->ift_id);
1383  }
1384  
1385  #define iflib_netmap_detach(ifp) netmap_detach(ifp)
1386  
1387  #else
1388  #define iflib_netmap_txq_init(ctx, txq) (0)
1389  #define iflib_netmap_rxq_init(ctx, rxq) (0)
1390  #define iflib_netmap_detach(ifp)
1391  #define netmap_enable_all_rings(ifp)
1392  #define netmap_disable_all_rings(ifp)
1393  
1394  #define iflib_netmap_attach(ctx) (0)
1395  #define netmap_rx_irq(ifp, qid, budget) (0)
1396  #endif
1397  
1398  #if defined(__i386__) || defined(__amd64__)
1399  static __inline void
prefetch(void * x)1400  prefetch(void *x)
1401  {
1402  	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
1403  }
1404  
1405  static __inline void
prefetch2cachelines(void * x)1406  prefetch2cachelines(void *x)
1407  {
1408  	__asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
1409  #if (CACHE_LINE_SIZE < 128)
1410  	__asm volatile("prefetcht0 %0" :: "m" (*(((unsigned long *)x) + CACHE_LINE_SIZE / (sizeof(unsigned long)))));
1411  #endif
1412  }
1413  #else
1414  static __inline void
prefetch(void * x)1415  prefetch(void *x)
1416  {
1417  }
1418  
1419  static __inline void
prefetch2cachelines(void * x)1420  prefetch2cachelines(void *x)
1421  {
1422  }
1423  #endif
1424  
1425  static void
iru_init(if_rxd_update_t iru,iflib_rxq_t rxq,uint8_t flid)1426  iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid)
1427  {
1428  	iflib_fl_t fl;
1429  
1430  	fl = &rxq->ifr_fl[flid];
1431  	iru->iru_paddrs = fl->ifl_bus_addrs;
1432  	iru->iru_idxs = fl->ifl_rxd_idxs;
1433  	iru->iru_qsidx = rxq->ifr_id;
1434  	iru->iru_buf_size = fl->ifl_buf_size;
1435  	iru->iru_flidx = fl->ifl_id;
1436  }
1437  
1438  static void
_iflib_dmamap_cb(void * arg,bus_dma_segment_t * segs,int nseg,int err)1439  _iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
1440  {
1441  	if (err)
1442  		return;
1443  	*(bus_addr_t *) arg = segs[0].ds_addr;
1444  }
1445  
1446  #define	DMA_WIDTH_TO_BUS_LOWADDR(width)				\
1447  	(((width) == 0) || (width) == flsll(BUS_SPACE_MAXADDR) ?	\
1448  	    BUS_SPACE_MAXADDR : (1ULL << (width)) - 1ULL)
1449  
1450  int
iflib_dma_alloc_align(if_ctx_t ctx,int size,int align,iflib_dma_info_t dma,int mapflags)1451  iflib_dma_alloc_align(if_ctx_t ctx, int size, int align, iflib_dma_info_t dma, int mapflags)
1452  {
1453  	int err;
1454  	device_t dev = ctx->ifc_dev;
1455  	bus_addr_t lowaddr;
1456  
1457  	lowaddr = DMA_WIDTH_TO_BUS_LOWADDR(ctx->ifc_softc_ctx.isc_dma_width);
1458  
1459  	err = bus_dma_tag_create(bus_get_dma_tag(dev),	/* parent */
1460  				align, 0,		/* alignment, bounds */
1461  				lowaddr,		/* lowaddr */
1462  				BUS_SPACE_MAXADDR,	/* highaddr */
1463  				NULL, NULL,		/* filter, filterarg */
1464  				size,			/* maxsize */
1465  				1,			/* nsegments */
1466  				size,			/* maxsegsize */
1467  				BUS_DMA_ALLOCNOW,	/* flags */
1468  				NULL,			/* lockfunc */
1469  				NULL,			/* lockarg */
1470  				&dma->idi_tag);
1471  	if (err) {
1472  		device_printf(dev,
1473  		    "%s: bus_dma_tag_create failed: %d (size=%d, align=%d)\n",
1474  		    __func__, err, size, align);
1475  		goto fail_0;
1476  	}
1477  
1478  	err = bus_dmamem_alloc(dma->idi_tag, (void **)&dma->idi_vaddr,
1479  	    BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map);
1480  	if (err) {
1481  		device_printf(dev,
1482  		    "%s: bus_dmamem_alloc(%ju) failed: %d\n",
1483  		    __func__, (uintmax_t)size, err);
1484  		goto fail_1;
1485  	}
1486  
1487  	dma->idi_paddr = IF_BAD_DMA;
1488  	err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr,
1489  	    size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT);
1490  	if (err || dma->idi_paddr == IF_BAD_DMA) {
1491  		device_printf(dev,
1492  		    "%s: bus_dmamap_load failed: %d\n",
1493  		    __func__, err);
1494  		goto fail_2;
1495  	}
1496  
1497  	dma->idi_size = size;
1498  	return (0);
1499  
1500  fail_2:
1501  	bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
1502  fail_1:
1503  	bus_dma_tag_destroy(dma->idi_tag);
1504  fail_0:
1505  	dma->idi_tag = NULL;
1506  
1507  	return (err);
1508  }
1509  
1510  int
iflib_dma_alloc(if_ctx_t ctx,int size,iflib_dma_info_t dma,int mapflags)1511  iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags)
1512  {
1513  	if_shared_ctx_t sctx = ctx->ifc_sctx;
1514  
1515  	KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized"));
1516  
1517  	return (iflib_dma_alloc_align(ctx, size, sctx->isc_q_align, dma, mapflags));
1518  }
1519  
1520  int
iflib_dma_alloc_multi(if_ctx_t ctx,int * sizes,iflib_dma_info_t * dmalist,int mapflags,int count)1521  iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count)
1522  {
1523  	int i, err;
1524  	iflib_dma_info_t *dmaiter;
1525  
1526  	dmaiter = dmalist;
1527  	for (i = 0; i < count; i++, dmaiter++) {
1528  		if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0)
1529  			break;
1530  	}
1531  	if (err)
1532  		iflib_dma_free_multi(dmalist, i);
1533  	return (err);
1534  }
1535  
1536  void
iflib_dma_free(iflib_dma_info_t dma)1537  iflib_dma_free(iflib_dma_info_t dma)
1538  {
1539  	if (dma->idi_tag == NULL)
1540  		return;
1541  	if (dma->idi_paddr != IF_BAD_DMA) {
1542  		bus_dmamap_sync(dma->idi_tag, dma->idi_map,
1543  		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
1544  		bus_dmamap_unload(dma->idi_tag, dma->idi_map);
1545  		dma->idi_paddr = IF_BAD_DMA;
1546  	}
1547  	if (dma->idi_vaddr != NULL) {
1548  		bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
1549  		dma->idi_vaddr = NULL;
1550  	}
1551  	bus_dma_tag_destroy(dma->idi_tag);
1552  	dma->idi_tag = NULL;
1553  }
1554  
1555  void
iflib_dma_free_multi(iflib_dma_info_t * dmalist,int count)1556  iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count)
1557  {
1558  	int i;
1559  	iflib_dma_info_t *dmaiter = dmalist;
1560  
1561  	for (i = 0; i < count; i++, dmaiter++)
1562  		iflib_dma_free(*dmaiter);
1563  }
1564  
1565  static int
iflib_fast_intr(void * arg)1566  iflib_fast_intr(void *arg)
1567  {
1568  	iflib_filter_info_t info = arg;
1569  	struct grouptask *gtask = info->ifi_task;
1570  	int result;
1571  
1572  	DBG_COUNTER_INC(fast_intrs);
1573  	if (info->ifi_filter != NULL) {
1574  		result = info->ifi_filter(info->ifi_filter_arg);
1575  		if ((result & FILTER_SCHEDULE_THREAD) == 0)
1576  			return (result);
1577  	}
1578  
1579  	GROUPTASK_ENQUEUE(gtask);
1580  	return (FILTER_HANDLED);
1581  }
1582  
1583  static int
iflib_fast_intr_rxtx(void * arg)1584  iflib_fast_intr_rxtx(void *arg)
1585  {
1586  	iflib_filter_info_t info = arg;
1587  	struct grouptask *gtask = info->ifi_task;
1588  	if_ctx_t ctx;
1589  	iflib_rxq_t rxq = (iflib_rxq_t)info->ifi_ctx;
1590  	iflib_txq_t txq;
1591  	void *sc;
1592  	int i, cidx, result;
1593  	qidx_t txqid;
1594  	bool intr_enable, intr_legacy;
1595  
1596  	DBG_COUNTER_INC(fast_intrs);
1597  	if (info->ifi_filter != NULL) {
1598  		result = info->ifi_filter(info->ifi_filter_arg);
1599  		if ((result & FILTER_SCHEDULE_THREAD) == 0)
1600  			return (result);
1601  	}
1602  
1603  	ctx = rxq->ifr_ctx;
1604  	sc = ctx->ifc_softc;
1605  	intr_enable = false;
1606  	intr_legacy = !!(ctx->ifc_flags & IFC_LEGACY);
1607  	MPASS(rxq->ifr_ntxqirq);
1608  	for (i = 0; i < rxq->ifr_ntxqirq; i++) {
1609  		txqid = rxq->ifr_txqid[i];
1610  		txq = &ctx->ifc_txqs[txqid];
1611  		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
1612  		    BUS_DMASYNC_POSTREAD);
1613  		if (!ctx->isc_txd_credits_update(sc, txqid, false)) {
1614  			if (intr_legacy)
1615  				intr_enable = true;
1616  			else
1617  				IFDI_TX_QUEUE_INTR_ENABLE(ctx, txqid);
1618  			continue;
1619  		}
1620  		GROUPTASK_ENQUEUE(&txq->ift_task);
1621  	}
1622  	if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_RXCQ)
1623  		cidx = rxq->ifr_cq_cidx;
1624  	else
1625  		cidx = rxq->ifr_fl[0].ifl_cidx;
1626  	if (iflib_rxd_avail(ctx, rxq, cidx, 1))
1627  		GROUPTASK_ENQUEUE(gtask);
1628  	else {
1629  		if (intr_legacy)
1630  			intr_enable = true;
1631  		else
1632  			IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
1633  		DBG_COUNTER_INC(rx_intr_enables);
1634  	}
1635  	if (intr_enable)
1636  		IFDI_INTR_ENABLE(ctx);
1637  	return (FILTER_HANDLED);
1638  }
1639  
1640  static int
iflib_fast_intr_ctx(void * arg)1641  iflib_fast_intr_ctx(void *arg)
1642  {
1643  	iflib_filter_info_t info = arg;
1644  	if_ctx_t ctx = info->ifi_ctx;
1645  	int result;
1646  
1647  	DBG_COUNTER_INC(fast_intrs);
1648  	if (info->ifi_filter != NULL) {
1649  		result = info->ifi_filter(info->ifi_filter_arg);
1650  		if ((result & FILTER_SCHEDULE_THREAD) == 0)
1651  			return (result);
1652  	}
1653  
1654  	taskqueue_enqueue(ctx->ifc_tq, &ctx->ifc_admin_task);
1655  	return (FILTER_HANDLED);
1656  }
1657  
1658  static int
_iflib_irq_alloc(if_ctx_t ctx,if_irq_t irq,int rid,driver_filter_t filter,driver_intr_t handler,void * arg,const char * name)1659  _iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
1660  		 driver_filter_t filter, driver_intr_t handler, void *arg,
1661  		 const char *name)
1662  {
1663  	struct resource *res;
1664  	void *tag = NULL;
1665  	device_t dev = ctx->ifc_dev;
1666  	int flags, i, rc;
1667  
1668  	flags = RF_ACTIVE;
1669  	if (ctx->ifc_flags & IFC_LEGACY)
1670  		flags |= RF_SHAREABLE;
1671  	MPASS(rid < 512);
1672  	i = rid;
1673  	res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &i, flags);
1674  	if (res == NULL) {
1675  		device_printf(dev,
1676  		    "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
1677  		return (ENOMEM);
1678  	}
1679  	irq->ii_res = res;
1680  	KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL"));
1681  	rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET,
1682  						filter, handler, arg, &tag);
1683  	if (rc != 0) {
1684  		device_printf(dev,
1685  		    "failed to setup interrupt for rid %d, name %s: %d\n",
1686  					  rid, name ? name : "unknown", rc);
1687  		return (rc);
1688  	} else if (name)
1689  		bus_describe_intr(dev, res, tag, "%s", name);
1690  
1691  	irq->ii_tag = tag;
1692  	return (0);
1693  }
1694  
1695  /*********************************************************************
1696   *
1697   *  Allocate DMA resources for TX buffers as well as memory for the TX
1698   *  mbuf map.  TX DMA maps (non-TSO/TSO) and TX mbuf map are kept in a
1699   *  iflib_sw_tx_desc_array structure, storing all the information that
1700   *  is needed to transmit a packet on the wire.  This is called only
1701   *  once at attach, setup is done every reset.
1702   *
1703   **********************************************************************/
1704  static int
iflib_txsd_alloc(iflib_txq_t txq)1705  iflib_txsd_alloc(iflib_txq_t txq)
1706  {
1707  	if_ctx_t ctx = txq->ift_ctx;
1708  	if_shared_ctx_t sctx = ctx->ifc_sctx;
1709  	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1710  	device_t dev = ctx->ifc_dev;
1711  	bus_size_t tsomaxsize;
1712  	bus_addr_t lowaddr;
1713  	int err, nsegments, ntsosegments;
1714  	bool tso;
1715  
1716  	nsegments = scctx->isc_tx_nsegments;
1717  	ntsosegments = scctx->isc_tx_tso_segments_max;
1718  	tsomaxsize = scctx->isc_tx_tso_size_max;
1719  	if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_VLAN_MTU)
1720  		tsomaxsize += sizeof(struct ether_vlan_header);
1721  	MPASS(scctx->isc_ntxd[0] > 0);
1722  	MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0);
1723  	MPASS(nsegments > 0);
1724  	if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) {
1725  		MPASS(ntsosegments > 0);
1726  		MPASS(sctx->isc_tso_maxsize >= tsomaxsize);
1727  	}
1728  
1729  	lowaddr = DMA_WIDTH_TO_BUS_LOWADDR(scctx->isc_dma_width);
1730  
1731  	/*
1732  	 * Set up DMA tags for TX buffers.
1733  	 */
1734  	if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
1735  			       1, 0,			/* alignment, bounds */
1736  			       lowaddr,			/* lowaddr */
1737  			       BUS_SPACE_MAXADDR,	/* highaddr */
1738  			       NULL, NULL,		/* filter, filterarg */
1739  			       sctx->isc_tx_maxsize,		/* maxsize */
1740  			       nsegments,	/* nsegments */
1741  			       sctx->isc_tx_maxsegsize,	/* maxsegsize */
1742  			       0,			/* flags */
1743  			       NULL,			/* lockfunc */
1744  			       NULL,			/* lockfuncarg */
1745  			       &txq->ift_buf_tag))) {
1746  		device_printf(dev, "Unable to allocate TX DMA tag: %d\n", err);
1747  		device_printf(dev, "maxsize: %ju nsegments: %d maxsegsize: %ju\n",
1748  		    (uintmax_t)sctx->isc_tx_maxsize, nsegments, (uintmax_t)sctx->isc_tx_maxsegsize);
1749  		goto fail;
1750  	}
1751  	tso = (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) != 0;
1752  	if (tso && (err = bus_dma_tag_create(bus_get_dma_tag(dev),
1753  			       1, 0,			/* alignment, bounds */
1754  			       lowaddr,			/* lowaddr */
1755  			       BUS_SPACE_MAXADDR,	/* highaddr */
1756  			       NULL, NULL,		/* filter, filterarg */
1757  			       tsomaxsize,		/* maxsize */
1758  			       ntsosegments,	/* nsegments */
1759  			       sctx->isc_tso_maxsegsize,/* maxsegsize */
1760  			       0,			/* flags */
1761  			       NULL,			/* lockfunc */
1762  			       NULL,			/* lockfuncarg */
1763  			       &txq->ift_tso_buf_tag))) {
1764  		device_printf(dev, "Unable to allocate TSO TX DMA tag: %d\n",
1765  		    err);
1766  		goto fail;
1767  	}
1768  
1769  	/* Allocate memory for the TX mbuf map. */
1770  	if (!(txq->ift_sds.ifsd_m =
1771  	    (struct mbuf **) malloc(sizeof(struct mbuf *) *
1772  	    scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1773  		device_printf(dev, "Unable to allocate TX mbuf map memory\n");
1774  		err = ENOMEM;
1775  		goto fail;
1776  	}
1777  
1778  	/*
1779  	 * Create the DMA maps for TX buffers.
1780  	 */
1781  	if ((txq->ift_sds.ifsd_map = (bus_dmamap_t *)malloc(
1782  	    sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
1783  	    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
1784  		device_printf(dev,
1785  		    "Unable to allocate TX buffer DMA map memory\n");
1786  		err = ENOMEM;
1787  		goto fail;
1788  	}
1789  	if (tso && (txq->ift_sds.ifsd_tso_map = (bus_dmamap_t *)malloc(
1790  	    sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
1791  	    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
1792  		device_printf(dev,
1793  		    "Unable to allocate TSO TX buffer map memory\n");
1794  		err = ENOMEM;
1795  		goto fail;
1796  	}
1797  	for (int i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) {
1798  		err = bus_dmamap_create(txq->ift_buf_tag, 0,
1799  		    &txq->ift_sds.ifsd_map[i]);
1800  		if (err != 0) {
1801  			device_printf(dev, "Unable to create TX DMA map\n");
1802  			goto fail;
1803  		}
1804  		if (!tso)
1805  			continue;
1806  		err = bus_dmamap_create(txq->ift_tso_buf_tag, 0,
1807  		    &txq->ift_sds.ifsd_tso_map[i]);
1808  		if (err != 0) {
1809  			device_printf(dev, "Unable to create TSO TX DMA map\n");
1810  			goto fail;
1811  		}
1812  	}
1813  	return (0);
1814  fail:
1815  	/* We free all, it handles case where we are in the middle */
1816  	iflib_tx_structures_free(ctx);
1817  	return (err);
1818  }
1819  
1820  static void
iflib_txsd_destroy(if_ctx_t ctx,iflib_txq_t txq,int i)1821  iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i)
1822  {
1823  	bus_dmamap_t map;
1824  
1825  	if (txq->ift_sds.ifsd_map != NULL) {
1826  		map = txq->ift_sds.ifsd_map[i];
1827  		bus_dmamap_sync(txq->ift_buf_tag, map, BUS_DMASYNC_POSTWRITE);
1828  		bus_dmamap_unload(txq->ift_buf_tag, map);
1829  		bus_dmamap_destroy(txq->ift_buf_tag, map);
1830  		txq->ift_sds.ifsd_map[i] = NULL;
1831  	}
1832  
1833  	if (txq->ift_sds.ifsd_tso_map != NULL) {
1834  		map = txq->ift_sds.ifsd_tso_map[i];
1835  		bus_dmamap_sync(txq->ift_tso_buf_tag, map,
1836  		    BUS_DMASYNC_POSTWRITE);
1837  		bus_dmamap_unload(txq->ift_tso_buf_tag, map);
1838  		bus_dmamap_destroy(txq->ift_tso_buf_tag, map);
1839  		txq->ift_sds.ifsd_tso_map[i] = NULL;
1840  	}
1841  }
1842  
1843  static void
iflib_txq_destroy(iflib_txq_t txq)1844  iflib_txq_destroy(iflib_txq_t txq)
1845  {
1846  	if_ctx_t ctx = txq->ift_ctx;
1847  
1848  	for (int i = 0; i < txq->ift_size; i++)
1849  		iflib_txsd_destroy(ctx, txq, i);
1850  
1851  	if (txq->ift_br != NULL) {
1852  		ifmp_ring_free(txq->ift_br);
1853  		txq->ift_br = NULL;
1854  	}
1855  
1856  	mtx_destroy(&txq->ift_mtx);
1857  
1858  	if (txq->ift_sds.ifsd_map != NULL) {
1859  		free(txq->ift_sds.ifsd_map, M_IFLIB);
1860  		txq->ift_sds.ifsd_map = NULL;
1861  	}
1862  	if (txq->ift_sds.ifsd_tso_map != NULL) {
1863  		free(txq->ift_sds.ifsd_tso_map, M_IFLIB);
1864  		txq->ift_sds.ifsd_tso_map = NULL;
1865  	}
1866  	if (txq->ift_sds.ifsd_m != NULL) {
1867  		free(txq->ift_sds.ifsd_m, M_IFLIB);
1868  		txq->ift_sds.ifsd_m = NULL;
1869  	}
1870  	if (txq->ift_buf_tag != NULL) {
1871  		bus_dma_tag_destroy(txq->ift_buf_tag);
1872  		txq->ift_buf_tag = NULL;
1873  	}
1874  	if (txq->ift_tso_buf_tag != NULL) {
1875  		bus_dma_tag_destroy(txq->ift_tso_buf_tag);
1876  		txq->ift_tso_buf_tag = NULL;
1877  	}
1878  	if (txq->ift_ifdi != NULL) {
1879  		free(txq->ift_ifdi, M_IFLIB);
1880  	}
1881  }
1882  
1883  static void
iflib_txsd_free(if_ctx_t ctx,iflib_txq_t txq,int i)1884  iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i)
1885  {
1886  	struct mbuf **mp;
1887  
1888  	mp = &txq->ift_sds.ifsd_m[i];
1889  	if (*mp == NULL)
1890  		return;
1891  
1892  	if (txq->ift_sds.ifsd_map != NULL) {
1893  		bus_dmamap_sync(txq->ift_buf_tag,
1894  		    txq->ift_sds.ifsd_map[i], BUS_DMASYNC_POSTWRITE);
1895  		bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[i]);
1896  	}
1897  	if (txq->ift_sds.ifsd_tso_map != NULL) {
1898  		bus_dmamap_sync(txq->ift_tso_buf_tag,
1899  		    txq->ift_sds.ifsd_tso_map[i], BUS_DMASYNC_POSTWRITE);
1900  		bus_dmamap_unload(txq->ift_tso_buf_tag,
1901  		    txq->ift_sds.ifsd_tso_map[i]);
1902  	}
1903  	m_freem(*mp);
1904  	DBG_COUNTER_INC(tx_frees);
1905  	*mp = NULL;
1906  }
1907  
1908  static int
iflib_txq_setup(iflib_txq_t txq)1909  iflib_txq_setup(iflib_txq_t txq)
1910  {
1911  	if_ctx_t ctx = txq->ift_ctx;
1912  	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1913  	if_shared_ctx_t sctx = ctx->ifc_sctx;
1914  	iflib_dma_info_t di;
1915  	int i;
1916  
1917  	/* Set number of descriptors available */
1918  	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
1919  	/* XXX make configurable */
1920  	txq->ift_update_freq = IFLIB_DEFAULT_TX_UPDATE_FREQ;
1921  
1922  	/* Reset indices */
1923  	txq->ift_cidx_processed = 0;
1924  	txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
1925  	txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset];
1926  
1927  	for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
1928  		bzero((void *)di->idi_vaddr, di->idi_size);
1929  
1930  	IFDI_TXQ_SETUP(ctx, txq->ift_id);
1931  	for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
1932  		bus_dmamap_sync(di->idi_tag, di->idi_map,
1933  		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
1934  	return (0);
1935  }
1936  
1937  /*********************************************************************
1938   *
1939   *  Allocate DMA resources for RX buffers as well as memory for the RX
1940   *  mbuf map, direct RX cluster pointer map and RX cluster bus address
1941   *  map.  RX DMA map, RX mbuf map, direct RX cluster pointer map and
1942   *  RX cluster map are kept in a iflib_sw_rx_desc_array structure.
1943   *  Since we use use one entry in iflib_sw_rx_desc_array per received
1944   *  packet, the maximum number of entries we'll need is equal to the
1945   *  number of hardware receive descriptors that we've allocated.
1946   *
1947   **********************************************************************/
1948  static int
iflib_rxsd_alloc(iflib_rxq_t rxq)1949  iflib_rxsd_alloc(iflib_rxq_t rxq)
1950  {
1951  	if_ctx_t ctx = rxq->ifr_ctx;
1952  	if_shared_ctx_t sctx = ctx->ifc_sctx;
1953  	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
1954  	device_t dev = ctx->ifc_dev;
1955  	iflib_fl_t fl;
1956  	bus_addr_t lowaddr;
1957  	int err;
1958  
1959  	MPASS(scctx->isc_nrxd[0] > 0);
1960  	MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0);
1961  
1962  	lowaddr = DMA_WIDTH_TO_BUS_LOWADDR(scctx->isc_dma_width);
1963  
1964  	fl = rxq->ifr_fl;
1965  	for (int i = 0; i < rxq->ifr_nfl; i++, fl++) {
1966  		fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */
1967  		/* Set up DMA tag for RX buffers. */
1968  		err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
1969  					 1, 0,			/* alignment, bounds */
1970  					 lowaddr,		/* lowaddr */
1971  					 BUS_SPACE_MAXADDR,	/* highaddr */
1972  					 NULL, NULL,		/* filter, filterarg */
1973  					 sctx->isc_rx_maxsize,	/* maxsize */
1974  					 sctx->isc_rx_nsegments,	/* nsegments */
1975  					 sctx->isc_rx_maxsegsize,	/* maxsegsize */
1976  					 0,			/* flags */
1977  					 NULL,			/* lockfunc */
1978  					 NULL,			/* lockarg */
1979  					 &fl->ifl_buf_tag);
1980  		if (err) {
1981  			device_printf(dev,
1982  			    "Unable to allocate RX DMA tag: %d\n", err);
1983  			goto fail;
1984  		}
1985  
1986  		/* Allocate memory for the RX mbuf map. */
1987  		if (!(fl->ifl_sds.ifsd_m =
1988  		      (struct mbuf **) malloc(sizeof(struct mbuf *) *
1989  					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
1990  			device_printf(dev,
1991  			    "Unable to allocate RX mbuf map memory\n");
1992  			err = ENOMEM;
1993  			goto fail;
1994  		}
1995  
1996  		/* Allocate memory for the direct RX cluster pointer map. */
1997  		if (!(fl->ifl_sds.ifsd_cl =
1998  		      (caddr_t *) malloc(sizeof(caddr_t) *
1999  					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
2000  			device_printf(dev,
2001  			    "Unable to allocate RX cluster map memory\n");
2002  			err = ENOMEM;
2003  			goto fail;
2004  		}
2005  
2006  		/* Allocate memory for the RX cluster bus address map. */
2007  		if (!(fl->ifl_sds.ifsd_ba =
2008  		      (bus_addr_t *) malloc(sizeof(bus_addr_t) *
2009  					      scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
2010  			device_printf(dev,
2011  			    "Unable to allocate RX bus address map memory\n");
2012  			err = ENOMEM;
2013  			goto fail;
2014  		}
2015  
2016  		/*
2017  		 * Create the DMA maps for RX buffers.
2018  		 */
2019  		if (!(fl->ifl_sds.ifsd_map =
2020  		      (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
2021  			device_printf(dev,
2022  			    "Unable to allocate RX buffer DMA map memory\n");
2023  			err = ENOMEM;
2024  			goto fail;
2025  		}
2026  		for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++) {
2027  			err = bus_dmamap_create(fl->ifl_buf_tag, 0,
2028  			    &fl->ifl_sds.ifsd_map[i]);
2029  			if (err != 0) {
2030  				device_printf(dev, "Unable to create RX buffer DMA map\n");
2031  				goto fail;
2032  			}
2033  		}
2034  	}
2035  	return (0);
2036  
2037  fail:
2038  	iflib_rx_structures_free(ctx);
2039  	return (err);
2040  }
2041  
2042  /*
2043   * Internal service routines
2044   */
2045  
2046  struct rxq_refill_cb_arg {
2047  	int               error;
2048  	bus_dma_segment_t seg;
2049  	int               nseg;
2050  };
2051  
2052  static void
_rxq_refill_cb(void * arg,bus_dma_segment_t * segs,int nseg,int error)2053  _rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
2054  {
2055  	struct rxq_refill_cb_arg *cb_arg = arg;
2056  
2057  	cb_arg->error = error;
2058  	cb_arg->seg = segs[0];
2059  	cb_arg->nseg = nseg;
2060  }
2061  
2062  /**
2063   * iflib_fl_refill - refill an rxq free-buffer list
2064   * @ctx: the iflib context
2065   * @fl: the free list to refill
2066   * @count: the number of new buffers to allocate
2067   *
2068   * (Re)populate an rxq free-buffer list with up to @count new packet buffers.
2069   * The caller must assure that @count does not exceed the queue's capacity
2070   * minus one (since we always leave a descriptor unavailable).
2071   */
2072  static uint8_t
iflib_fl_refill(if_ctx_t ctx,iflib_fl_t fl,int count)2073  iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count)
2074  {
2075  	struct if_rxd_update iru;
2076  	struct rxq_refill_cb_arg cb_arg;
2077  	struct mbuf *m;
2078  	caddr_t cl, *sd_cl;
2079  	struct mbuf **sd_m;
2080  	bus_dmamap_t *sd_map;
2081  	bus_addr_t bus_addr, *sd_ba;
2082  	int err, frag_idx, i, idx, n, pidx;
2083  	qidx_t credits;
2084  
2085  	MPASS(count <= fl->ifl_size - fl->ifl_credits - 1);
2086  
2087  	sd_m = fl->ifl_sds.ifsd_m;
2088  	sd_map = fl->ifl_sds.ifsd_map;
2089  	sd_cl = fl->ifl_sds.ifsd_cl;
2090  	sd_ba = fl->ifl_sds.ifsd_ba;
2091  	pidx = fl->ifl_pidx;
2092  	idx = pidx;
2093  	frag_idx = fl->ifl_fragidx;
2094  	credits = fl->ifl_credits;
2095  
2096  	i = 0;
2097  	n = count;
2098  	MPASS(n > 0);
2099  	MPASS(credits + n <= fl->ifl_size);
2100  
2101  	if (pidx < fl->ifl_cidx)
2102  		MPASS(pidx + n <= fl->ifl_cidx);
2103  	if (pidx == fl->ifl_cidx && (credits < fl->ifl_size))
2104  		MPASS(fl->ifl_gen == 0);
2105  	if (pidx > fl->ifl_cidx)
2106  		MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx);
2107  
2108  	DBG_COUNTER_INC(fl_refills);
2109  	if (n > 8)
2110  		DBG_COUNTER_INC(fl_refills_large);
2111  	iru_init(&iru, fl->ifl_rxq, fl->ifl_id);
2112  	while (n-- > 0) {
2113  		/*
2114  		 * We allocate an uninitialized mbuf + cluster, mbuf is
2115  		 * initialized after rx.
2116  		 *
2117  		 * If the cluster is still set then we know a minimum sized
2118  		 * packet was received
2119  		 */
2120  		bit_ffc_at(fl->ifl_rx_bitmap, frag_idx, fl->ifl_size,
2121  		    &frag_idx);
2122  		if (frag_idx < 0)
2123  			bit_ffc(fl->ifl_rx_bitmap, fl->ifl_size, &frag_idx);
2124  		MPASS(frag_idx >= 0);
2125  		if ((cl = sd_cl[frag_idx]) == NULL) {
2126  			cl = uma_zalloc(fl->ifl_zone, M_NOWAIT);
2127  			if (__predict_false(cl == NULL))
2128  				break;
2129  
2130  			cb_arg.error = 0;
2131  			MPASS(sd_map != NULL);
2132  			err = bus_dmamap_load(fl->ifl_buf_tag, sd_map[frag_idx],
2133  			    cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg,
2134  			    BUS_DMA_NOWAIT);
2135  			if (__predict_false(err != 0 || cb_arg.error)) {
2136  				uma_zfree(fl->ifl_zone, cl);
2137  				break;
2138  			}
2139  
2140  			sd_ba[frag_idx] = bus_addr = cb_arg.seg.ds_addr;
2141  			sd_cl[frag_idx] = cl;
2142  #if MEMORY_LOGGING
2143  			fl->ifl_cl_enqueued++;
2144  #endif
2145  		} else {
2146  			bus_addr = sd_ba[frag_idx];
2147  		}
2148  		bus_dmamap_sync(fl->ifl_buf_tag, sd_map[frag_idx],
2149  		    BUS_DMASYNC_PREREAD);
2150  
2151  		if (sd_m[frag_idx] == NULL) {
2152  			m = m_gethdr_raw(M_NOWAIT, 0);
2153  			if (__predict_false(m == NULL))
2154  				break;
2155  			sd_m[frag_idx] = m;
2156  		}
2157  		bit_set(fl->ifl_rx_bitmap, frag_idx);
2158  #if MEMORY_LOGGING
2159  		fl->ifl_m_enqueued++;
2160  #endif
2161  
2162  		DBG_COUNTER_INC(rx_allocs);
2163  		fl->ifl_rxd_idxs[i] = frag_idx;
2164  		fl->ifl_bus_addrs[i] = bus_addr;
2165  		credits++;
2166  		i++;
2167  		MPASS(credits <= fl->ifl_size);
2168  		if (++idx == fl->ifl_size) {
2169  #ifdef INVARIANTS
2170  			fl->ifl_gen = 1;
2171  #endif
2172  			idx = 0;
2173  		}
2174  		if (n == 0 || i == IFLIB_MAX_RX_REFRESH) {
2175  			iru.iru_pidx = pidx;
2176  			iru.iru_count = i;
2177  			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
2178  			fl->ifl_pidx = idx;
2179  			fl->ifl_credits = credits;
2180  			pidx = idx;
2181  			i = 0;
2182  		}
2183  	}
2184  
2185  	if (n < count - 1) {
2186  		if (i != 0) {
2187  			iru.iru_pidx = pidx;
2188  			iru.iru_count = i;
2189  			ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
2190  			fl->ifl_pidx = idx;
2191  			fl->ifl_credits = credits;
2192  		}
2193  		DBG_COUNTER_INC(rxd_flush);
2194  		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
2195  		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
2196  		ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id,
2197  		    fl->ifl_id, fl->ifl_pidx);
2198  		if (__predict_true(bit_test(fl->ifl_rx_bitmap, frag_idx))) {
2199  			fl->ifl_fragidx = frag_idx + 1;
2200  			if (fl->ifl_fragidx == fl->ifl_size)
2201  				fl->ifl_fragidx = 0;
2202  		} else {
2203  			fl->ifl_fragidx = frag_idx;
2204  		}
2205  	}
2206  
2207  	return (n == -1 ? 0 : IFLIB_RXEOF_EMPTY);
2208  }
2209  
2210  static inline uint8_t
iflib_fl_refill_all(if_ctx_t ctx,iflib_fl_t fl)2211  iflib_fl_refill_all(if_ctx_t ctx, iflib_fl_t fl)
2212  {
2213  	/*
2214  	 * We leave an unused descriptor to avoid pidx to catch up with cidx.
2215  	 * This is important as it confuses most NICs. For instance,
2216  	 * Intel NICs have (per receive ring) RDH and RDT registers, where
2217  	 * RDH points to the next receive descriptor to be used by the NIC,
2218  	 * and RDT for the next receive descriptor to be published by the
2219  	 * driver to the NIC (RDT - 1 is thus the last valid one).
2220  	 * The condition RDH == RDT means no descriptors are available to
2221  	 * the NIC, and thus it would be ambiguous if it also meant that
2222  	 * all the descriptors are available to the NIC.
2223  	 */
2224  	int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1;
2225  #ifdef INVARIANTS
2226  	int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1;
2227  #endif
2228  
2229  	MPASS(fl->ifl_credits <= fl->ifl_size);
2230  	MPASS(reclaimable == delta);
2231  
2232  	if (reclaimable > 0)
2233  		return (iflib_fl_refill(ctx, fl, reclaimable));
2234  	return (0);
2235  }
2236  
2237  uint8_t
iflib_in_detach(if_ctx_t ctx)2238  iflib_in_detach(if_ctx_t ctx)
2239  {
2240  	bool in_detach;
2241  
2242  	STATE_LOCK(ctx);
2243  	in_detach = !!(ctx->ifc_flags & IFC_IN_DETACH);
2244  	STATE_UNLOCK(ctx);
2245  	return (in_detach);
2246  }
2247  
2248  static void
iflib_fl_bufs_free(iflib_fl_t fl)2249  iflib_fl_bufs_free(iflib_fl_t fl)
2250  {
2251  	iflib_dma_info_t idi = fl->ifl_ifdi;
2252  	bus_dmamap_t sd_map;
2253  	uint32_t i;
2254  
2255  	for (i = 0; i < fl->ifl_size; i++) {
2256  		struct mbuf **sd_m = &fl->ifl_sds.ifsd_m[i];
2257  		caddr_t *sd_cl = &fl->ifl_sds.ifsd_cl[i];
2258  
2259  		if (*sd_cl != NULL) {
2260  			sd_map = fl->ifl_sds.ifsd_map[i];
2261  			bus_dmamap_sync(fl->ifl_buf_tag, sd_map,
2262  			    BUS_DMASYNC_POSTREAD);
2263  			bus_dmamap_unload(fl->ifl_buf_tag, sd_map);
2264  			uma_zfree(fl->ifl_zone, *sd_cl);
2265  			*sd_cl = NULL;
2266  			if (*sd_m != NULL) {
2267  				m_init(*sd_m, M_NOWAIT, MT_DATA, 0);
2268  				m_free_raw(*sd_m);
2269  				*sd_m = NULL;
2270  			}
2271  		} else {
2272  			MPASS(*sd_m == NULL);
2273  		}
2274  #if MEMORY_LOGGING
2275  		fl->ifl_m_dequeued++;
2276  		fl->ifl_cl_dequeued++;
2277  #endif
2278  	}
2279  #ifdef INVARIANTS
2280  	for (i = 0; i < fl->ifl_size; i++) {
2281  		MPASS(fl->ifl_sds.ifsd_cl[i] == NULL);
2282  		MPASS(fl->ifl_sds.ifsd_m[i] == NULL);
2283  	}
2284  #endif
2285  	/*
2286  	 * Reset free list values
2287  	 */
2288  	fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = fl->ifl_fragidx = 0;
2289  	bzero(idi->idi_vaddr, idi->idi_size);
2290  }
2291  
2292  /*********************************************************************
2293   *
2294   *  Initialize a free list and its buffers.
2295   *
2296   **********************************************************************/
2297  static int
iflib_fl_setup(iflib_fl_t fl)2298  iflib_fl_setup(iflib_fl_t fl)
2299  {
2300  	iflib_rxq_t rxq = fl->ifl_rxq;
2301  	if_ctx_t ctx = rxq->ifr_ctx;
2302  	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2303  	int qidx;
2304  
2305  	bit_nclear(fl->ifl_rx_bitmap, 0, fl->ifl_size - 1);
2306  	/*
2307  	 * Free current RX buffer structs and their mbufs
2308  	 */
2309  	iflib_fl_bufs_free(fl);
2310  	/* Now replenish the mbufs */
2311  	MPASS(fl->ifl_credits == 0);
2312  	qidx = rxq->ifr_fl_offset + fl->ifl_id;
2313  	if (scctx->isc_rxd_buf_size[qidx] != 0)
2314  		fl->ifl_buf_size = scctx->isc_rxd_buf_size[qidx];
2315  	else
2316  		fl->ifl_buf_size = ctx->ifc_rx_mbuf_sz;
2317  	/*
2318  	 * ifl_buf_size may be a driver-supplied value, so pull it up
2319  	 * to the selected mbuf size.
2320  	 */
2321  	fl->ifl_buf_size = iflib_get_mbuf_size_for(fl->ifl_buf_size);
2322  	if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size)
2323  		ctx->ifc_max_fl_buf_size = fl->ifl_buf_size;
2324  	fl->ifl_cltype = m_gettype(fl->ifl_buf_size);
2325  	fl->ifl_zone = m_getzone(fl->ifl_buf_size);
2326  
2327  	/*
2328  	 * Avoid pre-allocating zillions of clusters to an idle card
2329  	 * potentially speeding up attach. In any case make sure
2330  	 * to leave a descriptor unavailable. See the comment in
2331  	 * iflib_fl_refill_all().
2332  	 */
2333  	MPASS(fl->ifl_size > 0);
2334  	(void)iflib_fl_refill(ctx, fl, min(128, fl->ifl_size - 1));
2335  	if (min(128, fl->ifl_size - 1) != fl->ifl_credits)
2336  		return (ENOBUFS);
2337  	/*
2338  	 * handle failure
2339  	 */
2340  	MPASS(rxq != NULL);
2341  	MPASS(fl->ifl_ifdi != NULL);
2342  	bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
2343  	    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
2344  	return (0);
2345  }
2346  
2347  /*********************************************************************
2348   *
2349   *  Free receive ring data structures
2350   *
2351   **********************************************************************/
2352  static void
iflib_rx_sds_free(iflib_rxq_t rxq)2353  iflib_rx_sds_free(iflib_rxq_t rxq)
2354  {
2355  	iflib_fl_t fl;
2356  	int i, j;
2357  
2358  	if (rxq->ifr_fl != NULL) {
2359  		for (i = 0; i < rxq->ifr_nfl; i++) {
2360  			fl = &rxq->ifr_fl[i];
2361  			if (fl->ifl_buf_tag != NULL) {
2362  				if (fl->ifl_sds.ifsd_map != NULL) {
2363  					for (j = 0; j < fl->ifl_size; j++) {
2364  						bus_dmamap_sync(
2365  						    fl->ifl_buf_tag,
2366  						    fl->ifl_sds.ifsd_map[j],
2367  						    BUS_DMASYNC_POSTREAD);
2368  						bus_dmamap_unload(
2369  						    fl->ifl_buf_tag,
2370  						    fl->ifl_sds.ifsd_map[j]);
2371  						bus_dmamap_destroy(
2372  						    fl->ifl_buf_tag,
2373  						    fl->ifl_sds.ifsd_map[j]);
2374  					}
2375  				}
2376  				bus_dma_tag_destroy(fl->ifl_buf_tag);
2377  				fl->ifl_buf_tag = NULL;
2378  			}
2379  			free(fl->ifl_sds.ifsd_m, M_IFLIB);
2380  			free(fl->ifl_sds.ifsd_cl, M_IFLIB);
2381  			free(fl->ifl_sds.ifsd_ba, M_IFLIB);
2382  			free(fl->ifl_sds.ifsd_map, M_IFLIB);
2383  			free(fl->ifl_rx_bitmap, M_IFLIB);
2384  			fl->ifl_sds.ifsd_m = NULL;
2385  			fl->ifl_sds.ifsd_cl = NULL;
2386  			fl->ifl_sds.ifsd_ba = NULL;
2387  			fl->ifl_sds.ifsd_map = NULL;
2388  			fl->ifl_rx_bitmap = NULL;
2389  		}
2390  		free(rxq->ifr_fl, M_IFLIB);
2391  		rxq->ifr_fl = NULL;
2392  		free(rxq->ifr_ifdi, M_IFLIB);
2393  		rxq->ifr_ifdi = NULL;
2394  		rxq->ifr_cq_cidx = 0;
2395  	}
2396  }
2397  
2398  /*
2399   * Timer routine
2400   */
2401  static void
iflib_timer(void * arg)2402  iflib_timer(void *arg)
2403  {
2404  	iflib_txq_t txq = arg;
2405  	if_ctx_t ctx = txq->ift_ctx;
2406  	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
2407  	uint64_t this_tick = ticks;
2408  
2409  	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
2410  		return;
2411  
2412  	/*
2413  	** Check on the state of the TX queue(s), this
2414  	** can be done without the lock because its RO
2415  	** and the HUNG state will be static if set.
2416  	*/
2417  	if (this_tick - txq->ift_last_timer_tick >= iflib_timer_default) {
2418  		txq->ift_last_timer_tick = this_tick;
2419  		IFDI_TIMER(ctx, txq->ift_id);
2420  		if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) &&
2421  		    ((txq->ift_cleaned_prev == txq->ift_cleaned) ||
2422  		     (sctx->isc_pause_frames == 0)))
2423  			goto hung;
2424  
2425  		if (txq->ift_qstatus != IFLIB_QUEUE_IDLE &&
2426  		    ifmp_ring_is_stalled(txq->ift_br)) {
2427  			KASSERT(ctx->ifc_link_state == LINK_STATE_UP,
2428  			    ("queue can't be marked as hung if interface is down"));
2429  			txq->ift_qstatus = IFLIB_QUEUE_HUNG;
2430  		}
2431  		txq->ift_cleaned_prev = txq->ift_cleaned;
2432  	}
2433  	/* handle any laggards */
2434  	if (txq->ift_db_pending)
2435  		GROUPTASK_ENQUEUE(&txq->ift_task);
2436  
2437  	sctx->isc_pause_frames = 0;
2438  	if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)
2439  		callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer,
2440  		    txq, txq->ift_timer.c_cpu);
2441  	return;
2442  
2443   hung:
2444  	device_printf(ctx->ifc_dev,
2445  	    "Watchdog timeout (TX: %d desc avail: %d pidx: %d) -- resetting\n",
2446  	    txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx);
2447  	STATE_LOCK(ctx);
2448  	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
2449  	ctx->ifc_flags |= (IFC_DO_WATCHDOG | IFC_DO_RESET);
2450  	iflib_admin_intr_deferred(ctx);
2451  	STATE_UNLOCK(ctx);
2452  }
2453  
2454  static uint16_t
iflib_get_mbuf_size_for(unsigned int size)2455  iflib_get_mbuf_size_for(unsigned int size)
2456  {
2457  
2458  	if (size <= MCLBYTES)
2459  		return (MCLBYTES);
2460  	else
2461  		return (MJUMPAGESIZE);
2462  }
2463  
2464  static void
iflib_calc_rx_mbuf_sz(if_ctx_t ctx)2465  iflib_calc_rx_mbuf_sz(if_ctx_t ctx)
2466  {
2467  	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
2468  
2469  	/*
2470  	 * XXX don't set the max_frame_size to larger
2471  	 * than the hardware can handle
2472  	 */
2473  	ctx->ifc_rx_mbuf_sz =
2474  	    iflib_get_mbuf_size_for(sctx->isc_max_frame_size);
2475  }
2476  
2477  uint32_t
iflib_get_rx_mbuf_sz(if_ctx_t ctx)2478  iflib_get_rx_mbuf_sz(if_ctx_t ctx)
2479  {
2480  
2481  	return (ctx->ifc_rx_mbuf_sz);
2482  }
2483  
2484  static void
iflib_init_locked(if_ctx_t ctx)2485  iflib_init_locked(if_ctx_t ctx)
2486  {
2487  	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2488  	if_t ifp = ctx->ifc_ifp;
2489  	iflib_fl_t fl;
2490  	iflib_txq_t txq;
2491  	iflib_rxq_t rxq;
2492  	int i, j, tx_ip_csum_flags, tx_ip6_csum_flags;
2493  
2494  	if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
2495  	IFDI_INTR_DISABLE(ctx);
2496  
2497  	/*
2498  	 * See iflib_stop(). Useful in case iflib_init_locked() is
2499  	 * called without first calling iflib_stop().
2500  	 */
2501  	netmap_disable_all_rings(ifp);
2502  
2503  	tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP);
2504  	tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP);
2505  	/* Set hardware offload abilities */
2506  	if_clearhwassist(ifp);
2507  	if (if_getcapenable(ifp) & IFCAP_TXCSUM)
2508  		if_sethwassistbits(ifp, tx_ip_csum_flags, 0);
2509  	if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
2510  		if_sethwassistbits(ifp,  tx_ip6_csum_flags, 0);
2511  	if (if_getcapenable(ifp) & IFCAP_TSO4)
2512  		if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
2513  	if (if_getcapenable(ifp) & IFCAP_TSO6)
2514  		if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
2515  
2516  	for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) {
2517  		CALLOUT_LOCK(txq);
2518  		callout_stop(&txq->ift_timer);
2519  #ifdef DEV_NETMAP
2520  		callout_stop(&txq->ift_netmap_timer);
2521  #endif /* DEV_NETMAP */
2522  		CALLOUT_UNLOCK(txq);
2523  		(void)iflib_netmap_txq_init(ctx, txq);
2524  	}
2525  
2526  	/*
2527  	 * Calculate a suitable Rx mbuf size prior to calling IFDI_INIT, so
2528  	 * that drivers can use the value when setting up the hardware receive
2529  	 * buffers.
2530  	 */
2531  	iflib_calc_rx_mbuf_sz(ctx);
2532  
2533  #ifdef INVARIANTS
2534  	i = if_getdrvflags(ifp);
2535  #endif
2536  	IFDI_INIT(ctx);
2537  	MPASS(if_getdrvflags(ifp) == i);
2538  	for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) {
2539  		if (iflib_netmap_rxq_init(ctx, rxq) > 0) {
2540  			/* This rxq is in netmap mode. Skip normal init. */
2541  			continue;
2542  		}
2543  		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
2544  			if (iflib_fl_setup(fl)) {
2545  				device_printf(ctx->ifc_dev,
2546  				    "setting up free list %d failed - "
2547  				    "check cluster settings\n", j);
2548  				goto done;
2549  			}
2550  		}
2551  	}
2552  done:
2553  	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
2554  	IFDI_INTR_ENABLE(ctx);
2555  	txq = ctx->ifc_txqs;
2556  	for (i = 0; i < scctx->isc_ntxqsets; i++, txq++)
2557  		callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, txq,
2558  			txq->ift_timer.c_cpu);
2559  
2560          /* Re-enable txsync/rxsync. */
2561  	netmap_enable_all_rings(ifp);
2562  }
2563  
2564  static int
iflib_media_change(if_t ifp)2565  iflib_media_change(if_t ifp)
2566  {
2567  	if_ctx_t ctx = if_getsoftc(ifp);
2568  	int err;
2569  
2570  	CTX_LOCK(ctx);
2571  	if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0)
2572  		iflib_if_init_locked(ctx);
2573  	CTX_UNLOCK(ctx);
2574  	return (err);
2575  }
2576  
2577  static void
iflib_media_status(if_t ifp,struct ifmediareq * ifmr)2578  iflib_media_status(if_t ifp, struct ifmediareq *ifmr)
2579  {
2580  	if_ctx_t ctx = if_getsoftc(ifp);
2581  
2582  	CTX_LOCK(ctx);
2583  	IFDI_UPDATE_ADMIN_STATUS(ctx);
2584  	IFDI_MEDIA_STATUS(ctx, ifmr);
2585  	CTX_UNLOCK(ctx);
2586  }
2587  
2588  static void
iflib_stop(if_ctx_t ctx)2589  iflib_stop(if_ctx_t ctx)
2590  {
2591  	iflib_txq_t txq = ctx->ifc_txqs;
2592  	iflib_rxq_t rxq = ctx->ifc_rxqs;
2593  	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2594  	if_shared_ctx_t sctx = ctx->ifc_sctx;
2595  	iflib_dma_info_t di;
2596  	iflib_fl_t fl;
2597  	int i, j;
2598  
2599  	/* Tell the stack that the interface is no longer active */
2600  	if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
2601  
2602  	IFDI_INTR_DISABLE(ctx);
2603  	DELAY(1000);
2604  	IFDI_STOP(ctx);
2605  	DELAY(1000);
2606  
2607  	/*
2608  	 * Stop any pending txsync/rxsync and prevent new ones
2609  	 * form starting. Processes blocked in poll() will get
2610  	 * POLLERR.
2611  	 */
2612  	netmap_disable_all_rings(ctx->ifc_ifp);
2613  
2614  	iflib_debug_reset();
2615  	/* Wait for current tx queue users to exit to disarm watchdog timer. */
2616  	for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) {
2617  		/* make sure all transmitters have completed before proceeding XXX */
2618  
2619  		CALLOUT_LOCK(txq);
2620  		callout_stop(&txq->ift_timer);
2621  #ifdef DEV_NETMAP
2622  		callout_stop(&txq->ift_netmap_timer);
2623  #endif /* DEV_NETMAP */
2624  		CALLOUT_UNLOCK(txq);
2625  
2626  		/* clean any enqueued buffers */
2627  		iflib_ifmp_purge(txq);
2628  		/* Free any existing tx buffers. */
2629  		for (j = 0; j < txq->ift_size; j++) {
2630  			iflib_txsd_free(ctx, txq, j);
2631  		}
2632  		txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0;
2633  		txq->ift_in_use = txq->ift_gen = txq->ift_no_desc_avail = 0;
2634  		if (sctx->isc_flags & IFLIB_PRESERVE_TX_INDICES)
2635  			txq->ift_cidx = txq->ift_pidx;
2636  		else
2637  			txq->ift_cidx = txq->ift_pidx = 0;
2638  
2639  		txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0;
2640  		txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0;
2641  		txq->ift_pullups = 0;
2642  		ifmp_ring_reset_stats(txq->ift_br);
2643  		for (j = 0, di = txq->ift_ifdi; j < sctx->isc_ntxqs; j++, di++)
2644  			bzero((void *)di->idi_vaddr, di->idi_size);
2645  	}
2646  	for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) {
2647  		if (rxq->ifr_task.gt_taskqueue != NULL)
2648  			gtaskqueue_drain(rxq->ifr_task.gt_taskqueue,
2649  				 &rxq->ifr_task.gt_task);
2650  
2651  		rxq->ifr_cq_cidx = 0;
2652  		for (j = 0, di = rxq->ifr_ifdi; j < sctx->isc_nrxqs; j++, di++)
2653  			bzero((void *)di->idi_vaddr, di->idi_size);
2654  		/* also resets the free lists pidx/cidx */
2655  		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
2656  			iflib_fl_bufs_free(fl);
2657  	}
2658  }
2659  
2660  static inline caddr_t
calc_next_rxd(iflib_fl_t fl,int cidx)2661  calc_next_rxd(iflib_fl_t fl, int cidx)
2662  {
2663  	qidx_t size;
2664  	int nrxd;
2665  	caddr_t start, end, cur, next;
2666  
2667  	nrxd = fl->ifl_size;
2668  	size = fl->ifl_rxd_size;
2669  	start = fl->ifl_ifdi->idi_vaddr;
2670  
2671  	if (__predict_false(size == 0))
2672  		return (start);
2673  	cur = start + size * cidx;
2674  	end = start + size * nrxd;
2675  	next = CACHE_PTR_NEXT(cur);
2676  	return (next < end ? next : start);
2677  }
2678  
2679  static inline void
prefetch_pkts(iflib_fl_t fl,int cidx)2680  prefetch_pkts(iflib_fl_t fl, int cidx)
2681  {
2682  	int nextptr;
2683  	int nrxd = fl->ifl_size;
2684  	caddr_t next_rxd;
2685  
2686  	nextptr = (cidx + CACHE_PTR_INCREMENT) & (nrxd - 1);
2687  	prefetch(&fl->ifl_sds.ifsd_m[nextptr]);
2688  	prefetch(&fl->ifl_sds.ifsd_cl[nextptr]);
2689  	next_rxd = calc_next_rxd(fl, cidx);
2690  	prefetch(next_rxd);
2691  	prefetch(fl->ifl_sds.ifsd_m[(cidx + 1) & (nrxd - 1)]);
2692  	prefetch(fl->ifl_sds.ifsd_m[(cidx + 2) & (nrxd - 1)]);
2693  	prefetch(fl->ifl_sds.ifsd_m[(cidx + 3) & (nrxd - 1)]);
2694  	prefetch(fl->ifl_sds.ifsd_m[(cidx + 4) & (nrxd - 1)]);
2695  	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 1) & (nrxd - 1)]);
2696  	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 2) & (nrxd - 1)]);
2697  	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 3) & (nrxd - 1)]);
2698  	prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd - 1)]);
2699  }
2700  
2701  static struct mbuf *
rxd_frag_to_sd(iflib_rxq_t rxq,if_rxd_frag_t irf,bool unload,if_rxsd_t sd,int * pf_rv,if_rxd_info_t ri)2702  rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, bool unload, if_rxsd_t sd,
2703      int *pf_rv, if_rxd_info_t ri)
2704  {
2705  	bus_dmamap_t map;
2706  	iflib_fl_t fl;
2707  	caddr_t payload;
2708  	struct mbuf *m;
2709  	int flid, cidx, len, next;
2710  
2711  	map = NULL;
2712  	flid = irf->irf_flid;
2713  	cidx = irf->irf_idx;
2714  	fl = &rxq->ifr_fl[flid];
2715  	sd->ifsd_fl = fl;
2716  	sd->ifsd_cl = &fl->ifl_sds.ifsd_cl[cidx];
2717  	fl->ifl_credits--;
2718  #if MEMORY_LOGGING
2719  	fl->ifl_m_dequeued++;
2720  #endif
2721  	if (rxq->ifr_ctx->ifc_flags & IFC_PREFETCH)
2722  		prefetch_pkts(fl, cidx);
2723  	next = (cidx + CACHE_PTR_INCREMENT) & (fl->ifl_size - 1);
2724  	prefetch(&fl->ifl_sds.ifsd_map[next]);
2725  	map = fl->ifl_sds.ifsd_map[cidx];
2726  
2727  	bus_dmamap_sync(fl->ifl_buf_tag, map, BUS_DMASYNC_POSTREAD);
2728  
2729  	if (rxq->pfil != NULL && PFIL_HOOKED_IN(rxq->pfil) && pf_rv != NULL &&
2730  	    irf->irf_len != 0) {
2731  		payload  = *sd->ifsd_cl;
2732  		payload +=  ri->iri_pad;
2733  		len = ri->iri_len - ri->iri_pad;
2734  		*pf_rv = pfil_mem_in(rxq->pfil, payload, len, ri->iri_ifp, &m);
2735  		switch (*pf_rv) {
2736  		case PFIL_DROPPED:
2737  		case PFIL_CONSUMED:
2738  			/*
2739  			 * The filter ate it.  Everything is recycled.
2740  			 */
2741  			m = NULL;
2742  			unload = 0;
2743  			break;
2744  		case PFIL_REALLOCED:
2745  			/*
2746  			 * The filter copied it.  Everything is recycled.
2747  			 * 'm' points at new mbuf.
2748  			 */
2749  			unload = 0;
2750  			break;
2751  		case PFIL_PASS:
2752  			/*
2753  			 * Filter said it was OK, so receive like
2754  			 * normal
2755  			 */
2756  			m = fl->ifl_sds.ifsd_m[cidx];
2757  			fl->ifl_sds.ifsd_m[cidx] = NULL;
2758  			break;
2759  		default:
2760  			MPASS(0);
2761  		}
2762  	} else {
2763  		m = fl->ifl_sds.ifsd_m[cidx];
2764  		fl->ifl_sds.ifsd_m[cidx] = NULL;
2765  		if (pf_rv != NULL)
2766  			*pf_rv = PFIL_PASS;
2767  	}
2768  
2769  	if (unload && irf->irf_len != 0)
2770  		bus_dmamap_unload(fl->ifl_buf_tag, map);
2771  	fl->ifl_cidx = (fl->ifl_cidx + 1) & (fl->ifl_size - 1);
2772  	if (__predict_false(fl->ifl_cidx == 0))
2773  		fl->ifl_gen = 0;
2774  	bit_clear(fl->ifl_rx_bitmap, cidx);
2775  	return (m);
2776  }
2777  
2778  static struct mbuf *
assemble_segments(iflib_rxq_t rxq,if_rxd_info_t ri,if_rxsd_t sd,int * pf_rv)2779  assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri, if_rxsd_t sd, int *pf_rv)
2780  {
2781  	struct mbuf *m, *mh, *mt;
2782  	caddr_t cl;
2783  	int  *pf_rv_ptr, flags, i, padlen;
2784  	bool consumed;
2785  
2786  	i = 0;
2787  	mh = NULL;
2788  	consumed = false;
2789  	*pf_rv = PFIL_PASS;
2790  	pf_rv_ptr = pf_rv;
2791  	do {
2792  		m = rxd_frag_to_sd(rxq, &ri->iri_frags[i], !consumed, sd,
2793  		    pf_rv_ptr, ri);
2794  
2795  		MPASS(*sd->ifsd_cl != NULL);
2796  
2797  		/*
2798  		 * Exclude zero-length frags & frags from
2799  		 * packets the filter has consumed or dropped
2800  		 */
2801  		if (ri->iri_frags[i].irf_len == 0 || consumed ||
2802  		    *pf_rv == PFIL_CONSUMED || *pf_rv == PFIL_DROPPED) {
2803  			if (mh == NULL) {
2804  				/* everything saved here */
2805  				consumed = true;
2806  				pf_rv_ptr = NULL;
2807  				continue;
2808  			}
2809  			/* XXX we can save the cluster here, but not the mbuf */
2810  			m_init(m, M_NOWAIT, MT_DATA, 0);
2811  			m_free(m);
2812  			continue;
2813  		}
2814  		if (mh == NULL) {
2815  			flags = M_PKTHDR | M_EXT;
2816  			mh = mt = m;
2817  			padlen = ri->iri_pad;
2818  		} else {
2819  			flags = M_EXT;
2820  			mt->m_next = m;
2821  			mt = m;
2822  			/* assuming padding is only on the first fragment */
2823  			padlen = 0;
2824  		}
2825  		cl = *sd->ifsd_cl;
2826  		*sd->ifsd_cl = NULL;
2827  
2828  		/* Can these two be made one ? */
2829  		m_init(m, M_NOWAIT, MT_DATA, flags);
2830  		m_cljset(m, cl, sd->ifsd_fl->ifl_cltype);
2831  		/*
2832  		 * These must follow m_init and m_cljset
2833  		 */
2834  		m->m_data += padlen;
2835  		ri->iri_len -= padlen;
2836  		m->m_len = ri->iri_frags[i].irf_len;
2837  	} while (++i < ri->iri_nfrags);
2838  
2839  	return (mh);
2840  }
2841  
2842  /*
2843   * Process one software descriptor
2844   */
2845  static struct mbuf *
iflib_rxd_pkt_get(iflib_rxq_t rxq,if_rxd_info_t ri)2846  iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri)
2847  {
2848  	struct if_rxsd sd;
2849  	struct mbuf *m;
2850  	int pf_rv;
2851  
2852  	/* should I merge this back in now that the two paths are basically duplicated? */
2853  	if (ri->iri_nfrags == 1 &&
2854  	    ri->iri_frags[0].irf_len != 0 &&
2855  	    ri->iri_frags[0].irf_len <= MIN(IFLIB_RX_COPY_THRESH, MHLEN)) {
2856  		m = rxd_frag_to_sd(rxq, &ri->iri_frags[0], false, &sd,
2857  		    &pf_rv, ri);
2858  		if (pf_rv != PFIL_PASS && pf_rv != PFIL_REALLOCED)
2859  			return (m);
2860  		if (pf_rv == PFIL_PASS) {
2861  			m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
2862  #ifndef __NO_STRICT_ALIGNMENT
2863  			if (!IP_ALIGNED(m) && ri->iri_pad == 0)
2864  				m->m_data += 2;
2865  #endif
2866  			memcpy(m->m_data, *sd.ifsd_cl, ri->iri_len);
2867  			m->m_len = ri->iri_frags[0].irf_len;
2868  			m->m_data += ri->iri_pad;
2869  			ri->iri_len -= ri->iri_pad;
2870  		}
2871  	} else {
2872  		m = assemble_segments(rxq, ri, &sd, &pf_rv);
2873  		if (m == NULL)
2874  			return (NULL);
2875  		if (pf_rv != PFIL_PASS && pf_rv != PFIL_REALLOCED)
2876  			return (m);
2877  	}
2878  	m->m_pkthdr.len = ri->iri_len;
2879  	m->m_pkthdr.rcvif = ri->iri_ifp;
2880  	m->m_flags |= ri->iri_flags;
2881  	m->m_pkthdr.ether_vtag = ri->iri_vtag;
2882  	m->m_pkthdr.flowid = ri->iri_flowid;
2883  #ifdef NUMA
2884  	m->m_pkthdr.numa_domain = if_getnumadomain(ri->iri_ifp);
2885  #endif
2886  	M_HASHTYPE_SET(m, ri->iri_rsstype);
2887  	m->m_pkthdr.csum_flags = ri->iri_csum_flags;
2888  	m->m_pkthdr.csum_data = ri->iri_csum_data;
2889  	return (m);
2890  }
2891  
2892  #if defined(INET6) || defined(INET)
2893  static void
iflib_get_ip_forwarding(struct lro_ctrl * lc,bool * v4,bool * v6)2894  iflib_get_ip_forwarding(struct lro_ctrl *lc, bool *v4, bool *v6)
2895  {
2896  	CURVNET_SET(if_getvnet(lc->ifp));
2897  #if defined(INET6)
2898  	*v6 = V_ip6_forwarding;
2899  #endif
2900  #if defined(INET)
2901  	*v4 = V_ipforwarding;
2902  #endif
2903  	CURVNET_RESTORE();
2904  }
2905  
2906  /*
2907   * Returns true if it's possible this packet could be LROed.
2908   * if it returns false, it is guaranteed that tcp_lro_rx()
2909   * would not return zero.
2910   */
2911  static bool
iflib_check_lro_possible(struct mbuf * m,bool v4_forwarding,bool v6_forwarding)2912  iflib_check_lro_possible(struct mbuf *m, bool v4_forwarding, bool v6_forwarding)
2913  {
2914  	struct ether_header *eh;
2915  
2916  	eh = mtod(m, struct ether_header *);
2917  	switch (eh->ether_type) {
2918  #if defined(INET6)
2919  	case htons(ETHERTYPE_IPV6):
2920  		return (!v6_forwarding);
2921  #endif
2922  #if defined(INET)
2923  	case htons(ETHERTYPE_IP):
2924  		return (!v4_forwarding);
2925  #endif
2926  	}
2927  
2928  	return (false);
2929  }
2930  #else
2931  static void
iflib_get_ip_forwarding(struct lro_ctrl * lc __unused,bool * v4 __unused,bool * v6 __unused)2932  iflib_get_ip_forwarding(struct lro_ctrl *lc __unused, bool *v4 __unused, bool *v6 __unused)
2933  {
2934  }
2935  #endif
2936  
2937  static void
_task_fn_rx_watchdog(void * context)2938  _task_fn_rx_watchdog(void *context)
2939  {
2940  	iflib_rxq_t rxq = context;
2941  
2942  	GROUPTASK_ENQUEUE(&rxq->ifr_task);
2943  }
2944  
2945  static uint8_t
iflib_rxeof(iflib_rxq_t rxq,qidx_t budget)2946  iflib_rxeof(iflib_rxq_t rxq, qidx_t budget)
2947  {
2948  	if_t ifp;
2949  	if_ctx_t ctx = rxq->ifr_ctx;
2950  	if_shared_ctx_t sctx = ctx->ifc_sctx;
2951  	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
2952  	int avail, i;
2953  	qidx_t *cidxp;
2954  	struct if_rxd_info ri;
2955  	int err, budget_left, rx_bytes, rx_pkts;
2956  	iflib_fl_t fl;
2957  	int lro_enabled;
2958  	bool v4_forwarding, v6_forwarding, lro_possible;
2959  	uint8_t retval = 0;
2960  
2961  	/*
2962  	 * XXX early demux data packets so that if_input processing only handles
2963  	 * acks in interrupt context
2964  	 */
2965  	struct mbuf *m, *mh, *mt, *mf;
2966  
2967  	NET_EPOCH_ASSERT();
2968  
2969  	lro_possible = v4_forwarding = v6_forwarding = false;
2970  	ifp = ctx->ifc_ifp;
2971  	mh = mt = NULL;
2972  	MPASS(budget > 0);
2973  	rx_pkts	= rx_bytes = 0;
2974  	if (sctx->isc_flags & IFLIB_HAS_RXCQ)
2975  		cidxp = &rxq->ifr_cq_cidx;
2976  	else
2977  		cidxp = &rxq->ifr_fl[0].ifl_cidx;
2978  	if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) {
2979  		for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
2980  			retval |= iflib_fl_refill_all(ctx, fl);
2981  		DBG_COUNTER_INC(rx_unavail);
2982  		return (retval);
2983  	}
2984  
2985  	/* pfil needs the vnet to be set */
2986  	CURVNET_SET_QUIET(if_getvnet(ifp));
2987  	for (budget_left = budget; budget_left > 0 && avail > 0;) {
2988  		if (__predict_false(!CTX_ACTIVE(ctx))) {
2989  			DBG_COUNTER_INC(rx_ctx_inactive);
2990  			break;
2991  		}
2992  		/*
2993  		 * Reset client set fields to their default values
2994  		 */
2995  		rxd_info_zero(&ri);
2996  		ri.iri_qsidx = rxq->ifr_id;
2997  		ri.iri_cidx = *cidxp;
2998  		ri.iri_ifp = ifp;
2999  		ri.iri_frags = rxq->ifr_frags;
3000  		err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
3001  
3002  		if (err)
3003  			goto err;
3004  		rx_pkts += 1;
3005  		rx_bytes += ri.iri_len;
3006  		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
3007  			*cidxp = ri.iri_cidx;
3008  			/* Update our consumer index */
3009  			/* XXX NB: shurd - check if this is still safe */
3010  			while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0])
3011  				rxq->ifr_cq_cidx -= scctx->isc_nrxd[0];
3012  			/* was this only a completion queue message? */
3013  			if (__predict_false(ri.iri_nfrags == 0))
3014  				continue;
3015  		}
3016  		MPASS(ri.iri_nfrags != 0);
3017  		MPASS(ri.iri_len != 0);
3018  
3019  		/* will advance the cidx on the corresponding free lists */
3020  		m = iflib_rxd_pkt_get(rxq, &ri);
3021  		avail--;
3022  		budget_left--;
3023  		if (avail == 0 && budget_left)
3024  			avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left);
3025  
3026  		if (__predict_false(m == NULL))
3027  			continue;
3028  
3029  		/* imm_pkt: -- cxgb */
3030  		if (mh == NULL)
3031  			mh = mt = m;
3032  		else {
3033  			mt->m_nextpkt = m;
3034  			mt = m;
3035  		}
3036  	}
3037  	CURVNET_RESTORE();
3038  	/* make sure that we can refill faster than drain */
3039  	for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
3040  		retval |= iflib_fl_refill_all(ctx, fl);
3041  
3042  	lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO);
3043  	if (lro_enabled)
3044  		iflib_get_ip_forwarding(&rxq->ifr_lc, &v4_forwarding, &v6_forwarding);
3045  	mt = mf = NULL;
3046  	while (mh != NULL) {
3047  		m = mh;
3048  		mh = mh->m_nextpkt;
3049  		m->m_nextpkt = NULL;
3050  #ifndef __NO_STRICT_ALIGNMENT
3051  		if (!IP_ALIGNED(m) && (m = iflib_fixup_rx(m)) == NULL)
3052  			continue;
3053  #endif
3054  #if defined(INET6) || defined(INET)
3055  		if (lro_enabled) {
3056  			if (!lro_possible) {
3057  				lro_possible = iflib_check_lro_possible(m, v4_forwarding, v6_forwarding);
3058  				if (lro_possible && mf != NULL) {
3059  					if_input(ifp, mf);
3060  					DBG_COUNTER_INC(rx_if_input);
3061  					mt = mf = NULL;
3062  				}
3063  			}
3064  			if ((m->m_pkthdr.csum_flags & (CSUM_L4_CALC | CSUM_L4_VALID)) ==
3065  			    (CSUM_L4_CALC | CSUM_L4_VALID)) {
3066  				if (lro_possible && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
3067  					continue;
3068  			}
3069  		}
3070  #endif
3071  		if (lro_possible) {
3072  			if_input(ifp, m);
3073  			DBG_COUNTER_INC(rx_if_input);
3074  			continue;
3075  		}
3076  
3077  		if (mf == NULL)
3078  			mf = m;
3079  		if (mt != NULL)
3080  			mt->m_nextpkt = m;
3081  		mt = m;
3082  	}
3083  	if (mf != NULL) {
3084  		if_input(ifp, mf);
3085  		DBG_COUNTER_INC(rx_if_input);
3086  	}
3087  
3088  	if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
3089  	if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
3090  
3091  	/*
3092  	 * Flush any outstanding LRO work
3093  	 */
3094  #if defined(INET6) || defined(INET)
3095  	tcp_lro_flush_all(&rxq->ifr_lc);
3096  #endif
3097  	if (avail != 0 || iflib_rxd_avail(ctx, rxq, *cidxp, 1) != 0)
3098  		retval |= IFLIB_RXEOF_MORE;
3099  	return (retval);
3100  err:
3101  	STATE_LOCK(ctx);
3102  	ctx->ifc_flags |= IFC_DO_RESET;
3103  	iflib_admin_intr_deferred(ctx);
3104  	STATE_UNLOCK(ctx);
3105  	return (0);
3106  }
3107  
3108  #define TXD_NOTIFY_COUNT(txq) (((txq)->ift_size / (txq)->ift_update_freq) - 1)
3109  static inline qidx_t
txq_max_db_deferred(iflib_txq_t txq,qidx_t in_use)3110  txq_max_db_deferred(iflib_txq_t txq, qidx_t in_use)
3111  {
3112  	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
3113  	qidx_t minthresh = txq->ift_size / 8;
3114  	if (in_use > 4 * minthresh)
3115  		return (notify_count);
3116  	if (in_use > 2 * minthresh)
3117  		return (notify_count >> 1);
3118  	if (in_use > minthresh)
3119  		return (notify_count >> 3);
3120  	return (0);
3121  }
3122  
3123  static inline qidx_t
txq_max_rs_deferred(iflib_txq_t txq)3124  txq_max_rs_deferred(iflib_txq_t txq)
3125  {
3126  	qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
3127  	qidx_t minthresh = txq->ift_size / 8;
3128  	if (txq->ift_in_use > 4 * minthresh)
3129  		return (notify_count);
3130  	if (txq->ift_in_use > 2 * minthresh)
3131  		return (notify_count >> 1);
3132  	if (txq->ift_in_use > minthresh)
3133  		return (notify_count >> 2);
3134  	return (2);
3135  }
3136  
3137  #define M_CSUM_FLAGS(m)		((m)->m_pkthdr.csum_flags)
3138  #define M_HAS_VLANTAG(m)	(m->m_flags & M_VLANTAG)
3139  
3140  #define TXQ_MAX_DB_DEFERRED(txq, in_use)	txq_max_db_deferred((txq), (in_use))
3141  #define TXQ_MAX_RS_DEFERRED(txq)	txq_max_rs_deferred(txq)
3142  #define TXQ_MAX_DB_CONSUMED(size)	(size >> 4)
3143  
3144  /* forward compatibility for cxgb */
3145  #define FIRST_QSET(ctx) 0
3146  #define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
3147  #define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
3148  #define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
3149  #define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
3150  
3151  /* XXX we should be setting this to something other than zero */
3152  #define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
3153  #define	MAX_TX_DESC(ctx) MAX((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max, \
3154      (ctx)->ifc_softc_ctx.isc_tx_nsegments)
3155  
3156  static inline bool
iflib_txd_db_check(iflib_txq_t txq,int ring)3157  iflib_txd_db_check(iflib_txq_t txq, int ring)
3158  {
3159  	if_ctx_t ctx = txq->ift_ctx;
3160  	qidx_t dbval, max;
3161  
3162  	max = TXQ_MAX_DB_DEFERRED(txq, txq->ift_in_use);
3163  
3164  	/* force || threshold exceeded || at the edge of the ring */
3165  	if (ring || (txq->ift_db_pending >= max) || (TXQ_AVAIL(txq) <= MAX_TX_DESC(ctx) + 2)) {
3166  
3167  		/*
3168  		 * 'npending' is used if the card's doorbell is in terms of the number of descriptors
3169  		 * pending flush (BRCM). 'pidx' is used in cases where the card's doorbeel uses the
3170  		 * producer index explicitly (INTC).
3171  		 */
3172  		dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx;
3173  		bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
3174  		    BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
3175  		ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval);
3176  
3177  		/*
3178  		 * Absent bugs there are zero packets pending so reset pending counts to zero.
3179  		 */
3180  		txq->ift_db_pending = txq->ift_npending = 0;
3181  		return (true);
3182  	}
3183  	return (false);
3184  }
3185  
3186  #ifdef PKT_DEBUG
3187  static void
print_pkt(if_pkt_info_t pi)3188  print_pkt(if_pkt_info_t pi)
3189  {
3190  	printf("pi len:  %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n",
3191  	       pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx);
3192  	printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n",
3193  	       pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag);
3194  	printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n",
3195  	       pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto);
3196  }
3197  #endif
3198  
3199  #define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO)
3200  #define IS_TX_OFFLOAD4(pi) ((pi)->ipi_csum_flags & (CSUM_IP_TCP | CSUM_IP_TSO))
3201  #define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO)
3202  #define IS_TX_OFFLOAD6(pi) ((pi)->ipi_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_TSO))
3203  
3204  /**
3205   * Parses out ethernet header information in the given mbuf.
3206   * Returns in pi: ipi_etype (EtherType) and ipi_ehdrlen (Ethernet header length)
3207   *
3208   * This will account for the VLAN header if present.
3209   *
3210   * XXX: This doesn't handle QinQ, which could prevent TX offloads for those
3211   * types of packets.
3212   */
3213  static int
iflib_parse_ether_header(if_pkt_info_t pi,struct mbuf ** mp,uint64_t * pullups)3214  iflib_parse_ether_header(if_pkt_info_t pi, struct mbuf **mp, uint64_t *pullups)
3215  {
3216  	struct ether_vlan_header *eh;
3217  	struct mbuf *m;
3218  
3219  	m = *mp;
3220  	if (__predict_false(m->m_len < sizeof(*eh))) {
3221  		(*pullups)++;
3222  		if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL))
3223  			return (ENOMEM);
3224  	}
3225  	eh = mtod(m, struct ether_vlan_header *);
3226  	if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
3227  		pi->ipi_etype = ntohs(eh->evl_proto);
3228  		pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
3229  	} else {
3230  		pi->ipi_etype = ntohs(eh->evl_encap_proto);
3231  		pi->ipi_ehdrlen = ETHER_HDR_LEN;
3232  	}
3233  	*mp = m;
3234  
3235  	return (0);
3236  }
3237  
3238  /**
3239   * Parse up to the L3 header and extract IPv4/IPv6 header information into pi.
3240   * Currently this information includes: IP ToS value, IP header version/presence
3241   *
3242   * This is missing some checks and doesn't edit the packet content as it goes,
3243   * unlike iflib_parse_header(), in order to keep the amount of code here minimal.
3244   */
3245  static int
iflib_parse_header_partial(if_pkt_info_t pi,struct mbuf ** mp,uint64_t * pullups)3246  iflib_parse_header_partial(if_pkt_info_t pi, struct mbuf **mp, uint64_t *pullups)
3247  {
3248  	struct mbuf *m;
3249  	int err;
3250  
3251  	*pullups = 0;
3252  	m = *mp;
3253  	if (!M_WRITABLE(m)) {
3254  		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
3255  			return (ENOMEM);
3256  		} else {
3257  			m_freem(*mp);
3258  			DBG_COUNTER_INC(tx_frees);
3259  			*mp = m;
3260  		}
3261  	}
3262  
3263  	/* Fills out pi->ipi_etype */
3264  	err = iflib_parse_ether_header(pi, mp, pullups);
3265  	if (err)
3266  		return (err);
3267  	m = *mp;
3268  
3269  	switch (pi->ipi_etype) {
3270  #ifdef INET
3271  	case ETHERTYPE_IP:
3272  	{
3273  		struct mbuf *n;
3274  		struct ip *ip = NULL;
3275  		int miniplen;
3276  
3277  		miniplen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip));
3278  		if (__predict_false(m->m_len < miniplen)) {
3279  			/*
3280  			 * Check for common case where the first mbuf only contains
3281  			 * the Ethernet header
3282  			 */
3283  			if (m->m_len == pi->ipi_ehdrlen) {
3284  				n = m->m_next;
3285  				MPASS(n);
3286  				/* If next mbuf contains at least the minimal IP header, then stop */
3287  				if (n->m_len >= sizeof(*ip)) {
3288  					ip = (struct ip *)n->m_data;
3289  				} else {
3290  					(*pullups)++;
3291  					if (__predict_false((m = m_pullup(m, miniplen)) == NULL))
3292  						return (ENOMEM);
3293  					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3294  				}
3295  			} else {
3296  				(*pullups)++;
3297  				if (__predict_false((m = m_pullup(m, miniplen)) == NULL))
3298  					return (ENOMEM);
3299  				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3300  			}
3301  		} else {
3302  			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3303  		}
3304  
3305  		/* Have the IPv4 header w/ no options here */
3306  		pi->ipi_ip_hlen = ip->ip_hl << 2;
3307  		pi->ipi_ipproto = ip->ip_p;
3308  		pi->ipi_ip_tos = ip->ip_tos;
3309  		pi->ipi_flags |= IPI_TX_IPV4;
3310  
3311  		break;
3312  	}
3313  #endif
3314  #ifdef INET6
3315  	case ETHERTYPE_IPV6:
3316  	{
3317  		struct ip6_hdr *ip6;
3318  
3319  		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
3320  			(*pullups)++;
3321  			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
3322  				return (ENOMEM);
3323  		}
3324  		ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
3325  
3326  		/* Have the IPv6 fixed header here */
3327  		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
3328  		pi->ipi_ipproto = ip6->ip6_nxt;
3329  		pi->ipi_ip_tos = IPV6_TRAFFIC_CLASS(ip6);
3330  		pi->ipi_flags |= IPI_TX_IPV6;
3331  
3332  		break;
3333  	}
3334  #endif
3335  	default:
3336  		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
3337  		pi->ipi_ip_hlen = 0;
3338  		break;
3339  	}
3340  	*mp = m;
3341  
3342  	return (0);
3343  
3344  }
3345  
3346  static int
iflib_parse_header(iflib_txq_t txq,if_pkt_info_t pi,struct mbuf ** mp)3347  iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
3348  {
3349  	if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx;
3350  	struct mbuf *m;
3351  	int err;
3352  
3353  	m = *mp;
3354  	if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) &&
3355  	    M_WRITABLE(m) == 0) {
3356  		if ((m = m_dup(m, M_NOWAIT)) == NULL) {
3357  			return (ENOMEM);
3358  		} else {
3359  			m_freem(*mp);
3360  			DBG_COUNTER_INC(tx_frees);
3361  			*mp = m;
3362  		}
3363  	}
3364  
3365  	/* Fills out pi->ipi_etype */
3366  	err = iflib_parse_ether_header(pi, mp, &txq->ift_pullups);
3367  	if (__predict_false(err))
3368  		return (err);
3369  	m = *mp;
3370  
3371  	switch (pi->ipi_etype) {
3372  #ifdef INET
3373  	case ETHERTYPE_IP:
3374  	{
3375  		struct mbuf *n;
3376  		struct ip *ip = NULL;
3377  		struct tcphdr *th = NULL;
3378  		int minthlen;
3379  
3380  		minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th));
3381  		if (__predict_false(m->m_len < minthlen)) {
3382  			/*
3383  			 * if this code bloat is causing too much of a hit
3384  			 * move it to a separate function and mark it noinline
3385  			 */
3386  			if (m->m_len == pi->ipi_ehdrlen) {
3387  				n = m->m_next;
3388  				MPASS(n);
3389  				if (n->m_len >= sizeof(*ip))  {
3390  					ip = (struct ip *)n->m_data;
3391  					if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th))
3392  						th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
3393  				} else {
3394  					txq->ift_pullups++;
3395  					if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
3396  						return (ENOMEM);
3397  					ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3398  				}
3399  			} else {
3400  				txq->ift_pullups++;
3401  				if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
3402  					return (ENOMEM);
3403  				ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3404  				if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
3405  					th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
3406  			}
3407  		} else {
3408  			ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
3409  			if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
3410  				th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
3411  		}
3412  		pi->ipi_ip_hlen = ip->ip_hl << 2;
3413  		pi->ipi_ipproto = ip->ip_p;
3414  		pi->ipi_ip_tos = ip->ip_tos;
3415  		pi->ipi_flags |= IPI_TX_IPV4;
3416  
3417  		/* TCP checksum offload may require TCP header length */
3418  		if (IS_TX_OFFLOAD4(pi)) {
3419  			if (__predict_true(pi->ipi_ipproto == IPPROTO_TCP)) {
3420  				if (__predict_false(th == NULL)) {
3421  					txq->ift_pullups++;
3422  					if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
3423  						return (ENOMEM);
3424  					th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
3425  				}
3426  				pi->ipi_tcp_hflags = tcp_get_flags(th);
3427  				pi->ipi_tcp_hlen = th->th_off << 2;
3428  				pi->ipi_tcp_seq = th->th_seq;
3429  			}
3430  			if (IS_TSO4(pi)) {
3431  				if (__predict_false(ip->ip_p != IPPROTO_TCP))
3432  					return (ENXIO);
3433  				/*
3434  				 * TSO always requires hardware checksum offload.
3435  				 */
3436  				pi->ipi_csum_flags |= (CSUM_IP_TCP | CSUM_IP);
3437  				th->th_sum = in_pseudo(ip->ip_src.s_addr,
3438  						       ip->ip_dst.s_addr, htons(IPPROTO_TCP));
3439  				pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
3440  				if (sctx->isc_flags & IFLIB_TSO_INIT_IP) {
3441  					ip->ip_sum = 0;
3442  					ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz);
3443  				}
3444  			}
3445  		}
3446  		if ((sctx->isc_flags & IFLIB_NEED_ZERO_CSUM) && (pi->ipi_csum_flags & CSUM_IP))
3447  			ip->ip_sum = 0;
3448  
3449  		break;
3450  	}
3451  #endif
3452  #ifdef INET6
3453  	case ETHERTYPE_IPV6:
3454  	{
3455  		struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
3456  		struct tcphdr *th;
3457  		pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
3458  
3459  		if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
3460  			txq->ift_pullups++;
3461  			if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
3462  				return (ENOMEM);
3463  		}
3464  		th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen);
3465  
3466  		/* XXX-BZ this will go badly in case of ext hdrs. */
3467  		pi->ipi_ipproto = ip6->ip6_nxt;
3468  		pi->ipi_ip_tos = IPV6_TRAFFIC_CLASS(ip6);
3469  		pi->ipi_flags |= IPI_TX_IPV6;
3470  
3471  		/* TCP checksum offload may require TCP header length */
3472  		if (IS_TX_OFFLOAD6(pi)) {
3473  			if (pi->ipi_ipproto == IPPROTO_TCP) {
3474  				if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
3475  					txq->ift_pullups++;
3476  					if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
3477  						return (ENOMEM);
3478  				}
3479  				pi->ipi_tcp_hflags = tcp_get_flags(th);
3480  				pi->ipi_tcp_hlen = th->th_off << 2;
3481  				pi->ipi_tcp_seq = th->th_seq;
3482  			}
3483  			if (IS_TSO6(pi)) {
3484  				if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP))
3485  					return (ENXIO);
3486  				/*
3487  				 * TSO always requires hardware checksum offload.
3488  				 */
3489  				pi->ipi_csum_flags |= CSUM_IP6_TCP;
3490  				th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
3491  				pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
3492  			}
3493  		}
3494  		break;
3495  	}
3496  #endif
3497  	default:
3498  		pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
3499  		pi->ipi_ip_hlen = 0;
3500  		break;
3501  	}
3502  	*mp = m;
3503  
3504  	return (0);
3505  }
3506  
3507  /*
3508   * If dodgy hardware rejects the scatter gather chain we've handed it
3509   * we'll need to remove the mbuf chain from ifsg_m[] before we can add the
3510   * m_defrag'd mbufs
3511   */
3512  static __noinline struct mbuf *
iflib_remove_mbuf(iflib_txq_t txq)3513  iflib_remove_mbuf(iflib_txq_t txq)
3514  {
3515  	int ntxd, pidx;
3516  	struct mbuf *m, **ifsd_m;
3517  
3518  	ifsd_m = txq->ift_sds.ifsd_m;
3519  	ntxd = txq->ift_size;
3520  	pidx = txq->ift_pidx & (ntxd - 1);
3521  	ifsd_m = txq->ift_sds.ifsd_m;
3522  	m = ifsd_m[pidx];
3523  	ifsd_m[pidx] = NULL;
3524  	bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[pidx]);
3525  	if (txq->ift_sds.ifsd_tso_map != NULL)
3526  		bus_dmamap_unload(txq->ift_tso_buf_tag,
3527  		    txq->ift_sds.ifsd_tso_map[pidx]);
3528  #if MEMORY_LOGGING
3529  	txq->ift_dequeued++;
3530  #endif
3531  	return (m);
3532  }
3533  
3534  static inline caddr_t
calc_next_txd(iflib_txq_t txq,int cidx,uint8_t qid)3535  calc_next_txd(iflib_txq_t txq, int cidx, uint8_t qid)
3536  {
3537  	qidx_t size;
3538  	int ntxd;
3539  	caddr_t start, end, cur, next;
3540  
3541  	ntxd = txq->ift_size;
3542  	size = txq->ift_txd_size[qid];
3543  	start = txq->ift_ifdi[qid].idi_vaddr;
3544  
3545  	if (__predict_false(size == 0))
3546  		return (start);
3547  	cur = start + size * cidx;
3548  	end = start + size * ntxd;
3549  	next = CACHE_PTR_NEXT(cur);
3550  	return (next < end ? next : start);
3551  }
3552  
3553  /*
3554   * Pad an mbuf to ensure a minimum ethernet frame size.
3555   * min_frame_size is the frame size (less CRC) to pad the mbuf to
3556   */
3557  static __noinline int
iflib_ether_pad(device_t dev,struct mbuf ** m_head,uint16_t min_frame_size)3558  iflib_ether_pad(device_t dev, struct mbuf **m_head, uint16_t min_frame_size)
3559  {
3560  	/*
3561  	 * 18 is enough bytes to pad an ARP packet to 46 bytes, and
3562  	 * and ARP message is the smallest common payload I can think of
3563  	 */
3564  	static char pad[18];	/* just zeros */
3565  	int n;
3566  	struct mbuf *new_head;
3567  
3568  	if (!M_WRITABLE(*m_head)) {
3569  		new_head = m_dup(*m_head, M_NOWAIT);
3570  		if (new_head == NULL) {
3571  			m_freem(*m_head);
3572  			device_printf(dev, "cannot pad short frame, m_dup() failed");
3573  			DBG_COUNTER_INC(encap_pad_mbuf_fail);
3574  			DBG_COUNTER_INC(tx_frees);
3575  			return (ENOMEM);
3576  		}
3577  		m_freem(*m_head);
3578  		*m_head = new_head;
3579  	}
3580  
3581  	for (n = min_frame_size - (*m_head)->m_pkthdr.len;
3582  	     n > 0; n -= sizeof(pad))
3583  		if (!m_append(*m_head, min(n, sizeof(pad)), pad))
3584  			break;
3585  
3586  	if (n > 0) {
3587  		m_freem(*m_head);
3588  		device_printf(dev, "cannot pad short frame\n");
3589  		DBG_COUNTER_INC(encap_pad_mbuf_fail);
3590  		DBG_COUNTER_INC(tx_frees);
3591  		return (ENOBUFS);
3592  	}
3593  
3594  	return (0);
3595  }
3596  
3597  static int
iflib_encap(iflib_txq_t txq,struct mbuf ** m_headp)3598  iflib_encap(iflib_txq_t txq, struct mbuf **m_headp)
3599  {
3600  	if_ctx_t		ctx;
3601  	if_shared_ctx_t		sctx;
3602  	if_softc_ctx_t		scctx;
3603  	bus_dma_tag_t		buf_tag;
3604  	bus_dma_segment_t	*segs;
3605  	struct mbuf		*m_head, **ifsd_m;
3606  	void			*next_txd;
3607  	bus_dmamap_t		map;
3608  	struct if_pkt_info	pi;
3609  	int remap = 0;
3610  	int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd;
3611  
3612  	ctx = txq->ift_ctx;
3613  	sctx = ctx->ifc_sctx;
3614  	scctx = &ctx->ifc_softc_ctx;
3615  	segs = txq->ift_segs;
3616  	ntxd = txq->ift_size;
3617  	m_head = *m_headp;
3618  	map = NULL;
3619  
3620  	/*
3621  	 * If we're doing TSO the next descriptor to clean may be quite far ahead
3622  	 */
3623  	cidx = txq->ift_cidx;
3624  	pidx = txq->ift_pidx;
3625  	if (ctx->ifc_flags & IFC_PREFETCH) {
3626  		next = (cidx + CACHE_PTR_INCREMENT) & (ntxd - 1);
3627  		if (!(ctx->ifc_flags & IFLIB_HAS_TXCQ)) {
3628  			next_txd = calc_next_txd(txq, cidx, 0);
3629  			prefetch(next_txd);
3630  		}
3631  
3632  		/* prefetch the next cache line of mbuf pointers and flags */
3633  		prefetch(&txq->ift_sds.ifsd_m[next]);
3634  		prefetch(&txq->ift_sds.ifsd_map[next]);
3635  		next = (cidx + CACHE_LINE_SIZE) & (ntxd - 1);
3636  	}
3637  	map = txq->ift_sds.ifsd_map[pidx];
3638  	ifsd_m = txq->ift_sds.ifsd_m;
3639  
3640  	if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
3641  		buf_tag = txq->ift_tso_buf_tag;
3642  		max_segs = scctx->isc_tx_tso_segments_max;
3643  		map = txq->ift_sds.ifsd_tso_map[pidx];
3644  		MPASS(buf_tag != NULL);
3645  		MPASS(max_segs > 0);
3646  	} else {
3647  		buf_tag = txq->ift_buf_tag;
3648  		max_segs = scctx->isc_tx_nsegments;
3649  		map = txq->ift_sds.ifsd_map[pidx];
3650  	}
3651  	if ((sctx->isc_flags & IFLIB_NEED_ETHER_PAD) &&
3652  	    __predict_false(m_head->m_pkthdr.len < scctx->isc_min_frame_size)) {
3653  		err = iflib_ether_pad(ctx->ifc_dev, m_headp, scctx->isc_min_frame_size);
3654  		if (err) {
3655  			DBG_COUNTER_INC(encap_txd_encap_fail);
3656  			return (err);
3657  		}
3658  	}
3659  	m_head = *m_headp;
3660  
3661  	pkt_info_zero(&pi);
3662  	pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG | M_BCAST | M_MCAST));
3663  	pi.ipi_pidx = pidx;
3664  	pi.ipi_qsidx = txq->ift_id;
3665  	pi.ipi_len = m_head->m_pkthdr.len;
3666  	pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
3667  	pi.ipi_vtag = M_HAS_VLANTAG(m_head) ? m_head->m_pkthdr.ether_vtag : 0;
3668  
3669  	/* deliberate bitwise OR to make one condition */
3670  	if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) {
3671  		if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0)) {
3672  			DBG_COUNTER_INC(encap_txd_encap_fail);
3673  			return (err);
3674  		}
3675  		m_head = *m_headp;
3676  	}
3677  
3678  retry:
3679  	err = bus_dmamap_load_mbuf_sg(buf_tag, map, m_head, segs, &nsegs,
3680  	    BUS_DMA_NOWAIT);
3681  defrag:
3682  	if (__predict_false(err)) {
3683  		switch (err) {
3684  		case EFBIG:
3685  			/* try collapse once and defrag once */
3686  			if (remap == 0) {
3687  				m_head = m_collapse(*m_headp, M_NOWAIT, max_segs);
3688  				/* try defrag if collapsing fails */
3689  				if (m_head == NULL)
3690  					remap++;
3691  			}
3692  			if (remap == 1) {
3693  				txq->ift_mbuf_defrag++;
3694  				m_head = m_defrag(*m_headp, M_NOWAIT);
3695  			}
3696  			/*
3697  			 * remap should never be >1 unless bus_dmamap_load_mbuf_sg
3698  			 * failed to map an mbuf that was run through m_defrag
3699  			 */
3700  			MPASS(remap <= 1);
3701  			if (__predict_false(m_head == NULL || remap > 1))
3702  				goto defrag_failed;
3703  			remap++;
3704  			*m_headp = m_head;
3705  			goto retry;
3706  			break;
3707  		case ENOMEM:
3708  			txq->ift_no_tx_dma_setup++;
3709  			break;
3710  		default:
3711  			txq->ift_no_tx_dma_setup++;
3712  			m_freem(*m_headp);
3713  			DBG_COUNTER_INC(tx_frees);
3714  			*m_headp = NULL;
3715  			break;
3716  		}
3717  		txq->ift_map_failed++;
3718  		DBG_COUNTER_INC(encap_load_mbuf_fail);
3719  		DBG_COUNTER_INC(encap_txd_encap_fail);
3720  		return (err);
3721  	}
3722  	ifsd_m[pidx] = m_head;
3723  	/*
3724  	 * XXX assumes a 1 to 1 relationship between segments and
3725  	 *        descriptors - this does not hold true on all drivers, e.g.
3726  	 *        cxgb
3727  	 */
3728  	if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
3729  		txq->ift_no_desc_avail++;
3730  		bus_dmamap_unload(buf_tag, map);
3731  		DBG_COUNTER_INC(encap_txq_avail_fail);
3732  		DBG_COUNTER_INC(encap_txd_encap_fail);
3733  		if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
3734  			GROUPTASK_ENQUEUE(&txq->ift_task);
3735  		return (ENOBUFS);
3736  	}
3737  	/*
3738  	 * On Intel cards we can greatly reduce the number of TX interrupts
3739  	 * we see by only setting report status on every Nth descriptor.
3740  	 * However, this also means that the driver will need to keep track
3741  	 * of the descriptors that RS was set on to check them for the DD bit.
3742  	 */
3743  	txq->ift_rs_pending += nsegs + 1;
3744  	if (txq->ift_rs_pending > TXQ_MAX_RS_DEFERRED(txq) ||
3745  	     iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs) <= MAX_TX_DESC(ctx) + 2) {
3746  		pi.ipi_flags |= IPI_TX_INTR;
3747  		txq->ift_rs_pending = 0;
3748  	}
3749  
3750  	pi.ipi_segs = segs;
3751  	pi.ipi_nsegs = nsegs;
3752  
3753  	MPASS(pidx >= 0 && pidx < txq->ift_size);
3754  #ifdef PKT_DEBUG
3755  	print_pkt(&pi);
3756  #endif
3757  	if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) {
3758  		bus_dmamap_sync(buf_tag, map, BUS_DMASYNC_PREWRITE);
3759  		DBG_COUNTER_INC(tx_encap);
3760  		MPASS(pi.ipi_new_pidx < txq->ift_size);
3761  
3762  		ndesc = pi.ipi_new_pidx - pi.ipi_pidx;
3763  		if (pi.ipi_new_pidx < pi.ipi_pidx) {
3764  			ndesc += txq->ift_size;
3765  			txq->ift_gen = 1;
3766  		}
3767  		/*
3768  		 * drivers can need as many as
3769  		 * two sentinels
3770  		 */
3771  		MPASS(ndesc <= pi.ipi_nsegs + 2);
3772  		MPASS(pi.ipi_new_pidx != pidx);
3773  		MPASS(ndesc > 0);
3774  		txq->ift_in_use += ndesc;
3775  		txq->ift_db_pending += ndesc;
3776  
3777  		/*
3778  		 * We update the last software descriptor again here because there may
3779  		 * be a sentinel and/or there may be more mbufs than segments
3780  		 */
3781  		txq->ift_pidx = pi.ipi_new_pidx;
3782  		txq->ift_npending += pi.ipi_ndescs;
3783  	} else {
3784  		*m_headp = m_head = iflib_remove_mbuf(txq);
3785  		if (err == EFBIG) {
3786  			txq->ift_txd_encap_efbig++;
3787  			if (remap < 2) {
3788  				remap = 1;
3789  				goto defrag;
3790  			}
3791  		}
3792  		goto defrag_failed;
3793  	}
3794  	/*
3795  	 * err can't possibly be non-zero here, so we don't neet to test it
3796  	 * to see if we need to DBG_COUNTER_INC(encap_txd_encap_fail).
3797  	 */
3798  	return (err);
3799  
3800  defrag_failed:
3801  	txq->ift_mbuf_defrag_failed++;
3802  	txq->ift_map_failed++;
3803  	m_freem(*m_headp);
3804  	DBG_COUNTER_INC(tx_frees);
3805  	*m_headp = NULL;
3806  	DBG_COUNTER_INC(encap_txd_encap_fail);
3807  	return (ENOMEM);
3808  }
3809  
3810  static void
iflib_tx_desc_free(iflib_txq_t txq,int n)3811  iflib_tx_desc_free(iflib_txq_t txq, int n)
3812  {
3813  	uint32_t qsize, cidx, mask, gen;
3814  	struct mbuf *m, **ifsd_m;
3815  	bool do_prefetch;
3816  
3817  	cidx = txq->ift_cidx;
3818  	gen = txq->ift_gen;
3819  	qsize = txq->ift_size;
3820  	mask = qsize - 1;
3821  	ifsd_m = txq->ift_sds.ifsd_m;
3822  	do_prefetch = (txq->ift_ctx->ifc_flags & IFC_PREFETCH);
3823  
3824  	while (n-- > 0) {
3825  		if (do_prefetch) {
3826  			prefetch(ifsd_m[(cidx + 3) & mask]);
3827  			prefetch(ifsd_m[(cidx + 4) & mask]);
3828  		}
3829  		if ((m = ifsd_m[cidx]) != NULL) {
3830  			prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]);
3831  			if (m->m_pkthdr.csum_flags & CSUM_TSO) {
3832  				bus_dmamap_sync(txq->ift_tso_buf_tag,
3833  				    txq->ift_sds.ifsd_tso_map[cidx],
3834  				    BUS_DMASYNC_POSTWRITE);
3835  				bus_dmamap_unload(txq->ift_tso_buf_tag,
3836  				    txq->ift_sds.ifsd_tso_map[cidx]);
3837  			} else {
3838  				bus_dmamap_sync(txq->ift_buf_tag,
3839  				    txq->ift_sds.ifsd_map[cidx],
3840  				    BUS_DMASYNC_POSTWRITE);
3841  				bus_dmamap_unload(txq->ift_buf_tag,
3842  				    txq->ift_sds.ifsd_map[cidx]);
3843  			}
3844  			/* XXX we don't support any drivers that batch packets yet */
3845  			MPASS(m->m_nextpkt == NULL);
3846  			m_freem(m);
3847  			ifsd_m[cidx] = NULL;
3848  #if MEMORY_LOGGING
3849  			txq->ift_dequeued++;
3850  #endif
3851  			DBG_COUNTER_INC(tx_frees);
3852  		}
3853  		if (__predict_false(++cidx == qsize)) {
3854  			cidx = 0;
3855  			gen = 0;
3856  		}
3857  	}
3858  	txq->ift_cidx = cidx;
3859  	txq->ift_gen = gen;
3860  }
3861  
3862  static __inline int
iflib_completed_tx_reclaim(iflib_txq_t txq,int thresh)3863  iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh)
3864  {
3865  	int reclaim;
3866  	if_ctx_t ctx = txq->ift_ctx;
3867  
3868  	KASSERT(thresh >= 0, ("invalid threshold to reclaim"));
3869  	MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size);
3870  
3871  	/*
3872  	 * Need a rate-limiting check so that this isn't called every time
3873  	 */
3874  	iflib_tx_credits_update(ctx, txq);
3875  	reclaim = DESC_RECLAIMABLE(txq);
3876  
3877  	if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) {
3878  #ifdef INVARIANTS
3879  		if (iflib_verbose_debug) {
3880  			printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __func__,
3881  			       txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments,
3882  			       reclaim, thresh);
3883  		}
3884  #endif
3885  		return (0);
3886  	}
3887  	iflib_tx_desc_free(txq, reclaim);
3888  	txq->ift_cleaned += reclaim;
3889  	txq->ift_in_use -= reclaim;
3890  
3891  	return (reclaim);
3892  }
3893  
3894  static struct mbuf **
_ring_peek_one(struct ifmp_ring * r,int cidx,int offset,int remaining)3895  _ring_peek_one(struct ifmp_ring *r, int cidx, int offset, int remaining)
3896  {
3897  	int next, size;
3898  	struct mbuf **items;
3899  
3900  	size = r->size;
3901  	next = (cidx + CACHE_PTR_INCREMENT) & (size - 1);
3902  	items = __DEVOLATILE(struct mbuf **, &r->items[0]);
3903  
3904  	prefetch(items[(cidx + offset) & (size - 1)]);
3905  	if (remaining > 1) {
3906  		prefetch2cachelines(&items[next]);
3907  		prefetch2cachelines(items[(cidx + offset + 1) & (size - 1)]);
3908  		prefetch2cachelines(items[(cidx + offset + 2) & (size - 1)]);
3909  		prefetch2cachelines(items[(cidx + offset + 3) & (size - 1)]);
3910  	}
3911  	return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (size - 1)]));
3912  }
3913  
3914  static void
iflib_txq_check_drain(iflib_txq_t txq,int budget)3915  iflib_txq_check_drain(iflib_txq_t txq, int budget)
3916  {
3917  
3918  	ifmp_ring_check_drainage(txq->ift_br, budget);
3919  }
3920  
3921  static uint32_t
iflib_txq_can_drain(struct ifmp_ring * r)3922  iflib_txq_can_drain(struct ifmp_ring *r)
3923  {
3924  	iflib_txq_t txq = r->cookie;
3925  	if_ctx_t ctx = txq->ift_ctx;
3926  
3927  	if (TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2)
3928  		return (1);
3929  	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
3930  	    BUS_DMASYNC_POSTREAD);
3931  	return (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id,
3932  	    false));
3933  }
3934  
3935  static uint32_t
iflib_txq_drain(struct ifmp_ring * r,uint32_t cidx,uint32_t pidx)3936  iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
3937  {
3938  	iflib_txq_t txq = r->cookie;
3939  	if_ctx_t ctx = txq->ift_ctx;
3940  	if_t ifp = ctx->ifc_ifp;
3941  	struct mbuf *m, **mp;
3942  	int avail, bytes_sent, skipped, count, err, i;
3943  	int mcast_sent, pkt_sent, reclaimed;
3944  	bool do_prefetch, rang, ring;
3945  
3946  	if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) ||
3947  			    !LINK_ACTIVE(ctx))) {
3948  		DBG_COUNTER_INC(txq_drain_notready);
3949  		return (0);
3950  	}
3951  	reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
3952  	rang = iflib_txd_db_check(txq, reclaimed && txq->ift_db_pending);
3953  	avail = IDXDIFF(pidx, cidx, r->size);
3954  
3955  	if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) {
3956  		/*
3957  		 * The driver is unloading so we need to free all pending packets.
3958  		 */
3959  		DBG_COUNTER_INC(txq_drain_flushing);
3960  		for (i = 0; i < avail; i++) {
3961  			if (__predict_true(r->items[(cidx + i) & (r->size - 1)] != (void *)txq))
3962  				m_freem(r->items[(cidx + i) & (r->size - 1)]);
3963  			r->items[(cidx + i) & (r->size - 1)] = NULL;
3964  		}
3965  		return (avail);
3966  	}
3967  
3968  	if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
3969  		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
3970  		CALLOUT_LOCK(txq);
3971  		callout_stop(&txq->ift_timer);
3972  		CALLOUT_UNLOCK(txq);
3973  		DBG_COUNTER_INC(txq_drain_oactive);
3974  		return (0);
3975  	}
3976  
3977  	/*
3978  	 * If we've reclaimed any packets this queue cannot be hung.
3979  	 */
3980  	if (reclaimed)
3981  		txq->ift_qstatus = IFLIB_QUEUE_IDLE;
3982  	skipped = mcast_sent = bytes_sent = pkt_sent = 0;
3983  	count = MIN(avail, TX_BATCH_SIZE);
3984  #ifdef INVARIANTS
3985  	if (iflib_verbose_debug)
3986  		printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __func__,
3987  		       avail, ctx->ifc_flags, TXQ_AVAIL(txq));
3988  #endif
3989  	do_prefetch = (ctx->ifc_flags & IFC_PREFETCH);
3990  	err = 0;
3991  	for (i = 0; i < count && TXQ_AVAIL(txq) >= MAX_TX_DESC(ctx) + 2; i++) {
3992  		int rem = do_prefetch ? count - i : 0;
3993  
3994  		mp = _ring_peek_one(r, cidx, i, rem);
3995  		MPASS(mp != NULL && *mp != NULL);
3996  
3997  		/*
3998  		 * Completion interrupts will use the address of the txq
3999  		 * as a sentinel to enqueue _something_ in order to acquire
4000  		 * the lock on the mp_ring (there's no direct lock call).
4001  		 * We obviously whave to check for these sentinel cases
4002  		 * and skip them.
4003  		 */
4004  		if (__predict_false(*mp == (struct mbuf *)txq)) {
4005  			skipped++;
4006  			continue;
4007  		}
4008  		err = iflib_encap(txq, mp);
4009  		if (__predict_false(err)) {
4010  			/* no room - bail out */
4011  			if (err == ENOBUFS)
4012  				break;
4013  			skipped++;
4014  			/* we can't send this packet - skip it */
4015  			continue;
4016  		}
4017  		pkt_sent++;
4018  		m = *mp;
4019  		DBG_COUNTER_INC(tx_sent);
4020  		bytes_sent += m->m_pkthdr.len;
4021  		mcast_sent += !!(m->m_flags & M_MCAST);
4022  
4023  		if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING)))
4024  			break;
4025  		ETHER_BPF_MTAP(ifp, m);
4026  		rang = iflib_txd_db_check(txq, false);
4027  	}
4028  
4029  	/* deliberate use of bitwise or to avoid gratuitous short-circuit */
4030  	ring = rang ? false  : (iflib_min_tx_latency | err);
4031  	iflib_txd_db_check(txq, ring);
4032  	if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
4033  	if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
4034  	if (mcast_sent)
4035  		if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
4036  #ifdef INVARIANTS
4037  	if (iflib_verbose_debug)
4038  		printf("consumed=%d\n", skipped + pkt_sent);
4039  #endif
4040  	return (skipped + pkt_sent);
4041  }
4042  
4043  static uint32_t
iflib_txq_drain_always(struct ifmp_ring * r)4044  iflib_txq_drain_always(struct ifmp_ring *r)
4045  {
4046  	return (1);
4047  }
4048  
4049  static uint32_t
iflib_txq_drain_free(struct ifmp_ring * r,uint32_t cidx,uint32_t pidx)4050  iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
4051  {
4052  	int i, avail;
4053  	struct mbuf **mp;
4054  	iflib_txq_t txq;
4055  
4056  	txq = r->cookie;
4057  
4058  	txq->ift_qstatus = IFLIB_QUEUE_IDLE;
4059  	CALLOUT_LOCK(txq);
4060  	callout_stop(&txq->ift_timer);
4061  	CALLOUT_UNLOCK(txq);
4062  
4063  	avail = IDXDIFF(pidx, cidx, r->size);
4064  	for (i = 0; i < avail; i++) {
4065  		mp = _ring_peek_one(r, cidx, i, avail - i);
4066  		if (__predict_false(*mp == (struct mbuf *)txq))
4067  			continue;
4068  		m_freem(*mp);
4069  		DBG_COUNTER_INC(tx_frees);
4070  	}
4071  	MPASS(ifmp_ring_is_stalled(r) == 0);
4072  	return (avail);
4073  }
4074  
4075  static void
iflib_ifmp_purge(iflib_txq_t txq)4076  iflib_ifmp_purge(iflib_txq_t txq)
4077  {
4078  	struct ifmp_ring *r;
4079  
4080  	r = txq->ift_br;
4081  	r->drain = iflib_txq_drain_free;
4082  	r->can_drain = iflib_txq_drain_always;
4083  
4084  	ifmp_ring_check_drainage(r, r->size);
4085  
4086  	r->drain = iflib_txq_drain;
4087  	r->can_drain = iflib_txq_can_drain;
4088  }
4089  
4090  static void
_task_fn_tx(void * context)4091  _task_fn_tx(void *context)
4092  {
4093  	iflib_txq_t txq = context;
4094  	if_ctx_t ctx = txq->ift_ctx;
4095  	if_t ifp = ctx->ifc_ifp;
4096  	int abdicate = ctx->ifc_sysctl_tx_abdicate;
4097  
4098  #ifdef IFLIB_DIAGNOSTICS
4099  	txq->ift_cpu_exec_count[curcpu]++;
4100  #endif
4101  	if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
4102  		return;
4103  #ifdef DEV_NETMAP
4104  	if ((if_getcapenable(ifp) & IFCAP_NETMAP) &&
4105  	    netmap_tx_irq(ifp, txq->ift_id))
4106  		goto skip_ifmp;
4107  #endif
4108  #ifdef ALTQ
4109  	if (if_altq_is_enabled(ifp))
4110  		iflib_altq_if_start(ifp);
4111  #endif
4112  	if (txq->ift_db_pending)
4113  		ifmp_ring_enqueue(txq->ift_br, (void **)&txq, 1, TX_BATCH_SIZE, abdicate);
4114  	else if (!abdicate)
4115  		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
4116  	/*
4117  	 * When abdicating, we always need to check drainage, not just when we don't enqueue
4118  	 */
4119  	if (abdicate)
4120  		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
4121  #ifdef DEV_NETMAP
4122  skip_ifmp:
4123  #endif
4124  	if (ctx->ifc_flags & IFC_LEGACY)
4125  		IFDI_INTR_ENABLE(ctx);
4126  	else
4127  		IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
4128  }
4129  
4130  static void
_task_fn_rx(void * context)4131  _task_fn_rx(void *context)
4132  {
4133  	iflib_rxq_t rxq = context;
4134  	if_ctx_t ctx = rxq->ifr_ctx;
4135  	uint8_t more;
4136  	uint16_t budget;
4137  #ifdef DEV_NETMAP
4138  	u_int work = 0;
4139  	int nmirq;
4140  #endif
4141  
4142  #ifdef IFLIB_DIAGNOSTICS
4143  	rxq->ifr_cpu_exec_count[curcpu]++;
4144  #endif
4145  	DBG_COUNTER_INC(task_fn_rxs);
4146  	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
4147  		return;
4148  #ifdef DEV_NETMAP
4149  	nmirq = netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &work);
4150  	if (nmirq != NM_IRQ_PASS) {
4151  		more = (nmirq == NM_IRQ_RESCHED) ? IFLIB_RXEOF_MORE : 0;
4152  		goto skip_rxeof;
4153  	}
4154  #endif
4155  	budget = ctx->ifc_sysctl_rx_budget;
4156  	if (budget == 0)
4157  		budget = 16;	/* XXX */
4158  	more = iflib_rxeof(rxq, budget);
4159  #ifdef DEV_NETMAP
4160  skip_rxeof:
4161  #endif
4162  	if ((more & IFLIB_RXEOF_MORE) == 0) {
4163  		if (ctx->ifc_flags & IFC_LEGACY)
4164  			IFDI_INTR_ENABLE(ctx);
4165  		else
4166  			IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
4167  		DBG_COUNTER_INC(rx_intr_enables);
4168  	}
4169  	if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
4170  		return;
4171  
4172  	if (more & IFLIB_RXEOF_MORE)
4173  		GROUPTASK_ENQUEUE(&rxq->ifr_task);
4174  	else if (more & IFLIB_RXEOF_EMPTY)
4175  		callout_reset_curcpu(&rxq->ifr_watchdog, 1, &_task_fn_rx_watchdog, rxq);
4176  }
4177  
4178  static void
_task_fn_admin(void * context,int pending)4179  _task_fn_admin(void *context, int pending)
4180  {
4181  	if_ctx_t ctx = context;
4182  	if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
4183  	iflib_txq_t txq;
4184  	int i;
4185  	bool oactive, running, do_reset, do_watchdog, in_detach;
4186  
4187  	STATE_LOCK(ctx);
4188  	running = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING);
4189  	oactive = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE);
4190  	do_reset = (ctx->ifc_flags & IFC_DO_RESET);
4191  	do_watchdog = (ctx->ifc_flags & IFC_DO_WATCHDOG);
4192  	in_detach = (ctx->ifc_flags & IFC_IN_DETACH);
4193  	ctx->ifc_flags &= ~(IFC_DO_RESET | IFC_DO_WATCHDOG);
4194  	STATE_UNLOCK(ctx);
4195  
4196  	if ((!running && !oactive) && !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
4197  		return;
4198  	if (in_detach)
4199  		return;
4200  
4201  	CTX_LOCK(ctx);
4202  	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
4203  		CALLOUT_LOCK(txq);
4204  		callout_stop(&txq->ift_timer);
4205  		CALLOUT_UNLOCK(txq);
4206  	}
4207  	if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_ADMINCQ)
4208  		IFDI_ADMIN_COMPLETION_HANDLE(ctx);
4209  	if (do_watchdog) {
4210  		ctx->ifc_watchdog_events++;
4211  		IFDI_WATCHDOG_RESET(ctx);
4212  	}
4213  	IFDI_UPDATE_ADMIN_STATUS(ctx);
4214  	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
4215  		callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, txq,
4216  		    txq->ift_timer.c_cpu);
4217  	}
4218  	IFDI_LINK_INTR_ENABLE(ctx);
4219  	if (do_reset)
4220  		iflib_if_init_locked(ctx);
4221  	CTX_UNLOCK(ctx);
4222  
4223  	if (LINK_ACTIVE(ctx) == 0)
4224  		return;
4225  	for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
4226  		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
4227  }
4228  
4229  static void
_task_fn_iov(void * context,int pending)4230  _task_fn_iov(void *context, int pending)
4231  {
4232  	if_ctx_t ctx = context;
4233  
4234  	if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) &&
4235  	    !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
4236  		return;
4237  
4238  	CTX_LOCK(ctx);
4239  	IFDI_VFLR_HANDLE(ctx);
4240  	CTX_UNLOCK(ctx);
4241  }
4242  
4243  static int
iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS)4244  iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS)
4245  {
4246  	int err;
4247  	if_int_delay_info_t info;
4248  	if_ctx_t ctx;
4249  
4250  	info = (if_int_delay_info_t)arg1;
4251  	ctx = info->iidi_ctx;
4252  	info->iidi_req = req;
4253  	info->iidi_oidp = oidp;
4254  	CTX_LOCK(ctx);
4255  	err = IFDI_SYSCTL_INT_DELAY(ctx, info);
4256  	CTX_UNLOCK(ctx);
4257  	return (err);
4258  }
4259  
4260  /*********************************************************************
4261   *
4262   *  IFNET FUNCTIONS
4263   *
4264   **********************************************************************/
4265  
4266  static void
iflib_if_init_locked(if_ctx_t ctx)4267  iflib_if_init_locked(if_ctx_t ctx)
4268  {
4269  	iflib_stop(ctx);
4270  	iflib_init_locked(ctx);
4271  }
4272  
4273  static void
iflib_if_init(void * arg)4274  iflib_if_init(void *arg)
4275  {
4276  	if_ctx_t ctx = arg;
4277  
4278  	CTX_LOCK(ctx);
4279  	iflib_if_init_locked(ctx);
4280  	CTX_UNLOCK(ctx);
4281  }
4282  
4283  static int
iflib_if_transmit(if_t ifp,struct mbuf * m)4284  iflib_if_transmit(if_t ifp, struct mbuf *m)
4285  {
4286  	if_ctx_t ctx = if_getsoftc(ifp);
4287  	iflib_txq_t txq;
4288  	int err, qidx;
4289  	int abdicate;
4290  
4291  	if (__predict_false((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) {
4292  		DBG_COUNTER_INC(tx_frees);
4293  		m_freem(m);
4294  		return (ENETDOWN);
4295  	}
4296  
4297  	MPASS(m->m_nextpkt == NULL);
4298  	/* ALTQ-enabled interfaces always use queue 0. */
4299  	qidx = 0;
4300  	/* Use driver-supplied queue selection method if it exists */
4301  	if (ctx->isc_txq_select_v2) {
4302  		struct if_pkt_info pi;
4303  		uint64_t early_pullups = 0;
4304  		pkt_info_zero(&pi);
4305  
4306  		err = iflib_parse_header_partial(&pi, &m, &early_pullups);
4307  		if (__predict_false(err != 0)) {
4308  			/* Assign pullups for bad pkts to default queue */
4309  			ctx->ifc_txqs[0].ift_pullups += early_pullups;
4310  			DBG_COUNTER_INC(encap_txd_encap_fail);
4311  			return (err);
4312  		}
4313  		/* Let driver make queueing decision */
4314  		qidx = ctx->isc_txq_select_v2(ctx->ifc_softc, m, &pi);
4315  		ctx->ifc_txqs[qidx].ift_pullups += early_pullups;
4316  	}
4317  	/* Backwards compatibility w/ simpler queue select */
4318  	else if (ctx->isc_txq_select)
4319  		qidx = ctx->isc_txq_select(ctx->ifc_softc, m);
4320  	/* If not, use iflib's standard method */
4321  	else if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m) && !if_altq_is_enabled(ifp))
4322  		qidx = QIDX(ctx, m);
4323  
4324  	/* Set TX queue */
4325  	txq = &ctx->ifc_txqs[qidx];
4326  
4327  #ifdef DRIVER_BACKPRESSURE
4328  	if (txq->ift_closed) {
4329  		while (m != NULL) {
4330  			next = m->m_nextpkt;
4331  			m->m_nextpkt = NULL;
4332  			m_freem(m);
4333  			DBG_COUNTER_INC(tx_frees);
4334  			m = next;
4335  		}
4336  		return (ENOBUFS);
4337  	}
4338  #endif
4339  #ifdef notyet
4340  	qidx = count = 0;
4341  	mp = marr;
4342  	next = m;
4343  	do {
4344  		count++;
4345  		next = next->m_nextpkt;
4346  	} while (next != NULL);
4347  
4348  	if (count > nitems(marr))
4349  		if ((mp = malloc(count * sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) {
4350  			/* XXX check nextpkt */
4351  			m_freem(m);
4352  			/* XXX simplify for now */
4353  			DBG_COUNTER_INC(tx_frees);
4354  			return (ENOBUFS);
4355  		}
4356  	for (next = m, i = 0; next != NULL; i++) {
4357  		mp[i] = next;
4358  		next = next->m_nextpkt;
4359  		mp[i]->m_nextpkt = NULL;
4360  	}
4361  #endif
4362  	DBG_COUNTER_INC(tx_seen);
4363  	abdicate = ctx->ifc_sysctl_tx_abdicate;
4364  
4365  	err = ifmp_ring_enqueue(txq->ift_br, (void **)&m, 1, TX_BATCH_SIZE, abdicate);
4366  
4367  	if (abdicate)
4368  		GROUPTASK_ENQUEUE(&txq->ift_task);
4369   	if (err) {
4370  		if (!abdicate)
4371  			GROUPTASK_ENQUEUE(&txq->ift_task);
4372  		/* support forthcoming later */
4373  #ifdef DRIVER_BACKPRESSURE
4374  		txq->ift_closed = TRUE;
4375  #endif
4376  		ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
4377  		m_freem(m);
4378  		DBG_COUNTER_INC(tx_frees);
4379  	}
4380  
4381  	return (err);
4382  }
4383  
4384  #ifdef ALTQ
4385  /*
4386   * The overall approach to integrating iflib with ALTQ is to continue to use
4387   * the iflib mp_ring machinery between the ALTQ queue(s) and the hardware
4388   * ring.  Technically, when using ALTQ, queueing to an intermediate mp_ring
4389   * is redundant/unnecessary, but doing so minimizes the amount of
4390   * ALTQ-specific code required in iflib.  It is assumed that the overhead of
4391   * redundantly queueing to an intermediate mp_ring is swamped by the
4392   * performance limitations inherent in using ALTQ.
4393   *
4394   * When ALTQ support is compiled in, all iflib drivers will use a transmit
4395   * routine, iflib_altq_if_transmit(), that checks if ALTQ is enabled for the
4396   * given interface.  If ALTQ is enabled for an interface, then all
4397   * transmitted packets for that interface will be submitted to the ALTQ
4398   * subsystem via IFQ_ENQUEUE().  We don't use the legacy if_transmit()
4399   * implementation because it uses IFQ_HANDOFF(), which will duplicatively
4400   * update stats that the iflib machinery handles, and which is sensitve to
4401   * the disused IFF_DRV_OACTIVE flag.  Additionally, iflib_altq_if_start()
4402   * will be installed as the start routine for use by ALTQ facilities that
4403   * need to trigger queue drains on a scheduled basis.
4404   *
4405   */
4406  static void
iflib_altq_if_start(if_t ifp)4407  iflib_altq_if_start(if_t ifp)
4408  {
4409  	struct ifaltq *ifq = &ifp->if_snd; /* XXX - DRVAPI */
4410  	struct mbuf *m;
4411  
4412  	IFQ_LOCK(ifq);
4413  	IFQ_DEQUEUE_NOLOCK(ifq, m);
4414  	while (m != NULL) {
4415  		iflib_if_transmit(ifp, m);
4416  		IFQ_DEQUEUE_NOLOCK(ifq, m);
4417  	}
4418  	IFQ_UNLOCK(ifq);
4419  }
4420  
4421  static int
iflib_altq_if_transmit(if_t ifp,struct mbuf * m)4422  iflib_altq_if_transmit(if_t ifp, struct mbuf *m)
4423  {
4424  	int err;
4425  
4426  	if (if_altq_is_enabled(ifp)) {
4427  		IFQ_ENQUEUE(&ifp->if_snd, m, err); /* XXX - DRVAPI */
4428  		if (err == 0)
4429  			iflib_altq_if_start(ifp);
4430  	} else
4431  		err = iflib_if_transmit(ifp, m);
4432  
4433  	return (err);
4434  }
4435  #endif /* ALTQ */
4436  
4437  static void
iflib_if_qflush(if_t ifp)4438  iflib_if_qflush(if_t ifp)
4439  {
4440  	if_ctx_t ctx = if_getsoftc(ifp);
4441  	iflib_txq_t txq = ctx->ifc_txqs;
4442  	int i;
4443  
4444  	STATE_LOCK(ctx);
4445  	ctx->ifc_flags |= IFC_QFLUSH;
4446  	STATE_UNLOCK(ctx);
4447  	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
4448  		while (!(ifmp_ring_is_idle(txq->ift_br) || ifmp_ring_is_stalled(txq->ift_br)))
4449  			iflib_txq_check_drain(txq, 0);
4450  	STATE_LOCK(ctx);
4451  	ctx->ifc_flags &= ~IFC_QFLUSH;
4452  	STATE_UNLOCK(ctx);
4453  
4454  	/*
4455  	 * When ALTQ is enabled, this will also take care of purging the
4456  	 * ALTQ queue(s).
4457  	 */
4458  	if_qflush(ifp);
4459  }
4460  
4461  #define IFCAP_FLAGS (IFCAP_HWCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \
4462  		     IFCAP_TSO | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \
4463  		     IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | \
4464  		     IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWCSUM | IFCAP_MEXTPG)
4465  
4466  static int
iflib_if_ioctl(if_t ifp,u_long command,caddr_t data)4467  iflib_if_ioctl(if_t ifp, u_long command, caddr_t data)
4468  {
4469  	if_ctx_t ctx = if_getsoftc(ifp);
4470  	struct ifreq	*ifr = (struct ifreq *)data;
4471  #if defined(INET) || defined(INET6)
4472  	struct ifaddr	*ifa = (struct ifaddr *)data;
4473  #endif
4474  	bool		avoid_reset = false;
4475  	int		err = 0, reinit = 0, bits;
4476  
4477  	switch (command) {
4478  	case SIOCSIFADDR:
4479  #ifdef INET
4480  		if (ifa->ifa_addr->sa_family == AF_INET)
4481  			avoid_reset = true;
4482  #endif
4483  #ifdef INET6
4484  		if (ifa->ifa_addr->sa_family == AF_INET6)
4485  			avoid_reset = true;
4486  #endif
4487  		/*
4488  		 * Calling init results in link renegotiation,
4489  		 * so we avoid doing it when possible.
4490  		 */
4491  		if (avoid_reset) {
4492  			if_setflagbits(ifp, IFF_UP, 0);
4493  			if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
4494  				reinit = 1;
4495  #ifdef INET
4496  			if (!(if_getflags(ifp) & IFF_NOARP))
4497  				arp_ifinit(ifp, ifa);
4498  #endif
4499  		} else
4500  			err = ether_ioctl(ifp, command, data);
4501  		break;
4502  	case SIOCSIFMTU:
4503  		CTX_LOCK(ctx);
4504  		if (ifr->ifr_mtu == if_getmtu(ifp)) {
4505  			CTX_UNLOCK(ctx);
4506  			break;
4507  		}
4508  		bits = if_getdrvflags(ifp);
4509  		/* stop the driver and free any clusters before proceeding */
4510  		iflib_stop(ctx);
4511  
4512  		if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) {
4513  			STATE_LOCK(ctx);
4514  			if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size)
4515  				ctx->ifc_flags |= IFC_MULTISEG;
4516  			else
4517  				ctx->ifc_flags &= ~IFC_MULTISEG;
4518  			STATE_UNLOCK(ctx);
4519  			err = if_setmtu(ifp, ifr->ifr_mtu);
4520  		}
4521  		iflib_init_locked(ctx);
4522  		STATE_LOCK(ctx);
4523  		if_setdrvflags(ifp, bits);
4524  		STATE_UNLOCK(ctx);
4525  		CTX_UNLOCK(ctx);
4526  		break;
4527  	case SIOCSIFFLAGS:
4528  		CTX_LOCK(ctx);
4529  		if (if_getflags(ifp) & IFF_UP) {
4530  			if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
4531  				if ((if_getflags(ifp) ^ ctx->ifc_if_flags) &
4532  				    (IFF_PROMISC | IFF_ALLMULTI)) {
4533  					CTX_UNLOCK(ctx);
4534  					err = IFDI_PROMISC_SET(ctx, if_getflags(ifp));
4535  					CTX_LOCK(ctx);
4536  				}
4537  			} else
4538  				reinit = 1;
4539  		} else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
4540  			iflib_stop(ctx);
4541  		}
4542  		ctx->ifc_if_flags = if_getflags(ifp);
4543  		CTX_UNLOCK(ctx);
4544  		break;
4545  	case SIOCADDMULTI:
4546  	case SIOCDELMULTI:
4547  		if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
4548  			CTX_LOCK(ctx);
4549  			IFDI_INTR_DISABLE(ctx);
4550  			IFDI_MULTI_SET(ctx);
4551  			IFDI_INTR_ENABLE(ctx);
4552  			CTX_UNLOCK(ctx);
4553  		}
4554  		break;
4555  	case SIOCSIFMEDIA:
4556  		CTX_LOCK(ctx);
4557  		IFDI_MEDIA_SET(ctx);
4558  		CTX_UNLOCK(ctx);
4559  		/* FALLTHROUGH */
4560  	case SIOCGIFMEDIA:
4561  	case SIOCGIFXMEDIA:
4562  		err = ifmedia_ioctl(ifp, ifr, ctx->ifc_mediap, command);
4563  		break;
4564  	case SIOCGI2C:
4565  	{
4566  		struct ifi2creq i2c;
4567  
4568  		err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
4569  		if (err != 0)
4570  			break;
4571  		if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
4572  			err = EINVAL;
4573  			break;
4574  		}
4575  		if (i2c.len > sizeof(i2c.data)) {
4576  			err = EINVAL;
4577  			break;
4578  		}
4579  
4580  		if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0)
4581  			err = copyout(&i2c, ifr_data_get_ptr(ifr),
4582  			    sizeof(i2c));
4583  		break;
4584  	}
4585  	case SIOCSIFCAP:
4586  	{
4587  		int mask, setmask, oldmask;
4588  
4589  		oldmask = if_getcapenable(ifp);
4590  		mask = ifr->ifr_reqcap ^ oldmask;
4591  		mask &= ctx->ifc_softc_ctx.isc_capabilities | IFCAP_MEXTPG;
4592  		setmask = 0;
4593  #ifdef TCP_OFFLOAD
4594  		setmask |= mask & (IFCAP_TOE4 | IFCAP_TOE6);
4595  #endif
4596  		setmask |= (mask & IFCAP_FLAGS);
4597  		setmask |= (mask & IFCAP_WOL);
4598  
4599  		/*
4600  		 * If any RX csum has changed, change all the ones that
4601  		 * are supported by the driver.
4602  		 */
4603  		if (setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
4604  			setmask |= ctx->ifc_softc_ctx.isc_capabilities &
4605  			    (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6);
4606  		}
4607  
4608  		/*
4609  		 * want to ensure that traffic has stopped before we change any of the flags
4610  		 */
4611  		if (setmask) {
4612  			CTX_LOCK(ctx);
4613  			bits = if_getdrvflags(ifp);
4614  			if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
4615  				iflib_stop(ctx);
4616  			STATE_LOCK(ctx);
4617  			if_togglecapenable(ifp, setmask);
4618  			ctx->ifc_softc_ctx.isc_capenable ^= setmask;
4619  			STATE_UNLOCK(ctx);
4620  			if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
4621  				iflib_init_locked(ctx);
4622  			STATE_LOCK(ctx);
4623  			if_setdrvflags(ifp, bits);
4624  			STATE_UNLOCK(ctx);
4625  			CTX_UNLOCK(ctx);
4626  		}
4627  		if_vlancap(ifp);
4628  		break;
4629  	}
4630  	case SIOCGPRIVATE_0:
4631  	case SIOCSDRVSPEC:
4632  	case SIOCGDRVSPEC:
4633  		CTX_LOCK(ctx);
4634  		err = IFDI_PRIV_IOCTL(ctx, command, data);
4635  		CTX_UNLOCK(ctx);
4636  		break;
4637  	default:
4638  		err = ether_ioctl(ifp, command, data);
4639  		break;
4640  	}
4641  	if (reinit)
4642  		iflib_if_init(ctx);
4643  	return (err);
4644  }
4645  
4646  static uint64_t
iflib_if_get_counter(if_t ifp,ift_counter cnt)4647  iflib_if_get_counter(if_t ifp, ift_counter cnt)
4648  {
4649  	if_ctx_t ctx = if_getsoftc(ifp);
4650  
4651  	return (IFDI_GET_COUNTER(ctx, cnt));
4652  }
4653  
4654  /*********************************************************************
4655   *
4656   *  OTHER FUNCTIONS EXPORTED TO THE STACK
4657   *
4658   **********************************************************************/
4659  
4660  static void
iflib_vlan_register(void * arg,if_t ifp,uint16_t vtag)4661  iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag)
4662  {
4663  	if_ctx_t ctx = if_getsoftc(ifp);
4664  
4665  	if ((void *)ctx != arg)
4666  		return;
4667  
4668  	if ((vtag == 0) || (vtag > 4095))
4669  		return;
4670  
4671  	if (iflib_in_detach(ctx))
4672  		return;
4673  
4674  	CTX_LOCK(ctx);
4675  	/* Driver may need all untagged packets to be flushed */
4676  	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
4677  		iflib_stop(ctx);
4678  	IFDI_VLAN_REGISTER(ctx, vtag);
4679  	/* Re-init to load the changes, if required */
4680  	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
4681  		iflib_init_locked(ctx);
4682  	CTX_UNLOCK(ctx);
4683  }
4684  
4685  static void
iflib_vlan_unregister(void * arg,if_t ifp,uint16_t vtag)4686  iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag)
4687  {
4688  	if_ctx_t ctx = if_getsoftc(ifp);
4689  
4690  	if ((void *)ctx != arg)
4691  		return;
4692  
4693  	if ((vtag == 0) || (vtag > 4095))
4694  		return;
4695  
4696  	CTX_LOCK(ctx);
4697  	/* Driver may need all tagged packets to be flushed */
4698  	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
4699  		iflib_stop(ctx);
4700  	IFDI_VLAN_UNREGISTER(ctx, vtag);
4701  	/* Re-init to load the changes, if required */
4702  	if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
4703  		iflib_init_locked(ctx);
4704  	CTX_UNLOCK(ctx);
4705  }
4706  
4707  static void
iflib_led_func(void * arg,int onoff)4708  iflib_led_func(void *arg, int onoff)
4709  {
4710  	if_ctx_t ctx = arg;
4711  
4712  	CTX_LOCK(ctx);
4713  	IFDI_LED_FUNC(ctx, onoff);
4714  	CTX_UNLOCK(ctx);
4715  }
4716  
4717  /*********************************************************************
4718   *
4719   *  BUS FUNCTION DEFINITIONS
4720   *
4721   **********************************************************************/
4722  
4723  int
iflib_device_probe(device_t dev)4724  iflib_device_probe(device_t dev)
4725  {
4726  	const pci_vendor_info_t *ent;
4727  	if_shared_ctx_t sctx;
4728  	uint16_t pci_device_id, pci_rev_id, pci_subdevice_id, pci_subvendor_id;
4729  	uint16_t pci_vendor_id;
4730  
4731  	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
4732  		return (ENOTSUP);
4733  
4734  	pci_vendor_id = pci_get_vendor(dev);
4735  	pci_device_id = pci_get_device(dev);
4736  	pci_subvendor_id = pci_get_subvendor(dev);
4737  	pci_subdevice_id = pci_get_subdevice(dev);
4738  	pci_rev_id = pci_get_revid(dev);
4739  	if (sctx->isc_parse_devinfo != NULL)
4740  		sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id);
4741  
4742  	ent = sctx->isc_vendor_info;
4743  	while (ent->pvi_vendor_id != 0) {
4744  		if (pci_vendor_id != ent->pvi_vendor_id) {
4745  			ent++;
4746  			continue;
4747  		}
4748  		if ((pci_device_id == ent->pvi_device_id) &&
4749  		    ((pci_subvendor_id == ent->pvi_subvendor_id) ||
4750  		     (ent->pvi_subvendor_id == 0)) &&
4751  		    ((pci_subdevice_id == ent->pvi_subdevice_id) ||
4752  		     (ent->pvi_subdevice_id == 0)) &&
4753  		    ((pci_rev_id == ent->pvi_rev_id) ||
4754  		     (ent->pvi_rev_id == 0))) {
4755  			device_set_desc_copy(dev, ent->pvi_name);
4756  			/* this needs to be changed to zero if the bus probing code
4757  			 * ever stops re-probing on best match because the sctx
4758  			 * may have its values over written by register calls
4759  			 * in subsequent probes
4760  			 */
4761  			return (BUS_PROBE_DEFAULT);
4762  		}
4763  		ent++;
4764  	}
4765  	return (ENXIO);
4766  }
4767  
4768  int
iflib_device_probe_vendor(device_t dev)4769  iflib_device_probe_vendor(device_t dev)
4770  {
4771  	int probe;
4772  
4773  	probe = iflib_device_probe(dev);
4774  	if (probe == BUS_PROBE_DEFAULT)
4775  		return (BUS_PROBE_VENDOR);
4776  	else
4777  		return (probe);
4778  }
4779  
4780  static void
iflib_reset_qvalues(if_ctx_t ctx)4781  iflib_reset_qvalues(if_ctx_t ctx)
4782  {
4783  	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
4784  	if_shared_ctx_t sctx = ctx->ifc_sctx;
4785  	device_t dev = ctx->ifc_dev;
4786  	int i;
4787  
4788  	if (ctx->ifc_sysctl_ntxqs != 0)
4789  		scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs;
4790  	if (ctx->ifc_sysctl_nrxqs != 0)
4791  		scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs;
4792  
4793  	for (i = 0; i < sctx->isc_ntxqs; i++) {
4794  		if (ctx->ifc_sysctl_ntxds[i] != 0)
4795  			scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i];
4796  		else
4797  			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
4798  	}
4799  
4800  	for (i = 0; i < sctx->isc_nrxqs; i++) {
4801  		if (ctx->ifc_sysctl_nrxds[i] != 0)
4802  			scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i];
4803  		else
4804  			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
4805  	}
4806  
4807  	for (i = 0; i < sctx->isc_nrxqs; i++) {
4808  		if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) {
4809  			device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n",
4810  				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]);
4811  			scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i];
4812  		}
4813  		if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) {
4814  			device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n",
4815  				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]);
4816  			scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i];
4817  		}
4818  		if (!powerof2(scctx->isc_nrxd[i])) {
4819  			device_printf(dev, "nrxd%d: %d is not a power of 2 - using default value of %d\n",
4820  				      i, scctx->isc_nrxd[i], sctx->isc_nrxd_default[i]);
4821  			scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
4822  		}
4823  	}
4824  
4825  	for (i = 0; i < sctx->isc_ntxqs; i++) {
4826  		if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) {
4827  			device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n",
4828  				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]);
4829  			scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i];
4830  		}
4831  		if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) {
4832  			device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n",
4833  				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]);
4834  			scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i];
4835  		}
4836  		if (!powerof2(scctx->isc_ntxd[i])) {
4837  			device_printf(dev, "ntxd%d: %d is not a power of 2 - using default value of %d\n",
4838  				      i, scctx->isc_ntxd[i], sctx->isc_ntxd_default[i]);
4839  			scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
4840  		}
4841  	}
4842  }
4843  
4844  static void
iflib_add_pfil(if_ctx_t ctx)4845  iflib_add_pfil(if_ctx_t ctx)
4846  {
4847  	struct pfil_head *pfil;
4848  	struct pfil_head_args pa;
4849  	iflib_rxq_t rxq;
4850  	int i;
4851  
4852  	pa.pa_version = PFIL_VERSION;
4853  	pa.pa_flags = PFIL_IN;
4854  	pa.pa_type = PFIL_TYPE_ETHERNET;
4855  	pa.pa_headname = if_name(ctx->ifc_ifp);
4856  	pfil = pfil_head_register(&pa);
4857  
4858  	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
4859  		rxq->pfil = pfil;
4860  	}
4861  }
4862  
4863  static void
iflib_rem_pfil(if_ctx_t ctx)4864  iflib_rem_pfil(if_ctx_t ctx)
4865  {
4866  	struct pfil_head *pfil;
4867  	iflib_rxq_t rxq;
4868  	int i;
4869  
4870  	rxq = ctx->ifc_rxqs;
4871  	pfil = rxq->pfil;
4872  	for (i = 0; i < NRXQSETS(ctx); i++, rxq++) {
4873  		rxq->pfil = NULL;
4874  	}
4875  	pfil_head_unregister(pfil);
4876  }
4877  
4878  
4879  /*
4880   * Advance forward by n members of the cpuset ctx->ifc_cpus starting from
4881   * cpuid and wrapping as necessary.
4882   */
4883  static unsigned int
cpuid_advance(if_ctx_t ctx,unsigned int cpuid,unsigned int n)4884  cpuid_advance(if_ctx_t ctx, unsigned int cpuid, unsigned int n)
4885  {
4886  	unsigned int first_valid;
4887  	unsigned int last_valid;
4888  
4889  	/* cpuid should always be in the valid set */
4890  	MPASS(CPU_ISSET(cpuid, &ctx->ifc_cpus));
4891  
4892  	/* valid set should never be empty */
4893  	MPASS(!CPU_EMPTY(&ctx->ifc_cpus));
4894  
4895  	first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
4896  	last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
4897  	n = n % CPU_COUNT(&ctx->ifc_cpus);
4898  	while (n > 0) {
4899  		do {
4900  			cpuid++;
4901  			if (cpuid > last_valid)
4902  				cpuid = first_valid;
4903  		} while (!CPU_ISSET(cpuid, &ctx->ifc_cpus));
4904  		n--;
4905  	}
4906  
4907  	return (cpuid);
4908  }
4909  
4910  #if defined(SMP) && defined(SCHED_ULE)
4911  extern struct cpu_group *cpu_top;              /* CPU topology */
4912  
4913  static int
find_child_with_core(int cpu,struct cpu_group * grp)4914  find_child_with_core(int cpu, struct cpu_group *grp)
4915  {
4916  	int i;
4917  
4918  	if (grp->cg_children == 0)
4919  		return (-1);
4920  
4921  	MPASS(grp->cg_child);
4922  	for (i = 0; i < grp->cg_children; i++) {
4923  		if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask))
4924  			return (i);
4925  	}
4926  
4927  	return (-1);
4928  }
4929  
4930  
4931  /*
4932   * Find an L2 neighbor of the given CPU or return -1 if none found.  This
4933   * does not distinguish among multiple L2 neighbors if the given CPU has
4934   * more than one (it will always return the same result in that case).
4935   */
4936  static int
find_l2_neighbor(int cpu)4937  find_l2_neighbor(int cpu)
4938  {
4939  	struct cpu_group *grp;
4940  	int i;
4941  
4942  	grp = cpu_top;
4943  	if (grp == NULL)
4944  		return (-1);
4945  
4946  	/*
4947  	 * Find the smallest CPU group that contains the given core.
4948  	 */
4949  	i = 0;
4950  	while ((i = find_child_with_core(cpu, grp)) != -1) {
4951  		/*
4952  		 * If the smallest group containing the given CPU has less
4953  		 * than two members, we conclude the given CPU has no
4954  		 * L2 neighbor.
4955  		 */
4956  		if (grp->cg_child[i].cg_count <= 1)
4957  			return (-1);
4958  		grp = &grp->cg_child[i];
4959  	}
4960  
4961  	/* Must share L2. */
4962  	if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE)
4963  		return (-1);
4964  
4965  	/*
4966  	 * Select the first member of the set that isn't the reference
4967  	 * CPU, which at this point is guaranteed to exist.
4968  	 */
4969  	for (i = 0; i < CPU_SETSIZE; i++) {
4970  		if (CPU_ISSET(i, &grp->cg_mask) && i != cpu)
4971  			return (i);
4972  	}
4973  
4974  	/* Should never be reached */
4975  	return (-1);
4976  }
4977  
4978  #else
4979  static int
find_l2_neighbor(int cpu)4980  find_l2_neighbor(int cpu)
4981  {
4982  
4983  	return (-1);
4984  }
4985  #endif
4986  
4987  /*
4988   * CPU mapping behaviors
4989   * ---------------------
4990   * 'separate txrx' refers to the separate_txrx sysctl
4991   * 'use logical' refers to the use_logical_cores sysctl
4992   * 'INTR CPUS' indicates whether bus_get_cpus(INTR_CPUS) succeeded
4993   *
4994   *  separate     use     INTR
4995   *    txrx     logical   CPUS   result
4996   * ---------- --------- ------ ------------------------------------------------
4997   *     -          -       X     RX and TX queues mapped to consecutive physical
4998   *                              cores with RX/TX pairs on same core and excess
4999   *                              of either following
5000   *     -          X       X     RX and TX queues mapped to consecutive cores
5001   *                              of any type with RX/TX pairs on same core and
5002   *                              excess of either following
5003   *     X          -       X     RX and TX queues mapped to consecutive physical
5004   *                              cores; all RX then all TX
5005   *     X          X       X     RX queues mapped to consecutive physical cores
5006   *                              first, then TX queues mapped to L2 neighbor of
5007   *                              the corresponding RX queue if one exists,
5008   *                              otherwise to consecutive physical cores
5009   *     -         n/a      -     RX and TX queues mapped to consecutive cores of
5010   *                              any type with RX/TX pairs on same core and excess
5011   *                              of either following
5012   *     X         n/a      -     RX and TX queues mapped to consecutive cores of
5013   *                              any type; all RX then all TX
5014   */
5015  static unsigned int
get_cpuid_for_queue(if_ctx_t ctx,unsigned int base_cpuid,unsigned int qid,bool is_tx)5016  get_cpuid_for_queue(if_ctx_t ctx, unsigned int base_cpuid, unsigned int qid,
5017      bool is_tx)
5018  {
5019  	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
5020  	unsigned int core_index;
5021  
5022  	if (ctx->ifc_sysctl_separate_txrx) {
5023  		/*
5024  		 * When using separate CPUs for TX and RX, the assignment
5025  		 * will always be of a consecutive CPU out of the set of
5026  		 * context CPUs, except for the specific case where the
5027  		 * context CPUs are phsyical cores, the use of logical cores
5028  		 * has been enabled, the assignment is for TX, the TX qid
5029  		 * corresponds to an RX qid, and the CPU assigned to the
5030  		 * corresponding RX queue has an L2 neighbor.
5031  		 */
5032  		if (ctx->ifc_sysctl_use_logical_cores &&
5033  		    ctx->ifc_cpus_are_physical_cores &&
5034  		    is_tx && qid < scctx->isc_nrxqsets) {
5035  			int l2_neighbor;
5036  			unsigned int rx_cpuid;
5037  
5038  			rx_cpuid = cpuid_advance(ctx, base_cpuid, qid);
5039  			l2_neighbor = find_l2_neighbor(rx_cpuid);
5040  			if (l2_neighbor != -1) {
5041  				return (l2_neighbor);
5042  			}
5043  			/*
5044  			 * ... else fall through to the normal
5045  			 * consecutive-after-RX assignment scheme.
5046  			 *
5047  			 * Note that we are assuming that all RX queue CPUs
5048  			 * have an L2 neighbor, or all do not.  If a mixed
5049  			 * scenario is possible, we will have to keep track
5050  			 * separately of how many queues prior to this one
5051  			 * were not able to be assigned to an L2 neighbor.
5052  			 */
5053  		}
5054  		if (is_tx)
5055  			core_index = scctx->isc_nrxqsets + qid;
5056  		else
5057  			core_index = qid;
5058  	} else {
5059  		core_index = qid;
5060  	}
5061  
5062  	return (cpuid_advance(ctx, base_cpuid, core_index));
5063  }
5064  
5065  static uint16_t
get_ctx_core_offset(if_ctx_t ctx)5066  get_ctx_core_offset(if_ctx_t ctx)
5067  {
5068  	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
5069  	struct cpu_offset *op;
5070  	cpuset_t assigned_cpus;
5071  	unsigned int cores_consumed;
5072  	unsigned int base_cpuid = ctx->ifc_sysctl_core_offset;
5073  	unsigned int first_valid;
5074  	unsigned int last_valid;
5075  	unsigned int i;
5076  
5077  	first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
5078  	last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
5079  
5080  	if (base_cpuid != CORE_OFFSET_UNSPECIFIED) {
5081  		/*
5082  		 * Align the user-chosen base CPU ID to the next valid CPU
5083  		 * for this device.  If the chosen base CPU ID is smaller
5084  		 * than the first valid CPU or larger than the last valid
5085  		 * CPU, we assume the user does not know what the valid
5086  		 * range is for this device and is thinking in terms of a
5087  		 * zero-based reference frame, and so we shift the given
5088  		 * value into the valid range (and wrap accordingly) so the
5089  		 * intent is translated to the proper frame of reference.
5090  		 * If the base CPU ID is within the valid first/last, but
5091  		 * does not correspond to a valid CPU, it is advanced to the
5092  		 * next valid CPU (wrapping if necessary).
5093  		 */
5094  		if (base_cpuid < first_valid || base_cpuid > last_valid) {
5095  			/* shift from zero-based to first_valid-based */
5096  			base_cpuid += first_valid;
5097  			/* wrap to range [first_valid, last_valid] */
5098  			base_cpuid = (base_cpuid - first_valid) %
5099  			    (last_valid - first_valid + 1);
5100  		}
5101  		if (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus)) {
5102  			/*
5103  			 * base_cpuid is in [first_valid, last_valid], but
5104  			 * not a member of the valid set.  In this case,
5105  			 * there will always be a member of the valid set
5106  			 * with a CPU ID that is greater than base_cpuid,
5107  			 * and we simply advance to it.
5108  			 */
5109  			while (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus))
5110  				base_cpuid++;
5111  		}
5112  		return (base_cpuid);
5113  	}
5114  
5115  	/*
5116  	 * Determine how many cores will be consumed by performing the CPU
5117  	 * assignments and counting how many of the assigned CPUs correspond
5118  	 * to CPUs in the set of context CPUs.  This is done using the CPU
5119  	 * ID first_valid as the base CPU ID, as the base CPU must be within
5120  	 * the set of context CPUs.
5121  	 *
5122  	 * Note not all assigned CPUs will be in the set of context CPUs
5123  	 * when separate CPUs are being allocated to TX and RX queues,
5124  	 * assignment to logical cores has been enabled, the set of context
5125  	 * CPUs contains only physical CPUs, and TX queues are mapped to L2
5126  	 * neighbors of CPUs that RX queues have been mapped to - in this
5127  	 * case we do only want to count how many CPUs in the set of context
5128  	 * CPUs have been consumed, as that determines the next CPU in that
5129  	 * set to start allocating at for the next device for which
5130  	 * core_offset is not set.
5131  	 */
5132  	CPU_ZERO(&assigned_cpus);
5133  	for (i = 0; i < scctx->isc_ntxqsets; i++)
5134  		CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, true),
5135  		    &assigned_cpus);
5136  	for (i = 0; i < scctx->isc_nrxqsets; i++)
5137  		CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, false),
5138  		    &assigned_cpus);
5139  	CPU_AND(&assigned_cpus, &assigned_cpus, &ctx->ifc_cpus);
5140  	cores_consumed = CPU_COUNT(&assigned_cpus);
5141  
5142  	mtx_lock(&cpu_offset_mtx);
5143  	SLIST_FOREACH(op, &cpu_offsets, entries) {
5144  		if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
5145  			base_cpuid = op->next_cpuid;
5146  			op->next_cpuid = cpuid_advance(ctx, op->next_cpuid,
5147  			    cores_consumed);
5148  			MPASS(op->refcount < UINT_MAX);
5149  			op->refcount++;
5150  			break;
5151  		}
5152  	}
5153  	if (base_cpuid == CORE_OFFSET_UNSPECIFIED) {
5154  		base_cpuid = first_valid;
5155  		op = malloc(sizeof(struct cpu_offset), M_IFLIB,
5156  		    M_NOWAIT | M_ZERO);
5157  		if (op == NULL) {
5158  			device_printf(ctx->ifc_dev,
5159  			    "allocation for cpu offset failed.\n");
5160  		} else {
5161  			op->next_cpuid = cpuid_advance(ctx, base_cpuid,
5162  			    cores_consumed);
5163  			op->refcount = 1;
5164  			CPU_COPY(&ctx->ifc_cpus, &op->set);
5165  			SLIST_INSERT_HEAD(&cpu_offsets, op, entries);
5166  		}
5167  	}
5168  	mtx_unlock(&cpu_offset_mtx);
5169  
5170  	return (base_cpuid);
5171  }
5172  
5173  static void
unref_ctx_core_offset(if_ctx_t ctx)5174  unref_ctx_core_offset(if_ctx_t ctx)
5175  {
5176  	struct cpu_offset *op, *top;
5177  
5178  	mtx_lock(&cpu_offset_mtx);
5179  	SLIST_FOREACH_SAFE(op, &cpu_offsets, entries, top) {
5180  		if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
5181  			MPASS(op->refcount > 0);
5182  			op->refcount--;
5183  			if (op->refcount == 0) {
5184  				SLIST_REMOVE(&cpu_offsets, op, cpu_offset, entries);
5185  				free(op, M_IFLIB);
5186  			}
5187  			break;
5188  		}
5189  	}
5190  	mtx_unlock(&cpu_offset_mtx);
5191  }
5192  
5193  int
iflib_device_register(device_t dev,void * sc,if_shared_ctx_t sctx,if_ctx_t * ctxp)5194  iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp)
5195  {
5196  	if_ctx_t ctx;
5197  	if_t ifp;
5198  	if_softc_ctx_t scctx;
5199  	kobjop_desc_t kobj_desc;
5200  	kobj_method_t *kobj_method;
5201  	int err, msix, rid;
5202  	int num_txd, num_rxd;
5203  	char namebuf[TASKQUEUE_NAMELEN];
5204  
5205  	ctx = malloc(sizeof(*ctx), M_IFLIB, M_WAITOK | M_ZERO);
5206  
5207  	if (sc == NULL) {
5208  		sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK | M_ZERO);
5209  		device_set_softc(dev, ctx);
5210  		ctx->ifc_flags |= IFC_SC_ALLOCATED;
5211  	}
5212  
5213  	ctx->ifc_sctx = sctx;
5214  	ctx->ifc_dev = dev;
5215  	ctx->ifc_softc = sc;
5216  
5217  	if ((err = iflib_register(ctx)) != 0) {
5218  		device_printf(dev, "iflib_register failed %d\n", err);
5219  		goto fail_ctx_free;
5220  	}
5221  	iflib_add_device_sysctl_pre(ctx);
5222  
5223  	scctx = &ctx->ifc_softc_ctx;
5224  	ifp = ctx->ifc_ifp;
5225  
5226  	iflib_reset_qvalues(ctx);
5227  	IFNET_WLOCK();
5228  	CTX_LOCK(ctx);
5229  	if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
5230  		device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
5231  		goto fail_unlock;
5232  	}
5233  	_iflib_pre_assert(scctx);
5234  	ctx->ifc_txrx = *scctx->isc_txrx;
5235  
5236  	MPASS(scctx->isc_dma_width <= flsll(BUS_SPACE_MAXADDR));
5237  
5238  	if (sctx->isc_flags & IFLIB_DRIVER_MEDIA)
5239  		ctx->ifc_mediap = scctx->isc_media;
5240  
5241  #ifdef INVARIANTS
5242  	if (scctx->isc_capabilities & IFCAP_TXCSUM)
5243  		MPASS(scctx->isc_tx_csum_flags);
5244  #endif
5245  
5246  	if_setcapabilities(ifp,
5247  	    scctx->isc_capabilities | IFCAP_HWSTATS | IFCAP_MEXTPG);
5248  	if_setcapenable(ifp,
5249  	    scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_MEXTPG);
5250  
5251  	if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
5252  		scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
5253  	if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
5254  		scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
5255  
5256  	num_txd = iflib_num_tx_descs(ctx);
5257  	num_rxd = iflib_num_rx_descs(ctx);
5258  
5259  	/* XXX change for per-queue sizes */
5260  	device_printf(dev, "Using %d TX descriptors and %d RX descriptors\n",
5261  	    num_txd, num_rxd);
5262  
5263  	if (scctx->isc_tx_nsegments > num_txd / MAX_SINGLE_PACKET_FRACTION)
5264  		scctx->isc_tx_nsegments = max(1, num_txd /
5265  		    MAX_SINGLE_PACKET_FRACTION);
5266  	if (scctx->isc_tx_tso_segments_max > num_txd /
5267  	    MAX_SINGLE_PACKET_FRACTION)
5268  		scctx->isc_tx_tso_segments_max = max(1,
5269  		    num_txd / MAX_SINGLE_PACKET_FRACTION);
5270  
5271  	/* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
5272  	if (if_getcapabilities(ifp) & IFCAP_TSO) {
5273  		/*
5274  		 * The stack can't handle a TSO size larger than IP_MAXPACKET,
5275  		 * but some MACs do.
5276  		 */
5277  		if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max,
5278  		    IP_MAXPACKET));
5279  		/*
5280  		 * Take maximum number of m_pullup(9)'s in iflib_parse_header()
5281  		 * into account.  In the worst case, each of these calls will
5282  		 * add another mbuf and, thus, the requirement for another DMA
5283  		 * segment.  So for best performance, it doesn't make sense to
5284  		 * advertize a maximum of TSO segments that typically will
5285  		 * require defragmentation in iflib_encap().
5286  		 */
5287  		if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3);
5288  		if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max);
5289  	}
5290  	if (scctx->isc_rss_table_size == 0)
5291  		scctx->isc_rss_table_size = 64;
5292  	scctx->isc_rss_table_mask = scctx->isc_rss_table_size - 1;
5293  
5294  	/* Create and start admin taskqueue */
5295  	snprintf(namebuf, TASKQUEUE_NAMELEN, "if_%s_tq", device_get_nameunit(dev));
5296  	ctx->ifc_tq = taskqueue_create_fast(namebuf, M_NOWAIT,
5297  	    taskqueue_thread_enqueue, &ctx->ifc_tq);
5298  	if (ctx->ifc_tq == NULL) {
5299  		device_printf(dev, "Unable to create admin taskqueue\n");
5300  		return (ENOMEM);
5301  	}
5302  
5303  	err = taskqueue_start_threads(&ctx->ifc_tq, 1, PI_NET, "%s", namebuf);
5304  	if (err) {
5305  		device_printf(dev,
5306  		    "Unable to start admin taskqueue threads error: %d\n",
5307  		    err);
5308  		taskqueue_free(ctx->ifc_tq);
5309  		return (err);
5310  	}
5311  
5312  	TASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
5313  
5314  	/* Set up cpu set.  If it fails, use the set of all CPUs. */
5315  	if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) != 0) {
5316  		device_printf(dev, "Unable to fetch CPU list\n");
5317  		CPU_COPY(&all_cpus, &ctx->ifc_cpus);
5318  		ctx->ifc_cpus_are_physical_cores = false;
5319  	} else
5320  		ctx->ifc_cpus_are_physical_cores = true;
5321  	MPASS(CPU_COUNT(&ctx->ifc_cpus) > 0);
5322  
5323  	/*
5324  	 * Now set up MSI or MSI-X, should return us the number of supported
5325  	 * vectors (will be 1 for a legacy interrupt and MSI).
5326  	 */
5327  	if (sctx->isc_flags & IFLIB_SKIP_MSIX) {
5328  		msix = scctx->isc_vectors;
5329  	} else if (scctx->isc_msix_bar != 0)
5330  		/*
5331  		 * The simple fact that isc_msix_bar is not 0 does not mean we
5332  		 * we have a good value there that is known to work.
5333  		 */
5334  		msix = iflib_msix_init(ctx);
5335  	else {
5336  		scctx->isc_vectors = 1;
5337  		scctx->isc_ntxqsets = 1;
5338  		scctx->isc_nrxqsets = 1;
5339  		scctx->isc_intr = IFLIB_INTR_LEGACY;
5340  		msix = 0;
5341  	}
5342  	/* Get memory for the station queues */
5343  	if ((err = iflib_queues_alloc(ctx))) {
5344  		device_printf(dev, "Unable to allocate queue memory\n");
5345  		goto fail_intr_free;
5346  	}
5347  
5348  	if ((err = iflib_qset_structures_setup(ctx)))
5349  		goto fail_queues;
5350  
5351  	/*
5352  	 * Now that we know how many queues there are, get the core offset.
5353  	 */
5354  	ctx->ifc_sysctl_core_offset = get_ctx_core_offset(ctx);
5355  
5356  	if (msix > 1) {
5357  		/*
5358  		 * When using MSI-X, ensure that ifdi_{r,t}x_queue_intr_enable
5359  		 * aren't the default NULL implementation.
5360  		 */
5361  		kobj_desc = &ifdi_rx_queue_intr_enable_desc;
5362  		kobj_method = kobj_lookup_method(((kobj_t)ctx)->ops->cls, NULL,
5363  		    kobj_desc);
5364  		if (kobj_method == &kobj_desc->deflt) {
5365  			device_printf(dev,
5366  			    "MSI-X requires ifdi_rx_queue_intr_enable method");
5367  			err = EOPNOTSUPP;
5368  			goto fail_queues;
5369  		}
5370  		kobj_desc = &ifdi_tx_queue_intr_enable_desc;
5371  		kobj_method = kobj_lookup_method(((kobj_t)ctx)->ops->cls, NULL,
5372  		    kobj_desc);
5373  		if (kobj_method == &kobj_desc->deflt) {
5374  			device_printf(dev,
5375  			    "MSI-X requires ifdi_tx_queue_intr_enable method");
5376  			err = EOPNOTSUPP;
5377  			goto fail_queues;
5378  		}
5379  
5380  		/*
5381  		 * Assign the MSI-X vectors.
5382  		 * Note that the default NULL ifdi_msix_intr_assign method will
5383  		 * fail here, too.
5384  		 */
5385  		err = IFDI_MSIX_INTR_ASSIGN(ctx, msix);
5386  		if (err != 0) {
5387  			device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n",
5388  			    err);
5389  			goto fail_queues;
5390  		}
5391  	} else if (scctx->isc_intr != IFLIB_INTR_MSIX) {
5392  		rid = 0;
5393  		if (scctx->isc_intr == IFLIB_INTR_MSI) {
5394  			MPASS(msix == 1);
5395  			rid = 1;
5396  		}
5397  		if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) {
5398  			device_printf(dev, "iflib_legacy_setup failed %d\n", err);
5399  			goto fail_queues;
5400  		}
5401  	} else {
5402  		device_printf(dev,
5403  		    "Cannot use iflib with only 1 MSI-X interrupt!\n");
5404  		err = ENODEV;
5405  		goto fail_queues;
5406  	}
5407  
5408  	/*
5409  	 * It prevents a double-locking panic with iflib_media_status when
5410  	 * the driver loads.
5411  	 */
5412  	CTX_UNLOCK(ctx);
5413  	ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac.octet);
5414  	CTX_LOCK(ctx);
5415  
5416  	if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
5417  		device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
5418  		goto fail_detach;
5419  	}
5420  
5421  	/*
5422  	 * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
5423  	 * This must appear after the call to ether_ifattach() because
5424  	 * ether_ifattach() sets if_hdrlen to the default value.
5425  	 */
5426  	if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
5427  		if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
5428  
5429  	if ((err = iflib_netmap_attach(ctx))) {
5430  		device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err);
5431  		goto fail_detach;
5432  	}
5433  	*ctxp = ctx;
5434  
5435  	DEBUGNET_SET(ctx->ifc_ifp, iflib);
5436  
5437  	if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
5438  	iflib_add_device_sysctl_post(ctx);
5439  	iflib_add_pfil(ctx);
5440  	ctx->ifc_flags |= IFC_INIT_DONE;
5441  	CTX_UNLOCK(ctx);
5442  	IFNET_WUNLOCK();
5443  
5444  	return (0);
5445  
5446  fail_detach:
5447  	ether_ifdetach(ctx->ifc_ifp);
5448  fail_queues:
5449  	taskqueue_free(ctx->ifc_tq);
5450  	iflib_tqg_detach(ctx);
5451  	iflib_tx_structures_free(ctx);
5452  	iflib_rx_structures_free(ctx);
5453  	IFDI_DETACH(ctx);
5454  	IFDI_QUEUES_FREE(ctx);
5455  fail_intr_free:
5456  	iflib_free_intr_mem(ctx);
5457  fail_unlock:
5458  	CTX_UNLOCK(ctx);
5459  	IFNET_WUNLOCK();
5460  	iflib_deregister(ctx);
5461  fail_ctx_free:
5462  	device_set_softc(ctx->ifc_dev, NULL);
5463          if (ctx->ifc_flags & IFC_SC_ALLOCATED)
5464                  free(ctx->ifc_softc, M_IFLIB);
5465          free(ctx, M_IFLIB);
5466  	return (err);
5467  }
5468  
5469  int
iflib_device_attach(device_t dev)5470  iflib_device_attach(device_t dev)
5471  {
5472  	if_ctx_t ctx;
5473  	if_shared_ctx_t sctx;
5474  
5475  	if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
5476  		return (ENOTSUP);
5477  
5478  	pci_enable_busmaster(dev);
5479  
5480  	return (iflib_device_register(dev, NULL, sctx, &ctx));
5481  }
5482  
5483  int
iflib_device_deregister(if_ctx_t ctx)5484  iflib_device_deregister(if_ctx_t ctx)
5485  {
5486  	if_t ifp = ctx->ifc_ifp;
5487  	device_t dev = ctx->ifc_dev;
5488  
5489  	/* Make sure VLANS are not using driver */
5490  	if (if_vlantrunkinuse(ifp)) {
5491  		device_printf(dev, "Vlan in use, detach first\n");
5492  		return (EBUSY);
5493  	}
5494  #ifdef PCI_IOV
5495  	if (!CTX_IS_VF(ctx) && pci_iov_detach(dev) != 0) {
5496  		device_printf(dev, "SR-IOV in use; detach first.\n");
5497  		return (EBUSY);
5498  	}
5499  #endif
5500  
5501  	STATE_LOCK(ctx);
5502  	ctx->ifc_flags |= IFC_IN_DETACH;
5503  	STATE_UNLOCK(ctx);
5504  
5505  	/* Unregister VLAN handlers before calling iflib_stop() */
5506  	iflib_unregister_vlan_handlers(ctx);
5507  
5508  	iflib_netmap_detach(ifp);
5509  	ether_ifdetach(ifp);
5510  
5511  	CTX_LOCK(ctx);
5512  	iflib_stop(ctx);
5513  	CTX_UNLOCK(ctx);
5514  
5515  	iflib_rem_pfil(ctx);
5516  	if (ctx->ifc_led_dev != NULL)
5517  		led_destroy(ctx->ifc_led_dev);
5518  
5519  	iflib_tqg_detach(ctx);
5520  	iflib_tx_structures_free(ctx);
5521  	iflib_rx_structures_free(ctx);
5522  
5523  	CTX_LOCK(ctx);
5524  	IFDI_DETACH(ctx);
5525  	IFDI_QUEUES_FREE(ctx);
5526  	CTX_UNLOCK(ctx);
5527  
5528  	taskqueue_free(ctx->ifc_tq);
5529  	ctx->ifc_tq = NULL;
5530  
5531  	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
5532  	iflib_free_intr_mem(ctx);
5533  
5534  	bus_generic_detach(dev);
5535  
5536  	iflib_deregister(ctx);
5537  
5538  	device_set_softc(ctx->ifc_dev, NULL);
5539  	if (ctx->ifc_flags & IFC_SC_ALLOCATED)
5540  		free(ctx->ifc_softc, M_IFLIB);
5541  	unref_ctx_core_offset(ctx);
5542  	free(ctx, M_IFLIB);
5543  	return (0);
5544  }
5545  
5546  static void
iflib_tqg_detach(if_ctx_t ctx)5547  iflib_tqg_detach(if_ctx_t ctx)
5548  {
5549  	iflib_txq_t txq;
5550  	iflib_rxq_t rxq;
5551  	int i;
5552  	struct taskqgroup *tqg;
5553  
5554  	/* XXX drain any dependent tasks */
5555  	tqg = qgroup_if_io_tqg;
5556  	for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
5557  		callout_drain(&txq->ift_timer);
5558  #ifdef DEV_NETMAP
5559  		callout_drain(&txq->ift_netmap_timer);
5560  #endif /* DEV_NETMAP */
5561  		if (txq->ift_task.gt_uniq != NULL)
5562  			taskqgroup_detach(tqg, &txq->ift_task);
5563  	}
5564  	for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
5565  		if (rxq->ifr_task.gt_uniq != NULL)
5566  			taskqgroup_detach(tqg, &rxq->ifr_task);
5567  	}
5568  }
5569  
5570  static void
iflib_free_intr_mem(if_ctx_t ctx)5571  iflib_free_intr_mem(if_ctx_t ctx)
5572  {
5573  
5574  	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) {
5575  		iflib_irq_free(ctx, &ctx->ifc_legacy_irq);
5576  	}
5577  	if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) {
5578  		pci_release_msi(ctx->ifc_dev);
5579  	}
5580  	if (ctx->ifc_msix_mem != NULL) {
5581  		bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY,
5582  		    rman_get_rid(ctx->ifc_msix_mem), ctx->ifc_msix_mem);
5583  		ctx->ifc_msix_mem = NULL;
5584  	}
5585  }
5586  
5587  int
iflib_device_detach(device_t dev)5588  iflib_device_detach(device_t dev)
5589  {
5590  	if_ctx_t ctx = device_get_softc(dev);
5591  
5592  	return (iflib_device_deregister(ctx));
5593  }
5594  
5595  int
iflib_device_suspend(device_t dev)5596  iflib_device_suspend(device_t dev)
5597  {
5598  	if_ctx_t ctx = device_get_softc(dev);
5599  
5600  	CTX_LOCK(ctx);
5601  	IFDI_SUSPEND(ctx);
5602  	CTX_UNLOCK(ctx);
5603  
5604  	return (bus_generic_suspend(dev));
5605  }
5606  int
iflib_device_shutdown(device_t dev)5607  iflib_device_shutdown(device_t dev)
5608  {
5609  	if_ctx_t ctx = device_get_softc(dev);
5610  
5611  	CTX_LOCK(ctx);
5612  	IFDI_SHUTDOWN(ctx);
5613  	CTX_UNLOCK(ctx);
5614  
5615  	return (bus_generic_suspend(dev));
5616  }
5617  
5618  int
iflib_device_resume(device_t dev)5619  iflib_device_resume(device_t dev)
5620  {
5621  	if_ctx_t ctx = device_get_softc(dev);
5622  	iflib_txq_t txq = ctx->ifc_txqs;
5623  
5624  	CTX_LOCK(ctx);
5625  	IFDI_RESUME(ctx);
5626  	iflib_if_init_locked(ctx);
5627  	CTX_UNLOCK(ctx);
5628  	for (int i = 0; i < NTXQSETS(ctx); i++, txq++)
5629  		iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
5630  
5631  	return (bus_generic_resume(dev));
5632  }
5633  
5634  int
iflib_device_iov_init(device_t dev,uint16_t num_vfs,const nvlist_t * params)5635  iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params)
5636  {
5637  	int error;
5638  	if_ctx_t ctx = device_get_softc(dev);
5639  
5640  	CTX_LOCK(ctx);
5641  	error = IFDI_IOV_INIT(ctx, num_vfs, params);
5642  	CTX_UNLOCK(ctx);
5643  
5644  	return (error);
5645  }
5646  
5647  void
iflib_device_iov_uninit(device_t dev)5648  iflib_device_iov_uninit(device_t dev)
5649  {
5650  	if_ctx_t ctx = device_get_softc(dev);
5651  
5652  	CTX_LOCK(ctx);
5653  	IFDI_IOV_UNINIT(ctx);
5654  	CTX_UNLOCK(ctx);
5655  }
5656  
5657  int
iflib_device_iov_add_vf(device_t dev,uint16_t vfnum,const nvlist_t * params)5658  iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params)
5659  {
5660  	int error;
5661  	if_ctx_t ctx = device_get_softc(dev);
5662  
5663  	CTX_LOCK(ctx);
5664  	error = IFDI_IOV_VF_ADD(ctx, vfnum, params);
5665  	CTX_UNLOCK(ctx);
5666  
5667  	return (error);
5668  }
5669  
5670  /*********************************************************************
5671   *
5672   *  MODULE FUNCTION DEFINITIONS
5673   *
5674   **********************************************************************/
5675  
5676  /*
5677   * - Start a fast taskqueue thread for each core
5678   * - Start a taskqueue for control operations
5679   */
5680  static int
iflib_module_init(void)5681  iflib_module_init(void)
5682  {
5683  	iflib_timer_default = hz / 2;
5684  	return (0);
5685  }
5686  
5687  static int
iflib_module_event_handler(module_t mod,int what,void * arg)5688  iflib_module_event_handler(module_t mod, int what, void *arg)
5689  {
5690  	int err;
5691  
5692  	switch (what) {
5693  	case MOD_LOAD:
5694  		if ((err = iflib_module_init()) != 0)
5695  			return (err);
5696  		break;
5697  	case MOD_UNLOAD:
5698  		return (EBUSY);
5699  	default:
5700  		return (EOPNOTSUPP);
5701  	}
5702  
5703  	return (0);
5704  }
5705  
5706  /*********************************************************************
5707   *
5708   *  PUBLIC FUNCTION DEFINITIONS
5709   *     ordered as in iflib.h
5710   *
5711   **********************************************************************/
5712  
5713  static void
_iflib_assert(if_shared_ctx_t sctx)5714  _iflib_assert(if_shared_ctx_t sctx)
5715  {
5716  	int i;
5717  
5718  	MPASS(sctx->isc_tx_maxsize);
5719  	MPASS(sctx->isc_tx_maxsegsize);
5720  
5721  	MPASS(sctx->isc_rx_maxsize);
5722  	MPASS(sctx->isc_rx_nsegments);
5723  	MPASS(sctx->isc_rx_maxsegsize);
5724  
5725  	MPASS(sctx->isc_nrxqs >= 1 && sctx->isc_nrxqs <= 8);
5726  	for (i = 0; i < sctx->isc_nrxqs; i++) {
5727  		MPASS(sctx->isc_nrxd_min[i]);
5728  		MPASS(powerof2(sctx->isc_nrxd_min[i]));
5729  		MPASS(sctx->isc_nrxd_max[i]);
5730  		MPASS(powerof2(sctx->isc_nrxd_max[i]));
5731  		MPASS(sctx->isc_nrxd_default[i]);
5732  		MPASS(powerof2(sctx->isc_nrxd_default[i]));
5733  	}
5734  
5735  	MPASS(sctx->isc_ntxqs >= 1 && sctx->isc_ntxqs <= 8);
5736  	for (i = 0; i < sctx->isc_ntxqs; i++) {
5737  		MPASS(sctx->isc_ntxd_min[i]);
5738  		MPASS(powerof2(sctx->isc_ntxd_min[i]));
5739  		MPASS(sctx->isc_ntxd_max[i]);
5740  		MPASS(powerof2(sctx->isc_ntxd_max[i]));
5741  		MPASS(sctx->isc_ntxd_default[i]);
5742  		MPASS(powerof2(sctx->isc_ntxd_default[i]));
5743  	}
5744  }
5745  
5746  static void
_iflib_pre_assert(if_softc_ctx_t scctx)5747  _iflib_pre_assert(if_softc_ctx_t scctx)
5748  {
5749  
5750  	MPASS(scctx->isc_txrx->ift_txd_encap);
5751  	MPASS(scctx->isc_txrx->ift_txd_flush);
5752  	MPASS(scctx->isc_txrx->ift_txd_credits_update);
5753  	MPASS(scctx->isc_txrx->ift_rxd_available);
5754  	MPASS(scctx->isc_txrx->ift_rxd_pkt_get);
5755  	MPASS(scctx->isc_txrx->ift_rxd_refill);
5756  	MPASS(scctx->isc_txrx->ift_rxd_flush);
5757  }
5758  
5759  static int
iflib_register(if_ctx_t ctx)5760  iflib_register(if_ctx_t ctx)
5761  {
5762  	if_shared_ctx_t sctx = ctx->ifc_sctx;
5763  	driver_t *driver = sctx->isc_driver;
5764  	device_t dev = ctx->ifc_dev;
5765  	if_t ifp;
5766  
5767  	_iflib_assert(sctx);
5768  
5769  	CTX_LOCK_INIT(ctx);
5770  	STATE_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev));
5771  	ifp = ctx->ifc_ifp = if_alloc_dev(IFT_ETHER, dev);
5772  
5773  	/*
5774  	 * Initialize our context's device specific methods
5775  	 */
5776  	kobj_init((kobj_t) ctx, (kobj_class_t) driver);
5777  	kobj_class_compile((kobj_class_t) driver);
5778  
5779  	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
5780  	if_setsoftc(ifp, ctx);
5781  	if_setdev(ifp, dev);
5782  	if_setinitfn(ifp, iflib_if_init);
5783  	if_setioctlfn(ifp, iflib_if_ioctl);
5784  #ifdef ALTQ
5785  	if_setstartfn(ifp, iflib_altq_if_start);
5786  	if_settransmitfn(ifp, iflib_altq_if_transmit);
5787  	if_setsendqready(ifp);
5788  #else
5789  	if_settransmitfn(ifp, iflib_if_transmit);
5790  #endif
5791  	if_setqflushfn(ifp, iflib_if_qflush);
5792  	if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST);
5793  	ctx->ifc_vlan_attach_event =
5794  		EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx,
5795  							  EVENTHANDLER_PRI_FIRST);
5796  	ctx->ifc_vlan_detach_event =
5797  		EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx,
5798  							  EVENTHANDLER_PRI_FIRST);
5799  
5800  	if ((sctx->isc_flags & IFLIB_DRIVER_MEDIA) == 0) {
5801  		ctx->ifc_mediap = &ctx->ifc_media;
5802  		ifmedia_init(ctx->ifc_mediap, IFM_IMASK,
5803  		    iflib_media_change, iflib_media_status);
5804  	}
5805  	return (0);
5806  }
5807  
5808  static void
iflib_unregister_vlan_handlers(if_ctx_t ctx)5809  iflib_unregister_vlan_handlers(if_ctx_t ctx)
5810  {
5811  	/* Unregister VLAN events */
5812  	if (ctx->ifc_vlan_attach_event != NULL) {
5813  		EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event);
5814  		ctx->ifc_vlan_attach_event = NULL;
5815  	}
5816  	if (ctx->ifc_vlan_detach_event != NULL) {
5817  		EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event);
5818  		ctx->ifc_vlan_detach_event = NULL;
5819  	}
5820  
5821  }
5822  
5823  static void
iflib_deregister(if_ctx_t ctx)5824  iflib_deregister(if_ctx_t ctx)
5825  {
5826  	if_t ifp = ctx->ifc_ifp;
5827  
5828  	/* Remove all media */
5829  	ifmedia_removeall(&ctx->ifc_media);
5830  
5831  	/* Ensure that VLAN event handlers are unregistered */
5832  	iflib_unregister_vlan_handlers(ctx);
5833  
5834  	/* Release kobject reference */
5835  	kobj_delete((kobj_t) ctx, NULL);
5836  
5837  	/* Free the ifnet structure */
5838  	if_free(ifp);
5839  
5840  	STATE_LOCK_DESTROY(ctx);
5841  
5842  	/* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
5843  	CTX_LOCK_DESTROY(ctx);
5844  }
5845  
5846  static int
iflib_queues_alloc(if_ctx_t ctx)5847  iflib_queues_alloc(if_ctx_t ctx)
5848  {
5849  	if_shared_ctx_t sctx = ctx->ifc_sctx;
5850  	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
5851  	device_t dev = ctx->ifc_dev;
5852  	int nrxqsets = scctx->isc_nrxqsets;
5853  	int ntxqsets = scctx->isc_ntxqsets;
5854  	iflib_txq_t txq;
5855  	iflib_rxq_t rxq;
5856  	iflib_fl_t fl = NULL;
5857  	int i, j, cpu, err, txconf, rxconf;
5858  	iflib_dma_info_t ifdip;
5859  	uint32_t *rxqsizes = scctx->isc_rxqsizes;
5860  	uint32_t *txqsizes = scctx->isc_txqsizes;
5861  	uint8_t nrxqs = sctx->isc_nrxqs;
5862  	uint8_t ntxqs = sctx->isc_ntxqs;
5863  	int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1;
5864  	int fl_offset = (sctx->isc_flags & IFLIB_HAS_RXCQ ? 1 : 0);
5865  	caddr_t *vaddrs;
5866  	uint64_t *paddrs;
5867  
5868  	KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1"));
5869  	KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1"));
5870  	KASSERT(nrxqs >= fl_offset + nfree_lists,
5871             ("there must be at least a rxq for each free list"));
5872  
5873  	/* Allocate the TX ring struct memory */
5874  	if (!(ctx->ifc_txqs =
5875  	    (iflib_txq_t) malloc(sizeof(struct iflib_txq) *
5876  	    ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
5877  		device_printf(dev, "Unable to allocate TX ring memory\n");
5878  		err = ENOMEM;
5879  		goto fail;
5880  	}
5881  
5882  	/* Now allocate the RX */
5883  	if (!(ctx->ifc_rxqs =
5884  	    (iflib_rxq_t) malloc(sizeof(struct iflib_rxq) *
5885  	    nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
5886  		device_printf(dev, "Unable to allocate RX ring memory\n");
5887  		err = ENOMEM;
5888  		goto rx_fail;
5889  	}
5890  
5891  	txq = ctx->ifc_txqs;
5892  	rxq = ctx->ifc_rxqs;
5893  
5894  	/*
5895  	 * XXX handle allocation failure
5896  	 */
5897  	for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) {
5898  		/* Set up some basics */
5899  
5900  		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs,
5901  		    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
5902  			device_printf(dev,
5903  			    "Unable to allocate TX DMA info memory\n");
5904  			err = ENOMEM;
5905  			goto err_tx_desc;
5906  		}
5907  		txq->ift_ifdi = ifdip;
5908  		for (j = 0; j < ntxqs; j++, ifdip++) {
5909  			if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, 0)) {
5910  				device_printf(dev,
5911  				    "Unable to allocate TX descriptors\n");
5912  				err = ENOMEM;
5913  				goto err_tx_desc;
5914  			}
5915  			txq->ift_txd_size[j] = scctx->isc_txd_size[j];
5916  			bzero((void *)ifdip->idi_vaddr, txqsizes[j]);
5917  		}
5918  		txq->ift_ctx = ctx;
5919  		txq->ift_id = i;
5920  		if (sctx->isc_flags & IFLIB_HAS_TXCQ) {
5921  			txq->ift_br_offset = 1;
5922  		} else {
5923  			txq->ift_br_offset = 0;
5924  		}
5925  
5926  		if (iflib_txsd_alloc(txq)) {
5927  			device_printf(dev, "Critical Failure setting up TX buffers\n");
5928  			err = ENOMEM;
5929  			goto err_tx_desc;
5930  		}
5931  
5932  		/* Initialize the TX lock */
5933  		snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:TX(%d):callout",
5934  		    device_get_nameunit(dev), txq->ift_id);
5935  		mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF);
5936  		callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0);
5937  		txq->ift_timer.c_cpu = cpu;
5938  #ifdef DEV_NETMAP
5939  		callout_init_mtx(&txq->ift_netmap_timer, &txq->ift_mtx, 0);
5940  		txq->ift_netmap_timer.c_cpu = cpu;
5941  #endif /* DEV_NETMAP */
5942  
5943  		err = ifmp_ring_alloc(&txq->ift_br, 2048, txq, iflib_txq_drain,
5944  				      iflib_txq_can_drain, M_IFLIB, M_WAITOK);
5945  		if (err) {
5946  			/* XXX free any allocated rings */
5947  			device_printf(dev, "Unable to allocate buf_ring\n");
5948  			goto err_tx_desc;
5949  		}
5950  	}
5951  
5952  	for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) {
5953  		/* Set up some basics */
5954  		callout_init(&rxq->ifr_watchdog, 1);
5955  
5956  		if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs,
5957  		   M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
5958  			device_printf(dev,
5959  			    "Unable to allocate RX DMA info memory\n");
5960  			err = ENOMEM;
5961  			goto err_tx_desc;
5962  		}
5963  
5964  		rxq->ifr_ifdi = ifdip;
5965  		/* XXX this needs to be changed if #rx queues != #tx queues */
5966  		rxq->ifr_ntxqirq = 1;
5967  		rxq->ifr_txqid[0] = i;
5968  		for (j = 0; j < nrxqs; j++, ifdip++) {
5969  			if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, 0)) {
5970  				device_printf(dev,
5971  				    "Unable to allocate RX descriptors\n");
5972  				err = ENOMEM;
5973  				goto err_tx_desc;
5974  			}
5975  			bzero((void *)ifdip->idi_vaddr, rxqsizes[j]);
5976  		}
5977  		rxq->ifr_ctx = ctx;
5978  		rxq->ifr_id = i;
5979  		rxq->ifr_fl_offset = fl_offset;
5980  		rxq->ifr_nfl = nfree_lists;
5981  		if (!(fl =
5982  			  (iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) {
5983  			device_printf(dev, "Unable to allocate free list memory\n");
5984  			err = ENOMEM;
5985  			goto err_tx_desc;
5986  		}
5987  		rxq->ifr_fl = fl;
5988  		for (j = 0; j < nfree_lists; j++) {
5989  			fl[j].ifl_rxq = rxq;
5990  			fl[j].ifl_id = j;
5991  			fl[j].ifl_ifdi = &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
5992  			fl[j].ifl_rxd_size = scctx->isc_rxd_size[j];
5993  		}
5994  		/* Allocate receive buffers for the ring */
5995  		if (iflib_rxsd_alloc(rxq)) {
5996  			device_printf(dev,
5997  			    "Critical Failure setting up receive buffers\n");
5998  			err = ENOMEM;
5999  			goto err_rx_desc;
6000  		}
6001  
6002  		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
6003  			fl->ifl_rx_bitmap = bit_alloc(fl->ifl_size, M_IFLIB,
6004  			    M_WAITOK);
6005  	}
6006  
6007  	/* TXQs */
6008  	vaddrs = malloc(sizeof(caddr_t)  * ntxqsets * ntxqs, M_IFLIB, M_WAITOK);
6009  	paddrs = malloc(sizeof(uint64_t) * ntxqsets * ntxqs, M_IFLIB, M_WAITOK);
6010  	for (i = 0; i < ntxqsets; i++) {
6011  		iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi;
6012  
6013  		for (j = 0; j < ntxqs; j++, di++) {
6014  			vaddrs[i * ntxqs + j] = di->idi_vaddr;
6015  			paddrs[i * ntxqs + j] = di->idi_paddr;
6016  		}
6017  	}
6018  	if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) {
6019  		device_printf(ctx->ifc_dev,
6020  		    "Unable to allocate device TX queue\n");
6021  		iflib_tx_structures_free(ctx);
6022  		free(vaddrs, M_IFLIB);
6023  		free(paddrs, M_IFLIB);
6024  		goto err_rx_desc;
6025  	}
6026  	free(vaddrs, M_IFLIB);
6027  	free(paddrs, M_IFLIB);
6028  
6029  	/* RXQs */
6030  	vaddrs = malloc(sizeof(caddr_t)  * nrxqsets * nrxqs, M_IFLIB, M_WAITOK);
6031  	paddrs = malloc(sizeof(uint64_t) * nrxqsets * nrxqs, M_IFLIB, M_WAITOK);
6032  	for (i = 0; i < nrxqsets; i++) {
6033  		iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi;
6034  
6035  		for (j = 0; j < nrxqs; j++, di++) {
6036  			vaddrs[i * nrxqs + j] = di->idi_vaddr;
6037  			paddrs[i * nrxqs + j] = di->idi_paddr;
6038  		}
6039  	}
6040  	if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) {
6041  		device_printf(ctx->ifc_dev,
6042  		    "Unable to allocate device RX queue\n");
6043  		iflib_tx_structures_free(ctx);
6044  		free(vaddrs, M_IFLIB);
6045  		free(paddrs, M_IFLIB);
6046  		goto err_rx_desc;
6047  	}
6048  	free(vaddrs, M_IFLIB);
6049  	free(paddrs, M_IFLIB);
6050  
6051  	return (0);
6052  
6053  /* XXX handle allocation failure changes */
6054  err_rx_desc:
6055  err_tx_desc:
6056  rx_fail:
6057  	if (ctx->ifc_rxqs != NULL)
6058  		free(ctx->ifc_rxqs, M_IFLIB);
6059  	ctx->ifc_rxqs = NULL;
6060  	if (ctx->ifc_txqs != NULL)
6061  		free(ctx->ifc_txqs, M_IFLIB);
6062  	ctx->ifc_txqs = NULL;
6063  fail:
6064  	return (err);
6065  }
6066  
6067  static int
iflib_tx_structures_setup(if_ctx_t ctx)6068  iflib_tx_structures_setup(if_ctx_t ctx)
6069  {
6070  	iflib_txq_t txq = ctx->ifc_txqs;
6071  	int i;
6072  
6073  	for (i = 0; i < NTXQSETS(ctx); i++, txq++)
6074  		iflib_txq_setup(txq);
6075  
6076  	return (0);
6077  }
6078  
6079  static void
iflib_tx_structures_free(if_ctx_t ctx)6080  iflib_tx_structures_free(if_ctx_t ctx)
6081  {
6082  	iflib_txq_t txq = ctx->ifc_txqs;
6083  	if_shared_ctx_t sctx = ctx->ifc_sctx;
6084  	int i, j;
6085  
6086  	for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
6087  		for (j = 0; j < sctx->isc_ntxqs; j++)
6088  			iflib_dma_free(&txq->ift_ifdi[j]);
6089  		iflib_txq_destroy(txq);
6090  	}
6091  	free(ctx->ifc_txqs, M_IFLIB);
6092  	ctx->ifc_txqs = NULL;
6093  }
6094  
6095  /*********************************************************************
6096   *
6097   *  Initialize all receive rings.
6098   *
6099   **********************************************************************/
6100  static int
iflib_rx_structures_setup(if_ctx_t ctx)6101  iflib_rx_structures_setup(if_ctx_t ctx)
6102  {
6103  	iflib_rxq_t rxq = ctx->ifc_rxqs;
6104  	int q;
6105  #if defined(INET6) || defined(INET)
6106  	int err, i;
6107  #endif
6108  
6109  	for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) {
6110  #if defined(INET6) || defined(INET)
6111  		err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp,
6112  		    TCP_LRO_ENTRIES, min(1024,
6113  		    ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset]));
6114  		if (err != 0) {
6115  			device_printf(ctx->ifc_dev,
6116  			    "LRO Initialization failed!\n");
6117  			goto fail;
6118  		}
6119  #endif
6120  		IFDI_RXQ_SETUP(ctx, rxq->ifr_id);
6121  	}
6122  	return (0);
6123  #if defined(INET6) || defined(INET)
6124  fail:
6125  	/*
6126  	 * Free LRO resources allocated so far, we will only handle
6127  	 * the rings that completed, the failing case will have
6128  	 * cleaned up for itself.  'q' failed, so its the terminus.
6129  	 */
6130  	rxq = ctx->ifc_rxqs;
6131  	for (i = 0; i < q; ++i, rxq++) {
6132  		tcp_lro_free(&rxq->ifr_lc);
6133  	}
6134  	return (err);
6135  #endif
6136  }
6137  
6138  /*********************************************************************
6139   *
6140   *  Free all receive rings.
6141   *
6142   **********************************************************************/
6143  static void
iflib_rx_structures_free(if_ctx_t ctx)6144  iflib_rx_structures_free(if_ctx_t ctx)
6145  {
6146  	iflib_rxq_t rxq = ctx->ifc_rxqs;
6147  	if_shared_ctx_t sctx = ctx->ifc_sctx;
6148  	int i, j;
6149  
6150  	for (i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) {
6151  		for (j = 0; j < sctx->isc_nrxqs; j++)
6152  			iflib_dma_free(&rxq->ifr_ifdi[j]);
6153  		iflib_rx_sds_free(rxq);
6154  #if defined(INET6) || defined(INET)
6155  		tcp_lro_free(&rxq->ifr_lc);
6156  #endif
6157  	}
6158  	free(ctx->ifc_rxqs, M_IFLIB);
6159  	ctx->ifc_rxqs = NULL;
6160  }
6161  
6162  static int
iflib_qset_structures_setup(if_ctx_t ctx)6163  iflib_qset_structures_setup(if_ctx_t ctx)
6164  {
6165  	int err;
6166  
6167  	/*
6168  	 * It is expected that the caller takes care of freeing queues if this
6169  	 * fails.
6170  	 */
6171  	if ((err = iflib_tx_structures_setup(ctx)) != 0) {
6172  		device_printf(ctx->ifc_dev, "iflib_tx_structures_setup failed: %d\n", err);
6173  		return (err);
6174  	}
6175  
6176  	if ((err = iflib_rx_structures_setup(ctx)) != 0)
6177  		device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err);
6178  
6179  	return (err);
6180  }
6181  
6182  int
iflib_irq_alloc(if_ctx_t ctx,if_irq_t irq,int rid,driver_filter_t filter,void * filter_arg,driver_intr_t handler,void * arg,const char * name)6183  iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
6184  		driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, const char *name)
6185  {
6186  
6187  	return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
6188  }
6189  
6190  /* Just to avoid copy/paste */
6191  static inline int
iflib_irq_set_affinity(if_ctx_t ctx,if_irq_t irq,iflib_intr_type_t type,int qid,struct grouptask * gtask,struct taskqgroup * tqg,void * uniq,const char * name)6192  iflib_irq_set_affinity(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,
6193      int qid, struct grouptask *gtask, struct taskqgroup *tqg, void *uniq,
6194      const char *name)
6195  {
6196  	device_t dev;
6197  	unsigned int base_cpuid, cpuid;
6198  	int err;
6199  
6200  	dev = ctx->ifc_dev;
6201  	base_cpuid = ctx->ifc_sysctl_core_offset;
6202  	cpuid = get_cpuid_for_queue(ctx, base_cpuid, qid, type == IFLIB_INTR_TX);
6203  	err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid, dev,
6204  	    irq ? irq->ii_res : NULL, name);
6205  	if (err) {
6206  		device_printf(dev, "taskqgroup_attach_cpu failed %d\n", err);
6207  		return (err);
6208  	}
6209  #ifdef notyet
6210  	if (cpuid > ctx->ifc_cpuid_highest)
6211  		ctx->ifc_cpuid_highest = cpuid;
6212  #endif
6213  	return (0);
6214  }
6215  
6216  /*
6217   * Allocate a hardware interrupt for subctx using the parent (ctx)'s hardware
6218   * resources.
6219   *
6220   * Similar to iflib_irq_alloc_generic(), but for interrupt type IFLIB_INTR_RXTX
6221   * only.
6222   *
6223   * XXX: Could be removed if subctx's dev has its intr resource allocation
6224   * methods replaced with custom ones?
6225   */
6226  int
iflib_irq_alloc_generic_subctx(if_ctx_t ctx,if_ctx_t subctx,if_irq_t irq,int rid,iflib_intr_type_t type,driver_filter_t * filter,void * filter_arg,int qid,const char * name)6227  iflib_irq_alloc_generic_subctx(if_ctx_t ctx, if_ctx_t subctx, if_irq_t irq,
6228  			       int rid, iflib_intr_type_t type,
6229  			       driver_filter_t *filter, void *filter_arg,
6230  			       int qid, const char *name)
6231  {
6232  	device_t dev, subdev;
6233  	struct grouptask *gtask;
6234  	struct taskqgroup *tqg;
6235  	iflib_filter_info_t info;
6236  	gtask_fn_t *fn;
6237  	int tqrid, err;
6238  	driver_filter_t *intr_fast;
6239  	void *q;
6240  
6241  	MPASS(ctx != NULL);
6242  	MPASS(subctx != NULL);
6243  
6244  	tqrid = rid;
6245  	dev = ctx->ifc_dev;
6246  	subdev = subctx->ifc_dev;
6247  
6248  	switch (type) {
6249  	case IFLIB_INTR_RXTX:
6250  		q = &subctx->ifc_rxqs[qid];
6251  		info = &subctx->ifc_rxqs[qid].ifr_filter_info;
6252  		gtask = &subctx->ifc_rxqs[qid].ifr_task;
6253  		tqg = qgroup_if_io_tqg;
6254  		fn = _task_fn_rx;
6255  		intr_fast = iflib_fast_intr_rxtx;
6256  		NET_GROUPTASK_INIT(gtask, 0, fn, q);
6257  		break;
6258  	default:
6259  		device_printf(dev, "%s: unknown net intr type for subctx %s (%d)\n",
6260  		    __func__, device_get_nameunit(subdev), type);
6261  		return (EINVAL);
6262  	}
6263  
6264  	info->ifi_filter = filter;
6265  	info->ifi_filter_arg = filter_arg;
6266  	info->ifi_task = gtask;
6267  	info->ifi_ctx = q;
6268  
6269  	NET_GROUPTASK_INIT(gtask, 0, fn, q);
6270  
6271  	/* Allocate interrupts from hardware using parent context */
6272  	err = _iflib_irq_alloc(ctx, irq, rid, intr_fast, NULL, info, name);
6273  	if (err != 0) {
6274  		device_printf(dev, "_iflib_irq_alloc failed for subctx %s: %d\n",
6275  		    device_get_nameunit(subdev), err);
6276  		return (err);
6277  	}
6278  
6279  	if (tqrid != -1) {
6280  		err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q,
6281  		    name);
6282  		if (err)
6283  			return (err);
6284  	} else {
6285  		taskqgroup_attach(tqg, gtask, q, dev, irq->ii_res, name);
6286  	}
6287  
6288  	return (0);
6289  }
6290  
6291  int
iflib_irq_alloc_generic(if_ctx_t ctx,if_irq_t irq,int rid,iflib_intr_type_t type,driver_filter_t * filter,void * filter_arg,int qid,const char * name)6292  iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
6293  			iflib_intr_type_t type, driver_filter_t *filter,
6294  			void *filter_arg, int qid, const char *name)
6295  {
6296  	device_t dev;
6297  	struct grouptask *gtask;
6298  	struct taskqgroup *tqg;
6299  	iflib_filter_info_t info;
6300  	gtask_fn_t *fn;
6301  	int tqrid, err;
6302  	driver_filter_t *intr_fast;
6303  	void *q;
6304  
6305  	info = &ctx->ifc_filter_info;
6306  	tqrid = rid;
6307  
6308  	switch (type) {
6309  	/* XXX merge tx/rx for netmap? */
6310  	case IFLIB_INTR_TX:
6311  		q = &ctx->ifc_txqs[qid];
6312  		info = &ctx->ifc_txqs[qid].ift_filter_info;
6313  		gtask = &ctx->ifc_txqs[qid].ift_task;
6314  		tqg = qgroup_if_io_tqg;
6315  		fn = _task_fn_tx;
6316  		intr_fast = iflib_fast_intr;
6317  		GROUPTASK_INIT(gtask, 0, fn, q);
6318  		ctx->ifc_flags |= IFC_NETMAP_TX_IRQ;
6319  		break;
6320  	case IFLIB_INTR_RX:
6321  		q = &ctx->ifc_rxqs[qid];
6322  		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
6323  		gtask = &ctx->ifc_rxqs[qid].ifr_task;
6324  		tqg = qgroup_if_io_tqg;
6325  		fn = _task_fn_rx;
6326  		intr_fast = iflib_fast_intr;
6327  		NET_GROUPTASK_INIT(gtask, 0, fn, q);
6328  		break;
6329  	case IFLIB_INTR_RXTX:
6330  		q = &ctx->ifc_rxqs[qid];
6331  		info = &ctx->ifc_rxqs[qid].ifr_filter_info;
6332  		gtask = &ctx->ifc_rxqs[qid].ifr_task;
6333  		tqg = qgroup_if_io_tqg;
6334  		fn = _task_fn_rx;
6335  		intr_fast = iflib_fast_intr_rxtx;
6336  		NET_GROUPTASK_INIT(gtask, 0, fn, q);
6337  		break;
6338  	case IFLIB_INTR_ADMIN:
6339  		q = ctx;
6340  		tqrid = -1;
6341  		info = &ctx->ifc_filter_info;
6342  		gtask = NULL;
6343  		intr_fast = iflib_fast_intr_ctx;
6344  		break;
6345  	default:
6346  		device_printf(ctx->ifc_dev, "%s: unknown net intr type\n",
6347  		    __func__);
6348  		return (EINVAL);
6349  	}
6350  
6351  	info->ifi_filter = filter;
6352  	info->ifi_filter_arg = filter_arg;
6353  	info->ifi_task = gtask;
6354  	info->ifi_ctx = q;
6355  
6356  	dev = ctx->ifc_dev;
6357  	err = _iflib_irq_alloc(ctx, irq, rid, intr_fast, NULL, info,  name);
6358  	if (err != 0) {
6359  		device_printf(dev, "_iflib_irq_alloc failed %d\n", err);
6360  		return (err);
6361  	}
6362  	if (type == IFLIB_INTR_ADMIN)
6363  		return (0);
6364  
6365  	if (tqrid != -1) {
6366  		err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q,
6367  		    name);
6368  		if (err)
6369  			return (err);
6370  	} else {
6371  		taskqgroup_attach(tqg, gtask, q, dev, irq->ii_res, name);
6372  	}
6373  
6374  	return (0);
6375  }
6376  
6377  void
iflib_softirq_alloc_generic(if_ctx_t ctx,if_irq_t irq,iflib_intr_type_t type,void * arg,int qid,const char * name)6378  iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,
6379  			    void *arg, int qid, const char *name)
6380  {
6381  	device_t dev;
6382  	struct grouptask *gtask;
6383  	struct taskqgroup *tqg;
6384  	gtask_fn_t *fn;
6385  	void *q;
6386  	int err;
6387  
6388  	switch (type) {
6389  	case IFLIB_INTR_TX:
6390  		q = &ctx->ifc_txqs[qid];
6391  		gtask = &ctx->ifc_txqs[qid].ift_task;
6392  		tqg = qgroup_if_io_tqg;
6393  		fn = _task_fn_tx;
6394  		GROUPTASK_INIT(gtask, 0, fn, q);
6395  		break;
6396  	case IFLIB_INTR_RX:
6397  		q = &ctx->ifc_rxqs[qid];
6398  		gtask = &ctx->ifc_rxqs[qid].ifr_task;
6399  		tqg = qgroup_if_io_tqg;
6400  		fn = _task_fn_rx;
6401  		NET_GROUPTASK_INIT(gtask, 0, fn, q);
6402  		break;
6403  	case IFLIB_INTR_IOV:
6404  		TASK_INIT(&ctx->ifc_vflr_task, 0, _task_fn_iov, ctx);
6405  		return;
6406  	default:
6407  		panic("unknown net intr type");
6408  	}
6409  	err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q, name);
6410  	if (err) {
6411  		dev = ctx->ifc_dev;
6412  		taskqgroup_attach(tqg, gtask, q, dev, irq ? irq->ii_res : NULL,
6413  		    name);
6414  	}
6415  }
6416  
6417  void
iflib_irq_free(if_ctx_t ctx,if_irq_t irq)6418  iflib_irq_free(if_ctx_t ctx, if_irq_t irq)
6419  {
6420  
6421  	if (irq->ii_tag)
6422  		bus_teardown_intr(ctx->ifc_dev, irq->ii_res, irq->ii_tag);
6423  
6424  	if (irq->ii_res)
6425  		bus_release_resource(ctx->ifc_dev, SYS_RES_IRQ,
6426  		    rman_get_rid(irq->ii_res), irq->ii_res);
6427  }
6428  
6429  static int
iflib_legacy_setup(if_ctx_t ctx,driver_filter_t filter,void * filter_arg,int * rid,const char * name)6430  iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filter_arg, int *rid, const char *name)
6431  {
6432  	iflib_txq_t txq = ctx->ifc_txqs;
6433  	iflib_rxq_t rxq = ctx->ifc_rxqs;
6434  	if_irq_t irq = &ctx->ifc_legacy_irq;
6435  	iflib_filter_info_t info;
6436  	device_t dev;
6437  	struct grouptask *gtask;
6438  	struct resource *res;
6439  	int err, tqrid;
6440  	bool rx_only;
6441  
6442  	info = &rxq->ifr_filter_info;
6443  	gtask = &rxq->ifr_task;
6444  	tqrid = *rid;
6445  	rx_only = (ctx->ifc_sctx->isc_flags & IFLIB_SINGLE_IRQ_RX_ONLY) != 0;
6446  
6447  	ctx->ifc_flags |= IFC_LEGACY;
6448  	info->ifi_filter = filter;
6449  	info->ifi_filter_arg = filter_arg;
6450  	info->ifi_task = gtask;
6451  	info->ifi_ctx = rxq;
6452  
6453  	dev = ctx->ifc_dev;
6454  	/* We allocate a single interrupt resource */
6455  	err = _iflib_irq_alloc(ctx, irq, tqrid, rx_only ? iflib_fast_intr :
6456  	    iflib_fast_intr_rxtx, NULL, info, name);
6457  	if (err != 0)
6458  		return (err);
6459  	NET_GROUPTASK_INIT(gtask, 0, _task_fn_rx, rxq);
6460  	res = irq->ii_res;
6461  	taskqgroup_attach(qgroup_if_io_tqg, gtask, rxq, dev, res, name);
6462  
6463  	GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq);
6464  	taskqgroup_attach(qgroup_if_io_tqg, &txq->ift_task, txq, dev, res,
6465  	    "tx");
6466  	return (0);
6467  }
6468  
6469  void
iflib_led_create(if_ctx_t ctx)6470  iflib_led_create(if_ctx_t ctx)
6471  {
6472  
6473  	ctx->ifc_led_dev = led_create(iflib_led_func, ctx,
6474  	    device_get_nameunit(ctx->ifc_dev));
6475  }
6476  
6477  void
iflib_tx_intr_deferred(if_ctx_t ctx,int txqid)6478  iflib_tx_intr_deferred(if_ctx_t ctx, int txqid)
6479  {
6480  
6481  	GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task);
6482  }
6483  
6484  void
iflib_rx_intr_deferred(if_ctx_t ctx,int rxqid)6485  iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid)
6486  {
6487  
6488  	GROUPTASK_ENQUEUE(&ctx->ifc_rxqs[rxqid].ifr_task);
6489  }
6490  
6491  void
iflib_admin_intr_deferred(if_ctx_t ctx)6492  iflib_admin_intr_deferred(if_ctx_t ctx)
6493  {
6494  
6495  	taskqueue_enqueue(ctx->ifc_tq, &ctx->ifc_admin_task);
6496  }
6497  
6498  void
iflib_iov_intr_deferred(if_ctx_t ctx)6499  iflib_iov_intr_deferred(if_ctx_t ctx)
6500  {
6501  
6502  	taskqueue_enqueue(ctx->ifc_tq, &ctx->ifc_vflr_task);
6503  }
6504  
6505  void
iflib_io_tqg_attach(struct grouptask * gt,void * uniq,int cpu,const char * name)6506  iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, const char *name)
6507  {
6508  
6509  	taskqgroup_attach_cpu(qgroup_if_io_tqg, gt, uniq, cpu, NULL, NULL,
6510  	    name);
6511  }
6512  
6513  void
iflib_config_task_init(if_ctx_t ctx,struct task * config_task,task_fn_t * fn)6514  iflib_config_task_init(if_ctx_t ctx, struct task *config_task, task_fn_t *fn)
6515  {
6516  	TASK_INIT(config_task, 0, fn, ctx);
6517  }
6518  
6519  void
iflib_config_task_enqueue(if_ctx_t ctx,struct task * config_task)6520  iflib_config_task_enqueue(if_ctx_t ctx, struct task *config_task)
6521  {
6522  	taskqueue_enqueue(ctx->ifc_tq, config_task);
6523  }
6524  
6525  void
iflib_link_state_change(if_ctx_t ctx,int link_state,uint64_t baudrate)6526  iflib_link_state_change(if_ctx_t ctx, int link_state, uint64_t baudrate)
6527  {
6528  	if_t ifp = ctx->ifc_ifp;
6529  	iflib_txq_t txq = ctx->ifc_txqs;
6530  
6531  	if_setbaudrate(ifp, baudrate);
6532  	if (baudrate >= IF_Gbps(10)) {
6533  		STATE_LOCK(ctx);
6534  		ctx->ifc_flags |= IFC_PREFETCH;
6535  		STATE_UNLOCK(ctx);
6536  	}
6537  	/* If link down, disable watchdog */
6538  	if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) {
6539  		for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxqsets; i++, txq++)
6540  			txq->ift_qstatus = IFLIB_QUEUE_IDLE;
6541  	}
6542  	ctx->ifc_link_state = link_state;
6543  	if_link_state_change(ifp, link_state);
6544  }
6545  
6546  static int
iflib_tx_credits_update(if_ctx_t ctx,iflib_txq_t txq)6547  iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq)
6548  {
6549  	int credits;
6550  #ifdef INVARIANTS
6551  	int credits_pre = txq->ift_cidx_processed;
6552  #endif
6553  
6554  	bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
6555  	    BUS_DMASYNC_POSTREAD);
6556  	if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, true)) == 0)
6557  		return (0);
6558  
6559  	txq->ift_processed += credits;
6560  	txq->ift_cidx_processed += credits;
6561  
6562  	MPASS(credits_pre + credits == txq->ift_cidx_processed);
6563  	if (txq->ift_cidx_processed >= txq->ift_size)
6564  		txq->ift_cidx_processed -= txq->ift_size;
6565  	return (credits);
6566  }
6567  
6568  static int
iflib_rxd_avail(if_ctx_t ctx,iflib_rxq_t rxq,qidx_t cidx,qidx_t budget)6569  iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget)
6570  {
6571  	iflib_fl_t fl;
6572  	u_int i;
6573  
6574  	for (i = 0, fl = &rxq->ifr_fl[0]; i < rxq->ifr_nfl; i++, fl++)
6575  		bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
6576  		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
6577  	return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx,
6578  	    budget));
6579  }
6580  
6581  void
iflib_add_int_delay_sysctl(if_ctx_t ctx,const char * name,const char * description,if_int_delay_info_t info,int offset,int value)6582  iflib_add_int_delay_sysctl(if_ctx_t ctx, const char *name,
6583  	const char *description, if_int_delay_info_t info,
6584  	int offset, int value)
6585  {
6586  	info->iidi_ctx = ctx;
6587  	info->iidi_offset = offset;
6588  	info->iidi_value = value;
6589  	SYSCTL_ADD_PROC(device_get_sysctl_ctx(ctx->ifc_dev),
6590  	    SYSCTL_CHILDREN(device_get_sysctl_tree(ctx->ifc_dev)),
6591  	    OID_AUTO, name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
6592  	    info, 0, iflib_sysctl_int_delay, "I", description);
6593  }
6594  
6595  struct sx *
iflib_ctx_lock_get(if_ctx_t ctx)6596  iflib_ctx_lock_get(if_ctx_t ctx)
6597  {
6598  
6599  	return (&ctx->ifc_ctx_sx);
6600  }
6601  
6602  static int
iflib_msix_init(if_ctx_t ctx)6603  iflib_msix_init(if_ctx_t ctx)
6604  {
6605  	device_t dev = ctx->ifc_dev;
6606  	if_shared_ctx_t sctx = ctx->ifc_sctx;
6607  	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
6608  	int admincnt, bar, err, iflib_num_rx_queues, iflib_num_tx_queues;
6609  	int msgs, queuemsgs, queues, rx_queues, tx_queues, vectors;
6610  
6611  	iflib_num_tx_queues = ctx->ifc_sysctl_ntxqs;
6612  	iflib_num_rx_queues = ctx->ifc_sysctl_nrxqs;
6613  
6614  	if (bootverbose)
6615  		device_printf(dev, "msix_init qsets capped at %d\n",
6616  		    imax(scctx->isc_ntxqsets, scctx->isc_nrxqsets));
6617  
6618  	/* Override by tuneable */
6619  	if (scctx->isc_disable_msix)
6620  		goto msi;
6621  
6622  	/* First try MSI-X */
6623  	if ((msgs = pci_msix_count(dev)) == 0) {
6624  		if (bootverbose)
6625  			device_printf(dev, "MSI-X not supported or disabled\n");
6626  		goto msi;
6627  	}
6628  
6629  	bar = ctx->ifc_softc_ctx.isc_msix_bar;
6630  	/*
6631  	 * bar == -1 => "trust me I know what I'm doing"
6632  	 * Some drivers are for hardware that is so shoddily
6633  	 * documented that no one knows which bars are which
6634  	 * so the developer has to map all bars. This hack
6635  	 * allows shoddy garbage to use MSI-X in this framework.
6636  	 */
6637  	if (bar != -1) {
6638  		ctx->ifc_msix_mem = bus_alloc_resource_any(dev,
6639  	            SYS_RES_MEMORY, &bar, RF_ACTIVE);
6640  		if (ctx->ifc_msix_mem == NULL) {
6641  			device_printf(dev, "Unable to map MSI-X table\n");
6642  			goto msi;
6643  		}
6644  	}
6645  
6646  	admincnt = sctx->isc_admin_intrcnt;
6647  #if IFLIB_DEBUG
6648  	/* use only 1 qset in debug mode */
6649  	queuemsgs = min(msgs - admincnt, 1);
6650  #else
6651  	queuemsgs = msgs - admincnt;
6652  #endif
6653  #ifdef RSS
6654  	queues = imin(queuemsgs, rss_getnumbuckets());
6655  #else
6656  	queues = queuemsgs;
6657  #endif
6658  	queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues);
6659  	if (bootverbose)
6660  		device_printf(dev,
6661  		    "intr CPUs: %d queue msgs: %d admincnt: %d\n",
6662  		    CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt);
6663  #ifdef  RSS
6664  	/* If we're doing RSS, clamp at the number of RSS buckets */
6665  	if (queues > rss_getnumbuckets())
6666  		queues = rss_getnumbuckets();
6667  #endif
6668  	if (iflib_num_rx_queues > 0 && iflib_num_rx_queues < queuemsgs - admincnt)
6669  		rx_queues = iflib_num_rx_queues;
6670  	else
6671  		rx_queues = queues;
6672  
6673  	if (rx_queues > scctx->isc_nrxqsets)
6674  		rx_queues = scctx->isc_nrxqsets;
6675  
6676  	/*
6677  	 * We want this to be all logical CPUs by default
6678  	 */
6679  	if (iflib_num_tx_queues > 0 && iflib_num_tx_queues < queues)
6680  		tx_queues = iflib_num_tx_queues;
6681  	else
6682  		tx_queues = mp_ncpus;
6683  
6684  	if (tx_queues > scctx->isc_ntxqsets)
6685  		tx_queues = scctx->isc_ntxqsets;
6686  
6687  	if (ctx->ifc_sysctl_qs_eq_override == 0) {
6688  #ifdef INVARIANTS
6689  		if (tx_queues != rx_queues)
6690  			device_printf(dev,
6691  			    "queue equality override not set, capping rx_queues at %d and tx_queues at %d\n",
6692  			    min(rx_queues, tx_queues), min(rx_queues, tx_queues));
6693  #endif
6694  		tx_queues = min(rx_queues, tx_queues);
6695  		rx_queues = min(rx_queues, tx_queues);
6696  	}
6697  
6698  	vectors = rx_queues + admincnt;
6699  	if (msgs < vectors) {
6700  		device_printf(dev,
6701  		    "insufficient number of MSI-X vectors "
6702  		    "(supported %d, need %d)\n", msgs, vectors);
6703  		goto msi;
6704  	}
6705  
6706  	device_printf(dev, "Using %d RX queues %d TX queues\n", rx_queues,
6707  	    tx_queues);
6708  	msgs = vectors;
6709  	if ((err = pci_alloc_msix(dev, &vectors)) == 0) {
6710  		if (vectors != msgs) {
6711  			device_printf(dev,
6712  			    "Unable to allocate sufficient MSI-X vectors "
6713  			    "(got %d, need %d)\n", vectors, msgs);
6714  			pci_release_msi(dev);
6715  			if (bar != -1) {
6716  				bus_release_resource(dev, SYS_RES_MEMORY, bar,
6717  				    ctx->ifc_msix_mem);
6718  				ctx->ifc_msix_mem = NULL;
6719  			}
6720  			goto msi;
6721  		}
6722  		device_printf(dev, "Using MSI-X interrupts with %d vectors\n",
6723  		    vectors);
6724  		scctx->isc_vectors = vectors;
6725  		scctx->isc_nrxqsets = rx_queues;
6726  		scctx->isc_ntxqsets = tx_queues;
6727  		scctx->isc_intr = IFLIB_INTR_MSIX;
6728  
6729  		return (vectors);
6730  	} else {
6731  		device_printf(dev,
6732  		    "failed to allocate %d MSI-X vectors, err: %d\n", vectors,
6733  		    err);
6734  		if (bar != -1) {
6735  			bus_release_resource(dev, SYS_RES_MEMORY, bar,
6736  			    ctx->ifc_msix_mem);
6737  			ctx->ifc_msix_mem = NULL;
6738  		}
6739  	}
6740  
6741  msi:
6742  	vectors = pci_msi_count(dev);
6743  	scctx->isc_nrxqsets = 1;
6744  	scctx->isc_ntxqsets = 1;
6745  	scctx->isc_vectors = vectors;
6746  	if (vectors == 1 && pci_alloc_msi(dev, &vectors) == 0) {
6747  		device_printf(dev, "Using an MSI interrupt\n");
6748  		scctx->isc_intr = IFLIB_INTR_MSI;
6749  	} else {
6750  		scctx->isc_vectors = 1;
6751  		device_printf(dev, "Using a Legacy interrupt\n");
6752  		scctx->isc_intr = IFLIB_INTR_LEGACY;
6753  	}
6754  
6755  	return (vectors);
6756  }
6757  
6758  static const char *ring_states[] = { "IDLE", "BUSY", "STALLED", "ABDICATED" };
6759  
6760  static int
mp_ring_state_handler(SYSCTL_HANDLER_ARGS)6761  mp_ring_state_handler(SYSCTL_HANDLER_ARGS)
6762  {
6763  	int rc;
6764  	uint16_t *state = ((uint16_t *)oidp->oid_arg1);
6765  	struct sbuf *sb;
6766  	const char *ring_state = "UNKNOWN";
6767  
6768  	/* XXX needed ? */
6769  	rc = sysctl_wire_old_buffer(req, 0);
6770  	MPASS(rc == 0);
6771  	if (rc != 0)
6772  		return (rc);
6773  	sb = sbuf_new_for_sysctl(NULL, NULL, 80, req);
6774  	MPASS(sb != NULL);
6775  	if (sb == NULL)
6776  		return (ENOMEM);
6777  	if (state[3] <= 3)
6778  		ring_state = ring_states[state[3]];
6779  
6780  	sbuf_printf(sb, "pidx_head: %04hd pidx_tail: %04hd cidx: %04hd state: %s",
6781  		    state[0], state[1], state[2], ring_state);
6782  	rc = sbuf_finish(sb);
6783  	sbuf_delete(sb);
6784          return (rc);
6785  }
6786  
6787  enum iflib_ndesc_handler {
6788  	IFLIB_NTXD_HANDLER,
6789  	IFLIB_NRXD_HANDLER,
6790  };
6791  
6792  static int
mp_ndesc_handler(SYSCTL_HANDLER_ARGS)6793  mp_ndesc_handler(SYSCTL_HANDLER_ARGS)
6794  {
6795  	if_ctx_t ctx = (void *)arg1;
6796  	enum iflib_ndesc_handler type = arg2;
6797  	char buf[256] = {0};
6798  	qidx_t *ndesc;
6799  	char *p, *next;
6800  	int nqs, rc, i;
6801  
6802  	nqs = 8;
6803  	switch (type) {
6804  	case IFLIB_NTXD_HANDLER:
6805  		ndesc = ctx->ifc_sysctl_ntxds;
6806  		if (ctx->ifc_sctx)
6807  			nqs = ctx->ifc_sctx->isc_ntxqs;
6808  		break;
6809  	case IFLIB_NRXD_HANDLER:
6810  		ndesc = ctx->ifc_sysctl_nrxds;
6811  		if (ctx->ifc_sctx)
6812  			nqs = ctx->ifc_sctx->isc_nrxqs;
6813  		break;
6814  	default:
6815  		printf("%s: unhandled type\n", __func__);
6816  		return (EINVAL);
6817  	}
6818  	if (nqs == 0)
6819  		nqs = 8;
6820  
6821  	for (i = 0; i < 8; i++) {
6822  		if (i >= nqs)
6823  			break;
6824  		if (i)
6825  			strcat(buf, ",");
6826  		sprintf(strchr(buf, 0), "%d", ndesc[i]);
6827  	}
6828  
6829  	rc = sysctl_handle_string(oidp, buf, sizeof(buf), req);
6830  	if (rc || req->newptr == NULL)
6831  		return (rc);
6832  
6833  	for (i = 0, next = buf, p = strsep(&next, " ,"); i < 8 && p;
6834  	    i++, p = strsep(&next, " ,")) {
6835  		ndesc[i] = strtoul(p, NULL, 10);
6836  	}
6837  
6838  	return (rc);
6839  }
6840  
6841  #define NAME_BUFLEN 32
6842  static void
iflib_add_device_sysctl_pre(if_ctx_t ctx)6843  iflib_add_device_sysctl_pre(if_ctx_t ctx)
6844  {
6845          device_t dev = iflib_get_dev(ctx);
6846  	struct sysctl_oid_list *child, *oid_list;
6847  	struct sysctl_ctx_list *ctx_list;
6848  	struct sysctl_oid *node;
6849  
6850  	ctx_list = device_get_sysctl_ctx(dev);
6851  	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
6852  	ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child,
6853  	    OID_AUTO, "iflib", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
6854  	    "IFLIB fields");
6855  	oid_list = SYSCTL_CHILDREN(node);
6856  
6857  	SYSCTL_ADD_CONST_STRING(ctx_list, oid_list, OID_AUTO, "driver_version",
6858  	    CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version, "driver version");
6859  
6860  	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs",
6861  	    CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0,
6862  	    "# of txqs to use, 0 => use default #");
6863  	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs",
6864  	    CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0,
6865  	    "# of rxqs to use, 0 => use default #");
6866  	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable",
6867  	    CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0,
6868  	    "permit #txq != #rxq");
6869  	SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "disable_msix",
6870  	    CTLFLAG_RWTUN, &ctx->ifc_softc_ctx.isc_disable_msix, 0,
6871  	    "disable MSI-X (default 0)");
6872  	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "rx_budget",
6873  	    CTLFLAG_RWTUN, &ctx->ifc_sysctl_rx_budget, 0, "set the RX budget");
6874  	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "tx_abdicate",
6875  	    CTLFLAG_RWTUN, &ctx->ifc_sysctl_tx_abdicate, 0,
6876  	    "cause TX to abdicate instead of running to completion");
6877  	ctx->ifc_sysctl_core_offset = CORE_OFFSET_UNSPECIFIED;
6878  	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "core_offset",
6879  	    CTLFLAG_RDTUN, &ctx->ifc_sysctl_core_offset, 0,
6880  	    "offset to start using cores at");
6881  	SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "separate_txrx",
6882  	    CTLFLAG_RDTUN, &ctx->ifc_sysctl_separate_txrx, 0,
6883  	    "use separate cores for TX and RX");
6884  	SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "use_logical_cores",
6885  	    CTLFLAG_RDTUN, &ctx->ifc_sysctl_use_logical_cores, 0,
6886  	    "try to make use of logical cores for TX and RX");
6887  	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "use_extra_msix_vectors",
6888  	    CTLFLAG_RDTUN, &ctx->ifc_sysctl_extra_msix_vectors, 0,
6889  	    "attempt to reserve the given number of extra MSI-X vectors during driver load for the creation of additional interfaces later");
6890  	SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "allocated_msix_vectors",
6891         	    CTLFLAG_RDTUN, &ctx->ifc_softc_ctx.isc_vectors, 0,
6892  	    "total # of MSI-X vectors allocated by driver");
6893  
6894  	/* XXX change for per-queue sizes */
6895  	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
6896  	    CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, ctx,
6897  	    IFLIB_NTXD_HANDLER, mp_ndesc_handler, "A",
6898  	    "list of # of TX descriptors to use, 0 = use default #");
6899  	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds",
6900  	    CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, ctx,
6901  	    IFLIB_NRXD_HANDLER, mp_ndesc_handler, "A",
6902  	    "list of # of RX descriptors to use, 0 = use default #");
6903  }
6904  
6905  static void
iflib_add_device_sysctl_post(if_ctx_t ctx)6906  iflib_add_device_sysctl_post(if_ctx_t ctx)
6907  {
6908  	if_shared_ctx_t sctx = ctx->ifc_sctx;
6909  	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
6910          device_t dev = iflib_get_dev(ctx);
6911  	struct sysctl_oid_list *child;
6912  	struct sysctl_ctx_list *ctx_list;
6913  	iflib_fl_t fl;
6914  	iflib_txq_t txq;
6915  	iflib_rxq_t rxq;
6916  	int i, j;
6917  	char namebuf[NAME_BUFLEN];
6918  	char *qfmt;
6919  	struct sysctl_oid *queue_node, *fl_node, *node;
6920  	struct sysctl_oid_list *queue_list, *fl_list;
6921  	ctx_list = device_get_sysctl_ctx(dev);
6922  
6923  	node = ctx->ifc_sysctl_node;
6924  	child = SYSCTL_CHILDREN(node);
6925  
6926  	if (scctx->isc_ntxqsets > 100)
6927  		qfmt = "txq%03d";
6928  	else if (scctx->isc_ntxqsets > 10)
6929  		qfmt = "txq%02d";
6930  	else
6931  		qfmt = "txq%d";
6932  	for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) {
6933  		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
6934  		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
6935  		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
6936  		queue_list = SYSCTL_CHILDREN(queue_node);
6937  		SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu",
6938  		    CTLFLAG_RD, &txq->ift_task.gt_cpu, 0,
6939  		    "cpu this queue is bound to");
6940  #if MEMORY_LOGGING
6941  		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued",
6942  		    CTLFLAG_RD, &txq->ift_dequeued, "total mbufs freed");
6943  		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO, "txq_enqueued",
6944  		    CTLFLAG_RD, &txq->ift_enqueued, "total mbufs enqueued");
6945  #endif
6946  		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag",
6947  		    CTLFLAG_RD, &txq->ift_mbuf_defrag,
6948  		    "# of times m_defrag was called");
6949  		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO, "m_pullups",
6950  		    CTLFLAG_RD, &txq->ift_pullups,
6951  		    "# of times m_pullup was called");
6952  		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO,
6953  		    "mbuf_defrag_failed", CTLFLAG_RD,
6954  		    &txq->ift_mbuf_defrag_failed, "# of times m_defrag failed");
6955  		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO,
6956  		    "no_desc_avail", CTLFLAG_RD, &txq->ift_no_desc_avail,
6957  		    "# of times no descriptors were available");
6958  		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO,
6959  		    "tx_map_failed", CTLFLAG_RD, &txq->ift_map_failed,
6960  		    "# of times DMA map failed");
6961  		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO,
6962  		    "txd_encap_efbig", CTLFLAG_RD, &txq->ift_txd_encap_efbig,
6963  		    "# of times txd_encap returned EFBIG");
6964  		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO,
6965  		    "no_tx_dma_setup", CTLFLAG_RD, &txq->ift_no_tx_dma_setup,
6966  		    "# of times map failed for other than EFBIG");
6967  		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_pidx",
6968  		    CTLFLAG_RD, &txq->ift_pidx, 1, "Producer Index");
6969  		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx",
6970  		    CTLFLAG_RD, &txq->ift_cidx, 1, "Consumer Index");
6971  		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO,
6972  		    "txq_cidx_processed", CTLFLAG_RD, &txq->ift_cidx_processed,
6973  		    1, "Consumer Index seen by credit update");
6974  		SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_in_use",
6975  		    CTLFLAG_RD, &txq->ift_in_use, 1, "descriptors in use");
6976  		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO,
6977  		    "txq_processed", CTLFLAG_RD, &txq->ift_processed,
6978  		    "descriptors procesed for clean");
6979  		SYSCTL_ADD_UQUAD(ctx_list, queue_list, OID_AUTO, "txq_cleaned",
6980  		    CTLFLAG_RD, &txq->ift_cleaned, "total cleaned");
6981  		SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state",
6982  		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
6983  		    __DEVOLATILE(uint64_t *, &txq->ift_br->state), 0,
6984  		    mp_ring_state_handler, "A", "soft ring state");
6985  		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO,
6986  		    "r_enqueues", CTLFLAG_RD, &txq->ift_br->enqueues,
6987  		    "# of enqueues to the mp_ring for this queue");
6988  		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO,
6989  		    "r_drops", CTLFLAG_RD, &txq->ift_br->drops,
6990  		    "# of drops in the mp_ring for this queue");
6991  		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO,
6992  		    "r_starts", CTLFLAG_RD, &txq->ift_br->starts,
6993  		    "# of normal consumer starts in mp_ring for this queue");
6994  		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO,
6995  		    "r_stalls", CTLFLAG_RD, &txq->ift_br->stalls,
6996  		    "# of consumer stalls in the mp_ring for this queue");
6997  		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO,
6998  		    "r_restarts", CTLFLAG_RD, &txq->ift_br->restarts,
6999  		    "# of consumer restarts in the mp_ring for this queue");
7000  		SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO,
7001  		    "r_abdications", CTLFLAG_RD, &txq->ift_br->abdications,
7002  		    "# of consumer abdications in the mp_ring for this queue");
7003  	}
7004  
7005  	if (scctx->isc_nrxqsets > 100)
7006  		qfmt = "rxq%03d";
7007  	else if (scctx->isc_nrxqsets > 10)
7008  		qfmt = "rxq%02d";
7009  	else
7010  		qfmt = "rxq%d";
7011  	for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) {
7012  		snprintf(namebuf, NAME_BUFLEN, qfmt, i);
7013  		queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
7014  		    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
7015  		queue_list = SYSCTL_CHILDREN(queue_node);
7016  		SYSCTL_ADD_INT(ctx_list, queue_list, OID_AUTO, "cpu",
7017  		    CTLFLAG_RD, &rxq->ifr_task.gt_cpu, 0,
7018  		    "cpu this queue is bound to");
7019  		if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
7020  			SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO,
7021  			    "rxq_cq_cidx", CTLFLAG_RD, &rxq->ifr_cq_cidx, 1,
7022  			    "Consumer Index");
7023  		}
7024  
7025  		for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
7026  			snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j);
7027  			fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list,
7028  			    OID_AUTO, namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE,
7029  			    NULL, "freelist Name");
7030  			fl_list = SYSCTL_CHILDREN(fl_node);
7031  			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "pidx",
7032  			    CTLFLAG_RD, &fl->ifl_pidx, 1, "Producer Index");
7033  			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "cidx",
7034  			    CTLFLAG_RD, &fl->ifl_cidx, 1, "Consumer Index");
7035  			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "credits",
7036  			    CTLFLAG_RD, &fl->ifl_credits, 1,
7037  			    "credits available");
7038  			SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "buf_size",
7039  			    CTLFLAG_RD, &fl->ifl_buf_size, 1, "buffer size");
7040  #if MEMORY_LOGGING
7041  			SYSCTL_ADD_UQUAD(ctx_list, fl_list, OID_AUTO,
7042  			    "fl_m_enqueued", CTLFLAG_RD, &fl->ifl_m_enqueued,
7043  			    "mbufs allocated");
7044  			SYSCTL_ADD_UQUAD(ctx_list, fl_list, OID_AUTO,
7045  			    "fl_m_dequeued", CTLFLAG_RD, &fl->ifl_m_dequeued,
7046  			    "mbufs freed");
7047  			SYSCTL_ADD_UQUAD(ctx_list, fl_list, OID_AUTO,
7048  			    "fl_cl_enqueued", CTLFLAG_RD, &fl->ifl_cl_enqueued,
7049  			    "clusters allocated");
7050  			SYSCTL_ADD_UQUAD(ctx_list, fl_list, OID_AUTO,
7051  			    "fl_cl_dequeued", CTLFLAG_RD, &fl->ifl_cl_dequeued,
7052  			    "clusters freed");
7053  #endif
7054  		}
7055  	}
7056  
7057  }
7058  
7059  void
iflib_request_reset(if_ctx_t ctx)7060  iflib_request_reset(if_ctx_t ctx)
7061  {
7062  
7063  	STATE_LOCK(ctx);
7064  	ctx->ifc_flags |= IFC_DO_RESET;
7065  	STATE_UNLOCK(ctx);
7066  }
7067  
7068  #ifndef __NO_STRICT_ALIGNMENT
7069  static struct mbuf *
iflib_fixup_rx(struct mbuf * m)7070  iflib_fixup_rx(struct mbuf *m)
7071  {
7072  	struct mbuf *n;
7073  
7074  	if (m->m_len <= (MCLBYTES - ETHER_HDR_LEN)) {
7075  		bcopy(m->m_data, m->m_data + ETHER_HDR_LEN, m->m_len);
7076  		m->m_data += ETHER_HDR_LEN;
7077  		n = m;
7078  	} else {
7079  		MGETHDR(n, M_NOWAIT, MT_DATA);
7080  		if (n == NULL) {
7081  			m_freem(m);
7082  			return (NULL);
7083  		}
7084  		bcopy(m->m_data, n->m_data, ETHER_HDR_LEN);
7085  		m->m_data += ETHER_HDR_LEN;
7086  		m->m_len -= ETHER_HDR_LEN;
7087  		n->m_len = ETHER_HDR_LEN;
7088  		M_MOVE_PKTHDR(n, m);
7089  		n->m_next = m;
7090  	}
7091  	return (n);
7092  }
7093  #endif
7094  
7095  #ifdef DEBUGNET
7096  static void
iflib_debugnet_init(if_t ifp,int * nrxr,int * ncl,int * clsize)7097  iflib_debugnet_init(if_t ifp, int *nrxr, int *ncl, int *clsize)
7098  {
7099  	if_ctx_t ctx;
7100  
7101  	ctx = if_getsoftc(ifp);
7102  	CTX_LOCK(ctx);
7103  	*nrxr = NRXQSETS(ctx);
7104  	*ncl = ctx->ifc_rxqs[0].ifr_fl->ifl_size;
7105  	*clsize = ctx->ifc_rxqs[0].ifr_fl->ifl_buf_size;
7106  	CTX_UNLOCK(ctx);
7107  }
7108  
7109  static void
iflib_debugnet_event(if_t ifp,enum debugnet_ev event)7110  iflib_debugnet_event(if_t ifp, enum debugnet_ev event)
7111  {
7112  	if_ctx_t ctx;
7113  	if_softc_ctx_t scctx;
7114  	iflib_fl_t fl;
7115  	iflib_rxq_t rxq;
7116  	int i, j;
7117  
7118  	ctx = if_getsoftc(ifp);
7119  	scctx = &ctx->ifc_softc_ctx;
7120  
7121  	switch (event) {
7122  	case DEBUGNET_START:
7123  		for (i = 0; i < scctx->isc_nrxqsets; i++) {
7124  			rxq = &ctx->ifc_rxqs[i];
7125  			for (j = 0; j < rxq->ifr_nfl; j++) {
7126  				fl = rxq->ifr_fl;
7127  				fl->ifl_zone = m_getzone(fl->ifl_buf_size);
7128  			}
7129  		}
7130  		iflib_no_tx_batch = 1;
7131  		break;
7132  	default:
7133  		break;
7134  	}
7135  }
7136  
7137  static int
iflib_debugnet_transmit(if_t ifp,struct mbuf * m)7138  iflib_debugnet_transmit(if_t ifp, struct mbuf *m)
7139  {
7140  	if_ctx_t ctx;
7141  	iflib_txq_t txq;
7142  	int error;
7143  
7144  	ctx = if_getsoftc(ifp);
7145  	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
7146  	    IFF_DRV_RUNNING)
7147  		return (EBUSY);
7148  
7149  	txq = &ctx->ifc_txqs[0];
7150  	error = iflib_encap(txq, &m);
7151  	if (error == 0)
7152  		(void)iflib_txd_db_check(txq, true);
7153  	return (error);
7154  }
7155  
7156  static int
iflib_debugnet_poll(if_t ifp,int count)7157  iflib_debugnet_poll(if_t ifp, int count)
7158  {
7159  	struct epoch_tracker et;
7160  	if_ctx_t ctx;
7161  	if_softc_ctx_t scctx;
7162  	iflib_txq_t txq;
7163  	int i;
7164  
7165  	ctx = if_getsoftc(ifp);
7166  	scctx = &ctx->ifc_softc_ctx;
7167  
7168  	if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
7169  	    IFF_DRV_RUNNING)
7170  		return (EBUSY);
7171  
7172  	txq = &ctx->ifc_txqs[0];
7173  	(void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
7174  
7175  	NET_EPOCH_ENTER(et);
7176  	for (i = 0; i < scctx->isc_nrxqsets; i++)
7177  		(void)iflib_rxeof(&ctx->ifc_rxqs[i], 16 /* XXX */);
7178  	NET_EPOCH_EXIT(et);
7179  	return (0);
7180  }
7181  #endif /* DEBUGNET */
7182