xref: /freebsd/sys/net/if_lagg.c (revision f5f40dd63bc7acbb5312b26ac1ea1103c12352a6)
1  /*	$OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $	*/
2  
3  /*
4   * Copyright (c) 2005, 2006 Reyk Floeter <reyk@openbsd.org>
5   * Copyright (c) 2007 Andrew Thompson <thompsa@FreeBSD.org>
6   * Copyright (c) 2014, 2016 Marcelo Araujo <araujo@FreeBSD.org>
7   *
8   * Permission to use, copy, modify, and distribute this software for any
9   * purpose with or without fee is hereby granted, provided that the above
10   * copyright notice and this permission notice appear in all copies.
11   *
12   * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
13   * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
14   * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
15   * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
16   * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
17   * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
18   * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19   */
20  
21  #include <sys/cdefs.h>
22  #include "opt_inet.h"
23  #include "opt_inet6.h"
24  #include "opt_kern_tls.h"
25  #include "opt_ratelimit.h"
26  
27  #include <sys/param.h>
28  #include <sys/kernel.h>
29  #include <sys/malloc.h>
30  #include <sys/mbuf.h>
31  #include <sys/queue.h>
32  #include <sys/socket.h>
33  #include <sys/sockio.h>
34  #include <sys/sysctl.h>
35  #include <sys/module.h>
36  #include <sys/priv.h>
37  #include <sys/systm.h>
38  #include <sys/proc.h>
39  #include <sys/lock.h>
40  #include <sys/rmlock.h>
41  #include <sys/sx.h>
42  #include <sys/taskqueue.h>
43  #include <sys/eventhandler.h>
44  
45  #include <net/ethernet.h>
46  #include <net/if.h>
47  #include <net/if_clone.h>
48  #include <net/if_arp.h>
49  #include <net/if_dl.h>
50  #include <net/if_media.h>
51  #include <net/if_types.h>
52  #include <net/if_var.h>
53  #include <net/if_private.h>
54  #include <net/bpf.h>
55  #include <net/route.h>
56  #include <net/vnet.h>
57  #include <net/infiniband.h>
58  
59  #if defined(INET) || defined(INET6)
60  #include <netinet/in.h>
61  #include <netinet/ip.h>
62  #endif
63  #ifdef INET
64  #include <netinet/in_systm.h>
65  #include <netinet/if_ether.h>
66  #endif
67  
68  #ifdef INET6
69  #include <netinet/ip6.h>
70  #include <netinet6/in6_var.h>
71  #include <netinet6/in6_ifattach.h>
72  #endif
73  
74  #include <net/if_vlan_var.h>
75  #include <net/if_lagg.h>
76  #include <net/ieee8023ad_lacp.h>
77  
78  #ifdef DEV_NETMAP
79  MODULE_DEPEND(if_lagg, netmap, 1, 1, 1);
80  #endif
81  
82  #define	LAGG_SX_INIT(_sc)	sx_init(&(_sc)->sc_sx, "if_lagg sx")
83  #define	LAGG_SX_DESTROY(_sc)	sx_destroy(&(_sc)->sc_sx)
84  #define	LAGG_XLOCK(_sc)		sx_xlock(&(_sc)->sc_sx)
85  #define	LAGG_XUNLOCK(_sc)	sx_xunlock(&(_sc)->sc_sx)
86  #define	LAGG_XLOCK_ASSERT(_sc)	sx_assert(&(_sc)->sc_sx, SA_XLOCKED)
87  #define	LAGG_SLOCK(_sc)		sx_slock(&(_sc)->sc_sx)
88  #define	LAGG_SUNLOCK(_sc)	sx_sunlock(&(_sc)->sc_sx)
89  #define	LAGG_SXLOCK_ASSERT(_sc)	sx_assert(&(_sc)->sc_sx, SA_LOCKED)
90  
91  /* Special flags we should propagate to the lagg ports. */
92  static struct {
93  	int flag;
94  	int (*func)(struct ifnet *, int);
95  } lagg_pflags[] = {
96  	{IFF_PROMISC, ifpromisc},
97  	{IFF_ALLMULTI, if_allmulti},
98  	{0, NULL}
99  };
100  
101  struct lagg_snd_tag {
102  	struct m_snd_tag com;
103  	struct m_snd_tag *tag;
104  };
105  
106  VNET_DEFINE_STATIC(SLIST_HEAD(__trhead, lagg_softc), lagg_list); /* list of laggs */
107  #define	V_lagg_list	VNET(lagg_list)
108  VNET_DEFINE_STATIC(struct mtx, lagg_list_mtx);
109  #define	V_lagg_list_mtx	VNET(lagg_list_mtx)
110  #define	LAGG_LIST_LOCK_INIT(x)		mtx_init(&V_lagg_list_mtx, \
111  					"if_lagg list", NULL, MTX_DEF)
112  #define	LAGG_LIST_LOCK_DESTROY(x)	mtx_destroy(&V_lagg_list_mtx)
113  #define	LAGG_LIST_LOCK(x)		mtx_lock(&V_lagg_list_mtx)
114  #define	LAGG_LIST_UNLOCK(x)		mtx_unlock(&V_lagg_list_mtx)
115  static eventhandler_tag	lagg_detach_cookie = NULL;
116  
117  static int	lagg_clone_create(struct if_clone *, char *, size_t,
118  		    struct ifc_data *, struct ifnet **);
119  static int	lagg_clone_destroy(struct if_clone *, struct ifnet *, uint32_t);
120  VNET_DEFINE_STATIC(struct if_clone *, lagg_cloner);
121  #define	V_lagg_cloner	VNET(lagg_cloner)
122  static const char laggname[] = "lagg";
123  static MALLOC_DEFINE(M_LAGG, laggname, "802.3AD Link Aggregation Interface");
124  
125  static void	lagg_capabilities(struct lagg_softc *);
126  static int	lagg_port_create(struct lagg_softc *, struct ifnet *);
127  static int	lagg_port_destroy(struct lagg_port *, int);
128  static struct mbuf *lagg_input_ethernet(struct ifnet *, struct mbuf *);
129  static struct mbuf *lagg_input_infiniband(struct ifnet *, struct mbuf *);
130  static void	lagg_linkstate(struct lagg_softc *);
131  static void	lagg_port_state(struct ifnet *, int);
132  static int	lagg_port_ioctl(struct ifnet *, u_long, caddr_t);
133  static int	lagg_port_output(struct ifnet *, struct mbuf *,
134  		    const struct sockaddr *, struct route *);
135  static void	lagg_port_ifdetach(void *arg __unused, struct ifnet *);
136  #ifdef LAGG_PORT_STACKING
137  static int	lagg_port_checkstacking(struct lagg_softc *);
138  #endif
139  static void	lagg_port2req(struct lagg_port *, struct lagg_reqport *);
140  static void	lagg_if_updown(struct lagg_softc *, bool);
141  static void	lagg_init(void *);
142  static void	lagg_stop(struct lagg_softc *);
143  static int	lagg_ioctl(struct ifnet *, u_long, caddr_t);
144  #if defined(KERN_TLS) || defined(RATELIMIT)
145  static int	lagg_snd_tag_alloc(struct ifnet *,
146  		    union if_snd_tag_alloc_params *,
147  		    struct m_snd_tag **);
148  static int	lagg_snd_tag_modify(struct m_snd_tag *,
149  		    union if_snd_tag_modify_params *);
150  static int	lagg_snd_tag_query(struct m_snd_tag *,
151  		    union if_snd_tag_query_params *);
152  static void	lagg_snd_tag_free(struct m_snd_tag *);
153  static struct m_snd_tag *lagg_next_snd_tag(struct m_snd_tag *);
154  static void	lagg_ratelimit_query(struct ifnet *,
155  		    struct if_ratelimit_query_results *);
156  #endif
157  static int	lagg_setmulti(struct lagg_port *);
158  static int	lagg_clrmulti(struct lagg_port *);
159  static void	lagg_setcaps(struct lagg_port *, int cap, int cap2);
160  static int	lagg_setflag(struct lagg_port *, int, int,
161  		    int (*func)(struct ifnet *, int));
162  static int	lagg_setflags(struct lagg_port *, int status);
163  static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt);
164  static int	lagg_transmit_ethernet(struct ifnet *, struct mbuf *);
165  static int	lagg_transmit_infiniband(struct ifnet *, struct mbuf *);
166  static void	lagg_qflush(struct ifnet *);
167  static int	lagg_media_change(struct ifnet *);
168  static void	lagg_media_status(struct ifnet *, struct ifmediareq *);
169  static struct lagg_port *lagg_link_active(struct lagg_softc *,
170  		    struct lagg_port *);
171  
172  /* Simple round robin */
173  static void	lagg_rr_attach(struct lagg_softc *);
174  static int	lagg_rr_start(struct lagg_softc *, struct mbuf *);
175  
176  /* Active failover */
177  static int	lagg_fail_start(struct lagg_softc *, struct mbuf *);
178  static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *,
179  		    struct mbuf *);
180  
181  /* Loadbalancing */
182  static void	lagg_lb_attach(struct lagg_softc *);
183  static void	lagg_lb_detach(struct lagg_softc *);
184  static int	lagg_lb_port_create(struct lagg_port *);
185  static void	lagg_lb_port_destroy(struct lagg_port *);
186  static int	lagg_lb_start(struct lagg_softc *, struct mbuf *);
187  static int	lagg_lb_porttable(struct lagg_softc *, struct lagg_port *);
188  
189  /* Broadcast */
190  static int	lagg_bcast_start(struct lagg_softc *, struct mbuf *);
191  
192  /* 802.3ad LACP */
193  static void	lagg_lacp_attach(struct lagg_softc *);
194  static void	lagg_lacp_detach(struct lagg_softc *);
195  static int	lagg_lacp_start(struct lagg_softc *, struct mbuf *);
196  static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *,
197  		    struct mbuf *);
198  static void	lagg_lacp_lladdr(struct lagg_softc *);
199  
200  /* Default input */
201  static struct mbuf *lagg_default_input(struct lagg_softc *, struct lagg_port *,
202  		    struct mbuf *);
203  
204  /* lagg protocol table */
205  static const struct lagg_proto {
206  	lagg_proto	pr_num;
207  	void		(*pr_attach)(struct lagg_softc *);
208  	void		(*pr_detach)(struct lagg_softc *);
209  	int		(*pr_start)(struct lagg_softc *, struct mbuf *);
210  	struct mbuf *	(*pr_input)(struct lagg_softc *, struct lagg_port *,
211  			    struct mbuf *);
212  	int		(*pr_addport)(struct lagg_port *);
213  	void		(*pr_delport)(struct lagg_port *);
214  	void		(*pr_linkstate)(struct lagg_port *);
215  	void 		(*pr_init)(struct lagg_softc *);
216  	void 		(*pr_stop)(struct lagg_softc *);
217  	void 		(*pr_lladdr)(struct lagg_softc *);
218  	void		(*pr_request)(struct lagg_softc *, void *);
219  	void		(*pr_portreq)(struct lagg_port *, void *);
220  } lagg_protos[] = {
221      {
222  	.pr_num = LAGG_PROTO_NONE
223      },
224      {
225  	.pr_num = LAGG_PROTO_ROUNDROBIN,
226  	.pr_attach = lagg_rr_attach,
227  	.pr_start = lagg_rr_start,
228  	.pr_input = lagg_default_input,
229      },
230      {
231  	.pr_num = LAGG_PROTO_FAILOVER,
232  	.pr_start = lagg_fail_start,
233  	.pr_input = lagg_fail_input,
234      },
235      {
236  	.pr_num = LAGG_PROTO_LOADBALANCE,
237  	.pr_attach = lagg_lb_attach,
238  	.pr_detach = lagg_lb_detach,
239  	.pr_start = lagg_lb_start,
240  	.pr_input = lagg_default_input,
241  	.pr_addport = lagg_lb_port_create,
242  	.pr_delport = lagg_lb_port_destroy,
243      },
244      {
245  	.pr_num = LAGG_PROTO_LACP,
246  	.pr_attach = lagg_lacp_attach,
247  	.pr_detach = lagg_lacp_detach,
248  	.pr_start = lagg_lacp_start,
249  	.pr_input = lagg_lacp_input,
250  	.pr_addport = lacp_port_create,
251  	.pr_delport = lacp_port_destroy,
252  	.pr_linkstate = lacp_linkstate,
253  	.pr_init = lacp_init,
254  	.pr_stop = lacp_stop,
255  	.pr_lladdr = lagg_lacp_lladdr,
256  	.pr_request = lacp_req,
257  	.pr_portreq = lacp_portreq,
258      },
259      {
260  	.pr_num = LAGG_PROTO_BROADCAST,
261  	.pr_start = lagg_bcast_start,
262  	.pr_input = lagg_default_input,
263      },
264  };
265  
266  SYSCTL_DECL(_net_link);
267  SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
268      "Link Aggregation");
269  
270  /* Allow input on any failover links */
271  VNET_DEFINE_STATIC(int, lagg_failover_rx_all);
272  #define	V_lagg_failover_rx_all	VNET(lagg_failover_rx_all)
273  SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW | CTLFLAG_VNET,
274      &VNET_NAME(lagg_failover_rx_all), 0,
275      "Accept input from any interface in a failover lagg");
276  
277  /* Default value for using flowid */
278  VNET_DEFINE_STATIC(int, def_use_flowid) = 0;
279  #define	V_def_use_flowid	VNET(def_use_flowid)
280  SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid,
281      CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(def_use_flowid), 0,
282      "Default setting for using flow id for load sharing");
283  
284  /* Default value for using numa */
285  VNET_DEFINE_STATIC(int, def_use_numa) = 1;
286  #define	V_def_use_numa	VNET(def_use_numa)
287  SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_numa,
288      CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(def_use_numa), 0,
289      "Use numa to steer flows");
290  
291  /* Default value for flowid shift */
292  VNET_DEFINE_STATIC(int, def_flowid_shift) = 16;
293  #define	V_def_flowid_shift	VNET(def_flowid_shift)
294  SYSCTL_INT(_net_link_lagg, OID_AUTO, default_flowid_shift,
295      CTLFLAG_RWTUN | CTLFLAG_VNET, &VNET_NAME(def_flowid_shift), 0,
296      "Default setting for flowid shift for load sharing");
297  
298  static void
299  vnet_lagg_init(const void *unused __unused)
300  {
301  
302  	LAGG_LIST_LOCK_INIT();
303  	SLIST_INIT(&V_lagg_list);
304  	struct if_clone_addreq req = {
305  		.create_f = lagg_clone_create,
306  		.destroy_f = lagg_clone_destroy,
307  		.flags = IFC_F_AUTOUNIT,
308  	};
309  	V_lagg_cloner = ifc_attach_cloner(laggname, &req);
310  }
311  VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
312      vnet_lagg_init, NULL);
313  
314  static void
315  vnet_lagg_uninit(const void *unused __unused)
316  {
317  
318  	ifc_detach_cloner(V_lagg_cloner);
319  	LAGG_LIST_LOCK_DESTROY();
320  }
321  VNET_SYSUNINIT(vnet_lagg_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
322      vnet_lagg_uninit, NULL);
323  
324  static int
325  lagg_modevent(module_t mod, int type, void *data)
326  {
327  
328  	switch (type) {
329  	case MOD_LOAD:
330  		lagg_input_ethernet_p = lagg_input_ethernet;
331  		lagg_input_infiniband_p = lagg_input_infiniband;
332  		lagg_linkstate_p = lagg_port_state;
333  		lagg_detach_cookie = EVENTHANDLER_REGISTER(
334  		    ifnet_departure_event, lagg_port_ifdetach, NULL,
335  		    EVENTHANDLER_PRI_ANY);
336  		break;
337  	case MOD_UNLOAD:
338  		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
339  		    lagg_detach_cookie);
340  		lagg_input_ethernet_p = NULL;
341  		lagg_input_infiniband_p = NULL;
342  		lagg_linkstate_p = NULL;
343  		break;
344  	default:
345  		return (EOPNOTSUPP);
346  	}
347  	return (0);
348  }
349  
350  static moduledata_t lagg_mod = {
351  	"if_lagg",
352  	lagg_modevent,
353  	0
354  };
355  
356  DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
357  MODULE_VERSION(if_lagg, 1);
358  MODULE_DEPEND(if_lagg, if_infiniband, 1, 1, 1);
359  
360  static void
361  lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr)
362  {
363  
364  	LAGG_XLOCK_ASSERT(sc);
365  	KASSERT(sc->sc_proto == LAGG_PROTO_NONE, ("%s: sc %p has proto",
366  	    __func__, sc));
367  
368  	if (sc->sc_ifflags & IFF_DEBUG)
369  		if_printf(sc->sc_ifp, "using proto %u\n", pr);
370  
371  	if (lagg_protos[pr].pr_attach != NULL)
372  		lagg_protos[pr].pr_attach(sc);
373  	sc->sc_proto = pr;
374  }
375  
376  static void
377  lagg_proto_detach(struct lagg_softc *sc)
378  {
379  	lagg_proto pr;
380  
381  	LAGG_XLOCK_ASSERT(sc);
382  	pr = sc->sc_proto;
383  	sc->sc_proto = LAGG_PROTO_NONE;
384  
385  	if (lagg_protos[pr].pr_detach != NULL)
386  		lagg_protos[pr].pr_detach(sc);
387  }
388  
389  static inline int
390  lagg_proto_start(struct lagg_softc *sc, struct mbuf *m)
391  {
392  
393  	return (lagg_protos[sc->sc_proto].pr_start(sc, m));
394  }
395  
396  static inline struct mbuf *
397  lagg_proto_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
398  {
399  
400  	return (lagg_protos[sc->sc_proto].pr_input(sc, lp, m));
401  }
402  
403  static int
404  lagg_proto_addport(struct lagg_softc *sc, struct lagg_port *lp)
405  {
406  
407  	if (lagg_protos[sc->sc_proto].pr_addport == NULL)
408  		return (0);
409  	else
410  		return (lagg_protos[sc->sc_proto].pr_addport(lp));
411  }
412  
413  static void
414  lagg_proto_delport(struct lagg_softc *sc, struct lagg_port *lp)
415  {
416  
417  	if (lagg_protos[sc->sc_proto].pr_delport != NULL)
418  		lagg_protos[sc->sc_proto].pr_delport(lp);
419  }
420  
421  static void
422  lagg_proto_linkstate(struct lagg_softc *sc, struct lagg_port *lp)
423  {
424  
425  	if (lagg_protos[sc->sc_proto].pr_linkstate != NULL)
426  		lagg_protos[sc->sc_proto].pr_linkstate(lp);
427  }
428  
429  static void
430  lagg_proto_init(struct lagg_softc *sc)
431  {
432  
433  	if (lagg_protos[sc->sc_proto].pr_init != NULL)
434  		lagg_protos[sc->sc_proto].pr_init(sc);
435  }
436  
437  static void
438  lagg_proto_stop(struct lagg_softc *sc)
439  {
440  
441  	if (lagg_protos[sc->sc_proto].pr_stop != NULL)
442  		lagg_protos[sc->sc_proto].pr_stop(sc);
443  }
444  
445  static void
446  lagg_proto_lladdr(struct lagg_softc *sc)
447  {
448  
449  	if (lagg_protos[sc->sc_proto].pr_lladdr != NULL)
450  		lagg_protos[sc->sc_proto].pr_lladdr(sc);
451  }
452  
453  static void
454  lagg_proto_request(struct lagg_softc *sc, void *v)
455  {
456  
457  	if (lagg_protos[sc->sc_proto].pr_request != NULL)
458  		lagg_protos[sc->sc_proto].pr_request(sc, v);
459  }
460  
461  static void
462  lagg_proto_portreq(struct lagg_softc *sc, struct lagg_port *lp, void *v)
463  {
464  
465  	if (lagg_protos[sc->sc_proto].pr_portreq != NULL)
466  		lagg_protos[sc->sc_proto].pr_portreq(lp, v);
467  }
468  
469  /*
470   * This routine is run via an vlan
471   * config EVENT
472   */
473  static void
474  lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
475  {
476  	struct lagg_softc *sc = ifp->if_softc;
477  	struct lagg_port *lp;
478  
479  	if (ifp->if_softc != arg) /* Not our event */
480  		return;
481  
482  	LAGG_XLOCK(sc);
483  	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
484  		EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag);
485  	LAGG_XUNLOCK(sc);
486  }
487  
488  /*
489   * This routine is run via an vlan
490   * unconfig EVENT
491   */
492  static void
493  lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
494  {
495  	struct lagg_softc *sc = ifp->if_softc;
496  	struct lagg_port *lp;
497  
498  	if (ifp->if_softc != arg) /* Not our event */
499  		return;
500  
501  	LAGG_XLOCK(sc);
502  	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
503  		EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag);
504  	LAGG_XUNLOCK(sc);
505  }
506  
507  static int
508  lagg_clone_create(struct if_clone *ifc, char *name, size_t len,
509      struct ifc_data *ifd, struct ifnet **ifpp)
510  {
511  	struct iflaggparam iflp;
512  	struct lagg_softc *sc;
513  	struct ifnet *ifp;
514  	int if_type;
515  	int error;
516  	static const uint8_t eaddr[LAGG_ADDR_LEN];
517  
518  	if (ifd->params != NULL) {
519  		error = ifc_copyin(ifd, &iflp, sizeof(iflp));
520  		if (error)
521  			return (error);
522  
523  		switch (iflp.lagg_type) {
524  		case LAGG_TYPE_ETHERNET:
525  			if_type = IFT_ETHER;
526  			break;
527  		case LAGG_TYPE_INFINIBAND:
528  			if_type = IFT_INFINIBAND;
529  			break;
530  		default:
531  			return (EINVAL);
532  		}
533  	} else {
534  		if_type = IFT_ETHER;
535  	}
536  
537  	sc = malloc(sizeof(*sc), M_LAGG, M_WAITOK | M_ZERO);
538  	ifp = sc->sc_ifp = if_alloc(if_type);
539  	if (ifp == NULL) {
540  		free(sc, M_LAGG);
541  		return (ENOSPC);
542  	}
543  	LAGG_SX_INIT(sc);
544  
545  	mtx_init(&sc->sc_mtx, "lagg-mtx", NULL, MTX_DEF);
546  	callout_init_mtx(&sc->sc_watchdog, &sc->sc_mtx, 0);
547  
548  	LAGG_XLOCK(sc);
549  	if (V_def_use_flowid)
550  		sc->sc_opts |= LAGG_OPT_USE_FLOWID;
551  	if (V_def_use_numa)
552  		sc->sc_opts |= LAGG_OPT_USE_NUMA;
553  	sc->flowid_shift = V_def_flowid_shift;
554  
555  	/* Hash all layers by default */
556  	sc->sc_flags = MBUF_HASHFLAG_L2 | MBUF_HASHFLAG_L3 | MBUF_HASHFLAG_L4;
557  
558  	lagg_proto_attach(sc, LAGG_PROTO_DEFAULT);
559  
560  	CK_SLIST_INIT(&sc->sc_ports);
561  
562  	switch (if_type) {
563  	case IFT_ETHER:
564  		/* Initialise pseudo media types */
565  		ifmedia_init(&sc->sc_media, 0, lagg_media_change,
566  		    lagg_media_status);
567  		ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
568  		ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
569  
570  		if_initname(ifp, laggname, ifd->unit);
571  		ifp->if_transmit = lagg_transmit_ethernet;
572  		break;
573  	case IFT_INFINIBAND:
574  		if_initname(ifp, laggname, ifd->unit);
575  		ifp->if_transmit = lagg_transmit_infiniband;
576  		break;
577  	default:
578  		break;
579  	}
580  	ifp->if_softc = sc;
581  	ifp->if_qflush = lagg_qflush;
582  	ifp->if_init = lagg_init;
583  	ifp->if_ioctl = lagg_ioctl;
584  	ifp->if_get_counter = lagg_get_counter;
585  	ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
586  #if defined(KERN_TLS) || defined(RATELIMIT)
587  	ifp->if_snd_tag_alloc = lagg_snd_tag_alloc;
588  	ifp->if_ratelimit_query = lagg_ratelimit_query;
589  #endif
590  	ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
591  
592  	/*
593  	 * Attach as an ordinary ethernet device, children will be attached
594  	 * as special device IFT_IEEE8023ADLAG or IFT_INFINIBANDLAG.
595  	 */
596  	switch (if_type) {
597  	case IFT_ETHER:
598  		ether_ifattach(ifp, eaddr);
599  		break;
600  	case IFT_INFINIBAND:
601  		infiniband_ifattach(ifp, eaddr, sc->sc_bcast_addr);
602  		break;
603  	default:
604  		break;
605  	}
606  
607  	sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
608  		lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
609  	sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
610  		lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
611  
612  	/* Insert into the global list of laggs */
613  	LAGG_LIST_LOCK();
614  	SLIST_INSERT_HEAD(&V_lagg_list, sc, sc_entries);
615  	LAGG_LIST_UNLOCK();
616  	LAGG_XUNLOCK(sc);
617  	*ifpp = ifp;
618  
619  	return (0);
620  }
621  
622  static int
623  lagg_clone_destroy(struct if_clone *ifc, struct ifnet *ifp, uint32_t flags)
624  {
625  	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
626  	struct lagg_port *lp;
627  
628  	LAGG_XLOCK(sc);
629  	sc->sc_destroying = 1;
630  	lagg_stop(sc);
631  	ifp->if_flags &= ~IFF_UP;
632  
633  	EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach);
634  	EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach);
635  
636  	/* Shutdown and remove lagg ports */
637  	while ((lp = CK_SLIST_FIRST(&sc->sc_ports)) != NULL)
638  		lagg_port_destroy(lp, 1);
639  
640  	/* Unhook the aggregation protocol */
641  	lagg_proto_detach(sc);
642  	LAGG_XUNLOCK(sc);
643  
644  	switch (ifp->if_type) {
645  	case IFT_ETHER:
646  		ifmedia_removeall(&sc->sc_media);
647  		ether_ifdetach(ifp);
648  		break;
649  	case IFT_INFINIBAND:
650  		infiniband_ifdetach(ifp);
651  		break;
652  	default:
653  		break;
654  	}
655  	if_free(ifp);
656  
657  	LAGG_LIST_LOCK();
658  	SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries);
659  	LAGG_LIST_UNLOCK();
660  
661  	mtx_destroy(&sc->sc_mtx);
662  	LAGG_SX_DESTROY(sc);
663  	free(sc, M_LAGG);
664  
665  	return (0);
666  }
667  
668  static void
669  lagg_capabilities(struct lagg_softc *sc)
670  {
671  	struct lagg_port *lp;
672  	int cap, cap2, ena, ena2, pena, pena2;
673  	uint64_t hwa;
674  	struct ifnet_hw_tsomax hw_tsomax;
675  
676  	LAGG_XLOCK_ASSERT(sc);
677  
678  	/* Get common enabled capabilities for the lagg ports */
679  	ena = ena2 = ~0;
680  	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
681  		ena &= lp->lp_ifp->if_capenable;
682  		ena2 &= lp->lp_ifp->if_capenable2;
683  	}
684  	if (CK_SLIST_FIRST(&sc->sc_ports) == NULL)
685  		ena = ena2 = 0;
686  
687  	/*
688  	 * Apply common enabled capabilities back to the lagg ports.
689  	 * May require several iterations if they are dependent.
690  	 */
691  	do {
692  		pena = ena;
693  		pena2 = ena2;
694  		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
695  			lagg_setcaps(lp, ena, ena2);
696  			ena &= lp->lp_ifp->if_capenable;
697  			ena2 &= lp->lp_ifp->if_capenable2;
698  		}
699  	} while (pena != ena || pena2 != ena2);
700  
701  	/* Get other capabilities from the lagg ports */
702  	cap = cap2 = ~0;
703  	hwa = ~(uint64_t)0;
704  	memset(&hw_tsomax, 0, sizeof(hw_tsomax));
705  	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
706  		cap &= lp->lp_ifp->if_capabilities;
707  		cap2 &= lp->lp_ifp->if_capabilities2;
708  		hwa &= lp->lp_ifp->if_hwassist;
709  		if_hw_tsomax_common(lp->lp_ifp, &hw_tsomax);
710  	}
711  	if (CK_SLIST_FIRST(&sc->sc_ports) == NULL)
712  		cap = cap2 = hwa = 0;
713  
714  	if (sc->sc_ifp->if_capabilities != cap ||
715  	    sc->sc_ifp->if_capenable != ena ||
716  	    sc->sc_ifp->if_capenable2 != ena2 ||
717  	    sc->sc_ifp->if_hwassist != hwa ||
718  	    if_hw_tsomax_update(sc->sc_ifp, &hw_tsomax) != 0) {
719  		sc->sc_ifp->if_capabilities = cap;
720  		sc->sc_ifp->if_capabilities2 = cap2;
721  		sc->sc_ifp->if_capenable = ena;
722  		sc->sc_ifp->if_capenable2 = ena2;
723  		sc->sc_ifp->if_hwassist = hwa;
724  		getmicrotime(&sc->sc_ifp->if_lastchange);
725  
726  		if (sc->sc_ifflags & IFF_DEBUG)
727  			if_printf(sc->sc_ifp,
728  			    "capabilities 0x%08x enabled 0x%08x\n", cap, ena);
729  	}
730  }
731  
732  static int
733  lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp)
734  {
735  	struct lagg_softc *sc_ptr;
736  	struct lagg_port *lp, *tlp;
737  	struct ifreq ifr;
738  	int error, i, oldmtu;
739  	int if_type;
740  	uint64_t *pval;
741  
742  	LAGG_XLOCK_ASSERT(sc);
743  
744  	if (sc->sc_ifp == ifp) {
745  		if_printf(sc->sc_ifp,
746  		    "cannot add a lagg to itself as a port\n");
747  		return (EINVAL);
748  	}
749  
750  	if (sc->sc_destroying == 1)
751  		return (ENXIO);
752  
753  	/* Limit the maximal number of lagg ports */
754  	if (sc->sc_count >= LAGG_MAX_PORTS)
755  		return (ENOSPC);
756  
757  	/* Check if port has already been associated to a lagg */
758  	if (ifp->if_lagg != NULL) {
759  		/* Port is already in the current lagg? */
760  		lp = (struct lagg_port *)ifp->if_lagg;
761  		if (lp->lp_softc == sc)
762  			return (EEXIST);
763  		return (EBUSY);
764  	}
765  
766  	switch (sc->sc_ifp->if_type) {
767  	case IFT_ETHER:
768  		/* XXX Disallow non-ethernet interfaces (this should be any of 802) */
769  		if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN)
770  			return (EPROTONOSUPPORT);
771  		if_type = IFT_IEEE8023ADLAG;
772  		break;
773  	case IFT_INFINIBAND:
774  		/* XXX Disallow non-infiniband interfaces */
775  		if (ifp->if_type != IFT_INFINIBAND)
776  			return (EPROTONOSUPPORT);
777  		if_type = IFT_INFINIBANDLAG;
778  		break;
779  	default:
780  		break;
781  	}
782  
783  	/* Allow the first Ethernet member to define the MTU */
784  	oldmtu = -1;
785  	if (CK_SLIST_EMPTY(&sc->sc_ports)) {
786  		sc->sc_ifp->if_mtu = ifp->if_mtu;
787  	} else if (sc->sc_ifp->if_mtu != ifp->if_mtu) {
788  		if (ifp->if_ioctl == NULL) {
789  			if_printf(sc->sc_ifp, "cannot change MTU for %s\n",
790  			    ifp->if_xname);
791  			return (EINVAL);
792  		}
793  		oldmtu = ifp->if_mtu;
794  		strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name));
795  		ifr.ifr_mtu = sc->sc_ifp->if_mtu;
796  		error = (*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr);
797  		if (error != 0) {
798  			if_printf(sc->sc_ifp, "invalid MTU for %s\n",
799  			    ifp->if_xname);
800  			return (error);
801  		}
802  		ifr.ifr_mtu = oldmtu;
803  	}
804  
805  	lp = malloc(sizeof(struct lagg_port), M_LAGG, M_WAITOK | M_ZERO);
806  	lp->lp_softc = sc;
807  
808  	/* Check if port is a stacked lagg */
809  	LAGG_LIST_LOCK();
810  	SLIST_FOREACH(sc_ptr, &V_lagg_list, sc_entries) {
811  		if (ifp == sc_ptr->sc_ifp) {
812  			LAGG_LIST_UNLOCK();
813  			free(lp, M_LAGG);
814  			if (oldmtu != -1)
815  				(*ifp->if_ioctl)(ifp, SIOCSIFMTU,
816  				    (caddr_t)&ifr);
817  			return (EINVAL);
818  			/* XXX disable stacking for the moment, its untested */
819  #ifdef LAGG_PORT_STACKING
820  			lp->lp_flags |= LAGG_PORT_STACK;
821  			if (lagg_port_checkstacking(sc_ptr) >=
822  			    LAGG_MAX_STACKING) {
823  				LAGG_LIST_UNLOCK();
824  				free(lp, M_LAGG);
825  				if (oldmtu != -1)
826  					(*ifp->if_ioctl)(ifp, SIOCSIFMTU,
827  					    (caddr_t)&ifr);
828  				return (E2BIG);
829  			}
830  #endif
831  		}
832  	}
833  	LAGG_LIST_UNLOCK();
834  
835  	if_ref(ifp);
836  	lp->lp_ifp = ifp;
837  
838  	bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ifp->if_addrlen);
839  	lp->lp_ifcapenable = ifp->if_capenable;
840  	if (CK_SLIST_EMPTY(&sc->sc_ports)) {
841  		bcopy(IF_LLADDR(ifp), IF_LLADDR(sc->sc_ifp), ifp->if_addrlen);
842  		lagg_proto_lladdr(sc);
843  		EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
844  	} else {
845  		if_setlladdr(ifp, IF_LLADDR(sc->sc_ifp), ifp->if_addrlen);
846  	}
847  	lagg_setflags(lp, 1);
848  
849  	if (CK_SLIST_EMPTY(&sc->sc_ports))
850  		sc->sc_primary = lp;
851  
852  	/* Change the interface type */
853  	lp->lp_iftype = ifp->if_type;
854  	ifp->if_type = if_type;
855  	ifp->if_lagg = lp;
856  	lp->lp_ioctl = ifp->if_ioctl;
857  	ifp->if_ioctl = lagg_port_ioctl;
858  	lp->lp_output = ifp->if_output;
859  	ifp->if_output = lagg_port_output;
860  
861  	/* Read port counters */
862  	pval = lp->port_counters.val;
863  	for (i = 0; i < IFCOUNTERS; i++, pval++)
864  		*pval = ifp->if_get_counter(ifp, i);
865  
866  	/*
867  	 * Insert into the list of ports.
868  	 * Keep ports sorted by if_index. It is handy, when configuration
869  	 * is predictable and `ifconfig laggN create ...` command
870  	 * will lead to the same result each time.
871  	 */
872  	CK_SLIST_FOREACH(tlp, &sc->sc_ports, lp_entries) {
873  		if (tlp->lp_ifp->if_index < ifp->if_index && (
874  		    CK_SLIST_NEXT(tlp, lp_entries) == NULL ||
875  		    ((struct lagg_port*)CK_SLIST_NEXT(tlp, lp_entries))->lp_ifp->if_index >
876  		    ifp->if_index))
877  			break;
878  	}
879  	if (tlp != NULL)
880  		CK_SLIST_INSERT_AFTER(tlp, lp, lp_entries);
881  	else
882  		CK_SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries);
883  	sc->sc_count++;
884  
885  	lagg_setmulti(lp);
886  
887  	if ((error = lagg_proto_addport(sc, lp)) != 0) {
888  		/* Remove the port, without calling pr_delport. */
889  		lagg_port_destroy(lp, 0);
890  		if (oldmtu != -1)
891  			(*ifp->if_ioctl)(ifp, SIOCSIFMTU, (caddr_t)&ifr);
892  		return (error);
893  	}
894  
895  	/* Update lagg capabilities */
896  	lagg_capabilities(sc);
897  	lagg_linkstate(sc);
898  
899  	return (0);
900  }
901  
902  #ifdef LAGG_PORT_STACKING
903  static int
904  lagg_port_checkstacking(struct lagg_softc *sc)
905  {
906  	struct lagg_softc *sc_ptr;
907  	struct lagg_port *lp;
908  	int m = 0;
909  
910  	LAGG_SXLOCK_ASSERT(sc);
911  	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
912  		if (lp->lp_flags & LAGG_PORT_STACK) {
913  			sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc;
914  			m = MAX(m, lagg_port_checkstacking(sc_ptr));
915  		}
916  	}
917  
918  	return (m + 1);
919  }
920  #endif
921  
922  static void
923  lagg_port_destroy_cb(epoch_context_t ec)
924  {
925  	struct lagg_port *lp;
926  	struct ifnet *ifp;
927  
928  	lp = __containerof(ec, struct lagg_port, lp_epoch_ctx);
929  	ifp = lp->lp_ifp;
930  
931  	if_rele(ifp);
932  	free(lp, M_LAGG);
933  }
934  
935  static int
936  lagg_port_destroy(struct lagg_port *lp, int rundelport)
937  {
938  	struct lagg_softc *sc = lp->lp_softc;
939  	struct lagg_port *lp_ptr, *lp0;
940  	struct ifnet *ifp = lp->lp_ifp;
941  	uint64_t *pval, vdiff;
942  	int i;
943  
944  	LAGG_XLOCK_ASSERT(sc);
945  
946  	if (rundelport)
947  		lagg_proto_delport(sc, lp);
948  
949  	if (lp->lp_detaching == 0)
950  		lagg_clrmulti(lp);
951  
952  	/* Restore interface */
953  	ifp->if_type = lp->lp_iftype;
954  	ifp->if_ioctl = lp->lp_ioctl;
955  	ifp->if_output = lp->lp_output;
956  	ifp->if_lagg = NULL;
957  
958  	/* Update detached port counters */
959  	pval = lp->port_counters.val;
960  	for (i = 0; i < IFCOUNTERS; i++, pval++) {
961  		vdiff = ifp->if_get_counter(ifp, i) - *pval;
962  		sc->detached_counters.val[i] += vdiff;
963  	}
964  
965  	/* Finally, remove the port from the lagg */
966  	CK_SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries);
967  	sc->sc_count--;
968  
969  	/* Update the primary interface */
970  	if (lp == sc->sc_primary) {
971  		uint8_t lladdr[LAGG_ADDR_LEN];
972  
973  		if ((lp0 = CK_SLIST_FIRST(&sc->sc_ports)) == NULL)
974  			bzero(&lladdr, LAGG_ADDR_LEN);
975  		else
976  			bcopy(lp0->lp_lladdr, lladdr, LAGG_ADDR_LEN);
977  		sc->sc_primary = lp0;
978  		if (sc->sc_destroying == 0) {
979  			bcopy(lladdr, IF_LLADDR(sc->sc_ifp), sc->sc_ifp->if_addrlen);
980  			lagg_proto_lladdr(sc);
981  			EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp);
982  
983  			/*
984  			 * Update lladdr for each port (new primary needs update
985  			 * as well, to switch from old lladdr to its 'real' one).
986  			 * We can skip this if the lagg is being destroyed.
987  			 */
988  			CK_SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries)
989  				if_setlladdr(lp_ptr->lp_ifp, lladdr,
990  				    lp_ptr->lp_ifp->if_addrlen);
991  		}
992  	}
993  
994  	if (lp->lp_ifflags)
995  		if_printf(ifp, "%s: lp_ifflags unclean\n", __func__);
996  
997  	if (lp->lp_detaching == 0) {
998  		lagg_setflags(lp, 0);
999  		lagg_setcaps(lp, lp->lp_ifcapenable, lp->lp_ifcapenable2);
1000  		if_setlladdr(ifp, lp->lp_lladdr, ifp->if_addrlen);
1001  	}
1002  
1003  	/*
1004  	 * free port and release it's ifnet reference after a grace period has
1005  	 * elapsed.
1006  	 */
1007  	NET_EPOCH_CALL(lagg_port_destroy_cb, &lp->lp_epoch_ctx);
1008  	/* Update lagg capabilities */
1009  	lagg_capabilities(sc);
1010  	lagg_linkstate(sc);
1011  
1012  	return (0);
1013  }
1014  
1015  static int
1016  lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1017  {
1018  	struct lagg_reqport *rp = (struct lagg_reqport *)data;
1019  	struct lagg_softc *sc;
1020  	struct lagg_port *lp = NULL;
1021  	int error = 0;
1022  
1023  	/* Should be checked by the caller */
1024  	switch (ifp->if_type) {
1025  	case IFT_IEEE8023ADLAG:
1026  	case IFT_INFINIBANDLAG:
1027  		if ((lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL)
1028  			goto fallback;
1029  		break;
1030  	default:
1031  		goto fallback;
1032  	}
1033  
1034  	switch (cmd) {
1035  	case SIOCGLAGGPORT:
1036  		if (rp->rp_portname[0] == '\0' ||
1037  		    ifunit(rp->rp_portname) != ifp) {
1038  			error = EINVAL;
1039  			break;
1040  		}
1041  
1042  		LAGG_SLOCK(sc);
1043  		if (__predict_true((lp = ifp->if_lagg) != NULL &&
1044  		    lp->lp_softc == sc))
1045  			lagg_port2req(lp, rp);
1046  		else
1047  			error = ENOENT;	/* XXXGL: can happen? */
1048  		LAGG_SUNLOCK(sc);
1049  		break;
1050  
1051  	case SIOCSIFCAP:
1052  	case SIOCSIFCAPNV:
1053  		if (lp->lp_ioctl == NULL) {
1054  			error = EINVAL;
1055  			break;
1056  		}
1057  		error = (*lp->lp_ioctl)(ifp, cmd, data);
1058  		if (error)
1059  			break;
1060  
1061  		/* Update lagg interface capabilities */
1062  		LAGG_XLOCK(sc);
1063  		lagg_capabilities(sc);
1064  		LAGG_XUNLOCK(sc);
1065  		VLAN_CAPABILITIES(sc->sc_ifp);
1066  		break;
1067  
1068  	case SIOCSIFMTU:
1069  		/* Do not allow the MTU to be changed once joined */
1070  		error = EINVAL;
1071  		break;
1072  
1073  	default:
1074  		goto fallback;
1075  	}
1076  
1077  	return (error);
1078  
1079  fallback:
1080  	if (lp != NULL && lp->lp_ioctl != NULL)
1081  		return ((*lp->lp_ioctl)(ifp, cmd, data));
1082  
1083  	return (EINVAL);
1084  }
1085  
1086  /*
1087   * Requests counter @cnt data.
1088   *
1089   * Counter value is calculated the following way:
1090   * 1) for each port, sum difference between current and "initial" measurements.
1091   * 2) add lagg logical interface counters.
1092   * 3) add data from detached_counters array.
1093   *
1094   * We also do the following things on ports attach/detach:
1095   * 1) On port attach we store all counters it has into port_counter array.
1096   * 2) On port detach we add the different between "initial" and
1097   *   current counters data to detached_counters array.
1098   */
1099  static uint64_t
1100  lagg_get_counter(struct ifnet *ifp, ift_counter cnt)
1101  {
1102  	struct epoch_tracker et;
1103  	struct lagg_softc *sc;
1104  	struct lagg_port *lp;
1105  	struct ifnet *lpifp;
1106  	uint64_t newval, oldval, vsum;
1107  
1108  	/* Revise this when we've got non-generic counters. */
1109  	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
1110  
1111  	sc = (struct lagg_softc *)ifp->if_softc;
1112  
1113  	vsum = 0;
1114  	NET_EPOCH_ENTER(et);
1115  	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1116  		/* Saved attached value */
1117  		oldval = lp->port_counters.val[cnt];
1118  		/* current value */
1119  		lpifp = lp->lp_ifp;
1120  		newval = lpifp->if_get_counter(lpifp, cnt);
1121  		/* Calculate diff and save new */
1122  		vsum += newval - oldval;
1123  	}
1124  	NET_EPOCH_EXIT(et);
1125  
1126  	/*
1127  	 * Add counter data which might be added by upper
1128  	 * layer protocols operating on logical interface.
1129  	 */
1130  	vsum += if_get_counter_default(ifp, cnt);
1131  
1132  	/*
1133  	 * Add counter data from detached ports counters
1134  	 */
1135  	vsum += sc->detached_counters.val[cnt];
1136  
1137  	return (vsum);
1138  }
1139  
1140  /*
1141   * For direct output to child ports.
1142   */
1143  static int
1144  lagg_port_output(struct ifnet *ifp, struct mbuf *m,
1145  	const struct sockaddr *dst, struct route *ro)
1146  {
1147  	struct lagg_port *lp = ifp->if_lagg;
1148  
1149  	switch (dst->sa_family) {
1150  		case pseudo_AF_HDRCMPLT:
1151  		case AF_UNSPEC:
1152  			if (lp != NULL)
1153  				return ((*lp->lp_output)(ifp, m, dst, ro));
1154  	}
1155  
1156  	/* drop any other frames */
1157  	m_freem(m);
1158  	return (ENETDOWN);
1159  }
1160  
1161  static void
1162  lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp)
1163  {
1164  	struct lagg_port *lp;
1165  	struct lagg_softc *sc;
1166  
1167  	if ((lp = ifp->if_lagg) == NULL)
1168  		return;
1169  	/* If the ifnet is just being renamed, don't do anything. */
1170  	if (ifp->if_flags & IFF_RENAMING)
1171  		return;
1172  
1173  	sc = lp->lp_softc;
1174  
1175  	LAGG_XLOCK(sc);
1176  	lp->lp_detaching = 1;
1177  	lagg_port_destroy(lp, 1);
1178  	LAGG_XUNLOCK(sc);
1179  	VLAN_CAPABILITIES(sc->sc_ifp);
1180  }
1181  
1182  static void
1183  lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp)
1184  {
1185  	struct lagg_softc *sc = lp->lp_softc;
1186  
1187  	strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname));
1188  	strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname));
1189  	rp->rp_prio = lp->lp_prio;
1190  	rp->rp_flags = lp->lp_flags;
1191  	lagg_proto_portreq(sc, lp, &rp->rp_psc);
1192  
1193  	/* Add protocol specific flags */
1194  	switch (sc->sc_proto) {
1195  		case LAGG_PROTO_FAILOVER:
1196  			if (lp == sc->sc_primary)
1197  				rp->rp_flags |= LAGG_PORT_MASTER;
1198  			if (lp == lagg_link_active(sc, sc->sc_primary))
1199  				rp->rp_flags |= LAGG_PORT_ACTIVE;
1200  			break;
1201  
1202  		case LAGG_PROTO_ROUNDROBIN:
1203  		case LAGG_PROTO_LOADBALANCE:
1204  		case LAGG_PROTO_BROADCAST:
1205  			if (LAGG_PORTACTIVE(lp))
1206  				rp->rp_flags |= LAGG_PORT_ACTIVE;
1207  			break;
1208  
1209  		case LAGG_PROTO_LACP:
1210  			/* LACP has a different definition of active */
1211  			if (lacp_isactive(lp))
1212  				rp->rp_flags |= LAGG_PORT_ACTIVE;
1213  			if (lacp_iscollecting(lp))
1214  				rp->rp_flags |= LAGG_PORT_COLLECTING;
1215  			if (lacp_isdistributing(lp))
1216  				rp->rp_flags |= LAGG_PORT_DISTRIBUTING;
1217  			break;
1218  	}
1219  
1220  }
1221  
1222  static void
1223  lagg_watchdog_infiniband(void *arg)
1224  {
1225  	struct epoch_tracker et;
1226  	struct lagg_softc *sc;
1227  	struct lagg_port *lp;
1228  	struct ifnet *ifp;
1229  	struct ifnet *lp_ifp;
1230  
1231  	sc = arg;
1232  
1233  	/*
1234  	 * Because infiniband nodes have a fixed MAC address, which is
1235  	 * generated by the so-called GID, we need to regularly update
1236  	 * the link level address of the parent lagg<N> device when
1237  	 * the active port changes. Possibly we could piggy-back on
1238  	 * link up/down events aswell, but using a timer also provides
1239  	 * a guarantee against too frequent events. This operation
1240  	 * does not have to be atomic.
1241  	 */
1242  	NET_EPOCH_ENTER(et);
1243  	lp = lagg_link_active(sc, sc->sc_primary);
1244  	if (lp != NULL) {
1245  		ifp = sc->sc_ifp;
1246  		lp_ifp = lp->lp_ifp;
1247  
1248  		if (ifp != NULL && lp_ifp != NULL &&
1249  		    (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp_ifp), ifp->if_addrlen) != 0 ||
1250  		     memcmp(sc->sc_bcast_addr, lp_ifp->if_broadcastaddr, ifp->if_addrlen) != 0)) {
1251  			memcpy(IF_LLADDR(ifp), IF_LLADDR(lp_ifp), ifp->if_addrlen);
1252  			memcpy(sc->sc_bcast_addr, lp_ifp->if_broadcastaddr, ifp->if_addrlen);
1253  
1254  			CURVNET_SET(ifp->if_vnet);
1255  			EVENTHANDLER_INVOKE(iflladdr_event, ifp);
1256  			CURVNET_RESTORE();
1257  		}
1258  	}
1259  	NET_EPOCH_EXIT(et);
1260  
1261  	callout_reset(&sc->sc_watchdog, hz, &lagg_watchdog_infiniband, arg);
1262  }
1263  
1264  static void
1265  lagg_if_updown(struct lagg_softc *sc, bool up)
1266  {
1267  	struct ifreq ifr = {};
1268  	struct lagg_port *lp;
1269  
1270  	LAGG_XLOCK_ASSERT(sc);
1271  
1272  	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1273  		if (up)
1274  			if_up(lp->lp_ifp);
1275  		else
1276  			if_down(lp->lp_ifp);
1277  
1278  		if (lp->lp_ioctl != NULL)
1279  			lp->lp_ioctl(lp->lp_ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
1280  	}
1281  }
1282  
1283  static void
1284  lagg_init(void *xsc)
1285  {
1286  	struct lagg_softc *sc = (struct lagg_softc *)xsc;
1287  	struct ifnet *ifp = sc->sc_ifp;
1288  	struct lagg_port *lp;
1289  
1290  	LAGG_XLOCK(sc);
1291  	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
1292  		LAGG_XUNLOCK(sc);
1293  		return;
1294  	}
1295  
1296  	ifp->if_drv_flags |= IFF_DRV_RUNNING;
1297  
1298  	/*
1299  	 * Update the port lladdrs if needed.
1300  	 * This might be if_setlladdr() notification
1301  	 * that lladdr has been changed.
1302  	 */
1303  	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1304  		if (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp->lp_ifp),
1305  		    ifp->if_addrlen) != 0)
1306  			if_setlladdr(lp->lp_ifp, IF_LLADDR(ifp), ifp->if_addrlen);
1307  	}
1308  
1309  	lagg_if_updown(sc, true);
1310  
1311  	lagg_proto_init(sc);
1312  
1313  	if (ifp->if_type == IFT_INFINIBAND) {
1314  		mtx_lock(&sc->sc_mtx);
1315  		lagg_watchdog_infiniband(sc);
1316  		mtx_unlock(&sc->sc_mtx);
1317  	}
1318  
1319  	LAGG_XUNLOCK(sc);
1320  }
1321  
1322  static void
1323  lagg_stop(struct lagg_softc *sc)
1324  {
1325  	struct ifnet *ifp = sc->sc_ifp;
1326  
1327  	LAGG_XLOCK_ASSERT(sc);
1328  
1329  	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
1330  		return;
1331  
1332  	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
1333  
1334  	lagg_proto_stop(sc);
1335  
1336  	mtx_lock(&sc->sc_mtx);
1337  	callout_stop(&sc->sc_watchdog);
1338  	mtx_unlock(&sc->sc_mtx);
1339  
1340  	lagg_if_updown(sc, false);
1341  
1342  	callout_drain(&sc->sc_watchdog);
1343  }
1344  
1345  static int
1346  lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
1347  {
1348  	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1349  	struct lagg_reqall *ra = (struct lagg_reqall *)data;
1350  	struct lagg_reqopts *ro = (struct lagg_reqopts *)data;
1351  	struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf;
1352  	struct lagg_reqflags *rf = (struct lagg_reqflags *)data;
1353  	struct ifreq *ifr = (struct ifreq *)data;
1354  	struct lagg_port *lp;
1355  	struct ifnet *tpif;
1356  	struct thread *td = curthread;
1357  	char *buf, *outbuf;
1358  	int count, buflen, len, error = 0, oldmtu;
1359  
1360  	bzero(&rpbuf, sizeof(rpbuf));
1361  
1362  	/* XXX: This can race with lagg_clone_destroy. */
1363  
1364  	switch (cmd) {
1365  	case SIOCGLAGG:
1366  		LAGG_XLOCK(sc);
1367  		buflen = sc->sc_count * sizeof(struct lagg_reqport);
1368  		outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
1369  		ra->ra_proto = sc->sc_proto;
1370  		lagg_proto_request(sc, &ra->ra_psc);
1371  		count = 0;
1372  		buf = outbuf;
1373  		len = min(ra->ra_size, buflen);
1374  		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1375  			if (len < sizeof(rpbuf))
1376  				break;
1377  
1378  			lagg_port2req(lp, &rpbuf);
1379  			memcpy(buf, &rpbuf, sizeof(rpbuf));
1380  			count++;
1381  			buf += sizeof(rpbuf);
1382  			len -= sizeof(rpbuf);
1383  		}
1384  		LAGG_XUNLOCK(sc);
1385  		ra->ra_ports = count;
1386  		ra->ra_size = count * sizeof(rpbuf);
1387  		error = copyout(outbuf, ra->ra_port, ra->ra_size);
1388  		free(outbuf, M_TEMP);
1389  		break;
1390  	case SIOCSLAGG:
1391  		error = priv_check(td, PRIV_NET_LAGG);
1392  		if (error)
1393  			break;
1394  		if (ra->ra_proto >= LAGG_PROTO_MAX) {
1395  			error = EPROTONOSUPPORT;
1396  			break;
1397  		}
1398  		/* Infiniband only supports the failover protocol. */
1399  		if (ra->ra_proto != LAGG_PROTO_FAILOVER &&
1400  		    ifp->if_type == IFT_INFINIBAND) {
1401  			error = EPROTONOSUPPORT;
1402  			break;
1403  		}
1404  		LAGG_XLOCK(sc);
1405  		lagg_proto_detach(sc);
1406  		lagg_proto_attach(sc, ra->ra_proto);
1407  		LAGG_XUNLOCK(sc);
1408  		break;
1409  	case SIOCGLAGGOPTS:
1410  		LAGG_XLOCK(sc);
1411  		ro->ro_opts = sc->sc_opts;
1412  		if (sc->sc_proto == LAGG_PROTO_LACP) {
1413  			struct lacp_softc *lsc;
1414  
1415  			lsc = (struct lacp_softc *)sc->sc_psc;
1416  			if (lsc->lsc_debug.lsc_tx_test != 0)
1417  				ro->ro_opts |= LAGG_OPT_LACP_TXTEST;
1418  			if (lsc->lsc_debug.lsc_rx_test != 0)
1419  				ro->ro_opts |= LAGG_OPT_LACP_RXTEST;
1420  			if (lsc->lsc_strict_mode != 0)
1421  				ro->ro_opts |= LAGG_OPT_LACP_STRICT;
1422  			if (lsc->lsc_fast_timeout != 0)
1423  				ro->ro_opts |= LAGG_OPT_LACP_FAST_TIMO;
1424  
1425  			ro->ro_active = sc->sc_active;
1426  		} else {
1427  			ro->ro_active = 0;
1428  			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1429  				ro->ro_active += LAGG_PORTACTIVE(lp);
1430  		}
1431  		ro->ro_bkt = sc->sc_stride;
1432  		ro->ro_flapping = sc->sc_flapping;
1433  		ro->ro_flowid_shift = sc->flowid_shift;
1434  		LAGG_XUNLOCK(sc);
1435  		break;
1436  	case SIOCSLAGGOPTS:
1437  		error = priv_check(td, PRIV_NET_LAGG);
1438  		if (error)
1439  			break;
1440  
1441  		/*
1442  		 * The stride option was added without defining a corresponding
1443  		 * LAGG_OPT flag, so handle a non-zero value before checking
1444  		 * anything else to preserve compatibility.
1445  		 */
1446  		LAGG_XLOCK(sc);
1447  		if (ro->ro_opts == 0 && ro->ro_bkt != 0) {
1448  			if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN) {
1449  				LAGG_XUNLOCK(sc);
1450  				error = EINVAL;
1451  				break;
1452  			}
1453  			sc->sc_stride = ro->ro_bkt;
1454  		}
1455  		if (ro->ro_opts == 0) {
1456  			LAGG_XUNLOCK(sc);
1457  			break;
1458  		}
1459  
1460  		/*
1461  		 * Set options.  LACP options are stored in sc->sc_psc,
1462  		 * not in sc_opts.
1463  		 */
1464  		int valid, lacp;
1465  
1466  		switch (ro->ro_opts) {
1467  		case LAGG_OPT_USE_FLOWID:
1468  		case -LAGG_OPT_USE_FLOWID:
1469  		case LAGG_OPT_USE_NUMA:
1470  		case -LAGG_OPT_USE_NUMA:
1471  		case LAGG_OPT_FLOWIDSHIFT:
1472  		case LAGG_OPT_RR_LIMIT:
1473  			valid = 1;
1474  			lacp = 0;
1475  			break;
1476  		case LAGG_OPT_LACP_TXTEST:
1477  		case -LAGG_OPT_LACP_TXTEST:
1478  		case LAGG_OPT_LACP_RXTEST:
1479  		case -LAGG_OPT_LACP_RXTEST:
1480  		case LAGG_OPT_LACP_STRICT:
1481  		case -LAGG_OPT_LACP_STRICT:
1482  		case LAGG_OPT_LACP_FAST_TIMO:
1483  		case -LAGG_OPT_LACP_FAST_TIMO:
1484  			valid = lacp = 1;
1485  			break;
1486  		default:
1487  			valid = lacp = 0;
1488  			break;
1489  		}
1490  
1491  		if (valid == 0 ||
1492  		    (lacp == 1 && sc->sc_proto != LAGG_PROTO_LACP)) {
1493  			/* Invalid combination of options specified. */
1494  			error = EINVAL;
1495  			LAGG_XUNLOCK(sc);
1496  			break;	/* Return from SIOCSLAGGOPTS. */
1497  		}
1498  
1499  		/*
1500  		 * Store new options into sc->sc_opts except for
1501  		 * FLOWIDSHIFT, RR and LACP options.
1502  		 */
1503  		if (lacp == 0) {
1504  			if (ro->ro_opts == LAGG_OPT_FLOWIDSHIFT)
1505  				sc->flowid_shift = ro->ro_flowid_shift;
1506  			else if (ro->ro_opts == LAGG_OPT_RR_LIMIT) {
1507  				if (sc->sc_proto != LAGG_PROTO_ROUNDROBIN ||
1508  				    ro->ro_bkt == 0) {
1509  					error = EINVAL;
1510  					LAGG_XUNLOCK(sc);
1511  					break;
1512  				}
1513  				sc->sc_stride = ro->ro_bkt;
1514  			} else if (ro->ro_opts > 0)
1515  				sc->sc_opts |= ro->ro_opts;
1516  			else
1517  				sc->sc_opts &= ~ro->ro_opts;
1518  		} else {
1519  			struct lacp_softc *lsc;
1520  			struct lacp_port *lp;
1521  
1522  			lsc = (struct lacp_softc *)sc->sc_psc;
1523  
1524  			switch (ro->ro_opts) {
1525  			case LAGG_OPT_LACP_TXTEST:
1526  				lsc->lsc_debug.lsc_tx_test = 1;
1527  				break;
1528  			case -LAGG_OPT_LACP_TXTEST:
1529  				lsc->lsc_debug.lsc_tx_test = 0;
1530  				break;
1531  			case LAGG_OPT_LACP_RXTEST:
1532  				lsc->lsc_debug.lsc_rx_test = 1;
1533  				break;
1534  			case -LAGG_OPT_LACP_RXTEST:
1535  				lsc->lsc_debug.lsc_rx_test = 0;
1536  				break;
1537  			case LAGG_OPT_LACP_STRICT:
1538  				lsc->lsc_strict_mode = 1;
1539  				break;
1540  			case -LAGG_OPT_LACP_STRICT:
1541  				lsc->lsc_strict_mode = 0;
1542  				break;
1543  			case LAGG_OPT_LACP_FAST_TIMO:
1544  				LACP_LOCK(lsc);
1545  				LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
1546  					lp->lp_state |= LACP_STATE_TIMEOUT;
1547  				LACP_UNLOCK(lsc);
1548  				lsc->lsc_fast_timeout = 1;
1549  				break;
1550  			case -LAGG_OPT_LACP_FAST_TIMO:
1551  				LACP_LOCK(lsc);
1552  				LIST_FOREACH(lp, &lsc->lsc_ports, lp_next)
1553  					lp->lp_state &= ~LACP_STATE_TIMEOUT;
1554  				LACP_UNLOCK(lsc);
1555  				lsc->lsc_fast_timeout = 0;
1556  				break;
1557  			}
1558  		}
1559  		LAGG_XUNLOCK(sc);
1560  		break;
1561  	case SIOCGLAGGFLAGS:
1562  		rf->rf_flags = 0;
1563  		LAGG_XLOCK(sc);
1564  		if (sc->sc_flags & MBUF_HASHFLAG_L2)
1565  			rf->rf_flags |= LAGG_F_HASHL2;
1566  		if (sc->sc_flags & MBUF_HASHFLAG_L3)
1567  			rf->rf_flags |= LAGG_F_HASHL3;
1568  		if (sc->sc_flags & MBUF_HASHFLAG_L4)
1569  			rf->rf_flags |= LAGG_F_HASHL4;
1570  		LAGG_XUNLOCK(sc);
1571  		break;
1572  	case SIOCSLAGGHASH:
1573  		error = priv_check(td, PRIV_NET_LAGG);
1574  		if (error)
1575  			break;
1576  		if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) {
1577  			error = EINVAL;
1578  			break;
1579  		}
1580  		LAGG_XLOCK(sc);
1581  		sc->sc_flags = 0;
1582  		if (rf->rf_flags & LAGG_F_HASHL2)
1583  			sc->sc_flags |= MBUF_HASHFLAG_L2;
1584  		if (rf->rf_flags & LAGG_F_HASHL3)
1585  			sc->sc_flags |= MBUF_HASHFLAG_L3;
1586  		if (rf->rf_flags & LAGG_F_HASHL4)
1587  			sc->sc_flags |= MBUF_HASHFLAG_L4;
1588  		LAGG_XUNLOCK(sc);
1589  		break;
1590  	case SIOCGLAGGPORT:
1591  		if (rp->rp_portname[0] == '\0' ||
1592  		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
1593  			error = EINVAL;
1594  			break;
1595  		}
1596  
1597  		LAGG_SLOCK(sc);
1598  		if (__predict_true((lp = tpif->if_lagg) != NULL &&
1599  		    lp->lp_softc == sc))
1600  			lagg_port2req(lp, rp);
1601  		else
1602  			error = ENOENT;	/* XXXGL: can happen? */
1603  		LAGG_SUNLOCK(sc);
1604  		if_rele(tpif);
1605  		break;
1606  
1607  	case SIOCSLAGGPORT:
1608  		error = priv_check(td, PRIV_NET_LAGG);
1609  		if (error)
1610  			break;
1611  		if (rp->rp_portname[0] == '\0' ||
1612  		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
1613  			error = EINVAL;
1614  			break;
1615  		}
1616  #ifdef INET6
1617  		/*
1618  		 * A laggport interface should not have inet6 address
1619  		 * because two interfaces with a valid link-local
1620  		 * scope zone must not be merged in any form.  This
1621  		 * restriction is needed to prevent violation of
1622  		 * link-local scope zone.  Attempts to add a laggport
1623  		 * interface which has inet6 addresses triggers
1624  		 * removal of all inet6 addresses on the member
1625  		 * interface.
1626  		 */
1627  		if (in6ifa_llaonifp(tpif)) {
1628  			in6_ifdetach(tpif);
1629  				if_printf(sc->sc_ifp,
1630  				    "IPv6 addresses on %s have been removed "
1631  				    "before adding it as a member to prevent "
1632  				    "IPv6 address scope violation.\n",
1633  				    tpif->if_xname);
1634  		}
1635  #endif
1636  		oldmtu = ifp->if_mtu;
1637  		LAGG_XLOCK(sc);
1638  		error = lagg_port_create(sc, tpif);
1639  		LAGG_XUNLOCK(sc);
1640  		if_rele(tpif);
1641  
1642  		/*
1643  		 * LAGG MTU may change during addition of the first port.
1644  		 * If it did, do network layer specific procedure.
1645  		 */
1646  		if (ifp->if_mtu != oldmtu)
1647  			if_notifymtu(ifp);
1648  
1649  		VLAN_CAPABILITIES(ifp);
1650  		break;
1651  	case SIOCSLAGGDELPORT:
1652  		error = priv_check(td, PRIV_NET_LAGG);
1653  		if (error)
1654  			break;
1655  		if (rp->rp_portname[0] == '\0' ||
1656  		    (tpif = ifunit_ref(rp->rp_portname)) == NULL) {
1657  			error = EINVAL;
1658  			break;
1659  		}
1660  
1661  		LAGG_XLOCK(sc);
1662  		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
1663  		    lp->lp_softc != sc) {
1664  			error = ENOENT;
1665  			LAGG_XUNLOCK(sc);
1666  			if_rele(tpif);
1667  			break;
1668  		}
1669  
1670  		error = lagg_port_destroy(lp, 1);
1671  		LAGG_XUNLOCK(sc);
1672  		if_rele(tpif);
1673  		VLAN_CAPABILITIES(ifp);
1674  		break;
1675  	case SIOCSIFFLAGS:
1676  		/* Set flags on ports too */
1677  		LAGG_XLOCK(sc);
1678  		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1679  			lagg_setflags(lp, 1);
1680  		}
1681  
1682  		if (!(ifp->if_flags & IFF_UP) &&
1683  		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1684  			/*
1685  			 * If interface is marked down and it is running,
1686  			 * then stop and disable it.
1687  			 */
1688  			lagg_stop(sc);
1689  			LAGG_XUNLOCK(sc);
1690  		} else if ((ifp->if_flags & IFF_UP) &&
1691  		    !(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1692  			/*
1693  			 * If interface is marked up and it is stopped, then
1694  			 * start it.
1695  			 */
1696  			LAGG_XUNLOCK(sc);
1697  			(*ifp->if_init)(sc);
1698  		} else
1699  			LAGG_XUNLOCK(sc);
1700  		break;
1701  	case SIOCADDMULTI:
1702  	case SIOCDELMULTI:
1703  		LAGG_XLOCK(sc);
1704  		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1705  			lagg_clrmulti(lp);
1706  			lagg_setmulti(lp);
1707  		}
1708  		LAGG_XUNLOCK(sc);
1709  		error = 0;
1710  		break;
1711  	case SIOCSIFMEDIA:
1712  	case SIOCGIFMEDIA:
1713  		if (ifp->if_type == IFT_INFINIBAND)
1714  			error = EINVAL;
1715  		else
1716  			error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
1717  		break;
1718  
1719  	case SIOCSIFCAP:
1720  	case SIOCSIFCAPNV:
1721  		LAGG_XLOCK(sc);
1722  		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1723  			if (lp->lp_ioctl != NULL)
1724  				(*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
1725  		}
1726  		lagg_capabilities(sc);
1727  		LAGG_XUNLOCK(sc);
1728  		VLAN_CAPABILITIES(ifp);
1729  		error = 0;
1730  		break;
1731  
1732  	case SIOCGIFCAPNV:
1733  		error = 0;
1734  		break;
1735  
1736  	case SIOCSIFMTU:
1737  		LAGG_XLOCK(sc);
1738  		CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1739  			if (lp->lp_ioctl != NULL)
1740  				error = (*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
1741  			else
1742  				error = EINVAL;
1743  			if (error != 0) {
1744  				if_printf(ifp,
1745  				    "failed to change MTU to %d on port %s, "
1746  				    "reverting all ports to original MTU (%d)\n",
1747  				    ifr->ifr_mtu, lp->lp_ifp->if_xname, ifp->if_mtu);
1748  				break;
1749  			}
1750  		}
1751  		if (error == 0) {
1752  			ifp->if_mtu = ifr->ifr_mtu;
1753  		} else {
1754  			/* set every port back to the original MTU */
1755  			ifr->ifr_mtu = ifp->if_mtu;
1756  			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1757  				if (lp->lp_ioctl != NULL)
1758  					(*lp->lp_ioctl)(lp->lp_ifp, cmd, data);
1759  			}
1760  		}
1761  		lagg_capabilities(sc);
1762  		LAGG_XUNLOCK(sc);
1763  		VLAN_CAPABILITIES(ifp);
1764  		break;
1765  
1766  	default:
1767  		error = ether_ioctl(ifp, cmd, data);
1768  		break;
1769  	}
1770  	return (error);
1771  }
1772  
1773  #if defined(KERN_TLS) || defined(RATELIMIT)
1774  #ifdef RATELIMIT
1775  static const struct if_snd_tag_sw lagg_snd_tag_ul_sw = {
1776  	.snd_tag_modify = lagg_snd_tag_modify,
1777  	.snd_tag_query = lagg_snd_tag_query,
1778  	.snd_tag_free = lagg_snd_tag_free,
1779  	.next_snd_tag = lagg_next_snd_tag,
1780  	.type = IF_SND_TAG_TYPE_UNLIMITED
1781  };
1782  
1783  static const struct if_snd_tag_sw lagg_snd_tag_rl_sw = {
1784  	.snd_tag_modify = lagg_snd_tag_modify,
1785  	.snd_tag_query = lagg_snd_tag_query,
1786  	.snd_tag_free = lagg_snd_tag_free,
1787  	.next_snd_tag = lagg_next_snd_tag,
1788  	.type = IF_SND_TAG_TYPE_RATE_LIMIT
1789  };
1790  #endif
1791  
1792  #ifdef KERN_TLS
1793  static const struct if_snd_tag_sw lagg_snd_tag_tls_sw = {
1794  	.snd_tag_modify = lagg_snd_tag_modify,
1795  	.snd_tag_query = lagg_snd_tag_query,
1796  	.snd_tag_free = lagg_snd_tag_free,
1797  	.next_snd_tag = lagg_next_snd_tag,
1798  	.type = IF_SND_TAG_TYPE_TLS
1799  };
1800  
1801  #ifdef RATELIMIT
1802  static const struct if_snd_tag_sw lagg_snd_tag_tls_rl_sw = {
1803  	.snd_tag_modify = lagg_snd_tag_modify,
1804  	.snd_tag_query = lagg_snd_tag_query,
1805  	.snd_tag_free = lagg_snd_tag_free,
1806  	.next_snd_tag = lagg_next_snd_tag,
1807  	.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT
1808  };
1809  #endif
1810  #endif
1811  
1812  static inline struct lagg_snd_tag *
1813  mst_to_lst(struct m_snd_tag *mst)
1814  {
1815  
1816  	return (__containerof(mst, struct lagg_snd_tag, com));
1817  }
1818  
1819  /*
1820   * Look up the port used by a specific flow.  This only works for lagg
1821   * protocols with deterministic port mappings (e.g. not roundrobin).
1822   * In addition protocols which use a hash to map flows to ports must
1823   * be configured to use the mbuf flowid rather than hashing packet
1824   * contents.
1825   */
1826  static struct lagg_port *
1827  lookup_snd_tag_port(struct ifnet *ifp, uint32_t flowid, uint32_t flowtype,
1828      uint8_t numa_domain)
1829  {
1830  	struct lagg_softc *sc;
1831  	struct lagg_port *lp;
1832  	struct lagg_lb *lb;
1833  	uint32_t hash, p;
1834  	int err;
1835  
1836  	sc = ifp->if_softc;
1837  
1838  	switch (sc->sc_proto) {
1839  	case LAGG_PROTO_FAILOVER:
1840  		return (lagg_link_active(sc, sc->sc_primary));
1841  	case LAGG_PROTO_LOADBALANCE:
1842  		if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
1843  		    flowtype == M_HASHTYPE_NONE)
1844  			return (NULL);
1845  		p = flowid >> sc->flowid_shift;
1846  		p %= sc->sc_count;
1847  		lb = (struct lagg_lb *)sc->sc_psc;
1848  		lp = lb->lb_ports[p];
1849  		return (lagg_link_active(sc, lp));
1850  	case LAGG_PROTO_LACP:
1851  		if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
1852  		    flowtype == M_HASHTYPE_NONE)
1853  			return (NULL);
1854  		hash = flowid >> sc->flowid_shift;
1855  		return (lacp_select_tx_port_by_hash(sc, hash, numa_domain, &err));
1856  	default:
1857  		return (NULL);
1858  	}
1859  }
1860  
1861  static int
1862  lagg_snd_tag_alloc(struct ifnet *ifp,
1863      union if_snd_tag_alloc_params *params,
1864      struct m_snd_tag **ppmt)
1865  {
1866  	struct epoch_tracker et;
1867  	const struct if_snd_tag_sw *sw;
1868  	struct lagg_snd_tag *lst;
1869  	struct lagg_port *lp;
1870  	struct ifnet *lp_ifp;
1871  	struct m_snd_tag *mst;
1872  	int error;
1873  
1874  	switch (params->hdr.type) {
1875  #ifdef RATELIMIT
1876  	case IF_SND_TAG_TYPE_UNLIMITED:
1877  		sw = &lagg_snd_tag_ul_sw;
1878  		break;
1879  	case IF_SND_TAG_TYPE_RATE_LIMIT:
1880  		sw = &lagg_snd_tag_rl_sw;
1881  		break;
1882  #endif
1883  #ifdef KERN_TLS
1884  	case IF_SND_TAG_TYPE_TLS:
1885  		sw = &lagg_snd_tag_tls_sw;
1886  		break;
1887  	case IF_SND_TAG_TYPE_TLS_RX:
1888  		/* Return tag from port interface directly. */
1889  		sw = NULL;
1890  		break;
1891  #ifdef RATELIMIT
1892  	case IF_SND_TAG_TYPE_TLS_RATE_LIMIT:
1893  		sw = &lagg_snd_tag_tls_rl_sw;
1894  		break;
1895  #endif
1896  #endif
1897  	default:
1898  		return (EOPNOTSUPP);
1899  	}
1900  
1901  	NET_EPOCH_ENTER(et);
1902  	lp = lookup_snd_tag_port(ifp, params->hdr.flowid,
1903  	    params->hdr.flowtype, params->hdr.numa_domain);
1904  	if (lp == NULL) {
1905  		NET_EPOCH_EXIT(et);
1906  		return (EOPNOTSUPP);
1907  	}
1908  	if (lp->lp_ifp == NULL) {
1909  		NET_EPOCH_EXIT(et);
1910  		return (EOPNOTSUPP);
1911  	}
1912  	lp_ifp = lp->lp_ifp;
1913  	if_ref(lp_ifp);
1914  	NET_EPOCH_EXIT(et);
1915  
1916  	if (sw != NULL) {
1917  		lst = malloc(sizeof(*lst), M_LAGG, M_NOWAIT);
1918  		if (lst == NULL) {
1919  			if_rele(lp_ifp);
1920  			return (ENOMEM);
1921  		}
1922  	} else
1923  		lst = NULL;
1924  
1925  	error = m_snd_tag_alloc(lp_ifp, params, &mst);
1926  	if_rele(lp_ifp);
1927  	if (error) {
1928  		free(lst, M_LAGG);
1929  		return (error);
1930  	}
1931  
1932  	if (sw != NULL) {
1933  		m_snd_tag_init(&lst->com, ifp, sw);
1934  		lst->tag = mst;
1935  
1936  		*ppmt = &lst->com;
1937  	} else
1938  		*ppmt = mst;
1939  
1940  	return (0);
1941  }
1942  
1943  static struct m_snd_tag *
1944  lagg_next_snd_tag(struct m_snd_tag *mst)
1945  {
1946  	struct lagg_snd_tag *lst;
1947  
1948  	lst = mst_to_lst(mst);
1949  	return (lst->tag);
1950  }
1951  
1952  static int
1953  lagg_snd_tag_modify(struct m_snd_tag *mst,
1954      union if_snd_tag_modify_params *params)
1955  {
1956  	struct lagg_snd_tag *lst;
1957  
1958  	lst = mst_to_lst(mst);
1959  	return (lst->tag->sw->snd_tag_modify(lst->tag, params));
1960  }
1961  
1962  static int
1963  lagg_snd_tag_query(struct m_snd_tag *mst,
1964      union if_snd_tag_query_params *params)
1965  {
1966  	struct lagg_snd_tag *lst;
1967  
1968  	lst = mst_to_lst(mst);
1969  	return (lst->tag->sw->snd_tag_query(lst->tag, params));
1970  }
1971  
1972  static void
1973  lagg_snd_tag_free(struct m_snd_tag *mst)
1974  {
1975  	struct lagg_snd_tag *lst;
1976  
1977  	lst = mst_to_lst(mst);
1978  	m_snd_tag_rele(lst->tag);
1979  	free(lst, M_LAGG);
1980  }
1981  
1982  static void
1983  lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
1984  {
1985  	/*
1986  	 * For lagg, we have an indirect
1987  	 * interface. The caller needs to
1988  	 * get a ratelimit tag on the actual
1989  	 * interface the flow will go on.
1990  	 */
1991  	q->rate_table = NULL;
1992  	q->flags = RT_IS_INDIRECT;
1993  	q->max_flows = 0;
1994  	q->number_of_rates = 0;
1995  }
1996  #endif
1997  
1998  static int
1999  lagg_setmulti(struct lagg_port *lp)
2000  {
2001  	struct lagg_softc *sc = lp->lp_softc;
2002  	struct ifnet *ifp = lp->lp_ifp;
2003  	struct ifnet *scifp = sc->sc_ifp;
2004  	struct lagg_mc *mc;
2005  	struct ifmultiaddr *ifma;
2006  	int error;
2007  
2008  	IF_ADDR_WLOCK(scifp);
2009  	CK_STAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) {
2010  		if (ifma->ifma_addr->sa_family != AF_LINK)
2011  			continue;
2012  		mc = malloc(sizeof(struct lagg_mc), M_LAGG, M_NOWAIT);
2013  		if (mc == NULL) {
2014  			IF_ADDR_WUNLOCK(scifp);
2015  			return (ENOMEM);
2016  		}
2017  		bcopy(ifma->ifma_addr, &mc->mc_addr, ifma->ifma_addr->sa_len);
2018  		mc->mc_addr.sdl_index = ifp->if_index;
2019  		mc->mc_ifma = NULL;
2020  		SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries);
2021  	}
2022  	IF_ADDR_WUNLOCK(scifp);
2023  	SLIST_FOREACH (mc, &lp->lp_mc_head, mc_entries) {
2024  		error = if_addmulti(ifp,
2025  		    (struct sockaddr *)&mc->mc_addr, &mc->mc_ifma);
2026  		if (error)
2027  			return (error);
2028  	}
2029  	return (0);
2030  }
2031  
2032  static int
2033  lagg_clrmulti(struct lagg_port *lp)
2034  {
2035  	struct lagg_mc *mc;
2036  
2037  	LAGG_XLOCK_ASSERT(lp->lp_softc);
2038  	while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) {
2039  		SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries);
2040  		if (mc->mc_ifma && lp->lp_detaching == 0)
2041  			if_delmulti_ifma(mc->mc_ifma);
2042  		free(mc, M_LAGG);
2043  	}
2044  	return (0);
2045  }
2046  
2047  static void
2048  lagg_setcaps(struct lagg_port *lp, int cap, int cap2)
2049  {
2050  	struct ifreq ifr;
2051  	struct siocsifcapnv_driver_data drv_ioctl_data;
2052  
2053  	if (lp->lp_ifp->if_capenable == cap &&
2054  	    lp->lp_ifp->if_capenable2 == cap2)
2055  		return;
2056  	if (lp->lp_ioctl == NULL)
2057  		return;
2058  	/* XXX */
2059  	if ((lp->lp_ifp->if_capabilities & IFCAP_NV) != 0) {
2060  		drv_ioctl_data.reqcap = cap;
2061  		drv_ioctl_data.reqcap2 = cap2;
2062  		drv_ioctl_data.nvcap = NULL;
2063  		(*lp->lp_ioctl)(lp->lp_ifp, SIOCSIFCAPNV,
2064  		    (caddr_t)&drv_ioctl_data);
2065  	} else {
2066  		ifr.ifr_reqcap = cap;
2067  		(*lp->lp_ioctl)(lp->lp_ifp, SIOCSIFCAP, (caddr_t)&ifr);
2068  	}
2069  }
2070  
2071  /* Handle a ref counted flag that should be set on the lagg port as well */
2072  static int
2073  lagg_setflag(struct lagg_port *lp, int flag, int status,
2074      int (*func)(struct ifnet *, int))
2075  {
2076  	struct lagg_softc *sc = lp->lp_softc;
2077  	struct ifnet *scifp = sc->sc_ifp;
2078  	struct ifnet *ifp = lp->lp_ifp;
2079  	int error;
2080  
2081  	LAGG_XLOCK_ASSERT(sc);
2082  
2083  	status = status ? (scifp->if_flags & flag) : 0;
2084  	/* Now "status" contains the flag value or 0 */
2085  
2086  	/*
2087  	 * See if recorded ports status is different from what
2088  	 * we want it to be.  If it is, flip it.  We record ports
2089  	 * status in lp_ifflags so that we won't clear ports flag
2090  	 * we haven't set.  In fact, we don't clear or set ports
2091  	 * flags directly, but get or release references to them.
2092  	 * That's why we can be sure that recorded flags still are
2093  	 * in accord with actual ports flags.
2094  	 */
2095  	if (status != (lp->lp_ifflags & flag)) {
2096  		error = (*func)(ifp, status);
2097  		if (error)
2098  			return (error);
2099  		lp->lp_ifflags &= ~flag;
2100  		lp->lp_ifflags |= status;
2101  	}
2102  	return (0);
2103  }
2104  
2105  /*
2106   * Handle IFF_* flags that require certain changes on the lagg port
2107   * if "status" is true, update ports flags respective to the lagg
2108   * if "status" is false, forcedly clear the flags set on port.
2109   */
2110  static int
2111  lagg_setflags(struct lagg_port *lp, int status)
2112  {
2113  	int error, i;
2114  
2115  	for (i = 0; lagg_pflags[i].flag; i++) {
2116  		error = lagg_setflag(lp, lagg_pflags[i].flag,
2117  		    status, lagg_pflags[i].func);
2118  		if (error)
2119  			return (error);
2120  	}
2121  	return (0);
2122  }
2123  
2124  static int
2125  lagg_transmit_ethernet(struct ifnet *ifp, struct mbuf *m)
2126  {
2127  	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
2128  
2129  	NET_EPOCH_ASSERT();
2130  #if defined(KERN_TLS) || defined(RATELIMIT)
2131  	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG)
2132  		MPASS(m->m_pkthdr.snd_tag->ifp == ifp);
2133  #endif
2134  	/* We need a Tx algorithm and at least one port */
2135  	if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) {
2136  		m_freem(m);
2137  		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2138  		return (ENXIO);
2139  	}
2140  
2141  	ETHER_BPF_MTAP(ifp, m);
2142  
2143  	return (lagg_proto_start(sc, m));
2144  }
2145  
2146  static int
2147  lagg_transmit_infiniband(struct ifnet *ifp, struct mbuf *m)
2148  {
2149  	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
2150  
2151  	NET_EPOCH_ASSERT();
2152  #if defined(KERN_TLS) || defined(RATELIMIT)
2153  	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG)
2154  		MPASS(m->m_pkthdr.snd_tag->ifp == ifp);
2155  #endif
2156  	/* We need a Tx algorithm and at least one port */
2157  	if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) {
2158  		m_freem(m);
2159  		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
2160  		return (ENXIO);
2161  	}
2162  
2163  	infiniband_bpf_mtap(ifp, m);
2164  
2165  	return (lagg_proto_start(sc, m));
2166  }
2167  
2168  /*
2169   * The ifp->if_qflush entry point for lagg(4) is no-op.
2170   */
2171  static void
2172  lagg_qflush(struct ifnet *ifp __unused)
2173  {
2174  }
2175  
2176  static struct mbuf *
2177  lagg_input_ethernet(struct ifnet *ifp, struct mbuf *m)
2178  {
2179  	struct lagg_port *lp = ifp->if_lagg;
2180  	struct lagg_softc *sc = lp->lp_softc;
2181  	struct ifnet *scifp = sc->sc_ifp;
2182  
2183  	NET_EPOCH_ASSERT();
2184  	if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
2185  	    lp->lp_detaching != 0 ||
2186  	    sc->sc_proto == LAGG_PROTO_NONE) {
2187  		m_freem(m);
2188  		return (NULL);
2189  	}
2190  
2191  	m = lagg_proto_input(sc, lp, m);
2192  	if (m != NULL) {
2193  		ETHER_BPF_MTAP(scifp, m);
2194  
2195  		if ((scifp->if_flags & IFF_MONITOR) != 0) {
2196  			m_freem(m);
2197  			m = NULL;
2198  		}
2199  	}
2200  
2201  #ifdef DEV_NETMAP
2202  	if (m != NULL && scifp->if_capenable & IFCAP_NETMAP) {
2203  		scifp->if_input(scifp, m);
2204  		m = NULL;
2205  	}
2206  #endif	/* DEV_NETMAP */
2207  
2208  	return (m);
2209  }
2210  
2211  static struct mbuf *
2212  lagg_input_infiniband(struct ifnet *ifp, struct mbuf *m)
2213  {
2214  	struct lagg_port *lp = ifp->if_lagg;
2215  	struct lagg_softc *sc = lp->lp_softc;
2216  	struct ifnet *scifp = sc->sc_ifp;
2217  
2218  	NET_EPOCH_ASSERT();
2219  	if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
2220  	    lp->lp_detaching != 0 ||
2221  	    sc->sc_proto == LAGG_PROTO_NONE) {
2222  		m_freem(m);
2223  		return (NULL);
2224  	}
2225  
2226  	m = lagg_proto_input(sc, lp, m);
2227  	if (m != NULL) {
2228  		infiniband_bpf_mtap(scifp, m);
2229  
2230  		if ((scifp->if_flags & IFF_MONITOR) != 0) {
2231  			m_freem(m);
2232  			m = NULL;
2233  		}
2234  	}
2235  
2236  	return (m);
2237  }
2238  
2239  static int
2240  lagg_media_change(struct ifnet *ifp)
2241  {
2242  	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
2243  
2244  	if (sc->sc_ifflags & IFF_DEBUG)
2245  		printf("%s\n", __func__);
2246  
2247  	/* Ignore */
2248  	return (0);
2249  }
2250  
2251  static void
2252  lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr)
2253  {
2254  	struct epoch_tracker et;
2255  	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
2256  	struct lagg_port *lp;
2257  
2258  	imr->ifm_status = IFM_AVALID;
2259  	imr->ifm_active = IFM_ETHER | IFM_AUTO;
2260  
2261  	NET_EPOCH_ENTER(et);
2262  	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
2263  		if (LAGG_PORTACTIVE(lp))
2264  			imr->ifm_status |= IFM_ACTIVE;
2265  	}
2266  	NET_EPOCH_EXIT(et);
2267  }
2268  
2269  static void
2270  lagg_linkstate(struct lagg_softc *sc)
2271  {
2272  	struct epoch_tracker et;
2273  	struct lagg_port *lp;
2274  	int new_link = LINK_STATE_DOWN;
2275  	uint64_t speed;
2276  
2277  	LAGG_XLOCK_ASSERT(sc);
2278  
2279  	/* LACP handles link state itself */
2280  	if (sc->sc_proto == LAGG_PROTO_LACP)
2281  		return;
2282  
2283  	/* Our link is considered up if at least one of our ports is active */
2284  	NET_EPOCH_ENTER(et);
2285  	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
2286  		if (lp->lp_ifp->if_link_state == LINK_STATE_UP) {
2287  			new_link = LINK_STATE_UP;
2288  			break;
2289  		}
2290  	}
2291  	NET_EPOCH_EXIT(et);
2292  	if_link_state_change(sc->sc_ifp, new_link);
2293  
2294  	/* Update if_baudrate to reflect the max possible speed */
2295  	switch (sc->sc_proto) {
2296  		case LAGG_PROTO_FAILOVER:
2297  			sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ?
2298  			    sc->sc_primary->lp_ifp->if_baudrate : 0;
2299  			break;
2300  		case LAGG_PROTO_ROUNDROBIN:
2301  		case LAGG_PROTO_LOADBALANCE:
2302  		case LAGG_PROTO_BROADCAST:
2303  			speed = 0;
2304  			NET_EPOCH_ENTER(et);
2305  			CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2306  				speed += lp->lp_ifp->if_baudrate;
2307  			NET_EPOCH_EXIT(et);
2308  			sc->sc_ifp->if_baudrate = speed;
2309  			break;
2310  		case LAGG_PROTO_LACP:
2311  			/* LACP updates if_baudrate itself */
2312  			break;
2313  	}
2314  }
2315  
2316  static void
2317  lagg_port_state(struct ifnet *ifp, int state)
2318  {
2319  	struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg;
2320  	struct lagg_softc *sc = NULL;
2321  
2322  	if (lp != NULL)
2323  		sc = lp->lp_softc;
2324  	if (sc == NULL)
2325  		return;
2326  
2327  	LAGG_XLOCK(sc);
2328  	lagg_linkstate(sc);
2329  	lagg_proto_linkstate(sc, lp);
2330  	LAGG_XUNLOCK(sc);
2331  }
2332  
2333  struct lagg_port *
2334  lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp)
2335  {
2336  	struct lagg_port *lp_next, *rval = NULL;
2337  
2338  	/*
2339  	 * Search a port which reports an active link state.
2340  	 */
2341  
2342  #ifdef INVARIANTS
2343  	/*
2344  	 * This is called with either in the network epoch
2345  	 * or with LAGG_XLOCK(sc) held.
2346  	 */
2347  	if (!in_epoch(net_epoch_preempt))
2348  		LAGG_XLOCK_ASSERT(sc);
2349  #endif
2350  
2351  	if (lp == NULL)
2352  		goto search;
2353  	if (LAGG_PORTACTIVE(lp)) {
2354  		rval = lp;
2355  		goto found;
2356  	}
2357  	if ((lp_next = CK_SLIST_NEXT(lp, lp_entries)) != NULL &&
2358  	    LAGG_PORTACTIVE(lp_next)) {
2359  		rval = lp_next;
2360  		goto found;
2361  	}
2362  
2363  search:
2364  	CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
2365  		if (LAGG_PORTACTIVE(lp_next)) {
2366  			return (lp_next);
2367  		}
2368  	}
2369  found:
2370  	return (rval);
2371  }
2372  
2373  int
2374  lagg_enqueue(struct ifnet *ifp, struct mbuf *m)
2375  {
2376  
2377  #if defined(KERN_TLS) || defined(RATELIMIT)
2378  	if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) {
2379  		struct lagg_snd_tag *lst;
2380  		struct m_snd_tag *mst;
2381  
2382  		mst = m->m_pkthdr.snd_tag;
2383  		lst = mst_to_lst(mst);
2384  		if (lst->tag->ifp != ifp) {
2385  			m_freem(m);
2386  			return (EAGAIN);
2387  		}
2388  		m->m_pkthdr.snd_tag = m_snd_tag_ref(lst->tag);
2389  		m_snd_tag_rele(mst);
2390  	}
2391  #endif
2392  	return (ifp->if_transmit)(ifp, m);
2393  }
2394  
2395  /*
2396   * Simple round robin aggregation
2397   */
2398  static void
2399  lagg_rr_attach(struct lagg_softc *sc)
2400  {
2401  	sc->sc_seq = 0;
2402  	sc->sc_stride = 1;
2403  }
2404  
2405  static int
2406  lagg_rr_start(struct lagg_softc *sc, struct mbuf *m)
2407  {
2408  	struct lagg_port *lp;
2409  	uint32_t p;
2410  
2411  	p = atomic_fetchadd_32(&sc->sc_seq, 1);
2412  	p /= sc->sc_stride;
2413  	p %= sc->sc_count;
2414  	lp = CK_SLIST_FIRST(&sc->sc_ports);
2415  
2416  	while (p--)
2417  		lp = CK_SLIST_NEXT(lp, lp_entries);
2418  
2419  	/*
2420  	 * Check the port's link state. This will return the next active
2421  	 * port if the link is down or the port is NULL.
2422  	 */
2423  	if ((lp = lagg_link_active(sc, lp)) == NULL) {
2424  		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
2425  		m_freem(m);
2426  		return (ENETDOWN);
2427  	}
2428  
2429  	/* Send mbuf */
2430  	return (lagg_enqueue(lp->lp_ifp, m));
2431  }
2432  
2433  /*
2434   * Broadcast mode
2435   */
2436  static int
2437  lagg_bcast_start(struct lagg_softc *sc, struct mbuf *m)
2438  {
2439  	int errors = 0;
2440  	int ret;
2441  	struct lagg_port *lp, *last = NULL;
2442  	struct mbuf *m0;
2443  
2444  	NET_EPOCH_ASSERT();
2445  	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
2446  		if (!LAGG_PORTACTIVE(lp))
2447  			continue;
2448  
2449  		if (last != NULL) {
2450  			m0 = m_copym(m, 0, M_COPYALL, M_NOWAIT);
2451  			if (m0 == NULL) {
2452  				ret = ENOBUFS;
2453  				errors++;
2454  				break;
2455  			}
2456  			lagg_enqueue(last->lp_ifp, m0);
2457  		}
2458  		last = lp;
2459  	}
2460  
2461  	if (last == NULL) {
2462  		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
2463  		m_freem(m);
2464  		return (ENOENT);
2465  	}
2466  	if ((last = lagg_link_active(sc, last)) == NULL) {
2467  		errors++;
2468  		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors);
2469  		m_freem(m);
2470  		return (ENETDOWN);
2471  	}
2472  
2473  	ret = lagg_enqueue(last->lp_ifp, m);
2474  	if (errors != 0)
2475  		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, errors);
2476  
2477  	return (ret);
2478  }
2479  
2480  /*
2481   * Active failover
2482   */
2483  static int
2484  lagg_fail_start(struct lagg_softc *sc, struct mbuf *m)
2485  {
2486  	struct lagg_port *lp;
2487  
2488  	/* Use the master port if active or the next available port */
2489  	if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) {
2490  		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
2491  		m_freem(m);
2492  		return (ENETDOWN);
2493  	}
2494  
2495  	/* Send mbuf */
2496  	return (lagg_enqueue(lp->lp_ifp, m));
2497  }
2498  
2499  static struct mbuf *
2500  lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
2501  {
2502  	struct ifnet *ifp = sc->sc_ifp;
2503  	struct lagg_port *tmp_tp;
2504  
2505  	if (lp == sc->sc_primary || V_lagg_failover_rx_all) {
2506  		m->m_pkthdr.rcvif = ifp;
2507  		return (m);
2508  	}
2509  
2510  	if (!LAGG_PORTACTIVE(sc->sc_primary)) {
2511  		tmp_tp = lagg_link_active(sc, sc->sc_primary);
2512  		/*
2513  		 * If tmp_tp is null, we've received a packet when all
2514  		 * our links are down. Weird, but process it anyways.
2515  		 */
2516  		if (tmp_tp == NULL || tmp_tp == lp) {
2517  			m->m_pkthdr.rcvif = ifp;
2518  			return (m);
2519  		}
2520  	}
2521  
2522  	m_freem(m);
2523  	return (NULL);
2524  }
2525  
2526  /*
2527   * Loadbalancing
2528   */
2529  static void
2530  lagg_lb_attach(struct lagg_softc *sc)
2531  {
2532  	struct lagg_port *lp;
2533  	struct lagg_lb *lb;
2534  
2535  	LAGG_XLOCK_ASSERT(sc);
2536  	lb = malloc(sizeof(struct lagg_lb), M_LAGG, M_WAITOK | M_ZERO);
2537  	lb->lb_key = m_ether_tcpip_hash_init();
2538  	sc->sc_psc = lb;
2539  
2540  	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2541  		lagg_lb_port_create(lp);
2542  }
2543  
2544  static void
2545  lagg_lb_detach(struct lagg_softc *sc)
2546  {
2547  	struct lagg_lb *lb;
2548  
2549  	lb = (struct lagg_lb *)sc->sc_psc;
2550  	if (lb != NULL)
2551  		free(lb, M_LAGG);
2552  }
2553  
2554  static int
2555  lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp)
2556  {
2557  	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
2558  	struct lagg_port *lp_next;
2559  	int i = 0, rv;
2560  
2561  	rv = 0;
2562  	bzero(&lb->lb_ports, sizeof(lb->lb_ports));
2563  	LAGG_XLOCK_ASSERT(sc);
2564  	CK_SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
2565  		if (lp_next == lp)
2566  			continue;
2567  		if (i >= LAGG_MAX_PORTS) {
2568  			rv = EINVAL;
2569  			break;
2570  		}
2571  		if (sc->sc_ifflags & IFF_DEBUG)
2572  			printf("%s: port %s at index %d\n",
2573  			    sc->sc_ifname, lp_next->lp_ifp->if_xname, i);
2574  		lb->lb_ports[i++] = lp_next;
2575  	}
2576  
2577  	return (rv);
2578  }
2579  
2580  static int
2581  lagg_lb_port_create(struct lagg_port *lp)
2582  {
2583  	struct lagg_softc *sc = lp->lp_softc;
2584  	return (lagg_lb_porttable(sc, NULL));
2585  }
2586  
2587  static void
2588  lagg_lb_port_destroy(struct lagg_port *lp)
2589  {
2590  	struct lagg_softc *sc = lp->lp_softc;
2591  	lagg_lb_porttable(sc, lp);
2592  }
2593  
2594  static int
2595  lagg_lb_start(struct lagg_softc *sc, struct mbuf *m)
2596  {
2597  	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
2598  	struct lagg_port *lp = NULL;
2599  	uint32_t p = 0;
2600  
2601  	if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) &&
2602  	    M_HASHTYPE_GET(m) != M_HASHTYPE_NONE)
2603  		p = m->m_pkthdr.flowid >> sc->flowid_shift;
2604  	else
2605  		p = m_ether_tcpip_hash(sc->sc_flags, m, lb->lb_key);
2606  	p %= sc->sc_count;
2607  	lp = lb->lb_ports[p];
2608  
2609  	/*
2610  	 * Check the port's link state. This will return the next active
2611  	 * port if the link is down or the port is NULL.
2612  	 */
2613  	if ((lp = lagg_link_active(sc, lp)) == NULL) {
2614  		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
2615  		m_freem(m);
2616  		return (ENETDOWN);
2617  	}
2618  
2619  	/* Send mbuf */
2620  	return (lagg_enqueue(lp->lp_ifp, m));
2621  }
2622  
2623  /*
2624   * 802.3ad LACP
2625   */
2626  static void
2627  lagg_lacp_attach(struct lagg_softc *sc)
2628  {
2629  	struct lagg_port *lp;
2630  
2631  	lacp_attach(sc);
2632  	LAGG_XLOCK_ASSERT(sc);
2633  	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2634  		lacp_port_create(lp);
2635  }
2636  
2637  static void
2638  lagg_lacp_detach(struct lagg_softc *sc)
2639  {
2640  	struct lagg_port *lp;
2641  	void *psc;
2642  
2643  	LAGG_XLOCK_ASSERT(sc);
2644  	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2645  		lacp_port_destroy(lp);
2646  
2647  	psc = sc->sc_psc;
2648  	sc->sc_psc = NULL;
2649  	lacp_detach(psc);
2650  }
2651  
2652  static void
2653  lagg_lacp_lladdr(struct lagg_softc *sc)
2654  {
2655  	struct lagg_port *lp;
2656  
2657  	LAGG_SXLOCK_ASSERT(sc);
2658  
2659  	/* purge all the lacp ports */
2660  	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2661  		lacp_port_destroy(lp);
2662  
2663  	/* add them back in */
2664  	CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
2665  		lacp_port_create(lp);
2666  }
2667  
2668  static int
2669  lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m)
2670  {
2671  	struct lagg_port *lp;
2672  	int err;
2673  
2674  	lp = lacp_select_tx_port(sc, m, &err);
2675  	if (lp == NULL) {
2676  		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
2677  		m_freem(m);
2678  		return (err);
2679  	}
2680  
2681  	/* Send mbuf */
2682  	return (lagg_enqueue(lp->lp_ifp, m));
2683  }
2684  
2685  static struct mbuf *
2686  lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
2687  {
2688  	struct ifnet *ifp = sc->sc_ifp;
2689  	struct ether_header *eh;
2690  	u_short etype;
2691  
2692  	eh = mtod(m, struct ether_header *);
2693  	etype = ntohs(eh->ether_type);
2694  
2695  	/* Tap off LACP control messages */
2696  	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) {
2697  		m = lacp_input(lp, m);
2698  		if (m == NULL)
2699  			return (NULL);
2700  	}
2701  
2702  	/*
2703  	 * If the port is not collecting or not in the active aggregator then
2704  	 * free and return.
2705  	 */
2706  	if (!lacp_iscollecting(lp) || !lacp_isactive(lp)) {
2707  		m_freem(m);
2708  		return (NULL);
2709  	}
2710  
2711  	m->m_pkthdr.rcvif = ifp;
2712  	return (m);
2713  }
2714  
2715  /* Default input */
2716  static struct mbuf *
2717  lagg_default_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
2718  {
2719  	struct ifnet *ifp = sc->sc_ifp;
2720  
2721  	/* Just pass in the packet to our lagg device */
2722  	m->m_pkthdr.rcvif = ifp;
2723  
2724  	return (m);
2725  }
2726