xref: /freebsd/sys/net/if_epair.c (revision 964219664dcec4198441910904fb9064569d174d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2008 The FreeBSD Foundation
5  * Copyright (c) 2009-2010 Bjoern A. Zeeb <bz@FreeBSD.org>
6  * All rights reserved.
7  *
8  * This software was developed by CK Software GmbH under sponsorship
9  * from the FreeBSD Foundation.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  * notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  * notice, this list of conditions and the following disclaimer in the
18  * documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 /*
34  * A pair of virtual back-to-back connected ethernet like interfaces
35  * (``two interfaces with a virtual cross-over cable'').
36  *
37  * This is mostly intended to be used to provide connectivity between
38  * different virtual network stack instances.
39  */
40 /*
41  * Things to re-think once we have more experience:
42  * - ifp->if_reassign function once we can test with vimage. Depending on
43  *   how if_vmove() is going to be improved.
44  * - Real random etheraddrs that are checked to be uniquish; we would need
45  *   to re-do them in case we move the interface between network stacks
46  *   in a private if_reassign function.
47  *   In case we bridge to a real interface/network or between indepedent
48  *   epairs on multiple stacks/machines, we may need this.
49  *   For now let the user handle that case.
50  */
51 
52 #include <sys/cdefs.h>
53 __FBSDID("$FreeBSD$");
54 
55 #include <sys/param.h>
56 #include <sys/hash.h>
57 #include <sys/jail.h>
58 #include <sys/kernel.h>
59 #include <sys/libkern.h>
60 #include <sys/malloc.h>
61 #include <sys/mbuf.h>
62 #include <sys/module.h>
63 #include <sys/proc.h>
64 #include <sys/refcount.h>
65 #include <sys/queue.h>
66 #include <sys/smp.h>
67 #include <sys/socket.h>
68 #include <sys/sockio.h>
69 #include <sys/sysctl.h>
70 #include <sys/types.h>
71 
72 #include <net/bpf.h>
73 #include <net/ethernet.h>
74 #include <net/if.h>
75 #include <net/if_var.h>
76 #include <net/if_clone.h>
77 #include <net/if_media.h>
78 #include <net/if_var.h>
79 #include <net/if_types.h>
80 #include <net/netisr.h>
81 #include <net/vnet.h>
82 
83 SYSCTL_DECL(_net_link);
84 static SYSCTL_NODE(_net_link, OID_AUTO, epair, CTLFLAG_RW, 0, "epair sysctl");
85 
86 #ifdef EPAIR_DEBUG
87 static int epair_debug = 0;
88 SYSCTL_INT(_net_link_epair, OID_AUTO, epair_debug, CTLFLAG_RW,
89     &epair_debug, 0, "if_epair(4) debugging.");
90 #define	DPRINTF(fmt, arg...)						\
91 	if (epair_debug)						\
92 		printf("[%s:%d] " fmt, __func__, __LINE__, ##arg)
93 #else
94 #define	DPRINTF(fmt, arg...)
95 #endif
96 
97 static void epair_nh_sintr(struct mbuf *);
98 static struct mbuf *epair_nh_m2cpuid(struct mbuf *, uintptr_t, u_int *);
99 static void epair_nh_drainedcpu(u_int);
100 
101 static void epair_start_locked(struct ifnet *);
102 static int epair_media_change(struct ifnet *);
103 static void epair_media_status(struct ifnet *, struct ifmediareq *);
104 
105 static int epair_clone_match(struct if_clone *, const char *);
106 static int epair_clone_create(struct if_clone *, char *, size_t, caddr_t);
107 static int epair_clone_destroy(struct if_clone *, struct ifnet *);
108 
109 static const char epairname[] = "epair";
110 
111 /* Netisr related definitions and sysctl. */
112 static struct netisr_handler epair_nh = {
113 	.nh_name	= epairname,
114 	.nh_proto	= NETISR_EPAIR,
115 	.nh_policy	= NETISR_POLICY_CPU,
116 	.nh_handler	= epair_nh_sintr,
117 	.nh_m2cpuid	= epair_nh_m2cpuid,
118 	.nh_drainedcpu	= epair_nh_drainedcpu,
119 };
120 
121 static int
122 sysctl_epair_netisr_maxqlen(SYSCTL_HANDLER_ARGS)
123 {
124 	int error, qlimit;
125 
126 	netisr_getqlimit(&epair_nh, &qlimit);
127 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
128 	if (error || !req->newptr)
129 		return (error);
130 	if (qlimit < 1)
131 		return (EINVAL);
132 	return (netisr_setqlimit(&epair_nh, qlimit));
133 }
134 SYSCTL_PROC(_net_link_epair, OID_AUTO, netisr_maxqlen, CTLTYPE_INT|CTLFLAG_RW,
135     0, 0, sysctl_epair_netisr_maxqlen, "I",
136     "Maximum if_epair(4) netisr \"hw\" queue length");
137 
138 struct epair_softc {
139 	struct ifnet	*ifp;		/* This ifp. */
140 	struct ifnet	*oifp;		/* other ifp of pair. */
141 	struct ifmedia	media;		/* Media config (fake). */
142 	u_int		refcount;	/* # of mbufs in flight. */
143 	u_int		cpuid;		/* CPU ID assigned upon creation. */
144 	void		(*if_qflush)(struct ifnet *);
145 					/* Original if_qflush routine. */
146 };
147 
148 /*
149  * Per-CPU list of ifps with data in the ifq that needs to be flushed
150  * to the netisr ``hw'' queue before we allow any further direct queuing
151  * to the ``hw'' queue.
152  */
153 struct epair_ifp_drain {
154 	STAILQ_ENTRY(epair_ifp_drain)	ifp_next;
155 	struct ifnet			*ifp;
156 };
157 STAILQ_HEAD(eid_list, epair_ifp_drain);
158 
159 #define	EPAIR_LOCK_INIT(dpcpu)		mtx_init(&(dpcpu)->if_epair_mtx, \
160 					    "if_epair", NULL, MTX_DEF)
161 #define	EPAIR_LOCK_DESTROY(dpcpu)	mtx_destroy(&(dpcpu)->if_epair_mtx)
162 #define	EPAIR_LOCK_ASSERT(dpcpu)	mtx_assert(&(dpcpu)->if_epair_mtx, \
163 					    MA_OWNED)
164 #define	EPAIR_LOCK(dpcpu)		mtx_lock(&(dpcpu)->if_epair_mtx)
165 #define	EPAIR_UNLOCK(dpcpu)		mtx_unlock(&(dpcpu)->if_epair_mtx)
166 
167 #ifdef INVARIANTS
168 #define	EPAIR_REFCOUNT_INIT(r, v)	refcount_init((r), (v))
169 #define	EPAIR_REFCOUNT_AQUIRE(r)	refcount_acquire((r))
170 #define	EPAIR_REFCOUNT_RELEASE(r)	refcount_release((r))
171 #define	EPAIR_REFCOUNT_ASSERT(a, p)	KASSERT(a, p)
172 #else
173 #define	EPAIR_REFCOUNT_INIT(r, v)
174 #define	EPAIR_REFCOUNT_AQUIRE(r)
175 #define	EPAIR_REFCOUNT_RELEASE(r)
176 #define	EPAIR_REFCOUNT_ASSERT(a, p)
177 #endif
178 
179 static MALLOC_DEFINE(M_EPAIR, epairname,
180     "Pair of virtual cross-over connected Ethernet-like interfaces");
181 
182 static VNET_DEFINE(struct if_clone *, epair_cloner);
183 #define	V_epair_cloner	VNET(epair_cloner)
184 
185 /*
186  * DPCPU area and functions.
187  */
188 struct epair_dpcpu {
189 	struct mtx	if_epair_mtx;		/* Per-CPU locking. */
190 	int		epair_drv_flags;	/* Per-CPU ``hw'' drv flags. */
191 	struct eid_list	epair_ifp_drain_list;	/* Per-CPU list of ifps with
192 						 * data in the ifq. */
193 };
194 DPCPU_DEFINE(struct epair_dpcpu, epair_dpcpu);
195 
196 static void
197 epair_dpcpu_init(void)
198 {
199 	struct epair_dpcpu *epair_dpcpu;
200 	struct eid_list *s;
201 	u_int cpuid;
202 
203 	CPU_FOREACH(cpuid) {
204 		epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
205 
206 		/* Initialize per-cpu lock. */
207 		EPAIR_LOCK_INIT(epair_dpcpu);
208 
209 		/* Driver flags are per-cpu as are our netisr "hw" queues. */
210 		epair_dpcpu->epair_drv_flags = 0;
211 
212 		/*
213 		 * Initialize per-cpu drain list.
214 		 * Manually do what STAILQ_HEAD_INITIALIZER would do.
215 		 */
216 		s = &epair_dpcpu->epair_ifp_drain_list;
217 		s->stqh_first = NULL;
218 		s->stqh_last = &s->stqh_first;
219 	}
220 }
221 
222 static void
223 epair_dpcpu_detach(void)
224 {
225 	struct epair_dpcpu *epair_dpcpu;
226 	u_int cpuid;
227 
228 	CPU_FOREACH(cpuid) {
229 		epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
230 
231 		/* Destroy per-cpu lock. */
232 		EPAIR_LOCK_DESTROY(epair_dpcpu);
233 	}
234 }
235 
236 /*
237  * Helper functions.
238  */
239 static u_int
240 cpuid_from_ifp(struct ifnet *ifp)
241 {
242 	struct epair_softc *sc;
243 
244 	if (ifp == NULL)
245 		return (0);
246 	sc = ifp->if_softc;
247 
248 	return (sc->cpuid);
249 }
250 
251 /*
252  * Netisr handler functions.
253  */
254 static void
255 epair_nh_sintr(struct mbuf *m)
256 {
257 	struct ifnet *ifp;
258 	struct epair_softc *sc __unused;
259 
260 	ifp = m->m_pkthdr.rcvif;
261 	(*ifp->if_input)(ifp, m);
262 	sc = ifp->if_softc;
263 	EPAIR_REFCOUNT_RELEASE(&sc->refcount);
264 	EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
265 	    ("%s: ifp=%p sc->refcount not >= 1: %d",
266 	    __func__, ifp, sc->refcount));
267 	DPRINTF("ifp=%p refcount=%u\n", ifp, sc->refcount);
268 }
269 
270 static struct mbuf *
271 epair_nh_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
272 {
273 
274 	*cpuid = cpuid_from_ifp(m->m_pkthdr.rcvif);
275 
276 	return (m);
277 }
278 
279 static void
280 epair_nh_drainedcpu(u_int cpuid)
281 {
282 	struct epair_dpcpu *epair_dpcpu;
283 	struct epair_ifp_drain *elm, *tvar;
284 	struct ifnet *ifp;
285 
286 	epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
287 	EPAIR_LOCK(epair_dpcpu);
288 	/*
289 	 * Assume our "hw" queue and possibly ifq will be emptied
290 	 * again. In case we will overflow the "hw" queue while
291 	 * draining, epair_start_locked will set IFF_DRV_OACTIVE
292 	 * again and we will stop and return.
293 	 */
294 	STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list,
295 	    ifp_next, tvar) {
296 		ifp = elm->ifp;
297 		epair_dpcpu->epair_drv_flags &= ~IFF_DRV_OACTIVE;
298 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
299 		epair_start_locked(ifp);
300 
301 		IFQ_LOCK(&ifp->if_snd);
302 		if (IFQ_IS_EMPTY(&ifp->if_snd)) {
303 			struct epair_softc *sc __unused;
304 
305 			STAILQ_REMOVE(&epair_dpcpu->epair_ifp_drain_list,
306 			    elm, epair_ifp_drain, ifp_next);
307 			/* The cached ifp goes off the list. */
308 			sc = ifp->if_softc;
309 			EPAIR_REFCOUNT_RELEASE(&sc->refcount);
310 			EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
311 			    ("%s: ifp=%p sc->refcount not >= 1: %d",
312 			    __func__, ifp, sc->refcount));
313 			free(elm, M_EPAIR);
314 		}
315 		IFQ_UNLOCK(&ifp->if_snd);
316 
317 		if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) != 0) {
318 			/* Our "hw"q overflew again. */
319 			epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
320 			DPRINTF("hw queue length overflow at %u\n",
321 			    epair_nh.nh_qlimit);
322 			break;
323 		}
324 	}
325 	EPAIR_UNLOCK(epair_dpcpu);
326 }
327 
328 /*
329  * Network interface (`if') related functions.
330  */
331 static void
332 epair_remove_ifp_from_draining(struct ifnet *ifp)
333 {
334 	struct epair_dpcpu *epair_dpcpu;
335 	struct epair_ifp_drain *elm, *tvar;
336 	u_int cpuid;
337 
338 	CPU_FOREACH(cpuid) {
339 		epair_dpcpu = DPCPU_ID_PTR(cpuid, epair_dpcpu);
340 		EPAIR_LOCK(epair_dpcpu);
341 		STAILQ_FOREACH_SAFE(elm, &epair_dpcpu->epair_ifp_drain_list,
342 		    ifp_next, tvar) {
343 			if (ifp == elm->ifp) {
344 				struct epair_softc *sc __unused;
345 
346 				STAILQ_REMOVE(
347 				    &epair_dpcpu->epair_ifp_drain_list, elm,
348 				    epair_ifp_drain, ifp_next);
349 				/* The cached ifp goes off the list. */
350 				sc = ifp->if_softc;
351 				EPAIR_REFCOUNT_RELEASE(&sc->refcount);
352 				EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
353 				    ("%s: ifp=%p sc->refcount not >= 1: %d",
354 				    __func__, ifp, sc->refcount));
355 				free(elm, M_EPAIR);
356 			}
357 		}
358 		EPAIR_UNLOCK(epair_dpcpu);
359 	}
360 }
361 
362 static int
363 epair_add_ifp_for_draining(struct ifnet *ifp)
364 {
365 	struct epair_dpcpu *epair_dpcpu;
366 	struct epair_softc *sc;
367 	struct epair_ifp_drain *elm = NULL;
368 
369 	sc = ifp->if_softc;
370 	epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
371 	EPAIR_LOCK_ASSERT(epair_dpcpu);
372 	STAILQ_FOREACH(elm, &epair_dpcpu->epair_ifp_drain_list, ifp_next)
373 		if (elm->ifp == ifp)
374 			break;
375 	/* If the ifp is there already, return success. */
376 	if (elm != NULL)
377 		return (0);
378 
379 	elm = malloc(sizeof(struct epair_ifp_drain), M_EPAIR, M_NOWAIT|M_ZERO);
380 	if (elm == NULL)
381 		return (ENOMEM);
382 
383 	elm->ifp = ifp;
384 	/* Add a reference for the ifp pointer on the list. */
385 	EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
386 	STAILQ_INSERT_TAIL(&epair_dpcpu->epair_ifp_drain_list, elm, ifp_next);
387 
388 	return (0);
389 }
390 
391 static void
392 epair_start_locked(struct ifnet *ifp)
393 {
394 	struct epair_dpcpu *epair_dpcpu;
395 	struct mbuf *m;
396 	struct epair_softc *sc;
397 	struct ifnet *oifp;
398 	int error;
399 
400 	DPRINTF("ifp=%p\n", ifp);
401 	sc = ifp->if_softc;
402 	epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
403 	EPAIR_LOCK_ASSERT(epair_dpcpu);
404 
405 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
406 		return;
407 	if ((ifp->if_flags & IFF_UP) == 0)
408 		return;
409 
410 	/*
411 	 * We get packets here from ether_output via if_handoff()
412 	 * and need to put them into the input queue of the oifp
413 	 * and call oifp->if_input() via netisr/epair_sintr().
414 	 */
415 	oifp = sc->oifp;
416 	sc = oifp->if_softc;
417 	for (;;) {
418 		IFQ_DEQUEUE(&ifp->if_snd, m);
419 		if (m == NULL)
420 			break;
421 		BPF_MTAP(ifp, m);
422 
423 		/*
424 		 * In case the outgoing interface is not usable,
425 		 * drop the packet.
426 		 */
427 		if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
428 		    (oifp->if_flags & IFF_UP) ==0) {
429 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
430 			m_freem(m);
431 			continue;
432 		}
433 		DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname);
434 
435 		/*
436 		 * Add a reference so the interface cannot go while the
437 		 * packet is in transit as we rely on rcvif to stay valid.
438 		 */
439 		EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
440 		m->m_pkthdr.rcvif = oifp;
441 		CURVNET_SET_QUIET(oifp->if_vnet);
442 		error = netisr_queue(NETISR_EPAIR, m);
443 		CURVNET_RESTORE();
444 		if (!error) {
445 			if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
446 			/* Someone else received the packet. */
447 			if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
448 		} else {
449 			/* The packet was freed already. */
450 			epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
451 			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
452 			(void) epair_add_ifp_for_draining(ifp);
453 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
454 			EPAIR_REFCOUNT_RELEASE(&sc->refcount);
455 			EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
456 			    ("%s: ifp=%p sc->refcount not >= 1: %d",
457 			    __func__, oifp, sc->refcount));
458 		}
459 	}
460 }
461 
462 static void
463 epair_start(struct ifnet *ifp)
464 {
465 	struct epair_dpcpu *epair_dpcpu;
466 
467 	epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu);
468 	EPAIR_LOCK(epair_dpcpu);
469 	epair_start_locked(ifp);
470 	EPAIR_UNLOCK(epair_dpcpu);
471 }
472 
473 static int
474 epair_transmit_locked(struct ifnet *ifp, struct mbuf *m)
475 {
476 	struct epair_dpcpu *epair_dpcpu;
477 	struct epair_softc *sc;
478 	struct ifnet *oifp;
479 	int error, len;
480 	short mflags;
481 
482 	DPRINTF("ifp=%p m=%p\n", ifp, m);
483 	sc = ifp->if_softc;
484 	epair_dpcpu = DPCPU_ID_PTR(sc->cpuid, epair_dpcpu);
485 	EPAIR_LOCK_ASSERT(epair_dpcpu);
486 
487 	if (m == NULL)
488 		return (0);
489 
490 	/*
491 	 * We are not going to use the interface en/dequeue mechanism
492 	 * on the TX side. We are called from ether_output_frame()
493 	 * and will put the packet into the incoming queue of the
494 	 * other interface of our pair via the netsir.
495 	 */
496 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
497 		m_freem(m);
498 		return (ENXIO);
499 	}
500 	if ((ifp->if_flags & IFF_UP) == 0) {
501 		m_freem(m);
502 		return (ENETDOWN);
503 	}
504 
505 	BPF_MTAP(ifp, m);
506 
507 	/*
508 	 * In case the outgoing interface is not usable,
509 	 * drop the packet.
510 	 */
511 	oifp = sc->oifp;
512 	if ((oifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
513 	    (oifp->if_flags & IFF_UP) ==0) {
514 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
515 		m_freem(m);
516 		return (0);
517 	}
518 	len = m->m_pkthdr.len;
519 	mflags = m->m_flags;
520 	DPRINTF("packet %s -> %s\n", ifp->if_xname, oifp->if_xname);
521 
522 #ifdef ALTQ
523 	/* Support ALTQ via the classic if_start() path. */
524 	IF_LOCK(&ifp->if_snd);
525 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
526 		ALTQ_ENQUEUE(&ifp->if_snd, m, NULL, error);
527 		if (error)
528 			if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
529 		IF_UNLOCK(&ifp->if_snd);
530 		if (!error) {
531 			if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
532 			if (mflags & (M_BCAST|M_MCAST))
533 				if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
534 
535 			if ((ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0)
536 				epair_start_locked(ifp);
537 			else
538 				(void)epair_add_ifp_for_draining(ifp);
539 		}
540 		return (error);
541 	}
542 	IF_UNLOCK(&ifp->if_snd);
543 #endif
544 
545 	if ((epair_dpcpu->epair_drv_flags & IFF_DRV_OACTIVE) != 0) {
546 		/*
547 		 * Our hardware queue is full, try to fall back
548 		 * queuing to the ifq but do not call ifp->if_start.
549 		 * Either we are lucky or the packet is gone.
550 		 */
551 		IFQ_ENQUEUE(&ifp->if_snd, m, error);
552 		if (!error)
553 			(void)epair_add_ifp_for_draining(ifp);
554 		return (error);
555 	}
556 	sc = oifp->if_softc;
557 	/*
558 	 * Add a reference so the interface cannot go while the
559 	 * packet is in transit as we rely on rcvif to stay valid.
560 	 */
561 	EPAIR_REFCOUNT_AQUIRE(&sc->refcount);
562 	m->m_pkthdr.rcvif = oifp;
563 	CURVNET_SET_QUIET(oifp->if_vnet);
564 	error = netisr_queue(NETISR_EPAIR, m);
565 	CURVNET_RESTORE();
566 	if (!error) {
567 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
568 		/*
569 		 * IFQ_HANDOFF_ADJ/ip_handoff() update statistics,
570 		 * but as we bypass all this we have to duplicate
571 		 * the logic another time.
572 		 */
573 		if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
574 		if (mflags & (M_BCAST|M_MCAST))
575 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
576 		/* Someone else received the packet. */
577 		if_inc_counter(oifp, IFCOUNTER_IPACKETS, 1);
578 	} else {
579 		/* The packet was freed already. */
580 		epair_dpcpu->epair_drv_flags |= IFF_DRV_OACTIVE;
581 		ifp->if_drv_flags |= IFF_DRV_OACTIVE;
582 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
583 		EPAIR_REFCOUNT_RELEASE(&sc->refcount);
584 		EPAIR_REFCOUNT_ASSERT((int)sc->refcount >= 1,
585 		    ("%s: ifp=%p sc->refcount not >= 1: %d",
586 		    __func__, oifp, sc->refcount));
587 	}
588 
589 	return (error);
590 }
591 
592 static int
593 epair_transmit(struct ifnet *ifp, struct mbuf *m)
594 {
595 	struct epair_dpcpu *epair_dpcpu;
596 	int error;
597 
598 	epair_dpcpu = DPCPU_ID_PTR(cpuid_from_ifp(ifp), epair_dpcpu);
599 	EPAIR_LOCK(epair_dpcpu);
600 	error = epair_transmit_locked(ifp, m);
601 	EPAIR_UNLOCK(epair_dpcpu);
602 	return (error);
603 }
604 
605 static void
606 epair_qflush(struct ifnet *ifp)
607 {
608 	struct epair_softc *sc;
609 
610 	sc = ifp->if_softc;
611 	KASSERT(sc != NULL, ("%s: ifp=%p, epair_softc gone? sc=%p\n",
612 	    __func__, ifp, sc));
613 	/*
614 	 * Remove this ifp from all backpointer lists. The interface will not
615 	 * usable for flushing anyway nor should it have anything to flush
616 	 * after if_qflush().
617 	 */
618 	epair_remove_ifp_from_draining(ifp);
619 
620 	if (sc->if_qflush)
621 		sc->if_qflush(ifp);
622 }
623 
624 static int
625 epair_media_change(struct ifnet *ifp __unused)
626 {
627 
628 	/* Do nothing. */
629 	return (0);
630 }
631 
632 static void
633 epair_media_status(struct ifnet *ifp __unused, struct ifmediareq *imr)
634 {
635 
636 	imr->ifm_status = IFM_AVALID | IFM_ACTIVE;
637 	imr->ifm_active = IFM_ETHER | IFM_10G_T | IFM_FDX;
638 }
639 
640 static int
641 epair_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
642 {
643 	struct epair_softc *sc;
644 	struct ifreq *ifr;
645 	int error;
646 
647 	ifr = (struct ifreq *)data;
648 	switch (cmd) {
649 	case SIOCSIFFLAGS:
650 	case SIOCADDMULTI:
651 	case SIOCDELMULTI:
652 		error = 0;
653 		break;
654 
655 	case SIOCSIFMEDIA:
656 	case SIOCGIFMEDIA:
657 		sc = ifp->if_softc;
658 		error = ifmedia_ioctl(ifp, ifr, &sc->media, cmd);
659 		break;
660 
661 	case SIOCSIFMTU:
662 		/* We basically allow all kinds of MTUs. */
663 		ifp->if_mtu = ifr->ifr_mtu;
664 		error = 0;
665 		break;
666 
667 	default:
668 		/* Let the common ethernet handler process this. */
669 		error = ether_ioctl(ifp, cmd, data);
670 		break;
671 	}
672 
673 	return (error);
674 }
675 
676 static void
677 epair_init(void *dummy __unused)
678 {
679 }
680 
681 
682 /*
683  * Interface cloning functions.
684  * We use our private ones so that we can create/destroy our secondary
685  * device along with the primary one.
686  */
687 static int
688 epair_clone_match(struct if_clone *ifc, const char *name)
689 {
690 	const char *cp;
691 
692 	DPRINTF("name='%s'\n", name);
693 
694 	/*
695 	 * Our base name is epair.
696 	 * Our interfaces will be named epair<n>[ab].
697 	 * So accept anything of the following list:
698 	 * - epair
699 	 * - epair<n>
700 	 * but not the epair<n>[ab] versions.
701 	 */
702 	if (strncmp(epairname, name, sizeof(epairname)-1) != 0)
703 		return (0);
704 
705 	for (cp = name + sizeof(epairname) - 1; *cp != '\0'; cp++) {
706 		if (*cp < '0' || *cp > '9')
707 			return (0);
708 	}
709 
710 	return (1);
711 }
712 
713 static int
714 epair_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
715 {
716 	struct epair_softc *sca, *scb;
717 	struct ifnet *ifp;
718 	char *dp;
719 	int error, unit, wildcard;
720 	uint64_t hostid;
721 	uint32_t key[3];
722 	uint32_t hash;
723 	uint8_t eaddr[ETHER_ADDR_LEN];	/* 00:00:00:00:00:00 */
724 
725 	/*
726 	 * We are abusing params to create our second interface.
727 	 * Actually we already created it and called if_clone_create()
728 	 * for it to do the official insertion procedure the moment we knew
729 	 * it cannot fail anymore. So just do attach it here.
730 	 */
731 	if (params) {
732 		scb = (struct epair_softc *)params;
733 		ifp = scb->ifp;
734 		/* Copy epairNa etheraddr and change the last byte. */
735 		memcpy(eaddr, scb->oifp->if_hw_addr, ETHER_ADDR_LEN);
736 		eaddr[5] = 0x0b;
737 		ether_ifattach(ifp, eaddr);
738 		/* Correctly set the name for the cloner list. */
739 		strlcpy(name, ifp->if_xname, len);
740 		return (0);
741 	}
742 
743 	/* Try to see if a special unit was requested. */
744 	error = ifc_name2unit(name, &unit);
745 	if (error != 0)
746 		return (error);
747 	wildcard = (unit < 0);
748 
749 	error = ifc_alloc_unit(ifc, &unit);
750 	if (error != 0)
751 		return (error);
752 
753 	/*
754 	 * If no unit had been given, we need to adjust the ifName.
755 	 * Also make sure there is space for our extra [ab] suffix.
756 	 */
757 	for (dp = name; *dp != '\0'; dp++);
758 	if (wildcard) {
759 		error = snprintf(dp, len - (dp - name), "%d", unit);
760 		if (error > len - (dp - name) - 1) {
761 			/* ifName too long. */
762 			ifc_free_unit(ifc, unit);
763 			return (ENOSPC);
764 		}
765 		dp += error;
766 	}
767 	if (len - (dp - name) - 1 < 1) {
768 		/* No space left for our [ab] suffix. */
769 		ifc_free_unit(ifc, unit);
770 		return (ENOSPC);
771 	}
772 	*dp = 'b';
773 	/* Must not change dp so we can replace 'a' by 'b' later. */
774 	*(dp+1) = '\0';
775 
776 	/* Check if 'a' and 'b' interfaces already exist. */
777 	if (ifunit(name) != NULL)
778 		return (EEXIST);
779 	*dp = 'a';
780 	if (ifunit(name) != NULL)
781 		return (EEXIST);
782 
783 	/* Allocate memory for both [ab] interfaces */
784 	sca = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO);
785 	EPAIR_REFCOUNT_INIT(&sca->refcount, 1);
786 	sca->ifp = if_alloc(IFT_ETHER);
787 	if (sca->ifp == NULL) {
788 		free(sca, M_EPAIR);
789 		ifc_free_unit(ifc, unit);
790 		return (ENOSPC);
791 	}
792 
793 	scb = malloc(sizeof(struct epair_softc), M_EPAIR, M_WAITOK | M_ZERO);
794 	EPAIR_REFCOUNT_INIT(&scb->refcount, 1);
795 	scb->ifp = if_alloc(IFT_ETHER);
796 	if (scb->ifp == NULL) {
797 		free(scb, M_EPAIR);
798 		if_free(sca->ifp);
799 		free(sca, M_EPAIR);
800 		ifc_free_unit(ifc, unit);
801 		return (ENOSPC);
802 	}
803 
804 	/*
805 	 * Cross-reference the interfaces so we will be able to free both.
806 	 */
807 	sca->oifp = scb->ifp;
808 	scb->oifp = sca->ifp;
809 
810 	/*
811 	 * Calculate the cpuid for netisr queueing based on the
812 	 * ifIndex of the interfaces. As long as we cannot configure
813 	 * this or use cpuset information easily we cannot guarantee
814 	 * cache locality but we can at least allow parallelism.
815 	 */
816 	sca->cpuid =
817 	    netisr_get_cpuid(sca->ifp->if_index);
818 	scb->cpuid =
819 	    netisr_get_cpuid(scb->ifp->if_index);
820 
821 	/* Initialise pseudo media types. */
822 	ifmedia_init(&sca->media, 0, epair_media_change, epair_media_status);
823 	ifmedia_add(&sca->media, IFM_ETHER | IFM_10G_T, 0, NULL);
824 	ifmedia_set(&sca->media, IFM_ETHER | IFM_10G_T);
825 	ifmedia_init(&scb->media, 0, epair_media_change, epair_media_status);
826 	ifmedia_add(&scb->media, IFM_ETHER | IFM_10G_T, 0, NULL);
827 	ifmedia_set(&scb->media, IFM_ETHER | IFM_10G_T);
828 
829 	/* Finish initialization of interface <n>a. */
830 	ifp = sca->ifp;
831 	ifp->if_softc = sca;
832 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
833 	ifp->if_dname = epairname;
834 	ifp->if_dunit = unit;
835 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
836 	ifp->if_capabilities = IFCAP_VLAN_MTU;
837 	ifp->if_capenable = IFCAP_VLAN_MTU;
838 	ifp->if_start = epair_start;
839 	ifp->if_ioctl = epair_ioctl;
840 	ifp->if_init  = epair_init;
841 	if_setsendqlen(ifp, ifqmaxlen);
842 	if_setsendqready(ifp);
843 
844 	/*
845 	 * Calculate the etheraddr hashing the hostid and the
846 	 * interface index. The result would be hopefully unique
847 	 */
848 	getcredhostid(curthread->td_ucred, (unsigned long *)&hostid);
849 	if (hostid == 0)
850 		arc4rand(&hostid, sizeof(hostid), 0);
851 	key[0] = (uint32_t)ifp->if_index;
852 	key[1] = (uint32_t)(hostid & 0xffffffff);
853 	key[2] = (uint32_t)((hostid >> 32) & 0xfffffffff);
854 	hash = jenkins_hash32(key, 3, 0);
855 
856 	eaddr[0] = 0x02;
857 	memcpy(&eaddr[1], &hash, 4);
858 	eaddr[5] = 0x0a;
859 	ether_ifattach(ifp, eaddr);
860 	sca->if_qflush = ifp->if_qflush;
861 	ifp->if_qflush = epair_qflush;
862 	ifp->if_transmit = epair_transmit;
863 	ifp->if_baudrate = IF_Gbps(10);	/* arbitrary maximum */
864 
865 	/* Swap the name and finish initialization of interface <n>b. */
866 	*dp = 'b';
867 
868 	ifp = scb->ifp;
869 	ifp->if_softc = scb;
870 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
871 	ifp->if_dname = epairname;
872 	ifp->if_dunit = unit;
873 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
874 	ifp->if_capabilities = IFCAP_VLAN_MTU;
875 	ifp->if_capenable = IFCAP_VLAN_MTU;
876 	ifp->if_start = epair_start;
877 	ifp->if_ioctl = epair_ioctl;
878 	ifp->if_init  = epair_init;
879 	if_setsendqlen(ifp, ifqmaxlen);
880 	if_setsendqready(ifp);
881 	/* We need to play some tricks here for the second interface. */
882 	strlcpy(name, epairname, len);
883 	error = if_clone_create(name, len, (caddr_t)scb);
884 	if (error)
885 		panic("%s: if_clone_create() for our 2nd iface failed: %d",
886 		    __func__, error);
887 	scb->if_qflush = ifp->if_qflush;
888 	ifp->if_qflush = epair_qflush;
889 	ifp->if_transmit = epair_transmit;
890 	ifp->if_baudrate = IF_Gbps(10);	/* arbitrary maximum */
891 
892 	/*
893 	 * Restore name to <n>a as the ifp for this will go into the
894 	 * cloner list for the initial call.
895 	 */
896 	strlcpy(name, sca->ifp->if_xname, len);
897 	DPRINTF("name='%s/%db' created sca=%p scb=%p\n", name, unit, sca, scb);
898 
899 	/* Tell the world, that we are ready to rock. */
900 	sca->ifp->if_drv_flags |= IFF_DRV_RUNNING;
901 	scb->ifp->if_drv_flags |= IFF_DRV_RUNNING;
902 	if_link_state_change(sca->ifp, LINK_STATE_UP);
903 	if_link_state_change(scb->ifp, LINK_STATE_UP);
904 
905 	return (0);
906 }
907 
908 static int
909 epair_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
910 {
911 	struct ifnet *oifp;
912 	struct epair_softc *sca, *scb;
913 	int unit, error;
914 
915 	DPRINTF("ifp=%p\n", ifp);
916 
917 	/*
918 	 * In case we called into if_clone_destroyif() ourselves
919 	 * again to remove the second interface, the softc will be
920 	 * NULL. In that case so not do anything but return success.
921 	 */
922 	if (ifp->if_softc == NULL)
923 		return (0);
924 
925 	unit = ifp->if_dunit;
926 	sca = ifp->if_softc;
927 	oifp = sca->oifp;
928 	scb = oifp->if_softc;
929 
930 	DPRINTF("ifp=%p oifp=%p\n", ifp, oifp);
931 	if_link_state_change(ifp, LINK_STATE_DOWN);
932 	if_link_state_change(oifp, LINK_STATE_DOWN);
933 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
934 	oifp->if_drv_flags &= ~IFF_DRV_RUNNING;
935 
936 	/*
937 	 * Get rid of our second half. As the other of the two
938 	 * interfaces may reside in a different vnet, we need to
939 	 * switch before freeing them.
940 	 */
941 	CURVNET_SET_QUIET(oifp->if_vnet);
942 	ether_ifdetach(oifp);
943 	/*
944 	 * Wait for all packets to be dispatched to if_input.
945 	 * The numbers can only go down as the interface is
946 	 * detached so there is no need to use atomics.
947 	 */
948 	DPRINTF("scb refcnt=%u\n", scb->refcount);
949 	EPAIR_REFCOUNT_ASSERT(scb->refcount == 1,
950 	    ("%s: ifp=%p scb->refcount!=1: %d", __func__, oifp, scb->refcount));
951 	oifp->if_softc = NULL;
952 	error = if_clone_destroyif(ifc, oifp);
953 	if (error)
954 		panic("%s: if_clone_destroyif() for our 2nd iface failed: %d",
955 		    __func__, error);
956 	if_free(oifp);
957 	ifmedia_removeall(&scb->media);
958 	free(scb, M_EPAIR);
959 	CURVNET_RESTORE();
960 
961 	ether_ifdetach(ifp);
962 	/*
963 	 * Wait for all packets to be dispatched to if_input.
964 	 */
965 	DPRINTF("sca refcnt=%u\n", sca->refcount);
966 	EPAIR_REFCOUNT_ASSERT(sca->refcount == 1,
967 	    ("%s: ifp=%p sca->refcount!=1: %d", __func__, ifp, sca->refcount));
968 	if_free(ifp);
969 	ifmedia_removeall(&sca->media);
970 	free(sca, M_EPAIR);
971 	ifc_free_unit(ifc, unit);
972 
973 	return (0);
974 }
975 
976 static void
977 vnet_epair_init(const void *unused __unused)
978 {
979 
980 	V_epair_cloner = if_clone_advanced(epairname, 0,
981 	    epair_clone_match, epair_clone_create, epair_clone_destroy);
982 #ifdef VIMAGE
983 	netisr_register_vnet(&epair_nh);
984 #endif
985 }
986 VNET_SYSINIT(vnet_epair_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
987     vnet_epair_init, NULL);
988 
989 static void
990 vnet_epair_uninit(const void *unused __unused)
991 {
992 
993 #ifdef VIMAGE
994 	netisr_unregister_vnet(&epair_nh);
995 #endif
996 	if_clone_detach(V_epair_cloner);
997 }
998 VNET_SYSUNINIT(vnet_epair_uninit, SI_SUB_INIT_IF, SI_ORDER_ANY,
999     vnet_epair_uninit, NULL);
1000 
1001 static void
1002 epair_uninit(const void *unused __unused)
1003 {
1004 	netisr_unregister(&epair_nh);
1005 	epair_dpcpu_detach();
1006 	if (bootverbose)
1007 		printf("%s unloaded.\n", epairname);
1008 }
1009 SYSUNINIT(epair_uninit, SI_SUB_INIT_IF, SI_ORDER_MIDDLE,
1010     epair_uninit, NULL);
1011 
1012 static int
1013 epair_modevent(module_t mod, int type, void *data)
1014 {
1015 	int qlimit;
1016 
1017 	switch (type) {
1018 	case MOD_LOAD:
1019 		/* For now limit us to one global mutex and one inq. */
1020 		epair_dpcpu_init();
1021 		epair_nh.nh_qlimit = 42 * ifqmaxlen; /* 42 shall be the number. */
1022 		if (TUNABLE_INT_FETCH("net.link.epair.netisr_maxqlen", &qlimit))
1023 		    epair_nh.nh_qlimit = qlimit;
1024 		netisr_register(&epair_nh);
1025 		if (bootverbose)
1026 			printf("%s initialized.\n", epairname);
1027 		break;
1028 	case MOD_UNLOAD:
1029 		/* Handled in epair_uninit() */
1030 		break;
1031 	default:
1032 		return (EOPNOTSUPP);
1033 	}
1034 	return (0);
1035 }
1036 
1037 static moduledata_t epair_mod = {
1038 	"if_epair",
1039 	epair_modevent,
1040 	0
1041 };
1042 
1043 DECLARE_MODULE(if_epair, epair_mod, SI_SUB_PSEUDO, SI_ORDER_MIDDLE);
1044 MODULE_VERSION(if_epair, 1);
1045