xref: /freebsd/sys/dev/netmap/netmap_vale.c (revision 529a53abe2287eae08a3af62749273df775254e9)
1 /*
2  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * This module implements the VALE switch for netmap
29 
30 --- VALE SWITCH ---
31 
32 NMG_LOCK() serializes all modifications to switches and ports.
33 A switch cannot be deleted until all ports are gone.
34 
35 For each switch, an SX lock (RWlock on linux) protects
36 deletion of ports. When configuring or deleting a new port, the
37 lock is acquired in exclusive mode (after holding NMG_LOCK).
38 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
39 The lock is held throughout the entire forwarding cycle,
40 during which the thread may incur in a page fault.
41 Hence it is important that sleepable shared locks are used.
42 
43 On the rx ring, the per-port lock is grabbed initially to reserve
44 a number of slot in the ring, then the lock is released,
45 packets are copied from source to destination, and then
46 the lock is acquired again and the receive ring is updated.
47 (A similar thing is done on the tx ring for NIC and host stack
48 ports attached to the switch)
49 
50  */
51 
52 /*
53  * OS-specific code that is used only within this file.
54  * Other OS-specific code that must be accessed by drivers
55  * is present in netmap_kern.h
56  */
57 
58 #if defined(__FreeBSD__)
59 #include <sys/cdefs.h> /* prerequisite */
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/types.h>
63 #include <sys/errno.h>
64 #include <sys/param.h>	/* defines used in kernel.h */
65 #include <sys/kernel.h>	/* types used in module initialization */
66 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
67 #include <sys/sockio.h>
68 #include <sys/socketvar.h>	/* struct socket */
69 #include <sys/malloc.h>
70 #include <sys/poll.h>
71 #include <sys/rwlock.h>
72 #include <sys/socket.h> /* sockaddrs */
73 #include <sys/selinfo.h>
74 #include <sys/sysctl.h>
75 #include <net/if.h>
76 #include <net/if_var.h>
77 #include <net/bpf.h>		/* BIOCIMMEDIATE */
78 #include <machine/bus.h>	/* bus_dmamap_* */
79 #include <sys/endian.h>
80 #include <sys/refcount.h>
81 
82 
83 #define BDG_RWLOCK_T		struct rwlock // struct rwlock
84 
85 #define	BDG_RWINIT(b)		\
86 	rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
87 #define BDG_WLOCK(b)		rw_wlock(&(b)->bdg_lock)
88 #define BDG_WUNLOCK(b)		rw_wunlock(&(b)->bdg_lock)
89 #define BDG_RLOCK(b)		rw_rlock(&(b)->bdg_lock)
90 #define BDG_RTRYLOCK(b)		rw_try_rlock(&(b)->bdg_lock)
91 #define BDG_RUNLOCK(b)		rw_runlock(&(b)->bdg_lock)
92 #define BDG_RWDESTROY(b)	rw_destroy(&(b)->bdg_lock)
93 
94 
95 #elif defined(linux)
96 
97 #include "bsd_glue.h"
98 
99 #elif defined(__APPLE__)
100 
101 #warning OSX support is only partial
102 #include "osx_glue.h"
103 
104 #else
105 
106 #error	Unsupported platform
107 
108 #endif /* unsupported */
109 
110 /*
111  * common headers
112  */
113 
114 #include <net/netmap.h>
115 #include <dev/netmap/netmap_kern.h>
116 #include <dev/netmap/netmap_mem2.h>
117 
118 #ifdef WITH_VALE
119 
120 /*
121  * system parameters (most of them in netmap_kern.h)
122  * NM_NAME	prefix for switch port names, default "vale"
123  * NM_BDG_MAXPORTS	number of ports
124  * NM_BRIDGES	max number of switches in the system.
125  *	XXX should become a sysctl or tunable
126  *
127  * Switch ports are named valeX:Y where X is the switch name and Y
128  * is the port. If Y matches a physical interface name, the port is
129  * connected to a physical device.
130  *
131  * Unlike physical interfaces, switch ports use their own memory region
132  * for rings and buffers.
133  * The virtual interfaces use per-queue lock instead of core lock.
134  * In the tx loop, we aggregate traffic in batches to make all operations
135  * faster. The batch size is bridge_batch.
136  */
137 #define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
138 #define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
139 #define NM_BRIDGE_RINGSIZE	1024	/* in the device */
140 #define NM_BDG_HASH		1024	/* forwarding table entries */
141 #define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
142 #define NM_MULTISEG		64	/* max size of a chain of bufs */
143 /* actual size of the tables */
144 #define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NM_MULTISEG)
145 /* NM_FT_NULL terminates a list of slots in the ft */
146 #define NM_FT_NULL		NM_BDG_BATCH_MAX
147 #define	NM_BRIDGES		8	/* number of bridges */
148 
149 
150 /*
151  * bridge_batch is set via sysctl to the max batch size to be
152  * used in the bridge. The actual value may be larger as the
153  * last packet in the block may overflow the size.
154  */
155 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
156 SYSCTL_DECL(_dev_netmap);
157 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
158 
159 
160 static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp);
161 static int bdg_netmap_reg(struct netmap_adapter *na, int onoff);
162 static int netmap_bwrap_attach(struct ifnet *, struct ifnet *);
163 static int netmap_bwrap_register(struct netmap_adapter *, int onoff);
164 int kern_netmap_regif(struct nmreq *nmr);
165 
166 /*
167  * Each transmit queue accumulates a batch of packets into
168  * a structure before forwarding. Packets to the same
169  * destination are put in a list using ft_next as a link field.
170  * ft_frags and ft_next are valid only on the first fragment.
171  */
172 struct nm_bdg_fwd {	/* forwarding entry for a bridge */
173 	void *ft_buf;		/* netmap or indirect buffer */
174 	uint8_t ft_frags;	/* how many fragments (only on 1st frag) */
175 	uint8_t _ft_port;	/* dst port (unused) */
176 	uint16_t ft_flags;	/* flags, e.g. indirect */
177 	uint16_t ft_len;	/* src fragment len */
178 	uint16_t ft_next;	/* next packet to same destination */
179 };
180 
181 /*
182  * For each output interface, nm_bdg_q is used to construct a list.
183  * bq_len is the number of output buffers (we can have coalescing
184  * during the copy).
185  */
186 struct nm_bdg_q {
187 	uint16_t bq_head;
188 	uint16_t bq_tail;
189 	uint32_t bq_len;	/* number of buffers */
190 };
191 
192 /* XXX revise this */
193 struct nm_hash_ent {
194 	uint64_t	mac;	/* the top 2 bytes are the epoch */
195 	uint64_t	ports;
196 };
197 
198 /*
199  * nm_bridge is a descriptor for a VALE switch.
200  * Interfaces for a bridge are all in bdg_ports[].
201  * The array has fixed size, an empty entry does not terminate
202  * the search, but lookups only occur on attach/detach so we
203  * don't mind if they are slow.
204  *
205  * The bridge is non blocking on the transmit ports: excess
206  * packets are dropped if there is no room on the output port.
207  *
208  * bdg_lock protects accesses to the bdg_ports array.
209  * This is a rw lock (or equivalent).
210  */
211 struct nm_bridge {
212 	/* XXX what is the proper alignment/layout ? */
213 	BDG_RWLOCK_T	bdg_lock;	/* protects bdg_ports */
214 	int		bdg_namelen;
215 	uint32_t	bdg_active_ports; /* 0 means free */
216 	char		bdg_basename[IFNAMSIZ];
217 
218 	/* Indexes of active ports (up to active_ports)
219 	 * and all other remaining ports.
220 	 */
221 	uint8_t		bdg_port_index[NM_BDG_MAXPORTS];
222 
223 	struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
224 
225 
226 	/*
227 	 * The function to decide the destination port.
228 	 * It returns either of an index of the destination port,
229 	 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
230 	 * forward this packet.  ring_nr is the source ring index, and the
231 	 * function may overwrite this value to forward this packet to a
232 	 * different ring index.
233 	 * This function must be set by netmap_bdgctl().
234 	 */
235 	bdg_lookup_fn_t nm_bdg_lookup;
236 
237 	/* the forwarding table, MAC+ports.
238 	 * XXX should be changed to an argument to be passed to
239 	 * the lookup function, and allocated on attach
240 	 */
241 	struct nm_hash_ent ht[NM_BDG_HASH];
242 };
243 
244 
245 /*
246  * XXX in principle nm_bridges could be created dynamically
247  * Right now we have a static array and deletions are protected
248  * by an exclusive lock.
249  */
250 struct nm_bridge nm_bridges[NM_BRIDGES];
251 
252 
253 /*
254  * this is a slightly optimized copy routine which rounds
255  * to multiple of 64 bytes and is often faster than dealing
256  * with other odd sizes. We assume there is enough room
257  * in the source and destination buffers.
258  *
259  * XXX only for multiples of 64 bytes, non overlapped.
260  */
261 static inline void
262 pkt_copy(void *_src, void *_dst, int l)
263 {
264         uint64_t *src = _src;
265         uint64_t *dst = _dst;
266         if (unlikely(l >= 1024)) {
267                 memcpy(dst, src, l);
268                 return;
269         }
270         for (; likely(l > 0); l-=64) {
271                 *dst++ = *src++;
272                 *dst++ = *src++;
273                 *dst++ = *src++;
274                 *dst++ = *src++;
275                 *dst++ = *src++;
276                 *dst++ = *src++;
277                 *dst++ = *src++;
278                 *dst++ = *src++;
279         }
280 }
281 
282 
283 /*
284  * locate a bridge among the existing ones.
285  * MUST BE CALLED WITH NMG_LOCK()
286  *
287  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
288  * We assume that this is called with a name of at least NM_NAME chars.
289  */
290 static struct nm_bridge *
291 nm_find_bridge(const char *name, int create)
292 {
293 	int i, l, namelen;
294 	struct nm_bridge *b = NULL;
295 
296 	NMG_LOCK_ASSERT();
297 
298 	namelen = strlen(NM_NAME);	/* base length */
299 	l = name ? strlen(name) : 0;		/* actual length */
300 	if (l < namelen) {
301 		D("invalid bridge name %s", name ? name : NULL);
302 		return NULL;
303 	}
304 	for (i = namelen + 1; i < l; i++) {
305 		if (name[i] == ':') {
306 			namelen = i;
307 			break;
308 		}
309 	}
310 	if (namelen >= IFNAMSIZ)
311 		namelen = IFNAMSIZ;
312 	ND("--- prefix is '%.*s' ---", namelen, name);
313 
314 	/* lookup the name, remember empty slot if there is one */
315 	for (i = 0; i < NM_BRIDGES; i++) {
316 		struct nm_bridge *x = nm_bridges + i;
317 
318 		if (x->bdg_active_ports == 0) {
319 			if (create && b == NULL)
320 				b = x;	/* record empty slot */
321 		} else if (x->bdg_namelen != namelen) {
322 			continue;
323 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
324 			ND("found '%.*s' at %d", namelen, name, i);
325 			b = x;
326 			break;
327 		}
328 	}
329 	if (i == NM_BRIDGES && b) { /* name not found, can create entry */
330 		/* initialize the bridge */
331 		strncpy(b->bdg_basename, name, namelen);
332 		ND("create new bridge %s with ports %d", b->bdg_basename,
333 			b->bdg_active_ports);
334 		b->bdg_namelen = namelen;
335 		b->bdg_active_ports = 0;
336 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
337 			b->bdg_port_index[i] = i;
338 		/* set the default function */
339 		b->nm_bdg_lookup = netmap_bdg_learning;
340 		/* reset the MAC address table */
341 		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
342 	}
343 	return b;
344 }
345 
346 
347 /*
348  * Free the forwarding tables for rings attached to switch ports.
349  */
350 static void
351 nm_free_bdgfwd(struct netmap_adapter *na)
352 {
353 	int nrings, i;
354 	struct netmap_kring *kring;
355 
356 	NMG_LOCK_ASSERT();
357 	nrings = na->num_tx_rings;
358 	kring = na->tx_rings;
359 	for (i = 0; i < nrings; i++) {
360 		if (kring[i].nkr_ft) {
361 			free(kring[i].nkr_ft, M_DEVBUF);
362 			kring[i].nkr_ft = NULL; /* protect from freeing twice */
363 		}
364 	}
365 }
366 
367 
368 /*
369  * Allocate the forwarding tables for the rings attached to the bridge ports.
370  */
371 static int
372 nm_alloc_bdgfwd(struct netmap_adapter *na)
373 {
374 	int nrings, l, i, num_dstq;
375 	struct netmap_kring *kring;
376 
377 	NMG_LOCK_ASSERT();
378 	/* all port:rings + broadcast */
379 	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
380 	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
381 	l += sizeof(struct nm_bdg_q) * num_dstq;
382 	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
383 
384 	nrings = na->num_tx_rings + 1;
385 	kring = na->tx_rings;
386 	for (i = 0; i < nrings; i++) {
387 		struct nm_bdg_fwd *ft;
388 		struct nm_bdg_q *dstq;
389 		int j;
390 
391 		ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
392 		if (!ft) {
393 			nm_free_bdgfwd(na);
394 			return ENOMEM;
395 		}
396 		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
397 		for (j = 0; j < num_dstq; j++) {
398 			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
399 			dstq[j].bq_len = 0;
400 		}
401 		kring[i].nkr_ft = ft;
402 	}
403 	return 0;
404 }
405 
406 
407 static void
408 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
409 {
410 	int s_hw = hw, s_sw = sw;
411 	int i, lim =b->bdg_active_ports;
412 	uint8_t tmp[NM_BDG_MAXPORTS];
413 
414 	/*
415 	New algorithm:
416 	make a copy of bdg_port_index;
417 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
418 	in the array of bdg_port_index, replacing them with
419 	entries from the bottom of the array;
420 	decrement bdg_active_ports;
421 	acquire BDG_WLOCK() and copy back the array.
422 	 */
423 
424 	D("detach %d and %d (lim %d)", hw, sw, lim);
425 	/* make a copy of the list of active ports, update it,
426 	 * and then copy back within BDG_WLOCK().
427 	 */
428 	memcpy(tmp, b->bdg_port_index, sizeof(tmp));
429 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
430 		if (hw >= 0 && tmp[i] == hw) {
431 			ND("detach hw %d at %d", hw, i);
432 			lim--; /* point to last active port */
433 			tmp[i] = tmp[lim]; /* swap with i */
434 			tmp[lim] = hw;	/* now this is inactive */
435 			hw = -1;
436 		} else if (sw >= 0 && tmp[i] == sw) {
437 			ND("detach sw %d at %d", sw, i);
438 			lim--;
439 			tmp[i] = tmp[lim];
440 			tmp[lim] = sw;
441 			sw = -1;
442 		} else {
443 			i++;
444 		}
445 	}
446 	if (hw >= 0 || sw >= 0) {
447 		D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
448 	}
449 
450 	BDG_WLOCK(b);
451 	b->bdg_ports[s_hw] = NULL;
452 	if (s_sw >= 0) {
453 		b->bdg_ports[s_sw] = NULL;
454 	}
455 	memcpy(b->bdg_port_index, tmp, sizeof(tmp));
456 	b->bdg_active_ports = lim;
457 	BDG_WUNLOCK(b);
458 
459 	ND("now %d active ports", lim);
460 	if (lim == 0) {
461 		ND("marking bridge %s as free", b->bdg_basename);
462 		b->nm_bdg_lookup = NULL;
463 	}
464 }
465 
466 
467 static void
468 netmap_adapter_vp_dtor(struct netmap_adapter *na)
469 {
470 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
471 	struct nm_bridge *b = vpna->na_bdg;
472 	struct ifnet *ifp = na->ifp;
473 
474 	ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount);
475 
476 	if (b) {
477 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
478 	}
479 
480 	bzero(ifp, sizeof(*ifp));
481 	free(ifp, M_DEVBUF);
482 	na->ifp = NULL;
483 }
484 
485 
486 /* Try to get a reference to a netmap adapter attached to a VALE switch.
487  * If the adapter is found (or is created), this function returns 0, a
488  * non NULL pointer is returned into *na, and the caller holds a
489  * reference to the adapter.
490  * If an adapter is not found, then no reference is grabbed and the
491  * function returns an error code, or 0 if there is just a VALE prefix
492  * mismatch. Therefore the caller holds a reference when
493  * (*na != NULL && return == 0).
494  */
495 int
496 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
497 {
498 	const char *name = nmr->nr_name;
499 	struct ifnet *ifp;
500 	int error = 0;
501 	struct netmap_adapter *ret;
502 	struct netmap_vp_adapter *vpna;
503 	struct nm_bridge *b;
504 	int i, j, cand = -1, cand2 = -1;
505 	int needed;
506 
507 	*na = NULL;     /* default return value */
508 
509 	/* first try to see if this is a bridge port. */
510 	NMG_LOCK_ASSERT();
511 	if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
512 		return 0;  /* no error, but no VALE prefix */
513 	}
514 
515 	b = nm_find_bridge(name, create);
516 	if (b == NULL) {
517 		D("no bridges available for '%s'", name);
518 		return (create ? ENOMEM : ENXIO);
519 	}
520 
521 	/* Now we are sure that name starts with the bridge's name,
522 	 * lookup the port in the bridge. We need to scan the entire
523 	 * list. It is not important to hold a WLOCK on the bridge
524 	 * during the search because NMG_LOCK already guarantees
525 	 * that there are no other possible writers.
526 	 */
527 
528 	/* lookup in the local list of ports */
529 	for (j = 0; j < b->bdg_active_ports; j++) {
530 		i = b->bdg_port_index[j];
531 		vpna = b->bdg_ports[i];
532 		// KASSERT(na != NULL);
533 		ifp = vpna->up.ifp;
534 		/* XXX make sure the name only contains one : */
535 		if (!strcmp(NM_IFPNAME(ifp), name)) {
536 			netmap_adapter_get(&vpna->up);
537 			ND("found existing if %s refs %d", name,
538 				vpna->na_bdg_refcount);
539 			*na = (struct netmap_adapter *)vpna;
540 			return 0;
541 		}
542 	}
543 	/* not found, should we create it? */
544 	if (!create)
545 		return ENXIO;
546 	/* yes we should, see if we have space to attach entries */
547 	needed = 2; /* in some cases we only need 1 */
548 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
549 		D("bridge full %d, cannot create new port", b->bdg_active_ports);
550 		return ENOMEM;
551 	}
552 	/* record the next two ports available, but do not allocate yet */
553 	cand = b->bdg_port_index[b->bdg_active_ports];
554 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
555 	ND("+++ bridge %s port %s used %d avail %d %d",
556 		b->bdg_basename, name, b->bdg_active_ports, cand, cand2);
557 
558 	/*
559 	 * try see if there is a matching NIC with this name
560 	 * (after the bridge's name)
561 	 */
562 	ifp = ifunit_ref(name + b->bdg_namelen + 1);
563 	if (!ifp) { /* this is a virtual port */
564 		if (nmr->nr_cmd) {
565 			/* nr_cmd must be 0 for a virtual port */
566 			return EINVAL;
567 		}
568 
569 	 	/* create a struct ifnet for the new port.
570 		 * need M_NOWAIT as we are under nma_lock
571 		 */
572 		ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
573 		if (!ifp)
574 			return ENOMEM;
575 
576 		strcpy(ifp->if_xname, name);
577 		/* bdg_netmap_attach creates a struct netmap_adapter */
578 		error = bdg_netmap_attach(nmr, ifp);
579 		if (error) {
580 			D("error %d", error);
581 			free(ifp, M_DEVBUF);
582 			return error;
583 		}
584 		ret = NA(ifp);
585 		cand2 = -1;	/* only need one port */
586 	} else {  /* this is a NIC */
587 		struct ifnet *fake_ifp;
588 
589 		error = netmap_get_hw_na(ifp, &ret);
590 		if (error || ret == NULL)
591 			goto out;
592 
593 		/* make sure the NIC is not already in use */
594 		if (NETMAP_OWNED_BY_ANY(ret)) {
595 			D("NIC %s busy, cannot attach to bridge",
596 				NM_IFPNAME(ifp));
597 			error = EBUSY;
598 			goto out;
599 		}
600 		/* create a fake interface */
601 		fake_ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
602 		if (!fake_ifp) {
603 			error = ENOMEM;
604 			goto out;
605 		}
606 		strcpy(fake_ifp->if_xname, name);
607 		error = netmap_bwrap_attach(fake_ifp, ifp);
608 		if (error) {
609 			free(fake_ifp, M_DEVBUF);
610 			goto out;
611 		}
612 		ret = NA(fake_ifp);
613 		if (nmr->nr_arg1 != NETMAP_BDG_HOST)
614 			cand2 = -1; /* only need one port */
615 		if_rele(ifp);
616 	}
617 	vpna = (struct netmap_vp_adapter *)ret;
618 
619 	BDG_WLOCK(b);
620 	vpna->bdg_port = cand;
621 	ND("NIC  %p to bridge port %d", vpna, cand);
622 	/* bind the port to the bridge (virtual ports are not active) */
623 	b->bdg_ports[cand] = vpna;
624 	vpna->na_bdg = b;
625 	b->bdg_active_ports++;
626 	if (cand2 >= 0) {
627 		struct netmap_vp_adapter *hostna = vpna + 1;
628 		/* also bind the host stack to the bridge */
629 		b->bdg_ports[cand2] = hostna;
630 		hostna->bdg_port = cand2;
631 		hostna->na_bdg = b;
632 		b->bdg_active_ports++;
633 		ND("host %p to bridge port %d", hostna, cand2);
634 	}
635 	ND("if %s refs %d", name, vpna->up.na_refcount);
636 	BDG_WUNLOCK(b);
637 	*na = ret;
638 	netmap_adapter_get(ret);
639 	return 0;
640 
641 out:
642 	if_rele(ifp);
643 
644 	return error;
645 }
646 
647 
648 /* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
649 static int
650 nm_bdg_attach(struct nmreq *nmr)
651 {
652 	struct netmap_adapter *na;
653 	struct netmap_if *nifp;
654 	struct netmap_priv_d *npriv;
655 	struct netmap_bwrap_adapter *bna;
656 	int error;
657 
658 	npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
659 	if (npriv == NULL)
660 		return ENOMEM;
661 
662 	NMG_LOCK();
663 
664 	error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */);
665 	if (error) /* no device, or another bridge or user owns the device */
666 		goto unlock_exit;
667 
668 	if (na == NULL) { /* VALE prefix missing */
669 		error = EINVAL;
670 		goto unlock_exit;
671 	}
672 
673 	if (na->active_fds > 0) { /* already registered */
674 		error = EBUSY;
675 		goto unref_exit;
676 	}
677 
678 	nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, &error);
679 	if (!nifp) {
680 		goto unref_exit;
681 	}
682 
683 	bna = (struct netmap_bwrap_adapter*)na;
684 	bna->na_kpriv = npriv;
685 	NMG_UNLOCK();
686 	ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp));
687 	return 0;
688 
689 unref_exit:
690 	netmap_adapter_put(na);
691 unlock_exit:
692 	NMG_UNLOCK();
693 	bzero(npriv, sizeof(*npriv));
694 	free(npriv, M_DEVBUF);
695 	return error;
696 }
697 
698 
699 static int
700 nm_bdg_detach(struct nmreq *nmr)
701 {
702 	struct netmap_adapter *na;
703 	int error;
704 	struct netmap_bwrap_adapter *bna;
705 	int last_instance;
706 
707 	NMG_LOCK();
708 	error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */);
709 	if (error) { /* no device, or another bridge or user owns the device */
710 		goto unlock_exit;
711 	}
712 
713 	if (na == NULL) { /* VALE prefix missing */
714 		error = EINVAL;
715 		goto unlock_exit;
716 	}
717 
718 	bna = (struct netmap_bwrap_adapter *)na;
719 
720 	if (na->active_fds == 0) { /* not registered */
721 		error = EINVAL;
722 		goto unref_exit;
723 	}
724 
725 	last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */
726 	if (!last_instance) {
727 		D("--- error, trying to detach an entry with active mmaps");
728 		error = EINVAL;
729 	} else {
730 		struct netmap_priv_d *npriv = bna->na_kpriv;
731 
732 		bna->na_kpriv = NULL;
733 		D("deleting priv");
734 
735 		bzero(npriv, sizeof(*npriv));
736 		free(npriv, M_DEVBUF);
737 	}
738 
739 unref_exit:
740 	netmap_adapter_put(na);
741 unlock_exit:
742 	NMG_UNLOCK();
743 	return error;
744 
745 }
746 
747 
748 /* exported to kernel callers, e.g. OVS ?
749  * Entry point.
750  * Called without NMG_LOCK.
751  */
752 int
753 netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
754 {
755 	struct nm_bridge *b;
756 	struct netmap_adapter *na;
757 	struct netmap_vp_adapter *vpna;
758 	struct ifnet *iter;
759 	char *name = nmr->nr_name;
760 	int cmd = nmr->nr_cmd, namelen = strlen(name);
761 	int error = 0, i, j;
762 
763 	switch (cmd) {
764 	case NETMAP_BDG_ATTACH:
765 		error = nm_bdg_attach(nmr);
766 		break;
767 
768 	case NETMAP_BDG_DETACH:
769 		error = nm_bdg_detach(nmr);
770 		break;
771 
772 	case NETMAP_BDG_LIST:
773 		/* this is used to enumerate bridges and ports */
774 		if (namelen) { /* look up indexes of bridge and port */
775 			if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
776 				error = EINVAL;
777 				break;
778 			}
779 			NMG_LOCK();
780 			b = nm_find_bridge(name, 0 /* don't create */);
781 			if (!b) {
782 				error = ENOENT;
783 				NMG_UNLOCK();
784 				break;
785 			}
786 
787 			error = ENOENT;
788 			for (j = 0; j < b->bdg_active_ports; j++) {
789 				i = b->bdg_port_index[j];
790 				vpna = b->bdg_ports[i];
791 				if (vpna == NULL) {
792 					D("---AAAAAAAAARGH-------");
793 					continue;
794 				}
795 				iter = vpna->up.ifp;
796 				/* the former and the latter identify a
797 				 * virtual port and a NIC, respectively
798 				 */
799 				if (!strcmp(iter->if_xname, name)) {
800 					/* bridge index */
801 					nmr->nr_arg1 = b - nm_bridges;
802 					nmr->nr_arg2 = i; /* port index */
803 					error = 0;
804 					break;
805 				}
806 			}
807 			NMG_UNLOCK();
808 		} else {
809 			/* return the first non-empty entry starting from
810 			 * bridge nr_arg1 and port nr_arg2.
811 			 *
812 			 * Users can detect the end of the same bridge by
813 			 * seeing the new and old value of nr_arg1, and can
814 			 * detect the end of all the bridge by error != 0
815 			 */
816 			i = nmr->nr_arg1;
817 			j = nmr->nr_arg2;
818 
819 			NMG_LOCK();
820 			for (error = ENOENT; i < NM_BRIDGES; i++) {
821 				b = nm_bridges + i;
822 				if (j >= b->bdg_active_ports) {
823 					j = 0; /* following bridges scan from 0 */
824 					continue;
825 				}
826 				nmr->nr_arg1 = i;
827 				nmr->nr_arg2 = j;
828 				j = b->bdg_port_index[j];
829 				vpna = b->bdg_ports[j];
830 				iter = vpna->up.ifp;
831 				strncpy(name, iter->if_xname, (size_t)IFNAMSIZ);
832 				error = 0;
833 				break;
834 			}
835 			NMG_UNLOCK();
836 		}
837 		break;
838 
839 	case NETMAP_BDG_LOOKUP_REG:
840 		/* register a lookup function to the given bridge.
841 		 * nmr->nr_name may be just bridge's name (including ':'
842 		 * if it is not just NM_NAME).
843 		 */
844 		if (!func) {
845 			error = EINVAL;
846 			break;
847 		}
848 		NMG_LOCK();
849 		b = nm_find_bridge(name, 0 /* don't create */);
850 		if (!b) {
851 			error = EINVAL;
852 		} else {
853 			b->nm_bdg_lookup = func;
854 		}
855 		NMG_UNLOCK();
856 		break;
857 
858 	case NETMAP_BDG_OFFSET:
859 		NMG_LOCK();
860 		error = netmap_get_bdg_na(nmr, &na, 0);
861 		if (na && !error) {
862 			vpna = (struct netmap_vp_adapter *)na;
863 			if (nmr->nr_arg1 > NETMAP_BDG_MAX_OFFSET)
864 				nmr->nr_arg1 = NETMAP_BDG_MAX_OFFSET;
865 			vpna->offset = nmr->nr_arg1;
866 			D("Using offset %d for %p", vpna->offset, vpna);
867 			netmap_adapter_put(na);
868 		}
869 		NMG_UNLOCK();
870 		break;
871 
872 	default:
873 		D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
874 		error = EINVAL;
875 		break;
876 	}
877 	return error;
878 }
879 
880 
881 static int
882 netmap_vp_krings_create(struct netmap_adapter *na)
883 {
884 	u_int ntx, nrx, tailroom;
885 	int error, i;
886 	uint32_t *leases;
887 
888 	/* XXX vps do not need host rings,
889 	 * but we crash if we don't have one
890 	 */
891 	ntx = na->num_tx_rings + 1;
892 	nrx = na->num_rx_rings + 1;
893 
894 	/*
895 	 * Leases are attached to RX rings on vale ports
896 	 */
897 	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
898 
899 	error = netmap_krings_create(na, ntx, nrx, tailroom);
900 	if (error)
901 		return error;
902 
903 	leases = na->tailroom;
904 
905 	for (i = 0; i < nrx; i++) { /* Receive rings */
906 		na->rx_rings[i].nkr_leases = leases;
907 		leases += na->num_rx_desc;
908 	}
909 
910 	error = nm_alloc_bdgfwd(na);
911 	if (error) {
912 		netmap_krings_delete(na);
913 		return error;
914 	}
915 
916 	return 0;
917 }
918 
919 
920 static void
921 netmap_vp_krings_delete(struct netmap_adapter *na)
922 {
923 	nm_free_bdgfwd(na);
924 	netmap_krings_delete(na);
925 }
926 
927 
928 static int
929 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
930 	struct netmap_vp_adapter *na, u_int ring_nr);
931 
932 
933 /*
934  * Grab packets from a kring, move them into the ft structure
935  * associated to the tx (input) port. Max one instance per port,
936  * filtered on input (ioctl, poll or XXX).
937  * Returns the next position in the ring.
938  */
939 static int
940 nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr,
941 	struct netmap_kring *kring, u_int end)
942 {
943 	struct netmap_ring *ring = kring->ring;
944 	struct nm_bdg_fwd *ft;
945 	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
946 	u_int ft_i = 0;	/* start from 0 */
947 	u_int frags = 1; /* how many frags ? */
948 	struct nm_bridge *b = na->na_bdg;
949 
950 	/* To protect against modifications to the bridge we acquire a
951 	 * shared lock, waiting if we can sleep (if the source port is
952 	 * attached to a user process) or with a trylock otherwise (NICs).
953 	 */
954 	ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
955 	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
956 		BDG_RLOCK(b);
957 	else if (!BDG_RTRYLOCK(b))
958 		return 0;
959 	ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
960 	ft = kring->nkr_ft;
961 
962 	for (; likely(j != end); j = nm_next(j, lim)) {
963 		struct netmap_slot *slot = &ring->slot[j];
964 		char *buf;
965 
966 		ft[ft_i].ft_len = slot->len;
967 		ft[ft_i].ft_flags = slot->flags;
968 
969 		ND("flags is 0x%x", slot->flags);
970 		/* this slot goes into a list so initialize the link field */
971 		ft[ft_i].ft_next = NM_FT_NULL;
972 		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
973 			(void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot);
974 		__builtin_prefetch(buf);
975 		++ft_i;
976 		if (slot->flags & NS_MOREFRAG) {
977 			frags++;
978 			continue;
979 		}
980 		if (unlikely(netmap_verbose && frags > 1))
981 			RD(5, "%d frags at %d", frags, ft_i - frags);
982 		ft[ft_i - frags].ft_frags = frags;
983 		frags = 1;
984 		if (unlikely((int)ft_i >= bridge_batch))
985 			ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
986 	}
987 	if (frags > 1) {
988 		D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);
989 		// ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG
990 		ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;
991 		ft[ft_i - frags].ft_frags = frags - 1;
992 	}
993 	if (ft_i)
994 		ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
995 	BDG_RUNLOCK(b);
996 	return j;
997 }
998 
999 
1000 /* ----- FreeBSD if_bridge hash function ------- */
1001 
1002 /*
1003  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1004  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1005  *
1006  * http://www.burtleburtle.net/bob/hash/spooky.html
1007  */
1008 #define mix(a, b, c)                                                    \
1009 do {                                                                    \
1010         a -= b; a -= c; a ^= (c >> 13);                                 \
1011         b -= c; b -= a; b ^= (a << 8);                                  \
1012         c -= a; c -= b; c ^= (b >> 13);                                 \
1013         a -= b; a -= c; a ^= (c >> 12);                                 \
1014         b -= c; b -= a; b ^= (a << 16);                                 \
1015         c -= a; c -= b; c ^= (b >> 5);                                  \
1016         a -= b; a -= c; a ^= (c >> 3);                                  \
1017         b -= c; b -= a; b ^= (a << 10);                                 \
1018         c -= a; c -= b; c ^= (b >> 15);                                 \
1019 } while (/*CONSTCOND*/0)
1020 
1021 
1022 static __inline uint32_t
1023 nm_bridge_rthash(const uint8_t *addr)
1024 {
1025         uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1026 
1027         b += addr[5] << 8;
1028         b += addr[4];
1029         a += addr[3] << 24;
1030         a += addr[2] << 16;
1031         a += addr[1] << 8;
1032         a += addr[0];
1033 
1034         mix(a, b, c);
1035 #define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
1036         return (c & BRIDGE_RTHASH_MASK);
1037 }
1038 
1039 #undef mix
1040 
1041 
1042 static int
1043 bdg_netmap_reg(struct netmap_adapter *na, int onoff)
1044 {
1045 	struct netmap_vp_adapter *vpna =
1046 		(struct netmap_vp_adapter*)na;
1047 	struct ifnet *ifp = na->ifp;
1048 
1049 	/* the interface is already attached to the bridge,
1050 	 * so we only need to toggle IFCAP_NETMAP.
1051 	 */
1052 	BDG_WLOCK(vpna->na_bdg);
1053 	if (onoff) {
1054 		ifp->if_capenable |= IFCAP_NETMAP;
1055 	} else {
1056 		ifp->if_capenable &= ~IFCAP_NETMAP;
1057 	}
1058 	BDG_WUNLOCK(vpna->na_bdg);
1059 	return 0;
1060 }
1061 
1062 
1063 /*
1064  * Lookup function for a learning bridge.
1065  * Update the hash table with the source address,
1066  * and then returns the destination port index, and the
1067  * ring in *dst_ring (at the moment, always use ring 0)
1068  */
1069 u_int
1070 netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring,
1071 		struct netmap_vp_adapter *na)
1072 {
1073 	struct nm_hash_ent *ht = na->na_bdg->ht;
1074 	uint32_t sh, dh;
1075 	u_int dst, mysrc = na->bdg_port;
1076 	uint64_t smac, dmac;
1077 
1078 	if (buf_len < 14) {
1079 		D("invalid buf length %d", buf_len);
1080 		return NM_BDG_NOPORT;
1081 	}
1082 	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1083 	smac = le64toh(*(uint64_t *)(buf + 4));
1084 	smac >>= 16;
1085 
1086 	/*
1087 	 * The hash is somewhat expensive, there might be some
1088 	 * worthwhile optimizations here.
1089 	 */
1090 	if ((buf[6] & 1) == 0) { /* valid src */
1091 		uint8_t *s = buf+6;
1092 		sh = nm_bridge_rthash(s); // XXX hash of source
1093 		/* update source port forwarding entry */
1094 		ht[sh].mac = smac;	/* XXX expire ? */
1095 		ht[sh].ports = mysrc;
1096 		if (netmap_verbose)
1097 		    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1098 			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1099 	}
1100 	dst = NM_BDG_BROADCAST;
1101 	if ((buf[0] & 1) == 0) { /* unicast */
1102 		dh = nm_bridge_rthash(buf); // XXX hash of dst
1103 		if (ht[dh].mac == dmac) {	/* found dst */
1104 			dst = ht[dh].ports;
1105 		}
1106 		/* XXX otherwise return NM_BDG_UNKNOWN ? */
1107 	}
1108 	*dst_ring = 0;
1109 	return dst;
1110 }
1111 
1112 
1113 /*
1114  * Available space in the ring. Only used in VALE code
1115  * and only with is_rx = 1
1116  */
1117 static inline uint32_t
1118 nm_kr_space(struct netmap_kring *k, int is_rx)
1119 {
1120 	int space;
1121 
1122 	if (is_rx) {
1123 		int busy = k->nkr_hwlease - k->nr_hwcur;
1124 		if (busy < 0)
1125 			busy += k->nkr_num_slots;
1126 		space = k->nkr_num_slots - 1 - busy;
1127 	} else {
1128 		/* XXX never used in this branch */
1129 		space = k->nr_hwtail - k->nkr_hwlease;
1130 		if (space < 0)
1131 			space += k->nkr_num_slots;
1132 	}
1133 #if 0
1134 	// sanity check
1135 	if (k->nkr_hwlease >= k->nkr_num_slots ||
1136 		k->nr_hwcur >= k->nkr_num_slots ||
1137 		k->nr_tail >= k->nkr_num_slots ||
1138 		busy < 0 ||
1139 		busy >= k->nkr_num_slots) {
1140 		D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1141 			k->nkr_lease_idx, k->nkr_num_slots);
1142 	}
1143 #endif
1144 	return space;
1145 }
1146 
1147 
1148 
1149 
1150 /* make a lease on the kring for N positions. return the
1151  * lease index
1152  * XXX only used in VALE code and with is_rx = 1
1153  */
1154 static inline uint32_t
1155 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1156 {
1157 	uint32_t lim = k->nkr_num_slots - 1;
1158 	uint32_t lease_idx = k->nkr_lease_idx;
1159 
1160 	k->nkr_leases[lease_idx] = NR_NOSLOT;
1161 	k->nkr_lease_idx = nm_next(lease_idx, lim);
1162 
1163 	if (n > nm_kr_space(k, is_rx)) {
1164 		D("invalid request for %d slots", n);
1165 		panic("x");
1166 	}
1167 	/* XXX verify that there are n slots */
1168 	k->nkr_hwlease += n;
1169 	if (k->nkr_hwlease > lim)
1170 		k->nkr_hwlease -= lim + 1;
1171 
1172 	if (k->nkr_hwlease >= k->nkr_num_slots ||
1173 		k->nr_hwcur >= k->nkr_num_slots ||
1174 		k->nr_hwtail >= k->nkr_num_slots ||
1175 		k->nkr_lease_idx >= k->nkr_num_slots) {
1176 		D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1177 			k->na->ifp->if_xname,
1178 			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1179 			k->nkr_lease_idx, k->nkr_num_slots);
1180 	}
1181 	return lease_idx;
1182 }
1183 
1184 /*
1185  * This flush routine supports only unicast and broadcast but a large
1186  * number of ports, and lets us replace the learn and dispatch functions.
1187  */
1188 int
1189 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1190 		u_int ring_nr)
1191 {
1192 	struct nm_bdg_q *dst_ents, *brddst;
1193 	uint16_t num_dsts = 0, *dsts;
1194 	struct nm_bridge *b = na->na_bdg;
1195 	u_int i, j, me = na->bdg_port;
1196 
1197 	/*
1198 	 * The work area (pointed by ft) is followed by an array of
1199 	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1200 	 * queues per port plus one for the broadcast traffic.
1201 	 * Then we have an array of destination indexes.
1202 	 */
1203 	dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1204 	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1205 
1206 	/* first pass: find a destination for each packet in the batch */
1207 	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1208 		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1209 		uint16_t dst_port, d_i;
1210 		struct nm_bdg_q *d;
1211 		uint8_t *buf = ft[i].ft_buf;
1212 		u_int len = ft[i].ft_len;
1213 
1214 		ND("slot %d frags %d", i, ft[i].ft_frags);
1215 		/* Drop the packet if the offset is not into the first
1216 		   fragment nor at the very beginning of the second. */
1217 		if (unlikely(na->offset > len))
1218 			continue;
1219 		if (len == na->offset) {
1220 			buf = ft[i+1].ft_buf;
1221 			len = ft[i+1].ft_len;
1222 		} else {
1223 			buf += na->offset;
1224 			len -= na->offset;
1225 		}
1226 		dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na);
1227 		if (netmap_verbose > 255)
1228 			RD(5, "slot %d port %d -> %d", i, me, dst_port);
1229 		if (dst_port == NM_BDG_NOPORT)
1230 			continue; /* this packet is identified to be dropped */
1231 		else if (unlikely(dst_port > NM_BDG_MAXPORTS))
1232 			continue;
1233 		else if (dst_port == NM_BDG_BROADCAST)
1234 			dst_ring = 0; /* broadcasts always go to ring 0 */
1235 		else if (unlikely(dst_port == me ||
1236 		    !b->bdg_ports[dst_port]))
1237 			continue;
1238 
1239 		/* get a position in the scratch pad */
1240 		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1241 		d = dst_ents + d_i;
1242 
1243 		/* append the first fragment to the list */
1244 		if (d->bq_head == NM_FT_NULL) { /* new destination */
1245 			d->bq_head = d->bq_tail = i;
1246 			/* remember this position to be scanned later */
1247 			if (dst_port != NM_BDG_BROADCAST)
1248 				dsts[num_dsts++] = d_i;
1249 		} else {
1250 			ft[d->bq_tail].ft_next = i;
1251 			d->bq_tail = i;
1252 		}
1253 		d->bq_len += ft[i].ft_frags;
1254 	}
1255 
1256 	/*
1257 	 * Broadcast traffic goes to ring 0 on all destinations.
1258 	 * So we need to add these rings to the list of ports to scan.
1259 	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1260 	 * expensive. We should keep a compact list of active destinations
1261 	 * so we could shorten this loop.
1262 	 */
1263 	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1264 	if (brddst->bq_head != NM_FT_NULL) {
1265 		for (j = 0; likely(j < b->bdg_active_ports); j++) {
1266 			uint16_t d_i;
1267 			i = b->bdg_port_index[j];
1268 			if (unlikely(i == me))
1269 				continue;
1270 			d_i = i * NM_BDG_MAXRINGS;
1271 			if (dst_ents[d_i].bq_head == NM_FT_NULL)
1272 				dsts[num_dsts++] = d_i;
1273 		}
1274 	}
1275 
1276 	ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1277 	/* second pass: scan destinations (XXX will be modular somehow) */
1278 	for (i = 0; i < num_dsts; i++) {
1279 		struct ifnet *dst_ifp;
1280 		struct netmap_vp_adapter *dst_na;
1281 		struct netmap_kring *kring;
1282 		struct netmap_ring *ring;
1283 		u_int dst_nr, lim, j, sent = 0, d_i, next, brd_next;
1284 		u_int needed, howmany;
1285 		int retry = netmap_txsync_retry;
1286 		struct nm_bdg_q *d;
1287 		uint32_t my_start = 0, lease_idx = 0;
1288 		int nrings;
1289 		int offset_mismatch;
1290 
1291 		d_i = dsts[i];
1292 		ND("second pass %d port %d", i, d_i);
1293 		d = dst_ents + d_i;
1294 		// XXX fix the division
1295 		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1296 		/* protect from the lookup function returning an inactive
1297 		 * destination port
1298 		 */
1299 		if (unlikely(dst_na == NULL))
1300 			goto cleanup;
1301 		if (dst_na->up.na_flags & NAF_SW_ONLY)
1302 			goto cleanup;
1303 		dst_ifp = dst_na->up.ifp;
1304 		/*
1305 		 * The interface may be in !netmap mode in two cases:
1306 		 * - when na is attached but not activated yet;
1307 		 * - when na is being deactivated but is still attached.
1308 		 */
1309 		if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) {
1310 			ND("not in netmap mode!");
1311 			goto cleanup;
1312 		}
1313 
1314 		offset_mismatch = (dst_na->offset != na->offset);
1315 
1316 		/* there is at least one either unicast or broadcast packet */
1317 		brd_next = brddst->bq_head;
1318 		next = d->bq_head;
1319 		/* we need to reserve this many slots. If fewer are
1320 		 * available, some packets will be dropped.
1321 		 * Packets may have multiple fragments, so we may not use
1322 		 * there is a chance that we may not use all of the slots
1323 		 * we have claimed, so we will need to handle the leftover
1324 		 * ones when we regain the lock.
1325 		 */
1326 		needed = d->bq_len + brddst->bq_len;
1327 
1328 		ND(5, "pass 2 dst %d is %x %s",
1329 			i, d_i, is_vp ? "virtual" : "nic/host");
1330 		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1331 		nrings = dst_na->up.num_rx_rings;
1332 		if (dst_nr >= nrings)
1333 			dst_nr = dst_nr % nrings;
1334 		kring = &dst_na->up.rx_rings[dst_nr];
1335 		ring = kring->ring;
1336 		lim = kring->nkr_num_slots - 1;
1337 
1338 retry:
1339 
1340 		/* reserve the buffers in the queue and an entry
1341 		 * to report completion, and drop lock.
1342 		 * XXX this might become a helper function.
1343 		 */
1344 		mtx_lock(&kring->q_lock);
1345 		if (kring->nkr_stopped) {
1346 			mtx_unlock(&kring->q_lock);
1347 			goto cleanup;
1348 		}
1349 		if (dst_na->retry) {
1350 			dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1351 		}
1352 		my_start = j = kring->nkr_hwlease;
1353 		howmany = nm_kr_space(kring, 1);
1354 		if (needed < howmany)
1355 			howmany = needed;
1356 		lease_idx = nm_kr_lease(kring, howmany, 1);
1357 		mtx_unlock(&kring->q_lock);
1358 
1359 		/* only retry if we need more than available slots */
1360 		if (retry && needed <= howmany)
1361 			retry = 0;
1362 
1363 		/* copy to the destination queue */
1364 		while (howmany > 0) {
1365 			struct netmap_slot *slot;
1366 			struct nm_bdg_fwd *ft_p, *ft_end;
1367 			u_int cnt;
1368 			int fix_mismatch = offset_mismatch;
1369 
1370 			/* find the queue from which we pick next packet.
1371 			 * NM_FT_NULL is always higher than valid indexes
1372 			 * so we never dereference it if the other list
1373 			 * has packets (and if both are empty we never
1374 			 * get here).
1375 			 */
1376 			if (next < brd_next) {
1377 				ft_p = ft + next;
1378 				next = ft_p->ft_next;
1379 			} else { /* insert broadcast */
1380 				ft_p = ft + brd_next;
1381 				brd_next = ft_p->ft_next;
1382 			}
1383 			cnt = ft_p->ft_frags; // cnt > 0
1384 			if (unlikely(cnt > howmany))
1385 			    break; /* no more space */
1386 			howmany -= cnt;
1387 			if (netmap_verbose && cnt > 1)
1388 				RD(5, "rx %d frags to %d", cnt, j);
1389 			ft_end = ft_p + cnt;
1390 			do {
1391 			    char *dst, *src = ft_p->ft_buf;
1392 			    size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1393 
1394 			    slot = &ring->slot[j];
1395 			    dst = BDG_NMB(&dst_na->up, slot);
1396 
1397 			    if (unlikely(fix_mismatch)) {
1398 				    /* We are processing the first fragment
1399 				     * and there is a mismatch between source
1400 				     * and destination offsets. Create a zeroed
1401 				     * header for the destination, independently
1402 				     * of the source header length and content.
1403 				     */
1404 				    src += na->offset;
1405 				    copy_len -= na->offset;
1406 				    bzero(dst, dst_na->offset);
1407 				    dst += dst_na->offset;
1408 				    dst_len = dst_na->offset + copy_len;
1409 				    /* fix the first fragment only */
1410 				    fix_mismatch = 0;
1411 				    /* Here it could be copy_len == dst_len == 0,
1412 				     * and so a zero length fragment is passed.
1413 				     */
1414 			    }
1415 
1416 			    ND("send [%d] %d(%d) bytes at %s:%d",
1417 				i, (int)copy_len, (int)dst_len,
1418 				NM_IFPNAME(dst_ifp), j);
1419 			    /* round to a multiple of 64 */
1420 			    copy_len = (copy_len + 63) & ~63;
1421 
1422 			    if (ft_p->ft_flags & NS_INDIRECT) {
1423 				if (copyin(src, dst, copy_len)) {
1424 					// invalid user pointer, pretend len is 0
1425 					dst_len = 0;
1426 				}
1427 			    } else {
1428 				//memcpy(dst, src, copy_len);
1429 				pkt_copy(src, dst, (int)copy_len);
1430 			    }
1431 			    slot->len = dst_len;
1432 			    slot->flags = (cnt << 8)| NS_MOREFRAG;
1433 			    j = nm_next(j, lim);
1434 			    ft_p++;
1435 			    sent++;
1436 			} while (ft_p != ft_end);
1437 			slot->flags = (cnt << 8); /* clear flag on last entry */
1438 			/* are we done ? */
1439 			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1440 				break;
1441 		}
1442 		{
1443 		    /* current position */
1444 		    uint32_t *p = kring->nkr_leases; /* shorthand */
1445 		    uint32_t update_pos;
1446 		    int still_locked = 1;
1447 
1448 		    mtx_lock(&kring->q_lock);
1449 		    if (unlikely(howmany > 0)) {
1450 			/* not used all bufs. If i am the last one
1451 			 * i can recover the slots, otherwise must
1452 			 * fill them with 0 to mark empty packets.
1453 			 */
1454 			ND("leftover %d bufs", howmany);
1455 			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1456 			    /* yes i am the last one */
1457 			    ND("roll back nkr_hwlease to %d", j);
1458 			    kring->nkr_hwlease = j;
1459 			} else {
1460 			    while (howmany-- > 0) {
1461 				ring->slot[j].len = 0;
1462 				ring->slot[j].flags = 0;
1463 				j = nm_next(j, lim);
1464 			    }
1465 			}
1466 		    }
1467 		    p[lease_idx] = j; /* report I am done */
1468 
1469 		    update_pos = kring->nr_hwtail;
1470 
1471 		    if (my_start == update_pos) {
1472 			/* all slots before my_start have been reported,
1473 			 * so scan subsequent leases to see if other ranges
1474 			 * have been completed, and to a selwakeup or txsync.
1475 		         */
1476 			while (lease_idx != kring->nkr_lease_idx &&
1477 				p[lease_idx] != NR_NOSLOT) {
1478 			    j = p[lease_idx];
1479 			    p[lease_idx] = NR_NOSLOT;
1480 			    lease_idx = nm_next(lease_idx, lim);
1481 			}
1482 			/* j is the new 'write' position. j != my_start
1483 			 * means there are new buffers to report
1484 			 */
1485 			if (likely(j != my_start)) {
1486 				kring->nr_hwtail = j;
1487 				dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1488 				still_locked = 0;
1489 				mtx_unlock(&kring->q_lock);
1490 				if (dst_na->retry && retry--)
1491 					goto retry;
1492 			}
1493 		    }
1494 		    if (still_locked)
1495 			mtx_unlock(&kring->q_lock);
1496 		}
1497 cleanup:
1498 		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1499 		d->bq_len = 0;
1500 	}
1501 	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1502 	brddst->bq_len = 0;
1503 	return 0;
1504 }
1505 
1506 
1507 static int
1508 netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags)
1509 {
1510 	struct netmap_kring *kring = &na->up.tx_rings[ring_nr];
1511 	u_int done;
1512 	u_int const lim = kring->nkr_num_slots - 1;
1513 	u_int const cur = kring->rcur;
1514 
1515 	if (bridge_batch <= 0) { /* testing only */
1516 		done = cur; // used all
1517 		goto done;
1518 	}
1519 	if (bridge_batch > NM_BDG_BATCH)
1520 		bridge_batch = NM_BDG_BATCH;
1521 
1522 	done = nm_bdg_preflush(na, ring_nr, kring, cur);
1523 done:
1524 	if (done != cur)
1525 		D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail);
1526 	/*
1527 	 * packets between 'done' and 'cur' are left unsent.
1528 	 */
1529 	kring->nr_hwcur = done;
1530 	kring->nr_hwtail = nm_prev(done, lim);
1531 	nm_txsync_finalize(kring);
1532 	if (netmap_verbose)
1533 		D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags);
1534 	return 0;
1535 }
1536 
1537 
1538 /*
1539  * main dispatch routine for the bridge.
1540  * We already know that only one thread is running this.
1541  * we must run nm_bdg_preflush without lock.
1542  */
1543 static int
1544 bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1545 {
1546 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
1547 	return netmap_vp_txsync(vpna, ring_nr, flags);
1548 }
1549 
1550 static int
1551 netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1552 {
1553 	struct netmap_kring *kring = &na->rx_rings[ring_nr];
1554 	struct netmap_ring *ring = kring->ring;
1555 	u_int nm_i, lim = kring->nkr_num_slots - 1;
1556 	u_int head = nm_rxsync_prologue(kring);
1557 	int n;
1558 
1559 	if (head > lim) {
1560 		D("ouch dangerous reset!!!");
1561 		n = netmap_ring_reinit(kring);
1562 		goto done;
1563 	}
1564 
1565 	/* First part, import newly received packets. */
1566 	/* actually nothing to do here, they are already in the kring */
1567 
1568 	/* Second part, skip past packets that userspace has released. */
1569 	nm_i = kring->nr_hwcur;
1570 	if (nm_i != head) {
1571 		/* consistency check, but nothing really important here */
1572 		for (n = 0; likely(nm_i != head); n++) {
1573 			struct netmap_slot *slot = &ring->slot[nm_i];
1574 			void *addr = BDG_NMB(na, slot);
1575 
1576 			if (addr == netmap_buffer_base) { /* bad buf */
1577 				D("bad buffer index %d, ignore ?",
1578 					slot->buf_idx);
1579 			}
1580 			slot->flags &= ~NS_BUF_CHANGED;
1581 			nm_i = nm_next(nm_i, lim);
1582 		}
1583 		kring->nr_hwcur = head;
1584 	}
1585 
1586 	/* tell userspace that there are new packets */
1587 	nm_rxsync_finalize(kring);
1588 	n = 0;
1589 done:
1590 	return n;
1591 }
1592 
1593 /*
1594  * user process reading from a VALE switch.
1595  * Already protected against concurrent calls from userspace,
1596  * but we must acquire the queue's lock to protect against
1597  * writers on the same queue.
1598  */
1599 static int
1600 bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1601 {
1602 	struct netmap_kring *kring = &na->rx_rings[ring_nr];
1603 	int n;
1604 
1605 	mtx_lock(&kring->q_lock);
1606 	n = netmap_vp_rxsync(na, ring_nr, flags);
1607 	mtx_unlock(&kring->q_lock);
1608 	return n;
1609 }
1610 
1611 
1612 static int
1613 bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
1614 {
1615 	struct netmap_vp_adapter *vpna;
1616 	struct netmap_adapter *na;
1617 	int error;
1618 
1619 	vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO);
1620 	if (vpna == NULL)
1621 		return ENOMEM;
1622 
1623  	na = &vpna->up;
1624 
1625 	na->ifp = ifp;
1626 
1627 	/* bound checking */
1628 	na->num_tx_rings = nmr->nr_tx_rings;
1629 	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1630 	nmr->nr_tx_rings = na->num_tx_rings; // write back
1631 	na->num_rx_rings = nmr->nr_rx_rings;
1632 	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1633 	nmr->nr_rx_rings = na->num_rx_rings; // write back
1634 	nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1635 			1, NM_BDG_MAXSLOTS, NULL);
1636 	na->num_tx_desc = nmr->nr_tx_slots;
1637 	nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1638 			1, NM_BDG_MAXSLOTS, NULL);
1639 	na->num_rx_desc = nmr->nr_rx_slots;
1640 	vpna->offset = 0;
1641 
1642 	na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER;
1643 	na->nm_txsync = bdg_netmap_txsync;
1644 	na->nm_rxsync = bdg_netmap_rxsync;
1645 	na->nm_register = bdg_netmap_reg;
1646 	na->nm_dtor = netmap_adapter_vp_dtor;
1647 	na->nm_krings_create = netmap_vp_krings_create;
1648 	na->nm_krings_delete = netmap_vp_krings_delete;
1649 	na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp),
1650 			na->num_tx_rings, na->num_tx_desc,
1651 			na->num_rx_rings, na->num_rx_desc);
1652 	/* other nmd fields are set in the common routine */
1653 	error = netmap_attach_common(na);
1654 	if (error) {
1655 		free(vpna, M_DEVBUF);
1656 		return error;
1657 	}
1658 	return 0;
1659 }
1660 
1661 
1662 static void
1663 netmap_bwrap_dtor(struct netmap_adapter *na)
1664 {
1665 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1666 	struct netmap_adapter *hwna = bna->hwna;
1667 	struct nm_bridge *b = bna->up.na_bdg,
1668 		*bh = bna->host.na_bdg;
1669 	struct ifnet *ifp = na->ifp;
1670 
1671 	ND("na %p", na);
1672 
1673 	if (b) {
1674 		netmap_bdg_detach_common(b, bna->up.bdg_port,
1675 			(bh ? bna->host.bdg_port : -1));
1676 	}
1677 
1678 	hwna->na_private = NULL;
1679 	netmap_adapter_put(hwna);
1680 
1681 	bzero(ifp, sizeof(*ifp));
1682 	free(ifp, M_DEVBUF);
1683 	na->ifp = NULL;
1684 
1685 }
1686 
1687 
1688 /*
1689  * Intr callback for NICs connected to a bridge.
1690  * Simply ignore tx interrupts (maybe we could try to recover space ?)
1691  * and pass received packets from nic to the bridge.
1692  *
1693  * XXX TODO check locking: this is called from the interrupt
1694  * handler so we should make sure that the interface is not
1695  * disconnected while passing down an interrupt.
1696  *
1697  * Note, no user process can access this NIC or the host stack.
1698  * The only part of the ring that is significant are the slots,
1699  * and head/cur/tail are set from the kring as needed
1700  * (part as a receive ring, part as a transmit ring).
1701  *
1702  * callback that overwrites the hwna notify callback.
1703  * Packets come from the outside or from the host stack and are put on an hwna rx ring.
1704  * The bridge wrapper then sends the packets through the bridge.
1705  */
1706 static int
1707 netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags)
1708 {
1709 	struct ifnet *ifp = na->ifp;
1710 	struct netmap_bwrap_adapter *bna = na->na_private;
1711 	struct netmap_vp_adapter *hostna = &bna->host;
1712 	struct netmap_kring *kring, *bkring;
1713 	struct netmap_ring *ring;
1714 	int is_host_ring = ring_nr == na->num_rx_rings;
1715 	struct netmap_vp_adapter *vpna = &bna->up;
1716 	int error = 0;
1717 
1718 	if (netmap_verbose)
1719 	    D("%s %s%d 0x%x", NM_IFPNAME(ifp),
1720 		(tx == NR_TX ? "TX" : "RX"), ring_nr, flags);
1721 
1722 	if (flags & NAF_DISABLE_NOTIFY) {
1723 		kring = tx == NR_TX ? na->tx_rings : na->rx_rings;
1724 		bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings;
1725 		if (kring[ring_nr].nkr_stopped)
1726 			netmap_disable_ring(&bkring[ring_nr]);
1727 		else
1728 			bkring[ring_nr].nkr_stopped = 0;
1729 		return 0;
1730 	}
1731 
1732 	if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP))
1733 		return 0;
1734 
1735 	/* we only care about receive interrupts */
1736 	if (tx == NR_TX)
1737 		return 0;
1738 
1739 	kring = &na->rx_rings[ring_nr];
1740 	ring = kring->ring;
1741 
1742 	/* make sure the ring is not disabled */
1743 	if (nm_kr_tryget(kring))
1744 		return 0;
1745 
1746 	if (is_host_ring && hostna->na_bdg == NULL) {
1747 		error = bna->save_notify(na, ring_nr, tx, flags);
1748 		goto put_out;
1749 	}
1750 
1751 	/* Here we expect ring->head = ring->cur = ring->tail
1752 	 * because everything has been released from the previous round.
1753 	 * However the ring is shared and we might have info from
1754 	 * the wrong side (the tx ring). Hence we overwrite with
1755 	 * the info from the rx kring.
1756 	 */
1757 	if (netmap_verbose)
1758 	    D("%s head %d cur %d tail %d (kring %d %d %d)",  NM_IFPNAME(ifp),
1759 		ring->head, ring->cur, ring->tail,
1760 		kring->rhead, kring->rcur, kring->rtail);
1761 
1762 	ring->head = kring->rhead;
1763 	ring->cur = kring->rcur;
1764 	ring->tail = kring->rtail;
1765 
1766 	/* simulate a user wakeup on the rx ring */
1767 	if (is_host_ring) {
1768 		netmap_rxsync_from_host(na, NULL, NULL);
1769 		vpna = hostna;
1770 		ring_nr = 0;
1771 	} else {
1772 		/* fetch packets that have arrived.
1773 		 * XXX maybe do this in a loop ?
1774 		 */
1775 		error = na->nm_rxsync(na, ring_nr, 0);
1776 		if (error)
1777 			goto put_out;
1778 	}
1779 	if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {
1780 		D("how strange, interrupt with no packets on %s",
1781 			NM_IFPNAME(ifp));
1782 		goto put_out;
1783 	}
1784 
1785 	/* new packets are ring->cur to ring->tail, and the bkring
1786 	 * had hwcur == ring->cur. So advance ring->cur to ring->tail
1787 	 * to push all packets out.
1788 	 */
1789 	ring->head = ring->cur = ring->tail;
1790 
1791 	/* also set tail to what the bwrap expects */
1792 	bkring = &vpna->up.tx_rings[ring_nr];
1793 	ring->tail = bkring->nr_hwtail; // rtail too ?
1794 
1795 	/* pass packets to the switch */
1796 	nm_txsync_prologue(bkring); // XXX error checking ?
1797 	netmap_vp_txsync(vpna, ring_nr, flags);
1798 
1799 	/* mark all buffers as released on this ring */
1800 	ring->head = ring->cur = kring->nr_hwtail;
1801 	ring->tail = kring->rtail;
1802 	/* another call to actually release the buffers */
1803 	if (!is_host_ring) {
1804 		error = na->nm_rxsync(na, ring_nr, 0);
1805 	} else {
1806 		/* mark all packets as released, as in the
1807 		 * second part of netmap_rxsync_from_host()
1808 		 */
1809 		kring->nr_hwcur = kring->nr_hwtail;
1810 		nm_rxsync_finalize(kring);
1811 	}
1812 
1813 put_out:
1814 	nm_kr_put(kring);
1815 	return error;
1816 }
1817 
1818 
1819 static int
1820 netmap_bwrap_register(struct netmap_adapter *na, int onoff)
1821 {
1822 	struct netmap_bwrap_adapter *bna =
1823 		(struct netmap_bwrap_adapter *)na;
1824 	struct netmap_adapter *hwna = bna->hwna;
1825 	struct netmap_vp_adapter *hostna = &bna->host;
1826 	int error;
1827 
1828 	ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off");
1829 
1830 	if (onoff) {
1831 		int i;
1832 
1833 		hwna->na_lut = na->na_lut;
1834 		hwna->na_lut_objtotal = na->na_lut_objtotal;
1835 
1836 		if (hostna->na_bdg) {
1837 			hostna->up.na_lut = na->na_lut;
1838 			hostna->up.na_lut_objtotal = na->na_lut_objtotal;
1839 		}
1840 
1841 		/* cross-link the netmap rings
1842 		 * The original number of rings comes from hwna,
1843 		 * rx rings on one side equals tx rings on the other.
1844 		 */
1845 		for (i = 0; i <= na->num_rx_rings; i++) {
1846 			hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots;
1847 			hwna->tx_rings[i].ring = na->rx_rings[i].ring;
1848 		}
1849 		for (i = 0; i <= na->num_tx_rings; i++) {
1850 			hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots;
1851 			hwna->rx_rings[i].ring = na->tx_rings[i].ring;
1852 		}
1853 	}
1854 
1855 	if (hwna->ifp) {
1856 		error = hwna->nm_register(hwna, onoff);
1857 		if (error)
1858 			return error;
1859 	}
1860 
1861 	bdg_netmap_reg(na, onoff);
1862 
1863 	if (onoff) {
1864 		bna->save_notify = hwna->nm_notify;
1865 		hwna->nm_notify = netmap_bwrap_intr_notify;
1866 	} else {
1867 		hwna->nm_notify = bna->save_notify;
1868 		hwna->na_lut = NULL;
1869 		hwna->na_lut_objtotal = 0;
1870 	}
1871 
1872 	return 0;
1873 }
1874 
1875 
1876 static int
1877 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
1878 				    u_int *rxr, u_int *rxd)
1879 {
1880 	struct netmap_bwrap_adapter *bna =
1881 		(struct netmap_bwrap_adapter *)na;
1882 	struct netmap_adapter *hwna = bna->hwna;
1883 
1884 	/* forward the request */
1885 	netmap_update_config(hwna);
1886 	/* swap the results */
1887 	*txr = hwna->num_rx_rings;
1888 	*txd = hwna->num_rx_desc;
1889 	*rxr = hwna->num_tx_rings;
1890 	*rxd = hwna->num_rx_desc;
1891 
1892 	return 0;
1893 }
1894 
1895 
1896 static int
1897 netmap_bwrap_krings_create(struct netmap_adapter *na)
1898 {
1899 	struct netmap_bwrap_adapter *bna =
1900 		(struct netmap_bwrap_adapter *)na;
1901 	struct netmap_adapter *hwna = bna->hwna;
1902 	struct netmap_adapter *hostna = &bna->host.up;
1903 	int error;
1904 
1905 	ND("%s", NM_IFPNAME(na->ifp));
1906 
1907 	error = netmap_vp_krings_create(na);
1908 	if (error)
1909 		return error;
1910 
1911 	error = hwna->nm_krings_create(hwna);
1912 	if (error) {
1913 		netmap_vp_krings_delete(na);
1914 		return error;
1915 	}
1916 
1917 	hostna->tx_rings = na->tx_rings + na->num_tx_rings;
1918 	hostna->rx_rings = na->rx_rings + na->num_rx_rings;
1919 
1920 	return 0;
1921 }
1922 
1923 
1924 static void
1925 netmap_bwrap_krings_delete(struct netmap_adapter *na)
1926 {
1927 	struct netmap_bwrap_adapter *bna =
1928 		(struct netmap_bwrap_adapter *)na;
1929 	struct netmap_adapter *hwna = bna->hwna;
1930 
1931 	ND("%s", NM_IFPNAME(na->ifp));
1932 
1933 	hwna->nm_krings_delete(hwna);
1934 	netmap_vp_krings_delete(na);
1935 }
1936 
1937 
1938 /* notify method for the bridge-->hwna direction */
1939 static int
1940 netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
1941 {
1942 	struct netmap_bwrap_adapter *bna =
1943 		(struct netmap_bwrap_adapter *)na;
1944 	struct netmap_adapter *hwna = bna->hwna;
1945 	struct netmap_kring *kring, *hw_kring;
1946 	struct netmap_ring *ring;
1947 	u_int lim;
1948 	int error = 0;
1949 
1950 	if (tx == NR_TX)
1951 	        return EINVAL;
1952 
1953 	kring = &na->rx_rings[ring_n];
1954 	hw_kring = &hwna->tx_rings[ring_n];
1955 	ring = kring->ring;
1956 	lim = kring->nkr_num_slots - 1;
1957 
1958 	if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP))
1959 		return 0;
1960 	/* first step: simulate a user wakeup on the rx ring */
1961 	netmap_vp_rxsync(na, ring_n, flags);
1962 	ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1963 		NM_IFPNAME(na->ifp), ring_n,
1964 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1965 		ring->head, ring->cur, ring->tail,
1966 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
1967 	/* second step: the simulated user consumes all new packets */
1968 	ring->head = ring->cur = ring->tail;
1969 
1970 	/* third step: the new packets are sent on the tx ring
1971 	 * (which is actually the same ring)
1972 	 */
1973 	/* set tail to what the hw expects */
1974 	ring->tail = hw_kring->rtail;
1975 	if (ring_n == na->num_rx_rings) {
1976 		netmap_txsync_to_host(hwna);
1977 	} else {
1978 		nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ?
1979 		error = hwna->nm_txsync(hwna, ring_n, flags);
1980 	}
1981 
1982 	/* fourth step: now we are back the rx ring */
1983 	/* claim ownership on all hw owned bufs */
1984 	ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */
1985 	ring->tail = kring->rtail; /* restore saved value of tail, for safety */
1986 
1987 	/* fifth step: the user goes to sleep again, causing another rxsync */
1988 	netmap_vp_rxsync(na, ring_n, flags);
1989 	ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1990 		NM_IFPNAME(na->ifp), ring_n,
1991 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1992 		ring->head, ring->cur, ring->tail,
1993 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
1994 
1995 	return error;
1996 }
1997 
1998 
1999 static int
2000 netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
2001 {
2002 	struct netmap_bwrap_adapter *bna = na->na_private;
2003 	struct netmap_adapter *port_na = &bna->up.up;
2004 	if (tx == NR_TX || ring_n != 0)
2005 		return EINVAL;
2006 	return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags);
2007 }
2008 
2009 
2010 /* attach a bridge wrapper to the 'real' device */
2011 static int
2012 netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
2013 {
2014 	struct netmap_bwrap_adapter *bna;
2015 	struct netmap_adapter *na;
2016 	struct netmap_adapter *hwna = NA(real);
2017 	struct netmap_adapter *hostna;
2018 	int error;
2019 
2020 
2021 	bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO);
2022 	if (bna == NULL)
2023 		return ENOMEM;
2024 
2025 	na = &bna->up.up;
2026 	na->ifp = fake;
2027 	/* fill the ring data for the bwrap adapter with rx/tx meanings
2028 	 * swapped. The real cross-linking will be done during register,
2029 	 * when all the krings will have been created.
2030 	 */
2031 	na->num_rx_rings = hwna->num_tx_rings;
2032 	na->num_tx_rings = hwna->num_rx_rings;
2033 	na->num_tx_desc = hwna->num_rx_desc;
2034 	na->num_rx_desc = hwna->num_tx_desc;
2035 	na->nm_dtor = netmap_bwrap_dtor;
2036 	na->nm_register = netmap_bwrap_register;
2037 	// na->nm_txsync = netmap_bwrap_txsync;
2038 	// na->nm_rxsync = netmap_bwrap_rxsync;
2039 	na->nm_config = netmap_bwrap_config;
2040 	na->nm_krings_create = netmap_bwrap_krings_create;
2041 	na->nm_krings_delete = netmap_bwrap_krings_delete;
2042 	na->nm_notify = netmap_bwrap_notify;
2043 	na->nm_mem = hwna->nm_mem;
2044 	na->na_private = na; /* prevent NIOCREGIF */
2045 	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
2046 
2047 	bna->hwna = hwna;
2048 	netmap_adapter_get(hwna);
2049 	hwna->na_private = bna; /* weak reference */
2050 
2051 	hostna = &bna->host.up;
2052 	hostna->ifp = hwna->ifp;
2053 	hostna->num_tx_rings = 1;
2054 	hostna->num_tx_desc = hwna->num_rx_desc;
2055 	hostna->num_rx_rings = 1;
2056 	hostna->num_rx_desc = hwna->num_tx_desc;
2057 	// hostna->nm_txsync = netmap_bwrap_host_txsync;
2058 	// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
2059 	hostna->nm_notify = netmap_bwrap_host_notify;
2060 	hostna->nm_mem = na->nm_mem;
2061 	hostna->na_private = bna;
2062 
2063 	ND("%s<->%s txr %d txd %d rxr %d rxd %d",
2064 		fake->if_xname, real->if_xname,
2065 		na->num_tx_rings, na->num_tx_desc,
2066 		na->num_rx_rings, na->num_rx_desc);
2067 
2068 	error = netmap_attach_common(na);
2069 	if (error) {
2070 		netmap_adapter_put(hwna);
2071 		free(bna, M_DEVBUF);
2072 		return error;
2073 	}
2074 	return 0;
2075 }
2076 
2077 
2078 void
2079 netmap_init_bridges(void)
2080 {
2081 	int i;
2082 	bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */
2083 	for (i = 0; i < NM_BRIDGES; i++)
2084 		BDG_RWINIT(&nm_bridges[i]);
2085 }
2086 #endif /* WITH_VALE */
2087