xref: /freebsd/sys/dev/netmap/netmap_vale.c (revision b17b15d8ea55f8cb4c5a3bdfae755c545d10aea2)
1 /*
2  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * This module implements the VALE switch for netmap
29 
30 --- VALE SWITCH ---
31 
32 NMG_LOCK() serializes all modifications to switches and ports.
33 A switch cannot be deleted until all ports are gone.
34 
35 For each switch, an SX lock (RWlock on linux) protects
36 deletion of ports. When configuring or deleting a new port, the
37 lock is acquired in exclusive mode (after holding NMG_LOCK).
38 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
39 The lock is held throughout the entire forwarding cycle,
40 during which the thread may incur in a page fault.
41 Hence it is important that sleepable shared locks are used.
42 
43 On the rx ring, the per-port lock is grabbed initially to reserve
44 a number of slot in the ring, then the lock is released,
45 packets are copied from source to destination, and then
46 the lock is acquired again and the receive ring is updated.
47 (A similar thing is done on the tx ring for NIC and host stack
48 ports attached to the switch)
49 
50  */
51 
52 /*
53  * OS-specific code that is used only within this file.
54  * Other OS-specific code that must be accessed by drivers
55  * is present in netmap_kern.h
56  */
57 
58 #if defined(__FreeBSD__)
59 #include <sys/cdefs.h> /* prerequisite */
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/types.h>
63 #include <sys/errno.h>
64 #include <sys/param.h>	/* defines used in kernel.h */
65 #include <sys/kernel.h>	/* types used in module initialization */
66 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
67 #include <sys/sockio.h>
68 #include <sys/socketvar.h>	/* struct socket */
69 #include <sys/malloc.h>
70 #include <sys/poll.h>
71 #include <sys/rwlock.h>
72 #include <sys/socket.h> /* sockaddrs */
73 #include <sys/selinfo.h>
74 #include <sys/sysctl.h>
75 #include <net/if.h>
76 #include <net/if_var.h>
77 #include <net/bpf.h>		/* BIOCIMMEDIATE */
78 #include <machine/bus.h>	/* bus_dmamap_* */
79 #include <sys/endian.h>
80 #include <sys/refcount.h>
81 
82 
83 #define BDG_RWLOCK_T		struct rwlock // struct rwlock
84 
85 #define	BDG_RWINIT(b)		\
86 	rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
87 #define BDG_WLOCK(b)		rw_wlock(&(b)->bdg_lock)
88 #define BDG_WUNLOCK(b)		rw_wunlock(&(b)->bdg_lock)
89 #define BDG_RLOCK(b)		rw_rlock(&(b)->bdg_lock)
90 #define BDG_RTRYLOCK(b)		rw_try_rlock(&(b)->bdg_lock)
91 #define BDG_RUNLOCK(b)		rw_runlock(&(b)->bdg_lock)
92 #define BDG_RWDESTROY(b)	rw_destroy(&(b)->bdg_lock)
93 
94 
95 #elif defined(linux)
96 
97 #include "bsd_glue.h"
98 
99 #elif defined(__APPLE__)
100 
101 #warning OSX support is only partial
102 #include "osx_glue.h"
103 
104 #else
105 
106 #error	Unsupported platform
107 
108 #endif /* unsupported */
109 
110 /*
111  * common headers
112  */
113 
114 #include <net/netmap.h>
115 #include <dev/netmap/netmap_kern.h>
116 #include <dev/netmap/netmap_mem2.h>
117 
118 #ifdef WITH_VALE
119 
120 /*
121  * system parameters (most of them in netmap_kern.h)
122  * NM_NAME	prefix for switch port names, default "vale"
123  * NM_BDG_MAXPORTS	number of ports
124  * NM_BRIDGES	max number of switches in the system.
125  *	XXX should become a sysctl or tunable
126  *
127  * Switch ports are named valeX:Y where X is the switch name and Y
128  * is the port. If Y matches a physical interface name, the port is
129  * connected to a physical device.
130  *
131  * Unlike physical interfaces, switch ports use their own memory region
132  * for rings and buffers.
133  * The virtual interfaces use per-queue lock instead of core lock.
134  * In the tx loop, we aggregate traffic in batches to make all operations
135  * faster. The batch size is bridge_batch.
136  */
137 #define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
138 #define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
139 #define NM_BRIDGE_RINGSIZE	1024	/* in the device */
140 #define NM_BDG_HASH		1024	/* forwarding table entries */
141 #define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
142 #define NM_MULTISEG		64	/* max size of a chain of bufs */
143 /* actual size of the tables */
144 #define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NM_MULTISEG)
145 /* NM_FT_NULL terminates a list of slots in the ft */
146 #define NM_FT_NULL		NM_BDG_BATCH_MAX
147 #define	NM_BRIDGES		8	/* number of bridges */
148 
149 
150 /*
151  * bridge_batch is set via sysctl to the max batch size to be
152  * used in the bridge. The actual value may be larger as the
153  * last packet in the block may overflow the size.
154  */
155 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
156 SYSCTL_DECL(_dev_netmap);
157 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
158 
159 
160 static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp);
161 static int bdg_netmap_reg(struct netmap_adapter *na, int onoff);
162 static int netmap_bwrap_attach(struct ifnet *, struct ifnet *);
163 static int netmap_bwrap_register(struct netmap_adapter *, int onoff);
164 int kern_netmap_regif(struct nmreq *nmr);
165 
166 /*
167  * Each transmit queue accumulates a batch of packets into
168  * a structure before forwarding. Packets to the same
169  * destination are put in a list using ft_next as a link field.
170  * ft_frags and ft_next are valid only on the first fragment.
171  */
172 struct nm_bdg_fwd {	/* forwarding entry for a bridge */
173 	void *ft_buf;		/* netmap or indirect buffer */
174 	uint8_t ft_frags;	/* how many fragments (only on 1st frag) */
175 	uint8_t _ft_port;	/* dst port (unused) */
176 	uint16_t ft_flags;	/* flags, e.g. indirect */
177 	uint16_t ft_len;	/* src fragment len */
178 	uint16_t ft_next;	/* next packet to same destination */
179 };
180 
181 /*
182  * For each output interface, nm_bdg_q is used to construct a list.
183  * bq_len is the number of output buffers (we can have coalescing
184  * during the copy).
185  */
186 struct nm_bdg_q {
187 	uint16_t bq_head;
188 	uint16_t bq_tail;
189 	uint32_t bq_len;	/* number of buffers */
190 };
191 
192 /* XXX revise this */
193 struct nm_hash_ent {
194 	uint64_t	mac;	/* the top 2 bytes are the epoch */
195 	uint64_t	ports;
196 };
197 
198 /*
199  * nm_bridge is a descriptor for a VALE switch.
200  * Interfaces for a bridge are all in bdg_ports[].
201  * The array has fixed size, an empty entry does not terminate
202  * the search, but lookups only occur on attach/detach so we
203  * don't mind if they are slow.
204  *
205  * The bridge is non blocking on the transmit ports: excess
206  * packets are dropped if there is no room on the output port.
207  *
208  * bdg_lock protects accesses to the bdg_ports array.
209  * This is a rw lock (or equivalent).
210  */
211 struct nm_bridge {
212 	/* XXX what is the proper alignment/layout ? */
213 	BDG_RWLOCK_T	bdg_lock;	/* protects bdg_ports */
214 	int		bdg_namelen;
215 	uint32_t	bdg_active_ports; /* 0 means free */
216 	char		bdg_basename[IFNAMSIZ];
217 
218 	/* Indexes of active ports (up to active_ports)
219 	 * and all other remaining ports.
220 	 */
221 	uint8_t		bdg_port_index[NM_BDG_MAXPORTS];
222 
223 	struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
224 
225 
226 	/*
227 	 * The function to decide the destination port.
228 	 * It returns either of an index of the destination port,
229 	 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
230 	 * forward this packet.  ring_nr is the source ring index, and the
231 	 * function may overwrite this value to forward this packet to a
232 	 * different ring index.
233 	 * This function must be set by netmap_bdgctl().
234 	 */
235 	bdg_lookup_fn_t nm_bdg_lookup;
236 
237 	/* the forwarding table, MAC+ports.
238 	 * XXX should be changed to an argument to be passed to
239 	 * the lookup function, and allocated on attach
240 	 */
241 	struct nm_hash_ent ht[NM_BDG_HASH];
242 };
243 
244 
245 /*
246  * XXX in principle nm_bridges could be created dynamically
247  * Right now we have a static array and deletions are protected
248  * by an exclusive lock.
249  */
250 struct nm_bridge nm_bridges[NM_BRIDGES];
251 
252 
253 /*
254  * this is a slightly optimized copy routine which rounds
255  * to multiple of 64 bytes and is often faster than dealing
256  * with other odd sizes. We assume there is enough room
257  * in the source and destination buffers.
258  *
259  * XXX only for multiples of 64 bytes, non overlapped.
260  */
261 static inline void
262 pkt_copy(void *_src, void *_dst, int l)
263 {
264         uint64_t *src = _src;
265         uint64_t *dst = _dst;
266         if (unlikely(l >= 1024)) {
267                 memcpy(dst, src, l);
268                 return;
269         }
270         for (; likely(l > 0); l-=64) {
271                 *dst++ = *src++;
272                 *dst++ = *src++;
273                 *dst++ = *src++;
274                 *dst++ = *src++;
275                 *dst++ = *src++;
276                 *dst++ = *src++;
277                 *dst++ = *src++;
278                 *dst++ = *src++;
279         }
280 }
281 
282 
283 /*
284  * locate a bridge among the existing ones.
285  * MUST BE CALLED WITH NMG_LOCK()
286  *
287  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
288  * We assume that this is called with a name of at least NM_NAME chars.
289  */
290 static struct nm_bridge *
291 nm_find_bridge(const char *name, int create)
292 {
293 	int i, l, namelen;
294 	struct nm_bridge *b = NULL;
295 
296 	NMG_LOCK_ASSERT();
297 
298 	namelen = strlen(NM_NAME);	/* base length */
299 	l = name ? strlen(name) : 0;		/* actual length */
300 	if (l < namelen) {
301 		D("invalid bridge name %s", name ? name : NULL);
302 		return NULL;
303 	}
304 	for (i = namelen + 1; i < l; i++) {
305 		if (name[i] == ':') {
306 			namelen = i;
307 			break;
308 		}
309 	}
310 	if (namelen >= IFNAMSIZ)
311 		namelen = IFNAMSIZ;
312 	ND("--- prefix is '%.*s' ---", namelen, name);
313 
314 	/* lookup the name, remember empty slot if there is one */
315 	for (i = 0; i < NM_BRIDGES; i++) {
316 		struct nm_bridge *x = nm_bridges + i;
317 
318 		if (x->bdg_active_ports == 0) {
319 			if (create && b == NULL)
320 				b = x;	/* record empty slot */
321 		} else if (x->bdg_namelen != namelen) {
322 			continue;
323 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
324 			ND("found '%.*s' at %d", namelen, name, i);
325 			b = x;
326 			break;
327 		}
328 	}
329 	if (i == NM_BRIDGES && b) { /* name not found, can create entry */
330 		/* initialize the bridge */
331 		strncpy(b->bdg_basename, name, namelen);
332 		ND("create new bridge %s with ports %d", b->bdg_basename,
333 			b->bdg_active_ports);
334 		b->bdg_namelen = namelen;
335 		b->bdg_active_ports = 0;
336 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
337 			b->bdg_port_index[i] = i;
338 		/* set the default function */
339 		b->nm_bdg_lookup = netmap_bdg_learning;
340 		/* reset the MAC address table */
341 		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
342 	}
343 	return b;
344 }
345 
346 
347 /*
348  * Free the forwarding tables for rings attached to switch ports.
349  */
350 static void
351 nm_free_bdgfwd(struct netmap_adapter *na)
352 {
353 	int nrings, i;
354 	struct netmap_kring *kring;
355 
356 	NMG_LOCK_ASSERT();
357 	nrings = na->num_tx_rings;
358 	kring = na->tx_rings;
359 	for (i = 0; i < nrings; i++) {
360 		if (kring[i].nkr_ft) {
361 			free(kring[i].nkr_ft, M_DEVBUF);
362 			kring[i].nkr_ft = NULL; /* protect from freeing twice */
363 		}
364 	}
365 }
366 
367 
368 /*
369  * Allocate the forwarding tables for the rings attached to the bridge ports.
370  */
371 static int
372 nm_alloc_bdgfwd(struct netmap_adapter *na)
373 {
374 	int nrings, l, i, num_dstq;
375 	struct netmap_kring *kring;
376 
377 	NMG_LOCK_ASSERT();
378 	/* all port:rings + broadcast */
379 	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
380 	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
381 	l += sizeof(struct nm_bdg_q) * num_dstq;
382 	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
383 
384 	nrings = na->num_tx_rings + 1;
385 	kring = na->tx_rings;
386 	for (i = 0; i < nrings; i++) {
387 		struct nm_bdg_fwd *ft;
388 		struct nm_bdg_q *dstq;
389 		int j;
390 
391 		ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
392 		if (!ft) {
393 			nm_free_bdgfwd(na);
394 			return ENOMEM;
395 		}
396 		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
397 		for (j = 0; j < num_dstq; j++) {
398 			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
399 			dstq[j].bq_len = 0;
400 		}
401 		kring[i].nkr_ft = ft;
402 	}
403 	return 0;
404 }
405 
406 
407 static void
408 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
409 {
410 	int s_hw = hw, s_sw = sw;
411 	int i, lim =b->bdg_active_ports;
412 	uint8_t tmp[NM_BDG_MAXPORTS];
413 
414 	/*
415 	New algorithm:
416 	make a copy of bdg_port_index;
417 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
418 	in the array of bdg_port_index, replacing them with
419 	entries from the bottom of the array;
420 	decrement bdg_active_ports;
421 	acquire BDG_WLOCK() and copy back the array.
422 	 */
423 
424 	D("detach %d and %d (lim %d)", hw, sw, lim);
425 	/* make a copy of the list of active ports, update it,
426 	 * and then copy back within BDG_WLOCK().
427 	 */
428 	memcpy(tmp, b->bdg_port_index, sizeof(tmp));
429 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
430 		if (hw >= 0 && tmp[i] == hw) {
431 			ND("detach hw %d at %d", hw, i);
432 			lim--; /* point to last active port */
433 			tmp[i] = tmp[lim]; /* swap with i */
434 			tmp[lim] = hw;	/* now this is inactive */
435 			hw = -1;
436 		} else if (sw >= 0 && tmp[i] == sw) {
437 			ND("detach sw %d at %d", sw, i);
438 			lim--;
439 			tmp[i] = tmp[lim];
440 			tmp[lim] = sw;
441 			sw = -1;
442 		} else {
443 			i++;
444 		}
445 	}
446 	if (hw >= 0 || sw >= 0) {
447 		D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
448 	}
449 
450 	BDG_WLOCK(b);
451 	b->bdg_ports[s_hw] = NULL;
452 	if (s_sw >= 0) {
453 		b->bdg_ports[s_sw] = NULL;
454 	}
455 	memcpy(b->bdg_port_index, tmp, sizeof(tmp));
456 	b->bdg_active_ports = lim;
457 	BDG_WUNLOCK(b);
458 
459 	ND("now %d active ports", lim);
460 	if (lim == 0) {
461 		ND("marking bridge %s as free", b->bdg_basename);
462 		b->nm_bdg_lookup = NULL;
463 	}
464 }
465 
466 
467 static void
468 netmap_adapter_vp_dtor(struct netmap_adapter *na)
469 {
470 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
471 	struct nm_bridge *b = vpna->na_bdg;
472 	struct ifnet *ifp = na->ifp;
473 
474 	ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount);
475 
476 	if (b) {
477 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
478 	}
479 
480 	bzero(ifp, sizeof(*ifp));
481 	free(ifp, M_DEVBUF);
482 	na->ifp = NULL;
483 }
484 
485 
486 /* Try to get a reference to a netmap adapter attached to a VALE switch.
487  * If the adapter is found (or is created), this function returns 0, a
488  * non NULL pointer is returned into *na, and the caller holds a
489  * reference to the adapter.
490  * If an adapter is not found, then no reference is grabbed and the
491  * function returns an error code, or 0 if there is just a VALE prefix
492  * mismatch. Therefore the caller holds a reference when
493  * (*na != NULL && return == 0).
494  */
495 int
496 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
497 {
498 	const char *name = nmr->nr_name;
499 	struct ifnet *ifp;
500 	int error = 0;
501 	struct netmap_adapter *ret;
502 	struct netmap_vp_adapter *vpna;
503 	struct nm_bridge *b;
504 	int i, j, cand = -1, cand2 = -1;
505 	int needed;
506 
507 	*na = NULL;     /* default return value */
508 
509 	/* first try to see if this is a bridge port. */
510 	NMG_LOCK_ASSERT();
511 	if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
512 		return 0;  /* no error, but no VALE prefix */
513 	}
514 
515 	b = nm_find_bridge(name, create);
516 	if (b == NULL) {
517 		D("no bridges available for '%s'", name);
518 		return (ENXIO);
519 	}
520 
521 	/* Now we are sure that name starts with the bridge's name,
522 	 * lookup the port in the bridge. We need to scan the entire
523 	 * list. It is not important to hold a WLOCK on the bridge
524 	 * during the search because NMG_LOCK already guarantees
525 	 * that there are no other possible writers.
526 	 */
527 
528 	/* lookup in the local list of ports */
529 	for (j = 0; j < b->bdg_active_ports; j++) {
530 		i = b->bdg_port_index[j];
531 		vpna = b->bdg_ports[i];
532 		// KASSERT(na != NULL);
533 		ifp = vpna->up.ifp;
534 		/* XXX make sure the name only contains one : */
535 		if (!strcmp(NM_IFPNAME(ifp), name)) {
536 			netmap_adapter_get(&vpna->up);
537 			ND("found existing if %s refs %d", name,
538 				vpna->na_bdg_refcount);
539 			*na = (struct netmap_adapter *)vpna;
540 			return 0;
541 		}
542 	}
543 	/* not found, should we create it? */
544 	if (!create)
545 		return ENXIO;
546 	/* yes we should, see if we have space to attach entries */
547 	needed = 2; /* in some cases we only need 1 */
548 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
549 		D("bridge full %d, cannot create new port", b->bdg_active_ports);
550 		return EINVAL;
551 	}
552 	/* record the next two ports available, but do not allocate yet */
553 	cand = b->bdg_port_index[b->bdg_active_ports];
554 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
555 	ND("+++ bridge %s port %s used %d avail %d %d",
556 		b->bdg_basename, name, b->bdg_active_ports, cand, cand2);
557 
558 	/*
559 	 * try see if there is a matching NIC with this name
560 	 * (after the bridge's name)
561 	 */
562 	ifp = ifunit_ref(name + b->bdg_namelen + 1);
563 	if (!ifp) { /* this is a virtual port */
564 		if (nmr->nr_cmd) {
565 			/* nr_cmd must be 0 for a virtual port */
566 			return EINVAL;
567 		}
568 
569 	 	/* create a struct ifnet for the new port.
570 		 * need M_NOWAIT as we are under nma_lock
571 		 */
572 		ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
573 		if (!ifp)
574 			return ENOMEM;
575 
576 		strcpy(ifp->if_xname, name);
577 		/* bdg_netmap_attach creates a struct netmap_adapter */
578 		error = bdg_netmap_attach(nmr, ifp);
579 		if (error) {
580 			D("error %d", error);
581 			free(ifp, M_DEVBUF);
582 			return error;
583 		}
584 		ret = NA(ifp);
585 		cand2 = -1;	/* only need one port */
586 	} else {  /* this is a NIC */
587 		struct ifnet *fake_ifp;
588 
589 		error = netmap_get_hw_na(ifp, &ret);
590 		if (error || ret == NULL)
591 			goto out;
592 
593 		/* make sure the NIC is not already in use */
594 		if (NETMAP_OWNED_BY_ANY(ret)) {
595 			D("NIC %s busy, cannot attach to bridge",
596 				NM_IFPNAME(ifp));
597 			error = EINVAL;
598 			goto out;
599 		}
600 		/* create a fake interface */
601 		fake_ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
602 		if (!fake_ifp) {
603 			error = ENOMEM;
604 			goto out;
605 		}
606 		strcpy(fake_ifp->if_xname, name);
607 		error = netmap_bwrap_attach(fake_ifp, ifp);
608 		if (error) {
609 			free(fake_ifp, M_DEVBUF);
610 			goto out;
611 		}
612 		ret = NA(fake_ifp);
613 		if (nmr->nr_arg1 != NETMAP_BDG_HOST)
614 			cand2 = -1; /* only need one port */
615 		if_rele(ifp);
616 	}
617 	vpna = (struct netmap_vp_adapter *)ret;
618 
619 	BDG_WLOCK(b);
620 	vpna->bdg_port = cand;
621 	ND("NIC  %p to bridge port %d", vpna, cand);
622 	/* bind the port to the bridge (virtual ports are not active) */
623 	b->bdg_ports[cand] = vpna;
624 	vpna->na_bdg = b;
625 	b->bdg_active_ports++;
626 	if (cand2 >= 0) {
627 		struct netmap_vp_adapter *hostna = vpna + 1;
628 		/* also bind the host stack to the bridge */
629 		b->bdg_ports[cand2] = hostna;
630 		hostna->bdg_port = cand2;
631 		hostna->na_bdg = b;
632 		b->bdg_active_ports++;
633 		ND("host %p to bridge port %d", hostna, cand2);
634 	}
635 	ND("if %s refs %d", name, vpna->up.na_refcount);
636 	BDG_WUNLOCK(b);
637 	*na = ret;
638 	netmap_adapter_get(ret);
639 	return 0;
640 
641 out:
642 	if_rele(ifp);
643 
644 	return error;
645 }
646 
647 
648 /* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
649 static int
650 nm_bdg_attach(struct nmreq *nmr)
651 {
652 	struct netmap_adapter *na;
653 	struct netmap_if *nifp;
654 	struct netmap_priv_d *npriv;
655 	struct netmap_bwrap_adapter *bna;
656 	int error;
657 
658 	npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
659 	if (npriv == NULL)
660 		return ENOMEM;
661 	NMG_LOCK();
662 	/* XXX probably netmap_get_bdg_na() */
663 	error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */);
664 	if (error) /* no device, or another bridge or user owns the device */
665 		goto unlock_exit;
666 	if (na == NULL) { /* VALE prefix missing */
667 		error = EINVAL;
668 		goto unlock_exit;
669 	}
670 
671 	if (na->active_fds > 0) { /* already registered */
672 		error = EBUSY;
673 		goto unref_exit;
674 	}
675 
676 	nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, &error);
677 	if (!nifp) {
678 		goto unref_exit;
679 	}
680 
681 	bna = (struct netmap_bwrap_adapter*)na;
682 	bna->na_kpriv = npriv;
683 	NMG_UNLOCK();
684 	ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp));
685 	return 0;
686 
687 unref_exit:
688 	netmap_adapter_put(na);
689 unlock_exit:
690 	NMG_UNLOCK();
691 	bzero(npriv, sizeof(*npriv));
692 	free(npriv, M_DEVBUF);
693 	return error;
694 }
695 
696 
697 static int
698 nm_bdg_detach(struct nmreq *nmr)
699 {
700 	struct netmap_adapter *na;
701 	int error;
702 	struct netmap_bwrap_adapter *bna;
703 	int last_instance;
704 
705 	NMG_LOCK();
706 	error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */);
707 	if (error) { /* no device, or another bridge or user owns the device */
708 		goto unlock_exit;
709 	}
710 	if (na == NULL) { /* VALE prefix missing */
711 		error = EINVAL;
712 		goto unlock_exit;
713 	}
714 
715 	bna = (struct netmap_bwrap_adapter *)na;
716 
717 	if (na->active_fds == 0) { /* not registered */
718 		error = EINVAL;
719 		goto unref_exit;
720 	}
721 
722 	last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */
723 	if (!last_instance) {
724 		D("--- error, trying to detach an entry with active mmaps");
725 		error = EINVAL;
726 	} else {
727 		struct netmap_priv_d *npriv = bna->na_kpriv;
728 
729 		bna->na_kpriv = NULL;
730 		D("deleting priv");
731 
732 		bzero(npriv, sizeof(*npriv));
733 		free(npriv, M_DEVBUF);
734 	}
735 
736 unref_exit:
737 	netmap_adapter_put(na);
738 unlock_exit:
739 	NMG_UNLOCK();
740 	return error;
741 
742 }
743 
744 
745 /* exported to kernel callers, e.g. OVS ?
746  * Entry point.
747  * Called without NMG_LOCK.
748  */
749 int
750 netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
751 {
752 	struct nm_bridge *b;
753 	struct netmap_adapter *na;
754 	struct netmap_vp_adapter *vpna;
755 	struct ifnet *iter;
756 	char *name = nmr->nr_name;
757 	int cmd = nmr->nr_cmd, namelen = strlen(name);
758 	int error = 0, i, j;
759 
760 	switch (cmd) {
761 	case NETMAP_BDG_ATTACH:
762 		error = nm_bdg_attach(nmr);
763 		break;
764 
765 	case NETMAP_BDG_DETACH:
766 		error = nm_bdg_detach(nmr);
767 		break;
768 
769 	case NETMAP_BDG_LIST:
770 		/* this is used to enumerate bridges and ports */
771 		if (namelen) { /* look up indexes of bridge and port */
772 			if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
773 				error = EINVAL;
774 				break;
775 			}
776 			NMG_LOCK();
777 			b = nm_find_bridge(name, 0 /* don't create */);
778 			if (!b) {
779 				error = ENOENT;
780 				NMG_UNLOCK();
781 				break;
782 			}
783 
784 			error = ENOENT;
785 			for (j = 0; j < b->bdg_active_ports; j++) {
786 				i = b->bdg_port_index[j];
787 				vpna = b->bdg_ports[i];
788 				if (vpna == NULL) {
789 					D("---AAAAAAAAARGH-------");
790 					continue;
791 				}
792 				iter = vpna->up.ifp;
793 				/* the former and the latter identify a
794 				 * virtual port and a NIC, respectively
795 				 */
796 				if (!strcmp(iter->if_xname, name)) {
797 					/* bridge index */
798 					nmr->nr_arg1 = b - nm_bridges;
799 					nmr->nr_arg2 = i; /* port index */
800 					error = 0;
801 					break;
802 				}
803 			}
804 			NMG_UNLOCK();
805 		} else {
806 			/* return the first non-empty entry starting from
807 			 * bridge nr_arg1 and port nr_arg2.
808 			 *
809 			 * Users can detect the end of the same bridge by
810 			 * seeing the new and old value of nr_arg1, and can
811 			 * detect the end of all the bridge by error != 0
812 			 */
813 			i = nmr->nr_arg1;
814 			j = nmr->nr_arg2;
815 
816 			NMG_LOCK();
817 			for (error = ENOENT; i < NM_BRIDGES; i++) {
818 				b = nm_bridges + i;
819 				if (j >= b->bdg_active_ports) {
820 					j = 0; /* following bridges scan from 0 */
821 					continue;
822 				}
823 				nmr->nr_arg1 = i;
824 				nmr->nr_arg2 = j;
825 				j = b->bdg_port_index[j];
826 				vpna = b->bdg_ports[j];
827 				iter = vpna->up.ifp;
828 				strncpy(name, iter->if_xname, (size_t)IFNAMSIZ);
829 				error = 0;
830 				break;
831 			}
832 			NMG_UNLOCK();
833 		}
834 		break;
835 
836 	case NETMAP_BDG_LOOKUP_REG:
837 		/* register a lookup function to the given bridge.
838 		 * nmr->nr_name may be just bridge's name (including ':'
839 		 * if it is not just NM_NAME).
840 		 */
841 		if (!func) {
842 			error = EINVAL;
843 			break;
844 		}
845 		NMG_LOCK();
846 		b = nm_find_bridge(name, 0 /* don't create */);
847 		if (!b) {
848 			error = EINVAL;
849 		} else {
850 			b->nm_bdg_lookup = func;
851 		}
852 		NMG_UNLOCK();
853 		break;
854 
855 	case NETMAP_BDG_OFFSET:
856 		NMG_LOCK();
857 		error = netmap_get_bdg_na(nmr, &na, 0);
858 		if (na && !error) {
859 			vpna = (struct netmap_vp_adapter *)na;
860 			if (nmr->nr_arg1 > NETMAP_BDG_MAX_OFFSET)
861 				nmr->nr_arg1 = NETMAP_BDG_MAX_OFFSET;
862 			vpna->offset = nmr->nr_arg1;
863 			D("Using offset %d for %p", vpna->offset, vpna);
864 			netmap_adapter_put(na);
865 		}
866 		NMG_UNLOCK();
867 		break;
868 
869 	default:
870 		D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
871 		error = EINVAL;
872 		break;
873 	}
874 	return error;
875 }
876 
877 
878 static int
879 netmap_vp_krings_create(struct netmap_adapter *na)
880 {
881 	u_int ntx, nrx, tailroom;
882 	int error, i;
883 	uint32_t *leases;
884 
885 	/* XXX vps do not need host rings,
886 	 * but we crash if we don't have one
887 	 */
888 	ntx = na->num_tx_rings + 1;
889 	nrx = na->num_rx_rings + 1;
890 
891 	/*
892 	 * Leases are attached to RX rings on vale ports
893 	 */
894 	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
895 
896 	error = netmap_krings_create(na, ntx, nrx, tailroom);
897 	if (error)
898 		return error;
899 
900 	leases = na->tailroom;
901 
902 	for (i = 0; i < nrx; i++) { /* Receive rings */
903 		na->rx_rings[i].nkr_leases = leases;
904 		leases += na->num_rx_desc;
905 	}
906 
907 	error = nm_alloc_bdgfwd(na);
908 	if (error) {
909 		netmap_krings_delete(na);
910 		return error;
911 	}
912 
913 	return 0;
914 }
915 
916 
917 static void
918 netmap_vp_krings_delete(struct netmap_adapter *na)
919 {
920 	nm_free_bdgfwd(na);
921 	netmap_krings_delete(na);
922 }
923 
924 
925 static int
926 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
927 	struct netmap_vp_adapter *na, u_int ring_nr);
928 
929 
930 /*
931  * Grab packets from a kring, move them into the ft structure
932  * associated to the tx (input) port. Max one instance per port,
933  * filtered on input (ioctl, poll or XXX).
934  * Returns the next position in the ring.
935  */
936 static int
937 nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr,
938 	struct netmap_kring *kring, u_int end)
939 {
940 	struct netmap_ring *ring = kring->ring;
941 	struct nm_bdg_fwd *ft;
942 	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
943 	u_int ft_i = 0;	/* start from 0 */
944 	u_int frags = 1; /* how many frags ? */
945 	struct nm_bridge *b = na->na_bdg;
946 
947 	/* To protect against modifications to the bridge we acquire a
948 	 * shared lock, waiting if we can sleep (if the source port is
949 	 * attached to a user process) or with a trylock otherwise (NICs).
950 	 */
951 	ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
952 	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
953 		BDG_RLOCK(b);
954 	else if (!BDG_RTRYLOCK(b))
955 		return 0;
956 	ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
957 	ft = kring->nkr_ft;
958 
959 	for (; likely(j != end); j = nm_next(j, lim)) {
960 		struct netmap_slot *slot = &ring->slot[j];
961 		char *buf;
962 
963 		ft[ft_i].ft_len = slot->len;
964 		ft[ft_i].ft_flags = slot->flags;
965 
966 		ND("flags is 0x%x", slot->flags);
967 		/* this slot goes into a list so initialize the link field */
968 		ft[ft_i].ft_next = NM_FT_NULL;
969 		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
970 			(void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot);
971 		__builtin_prefetch(buf);
972 		++ft_i;
973 		if (slot->flags & NS_MOREFRAG) {
974 			frags++;
975 			continue;
976 		}
977 		if (unlikely(netmap_verbose && frags > 1))
978 			RD(5, "%d frags at %d", frags, ft_i - frags);
979 		ft[ft_i - frags].ft_frags = frags;
980 		frags = 1;
981 		if (unlikely((int)ft_i >= bridge_batch))
982 			ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
983 	}
984 	if (frags > 1) {
985 		D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);
986 		// ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG
987 		ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;
988 		ft[ft_i - frags].ft_frags = frags - 1;
989 	}
990 	if (ft_i)
991 		ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
992 	BDG_RUNLOCK(b);
993 	return j;
994 }
995 
996 
997 /* ----- FreeBSD if_bridge hash function ------- */
998 
999 /*
1000  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1001  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1002  *
1003  * http://www.burtleburtle.net/bob/hash/spooky.html
1004  */
1005 #define mix(a, b, c)                                                    \
1006 do {                                                                    \
1007         a -= b; a -= c; a ^= (c >> 13);                                 \
1008         b -= c; b -= a; b ^= (a << 8);                                  \
1009         c -= a; c -= b; c ^= (b >> 13);                                 \
1010         a -= b; a -= c; a ^= (c >> 12);                                 \
1011         b -= c; b -= a; b ^= (a << 16);                                 \
1012         c -= a; c -= b; c ^= (b >> 5);                                  \
1013         a -= b; a -= c; a ^= (c >> 3);                                  \
1014         b -= c; b -= a; b ^= (a << 10);                                 \
1015         c -= a; c -= b; c ^= (b >> 15);                                 \
1016 } while (/*CONSTCOND*/0)
1017 
1018 
1019 static __inline uint32_t
1020 nm_bridge_rthash(const uint8_t *addr)
1021 {
1022         uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1023 
1024         b += addr[5] << 8;
1025         b += addr[4];
1026         a += addr[3] << 24;
1027         a += addr[2] << 16;
1028         a += addr[1] << 8;
1029         a += addr[0];
1030 
1031         mix(a, b, c);
1032 #define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
1033         return (c & BRIDGE_RTHASH_MASK);
1034 }
1035 
1036 #undef mix
1037 
1038 
1039 static int
1040 bdg_netmap_reg(struct netmap_adapter *na, int onoff)
1041 {
1042 	struct netmap_vp_adapter *vpna =
1043 		(struct netmap_vp_adapter*)na;
1044 	struct ifnet *ifp = na->ifp;
1045 
1046 	/* the interface is already attached to the bridge,
1047 	 * so we only need to toggle IFCAP_NETMAP.
1048 	 */
1049 	BDG_WLOCK(vpna->na_bdg);
1050 	if (onoff) {
1051 		ifp->if_capenable |= IFCAP_NETMAP;
1052 	} else {
1053 		ifp->if_capenable &= ~IFCAP_NETMAP;
1054 	}
1055 	BDG_WUNLOCK(vpna->na_bdg);
1056 	return 0;
1057 }
1058 
1059 
1060 /*
1061  * Lookup function for a learning bridge.
1062  * Update the hash table with the source address,
1063  * and then returns the destination port index, and the
1064  * ring in *dst_ring (at the moment, always use ring 0)
1065  */
1066 u_int
1067 netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring,
1068 		struct netmap_vp_adapter *na)
1069 {
1070 	struct nm_hash_ent *ht = na->na_bdg->ht;
1071 	uint32_t sh, dh;
1072 	u_int dst, mysrc = na->bdg_port;
1073 	uint64_t smac, dmac;
1074 
1075 	if (buf_len < 14) {
1076 		D("invalid buf length %d", buf_len);
1077 		return NM_BDG_NOPORT;
1078 	}
1079 	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1080 	smac = le64toh(*(uint64_t *)(buf + 4));
1081 	smac >>= 16;
1082 
1083 	/*
1084 	 * The hash is somewhat expensive, there might be some
1085 	 * worthwhile optimizations here.
1086 	 */
1087 	if ((buf[6] & 1) == 0) { /* valid src */
1088 		uint8_t *s = buf+6;
1089 		sh = nm_bridge_rthash(s); // XXX hash of source
1090 		/* update source port forwarding entry */
1091 		ht[sh].mac = smac;	/* XXX expire ? */
1092 		ht[sh].ports = mysrc;
1093 		if (netmap_verbose)
1094 		    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1095 			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1096 	}
1097 	dst = NM_BDG_BROADCAST;
1098 	if ((buf[0] & 1) == 0) { /* unicast */
1099 		dh = nm_bridge_rthash(buf); // XXX hash of dst
1100 		if (ht[dh].mac == dmac) {	/* found dst */
1101 			dst = ht[dh].ports;
1102 		}
1103 		/* XXX otherwise return NM_BDG_UNKNOWN ? */
1104 	}
1105 	*dst_ring = 0;
1106 	return dst;
1107 }
1108 
1109 
1110 /*
1111  * Available space in the ring. Only used in VALE code
1112  * and only with is_rx = 1
1113  */
1114 static inline uint32_t
1115 nm_kr_space(struct netmap_kring *k, int is_rx)
1116 {
1117 	int space;
1118 
1119 	if (is_rx) {
1120 		int busy = k->nkr_hwlease - k->nr_hwcur;
1121 		if (busy < 0)
1122 			busy += k->nkr_num_slots;
1123 		space = k->nkr_num_slots - 1 - busy;
1124 	} else {
1125 		/* XXX never used in this branch */
1126 		space = k->nr_hwtail - k->nkr_hwlease;
1127 		if (space < 0)
1128 			space += k->nkr_num_slots;
1129 	}
1130 #if 0
1131 	// sanity check
1132 	if (k->nkr_hwlease >= k->nkr_num_slots ||
1133 		k->nr_hwcur >= k->nkr_num_slots ||
1134 		k->nr_tail >= k->nkr_num_slots ||
1135 		busy < 0 ||
1136 		busy >= k->nkr_num_slots) {
1137 		D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1138 			k->nkr_lease_idx, k->nkr_num_slots);
1139 	}
1140 #endif
1141 	return space;
1142 }
1143 
1144 
1145 
1146 
1147 /* make a lease on the kring for N positions. return the
1148  * lease index
1149  * XXX only used in VALE code and with is_rx = 1
1150  */
1151 static inline uint32_t
1152 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1153 {
1154 	uint32_t lim = k->nkr_num_slots - 1;
1155 	uint32_t lease_idx = k->nkr_lease_idx;
1156 
1157 	k->nkr_leases[lease_idx] = NR_NOSLOT;
1158 	k->nkr_lease_idx = nm_next(lease_idx, lim);
1159 
1160 	if (n > nm_kr_space(k, is_rx)) {
1161 		D("invalid request for %d slots", n);
1162 		panic("x");
1163 	}
1164 	/* XXX verify that there are n slots */
1165 	k->nkr_hwlease += n;
1166 	if (k->nkr_hwlease > lim)
1167 		k->nkr_hwlease -= lim + 1;
1168 
1169 	if (k->nkr_hwlease >= k->nkr_num_slots ||
1170 		k->nr_hwcur >= k->nkr_num_slots ||
1171 		k->nr_hwtail >= k->nkr_num_slots ||
1172 		k->nkr_lease_idx >= k->nkr_num_slots) {
1173 		D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1174 			k->na->ifp->if_xname,
1175 			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1176 			k->nkr_lease_idx, k->nkr_num_slots);
1177 	}
1178 	return lease_idx;
1179 }
1180 
1181 /*
1182  * This flush routine supports only unicast and broadcast but a large
1183  * number of ports, and lets us replace the learn and dispatch functions.
1184  */
1185 int
1186 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1187 		u_int ring_nr)
1188 {
1189 	struct nm_bdg_q *dst_ents, *brddst;
1190 	uint16_t num_dsts = 0, *dsts;
1191 	struct nm_bridge *b = na->na_bdg;
1192 	u_int i, j, me = na->bdg_port;
1193 
1194 	/*
1195 	 * The work area (pointed by ft) is followed by an array of
1196 	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1197 	 * queues per port plus one for the broadcast traffic.
1198 	 * Then we have an array of destination indexes.
1199 	 */
1200 	dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1201 	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1202 
1203 	/* first pass: find a destination for each packet in the batch */
1204 	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1205 		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1206 		uint16_t dst_port, d_i;
1207 		struct nm_bdg_q *d;
1208 		uint8_t *buf = ft[i].ft_buf;
1209 		u_int len = ft[i].ft_len;
1210 
1211 		ND("slot %d frags %d", i, ft[i].ft_frags);
1212 		/* Drop the packet if the offset is not into the first
1213 		   fragment nor at the very beginning of the second. */
1214 		if (unlikely(na->offset > len))
1215 			continue;
1216 		if (len == na->offset) {
1217 			buf = ft[i+1].ft_buf;
1218 			len = ft[i+1].ft_len;
1219 		} else {
1220 			buf += na->offset;
1221 			len -= na->offset;
1222 		}
1223 		dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na);
1224 		if (netmap_verbose > 255)
1225 			RD(5, "slot %d port %d -> %d", i, me, dst_port);
1226 		if (dst_port == NM_BDG_NOPORT)
1227 			continue; /* this packet is identified to be dropped */
1228 		else if (unlikely(dst_port > NM_BDG_MAXPORTS))
1229 			continue;
1230 		else if (dst_port == NM_BDG_BROADCAST)
1231 			dst_ring = 0; /* broadcasts always go to ring 0 */
1232 		else if (unlikely(dst_port == me ||
1233 		    !b->bdg_ports[dst_port]))
1234 			continue;
1235 
1236 		/* get a position in the scratch pad */
1237 		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1238 		d = dst_ents + d_i;
1239 
1240 		/* append the first fragment to the list */
1241 		if (d->bq_head == NM_FT_NULL) { /* new destination */
1242 			d->bq_head = d->bq_tail = i;
1243 			/* remember this position to be scanned later */
1244 			if (dst_port != NM_BDG_BROADCAST)
1245 				dsts[num_dsts++] = d_i;
1246 		} else {
1247 			ft[d->bq_tail].ft_next = i;
1248 			d->bq_tail = i;
1249 		}
1250 		d->bq_len += ft[i].ft_frags;
1251 	}
1252 
1253 	/*
1254 	 * Broadcast traffic goes to ring 0 on all destinations.
1255 	 * So we need to add these rings to the list of ports to scan.
1256 	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1257 	 * expensive. We should keep a compact list of active destinations
1258 	 * so we could shorten this loop.
1259 	 */
1260 	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1261 	if (brddst->bq_head != NM_FT_NULL) {
1262 		for (j = 0; likely(j < b->bdg_active_ports); j++) {
1263 			uint16_t d_i;
1264 			i = b->bdg_port_index[j];
1265 			if (unlikely(i == me))
1266 				continue;
1267 			d_i = i * NM_BDG_MAXRINGS;
1268 			if (dst_ents[d_i].bq_head == NM_FT_NULL)
1269 				dsts[num_dsts++] = d_i;
1270 		}
1271 	}
1272 
1273 	ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1274 	/* second pass: scan destinations (XXX will be modular somehow) */
1275 	for (i = 0; i < num_dsts; i++) {
1276 		struct ifnet *dst_ifp;
1277 		struct netmap_vp_adapter *dst_na;
1278 		struct netmap_kring *kring;
1279 		struct netmap_ring *ring;
1280 		u_int dst_nr, lim, j, sent = 0, d_i, next, brd_next;
1281 		u_int needed, howmany;
1282 		int retry = netmap_txsync_retry;
1283 		struct nm_bdg_q *d;
1284 		uint32_t my_start = 0, lease_idx = 0;
1285 		int nrings;
1286 		int offset_mismatch;
1287 
1288 		d_i = dsts[i];
1289 		ND("second pass %d port %d", i, d_i);
1290 		d = dst_ents + d_i;
1291 		// XXX fix the division
1292 		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1293 		/* protect from the lookup function returning an inactive
1294 		 * destination port
1295 		 */
1296 		if (unlikely(dst_na == NULL))
1297 			goto cleanup;
1298 		if (dst_na->up.na_flags & NAF_SW_ONLY)
1299 			goto cleanup;
1300 		dst_ifp = dst_na->up.ifp;
1301 		/*
1302 		 * The interface may be in !netmap mode in two cases:
1303 		 * - when na is attached but not activated yet;
1304 		 * - when na is being deactivated but is still attached.
1305 		 */
1306 		if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) {
1307 			ND("not in netmap mode!");
1308 			goto cleanup;
1309 		}
1310 
1311 		offset_mismatch = (dst_na->offset != na->offset);
1312 
1313 		/* there is at least one either unicast or broadcast packet */
1314 		brd_next = brddst->bq_head;
1315 		next = d->bq_head;
1316 		/* we need to reserve this many slots. If fewer are
1317 		 * available, some packets will be dropped.
1318 		 * Packets may have multiple fragments, so we may not use
1319 		 * there is a chance that we may not use all of the slots
1320 		 * we have claimed, so we will need to handle the leftover
1321 		 * ones when we regain the lock.
1322 		 */
1323 		needed = d->bq_len + brddst->bq_len;
1324 
1325 		ND(5, "pass 2 dst %d is %x %s",
1326 			i, d_i, is_vp ? "virtual" : "nic/host");
1327 		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1328 		nrings = dst_na->up.num_rx_rings;
1329 		if (dst_nr >= nrings)
1330 			dst_nr = dst_nr % nrings;
1331 		kring = &dst_na->up.rx_rings[dst_nr];
1332 		ring = kring->ring;
1333 		lim = kring->nkr_num_slots - 1;
1334 
1335 retry:
1336 
1337 		/* reserve the buffers in the queue and an entry
1338 		 * to report completion, and drop lock.
1339 		 * XXX this might become a helper function.
1340 		 */
1341 		mtx_lock(&kring->q_lock);
1342 		if (kring->nkr_stopped) {
1343 			mtx_unlock(&kring->q_lock);
1344 			goto cleanup;
1345 		}
1346 		if (dst_na->retry) {
1347 			dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1348 		}
1349 		my_start = j = kring->nkr_hwlease;
1350 		howmany = nm_kr_space(kring, 1);
1351 		if (needed < howmany)
1352 			howmany = needed;
1353 		lease_idx = nm_kr_lease(kring, howmany, 1);
1354 		mtx_unlock(&kring->q_lock);
1355 
1356 		/* only retry if we need more than available slots */
1357 		if (retry && needed <= howmany)
1358 			retry = 0;
1359 
1360 		/* copy to the destination queue */
1361 		while (howmany > 0) {
1362 			struct netmap_slot *slot;
1363 			struct nm_bdg_fwd *ft_p, *ft_end;
1364 			u_int cnt;
1365 			int fix_mismatch = offset_mismatch;
1366 
1367 			/* find the queue from which we pick next packet.
1368 			 * NM_FT_NULL is always higher than valid indexes
1369 			 * so we never dereference it if the other list
1370 			 * has packets (and if both are empty we never
1371 			 * get here).
1372 			 */
1373 			if (next < brd_next) {
1374 				ft_p = ft + next;
1375 				next = ft_p->ft_next;
1376 			} else { /* insert broadcast */
1377 				ft_p = ft + brd_next;
1378 				brd_next = ft_p->ft_next;
1379 			}
1380 			cnt = ft_p->ft_frags; // cnt > 0
1381 			if (unlikely(cnt > howmany))
1382 			    break; /* no more space */
1383 			howmany -= cnt;
1384 			if (netmap_verbose && cnt > 1)
1385 				RD(5, "rx %d frags to %d", cnt, j);
1386 			ft_end = ft_p + cnt;
1387 			do {
1388 			    char *dst, *src = ft_p->ft_buf;
1389 			    size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1390 
1391 			    slot = &ring->slot[j];
1392 			    dst = BDG_NMB(&dst_na->up, slot);
1393 
1394 			    if (unlikely(fix_mismatch)) {
1395 				    /* We are processing the first fragment
1396 				     * and there is a mismatch between source
1397 				     * and destination offsets. Create a zeroed
1398 				     * header for the destination, independently
1399 				     * of the source header length and content.
1400 				     */
1401 				    src += na->offset;
1402 				    copy_len -= na->offset;
1403 				    bzero(dst, dst_na->offset);
1404 				    dst += dst_na->offset;
1405 				    dst_len = dst_na->offset + copy_len;
1406 				    /* fix the first fragment only */
1407 				    fix_mismatch = 0;
1408 				    /* Here it could be copy_len == dst_len == 0,
1409 				     * and so a zero length fragment is passed.
1410 				     */
1411 			    }
1412 
1413 			    ND("send [%d] %d(%d) bytes at %s:%d",
1414 				i, (int)copy_len, (int)dst_len,
1415 				NM_IFPNAME(dst_ifp), j);
1416 			    /* round to a multiple of 64 */
1417 			    copy_len = (copy_len + 63) & ~63;
1418 
1419 			    if (ft_p->ft_flags & NS_INDIRECT) {
1420 				if (copyin(src, dst, copy_len)) {
1421 					// invalid user pointer, pretend len is 0
1422 					dst_len = 0;
1423 				}
1424 			    } else {
1425 				//memcpy(dst, src, copy_len);
1426 				pkt_copy(src, dst, (int)copy_len);
1427 			    }
1428 			    slot->len = dst_len;
1429 			    slot->flags = (cnt << 8)| NS_MOREFRAG;
1430 			    j = nm_next(j, lim);
1431 			    ft_p++;
1432 			    sent++;
1433 			} while (ft_p != ft_end);
1434 			slot->flags = (cnt << 8); /* clear flag on last entry */
1435 			/* are we done ? */
1436 			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1437 				break;
1438 		}
1439 		{
1440 		    /* current position */
1441 		    uint32_t *p = kring->nkr_leases; /* shorthand */
1442 		    uint32_t update_pos;
1443 		    int still_locked = 1;
1444 
1445 		    mtx_lock(&kring->q_lock);
1446 		    if (unlikely(howmany > 0)) {
1447 			/* not used all bufs. If i am the last one
1448 			 * i can recover the slots, otherwise must
1449 			 * fill them with 0 to mark empty packets.
1450 			 */
1451 			ND("leftover %d bufs", howmany);
1452 			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1453 			    /* yes i am the last one */
1454 			    ND("roll back nkr_hwlease to %d", j);
1455 			    kring->nkr_hwlease = j;
1456 			} else {
1457 			    while (howmany-- > 0) {
1458 				ring->slot[j].len = 0;
1459 				ring->slot[j].flags = 0;
1460 				j = nm_next(j, lim);
1461 			    }
1462 			}
1463 		    }
1464 		    p[lease_idx] = j; /* report I am done */
1465 
1466 		    update_pos = kring->nr_hwtail;
1467 
1468 		    if (my_start == update_pos) {
1469 			/* all slots before my_start have been reported,
1470 			 * so scan subsequent leases to see if other ranges
1471 			 * have been completed, and to a selwakeup or txsync.
1472 		         */
1473 			while (lease_idx != kring->nkr_lease_idx &&
1474 				p[lease_idx] != NR_NOSLOT) {
1475 			    j = p[lease_idx];
1476 			    p[lease_idx] = NR_NOSLOT;
1477 			    lease_idx = nm_next(lease_idx, lim);
1478 			}
1479 			/* j is the new 'write' position. j != my_start
1480 			 * means there are new buffers to report
1481 			 */
1482 			if (likely(j != my_start)) {
1483 				kring->nr_hwtail = j;
1484 				dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1485 				still_locked = 0;
1486 				mtx_unlock(&kring->q_lock);
1487 				if (dst_na->retry && retry--)
1488 					goto retry;
1489 			}
1490 		    }
1491 		    if (still_locked)
1492 			mtx_unlock(&kring->q_lock);
1493 		}
1494 cleanup:
1495 		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1496 		d->bq_len = 0;
1497 	}
1498 	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1499 	brddst->bq_len = 0;
1500 	return 0;
1501 }
1502 
1503 
1504 static int
1505 netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags)
1506 {
1507 	struct netmap_kring *kring = &na->up.tx_rings[ring_nr];
1508 	u_int done;
1509 	u_int const lim = kring->nkr_num_slots - 1;
1510 	u_int const cur = kring->rcur;
1511 
1512 	if (bridge_batch <= 0) { /* testing only */
1513 		done = cur; // used all
1514 		goto done;
1515 	}
1516 	if (bridge_batch > NM_BDG_BATCH)
1517 		bridge_batch = NM_BDG_BATCH;
1518 
1519 	done = nm_bdg_preflush(na, ring_nr, kring, cur);
1520 done:
1521 	if (done != cur)
1522 		D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail);
1523 	/*
1524 	 * packets between 'done' and 'cur' are left unsent.
1525 	 */
1526 	kring->nr_hwcur = done;
1527 	kring->nr_hwtail = nm_prev(done, lim);
1528 	nm_txsync_finalize(kring);
1529 	if (netmap_verbose)
1530 		D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags);
1531 	return 0;
1532 }
1533 
1534 
1535 /*
1536  * main dispatch routine for the bridge.
1537  * We already know that only one thread is running this.
1538  * we must run nm_bdg_preflush without lock.
1539  */
1540 static int
1541 bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1542 {
1543 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
1544 	return netmap_vp_txsync(vpna, ring_nr, flags);
1545 }
1546 
1547 static int
1548 netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1549 {
1550 	struct netmap_kring *kring = &na->rx_rings[ring_nr];
1551 	struct netmap_ring *ring = kring->ring;
1552 	u_int nm_i, lim = kring->nkr_num_slots - 1;
1553 	u_int head = nm_rxsync_prologue(kring);
1554 	int n;
1555 
1556 	if (head > lim) {
1557 		D("ouch dangerous reset!!!");
1558 		n = netmap_ring_reinit(kring);
1559 		goto done;
1560 	}
1561 
1562 	/* First part, import newly received packets. */
1563 	/* actually nothing to do here, they are already in the kring */
1564 
1565 	/* Second part, skip past packets that userspace has released. */
1566 	nm_i = kring->nr_hwcur;
1567 	if (nm_i != head) {
1568 		/* consistency check, but nothing really important here */
1569 		for (n = 0; likely(nm_i != head); n++) {
1570 			struct netmap_slot *slot = &ring->slot[nm_i];
1571 			void *addr = BDG_NMB(na, slot);
1572 
1573 			if (addr == netmap_buffer_base) { /* bad buf */
1574 				D("bad buffer index %d, ignore ?",
1575 					slot->buf_idx);
1576 			}
1577 			slot->flags &= ~NS_BUF_CHANGED;
1578 			nm_i = nm_next(nm_i, lim);
1579 		}
1580 		kring->nr_hwcur = head;
1581 	}
1582 
1583 	/* tell userspace that there are new packets */
1584 	nm_rxsync_finalize(kring);
1585 	n = 0;
1586 done:
1587 	return n;
1588 }
1589 
1590 /*
1591  * user process reading from a VALE switch.
1592  * Already protected against concurrent calls from userspace,
1593  * but we must acquire the queue's lock to protect against
1594  * writers on the same queue.
1595  */
1596 static int
1597 bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1598 {
1599 	struct netmap_kring *kring = &na->rx_rings[ring_nr];
1600 	int n;
1601 
1602 	mtx_lock(&kring->q_lock);
1603 	n = netmap_vp_rxsync(na, ring_nr, flags);
1604 	mtx_unlock(&kring->q_lock);
1605 	return n;
1606 }
1607 
1608 
1609 static int
1610 bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
1611 {
1612 	struct netmap_vp_adapter *vpna;
1613 	struct netmap_adapter *na;
1614 	int error;
1615 
1616 	vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO);
1617 	if (vpna == NULL)
1618 		return ENOMEM;
1619 
1620  	na = &vpna->up;
1621 
1622 	na->ifp = ifp;
1623 
1624 	/* bound checking */
1625 	na->num_tx_rings = nmr->nr_tx_rings;
1626 	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1627 	nmr->nr_tx_rings = na->num_tx_rings; // write back
1628 	na->num_rx_rings = nmr->nr_rx_rings;
1629 	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1630 	nmr->nr_rx_rings = na->num_rx_rings; // write back
1631 	nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1632 			1, NM_BDG_MAXSLOTS, NULL);
1633 	na->num_tx_desc = nmr->nr_tx_slots;
1634 	nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1635 			1, NM_BDG_MAXSLOTS, NULL);
1636 	na->num_rx_desc = nmr->nr_rx_slots;
1637 	vpna->offset = 0;
1638 
1639 	na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER;
1640 	na->nm_txsync = bdg_netmap_txsync;
1641 	na->nm_rxsync = bdg_netmap_rxsync;
1642 	na->nm_register = bdg_netmap_reg;
1643 	na->nm_dtor = netmap_adapter_vp_dtor;
1644 	na->nm_krings_create = netmap_vp_krings_create;
1645 	na->nm_krings_delete = netmap_vp_krings_delete;
1646 	na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp),
1647 			na->num_tx_rings, na->num_tx_desc,
1648 			na->num_rx_rings, na->num_rx_desc);
1649 	/* other nmd fields are set in the common routine */
1650 	error = netmap_attach_common(na);
1651 	if (error) {
1652 		free(vpna, M_DEVBUF);
1653 		return error;
1654 	}
1655 	return 0;
1656 }
1657 
1658 
1659 static void
1660 netmap_bwrap_dtor(struct netmap_adapter *na)
1661 {
1662 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1663 	struct netmap_adapter *hwna = bna->hwna;
1664 	struct nm_bridge *b = bna->up.na_bdg,
1665 		*bh = bna->host.na_bdg;
1666 	struct ifnet *ifp = na->ifp;
1667 
1668 	ND("na %p", na);
1669 
1670 	if (b) {
1671 		netmap_bdg_detach_common(b, bna->up.bdg_port,
1672 			(bh ? bna->host.bdg_port : -1));
1673 	}
1674 
1675 	hwna->na_private = NULL;
1676 	netmap_adapter_put(hwna);
1677 
1678 	bzero(ifp, sizeof(*ifp));
1679 	free(ifp, M_DEVBUF);
1680 	na->ifp = NULL;
1681 
1682 }
1683 
1684 
1685 /*
1686  * Intr callback for NICs connected to a bridge.
1687  * Simply ignore tx interrupts (maybe we could try to recover space ?)
1688  * and pass received packets from nic to the bridge.
1689  *
1690  * XXX TODO check locking: this is called from the interrupt
1691  * handler so we should make sure that the interface is not
1692  * disconnected while passing down an interrupt.
1693  *
1694  * Note, no user process can access this NIC or the host stack.
1695  * The only part of the ring that is significant are the slots,
1696  * and head/cur/tail are set from the kring as needed
1697  * (part as a receive ring, part as a transmit ring).
1698  *
1699  * callback that overwrites the hwna notify callback.
1700  * Packets come from the outside or from the host stack and are put on an hwna rx ring.
1701  * The bridge wrapper then sends the packets through the bridge.
1702  */
1703 static int
1704 netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags)
1705 {
1706 	struct ifnet *ifp = na->ifp;
1707 	struct netmap_bwrap_adapter *bna = na->na_private;
1708 	struct netmap_vp_adapter *hostna = &bna->host;
1709 	struct netmap_kring *kring, *bkring;
1710 	struct netmap_ring *ring;
1711 	int is_host_ring = ring_nr == na->num_rx_rings;
1712 	struct netmap_vp_adapter *vpna = &bna->up;
1713 	int error = 0;
1714 
1715 	if (netmap_verbose)
1716 	    D("%s %s%d 0x%x", NM_IFPNAME(ifp),
1717 		(tx == NR_TX ? "TX" : "RX"), ring_nr, flags);
1718 
1719 	if (flags & NAF_DISABLE_NOTIFY) {
1720 		kring = tx == NR_TX ? na->tx_rings : na->rx_rings;
1721 		bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings;
1722 		if (kring[ring_nr].nkr_stopped)
1723 			netmap_disable_ring(&bkring[ring_nr]);
1724 		else
1725 			bkring[ring_nr].nkr_stopped = 0;
1726 		return 0;
1727 	}
1728 
1729 	if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP))
1730 		return 0;
1731 
1732 	/* we only care about receive interrupts */
1733 	if (tx == NR_TX)
1734 		return 0;
1735 
1736 	kring = &na->rx_rings[ring_nr];
1737 	ring = kring->ring;
1738 
1739 	/* make sure the ring is not disabled */
1740 	if (nm_kr_tryget(kring))
1741 		return 0;
1742 
1743 	if (is_host_ring && hostna->na_bdg == NULL) {
1744 		error = bna->save_notify(na, ring_nr, tx, flags);
1745 		goto put_out;
1746 	}
1747 
1748 	/* Here we expect ring->head = ring->cur = ring->tail
1749 	 * because everything has been released from the previous round.
1750 	 * However the ring is shared and we might have info from
1751 	 * the wrong side (the tx ring). Hence we overwrite with
1752 	 * the info from the rx kring.
1753 	 */
1754 	if (netmap_verbose)
1755 	    D("%s head %d cur %d tail %d (kring %d %d %d)",  NM_IFPNAME(ifp),
1756 		ring->head, ring->cur, ring->tail,
1757 		kring->rhead, kring->rcur, kring->rtail);
1758 
1759 	ring->head = kring->rhead;
1760 	ring->cur = kring->rcur;
1761 	ring->tail = kring->rtail;
1762 
1763 	/* simulate a user wakeup on the rx ring */
1764 	if (is_host_ring) {
1765 		netmap_rxsync_from_host(na, NULL, NULL);
1766 		vpna = hostna;
1767 		ring_nr = 0;
1768 	} else {
1769 		/* fetch packets that have arrived.
1770 		 * XXX maybe do this in a loop ?
1771 		 */
1772 		error = na->nm_rxsync(na, ring_nr, 0);
1773 		if (error)
1774 			goto put_out;
1775 	}
1776 	if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {
1777 		D("how strange, interrupt with no packets on %s",
1778 			NM_IFPNAME(ifp));
1779 		goto put_out;
1780 	}
1781 
1782 	/* new packets are ring->cur to ring->tail, and the bkring
1783 	 * had hwcur == ring->cur. So advance ring->cur to ring->tail
1784 	 * to push all packets out.
1785 	 */
1786 	ring->head = ring->cur = ring->tail;
1787 
1788 	/* also set tail to what the bwrap expects */
1789 	bkring = &vpna->up.tx_rings[ring_nr];
1790 	ring->tail = bkring->nr_hwtail; // rtail too ?
1791 
1792 	/* pass packets to the switch */
1793 	nm_txsync_prologue(bkring); // XXX error checking ?
1794 	netmap_vp_txsync(vpna, ring_nr, flags);
1795 
1796 	/* mark all buffers as released on this ring */
1797 	ring->head = ring->cur = kring->nr_hwtail;
1798 	ring->tail = kring->rtail;
1799 	/* another call to actually release the buffers */
1800 	if (!is_host_ring) {
1801 		error = na->nm_rxsync(na, ring_nr, 0);
1802 	} else {
1803 		/* mark all packets as released, as in the
1804 		 * second part of netmap_rxsync_from_host()
1805 		 */
1806 		kring->nr_hwcur = kring->nr_hwtail;
1807 		nm_rxsync_finalize(kring);
1808 	}
1809 
1810 put_out:
1811 	nm_kr_put(kring);
1812 	return error;
1813 }
1814 
1815 
1816 static int
1817 netmap_bwrap_register(struct netmap_adapter *na, int onoff)
1818 {
1819 	struct netmap_bwrap_adapter *bna =
1820 		(struct netmap_bwrap_adapter *)na;
1821 	struct netmap_adapter *hwna = bna->hwna;
1822 	struct netmap_vp_adapter *hostna = &bna->host;
1823 	int error;
1824 
1825 	ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off");
1826 
1827 	if (onoff) {
1828 		int i;
1829 
1830 		hwna->na_lut = na->na_lut;
1831 		hwna->na_lut_objtotal = na->na_lut_objtotal;
1832 
1833 		if (hostna->na_bdg) {
1834 			hostna->up.na_lut = na->na_lut;
1835 			hostna->up.na_lut_objtotal = na->na_lut_objtotal;
1836 		}
1837 
1838 		/* cross-link the netmap rings
1839 		 * The original number of rings comes from hwna,
1840 		 * rx rings on one side equals tx rings on the other.
1841 		 */
1842 		for (i = 0; i <= na->num_rx_rings; i++) {
1843 			hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots;
1844 			hwna->tx_rings[i].ring = na->rx_rings[i].ring;
1845 		}
1846 		for (i = 0; i <= na->num_tx_rings; i++) {
1847 			hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots;
1848 			hwna->rx_rings[i].ring = na->tx_rings[i].ring;
1849 		}
1850 	}
1851 
1852 	if (hwna->ifp) {
1853 		error = hwna->nm_register(hwna, onoff);
1854 		if (error)
1855 			return error;
1856 	}
1857 
1858 	bdg_netmap_reg(na, onoff);
1859 
1860 	if (onoff) {
1861 		bna->save_notify = hwna->nm_notify;
1862 		hwna->nm_notify = netmap_bwrap_intr_notify;
1863 	} else {
1864 		hwna->nm_notify = bna->save_notify;
1865 		hwna->na_lut = NULL;
1866 		hwna->na_lut_objtotal = 0;
1867 	}
1868 
1869 	return 0;
1870 }
1871 
1872 
1873 static int
1874 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
1875 				    u_int *rxr, u_int *rxd)
1876 {
1877 	struct netmap_bwrap_adapter *bna =
1878 		(struct netmap_bwrap_adapter *)na;
1879 	struct netmap_adapter *hwna = bna->hwna;
1880 
1881 	/* forward the request */
1882 	netmap_update_config(hwna);
1883 	/* swap the results */
1884 	*txr = hwna->num_rx_rings;
1885 	*txd = hwna->num_rx_desc;
1886 	*rxr = hwna->num_tx_rings;
1887 	*rxd = hwna->num_rx_desc;
1888 
1889 	return 0;
1890 }
1891 
1892 
1893 static int
1894 netmap_bwrap_krings_create(struct netmap_adapter *na)
1895 {
1896 	struct netmap_bwrap_adapter *bna =
1897 		(struct netmap_bwrap_adapter *)na;
1898 	struct netmap_adapter *hwna = bna->hwna;
1899 	struct netmap_adapter *hostna = &bna->host.up;
1900 	int error;
1901 
1902 	ND("%s", NM_IFPNAME(na->ifp));
1903 
1904 	error = netmap_vp_krings_create(na);
1905 	if (error)
1906 		return error;
1907 
1908 	error = hwna->nm_krings_create(hwna);
1909 	if (error) {
1910 		netmap_vp_krings_delete(na);
1911 		return error;
1912 	}
1913 
1914 	hostna->tx_rings = na->tx_rings + na->num_tx_rings;
1915 	hostna->rx_rings = na->rx_rings + na->num_rx_rings;
1916 
1917 	return 0;
1918 }
1919 
1920 
1921 static void
1922 netmap_bwrap_krings_delete(struct netmap_adapter *na)
1923 {
1924 	struct netmap_bwrap_adapter *bna =
1925 		(struct netmap_bwrap_adapter *)na;
1926 	struct netmap_adapter *hwna = bna->hwna;
1927 
1928 	ND("%s", NM_IFPNAME(na->ifp));
1929 
1930 	hwna->nm_krings_delete(hwna);
1931 	netmap_vp_krings_delete(na);
1932 }
1933 
1934 
1935 /* notify method for the bridge-->hwna direction */
1936 static int
1937 netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
1938 {
1939 	struct netmap_bwrap_adapter *bna =
1940 		(struct netmap_bwrap_adapter *)na;
1941 	struct netmap_adapter *hwna = bna->hwna;
1942 	struct netmap_kring *kring, *hw_kring;
1943 	struct netmap_ring *ring;
1944 	u_int lim;
1945 	int error = 0;
1946 
1947 	if (tx == NR_TX)
1948 	        return ENXIO;
1949 
1950 	kring = &na->rx_rings[ring_n];
1951 	hw_kring = &hwna->tx_rings[ring_n];
1952 	ring = kring->ring;
1953 	lim = kring->nkr_num_slots - 1;
1954 
1955 	if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP))
1956 		return 0;
1957 	/* first step: simulate a user wakeup on the rx ring */
1958 	netmap_vp_rxsync(na, ring_n, flags);
1959 	ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1960 		NM_IFPNAME(na->ifp), ring_n,
1961 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1962 		ring->head, ring->cur, ring->tail,
1963 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
1964 	/* second step: the simulated user consumes all new packets */
1965 	ring->head = ring->cur = ring->tail;
1966 
1967 	/* third step: the new packets are sent on the tx ring
1968 	 * (which is actually the same ring)
1969 	 */
1970 	/* set tail to what the hw expects */
1971 	ring->tail = hw_kring->rtail;
1972 	if (ring_n == na->num_rx_rings) {
1973 		netmap_txsync_to_host(hwna);
1974 	} else {
1975 		nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ?
1976 		error = hwna->nm_txsync(hwna, ring_n, flags);
1977 	}
1978 
1979 	/* fourth step: now we are back the rx ring */
1980 	/* claim ownership on all hw owned bufs */
1981 	ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */
1982 	ring->tail = kring->rtail; /* restore saved value of tail, for safety */
1983 
1984 	/* fifth step: the user goes to sleep again, causing another rxsync */
1985 	netmap_vp_rxsync(na, ring_n, flags);
1986 	ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1987 		NM_IFPNAME(na->ifp), ring_n,
1988 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1989 		ring->head, ring->cur, ring->tail,
1990 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
1991 
1992 	return error;
1993 }
1994 
1995 
1996 static int
1997 netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
1998 {
1999 	struct netmap_bwrap_adapter *bna = na->na_private;
2000 	struct netmap_adapter *port_na = &bna->up.up;
2001 	if (tx == NR_TX || ring_n != 0)
2002 		return ENXIO;
2003 	return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags);
2004 }
2005 
2006 
2007 /* attach a bridge wrapper to the 'real' device */
2008 static int
2009 netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
2010 {
2011 	struct netmap_bwrap_adapter *bna;
2012 	struct netmap_adapter *na;
2013 	struct netmap_adapter *hwna = NA(real);
2014 	struct netmap_adapter *hostna;
2015 	int error;
2016 
2017 
2018 	bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO);
2019 	if (bna == NULL)
2020 		return ENOMEM;
2021 
2022 	na = &bna->up.up;
2023 	na->ifp = fake;
2024 	/* fill the ring data for the bwrap adapter with rx/tx meanings
2025 	 * swapped. The real cross-linking will be done during register,
2026 	 * when all the krings will have been created.
2027 	 */
2028 	na->num_rx_rings = hwna->num_tx_rings;
2029 	na->num_tx_rings = hwna->num_rx_rings;
2030 	na->num_tx_desc = hwna->num_rx_desc;
2031 	na->num_rx_desc = hwna->num_tx_desc;
2032 	na->nm_dtor = netmap_bwrap_dtor;
2033 	na->nm_register = netmap_bwrap_register;
2034 	// na->nm_txsync = netmap_bwrap_txsync;
2035 	// na->nm_rxsync = netmap_bwrap_rxsync;
2036 	na->nm_config = netmap_bwrap_config;
2037 	na->nm_krings_create = netmap_bwrap_krings_create;
2038 	na->nm_krings_delete = netmap_bwrap_krings_delete;
2039 	na->nm_notify = netmap_bwrap_notify;
2040 	na->nm_mem = hwna->nm_mem;
2041 	na->na_private = na; /* prevent NIOCREGIF */
2042 	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
2043 
2044 	bna->hwna = hwna;
2045 	netmap_adapter_get(hwna);
2046 	hwna->na_private = bna; /* weak reference */
2047 
2048 	hostna = &bna->host.up;
2049 	hostna->ifp = hwna->ifp;
2050 	hostna->num_tx_rings = 1;
2051 	hostna->num_tx_desc = hwna->num_rx_desc;
2052 	hostna->num_rx_rings = 1;
2053 	hostna->num_rx_desc = hwna->num_tx_desc;
2054 	// hostna->nm_txsync = netmap_bwrap_host_txsync;
2055 	// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
2056 	hostna->nm_notify = netmap_bwrap_host_notify;
2057 	hostna->nm_mem = na->nm_mem;
2058 	hostna->na_private = bna;
2059 
2060 	ND("%s<->%s txr %d txd %d rxr %d rxd %d",
2061 		fake->if_xname, real->if_xname,
2062 		na->num_tx_rings, na->num_tx_desc,
2063 		na->num_rx_rings, na->num_rx_desc);
2064 
2065 	error = netmap_attach_common(na);
2066 	if (error) {
2067 		netmap_adapter_put(hwna);
2068 		free(bna, M_DEVBUF);
2069 		return error;
2070 	}
2071 	return 0;
2072 }
2073 
2074 
2075 void
2076 netmap_init_bridges(void)
2077 {
2078 	int i;
2079 	bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */
2080 	for (i = 0; i < NM_BRIDGES; i++)
2081 		BDG_RWINIT(&nm_bridges[i]);
2082 }
2083 #endif /* WITH_VALE */
2084