xref: /freebsd/sys/dev/netmap/netmap_vale.c (revision 95d45410b5100e07f6f98450bcd841a8945d4726)
1 /*
2  * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * This module implements the VALE switch for netmap
29 
30 --- VALE SWITCH ---
31 
32 NMG_LOCK() serializes all modifications to switches and ports.
33 A switch cannot be deleted until all ports are gone.
34 
35 For each switch, an SX lock (RWlock on linux) protects
36 deletion of ports. When configuring or deleting a new port, the
37 lock is acquired in exclusive mode (after holding NMG_LOCK).
38 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
39 The lock is held throughout the entire forwarding cycle,
40 during which the thread may incur in a page fault.
41 Hence it is important that sleepable shared locks are used.
42 
43 On the rx ring, the per-port lock is grabbed initially to reserve
44 a number of slot in the ring, then the lock is released,
45 packets are copied from source to destination, and then
46 the lock is acquired again and the receive ring is updated.
47 (A similar thing is done on the tx ring for NIC and host stack
48 ports attached to the switch)
49 
50  */
51 
52 /*
53  * OS-specific code that is used only within this file.
54  * Other OS-specific code that must be accessed by drivers
55  * is present in netmap_kern.h
56  */
57 
58 #if defined(__FreeBSD__)
59 #include <sys/cdefs.h> /* prerequisite */
60 __FBSDID("$FreeBSD$");
61 
62 #include <sys/types.h>
63 #include <sys/errno.h>
64 #include <sys/param.h>	/* defines used in kernel.h */
65 #include <sys/kernel.h>	/* types used in module initialization */
66 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
67 #include <sys/sockio.h>
68 #include <sys/socketvar.h>	/* struct socket */
69 #include <sys/malloc.h>
70 #include <sys/poll.h>
71 #include <sys/rwlock.h>
72 #include <sys/socket.h> /* sockaddrs */
73 #include <sys/selinfo.h>
74 #include <sys/sysctl.h>
75 #include <net/if.h>
76 #include <net/if_var.h>
77 #include <net/bpf.h>		/* BIOCIMMEDIATE */
78 #include <machine/bus.h>	/* bus_dmamap_* */
79 #include <sys/endian.h>
80 #include <sys/refcount.h>
81 
82 
83 #define BDG_RWLOCK_T		struct rwlock // struct rwlock
84 
85 #define	BDG_RWINIT(b)		\
86 	rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
87 #define BDG_WLOCK(b)		rw_wlock(&(b)->bdg_lock)
88 #define BDG_WUNLOCK(b)		rw_wunlock(&(b)->bdg_lock)
89 #define BDG_RLOCK(b)		rw_rlock(&(b)->bdg_lock)
90 #define BDG_RTRYLOCK(b)		rw_try_rlock(&(b)->bdg_lock)
91 #define BDG_RUNLOCK(b)		rw_runlock(&(b)->bdg_lock)
92 #define BDG_RWDESTROY(b)	rw_destroy(&(b)->bdg_lock)
93 
94 
95 #elif defined(linux)
96 
97 #include "bsd_glue.h"
98 
99 #elif defined(__APPLE__)
100 
101 #warning OSX support is only partial
102 #include "osx_glue.h"
103 
104 #else
105 
106 #error	Unsupported platform
107 
108 #endif /* unsupported */
109 
110 /*
111  * common headers
112  */
113 
114 #include <net/netmap.h>
115 #include <dev/netmap/netmap_kern.h>
116 #include <dev/netmap/netmap_mem2.h>
117 
118 #ifdef WITH_VALE
119 
120 /*
121  * system parameters (most of them in netmap_kern.h)
122  * NM_NAME	prefix for switch port names, default "vale"
123  * NM_BDG_MAXPORTS	number of ports
124  * NM_BRIDGES	max number of switches in the system.
125  *	XXX should become a sysctl or tunable
126  *
127  * Switch ports are named valeX:Y where X is the switch name and Y
128  * is the port. If Y matches a physical interface name, the port is
129  * connected to a physical device.
130  *
131  * Unlike physical interfaces, switch ports use their own memory region
132  * for rings and buffers.
133  * The virtual interfaces use per-queue lock instead of core lock.
134  * In the tx loop, we aggregate traffic in batches to make all operations
135  * faster. The batch size is bridge_batch.
136  */
137 #define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
138 #define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
139 #define NM_BRIDGE_RINGSIZE	1024	/* in the device */
140 #define NM_BDG_HASH		1024	/* forwarding table entries */
141 #define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
142 #define NM_MULTISEG		64	/* max size of a chain of bufs */
143 /* actual size of the tables */
144 #define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NM_MULTISEG)
145 /* NM_FT_NULL terminates a list of slots in the ft */
146 #define NM_FT_NULL		NM_BDG_BATCH_MAX
147 #define	NM_BRIDGES		8	/* number of bridges */
148 
149 
150 /*
151  * bridge_batch is set via sysctl to the max batch size to be
152  * used in the bridge. The actual value may be larger as the
153  * last packet in the block may overflow the size.
154  */
155 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
156 SYSCTL_DECL(_dev_netmap);
157 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
158 
159 
160 static int bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp);
161 static int bdg_netmap_reg(struct netmap_adapter *na, int onoff);
162 static int netmap_bwrap_attach(struct ifnet *, struct ifnet *);
163 static int netmap_bwrap_register(struct netmap_adapter *, int onoff);
164 int kern_netmap_regif(struct nmreq *nmr);
165 
166 /*
167  * For each output interface, nm_bdg_q is used to construct a list.
168  * bq_len is the number of output buffers (we can have coalescing
169  * during the copy).
170  */
171 struct nm_bdg_q {
172 	uint16_t bq_head;
173 	uint16_t bq_tail;
174 	uint32_t bq_len;	/* number of buffers */
175 };
176 
177 /* XXX revise this */
178 struct nm_hash_ent {
179 	uint64_t	mac;	/* the top 2 bytes are the epoch */
180 	uint64_t	ports;
181 };
182 
183 /*
184  * nm_bridge is a descriptor for a VALE switch.
185  * Interfaces for a bridge are all in bdg_ports[].
186  * The array has fixed size, an empty entry does not terminate
187  * the search, but lookups only occur on attach/detach so we
188  * don't mind if they are slow.
189  *
190  * The bridge is non blocking on the transmit ports: excess
191  * packets are dropped if there is no room on the output port.
192  *
193  * bdg_lock protects accesses to the bdg_ports array.
194  * This is a rw lock (or equivalent).
195  */
196 struct nm_bridge {
197 	/* XXX what is the proper alignment/layout ? */
198 	BDG_RWLOCK_T	bdg_lock;	/* protects bdg_ports */
199 	int		bdg_namelen;
200 	uint32_t	bdg_active_ports; /* 0 means free */
201 	char		bdg_basename[IFNAMSIZ];
202 
203 	/* Indexes of active ports (up to active_ports)
204 	 * and all other remaining ports.
205 	 */
206 	uint8_t		bdg_port_index[NM_BDG_MAXPORTS];
207 
208 	struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
209 
210 
211 	/*
212 	 * The function to decide the destination port.
213 	 * It returns either of an index of the destination port,
214 	 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
215 	 * forward this packet.  ring_nr is the source ring index, and the
216 	 * function may overwrite this value to forward this packet to a
217 	 * different ring index.
218 	 * This function must be set by netmap_bdgctl().
219 	 */
220 	bdg_lookup_fn_t nm_bdg_lookup;
221 
222 	/* the forwarding table, MAC+ports.
223 	 * XXX should be changed to an argument to be passed to
224 	 * the lookup function, and allocated on attach
225 	 */
226 	struct nm_hash_ent ht[NM_BDG_HASH];
227 };
228 
229 
230 /*
231  * XXX in principle nm_bridges could be created dynamically
232  * Right now we have a static array and deletions are protected
233  * by an exclusive lock.
234  */
235 struct nm_bridge nm_bridges[NM_BRIDGES];
236 
237 
238 /*
239  * this is a slightly optimized copy routine which rounds
240  * to multiple of 64 bytes and is often faster than dealing
241  * with other odd sizes. We assume there is enough room
242  * in the source and destination buffers.
243  *
244  * XXX only for multiples of 64 bytes, non overlapped.
245  */
246 static inline void
247 pkt_copy(void *_src, void *_dst, int l)
248 {
249         uint64_t *src = _src;
250         uint64_t *dst = _dst;
251         if (unlikely(l >= 1024)) {
252                 memcpy(dst, src, l);
253                 return;
254         }
255         for (; likely(l > 0); l-=64) {
256                 *dst++ = *src++;
257                 *dst++ = *src++;
258                 *dst++ = *src++;
259                 *dst++ = *src++;
260                 *dst++ = *src++;
261                 *dst++ = *src++;
262                 *dst++ = *src++;
263                 *dst++ = *src++;
264         }
265 }
266 
267 
268 /*
269  * locate a bridge among the existing ones.
270  * MUST BE CALLED WITH NMG_LOCK()
271  *
272  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
273  * We assume that this is called with a name of at least NM_NAME chars.
274  */
275 static struct nm_bridge *
276 nm_find_bridge(const char *name, int create)
277 {
278 	int i, l, namelen;
279 	struct nm_bridge *b = NULL;
280 
281 	NMG_LOCK_ASSERT();
282 
283 	namelen = strlen(NM_NAME);	/* base length */
284 	l = name ? strlen(name) : 0;		/* actual length */
285 	if (l < namelen) {
286 		D("invalid bridge name %s", name ? name : NULL);
287 		return NULL;
288 	}
289 	for (i = namelen + 1; i < l; i++) {
290 		if (name[i] == ':') {
291 			namelen = i;
292 			break;
293 		}
294 	}
295 	if (namelen >= IFNAMSIZ)
296 		namelen = IFNAMSIZ;
297 	ND("--- prefix is '%.*s' ---", namelen, name);
298 
299 	/* lookup the name, remember empty slot if there is one */
300 	for (i = 0; i < NM_BRIDGES; i++) {
301 		struct nm_bridge *x = nm_bridges + i;
302 
303 		if (x->bdg_active_ports == 0) {
304 			if (create && b == NULL)
305 				b = x;	/* record empty slot */
306 		} else if (x->bdg_namelen != namelen) {
307 			continue;
308 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
309 			ND("found '%.*s' at %d", namelen, name, i);
310 			b = x;
311 			break;
312 		}
313 	}
314 	if (i == NM_BRIDGES && b) { /* name not found, can create entry */
315 		/* initialize the bridge */
316 		strncpy(b->bdg_basename, name, namelen);
317 		ND("create new bridge %s with ports %d", b->bdg_basename,
318 			b->bdg_active_ports);
319 		b->bdg_namelen = namelen;
320 		b->bdg_active_ports = 0;
321 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
322 			b->bdg_port_index[i] = i;
323 		/* set the default function */
324 		b->nm_bdg_lookup = netmap_bdg_learning;
325 		/* reset the MAC address table */
326 		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
327 	}
328 	return b;
329 }
330 
331 
332 /*
333  * Free the forwarding tables for rings attached to switch ports.
334  */
335 static void
336 nm_free_bdgfwd(struct netmap_adapter *na)
337 {
338 	int nrings, i;
339 	struct netmap_kring *kring;
340 
341 	NMG_LOCK_ASSERT();
342 	nrings = na->num_tx_rings;
343 	kring = na->tx_rings;
344 	for (i = 0; i < nrings; i++) {
345 		if (kring[i].nkr_ft) {
346 			free(kring[i].nkr_ft, M_DEVBUF);
347 			kring[i].nkr_ft = NULL; /* protect from freeing twice */
348 		}
349 	}
350 }
351 
352 
353 /*
354  * Allocate the forwarding tables for the rings attached to the bridge ports.
355  */
356 static int
357 nm_alloc_bdgfwd(struct netmap_adapter *na)
358 {
359 	int nrings, l, i, num_dstq;
360 	struct netmap_kring *kring;
361 
362 	NMG_LOCK_ASSERT();
363 	/* all port:rings + broadcast */
364 	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
365 	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
366 	l += sizeof(struct nm_bdg_q) * num_dstq;
367 	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
368 
369 	nrings = netmap_real_tx_rings(na);
370 	kring = na->tx_rings;
371 	for (i = 0; i < nrings; i++) {
372 		struct nm_bdg_fwd *ft;
373 		struct nm_bdg_q *dstq;
374 		int j;
375 
376 		ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
377 		if (!ft) {
378 			nm_free_bdgfwd(na);
379 			return ENOMEM;
380 		}
381 		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
382 		for (j = 0; j < num_dstq; j++) {
383 			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
384 			dstq[j].bq_len = 0;
385 		}
386 		kring[i].nkr_ft = ft;
387 	}
388 	return 0;
389 }
390 
391 
392 static void
393 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
394 {
395 	int s_hw = hw, s_sw = sw;
396 	int i, lim =b->bdg_active_ports;
397 	uint8_t tmp[NM_BDG_MAXPORTS];
398 
399 	/*
400 	New algorithm:
401 	make a copy of bdg_port_index;
402 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
403 	in the array of bdg_port_index, replacing them with
404 	entries from the bottom of the array;
405 	decrement bdg_active_ports;
406 	acquire BDG_WLOCK() and copy back the array.
407 	 */
408 
409 	if (netmap_verbose)
410 		D("detach %d and %d (lim %d)", hw, sw, lim);
411 	/* make a copy of the list of active ports, update it,
412 	 * and then copy back within BDG_WLOCK().
413 	 */
414 	memcpy(tmp, b->bdg_port_index, sizeof(tmp));
415 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
416 		if (hw >= 0 && tmp[i] == hw) {
417 			ND("detach hw %d at %d", hw, i);
418 			lim--; /* point to last active port */
419 			tmp[i] = tmp[lim]; /* swap with i */
420 			tmp[lim] = hw;	/* now this is inactive */
421 			hw = -1;
422 		} else if (sw >= 0 && tmp[i] == sw) {
423 			ND("detach sw %d at %d", sw, i);
424 			lim--;
425 			tmp[i] = tmp[lim];
426 			tmp[lim] = sw;
427 			sw = -1;
428 		} else {
429 			i++;
430 		}
431 	}
432 	if (hw >= 0 || sw >= 0) {
433 		D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
434 	}
435 
436 	BDG_WLOCK(b);
437 	b->bdg_ports[s_hw] = NULL;
438 	if (s_sw >= 0) {
439 		b->bdg_ports[s_sw] = NULL;
440 	}
441 	memcpy(b->bdg_port_index, tmp, sizeof(tmp));
442 	b->bdg_active_ports = lim;
443 	BDG_WUNLOCK(b);
444 
445 	ND("now %d active ports", lim);
446 	if (lim == 0) {
447 		ND("marking bridge %s as free", b->bdg_basename);
448 		b->nm_bdg_lookup = NULL;
449 	}
450 }
451 
452 
453 static void
454 netmap_adapter_vp_dtor(struct netmap_adapter *na)
455 {
456 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
457 	struct nm_bridge *b = vpna->na_bdg;
458 	struct ifnet *ifp = na->ifp;
459 
460 	ND("%s has %d references", NM_IFPNAME(ifp), na->na_refcount);
461 
462 	if (b) {
463 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
464 	}
465 
466 	bzero(ifp, sizeof(*ifp));
467 	free(ifp, M_DEVBUF);
468 	na->ifp = NULL;
469 }
470 
471 
472 /* Try to get a reference to a netmap adapter attached to a VALE switch.
473  * If the adapter is found (or is created), this function returns 0, a
474  * non NULL pointer is returned into *na, and the caller holds a
475  * reference to the adapter.
476  * If an adapter is not found, then no reference is grabbed and the
477  * function returns an error code, or 0 if there is just a VALE prefix
478  * mismatch. Therefore the caller holds a reference when
479  * (*na != NULL && return == 0).
480  */
481 int
482 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
483 {
484 	const char *name = nmr->nr_name;
485 	struct ifnet *ifp;
486 	int error = 0;
487 	struct netmap_adapter *ret;
488 	struct netmap_vp_adapter *vpna;
489 	struct nm_bridge *b;
490 	int i, j, cand = -1, cand2 = -1;
491 	int needed;
492 
493 	*na = NULL;     /* default return value */
494 
495 	/* first try to see if this is a bridge port. */
496 	NMG_LOCK_ASSERT();
497 	if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
498 		return 0;  /* no error, but no VALE prefix */
499 	}
500 
501 	b = nm_find_bridge(name, create);
502 	if (b == NULL) {
503 		D("no bridges available for '%s'", name);
504 		return (create ? ENOMEM : ENXIO);
505 	}
506 
507 	/* Now we are sure that name starts with the bridge's name,
508 	 * lookup the port in the bridge. We need to scan the entire
509 	 * list. It is not important to hold a WLOCK on the bridge
510 	 * during the search because NMG_LOCK already guarantees
511 	 * that there are no other possible writers.
512 	 */
513 
514 	/* lookup in the local list of ports */
515 	for (j = 0; j < b->bdg_active_ports; j++) {
516 		i = b->bdg_port_index[j];
517 		vpna = b->bdg_ports[i];
518 		// KASSERT(na != NULL);
519 		ifp = vpna->up.ifp;
520 		/* XXX make sure the name only contains one : */
521 		if (!strcmp(NM_IFPNAME(ifp), name)) {
522 			netmap_adapter_get(&vpna->up);
523 			ND("found existing if %s refs %d", name,
524 				vpna->na_bdg_refcount);
525 			*na = (struct netmap_adapter *)vpna;
526 			return 0;
527 		}
528 	}
529 	/* not found, should we create it? */
530 	if (!create)
531 		return ENXIO;
532 	/* yes we should, see if we have space to attach entries */
533 	needed = 2; /* in some cases we only need 1 */
534 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
535 		D("bridge full %d, cannot create new port", b->bdg_active_ports);
536 		return ENOMEM;
537 	}
538 	/* record the next two ports available, but do not allocate yet */
539 	cand = b->bdg_port_index[b->bdg_active_ports];
540 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
541 	ND("+++ bridge %s port %s used %d avail %d %d",
542 		b->bdg_basename, name, b->bdg_active_ports, cand, cand2);
543 
544 	/*
545 	 * try see if there is a matching NIC with this name
546 	 * (after the bridge's name)
547 	 */
548 	ifp = ifunit_ref(name + b->bdg_namelen + 1);
549 	if (!ifp) { /* this is a virtual port */
550 		if (nmr->nr_cmd) {
551 			/* nr_cmd must be 0 for a virtual port */
552 			return EINVAL;
553 		}
554 
555 	 	/* create a struct ifnet for the new port.
556 		 * need M_NOWAIT as we are under nma_lock
557 		 */
558 		ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
559 		if (!ifp)
560 			return ENOMEM;
561 
562 		strcpy(ifp->if_xname, name);
563 		/* bdg_netmap_attach creates a struct netmap_adapter */
564 		error = bdg_netmap_attach(nmr, ifp);
565 		if (error) {
566 			D("error %d", error);
567 			free(ifp, M_DEVBUF);
568 			return error;
569 		}
570 		ret = NA(ifp);
571 		cand2 = -1;	/* only need one port */
572 	} else {  /* this is a NIC */
573 		struct ifnet *fake_ifp;
574 
575 		error = netmap_get_hw_na(ifp, &ret);
576 		if (error || ret == NULL)
577 			goto out;
578 
579 		/* make sure the NIC is not already in use */
580 		if (NETMAP_OWNED_BY_ANY(ret)) {
581 			D("NIC %s busy, cannot attach to bridge",
582 				NM_IFPNAME(ifp));
583 			error = EBUSY;
584 			goto out;
585 		}
586 		/* create a fake interface */
587 		fake_ifp = malloc(sizeof(*ifp), M_DEVBUF, M_NOWAIT | M_ZERO);
588 		if (!fake_ifp) {
589 			error = ENOMEM;
590 			goto out;
591 		}
592 		strcpy(fake_ifp->if_xname, name);
593 		error = netmap_bwrap_attach(fake_ifp, ifp);
594 		if (error) {
595 			free(fake_ifp, M_DEVBUF);
596 			goto out;
597 		}
598 		ret = NA(fake_ifp);
599 		if (nmr->nr_arg1 != NETMAP_BDG_HOST)
600 			cand2 = -1; /* only need one port */
601 		if_rele(ifp);
602 	}
603 	vpna = (struct netmap_vp_adapter *)ret;
604 
605 	BDG_WLOCK(b);
606 	vpna->bdg_port = cand;
607 	ND("NIC  %p to bridge port %d", vpna, cand);
608 	/* bind the port to the bridge (virtual ports are not active) */
609 	b->bdg_ports[cand] = vpna;
610 	vpna->na_bdg = b;
611 	b->bdg_active_ports++;
612 	if (cand2 >= 0) {
613 		struct netmap_vp_adapter *hostna = vpna + 1;
614 		/* also bind the host stack to the bridge */
615 		b->bdg_ports[cand2] = hostna;
616 		hostna->bdg_port = cand2;
617 		hostna->na_bdg = b;
618 		b->bdg_active_ports++;
619 		ND("host %p to bridge port %d", hostna, cand2);
620 	}
621 	ND("if %s refs %d", name, vpna->up.na_refcount);
622 	BDG_WUNLOCK(b);
623 	*na = ret;
624 	netmap_adapter_get(ret);
625 	return 0;
626 
627 out:
628 	if_rele(ifp);
629 
630 	return error;
631 }
632 
633 
634 /* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
635 static int
636 nm_bdg_attach(struct nmreq *nmr)
637 {
638 	struct netmap_adapter *na;
639 	struct netmap_if *nifp;
640 	struct netmap_priv_d *npriv;
641 	struct netmap_bwrap_adapter *bna;
642 	int error;
643 
644 	npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
645 	if (npriv == NULL)
646 		return ENOMEM;
647 
648 	NMG_LOCK();
649 
650 	error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */);
651 	if (error) /* no device, or another bridge or user owns the device */
652 		goto unlock_exit;
653 
654 	if (na == NULL) { /* VALE prefix missing */
655 		error = EINVAL;
656 		goto unlock_exit;
657 	}
658 
659 	if (na->active_fds > 0) { /* already registered */
660 		error = EBUSY;
661 		goto unref_exit;
662 	}
663 
664 	nifp = netmap_do_regif(npriv, na, nmr->nr_ringid, nmr->nr_flags, &error);
665 	if (!nifp) {
666 		goto unref_exit;
667 	}
668 
669 	bna = (struct netmap_bwrap_adapter*)na;
670 	bna->na_kpriv = npriv;
671 	NMG_UNLOCK();
672 	ND("registered %s to netmap-mode", NM_IFPNAME(na->ifp));
673 	return 0;
674 
675 unref_exit:
676 	netmap_adapter_put(na);
677 unlock_exit:
678 	NMG_UNLOCK();
679 	bzero(npriv, sizeof(*npriv));
680 	free(npriv, M_DEVBUF);
681 	return error;
682 }
683 
684 
685 static int
686 nm_bdg_detach(struct nmreq *nmr)
687 {
688 	struct netmap_adapter *na;
689 	int error;
690 	struct netmap_bwrap_adapter *bna;
691 	int last_instance;
692 
693 	NMG_LOCK();
694 	error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */);
695 	if (error) { /* no device, or another bridge or user owns the device */
696 		goto unlock_exit;
697 	}
698 
699 	if (na == NULL) { /* VALE prefix missing */
700 		error = EINVAL;
701 		goto unlock_exit;
702 	}
703 
704 	bna = (struct netmap_bwrap_adapter *)na;
705 
706 	if (na->active_fds == 0) { /* not registered */
707 		error = EINVAL;
708 		goto unref_exit;
709 	}
710 
711 	last_instance = netmap_dtor_locked(bna->na_kpriv); /* unregister */
712 	if (!last_instance) {
713 		D("--- error, trying to detach an entry with active mmaps");
714 		error = EINVAL;
715 	} else {
716 		struct netmap_priv_d *npriv = bna->na_kpriv;
717 
718 		bna->na_kpriv = NULL;
719 		D("deleting priv");
720 
721 		bzero(npriv, sizeof(*npriv));
722 		free(npriv, M_DEVBUF);
723 	}
724 
725 unref_exit:
726 	netmap_adapter_put(na);
727 unlock_exit:
728 	NMG_UNLOCK();
729 	return error;
730 
731 }
732 
733 
734 /* exported to kernel callers, e.g. OVS ?
735  * Entry point.
736  * Called without NMG_LOCK.
737  */
738 int
739 netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
740 {
741 	struct nm_bridge *b;
742 	struct netmap_adapter *na;
743 	struct netmap_vp_adapter *vpna;
744 	struct ifnet *iter;
745 	char *name = nmr->nr_name;
746 	int cmd = nmr->nr_cmd, namelen = strlen(name);
747 	int error = 0, i, j;
748 
749 	switch (cmd) {
750 	case NETMAP_BDG_ATTACH:
751 		error = nm_bdg_attach(nmr);
752 		break;
753 
754 	case NETMAP_BDG_DETACH:
755 		error = nm_bdg_detach(nmr);
756 		break;
757 
758 	case NETMAP_BDG_LIST:
759 		/* this is used to enumerate bridges and ports */
760 		if (namelen) { /* look up indexes of bridge and port */
761 			if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
762 				error = EINVAL;
763 				break;
764 			}
765 			NMG_LOCK();
766 			b = nm_find_bridge(name, 0 /* don't create */);
767 			if (!b) {
768 				error = ENOENT;
769 				NMG_UNLOCK();
770 				break;
771 			}
772 
773 			error = ENOENT;
774 			for (j = 0; j < b->bdg_active_ports; j++) {
775 				i = b->bdg_port_index[j];
776 				vpna = b->bdg_ports[i];
777 				if (vpna == NULL) {
778 					D("---AAAAAAAAARGH-------");
779 					continue;
780 				}
781 				iter = vpna->up.ifp;
782 				/* the former and the latter identify a
783 				 * virtual port and a NIC, respectively
784 				 */
785 				if (!strcmp(iter->if_xname, name)) {
786 					/* bridge index */
787 					nmr->nr_arg1 = b - nm_bridges;
788 					nmr->nr_arg2 = i; /* port index */
789 					error = 0;
790 					break;
791 				}
792 			}
793 			NMG_UNLOCK();
794 		} else {
795 			/* return the first non-empty entry starting from
796 			 * bridge nr_arg1 and port nr_arg2.
797 			 *
798 			 * Users can detect the end of the same bridge by
799 			 * seeing the new and old value of nr_arg1, and can
800 			 * detect the end of all the bridge by error != 0
801 			 */
802 			i = nmr->nr_arg1;
803 			j = nmr->nr_arg2;
804 
805 			NMG_LOCK();
806 			for (error = ENOENT; i < NM_BRIDGES; i++) {
807 				b = nm_bridges + i;
808 				if (j >= b->bdg_active_ports) {
809 					j = 0; /* following bridges scan from 0 */
810 					continue;
811 				}
812 				nmr->nr_arg1 = i;
813 				nmr->nr_arg2 = j;
814 				j = b->bdg_port_index[j];
815 				vpna = b->bdg_ports[j];
816 				iter = vpna->up.ifp;
817 				strncpy(name, iter->if_xname, (size_t)IFNAMSIZ);
818 				error = 0;
819 				break;
820 			}
821 			NMG_UNLOCK();
822 		}
823 		break;
824 
825 	case NETMAP_BDG_LOOKUP_REG:
826 		/* register a lookup function to the given bridge.
827 		 * nmr->nr_name may be just bridge's name (including ':'
828 		 * if it is not just NM_NAME).
829 		 */
830 		if (!func) {
831 			error = EINVAL;
832 			break;
833 		}
834 		NMG_LOCK();
835 		b = nm_find_bridge(name, 0 /* don't create */);
836 		if (!b) {
837 			error = EINVAL;
838 		} else {
839 			b->nm_bdg_lookup = func;
840 		}
841 		NMG_UNLOCK();
842 		break;
843 
844 	case NETMAP_BDG_VNET_HDR:
845 		/* Valid lengths for the virtio-net header are 0 (no header),
846 		   10 and 12. */
847 		if (nmr->nr_arg1 != 0 &&
848 			nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) &&
849 				nmr->nr_arg1 != 12) {
850 			error = EINVAL;
851 			break;
852 		}
853 		NMG_LOCK();
854 		error = netmap_get_bdg_na(nmr, &na, 0);
855 		if (na && !error) {
856 			vpna = (struct netmap_vp_adapter *)na;
857 			vpna->virt_hdr_len = nmr->nr_arg1;
858 			if (vpna->virt_hdr_len)
859 				vpna->mfs = NETMAP_BDG_BUF_SIZE(na->nm_mem);
860 			D("Using vnet_hdr_len %d for %p", vpna->virt_hdr_len, vpna);
861 			netmap_adapter_put(na);
862 		}
863 		NMG_UNLOCK();
864 		break;
865 
866 	default:
867 		D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
868 		error = EINVAL;
869 		break;
870 	}
871 	return error;
872 }
873 
874 static int
875 netmap_vp_krings_create(struct netmap_adapter *na)
876 {
877 	u_int tailroom;
878 	int error, i;
879 	uint32_t *leases;
880 	u_int nrx = netmap_real_rx_rings(na);
881 
882 	/*
883 	 * Leases are attached to RX rings on vale ports
884 	 */
885 	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
886 
887 	error = netmap_krings_create(na, tailroom);
888 	if (error)
889 		return error;
890 
891 	leases = na->tailroom;
892 
893 	for (i = 0; i < nrx; i++) { /* Receive rings */
894 		na->rx_rings[i].nkr_leases = leases;
895 		leases += na->num_rx_desc;
896 	}
897 
898 	error = nm_alloc_bdgfwd(na);
899 	if (error) {
900 		netmap_krings_delete(na);
901 		return error;
902 	}
903 
904 	return 0;
905 }
906 
907 
908 static void
909 netmap_vp_krings_delete(struct netmap_adapter *na)
910 {
911 	nm_free_bdgfwd(na);
912 	netmap_krings_delete(na);
913 }
914 
915 
916 static int
917 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
918 	struct netmap_vp_adapter *na, u_int ring_nr);
919 
920 
921 /*
922  * Grab packets from a kring, move them into the ft structure
923  * associated to the tx (input) port. Max one instance per port,
924  * filtered on input (ioctl, poll or XXX).
925  * Returns the next position in the ring.
926  */
927 static int
928 nm_bdg_preflush(struct netmap_vp_adapter *na, u_int ring_nr,
929 	struct netmap_kring *kring, u_int end)
930 {
931 	struct netmap_ring *ring = kring->ring;
932 	struct nm_bdg_fwd *ft;
933 	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
934 	u_int ft_i = 0;	/* start from 0 */
935 	u_int frags = 1; /* how many frags ? */
936 	struct nm_bridge *b = na->na_bdg;
937 
938 	/* To protect against modifications to the bridge we acquire a
939 	 * shared lock, waiting if we can sleep (if the source port is
940 	 * attached to a user process) or with a trylock otherwise (NICs).
941 	 */
942 	ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
943 	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
944 		BDG_RLOCK(b);
945 	else if (!BDG_RTRYLOCK(b))
946 		return 0;
947 	ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
948 	ft = kring->nkr_ft;
949 
950 	for (; likely(j != end); j = nm_next(j, lim)) {
951 		struct netmap_slot *slot = &ring->slot[j];
952 		char *buf;
953 
954 		ft[ft_i].ft_len = slot->len;
955 		ft[ft_i].ft_flags = slot->flags;
956 
957 		ND("flags is 0x%x", slot->flags);
958 		/* this slot goes into a list so initialize the link field */
959 		ft[ft_i].ft_next = NM_FT_NULL;
960 		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
961 			(void *)(uintptr_t)slot->ptr : BDG_NMB(&na->up, slot);
962 		if (unlikely(buf == NULL)) {
963 			RD(5, "NULL %s buffer pointer from %s slot %d len %d",
964 				(slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
965 				kring->name, j, ft[ft_i].ft_len);
966 			buf = ft[ft_i].ft_buf = NMB_VA(0); /* the 'null' buffer */
967 			ft[ft_i].ft_len = 0;
968 			ft[ft_i].ft_flags = 0;
969 		}
970 		__builtin_prefetch(buf);
971 		++ft_i;
972 		if (slot->flags & NS_MOREFRAG) {
973 			frags++;
974 			continue;
975 		}
976 		if (unlikely(netmap_verbose && frags > 1))
977 			RD(5, "%d frags at %d", frags, ft_i - frags);
978 		ft[ft_i - frags].ft_frags = frags;
979 		frags = 1;
980 		if (unlikely((int)ft_i >= bridge_batch))
981 			ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
982 	}
983 	if (frags > 1) {
984 		D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);
985 		// ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG
986 		ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;
987 		ft[ft_i - frags].ft_frags = frags - 1;
988 	}
989 	if (ft_i)
990 		ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
991 	BDG_RUNLOCK(b);
992 	return j;
993 }
994 
995 
996 /* ----- FreeBSD if_bridge hash function ------- */
997 
998 /*
999  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1000  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1001  *
1002  * http://www.burtleburtle.net/bob/hash/spooky.html
1003  */
1004 #define mix(a, b, c)                                                    \
1005 do {                                                                    \
1006         a -= b; a -= c; a ^= (c >> 13);                                 \
1007         b -= c; b -= a; b ^= (a << 8);                                  \
1008         c -= a; c -= b; c ^= (b >> 13);                                 \
1009         a -= b; a -= c; a ^= (c >> 12);                                 \
1010         b -= c; b -= a; b ^= (a << 16);                                 \
1011         c -= a; c -= b; c ^= (b >> 5);                                  \
1012         a -= b; a -= c; a ^= (c >> 3);                                  \
1013         b -= c; b -= a; b ^= (a << 10);                                 \
1014         c -= a; c -= b; c ^= (b >> 15);                                 \
1015 } while (/*CONSTCOND*/0)
1016 
1017 
1018 static __inline uint32_t
1019 nm_bridge_rthash(const uint8_t *addr)
1020 {
1021         uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1022 
1023         b += addr[5] << 8;
1024         b += addr[4];
1025         a += addr[3] << 24;
1026         a += addr[2] << 16;
1027         a += addr[1] << 8;
1028         a += addr[0];
1029 
1030         mix(a, b, c);
1031 #define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
1032         return (c & BRIDGE_RTHASH_MASK);
1033 }
1034 
1035 #undef mix
1036 
1037 
1038 static int
1039 bdg_netmap_reg(struct netmap_adapter *na, int onoff)
1040 {
1041 	struct netmap_vp_adapter *vpna =
1042 		(struct netmap_vp_adapter*)na;
1043 	struct ifnet *ifp = na->ifp;
1044 
1045 	/* the interface is already attached to the bridge,
1046 	 * so we only need to toggle IFCAP_NETMAP.
1047 	 */
1048 	BDG_WLOCK(vpna->na_bdg);
1049 	if (onoff) {
1050 		ifp->if_capenable |= IFCAP_NETMAP;
1051 	} else {
1052 		ifp->if_capenable &= ~IFCAP_NETMAP;
1053 	}
1054 	BDG_WUNLOCK(vpna->na_bdg);
1055 	return 0;
1056 }
1057 
1058 
1059 /*
1060  * Lookup function for a learning bridge.
1061  * Update the hash table with the source address,
1062  * and then returns the destination port index, and the
1063  * ring in *dst_ring (at the moment, always use ring 0)
1064  */
1065 u_int
1066 netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring,
1067 		struct netmap_vp_adapter *na)
1068 {
1069 	struct nm_hash_ent *ht = na->na_bdg->ht;
1070 	uint32_t sh, dh;
1071 	u_int dst, mysrc = na->bdg_port;
1072 	uint64_t smac, dmac;
1073 
1074 	if (buf_len < 14) {
1075 		RD(5, "invalid buf length %d", buf_len);
1076 		return NM_BDG_NOPORT;
1077 	}
1078 	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1079 	smac = le64toh(*(uint64_t *)(buf + 4));
1080 	smac >>= 16;
1081 
1082 	/*
1083 	 * The hash is somewhat expensive, there might be some
1084 	 * worthwhile optimizations here.
1085 	 */
1086 	if ((buf[6] & 1) == 0) { /* valid src */
1087 		uint8_t *s = buf+6;
1088 		sh = nm_bridge_rthash(s); // XXX hash of source
1089 		/* update source port forwarding entry */
1090 		ht[sh].mac = smac;	/* XXX expire ? */
1091 		ht[sh].ports = mysrc;
1092 		if (netmap_verbose)
1093 		    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1094 			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1095 	}
1096 	dst = NM_BDG_BROADCAST;
1097 	if ((buf[0] & 1) == 0) { /* unicast */
1098 		dh = nm_bridge_rthash(buf); // XXX hash of dst
1099 		if (ht[dh].mac == dmac) {	/* found dst */
1100 			dst = ht[dh].ports;
1101 		}
1102 		/* XXX otherwise return NM_BDG_UNKNOWN ? */
1103 	}
1104 	*dst_ring = 0;
1105 	return dst;
1106 }
1107 
1108 
1109 /*
1110  * Available space in the ring. Only used in VALE code
1111  * and only with is_rx = 1
1112  */
1113 static inline uint32_t
1114 nm_kr_space(struct netmap_kring *k, int is_rx)
1115 {
1116 	int space;
1117 
1118 	if (is_rx) {
1119 		int busy = k->nkr_hwlease - k->nr_hwcur;
1120 		if (busy < 0)
1121 			busy += k->nkr_num_slots;
1122 		space = k->nkr_num_slots - 1 - busy;
1123 	} else {
1124 		/* XXX never used in this branch */
1125 		space = k->nr_hwtail - k->nkr_hwlease;
1126 		if (space < 0)
1127 			space += k->nkr_num_slots;
1128 	}
1129 #if 0
1130 	// sanity check
1131 	if (k->nkr_hwlease >= k->nkr_num_slots ||
1132 		k->nr_hwcur >= k->nkr_num_slots ||
1133 		k->nr_tail >= k->nkr_num_slots ||
1134 		busy < 0 ||
1135 		busy >= k->nkr_num_slots) {
1136 		D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1137 			k->nkr_lease_idx, k->nkr_num_slots);
1138 	}
1139 #endif
1140 	return space;
1141 }
1142 
1143 
1144 
1145 
1146 /* make a lease on the kring for N positions. return the
1147  * lease index
1148  * XXX only used in VALE code and with is_rx = 1
1149  */
1150 static inline uint32_t
1151 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1152 {
1153 	uint32_t lim = k->nkr_num_slots - 1;
1154 	uint32_t lease_idx = k->nkr_lease_idx;
1155 
1156 	k->nkr_leases[lease_idx] = NR_NOSLOT;
1157 	k->nkr_lease_idx = nm_next(lease_idx, lim);
1158 
1159 	if (n > nm_kr_space(k, is_rx)) {
1160 		D("invalid request for %d slots", n);
1161 		panic("x");
1162 	}
1163 	/* XXX verify that there are n slots */
1164 	k->nkr_hwlease += n;
1165 	if (k->nkr_hwlease > lim)
1166 		k->nkr_hwlease -= lim + 1;
1167 
1168 	if (k->nkr_hwlease >= k->nkr_num_slots ||
1169 		k->nr_hwcur >= k->nkr_num_slots ||
1170 		k->nr_hwtail >= k->nkr_num_slots ||
1171 		k->nkr_lease_idx >= k->nkr_num_slots) {
1172 		D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1173 			k->na->ifp->if_xname,
1174 			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1175 			k->nkr_lease_idx, k->nkr_num_slots);
1176 	}
1177 	return lease_idx;
1178 }
1179 
1180 /*
1181  * This flush routine supports only unicast and broadcast but a large
1182  * number of ports, and lets us replace the learn and dispatch functions.
1183  */
1184 int
1185 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1186 		u_int ring_nr)
1187 {
1188 	struct nm_bdg_q *dst_ents, *brddst;
1189 	uint16_t num_dsts = 0, *dsts;
1190 	struct nm_bridge *b = na->na_bdg;
1191 	u_int i, j, me = na->bdg_port;
1192 
1193 	/*
1194 	 * The work area (pointed by ft) is followed by an array of
1195 	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1196 	 * queues per port plus one for the broadcast traffic.
1197 	 * Then we have an array of destination indexes.
1198 	 */
1199 	dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1200 	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1201 
1202 	/* first pass: find a destination for each packet in the batch */
1203 	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1204 		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1205 		uint16_t dst_port, d_i;
1206 		struct nm_bdg_q *d;
1207 		uint8_t *buf = ft[i].ft_buf;
1208 		u_int len = ft[i].ft_len;
1209 
1210 		ND("slot %d frags %d", i, ft[i].ft_frags);
1211 		/* Drop the packet if the virtio-net header is not into the first
1212 		   fragment nor at the very beginning of the second. */
1213 		if (unlikely(na->virt_hdr_len > len))
1214 			continue;
1215 		if (len == na->virt_hdr_len) {
1216 			buf = ft[i+1].ft_buf;
1217 			len = ft[i+1].ft_len;
1218 		} else {
1219 			buf += na->virt_hdr_len;
1220 			len -= na->virt_hdr_len;
1221 		}
1222 		dst_port = b->nm_bdg_lookup(buf, len, &dst_ring, na);
1223 		if (netmap_verbose > 255)
1224 			RD(5, "slot %d port %d -> %d", i, me, dst_port);
1225 		if (dst_port == NM_BDG_NOPORT)
1226 			continue; /* this packet is identified to be dropped */
1227 		else if (unlikely(dst_port > NM_BDG_MAXPORTS))
1228 			continue;
1229 		else if (dst_port == NM_BDG_BROADCAST)
1230 			dst_ring = 0; /* broadcasts always go to ring 0 */
1231 		else if (unlikely(dst_port == me ||
1232 		    !b->bdg_ports[dst_port]))
1233 			continue;
1234 
1235 		/* get a position in the scratch pad */
1236 		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1237 		d = dst_ents + d_i;
1238 
1239 		/* append the first fragment to the list */
1240 		if (d->bq_head == NM_FT_NULL) { /* new destination */
1241 			d->bq_head = d->bq_tail = i;
1242 			/* remember this position to be scanned later */
1243 			if (dst_port != NM_BDG_BROADCAST)
1244 				dsts[num_dsts++] = d_i;
1245 		} else {
1246 			ft[d->bq_tail].ft_next = i;
1247 			d->bq_tail = i;
1248 		}
1249 		d->bq_len += ft[i].ft_frags;
1250 	}
1251 
1252 	/*
1253 	 * Broadcast traffic goes to ring 0 on all destinations.
1254 	 * So we need to add these rings to the list of ports to scan.
1255 	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1256 	 * expensive. We should keep a compact list of active destinations
1257 	 * so we could shorten this loop.
1258 	 */
1259 	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1260 	if (brddst->bq_head != NM_FT_NULL) {
1261 		for (j = 0; likely(j < b->bdg_active_ports); j++) {
1262 			uint16_t d_i;
1263 			i = b->bdg_port_index[j];
1264 			if (unlikely(i == me))
1265 				continue;
1266 			d_i = i * NM_BDG_MAXRINGS;
1267 			if (dst_ents[d_i].bq_head == NM_FT_NULL)
1268 				dsts[num_dsts++] = d_i;
1269 		}
1270 	}
1271 
1272 	ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1273 	/* second pass: scan destinations (XXX will be modular somehow) */
1274 	for (i = 0; i < num_dsts; i++) {
1275 		struct ifnet *dst_ifp;
1276 		struct netmap_vp_adapter *dst_na;
1277 		struct netmap_kring *kring;
1278 		struct netmap_ring *ring;
1279 		u_int dst_nr, lim, j, d_i, next, brd_next;
1280 		u_int needed, howmany;
1281 		int retry = netmap_txsync_retry;
1282 		struct nm_bdg_q *d;
1283 		uint32_t my_start = 0, lease_idx = 0;
1284 		int nrings;
1285 		int virt_hdr_mismatch = 0;
1286 
1287 		d_i = dsts[i];
1288 		ND("second pass %d port %d", i, d_i);
1289 		d = dst_ents + d_i;
1290 		// XXX fix the division
1291 		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1292 		/* protect from the lookup function returning an inactive
1293 		 * destination port
1294 		 */
1295 		if (unlikely(dst_na == NULL))
1296 			goto cleanup;
1297 		if (dst_na->up.na_flags & NAF_SW_ONLY)
1298 			goto cleanup;
1299 		dst_ifp = dst_na->up.ifp;
1300 		/*
1301 		 * The interface may be in !netmap mode in two cases:
1302 		 * - when na is attached but not activated yet;
1303 		 * - when na is being deactivated but is still attached.
1304 		 */
1305 		if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) {
1306 			ND("not in netmap mode!");
1307 			goto cleanup;
1308 		}
1309 
1310 		/* there is at least one either unicast or broadcast packet */
1311 		brd_next = brddst->bq_head;
1312 		next = d->bq_head;
1313 		/* we need to reserve this many slots. If fewer are
1314 		 * available, some packets will be dropped.
1315 		 * Packets may have multiple fragments, so we may not use
1316 		 * there is a chance that we may not use all of the slots
1317 		 * we have claimed, so we will need to handle the leftover
1318 		 * ones when we regain the lock.
1319 		 */
1320 		needed = d->bq_len + brddst->bq_len;
1321 
1322 		if (unlikely(dst_na->virt_hdr_len != na->virt_hdr_len)) {
1323 			RD(3, "virt_hdr_mismatch, src %d len %d", na->virt_hdr_len, dst_na->virt_hdr_len);
1324 			/* There is a virtio-net header/offloadings mismatch between
1325 			 * source and destination. The slower mismatch datapath will
1326 			 * be used to cope with all the mismatches.
1327 			 */
1328 			virt_hdr_mismatch = 1;
1329 			if (dst_na->mfs < na->mfs) {
1330 				/* We may need to do segmentation offloadings, and so
1331 				 * we may need a number of destination slots greater
1332 				 * than the number of input slots ('needed').
1333 				 * We look for the smallest integer 'x' which satisfies:
1334 				 *	needed * na->mfs + x * H <= x * na->mfs
1335 				 * where 'H' is the length of the longest header that may
1336 				 * be replicated in the segmentation process (e.g. for
1337 				 * TCPv4 we must account for ethernet header, IP header
1338 				 * and TCPv4 header).
1339 				 */
1340 				needed = (needed * na->mfs) /
1341 						(dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
1342 				ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
1343 			}
1344 		}
1345 
1346 		ND(5, "pass 2 dst %d is %x %s",
1347 			i, d_i, is_vp ? "virtual" : "nic/host");
1348 		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1349 		nrings = dst_na->up.num_rx_rings;
1350 		if (dst_nr >= nrings)
1351 			dst_nr = dst_nr % nrings;
1352 		kring = &dst_na->up.rx_rings[dst_nr];
1353 		ring = kring->ring;
1354 		lim = kring->nkr_num_slots - 1;
1355 
1356 retry:
1357 
1358 		if (dst_na->retry && retry) {
1359 			/* try to get some free slot from the previous run */
1360 			dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1361 		}
1362 		/* reserve the buffers in the queue and an entry
1363 		 * to report completion, and drop lock.
1364 		 * XXX this might become a helper function.
1365 		 */
1366 		mtx_lock(&kring->q_lock);
1367 		if (kring->nkr_stopped) {
1368 			mtx_unlock(&kring->q_lock);
1369 			goto cleanup;
1370 		}
1371 		my_start = j = kring->nkr_hwlease;
1372 		howmany = nm_kr_space(kring, 1);
1373 		if (needed < howmany)
1374 			howmany = needed;
1375 		lease_idx = nm_kr_lease(kring, howmany, 1);
1376 		mtx_unlock(&kring->q_lock);
1377 
1378 		/* only retry if we need more than available slots */
1379 		if (retry && needed <= howmany)
1380 			retry = 0;
1381 
1382 		/* copy to the destination queue */
1383 		while (howmany > 0) {
1384 			struct netmap_slot *slot;
1385 			struct nm_bdg_fwd *ft_p, *ft_end;
1386 			u_int cnt;
1387 
1388 			/* find the queue from which we pick next packet.
1389 			 * NM_FT_NULL is always higher than valid indexes
1390 			 * so we never dereference it if the other list
1391 			 * has packets (and if both are empty we never
1392 			 * get here).
1393 			 */
1394 			if (next < brd_next) {
1395 				ft_p = ft + next;
1396 				next = ft_p->ft_next;
1397 			} else { /* insert broadcast */
1398 				ft_p = ft + brd_next;
1399 				brd_next = ft_p->ft_next;
1400 			}
1401 			cnt = ft_p->ft_frags; // cnt > 0
1402 			if (unlikely(cnt > howmany))
1403 			    break; /* no more space */
1404 			if (netmap_verbose && cnt > 1)
1405 				RD(5, "rx %d frags to %d", cnt, j);
1406 			ft_end = ft_p + cnt;
1407 			if (unlikely(virt_hdr_mismatch)) {
1408 				bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
1409 			} else {
1410 				howmany -= cnt;
1411 				do {
1412 					char *dst, *src = ft_p->ft_buf;
1413 					size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1414 
1415 					slot = &ring->slot[j];
1416 					dst = BDG_NMB(&dst_na->up, slot);
1417 
1418 					ND("send [%d] %d(%d) bytes at %s:%d",
1419 							i, (int)copy_len, (int)dst_len,
1420 							NM_IFPNAME(dst_ifp), j);
1421 					/* round to a multiple of 64 */
1422 					copy_len = (copy_len + 63) & ~63;
1423 
1424 					if (unlikely(copy_len > NETMAP_BUF_SIZE ||
1425 							copy_len > NETMAP_BUF_SIZE)) {
1426 						RD(5, "invalid len %d, down to 64", (int)copy_len);
1427 						copy_len = dst_len = 64; // XXX
1428 					}
1429 					if (ft_p->ft_flags & NS_INDIRECT) {
1430 						if (copyin(src, dst, copy_len)) {
1431 							// invalid user pointer, pretend len is 0
1432 							dst_len = 0;
1433 						}
1434 					} else {
1435 						//memcpy(dst, src, copy_len);
1436 						pkt_copy(src, dst, (int)copy_len);
1437 					}
1438 					slot->len = dst_len;
1439 					slot->flags = (cnt << 8)| NS_MOREFRAG;
1440 					j = nm_next(j, lim);
1441 					needed--;
1442 					ft_p++;
1443 				} while (ft_p != ft_end);
1444 				slot->flags = (cnt << 8); /* clear flag on last entry */
1445 			}
1446 			/* are we done ? */
1447 			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1448 				break;
1449 		}
1450 		{
1451 		    /* current position */
1452 		    uint32_t *p = kring->nkr_leases; /* shorthand */
1453 		    uint32_t update_pos;
1454 		    int still_locked = 1;
1455 
1456 		    mtx_lock(&kring->q_lock);
1457 		    if (unlikely(howmany > 0)) {
1458 			/* not used all bufs. If i am the last one
1459 			 * i can recover the slots, otherwise must
1460 			 * fill them with 0 to mark empty packets.
1461 			 */
1462 			ND("leftover %d bufs", howmany);
1463 			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1464 			    /* yes i am the last one */
1465 			    ND("roll back nkr_hwlease to %d", j);
1466 			    kring->nkr_hwlease = j;
1467 			} else {
1468 			    while (howmany-- > 0) {
1469 				ring->slot[j].len = 0;
1470 				ring->slot[j].flags = 0;
1471 				j = nm_next(j, lim);
1472 			    }
1473 			}
1474 		    }
1475 		    p[lease_idx] = j; /* report I am done */
1476 
1477 		    update_pos = kring->nr_hwtail;
1478 
1479 		    if (my_start == update_pos) {
1480 			/* all slots before my_start have been reported,
1481 			 * so scan subsequent leases to see if other ranges
1482 			 * have been completed, and to a selwakeup or txsync.
1483 		         */
1484 			while (lease_idx != kring->nkr_lease_idx &&
1485 				p[lease_idx] != NR_NOSLOT) {
1486 			    j = p[lease_idx];
1487 			    p[lease_idx] = NR_NOSLOT;
1488 			    lease_idx = nm_next(lease_idx, lim);
1489 			}
1490 			/* j is the new 'write' position. j != my_start
1491 			 * means there are new buffers to report
1492 			 */
1493 			if (likely(j != my_start)) {
1494 				kring->nr_hwtail = j;
1495 				still_locked = 0;
1496 				mtx_unlock(&kring->q_lock);
1497 				dst_na->up.nm_notify(&dst_na->up, dst_nr, NR_RX, 0);
1498 				if (dst_na->retry && retry--)
1499 					goto retry;
1500 			}
1501 		    }
1502 		    if (still_locked)
1503 			mtx_unlock(&kring->q_lock);
1504 		}
1505 cleanup:
1506 		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1507 		d->bq_len = 0;
1508 	}
1509 	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1510 	brddst->bq_len = 0;
1511 	return 0;
1512 }
1513 
1514 
1515 static int
1516 netmap_vp_txsync(struct netmap_vp_adapter *na, u_int ring_nr, int flags)
1517 {
1518 	struct netmap_kring *kring = &na->up.tx_rings[ring_nr];
1519 	u_int done;
1520 	u_int const lim = kring->nkr_num_slots - 1;
1521 	u_int const cur = kring->rcur;
1522 
1523 	if (bridge_batch <= 0) { /* testing only */
1524 		done = cur; // used all
1525 		goto done;
1526 	}
1527 	if (bridge_batch > NM_BDG_BATCH)
1528 		bridge_batch = NM_BDG_BATCH;
1529 
1530 	done = nm_bdg_preflush(na, ring_nr, kring, cur);
1531 done:
1532 	if (done != cur)
1533 		D("early break at %d/ %d, tail %d", done, cur, kring->nr_hwtail);
1534 	/*
1535 	 * packets between 'done' and 'cur' are left unsent.
1536 	 */
1537 	kring->nr_hwcur = done;
1538 	kring->nr_hwtail = nm_prev(done, lim);
1539 	nm_txsync_finalize(kring);
1540 	if (netmap_verbose)
1541 		D("%s ring %d flags %d", NM_IFPNAME(na->up.ifp), ring_nr, flags);
1542 	return 0;
1543 }
1544 
1545 
1546 /*
1547  * main dispatch routine for the bridge.
1548  * We already know that only one thread is running this.
1549  * we must run nm_bdg_preflush without lock.
1550  */
1551 static int
1552 bdg_netmap_txsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1553 {
1554 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
1555 	return netmap_vp_txsync(vpna, ring_nr, flags);
1556 }
1557 
1558 static int
1559 netmap_vp_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1560 {
1561 	struct netmap_kring *kring = &na->rx_rings[ring_nr];
1562 	struct netmap_ring *ring = kring->ring;
1563 	u_int nm_i, lim = kring->nkr_num_slots - 1;
1564 	u_int head = nm_rxsync_prologue(kring);
1565 	int n;
1566 
1567 	if (head > lim) {
1568 		D("ouch dangerous reset!!!");
1569 		n = netmap_ring_reinit(kring);
1570 		goto done;
1571 	}
1572 
1573 	/* First part, import newly received packets. */
1574 	/* actually nothing to do here, they are already in the kring */
1575 
1576 	/* Second part, skip past packets that userspace has released. */
1577 	nm_i = kring->nr_hwcur;
1578 	if (nm_i != head) {
1579 		/* consistency check, but nothing really important here */
1580 		for (n = 0; likely(nm_i != head); n++) {
1581 			struct netmap_slot *slot = &ring->slot[nm_i];
1582 			void *addr = BDG_NMB(na, slot);
1583 
1584 			if (addr == netmap_buffer_base) { /* bad buf */
1585 				D("bad buffer index %d, ignore ?",
1586 					slot->buf_idx);
1587 			}
1588 			slot->flags &= ~NS_BUF_CHANGED;
1589 			nm_i = nm_next(nm_i, lim);
1590 		}
1591 		kring->nr_hwcur = head;
1592 	}
1593 
1594 	/* tell userspace that there are new packets */
1595 	nm_rxsync_finalize(kring);
1596 	n = 0;
1597 done:
1598 	return n;
1599 }
1600 
1601 /*
1602  * user process reading from a VALE switch.
1603  * Already protected against concurrent calls from userspace,
1604  * but we must acquire the queue's lock to protect against
1605  * writers on the same queue.
1606  */
1607 static int
1608 bdg_netmap_rxsync(struct netmap_adapter *na, u_int ring_nr, int flags)
1609 {
1610 	struct netmap_kring *kring = &na->rx_rings[ring_nr];
1611 	int n;
1612 
1613 	mtx_lock(&kring->q_lock);
1614 	n = netmap_vp_rxsync(na, ring_nr, flags);
1615 	mtx_unlock(&kring->q_lock);
1616 	return n;
1617 }
1618 
1619 
1620 static int
1621 bdg_netmap_attach(struct nmreq *nmr, struct ifnet *ifp)
1622 {
1623 	struct netmap_vp_adapter *vpna;
1624 	struct netmap_adapter *na;
1625 	int error;
1626 	u_int npipes = 0;
1627 
1628 	vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO);
1629 	if (vpna == NULL)
1630 		return ENOMEM;
1631 
1632  	na = &vpna->up;
1633 
1634 	na->ifp = ifp;
1635 
1636 	/* bound checking */
1637 	na->num_tx_rings = nmr->nr_tx_rings;
1638 	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1639 	nmr->nr_tx_rings = na->num_tx_rings; // write back
1640 	na->num_rx_rings = nmr->nr_rx_rings;
1641 	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1642 	nmr->nr_rx_rings = na->num_rx_rings; // write back
1643 	nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1644 			1, NM_BDG_MAXSLOTS, NULL);
1645 	na->num_tx_desc = nmr->nr_tx_slots;
1646 	nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1647 			1, NM_BDG_MAXSLOTS, NULL);
1648 	/* validate number of pipes. We want at least 1,
1649 	 * but probably can do with some more.
1650 	 * So let's use 2 as default (when 0 is supplied)
1651 	 */
1652 	npipes = nmr->nr_arg1;
1653 	nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
1654 	nmr->nr_arg1 = npipes;	/* write back */
1655 	/* validate extra bufs */
1656 	nm_bound_var(&nmr->nr_arg3, 0, 0,
1657 			128*NM_BDG_MAXSLOTS, NULL);
1658 	na->num_rx_desc = nmr->nr_rx_slots;
1659 	vpna->virt_hdr_len = 0;
1660 	vpna->mfs = 1514;
1661 	/*if (vpna->mfs > netmap_buf_size)  TODO netmap_buf_size is zero??
1662 		vpna->mfs = netmap_buf_size; */
1663         if (netmap_verbose)
1664 		D("max frame size %u", vpna->mfs);
1665 
1666 	na->na_flags |= NAF_BDG_MAYSLEEP | NAF_MEM_OWNER;
1667 	na->nm_txsync = bdg_netmap_txsync;
1668 	na->nm_rxsync = bdg_netmap_rxsync;
1669 	na->nm_register = bdg_netmap_reg;
1670 	na->nm_dtor = netmap_adapter_vp_dtor;
1671 	na->nm_krings_create = netmap_vp_krings_create;
1672 	na->nm_krings_delete = netmap_vp_krings_delete;
1673 	na->nm_mem = netmap_mem_private_new(NM_IFPNAME(na->ifp),
1674 			na->num_tx_rings, na->num_tx_desc,
1675 			na->num_rx_rings, na->num_rx_desc,
1676 			nmr->nr_arg3, npipes, &error);
1677 	if (na->nm_mem == NULL)
1678 		goto err;
1679 	/* other nmd fields are set in the common routine */
1680 	error = netmap_attach_common(na);
1681 	if (error)
1682 		goto err;
1683 	return 0;
1684 
1685 err:
1686 	if (na->nm_mem != NULL)
1687 		netmap_mem_private_delete(na->nm_mem);
1688 	free(vpna, M_DEVBUF);
1689 	return error;
1690 }
1691 
1692 
1693 static void
1694 netmap_bwrap_dtor(struct netmap_adapter *na)
1695 {
1696 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1697 	struct netmap_adapter *hwna = bna->hwna;
1698 	struct nm_bridge *b = bna->up.na_bdg,
1699 		*bh = bna->host.na_bdg;
1700 	struct ifnet *ifp = na->ifp;
1701 
1702 	ND("na %p", na);
1703 
1704 	if (b) {
1705 		netmap_bdg_detach_common(b, bna->up.bdg_port,
1706 			(bh ? bna->host.bdg_port : -1));
1707 	}
1708 
1709 	hwna->na_private = NULL;
1710 	netmap_adapter_put(hwna);
1711 
1712 	bzero(ifp, sizeof(*ifp));
1713 	free(ifp, M_DEVBUF);
1714 	na->ifp = NULL;
1715 
1716 }
1717 
1718 
1719 /*
1720  * Intr callback for NICs connected to a bridge.
1721  * Simply ignore tx interrupts (maybe we could try to recover space ?)
1722  * and pass received packets from nic to the bridge.
1723  *
1724  * XXX TODO check locking: this is called from the interrupt
1725  * handler so we should make sure that the interface is not
1726  * disconnected while passing down an interrupt.
1727  *
1728  * Note, no user process can access this NIC or the host stack.
1729  * The only part of the ring that is significant are the slots,
1730  * and head/cur/tail are set from the kring as needed
1731  * (part as a receive ring, part as a transmit ring).
1732  *
1733  * callback that overwrites the hwna notify callback.
1734  * Packets come from the outside or from the host stack and are put on an hwna rx ring.
1735  * The bridge wrapper then sends the packets through the bridge.
1736  */
1737 static int
1738 netmap_bwrap_intr_notify(struct netmap_adapter *na, u_int ring_nr, enum txrx tx, int flags)
1739 {
1740 	struct ifnet *ifp = na->ifp;
1741 	struct netmap_bwrap_adapter *bna = na->na_private;
1742 	struct netmap_vp_adapter *hostna = &bna->host;
1743 	struct netmap_kring *kring, *bkring;
1744 	struct netmap_ring *ring;
1745 	int is_host_ring = ring_nr == na->num_rx_rings;
1746 	struct netmap_vp_adapter *vpna = &bna->up;
1747 	int error = 0;
1748 
1749 	if (netmap_verbose)
1750 	    D("%s %s%d 0x%x", NM_IFPNAME(ifp),
1751 		(tx == NR_TX ? "TX" : "RX"), ring_nr, flags);
1752 
1753 	if (flags & NAF_DISABLE_NOTIFY) {
1754 		kring = tx == NR_TX ? na->tx_rings : na->rx_rings;
1755 		bkring = tx == NR_TX ? vpna->up.rx_rings : vpna->up.tx_rings;
1756 		if (kring[ring_nr].nkr_stopped)
1757 			netmap_disable_ring(&bkring[ring_nr]);
1758 		else
1759 			bkring[ring_nr].nkr_stopped = 0;
1760 		return 0;
1761 	}
1762 
1763 	if (ifp == NULL || !(ifp->if_capenable & IFCAP_NETMAP))
1764 		return 0;
1765 
1766 	/* we only care about receive interrupts */
1767 	if (tx == NR_TX)
1768 		return 0;
1769 
1770 	kring = &na->rx_rings[ring_nr];
1771 	ring = kring->ring;
1772 
1773 	/* make sure the ring is not disabled */
1774 	if (nm_kr_tryget(kring))
1775 		return 0;
1776 
1777 	if (is_host_ring && hostna->na_bdg == NULL) {
1778 		error = bna->save_notify(na, ring_nr, tx, flags);
1779 		goto put_out;
1780 	}
1781 
1782 	/* Here we expect ring->head = ring->cur = ring->tail
1783 	 * because everything has been released from the previous round.
1784 	 * However the ring is shared and we might have info from
1785 	 * the wrong side (the tx ring). Hence we overwrite with
1786 	 * the info from the rx kring.
1787 	 */
1788 	if (netmap_verbose)
1789 	    D("%s head %d cur %d tail %d (kring %d %d %d)",  NM_IFPNAME(ifp),
1790 		ring->head, ring->cur, ring->tail,
1791 		kring->rhead, kring->rcur, kring->rtail);
1792 
1793 	ring->head = kring->rhead;
1794 	ring->cur = kring->rcur;
1795 	ring->tail = kring->rtail;
1796 
1797 	if (is_host_ring) {
1798 		vpna = hostna;
1799 		ring_nr = 0;
1800 	}
1801 	/* simulate a user wakeup on the rx ring */
1802 	/* fetch packets that have arrived.
1803 	 * XXX maybe do this in a loop ?
1804 	 */
1805 	error = kring->nm_sync(kring, 0);
1806 	if (error)
1807 		goto put_out;
1808 	if (kring->nr_hwcur == kring->nr_hwtail && netmap_verbose) {
1809 		D("how strange, interrupt with no packets on %s",
1810 			NM_IFPNAME(ifp));
1811 		goto put_out;
1812 	}
1813 
1814 	/* new packets are ring->cur to ring->tail, and the bkring
1815 	 * had hwcur == ring->cur. So advance ring->cur to ring->tail
1816 	 * to push all packets out.
1817 	 */
1818 	ring->head = ring->cur = ring->tail;
1819 
1820 	/* also set tail to what the bwrap expects */
1821 	bkring = &vpna->up.tx_rings[ring_nr];
1822 	ring->tail = bkring->nr_hwtail; // rtail too ?
1823 
1824 	/* pass packets to the switch */
1825 	nm_txsync_prologue(bkring); // XXX error checking ?
1826 	netmap_vp_txsync(vpna, ring_nr, flags);
1827 
1828 	/* mark all buffers as released on this ring */
1829 	ring->head = ring->cur = kring->nr_hwtail;
1830 	ring->tail = kring->rtail;
1831 	/* another call to actually release the buffers */
1832 	if (!is_host_ring) {
1833 		error = kring->nm_sync(kring, 0);
1834 	} else {
1835 		/* mark all packets as released, as in the
1836 		 * second part of netmap_rxsync_from_host()
1837 		 */
1838 		kring->nr_hwcur = kring->nr_hwtail;
1839 		nm_rxsync_finalize(kring);
1840 	}
1841 
1842 put_out:
1843 	nm_kr_put(kring);
1844 	return error;
1845 }
1846 
1847 
1848 static int
1849 netmap_bwrap_register(struct netmap_adapter *na, int onoff)
1850 {
1851 	struct netmap_bwrap_adapter *bna =
1852 		(struct netmap_bwrap_adapter *)na;
1853 	struct netmap_adapter *hwna = bna->hwna;
1854 	struct netmap_vp_adapter *hostna = &bna->host;
1855 	int error;
1856 
1857 	ND("%s %s", NM_IFPNAME(na->ifp), onoff ? "on" : "off");
1858 
1859 	if (onoff) {
1860 		int i;
1861 
1862 		hwna->na_lut = na->na_lut;
1863 		hwna->na_lut_objtotal = na->na_lut_objtotal;
1864 
1865 		if (hostna->na_bdg) {
1866 			hostna->up.na_lut = na->na_lut;
1867 			hostna->up.na_lut_objtotal = na->na_lut_objtotal;
1868 		}
1869 
1870 		/* cross-link the netmap rings
1871 		 * The original number of rings comes from hwna,
1872 		 * rx rings on one side equals tx rings on the other.
1873 		 */
1874 		for (i = 0; i < na->num_rx_rings + 1; i++) {
1875 			hwna->tx_rings[i].nkr_num_slots = na->rx_rings[i].nkr_num_slots;
1876 			hwna->tx_rings[i].ring = na->rx_rings[i].ring;
1877 		}
1878 		for (i = 0; i < na->num_tx_rings + 1; i++) {
1879 			hwna->rx_rings[i].nkr_num_slots = na->tx_rings[i].nkr_num_slots;
1880 			hwna->rx_rings[i].ring = na->tx_rings[i].ring;
1881 		}
1882 	}
1883 
1884 	if (hwna->ifp) {
1885 		error = hwna->nm_register(hwna, onoff);
1886 		if (error)
1887 			return error;
1888 	}
1889 
1890 	bdg_netmap_reg(na, onoff);
1891 
1892 	if (onoff) {
1893 		bna->save_notify = hwna->nm_notify;
1894 		hwna->nm_notify = netmap_bwrap_intr_notify;
1895 	} else {
1896 		hwna->nm_notify = bna->save_notify;
1897 		hwna->na_lut = NULL;
1898 		hwna->na_lut_objtotal = 0;
1899 	}
1900 
1901 	return 0;
1902 }
1903 
1904 
1905 static int
1906 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
1907 				    u_int *rxr, u_int *rxd)
1908 {
1909 	struct netmap_bwrap_adapter *bna =
1910 		(struct netmap_bwrap_adapter *)na;
1911 	struct netmap_adapter *hwna = bna->hwna;
1912 
1913 	/* forward the request */
1914 	netmap_update_config(hwna);
1915 	/* swap the results */
1916 	*txr = hwna->num_rx_rings;
1917 	*txd = hwna->num_rx_desc;
1918 	*rxr = hwna->num_tx_rings;
1919 	*rxd = hwna->num_rx_desc;
1920 
1921 	return 0;
1922 }
1923 
1924 
1925 static int
1926 netmap_bwrap_krings_create(struct netmap_adapter *na)
1927 {
1928 	struct netmap_bwrap_adapter *bna =
1929 		(struct netmap_bwrap_adapter *)na;
1930 	struct netmap_adapter *hwna = bna->hwna;
1931 	struct netmap_adapter *hostna = &bna->host.up;
1932 	int error;
1933 
1934 	ND("%s", NM_IFPNAME(na->ifp));
1935 
1936 	error = netmap_vp_krings_create(na);
1937 	if (error)
1938 		return error;
1939 
1940 	error = hwna->nm_krings_create(hwna);
1941 	if (error) {
1942 		netmap_vp_krings_delete(na);
1943 		return error;
1944 	}
1945 
1946 	if (na->na_flags & NAF_HOST_RINGS) {
1947 		hostna->tx_rings = na->tx_rings + na->num_tx_rings;
1948 		hostna->rx_rings = na->rx_rings + na->num_rx_rings;
1949 	}
1950 
1951 	return 0;
1952 }
1953 
1954 
1955 static void
1956 netmap_bwrap_krings_delete(struct netmap_adapter *na)
1957 {
1958 	struct netmap_bwrap_adapter *bna =
1959 		(struct netmap_bwrap_adapter *)na;
1960 	struct netmap_adapter *hwna = bna->hwna;
1961 
1962 	ND("%s", NM_IFPNAME(na->ifp));
1963 
1964 	hwna->nm_krings_delete(hwna);
1965 	netmap_vp_krings_delete(na);
1966 }
1967 
1968 
1969 /* notify method for the bridge-->hwna direction */
1970 static int
1971 netmap_bwrap_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
1972 {
1973 	struct netmap_bwrap_adapter *bna =
1974 		(struct netmap_bwrap_adapter *)na;
1975 	struct netmap_adapter *hwna = bna->hwna;
1976 	struct netmap_kring *kring, *hw_kring;
1977 	struct netmap_ring *ring;
1978 	u_int lim;
1979 	int error = 0;
1980 
1981 	if (tx == NR_TX)
1982 	        return EINVAL;
1983 
1984 	kring = &na->rx_rings[ring_n];
1985 	hw_kring = &hwna->tx_rings[ring_n];
1986 	ring = kring->ring;
1987 	lim = kring->nkr_num_slots - 1;
1988 
1989 	if (hwna->ifp == NULL || !(hwna->ifp->if_capenable & IFCAP_NETMAP))
1990 		return 0;
1991 	mtx_lock(&kring->q_lock);
1992 	/* first step: simulate a user wakeup on the rx ring */
1993 	netmap_vp_rxsync(na, ring_n, flags);
1994 	ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1995 		NM_IFPNAME(na->ifp), ring_n,
1996 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1997 		ring->head, ring->cur, ring->tail,
1998 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
1999 	/* second step: the simulated user consumes all new packets */
2000 	ring->head = ring->cur = ring->tail;
2001 
2002 	/* third step: the new packets are sent on the tx ring
2003 	 * (which is actually the same ring)
2004 	 */
2005 	/* set tail to what the hw expects */
2006 	ring->tail = hw_kring->rtail;
2007 	nm_txsync_prologue(&hwna->tx_rings[ring_n]); // XXX error checking ?
2008 	error = hw_kring->nm_sync(hw_kring, flags);
2009 
2010 	/* fourth step: now we are back the rx ring */
2011 	/* claim ownership on all hw owned bufs */
2012 	ring->head = nm_next(ring->tail, lim); /* skip past reserved slot */
2013 	ring->tail = kring->rtail; /* restore saved value of tail, for safety */
2014 
2015 	/* fifth step: the user goes to sleep again, causing another rxsync */
2016 	netmap_vp_rxsync(na, ring_n, flags);
2017 	ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2018 		NM_IFPNAME(na->ifp), ring_n,
2019 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2020 		ring->head, ring->cur, ring->tail,
2021 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
2022 	mtx_unlock(&kring->q_lock);
2023 	return error;
2024 }
2025 
2026 
2027 static int
2028 netmap_bwrap_host_notify(struct netmap_adapter *na, u_int ring_n, enum txrx tx, int flags)
2029 {
2030 	struct netmap_bwrap_adapter *bna = na->na_private;
2031 	struct netmap_adapter *port_na = &bna->up.up;
2032 	if (tx == NR_TX || ring_n != 0)
2033 		return EINVAL;
2034 	return netmap_bwrap_notify(port_na, port_na->num_rx_rings, NR_RX, flags);
2035 }
2036 
2037 
2038 /* attach a bridge wrapper to the 'real' device */
2039 static int
2040 netmap_bwrap_attach(struct ifnet *fake, struct ifnet *real)
2041 {
2042 	struct netmap_bwrap_adapter *bna;
2043 	struct netmap_adapter *na;
2044 	struct netmap_adapter *hwna = NA(real);
2045 	struct netmap_adapter *hostna;
2046 	int error;
2047 
2048 
2049 	bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO);
2050 	if (bna == NULL)
2051 		return ENOMEM;
2052 
2053 	na = &bna->up.up;
2054 	na->ifp = fake;
2055 	/* fill the ring data for the bwrap adapter with rx/tx meanings
2056 	 * swapped. The real cross-linking will be done during register,
2057 	 * when all the krings will have been created.
2058 	 */
2059 	na->num_rx_rings = hwna->num_tx_rings;
2060 	na->num_tx_rings = hwna->num_rx_rings;
2061 	na->num_tx_desc = hwna->num_rx_desc;
2062 	na->num_rx_desc = hwna->num_tx_desc;
2063 	na->nm_dtor = netmap_bwrap_dtor;
2064 	na->nm_register = netmap_bwrap_register;
2065 	// na->nm_txsync = netmap_bwrap_txsync;
2066 	// na->nm_rxsync = netmap_bwrap_rxsync;
2067 	na->nm_config = netmap_bwrap_config;
2068 	na->nm_krings_create = netmap_bwrap_krings_create;
2069 	na->nm_krings_delete = netmap_bwrap_krings_delete;
2070 	na->nm_notify = netmap_bwrap_notify;
2071 	na->nm_mem = hwna->nm_mem;
2072 	na->na_private = na; /* prevent NIOCREGIF */
2073 	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
2074 
2075 	bna->hwna = hwna;
2076 	netmap_adapter_get(hwna);
2077 	hwna->na_private = bna; /* weak reference */
2078 
2079 	if (hwna->na_flags & NAF_HOST_RINGS) {
2080 		na->na_flags |= NAF_HOST_RINGS;
2081 		hostna = &bna->host.up;
2082 		hostna->ifp = hwna->ifp;
2083 		hostna->num_tx_rings = 1;
2084 		hostna->num_tx_desc = hwna->num_rx_desc;
2085 		hostna->num_rx_rings = 1;
2086 		hostna->num_rx_desc = hwna->num_tx_desc;
2087 		// hostna->nm_txsync = netmap_bwrap_host_txsync;
2088 		// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
2089 		hostna->nm_notify = netmap_bwrap_host_notify;
2090 		hostna->nm_mem = na->nm_mem;
2091 		hostna->na_private = bna;
2092 	}
2093 
2094 	ND("%s<->%s txr %d txd %d rxr %d rxd %d",
2095 		fake->if_xname, real->if_xname,
2096 		na->num_tx_rings, na->num_tx_desc,
2097 		na->num_rx_rings, na->num_rx_desc);
2098 
2099 	error = netmap_attach_common(na);
2100 	if (error) {
2101 		netmap_adapter_put(hwna);
2102 		free(bna, M_DEVBUF);
2103 		return error;
2104 	}
2105 	return 0;
2106 }
2107 
2108 
2109 void
2110 netmap_init_bridges(void)
2111 {
2112 	int i;
2113 	bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */
2114 	for (i = 0; i < NM_BRIDGES; i++)
2115 		BDG_RWINIT(&nm_bridges[i]);
2116 }
2117 #endif /* WITH_VALE */
2118