xref: /freebsd/sys/dev/netmap/netmap_vale.c (revision 8657387683946d0c03e09fe77029edfe309eeb20)
1 /*
2  * Copyright (C) 2013-2016 Universita` di Pisa
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *   1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  *   2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *      documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 
28 /*
29  * This module implements the VALE switch for netmap
30 
31 --- VALE SWITCH ---
32 
33 NMG_LOCK() serializes all modifications to switches and ports.
34 A switch cannot be deleted until all ports are gone.
35 
36 For each switch, an SX lock (RWlock on linux) protects
37 deletion of ports. When configuring or deleting a new port, the
38 lock is acquired in exclusive mode (after holding NMG_LOCK).
39 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
40 The lock is held throughout the entire forwarding cycle,
41 during which the thread may incur in a page fault.
42 Hence it is important that sleepable shared locks are used.
43 
44 On the rx ring, the per-port lock is grabbed initially to reserve
45 a number of slot in the ring, then the lock is released,
46 packets are copied from source to destination, and then
47 the lock is acquired again and the receive ring is updated.
48 (A similar thing is done on the tx ring for NIC and host stack
49 ports attached to the switch)
50 
51  */
52 
53 /*
54  * OS-specific code that is used only within this file.
55  * Other OS-specific code that must be accessed by drivers
56  * is present in netmap_kern.h
57  */
58 
59 #if defined(__FreeBSD__)
60 #include <sys/cdefs.h> /* prerequisite */
61 __FBSDID("$FreeBSD$");
62 
63 #include <sys/types.h>
64 #include <sys/errno.h>
65 #include <sys/param.h>	/* defines used in kernel.h */
66 #include <sys/kernel.h>	/* types used in module initialization */
67 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
68 #include <sys/sockio.h>
69 #include <sys/socketvar.h>	/* struct socket */
70 #include <sys/malloc.h>
71 #include <sys/poll.h>
72 #include <sys/rwlock.h>
73 #include <sys/socket.h> /* sockaddrs */
74 #include <sys/selinfo.h>
75 #include <sys/sysctl.h>
76 #include <net/if.h>
77 #include <net/if_var.h>
78 #include <net/bpf.h>		/* BIOCIMMEDIATE */
79 #include <machine/bus.h>	/* bus_dmamap_* */
80 #include <sys/endian.h>
81 #include <sys/refcount.h>
82 
83 
84 #define BDG_RWLOCK_T		struct rwlock // struct rwlock
85 
86 #define	BDG_RWINIT(b)		\
87 	rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
88 #define BDG_WLOCK(b)		rw_wlock(&(b)->bdg_lock)
89 #define BDG_WUNLOCK(b)		rw_wunlock(&(b)->bdg_lock)
90 #define BDG_RLOCK(b)		rw_rlock(&(b)->bdg_lock)
91 #define BDG_RTRYLOCK(b)		rw_try_rlock(&(b)->bdg_lock)
92 #define BDG_RUNLOCK(b)		rw_runlock(&(b)->bdg_lock)
93 #define BDG_RWDESTROY(b)	rw_destroy(&(b)->bdg_lock)
94 
95 
96 #elif defined(linux)
97 
98 #include "bsd_glue.h"
99 
100 #elif defined(__APPLE__)
101 
102 #warning OSX support is only partial
103 #include "osx_glue.h"
104 
105 #elif defined(_WIN32)
106 #include "win_glue.h"
107 
108 #else
109 
110 #error	Unsupported platform
111 
112 #endif /* unsupported */
113 
114 /*
115  * common headers
116  */
117 
118 #include <net/netmap.h>
119 #include <dev/netmap/netmap_kern.h>
120 #include <dev/netmap/netmap_mem2.h>
121 
122 #ifdef WITH_VALE
123 
124 /*
125  * system parameters (most of them in netmap_kern.h)
126  * NM_BDG_NAME	prefix for switch port names, default "vale"
127  * NM_BDG_MAXPORTS	number of ports
128  * NM_BRIDGES	max number of switches in the system.
129  *	XXX should become a sysctl or tunable
130  *
131  * Switch ports are named valeX:Y where X is the switch name and Y
132  * is the port. If Y matches a physical interface name, the port is
133  * connected to a physical device.
134  *
135  * Unlike physical interfaces, switch ports use their own memory region
136  * for rings and buffers.
137  * The virtual interfaces use per-queue lock instead of core lock.
138  * In the tx loop, we aggregate traffic in batches to make all operations
139  * faster. The batch size is bridge_batch.
140  */
141 #define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
142 #define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
143 #define NM_BRIDGE_RINGSIZE	1024	/* in the device */
144 #define NM_BDG_HASH		1024	/* forwarding table entries */
145 #define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
146 #define NM_MULTISEG		64	/* max size of a chain of bufs */
147 /* actual size of the tables */
148 #define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NM_MULTISEG)
149 /* NM_FT_NULL terminates a list of slots in the ft */
150 #define NM_FT_NULL		NM_BDG_BATCH_MAX
151 
152 
153 /*
154  * bridge_batch is set via sysctl to the max batch size to be
155  * used in the bridge. The actual value may be larger as the
156  * last packet in the block may overflow the size.
157  */
158 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
159 SYSBEGIN(vars_vale);
160 SYSCTL_DECL(_dev_netmap);
161 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
162 SYSEND;
163 
164 static int netmap_vp_create(struct nmreq *, struct ifnet *,
165 		struct netmap_mem_d *nmd, struct netmap_vp_adapter **);
166 static int netmap_vp_reg(struct netmap_adapter *na, int onoff);
167 static int netmap_bwrap_reg(struct netmap_adapter *, int onoff);
168 
169 /*
170  * For each output interface, nm_bdg_q is used to construct a list.
171  * bq_len is the number of output buffers (we can have coalescing
172  * during the copy).
173  */
174 struct nm_bdg_q {
175 	uint16_t bq_head;
176 	uint16_t bq_tail;
177 	uint32_t bq_len;	/* number of buffers */
178 };
179 
180 /* XXX revise this */
181 struct nm_hash_ent {
182 	uint64_t	mac;	/* the top 2 bytes are the epoch */
183 	uint64_t	ports;
184 };
185 
186 /*
187  * nm_bridge is a descriptor for a VALE switch.
188  * Interfaces for a bridge are all in bdg_ports[].
189  * The array has fixed size, an empty entry does not terminate
190  * the search, but lookups only occur on attach/detach so we
191  * don't mind if they are slow.
192  *
193  * The bridge is non blocking on the transmit ports: excess
194  * packets are dropped if there is no room on the output port.
195  *
196  * bdg_lock protects accesses to the bdg_ports array.
197  * This is a rw lock (or equivalent).
198  */
199 struct nm_bridge {
200 	/* XXX what is the proper alignment/layout ? */
201 	BDG_RWLOCK_T	bdg_lock;	/* protects bdg_ports */
202 	int		bdg_namelen;
203 	uint32_t	bdg_active_ports; /* 0 means free */
204 	char		bdg_basename[IFNAMSIZ];
205 
206 	/* Indexes of active ports (up to active_ports)
207 	 * and all other remaining ports.
208 	 */
209 	uint8_t		bdg_port_index[NM_BDG_MAXPORTS];
210 
211 	struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
212 
213 
214 	/*
215 	 * The function to decide the destination port.
216 	 * It returns either of an index of the destination port,
217 	 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
218 	 * forward this packet.  ring_nr is the source ring index, and the
219 	 * function may overwrite this value to forward this packet to a
220 	 * different ring index.
221 	 * This function must be set by netmap_bdg_ctl().
222 	 */
223 	struct netmap_bdg_ops bdg_ops;
224 
225 	/* the forwarding table, MAC+ports.
226 	 * XXX should be changed to an argument to be passed to
227 	 * the lookup function, and allocated on attach
228 	 */
229 	struct nm_hash_ent ht[NM_BDG_HASH];
230 
231 #ifdef CONFIG_NET_NS
232 	struct net *ns;
233 #endif /* CONFIG_NET_NS */
234 };
235 
236 const char*
237 netmap_bdg_name(struct netmap_vp_adapter *vp)
238 {
239 	struct nm_bridge *b = vp->na_bdg;
240 	if (b == NULL)
241 		return NULL;
242 	return b->bdg_basename;
243 }
244 
245 
246 #ifndef CONFIG_NET_NS
247 /*
248  * XXX in principle nm_bridges could be created dynamically
249  * Right now we have a static array and deletions are protected
250  * by an exclusive lock.
251  */
252 static struct nm_bridge *nm_bridges;
253 #endif /* !CONFIG_NET_NS */
254 
255 
256 /*
257  * this is a slightly optimized copy routine which rounds
258  * to multiple of 64 bytes and is often faster than dealing
259  * with other odd sizes. We assume there is enough room
260  * in the source and destination buffers.
261  *
262  * XXX only for multiples of 64 bytes, non overlapped.
263  */
264 static inline void
265 pkt_copy(void *_src, void *_dst, int l)
266 {
267         uint64_t *src = _src;
268         uint64_t *dst = _dst;
269         if (unlikely(l >= 1024)) {
270                 memcpy(dst, src, l);
271                 return;
272         }
273         for (; likely(l > 0); l-=64) {
274                 *dst++ = *src++;
275                 *dst++ = *src++;
276                 *dst++ = *src++;
277                 *dst++ = *src++;
278                 *dst++ = *src++;
279                 *dst++ = *src++;
280                 *dst++ = *src++;
281                 *dst++ = *src++;
282         }
283 }
284 
285 
286 static int
287 nm_is_id_char(const char c)
288 {
289 	return (c >= 'a' && c <= 'z') ||
290 	       (c >= 'A' && c <= 'Z') ||
291 	       (c >= '0' && c <= '9') ||
292 	       (c == '_');
293 }
294 
295 /* Validate the name of a VALE bridge port and return the
296  * position of the ":" character. */
297 static int
298 nm_vale_name_validate(const char *name)
299 {
300 	int colon_pos = -1;
301 	int i;
302 
303 	if (!name || strlen(name) < strlen(NM_BDG_NAME)) {
304 		return -1;
305 	}
306 
307 	for (i = 0; name[i]; i++) {
308 		if (name[i] == ':') {
309 			if (colon_pos != -1) {
310 				return -1;
311 			}
312 			colon_pos = i;
313 		} else if (!nm_is_id_char(name[i])) {
314 			return -1;
315 		}
316 	}
317 
318 	if (i >= IFNAMSIZ) {
319 		return -1;
320 	}
321 
322 	return colon_pos;
323 }
324 
325 /*
326  * locate a bridge among the existing ones.
327  * MUST BE CALLED WITH NMG_LOCK()
328  *
329  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
330  * We assume that this is called with a name of at least NM_NAME chars.
331  */
332 static struct nm_bridge *
333 nm_find_bridge(const char *name, int create)
334 {
335 	int i, namelen;
336 	struct nm_bridge *b = NULL, *bridges;
337 	u_int num_bridges;
338 
339 	NMG_LOCK_ASSERT();
340 
341 	netmap_bns_getbridges(&bridges, &num_bridges);
342 
343 	namelen = nm_vale_name_validate(name);
344 	if (namelen < 0) {
345 		D("invalid bridge name %s", name ? name : NULL);
346 		return NULL;
347 	}
348 
349 	/* lookup the name, remember empty slot if there is one */
350 	for (i = 0; i < num_bridges; i++) {
351 		struct nm_bridge *x = bridges + i;
352 
353 		if (x->bdg_active_ports == 0) {
354 			if (create && b == NULL)
355 				b = x;	/* record empty slot */
356 		} else if (x->bdg_namelen != namelen) {
357 			continue;
358 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
359 			ND("found '%.*s' at %d", namelen, name, i);
360 			b = x;
361 			break;
362 		}
363 	}
364 	if (i == num_bridges && b) { /* name not found, can create entry */
365 		/* initialize the bridge */
366 		strncpy(b->bdg_basename, name, namelen);
367 		ND("create new bridge %s with ports %d", b->bdg_basename,
368 			b->bdg_active_ports);
369 		b->bdg_namelen = namelen;
370 		b->bdg_active_ports = 0;
371 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
372 			b->bdg_port_index[i] = i;
373 		/* set the default function */
374 		b->bdg_ops.lookup = netmap_bdg_learning;
375 		/* reset the MAC address table */
376 		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
377 		NM_BNS_GET(b);
378 	}
379 	return b;
380 }
381 
382 
383 /*
384  * Free the forwarding tables for rings attached to switch ports.
385  */
386 static void
387 nm_free_bdgfwd(struct netmap_adapter *na)
388 {
389 	int nrings, i;
390 	struct netmap_kring *kring;
391 
392 	NMG_LOCK_ASSERT();
393 	nrings = na->num_tx_rings;
394 	kring = na->tx_rings;
395 	for (i = 0; i < nrings; i++) {
396 		if (kring[i].nkr_ft) {
397 			nm_os_free(kring[i].nkr_ft);
398 			kring[i].nkr_ft = NULL; /* protect from freeing twice */
399 		}
400 	}
401 }
402 
403 
404 /*
405  * Allocate the forwarding tables for the rings attached to the bridge ports.
406  */
407 static int
408 nm_alloc_bdgfwd(struct netmap_adapter *na)
409 {
410 	int nrings, l, i, num_dstq;
411 	struct netmap_kring *kring;
412 
413 	NMG_LOCK_ASSERT();
414 	/* all port:rings + broadcast */
415 	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
416 	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
417 	l += sizeof(struct nm_bdg_q) * num_dstq;
418 	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
419 
420 	nrings = netmap_real_rings(na, NR_TX);
421 	kring = na->tx_rings;
422 	for (i = 0; i < nrings; i++) {
423 		struct nm_bdg_fwd *ft;
424 		struct nm_bdg_q *dstq;
425 		int j;
426 
427 		ft = nm_os_malloc(l);
428 		if (!ft) {
429 			nm_free_bdgfwd(na);
430 			return ENOMEM;
431 		}
432 		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
433 		for (j = 0; j < num_dstq; j++) {
434 			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
435 			dstq[j].bq_len = 0;
436 		}
437 		kring[i].nkr_ft = ft;
438 	}
439 	return 0;
440 }
441 
442 
443 /* remove from bridge b the ports in slots hw and sw
444  * (sw can be -1 if not needed)
445  */
446 static void
447 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
448 {
449 	int s_hw = hw, s_sw = sw;
450 	int i, lim =b->bdg_active_ports;
451 	uint8_t tmp[NM_BDG_MAXPORTS];
452 
453 	/*
454 	New algorithm:
455 	make a copy of bdg_port_index;
456 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
457 	in the array of bdg_port_index, replacing them with
458 	entries from the bottom of the array;
459 	decrement bdg_active_ports;
460 	acquire BDG_WLOCK() and copy back the array.
461 	 */
462 
463 	if (netmap_verbose)
464 		D("detach %d and %d (lim %d)", hw, sw, lim);
465 	/* make a copy of the list of active ports, update it,
466 	 * and then copy back within BDG_WLOCK().
467 	 */
468 	memcpy(tmp, b->bdg_port_index, sizeof(tmp));
469 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
470 		if (hw >= 0 && tmp[i] == hw) {
471 			ND("detach hw %d at %d", hw, i);
472 			lim--; /* point to last active port */
473 			tmp[i] = tmp[lim]; /* swap with i */
474 			tmp[lim] = hw;	/* now this is inactive */
475 			hw = -1;
476 		} else if (sw >= 0 && tmp[i] == sw) {
477 			ND("detach sw %d at %d", sw, i);
478 			lim--;
479 			tmp[i] = tmp[lim];
480 			tmp[lim] = sw;
481 			sw = -1;
482 		} else {
483 			i++;
484 		}
485 	}
486 	if (hw >= 0 || sw >= 0) {
487 		D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
488 	}
489 
490 	BDG_WLOCK(b);
491 	if (b->bdg_ops.dtor)
492 		b->bdg_ops.dtor(b->bdg_ports[s_hw]);
493 	b->bdg_ports[s_hw] = NULL;
494 	if (s_sw >= 0) {
495 		b->bdg_ports[s_sw] = NULL;
496 	}
497 	memcpy(b->bdg_port_index, tmp, sizeof(tmp));
498 	b->bdg_active_ports = lim;
499 	BDG_WUNLOCK(b);
500 
501 	ND("now %d active ports", lim);
502 	if (lim == 0) {
503 		ND("marking bridge %s as free", b->bdg_basename);
504 		bzero(&b->bdg_ops, sizeof(b->bdg_ops));
505 		NM_BNS_PUT(b);
506 	}
507 }
508 
509 /* nm_bdg_ctl callback for VALE ports */
510 static int
511 netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
512 {
513 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
514 	struct nm_bridge *b = vpna->na_bdg;
515 
516 	(void)nmr;	// XXX merge ?
517 	if (attach)
518 		return 0; /* nothing to do */
519 	if (b) {
520 		netmap_set_all_rings(na, 0 /* disable */);
521 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
522 		vpna->na_bdg = NULL;
523 		netmap_set_all_rings(na, 1 /* enable */);
524 	}
525 	/* I have took reference just for attach */
526 	netmap_adapter_put(na);
527 	return 0;
528 }
529 
530 /* nm_dtor callback for ephemeral VALE ports */
531 static void
532 netmap_vp_dtor(struct netmap_adapter *na)
533 {
534 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
535 	struct nm_bridge *b = vpna->na_bdg;
536 
537 	ND("%s has %d references", na->name, na->na_refcount);
538 
539 	if (b) {
540 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
541 	}
542 
543 	if (vpna->autodelete && na->ifp != NULL) {
544 		ND("releasing %s", na->ifp->if_xname);
545 		NMG_UNLOCK();
546 		nm_os_vi_detach(na->ifp);
547 		NMG_LOCK();
548 	}
549 }
550 
551 /* remove a persistent VALE port from the system */
552 static int
553 nm_vi_destroy(const char *name)
554 {
555 	struct ifnet *ifp;
556 	struct netmap_vp_adapter *vpna;
557 	int error;
558 
559 	ifp = ifunit_ref(name);
560 	if (!ifp)
561 		return ENXIO;
562 	NMG_LOCK();
563 	/* make sure this is actually a VALE port */
564 	if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
565 		error = EINVAL;
566 		goto err;
567 	}
568 
569 	vpna = (struct netmap_vp_adapter *)NA(ifp);
570 
571 	/* we can only destroy ports that were created via NETMAP_BDG_NEWIF */
572 	if (vpna->autodelete) {
573 		error = EINVAL;
574 		goto err;
575 	}
576 
577 	/* also make sure that nobody is using the inferface */
578 	if (NETMAP_OWNED_BY_ANY(&vpna->up) ||
579 	    vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) {
580 		error = EBUSY;
581 		goto err;
582 	}
583 
584 	NMG_UNLOCK();
585 
586 	D("destroying a persistent vale interface %s", ifp->if_xname);
587 	/* Linux requires all the references are released
588 	 * before unregister
589 	 */
590 	netmap_detach(ifp);
591 	if_rele(ifp);
592 	nm_os_vi_detach(ifp);
593 	return 0;
594 
595 err:
596 	NMG_UNLOCK();
597 	if_rele(ifp);
598 	return error;
599 }
600 
601 static int
602 nm_update_info(struct nmreq *nmr, struct netmap_adapter *na)
603 {
604 	nmr->nr_rx_rings = na->num_rx_rings;
605 	nmr->nr_tx_rings = na->num_tx_rings;
606 	nmr->nr_rx_slots = na->num_rx_desc;
607 	nmr->nr_tx_slots = na->num_tx_desc;
608 	return netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, NULL, &nmr->nr_arg2);
609 }
610 
611 /*
612  * Create a virtual interface registered to the system.
613  * The interface will be attached to a bridge later.
614  */
615 int
616 netmap_vi_create(struct nmreq *nmr, int autodelete)
617 {
618 	struct ifnet *ifp;
619 	struct netmap_vp_adapter *vpna;
620 	struct netmap_mem_d *nmd = NULL;
621 	int error;
622 
623 	/* don't include VALE prefix */
624 	if (!strncmp(nmr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
625 		return EINVAL;
626 	ifp = ifunit_ref(nmr->nr_name);
627 	if (ifp) { /* already exist, cannot create new one */
628 		error = EEXIST;
629 		NMG_LOCK();
630 		if (NM_NA_VALID(ifp)) {
631 			int update_err = nm_update_info(nmr, NA(ifp));
632 			if (update_err)
633 				error = update_err;
634 		}
635 		NMG_UNLOCK();
636 		if_rele(ifp);
637 		return error;
638 	}
639 	error = nm_os_vi_persist(nmr->nr_name, &ifp);
640 	if (error)
641 		return error;
642 
643 	NMG_LOCK();
644 	if (nmr->nr_arg2) {
645 		nmd = netmap_mem_find(nmr->nr_arg2);
646 		if (nmd == NULL) {
647 			error = EINVAL;
648 			goto err_1;
649 		}
650 	}
651 	/* netmap_vp_create creates a struct netmap_vp_adapter */
652 	error = netmap_vp_create(nmr, ifp, nmd, &vpna);
653 	if (error) {
654 		D("error %d", error);
655 		goto err_1;
656 	}
657 	/* persist-specific routines */
658 	vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
659 	if (!autodelete) {
660 		netmap_adapter_get(&vpna->up);
661 	} else {
662 		vpna->autodelete = 1;
663 	}
664 	NM_ATTACH_NA(ifp, &vpna->up);
665 	/* return the updated info */
666 	error = nm_update_info(nmr, &vpna->up);
667 	if (error) {
668 		goto err_2;
669 	}
670 	D("returning nr_arg2 %d", nmr->nr_arg2);
671 	if (nmd)
672 		netmap_mem_put(nmd);
673 	NMG_UNLOCK();
674 	D("created %s", ifp->if_xname);
675 	return 0;
676 
677 err_2:
678 	netmap_detach(ifp);
679 err_1:
680 	if (nmd)
681 		netmap_mem_put(nmd);
682 	NMG_UNLOCK();
683 	nm_os_vi_detach(ifp);
684 
685 	return error;
686 }
687 
688 /* Try to get a reference to a netmap adapter attached to a VALE switch.
689  * If the adapter is found (or is created), this function returns 0, a
690  * non NULL pointer is returned into *na, and the caller holds a
691  * reference to the adapter.
692  * If an adapter is not found, then no reference is grabbed and the
693  * function returns an error code, or 0 if there is just a VALE prefix
694  * mismatch. Therefore the caller holds a reference when
695  * (*na != NULL && return == 0).
696  */
697 int
698 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na,
699 		struct netmap_mem_d *nmd, int create)
700 {
701 	char *nr_name = nmr->nr_name;
702 	const char *ifname;
703 	struct ifnet *ifp = NULL;
704 	int error = 0;
705 	struct netmap_vp_adapter *vpna, *hostna = NULL;
706 	struct nm_bridge *b;
707 	int i, j, cand = -1, cand2 = -1;
708 	int needed;
709 
710 	*na = NULL;     /* default return value */
711 
712 	/* first try to see if this is a bridge port. */
713 	NMG_LOCK_ASSERT();
714 	if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) {
715 		return 0;  /* no error, but no VALE prefix */
716 	}
717 
718 	b = nm_find_bridge(nr_name, create);
719 	if (b == NULL) {
720 		D("no bridges available for '%s'", nr_name);
721 		return (create ? ENOMEM : ENXIO);
722 	}
723 	if (strlen(nr_name) < b->bdg_namelen) /* impossible */
724 		panic("x");
725 
726 	/* Now we are sure that name starts with the bridge's name,
727 	 * lookup the port in the bridge. We need to scan the entire
728 	 * list. It is not important to hold a WLOCK on the bridge
729 	 * during the search because NMG_LOCK already guarantees
730 	 * that there are no other possible writers.
731 	 */
732 
733 	/* lookup in the local list of ports */
734 	for (j = 0; j < b->bdg_active_ports; j++) {
735 		i = b->bdg_port_index[j];
736 		vpna = b->bdg_ports[i];
737 		// KASSERT(na != NULL);
738 		ND("checking %s", vpna->up.name);
739 		if (!strcmp(vpna->up.name, nr_name)) {
740 			netmap_adapter_get(&vpna->up);
741 			ND("found existing if %s refs %d", nr_name)
742 			*na = &vpna->up;
743 			return 0;
744 		}
745 	}
746 	/* not found, should we create it? */
747 	if (!create)
748 		return ENXIO;
749 	/* yes we should, see if we have space to attach entries */
750 	needed = 2; /* in some cases we only need 1 */
751 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
752 		D("bridge full %d, cannot create new port", b->bdg_active_ports);
753 		return ENOMEM;
754 	}
755 	/* record the next two ports available, but do not allocate yet */
756 	cand = b->bdg_port_index[b->bdg_active_ports];
757 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
758 	ND("+++ bridge %s port %s used %d avail %d %d",
759 		b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
760 
761 	/*
762 	 * try see if there is a matching NIC with this name
763 	 * (after the bridge's name)
764 	 */
765 	ifname = nr_name + b->bdg_namelen + 1;
766 	ifp = ifunit_ref(ifname);
767 	if (!ifp) {
768 		/* Create an ephemeral virtual port
769 		 * This block contains all the ephemeral-specific logics
770 		 */
771 		if (nmr->nr_cmd) {
772 			/* nr_cmd must be 0 for a virtual port */
773 			error = EINVAL;
774 			goto out;
775 		}
776 
777 		/* bdg_netmap_attach creates a struct netmap_adapter */
778 		error = netmap_vp_create(nmr, NULL, nmd, &vpna);
779 		if (error) {
780 			D("error %d", error);
781 			goto out;
782 		}
783 		/* shortcut - we can skip get_hw_na(),
784 		 * ownership check and nm_bdg_attach()
785 		 */
786 	} else {
787 		struct netmap_adapter *hw;
788 
789 		error = netmap_get_hw_na(ifp, nmd, &hw);
790 		if (error || hw == NULL)
791 			goto out;
792 
793 		/* host adapter might not be created */
794 		error = hw->nm_bdg_attach(nr_name, hw);
795 		if (error)
796 			goto out;
797 		vpna = hw->na_vp;
798 		hostna = hw->na_hostvp;
799 		if (nmr->nr_arg1 != NETMAP_BDG_HOST)
800 			hostna = NULL;
801 	}
802 
803 	BDG_WLOCK(b);
804 	vpna->bdg_port = cand;
805 	ND("NIC  %p to bridge port %d", vpna, cand);
806 	/* bind the port to the bridge (virtual ports are not active) */
807 	b->bdg_ports[cand] = vpna;
808 	vpna->na_bdg = b;
809 	b->bdg_active_ports++;
810 	if (hostna != NULL) {
811 		/* also bind the host stack to the bridge */
812 		b->bdg_ports[cand2] = hostna;
813 		hostna->bdg_port = cand2;
814 		hostna->na_bdg = b;
815 		b->bdg_active_ports++;
816 		ND("host %p to bridge port %d", hostna, cand2);
817 	}
818 	ND("if %s refs %d", ifname, vpna->up.na_refcount);
819 	BDG_WUNLOCK(b);
820 	*na = &vpna->up;
821 	netmap_adapter_get(*na);
822 
823 out:
824 	if (ifp)
825 		if_rele(ifp);
826 
827 	return error;
828 }
829 
830 
831 /* Process NETMAP_BDG_ATTACH */
832 static int
833 nm_bdg_ctl_attach(struct nmreq *nmr)
834 {
835 	struct netmap_adapter *na;
836 	struct netmap_mem_d *nmd = NULL;
837 	int error;
838 
839 	NMG_LOCK();
840 
841 	if (nmr->nr_arg2) {
842 		nmd = netmap_mem_find(nmr->nr_arg2);
843 		if (nmd == NULL) {
844 			error = EINVAL;
845 			goto unlock_exit;
846 		}
847 	}
848 
849 	error = netmap_get_bdg_na(nmr, &na, nmd, 1 /* create if not exists */);
850 	if (error) /* no device */
851 		goto unlock_exit;
852 
853 	if (na == NULL) { /* VALE prefix missing */
854 		error = EINVAL;
855 		goto unlock_exit;
856 	}
857 
858 	if (NETMAP_OWNED_BY_ANY(na)) {
859 		error = EBUSY;
860 		goto unref_exit;
861 	}
862 
863 	if (na->nm_bdg_ctl) {
864 		/* nop for VALE ports. The bwrap needs to put the hwna
865 		 * in netmap mode (see netmap_bwrap_bdg_ctl)
866 		 */
867 		error = na->nm_bdg_ctl(na, nmr, 1);
868 		if (error)
869 			goto unref_exit;
870 		ND("registered %s to netmap-mode", na->name);
871 	}
872 	NMG_UNLOCK();
873 	return 0;
874 
875 unref_exit:
876 	netmap_adapter_put(na);
877 unlock_exit:
878 	NMG_UNLOCK();
879 	return error;
880 }
881 
882 static inline int
883 nm_is_bwrap(struct netmap_adapter *na)
884 {
885 	return na->nm_register == netmap_bwrap_reg;
886 }
887 
888 /* process NETMAP_BDG_DETACH */
889 static int
890 nm_bdg_ctl_detach(struct nmreq *nmr)
891 {
892 	struct netmap_adapter *na;
893 	int error;
894 
895 	NMG_LOCK();
896 	error = netmap_get_bdg_na(nmr, &na, NULL, 0 /* don't create */);
897 	if (error) { /* no device, or another bridge or user owns the device */
898 		goto unlock_exit;
899 	}
900 
901 	if (na == NULL) { /* VALE prefix missing */
902 		error = EINVAL;
903 		goto unlock_exit;
904 	} else if (nm_is_bwrap(na) &&
905 		   ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
906 		/* Don't detach a NIC with polling */
907 		error = EBUSY;
908 		netmap_adapter_put(na);
909 		goto unlock_exit;
910 	}
911 	if (na->nm_bdg_ctl) {
912 		/* remove the port from bridge. The bwrap
913 		 * also needs to put the hwna in normal mode
914 		 */
915 		error = na->nm_bdg_ctl(na, nmr, 0);
916 	}
917 
918 	netmap_adapter_put(na);
919 unlock_exit:
920 	NMG_UNLOCK();
921 	return error;
922 
923 }
924 
925 struct nm_bdg_polling_state;
926 struct
927 nm_bdg_kthread {
928 	struct nm_kctx *nmk;
929 	u_int qfirst;
930 	u_int qlast;
931 	struct nm_bdg_polling_state *bps;
932 };
933 
934 struct nm_bdg_polling_state {
935 	bool configured;
936 	bool stopped;
937 	struct netmap_bwrap_adapter *bna;
938 	u_int reg;
939 	u_int qfirst;
940 	u_int qlast;
941 	u_int cpu_from;
942 	u_int ncpus;
943 	struct nm_bdg_kthread *kthreads;
944 };
945 
946 static void
947 netmap_bwrap_polling(void *data, int is_kthread)
948 {
949 	struct nm_bdg_kthread *nbk = data;
950 	struct netmap_bwrap_adapter *bna;
951 	u_int qfirst, qlast, i;
952 	struct netmap_kring *kring0, *kring;
953 
954 	if (!nbk)
955 		return;
956 	qfirst = nbk->qfirst;
957 	qlast = nbk->qlast;
958 	bna = nbk->bps->bna;
959 	kring0 = NMR(bna->hwna, NR_RX);
960 
961 	for (i = qfirst; i < qlast; i++) {
962 		kring = kring0 + i;
963 		kring->nm_notify(kring, 0);
964 	}
965 }
966 
967 static int
968 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps)
969 {
970 	struct nm_kctx_cfg kcfg;
971 	int i, j;
972 
973 	bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus);
974 	if (bps->kthreads == NULL)
975 		return ENOMEM;
976 
977 	bzero(&kcfg, sizeof(kcfg));
978 	kcfg.worker_fn = netmap_bwrap_polling;
979 	kcfg.use_kthread = 1;
980 	for (i = 0; i < bps->ncpus; i++) {
981 		struct nm_bdg_kthread *t = bps->kthreads + i;
982 		int all = (bps->ncpus == 1 && bps->reg == NR_REG_ALL_NIC);
983 		int affinity = bps->cpu_from + i;
984 
985 		t->bps = bps;
986 		t->qfirst = all ? bps->qfirst /* must be 0 */: affinity;
987 		t->qlast = all ? bps->qlast : t->qfirst + 1;
988 		D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst,
989 			t->qlast);
990 
991 		kcfg.type = i;
992 		kcfg.worker_private = t;
993 		t->nmk = nm_os_kctx_create(&kcfg, 0, NULL);
994 		if (t->nmk == NULL) {
995 			goto cleanup;
996 		}
997 		nm_os_kctx_worker_setaff(t->nmk, affinity);
998 	}
999 	return 0;
1000 
1001 cleanup:
1002 	for (j = 0; j < i; j++) {
1003 		struct nm_bdg_kthread *t = bps->kthreads + i;
1004 		nm_os_kctx_destroy(t->nmk);
1005 	}
1006 	nm_os_free(bps->kthreads);
1007 	return EFAULT;
1008 }
1009 
1010 /* A variant of ptnetmap_start_kthreads() */
1011 static int
1012 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps)
1013 {
1014 	int error, i, j;
1015 
1016 	if (!bps) {
1017 		D("polling is not configured");
1018 		return EFAULT;
1019 	}
1020 	bps->stopped = false;
1021 
1022 	for (i = 0; i < bps->ncpus; i++) {
1023 		struct nm_bdg_kthread *t = bps->kthreads + i;
1024 		error = nm_os_kctx_worker_start(t->nmk);
1025 		if (error) {
1026 			D("error in nm_kthread_start()");
1027 			goto cleanup;
1028 		}
1029 	}
1030 	return 0;
1031 
1032 cleanup:
1033 	for (j = 0; j < i; j++) {
1034 		struct nm_bdg_kthread *t = bps->kthreads + i;
1035 		nm_os_kctx_worker_stop(t->nmk);
1036 	}
1037 	bps->stopped = true;
1038 	return error;
1039 }
1040 
1041 static void
1042 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps)
1043 {
1044 	int i;
1045 
1046 	if (!bps)
1047 		return;
1048 
1049 	for (i = 0; i < bps->ncpus; i++) {
1050 		struct nm_bdg_kthread *t = bps->kthreads + i;
1051 		nm_os_kctx_worker_stop(t->nmk);
1052 		nm_os_kctx_destroy(t->nmk);
1053 	}
1054 	bps->stopped = true;
1055 }
1056 
1057 static int
1058 get_polling_cfg(struct nmreq *nmr, struct netmap_adapter *na,
1059 			struct nm_bdg_polling_state *bps)
1060 {
1061 	int req_cpus, avail_cpus, core_from;
1062 	u_int reg, i, qfirst, qlast;
1063 
1064 	avail_cpus = nm_os_ncpus();
1065 	req_cpus = nmr->nr_arg1;
1066 
1067 	if (req_cpus == 0) {
1068 		D("req_cpus must be > 0");
1069 		return EINVAL;
1070 	} else if (req_cpus >= avail_cpus) {
1071 		D("for safety, we need at least one core left in the system");
1072 		return EINVAL;
1073 	}
1074 	reg = nmr->nr_flags & NR_REG_MASK;
1075 	i = nmr->nr_ringid & NETMAP_RING_MASK;
1076 	/*
1077 	 * ONE_NIC: dedicate one core to one ring. If multiple cores
1078 	 *          are specified, consecutive rings are also polled.
1079 	 *          For example, if ringid=2 and 2 cores are given,
1080 	 *          ring 2 and 3 are polled by core 2 and 3, respectively.
1081 	 * ALL_NIC: poll all the rings using a core specified by ringid.
1082 	 *          the number of cores must be 1.
1083 	 */
1084 	if (reg == NR_REG_ONE_NIC) {
1085 		if (i + req_cpus > nma_get_nrings(na, NR_RX)) {
1086 			D("only %d rings exist (ring %u-%u is given)",
1087 				nma_get_nrings(na, NR_RX), i, i+req_cpus);
1088 			return EINVAL;
1089 		}
1090 		qfirst = i;
1091 		qlast = qfirst + req_cpus;
1092 		core_from = qfirst;
1093 	} else if (reg == NR_REG_ALL_NIC) {
1094 		if (req_cpus != 1) {
1095 			D("ncpus must be 1 not %d for REG_ALL_NIC", req_cpus);
1096 			return EINVAL;
1097 		}
1098 		qfirst = 0;
1099 		qlast = nma_get_nrings(na, NR_RX);
1100 		core_from = i;
1101 	} else {
1102 		D("reg must be ALL_NIC or ONE_NIC");
1103 		return EINVAL;
1104 	}
1105 
1106 	bps->reg = reg;
1107 	bps->qfirst = qfirst;
1108 	bps->qlast = qlast;
1109 	bps->cpu_from = core_from;
1110 	bps->ncpus = req_cpus;
1111 	D("%s qfirst %u qlast %u cpu_from %u ncpus %u",
1112 		reg == NR_REG_ALL_NIC ? "REG_ALL_NIC" : "REG_ONE_NIC",
1113 		qfirst, qlast, core_from, req_cpus);
1114 	return 0;
1115 }
1116 
1117 static int
1118 nm_bdg_ctl_polling_start(struct nmreq *nmr, struct netmap_adapter *na)
1119 {
1120 	struct nm_bdg_polling_state *bps;
1121 	struct netmap_bwrap_adapter *bna;
1122 	int error;
1123 
1124 	bna = (struct netmap_bwrap_adapter *)na;
1125 	if (bna->na_polling_state) {
1126 		D("ERROR adapter already in polling mode");
1127 		return EFAULT;
1128 	}
1129 
1130 	bps = nm_os_malloc(sizeof(*bps));
1131 	if (!bps)
1132 		return ENOMEM;
1133 	bps->configured = false;
1134 	bps->stopped = true;
1135 
1136 	if (get_polling_cfg(nmr, na, bps)) {
1137 		nm_os_free(bps);
1138 		return EINVAL;
1139 	}
1140 
1141 	if (nm_bdg_create_kthreads(bps)) {
1142 		nm_os_free(bps);
1143 		return EFAULT;
1144 	}
1145 
1146 	bps->configured = true;
1147 	bna->na_polling_state = bps;
1148 	bps->bna = bna;
1149 
1150 	/* disable interrupt if possible */
1151 	if (bna->hwna->nm_intr)
1152 		bna->hwna->nm_intr(bna->hwna, 0);
1153 	/* start kthread now */
1154 	error = nm_bdg_polling_start_kthreads(bps);
1155 	if (error) {
1156 		D("ERROR nm_bdg_polling_start_kthread()");
1157 		nm_os_free(bps->kthreads);
1158 		nm_os_free(bps);
1159 		bna->na_polling_state = NULL;
1160 		if (bna->hwna->nm_intr)
1161 			bna->hwna->nm_intr(bna->hwna, 1);
1162 	}
1163 	return error;
1164 }
1165 
1166 static int
1167 nm_bdg_ctl_polling_stop(struct nmreq *nmr, struct netmap_adapter *na)
1168 {
1169 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na;
1170 	struct nm_bdg_polling_state *bps;
1171 
1172 	if (!bna->na_polling_state) {
1173 		D("ERROR adapter is not in polling mode");
1174 		return EFAULT;
1175 	}
1176 	bps = bna->na_polling_state;
1177 	nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state);
1178 	bps->configured = false;
1179 	nm_os_free(bps);
1180 	bna->na_polling_state = NULL;
1181 	/* reenable interrupt */
1182 	if (bna->hwna->nm_intr)
1183 		bna->hwna->nm_intr(bna->hwna, 1);
1184 	return 0;
1185 }
1186 
1187 /* Called by either user's context (netmap_ioctl())
1188  * or external kernel modules (e.g., Openvswitch).
1189  * Operation is indicated in nmr->nr_cmd.
1190  * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge
1191  * requires bdg_ops argument; the other commands ignore this argument.
1192  *
1193  * Called without NMG_LOCK.
1194  */
1195 int
1196 netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)
1197 {
1198 	struct nm_bridge *b, *bridges;
1199 	struct netmap_adapter *na;
1200 	struct netmap_vp_adapter *vpna;
1201 	char *name = nmr->nr_name;
1202 	int cmd = nmr->nr_cmd, namelen = strlen(name);
1203 	int error = 0, i, j;
1204 	u_int num_bridges;
1205 
1206 	netmap_bns_getbridges(&bridges, &num_bridges);
1207 
1208 	switch (cmd) {
1209 	case NETMAP_BDG_NEWIF:
1210 		error = netmap_vi_create(nmr, 0 /* no autodelete */);
1211 		break;
1212 
1213 	case NETMAP_BDG_DELIF:
1214 		error = nm_vi_destroy(nmr->nr_name);
1215 		break;
1216 
1217 	case NETMAP_BDG_ATTACH:
1218 		error = nm_bdg_ctl_attach(nmr);
1219 		break;
1220 
1221 	case NETMAP_BDG_DETACH:
1222 		error = nm_bdg_ctl_detach(nmr);
1223 		break;
1224 
1225 	case NETMAP_BDG_LIST:
1226 		/* this is used to enumerate bridges and ports */
1227 		if (namelen) { /* look up indexes of bridge and port */
1228 			if (strncmp(name, NM_BDG_NAME, strlen(NM_BDG_NAME))) {
1229 				error = EINVAL;
1230 				break;
1231 			}
1232 			NMG_LOCK();
1233 			b = nm_find_bridge(name, 0 /* don't create */);
1234 			if (!b) {
1235 				error = ENOENT;
1236 				NMG_UNLOCK();
1237 				break;
1238 			}
1239 
1240 			error = 0;
1241 			nmr->nr_arg1 = b - bridges; /* bridge index */
1242 			nmr->nr_arg2 = NM_BDG_NOPORT;
1243 			for (j = 0; j < b->bdg_active_ports; j++) {
1244 				i = b->bdg_port_index[j];
1245 				vpna = b->bdg_ports[i];
1246 				if (vpna == NULL) {
1247 					D("---AAAAAAAAARGH-------");
1248 					continue;
1249 				}
1250 				/* the former and the latter identify a
1251 				 * virtual port and a NIC, respectively
1252 				 */
1253 				if (!strcmp(vpna->up.name, name)) {
1254 					nmr->nr_arg2 = i; /* port index */
1255 					break;
1256 				}
1257 			}
1258 			NMG_UNLOCK();
1259 		} else {
1260 			/* return the first non-empty entry starting from
1261 			 * bridge nr_arg1 and port nr_arg2.
1262 			 *
1263 			 * Users can detect the end of the same bridge by
1264 			 * seeing the new and old value of nr_arg1, and can
1265 			 * detect the end of all the bridge by error != 0
1266 			 */
1267 			i = nmr->nr_arg1;
1268 			j = nmr->nr_arg2;
1269 
1270 			NMG_LOCK();
1271 			for (error = ENOENT; i < NM_BRIDGES; i++) {
1272 				b = bridges + i;
1273 				for ( ; j < NM_BDG_MAXPORTS; j++) {
1274 					if (b->bdg_ports[j] == NULL)
1275 						continue;
1276 					vpna = b->bdg_ports[j];
1277 					strncpy(name, vpna->up.name, (size_t)IFNAMSIZ);
1278 					error = 0;
1279 					goto out;
1280 				}
1281 				j = 0; /* following bridges scan from 0 */
1282 			}
1283 		out:
1284 			nmr->nr_arg1 = i;
1285 			nmr->nr_arg2 = j;
1286 			NMG_UNLOCK();
1287 		}
1288 		break;
1289 
1290 	case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */
1291 		/* register callbacks to the given bridge.
1292 		 * nmr->nr_name may be just bridge's name (including ':'
1293 		 * if it is not just NM_NAME).
1294 		 */
1295 		if (!bdg_ops) {
1296 			error = EINVAL;
1297 			break;
1298 		}
1299 		NMG_LOCK();
1300 		b = nm_find_bridge(name, 0 /* don't create */);
1301 		if (!b) {
1302 			error = EINVAL;
1303 		} else {
1304 			b->bdg_ops = *bdg_ops;
1305 		}
1306 		NMG_UNLOCK();
1307 		break;
1308 
1309 	case NETMAP_BDG_VNET_HDR:
1310 		/* Valid lengths for the virtio-net header are 0 (no header),
1311 		   10 and 12. */
1312 		if (nmr->nr_arg1 != 0 &&
1313 			nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) &&
1314 				nmr->nr_arg1 != 12) {
1315 			error = EINVAL;
1316 			break;
1317 		}
1318 		NMG_LOCK();
1319 		error = netmap_get_bdg_na(nmr, &na, NULL, 0);
1320 		if (na && !error) {
1321 			vpna = (struct netmap_vp_adapter *)na;
1322 			na->virt_hdr_len = nmr->nr_arg1;
1323 			if (na->virt_hdr_len) {
1324 				vpna->mfs = NETMAP_BUF_SIZE(na);
1325 			}
1326 			D("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
1327 			netmap_adapter_put(na);
1328 		} else if (!na) {
1329 			error = ENXIO;
1330 		}
1331 		NMG_UNLOCK();
1332 		break;
1333 
1334 	case NETMAP_BDG_POLLING_ON:
1335 	case NETMAP_BDG_POLLING_OFF:
1336 		NMG_LOCK();
1337 		error = netmap_get_bdg_na(nmr, &na, NULL, 0);
1338 		if (na && !error) {
1339 			if (!nm_is_bwrap(na)) {
1340 				error = EOPNOTSUPP;
1341 			} else if (cmd == NETMAP_BDG_POLLING_ON) {
1342 				error = nm_bdg_ctl_polling_start(nmr, na);
1343 				if (!error)
1344 					netmap_adapter_get(na);
1345 			} else {
1346 				error = nm_bdg_ctl_polling_stop(nmr, na);
1347 				if (!error)
1348 					netmap_adapter_put(na);
1349 			}
1350 			netmap_adapter_put(na);
1351 		}
1352 		NMG_UNLOCK();
1353 		break;
1354 
1355 	default:
1356 		D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
1357 		error = EINVAL;
1358 		break;
1359 	}
1360 	return error;
1361 }
1362 
1363 int
1364 netmap_bdg_config(struct nmreq *nmr)
1365 {
1366 	struct nm_bridge *b;
1367 	int error = EINVAL;
1368 
1369 	NMG_LOCK();
1370 	b = nm_find_bridge(nmr->nr_name, 0);
1371 	if (!b) {
1372 		NMG_UNLOCK();
1373 		return error;
1374 	}
1375 	NMG_UNLOCK();
1376 	/* Don't call config() with NMG_LOCK() held */
1377 	BDG_RLOCK(b);
1378 	if (b->bdg_ops.config != NULL)
1379 		error = b->bdg_ops.config((struct nm_ifreq *)nmr);
1380 	BDG_RUNLOCK(b);
1381 	return error;
1382 }
1383 
1384 
1385 /* nm_krings_create callback for VALE ports.
1386  * Calls the standard netmap_krings_create, then adds leases on rx
1387  * rings and bdgfwd on tx rings.
1388  */
1389 static int
1390 netmap_vp_krings_create(struct netmap_adapter *na)
1391 {
1392 	u_int tailroom;
1393 	int error, i;
1394 	uint32_t *leases;
1395 	u_int nrx = netmap_real_rings(na, NR_RX);
1396 
1397 	/*
1398 	 * Leases are attached to RX rings on vale ports
1399 	 */
1400 	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
1401 
1402 	error = netmap_krings_create(na, tailroom);
1403 	if (error)
1404 		return error;
1405 
1406 	leases = na->tailroom;
1407 
1408 	for (i = 0; i < nrx; i++) { /* Receive rings */
1409 		na->rx_rings[i].nkr_leases = leases;
1410 		leases += na->num_rx_desc;
1411 	}
1412 
1413 	error = nm_alloc_bdgfwd(na);
1414 	if (error) {
1415 		netmap_krings_delete(na);
1416 		return error;
1417 	}
1418 
1419 	return 0;
1420 }
1421 
1422 
1423 /* nm_krings_delete callback for VALE ports. */
1424 static void
1425 netmap_vp_krings_delete(struct netmap_adapter *na)
1426 {
1427 	nm_free_bdgfwd(na);
1428 	netmap_krings_delete(na);
1429 }
1430 
1431 
1432 static int
1433 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
1434 	struct netmap_vp_adapter *na, u_int ring_nr);
1435 
1436 
1437 /*
1438  * main dispatch routine for the bridge.
1439  * Grab packets from a kring, move them into the ft structure
1440  * associated to the tx (input) port. Max one instance per port,
1441  * filtered on input (ioctl, poll or XXX).
1442  * Returns the next position in the ring.
1443  */
1444 static int
1445 nm_bdg_preflush(struct netmap_kring *kring, u_int end)
1446 {
1447 	struct netmap_vp_adapter *na =
1448 		(struct netmap_vp_adapter*)kring->na;
1449 	struct netmap_ring *ring = kring->ring;
1450 	struct nm_bdg_fwd *ft;
1451 	u_int ring_nr = kring->ring_id;
1452 	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
1453 	u_int ft_i = 0;	/* start from 0 */
1454 	u_int frags = 1; /* how many frags ? */
1455 	struct nm_bridge *b = na->na_bdg;
1456 
1457 	/* To protect against modifications to the bridge we acquire a
1458 	 * shared lock, waiting if we can sleep (if the source port is
1459 	 * attached to a user process) or with a trylock otherwise (NICs).
1460 	 */
1461 	ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1462 	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
1463 		BDG_RLOCK(b);
1464 	else if (!BDG_RTRYLOCK(b))
1465 		return j;
1466 	ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1467 	ft = kring->nkr_ft;
1468 
1469 	for (; likely(j != end); j = nm_next(j, lim)) {
1470 		struct netmap_slot *slot = &ring->slot[j];
1471 		char *buf;
1472 
1473 		ft[ft_i].ft_len = slot->len;
1474 		ft[ft_i].ft_flags = slot->flags;
1475 
1476 		ND("flags is 0x%x", slot->flags);
1477 		/* we do not use the buf changed flag, but we still need to reset it */
1478 		slot->flags &= ~NS_BUF_CHANGED;
1479 
1480 		/* this slot goes into a list so initialize the link field */
1481 		ft[ft_i].ft_next = NM_FT_NULL;
1482 		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
1483 			(void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
1484 		if (unlikely(buf == NULL)) {
1485 			RD(5, "NULL %s buffer pointer from %s slot %d len %d",
1486 				(slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
1487 				kring->name, j, ft[ft_i].ft_len);
1488 			buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
1489 			ft[ft_i].ft_len = 0;
1490 			ft[ft_i].ft_flags = 0;
1491 		}
1492 		__builtin_prefetch(buf);
1493 		++ft_i;
1494 		if (slot->flags & NS_MOREFRAG) {
1495 			frags++;
1496 			continue;
1497 		}
1498 		if (unlikely(netmap_verbose && frags > 1))
1499 			RD(5, "%d frags at %d", frags, ft_i - frags);
1500 		ft[ft_i - frags].ft_frags = frags;
1501 		frags = 1;
1502 		if (unlikely((int)ft_i >= bridge_batch))
1503 			ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1504 	}
1505 	if (frags > 1) {
1506 		/* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
1507 		 * have to fix frags count. */
1508 		frags--;
1509 		ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
1510 		ft[ft_i - frags].ft_frags = frags;
1511 		D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
1512 	}
1513 	if (ft_i)
1514 		ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1515 	BDG_RUNLOCK(b);
1516 	return j;
1517 }
1518 
1519 
1520 /* ----- FreeBSD if_bridge hash function ------- */
1521 
1522 /*
1523  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1524  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1525  *
1526  * http://www.burtleburtle.net/bob/hash/spooky.html
1527  */
1528 #define mix(a, b, c)                                                    \
1529 do {                                                                    \
1530         a -= b; a -= c; a ^= (c >> 13);                                 \
1531         b -= c; b -= a; b ^= (a << 8);                                  \
1532         c -= a; c -= b; c ^= (b >> 13);                                 \
1533         a -= b; a -= c; a ^= (c >> 12);                                 \
1534         b -= c; b -= a; b ^= (a << 16);                                 \
1535         c -= a; c -= b; c ^= (b >> 5);                                  \
1536         a -= b; a -= c; a ^= (c >> 3);                                  \
1537         b -= c; b -= a; b ^= (a << 10);                                 \
1538         c -= a; c -= b; c ^= (b >> 15);                                 \
1539 } while (/*CONSTCOND*/0)
1540 
1541 
1542 static __inline uint32_t
1543 nm_bridge_rthash(const uint8_t *addr)
1544 {
1545         uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1546 
1547         b += addr[5] << 8;
1548         b += addr[4];
1549         a += addr[3] << 24;
1550         a += addr[2] << 16;
1551         a += addr[1] << 8;
1552         a += addr[0];
1553 
1554         mix(a, b, c);
1555 #define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
1556         return (c & BRIDGE_RTHASH_MASK);
1557 }
1558 
1559 #undef mix
1560 
1561 
1562 /* nm_register callback for VALE ports */
1563 static int
1564 netmap_vp_reg(struct netmap_adapter *na, int onoff)
1565 {
1566 	struct netmap_vp_adapter *vpna =
1567 		(struct netmap_vp_adapter*)na;
1568 	enum txrx t;
1569 	int i;
1570 
1571 	/* persistent ports may be put in netmap mode
1572 	 * before being attached to a bridge
1573 	 */
1574 	if (vpna->na_bdg)
1575 		BDG_WLOCK(vpna->na_bdg);
1576 	if (onoff) {
1577 		for_rx_tx(t) {
1578 			for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
1579 				struct netmap_kring *kring = &NMR(na, t)[i];
1580 
1581 				if (nm_kring_pending_on(kring))
1582 					kring->nr_mode = NKR_NETMAP_ON;
1583 			}
1584 		}
1585 		if (na->active_fds == 0)
1586 			na->na_flags |= NAF_NETMAP_ON;
1587 		 /* XXX on FreeBSD, persistent VALE ports should also
1588 		 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
1589 		 */
1590 	} else {
1591 		if (na->active_fds == 0)
1592 			na->na_flags &= ~NAF_NETMAP_ON;
1593 		for_rx_tx(t) {
1594 			for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
1595 				struct netmap_kring *kring = &NMR(na, t)[i];
1596 
1597 				if (nm_kring_pending_off(kring))
1598 					kring->nr_mode = NKR_NETMAP_OFF;
1599 			}
1600 		}
1601 	}
1602 	if (vpna->na_bdg)
1603 		BDG_WUNLOCK(vpna->na_bdg);
1604 	return 0;
1605 }
1606 
1607 
1608 /*
1609  * Lookup function for a learning bridge.
1610  * Update the hash table with the source address,
1611  * and then returns the destination port index, and the
1612  * ring in *dst_ring (at the moment, always use ring 0)
1613  */
1614 u_int
1615 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
1616 		struct netmap_vp_adapter *na)
1617 {
1618 	uint8_t *buf = ft->ft_buf;
1619 	u_int buf_len = ft->ft_len;
1620 	struct nm_hash_ent *ht = na->na_bdg->ht;
1621 	uint32_t sh, dh;
1622 	u_int dst, mysrc = na->bdg_port;
1623 	uint64_t smac, dmac;
1624 	uint8_t indbuf[12];
1625 
1626 	/* safety check, unfortunately we have many cases */
1627 	if (buf_len >= 14 + na->up.virt_hdr_len) {
1628 		/* virthdr + mac_hdr in the same slot */
1629 		buf += na->up.virt_hdr_len;
1630 		buf_len -= na->up.virt_hdr_len;
1631 	} else if (buf_len == na->up.virt_hdr_len && ft->ft_flags & NS_MOREFRAG) {
1632 		/* only header in first fragment */
1633 		ft++;
1634 		buf = ft->ft_buf;
1635 		buf_len = ft->ft_len;
1636 	} else {
1637 		RD(5, "invalid buf format, length %d", buf_len);
1638 		return NM_BDG_NOPORT;
1639 	}
1640 
1641 	if (ft->ft_flags & NS_INDIRECT) {
1642 		if (copyin(buf, indbuf, sizeof(indbuf))) {
1643 			return NM_BDG_NOPORT;
1644 		}
1645 		buf = indbuf;
1646 	}
1647 
1648 	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1649 	smac = le64toh(*(uint64_t *)(buf + 4));
1650 	smac >>= 16;
1651 
1652 	/*
1653 	 * The hash is somewhat expensive, there might be some
1654 	 * worthwhile optimizations here.
1655 	 */
1656 	if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
1657 		uint8_t *s = buf+6;
1658 		sh = nm_bridge_rthash(s); // XXX hash of source
1659 		/* update source port forwarding entry */
1660 		na->last_smac = ht[sh].mac = smac;	/* XXX expire ? */
1661 		ht[sh].ports = mysrc;
1662 		if (netmap_verbose)
1663 		    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1664 			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1665 	}
1666 	dst = NM_BDG_BROADCAST;
1667 	if ((buf[0] & 1) == 0) { /* unicast */
1668 		dh = nm_bridge_rthash(buf); // XXX hash of dst
1669 		if (ht[dh].mac == dmac) {	/* found dst */
1670 			dst = ht[dh].ports;
1671 		}
1672 		/* XXX otherwise return NM_BDG_UNKNOWN ? */
1673 	}
1674 	return dst;
1675 }
1676 
1677 
1678 /*
1679  * Available space in the ring. Only used in VALE code
1680  * and only with is_rx = 1
1681  */
1682 static inline uint32_t
1683 nm_kr_space(struct netmap_kring *k, int is_rx)
1684 {
1685 	int space;
1686 
1687 	if (is_rx) {
1688 		int busy = k->nkr_hwlease - k->nr_hwcur;
1689 		if (busy < 0)
1690 			busy += k->nkr_num_slots;
1691 		space = k->nkr_num_slots - 1 - busy;
1692 	} else {
1693 		/* XXX never used in this branch */
1694 		space = k->nr_hwtail - k->nkr_hwlease;
1695 		if (space < 0)
1696 			space += k->nkr_num_slots;
1697 	}
1698 #if 0
1699 	// sanity check
1700 	if (k->nkr_hwlease >= k->nkr_num_slots ||
1701 		k->nr_hwcur >= k->nkr_num_slots ||
1702 		k->nr_tail >= k->nkr_num_slots ||
1703 		busy < 0 ||
1704 		busy >= k->nkr_num_slots) {
1705 		D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1706 			k->nkr_lease_idx, k->nkr_num_slots);
1707 	}
1708 #endif
1709 	return space;
1710 }
1711 
1712 
1713 
1714 
1715 /* make a lease on the kring for N positions. return the
1716  * lease index
1717  * XXX only used in VALE code and with is_rx = 1
1718  */
1719 static inline uint32_t
1720 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1721 {
1722 	uint32_t lim = k->nkr_num_slots - 1;
1723 	uint32_t lease_idx = k->nkr_lease_idx;
1724 
1725 	k->nkr_leases[lease_idx] = NR_NOSLOT;
1726 	k->nkr_lease_idx = nm_next(lease_idx, lim);
1727 
1728 	if (n > nm_kr_space(k, is_rx)) {
1729 		D("invalid request for %d slots", n);
1730 		panic("x");
1731 	}
1732 	/* XXX verify that there are n slots */
1733 	k->nkr_hwlease += n;
1734 	if (k->nkr_hwlease > lim)
1735 		k->nkr_hwlease -= lim + 1;
1736 
1737 	if (k->nkr_hwlease >= k->nkr_num_slots ||
1738 		k->nr_hwcur >= k->nkr_num_slots ||
1739 		k->nr_hwtail >= k->nkr_num_slots ||
1740 		k->nkr_lease_idx >= k->nkr_num_slots) {
1741 		D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1742 			k->na->name,
1743 			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1744 			k->nkr_lease_idx, k->nkr_num_slots);
1745 	}
1746 	return lease_idx;
1747 }
1748 
1749 /*
1750  *
1751  * This flush routine supports only unicast and broadcast but a large
1752  * number of ports, and lets us replace the learn and dispatch functions.
1753  */
1754 int
1755 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1756 		u_int ring_nr)
1757 {
1758 	struct nm_bdg_q *dst_ents, *brddst;
1759 	uint16_t num_dsts = 0, *dsts;
1760 	struct nm_bridge *b = na->na_bdg;
1761 	u_int i, me = na->bdg_port;
1762 
1763 	/*
1764 	 * The work area (pointed by ft) is followed by an array of
1765 	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1766 	 * queues per port plus one for the broadcast traffic.
1767 	 * Then we have an array of destination indexes.
1768 	 */
1769 	dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1770 	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1771 
1772 	/* first pass: find a destination for each packet in the batch */
1773 	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1774 		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1775 		uint16_t dst_port, d_i;
1776 		struct nm_bdg_q *d;
1777 
1778 		ND("slot %d frags %d", i, ft[i].ft_frags);
1779 		/* Drop the packet if the virtio-net header is not into the first
1780 		   fragment nor at the very beginning of the second. */
1781 		if (unlikely(na->up.virt_hdr_len > ft[i].ft_len))
1782 			continue;
1783 		dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na);
1784 		if (netmap_verbose > 255)
1785 			RD(5, "slot %d port %d -> %d", i, me, dst_port);
1786 		if (dst_port == NM_BDG_NOPORT)
1787 			continue; /* this packet is identified to be dropped */
1788 		else if (unlikely(dst_port > NM_BDG_MAXPORTS))
1789 			continue;
1790 		else if (dst_port == NM_BDG_BROADCAST)
1791 			dst_ring = 0; /* broadcasts always go to ring 0 */
1792 		else if (unlikely(dst_port == me ||
1793 		    !b->bdg_ports[dst_port]))
1794 			continue;
1795 
1796 		/* get a position in the scratch pad */
1797 		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1798 		d = dst_ents + d_i;
1799 
1800 		/* append the first fragment to the list */
1801 		if (d->bq_head == NM_FT_NULL) { /* new destination */
1802 			d->bq_head = d->bq_tail = i;
1803 			/* remember this position to be scanned later */
1804 			if (dst_port != NM_BDG_BROADCAST)
1805 				dsts[num_dsts++] = d_i;
1806 		} else {
1807 			ft[d->bq_tail].ft_next = i;
1808 			d->bq_tail = i;
1809 		}
1810 		d->bq_len += ft[i].ft_frags;
1811 	}
1812 
1813 	/*
1814 	 * Broadcast traffic goes to ring 0 on all destinations.
1815 	 * So we need to add these rings to the list of ports to scan.
1816 	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1817 	 * expensive. We should keep a compact list of active destinations
1818 	 * so we could shorten this loop.
1819 	 */
1820 	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1821 	if (brddst->bq_head != NM_FT_NULL) {
1822 		u_int j;
1823 		for (j = 0; likely(j < b->bdg_active_ports); j++) {
1824 			uint16_t d_i;
1825 			i = b->bdg_port_index[j];
1826 			if (unlikely(i == me))
1827 				continue;
1828 			d_i = i * NM_BDG_MAXRINGS;
1829 			if (dst_ents[d_i].bq_head == NM_FT_NULL)
1830 				dsts[num_dsts++] = d_i;
1831 		}
1832 	}
1833 
1834 	ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1835 	/* second pass: scan destinations */
1836 	for (i = 0; i < num_dsts; i++) {
1837 		struct netmap_vp_adapter *dst_na;
1838 		struct netmap_kring *kring;
1839 		struct netmap_ring *ring;
1840 		u_int dst_nr, lim, j, d_i, next, brd_next;
1841 		u_int needed, howmany;
1842 		int retry = netmap_txsync_retry;
1843 		struct nm_bdg_q *d;
1844 		uint32_t my_start = 0, lease_idx = 0;
1845 		int nrings;
1846 		int virt_hdr_mismatch = 0;
1847 
1848 		d_i = dsts[i];
1849 		ND("second pass %d port %d", i, d_i);
1850 		d = dst_ents + d_i;
1851 		// XXX fix the division
1852 		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1853 		/* protect from the lookup function returning an inactive
1854 		 * destination port
1855 		 */
1856 		if (unlikely(dst_na == NULL))
1857 			goto cleanup;
1858 		if (dst_na->up.na_flags & NAF_SW_ONLY)
1859 			goto cleanup;
1860 		/*
1861 		 * The interface may be in !netmap mode in two cases:
1862 		 * - when na is attached but not activated yet;
1863 		 * - when na is being deactivated but is still attached.
1864 		 */
1865 		if (unlikely(!nm_netmap_on(&dst_na->up))) {
1866 			ND("not in netmap mode!");
1867 			goto cleanup;
1868 		}
1869 
1870 		/* there is at least one either unicast or broadcast packet */
1871 		brd_next = brddst->bq_head;
1872 		next = d->bq_head;
1873 		/* we need to reserve this many slots. If fewer are
1874 		 * available, some packets will be dropped.
1875 		 * Packets may have multiple fragments, so we may not use
1876 		 * there is a chance that we may not use all of the slots
1877 		 * we have claimed, so we will need to handle the leftover
1878 		 * ones when we regain the lock.
1879 		 */
1880 		needed = d->bq_len + brddst->bq_len;
1881 
1882 		if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
1883 			if (netmap_verbose) {
1884 			    RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
1885 				  dst_na->up.virt_hdr_len);
1886 			}
1887 			/* There is a virtio-net header/offloadings mismatch between
1888 			 * source and destination. The slower mismatch datapath will
1889 			 * be used to cope with all the mismatches.
1890 			 */
1891 			virt_hdr_mismatch = 1;
1892 			if (dst_na->mfs < na->mfs) {
1893 				/* We may need to do segmentation offloadings, and so
1894 				 * we may need a number of destination slots greater
1895 				 * than the number of input slots ('needed').
1896 				 * We look for the smallest integer 'x' which satisfies:
1897 				 *	needed * na->mfs + x * H <= x * na->mfs
1898 				 * where 'H' is the length of the longest header that may
1899 				 * be replicated in the segmentation process (e.g. for
1900 				 * TCPv4 we must account for ethernet header, IP header
1901 				 * and TCPv4 header).
1902 				 */
1903 				needed = (needed * na->mfs) /
1904 						(dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
1905 				ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
1906 			}
1907 		}
1908 
1909 		ND(5, "pass 2 dst %d is %x %s",
1910 			i, d_i, is_vp ? "virtual" : "nic/host");
1911 		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1912 		nrings = dst_na->up.num_rx_rings;
1913 		if (dst_nr >= nrings)
1914 			dst_nr = dst_nr % nrings;
1915 		kring = &dst_na->up.rx_rings[dst_nr];
1916 		ring = kring->ring;
1917 		lim = kring->nkr_num_slots - 1;
1918 
1919 retry:
1920 
1921 		if (dst_na->retry && retry) {
1922 			/* try to get some free slot from the previous run */
1923 			kring->nm_notify(kring, 0);
1924 			/* actually useful only for bwraps, since there
1925 			 * the notify will trigger a txsync on the hwna. VALE ports
1926 			 * have dst_na->retry == 0
1927 			 */
1928 		}
1929 		/* reserve the buffers in the queue and an entry
1930 		 * to report completion, and drop lock.
1931 		 * XXX this might become a helper function.
1932 		 */
1933 		mtx_lock(&kring->q_lock);
1934 		if (kring->nkr_stopped) {
1935 			mtx_unlock(&kring->q_lock);
1936 			goto cleanup;
1937 		}
1938 		my_start = j = kring->nkr_hwlease;
1939 		howmany = nm_kr_space(kring, 1);
1940 		if (needed < howmany)
1941 			howmany = needed;
1942 		lease_idx = nm_kr_lease(kring, howmany, 1);
1943 		mtx_unlock(&kring->q_lock);
1944 
1945 		/* only retry if we need more than available slots */
1946 		if (retry && needed <= howmany)
1947 			retry = 0;
1948 
1949 		/* copy to the destination queue */
1950 		while (howmany > 0) {
1951 			struct netmap_slot *slot;
1952 			struct nm_bdg_fwd *ft_p, *ft_end;
1953 			u_int cnt;
1954 
1955 			/* find the queue from which we pick next packet.
1956 			 * NM_FT_NULL is always higher than valid indexes
1957 			 * so we never dereference it if the other list
1958 			 * has packets (and if both are empty we never
1959 			 * get here).
1960 			 */
1961 			if (next < brd_next) {
1962 				ft_p = ft + next;
1963 				next = ft_p->ft_next;
1964 			} else { /* insert broadcast */
1965 				ft_p = ft + brd_next;
1966 				brd_next = ft_p->ft_next;
1967 			}
1968 			cnt = ft_p->ft_frags; // cnt > 0
1969 			if (unlikely(cnt > howmany))
1970 			    break; /* no more space */
1971 			if (netmap_verbose && cnt > 1)
1972 				RD(5, "rx %d frags to %d", cnt, j);
1973 			ft_end = ft_p + cnt;
1974 			if (unlikely(virt_hdr_mismatch)) {
1975 				bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
1976 			} else {
1977 				howmany -= cnt;
1978 				do {
1979 					char *dst, *src = ft_p->ft_buf;
1980 					size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1981 
1982 					slot = &ring->slot[j];
1983 					dst = NMB(&dst_na->up, slot);
1984 
1985 					ND("send [%d] %d(%d) bytes at %s:%d",
1986 							i, (int)copy_len, (int)dst_len,
1987 							NM_IFPNAME(dst_ifp), j);
1988 					/* round to a multiple of 64 */
1989 					copy_len = (copy_len + 63) & ~63;
1990 
1991 					if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
1992 						     copy_len > NETMAP_BUF_SIZE(&na->up))) {
1993 						RD(5, "invalid len %d, down to 64", (int)copy_len);
1994 						copy_len = dst_len = 64; // XXX
1995 					}
1996 					if (ft_p->ft_flags & NS_INDIRECT) {
1997 						if (copyin(src, dst, copy_len)) {
1998 							// invalid user pointer, pretend len is 0
1999 							dst_len = 0;
2000 						}
2001 					} else {
2002 						//memcpy(dst, src, copy_len);
2003 						pkt_copy(src, dst, (int)copy_len);
2004 					}
2005 					slot->len = dst_len;
2006 					slot->flags = (cnt << 8)| NS_MOREFRAG;
2007 					j = nm_next(j, lim);
2008 					needed--;
2009 					ft_p++;
2010 				} while (ft_p != ft_end);
2011 				slot->flags = (cnt << 8); /* clear flag on last entry */
2012 			}
2013 			/* are we done ? */
2014 			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
2015 				break;
2016 		}
2017 		{
2018 		    /* current position */
2019 		    uint32_t *p = kring->nkr_leases; /* shorthand */
2020 		    uint32_t update_pos;
2021 		    int still_locked = 1;
2022 
2023 		    mtx_lock(&kring->q_lock);
2024 		    if (unlikely(howmany > 0)) {
2025 			/* not used all bufs. If i am the last one
2026 			 * i can recover the slots, otherwise must
2027 			 * fill them with 0 to mark empty packets.
2028 			 */
2029 			ND("leftover %d bufs", howmany);
2030 			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
2031 			    /* yes i am the last one */
2032 			    ND("roll back nkr_hwlease to %d", j);
2033 			    kring->nkr_hwlease = j;
2034 			} else {
2035 			    while (howmany-- > 0) {
2036 				ring->slot[j].len = 0;
2037 				ring->slot[j].flags = 0;
2038 				j = nm_next(j, lim);
2039 			    }
2040 			}
2041 		    }
2042 		    p[lease_idx] = j; /* report I am done */
2043 
2044 		    update_pos = kring->nr_hwtail;
2045 
2046 		    if (my_start == update_pos) {
2047 			/* all slots before my_start have been reported,
2048 			 * so scan subsequent leases to see if other ranges
2049 			 * have been completed, and to a selwakeup or txsync.
2050 		         */
2051 			while (lease_idx != kring->nkr_lease_idx &&
2052 				p[lease_idx] != NR_NOSLOT) {
2053 			    j = p[lease_idx];
2054 			    p[lease_idx] = NR_NOSLOT;
2055 			    lease_idx = nm_next(lease_idx, lim);
2056 			}
2057 			/* j is the new 'write' position. j != my_start
2058 			 * means there are new buffers to report
2059 			 */
2060 			if (likely(j != my_start)) {
2061 				kring->nr_hwtail = j;
2062 				still_locked = 0;
2063 				mtx_unlock(&kring->q_lock);
2064 				kring->nm_notify(kring, 0);
2065 				/* this is netmap_notify for VALE ports and
2066 				 * netmap_bwrap_notify for bwrap. The latter will
2067 				 * trigger a txsync on the underlying hwna
2068 				 */
2069 				if (dst_na->retry && retry--) {
2070 					/* XXX this is going to call nm_notify again.
2071 					 * Only useful for bwrap in virtual machines
2072 					 */
2073 					goto retry;
2074 				}
2075 			}
2076 		    }
2077 		    if (still_locked)
2078 			mtx_unlock(&kring->q_lock);
2079 		}
2080 cleanup:
2081 		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
2082 		d->bq_len = 0;
2083 	}
2084 	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
2085 	brddst->bq_len = 0;
2086 	return 0;
2087 }
2088 
2089 /* nm_txsync callback for VALE ports */
2090 static int
2091 netmap_vp_txsync(struct netmap_kring *kring, int flags)
2092 {
2093 	struct netmap_vp_adapter *na =
2094 		(struct netmap_vp_adapter *)kring->na;
2095 	u_int done;
2096 	u_int const lim = kring->nkr_num_slots - 1;
2097 	u_int const head = kring->rhead;
2098 
2099 	if (bridge_batch <= 0) { /* testing only */
2100 		done = head; // used all
2101 		goto done;
2102 	}
2103 	if (!na->na_bdg) {
2104 		done = head;
2105 		goto done;
2106 	}
2107 	if (bridge_batch > NM_BDG_BATCH)
2108 		bridge_batch = NM_BDG_BATCH;
2109 
2110 	done = nm_bdg_preflush(kring, head);
2111 done:
2112 	if (done != head)
2113 		D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
2114 	/*
2115 	 * packets between 'done' and 'cur' are left unsent.
2116 	 */
2117 	kring->nr_hwcur = done;
2118 	kring->nr_hwtail = nm_prev(done, lim);
2119 	if (netmap_verbose)
2120 		D("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
2121 	return 0;
2122 }
2123 
2124 
2125 /* rxsync code used by VALE ports nm_rxsync callback and also
2126  * internally by the brwap
2127  */
2128 static int
2129 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
2130 {
2131 	struct netmap_adapter *na = kring->na;
2132 	struct netmap_ring *ring = kring->ring;
2133 	u_int nm_i, lim = kring->nkr_num_slots - 1;
2134 	u_int head = kring->rhead;
2135 	int n;
2136 
2137 	if (head > lim) {
2138 		D("ouch dangerous reset!!!");
2139 		n = netmap_ring_reinit(kring);
2140 		goto done;
2141 	}
2142 
2143 	/* First part, import newly received packets. */
2144 	/* actually nothing to do here, they are already in the kring */
2145 
2146 	/* Second part, skip past packets that userspace has released. */
2147 	nm_i = kring->nr_hwcur;
2148 	if (nm_i != head) {
2149 		/* consistency check, but nothing really important here */
2150 		for (n = 0; likely(nm_i != head); n++) {
2151 			struct netmap_slot *slot = &ring->slot[nm_i];
2152 			void *addr = NMB(na, slot);
2153 
2154 			if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
2155 				D("bad buffer index %d, ignore ?",
2156 					slot->buf_idx);
2157 			}
2158 			slot->flags &= ~NS_BUF_CHANGED;
2159 			nm_i = nm_next(nm_i, lim);
2160 		}
2161 		kring->nr_hwcur = head;
2162 	}
2163 
2164 	n = 0;
2165 done:
2166 	return n;
2167 }
2168 
2169 /*
2170  * nm_rxsync callback for VALE ports
2171  * user process reading from a VALE switch.
2172  * Already protected against concurrent calls from userspace,
2173  * but we must acquire the queue's lock to protect against
2174  * writers on the same queue.
2175  */
2176 static int
2177 netmap_vp_rxsync(struct netmap_kring *kring, int flags)
2178 {
2179 	int n;
2180 
2181 	mtx_lock(&kring->q_lock);
2182 	n = netmap_vp_rxsync_locked(kring, flags);
2183 	mtx_unlock(&kring->q_lock);
2184 	return n;
2185 }
2186 
2187 
2188 /* nm_bdg_attach callback for VALE ports
2189  * The na_vp port is this same netmap_adapter. There is no host port.
2190  */
2191 static int
2192 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na)
2193 {
2194 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
2195 
2196 	if (vpna->na_bdg)
2197 		return EBUSY;
2198 	na->na_vp = vpna;
2199 	strncpy(na->name, name, sizeof(na->name));
2200 	na->na_hostvp = NULL;
2201 	return 0;
2202 }
2203 
2204 /* create a netmap_vp_adapter that describes a VALE port.
2205  * Only persistent VALE ports have a non-null ifp.
2206  */
2207 static int
2208 netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp,
2209 		struct netmap_mem_d *nmd,
2210 		struct netmap_vp_adapter **ret)
2211 {
2212 	struct netmap_vp_adapter *vpna;
2213 	struct netmap_adapter *na;
2214 	int error = 0;
2215 	u_int npipes = 0;
2216 
2217 	vpna = nm_os_malloc(sizeof(*vpna));
2218 	if (vpna == NULL)
2219 		return ENOMEM;
2220 
2221  	na = &vpna->up;
2222 
2223 	na->ifp = ifp;
2224 	strncpy(na->name, nmr->nr_name, sizeof(na->name));
2225 
2226 	/* bound checking */
2227 	na->num_tx_rings = nmr->nr_tx_rings;
2228 	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
2229 	nmr->nr_tx_rings = na->num_tx_rings; // write back
2230 	na->num_rx_rings = nmr->nr_rx_rings;
2231 	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
2232 	nmr->nr_rx_rings = na->num_rx_rings; // write back
2233 	nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
2234 			1, NM_BDG_MAXSLOTS, NULL);
2235 	na->num_tx_desc = nmr->nr_tx_slots;
2236 	nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
2237 			1, NM_BDG_MAXSLOTS, NULL);
2238 	/* validate number of pipes. We want at least 1,
2239 	 * but probably can do with some more.
2240 	 * So let's use 2 as default (when 0 is supplied)
2241 	 */
2242 	npipes = nmr->nr_arg1;
2243 	nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
2244 	nmr->nr_arg1 = npipes;	/* write back */
2245 	/* validate extra bufs */
2246 	nm_bound_var(&nmr->nr_arg3, 0, 0,
2247 			128*NM_BDG_MAXSLOTS, NULL);
2248 	na->num_rx_desc = nmr->nr_rx_slots;
2249 	vpna->mfs = 1514;
2250 	vpna->last_smac = ~0llu;
2251 	/*if (vpna->mfs > netmap_buf_size)  TODO netmap_buf_size is zero??
2252 		vpna->mfs = netmap_buf_size; */
2253         if (netmap_verbose)
2254 		D("max frame size %u", vpna->mfs);
2255 
2256 	na->na_flags |= NAF_BDG_MAYSLEEP;
2257 	/* persistent VALE ports look like hw devices
2258 	 * with a native netmap adapter
2259 	 */
2260 	if (ifp)
2261 		na->na_flags |= NAF_NATIVE;
2262 	na->nm_txsync = netmap_vp_txsync;
2263 	na->nm_rxsync = netmap_vp_rxsync;
2264 	na->nm_register = netmap_vp_reg;
2265 	na->nm_krings_create = netmap_vp_krings_create;
2266 	na->nm_krings_delete = netmap_vp_krings_delete;
2267 	na->nm_dtor = netmap_vp_dtor;
2268 	D("nr_arg2 %d", nmr->nr_arg2);
2269 	na->nm_mem = nmd ?
2270 		netmap_mem_get(nmd):
2271 		netmap_mem_private_new(
2272 			na->num_tx_rings, na->num_tx_desc,
2273 			na->num_rx_rings, na->num_rx_desc,
2274 			nmr->nr_arg3, npipes, &error);
2275 	if (na->nm_mem == NULL)
2276 		goto err;
2277 	na->nm_bdg_attach = netmap_vp_bdg_attach;
2278 	/* other nmd fields are set in the common routine */
2279 	error = netmap_attach_common(na);
2280 	if (error)
2281 		goto err;
2282 	*ret = vpna;
2283 	return 0;
2284 
2285 err:
2286 	if (na->nm_mem != NULL)
2287 		netmap_mem_put(na->nm_mem);
2288 	nm_os_free(vpna);
2289 	return error;
2290 }
2291 
2292 /* Bridge wrapper code (bwrap).
2293  * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
2294  * VALE switch.
2295  * The main task is to swap the meaning of tx and rx rings to match the
2296  * expectations of the VALE switch code (see nm_bdg_flush).
2297  *
2298  * The bwrap works by interposing a netmap_bwrap_adapter between the
2299  * rest of the system and the hwna. The netmap_bwrap_adapter looks like
2300  * a netmap_vp_adapter to the rest the system, but, internally, it
2301  * translates all callbacks to what the hwna expects.
2302  *
2303  * Note that we have to intercept callbacks coming from two sides:
2304  *
2305  *  - callbacks coming from the netmap module are intercepted by
2306  *    passing around the netmap_bwrap_adapter instead of the hwna
2307  *
2308  *  - callbacks coming from outside of the netmap module only know
2309  *    about the hwna. This, however, only happens in interrupt
2310  *    handlers, where only the hwna->nm_notify callback is called.
2311  *    What the bwrap does is to overwrite the hwna->nm_notify callback
2312  *    with its own netmap_bwrap_intr_notify.
2313  *    XXX This assumes that the hwna->nm_notify callback was the
2314  *    standard netmap_notify(), as it is the case for nic adapters.
2315  *    Any additional action performed by hwna->nm_notify will not be
2316  *    performed by netmap_bwrap_intr_notify.
2317  *
2318  * Additionally, the bwrap can optionally attach the host rings pair
2319  * of the wrapped adapter to a different port of the switch.
2320  */
2321 
2322 
2323 static void
2324 netmap_bwrap_dtor(struct netmap_adapter *na)
2325 {
2326 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2327 	struct netmap_adapter *hwna = bna->hwna;
2328 	struct nm_bridge *b = bna->up.na_bdg,
2329 		*bh = bna->host.na_bdg;
2330 
2331 	netmap_mem_put(bna->host.up.nm_mem);
2332 
2333 	if (b) {
2334 		netmap_bdg_detach_common(b, bna->up.bdg_port,
2335 			    (bh ? bna->host.bdg_port : -1));
2336 	}
2337 
2338 	ND("na %p", na);
2339 	na->ifp = NULL;
2340 	bna->host.up.ifp = NULL;
2341 	hwna->na_private = NULL;
2342 	hwna->na_vp = hwna->na_hostvp = NULL;
2343 	hwna->na_flags &= ~NAF_BUSY;
2344 	netmap_adapter_put(hwna);
2345 
2346 }
2347 
2348 
2349 /*
2350  * Intr callback for NICs connected to a bridge.
2351  * Simply ignore tx interrupts (maybe we could try to recover space ?)
2352  * and pass received packets from nic to the bridge.
2353  *
2354  * XXX TODO check locking: this is called from the interrupt
2355  * handler so we should make sure that the interface is not
2356  * disconnected while passing down an interrupt.
2357  *
2358  * Note, no user process can access this NIC or the host stack.
2359  * The only part of the ring that is significant are the slots,
2360  * and head/cur/tail are set from the kring as needed
2361  * (part as a receive ring, part as a transmit ring).
2362  *
2363  * callback that overwrites the hwna notify callback.
2364  * Packets come from the outside or from the host stack and are put on an
2365  * hwna rx ring.
2366  * The bridge wrapper then sends the packets through the bridge.
2367  */
2368 static int
2369 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
2370 {
2371 	struct netmap_adapter *na = kring->na;
2372 	struct netmap_bwrap_adapter *bna = na->na_private;
2373 	struct netmap_kring *bkring;
2374 	struct netmap_vp_adapter *vpna = &bna->up;
2375 	u_int ring_nr = kring->ring_id;
2376 	int ret = NM_IRQ_COMPLETED;
2377 	int error;
2378 
2379 	if (netmap_verbose)
2380 	    D("%s %s 0x%x", na->name, kring->name, flags);
2381 
2382 	bkring = &vpna->up.tx_rings[ring_nr];
2383 
2384 	/* make sure the ring is not disabled */
2385 	if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) {
2386 		return EIO;
2387 	}
2388 
2389 	if (netmap_verbose)
2390 	    D("%s head %d cur %d tail %d",  na->name,
2391 		kring->rhead, kring->rcur, kring->rtail);
2392 
2393 	/* simulate a user wakeup on the rx ring
2394 	 * fetch packets that have arrived.
2395 	 */
2396 	error = kring->nm_sync(kring, 0);
2397 	if (error)
2398 		goto put_out;
2399 	if (kring->nr_hwcur == kring->nr_hwtail) {
2400 		if (netmap_verbose)
2401 			D("how strange, interrupt with no packets on %s",
2402 			    na->name);
2403 		goto put_out;
2404 	}
2405 
2406 	/* new packets are kring->rcur to kring->nr_hwtail, and the bkring
2407 	 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail
2408 	 * to push all packets out.
2409 	 */
2410 	bkring->rhead = bkring->rcur = kring->nr_hwtail;
2411 
2412 	netmap_vp_txsync(bkring, flags);
2413 
2414 	/* mark all buffers as released on this ring */
2415 	kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail;
2416 	/* another call to actually release the buffers */
2417 	error = kring->nm_sync(kring, 0);
2418 
2419 	/* The second rxsync may have further advanced hwtail. If this happens,
2420 	 *  return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */
2421 	if (kring->rcur != kring->nr_hwtail) {
2422 		ret = NM_IRQ_RESCHED;
2423 	}
2424 put_out:
2425 	nm_kr_put(kring);
2426 
2427 	return error ? error : ret;
2428 }
2429 
2430 
2431 /* nm_register callback for bwrap */
2432 static int
2433 netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
2434 {
2435 	struct netmap_bwrap_adapter *bna =
2436 		(struct netmap_bwrap_adapter *)na;
2437 	struct netmap_adapter *hwna = bna->hwna;
2438 	struct netmap_vp_adapter *hostna = &bna->host;
2439 	int error, i;
2440 	enum txrx t;
2441 
2442 	ND("%s %s", na->name, onoff ? "on" : "off");
2443 
2444 	if (onoff) {
2445 		/* netmap_do_regif has been called on the bwrap na.
2446 		 * We need to pass the information about the
2447 		 * memory allocator down to the hwna before
2448 		 * putting it in netmap mode
2449 		 */
2450 		hwna->na_lut = na->na_lut;
2451 
2452 		if (hostna->na_bdg) {
2453 			/* if the host rings have been attached to switch,
2454 			 * we need to copy the memory allocator information
2455 			 * in the hostna also
2456 			 */
2457 			hostna->up.na_lut = na->na_lut;
2458 		}
2459 
2460 		/* cross-link the netmap rings
2461 		 * The original number of rings comes from hwna,
2462 		 * rx rings on one side equals tx rings on the other.
2463 		 */
2464 		for_rx_tx(t) {
2465 			enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2466 			for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {
2467 				NMR(hwna, r)[i].ring = NMR(na, t)[i].ring;
2468 			}
2469 		}
2470 
2471 		if (na->na_flags & NAF_HOST_RINGS) {
2472 			struct netmap_adapter *hna = &hostna->up;
2473 			/* the hostna rings are the host rings of the bwrap.
2474 			 * The corresponding krings must point back to the
2475 			 * hostna
2476 			 */
2477 			hna->tx_rings = &na->tx_rings[na->num_tx_rings];
2478 			hna->tx_rings[0].na = hna;
2479 			hna->rx_rings = &na->rx_rings[na->num_rx_rings];
2480 			hna->rx_rings[0].na = hna;
2481 		}
2482 	}
2483 
2484 	/* pass down the pending ring state information */
2485 	for_rx_tx(t) {
2486 		for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
2487 			NMR(hwna, t)[i].nr_pending_mode =
2488 				NMR(na, t)[i].nr_pending_mode;
2489 	}
2490 
2491 	/* forward the request to the hwna */
2492 	error = hwna->nm_register(hwna, onoff);
2493 	if (error)
2494 		return error;
2495 
2496 	/* copy up the current ring state information */
2497 	for_rx_tx(t) {
2498 		for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
2499 			NMR(na, t)[i].nr_mode =
2500 				NMR(hwna, t)[i].nr_mode;
2501 	}
2502 
2503 	/* impersonate a netmap_vp_adapter */
2504 	netmap_vp_reg(na, onoff);
2505 	if (hostna->na_bdg)
2506 		netmap_vp_reg(&hostna->up, onoff);
2507 
2508 	if (onoff) {
2509 		u_int i;
2510 		/* intercept the hwna nm_nofify callback on the hw rings */
2511 		for (i = 0; i < hwna->num_rx_rings; i++) {
2512 			hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify;
2513 			hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
2514 		}
2515 		i = hwna->num_rx_rings; /* for safety */
2516 		/* save the host ring notify unconditionally */
2517 		hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify;
2518 		if (hostna->na_bdg) {
2519 			/* also intercept the host ring notify */
2520 			hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
2521 		}
2522 		if (na->active_fds == 0)
2523 			na->na_flags |= NAF_NETMAP_ON;
2524 	} else {
2525 		u_int i;
2526 
2527 		if (na->active_fds == 0)
2528 			na->na_flags &= ~NAF_NETMAP_ON;
2529 
2530 		/* reset all notify callbacks (including host ring) */
2531 		for (i = 0; i <= hwna->num_rx_rings; i++) {
2532 			hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify;
2533 			hwna->rx_rings[i].save_notify = NULL;
2534 		}
2535 		hwna->na_lut.lut = NULL;
2536 		hwna->na_lut.objtotal = 0;
2537 		hwna->na_lut.objsize = 0;
2538 	}
2539 
2540 	return 0;
2541 }
2542 
2543 /* nm_config callback for bwrap */
2544 static int
2545 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
2546 				    u_int *rxr, u_int *rxd)
2547 {
2548 	struct netmap_bwrap_adapter *bna =
2549 		(struct netmap_bwrap_adapter *)na;
2550 	struct netmap_adapter *hwna = bna->hwna;
2551 
2552 	/* forward the request */
2553 	netmap_update_config(hwna);
2554 	/* swap the results */
2555 	*txr = hwna->num_rx_rings;
2556 	*txd = hwna->num_rx_desc;
2557 	*rxr = hwna->num_tx_rings;
2558 	*rxd = hwna->num_rx_desc;
2559 
2560 	return 0;
2561 }
2562 
2563 
2564 /* nm_krings_create callback for bwrap */
2565 static int
2566 netmap_bwrap_krings_create(struct netmap_adapter *na)
2567 {
2568 	struct netmap_bwrap_adapter *bna =
2569 		(struct netmap_bwrap_adapter *)na;
2570 	struct netmap_adapter *hwna = bna->hwna;
2571 	int i, error = 0;
2572 	enum txrx t;
2573 
2574 	ND("%s", na->name);
2575 
2576 	/* impersonate a netmap_vp_adapter */
2577 	error = netmap_vp_krings_create(na);
2578 	if (error)
2579 		return error;
2580 
2581 	/* also create the hwna krings */
2582 	error = hwna->nm_krings_create(hwna);
2583 	if (error) {
2584 		goto err_del_vp_rings;
2585 	}
2586 
2587 	/* get each ring slot number from the corresponding hwna ring */
2588 	for_rx_tx(t) {
2589 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2590 		for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {
2591 			NMR(na, t)[i].nkr_num_slots = NMR(hwna, r)[i].nkr_num_slots;
2592 		}
2593 	}
2594 
2595 	return 0;
2596 
2597 err_del_vp_rings:
2598 	netmap_vp_krings_delete(na);
2599 
2600 	return error;
2601 }
2602 
2603 
2604 static void
2605 netmap_bwrap_krings_delete(struct netmap_adapter *na)
2606 {
2607 	struct netmap_bwrap_adapter *bna =
2608 		(struct netmap_bwrap_adapter *)na;
2609 	struct netmap_adapter *hwna = bna->hwna;
2610 
2611 	ND("%s", na->name);
2612 
2613 	hwna->nm_krings_delete(hwna);
2614 	netmap_vp_krings_delete(na);
2615 }
2616 
2617 
2618 /* notify method for the bridge-->hwna direction */
2619 static int
2620 netmap_bwrap_notify(struct netmap_kring *kring, int flags)
2621 {
2622 	struct netmap_adapter *na = kring->na;
2623 	struct netmap_bwrap_adapter *bna = na->na_private;
2624 	struct netmap_adapter *hwna = bna->hwna;
2625 	u_int ring_n = kring->ring_id;
2626 	u_int lim = kring->nkr_num_slots - 1;
2627 	struct netmap_kring *hw_kring;
2628 	int error;
2629 
2630 	ND("%s: na %s hwna %s",
2631 			(kring ? kring->name : "NULL!"),
2632 			(na ? na->name : "NULL!"),
2633 			(hwna ? hwna->name : "NULL!"));
2634 	hw_kring = &hwna->tx_rings[ring_n];
2635 
2636 	if (nm_kr_tryget(hw_kring, 0, NULL)) {
2637 		return ENXIO;
2638 	}
2639 
2640 	/* first step: simulate a user wakeup on the rx ring */
2641 	netmap_vp_rxsync(kring, flags);
2642 	ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2643 		na->name, ring_n,
2644 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2645 		ring->head, ring->cur, ring->tail,
2646 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
2647 	/* second step: the new packets are sent on the tx ring
2648 	 * (which is actually the same ring)
2649 	 */
2650 	hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
2651 	error = hw_kring->nm_sync(hw_kring, flags);
2652 	if (error)
2653 		goto put_out;
2654 
2655 	/* third step: now we are back the rx ring */
2656 	/* claim ownership on all hw owned bufs */
2657 	kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */
2658 
2659 	/* fourth step: the user goes to sleep again, causing another rxsync */
2660 	netmap_vp_rxsync(kring, flags);
2661 	ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2662 		na->name, ring_n,
2663 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2664 		ring->head, ring->cur, ring->tail,
2665 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
2666 put_out:
2667 	nm_kr_put(hw_kring);
2668 
2669 	return error ? error : NM_IRQ_COMPLETED;
2670 }
2671 
2672 
2673 /* nm_bdg_ctl callback for the bwrap.
2674  * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd].
2675  * On attach, it needs to provide a fake netmap_priv_d structure and
2676  * perform a netmap_do_regif() on the bwrap. This will put both the
2677  * bwrap and the hwna in netmap mode, with the netmap rings shared
2678  * and cross linked. Moroever, it will start intercepting interrupts
2679  * directed to hwna.
2680  */
2681 static int
2682 netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
2683 {
2684 	struct netmap_priv_d *npriv;
2685 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2686 	int error = 0;
2687 
2688 	if (attach) {
2689 		if (NETMAP_OWNED_BY_ANY(na)) {
2690 			return EBUSY;
2691 		}
2692 		if (bna->na_kpriv) {
2693 			/* nothing to do */
2694 			return 0;
2695 		}
2696 		npriv = netmap_priv_new();
2697 		if (npriv == NULL)
2698 			return ENOMEM;
2699 		npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */
2700 		error = netmap_do_regif(npriv, na, 0, NR_REG_NIC_SW);
2701 		if (error) {
2702 			netmap_priv_delete(npriv);
2703 			return error;
2704 		}
2705 		bna->na_kpriv = npriv;
2706 		na->na_flags |= NAF_BUSY;
2707 	} else {
2708 		if (na->active_fds == 0) /* not registered */
2709 			return EINVAL;
2710 		netmap_priv_delete(bna->na_kpriv);
2711 		bna->na_kpriv = NULL;
2712 		na->na_flags &= ~NAF_BUSY;
2713 	}
2714 	return error;
2715 
2716 }
2717 
2718 /* attach a bridge wrapper to the 'real' device */
2719 int
2720 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
2721 {
2722 	struct netmap_bwrap_adapter *bna;
2723 	struct netmap_adapter *na = NULL;
2724 	struct netmap_adapter *hostna = NULL;
2725 	int error = 0;
2726 	enum txrx t;
2727 
2728 	/* make sure the NIC is not already in use */
2729 	if (NETMAP_OWNED_BY_ANY(hwna)) {
2730 		D("NIC %s busy, cannot attach to bridge", hwna->name);
2731 		return EBUSY;
2732 	}
2733 
2734 	bna = nm_os_malloc(sizeof(*bna));
2735 	if (bna == NULL) {
2736 		return ENOMEM;
2737 	}
2738 
2739 	na = &bna->up.up;
2740 	/* make bwrap ifp point to the real ifp */
2741 	na->ifp = hwna->ifp;
2742 	if_ref(na->ifp);
2743 	na->na_private = bna;
2744 	strncpy(na->name, nr_name, sizeof(na->name));
2745 	/* fill the ring data for the bwrap adapter with rx/tx meanings
2746 	 * swapped. The real cross-linking will be done during register,
2747 	 * when all the krings will have been created.
2748 	 */
2749 	for_rx_tx(t) {
2750 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2751 		nma_set_nrings(na, t, nma_get_nrings(hwna, r));
2752 		nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
2753 	}
2754 	na->nm_dtor = netmap_bwrap_dtor;
2755 	na->nm_register = netmap_bwrap_reg;
2756 	// na->nm_txsync = netmap_bwrap_txsync;
2757 	// na->nm_rxsync = netmap_bwrap_rxsync;
2758 	na->nm_config = netmap_bwrap_config;
2759 	na->nm_krings_create = netmap_bwrap_krings_create;
2760 	na->nm_krings_delete = netmap_bwrap_krings_delete;
2761 	na->nm_notify = netmap_bwrap_notify;
2762 	na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
2763 	na->pdev = hwna->pdev;
2764 	na->nm_mem = netmap_mem_get(hwna->nm_mem);
2765 	na->virt_hdr_len = hwna->virt_hdr_len;
2766 	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
2767 
2768 	bna->hwna = hwna;
2769 	netmap_adapter_get(hwna);
2770 	hwna->na_private = bna; /* weak reference */
2771 	hwna->na_vp = &bna->up;
2772 
2773 	if (hwna->na_flags & NAF_HOST_RINGS) {
2774 		if (hwna->na_flags & NAF_SW_ONLY)
2775 			na->na_flags |= NAF_SW_ONLY;
2776 		na->na_flags |= NAF_HOST_RINGS;
2777 		hostna = &bna->host.up;
2778 		snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name);
2779 		hostna->ifp = hwna->ifp;
2780 		for_rx_tx(t) {
2781 			enum txrx r = nm_txrx_swap(t);
2782 			nma_set_nrings(hostna, t, 1);
2783 			nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
2784 		}
2785 		// hostna->nm_txsync = netmap_bwrap_host_txsync;
2786 		// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
2787 		hostna->nm_notify = netmap_bwrap_notify;
2788 		hostna->nm_mem = netmap_mem_get(na->nm_mem);
2789 		hostna->na_private = bna;
2790 		hostna->na_vp = &bna->up;
2791 		na->na_hostvp = hwna->na_hostvp =
2792 			hostna->na_hostvp = &bna->host;
2793 		hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
2794 	}
2795 
2796 	ND("%s<->%s txr %d txd %d rxr %d rxd %d",
2797 		na->name, ifp->if_xname,
2798 		na->num_tx_rings, na->num_tx_desc,
2799 		na->num_rx_rings, na->num_rx_desc);
2800 
2801 	error = netmap_attach_common(na);
2802 	if (error) {
2803 		goto err_free;
2804 	}
2805 	hwna->na_flags |= NAF_BUSY;
2806 	return 0;
2807 
2808 err_free:
2809 	hwna->na_vp = hwna->na_hostvp = NULL;
2810 	netmap_adapter_put(hwna);
2811 	nm_os_free(bna);
2812 	return error;
2813 
2814 }
2815 
2816 struct nm_bridge *
2817 netmap_init_bridges2(u_int n)
2818 {
2819 	int i;
2820 	struct nm_bridge *b;
2821 
2822 	b = nm_os_malloc(sizeof(struct nm_bridge) * n);
2823 	if (b == NULL)
2824 		return NULL;
2825 	for (i = 0; i < n; i++)
2826 		BDG_RWINIT(&b[i]);
2827 	return b;
2828 }
2829 
2830 void
2831 netmap_uninit_bridges2(struct nm_bridge *b, u_int n)
2832 {
2833 	int i;
2834 
2835 	if (b == NULL)
2836 		return;
2837 
2838 	for (i = 0; i < n; i++)
2839 		BDG_RWDESTROY(&b[i]);
2840 	nm_os_free(b);
2841 }
2842 
2843 int
2844 netmap_init_bridges(void)
2845 {
2846 #ifdef CONFIG_NET_NS
2847 	return netmap_bns_register();
2848 #else
2849 	nm_bridges = netmap_init_bridges2(NM_BRIDGES);
2850 	if (nm_bridges == NULL)
2851 		return ENOMEM;
2852 	return 0;
2853 #endif
2854 }
2855 
2856 void
2857 netmap_uninit_bridges(void)
2858 {
2859 #ifdef CONFIG_NET_NS
2860 	netmap_bns_unregister();
2861 #else
2862 	netmap_uninit_bridges2(nm_bridges, NM_BRIDGES);
2863 #endif
2864 }
2865 #endif /* WITH_VALE */
2866