xref: /freebsd/sys/dev/netmap/netmap_vale.c (revision 56e53cb8ef000c3ef72337a4095987a932cdedef)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (C) 2013-2016 Universita` di Pisa
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *   1. Redistributions of source code must retain the above copyright
11  *      notice, this list of conditions and the following disclaimer.
12  *   2. Redistributions in binary form must reproduce the above copyright
13  *      notice, this list of conditions and the following disclaimer in the
14  *      documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 
30 /*
31  * This module implements the VALE switch for netmap
32 
33 --- VALE SWITCH ---
34 
35 NMG_LOCK() serializes all modifications to switches and ports.
36 A switch cannot be deleted until all ports are gone.
37 
38 For each switch, an SX lock (RWlock on linux) protects
39 deletion of ports. When configuring or deleting a new port, the
40 lock is acquired in exclusive mode (after holding NMG_LOCK).
41 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
42 The lock is held throughout the entire forwarding cycle,
43 during which the thread may incur in a page fault.
44 Hence it is important that sleepable shared locks are used.
45 
46 On the rx ring, the per-port lock is grabbed initially to reserve
47 a number of slot in the ring, then the lock is released,
48 packets are copied from source to destination, and then
49 the lock is acquired again and the receive ring is updated.
50 (A similar thing is done on the tx ring for NIC and host stack
51 ports attached to the switch)
52 
53  */
54 
55 /*
56  * OS-specific code that is used only within this file.
57  * Other OS-specific code that must be accessed by drivers
58  * is present in netmap_kern.h
59  */
60 
61 #if defined(__FreeBSD__)
62 #include <sys/cdefs.h> /* prerequisite */
63 __FBSDID("$FreeBSD$");
64 
65 #include <sys/types.h>
66 #include <sys/errno.h>
67 #include <sys/param.h>	/* defines used in kernel.h */
68 #include <sys/kernel.h>	/* types used in module initialization */
69 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
70 #include <sys/sockio.h>
71 #include <sys/socketvar.h>	/* struct socket */
72 #include <sys/malloc.h>
73 #include <sys/poll.h>
74 #include <sys/rwlock.h>
75 #include <sys/socket.h> /* sockaddrs */
76 #include <sys/selinfo.h>
77 #include <sys/sysctl.h>
78 #include <net/if.h>
79 #include <net/if_var.h>
80 #include <net/bpf.h>		/* BIOCIMMEDIATE */
81 #include <machine/bus.h>	/* bus_dmamap_* */
82 #include <sys/endian.h>
83 #include <sys/refcount.h>
84 
85 
86 #define BDG_RWLOCK_T		struct rwlock // struct rwlock
87 
88 #define	BDG_RWINIT(b)		\
89 	rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
90 #define BDG_WLOCK(b)		rw_wlock(&(b)->bdg_lock)
91 #define BDG_WUNLOCK(b)		rw_wunlock(&(b)->bdg_lock)
92 #define BDG_RLOCK(b)		rw_rlock(&(b)->bdg_lock)
93 #define BDG_RTRYLOCK(b)		rw_try_rlock(&(b)->bdg_lock)
94 #define BDG_RUNLOCK(b)		rw_runlock(&(b)->bdg_lock)
95 #define BDG_RWDESTROY(b)	rw_destroy(&(b)->bdg_lock)
96 
97 
98 #elif defined(linux)
99 
100 #include "bsd_glue.h"
101 
102 #elif defined(__APPLE__)
103 
104 #warning OSX support is only partial
105 #include "osx_glue.h"
106 
107 #elif defined(_WIN32)
108 #include "win_glue.h"
109 
110 #else
111 
112 #error	Unsupported platform
113 
114 #endif /* unsupported */
115 
116 /*
117  * common headers
118  */
119 
120 #include <net/netmap.h>
121 #include <dev/netmap/netmap_kern.h>
122 #include <dev/netmap/netmap_mem2.h>
123 
124 #ifdef WITH_VALE
125 
126 /*
127  * system parameters (most of them in netmap_kern.h)
128  * NM_BDG_NAME	prefix for switch port names, default "vale"
129  * NM_BDG_MAXPORTS	number of ports
130  * NM_BRIDGES	max number of switches in the system.
131  *	XXX should become a sysctl or tunable
132  *
133  * Switch ports are named valeX:Y where X is the switch name and Y
134  * is the port. If Y matches a physical interface name, the port is
135  * connected to a physical device.
136  *
137  * Unlike physical interfaces, switch ports use their own memory region
138  * for rings and buffers.
139  * The virtual interfaces use per-queue lock instead of core lock.
140  * In the tx loop, we aggregate traffic in batches to make all operations
141  * faster. The batch size is bridge_batch.
142  */
143 #define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
144 #define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
145 #define NM_BRIDGE_RINGSIZE	1024	/* in the device */
146 #define NM_BDG_HASH		1024	/* forwarding table entries */
147 #define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
148 #define NM_MULTISEG		64	/* max size of a chain of bufs */
149 /* actual size of the tables */
150 #define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NM_MULTISEG)
151 /* NM_FT_NULL terminates a list of slots in the ft */
152 #define NM_FT_NULL		NM_BDG_BATCH_MAX
153 
154 
155 /*
156  * bridge_batch is set via sysctl to the max batch size to be
157  * used in the bridge. The actual value may be larger as the
158  * last packet in the block may overflow the size.
159  */
160 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
161 SYSBEGIN(vars_vale);
162 SYSCTL_DECL(_dev_netmap);
163 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
164 SYSEND;
165 
166 static int netmap_vp_create(struct nmreq *, struct ifnet *,
167 		struct netmap_mem_d *nmd, struct netmap_vp_adapter **);
168 static int netmap_vp_reg(struct netmap_adapter *na, int onoff);
169 static int netmap_bwrap_reg(struct netmap_adapter *, int onoff);
170 
171 /*
172  * For each output interface, nm_bdg_q is used to construct a list.
173  * bq_len is the number of output buffers (we can have coalescing
174  * during the copy).
175  */
176 struct nm_bdg_q {
177 	uint16_t bq_head;
178 	uint16_t bq_tail;
179 	uint32_t bq_len;	/* number of buffers */
180 };
181 
182 /* XXX revise this */
183 struct nm_hash_ent {
184 	uint64_t	mac;	/* the top 2 bytes are the epoch */
185 	uint64_t	ports;
186 };
187 
188 /*
189  * nm_bridge is a descriptor for a VALE switch.
190  * Interfaces for a bridge are all in bdg_ports[].
191  * The array has fixed size, an empty entry does not terminate
192  * the search, but lookups only occur on attach/detach so we
193  * don't mind if they are slow.
194  *
195  * The bridge is non blocking on the transmit ports: excess
196  * packets are dropped if there is no room on the output port.
197  *
198  * bdg_lock protects accesses to the bdg_ports array.
199  * This is a rw lock (or equivalent).
200  */
201 struct nm_bridge {
202 	/* XXX what is the proper alignment/layout ? */
203 	BDG_RWLOCK_T	bdg_lock;	/* protects bdg_ports */
204 	int		bdg_namelen;
205 	uint32_t	bdg_active_ports; /* 0 means free */
206 	char		bdg_basename[IFNAMSIZ];
207 
208 	/* Indexes of active ports (up to active_ports)
209 	 * and all other remaining ports.
210 	 */
211 	uint8_t		bdg_port_index[NM_BDG_MAXPORTS];
212 
213 	struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
214 
215 
216 	/*
217 	 * The function to decide the destination port.
218 	 * It returns either of an index of the destination port,
219 	 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
220 	 * forward this packet.  ring_nr is the source ring index, and the
221 	 * function may overwrite this value to forward this packet to a
222 	 * different ring index.
223 	 * This function must be set by netmap_bdg_ctl().
224 	 */
225 	struct netmap_bdg_ops bdg_ops;
226 
227 	/* the forwarding table, MAC+ports.
228 	 * XXX should be changed to an argument to be passed to
229 	 * the lookup function, and allocated on attach
230 	 */
231 	struct nm_hash_ent ht[NM_BDG_HASH];
232 
233 #ifdef CONFIG_NET_NS
234 	struct net *ns;
235 #endif /* CONFIG_NET_NS */
236 };
237 
238 const char*
239 netmap_bdg_name(struct netmap_vp_adapter *vp)
240 {
241 	struct nm_bridge *b = vp->na_bdg;
242 	if (b == NULL)
243 		return NULL;
244 	return b->bdg_basename;
245 }
246 
247 
248 #ifndef CONFIG_NET_NS
249 /*
250  * XXX in principle nm_bridges could be created dynamically
251  * Right now we have a static array and deletions are protected
252  * by an exclusive lock.
253  */
254 static struct nm_bridge *nm_bridges;
255 #endif /* !CONFIG_NET_NS */
256 
257 
258 /*
259  * this is a slightly optimized copy routine which rounds
260  * to multiple of 64 bytes and is often faster than dealing
261  * with other odd sizes. We assume there is enough room
262  * in the source and destination buffers.
263  *
264  * XXX only for multiples of 64 bytes, non overlapped.
265  */
266 static inline void
267 pkt_copy(void *_src, void *_dst, int l)
268 {
269         uint64_t *src = _src;
270         uint64_t *dst = _dst;
271         if (unlikely(l >= 1024)) {
272                 memcpy(dst, src, l);
273                 return;
274         }
275         for (; likely(l > 0); l-=64) {
276                 *dst++ = *src++;
277                 *dst++ = *src++;
278                 *dst++ = *src++;
279                 *dst++ = *src++;
280                 *dst++ = *src++;
281                 *dst++ = *src++;
282                 *dst++ = *src++;
283                 *dst++ = *src++;
284         }
285 }
286 
287 
288 static int
289 nm_is_id_char(const char c)
290 {
291 	return (c >= 'a' && c <= 'z') ||
292 	       (c >= 'A' && c <= 'Z') ||
293 	       (c >= '0' && c <= '9') ||
294 	       (c == '_');
295 }
296 
297 /* Validate the name of a VALE bridge port and return the
298  * position of the ":" character. */
299 static int
300 nm_vale_name_validate(const char *name)
301 {
302 	int colon_pos = -1;
303 	int i;
304 
305 	if (!name || strlen(name) < strlen(NM_BDG_NAME)) {
306 		return -1;
307 	}
308 
309 	for (i = 0; name[i]; i++) {
310 		if (name[i] == ':') {
311 			if (colon_pos != -1) {
312 				return -1;
313 			}
314 			colon_pos = i;
315 		} else if (!nm_is_id_char(name[i])) {
316 			return -1;
317 		}
318 	}
319 
320 	if (i >= IFNAMSIZ) {
321 		return -1;
322 	}
323 
324 	return colon_pos;
325 }
326 
327 /*
328  * locate a bridge among the existing ones.
329  * MUST BE CALLED WITH NMG_LOCK()
330  *
331  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
332  * We assume that this is called with a name of at least NM_NAME chars.
333  */
334 static struct nm_bridge *
335 nm_find_bridge(const char *name, int create)
336 {
337 	int i, namelen;
338 	struct nm_bridge *b = NULL, *bridges;
339 	u_int num_bridges;
340 
341 	NMG_LOCK_ASSERT();
342 
343 	netmap_bns_getbridges(&bridges, &num_bridges);
344 
345 	namelen = nm_vale_name_validate(name);
346 	if (namelen < 0) {
347 		D("invalid bridge name %s", name ? name : NULL);
348 		return NULL;
349 	}
350 
351 	/* lookup the name, remember empty slot if there is one */
352 	for (i = 0; i < num_bridges; i++) {
353 		struct nm_bridge *x = bridges + i;
354 
355 		if (x->bdg_active_ports == 0) {
356 			if (create && b == NULL)
357 				b = x;	/* record empty slot */
358 		} else if (x->bdg_namelen != namelen) {
359 			continue;
360 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
361 			ND("found '%.*s' at %d", namelen, name, i);
362 			b = x;
363 			break;
364 		}
365 	}
366 	if (i == num_bridges && b) { /* name not found, can create entry */
367 		/* initialize the bridge */
368 		strncpy(b->bdg_basename, name, namelen);
369 		ND("create new bridge %s with ports %d", b->bdg_basename,
370 			b->bdg_active_ports);
371 		b->bdg_namelen = namelen;
372 		b->bdg_active_ports = 0;
373 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
374 			b->bdg_port_index[i] = i;
375 		/* set the default function */
376 		b->bdg_ops.lookup = netmap_bdg_learning;
377 		/* reset the MAC address table */
378 		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
379 		NM_BNS_GET(b);
380 	}
381 	return b;
382 }
383 
384 
385 /*
386  * Free the forwarding tables for rings attached to switch ports.
387  */
388 static void
389 nm_free_bdgfwd(struct netmap_adapter *na)
390 {
391 	int nrings, i;
392 	struct netmap_kring *kring;
393 
394 	NMG_LOCK_ASSERT();
395 	nrings = na->num_tx_rings;
396 	kring = na->tx_rings;
397 	for (i = 0; i < nrings; i++) {
398 		if (kring[i].nkr_ft) {
399 			nm_os_free(kring[i].nkr_ft);
400 			kring[i].nkr_ft = NULL; /* protect from freeing twice */
401 		}
402 	}
403 }
404 
405 
406 /*
407  * Allocate the forwarding tables for the rings attached to the bridge ports.
408  */
409 static int
410 nm_alloc_bdgfwd(struct netmap_adapter *na)
411 {
412 	int nrings, l, i, num_dstq;
413 	struct netmap_kring *kring;
414 
415 	NMG_LOCK_ASSERT();
416 	/* all port:rings + broadcast */
417 	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
418 	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
419 	l += sizeof(struct nm_bdg_q) * num_dstq;
420 	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
421 
422 	nrings = netmap_real_rings(na, NR_TX);
423 	kring = na->tx_rings;
424 	for (i = 0; i < nrings; i++) {
425 		struct nm_bdg_fwd *ft;
426 		struct nm_bdg_q *dstq;
427 		int j;
428 
429 		ft = nm_os_malloc(l);
430 		if (!ft) {
431 			nm_free_bdgfwd(na);
432 			return ENOMEM;
433 		}
434 		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
435 		for (j = 0; j < num_dstq; j++) {
436 			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
437 			dstq[j].bq_len = 0;
438 		}
439 		kring[i].nkr_ft = ft;
440 	}
441 	return 0;
442 }
443 
444 
445 /* remove from bridge b the ports in slots hw and sw
446  * (sw can be -1 if not needed)
447  */
448 static void
449 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
450 {
451 	int s_hw = hw, s_sw = sw;
452 	int i, lim =b->bdg_active_ports;
453 	uint8_t tmp[NM_BDG_MAXPORTS];
454 
455 	/*
456 	New algorithm:
457 	make a copy of bdg_port_index;
458 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
459 	in the array of bdg_port_index, replacing them with
460 	entries from the bottom of the array;
461 	decrement bdg_active_ports;
462 	acquire BDG_WLOCK() and copy back the array.
463 	 */
464 
465 	if (netmap_verbose)
466 		D("detach %d and %d (lim %d)", hw, sw, lim);
467 	/* make a copy of the list of active ports, update it,
468 	 * and then copy back within BDG_WLOCK().
469 	 */
470 	memcpy(tmp, b->bdg_port_index, sizeof(tmp));
471 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
472 		if (hw >= 0 && tmp[i] == hw) {
473 			ND("detach hw %d at %d", hw, i);
474 			lim--; /* point to last active port */
475 			tmp[i] = tmp[lim]; /* swap with i */
476 			tmp[lim] = hw;	/* now this is inactive */
477 			hw = -1;
478 		} else if (sw >= 0 && tmp[i] == sw) {
479 			ND("detach sw %d at %d", sw, i);
480 			lim--;
481 			tmp[i] = tmp[lim];
482 			tmp[lim] = sw;
483 			sw = -1;
484 		} else {
485 			i++;
486 		}
487 	}
488 	if (hw >= 0 || sw >= 0) {
489 		D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
490 	}
491 
492 	BDG_WLOCK(b);
493 	if (b->bdg_ops.dtor)
494 		b->bdg_ops.dtor(b->bdg_ports[s_hw]);
495 	b->bdg_ports[s_hw] = NULL;
496 	if (s_sw >= 0) {
497 		b->bdg_ports[s_sw] = NULL;
498 	}
499 	memcpy(b->bdg_port_index, tmp, sizeof(tmp));
500 	b->bdg_active_ports = lim;
501 	BDG_WUNLOCK(b);
502 
503 	ND("now %d active ports", lim);
504 	if (lim == 0) {
505 		ND("marking bridge %s as free", b->bdg_basename);
506 		bzero(&b->bdg_ops, sizeof(b->bdg_ops));
507 		NM_BNS_PUT(b);
508 	}
509 }
510 
511 /* nm_bdg_ctl callback for VALE ports */
512 static int
513 netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
514 {
515 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
516 	struct nm_bridge *b = vpna->na_bdg;
517 
518 	(void)nmr;	// XXX merge ?
519 	if (attach)
520 		return 0; /* nothing to do */
521 	if (b) {
522 		netmap_set_all_rings(na, 0 /* disable */);
523 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
524 		vpna->na_bdg = NULL;
525 		netmap_set_all_rings(na, 1 /* enable */);
526 	}
527 	/* I have took reference just for attach */
528 	netmap_adapter_put(na);
529 	return 0;
530 }
531 
532 /* nm_dtor callback for ephemeral VALE ports */
533 static void
534 netmap_vp_dtor(struct netmap_adapter *na)
535 {
536 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
537 	struct nm_bridge *b = vpna->na_bdg;
538 
539 	ND("%s has %d references", na->name, na->na_refcount);
540 
541 	if (b) {
542 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
543 	}
544 
545 	if (vpna->autodelete && na->ifp != NULL) {
546 		ND("releasing %s", na->ifp->if_xname);
547 		NMG_UNLOCK();
548 		nm_os_vi_detach(na->ifp);
549 		NMG_LOCK();
550 	}
551 }
552 
553 /* remove a persistent VALE port from the system */
554 static int
555 nm_vi_destroy(const char *name)
556 {
557 	struct ifnet *ifp;
558 	struct netmap_vp_adapter *vpna;
559 	int error;
560 
561 	ifp = ifunit_ref(name);
562 	if (!ifp)
563 		return ENXIO;
564 	NMG_LOCK();
565 	/* make sure this is actually a VALE port */
566 	if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
567 		error = EINVAL;
568 		goto err;
569 	}
570 
571 	vpna = (struct netmap_vp_adapter *)NA(ifp);
572 
573 	/* we can only destroy ports that were created via NETMAP_BDG_NEWIF */
574 	if (vpna->autodelete) {
575 		error = EINVAL;
576 		goto err;
577 	}
578 
579 	/* also make sure that nobody is using the inferface */
580 	if (NETMAP_OWNED_BY_ANY(&vpna->up) ||
581 	    vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) {
582 		error = EBUSY;
583 		goto err;
584 	}
585 
586 	NMG_UNLOCK();
587 
588 	D("destroying a persistent vale interface %s", ifp->if_xname);
589 	/* Linux requires all the references are released
590 	 * before unregister
591 	 */
592 	netmap_detach(ifp);
593 	if_rele(ifp);
594 	nm_os_vi_detach(ifp);
595 	return 0;
596 
597 err:
598 	NMG_UNLOCK();
599 	if_rele(ifp);
600 	return error;
601 }
602 
603 static int
604 nm_update_info(struct nmreq *nmr, struct netmap_adapter *na)
605 {
606 	nmr->nr_rx_rings = na->num_rx_rings;
607 	nmr->nr_tx_rings = na->num_tx_rings;
608 	nmr->nr_rx_slots = na->num_rx_desc;
609 	nmr->nr_tx_slots = na->num_tx_desc;
610 	return netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, NULL, &nmr->nr_arg2);
611 }
612 
613 /*
614  * Create a virtual interface registered to the system.
615  * The interface will be attached to a bridge later.
616  */
617 int
618 netmap_vi_create(struct nmreq *nmr, int autodelete)
619 {
620 	struct ifnet *ifp;
621 	struct netmap_vp_adapter *vpna;
622 	struct netmap_mem_d *nmd = NULL;
623 	int error;
624 
625 	/* don't include VALE prefix */
626 	if (!strncmp(nmr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
627 		return EINVAL;
628 	ifp = ifunit_ref(nmr->nr_name);
629 	if (ifp) { /* already exist, cannot create new one */
630 		error = EEXIST;
631 		NMG_LOCK();
632 		if (NM_NA_VALID(ifp)) {
633 			int update_err = nm_update_info(nmr, NA(ifp));
634 			if (update_err)
635 				error = update_err;
636 		}
637 		NMG_UNLOCK();
638 		if_rele(ifp);
639 		return error;
640 	}
641 	error = nm_os_vi_persist(nmr->nr_name, &ifp);
642 	if (error)
643 		return error;
644 
645 	NMG_LOCK();
646 	if (nmr->nr_arg2) {
647 		nmd = netmap_mem_find(nmr->nr_arg2);
648 		if (nmd == NULL) {
649 			error = EINVAL;
650 			goto err_1;
651 		}
652 	}
653 	/* netmap_vp_create creates a struct netmap_vp_adapter */
654 	error = netmap_vp_create(nmr, ifp, nmd, &vpna);
655 	if (error) {
656 		D("error %d", error);
657 		goto err_1;
658 	}
659 	/* persist-specific routines */
660 	vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
661 	if (!autodelete) {
662 		netmap_adapter_get(&vpna->up);
663 	} else {
664 		vpna->autodelete = 1;
665 	}
666 	NM_ATTACH_NA(ifp, &vpna->up);
667 	/* return the updated info */
668 	error = nm_update_info(nmr, &vpna->up);
669 	if (error) {
670 		goto err_2;
671 	}
672 	D("returning nr_arg2 %d", nmr->nr_arg2);
673 	if (nmd)
674 		netmap_mem_put(nmd);
675 	NMG_UNLOCK();
676 	D("created %s", ifp->if_xname);
677 	return 0;
678 
679 err_2:
680 	netmap_detach(ifp);
681 err_1:
682 	if (nmd)
683 		netmap_mem_put(nmd);
684 	NMG_UNLOCK();
685 	nm_os_vi_detach(ifp);
686 
687 	return error;
688 }
689 
690 /* Try to get a reference to a netmap adapter attached to a VALE switch.
691  * If the adapter is found (or is created), this function returns 0, a
692  * non NULL pointer is returned into *na, and the caller holds a
693  * reference to the adapter.
694  * If an adapter is not found, then no reference is grabbed and the
695  * function returns an error code, or 0 if there is just a VALE prefix
696  * mismatch. Therefore the caller holds a reference when
697  * (*na != NULL && return == 0).
698  */
699 int
700 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na,
701 		struct netmap_mem_d *nmd, int create)
702 {
703 	char *nr_name = nmr->nr_name;
704 	const char *ifname;
705 	struct ifnet *ifp = NULL;
706 	int error = 0;
707 	struct netmap_vp_adapter *vpna, *hostna = NULL;
708 	struct nm_bridge *b;
709 	int i, j, cand = -1, cand2 = -1;
710 	int needed;
711 
712 	*na = NULL;     /* default return value */
713 
714 	/* first try to see if this is a bridge port. */
715 	NMG_LOCK_ASSERT();
716 	if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) {
717 		return 0;  /* no error, but no VALE prefix */
718 	}
719 
720 	b = nm_find_bridge(nr_name, create);
721 	if (b == NULL) {
722 		D("no bridges available for '%s'", nr_name);
723 		return (create ? ENOMEM : ENXIO);
724 	}
725 	if (strlen(nr_name) < b->bdg_namelen) /* impossible */
726 		panic("x");
727 
728 	/* Now we are sure that name starts with the bridge's name,
729 	 * lookup the port in the bridge. We need to scan the entire
730 	 * list. It is not important to hold a WLOCK on the bridge
731 	 * during the search because NMG_LOCK already guarantees
732 	 * that there are no other possible writers.
733 	 */
734 
735 	/* lookup in the local list of ports */
736 	for (j = 0; j < b->bdg_active_ports; j++) {
737 		i = b->bdg_port_index[j];
738 		vpna = b->bdg_ports[i];
739 		// KASSERT(na != NULL);
740 		ND("checking %s", vpna->up.name);
741 		if (!strcmp(vpna->up.name, nr_name)) {
742 			netmap_adapter_get(&vpna->up);
743 			ND("found existing if %s refs %d", nr_name)
744 			*na = &vpna->up;
745 			return 0;
746 		}
747 	}
748 	/* not found, should we create it? */
749 	if (!create)
750 		return ENXIO;
751 	/* yes we should, see if we have space to attach entries */
752 	needed = 2; /* in some cases we only need 1 */
753 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
754 		D("bridge full %d, cannot create new port", b->bdg_active_ports);
755 		return ENOMEM;
756 	}
757 	/* record the next two ports available, but do not allocate yet */
758 	cand = b->bdg_port_index[b->bdg_active_ports];
759 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
760 	ND("+++ bridge %s port %s used %d avail %d %d",
761 		b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
762 
763 	/*
764 	 * try see if there is a matching NIC with this name
765 	 * (after the bridge's name)
766 	 */
767 	ifname = nr_name + b->bdg_namelen + 1;
768 	ifp = ifunit_ref(ifname);
769 	if (!ifp) {
770 		/* Create an ephemeral virtual port
771 		 * This block contains all the ephemeral-specific logics
772 		 */
773 		if (nmr->nr_cmd) {
774 			/* nr_cmd must be 0 for a virtual port */
775 			error = EINVAL;
776 			goto out;
777 		}
778 
779 		/* bdg_netmap_attach creates a struct netmap_adapter */
780 		error = netmap_vp_create(nmr, NULL, nmd, &vpna);
781 		if (error) {
782 			D("error %d", error);
783 			goto out;
784 		}
785 		/* shortcut - we can skip get_hw_na(),
786 		 * ownership check and nm_bdg_attach()
787 		 */
788 	} else {
789 		struct netmap_adapter *hw;
790 
791 		error = netmap_get_hw_na(ifp, nmd, &hw);
792 		if (error || hw == NULL)
793 			goto out;
794 
795 		/* host adapter might not be created */
796 		error = hw->nm_bdg_attach(nr_name, hw);
797 		if (error)
798 			goto out;
799 		vpna = hw->na_vp;
800 		hostna = hw->na_hostvp;
801 		if (nmr->nr_arg1 != NETMAP_BDG_HOST)
802 			hostna = NULL;
803 	}
804 
805 	BDG_WLOCK(b);
806 	vpna->bdg_port = cand;
807 	ND("NIC  %p to bridge port %d", vpna, cand);
808 	/* bind the port to the bridge (virtual ports are not active) */
809 	b->bdg_ports[cand] = vpna;
810 	vpna->na_bdg = b;
811 	b->bdg_active_ports++;
812 	if (hostna != NULL) {
813 		/* also bind the host stack to the bridge */
814 		b->bdg_ports[cand2] = hostna;
815 		hostna->bdg_port = cand2;
816 		hostna->na_bdg = b;
817 		b->bdg_active_ports++;
818 		ND("host %p to bridge port %d", hostna, cand2);
819 	}
820 	ND("if %s refs %d", ifname, vpna->up.na_refcount);
821 	BDG_WUNLOCK(b);
822 	*na = &vpna->up;
823 	netmap_adapter_get(*na);
824 
825 out:
826 	if (ifp)
827 		if_rele(ifp);
828 
829 	return error;
830 }
831 
832 
833 /* Process NETMAP_BDG_ATTACH */
834 static int
835 nm_bdg_ctl_attach(struct nmreq *nmr)
836 {
837 	struct netmap_adapter *na;
838 	struct netmap_mem_d *nmd = NULL;
839 	int error;
840 
841 	NMG_LOCK();
842 
843 	if (nmr->nr_arg2) {
844 		nmd = netmap_mem_find(nmr->nr_arg2);
845 		if (nmd == NULL) {
846 			error = EINVAL;
847 			goto unlock_exit;
848 		}
849 	}
850 
851 	error = netmap_get_bdg_na(nmr, &na, nmd, 1 /* create if not exists */);
852 	if (error) /* no device */
853 		goto unlock_exit;
854 
855 	if (na == NULL) { /* VALE prefix missing */
856 		error = EINVAL;
857 		goto unlock_exit;
858 	}
859 
860 	if (NETMAP_OWNED_BY_ANY(na)) {
861 		error = EBUSY;
862 		goto unref_exit;
863 	}
864 
865 	if (na->nm_bdg_ctl) {
866 		/* nop for VALE ports. The bwrap needs to put the hwna
867 		 * in netmap mode (see netmap_bwrap_bdg_ctl)
868 		 */
869 		error = na->nm_bdg_ctl(na, nmr, 1);
870 		if (error)
871 			goto unref_exit;
872 		ND("registered %s to netmap-mode", na->name);
873 	}
874 	NMG_UNLOCK();
875 	return 0;
876 
877 unref_exit:
878 	netmap_adapter_put(na);
879 unlock_exit:
880 	NMG_UNLOCK();
881 	return error;
882 }
883 
884 static inline int
885 nm_is_bwrap(struct netmap_adapter *na)
886 {
887 	return na->nm_register == netmap_bwrap_reg;
888 }
889 
890 /* process NETMAP_BDG_DETACH */
891 static int
892 nm_bdg_ctl_detach(struct nmreq *nmr)
893 {
894 	struct netmap_adapter *na;
895 	int error;
896 
897 	NMG_LOCK();
898 	error = netmap_get_bdg_na(nmr, &na, NULL, 0 /* don't create */);
899 	if (error) { /* no device, or another bridge or user owns the device */
900 		goto unlock_exit;
901 	}
902 
903 	if (na == NULL) { /* VALE prefix missing */
904 		error = EINVAL;
905 		goto unlock_exit;
906 	} else if (nm_is_bwrap(na) &&
907 		   ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
908 		/* Don't detach a NIC with polling */
909 		error = EBUSY;
910 		netmap_adapter_put(na);
911 		goto unlock_exit;
912 	}
913 	if (na->nm_bdg_ctl) {
914 		/* remove the port from bridge. The bwrap
915 		 * also needs to put the hwna in normal mode
916 		 */
917 		error = na->nm_bdg_ctl(na, nmr, 0);
918 	}
919 
920 	netmap_adapter_put(na);
921 unlock_exit:
922 	NMG_UNLOCK();
923 	return error;
924 
925 }
926 
927 struct nm_bdg_polling_state;
928 struct
929 nm_bdg_kthread {
930 	struct nm_kctx *nmk;
931 	u_int qfirst;
932 	u_int qlast;
933 	struct nm_bdg_polling_state *bps;
934 };
935 
936 struct nm_bdg_polling_state {
937 	bool configured;
938 	bool stopped;
939 	struct netmap_bwrap_adapter *bna;
940 	u_int reg;
941 	u_int qfirst;
942 	u_int qlast;
943 	u_int cpu_from;
944 	u_int ncpus;
945 	struct nm_bdg_kthread *kthreads;
946 };
947 
948 static void
949 netmap_bwrap_polling(void *data, int is_kthread)
950 {
951 	struct nm_bdg_kthread *nbk = data;
952 	struct netmap_bwrap_adapter *bna;
953 	u_int qfirst, qlast, i;
954 	struct netmap_kring *kring0, *kring;
955 
956 	if (!nbk)
957 		return;
958 	qfirst = nbk->qfirst;
959 	qlast = nbk->qlast;
960 	bna = nbk->bps->bna;
961 	kring0 = NMR(bna->hwna, NR_RX);
962 
963 	for (i = qfirst; i < qlast; i++) {
964 		kring = kring0 + i;
965 		kring->nm_notify(kring, 0);
966 	}
967 }
968 
969 static int
970 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps)
971 {
972 	struct nm_kctx_cfg kcfg;
973 	int i, j;
974 
975 	bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus);
976 	if (bps->kthreads == NULL)
977 		return ENOMEM;
978 
979 	bzero(&kcfg, sizeof(kcfg));
980 	kcfg.worker_fn = netmap_bwrap_polling;
981 	kcfg.use_kthread = 1;
982 	for (i = 0; i < bps->ncpus; i++) {
983 		struct nm_bdg_kthread *t = bps->kthreads + i;
984 		int all = (bps->ncpus == 1 && bps->reg == NR_REG_ALL_NIC);
985 		int affinity = bps->cpu_from + i;
986 
987 		t->bps = bps;
988 		t->qfirst = all ? bps->qfirst /* must be 0 */: affinity;
989 		t->qlast = all ? bps->qlast : t->qfirst + 1;
990 		D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst,
991 			t->qlast);
992 
993 		kcfg.type = i;
994 		kcfg.worker_private = t;
995 		t->nmk = nm_os_kctx_create(&kcfg, 0, NULL);
996 		if (t->nmk == NULL) {
997 			goto cleanup;
998 		}
999 		nm_os_kctx_worker_setaff(t->nmk, affinity);
1000 	}
1001 	return 0;
1002 
1003 cleanup:
1004 	for (j = 0; j < i; j++) {
1005 		struct nm_bdg_kthread *t = bps->kthreads + i;
1006 		nm_os_kctx_destroy(t->nmk);
1007 	}
1008 	nm_os_free(bps->kthreads);
1009 	return EFAULT;
1010 }
1011 
1012 /* A variant of ptnetmap_start_kthreads() */
1013 static int
1014 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps)
1015 {
1016 	int error, i, j;
1017 
1018 	if (!bps) {
1019 		D("polling is not configured");
1020 		return EFAULT;
1021 	}
1022 	bps->stopped = false;
1023 
1024 	for (i = 0; i < bps->ncpus; i++) {
1025 		struct nm_bdg_kthread *t = bps->kthreads + i;
1026 		error = nm_os_kctx_worker_start(t->nmk);
1027 		if (error) {
1028 			D("error in nm_kthread_start()");
1029 			goto cleanup;
1030 		}
1031 	}
1032 	return 0;
1033 
1034 cleanup:
1035 	for (j = 0; j < i; j++) {
1036 		struct nm_bdg_kthread *t = bps->kthreads + i;
1037 		nm_os_kctx_worker_stop(t->nmk);
1038 	}
1039 	bps->stopped = true;
1040 	return error;
1041 }
1042 
1043 static void
1044 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps)
1045 {
1046 	int i;
1047 
1048 	if (!bps)
1049 		return;
1050 
1051 	for (i = 0; i < bps->ncpus; i++) {
1052 		struct nm_bdg_kthread *t = bps->kthreads + i;
1053 		nm_os_kctx_worker_stop(t->nmk);
1054 		nm_os_kctx_destroy(t->nmk);
1055 	}
1056 	bps->stopped = true;
1057 }
1058 
1059 static int
1060 get_polling_cfg(struct nmreq *nmr, struct netmap_adapter *na,
1061 			struct nm_bdg_polling_state *bps)
1062 {
1063 	int req_cpus, avail_cpus, core_from;
1064 	u_int reg, i, qfirst, qlast;
1065 
1066 	avail_cpus = nm_os_ncpus();
1067 	req_cpus = nmr->nr_arg1;
1068 
1069 	if (req_cpus == 0) {
1070 		D("req_cpus must be > 0");
1071 		return EINVAL;
1072 	} else if (req_cpus >= avail_cpus) {
1073 		D("for safety, we need at least one core left in the system");
1074 		return EINVAL;
1075 	}
1076 	reg = nmr->nr_flags & NR_REG_MASK;
1077 	i = nmr->nr_ringid & NETMAP_RING_MASK;
1078 	/*
1079 	 * ONE_NIC: dedicate one core to one ring. If multiple cores
1080 	 *          are specified, consecutive rings are also polled.
1081 	 *          For example, if ringid=2 and 2 cores are given,
1082 	 *          ring 2 and 3 are polled by core 2 and 3, respectively.
1083 	 * ALL_NIC: poll all the rings using a core specified by ringid.
1084 	 *          the number of cores must be 1.
1085 	 */
1086 	if (reg == NR_REG_ONE_NIC) {
1087 		if (i + req_cpus > nma_get_nrings(na, NR_RX)) {
1088 			D("only %d rings exist (ring %u-%u is given)",
1089 				nma_get_nrings(na, NR_RX), i, i+req_cpus);
1090 			return EINVAL;
1091 		}
1092 		qfirst = i;
1093 		qlast = qfirst + req_cpus;
1094 		core_from = qfirst;
1095 	} else if (reg == NR_REG_ALL_NIC) {
1096 		if (req_cpus != 1) {
1097 			D("ncpus must be 1 not %d for REG_ALL_NIC", req_cpus);
1098 			return EINVAL;
1099 		}
1100 		qfirst = 0;
1101 		qlast = nma_get_nrings(na, NR_RX);
1102 		core_from = i;
1103 	} else {
1104 		D("reg must be ALL_NIC or ONE_NIC");
1105 		return EINVAL;
1106 	}
1107 
1108 	bps->reg = reg;
1109 	bps->qfirst = qfirst;
1110 	bps->qlast = qlast;
1111 	bps->cpu_from = core_from;
1112 	bps->ncpus = req_cpus;
1113 	D("%s qfirst %u qlast %u cpu_from %u ncpus %u",
1114 		reg == NR_REG_ALL_NIC ? "REG_ALL_NIC" : "REG_ONE_NIC",
1115 		qfirst, qlast, core_from, req_cpus);
1116 	return 0;
1117 }
1118 
1119 static int
1120 nm_bdg_ctl_polling_start(struct nmreq *nmr, struct netmap_adapter *na)
1121 {
1122 	struct nm_bdg_polling_state *bps;
1123 	struct netmap_bwrap_adapter *bna;
1124 	int error;
1125 
1126 	bna = (struct netmap_bwrap_adapter *)na;
1127 	if (bna->na_polling_state) {
1128 		D("ERROR adapter already in polling mode");
1129 		return EFAULT;
1130 	}
1131 
1132 	bps = nm_os_malloc(sizeof(*bps));
1133 	if (!bps)
1134 		return ENOMEM;
1135 	bps->configured = false;
1136 	bps->stopped = true;
1137 
1138 	if (get_polling_cfg(nmr, na, bps)) {
1139 		nm_os_free(bps);
1140 		return EINVAL;
1141 	}
1142 
1143 	if (nm_bdg_create_kthreads(bps)) {
1144 		nm_os_free(bps);
1145 		return EFAULT;
1146 	}
1147 
1148 	bps->configured = true;
1149 	bna->na_polling_state = bps;
1150 	bps->bna = bna;
1151 
1152 	/* disable interrupt if possible */
1153 	if (bna->hwna->nm_intr)
1154 		bna->hwna->nm_intr(bna->hwna, 0);
1155 	/* start kthread now */
1156 	error = nm_bdg_polling_start_kthreads(bps);
1157 	if (error) {
1158 		D("ERROR nm_bdg_polling_start_kthread()");
1159 		nm_os_free(bps->kthreads);
1160 		nm_os_free(bps);
1161 		bna->na_polling_state = NULL;
1162 		if (bna->hwna->nm_intr)
1163 			bna->hwna->nm_intr(bna->hwna, 1);
1164 	}
1165 	return error;
1166 }
1167 
1168 static int
1169 nm_bdg_ctl_polling_stop(struct nmreq *nmr, struct netmap_adapter *na)
1170 {
1171 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na;
1172 	struct nm_bdg_polling_state *bps;
1173 
1174 	if (!bna->na_polling_state) {
1175 		D("ERROR adapter is not in polling mode");
1176 		return EFAULT;
1177 	}
1178 	bps = bna->na_polling_state;
1179 	nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state);
1180 	bps->configured = false;
1181 	nm_os_free(bps);
1182 	bna->na_polling_state = NULL;
1183 	/* reenable interrupt */
1184 	if (bna->hwna->nm_intr)
1185 		bna->hwna->nm_intr(bna->hwna, 1);
1186 	return 0;
1187 }
1188 
1189 /* Called by either user's context (netmap_ioctl())
1190  * or external kernel modules (e.g., Openvswitch).
1191  * Operation is indicated in nmr->nr_cmd.
1192  * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge
1193  * requires bdg_ops argument; the other commands ignore this argument.
1194  *
1195  * Called without NMG_LOCK.
1196  */
1197 int
1198 netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)
1199 {
1200 	struct nm_bridge *b, *bridges;
1201 	struct netmap_adapter *na;
1202 	struct netmap_vp_adapter *vpna;
1203 	char *name = nmr->nr_name;
1204 	int cmd = nmr->nr_cmd, namelen = strlen(name);
1205 	int error = 0, i, j;
1206 	u_int num_bridges;
1207 
1208 	netmap_bns_getbridges(&bridges, &num_bridges);
1209 
1210 	switch (cmd) {
1211 	case NETMAP_BDG_NEWIF:
1212 		error = netmap_vi_create(nmr, 0 /* no autodelete */);
1213 		break;
1214 
1215 	case NETMAP_BDG_DELIF:
1216 		error = nm_vi_destroy(nmr->nr_name);
1217 		break;
1218 
1219 	case NETMAP_BDG_ATTACH:
1220 		error = nm_bdg_ctl_attach(nmr);
1221 		break;
1222 
1223 	case NETMAP_BDG_DETACH:
1224 		error = nm_bdg_ctl_detach(nmr);
1225 		break;
1226 
1227 	case NETMAP_BDG_LIST:
1228 		/* this is used to enumerate bridges and ports */
1229 		if (namelen) { /* look up indexes of bridge and port */
1230 			if (strncmp(name, NM_BDG_NAME, strlen(NM_BDG_NAME))) {
1231 				error = EINVAL;
1232 				break;
1233 			}
1234 			NMG_LOCK();
1235 			b = nm_find_bridge(name, 0 /* don't create */);
1236 			if (!b) {
1237 				error = ENOENT;
1238 				NMG_UNLOCK();
1239 				break;
1240 			}
1241 
1242 			error = 0;
1243 			nmr->nr_arg1 = b - bridges; /* bridge index */
1244 			nmr->nr_arg2 = NM_BDG_NOPORT;
1245 			for (j = 0; j < b->bdg_active_ports; j++) {
1246 				i = b->bdg_port_index[j];
1247 				vpna = b->bdg_ports[i];
1248 				if (vpna == NULL) {
1249 					D("---AAAAAAAAARGH-------");
1250 					continue;
1251 				}
1252 				/* the former and the latter identify a
1253 				 * virtual port and a NIC, respectively
1254 				 */
1255 				if (!strcmp(vpna->up.name, name)) {
1256 					nmr->nr_arg2 = i; /* port index */
1257 					break;
1258 				}
1259 			}
1260 			NMG_UNLOCK();
1261 		} else {
1262 			/* return the first non-empty entry starting from
1263 			 * bridge nr_arg1 and port nr_arg2.
1264 			 *
1265 			 * Users can detect the end of the same bridge by
1266 			 * seeing the new and old value of nr_arg1, and can
1267 			 * detect the end of all the bridge by error != 0
1268 			 */
1269 			i = nmr->nr_arg1;
1270 			j = nmr->nr_arg2;
1271 
1272 			NMG_LOCK();
1273 			for (error = ENOENT; i < NM_BRIDGES; i++) {
1274 				b = bridges + i;
1275 				for ( ; j < NM_BDG_MAXPORTS; j++) {
1276 					if (b->bdg_ports[j] == NULL)
1277 						continue;
1278 					vpna = b->bdg_ports[j];
1279 					strncpy(name, vpna->up.name, (size_t)IFNAMSIZ);
1280 					error = 0;
1281 					goto out;
1282 				}
1283 				j = 0; /* following bridges scan from 0 */
1284 			}
1285 		out:
1286 			nmr->nr_arg1 = i;
1287 			nmr->nr_arg2 = j;
1288 			NMG_UNLOCK();
1289 		}
1290 		break;
1291 
1292 	case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */
1293 		/* register callbacks to the given bridge.
1294 		 * nmr->nr_name may be just bridge's name (including ':'
1295 		 * if it is not just NM_NAME).
1296 		 */
1297 		if (!bdg_ops) {
1298 			error = EINVAL;
1299 			break;
1300 		}
1301 		NMG_LOCK();
1302 		b = nm_find_bridge(name, 0 /* don't create */);
1303 		if (!b) {
1304 			error = EINVAL;
1305 		} else {
1306 			b->bdg_ops = *bdg_ops;
1307 		}
1308 		NMG_UNLOCK();
1309 		break;
1310 
1311 	case NETMAP_BDG_VNET_HDR:
1312 		/* Valid lengths for the virtio-net header are 0 (no header),
1313 		   10 and 12. */
1314 		if (nmr->nr_arg1 != 0 &&
1315 			nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) &&
1316 				nmr->nr_arg1 != 12) {
1317 			error = EINVAL;
1318 			break;
1319 		}
1320 		NMG_LOCK();
1321 		error = netmap_get_bdg_na(nmr, &na, NULL, 0);
1322 		if (na && !error) {
1323 			vpna = (struct netmap_vp_adapter *)na;
1324 			na->virt_hdr_len = nmr->nr_arg1;
1325 			if (na->virt_hdr_len) {
1326 				vpna->mfs = NETMAP_BUF_SIZE(na);
1327 			}
1328 			D("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
1329 			netmap_adapter_put(na);
1330 		} else if (!na) {
1331 			error = ENXIO;
1332 		}
1333 		NMG_UNLOCK();
1334 		break;
1335 
1336 	case NETMAP_BDG_POLLING_ON:
1337 	case NETMAP_BDG_POLLING_OFF:
1338 		NMG_LOCK();
1339 		error = netmap_get_bdg_na(nmr, &na, NULL, 0);
1340 		if (na && !error) {
1341 			if (!nm_is_bwrap(na)) {
1342 				error = EOPNOTSUPP;
1343 			} else if (cmd == NETMAP_BDG_POLLING_ON) {
1344 				error = nm_bdg_ctl_polling_start(nmr, na);
1345 				if (!error)
1346 					netmap_adapter_get(na);
1347 			} else {
1348 				error = nm_bdg_ctl_polling_stop(nmr, na);
1349 				if (!error)
1350 					netmap_adapter_put(na);
1351 			}
1352 			netmap_adapter_put(na);
1353 		}
1354 		NMG_UNLOCK();
1355 		break;
1356 
1357 	default:
1358 		D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
1359 		error = EINVAL;
1360 		break;
1361 	}
1362 	return error;
1363 }
1364 
1365 int
1366 netmap_bdg_config(struct nmreq *nmr)
1367 {
1368 	struct nm_bridge *b;
1369 	int error = EINVAL;
1370 
1371 	NMG_LOCK();
1372 	b = nm_find_bridge(nmr->nr_name, 0);
1373 	if (!b) {
1374 		NMG_UNLOCK();
1375 		return error;
1376 	}
1377 	NMG_UNLOCK();
1378 	/* Don't call config() with NMG_LOCK() held */
1379 	BDG_RLOCK(b);
1380 	if (b->bdg_ops.config != NULL)
1381 		error = b->bdg_ops.config((struct nm_ifreq *)nmr);
1382 	BDG_RUNLOCK(b);
1383 	return error;
1384 }
1385 
1386 
1387 /* nm_krings_create callback for VALE ports.
1388  * Calls the standard netmap_krings_create, then adds leases on rx
1389  * rings and bdgfwd on tx rings.
1390  */
1391 static int
1392 netmap_vp_krings_create(struct netmap_adapter *na)
1393 {
1394 	u_int tailroom;
1395 	int error, i;
1396 	uint32_t *leases;
1397 	u_int nrx = netmap_real_rings(na, NR_RX);
1398 
1399 	/*
1400 	 * Leases are attached to RX rings on vale ports
1401 	 */
1402 	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
1403 
1404 	error = netmap_krings_create(na, tailroom);
1405 	if (error)
1406 		return error;
1407 
1408 	leases = na->tailroom;
1409 
1410 	for (i = 0; i < nrx; i++) { /* Receive rings */
1411 		na->rx_rings[i].nkr_leases = leases;
1412 		leases += na->num_rx_desc;
1413 	}
1414 
1415 	error = nm_alloc_bdgfwd(na);
1416 	if (error) {
1417 		netmap_krings_delete(na);
1418 		return error;
1419 	}
1420 
1421 	return 0;
1422 }
1423 
1424 
1425 /* nm_krings_delete callback for VALE ports. */
1426 static void
1427 netmap_vp_krings_delete(struct netmap_adapter *na)
1428 {
1429 	nm_free_bdgfwd(na);
1430 	netmap_krings_delete(na);
1431 }
1432 
1433 
1434 static int
1435 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
1436 	struct netmap_vp_adapter *na, u_int ring_nr);
1437 
1438 
1439 /*
1440  * main dispatch routine for the bridge.
1441  * Grab packets from a kring, move them into the ft structure
1442  * associated to the tx (input) port. Max one instance per port,
1443  * filtered on input (ioctl, poll or XXX).
1444  * Returns the next position in the ring.
1445  */
1446 static int
1447 nm_bdg_preflush(struct netmap_kring *kring, u_int end)
1448 {
1449 	struct netmap_vp_adapter *na =
1450 		(struct netmap_vp_adapter*)kring->na;
1451 	struct netmap_ring *ring = kring->ring;
1452 	struct nm_bdg_fwd *ft;
1453 	u_int ring_nr = kring->ring_id;
1454 	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
1455 	u_int ft_i = 0;	/* start from 0 */
1456 	u_int frags = 1; /* how many frags ? */
1457 	struct nm_bridge *b = na->na_bdg;
1458 
1459 	/* To protect against modifications to the bridge we acquire a
1460 	 * shared lock, waiting if we can sleep (if the source port is
1461 	 * attached to a user process) or with a trylock otherwise (NICs).
1462 	 */
1463 	ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1464 	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
1465 		BDG_RLOCK(b);
1466 	else if (!BDG_RTRYLOCK(b))
1467 		return j;
1468 	ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1469 	ft = kring->nkr_ft;
1470 
1471 	for (; likely(j != end); j = nm_next(j, lim)) {
1472 		struct netmap_slot *slot = &ring->slot[j];
1473 		char *buf;
1474 
1475 		ft[ft_i].ft_len = slot->len;
1476 		ft[ft_i].ft_flags = slot->flags;
1477 
1478 		ND("flags is 0x%x", slot->flags);
1479 		/* we do not use the buf changed flag, but we still need to reset it */
1480 		slot->flags &= ~NS_BUF_CHANGED;
1481 
1482 		/* this slot goes into a list so initialize the link field */
1483 		ft[ft_i].ft_next = NM_FT_NULL;
1484 		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
1485 			(void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
1486 		if (unlikely(buf == NULL)) {
1487 			RD(5, "NULL %s buffer pointer from %s slot %d len %d",
1488 				(slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
1489 				kring->name, j, ft[ft_i].ft_len);
1490 			buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
1491 			ft[ft_i].ft_len = 0;
1492 			ft[ft_i].ft_flags = 0;
1493 		}
1494 		__builtin_prefetch(buf);
1495 		++ft_i;
1496 		if (slot->flags & NS_MOREFRAG) {
1497 			frags++;
1498 			continue;
1499 		}
1500 		if (unlikely(netmap_verbose && frags > 1))
1501 			RD(5, "%d frags at %d", frags, ft_i - frags);
1502 		ft[ft_i - frags].ft_frags = frags;
1503 		frags = 1;
1504 		if (unlikely((int)ft_i >= bridge_batch))
1505 			ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1506 	}
1507 	if (frags > 1) {
1508 		/* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
1509 		 * have to fix frags count. */
1510 		frags--;
1511 		ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
1512 		ft[ft_i - frags].ft_frags = frags;
1513 		D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
1514 	}
1515 	if (ft_i)
1516 		ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1517 	BDG_RUNLOCK(b);
1518 	return j;
1519 }
1520 
1521 
1522 /* ----- FreeBSD if_bridge hash function ------- */
1523 
1524 /*
1525  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1526  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1527  *
1528  * http://www.burtleburtle.net/bob/hash/spooky.html
1529  */
1530 #define mix(a, b, c)                                                    \
1531 do {                                                                    \
1532         a -= b; a -= c; a ^= (c >> 13);                                 \
1533         b -= c; b -= a; b ^= (a << 8);                                  \
1534         c -= a; c -= b; c ^= (b >> 13);                                 \
1535         a -= b; a -= c; a ^= (c >> 12);                                 \
1536         b -= c; b -= a; b ^= (a << 16);                                 \
1537         c -= a; c -= b; c ^= (b >> 5);                                  \
1538         a -= b; a -= c; a ^= (c >> 3);                                  \
1539         b -= c; b -= a; b ^= (a << 10);                                 \
1540         c -= a; c -= b; c ^= (b >> 15);                                 \
1541 } while (/*CONSTCOND*/0)
1542 
1543 
1544 static __inline uint32_t
1545 nm_bridge_rthash(const uint8_t *addr)
1546 {
1547         uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1548 
1549         b += addr[5] << 8;
1550         b += addr[4];
1551         a += addr[3] << 24;
1552         a += addr[2] << 16;
1553         a += addr[1] << 8;
1554         a += addr[0];
1555 
1556         mix(a, b, c);
1557 #define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
1558         return (c & BRIDGE_RTHASH_MASK);
1559 }
1560 
1561 #undef mix
1562 
1563 
1564 /* nm_register callback for VALE ports */
1565 static int
1566 netmap_vp_reg(struct netmap_adapter *na, int onoff)
1567 {
1568 	struct netmap_vp_adapter *vpna =
1569 		(struct netmap_vp_adapter*)na;
1570 	enum txrx t;
1571 	int i;
1572 
1573 	/* persistent ports may be put in netmap mode
1574 	 * before being attached to a bridge
1575 	 */
1576 	if (vpna->na_bdg)
1577 		BDG_WLOCK(vpna->na_bdg);
1578 	if (onoff) {
1579 		for_rx_tx(t) {
1580 			for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
1581 				struct netmap_kring *kring = &NMR(na, t)[i];
1582 
1583 				if (nm_kring_pending_on(kring))
1584 					kring->nr_mode = NKR_NETMAP_ON;
1585 			}
1586 		}
1587 		if (na->active_fds == 0)
1588 			na->na_flags |= NAF_NETMAP_ON;
1589 		 /* XXX on FreeBSD, persistent VALE ports should also
1590 		 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
1591 		 */
1592 	} else {
1593 		if (na->active_fds == 0)
1594 			na->na_flags &= ~NAF_NETMAP_ON;
1595 		for_rx_tx(t) {
1596 			for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
1597 				struct netmap_kring *kring = &NMR(na, t)[i];
1598 
1599 				if (nm_kring_pending_off(kring))
1600 					kring->nr_mode = NKR_NETMAP_OFF;
1601 			}
1602 		}
1603 	}
1604 	if (vpna->na_bdg)
1605 		BDG_WUNLOCK(vpna->na_bdg);
1606 	return 0;
1607 }
1608 
1609 
1610 /*
1611  * Lookup function for a learning bridge.
1612  * Update the hash table with the source address,
1613  * and then returns the destination port index, and the
1614  * ring in *dst_ring (at the moment, always use ring 0)
1615  */
1616 u_int
1617 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
1618 		struct netmap_vp_adapter *na)
1619 {
1620 	uint8_t *buf = ft->ft_buf;
1621 	u_int buf_len = ft->ft_len;
1622 	struct nm_hash_ent *ht = na->na_bdg->ht;
1623 	uint32_t sh, dh;
1624 	u_int dst, mysrc = na->bdg_port;
1625 	uint64_t smac, dmac;
1626 	uint8_t indbuf[12];
1627 
1628 	/* safety check, unfortunately we have many cases */
1629 	if (buf_len >= 14 + na->up.virt_hdr_len) {
1630 		/* virthdr + mac_hdr in the same slot */
1631 		buf += na->up.virt_hdr_len;
1632 		buf_len -= na->up.virt_hdr_len;
1633 	} else if (buf_len == na->up.virt_hdr_len && ft->ft_flags & NS_MOREFRAG) {
1634 		/* only header in first fragment */
1635 		ft++;
1636 		buf = ft->ft_buf;
1637 		buf_len = ft->ft_len;
1638 	} else {
1639 		RD(5, "invalid buf format, length %d", buf_len);
1640 		return NM_BDG_NOPORT;
1641 	}
1642 
1643 	if (ft->ft_flags & NS_INDIRECT) {
1644 		if (copyin(buf, indbuf, sizeof(indbuf))) {
1645 			return NM_BDG_NOPORT;
1646 		}
1647 		buf = indbuf;
1648 	}
1649 
1650 	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1651 	smac = le64toh(*(uint64_t *)(buf + 4));
1652 	smac >>= 16;
1653 
1654 	/*
1655 	 * The hash is somewhat expensive, there might be some
1656 	 * worthwhile optimizations here.
1657 	 */
1658 	if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
1659 		uint8_t *s = buf+6;
1660 		sh = nm_bridge_rthash(s); // XXX hash of source
1661 		/* update source port forwarding entry */
1662 		na->last_smac = ht[sh].mac = smac;	/* XXX expire ? */
1663 		ht[sh].ports = mysrc;
1664 		if (netmap_verbose)
1665 		    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1666 			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1667 	}
1668 	dst = NM_BDG_BROADCAST;
1669 	if ((buf[0] & 1) == 0) { /* unicast */
1670 		dh = nm_bridge_rthash(buf); // XXX hash of dst
1671 		if (ht[dh].mac == dmac) {	/* found dst */
1672 			dst = ht[dh].ports;
1673 		}
1674 		/* XXX otherwise return NM_BDG_UNKNOWN ? */
1675 	}
1676 	return dst;
1677 }
1678 
1679 
1680 /*
1681  * Available space in the ring. Only used in VALE code
1682  * and only with is_rx = 1
1683  */
1684 static inline uint32_t
1685 nm_kr_space(struct netmap_kring *k, int is_rx)
1686 {
1687 	int space;
1688 
1689 	if (is_rx) {
1690 		int busy = k->nkr_hwlease - k->nr_hwcur;
1691 		if (busy < 0)
1692 			busy += k->nkr_num_slots;
1693 		space = k->nkr_num_slots - 1 - busy;
1694 	} else {
1695 		/* XXX never used in this branch */
1696 		space = k->nr_hwtail - k->nkr_hwlease;
1697 		if (space < 0)
1698 			space += k->nkr_num_slots;
1699 	}
1700 #if 0
1701 	// sanity check
1702 	if (k->nkr_hwlease >= k->nkr_num_slots ||
1703 		k->nr_hwcur >= k->nkr_num_slots ||
1704 		k->nr_tail >= k->nkr_num_slots ||
1705 		busy < 0 ||
1706 		busy >= k->nkr_num_slots) {
1707 		D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1708 			k->nkr_lease_idx, k->nkr_num_slots);
1709 	}
1710 #endif
1711 	return space;
1712 }
1713 
1714 
1715 
1716 
1717 /* make a lease on the kring for N positions. return the
1718  * lease index
1719  * XXX only used in VALE code and with is_rx = 1
1720  */
1721 static inline uint32_t
1722 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1723 {
1724 	uint32_t lim = k->nkr_num_slots - 1;
1725 	uint32_t lease_idx = k->nkr_lease_idx;
1726 
1727 	k->nkr_leases[lease_idx] = NR_NOSLOT;
1728 	k->nkr_lease_idx = nm_next(lease_idx, lim);
1729 
1730 	if (n > nm_kr_space(k, is_rx)) {
1731 		D("invalid request for %d slots", n);
1732 		panic("x");
1733 	}
1734 	/* XXX verify that there are n slots */
1735 	k->nkr_hwlease += n;
1736 	if (k->nkr_hwlease > lim)
1737 		k->nkr_hwlease -= lim + 1;
1738 
1739 	if (k->nkr_hwlease >= k->nkr_num_slots ||
1740 		k->nr_hwcur >= k->nkr_num_slots ||
1741 		k->nr_hwtail >= k->nkr_num_slots ||
1742 		k->nkr_lease_idx >= k->nkr_num_slots) {
1743 		D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1744 			k->na->name,
1745 			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1746 			k->nkr_lease_idx, k->nkr_num_slots);
1747 	}
1748 	return lease_idx;
1749 }
1750 
1751 /*
1752  *
1753  * This flush routine supports only unicast and broadcast but a large
1754  * number of ports, and lets us replace the learn and dispatch functions.
1755  */
1756 int
1757 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1758 		u_int ring_nr)
1759 {
1760 	struct nm_bdg_q *dst_ents, *brddst;
1761 	uint16_t num_dsts = 0, *dsts;
1762 	struct nm_bridge *b = na->na_bdg;
1763 	u_int i, me = na->bdg_port;
1764 
1765 	/*
1766 	 * The work area (pointed by ft) is followed by an array of
1767 	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1768 	 * queues per port plus one for the broadcast traffic.
1769 	 * Then we have an array of destination indexes.
1770 	 */
1771 	dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1772 	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1773 
1774 	/* first pass: find a destination for each packet in the batch */
1775 	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1776 		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1777 		uint16_t dst_port, d_i;
1778 		struct nm_bdg_q *d;
1779 
1780 		ND("slot %d frags %d", i, ft[i].ft_frags);
1781 		/* Drop the packet if the virtio-net header is not into the first
1782 		   fragment nor at the very beginning of the second. */
1783 		if (unlikely(na->up.virt_hdr_len > ft[i].ft_len))
1784 			continue;
1785 		dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na);
1786 		if (netmap_verbose > 255)
1787 			RD(5, "slot %d port %d -> %d", i, me, dst_port);
1788 		if (dst_port == NM_BDG_NOPORT)
1789 			continue; /* this packet is identified to be dropped */
1790 		else if (unlikely(dst_port > NM_BDG_MAXPORTS))
1791 			continue;
1792 		else if (dst_port == NM_BDG_BROADCAST)
1793 			dst_ring = 0; /* broadcasts always go to ring 0 */
1794 		else if (unlikely(dst_port == me ||
1795 		    !b->bdg_ports[dst_port]))
1796 			continue;
1797 
1798 		/* get a position in the scratch pad */
1799 		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1800 		d = dst_ents + d_i;
1801 
1802 		/* append the first fragment to the list */
1803 		if (d->bq_head == NM_FT_NULL) { /* new destination */
1804 			d->bq_head = d->bq_tail = i;
1805 			/* remember this position to be scanned later */
1806 			if (dst_port != NM_BDG_BROADCAST)
1807 				dsts[num_dsts++] = d_i;
1808 		} else {
1809 			ft[d->bq_tail].ft_next = i;
1810 			d->bq_tail = i;
1811 		}
1812 		d->bq_len += ft[i].ft_frags;
1813 	}
1814 
1815 	/*
1816 	 * Broadcast traffic goes to ring 0 on all destinations.
1817 	 * So we need to add these rings to the list of ports to scan.
1818 	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1819 	 * expensive. We should keep a compact list of active destinations
1820 	 * so we could shorten this loop.
1821 	 */
1822 	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1823 	if (brddst->bq_head != NM_FT_NULL) {
1824 		u_int j;
1825 		for (j = 0; likely(j < b->bdg_active_ports); j++) {
1826 			uint16_t d_i;
1827 			i = b->bdg_port_index[j];
1828 			if (unlikely(i == me))
1829 				continue;
1830 			d_i = i * NM_BDG_MAXRINGS;
1831 			if (dst_ents[d_i].bq_head == NM_FT_NULL)
1832 				dsts[num_dsts++] = d_i;
1833 		}
1834 	}
1835 
1836 	ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1837 	/* second pass: scan destinations */
1838 	for (i = 0; i < num_dsts; i++) {
1839 		struct netmap_vp_adapter *dst_na;
1840 		struct netmap_kring *kring;
1841 		struct netmap_ring *ring;
1842 		u_int dst_nr, lim, j, d_i, next, brd_next;
1843 		u_int needed, howmany;
1844 		int retry = netmap_txsync_retry;
1845 		struct nm_bdg_q *d;
1846 		uint32_t my_start = 0, lease_idx = 0;
1847 		int nrings;
1848 		int virt_hdr_mismatch = 0;
1849 
1850 		d_i = dsts[i];
1851 		ND("second pass %d port %d", i, d_i);
1852 		d = dst_ents + d_i;
1853 		// XXX fix the division
1854 		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1855 		/* protect from the lookup function returning an inactive
1856 		 * destination port
1857 		 */
1858 		if (unlikely(dst_na == NULL))
1859 			goto cleanup;
1860 		if (dst_na->up.na_flags & NAF_SW_ONLY)
1861 			goto cleanup;
1862 		/*
1863 		 * The interface may be in !netmap mode in two cases:
1864 		 * - when na is attached but not activated yet;
1865 		 * - when na is being deactivated but is still attached.
1866 		 */
1867 		if (unlikely(!nm_netmap_on(&dst_na->up))) {
1868 			ND("not in netmap mode!");
1869 			goto cleanup;
1870 		}
1871 
1872 		/* there is at least one either unicast or broadcast packet */
1873 		brd_next = brddst->bq_head;
1874 		next = d->bq_head;
1875 		/* we need to reserve this many slots. If fewer are
1876 		 * available, some packets will be dropped.
1877 		 * Packets may have multiple fragments, so we may not use
1878 		 * there is a chance that we may not use all of the slots
1879 		 * we have claimed, so we will need to handle the leftover
1880 		 * ones when we regain the lock.
1881 		 */
1882 		needed = d->bq_len + brddst->bq_len;
1883 
1884 		if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
1885 			if (netmap_verbose) {
1886 			    RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
1887 				  dst_na->up.virt_hdr_len);
1888 			}
1889 			/* There is a virtio-net header/offloadings mismatch between
1890 			 * source and destination. The slower mismatch datapath will
1891 			 * be used to cope with all the mismatches.
1892 			 */
1893 			virt_hdr_mismatch = 1;
1894 			if (dst_na->mfs < na->mfs) {
1895 				/* We may need to do segmentation offloadings, and so
1896 				 * we may need a number of destination slots greater
1897 				 * than the number of input slots ('needed').
1898 				 * We look for the smallest integer 'x' which satisfies:
1899 				 *	needed * na->mfs + x * H <= x * na->mfs
1900 				 * where 'H' is the length of the longest header that may
1901 				 * be replicated in the segmentation process (e.g. for
1902 				 * TCPv4 we must account for ethernet header, IP header
1903 				 * and TCPv4 header).
1904 				 */
1905 				needed = (needed * na->mfs) /
1906 						(dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
1907 				ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
1908 			}
1909 		}
1910 
1911 		ND(5, "pass 2 dst %d is %x %s",
1912 			i, d_i, is_vp ? "virtual" : "nic/host");
1913 		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1914 		nrings = dst_na->up.num_rx_rings;
1915 		if (dst_nr >= nrings)
1916 			dst_nr = dst_nr % nrings;
1917 		kring = &dst_na->up.rx_rings[dst_nr];
1918 		ring = kring->ring;
1919 		lim = kring->nkr_num_slots - 1;
1920 
1921 retry:
1922 
1923 		if (dst_na->retry && retry) {
1924 			/* try to get some free slot from the previous run */
1925 			kring->nm_notify(kring, 0);
1926 			/* actually useful only for bwraps, since there
1927 			 * the notify will trigger a txsync on the hwna. VALE ports
1928 			 * have dst_na->retry == 0
1929 			 */
1930 		}
1931 		/* reserve the buffers in the queue and an entry
1932 		 * to report completion, and drop lock.
1933 		 * XXX this might become a helper function.
1934 		 */
1935 		mtx_lock(&kring->q_lock);
1936 		if (kring->nkr_stopped) {
1937 			mtx_unlock(&kring->q_lock);
1938 			goto cleanup;
1939 		}
1940 		my_start = j = kring->nkr_hwlease;
1941 		howmany = nm_kr_space(kring, 1);
1942 		if (needed < howmany)
1943 			howmany = needed;
1944 		lease_idx = nm_kr_lease(kring, howmany, 1);
1945 		mtx_unlock(&kring->q_lock);
1946 
1947 		/* only retry if we need more than available slots */
1948 		if (retry && needed <= howmany)
1949 			retry = 0;
1950 
1951 		/* copy to the destination queue */
1952 		while (howmany > 0) {
1953 			struct netmap_slot *slot;
1954 			struct nm_bdg_fwd *ft_p, *ft_end;
1955 			u_int cnt;
1956 
1957 			/* find the queue from which we pick next packet.
1958 			 * NM_FT_NULL is always higher than valid indexes
1959 			 * so we never dereference it if the other list
1960 			 * has packets (and if both are empty we never
1961 			 * get here).
1962 			 */
1963 			if (next < brd_next) {
1964 				ft_p = ft + next;
1965 				next = ft_p->ft_next;
1966 			} else { /* insert broadcast */
1967 				ft_p = ft + brd_next;
1968 				brd_next = ft_p->ft_next;
1969 			}
1970 			cnt = ft_p->ft_frags; // cnt > 0
1971 			if (unlikely(cnt > howmany))
1972 			    break; /* no more space */
1973 			if (netmap_verbose && cnt > 1)
1974 				RD(5, "rx %d frags to %d", cnt, j);
1975 			ft_end = ft_p + cnt;
1976 			if (unlikely(virt_hdr_mismatch)) {
1977 				bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
1978 			} else {
1979 				howmany -= cnt;
1980 				do {
1981 					char *dst, *src = ft_p->ft_buf;
1982 					size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1983 
1984 					slot = &ring->slot[j];
1985 					dst = NMB(&dst_na->up, slot);
1986 
1987 					ND("send [%d] %d(%d) bytes at %s:%d",
1988 							i, (int)copy_len, (int)dst_len,
1989 							NM_IFPNAME(dst_ifp), j);
1990 					/* round to a multiple of 64 */
1991 					copy_len = (copy_len + 63) & ~63;
1992 
1993 					if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
1994 						     copy_len > NETMAP_BUF_SIZE(&na->up))) {
1995 						RD(5, "invalid len %d, down to 64", (int)copy_len);
1996 						copy_len = dst_len = 64; // XXX
1997 					}
1998 					if (ft_p->ft_flags & NS_INDIRECT) {
1999 						if (copyin(src, dst, copy_len)) {
2000 							// invalid user pointer, pretend len is 0
2001 							dst_len = 0;
2002 						}
2003 					} else {
2004 						//memcpy(dst, src, copy_len);
2005 						pkt_copy(src, dst, (int)copy_len);
2006 					}
2007 					slot->len = dst_len;
2008 					slot->flags = (cnt << 8)| NS_MOREFRAG;
2009 					j = nm_next(j, lim);
2010 					needed--;
2011 					ft_p++;
2012 				} while (ft_p != ft_end);
2013 				slot->flags = (cnt << 8); /* clear flag on last entry */
2014 			}
2015 			/* are we done ? */
2016 			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
2017 				break;
2018 		}
2019 		{
2020 		    /* current position */
2021 		    uint32_t *p = kring->nkr_leases; /* shorthand */
2022 		    uint32_t update_pos;
2023 		    int still_locked = 1;
2024 
2025 		    mtx_lock(&kring->q_lock);
2026 		    if (unlikely(howmany > 0)) {
2027 			/* not used all bufs. If i am the last one
2028 			 * i can recover the slots, otherwise must
2029 			 * fill them with 0 to mark empty packets.
2030 			 */
2031 			ND("leftover %d bufs", howmany);
2032 			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
2033 			    /* yes i am the last one */
2034 			    ND("roll back nkr_hwlease to %d", j);
2035 			    kring->nkr_hwlease = j;
2036 			} else {
2037 			    while (howmany-- > 0) {
2038 				ring->slot[j].len = 0;
2039 				ring->slot[j].flags = 0;
2040 				j = nm_next(j, lim);
2041 			    }
2042 			}
2043 		    }
2044 		    p[lease_idx] = j; /* report I am done */
2045 
2046 		    update_pos = kring->nr_hwtail;
2047 
2048 		    if (my_start == update_pos) {
2049 			/* all slots before my_start have been reported,
2050 			 * so scan subsequent leases to see if other ranges
2051 			 * have been completed, and to a selwakeup or txsync.
2052 		         */
2053 			while (lease_idx != kring->nkr_lease_idx &&
2054 				p[lease_idx] != NR_NOSLOT) {
2055 			    j = p[lease_idx];
2056 			    p[lease_idx] = NR_NOSLOT;
2057 			    lease_idx = nm_next(lease_idx, lim);
2058 			}
2059 			/* j is the new 'write' position. j != my_start
2060 			 * means there are new buffers to report
2061 			 */
2062 			if (likely(j != my_start)) {
2063 				kring->nr_hwtail = j;
2064 				still_locked = 0;
2065 				mtx_unlock(&kring->q_lock);
2066 				kring->nm_notify(kring, 0);
2067 				/* this is netmap_notify for VALE ports and
2068 				 * netmap_bwrap_notify for bwrap. The latter will
2069 				 * trigger a txsync on the underlying hwna
2070 				 */
2071 				if (dst_na->retry && retry--) {
2072 					/* XXX this is going to call nm_notify again.
2073 					 * Only useful for bwrap in virtual machines
2074 					 */
2075 					goto retry;
2076 				}
2077 			}
2078 		    }
2079 		    if (still_locked)
2080 			mtx_unlock(&kring->q_lock);
2081 		}
2082 cleanup:
2083 		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
2084 		d->bq_len = 0;
2085 	}
2086 	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
2087 	brddst->bq_len = 0;
2088 	return 0;
2089 }
2090 
2091 /* nm_txsync callback for VALE ports */
2092 static int
2093 netmap_vp_txsync(struct netmap_kring *kring, int flags)
2094 {
2095 	struct netmap_vp_adapter *na =
2096 		(struct netmap_vp_adapter *)kring->na;
2097 	u_int done;
2098 	u_int const lim = kring->nkr_num_slots - 1;
2099 	u_int const head = kring->rhead;
2100 
2101 	if (bridge_batch <= 0) { /* testing only */
2102 		done = head; // used all
2103 		goto done;
2104 	}
2105 	if (!na->na_bdg) {
2106 		done = head;
2107 		goto done;
2108 	}
2109 	if (bridge_batch > NM_BDG_BATCH)
2110 		bridge_batch = NM_BDG_BATCH;
2111 
2112 	done = nm_bdg_preflush(kring, head);
2113 done:
2114 	if (done != head)
2115 		D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
2116 	/*
2117 	 * packets between 'done' and 'cur' are left unsent.
2118 	 */
2119 	kring->nr_hwcur = done;
2120 	kring->nr_hwtail = nm_prev(done, lim);
2121 	if (netmap_verbose)
2122 		D("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
2123 	return 0;
2124 }
2125 
2126 
2127 /* rxsync code used by VALE ports nm_rxsync callback and also
2128  * internally by the brwap
2129  */
2130 static int
2131 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
2132 {
2133 	struct netmap_adapter *na = kring->na;
2134 	struct netmap_ring *ring = kring->ring;
2135 	u_int nm_i, lim = kring->nkr_num_slots - 1;
2136 	u_int head = kring->rhead;
2137 	int n;
2138 
2139 	if (head > lim) {
2140 		D("ouch dangerous reset!!!");
2141 		n = netmap_ring_reinit(kring);
2142 		goto done;
2143 	}
2144 
2145 	/* First part, import newly received packets. */
2146 	/* actually nothing to do here, they are already in the kring */
2147 
2148 	/* Second part, skip past packets that userspace has released. */
2149 	nm_i = kring->nr_hwcur;
2150 	if (nm_i != head) {
2151 		/* consistency check, but nothing really important here */
2152 		for (n = 0; likely(nm_i != head); n++) {
2153 			struct netmap_slot *slot = &ring->slot[nm_i];
2154 			void *addr = NMB(na, slot);
2155 
2156 			if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
2157 				D("bad buffer index %d, ignore ?",
2158 					slot->buf_idx);
2159 			}
2160 			slot->flags &= ~NS_BUF_CHANGED;
2161 			nm_i = nm_next(nm_i, lim);
2162 		}
2163 		kring->nr_hwcur = head;
2164 	}
2165 
2166 	n = 0;
2167 done:
2168 	return n;
2169 }
2170 
2171 /*
2172  * nm_rxsync callback for VALE ports
2173  * user process reading from a VALE switch.
2174  * Already protected against concurrent calls from userspace,
2175  * but we must acquire the queue's lock to protect against
2176  * writers on the same queue.
2177  */
2178 static int
2179 netmap_vp_rxsync(struct netmap_kring *kring, int flags)
2180 {
2181 	int n;
2182 
2183 	mtx_lock(&kring->q_lock);
2184 	n = netmap_vp_rxsync_locked(kring, flags);
2185 	mtx_unlock(&kring->q_lock);
2186 	return n;
2187 }
2188 
2189 
2190 /* nm_bdg_attach callback for VALE ports
2191  * The na_vp port is this same netmap_adapter. There is no host port.
2192  */
2193 static int
2194 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na)
2195 {
2196 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
2197 
2198 	if (vpna->na_bdg)
2199 		return EBUSY;
2200 	na->na_vp = vpna;
2201 	strncpy(na->name, name, sizeof(na->name));
2202 	na->na_hostvp = NULL;
2203 	return 0;
2204 }
2205 
2206 /* create a netmap_vp_adapter that describes a VALE port.
2207  * Only persistent VALE ports have a non-null ifp.
2208  */
2209 static int
2210 netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp,
2211 		struct netmap_mem_d *nmd,
2212 		struct netmap_vp_adapter **ret)
2213 {
2214 	struct netmap_vp_adapter *vpna;
2215 	struct netmap_adapter *na;
2216 	int error = 0;
2217 	u_int npipes = 0;
2218 
2219 	vpna = nm_os_malloc(sizeof(*vpna));
2220 	if (vpna == NULL)
2221 		return ENOMEM;
2222 
2223  	na = &vpna->up;
2224 
2225 	na->ifp = ifp;
2226 	strncpy(na->name, nmr->nr_name, sizeof(na->name));
2227 
2228 	/* bound checking */
2229 	na->num_tx_rings = nmr->nr_tx_rings;
2230 	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
2231 	nmr->nr_tx_rings = na->num_tx_rings; // write back
2232 	na->num_rx_rings = nmr->nr_rx_rings;
2233 	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
2234 	nmr->nr_rx_rings = na->num_rx_rings; // write back
2235 	nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
2236 			1, NM_BDG_MAXSLOTS, NULL);
2237 	na->num_tx_desc = nmr->nr_tx_slots;
2238 	nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
2239 			1, NM_BDG_MAXSLOTS, NULL);
2240 	/* validate number of pipes. We want at least 1,
2241 	 * but probably can do with some more.
2242 	 * So let's use 2 as default (when 0 is supplied)
2243 	 */
2244 	npipes = nmr->nr_arg1;
2245 	nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
2246 	nmr->nr_arg1 = npipes;	/* write back */
2247 	/* validate extra bufs */
2248 	nm_bound_var(&nmr->nr_arg3, 0, 0,
2249 			128*NM_BDG_MAXSLOTS, NULL);
2250 	na->num_rx_desc = nmr->nr_rx_slots;
2251 	vpna->mfs = 1514;
2252 	vpna->last_smac = ~0llu;
2253 	/*if (vpna->mfs > netmap_buf_size)  TODO netmap_buf_size is zero??
2254 		vpna->mfs = netmap_buf_size; */
2255         if (netmap_verbose)
2256 		D("max frame size %u", vpna->mfs);
2257 
2258 	na->na_flags |= NAF_BDG_MAYSLEEP;
2259 	/* persistent VALE ports look like hw devices
2260 	 * with a native netmap adapter
2261 	 */
2262 	if (ifp)
2263 		na->na_flags |= NAF_NATIVE;
2264 	na->nm_txsync = netmap_vp_txsync;
2265 	na->nm_rxsync = netmap_vp_rxsync;
2266 	na->nm_register = netmap_vp_reg;
2267 	na->nm_krings_create = netmap_vp_krings_create;
2268 	na->nm_krings_delete = netmap_vp_krings_delete;
2269 	na->nm_dtor = netmap_vp_dtor;
2270 	D("nr_arg2 %d", nmr->nr_arg2);
2271 	na->nm_mem = nmd ?
2272 		netmap_mem_get(nmd):
2273 		netmap_mem_private_new(
2274 			na->num_tx_rings, na->num_tx_desc,
2275 			na->num_rx_rings, na->num_rx_desc,
2276 			nmr->nr_arg3, npipes, &error);
2277 	if (na->nm_mem == NULL)
2278 		goto err;
2279 	na->nm_bdg_attach = netmap_vp_bdg_attach;
2280 	/* other nmd fields are set in the common routine */
2281 	error = netmap_attach_common(na);
2282 	if (error)
2283 		goto err;
2284 	*ret = vpna;
2285 	return 0;
2286 
2287 err:
2288 	if (na->nm_mem != NULL)
2289 		netmap_mem_put(na->nm_mem);
2290 	nm_os_free(vpna);
2291 	return error;
2292 }
2293 
2294 /* Bridge wrapper code (bwrap).
2295  * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
2296  * VALE switch.
2297  * The main task is to swap the meaning of tx and rx rings to match the
2298  * expectations of the VALE switch code (see nm_bdg_flush).
2299  *
2300  * The bwrap works by interposing a netmap_bwrap_adapter between the
2301  * rest of the system and the hwna. The netmap_bwrap_adapter looks like
2302  * a netmap_vp_adapter to the rest the system, but, internally, it
2303  * translates all callbacks to what the hwna expects.
2304  *
2305  * Note that we have to intercept callbacks coming from two sides:
2306  *
2307  *  - callbacks coming from the netmap module are intercepted by
2308  *    passing around the netmap_bwrap_adapter instead of the hwna
2309  *
2310  *  - callbacks coming from outside of the netmap module only know
2311  *    about the hwna. This, however, only happens in interrupt
2312  *    handlers, where only the hwna->nm_notify callback is called.
2313  *    What the bwrap does is to overwrite the hwna->nm_notify callback
2314  *    with its own netmap_bwrap_intr_notify.
2315  *    XXX This assumes that the hwna->nm_notify callback was the
2316  *    standard netmap_notify(), as it is the case for nic adapters.
2317  *    Any additional action performed by hwna->nm_notify will not be
2318  *    performed by netmap_bwrap_intr_notify.
2319  *
2320  * Additionally, the bwrap can optionally attach the host rings pair
2321  * of the wrapped adapter to a different port of the switch.
2322  */
2323 
2324 
2325 static void
2326 netmap_bwrap_dtor(struct netmap_adapter *na)
2327 {
2328 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2329 	struct netmap_adapter *hwna = bna->hwna;
2330 	struct nm_bridge *b = bna->up.na_bdg,
2331 		*bh = bna->host.na_bdg;
2332 
2333 	netmap_mem_put(bna->host.up.nm_mem);
2334 
2335 	if (b) {
2336 		netmap_bdg_detach_common(b, bna->up.bdg_port,
2337 			    (bh ? bna->host.bdg_port : -1));
2338 	}
2339 
2340 	ND("na %p", na);
2341 	na->ifp = NULL;
2342 	bna->host.up.ifp = NULL;
2343 	hwna->na_private = NULL;
2344 	hwna->na_vp = hwna->na_hostvp = NULL;
2345 	hwna->na_flags &= ~NAF_BUSY;
2346 	netmap_adapter_put(hwna);
2347 
2348 }
2349 
2350 
2351 /*
2352  * Intr callback for NICs connected to a bridge.
2353  * Simply ignore tx interrupts (maybe we could try to recover space ?)
2354  * and pass received packets from nic to the bridge.
2355  *
2356  * XXX TODO check locking: this is called from the interrupt
2357  * handler so we should make sure that the interface is not
2358  * disconnected while passing down an interrupt.
2359  *
2360  * Note, no user process can access this NIC or the host stack.
2361  * The only part of the ring that is significant are the slots,
2362  * and head/cur/tail are set from the kring as needed
2363  * (part as a receive ring, part as a transmit ring).
2364  *
2365  * callback that overwrites the hwna notify callback.
2366  * Packets come from the outside or from the host stack and are put on an
2367  * hwna rx ring.
2368  * The bridge wrapper then sends the packets through the bridge.
2369  */
2370 static int
2371 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
2372 {
2373 	struct netmap_adapter *na = kring->na;
2374 	struct netmap_bwrap_adapter *bna = na->na_private;
2375 	struct netmap_kring *bkring;
2376 	struct netmap_vp_adapter *vpna = &bna->up;
2377 	u_int ring_nr = kring->ring_id;
2378 	int ret = NM_IRQ_COMPLETED;
2379 	int error;
2380 
2381 	if (netmap_verbose)
2382 	    D("%s %s 0x%x", na->name, kring->name, flags);
2383 
2384 	bkring = &vpna->up.tx_rings[ring_nr];
2385 
2386 	/* make sure the ring is not disabled */
2387 	if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) {
2388 		return EIO;
2389 	}
2390 
2391 	if (netmap_verbose)
2392 	    D("%s head %d cur %d tail %d",  na->name,
2393 		kring->rhead, kring->rcur, kring->rtail);
2394 
2395 	/* simulate a user wakeup on the rx ring
2396 	 * fetch packets that have arrived.
2397 	 */
2398 	error = kring->nm_sync(kring, 0);
2399 	if (error)
2400 		goto put_out;
2401 	if (kring->nr_hwcur == kring->nr_hwtail) {
2402 		if (netmap_verbose)
2403 			D("how strange, interrupt with no packets on %s",
2404 			    na->name);
2405 		goto put_out;
2406 	}
2407 
2408 	/* new packets are kring->rcur to kring->nr_hwtail, and the bkring
2409 	 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail
2410 	 * to push all packets out.
2411 	 */
2412 	bkring->rhead = bkring->rcur = kring->nr_hwtail;
2413 
2414 	netmap_vp_txsync(bkring, flags);
2415 
2416 	/* mark all buffers as released on this ring */
2417 	kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail;
2418 	/* another call to actually release the buffers */
2419 	error = kring->nm_sync(kring, 0);
2420 
2421 	/* The second rxsync may have further advanced hwtail. If this happens,
2422 	 *  return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */
2423 	if (kring->rcur != kring->nr_hwtail) {
2424 		ret = NM_IRQ_RESCHED;
2425 	}
2426 put_out:
2427 	nm_kr_put(kring);
2428 
2429 	return error ? error : ret;
2430 }
2431 
2432 
2433 /* nm_register callback for bwrap */
2434 static int
2435 netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
2436 {
2437 	struct netmap_bwrap_adapter *bna =
2438 		(struct netmap_bwrap_adapter *)na;
2439 	struct netmap_adapter *hwna = bna->hwna;
2440 	struct netmap_vp_adapter *hostna = &bna->host;
2441 	int error, i;
2442 	enum txrx t;
2443 
2444 	ND("%s %s", na->name, onoff ? "on" : "off");
2445 
2446 	if (onoff) {
2447 		/* netmap_do_regif has been called on the bwrap na.
2448 		 * We need to pass the information about the
2449 		 * memory allocator down to the hwna before
2450 		 * putting it in netmap mode
2451 		 */
2452 		hwna->na_lut = na->na_lut;
2453 
2454 		if (hostna->na_bdg) {
2455 			/* if the host rings have been attached to switch,
2456 			 * we need to copy the memory allocator information
2457 			 * in the hostna also
2458 			 */
2459 			hostna->up.na_lut = na->na_lut;
2460 		}
2461 
2462 		/* cross-link the netmap rings
2463 		 * The original number of rings comes from hwna,
2464 		 * rx rings on one side equals tx rings on the other.
2465 		 */
2466 		for_rx_tx(t) {
2467 			enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2468 			for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {
2469 				NMR(hwna, r)[i].ring = NMR(na, t)[i].ring;
2470 			}
2471 		}
2472 
2473 		if (na->na_flags & NAF_HOST_RINGS) {
2474 			struct netmap_adapter *hna = &hostna->up;
2475 			/* the hostna rings are the host rings of the bwrap.
2476 			 * The corresponding krings must point back to the
2477 			 * hostna
2478 			 */
2479 			hna->tx_rings = &na->tx_rings[na->num_tx_rings];
2480 			hna->tx_rings[0].na = hna;
2481 			hna->rx_rings = &na->rx_rings[na->num_rx_rings];
2482 			hna->rx_rings[0].na = hna;
2483 		}
2484 	}
2485 
2486 	/* pass down the pending ring state information */
2487 	for_rx_tx(t) {
2488 		for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
2489 			NMR(hwna, t)[i].nr_pending_mode =
2490 				NMR(na, t)[i].nr_pending_mode;
2491 	}
2492 
2493 	/* forward the request to the hwna */
2494 	error = hwna->nm_register(hwna, onoff);
2495 	if (error)
2496 		return error;
2497 
2498 	/* copy up the current ring state information */
2499 	for_rx_tx(t) {
2500 		for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
2501 			NMR(na, t)[i].nr_mode =
2502 				NMR(hwna, t)[i].nr_mode;
2503 	}
2504 
2505 	/* impersonate a netmap_vp_adapter */
2506 	netmap_vp_reg(na, onoff);
2507 	if (hostna->na_bdg)
2508 		netmap_vp_reg(&hostna->up, onoff);
2509 
2510 	if (onoff) {
2511 		u_int i;
2512 		/* intercept the hwna nm_nofify callback on the hw rings */
2513 		for (i = 0; i < hwna->num_rx_rings; i++) {
2514 			hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify;
2515 			hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
2516 		}
2517 		i = hwna->num_rx_rings; /* for safety */
2518 		/* save the host ring notify unconditionally */
2519 		hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify;
2520 		if (hostna->na_bdg) {
2521 			/* also intercept the host ring notify */
2522 			hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
2523 		}
2524 		if (na->active_fds == 0)
2525 			na->na_flags |= NAF_NETMAP_ON;
2526 	} else {
2527 		u_int i;
2528 
2529 		if (na->active_fds == 0)
2530 			na->na_flags &= ~NAF_NETMAP_ON;
2531 
2532 		/* reset all notify callbacks (including host ring) */
2533 		for (i = 0; i <= hwna->num_rx_rings; i++) {
2534 			hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify;
2535 			hwna->rx_rings[i].save_notify = NULL;
2536 		}
2537 		hwna->na_lut.lut = NULL;
2538 		hwna->na_lut.objtotal = 0;
2539 		hwna->na_lut.objsize = 0;
2540 	}
2541 
2542 	return 0;
2543 }
2544 
2545 /* nm_config callback for bwrap */
2546 static int
2547 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
2548 				    u_int *rxr, u_int *rxd)
2549 {
2550 	struct netmap_bwrap_adapter *bna =
2551 		(struct netmap_bwrap_adapter *)na;
2552 	struct netmap_adapter *hwna = bna->hwna;
2553 
2554 	/* forward the request */
2555 	netmap_update_config(hwna);
2556 	/* swap the results */
2557 	*txr = hwna->num_rx_rings;
2558 	*txd = hwna->num_rx_desc;
2559 	*rxr = hwna->num_tx_rings;
2560 	*rxd = hwna->num_rx_desc;
2561 
2562 	return 0;
2563 }
2564 
2565 
2566 /* nm_krings_create callback for bwrap */
2567 static int
2568 netmap_bwrap_krings_create(struct netmap_adapter *na)
2569 {
2570 	struct netmap_bwrap_adapter *bna =
2571 		(struct netmap_bwrap_adapter *)na;
2572 	struct netmap_adapter *hwna = bna->hwna;
2573 	int i, error = 0;
2574 	enum txrx t;
2575 
2576 	ND("%s", na->name);
2577 
2578 	/* impersonate a netmap_vp_adapter */
2579 	error = netmap_vp_krings_create(na);
2580 	if (error)
2581 		return error;
2582 
2583 	/* also create the hwna krings */
2584 	error = hwna->nm_krings_create(hwna);
2585 	if (error) {
2586 		goto err_del_vp_rings;
2587 	}
2588 
2589 	/* get each ring slot number from the corresponding hwna ring */
2590 	for_rx_tx(t) {
2591 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2592 		for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {
2593 			NMR(na, t)[i].nkr_num_slots = NMR(hwna, r)[i].nkr_num_slots;
2594 		}
2595 	}
2596 
2597 	return 0;
2598 
2599 err_del_vp_rings:
2600 	netmap_vp_krings_delete(na);
2601 
2602 	return error;
2603 }
2604 
2605 
2606 static void
2607 netmap_bwrap_krings_delete(struct netmap_adapter *na)
2608 {
2609 	struct netmap_bwrap_adapter *bna =
2610 		(struct netmap_bwrap_adapter *)na;
2611 	struct netmap_adapter *hwna = bna->hwna;
2612 
2613 	ND("%s", na->name);
2614 
2615 	hwna->nm_krings_delete(hwna);
2616 	netmap_vp_krings_delete(na);
2617 }
2618 
2619 
2620 /* notify method for the bridge-->hwna direction */
2621 static int
2622 netmap_bwrap_notify(struct netmap_kring *kring, int flags)
2623 {
2624 	struct netmap_adapter *na = kring->na;
2625 	struct netmap_bwrap_adapter *bna = na->na_private;
2626 	struct netmap_adapter *hwna = bna->hwna;
2627 	u_int ring_n = kring->ring_id;
2628 	u_int lim = kring->nkr_num_slots - 1;
2629 	struct netmap_kring *hw_kring;
2630 	int error;
2631 
2632 	ND("%s: na %s hwna %s",
2633 			(kring ? kring->name : "NULL!"),
2634 			(na ? na->name : "NULL!"),
2635 			(hwna ? hwna->name : "NULL!"));
2636 	hw_kring = &hwna->tx_rings[ring_n];
2637 
2638 	if (nm_kr_tryget(hw_kring, 0, NULL)) {
2639 		return ENXIO;
2640 	}
2641 
2642 	/* first step: simulate a user wakeup on the rx ring */
2643 	netmap_vp_rxsync(kring, flags);
2644 	ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2645 		na->name, ring_n,
2646 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2647 		ring->head, ring->cur, ring->tail,
2648 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
2649 	/* second step: the new packets are sent on the tx ring
2650 	 * (which is actually the same ring)
2651 	 */
2652 	hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
2653 	error = hw_kring->nm_sync(hw_kring, flags);
2654 	if (error)
2655 		goto put_out;
2656 
2657 	/* third step: now we are back the rx ring */
2658 	/* claim ownership on all hw owned bufs */
2659 	kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */
2660 
2661 	/* fourth step: the user goes to sleep again, causing another rxsync */
2662 	netmap_vp_rxsync(kring, flags);
2663 	ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2664 		na->name, ring_n,
2665 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2666 		ring->head, ring->cur, ring->tail,
2667 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
2668 put_out:
2669 	nm_kr_put(hw_kring);
2670 
2671 	return error ? error : NM_IRQ_COMPLETED;
2672 }
2673 
2674 
2675 /* nm_bdg_ctl callback for the bwrap.
2676  * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd].
2677  * On attach, it needs to provide a fake netmap_priv_d structure and
2678  * perform a netmap_do_regif() on the bwrap. This will put both the
2679  * bwrap and the hwna in netmap mode, with the netmap rings shared
2680  * and cross linked. Moroever, it will start intercepting interrupts
2681  * directed to hwna.
2682  */
2683 static int
2684 netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
2685 {
2686 	struct netmap_priv_d *npriv;
2687 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2688 	int error = 0;
2689 
2690 	if (attach) {
2691 		if (NETMAP_OWNED_BY_ANY(na)) {
2692 			return EBUSY;
2693 		}
2694 		if (bna->na_kpriv) {
2695 			/* nothing to do */
2696 			return 0;
2697 		}
2698 		npriv = netmap_priv_new();
2699 		if (npriv == NULL)
2700 			return ENOMEM;
2701 		npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */
2702 		error = netmap_do_regif(npriv, na, 0, NR_REG_NIC_SW);
2703 		if (error) {
2704 			netmap_priv_delete(npriv);
2705 			return error;
2706 		}
2707 		bna->na_kpriv = npriv;
2708 		na->na_flags |= NAF_BUSY;
2709 	} else {
2710 		if (na->active_fds == 0) /* not registered */
2711 			return EINVAL;
2712 		netmap_priv_delete(bna->na_kpriv);
2713 		bna->na_kpriv = NULL;
2714 		na->na_flags &= ~NAF_BUSY;
2715 	}
2716 	return error;
2717 
2718 }
2719 
2720 /* attach a bridge wrapper to the 'real' device */
2721 int
2722 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
2723 {
2724 	struct netmap_bwrap_adapter *bna;
2725 	struct netmap_adapter *na = NULL;
2726 	struct netmap_adapter *hostna = NULL;
2727 	int error = 0;
2728 	enum txrx t;
2729 
2730 	/* make sure the NIC is not already in use */
2731 	if (NETMAP_OWNED_BY_ANY(hwna)) {
2732 		D("NIC %s busy, cannot attach to bridge", hwna->name);
2733 		return EBUSY;
2734 	}
2735 
2736 	bna = nm_os_malloc(sizeof(*bna));
2737 	if (bna == NULL) {
2738 		return ENOMEM;
2739 	}
2740 
2741 	na = &bna->up.up;
2742 	/* make bwrap ifp point to the real ifp */
2743 	na->ifp = hwna->ifp;
2744 	if_ref(na->ifp);
2745 	na->na_private = bna;
2746 	strncpy(na->name, nr_name, sizeof(na->name));
2747 	/* fill the ring data for the bwrap adapter with rx/tx meanings
2748 	 * swapped. The real cross-linking will be done during register,
2749 	 * when all the krings will have been created.
2750 	 */
2751 	for_rx_tx(t) {
2752 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2753 		nma_set_nrings(na, t, nma_get_nrings(hwna, r));
2754 		nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
2755 	}
2756 	na->nm_dtor = netmap_bwrap_dtor;
2757 	na->nm_register = netmap_bwrap_reg;
2758 	// na->nm_txsync = netmap_bwrap_txsync;
2759 	// na->nm_rxsync = netmap_bwrap_rxsync;
2760 	na->nm_config = netmap_bwrap_config;
2761 	na->nm_krings_create = netmap_bwrap_krings_create;
2762 	na->nm_krings_delete = netmap_bwrap_krings_delete;
2763 	na->nm_notify = netmap_bwrap_notify;
2764 	na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
2765 	na->pdev = hwna->pdev;
2766 	na->nm_mem = netmap_mem_get(hwna->nm_mem);
2767 	na->virt_hdr_len = hwna->virt_hdr_len;
2768 	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
2769 
2770 	bna->hwna = hwna;
2771 	netmap_adapter_get(hwna);
2772 	hwna->na_private = bna; /* weak reference */
2773 	hwna->na_vp = &bna->up;
2774 
2775 	if (hwna->na_flags & NAF_HOST_RINGS) {
2776 		if (hwna->na_flags & NAF_SW_ONLY)
2777 			na->na_flags |= NAF_SW_ONLY;
2778 		na->na_flags |= NAF_HOST_RINGS;
2779 		hostna = &bna->host.up;
2780 		snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name);
2781 		hostna->ifp = hwna->ifp;
2782 		for_rx_tx(t) {
2783 			enum txrx r = nm_txrx_swap(t);
2784 			nma_set_nrings(hostna, t, 1);
2785 			nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
2786 		}
2787 		// hostna->nm_txsync = netmap_bwrap_host_txsync;
2788 		// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
2789 		hostna->nm_notify = netmap_bwrap_notify;
2790 		hostna->nm_mem = netmap_mem_get(na->nm_mem);
2791 		hostna->na_private = bna;
2792 		hostna->na_vp = &bna->up;
2793 		na->na_hostvp = hwna->na_hostvp =
2794 			hostna->na_hostvp = &bna->host;
2795 		hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
2796 	}
2797 
2798 	ND("%s<->%s txr %d txd %d rxr %d rxd %d",
2799 		na->name, ifp->if_xname,
2800 		na->num_tx_rings, na->num_tx_desc,
2801 		na->num_rx_rings, na->num_rx_desc);
2802 
2803 	error = netmap_attach_common(na);
2804 	if (error) {
2805 		goto err_free;
2806 	}
2807 	hwna->na_flags |= NAF_BUSY;
2808 	return 0;
2809 
2810 err_free:
2811 	hwna->na_vp = hwna->na_hostvp = NULL;
2812 	netmap_adapter_put(hwna);
2813 	nm_os_free(bna);
2814 	return error;
2815 
2816 }
2817 
2818 struct nm_bridge *
2819 netmap_init_bridges2(u_int n)
2820 {
2821 	int i;
2822 	struct nm_bridge *b;
2823 
2824 	b = nm_os_malloc(sizeof(struct nm_bridge) * n);
2825 	if (b == NULL)
2826 		return NULL;
2827 	for (i = 0; i < n; i++)
2828 		BDG_RWINIT(&b[i]);
2829 	return b;
2830 }
2831 
2832 void
2833 netmap_uninit_bridges2(struct nm_bridge *b, u_int n)
2834 {
2835 	int i;
2836 
2837 	if (b == NULL)
2838 		return;
2839 
2840 	for (i = 0; i < n; i++)
2841 		BDG_RWDESTROY(&b[i]);
2842 	nm_os_free(b);
2843 }
2844 
2845 int
2846 netmap_init_bridges(void)
2847 {
2848 #ifdef CONFIG_NET_NS
2849 	return netmap_bns_register();
2850 #else
2851 	nm_bridges = netmap_init_bridges2(NM_BRIDGES);
2852 	if (nm_bridges == NULL)
2853 		return ENOMEM;
2854 	return 0;
2855 #endif
2856 }
2857 
2858 void
2859 netmap_uninit_bridges(void)
2860 {
2861 #ifdef CONFIG_NET_NS
2862 	netmap_bns_unregister();
2863 #else
2864 	netmap_uninit_bridges2(nm_bridges, NM_BRIDGES);
2865 #endif
2866 }
2867 #endif /* WITH_VALE */
2868