xref: /freebsd/sys/dev/netmap/netmap_vale.c (revision f1951fd745b894fe6586c298874af98544a5e272)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (C) 2013-2016 Universita` di Pisa
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *   1. Redistributions of source code must retain the above copyright
11  *      notice, this list of conditions and the following disclaimer.
12  *   2. Redistributions in binary form must reproduce the above copyright
13  *      notice, this list of conditions and the following disclaimer in the
14  *      documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 
30 /*
31  * This module implements the VALE switch for netmap
32 
33 --- VALE SWITCH ---
34 
35 NMG_LOCK() serializes all modifications to switches and ports.
36 A switch cannot be deleted until all ports are gone.
37 
38 For each switch, an SX lock (RWlock on linux) protects
39 deletion of ports. When configuring or deleting a new port, the
40 lock is acquired in exclusive mode (after holding NMG_LOCK).
41 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
42 The lock is held throughout the entire forwarding cycle,
43 during which the thread may incur in a page fault.
44 Hence it is important that sleepable shared locks are used.
45 
46 On the rx ring, the per-port lock is grabbed initially to reserve
47 a number of slot in the ring, then the lock is released,
48 packets are copied from source to destination, and then
49 the lock is acquired again and the receive ring is updated.
50 (A similar thing is done on the tx ring for NIC and host stack
51 ports attached to the switch)
52 
53  */
54 
55 /*
56  * OS-specific code that is used only within this file.
57  * Other OS-specific code that must be accessed by drivers
58  * is present in netmap_kern.h
59  */
60 
61 #if defined(__FreeBSD__)
62 #include <sys/cdefs.h> /* prerequisite */
63 __FBSDID("$FreeBSD$");
64 
65 #include <sys/types.h>
66 #include <sys/errno.h>
67 #include <sys/param.h>	/* defines used in kernel.h */
68 #include <sys/kernel.h>	/* types used in module initialization */
69 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
70 #include <sys/sockio.h>
71 #include <sys/socketvar.h>	/* struct socket */
72 #include <sys/malloc.h>
73 #include <sys/poll.h>
74 #include <sys/rwlock.h>
75 #include <sys/socket.h> /* sockaddrs */
76 #include <sys/selinfo.h>
77 #include <sys/sysctl.h>
78 #include <net/if.h>
79 #include <net/if_var.h>
80 #include <net/bpf.h>		/* BIOCIMMEDIATE */
81 #include <machine/bus.h>	/* bus_dmamap_* */
82 #include <sys/endian.h>
83 #include <sys/refcount.h>
84 
85 
86 #define BDG_RWLOCK_T		struct rwlock // struct rwlock
87 
88 #define	BDG_RWINIT(b)		\
89 	rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
90 #define BDG_WLOCK(b)		rw_wlock(&(b)->bdg_lock)
91 #define BDG_WUNLOCK(b)		rw_wunlock(&(b)->bdg_lock)
92 #define BDG_RLOCK(b)		rw_rlock(&(b)->bdg_lock)
93 #define BDG_RTRYLOCK(b)		rw_try_rlock(&(b)->bdg_lock)
94 #define BDG_RUNLOCK(b)		rw_runlock(&(b)->bdg_lock)
95 #define BDG_RWDESTROY(b)	rw_destroy(&(b)->bdg_lock)
96 
97 
98 #elif defined(linux)
99 
100 #include "bsd_glue.h"
101 
102 #elif defined(__APPLE__)
103 
104 #warning OSX support is only partial
105 #include "osx_glue.h"
106 
107 #elif defined(_WIN32)
108 #include "win_glue.h"
109 
110 #else
111 
112 #error	Unsupported platform
113 
114 #endif /* unsupported */
115 
116 /*
117  * common headers
118  */
119 
120 #include <net/netmap.h>
121 #include <dev/netmap/netmap_kern.h>
122 #include <dev/netmap/netmap_mem2.h>
123 
124 #ifdef WITH_VALE
125 
126 /*
127  * system parameters (most of them in netmap_kern.h)
128  * NM_BDG_NAME	prefix for switch port names, default "vale"
129  * NM_BDG_MAXPORTS	number of ports
130  * NM_BRIDGES	max number of switches in the system.
131  *	XXX should become a sysctl or tunable
132  *
133  * Switch ports are named valeX:Y where X is the switch name and Y
134  * is the port. If Y matches a physical interface name, the port is
135  * connected to a physical device.
136  *
137  * Unlike physical interfaces, switch ports use their own memory region
138  * for rings and buffers.
139  * The virtual interfaces use per-queue lock instead of core lock.
140  * In the tx loop, we aggregate traffic in batches to make all operations
141  * faster. The batch size is bridge_batch.
142  */
143 #define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
144 #define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
145 #define NM_BRIDGE_RINGSIZE	1024	/* in the device */
146 #define NM_BDG_HASH		1024	/* forwarding table entries */
147 #define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
148 #define NM_MULTISEG		64	/* max size of a chain of bufs */
149 /* actual size of the tables */
150 #define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NM_MULTISEG)
151 /* NM_FT_NULL terminates a list of slots in the ft */
152 #define NM_FT_NULL		NM_BDG_BATCH_MAX
153 /* Default size for the Maximum Frame Size. */
154 #define NM_BDG_MFS_DEFAULT	1514
155 
156 
157 /*
158  * bridge_batch is set via sysctl to the max batch size to be
159  * used in the bridge. The actual value may be larger as the
160  * last packet in the block may overflow the size.
161  */
162 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
163 SYSBEGIN(vars_vale);
164 SYSCTL_DECL(_dev_netmap);
165 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0,
166 		"Max batch size to be used in the bridge");
167 SYSEND;
168 
169 static int netmap_vp_create(struct nmreq_header *hdr, struct ifnet *,
170 		struct netmap_mem_d *nmd, struct netmap_vp_adapter **);
171 static int netmap_vp_reg(struct netmap_adapter *na, int onoff);
172 static int netmap_bwrap_reg(struct netmap_adapter *, int onoff);
173 
174 /*
175  * For each output interface, nm_bdg_q is used to construct a list.
176  * bq_len is the number of output buffers (we can have coalescing
177  * during the copy).
178  */
179 struct nm_bdg_q {
180 	uint16_t bq_head;
181 	uint16_t bq_tail;
182 	uint32_t bq_len;	/* number of buffers */
183 };
184 
185 /* XXX revise this */
186 struct nm_hash_ent {
187 	uint64_t	mac;	/* the top 2 bytes are the epoch */
188 	uint64_t	ports;
189 };
190 
191 /* Holds the default callbacks */
192 static struct netmap_bdg_ops default_bdg_ops = {netmap_bdg_learning, NULL, NULL};
193 
194 /*
195  * nm_bridge is a descriptor for a VALE switch.
196  * Interfaces for a bridge are all in bdg_ports[].
197  * The array has fixed size, an empty entry does not terminate
198  * the search, but lookups only occur on attach/detach so we
199  * don't mind if they are slow.
200  *
201  * The bridge is non blocking on the transmit ports: excess
202  * packets are dropped if there is no room on the output port.
203  *
204  * bdg_lock protects accesses to the bdg_ports array.
205  * This is a rw lock (or equivalent).
206  */
207 #define NM_BDG_IFNAMSIZ IFNAMSIZ
208 struct nm_bridge {
209 	/* XXX what is the proper alignment/layout ? */
210 	BDG_RWLOCK_T	bdg_lock;	/* protects bdg_ports */
211 	int		bdg_namelen;
212 	uint32_t	bdg_active_ports;
213 	char		bdg_basename[NM_BDG_IFNAMSIZ];
214 
215 	/* Indexes of active ports (up to active_ports)
216 	 * and all other remaining ports.
217 	 */
218 	uint32_t	bdg_port_index[NM_BDG_MAXPORTS];
219 	/* used by netmap_bdg_detach_common() */
220 	uint32_t	tmp_bdg_port_index[NM_BDG_MAXPORTS];
221 
222 	struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
223 
224 	/*
225 	 * Programmable lookup functions to figure out the destination port.
226 	 * It returns either of an index of the destination port,
227 	 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
228 	 * forward this packet.  ring_nr is the source ring index, and the
229 	 * function may overwrite this value to forward this packet to a
230 	 * different ring index.
231 	 * The function is set by netmap_bdg_regops().
232 	 */
233 	struct netmap_bdg_ops *bdg_ops;
234 
235 	/*
236 	 * Contains the data structure used by the bdg_ops.lookup function.
237 	 * By default points to *ht which is allocated on attach and used by the default lookup
238 	 * otherwise will point to the data structure received by netmap_bdg_regops().
239 	 */
240 	void *private_data;
241 	struct nm_hash_ent *ht;
242 
243 	/* Currently used to specify if the bridge is still in use while empty and
244 	 * if it has been put in exclusive mode by an external module, see netmap_bdg_regops()
245 	 * and netmap_bdg_create().
246 	 */
247 #define NM_BDG_ACTIVE		1
248 #define NM_BDG_EXCLUSIVE	2
249 	uint8_t			bdg_flags;
250 
251 
252 #ifdef CONFIG_NET_NS
253 	struct net *ns;
254 #endif /* CONFIG_NET_NS */
255 };
256 
257 const char*
258 netmap_bdg_name(struct netmap_vp_adapter *vp)
259 {
260 	struct nm_bridge *b = vp->na_bdg;
261 	if (b == NULL)
262 		return NULL;
263 	return b->bdg_basename;
264 }
265 
266 
267 #ifndef CONFIG_NET_NS
268 /*
269  * XXX in principle nm_bridges could be created dynamically
270  * Right now we have a static array and deletions are protected
271  * by an exclusive lock.
272  */
273 static struct nm_bridge *nm_bridges;
274 #endif /* !CONFIG_NET_NS */
275 
276 
277 /*
278  * this is a slightly optimized copy routine which rounds
279  * to multiple of 64 bytes and is often faster than dealing
280  * with other odd sizes. We assume there is enough room
281  * in the source and destination buffers.
282  *
283  * XXX only for multiples of 64 bytes, non overlapped.
284  */
285 static inline void
286 pkt_copy(void *_src, void *_dst, int l)
287 {
288 	uint64_t *src = _src;
289 	uint64_t *dst = _dst;
290 	if (unlikely(l >= 1024)) {
291 		memcpy(dst, src, l);
292 		return;
293 	}
294 	for (; likely(l > 0); l-=64) {
295 		*dst++ = *src++;
296 		*dst++ = *src++;
297 		*dst++ = *src++;
298 		*dst++ = *src++;
299 		*dst++ = *src++;
300 		*dst++ = *src++;
301 		*dst++ = *src++;
302 		*dst++ = *src++;
303 	}
304 }
305 
306 
307 static int
308 nm_is_id_char(const char c)
309 {
310 	return (c >= 'a' && c <= 'z') ||
311 	       (c >= 'A' && c <= 'Z') ||
312 	       (c >= '0' && c <= '9') ||
313 	       (c == '_');
314 }
315 
316 /* Validate the name of a VALE bridge port and return the
317  * position of the ":" character. */
318 static int
319 nm_vale_name_validate(const char *name)
320 {
321 	int colon_pos = -1;
322 	int i;
323 
324 	if (!name || strlen(name) < strlen(NM_BDG_NAME)) {
325 		return -1;
326 	}
327 
328 	for (i = 0; i < NM_BDG_IFNAMSIZ && name[i]; i++) {
329 		if (name[i] == ':') {
330 			colon_pos = i;
331 			break;
332 		} else if (!nm_is_id_char(name[i])) {
333 			return -1;
334 		}
335 	}
336 
337 	if (strlen(name) - colon_pos > IFNAMSIZ) {
338 		/* interface name too long */
339 		return -1;
340 	}
341 
342 	return colon_pos;
343 }
344 
345 /*
346  * locate a bridge among the existing ones.
347  * MUST BE CALLED WITH NMG_LOCK()
348  *
349  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
350  * We assume that this is called with a name of at least NM_NAME chars.
351  */
352 static struct nm_bridge *
353 nm_find_bridge(const char *name, int create)
354 {
355 	int i, namelen;
356 	struct nm_bridge *b = NULL, *bridges;
357 	u_int num_bridges;
358 
359 	NMG_LOCK_ASSERT();
360 
361 	netmap_bns_getbridges(&bridges, &num_bridges);
362 
363 	namelen = nm_vale_name_validate(name);
364 	if (namelen < 0) {
365 		D("invalid bridge name %s", name ? name : NULL);
366 		return NULL;
367 	}
368 
369 	/* lookup the name, remember empty slot if there is one */
370 	for (i = 0; i < num_bridges; i++) {
371 		struct nm_bridge *x = bridges + i;
372 
373 		if ((x->bdg_flags & NM_BDG_ACTIVE) + x->bdg_active_ports == 0) {
374 			if (create && b == NULL)
375 				b = x;	/* record empty slot */
376 		} else if (x->bdg_namelen != namelen) {
377 			continue;
378 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
379 			ND("found '%.*s' at %d", namelen, name, i);
380 			b = x;
381 			break;
382 		}
383 	}
384 	if (i == num_bridges && b) { /* name not found, can create entry */
385 		/* initialize the bridge */
386 		ND("create new bridge %s with ports %d", b->bdg_basename,
387 			b->bdg_active_ports);
388 		b->ht = nm_os_malloc(sizeof(struct nm_hash_ent) * NM_BDG_HASH);
389 		if (b->ht == NULL) {
390 			D("failed to allocate hash table");
391 			return NULL;
392 		}
393 		strncpy(b->bdg_basename, name, namelen);
394 		b->bdg_namelen = namelen;
395 		b->bdg_active_ports = 0;
396 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
397 			b->bdg_port_index[i] = i;
398 		/* set the default function */
399 		b->bdg_ops = &default_bdg_ops;
400 		b->private_data = b->ht;
401 		b->bdg_flags = 0;
402 		NM_BNS_GET(b);
403 	}
404 	return b;
405 }
406 
407 
408 /*
409  * Free the forwarding tables for rings attached to switch ports.
410  */
411 static void
412 nm_free_bdgfwd(struct netmap_adapter *na)
413 {
414 	int nrings, i;
415 	struct netmap_kring **kring;
416 
417 	NMG_LOCK_ASSERT();
418 	nrings = na->num_tx_rings;
419 	kring = na->tx_rings;
420 	for (i = 0; i < nrings; i++) {
421 		if (kring[i]->nkr_ft) {
422 			nm_os_free(kring[i]->nkr_ft);
423 			kring[i]->nkr_ft = NULL; /* protect from freeing twice */
424 		}
425 	}
426 }
427 
428 
429 /*
430  * Allocate the forwarding tables for the rings attached to the bridge ports.
431  */
432 static int
433 nm_alloc_bdgfwd(struct netmap_adapter *na)
434 {
435 	int nrings, l, i, num_dstq;
436 	struct netmap_kring **kring;
437 
438 	NMG_LOCK_ASSERT();
439 	/* all port:rings + broadcast */
440 	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
441 	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
442 	l += sizeof(struct nm_bdg_q) * num_dstq;
443 	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
444 
445 	nrings = netmap_real_rings(na, NR_TX);
446 	kring = na->tx_rings;
447 	for (i = 0; i < nrings; i++) {
448 		struct nm_bdg_fwd *ft;
449 		struct nm_bdg_q *dstq;
450 		int j;
451 
452 		ft = nm_os_malloc(l);
453 		if (!ft) {
454 			nm_free_bdgfwd(na);
455 			return ENOMEM;
456 		}
457 		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
458 		for (j = 0; j < num_dstq; j++) {
459 			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
460 			dstq[j].bq_len = 0;
461 		}
462 		kring[i]->nkr_ft = ft;
463 	}
464 	return 0;
465 }
466 
467 static int
468 netmap_bdg_free(struct nm_bridge *b)
469 {
470 	if ((b->bdg_flags & NM_BDG_ACTIVE) + b->bdg_active_ports != 0) {
471 		return EBUSY;
472 	}
473 
474 	ND("marking bridge %s as free", b->bdg_basename);
475 	nm_os_free(b->ht);
476 	b->bdg_ops = NULL;
477 	b->bdg_flags = 0;
478 	NM_BNS_PUT(b);
479 	return 0;
480 }
481 
482 
483 /* remove from bridge b the ports in slots hw and sw
484  * (sw can be -1 if not needed)
485  */
486 static void
487 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
488 {
489 	int s_hw = hw, s_sw = sw;
490 	int i, lim =b->bdg_active_ports;
491 	uint32_t *tmp = b->tmp_bdg_port_index;
492 
493 	/*
494 	New algorithm:
495 	make a copy of bdg_port_index;
496 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
497 	in the array of bdg_port_index, replacing them with
498 	entries from the bottom of the array;
499 	decrement bdg_active_ports;
500 	acquire BDG_WLOCK() and copy back the array.
501 	 */
502 
503 	if (netmap_verbose)
504 		D("detach %d and %d (lim %d)", hw, sw, lim);
505 	/* make a copy of the list of active ports, update it,
506 	 * and then copy back within BDG_WLOCK().
507 	 */
508 	memcpy(b->tmp_bdg_port_index, b->bdg_port_index, sizeof(b->tmp_bdg_port_index));
509 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
510 		if (hw >= 0 && tmp[i] == hw) {
511 			ND("detach hw %d at %d", hw, i);
512 			lim--; /* point to last active port */
513 			tmp[i] = tmp[lim]; /* swap with i */
514 			tmp[lim] = hw;	/* now this is inactive */
515 			hw = -1;
516 		} else if (sw >= 0 && tmp[i] == sw) {
517 			ND("detach sw %d at %d", sw, i);
518 			lim--;
519 			tmp[i] = tmp[lim];
520 			tmp[lim] = sw;
521 			sw = -1;
522 		} else {
523 			i++;
524 		}
525 	}
526 	if (hw >= 0 || sw >= 0) {
527 		D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
528 	}
529 
530 	BDG_WLOCK(b);
531 	if (b->bdg_ops->dtor)
532 		b->bdg_ops->dtor(b->bdg_ports[s_hw]);
533 	b->bdg_ports[s_hw] = NULL;
534 	if (s_sw >= 0) {
535 		b->bdg_ports[s_sw] = NULL;
536 	}
537 	memcpy(b->bdg_port_index, b->tmp_bdg_port_index, sizeof(b->tmp_bdg_port_index));
538 	b->bdg_active_ports = lim;
539 	BDG_WUNLOCK(b);
540 
541 	ND("now %d active ports", lim);
542 	netmap_bdg_free(b);
543 }
544 
545 static inline void *
546 nm_bdg_get_auth_token(struct nm_bridge *b)
547 {
548 	return b->ht;
549 }
550 
551 /* bridge not in exclusive mode ==> always valid
552  * bridge in exclusive mode (created through netmap_bdg_create()) ==> check authentication token
553  */
554 static inline int
555 nm_bdg_valid_auth_token(struct nm_bridge *b, void *auth_token)
556 {
557 	return !(b->bdg_flags & NM_BDG_EXCLUSIVE) || b->ht == auth_token;
558 }
559 
560 /* Allows external modules to create bridges in exclusive mode,
561  * returns an authentication token that the external module will need
562  * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(),
563  * and nm_bdg_update_private_data() operations.
564  * Successfully executed if ret != NULL and *return_status == 0.
565  */
566 void *
567 netmap_bdg_create(const char *bdg_name, int *return_status)
568 {
569 	struct nm_bridge *b = NULL;
570 	void *ret = NULL;
571 
572 	NMG_LOCK();
573 	b = nm_find_bridge(bdg_name, 0 /* don't create */);
574 	if (b) {
575 		*return_status = EEXIST;
576 		goto unlock_bdg_create;
577 	}
578 
579 	b = nm_find_bridge(bdg_name, 1 /* create */);
580 	if (!b) {
581 		*return_status = ENOMEM;
582 		goto unlock_bdg_create;
583 	}
584 
585 	b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE;
586 	ret = nm_bdg_get_auth_token(b);
587 	*return_status = 0;
588 
589 unlock_bdg_create:
590 	NMG_UNLOCK();
591 	return ret;
592 }
593 
594 /* Allows external modules to destroy a bridge created through
595  * netmap_bdg_create(), the bridge must be empty.
596  */
597 int
598 netmap_bdg_destroy(const char *bdg_name, void *auth_token)
599 {
600 	struct nm_bridge *b = NULL;
601 	int ret = 0;
602 
603 	NMG_LOCK();
604 	b = nm_find_bridge(bdg_name, 0 /* don't create */);
605 	if (!b) {
606 		ret = ENXIO;
607 		goto unlock_bdg_free;
608 	}
609 
610 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
611 		ret = EACCES;
612 		goto unlock_bdg_free;
613 	}
614 	if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) {
615 		ret = EINVAL;
616 		goto unlock_bdg_free;
617 	}
618 
619 	b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE);
620 	ret = netmap_bdg_free(b);
621 	if (ret) {
622 		b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE;
623 	}
624 
625 unlock_bdg_free:
626 	NMG_UNLOCK();
627 	return ret;
628 }
629 
630 
631 
632 /* nm_bdg_ctl callback for VALE ports */
633 static int
634 netmap_vp_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
635 {
636 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
637 	struct nm_bridge *b = vpna->na_bdg;
638 
639 	if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
640 		return 0; /* nothing to do */
641 	}
642 	if (b) {
643 		netmap_set_all_rings(na, 0 /* disable */);
644 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
645 		vpna->na_bdg = NULL;
646 		netmap_set_all_rings(na, 1 /* enable */);
647 	}
648 	/* I have took reference just for attach */
649 	netmap_adapter_put(na);
650 	return 0;
651 }
652 
653 /* nm_dtor callback for ephemeral VALE ports */
654 static void
655 netmap_vp_dtor(struct netmap_adapter *na)
656 {
657 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
658 	struct nm_bridge *b = vpna->na_bdg;
659 
660 	ND("%s has %d references", na->name, na->na_refcount);
661 
662 	if (b) {
663 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
664 	}
665 
666 	if (na->ifp != NULL && !nm_iszombie(na)) {
667 		WNA(na->ifp) = NULL;
668 		if (vpna->autodelete) {
669 			ND("releasing %s", na->ifp->if_xname);
670 			NMG_UNLOCK();
671 			nm_os_vi_detach(na->ifp);
672 			NMG_LOCK();
673 		}
674 	}
675 }
676 
677 /* creates a persistent VALE port */
678 int
679 nm_vi_create(struct nmreq_header *hdr)
680 {
681 	struct nmreq_vale_newif *req =
682 		(struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body;
683 	int error = 0;
684 	/* Build a nmreq_register out of the nmreq_vale_newif,
685 	 * so that we can call netmap_get_bdg_na(). */
686 	struct nmreq_register regreq;
687 	bzero(&regreq, sizeof(regreq));
688 	regreq.nr_tx_slots = req->nr_tx_slots;
689 	regreq.nr_rx_slots = req->nr_rx_slots;
690 	regreq.nr_tx_rings = req->nr_tx_rings;
691 	regreq.nr_rx_rings = req->nr_rx_rings;
692 	regreq.nr_mem_id = req->nr_mem_id;
693 	hdr->nr_reqtype = NETMAP_REQ_REGISTER;
694 	hdr->nr_body = (uintptr_t)&regreq;
695 	error = netmap_vi_create(hdr, 0 /* no autodelete */);
696 	hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF;
697 	hdr->nr_body = (uintptr_t)req;
698 	/* Write back to the original struct. */
699 	req->nr_tx_slots = regreq.nr_tx_slots;
700 	req->nr_rx_slots = regreq.nr_rx_slots;
701 	req->nr_tx_rings = regreq.nr_tx_rings;
702 	req->nr_rx_rings = regreq.nr_rx_rings;
703 	req->nr_mem_id = regreq.nr_mem_id;
704 	return error;
705 }
706 
707 /* remove a persistent VALE port from the system */
708 int
709 nm_vi_destroy(const char *name)
710 {
711 	struct ifnet *ifp;
712 	struct netmap_vp_adapter *vpna;
713 	int error;
714 
715 	ifp = ifunit_ref(name);
716 	if (!ifp)
717 		return ENXIO;
718 	NMG_LOCK();
719 	/* make sure this is actually a VALE port */
720 	if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
721 		error = EINVAL;
722 		goto err;
723 	}
724 
725 	vpna = (struct netmap_vp_adapter *)NA(ifp);
726 
727 	/* we can only destroy ports that were created via NETMAP_BDG_NEWIF */
728 	if (vpna->autodelete) {
729 		error = EINVAL;
730 		goto err;
731 	}
732 
733 	/* also make sure that nobody is using the inferface */
734 	if (NETMAP_OWNED_BY_ANY(&vpna->up) ||
735 	    vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) {
736 		error = EBUSY;
737 		goto err;
738 	}
739 
740 	NMG_UNLOCK();
741 
742 	D("destroying a persistent vale interface %s", ifp->if_xname);
743 	/* Linux requires all the references are released
744 	 * before unregister
745 	 */
746 	netmap_detach(ifp);
747 	if_rele(ifp);
748 	nm_os_vi_detach(ifp);
749 	return 0;
750 
751 err:
752 	NMG_UNLOCK();
753 	if_rele(ifp);
754 	return error;
755 }
756 
757 static int
758 nm_update_info(struct nmreq_register *req, struct netmap_adapter *na)
759 {
760 	req->nr_rx_rings = na->num_rx_rings;
761 	req->nr_tx_rings = na->num_tx_rings;
762 	req->nr_rx_slots = na->num_rx_desc;
763 	req->nr_tx_slots = na->num_tx_desc;
764 	return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL,
765 					&req->nr_mem_id);
766 }
767 
768 /*
769  * Create a virtual interface registered to the system.
770  * The interface will be attached to a bridge later.
771  */
772 int
773 netmap_vi_create(struct nmreq_header *hdr, int autodelete)
774 {
775 	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
776 	struct ifnet *ifp;
777 	struct netmap_vp_adapter *vpna;
778 	struct netmap_mem_d *nmd = NULL;
779 	int error;
780 
781 	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
782 		return EINVAL;
783 	}
784 
785 	/* don't include VALE prefix */
786 	if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
787 		return EINVAL;
788 	if (strlen(hdr->nr_name) >= IFNAMSIZ) {
789 		return EINVAL;
790 	}
791 	ifp = ifunit_ref(hdr->nr_name);
792 	if (ifp) { /* already exist, cannot create new one */
793 		error = EEXIST;
794 		NMG_LOCK();
795 		if (NM_NA_VALID(ifp)) {
796 			int update_err = nm_update_info(req, NA(ifp));
797 			if (update_err)
798 				error = update_err;
799 		}
800 		NMG_UNLOCK();
801 		if_rele(ifp);
802 		return error;
803 	}
804 	error = nm_os_vi_persist(hdr->nr_name, &ifp);
805 	if (error)
806 		return error;
807 
808 	NMG_LOCK();
809 	if (req->nr_mem_id) {
810 		nmd = netmap_mem_find(req->nr_mem_id);
811 		if (nmd == NULL) {
812 			error = EINVAL;
813 			goto err_1;
814 		}
815 	}
816 	/* netmap_vp_create creates a struct netmap_vp_adapter */
817 	error = netmap_vp_create(hdr, ifp, nmd, &vpna);
818 	if (error) {
819 		D("error %d", error);
820 		goto err_1;
821 	}
822 	/* persist-specific routines */
823 	vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
824 	if (!autodelete) {
825 		netmap_adapter_get(&vpna->up);
826 	} else {
827 		vpna->autodelete = 1;
828 	}
829 	NM_ATTACH_NA(ifp, &vpna->up);
830 	/* return the updated info */
831 	error = nm_update_info(req, &vpna->up);
832 	if (error) {
833 		goto err_2;
834 	}
835 	ND("returning nr_mem_id %d", req->nr_mem_id);
836 	if (nmd)
837 		netmap_mem_put(nmd);
838 	NMG_UNLOCK();
839 	ND("created %s", ifp->if_xname);
840 	return 0;
841 
842 err_2:
843 	netmap_detach(ifp);
844 err_1:
845 	if (nmd)
846 		netmap_mem_put(nmd);
847 	NMG_UNLOCK();
848 	nm_os_vi_detach(ifp);
849 
850 	return error;
851 }
852 
853 /* Try to get a reference to a netmap adapter attached to a VALE switch.
854  * If the adapter is found (or is created), this function returns 0, a
855  * non NULL pointer is returned into *na, and the caller holds a
856  * reference to the adapter.
857  * If an adapter is not found, then no reference is grabbed and the
858  * function returns an error code, or 0 if there is just a VALE prefix
859  * mismatch. Therefore the caller holds a reference when
860  * (*na != NULL && return == 0).
861  */
862 int
863 netmap_get_bdg_na(struct nmreq_header *hdr, struct netmap_adapter **na,
864 		struct netmap_mem_d *nmd, int create)
865 {
866 	char *nr_name = hdr->nr_name;
867 	const char *ifname;
868 	struct ifnet *ifp = NULL;
869 	int error = 0;
870 	struct netmap_vp_adapter *vpna, *hostna = NULL;
871 	struct nm_bridge *b;
872 	uint32_t i, j;
873 	uint32_t cand = NM_BDG_NOPORT, cand2 = NM_BDG_NOPORT;
874 	int needed;
875 
876 	*na = NULL;     /* default return value */
877 
878 	/* first try to see if this is a bridge port. */
879 	NMG_LOCK_ASSERT();
880 	if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) {
881 		return 0;  /* no error, but no VALE prefix */
882 	}
883 
884 	b = nm_find_bridge(nr_name, create);
885 	if (b == NULL) {
886 		ND("no bridges available for '%s'", nr_name);
887 		return (create ? ENOMEM : ENXIO);
888 	}
889 	if (strlen(nr_name) < b->bdg_namelen) /* impossible */
890 		panic("x");
891 
892 	/* Now we are sure that name starts with the bridge's name,
893 	 * lookup the port in the bridge. We need to scan the entire
894 	 * list. It is not important to hold a WLOCK on the bridge
895 	 * during the search because NMG_LOCK already guarantees
896 	 * that there are no other possible writers.
897 	 */
898 
899 	/* lookup in the local list of ports */
900 	for (j = 0; j < b->bdg_active_ports; j++) {
901 		i = b->bdg_port_index[j];
902 		vpna = b->bdg_ports[i];
903 		ND("checking %s", vpna->up.name);
904 		if (!strcmp(vpna->up.name, nr_name)) {
905 			netmap_adapter_get(&vpna->up);
906 			ND("found existing if %s refs %d", nr_name)
907 			*na = &vpna->up;
908 			return 0;
909 		}
910 	}
911 	/* not found, should we create it? */
912 	if (!create)
913 		return ENXIO;
914 	/* yes we should, see if we have space to attach entries */
915 	needed = 2; /* in some cases we only need 1 */
916 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
917 		D("bridge full %d, cannot create new port", b->bdg_active_ports);
918 		return ENOMEM;
919 	}
920 	/* record the next two ports available, but do not allocate yet */
921 	cand = b->bdg_port_index[b->bdg_active_ports];
922 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
923 	ND("+++ bridge %s port %s used %d avail %d %d",
924 		b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
925 
926 	/*
927 	 * try see if there is a matching NIC with this name
928 	 * (after the bridge's name)
929 	 */
930 	ifname = nr_name + b->bdg_namelen + 1;
931 	ifp = ifunit_ref(ifname);
932 	if (!ifp) {
933 		/* Create an ephemeral virtual port.
934 		 * This block contains all the ephemeral-specific logic.
935 		 */
936 
937 		if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
938 			error = EINVAL;
939 			goto out;
940 		}
941 
942 		/* bdg_netmap_attach creates a struct netmap_adapter */
943 		error = netmap_vp_create(hdr, NULL, nmd, &vpna);
944 		if (error) {
945 			D("error %d", error);
946 			goto out;
947 		}
948 		/* shortcut - we can skip get_hw_na(),
949 		 * ownership check and nm_bdg_attach()
950 		 */
951 
952 	} else {
953 		struct netmap_adapter *hw;
954 
955 		/* the vale:nic syntax is only valid for some commands */
956 		switch (hdr->nr_reqtype) {
957 		case NETMAP_REQ_VALE_ATTACH:
958 		case NETMAP_REQ_VALE_DETACH:
959 		case NETMAP_REQ_VALE_POLLING_ENABLE:
960 		case NETMAP_REQ_VALE_POLLING_DISABLE:
961 			break; /* ok */
962 		default:
963 			error = EINVAL;
964 			goto out;
965 		}
966 
967 		error = netmap_get_hw_na(ifp, nmd, &hw);
968 		if (error || hw == NULL)
969 			goto out;
970 
971 		/* host adapter might not be created */
972 		error = hw->nm_bdg_attach(nr_name, hw);
973 		if (error)
974 			goto out;
975 		vpna = hw->na_vp;
976 		hostna = hw->na_hostvp;
977 		if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
978 			/* Check if we need to skip the host rings. */
979 			struct nmreq_vale_attach *areq =
980 				(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
981 			if (areq->reg.nr_mode != NR_REG_NIC_SW) {
982 				hostna = NULL;
983 			}
984 		}
985 	}
986 
987 	BDG_WLOCK(b);
988 	vpna->bdg_port = cand;
989 	ND("NIC  %p to bridge port %d", vpna, cand);
990 	/* bind the port to the bridge (virtual ports are not active) */
991 	b->bdg_ports[cand] = vpna;
992 	vpna->na_bdg = b;
993 	b->bdg_active_ports++;
994 	if (hostna != NULL) {
995 		/* also bind the host stack to the bridge */
996 		b->bdg_ports[cand2] = hostna;
997 		hostna->bdg_port = cand2;
998 		hostna->na_bdg = b;
999 		b->bdg_active_ports++;
1000 		ND("host %p to bridge port %d", hostna, cand2);
1001 	}
1002 	ND("if %s refs %d", ifname, vpna->up.na_refcount);
1003 	BDG_WUNLOCK(b);
1004 	*na = &vpna->up;
1005 	netmap_adapter_get(*na);
1006 
1007 out:
1008 	if (ifp)
1009 		if_rele(ifp);
1010 
1011 	return error;
1012 }
1013 
1014 /* Process NETMAP_REQ_VALE_ATTACH.
1015  */
1016 int
1017 nm_bdg_ctl_attach(struct nmreq_header *hdr, void *auth_token)
1018 {
1019 	struct nmreq_vale_attach *req =
1020 		(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
1021 	struct netmap_vp_adapter * vpna;
1022 	struct netmap_adapter *na;
1023 	struct netmap_mem_d *nmd = NULL;
1024 	struct nm_bridge *b = NULL;
1025 	int error;
1026 
1027 	NMG_LOCK();
1028 	/* permission check for modified bridges */
1029 	b = nm_find_bridge(hdr->nr_name, 0 /* don't create */);
1030 	if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
1031 		error = EACCES;
1032 		goto unlock_exit;
1033 	}
1034 
1035 	if (req->reg.nr_mem_id) {
1036 		nmd = netmap_mem_find(req->reg.nr_mem_id);
1037 		if (nmd == NULL) {
1038 			error = EINVAL;
1039 			goto unlock_exit;
1040 		}
1041 	}
1042 
1043 	/* check for existing one */
1044 	error = netmap_get_bdg_na(hdr, &na, nmd, 0);
1045 	if (!error) {
1046 		error = EBUSY;
1047 		goto unref_exit;
1048 	}
1049 	error = netmap_get_bdg_na(hdr, &na,
1050 				nmd, 1 /* create if not exists */);
1051 	if (error) { /* no device */
1052 		goto unlock_exit;
1053 	}
1054 
1055 	if (na == NULL) { /* VALE prefix missing */
1056 		error = EINVAL;
1057 		goto unlock_exit;
1058 	}
1059 
1060 	if (NETMAP_OWNED_BY_ANY(na)) {
1061 		error = EBUSY;
1062 		goto unref_exit;
1063 	}
1064 
1065 	if (na->nm_bdg_ctl) {
1066 		/* nop for VALE ports. The bwrap needs to put the hwna
1067 		 * in netmap mode (see netmap_bwrap_bdg_ctl)
1068 		 */
1069 		error = na->nm_bdg_ctl(hdr, na);
1070 		if (error)
1071 			goto unref_exit;
1072 		ND("registered %s to netmap-mode", na->name);
1073 	}
1074 	vpna = (struct netmap_vp_adapter *)na;
1075 	req->port_index = vpna->bdg_port;
1076 	NMG_UNLOCK();
1077 	return 0;
1078 
1079 unref_exit:
1080 	netmap_adapter_put(na);
1081 unlock_exit:
1082 	NMG_UNLOCK();
1083 	return error;
1084 }
1085 
1086 static inline int
1087 nm_is_bwrap(struct netmap_adapter *na)
1088 {
1089 	return na->nm_register == netmap_bwrap_reg;
1090 }
1091 
1092 /* Process NETMAP_REQ_VALE_DETACH.
1093  */
1094 int
1095 nm_bdg_ctl_detach(struct nmreq_header *hdr, void *auth_token)
1096 {
1097 	struct nmreq_vale_detach *nmreq_det = (void *)(uintptr_t)hdr->nr_body;
1098 	struct netmap_vp_adapter *vpna;
1099 	struct netmap_adapter *na;
1100 	struct nm_bridge *b = NULL;
1101 	int error;
1102 
1103 	NMG_LOCK();
1104 	/* permission check for modified bridges */
1105 	b = nm_find_bridge(hdr->nr_name, 0 /* don't create */);
1106 	if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
1107 		error = EACCES;
1108 		goto unlock_exit;
1109 	}
1110 
1111 	error = netmap_get_bdg_na(hdr, &na, NULL, 0 /* don't create */);
1112 	if (error) { /* no device, or another bridge or user owns the device */
1113 		goto unlock_exit;
1114 	}
1115 
1116 	if (na == NULL) { /* VALE prefix missing */
1117 		error = EINVAL;
1118 		goto unlock_exit;
1119 	} else if (nm_is_bwrap(na) &&
1120 		   ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
1121 		/* Don't detach a NIC with polling */
1122 		error = EBUSY;
1123 		goto unref_exit;
1124 	}
1125 
1126 	vpna = (struct netmap_vp_adapter *)na;
1127 	if (na->na_vp != vpna) {
1128 		/* trying to detach first attach of VALE persistent port attached
1129 		 * to 2 bridges
1130 		 */
1131 		error = EBUSY;
1132 		goto unref_exit;
1133 	}
1134 	nmreq_det->port_index = vpna->bdg_port;
1135 
1136 	if (na->nm_bdg_ctl) {
1137 		/* remove the port from bridge. The bwrap
1138 		 * also needs to put the hwna in normal mode
1139 		 */
1140 		error = na->nm_bdg_ctl(hdr, na);
1141 	}
1142 
1143 unref_exit:
1144 	netmap_adapter_put(na);
1145 unlock_exit:
1146 	NMG_UNLOCK();
1147 	return error;
1148 
1149 }
1150 
1151 struct nm_bdg_polling_state;
1152 struct
1153 nm_bdg_kthread {
1154 	struct nm_kctx *nmk;
1155 	u_int qfirst;
1156 	u_int qlast;
1157 	struct nm_bdg_polling_state *bps;
1158 };
1159 
1160 struct nm_bdg_polling_state {
1161 	bool configured;
1162 	bool stopped;
1163 	struct netmap_bwrap_adapter *bna;
1164 	uint32_t mode;
1165 	u_int qfirst;
1166 	u_int qlast;
1167 	u_int cpu_from;
1168 	u_int ncpus;
1169 	struct nm_bdg_kthread *kthreads;
1170 };
1171 
1172 static void
1173 netmap_bwrap_polling(void *data, int is_kthread)
1174 {
1175 	struct nm_bdg_kthread *nbk = data;
1176 	struct netmap_bwrap_adapter *bna;
1177 	u_int qfirst, qlast, i;
1178 	struct netmap_kring **kring0, *kring;
1179 
1180 	if (!nbk)
1181 		return;
1182 	qfirst = nbk->qfirst;
1183 	qlast = nbk->qlast;
1184 	bna = nbk->bps->bna;
1185 	kring0 = NMR(bna->hwna, NR_RX);
1186 
1187 	for (i = qfirst; i < qlast; i++) {
1188 		kring = kring0[i];
1189 		kring->nm_notify(kring, 0);
1190 	}
1191 }
1192 
1193 static int
1194 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps)
1195 {
1196 	struct nm_kctx_cfg kcfg;
1197 	int i, j;
1198 
1199 	bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus);
1200 	if (bps->kthreads == NULL)
1201 		return ENOMEM;
1202 
1203 	bzero(&kcfg, sizeof(kcfg));
1204 	kcfg.worker_fn = netmap_bwrap_polling;
1205 	kcfg.use_kthread = 1;
1206 	for (i = 0; i < bps->ncpus; i++) {
1207 		struct nm_bdg_kthread *t = bps->kthreads + i;
1208 		int all = (bps->ncpus == 1 &&
1209 			bps->mode == NETMAP_POLLING_MODE_SINGLE_CPU);
1210 		int affinity = bps->cpu_from + i;
1211 
1212 		t->bps = bps;
1213 		t->qfirst = all ? bps->qfirst /* must be 0 */: affinity;
1214 		t->qlast = all ? bps->qlast : t->qfirst + 1;
1215 		D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst,
1216 			t->qlast);
1217 
1218 		kcfg.type = i;
1219 		kcfg.worker_private = t;
1220 		t->nmk = nm_os_kctx_create(&kcfg, NULL);
1221 		if (t->nmk == NULL) {
1222 			goto cleanup;
1223 		}
1224 		nm_os_kctx_worker_setaff(t->nmk, affinity);
1225 	}
1226 	return 0;
1227 
1228 cleanup:
1229 	for (j = 0; j < i; j++) {
1230 		struct nm_bdg_kthread *t = bps->kthreads + i;
1231 		nm_os_kctx_destroy(t->nmk);
1232 	}
1233 	nm_os_free(bps->kthreads);
1234 	return EFAULT;
1235 }
1236 
1237 /* A variant of ptnetmap_start_kthreads() */
1238 static int
1239 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps)
1240 {
1241 	int error, i, j;
1242 
1243 	if (!bps) {
1244 		D("polling is not configured");
1245 		return EFAULT;
1246 	}
1247 	bps->stopped = false;
1248 
1249 	for (i = 0; i < bps->ncpus; i++) {
1250 		struct nm_bdg_kthread *t = bps->kthreads + i;
1251 		error = nm_os_kctx_worker_start(t->nmk);
1252 		if (error) {
1253 			D("error in nm_kthread_start()");
1254 			goto cleanup;
1255 		}
1256 	}
1257 	return 0;
1258 
1259 cleanup:
1260 	for (j = 0; j < i; j++) {
1261 		struct nm_bdg_kthread *t = bps->kthreads + i;
1262 		nm_os_kctx_worker_stop(t->nmk);
1263 	}
1264 	bps->stopped = true;
1265 	return error;
1266 }
1267 
1268 static void
1269 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps)
1270 {
1271 	int i;
1272 
1273 	if (!bps)
1274 		return;
1275 
1276 	for (i = 0; i < bps->ncpus; i++) {
1277 		struct nm_bdg_kthread *t = bps->kthreads + i;
1278 		nm_os_kctx_worker_stop(t->nmk);
1279 		nm_os_kctx_destroy(t->nmk);
1280 	}
1281 	bps->stopped = true;
1282 }
1283 
1284 static int
1285 get_polling_cfg(struct nmreq_vale_polling *req, struct netmap_adapter *na,
1286 		struct nm_bdg_polling_state *bps)
1287 {
1288 	unsigned int avail_cpus, core_from;
1289 	unsigned int qfirst, qlast;
1290 	uint32_t i = req->nr_first_cpu_id;
1291 	uint32_t req_cpus = req->nr_num_polling_cpus;
1292 
1293 	avail_cpus = nm_os_ncpus();
1294 
1295 	if (req_cpus == 0) {
1296 		D("req_cpus must be > 0");
1297 		return EINVAL;
1298 	} else if (req_cpus >= avail_cpus) {
1299 		D("Cannot use all the CPUs in the system");
1300 		return EINVAL;
1301 	}
1302 
1303 	if (req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU) {
1304 		/* Use a separate core for each ring. If nr_num_polling_cpus>1
1305 		 * more consecutive rings are polled.
1306 		 * For example, if nr_first_cpu_id=2 and nr_num_polling_cpus=2,
1307 		 * ring 2 and 3 are polled by core 2 and 3, respectively. */
1308 		if (i + req_cpus > nma_get_nrings(na, NR_RX)) {
1309 			D("Rings %u-%u not in range (have %d rings)",
1310 				i, i + req_cpus, nma_get_nrings(na, NR_RX));
1311 			return EINVAL;
1312 		}
1313 		qfirst = i;
1314 		qlast = qfirst + req_cpus;
1315 		core_from = qfirst;
1316 
1317 	} else if (req->nr_mode == NETMAP_POLLING_MODE_SINGLE_CPU) {
1318 		/* Poll all the rings using a core specified by nr_first_cpu_id.
1319 		 * the number of cores must be 1. */
1320 		if (req_cpus != 1) {
1321 			D("ncpus must be 1 for NETMAP_POLLING_MODE_SINGLE_CPU "
1322 				"(was %d)", req_cpus);
1323 			return EINVAL;
1324 		}
1325 		qfirst = 0;
1326 		qlast = nma_get_nrings(na, NR_RX);
1327 		core_from = i;
1328 	} else {
1329 		D("Invalid polling mode");
1330 		return EINVAL;
1331 	}
1332 
1333 	bps->mode = req->nr_mode;
1334 	bps->qfirst = qfirst;
1335 	bps->qlast = qlast;
1336 	bps->cpu_from = core_from;
1337 	bps->ncpus = req_cpus;
1338 	D("%s qfirst %u qlast %u cpu_from %u ncpus %u",
1339 		req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU ?
1340 		"MULTI" : "SINGLE",
1341 		qfirst, qlast, core_from, req_cpus);
1342 	return 0;
1343 }
1344 
1345 static int
1346 nm_bdg_ctl_polling_start(struct nmreq_vale_polling *req, struct netmap_adapter *na)
1347 {
1348 	struct nm_bdg_polling_state *bps;
1349 	struct netmap_bwrap_adapter *bna;
1350 	int error;
1351 
1352 	bna = (struct netmap_bwrap_adapter *)na;
1353 	if (bna->na_polling_state) {
1354 		D("ERROR adapter already in polling mode");
1355 		return EFAULT;
1356 	}
1357 
1358 	bps = nm_os_malloc(sizeof(*bps));
1359 	if (!bps)
1360 		return ENOMEM;
1361 	bps->configured = false;
1362 	bps->stopped = true;
1363 
1364 	if (get_polling_cfg(req, na, bps)) {
1365 		nm_os_free(bps);
1366 		return EINVAL;
1367 	}
1368 
1369 	if (nm_bdg_create_kthreads(bps)) {
1370 		nm_os_free(bps);
1371 		return EFAULT;
1372 	}
1373 
1374 	bps->configured = true;
1375 	bna->na_polling_state = bps;
1376 	bps->bna = bna;
1377 
1378 	/* disable interrupts if possible */
1379 	nma_intr_enable(bna->hwna, 0);
1380 	/* start kthread now */
1381 	error = nm_bdg_polling_start_kthreads(bps);
1382 	if (error) {
1383 		D("ERROR nm_bdg_polling_start_kthread()");
1384 		nm_os_free(bps->kthreads);
1385 		nm_os_free(bps);
1386 		bna->na_polling_state = NULL;
1387 		nma_intr_enable(bna->hwna, 1);
1388 	}
1389 	return error;
1390 }
1391 
1392 static int
1393 nm_bdg_ctl_polling_stop(struct netmap_adapter *na)
1394 {
1395 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na;
1396 	struct nm_bdg_polling_state *bps;
1397 
1398 	if (!bna->na_polling_state) {
1399 		D("ERROR adapter is not in polling mode");
1400 		return EFAULT;
1401 	}
1402 	bps = bna->na_polling_state;
1403 	nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state);
1404 	bps->configured = false;
1405 	nm_os_free(bps);
1406 	bna->na_polling_state = NULL;
1407 	/* reenable interrupts */
1408 	nma_intr_enable(bna->hwna, 1);
1409 	return 0;
1410 }
1411 
1412 int
1413 nm_bdg_polling(struct nmreq_header *hdr)
1414 {
1415 	struct nmreq_vale_polling *req =
1416 		(struct nmreq_vale_polling *)(uintptr_t)hdr->nr_body;
1417 	struct netmap_adapter *na = NULL;
1418 	int error = 0;
1419 
1420 	NMG_LOCK();
1421 	error = netmap_get_bdg_na(hdr, &na, NULL, /*create=*/0);
1422 	if (na && !error) {
1423 		if (!nm_is_bwrap(na)) {
1424 			error = EOPNOTSUPP;
1425 		} else if (hdr->nr_reqtype == NETMAP_BDG_POLLING_ON) {
1426 			error = nm_bdg_ctl_polling_start(req, na);
1427 			if (!error)
1428 				netmap_adapter_get(na);
1429 		} else {
1430 			error = nm_bdg_ctl_polling_stop(na);
1431 			if (!error)
1432 				netmap_adapter_put(na);
1433 		}
1434 		netmap_adapter_put(na);
1435 	} else if (!na && !error) {
1436 		/* Not VALE port. */
1437 		error = EINVAL;
1438 	}
1439 	NMG_UNLOCK();
1440 
1441 	return error;
1442 }
1443 
1444 /* Process NETMAP_REQ_VALE_LIST. */
1445 int
1446 netmap_bdg_list(struct nmreq_header *hdr)
1447 {
1448 	struct nmreq_vale_list *req =
1449 		(struct nmreq_vale_list *)(uintptr_t)hdr->nr_body;
1450 	int namelen = strlen(hdr->nr_name);
1451 	struct nm_bridge *b, *bridges;
1452 	struct netmap_vp_adapter *vpna;
1453 	int error = 0, i, j;
1454 	u_int num_bridges;
1455 
1456 	netmap_bns_getbridges(&bridges, &num_bridges);
1457 
1458 	/* this is used to enumerate bridges and ports */
1459 	if (namelen) { /* look up indexes of bridge and port */
1460 		if (strncmp(hdr->nr_name, NM_BDG_NAME,
1461 					strlen(NM_BDG_NAME))) {
1462 			return EINVAL;
1463 		}
1464 		NMG_LOCK();
1465 		b = nm_find_bridge(hdr->nr_name, 0 /* don't create */);
1466 		if (!b) {
1467 			NMG_UNLOCK();
1468 			return ENOENT;
1469 		}
1470 
1471 		req->nr_bridge_idx = b - bridges; /* bridge index */
1472 		req->nr_port_idx = NM_BDG_NOPORT;
1473 		for (j = 0; j < b->bdg_active_ports; j++) {
1474 			i = b->bdg_port_index[j];
1475 			vpna = b->bdg_ports[i];
1476 			if (vpna == NULL) {
1477 				D("This should not happen");
1478 				continue;
1479 			}
1480 			/* the former and the latter identify a
1481 			 * virtual port and a NIC, respectively
1482 			 */
1483 			if (!strcmp(vpna->up.name, hdr->nr_name)) {
1484 				req->nr_port_idx = i; /* port index */
1485 				break;
1486 			}
1487 		}
1488 		NMG_UNLOCK();
1489 	} else {
1490 		/* return the first non-empty entry starting from
1491 		 * bridge nr_arg1 and port nr_arg2.
1492 		 *
1493 		 * Users can detect the end of the same bridge by
1494 		 * seeing the new and old value of nr_arg1, and can
1495 		 * detect the end of all the bridge by error != 0
1496 		 */
1497 		i = req->nr_bridge_idx;
1498 		j = req->nr_port_idx;
1499 
1500 		NMG_LOCK();
1501 		for (error = ENOENT; i < NM_BRIDGES; i++) {
1502 			b = bridges + i;
1503 			for ( ; j < NM_BDG_MAXPORTS; j++) {
1504 				if (b->bdg_ports[j] == NULL)
1505 					continue;
1506 				vpna = b->bdg_ports[j];
1507 				/* write back the VALE switch name */
1508 				strncpy(hdr->nr_name, vpna->up.name,
1509 					(size_t)IFNAMSIZ);
1510 				error = 0;
1511 				goto out;
1512 			}
1513 			j = 0; /* following bridges scan from 0 */
1514 		}
1515 	out:
1516 		req->nr_bridge_idx = i;
1517 		req->nr_port_idx = j;
1518 		NMG_UNLOCK();
1519 	}
1520 
1521 	return error;
1522 }
1523 
1524 /* Called by external kernel modules (e.g., Openvswitch).
1525  * to set configure/lookup/dtor functions of a VALE instance.
1526  * Register callbacks to the given bridge. 'name' may be just
1527  * bridge's name (including ':' if it is not just NM_BDG_NAME).
1528  *
1529  * Called without NMG_LOCK.
1530  */
1531 
1532 int
1533 netmap_bdg_regops(const char *name, struct netmap_bdg_ops *bdg_ops, void *private_data, void *auth_token)
1534 {
1535 	struct nm_bridge *b;
1536 	int error = 0;
1537 
1538 	NMG_LOCK();
1539 	b = nm_find_bridge(name, 0 /* don't create */);
1540 	if (!b) {
1541 		error = ENXIO;
1542 		goto unlock_regops;
1543 	}
1544 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
1545 		error = EACCES;
1546 		goto unlock_regops;
1547 	}
1548 
1549 	BDG_WLOCK(b);
1550 	if (!bdg_ops) {
1551 		/* resetting the bridge */
1552 		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
1553 		b->bdg_ops = &default_bdg_ops;
1554 		b->private_data = b->ht;
1555 	} else {
1556 		/* modifying the bridge */
1557 		b->private_data = private_data;
1558 		b->bdg_ops = bdg_ops;
1559 	}
1560 	BDG_WUNLOCK(b);
1561 
1562 unlock_regops:
1563 	NMG_UNLOCK();
1564 	return error;
1565 }
1566 
1567 /* Called by external kernel modules (e.g., Openvswitch).
1568  * to modify the private data previously given to regops().
1569  * 'name' may be just bridge's name (including ':' if it
1570  * is not just NM_BDG_NAME).
1571  * Called without NMG_LOCK.
1572  */
1573 int
1574 nm_bdg_update_private_data(const char *name, bdg_update_private_data_fn_t callback,
1575 	void *callback_data, void *auth_token)
1576 {
1577 	void *private_data = NULL;
1578 	struct nm_bridge *b;
1579 	int error = 0;
1580 
1581 	NMG_LOCK();
1582 	b = nm_find_bridge(name, 0 /* don't create */);
1583 	if (!b) {
1584 		error = EINVAL;
1585 		goto unlock_update_priv;
1586 	}
1587 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
1588 		error = EACCES;
1589 		goto unlock_update_priv;
1590 	}
1591 	BDG_WLOCK(b);
1592 	private_data = callback(b->private_data, callback_data, &error);
1593 	b->private_data = private_data;
1594 	BDG_WUNLOCK(b);
1595 
1596 unlock_update_priv:
1597 	NMG_UNLOCK();
1598 	return error;
1599 }
1600 
1601 int
1602 netmap_bdg_config(struct nm_ifreq *nr)
1603 {
1604 	struct nm_bridge *b;
1605 	int error = EINVAL;
1606 
1607 	NMG_LOCK();
1608 	b = nm_find_bridge(nr->nifr_name, 0);
1609 	if (!b) {
1610 		NMG_UNLOCK();
1611 		return error;
1612 	}
1613 	NMG_UNLOCK();
1614 	/* Don't call config() with NMG_LOCK() held */
1615 	BDG_RLOCK(b);
1616 	if (b->bdg_ops->config != NULL)
1617 		error = b->bdg_ops->config(nr);
1618 	BDG_RUNLOCK(b);
1619 	return error;
1620 }
1621 
1622 
1623 /* nm_krings_create callback for VALE ports.
1624  * Calls the standard netmap_krings_create, then adds leases on rx
1625  * rings and bdgfwd on tx rings.
1626  */
1627 static int
1628 netmap_vp_krings_create(struct netmap_adapter *na)
1629 {
1630 	u_int tailroom;
1631 	int error, i;
1632 	uint32_t *leases;
1633 	u_int nrx = netmap_real_rings(na, NR_RX);
1634 
1635 	/*
1636 	 * Leases are attached to RX rings on vale ports
1637 	 */
1638 	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
1639 
1640 	error = netmap_krings_create(na, tailroom);
1641 	if (error)
1642 		return error;
1643 
1644 	leases = na->tailroom;
1645 
1646 	for (i = 0; i < nrx; i++) { /* Receive rings */
1647 		na->rx_rings[i]->nkr_leases = leases;
1648 		leases += na->num_rx_desc;
1649 	}
1650 
1651 	error = nm_alloc_bdgfwd(na);
1652 	if (error) {
1653 		netmap_krings_delete(na);
1654 		return error;
1655 	}
1656 
1657 	return 0;
1658 }
1659 
1660 
1661 /* nm_krings_delete callback for VALE ports. */
1662 static void
1663 netmap_vp_krings_delete(struct netmap_adapter *na)
1664 {
1665 	nm_free_bdgfwd(na);
1666 	netmap_krings_delete(na);
1667 }
1668 
1669 
1670 static int
1671 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
1672 	struct netmap_vp_adapter *na, u_int ring_nr);
1673 
1674 
1675 /*
1676  * main dispatch routine for the bridge.
1677  * Grab packets from a kring, move them into the ft structure
1678  * associated to the tx (input) port. Max one instance per port,
1679  * filtered on input (ioctl, poll or XXX).
1680  * Returns the next position in the ring.
1681  */
1682 static int
1683 nm_bdg_preflush(struct netmap_kring *kring, u_int end)
1684 {
1685 	struct netmap_vp_adapter *na =
1686 		(struct netmap_vp_adapter*)kring->na;
1687 	struct netmap_ring *ring = kring->ring;
1688 	struct nm_bdg_fwd *ft;
1689 	u_int ring_nr = kring->ring_id;
1690 	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
1691 	u_int ft_i = 0;	/* start from 0 */
1692 	u_int frags = 1; /* how many frags ? */
1693 	struct nm_bridge *b = na->na_bdg;
1694 
1695 	/* To protect against modifications to the bridge we acquire a
1696 	 * shared lock, waiting if we can sleep (if the source port is
1697 	 * attached to a user process) or with a trylock otherwise (NICs).
1698 	 */
1699 	ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1700 	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
1701 		BDG_RLOCK(b);
1702 	else if (!BDG_RTRYLOCK(b))
1703 		return j;
1704 	ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1705 	ft = kring->nkr_ft;
1706 
1707 	for (; likely(j != end); j = nm_next(j, lim)) {
1708 		struct netmap_slot *slot = &ring->slot[j];
1709 		char *buf;
1710 
1711 		ft[ft_i].ft_len = slot->len;
1712 		ft[ft_i].ft_flags = slot->flags;
1713 		ft[ft_i].ft_offset = 0;
1714 
1715 		ND("flags is 0x%x", slot->flags);
1716 		/* we do not use the buf changed flag, but we still need to reset it */
1717 		slot->flags &= ~NS_BUF_CHANGED;
1718 
1719 		/* this slot goes into a list so initialize the link field */
1720 		ft[ft_i].ft_next = NM_FT_NULL;
1721 		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
1722 			(void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
1723 		if (unlikely(buf == NULL)) {
1724 			RD(5, "NULL %s buffer pointer from %s slot %d len %d",
1725 				(slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
1726 				kring->name, j, ft[ft_i].ft_len);
1727 			buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
1728 			ft[ft_i].ft_len = 0;
1729 			ft[ft_i].ft_flags = 0;
1730 		}
1731 		__builtin_prefetch(buf);
1732 		++ft_i;
1733 		if (slot->flags & NS_MOREFRAG) {
1734 			frags++;
1735 			continue;
1736 		}
1737 		if (unlikely(netmap_verbose && frags > 1))
1738 			RD(5, "%d frags at %d", frags, ft_i - frags);
1739 		ft[ft_i - frags].ft_frags = frags;
1740 		frags = 1;
1741 		if (unlikely((int)ft_i >= bridge_batch))
1742 			ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1743 	}
1744 	if (frags > 1) {
1745 		/* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
1746 		 * have to fix frags count. */
1747 		frags--;
1748 		ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
1749 		ft[ft_i - frags].ft_frags = frags;
1750 		D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
1751 	}
1752 	if (ft_i)
1753 		ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1754 	BDG_RUNLOCK(b);
1755 	return j;
1756 }
1757 
1758 
1759 /* ----- FreeBSD if_bridge hash function ------- */
1760 
1761 /*
1762  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1763  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1764  *
1765  * http://www.burtleburtle.net/bob/hash/spooky.html
1766  */
1767 #define mix(a, b, c)                                                    \
1768 do {                                                                    \
1769 	a -= b; a -= c; a ^= (c >> 13);                                 \
1770 	b -= c; b -= a; b ^= (a << 8);                                  \
1771 	c -= a; c -= b; c ^= (b >> 13);                                 \
1772 	a -= b; a -= c; a ^= (c >> 12);                                 \
1773 	b -= c; b -= a; b ^= (a << 16);                                 \
1774 	c -= a; c -= b; c ^= (b >> 5);                                  \
1775 	a -= b; a -= c; a ^= (c >> 3);                                  \
1776 	b -= c; b -= a; b ^= (a << 10);                                 \
1777 	c -= a; c -= b; c ^= (b >> 15);                                 \
1778 } while (/*CONSTCOND*/0)
1779 
1780 
1781 static __inline uint32_t
1782 nm_bridge_rthash(const uint8_t *addr)
1783 {
1784 	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1785 
1786 	b += addr[5] << 8;
1787 	b += addr[4];
1788 	a += addr[3] << 24;
1789 	a += addr[2] << 16;
1790 	a += addr[1] << 8;
1791 	a += addr[0];
1792 
1793 	mix(a, b, c);
1794 #define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
1795 	return (c & BRIDGE_RTHASH_MASK);
1796 }
1797 
1798 #undef mix
1799 
1800 
1801 /* nm_register callback for VALE ports */
1802 static int
1803 netmap_vp_reg(struct netmap_adapter *na, int onoff)
1804 {
1805 	struct netmap_vp_adapter *vpna =
1806 		(struct netmap_vp_adapter*)na;
1807 	enum txrx t;
1808 	int i;
1809 
1810 	/* persistent ports may be put in netmap mode
1811 	 * before being attached to a bridge
1812 	 */
1813 	if (vpna->na_bdg)
1814 		BDG_WLOCK(vpna->na_bdg);
1815 	if (onoff) {
1816 		for_rx_tx(t) {
1817 			for (i = 0; i < netmap_real_rings(na, t); i++) {
1818 				struct netmap_kring *kring = NMR(na, t)[i];
1819 
1820 				if (nm_kring_pending_on(kring))
1821 					kring->nr_mode = NKR_NETMAP_ON;
1822 			}
1823 		}
1824 		if (na->active_fds == 0)
1825 			na->na_flags |= NAF_NETMAP_ON;
1826 		 /* XXX on FreeBSD, persistent VALE ports should also
1827 		 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
1828 		 */
1829 	} else {
1830 		if (na->active_fds == 0)
1831 			na->na_flags &= ~NAF_NETMAP_ON;
1832 		for_rx_tx(t) {
1833 			for (i = 0; i < netmap_real_rings(na, t); i++) {
1834 				struct netmap_kring *kring = NMR(na, t)[i];
1835 
1836 				if (nm_kring_pending_off(kring))
1837 					kring->nr_mode = NKR_NETMAP_OFF;
1838 			}
1839 		}
1840 	}
1841 	if (vpna->na_bdg)
1842 		BDG_WUNLOCK(vpna->na_bdg);
1843 	return 0;
1844 }
1845 
1846 
1847 /*
1848  * Lookup function for a learning bridge.
1849  * Update the hash table with the source address,
1850  * and then returns the destination port index, and the
1851  * ring in *dst_ring (at the moment, always use ring 0)
1852  */
1853 uint32_t
1854 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
1855 		struct netmap_vp_adapter *na, void *private_data)
1856 {
1857 	uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset;
1858 	u_int buf_len = ft->ft_len - ft->ft_offset;
1859 	struct nm_hash_ent *ht = private_data;
1860 	uint32_t sh, dh;
1861 	u_int dst, mysrc = na->bdg_port;
1862 	uint64_t smac, dmac;
1863 	uint8_t indbuf[12];
1864 
1865 	if (buf_len < 14) {
1866 		return NM_BDG_NOPORT;
1867 	}
1868 
1869 	if (ft->ft_flags & NS_INDIRECT) {
1870 		if (copyin(buf, indbuf, sizeof(indbuf))) {
1871 			return NM_BDG_NOPORT;
1872 		}
1873 		buf = indbuf;
1874 	}
1875 
1876 	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1877 	smac = le64toh(*(uint64_t *)(buf + 4));
1878 	smac >>= 16;
1879 
1880 	/*
1881 	 * The hash is somewhat expensive, there might be some
1882 	 * worthwhile optimizations here.
1883 	 */
1884 	if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
1885 		uint8_t *s = buf+6;
1886 		sh = nm_bridge_rthash(s); /* hash of source */
1887 		/* update source port forwarding entry */
1888 		na->last_smac = ht[sh].mac = smac;	/* XXX expire ? */
1889 		ht[sh].ports = mysrc;
1890 		if (netmap_verbose)
1891 		    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1892 			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1893 	}
1894 	dst = NM_BDG_BROADCAST;
1895 	if ((buf[0] & 1) == 0) { /* unicast */
1896 		dh = nm_bridge_rthash(buf); /* hash of dst */
1897 		if (ht[dh].mac == dmac) {	/* found dst */
1898 			dst = ht[dh].ports;
1899 		}
1900 	}
1901 	return dst;
1902 }
1903 
1904 
1905 /*
1906  * Available space in the ring. Only used in VALE code
1907  * and only with is_rx = 1
1908  */
1909 static inline uint32_t
1910 nm_kr_space(struct netmap_kring *k, int is_rx)
1911 {
1912 	int space;
1913 
1914 	if (is_rx) {
1915 		int busy = k->nkr_hwlease - k->nr_hwcur;
1916 		if (busy < 0)
1917 			busy += k->nkr_num_slots;
1918 		space = k->nkr_num_slots - 1 - busy;
1919 	} else {
1920 		/* XXX never used in this branch */
1921 		space = k->nr_hwtail - k->nkr_hwlease;
1922 		if (space < 0)
1923 			space += k->nkr_num_slots;
1924 	}
1925 #if 0
1926 	// sanity check
1927 	if (k->nkr_hwlease >= k->nkr_num_slots ||
1928 		k->nr_hwcur >= k->nkr_num_slots ||
1929 		k->nr_tail >= k->nkr_num_slots ||
1930 		busy < 0 ||
1931 		busy >= k->nkr_num_slots) {
1932 		D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1933 			k->nkr_lease_idx, k->nkr_num_slots);
1934 	}
1935 #endif
1936 	return space;
1937 }
1938 
1939 
1940 
1941 
1942 /* make a lease on the kring for N positions. return the
1943  * lease index
1944  * XXX only used in VALE code and with is_rx = 1
1945  */
1946 static inline uint32_t
1947 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1948 {
1949 	uint32_t lim = k->nkr_num_slots - 1;
1950 	uint32_t lease_idx = k->nkr_lease_idx;
1951 
1952 	k->nkr_leases[lease_idx] = NR_NOSLOT;
1953 	k->nkr_lease_idx = nm_next(lease_idx, lim);
1954 
1955 	if (n > nm_kr_space(k, is_rx)) {
1956 		D("invalid request for %d slots", n);
1957 		panic("x");
1958 	}
1959 	/* XXX verify that there are n slots */
1960 	k->nkr_hwlease += n;
1961 	if (k->nkr_hwlease > lim)
1962 		k->nkr_hwlease -= lim + 1;
1963 
1964 	if (k->nkr_hwlease >= k->nkr_num_slots ||
1965 		k->nr_hwcur >= k->nkr_num_slots ||
1966 		k->nr_hwtail >= k->nkr_num_slots ||
1967 		k->nkr_lease_idx >= k->nkr_num_slots) {
1968 		D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1969 			k->na->name,
1970 			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1971 			k->nkr_lease_idx, k->nkr_num_slots);
1972 	}
1973 	return lease_idx;
1974 }
1975 
1976 /*
1977  *
1978  * This flush routine supports only unicast and broadcast but a large
1979  * number of ports, and lets us replace the learn and dispatch functions.
1980  */
1981 int
1982 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1983 		u_int ring_nr)
1984 {
1985 	struct nm_bdg_q *dst_ents, *brddst;
1986 	uint16_t num_dsts = 0, *dsts;
1987 	struct nm_bridge *b = na->na_bdg;
1988 	u_int i, me = na->bdg_port;
1989 
1990 	/*
1991 	 * The work area (pointed by ft) is followed by an array of
1992 	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1993 	 * queues per port plus one for the broadcast traffic.
1994 	 * Then we have an array of destination indexes.
1995 	 */
1996 	dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1997 	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1998 
1999 	/* first pass: find a destination for each packet in the batch */
2000 	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
2001 		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
2002 		uint16_t dst_port, d_i;
2003 		struct nm_bdg_q *d;
2004 		struct nm_bdg_fwd *start_ft = NULL;
2005 
2006 		ND("slot %d frags %d", i, ft[i].ft_frags);
2007 
2008 		if (na->up.virt_hdr_len < ft[i].ft_len) {
2009 			ft[i].ft_offset = na->up.virt_hdr_len;
2010 			start_ft = &ft[i];
2011 		} else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) {
2012 			ft[i].ft_offset = ft[i].ft_len;
2013 			start_ft = &ft[i+1];
2014 		} else {
2015 			/* Drop the packet if the virtio-net header is not into the first
2016 			 * fragment nor at the very beginning of the second.
2017 			 */
2018 			continue;
2019 		}
2020 		dst_port = b->bdg_ops->lookup(start_ft, &dst_ring, na, b->private_data);
2021 		if (netmap_verbose > 255)
2022 			RD(5, "slot %d port %d -> %d", i, me, dst_port);
2023 		if (dst_port >= NM_BDG_NOPORT)
2024 			continue; /* this packet is identified to be dropped */
2025 		else if (dst_port == NM_BDG_BROADCAST)
2026 			dst_ring = 0; /* broadcasts always go to ring 0 */
2027 		else if (unlikely(dst_port == me ||
2028 		    !b->bdg_ports[dst_port]))
2029 			continue;
2030 
2031 		/* get a position in the scratch pad */
2032 		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
2033 		d = dst_ents + d_i;
2034 
2035 		/* append the first fragment to the list */
2036 		if (d->bq_head == NM_FT_NULL) { /* new destination */
2037 			d->bq_head = d->bq_tail = i;
2038 			/* remember this position to be scanned later */
2039 			if (dst_port != NM_BDG_BROADCAST)
2040 				dsts[num_dsts++] = d_i;
2041 		} else {
2042 			ft[d->bq_tail].ft_next = i;
2043 			d->bq_tail = i;
2044 		}
2045 		d->bq_len += ft[i].ft_frags;
2046 	}
2047 
2048 	/*
2049 	 * Broadcast traffic goes to ring 0 on all destinations.
2050 	 * So we need to add these rings to the list of ports to scan.
2051 	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
2052 	 * expensive. We should keep a compact list of active destinations
2053 	 * so we could shorten this loop.
2054 	 */
2055 	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
2056 	if (brddst->bq_head != NM_FT_NULL) {
2057 		u_int j;
2058 		for (j = 0; likely(j < b->bdg_active_ports); j++) {
2059 			uint16_t d_i;
2060 			i = b->bdg_port_index[j];
2061 			if (unlikely(i == me))
2062 				continue;
2063 			d_i = i * NM_BDG_MAXRINGS;
2064 			if (dst_ents[d_i].bq_head == NM_FT_NULL)
2065 				dsts[num_dsts++] = d_i;
2066 		}
2067 	}
2068 
2069 	ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
2070 	/* second pass: scan destinations */
2071 	for (i = 0; i < num_dsts; i++) {
2072 		struct netmap_vp_adapter *dst_na;
2073 		struct netmap_kring *kring;
2074 		struct netmap_ring *ring;
2075 		u_int dst_nr, lim, j, d_i, next, brd_next;
2076 		u_int needed, howmany;
2077 		int retry = netmap_txsync_retry;
2078 		struct nm_bdg_q *d;
2079 		uint32_t my_start = 0, lease_idx = 0;
2080 		int nrings;
2081 		int virt_hdr_mismatch = 0;
2082 
2083 		d_i = dsts[i];
2084 		ND("second pass %d port %d", i, d_i);
2085 		d = dst_ents + d_i;
2086 		// XXX fix the division
2087 		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
2088 		/* protect from the lookup function returning an inactive
2089 		 * destination port
2090 		 */
2091 		if (unlikely(dst_na == NULL))
2092 			goto cleanup;
2093 		if (dst_na->up.na_flags & NAF_SW_ONLY)
2094 			goto cleanup;
2095 		/*
2096 		 * The interface may be in !netmap mode in two cases:
2097 		 * - when na is attached but not activated yet;
2098 		 * - when na is being deactivated but is still attached.
2099 		 */
2100 		if (unlikely(!nm_netmap_on(&dst_na->up))) {
2101 			ND("not in netmap mode!");
2102 			goto cleanup;
2103 		}
2104 
2105 		/* there is at least one either unicast or broadcast packet */
2106 		brd_next = brddst->bq_head;
2107 		next = d->bq_head;
2108 		/* we need to reserve this many slots. If fewer are
2109 		 * available, some packets will be dropped.
2110 		 * Packets may have multiple fragments, so we may not use
2111 		 * there is a chance that we may not use all of the slots
2112 		 * we have claimed, so we will need to handle the leftover
2113 		 * ones when we regain the lock.
2114 		 */
2115 		needed = d->bq_len + brddst->bq_len;
2116 
2117 		if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
2118 			if (netmap_verbose) {
2119 				RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
2120 						dst_na->up.virt_hdr_len);
2121 			}
2122 			/* There is a virtio-net header/offloadings mismatch between
2123 			 * source and destination. The slower mismatch datapath will
2124 			 * be used to cope with all the mismatches.
2125 			 */
2126 			virt_hdr_mismatch = 1;
2127 			if (dst_na->mfs < na->mfs) {
2128 				/* We may need to do segmentation offloadings, and so
2129 				 * we may need a number of destination slots greater
2130 				 * than the number of input slots ('needed').
2131 				 * We look for the smallest integer 'x' which satisfies:
2132 				 *	needed * na->mfs + x * H <= x * na->mfs
2133 				 * where 'H' is the length of the longest header that may
2134 				 * be replicated in the segmentation process (e.g. for
2135 				 * TCPv4 we must account for ethernet header, IP header
2136 				 * and TCPv4 header).
2137 				 */
2138 				KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0"));
2139 				needed = (needed * na->mfs) /
2140 						(dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
2141 				ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
2142 			}
2143 		}
2144 
2145 		ND(5, "pass 2 dst %d is %x %s",
2146 			i, d_i, is_vp ? "virtual" : "nic/host");
2147 		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
2148 		nrings = dst_na->up.num_rx_rings;
2149 		if (dst_nr >= nrings)
2150 			dst_nr = dst_nr % nrings;
2151 		kring = dst_na->up.rx_rings[dst_nr];
2152 		ring = kring->ring;
2153 		/* the destination ring may have not been opened for RX */
2154 		if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON))
2155 			goto cleanup;
2156 		lim = kring->nkr_num_slots - 1;
2157 
2158 retry:
2159 
2160 		if (dst_na->retry && retry) {
2161 			/* try to get some free slot from the previous run */
2162 			kring->nm_notify(kring, 0);
2163 			/* actually useful only for bwraps, since there
2164 			 * the notify will trigger a txsync on the hwna. VALE ports
2165 			 * have dst_na->retry == 0
2166 			 */
2167 		}
2168 		/* reserve the buffers in the queue and an entry
2169 		 * to report completion, and drop lock.
2170 		 * XXX this might become a helper function.
2171 		 */
2172 		mtx_lock(&kring->q_lock);
2173 		if (kring->nkr_stopped) {
2174 			mtx_unlock(&kring->q_lock);
2175 			goto cleanup;
2176 		}
2177 		my_start = j = kring->nkr_hwlease;
2178 		howmany = nm_kr_space(kring, 1);
2179 		if (needed < howmany)
2180 			howmany = needed;
2181 		lease_idx = nm_kr_lease(kring, howmany, 1);
2182 		mtx_unlock(&kring->q_lock);
2183 
2184 		/* only retry if we need more than available slots */
2185 		if (retry && needed <= howmany)
2186 			retry = 0;
2187 
2188 		/* copy to the destination queue */
2189 		while (howmany > 0) {
2190 			struct netmap_slot *slot;
2191 			struct nm_bdg_fwd *ft_p, *ft_end;
2192 			u_int cnt;
2193 
2194 			/* find the queue from which we pick next packet.
2195 			 * NM_FT_NULL is always higher than valid indexes
2196 			 * so we never dereference it if the other list
2197 			 * has packets (and if both are empty we never
2198 			 * get here).
2199 			 */
2200 			if (next < brd_next) {
2201 				ft_p = ft + next;
2202 				next = ft_p->ft_next;
2203 			} else { /* insert broadcast */
2204 				ft_p = ft + brd_next;
2205 				brd_next = ft_p->ft_next;
2206 			}
2207 			cnt = ft_p->ft_frags; // cnt > 0
2208 			if (unlikely(cnt > howmany))
2209 			    break; /* no more space */
2210 			if (netmap_verbose && cnt > 1)
2211 				RD(5, "rx %d frags to %d", cnt, j);
2212 			ft_end = ft_p + cnt;
2213 			if (unlikely(virt_hdr_mismatch)) {
2214 				bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
2215 			} else {
2216 				howmany -= cnt;
2217 				do {
2218 					char *dst, *src = ft_p->ft_buf;
2219 					size_t copy_len = ft_p->ft_len, dst_len = copy_len;
2220 
2221 					slot = &ring->slot[j];
2222 					dst = NMB(&dst_na->up, slot);
2223 
2224 					ND("send [%d] %d(%d) bytes at %s:%d",
2225 							i, (int)copy_len, (int)dst_len,
2226 							NM_IFPNAME(dst_ifp), j);
2227 					/* round to a multiple of 64 */
2228 					copy_len = (copy_len + 63) & ~63;
2229 
2230 					if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
2231 						     copy_len > NETMAP_BUF_SIZE(&na->up))) {
2232 						RD(5, "invalid len %d, down to 64", (int)copy_len);
2233 						copy_len = dst_len = 64; // XXX
2234 					}
2235 					if (ft_p->ft_flags & NS_INDIRECT) {
2236 						if (copyin(src, dst, copy_len)) {
2237 							// invalid user pointer, pretend len is 0
2238 							dst_len = 0;
2239 						}
2240 					} else {
2241 						//memcpy(dst, src, copy_len);
2242 						pkt_copy(src, dst, (int)copy_len);
2243 					}
2244 					slot->len = dst_len;
2245 					slot->flags = (cnt << 8)| NS_MOREFRAG;
2246 					j = nm_next(j, lim);
2247 					needed--;
2248 					ft_p++;
2249 				} while (ft_p != ft_end);
2250 				slot->flags = (cnt << 8); /* clear flag on last entry */
2251 			}
2252 			/* are we done ? */
2253 			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
2254 				break;
2255 		}
2256 		{
2257 		    /* current position */
2258 		    uint32_t *p = kring->nkr_leases; /* shorthand */
2259 		    uint32_t update_pos;
2260 		    int still_locked = 1;
2261 
2262 		    mtx_lock(&kring->q_lock);
2263 		    if (unlikely(howmany > 0)) {
2264 			/* not used all bufs. If i am the last one
2265 			 * i can recover the slots, otherwise must
2266 			 * fill them with 0 to mark empty packets.
2267 			 */
2268 			ND("leftover %d bufs", howmany);
2269 			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
2270 			    /* yes i am the last one */
2271 			    ND("roll back nkr_hwlease to %d", j);
2272 			    kring->nkr_hwlease = j;
2273 			} else {
2274 			    while (howmany-- > 0) {
2275 				ring->slot[j].len = 0;
2276 				ring->slot[j].flags = 0;
2277 				j = nm_next(j, lim);
2278 			    }
2279 			}
2280 		    }
2281 		    p[lease_idx] = j; /* report I am done */
2282 
2283 		    update_pos = kring->nr_hwtail;
2284 
2285 		    if (my_start == update_pos) {
2286 			/* all slots before my_start have been reported,
2287 			 * so scan subsequent leases to see if other ranges
2288 			 * have been completed, and to a selwakeup or txsync.
2289 		         */
2290 			while (lease_idx != kring->nkr_lease_idx &&
2291 				p[lease_idx] != NR_NOSLOT) {
2292 			    j = p[lease_idx];
2293 			    p[lease_idx] = NR_NOSLOT;
2294 			    lease_idx = nm_next(lease_idx, lim);
2295 			}
2296 			/* j is the new 'write' position. j != my_start
2297 			 * means there are new buffers to report
2298 			 */
2299 			if (likely(j != my_start)) {
2300 				kring->nr_hwtail = j;
2301 				still_locked = 0;
2302 				mtx_unlock(&kring->q_lock);
2303 				kring->nm_notify(kring, 0);
2304 				/* this is netmap_notify for VALE ports and
2305 				 * netmap_bwrap_notify for bwrap. The latter will
2306 				 * trigger a txsync on the underlying hwna
2307 				 */
2308 				if (dst_na->retry && retry--) {
2309 					/* XXX this is going to call nm_notify again.
2310 					 * Only useful for bwrap in virtual machines
2311 					 */
2312 					goto retry;
2313 				}
2314 			}
2315 		    }
2316 		    if (still_locked)
2317 			mtx_unlock(&kring->q_lock);
2318 		}
2319 cleanup:
2320 		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
2321 		d->bq_len = 0;
2322 	}
2323 	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
2324 	brddst->bq_len = 0;
2325 	return 0;
2326 }
2327 
2328 /* nm_txsync callback for VALE ports */
2329 static int
2330 netmap_vp_txsync(struct netmap_kring *kring, int flags)
2331 {
2332 	struct netmap_vp_adapter *na =
2333 		(struct netmap_vp_adapter *)kring->na;
2334 	u_int done;
2335 	u_int const lim = kring->nkr_num_slots - 1;
2336 	u_int const head = kring->rhead;
2337 
2338 	if (bridge_batch <= 0) { /* testing only */
2339 		done = head; // used all
2340 		goto done;
2341 	}
2342 	if (!na->na_bdg) {
2343 		done = head;
2344 		goto done;
2345 	}
2346 	if (bridge_batch > NM_BDG_BATCH)
2347 		bridge_batch = NM_BDG_BATCH;
2348 
2349 	done = nm_bdg_preflush(kring, head);
2350 done:
2351 	if (done != head)
2352 		D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
2353 	/*
2354 	 * packets between 'done' and 'cur' are left unsent.
2355 	 */
2356 	kring->nr_hwcur = done;
2357 	kring->nr_hwtail = nm_prev(done, lim);
2358 	if (netmap_verbose)
2359 		D("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
2360 	return 0;
2361 }
2362 
2363 
2364 /* rxsync code used by VALE ports nm_rxsync callback and also
2365  * internally by the brwap
2366  */
2367 static int
2368 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
2369 {
2370 	struct netmap_adapter *na = kring->na;
2371 	struct netmap_ring *ring = kring->ring;
2372 	u_int nm_i, lim = kring->nkr_num_slots - 1;
2373 	u_int head = kring->rhead;
2374 	int n;
2375 
2376 	if (head > lim) {
2377 		D("ouch dangerous reset!!!");
2378 		n = netmap_ring_reinit(kring);
2379 		goto done;
2380 	}
2381 
2382 	/* First part, import newly received packets. */
2383 	/* actually nothing to do here, they are already in the kring */
2384 
2385 	/* Second part, skip past packets that userspace has released. */
2386 	nm_i = kring->nr_hwcur;
2387 	if (nm_i != head) {
2388 		/* consistency check, but nothing really important here */
2389 		for (n = 0; likely(nm_i != head); n++) {
2390 			struct netmap_slot *slot = &ring->slot[nm_i];
2391 			void *addr = NMB(na, slot);
2392 
2393 			if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
2394 				D("bad buffer index %d, ignore ?",
2395 					slot->buf_idx);
2396 			}
2397 			slot->flags &= ~NS_BUF_CHANGED;
2398 			nm_i = nm_next(nm_i, lim);
2399 		}
2400 		kring->nr_hwcur = head;
2401 	}
2402 
2403 	n = 0;
2404 done:
2405 	return n;
2406 }
2407 
2408 /*
2409  * nm_rxsync callback for VALE ports
2410  * user process reading from a VALE switch.
2411  * Already protected against concurrent calls from userspace,
2412  * but we must acquire the queue's lock to protect against
2413  * writers on the same queue.
2414  */
2415 static int
2416 netmap_vp_rxsync(struct netmap_kring *kring, int flags)
2417 {
2418 	int n;
2419 
2420 	mtx_lock(&kring->q_lock);
2421 	n = netmap_vp_rxsync_locked(kring, flags);
2422 	mtx_unlock(&kring->q_lock);
2423 	return n;
2424 }
2425 
2426 
2427 /* nm_bdg_attach callback for VALE ports
2428  * The na_vp port is this same netmap_adapter. There is no host port.
2429  */
2430 static int
2431 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na)
2432 {
2433 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
2434 
2435 	if (vpna->na_bdg) {
2436 		return netmap_bwrap_attach(name, na);
2437 	}
2438 	na->na_vp = vpna;
2439 	strncpy(na->name, name, sizeof(na->name));
2440 	na->na_hostvp = NULL;
2441 	return 0;
2442 }
2443 
2444 /* create a netmap_vp_adapter that describes a VALE port.
2445  * Only persistent VALE ports have a non-null ifp.
2446  */
2447 static int
2448 netmap_vp_create(struct nmreq_header *hdr, struct ifnet *ifp,
2449 		struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret)
2450 {
2451 	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
2452 	struct netmap_vp_adapter *vpna;
2453 	struct netmap_adapter *na;
2454 	int error = 0;
2455 	u_int npipes = 0;
2456 	u_int extrabufs = 0;
2457 
2458 	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
2459 		return EINVAL;
2460 	}
2461 
2462 	vpna = nm_os_malloc(sizeof(*vpna));
2463 	if (vpna == NULL)
2464 		return ENOMEM;
2465 
2466  	na = &vpna->up;
2467 
2468 	na->ifp = ifp;
2469 	strncpy(na->name, hdr->nr_name, sizeof(na->name));
2470 
2471 	/* bound checking */
2472 	na->num_tx_rings = req->nr_tx_rings;
2473 	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
2474 	req->nr_tx_rings = na->num_tx_rings; /* write back */
2475 	na->num_rx_rings = req->nr_rx_rings;
2476 	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
2477 	req->nr_rx_rings = na->num_rx_rings; /* write back */
2478 	nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE,
2479 			1, NM_BDG_MAXSLOTS, NULL);
2480 	na->num_tx_desc = req->nr_tx_slots;
2481 	nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE,
2482 			1, NM_BDG_MAXSLOTS, NULL);
2483 	/* validate number of pipes. We want at least 1,
2484 	 * but probably can do with some more.
2485 	 * So let's use 2 as default (when 0 is supplied)
2486 	 */
2487 	nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
2488 	/* validate extra bufs */
2489 	nm_bound_var(&extrabufs, 0, 0,
2490 			128*NM_BDG_MAXSLOTS, NULL);
2491 	req->nr_extra_bufs = extrabufs; /* write back */
2492 	na->num_rx_desc = req->nr_rx_slots;
2493 	/* Set the mfs to a default value, as it is needed on the VALE
2494 	 * mismatch datapath. XXX We should set it according to the MTU
2495 	 * known to the kernel. */
2496 	vpna->mfs = NM_BDG_MFS_DEFAULT;
2497 	vpna->last_smac = ~0llu;
2498 	/*if (vpna->mfs > netmap_buf_size)  TODO netmap_buf_size is zero??
2499 		vpna->mfs = netmap_buf_size; */
2500 	if (netmap_verbose)
2501 		D("max frame size %u", vpna->mfs);
2502 
2503 	na->na_flags |= NAF_BDG_MAYSLEEP;
2504 	/* persistent VALE ports look like hw devices
2505 	 * with a native netmap adapter
2506 	 */
2507 	if (ifp)
2508 		na->na_flags |= NAF_NATIVE;
2509 	na->nm_txsync = netmap_vp_txsync;
2510 	na->nm_rxsync = netmap_vp_rxsync;
2511 	na->nm_register = netmap_vp_reg;
2512 	na->nm_krings_create = netmap_vp_krings_create;
2513 	na->nm_krings_delete = netmap_vp_krings_delete;
2514 	na->nm_dtor = netmap_vp_dtor;
2515 	ND("nr_mem_id %d", req->nr_mem_id);
2516 	na->nm_mem = nmd ?
2517 		netmap_mem_get(nmd):
2518 		netmap_mem_private_new(
2519 			na->num_tx_rings, na->num_tx_desc,
2520 			na->num_rx_rings, na->num_rx_desc,
2521 			req->nr_extra_bufs, npipes, &error);
2522 	if (na->nm_mem == NULL)
2523 		goto err;
2524 	na->nm_bdg_attach = netmap_vp_bdg_attach;
2525 	/* other nmd fields are set in the common routine */
2526 	error = netmap_attach_common(na);
2527 	if (error)
2528 		goto err;
2529 	*ret = vpna;
2530 	return 0;
2531 
2532 err:
2533 	if (na->nm_mem != NULL)
2534 		netmap_mem_put(na->nm_mem);
2535 	nm_os_free(vpna);
2536 	return error;
2537 }
2538 
2539 /* Bridge wrapper code (bwrap).
2540  * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
2541  * VALE switch.
2542  * The main task is to swap the meaning of tx and rx rings to match the
2543  * expectations of the VALE switch code (see nm_bdg_flush).
2544  *
2545  * The bwrap works by interposing a netmap_bwrap_adapter between the
2546  * rest of the system and the hwna. The netmap_bwrap_adapter looks like
2547  * a netmap_vp_adapter to the rest the system, but, internally, it
2548  * translates all callbacks to what the hwna expects.
2549  *
2550  * Note that we have to intercept callbacks coming from two sides:
2551  *
2552  *  - callbacks coming from the netmap module are intercepted by
2553  *    passing around the netmap_bwrap_adapter instead of the hwna
2554  *
2555  *  - callbacks coming from outside of the netmap module only know
2556  *    about the hwna. This, however, only happens in interrupt
2557  *    handlers, where only the hwna->nm_notify callback is called.
2558  *    What the bwrap does is to overwrite the hwna->nm_notify callback
2559  *    with its own netmap_bwrap_intr_notify.
2560  *    XXX This assumes that the hwna->nm_notify callback was the
2561  *    standard netmap_notify(), as it is the case for nic adapters.
2562  *    Any additional action performed by hwna->nm_notify will not be
2563  *    performed by netmap_bwrap_intr_notify.
2564  *
2565  * Additionally, the bwrap can optionally attach the host rings pair
2566  * of the wrapped adapter to a different port of the switch.
2567  */
2568 
2569 
2570 static void
2571 netmap_bwrap_dtor(struct netmap_adapter *na)
2572 {
2573 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2574 	struct netmap_adapter *hwna = bna->hwna;
2575 	struct nm_bridge *b = bna->up.na_bdg,
2576 		*bh = bna->host.na_bdg;
2577 
2578 	if (bna->host.up.nm_mem)
2579 		netmap_mem_put(bna->host.up.nm_mem);
2580 
2581 	if (b) {
2582 		netmap_bdg_detach_common(b, bna->up.bdg_port,
2583 			    (bh ? bna->host.bdg_port : -1));
2584 	}
2585 
2586 	ND("na %p", na);
2587 	na->ifp = NULL;
2588 	bna->host.up.ifp = NULL;
2589 	hwna->na_vp = bna->saved_na_vp;
2590 	hwna->na_hostvp = NULL;
2591 	hwna->na_private = NULL;
2592 	hwna->na_flags &= ~NAF_BUSY;
2593 	netmap_adapter_put(hwna);
2594 
2595 }
2596 
2597 
2598 /*
2599  * Intr callback for NICs connected to a bridge.
2600  * Simply ignore tx interrupts (maybe we could try to recover space ?)
2601  * and pass received packets from nic to the bridge.
2602  *
2603  * XXX TODO check locking: this is called from the interrupt
2604  * handler so we should make sure that the interface is not
2605  * disconnected while passing down an interrupt.
2606  *
2607  * Note, no user process can access this NIC or the host stack.
2608  * The only part of the ring that is significant are the slots,
2609  * and head/cur/tail are set from the kring as needed
2610  * (part as a receive ring, part as a transmit ring).
2611  *
2612  * callback that overwrites the hwna notify callback.
2613  * Packets come from the outside or from the host stack and are put on an
2614  * hwna rx ring.
2615  * The bridge wrapper then sends the packets through the bridge.
2616  */
2617 static int
2618 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
2619 {
2620 	struct netmap_adapter *na = kring->na;
2621 	struct netmap_bwrap_adapter *bna = na->na_private;
2622 	struct netmap_kring *bkring;
2623 	struct netmap_vp_adapter *vpna = &bna->up;
2624 	u_int ring_nr = kring->ring_id;
2625 	int ret = NM_IRQ_COMPLETED;
2626 	int error;
2627 
2628 	if (netmap_verbose)
2629 	    D("%s %s 0x%x", na->name, kring->name, flags);
2630 
2631 	bkring = vpna->up.tx_rings[ring_nr];
2632 
2633 	/* make sure the ring is not disabled */
2634 	if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) {
2635 		return EIO;
2636 	}
2637 
2638 	if (netmap_verbose)
2639 	    D("%s head %d cur %d tail %d",  na->name,
2640 		kring->rhead, kring->rcur, kring->rtail);
2641 
2642 	/* simulate a user wakeup on the rx ring
2643 	 * fetch packets that have arrived.
2644 	 */
2645 	error = kring->nm_sync(kring, 0);
2646 	if (error)
2647 		goto put_out;
2648 	if (kring->nr_hwcur == kring->nr_hwtail) {
2649 		if (netmap_verbose)
2650 			D("how strange, interrupt with no packets on %s",
2651 			    na->name);
2652 		goto put_out;
2653 	}
2654 
2655 	/* new packets are kring->rcur to kring->nr_hwtail, and the bkring
2656 	 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail
2657 	 * to push all packets out.
2658 	 */
2659 	bkring->rhead = bkring->rcur = kring->nr_hwtail;
2660 
2661 	netmap_vp_txsync(bkring, flags);
2662 
2663 	/* mark all buffers as released on this ring */
2664 	kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail;
2665 	/* another call to actually release the buffers */
2666 	error = kring->nm_sync(kring, 0);
2667 
2668 	/* The second rxsync may have further advanced hwtail. If this happens,
2669 	 *  return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */
2670 	if (kring->rcur != kring->nr_hwtail) {
2671 		ret = NM_IRQ_RESCHED;
2672 	}
2673 put_out:
2674 	nm_kr_put(kring);
2675 
2676 	return error ? error : ret;
2677 }
2678 
2679 
2680 /* nm_register callback for bwrap */
2681 static int
2682 netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
2683 {
2684 	struct netmap_bwrap_adapter *bna =
2685 		(struct netmap_bwrap_adapter *)na;
2686 	struct netmap_adapter *hwna = bna->hwna;
2687 	struct netmap_vp_adapter *hostna = &bna->host;
2688 	int error, i;
2689 	enum txrx t;
2690 
2691 	ND("%s %s", na->name, onoff ? "on" : "off");
2692 
2693 	if (onoff) {
2694 		/* netmap_do_regif has been called on the bwrap na.
2695 		 * We need to pass the information about the
2696 		 * memory allocator down to the hwna before
2697 		 * putting it in netmap mode
2698 		 */
2699 		hwna->na_lut = na->na_lut;
2700 
2701 		if (hostna->na_bdg) {
2702 			/* if the host rings have been attached to switch,
2703 			 * we need to copy the memory allocator information
2704 			 * in the hostna also
2705 			 */
2706 			hostna->up.na_lut = na->na_lut;
2707 		}
2708 
2709 	}
2710 
2711 	/* pass down the pending ring state information */
2712 	for_rx_tx(t) {
2713 		for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
2714 			NMR(hwna, t)[i]->nr_pending_mode =
2715 				NMR(na, t)[i]->nr_pending_mode;
2716 	}
2717 
2718 	/* forward the request to the hwna */
2719 	error = hwna->nm_register(hwna, onoff);
2720 	if (error)
2721 		return error;
2722 
2723 	/* copy up the current ring state information */
2724 	for_rx_tx(t) {
2725 		for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
2726 			struct netmap_kring *kring = NMR(hwna, t)[i];
2727 			NMR(na, t)[i]->nr_mode = kring->nr_mode;
2728 		}
2729 	}
2730 
2731 	/* impersonate a netmap_vp_adapter */
2732 	netmap_vp_reg(na, onoff);
2733 	if (hostna->na_bdg)
2734 		netmap_vp_reg(&hostna->up, onoff);
2735 
2736 	if (onoff) {
2737 		u_int i;
2738 		/* intercept the hwna nm_nofify callback on the hw rings */
2739 		for (i = 0; i < hwna->num_rx_rings; i++) {
2740 			hwna->rx_rings[i]->save_notify = hwna->rx_rings[i]->nm_notify;
2741 			hwna->rx_rings[i]->nm_notify = netmap_bwrap_intr_notify;
2742 		}
2743 		i = hwna->num_rx_rings; /* for safety */
2744 		/* save the host ring notify unconditionally */
2745 		hwna->rx_rings[i]->save_notify = hwna->rx_rings[i]->nm_notify;
2746 		if (hostna->na_bdg) {
2747 			/* also intercept the host ring notify */
2748 			hwna->rx_rings[i]->nm_notify = netmap_bwrap_intr_notify;
2749 		}
2750 		if (na->active_fds == 0)
2751 			na->na_flags |= NAF_NETMAP_ON;
2752 	} else {
2753 		u_int i;
2754 
2755 		if (na->active_fds == 0)
2756 			na->na_flags &= ~NAF_NETMAP_ON;
2757 
2758 		/* reset all notify callbacks (including host ring) */
2759 		for (i = 0; i <= hwna->num_rx_rings; i++) {
2760 			hwna->rx_rings[i]->nm_notify = hwna->rx_rings[i]->save_notify;
2761 			hwna->rx_rings[i]->save_notify = NULL;
2762 		}
2763 		hwna->na_lut.lut = NULL;
2764 		hwna->na_lut.plut = NULL;
2765 		hwna->na_lut.objtotal = 0;
2766 		hwna->na_lut.objsize = 0;
2767 
2768 		/* pass ownership of the netmap rings to the hwna */
2769 		for_rx_tx(t) {
2770 			for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
2771 				NMR(na, t)[i]->ring = NULL;
2772 			}
2773 		}
2774 
2775 	}
2776 
2777 	return 0;
2778 }
2779 
2780 /* nm_config callback for bwrap */
2781 static int
2782 netmap_bwrap_config(struct netmap_adapter *na, struct nm_config_info *info)
2783 {
2784 	struct netmap_bwrap_adapter *bna =
2785 		(struct netmap_bwrap_adapter *)na;
2786 	struct netmap_adapter *hwna = bna->hwna;
2787 	int error;
2788 
2789 	/* Forward the request to the hwna. It may happen that nobody
2790 	 * registered hwna yet, so netmap_mem_get_lut() may have not
2791 	 * been called yet. */
2792 	error = netmap_mem_get_lut(hwna->nm_mem, &hwna->na_lut);
2793 	if (error)
2794 		return error;
2795 	netmap_update_config(hwna);
2796 	/* swap the results and propagate */
2797 	info->num_tx_rings = hwna->num_rx_rings;
2798 	info->num_tx_descs = hwna->num_rx_desc;
2799 	info->num_rx_rings = hwna->num_tx_rings;
2800 	info->num_rx_descs = hwna->num_tx_desc;
2801 	info->rx_buf_maxsize = hwna->rx_buf_maxsize;
2802 
2803 	return 0;
2804 }
2805 
2806 
2807 /* nm_krings_create callback for bwrap */
2808 static int
2809 netmap_bwrap_krings_create(struct netmap_adapter *na)
2810 {
2811 	struct netmap_bwrap_adapter *bna =
2812 		(struct netmap_bwrap_adapter *)na;
2813 	struct netmap_adapter *hwna = bna->hwna;
2814 	struct netmap_adapter *hostna = &bna->host.up;
2815 	int i, error = 0;
2816 	enum txrx t;
2817 
2818 	ND("%s", na->name);
2819 
2820 	/* impersonate a netmap_vp_adapter */
2821 	error = netmap_vp_krings_create(na);
2822 	if (error)
2823 		return error;
2824 
2825 	/* also create the hwna krings */
2826 	error = hwna->nm_krings_create(hwna);
2827 	if (error) {
2828 		goto err_del_vp_rings;
2829 	}
2830 
2831 	/* increment the usage counter for all the hwna krings */
2832 	for_rx_tx(t) {
2833 		for (i = 0; i < nma_get_nrings(hwna, t) + 1; i++) {
2834 			NMR(hwna, t)[i]->users++;
2835 		}
2836 	}
2837 
2838 	/* now create the actual rings */
2839 	error = netmap_mem_rings_create(hwna);
2840 	if (error) {
2841 		goto err_dec_users;
2842 	}
2843 
2844 	/* cross-link the netmap rings
2845 	 * The original number of rings comes from hwna,
2846 	 * rx rings on one side equals tx rings on the other.
2847 	 */
2848 	for_rx_tx(t) {
2849 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2850 		for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {
2851 			NMR(na, t)[i]->nkr_num_slots = NMR(hwna, r)[i]->nkr_num_slots;
2852 			NMR(na, t)[i]->ring = NMR(hwna, r)[i]->ring;
2853 		}
2854 	}
2855 
2856 	if (na->na_flags & NAF_HOST_RINGS) {
2857 		/* the hostna rings are the host rings of the bwrap.
2858 		 * The corresponding krings must point back to the
2859 		 * hostna
2860 		 */
2861 		hostna->tx_rings = &na->tx_rings[na->num_tx_rings];
2862 		hostna->tx_rings[0]->na = hostna;
2863 		hostna->rx_rings = &na->rx_rings[na->num_rx_rings];
2864 		hostna->rx_rings[0]->na = hostna;
2865 	}
2866 
2867 	return 0;
2868 
2869 err_dec_users:
2870 	for_rx_tx(t) {
2871 		NMR(hwna, t)[i]->users--;
2872 	}
2873 	hwna->nm_krings_delete(hwna);
2874 err_del_vp_rings:
2875 	netmap_vp_krings_delete(na);
2876 
2877 	return error;
2878 }
2879 
2880 
2881 static void
2882 netmap_bwrap_krings_delete(struct netmap_adapter *na)
2883 {
2884 	struct netmap_bwrap_adapter *bna =
2885 		(struct netmap_bwrap_adapter *)na;
2886 	struct netmap_adapter *hwna = bna->hwna;
2887 	enum txrx t;
2888 	int i;
2889 
2890 	ND("%s", na->name);
2891 
2892 	/* decrement the usage counter for all the hwna krings */
2893 	for_rx_tx(t) {
2894 		for (i = 0; i < nma_get_nrings(hwna, t) + 1; i++) {
2895 			NMR(hwna, t)[i]->users--;
2896 		}
2897 	}
2898 
2899 	/* delete any netmap rings that are no longer needed */
2900 	netmap_mem_rings_delete(hwna);
2901 	hwna->nm_krings_delete(hwna);
2902 	netmap_vp_krings_delete(na);
2903 }
2904 
2905 
2906 /* notify method for the bridge-->hwna direction */
2907 static int
2908 netmap_bwrap_notify(struct netmap_kring *kring, int flags)
2909 {
2910 	struct netmap_adapter *na = kring->na;
2911 	struct netmap_bwrap_adapter *bna = na->na_private;
2912 	struct netmap_adapter *hwna = bna->hwna;
2913 	u_int ring_n = kring->ring_id;
2914 	u_int lim = kring->nkr_num_slots - 1;
2915 	struct netmap_kring *hw_kring;
2916 	int error;
2917 
2918 	ND("%s: na %s hwna %s",
2919 			(kring ? kring->name : "NULL!"),
2920 			(na ? na->name : "NULL!"),
2921 			(hwna ? hwna->name : "NULL!"));
2922 	hw_kring = hwna->tx_rings[ring_n];
2923 
2924 	if (nm_kr_tryget(hw_kring, 0, NULL)) {
2925 		return ENXIO;
2926 	}
2927 
2928 	/* first step: simulate a user wakeup on the rx ring */
2929 	netmap_vp_rxsync(kring, flags);
2930 	ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2931 		na->name, ring_n,
2932 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2933 		ring->head, ring->cur, ring->tail,
2934 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
2935 	/* second step: the new packets are sent on the tx ring
2936 	 * (which is actually the same ring)
2937 	 */
2938 	hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
2939 	error = hw_kring->nm_sync(hw_kring, flags);
2940 	if (error)
2941 		goto put_out;
2942 
2943 	/* third step: now we are back the rx ring */
2944 	/* claim ownership on all hw owned bufs */
2945 	kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */
2946 
2947 	/* fourth step: the user goes to sleep again, causing another rxsync */
2948 	netmap_vp_rxsync(kring, flags);
2949 	ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2950 		na->name, ring_n,
2951 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2952 		ring->head, ring->cur, ring->tail,
2953 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
2954 put_out:
2955 	nm_kr_put(hw_kring);
2956 
2957 	return error ? error : NM_IRQ_COMPLETED;
2958 }
2959 
2960 
2961 /* nm_bdg_ctl callback for the bwrap.
2962  * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd].
2963  * On attach, it needs to provide a fake netmap_priv_d structure and
2964  * perform a netmap_do_regif() on the bwrap. This will put both the
2965  * bwrap and the hwna in netmap mode, with the netmap rings shared
2966  * and cross linked. Moroever, it will start intercepting interrupts
2967  * directed to hwna.
2968  */
2969 static int
2970 netmap_bwrap_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
2971 {
2972 	struct netmap_priv_d *npriv;
2973 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2974 	int error = 0;
2975 
2976 	if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
2977 		struct nmreq_vale_attach *req =
2978 			(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
2979 		if (req->reg.nr_ringid != 0 ||
2980 			(req->reg.nr_mode != NR_REG_ALL_NIC &&
2981 				req->reg.nr_mode != NR_REG_NIC_SW)) {
2982 			/* We only support attaching all the NIC rings
2983 			 * and/or the host stack. */
2984 			return EINVAL;
2985 		}
2986 		if (NETMAP_OWNED_BY_ANY(na)) {
2987 			return EBUSY;
2988 		}
2989 		if (bna->na_kpriv) {
2990 			/* nothing to do */
2991 			return 0;
2992 		}
2993 		npriv = netmap_priv_new();
2994 		if (npriv == NULL)
2995 			return ENOMEM;
2996 		npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */
2997 		error = netmap_do_regif(npriv, na, req->reg.nr_mode,
2998 					req->reg.nr_ringid, req->reg.nr_flags);
2999 		if (error) {
3000 			netmap_priv_delete(npriv);
3001 			return error;
3002 		}
3003 		bna->na_kpriv = npriv;
3004 		na->na_flags |= NAF_BUSY;
3005 	} else {
3006 		if (na->active_fds == 0) /* not registered */
3007 			return EINVAL;
3008 		netmap_priv_delete(bna->na_kpriv);
3009 		bna->na_kpriv = NULL;
3010 		na->na_flags &= ~NAF_BUSY;
3011 	}
3012 
3013 	return error;
3014 }
3015 
3016 /* attach a bridge wrapper to the 'real' device */
3017 int
3018 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
3019 {
3020 	struct netmap_bwrap_adapter *bna;
3021 	struct netmap_adapter *na = NULL;
3022 	struct netmap_adapter *hostna = NULL;
3023 	int error = 0;
3024 	enum txrx t;
3025 
3026 	/* make sure the NIC is not already in use */
3027 	if (NETMAP_OWNED_BY_ANY(hwna)) {
3028 		D("NIC %s busy, cannot attach to bridge", hwna->name);
3029 		return EBUSY;
3030 	}
3031 
3032 	bna = nm_os_malloc(sizeof(*bna));
3033 	if (bna == NULL) {
3034 		return ENOMEM;
3035 	}
3036 
3037 	na = &bna->up.up;
3038 	/* make bwrap ifp point to the real ifp */
3039 	na->ifp = hwna->ifp;
3040 	if_ref(na->ifp);
3041 	na->na_private = bna;
3042 	strncpy(na->name, nr_name, sizeof(na->name));
3043 	/* fill the ring data for the bwrap adapter with rx/tx meanings
3044 	 * swapped. The real cross-linking will be done during register,
3045 	 * when all the krings will have been created.
3046 	 */
3047 	for_rx_tx(t) {
3048 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
3049 		nma_set_nrings(na, t, nma_get_nrings(hwna, r));
3050 		nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
3051 	}
3052 	na->nm_dtor = netmap_bwrap_dtor;
3053 	na->nm_register = netmap_bwrap_reg;
3054 	// na->nm_txsync = netmap_bwrap_txsync;
3055 	// na->nm_rxsync = netmap_bwrap_rxsync;
3056 	na->nm_config = netmap_bwrap_config;
3057 	na->nm_krings_create = netmap_bwrap_krings_create;
3058 	na->nm_krings_delete = netmap_bwrap_krings_delete;
3059 	na->nm_notify = netmap_bwrap_notify;
3060 	na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
3061 	na->pdev = hwna->pdev;
3062 	na->nm_mem = netmap_mem_get(hwna->nm_mem);
3063 	na->virt_hdr_len = hwna->virt_hdr_len;
3064 	na->rx_buf_maxsize = hwna->rx_buf_maxsize;
3065 	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
3066 	/* Set the mfs, needed on the VALE mismatch datapath. */
3067 	bna->up.mfs = NM_BDG_MFS_DEFAULT;
3068 
3069 	bna->hwna = hwna;
3070 	netmap_adapter_get(hwna);
3071 	hwna->na_private = bna; /* weak reference */
3072 	bna->saved_na_vp = hwna->na_vp;
3073 	hwna->na_vp = &bna->up;
3074 	bna->up.up.na_vp = &(bna->up);
3075 
3076 	if (hwna->na_flags & NAF_HOST_RINGS) {
3077 		if (hwna->na_flags & NAF_SW_ONLY)
3078 			na->na_flags |= NAF_SW_ONLY;
3079 		na->na_flags |= NAF_HOST_RINGS;
3080 		hostna = &bna->host.up;
3081 		snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name);
3082 		hostna->ifp = hwna->ifp;
3083 		for_rx_tx(t) {
3084 			enum txrx r = nm_txrx_swap(t);
3085 			nma_set_nrings(hostna, t, 1);
3086 			nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
3087 		}
3088 		// hostna->nm_txsync = netmap_bwrap_host_txsync;
3089 		// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
3090 		hostna->nm_notify = netmap_bwrap_notify;
3091 		hostna->nm_mem = netmap_mem_get(na->nm_mem);
3092 		hostna->na_private = bna;
3093 		hostna->na_vp = &bna->up;
3094 		na->na_hostvp = hwna->na_hostvp =
3095 			hostna->na_hostvp = &bna->host;
3096 		hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
3097 		hostna->rx_buf_maxsize = hwna->rx_buf_maxsize;
3098 		bna->host.mfs = NM_BDG_MFS_DEFAULT;
3099 	}
3100 
3101 	ND("%s<->%s txr %d txd %d rxr %d rxd %d",
3102 		na->name, ifp->if_xname,
3103 		na->num_tx_rings, na->num_tx_desc,
3104 		na->num_rx_rings, na->num_rx_desc);
3105 
3106 	error = netmap_attach_common(na);
3107 	if (error) {
3108 		goto err_free;
3109 	}
3110 	hwna->na_flags |= NAF_BUSY;
3111 	return 0;
3112 
3113 err_free:
3114 	hwna->na_vp = hwna->na_hostvp = NULL;
3115 	netmap_adapter_put(hwna);
3116 	nm_os_free(bna);
3117 	return error;
3118 
3119 }
3120 
3121 struct nm_bridge *
3122 netmap_init_bridges2(u_int n)
3123 {
3124 	int i;
3125 	struct nm_bridge *b;
3126 
3127 	b = nm_os_malloc(sizeof(struct nm_bridge) * n);
3128 	if (b == NULL)
3129 		return NULL;
3130 	for (i = 0; i < n; i++)
3131 		BDG_RWINIT(&b[i]);
3132 	return b;
3133 }
3134 
3135 void
3136 netmap_uninit_bridges2(struct nm_bridge *b, u_int n)
3137 {
3138 	int i;
3139 
3140 	if (b == NULL)
3141 		return;
3142 
3143 	for (i = 0; i < n; i++)
3144 		BDG_RWDESTROY(&b[i]);
3145 	nm_os_free(b);
3146 }
3147 
3148 int
3149 netmap_init_bridges(void)
3150 {
3151 #ifdef CONFIG_NET_NS
3152 	return netmap_bns_register();
3153 #else
3154 	nm_bridges = netmap_init_bridges2(NM_BRIDGES);
3155 	if (nm_bridges == NULL)
3156 		return ENOMEM;
3157 	return 0;
3158 #endif
3159 }
3160 
3161 void
3162 netmap_uninit_bridges(void)
3163 {
3164 #ifdef CONFIG_NET_NS
3165 	netmap_bns_unregister();
3166 #else
3167 	netmap_uninit_bridges2(nm_bridges, NM_BRIDGES);
3168 #endif
3169 }
3170 #endif /* WITH_VALE */
3171