xref: /freebsd/sys/dev/netmap/netmap_vale.c (revision def7fe87e9b28032572ca6f820a260677fd0c2d5)
1 /*
2  * Copyright (C) 2013-2016 Universita` di Pisa
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *   1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  *   2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *      documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 
28 /*
29  * This module implements the VALE switch for netmap
30 
31 --- VALE SWITCH ---
32 
33 NMG_LOCK() serializes all modifications to switches and ports.
34 A switch cannot be deleted until all ports are gone.
35 
36 For each switch, an SX lock (RWlock on linux) protects
37 deletion of ports. When configuring or deleting a new port, the
38 lock is acquired in exclusive mode (after holding NMG_LOCK).
39 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
40 The lock is held throughout the entire forwarding cycle,
41 during which the thread may incur in a page fault.
42 Hence it is important that sleepable shared locks are used.
43 
44 On the rx ring, the per-port lock is grabbed initially to reserve
45 a number of slot in the ring, then the lock is released,
46 packets are copied from source to destination, and then
47 the lock is acquired again and the receive ring is updated.
48 (A similar thing is done on the tx ring for NIC and host stack
49 ports attached to the switch)
50 
51  */
52 
53 /*
54  * OS-specific code that is used only within this file.
55  * Other OS-specific code that must be accessed by drivers
56  * is present in netmap_kern.h
57  */
58 
59 #if defined(__FreeBSD__)
60 #include <sys/cdefs.h> /* prerequisite */
61 __FBSDID("$FreeBSD$");
62 
63 #include <sys/types.h>
64 #include <sys/errno.h>
65 #include <sys/param.h>	/* defines used in kernel.h */
66 #include <sys/kernel.h>	/* types used in module initialization */
67 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
68 #include <sys/sockio.h>
69 #include <sys/socketvar.h>	/* struct socket */
70 #include <sys/malloc.h>
71 #include <sys/poll.h>
72 #include <sys/rwlock.h>
73 #include <sys/socket.h> /* sockaddrs */
74 #include <sys/selinfo.h>
75 #include <sys/sysctl.h>
76 #include <net/if.h>
77 #include <net/if_var.h>
78 #include <net/bpf.h>		/* BIOCIMMEDIATE */
79 #include <machine/bus.h>	/* bus_dmamap_* */
80 #include <sys/endian.h>
81 #include <sys/refcount.h>
82 
83 
84 #define BDG_RWLOCK_T		struct rwlock // struct rwlock
85 
86 #define	BDG_RWINIT(b)		\
87 	rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
88 #define BDG_WLOCK(b)		rw_wlock(&(b)->bdg_lock)
89 #define BDG_WUNLOCK(b)		rw_wunlock(&(b)->bdg_lock)
90 #define BDG_RLOCK(b)		rw_rlock(&(b)->bdg_lock)
91 #define BDG_RTRYLOCK(b)		rw_try_rlock(&(b)->bdg_lock)
92 #define BDG_RUNLOCK(b)		rw_runlock(&(b)->bdg_lock)
93 #define BDG_RWDESTROY(b)	rw_destroy(&(b)->bdg_lock)
94 
95 
96 #elif defined(linux)
97 
98 #include "bsd_glue.h"
99 
100 #elif defined(__APPLE__)
101 
102 #warning OSX support is only partial
103 #include "osx_glue.h"
104 
105 #elif defined(_WIN32)
106 #include "win_glue.h"
107 
108 #else
109 
110 #error	Unsupported platform
111 
112 #endif /* unsupported */
113 
114 /*
115  * common headers
116  */
117 
118 #include <net/netmap.h>
119 #include <dev/netmap/netmap_kern.h>
120 #include <dev/netmap/netmap_mem2.h>
121 
122 #ifdef WITH_VALE
123 
124 /*
125  * system parameters (most of them in netmap_kern.h)
126  * NM_BDG_NAME	prefix for switch port names, default "vale"
127  * NM_BDG_MAXPORTS	number of ports
128  * NM_BRIDGES	max number of switches in the system.
129  *	XXX should become a sysctl or tunable
130  *
131  * Switch ports are named valeX:Y where X is the switch name and Y
132  * is the port. If Y matches a physical interface name, the port is
133  * connected to a physical device.
134  *
135  * Unlike physical interfaces, switch ports use their own memory region
136  * for rings and buffers.
137  * The virtual interfaces use per-queue lock instead of core lock.
138  * In the tx loop, we aggregate traffic in batches to make all operations
139  * faster. The batch size is bridge_batch.
140  */
141 #define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
142 #define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
143 #define NM_BRIDGE_RINGSIZE	1024	/* in the device */
144 #define NM_BDG_HASH		1024	/* forwarding table entries */
145 #define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
146 #define NM_MULTISEG		64	/* max size of a chain of bufs */
147 /* actual size of the tables */
148 #define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NM_MULTISEG)
149 /* NM_FT_NULL terminates a list of slots in the ft */
150 #define NM_FT_NULL		NM_BDG_BATCH_MAX
151 
152 
153 /*
154  * bridge_batch is set via sysctl to the max batch size to be
155  * used in the bridge. The actual value may be larger as the
156  * last packet in the block may overflow the size.
157  */
158 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
159 SYSBEGIN(vars_vale);
160 SYSCTL_DECL(_dev_netmap);
161 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
162 SYSEND;
163 
164 static int netmap_vp_create(struct nmreq *, struct ifnet *, struct netmap_vp_adapter **);
165 static int netmap_vp_reg(struct netmap_adapter *na, int onoff);
166 static int netmap_bwrap_reg(struct netmap_adapter *, int onoff);
167 
168 /*
169  * For each output interface, nm_bdg_q is used to construct a list.
170  * bq_len is the number of output buffers (we can have coalescing
171  * during the copy).
172  */
173 struct nm_bdg_q {
174 	uint16_t bq_head;
175 	uint16_t bq_tail;
176 	uint32_t bq_len;	/* number of buffers */
177 };
178 
179 /* XXX revise this */
180 struct nm_hash_ent {
181 	uint64_t	mac;	/* the top 2 bytes are the epoch */
182 	uint64_t	ports;
183 };
184 
185 /*
186  * nm_bridge is a descriptor for a VALE switch.
187  * Interfaces for a bridge are all in bdg_ports[].
188  * The array has fixed size, an empty entry does not terminate
189  * the search, but lookups only occur on attach/detach so we
190  * don't mind if they are slow.
191  *
192  * The bridge is non blocking on the transmit ports: excess
193  * packets are dropped if there is no room on the output port.
194  *
195  * bdg_lock protects accesses to the bdg_ports array.
196  * This is a rw lock (or equivalent).
197  */
198 struct nm_bridge {
199 	/* XXX what is the proper alignment/layout ? */
200 	BDG_RWLOCK_T	bdg_lock;	/* protects bdg_ports */
201 	int		bdg_namelen;
202 	uint32_t	bdg_active_ports; /* 0 means free */
203 	char		bdg_basename[IFNAMSIZ];
204 
205 	/* Indexes of active ports (up to active_ports)
206 	 * and all other remaining ports.
207 	 */
208 	uint8_t		bdg_port_index[NM_BDG_MAXPORTS];
209 
210 	struct netmap_vp_adapter *bdg_ports[NM_BDG_MAXPORTS];
211 
212 
213 	/*
214 	 * The function to decide the destination port.
215 	 * It returns either of an index of the destination port,
216 	 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
217 	 * forward this packet.  ring_nr is the source ring index, and the
218 	 * function may overwrite this value to forward this packet to a
219 	 * different ring index.
220 	 * This function must be set by netmap_bdg_ctl().
221 	 */
222 	struct netmap_bdg_ops bdg_ops;
223 
224 	/* the forwarding table, MAC+ports.
225 	 * XXX should be changed to an argument to be passed to
226 	 * the lookup function, and allocated on attach
227 	 */
228 	struct nm_hash_ent ht[NM_BDG_HASH];
229 
230 #ifdef CONFIG_NET_NS
231 	struct net *ns;
232 #endif /* CONFIG_NET_NS */
233 };
234 
235 const char*
236 netmap_bdg_name(struct netmap_vp_adapter *vp)
237 {
238 	struct nm_bridge *b = vp->na_bdg;
239 	if (b == NULL)
240 		return NULL;
241 	return b->bdg_basename;
242 }
243 
244 
245 #ifndef CONFIG_NET_NS
246 /*
247  * XXX in principle nm_bridges could be created dynamically
248  * Right now we have a static array and deletions are protected
249  * by an exclusive lock.
250  */
251 static struct nm_bridge *nm_bridges;
252 #endif /* !CONFIG_NET_NS */
253 
254 
255 /*
256  * this is a slightly optimized copy routine which rounds
257  * to multiple of 64 bytes and is often faster than dealing
258  * with other odd sizes. We assume there is enough room
259  * in the source and destination buffers.
260  *
261  * XXX only for multiples of 64 bytes, non overlapped.
262  */
263 static inline void
264 pkt_copy(void *_src, void *_dst, int l)
265 {
266         uint64_t *src = _src;
267         uint64_t *dst = _dst;
268         if (unlikely(l >= 1024)) {
269                 memcpy(dst, src, l);
270                 return;
271         }
272         for (; likely(l > 0); l-=64) {
273                 *dst++ = *src++;
274                 *dst++ = *src++;
275                 *dst++ = *src++;
276                 *dst++ = *src++;
277                 *dst++ = *src++;
278                 *dst++ = *src++;
279                 *dst++ = *src++;
280                 *dst++ = *src++;
281         }
282 }
283 
284 
285 static int
286 nm_is_id_char(const char c)
287 {
288 	return (c >= 'a' && c <= 'z') ||
289 	       (c >= 'A' && c <= 'Z') ||
290 	       (c >= '0' && c <= '9') ||
291 	       (c == '_');
292 }
293 
294 /* Validate the name of a VALE bridge port and return the
295  * position of the ":" character. */
296 static int
297 nm_vale_name_validate(const char *name)
298 {
299 	int colon_pos = -1;
300 	int i;
301 
302 	if (!name || strlen(name) < strlen(NM_BDG_NAME)) {
303 		return -1;
304 	}
305 
306 	for (i = 0; name[i]; i++) {
307 		if (name[i] == ':') {
308 			if (colon_pos != -1) {
309 				return -1;
310 			}
311 			colon_pos = i;
312 		} else if (!nm_is_id_char(name[i])) {
313 			return -1;
314 		}
315 	}
316 
317 	if (i >= IFNAMSIZ) {
318 		return -1;
319 	}
320 
321 	return colon_pos;
322 }
323 
324 /*
325  * locate a bridge among the existing ones.
326  * MUST BE CALLED WITH NMG_LOCK()
327  *
328  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
329  * We assume that this is called with a name of at least NM_NAME chars.
330  */
331 static struct nm_bridge *
332 nm_find_bridge(const char *name, int create)
333 {
334 	int i, namelen;
335 	struct nm_bridge *b = NULL, *bridges;
336 	u_int num_bridges;
337 
338 	NMG_LOCK_ASSERT();
339 
340 	netmap_bns_getbridges(&bridges, &num_bridges);
341 
342 	namelen = nm_vale_name_validate(name);
343 	if (namelen < 0) {
344 		D("invalid bridge name %s", name ? name : NULL);
345 		return NULL;
346 	}
347 
348 	/* lookup the name, remember empty slot if there is one */
349 	for (i = 0; i < num_bridges; i++) {
350 		struct nm_bridge *x = bridges + i;
351 
352 		if (x->bdg_active_ports == 0) {
353 			if (create && b == NULL)
354 				b = x;	/* record empty slot */
355 		} else if (x->bdg_namelen != namelen) {
356 			continue;
357 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
358 			ND("found '%.*s' at %d", namelen, name, i);
359 			b = x;
360 			break;
361 		}
362 	}
363 	if (i == num_bridges && b) { /* name not found, can create entry */
364 		/* initialize the bridge */
365 		strncpy(b->bdg_basename, name, namelen);
366 		ND("create new bridge %s with ports %d", b->bdg_basename,
367 			b->bdg_active_ports);
368 		b->bdg_namelen = namelen;
369 		b->bdg_active_ports = 0;
370 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
371 			b->bdg_port_index[i] = i;
372 		/* set the default function */
373 		b->bdg_ops.lookup = netmap_bdg_learning;
374 		/* reset the MAC address table */
375 		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
376 		NM_BNS_GET(b);
377 	}
378 	return b;
379 }
380 
381 
382 /*
383  * Free the forwarding tables for rings attached to switch ports.
384  */
385 static void
386 nm_free_bdgfwd(struct netmap_adapter *na)
387 {
388 	int nrings, i;
389 	struct netmap_kring *kring;
390 
391 	NMG_LOCK_ASSERT();
392 	nrings = na->num_tx_rings;
393 	kring = na->tx_rings;
394 	for (i = 0; i < nrings; i++) {
395 		if (kring[i].nkr_ft) {
396 			free(kring[i].nkr_ft, M_DEVBUF);
397 			kring[i].nkr_ft = NULL; /* protect from freeing twice */
398 		}
399 	}
400 }
401 
402 
403 /*
404  * Allocate the forwarding tables for the rings attached to the bridge ports.
405  */
406 static int
407 nm_alloc_bdgfwd(struct netmap_adapter *na)
408 {
409 	int nrings, l, i, num_dstq;
410 	struct netmap_kring *kring;
411 
412 	NMG_LOCK_ASSERT();
413 	/* all port:rings + broadcast */
414 	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
415 	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
416 	l += sizeof(struct nm_bdg_q) * num_dstq;
417 	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
418 
419 	nrings = netmap_real_rings(na, NR_TX);
420 	kring = na->tx_rings;
421 	for (i = 0; i < nrings; i++) {
422 		struct nm_bdg_fwd *ft;
423 		struct nm_bdg_q *dstq;
424 		int j;
425 
426 		ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
427 		if (!ft) {
428 			nm_free_bdgfwd(na);
429 			return ENOMEM;
430 		}
431 		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
432 		for (j = 0; j < num_dstq; j++) {
433 			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
434 			dstq[j].bq_len = 0;
435 		}
436 		kring[i].nkr_ft = ft;
437 	}
438 	return 0;
439 }
440 
441 
442 /* remove from bridge b the ports in slots hw and sw
443  * (sw can be -1 if not needed)
444  */
445 static void
446 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
447 {
448 	int s_hw = hw, s_sw = sw;
449 	int i, lim =b->bdg_active_ports;
450 	uint8_t tmp[NM_BDG_MAXPORTS];
451 
452 	/*
453 	New algorithm:
454 	make a copy of bdg_port_index;
455 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
456 	in the array of bdg_port_index, replacing them with
457 	entries from the bottom of the array;
458 	decrement bdg_active_ports;
459 	acquire BDG_WLOCK() and copy back the array.
460 	 */
461 
462 	if (netmap_verbose)
463 		D("detach %d and %d (lim %d)", hw, sw, lim);
464 	/* make a copy of the list of active ports, update it,
465 	 * and then copy back within BDG_WLOCK().
466 	 */
467 	memcpy(tmp, b->bdg_port_index, sizeof(tmp));
468 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
469 		if (hw >= 0 && tmp[i] == hw) {
470 			ND("detach hw %d at %d", hw, i);
471 			lim--; /* point to last active port */
472 			tmp[i] = tmp[lim]; /* swap with i */
473 			tmp[lim] = hw;	/* now this is inactive */
474 			hw = -1;
475 		} else if (sw >= 0 && tmp[i] == sw) {
476 			ND("detach sw %d at %d", sw, i);
477 			lim--;
478 			tmp[i] = tmp[lim];
479 			tmp[lim] = sw;
480 			sw = -1;
481 		} else {
482 			i++;
483 		}
484 	}
485 	if (hw >= 0 || sw >= 0) {
486 		D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
487 	}
488 
489 	BDG_WLOCK(b);
490 	if (b->bdg_ops.dtor)
491 		b->bdg_ops.dtor(b->bdg_ports[s_hw]);
492 	b->bdg_ports[s_hw] = NULL;
493 	if (s_sw >= 0) {
494 		b->bdg_ports[s_sw] = NULL;
495 	}
496 	memcpy(b->bdg_port_index, tmp, sizeof(tmp));
497 	b->bdg_active_ports = lim;
498 	BDG_WUNLOCK(b);
499 
500 	ND("now %d active ports", lim);
501 	if (lim == 0) {
502 		ND("marking bridge %s as free", b->bdg_basename);
503 		bzero(&b->bdg_ops, sizeof(b->bdg_ops));
504 		NM_BNS_PUT(b);
505 	}
506 }
507 
508 /* nm_bdg_ctl callback for VALE ports */
509 static int
510 netmap_vp_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
511 {
512 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
513 	struct nm_bridge *b = vpna->na_bdg;
514 
515 	(void)nmr;	// XXX merge ?
516 	if (attach)
517 		return 0; /* nothing to do */
518 	if (b) {
519 		netmap_set_all_rings(na, 0 /* disable */);
520 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
521 		vpna->na_bdg = NULL;
522 		netmap_set_all_rings(na, 1 /* enable */);
523 	}
524 	/* I have took reference just for attach */
525 	netmap_adapter_put(na);
526 	return 0;
527 }
528 
529 /* nm_dtor callback for ephemeral VALE ports */
530 static void
531 netmap_vp_dtor(struct netmap_adapter *na)
532 {
533 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
534 	struct nm_bridge *b = vpna->na_bdg;
535 
536 	ND("%s has %d references", na->name, na->na_refcount);
537 
538 	if (b) {
539 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
540 	}
541 }
542 
543 /* remove a persistent VALE port from the system */
544 static int
545 nm_vi_destroy(const char *name)
546 {
547 	struct ifnet *ifp;
548 	int error;
549 
550 	ifp = ifunit_ref(name);
551 	if (!ifp)
552 		return ENXIO;
553 	NMG_LOCK();
554 	/* make sure this is actually a VALE port */
555 	if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
556 		error = EINVAL;
557 		goto err;
558 	}
559 
560 	if (NA(ifp)->na_refcount > 1) {
561 		error = EBUSY;
562 		goto err;
563 	}
564 	NMG_UNLOCK();
565 
566 	D("destroying a persistent vale interface %s", ifp->if_xname);
567 	/* Linux requires all the references are released
568 	 * before unregister
569 	 */
570 	if_rele(ifp);
571 	netmap_detach(ifp);
572 	nm_os_vi_detach(ifp);
573 	return 0;
574 
575 err:
576 	NMG_UNLOCK();
577 	if_rele(ifp);
578 	return error;
579 }
580 
581 /*
582  * Create a virtual interface registered to the system.
583  * The interface will be attached to a bridge later.
584  */
585 static int
586 nm_vi_create(struct nmreq *nmr)
587 {
588 	struct ifnet *ifp;
589 	struct netmap_vp_adapter *vpna;
590 	int error;
591 
592 	/* don't include VALE prefix */
593 	if (!strncmp(nmr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
594 		return EINVAL;
595 	ifp = ifunit_ref(nmr->nr_name);
596 	if (ifp) { /* already exist, cannot create new one */
597 		if_rele(ifp);
598 		return EEXIST;
599 	}
600 	error = nm_os_vi_persist(nmr->nr_name, &ifp);
601 	if (error)
602 		return error;
603 
604 	NMG_LOCK();
605 	/* netmap_vp_create creates a struct netmap_vp_adapter */
606 	error = netmap_vp_create(nmr, ifp, &vpna);
607 	if (error) {
608 		D("error %d", error);
609 		nm_os_vi_detach(ifp);
610 		return error;
611 	}
612 	/* persist-specific routines */
613 	vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
614 	netmap_adapter_get(&vpna->up);
615 	NM_ATTACH_NA(ifp, &vpna->up);
616 	NMG_UNLOCK();
617 	D("created %s", ifp->if_xname);
618 	return 0;
619 }
620 
621 /* Try to get a reference to a netmap adapter attached to a VALE switch.
622  * If the adapter is found (or is created), this function returns 0, a
623  * non NULL pointer is returned into *na, and the caller holds a
624  * reference to the adapter.
625  * If an adapter is not found, then no reference is grabbed and the
626  * function returns an error code, or 0 if there is just a VALE prefix
627  * mismatch. Therefore the caller holds a reference when
628  * (*na != NULL && return == 0).
629  */
630 int
631 netmap_get_bdg_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
632 {
633 	char *nr_name = nmr->nr_name;
634 	const char *ifname;
635 	struct ifnet *ifp;
636 	int error = 0;
637 	struct netmap_vp_adapter *vpna, *hostna = NULL;
638 	struct nm_bridge *b;
639 	int i, j, cand = -1, cand2 = -1;
640 	int needed;
641 
642 	*na = NULL;     /* default return value */
643 
644 	/* first try to see if this is a bridge port. */
645 	NMG_LOCK_ASSERT();
646 	if (strncmp(nr_name, NM_BDG_NAME, sizeof(NM_BDG_NAME) - 1)) {
647 		return 0;  /* no error, but no VALE prefix */
648 	}
649 
650 	b = nm_find_bridge(nr_name, create);
651 	if (b == NULL) {
652 		D("no bridges available for '%s'", nr_name);
653 		return (create ? ENOMEM : ENXIO);
654 	}
655 	if (strlen(nr_name) < b->bdg_namelen) /* impossible */
656 		panic("x");
657 
658 	/* Now we are sure that name starts with the bridge's name,
659 	 * lookup the port in the bridge. We need to scan the entire
660 	 * list. It is not important to hold a WLOCK on the bridge
661 	 * during the search because NMG_LOCK already guarantees
662 	 * that there are no other possible writers.
663 	 */
664 
665 	/* lookup in the local list of ports */
666 	for (j = 0; j < b->bdg_active_ports; j++) {
667 		i = b->bdg_port_index[j];
668 		vpna = b->bdg_ports[i];
669 		// KASSERT(na != NULL);
670 		ND("checking %s", vpna->up.name);
671 		if (!strcmp(vpna->up.name, nr_name)) {
672 			netmap_adapter_get(&vpna->up);
673 			ND("found existing if %s refs %d", nr_name)
674 			*na = &vpna->up;
675 			return 0;
676 		}
677 	}
678 	/* not found, should we create it? */
679 	if (!create)
680 		return ENXIO;
681 	/* yes we should, see if we have space to attach entries */
682 	needed = 2; /* in some cases we only need 1 */
683 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
684 		D("bridge full %d, cannot create new port", b->bdg_active_ports);
685 		return ENOMEM;
686 	}
687 	/* record the next two ports available, but do not allocate yet */
688 	cand = b->bdg_port_index[b->bdg_active_ports];
689 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
690 	ND("+++ bridge %s port %s used %d avail %d %d",
691 		b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
692 
693 	/*
694 	 * try see if there is a matching NIC with this name
695 	 * (after the bridge's name)
696 	 */
697 	ifname = nr_name + b->bdg_namelen + 1;
698 	ifp = ifunit_ref(ifname);
699 	if (!ifp) {
700 		/* Create an ephemeral virtual port
701 		 * This block contains all the ephemeral-specific logics
702 		 */
703 		if (nmr->nr_cmd) {
704 			/* nr_cmd must be 0 for a virtual port */
705 			return EINVAL;
706 		}
707 
708 		/* bdg_netmap_attach creates a struct netmap_adapter */
709 		error = netmap_vp_create(nmr, NULL, &vpna);
710 		if (error) {
711 			D("error %d", error);
712 			free(ifp, M_DEVBUF);
713 			return error;
714 		}
715 		/* shortcut - we can skip get_hw_na(),
716 		 * ownership check and nm_bdg_attach()
717 		 */
718 	} else {
719 		struct netmap_adapter *hw;
720 
721 		error = netmap_get_hw_na(ifp, &hw);
722 		if (error || hw == NULL)
723 			goto out;
724 
725 		/* host adapter might not be created */
726 		error = hw->nm_bdg_attach(nr_name, hw);
727 		if (error)
728 			goto out;
729 		vpna = hw->na_vp;
730 		hostna = hw->na_hostvp;
731 		if (nmr->nr_arg1 != NETMAP_BDG_HOST)
732 			hostna = NULL;
733 	}
734 
735 	BDG_WLOCK(b);
736 	vpna->bdg_port = cand;
737 	ND("NIC  %p to bridge port %d", vpna, cand);
738 	/* bind the port to the bridge (virtual ports are not active) */
739 	b->bdg_ports[cand] = vpna;
740 	vpna->na_bdg = b;
741 	b->bdg_active_ports++;
742 	if (hostna != NULL) {
743 		/* also bind the host stack to the bridge */
744 		b->bdg_ports[cand2] = hostna;
745 		hostna->bdg_port = cand2;
746 		hostna->na_bdg = b;
747 		b->bdg_active_ports++;
748 		ND("host %p to bridge port %d", hostna, cand2);
749 	}
750 	ND("if %s refs %d", ifname, vpna->up.na_refcount);
751 	BDG_WUNLOCK(b);
752 	*na = &vpna->up;
753 	netmap_adapter_get(*na);
754 	return 0;
755 
756 out:
757 	if_rele(ifp);
758 
759 	return error;
760 }
761 
762 
763 /* Process NETMAP_BDG_ATTACH */
764 static int
765 nm_bdg_ctl_attach(struct nmreq *nmr)
766 {
767 	struct netmap_adapter *na;
768 	int error;
769 
770 	NMG_LOCK();
771 
772 	error = netmap_get_bdg_na(nmr, &na, 1 /* create if not exists */);
773 	if (error) /* no device */
774 		goto unlock_exit;
775 
776 	if (na == NULL) { /* VALE prefix missing */
777 		error = EINVAL;
778 		goto unlock_exit;
779 	}
780 
781 	if (NETMAP_OWNED_BY_ANY(na)) {
782 		error = EBUSY;
783 		goto unref_exit;
784 	}
785 
786 	if (na->nm_bdg_ctl) {
787 		/* nop for VALE ports. The bwrap needs to put the hwna
788 		 * in netmap mode (see netmap_bwrap_bdg_ctl)
789 		 */
790 		error = na->nm_bdg_ctl(na, nmr, 1);
791 		if (error)
792 			goto unref_exit;
793 		ND("registered %s to netmap-mode", na->name);
794 	}
795 	NMG_UNLOCK();
796 	return 0;
797 
798 unref_exit:
799 	netmap_adapter_put(na);
800 unlock_exit:
801 	NMG_UNLOCK();
802 	return error;
803 }
804 
805 static inline int
806 nm_is_bwrap(struct netmap_adapter *na)
807 {
808 	return na->nm_register == netmap_bwrap_reg;
809 }
810 
811 /* process NETMAP_BDG_DETACH */
812 static int
813 nm_bdg_ctl_detach(struct nmreq *nmr)
814 {
815 	struct netmap_adapter *na;
816 	int error;
817 
818 	NMG_LOCK();
819 	error = netmap_get_bdg_na(nmr, &na, 0 /* don't create */);
820 	if (error) { /* no device, or another bridge or user owns the device */
821 		goto unlock_exit;
822 	}
823 
824 	if (na == NULL) { /* VALE prefix missing */
825 		error = EINVAL;
826 		goto unlock_exit;
827 	} else if (nm_is_bwrap(na) &&
828 		   ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
829 		/* Don't detach a NIC with polling */
830 		error = EBUSY;
831 		netmap_adapter_put(na);
832 		goto unlock_exit;
833 	}
834 	if (na->nm_bdg_ctl) {
835 		/* remove the port from bridge. The bwrap
836 		 * also needs to put the hwna in normal mode
837 		 */
838 		error = na->nm_bdg_ctl(na, nmr, 0);
839 	}
840 
841 	netmap_adapter_put(na);
842 unlock_exit:
843 	NMG_UNLOCK();
844 	return error;
845 
846 }
847 
848 struct nm_bdg_polling_state;
849 struct
850 nm_bdg_kthread {
851 	struct nm_kthread *nmk;
852 	u_int qfirst;
853 	u_int qlast;
854 	struct nm_bdg_polling_state *bps;
855 };
856 
857 struct nm_bdg_polling_state {
858 	bool configured;
859 	bool stopped;
860 	struct netmap_bwrap_adapter *bna;
861 	u_int reg;
862 	u_int qfirst;
863 	u_int qlast;
864 	u_int cpu_from;
865 	u_int ncpus;
866 	struct nm_bdg_kthread *kthreads;
867 };
868 
869 static void
870 netmap_bwrap_polling(void *data)
871 {
872 	struct nm_bdg_kthread *nbk = data;
873 	struct netmap_bwrap_adapter *bna;
874 	u_int qfirst, qlast, i;
875 	struct netmap_kring *kring0, *kring;
876 
877 	if (!nbk)
878 		return;
879 	qfirst = nbk->qfirst;
880 	qlast = nbk->qlast;
881 	bna = nbk->bps->bna;
882 	kring0 = NMR(bna->hwna, NR_RX);
883 
884 	for (i = qfirst; i < qlast; i++) {
885 		kring = kring0 + i;
886 		kring->nm_notify(kring, 0);
887 	}
888 }
889 
890 static int
891 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps)
892 {
893 	struct nm_kthread_cfg kcfg;
894 	int i, j;
895 
896 	bps->kthreads = malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus,
897 				M_DEVBUF, M_NOWAIT | M_ZERO);
898 	if (bps->kthreads == NULL)
899 		return ENOMEM;
900 
901 	bzero(&kcfg, sizeof(kcfg));
902 	kcfg.worker_fn = netmap_bwrap_polling;
903 	for (i = 0; i < bps->ncpus; i++) {
904 		struct nm_bdg_kthread *t = bps->kthreads + i;
905 		int all = (bps->ncpus == 1 && bps->reg == NR_REG_ALL_NIC);
906 		int affinity = bps->cpu_from + i;
907 
908 		t->bps = bps;
909 		t->qfirst = all ? bps->qfirst /* must be 0 */: affinity;
910 		t->qlast = all ? bps->qlast : t->qfirst + 1;
911 		D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst,
912 			t->qlast);
913 
914 		kcfg.type = i;
915 		kcfg.worker_private = t;
916 		t->nmk = nm_os_kthread_create(&kcfg);
917 		if (t->nmk == NULL) {
918 			goto cleanup;
919 		}
920 		nm_os_kthread_set_affinity(t->nmk, affinity);
921 	}
922 	return 0;
923 
924 cleanup:
925 	for (j = 0; j < i; j++) {
926 		struct nm_bdg_kthread *t = bps->kthreads + i;
927 		nm_os_kthread_delete(t->nmk);
928 	}
929 	free(bps->kthreads, M_DEVBUF);
930 	return EFAULT;
931 }
932 
933 /* a version of ptnetmap_start_kthreads() */
934 static int
935 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps)
936 {
937 	int error, i, j;
938 
939 	if (!bps) {
940 		D("polling is not configured");
941 		return EFAULT;
942 	}
943 	bps->stopped = false;
944 
945 	for (i = 0; i < bps->ncpus; i++) {
946 		struct nm_bdg_kthread *t = bps->kthreads + i;
947 		error = nm_os_kthread_start(t->nmk);
948 		if (error) {
949 			D("error in nm_kthread_start()");
950 			goto cleanup;
951 		}
952 	}
953 	return 0;
954 
955 cleanup:
956 	for (j = 0; j < i; j++) {
957 		struct nm_bdg_kthread *t = bps->kthreads + i;
958 		nm_os_kthread_stop(t->nmk);
959 	}
960 	bps->stopped = true;
961 	return error;
962 }
963 
964 static void
965 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps)
966 {
967 	int i;
968 
969 	if (!bps)
970 		return;
971 
972 	for (i = 0; i < bps->ncpus; i++) {
973 		struct nm_bdg_kthread *t = bps->kthreads + i;
974 		nm_os_kthread_stop(t->nmk);
975 		nm_os_kthread_delete(t->nmk);
976 	}
977 	bps->stopped = true;
978 }
979 
980 static int
981 get_polling_cfg(struct nmreq *nmr, struct netmap_adapter *na,
982 			struct nm_bdg_polling_state *bps)
983 {
984 	int req_cpus, avail_cpus, core_from;
985 	u_int reg, i, qfirst, qlast;
986 
987 	avail_cpus = nm_os_ncpus();
988 	req_cpus = nmr->nr_arg1;
989 
990 	if (req_cpus == 0) {
991 		D("req_cpus must be > 0");
992 		return EINVAL;
993 	} else if (req_cpus >= avail_cpus) {
994 		D("for safety, we need at least one core left in the system");
995 		return EINVAL;
996 	}
997 	reg = nmr->nr_flags & NR_REG_MASK;
998 	i = nmr->nr_ringid & NETMAP_RING_MASK;
999 	/*
1000 	 * ONE_NIC: dedicate one core to one ring. If multiple cores
1001 	 *          are specified, consecutive rings are also polled.
1002 	 *          For example, if ringid=2 and 2 cores are given,
1003 	 *          ring 2 and 3 are polled by core 2 and 3, respectively.
1004 	 * ALL_NIC: poll all the rings using a core specified by ringid.
1005 	 *          the number of cores must be 1.
1006 	 */
1007 	if (reg == NR_REG_ONE_NIC) {
1008 		if (i + req_cpus > nma_get_nrings(na, NR_RX)) {
1009 			D("only %d rings exist (ring %u-%u is given)",
1010 				nma_get_nrings(na, NR_RX), i, i+req_cpus);
1011 			return EINVAL;
1012 		}
1013 		qfirst = i;
1014 		qlast = qfirst + req_cpus;
1015 		core_from = qfirst;
1016 	} else if (reg == NR_REG_ALL_NIC) {
1017 		if (req_cpus != 1) {
1018 			D("ncpus must be 1 not %d for REG_ALL_NIC", req_cpus);
1019 			return EINVAL;
1020 		}
1021 		qfirst = 0;
1022 		qlast = nma_get_nrings(na, NR_RX);
1023 		core_from = i;
1024 	} else {
1025 		D("reg must be ALL_NIC or ONE_NIC");
1026 		return EINVAL;
1027 	}
1028 
1029 	bps->reg = reg;
1030 	bps->qfirst = qfirst;
1031 	bps->qlast = qlast;
1032 	bps->cpu_from = core_from;
1033 	bps->ncpus = req_cpus;
1034 	D("%s qfirst %u qlast %u cpu_from %u ncpus %u",
1035 		reg == NR_REG_ALL_NIC ? "REG_ALL_NIC" : "REG_ONE_NIC",
1036 		qfirst, qlast, core_from, req_cpus);
1037 	return 0;
1038 }
1039 
1040 static int
1041 nm_bdg_ctl_polling_start(struct nmreq *nmr, struct netmap_adapter *na)
1042 {
1043 	struct nm_bdg_polling_state *bps;
1044 	struct netmap_bwrap_adapter *bna;
1045 	int error;
1046 
1047 	bna = (struct netmap_bwrap_adapter *)na;
1048 	if (bna->na_polling_state) {
1049 		D("ERROR adapter already in polling mode");
1050 		return EFAULT;
1051 	}
1052 
1053 	bps = malloc(sizeof(*bps), M_DEVBUF, M_NOWAIT | M_ZERO);
1054 	if (!bps)
1055 		return ENOMEM;
1056 	bps->configured = false;
1057 	bps->stopped = true;
1058 
1059 	if (get_polling_cfg(nmr, na, bps)) {
1060 		free(bps, M_DEVBUF);
1061 		return EINVAL;
1062 	}
1063 
1064 	if (nm_bdg_create_kthreads(bps)) {
1065 		free(bps, M_DEVBUF);
1066 		return EFAULT;
1067 	}
1068 
1069 	bps->configured = true;
1070 	bna->na_polling_state = bps;
1071 	bps->bna = bna;
1072 
1073 	/* disable interrupt if possible */
1074 	if (bna->hwna->nm_intr)
1075 		bna->hwna->nm_intr(bna->hwna, 0);
1076 	/* start kthread now */
1077 	error = nm_bdg_polling_start_kthreads(bps);
1078 	if (error) {
1079 		D("ERROR nm_bdg_polling_start_kthread()");
1080 		free(bps->kthreads, M_DEVBUF);
1081 		free(bps, M_DEVBUF);
1082 		bna->na_polling_state = NULL;
1083 		if (bna->hwna->nm_intr)
1084 			bna->hwna->nm_intr(bna->hwna, 1);
1085 	}
1086 	return error;
1087 }
1088 
1089 static int
1090 nm_bdg_ctl_polling_stop(struct nmreq *nmr, struct netmap_adapter *na)
1091 {
1092 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na;
1093 	struct nm_bdg_polling_state *bps;
1094 
1095 	if (!bna->na_polling_state) {
1096 		D("ERROR adapter is not in polling mode");
1097 		return EFAULT;
1098 	}
1099 	bps = bna->na_polling_state;
1100 	nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state);
1101 	bps->configured = false;
1102 	free(bps, M_DEVBUF);
1103 	bna->na_polling_state = NULL;
1104 	/* reenable interrupt */
1105 	if (bna->hwna->nm_intr)
1106 		bna->hwna->nm_intr(bna->hwna, 1);
1107 	return 0;
1108 }
1109 
1110 /* Called by either user's context (netmap_ioctl())
1111  * or external kernel modules (e.g., Openvswitch).
1112  * Operation is indicated in nmr->nr_cmd.
1113  * NETMAP_BDG_OPS that sets configure/lookup/dtor functions to the bridge
1114  * requires bdg_ops argument; the other commands ignore this argument.
1115  *
1116  * Called without NMG_LOCK.
1117  */
1118 int
1119 netmap_bdg_ctl(struct nmreq *nmr, struct netmap_bdg_ops *bdg_ops)
1120 {
1121 	struct nm_bridge *b, *bridges;
1122 	struct netmap_adapter *na;
1123 	struct netmap_vp_adapter *vpna;
1124 	char *name = nmr->nr_name;
1125 	int cmd = nmr->nr_cmd, namelen = strlen(name);
1126 	int error = 0, i, j;
1127 	u_int num_bridges;
1128 
1129 	netmap_bns_getbridges(&bridges, &num_bridges);
1130 
1131 	switch (cmd) {
1132 	case NETMAP_BDG_NEWIF:
1133 		error = nm_vi_create(nmr);
1134 		break;
1135 
1136 	case NETMAP_BDG_DELIF:
1137 		error = nm_vi_destroy(nmr->nr_name);
1138 		break;
1139 
1140 	case NETMAP_BDG_ATTACH:
1141 		error = nm_bdg_ctl_attach(nmr);
1142 		break;
1143 
1144 	case NETMAP_BDG_DETACH:
1145 		error = nm_bdg_ctl_detach(nmr);
1146 		break;
1147 
1148 	case NETMAP_BDG_LIST:
1149 		/* this is used to enumerate bridges and ports */
1150 		if (namelen) { /* look up indexes of bridge and port */
1151 			if (strncmp(name, NM_BDG_NAME, strlen(NM_BDG_NAME))) {
1152 				error = EINVAL;
1153 				break;
1154 			}
1155 			NMG_LOCK();
1156 			b = nm_find_bridge(name, 0 /* don't create */);
1157 			if (!b) {
1158 				error = ENOENT;
1159 				NMG_UNLOCK();
1160 				break;
1161 			}
1162 
1163 			error = 0;
1164 			nmr->nr_arg1 = b - bridges; /* bridge index */
1165 			nmr->nr_arg2 = NM_BDG_NOPORT;
1166 			for (j = 0; j < b->bdg_active_ports; j++) {
1167 				i = b->bdg_port_index[j];
1168 				vpna = b->bdg_ports[i];
1169 				if (vpna == NULL) {
1170 					D("---AAAAAAAAARGH-------");
1171 					continue;
1172 				}
1173 				/* the former and the latter identify a
1174 				 * virtual port and a NIC, respectively
1175 				 */
1176 				if (!strcmp(vpna->up.name, name)) {
1177 					nmr->nr_arg2 = i; /* port index */
1178 					break;
1179 				}
1180 			}
1181 			NMG_UNLOCK();
1182 		} else {
1183 			/* return the first non-empty entry starting from
1184 			 * bridge nr_arg1 and port nr_arg2.
1185 			 *
1186 			 * Users can detect the end of the same bridge by
1187 			 * seeing the new and old value of nr_arg1, and can
1188 			 * detect the end of all the bridge by error != 0
1189 			 */
1190 			i = nmr->nr_arg1;
1191 			j = nmr->nr_arg2;
1192 
1193 			NMG_LOCK();
1194 			for (error = ENOENT; i < NM_BRIDGES; i++) {
1195 				b = bridges + i;
1196 				if (j >= b->bdg_active_ports) {
1197 					j = 0; /* following bridges scan from 0 */
1198 					continue;
1199 				}
1200 				nmr->nr_arg1 = i;
1201 				nmr->nr_arg2 = j;
1202 				j = b->bdg_port_index[j];
1203 				vpna = b->bdg_ports[j];
1204 				strncpy(name, vpna->up.name, (size_t)IFNAMSIZ);
1205 				error = 0;
1206 				break;
1207 			}
1208 			NMG_UNLOCK();
1209 		}
1210 		break;
1211 
1212 	case NETMAP_BDG_REGOPS: /* XXX this should not be available from userspace */
1213 		/* register callbacks to the given bridge.
1214 		 * nmr->nr_name may be just bridge's name (including ':'
1215 		 * if it is not just NM_NAME).
1216 		 */
1217 		if (!bdg_ops) {
1218 			error = EINVAL;
1219 			break;
1220 		}
1221 		NMG_LOCK();
1222 		b = nm_find_bridge(name, 0 /* don't create */);
1223 		if (!b) {
1224 			error = EINVAL;
1225 		} else {
1226 			b->bdg_ops = *bdg_ops;
1227 		}
1228 		NMG_UNLOCK();
1229 		break;
1230 
1231 	case NETMAP_BDG_VNET_HDR:
1232 		/* Valid lengths for the virtio-net header are 0 (no header),
1233 		   10 and 12. */
1234 		if (nmr->nr_arg1 != 0 &&
1235 			nmr->nr_arg1 != sizeof(struct nm_vnet_hdr) &&
1236 				nmr->nr_arg1 != 12) {
1237 			error = EINVAL;
1238 			break;
1239 		}
1240 		NMG_LOCK();
1241 		error = netmap_get_bdg_na(nmr, &na, 0);
1242 		if (na && !error) {
1243 			vpna = (struct netmap_vp_adapter *)na;
1244 			na->virt_hdr_len = nmr->nr_arg1;
1245 			if (na->virt_hdr_len) {
1246 				vpna->mfs = NETMAP_BUF_SIZE(na);
1247 			}
1248 			D("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
1249 			netmap_adapter_put(na);
1250 		} else if (!na) {
1251 			error = ENXIO;
1252 		}
1253 		NMG_UNLOCK();
1254 		break;
1255 
1256 	case NETMAP_BDG_POLLING_ON:
1257 	case NETMAP_BDG_POLLING_OFF:
1258 		NMG_LOCK();
1259 		error = netmap_get_bdg_na(nmr, &na, 0);
1260 		if (na && !error) {
1261 			if (!nm_is_bwrap(na)) {
1262 				error = EOPNOTSUPP;
1263 			} else if (cmd == NETMAP_BDG_POLLING_ON) {
1264 				error = nm_bdg_ctl_polling_start(nmr, na);
1265 				if (!error)
1266 					netmap_adapter_get(na);
1267 			} else {
1268 				error = nm_bdg_ctl_polling_stop(nmr, na);
1269 				if (!error)
1270 					netmap_adapter_put(na);
1271 			}
1272 			netmap_adapter_put(na);
1273 		}
1274 		NMG_UNLOCK();
1275 		break;
1276 
1277 	default:
1278 		D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
1279 		error = EINVAL;
1280 		break;
1281 	}
1282 	return error;
1283 }
1284 
1285 int
1286 netmap_bdg_config(struct nmreq *nmr)
1287 {
1288 	struct nm_bridge *b;
1289 	int error = EINVAL;
1290 
1291 	NMG_LOCK();
1292 	b = nm_find_bridge(nmr->nr_name, 0);
1293 	if (!b) {
1294 		NMG_UNLOCK();
1295 		return error;
1296 	}
1297 	NMG_UNLOCK();
1298 	/* Don't call config() with NMG_LOCK() held */
1299 	BDG_RLOCK(b);
1300 	if (b->bdg_ops.config != NULL)
1301 		error = b->bdg_ops.config((struct nm_ifreq *)nmr);
1302 	BDG_RUNLOCK(b);
1303 	return error;
1304 }
1305 
1306 
1307 /* nm_krings_create callback for VALE ports.
1308  * Calls the standard netmap_krings_create, then adds leases on rx
1309  * rings and bdgfwd on tx rings.
1310  */
1311 static int
1312 netmap_vp_krings_create(struct netmap_adapter *na)
1313 {
1314 	u_int tailroom;
1315 	int error, i;
1316 	uint32_t *leases;
1317 	u_int nrx = netmap_real_rings(na, NR_RX);
1318 
1319 	/*
1320 	 * Leases are attached to RX rings on vale ports
1321 	 */
1322 	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
1323 
1324 	error = netmap_krings_create(na, tailroom);
1325 	if (error)
1326 		return error;
1327 
1328 	leases = na->tailroom;
1329 
1330 	for (i = 0; i < nrx; i++) { /* Receive rings */
1331 		na->rx_rings[i].nkr_leases = leases;
1332 		leases += na->num_rx_desc;
1333 	}
1334 
1335 	error = nm_alloc_bdgfwd(na);
1336 	if (error) {
1337 		netmap_krings_delete(na);
1338 		return error;
1339 	}
1340 
1341 	return 0;
1342 }
1343 
1344 
1345 /* nm_krings_delete callback for VALE ports. */
1346 static void
1347 netmap_vp_krings_delete(struct netmap_adapter *na)
1348 {
1349 	nm_free_bdgfwd(na);
1350 	netmap_krings_delete(na);
1351 }
1352 
1353 
1354 static int
1355 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
1356 	struct netmap_vp_adapter *na, u_int ring_nr);
1357 
1358 
1359 /*
1360  * main dispatch routine for the bridge.
1361  * Grab packets from a kring, move them into the ft structure
1362  * associated to the tx (input) port. Max one instance per port,
1363  * filtered on input (ioctl, poll or XXX).
1364  * Returns the next position in the ring.
1365  */
1366 static int
1367 nm_bdg_preflush(struct netmap_kring *kring, u_int end)
1368 {
1369 	struct netmap_vp_adapter *na =
1370 		(struct netmap_vp_adapter*)kring->na;
1371 	struct netmap_ring *ring = kring->ring;
1372 	struct nm_bdg_fwd *ft;
1373 	u_int ring_nr = kring->ring_id;
1374 	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
1375 	u_int ft_i = 0;	/* start from 0 */
1376 	u_int frags = 1; /* how many frags ? */
1377 	struct nm_bridge *b = na->na_bdg;
1378 
1379 	/* To protect against modifications to the bridge we acquire a
1380 	 * shared lock, waiting if we can sleep (if the source port is
1381 	 * attached to a user process) or with a trylock otherwise (NICs).
1382 	 */
1383 	ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1384 	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
1385 		BDG_RLOCK(b);
1386 	else if (!BDG_RTRYLOCK(b))
1387 		return 0;
1388 	ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
1389 	ft = kring->nkr_ft;
1390 
1391 	for (; likely(j != end); j = nm_next(j, lim)) {
1392 		struct netmap_slot *slot = &ring->slot[j];
1393 		char *buf;
1394 
1395 		ft[ft_i].ft_len = slot->len;
1396 		ft[ft_i].ft_flags = slot->flags;
1397 
1398 		ND("flags is 0x%x", slot->flags);
1399 		/* we do not use the buf changed flag, but we still need to reset it */
1400 		slot->flags &= ~NS_BUF_CHANGED;
1401 
1402 		/* this slot goes into a list so initialize the link field */
1403 		ft[ft_i].ft_next = NM_FT_NULL;
1404 		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
1405 			(void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
1406 		if (unlikely(buf == NULL)) {
1407 			RD(5, "NULL %s buffer pointer from %s slot %d len %d",
1408 				(slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
1409 				kring->name, j, ft[ft_i].ft_len);
1410 			buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
1411 			ft[ft_i].ft_len = 0;
1412 			ft[ft_i].ft_flags = 0;
1413 		}
1414 		__builtin_prefetch(buf);
1415 		++ft_i;
1416 		if (slot->flags & NS_MOREFRAG) {
1417 			frags++;
1418 			continue;
1419 		}
1420 		if (unlikely(netmap_verbose && frags > 1))
1421 			RD(5, "%d frags at %d", frags, ft_i - frags);
1422 		ft[ft_i - frags].ft_frags = frags;
1423 		frags = 1;
1424 		if (unlikely((int)ft_i >= bridge_batch))
1425 			ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1426 	}
1427 	if (frags > 1) {
1428 		/* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
1429 		 * have to fix frags count. */
1430 		frags--;
1431 		ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
1432 		ft[ft_i - frags].ft_frags = frags;
1433 		D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
1434 	}
1435 	if (ft_i)
1436 		ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
1437 	BDG_RUNLOCK(b);
1438 	return j;
1439 }
1440 
1441 
1442 /* ----- FreeBSD if_bridge hash function ------- */
1443 
1444 /*
1445  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
1446  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
1447  *
1448  * http://www.burtleburtle.net/bob/hash/spooky.html
1449  */
1450 #define mix(a, b, c)                                                    \
1451 do {                                                                    \
1452         a -= b; a -= c; a ^= (c >> 13);                                 \
1453         b -= c; b -= a; b ^= (a << 8);                                  \
1454         c -= a; c -= b; c ^= (b >> 13);                                 \
1455         a -= b; a -= c; a ^= (c >> 12);                                 \
1456         b -= c; b -= a; b ^= (a << 16);                                 \
1457         c -= a; c -= b; c ^= (b >> 5);                                  \
1458         a -= b; a -= c; a ^= (c >> 3);                                  \
1459         b -= c; b -= a; b ^= (a << 10);                                 \
1460         c -= a; c -= b; c ^= (b >> 15);                                 \
1461 } while (/*CONSTCOND*/0)
1462 
1463 
1464 static __inline uint32_t
1465 nm_bridge_rthash(const uint8_t *addr)
1466 {
1467         uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
1468 
1469         b += addr[5] << 8;
1470         b += addr[4];
1471         a += addr[3] << 24;
1472         a += addr[2] << 16;
1473         a += addr[1] << 8;
1474         a += addr[0];
1475 
1476         mix(a, b, c);
1477 #define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
1478         return (c & BRIDGE_RTHASH_MASK);
1479 }
1480 
1481 #undef mix
1482 
1483 
1484 /* nm_register callback for VALE ports */
1485 static int
1486 netmap_vp_reg(struct netmap_adapter *na, int onoff)
1487 {
1488 	struct netmap_vp_adapter *vpna =
1489 		(struct netmap_vp_adapter*)na;
1490 	enum txrx t;
1491 	int i;
1492 
1493 	/* persistent ports may be put in netmap mode
1494 	 * before being attached to a bridge
1495 	 */
1496 	if (vpna->na_bdg)
1497 		BDG_WLOCK(vpna->na_bdg);
1498 	if (onoff) {
1499 		for_rx_tx(t) {
1500 			for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
1501 				struct netmap_kring *kring = &NMR(na, t)[i];
1502 
1503 				if (nm_kring_pending_on(kring))
1504 					kring->nr_mode = NKR_NETMAP_ON;
1505 			}
1506 		}
1507 		if (na->active_fds == 0)
1508 			na->na_flags |= NAF_NETMAP_ON;
1509 		 /* XXX on FreeBSD, persistent VALE ports should also
1510 		 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
1511 		 */
1512 	} else {
1513 		if (na->active_fds == 0)
1514 			na->na_flags &= ~NAF_NETMAP_ON;
1515 		for_rx_tx(t) {
1516 			for (i = 0; i < nma_get_nrings(na, t) + 1; i++) {
1517 				struct netmap_kring *kring = &NMR(na, t)[i];
1518 
1519 				if (nm_kring_pending_off(kring))
1520 					kring->nr_mode = NKR_NETMAP_OFF;
1521 			}
1522 		}
1523 	}
1524 	if (vpna->na_bdg)
1525 		BDG_WUNLOCK(vpna->na_bdg);
1526 	return 0;
1527 }
1528 
1529 
1530 /*
1531  * Lookup function for a learning bridge.
1532  * Update the hash table with the source address,
1533  * and then returns the destination port index, and the
1534  * ring in *dst_ring (at the moment, always use ring 0)
1535  */
1536 u_int
1537 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
1538 		struct netmap_vp_adapter *na)
1539 {
1540 	uint8_t *buf = ft->ft_buf;
1541 	u_int buf_len = ft->ft_len;
1542 	struct nm_hash_ent *ht = na->na_bdg->ht;
1543 	uint32_t sh, dh;
1544 	u_int dst, mysrc = na->bdg_port;
1545 	uint64_t smac, dmac;
1546 	uint8_t indbuf[12];
1547 
1548 	/* safety check, unfortunately we have many cases */
1549 	if (buf_len >= 14 + na->up.virt_hdr_len) {
1550 		/* virthdr + mac_hdr in the same slot */
1551 		buf += na->up.virt_hdr_len;
1552 		buf_len -= na->up.virt_hdr_len;
1553 	} else if (buf_len == na->up.virt_hdr_len && ft->ft_flags & NS_MOREFRAG) {
1554 		/* only header in first fragment */
1555 		ft++;
1556 		buf = ft->ft_buf;
1557 		buf_len = ft->ft_len;
1558 	} else {
1559 		RD(5, "invalid buf format, length %d", buf_len);
1560 		return NM_BDG_NOPORT;
1561 	}
1562 
1563 	if (ft->ft_flags & NS_INDIRECT) {
1564 		if (copyin(buf, indbuf, sizeof(indbuf))) {
1565 			return NM_BDG_NOPORT;
1566 		}
1567 		buf = indbuf;
1568 	}
1569 
1570 	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
1571 	smac = le64toh(*(uint64_t *)(buf + 4));
1572 	smac >>= 16;
1573 
1574 	/*
1575 	 * The hash is somewhat expensive, there might be some
1576 	 * worthwhile optimizations here.
1577 	 */
1578 	if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
1579 		uint8_t *s = buf+6;
1580 		sh = nm_bridge_rthash(s); // XXX hash of source
1581 		/* update source port forwarding entry */
1582 		na->last_smac = ht[sh].mac = smac;	/* XXX expire ? */
1583 		ht[sh].ports = mysrc;
1584 		if (netmap_verbose)
1585 		    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
1586 			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
1587 	}
1588 	dst = NM_BDG_BROADCAST;
1589 	if ((buf[0] & 1) == 0) { /* unicast */
1590 		dh = nm_bridge_rthash(buf); // XXX hash of dst
1591 		if (ht[dh].mac == dmac) {	/* found dst */
1592 			dst = ht[dh].ports;
1593 		}
1594 		/* XXX otherwise return NM_BDG_UNKNOWN ? */
1595 	}
1596 	return dst;
1597 }
1598 
1599 
1600 /*
1601  * Available space in the ring. Only used in VALE code
1602  * and only with is_rx = 1
1603  */
1604 static inline uint32_t
1605 nm_kr_space(struct netmap_kring *k, int is_rx)
1606 {
1607 	int space;
1608 
1609 	if (is_rx) {
1610 		int busy = k->nkr_hwlease - k->nr_hwcur;
1611 		if (busy < 0)
1612 			busy += k->nkr_num_slots;
1613 		space = k->nkr_num_slots - 1 - busy;
1614 	} else {
1615 		/* XXX never used in this branch */
1616 		space = k->nr_hwtail - k->nkr_hwlease;
1617 		if (space < 0)
1618 			space += k->nkr_num_slots;
1619 	}
1620 #if 0
1621 	// sanity check
1622 	if (k->nkr_hwlease >= k->nkr_num_slots ||
1623 		k->nr_hwcur >= k->nkr_num_slots ||
1624 		k->nr_tail >= k->nkr_num_slots ||
1625 		busy < 0 ||
1626 		busy >= k->nkr_num_slots) {
1627 		D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1628 			k->nkr_lease_idx, k->nkr_num_slots);
1629 	}
1630 #endif
1631 	return space;
1632 }
1633 
1634 
1635 
1636 
1637 /* make a lease on the kring for N positions. return the
1638  * lease index
1639  * XXX only used in VALE code and with is_rx = 1
1640  */
1641 static inline uint32_t
1642 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
1643 {
1644 	uint32_t lim = k->nkr_num_slots - 1;
1645 	uint32_t lease_idx = k->nkr_lease_idx;
1646 
1647 	k->nkr_leases[lease_idx] = NR_NOSLOT;
1648 	k->nkr_lease_idx = nm_next(lease_idx, lim);
1649 
1650 	if (n > nm_kr_space(k, is_rx)) {
1651 		D("invalid request for %d slots", n);
1652 		panic("x");
1653 	}
1654 	/* XXX verify that there are n slots */
1655 	k->nkr_hwlease += n;
1656 	if (k->nkr_hwlease > lim)
1657 		k->nkr_hwlease -= lim + 1;
1658 
1659 	if (k->nkr_hwlease >= k->nkr_num_slots ||
1660 		k->nr_hwcur >= k->nkr_num_slots ||
1661 		k->nr_hwtail >= k->nkr_num_slots ||
1662 		k->nkr_lease_idx >= k->nkr_num_slots) {
1663 		D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
1664 			k->na->name,
1665 			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
1666 			k->nkr_lease_idx, k->nkr_num_slots);
1667 	}
1668 	return lease_idx;
1669 }
1670 
1671 /*
1672  *
1673  * This flush routine supports only unicast and broadcast but a large
1674  * number of ports, and lets us replace the learn and dispatch functions.
1675  */
1676 int
1677 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
1678 		u_int ring_nr)
1679 {
1680 	struct nm_bdg_q *dst_ents, *brddst;
1681 	uint16_t num_dsts = 0, *dsts;
1682 	struct nm_bridge *b = na->na_bdg;
1683 	u_int i, me = na->bdg_port;
1684 
1685 	/*
1686 	 * The work area (pointed by ft) is followed by an array of
1687 	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
1688 	 * queues per port plus one for the broadcast traffic.
1689 	 * Then we have an array of destination indexes.
1690 	 */
1691 	dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
1692 	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
1693 
1694 	/* first pass: find a destination for each packet in the batch */
1695 	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
1696 		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
1697 		uint16_t dst_port, d_i;
1698 		struct nm_bdg_q *d;
1699 
1700 		ND("slot %d frags %d", i, ft[i].ft_frags);
1701 		/* Drop the packet if the virtio-net header is not into the first
1702 		   fragment nor at the very beginning of the second. */
1703 		if (unlikely(na->up.virt_hdr_len > ft[i].ft_len))
1704 			continue;
1705 		dst_port = b->bdg_ops.lookup(&ft[i], &dst_ring, na);
1706 		if (netmap_verbose > 255)
1707 			RD(5, "slot %d port %d -> %d", i, me, dst_port);
1708 		if (dst_port == NM_BDG_NOPORT)
1709 			continue; /* this packet is identified to be dropped */
1710 		else if (unlikely(dst_port > NM_BDG_MAXPORTS))
1711 			continue;
1712 		else if (dst_port == NM_BDG_BROADCAST)
1713 			dst_ring = 0; /* broadcasts always go to ring 0 */
1714 		else if (unlikely(dst_port == me ||
1715 		    !b->bdg_ports[dst_port]))
1716 			continue;
1717 
1718 		/* get a position in the scratch pad */
1719 		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
1720 		d = dst_ents + d_i;
1721 
1722 		/* append the first fragment to the list */
1723 		if (d->bq_head == NM_FT_NULL) { /* new destination */
1724 			d->bq_head = d->bq_tail = i;
1725 			/* remember this position to be scanned later */
1726 			if (dst_port != NM_BDG_BROADCAST)
1727 				dsts[num_dsts++] = d_i;
1728 		} else {
1729 			ft[d->bq_tail].ft_next = i;
1730 			d->bq_tail = i;
1731 		}
1732 		d->bq_len += ft[i].ft_frags;
1733 	}
1734 
1735 	/*
1736 	 * Broadcast traffic goes to ring 0 on all destinations.
1737 	 * So we need to add these rings to the list of ports to scan.
1738 	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
1739 	 * expensive. We should keep a compact list of active destinations
1740 	 * so we could shorten this loop.
1741 	 */
1742 	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
1743 	if (brddst->bq_head != NM_FT_NULL) {
1744 		u_int j;
1745 		for (j = 0; likely(j < b->bdg_active_ports); j++) {
1746 			uint16_t d_i;
1747 			i = b->bdg_port_index[j];
1748 			if (unlikely(i == me))
1749 				continue;
1750 			d_i = i * NM_BDG_MAXRINGS;
1751 			if (dst_ents[d_i].bq_head == NM_FT_NULL)
1752 				dsts[num_dsts++] = d_i;
1753 		}
1754 	}
1755 
1756 	ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
1757 	/* second pass: scan destinations */
1758 	for (i = 0; i < num_dsts; i++) {
1759 		struct netmap_vp_adapter *dst_na;
1760 		struct netmap_kring *kring;
1761 		struct netmap_ring *ring;
1762 		u_int dst_nr, lim, j, d_i, next, brd_next;
1763 		u_int needed, howmany;
1764 		int retry = netmap_txsync_retry;
1765 		struct nm_bdg_q *d;
1766 		uint32_t my_start = 0, lease_idx = 0;
1767 		int nrings;
1768 		int virt_hdr_mismatch = 0;
1769 
1770 		d_i = dsts[i];
1771 		ND("second pass %d port %d", i, d_i);
1772 		d = dst_ents + d_i;
1773 		// XXX fix the division
1774 		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
1775 		/* protect from the lookup function returning an inactive
1776 		 * destination port
1777 		 */
1778 		if (unlikely(dst_na == NULL))
1779 			goto cleanup;
1780 		if (dst_na->up.na_flags & NAF_SW_ONLY)
1781 			goto cleanup;
1782 		/*
1783 		 * The interface may be in !netmap mode in two cases:
1784 		 * - when na is attached but not activated yet;
1785 		 * - when na is being deactivated but is still attached.
1786 		 */
1787 		if (unlikely(!nm_netmap_on(&dst_na->up))) {
1788 			ND("not in netmap mode!");
1789 			goto cleanup;
1790 		}
1791 
1792 		/* there is at least one either unicast or broadcast packet */
1793 		brd_next = brddst->bq_head;
1794 		next = d->bq_head;
1795 		/* we need to reserve this many slots. If fewer are
1796 		 * available, some packets will be dropped.
1797 		 * Packets may have multiple fragments, so we may not use
1798 		 * there is a chance that we may not use all of the slots
1799 		 * we have claimed, so we will need to handle the leftover
1800 		 * ones when we regain the lock.
1801 		 */
1802 		needed = d->bq_len + brddst->bq_len;
1803 
1804 		if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
1805 			RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
1806 			      dst_na->up.virt_hdr_len);
1807 			/* There is a virtio-net header/offloadings mismatch between
1808 			 * source and destination. The slower mismatch datapath will
1809 			 * be used to cope with all the mismatches.
1810 			 */
1811 			virt_hdr_mismatch = 1;
1812 			if (dst_na->mfs < na->mfs) {
1813 				/* We may need to do segmentation offloadings, and so
1814 				 * we may need a number of destination slots greater
1815 				 * than the number of input slots ('needed').
1816 				 * We look for the smallest integer 'x' which satisfies:
1817 				 *	needed * na->mfs + x * H <= x * na->mfs
1818 				 * where 'H' is the length of the longest header that may
1819 				 * be replicated in the segmentation process (e.g. for
1820 				 * TCPv4 we must account for ethernet header, IP header
1821 				 * and TCPv4 header).
1822 				 */
1823 				needed = (needed * na->mfs) /
1824 						(dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
1825 				ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
1826 			}
1827 		}
1828 
1829 		ND(5, "pass 2 dst %d is %x %s",
1830 			i, d_i, is_vp ? "virtual" : "nic/host");
1831 		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
1832 		nrings = dst_na->up.num_rx_rings;
1833 		if (dst_nr >= nrings)
1834 			dst_nr = dst_nr % nrings;
1835 		kring = &dst_na->up.rx_rings[dst_nr];
1836 		ring = kring->ring;
1837 		lim = kring->nkr_num_slots - 1;
1838 
1839 retry:
1840 
1841 		if (dst_na->retry && retry) {
1842 			/* try to get some free slot from the previous run */
1843 			kring->nm_notify(kring, 0);
1844 			/* actually useful only for bwraps, since there
1845 			 * the notify will trigger a txsync on the hwna. VALE ports
1846 			 * have dst_na->retry == 0
1847 			 */
1848 		}
1849 		/* reserve the buffers in the queue and an entry
1850 		 * to report completion, and drop lock.
1851 		 * XXX this might become a helper function.
1852 		 */
1853 		mtx_lock(&kring->q_lock);
1854 		if (kring->nkr_stopped) {
1855 			mtx_unlock(&kring->q_lock);
1856 			goto cleanup;
1857 		}
1858 		my_start = j = kring->nkr_hwlease;
1859 		howmany = nm_kr_space(kring, 1);
1860 		if (needed < howmany)
1861 			howmany = needed;
1862 		lease_idx = nm_kr_lease(kring, howmany, 1);
1863 		mtx_unlock(&kring->q_lock);
1864 
1865 		/* only retry if we need more than available slots */
1866 		if (retry && needed <= howmany)
1867 			retry = 0;
1868 
1869 		/* copy to the destination queue */
1870 		while (howmany > 0) {
1871 			struct netmap_slot *slot;
1872 			struct nm_bdg_fwd *ft_p, *ft_end;
1873 			u_int cnt;
1874 
1875 			/* find the queue from which we pick next packet.
1876 			 * NM_FT_NULL is always higher than valid indexes
1877 			 * so we never dereference it if the other list
1878 			 * has packets (and if both are empty we never
1879 			 * get here).
1880 			 */
1881 			if (next < brd_next) {
1882 				ft_p = ft + next;
1883 				next = ft_p->ft_next;
1884 			} else { /* insert broadcast */
1885 				ft_p = ft + brd_next;
1886 				brd_next = ft_p->ft_next;
1887 			}
1888 			cnt = ft_p->ft_frags; // cnt > 0
1889 			if (unlikely(cnt > howmany))
1890 			    break; /* no more space */
1891 			if (netmap_verbose && cnt > 1)
1892 				RD(5, "rx %d frags to %d", cnt, j);
1893 			ft_end = ft_p + cnt;
1894 			if (unlikely(virt_hdr_mismatch)) {
1895 				bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
1896 			} else {
1897 				howmany -= cnt;
1898 				do {
1899 					char *dst, *src = ft_p->ft_buf;
1900 					size_t copy_len = ft_p->ft_len, dst_len = copy_len;
1901 
1902 					slot = &ring->slot[j];
1903 					dst = NMB(&dst_na->up, slot);
1904 
1905 					ND("send [%d] %d(%d) bytes at %s:%d",
1906 							i, (int)copy_len, (int)dst_len,
1907 							NM_IFPNAME(dst_ifp), j);
1908 					/* round to a multiple of 64 */
1909 					copy_len = (copy_len + 63) & ~63;
1910 
1911 					if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
1912 						     copy_len > NETMAP_BUF_SIZE(&na->up))) {
1913 						RD(5, "invalid len %d, down to 64", (int)copy_len);
1914 						copy_len = dst_len = 64; // XXX
1915 					}
1916 					if (ft_p->ft_flags & NS_INDIRECT) {
1917 						if (copyin(src, dst, copy_len)) {
1918 							// invalid user pointer, pretend len is 0
1919 							dst_len = 0;
1920 						}
1921 					} else {
1922 						//memcpy(dst, src, copy_len);
1923 						pkt_copy(src, dst, (int)copy_len);
1924 					}
1925 					slot->len = dst_len;
1926 					slot->flags = (cnt << 8)| NS_MOREFRAG;
1927 					j = nm_next(j, lim);
1928 					needed--;
1929 					ft_p++;
1930 				} while (ft_p != ft_end);
1931 				slot->flags = (cnt << 8); /* clear flag on last entry */
1932 			}
1933 			/* are we done ? */
1934 			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
1935 				break;
1936 		}
1937 		{
1938 		    /* current position */
1939 		    uint32_t *p = kring->nkr_leases; /* shorthand */
1940 		    uint32_t update_pos;
1941 		    int still_locked = 1;
1942 
1943 		    mtx_lock(&kring->q_lock);
1944 		    if (unlikely(howmany > 0)) {
1945 			/* not used all bufs. If i am the last one
1946 			 * i can recover the slots, otherwise must
1947 			 * fill them with 0 to mark empty packets.
1948 			 */
1949 			ND("leftover %d bufs", howmany);
1950 			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
1951 			    /* yes i am the last one */
1952 			    ND("roll back nkr_hwlease to %d", j);
1953 			    kring->nkr_hwlease = j;
1954 			} else {
1955 			    while (howmany-- > 0) {
1956 				ring->slot[j].len = 0;
1957 				ring->slot[j].flags = 0;
1958 				j = nm_next(j, lim);
1959 			    }
1960 			}
1961 		    }
1962 		    p[lease_idx] = j; /* report I am done */
1963 
1964 		    update_pos = kring->nr_hwtail;
1965 
1966 		    if (my_start == update_pos) {
1967 			/* all slots before my_start have been reported,
1968 			 * so scan subsequent leases to see if other ranges
1969 			 * have been completed, and to a selwakeup or txsync.
1970 		         */
1971 			while (lease_idx != kring->nkr_lease_idx &&
1972 				p[lease_idx] != NR_NOSLOT) {
1973 			    j = p[lease_idx];
1974 			    p[lease_idx] = NR_NOSLOT;
1975 			    lease_idx = nm_next(lease_idx, lim);
1976 			}
1977 			/* j is the new 'write' position. j != my_start
1978 			 * means there are new buffers to report
1979 			 */
1980 			if (likely(j != my_start)) {
1981 				kring->nr_hwtail = j;
1982 				still_locked = 0;
1983 				mtx_unlock(&kring->q_lock);
1984 				kring->nm_notify(kring, 0);
1985 				/* this is netmap_notify for VALE ports and
1986 				 * netmap_bwrap_notify for bwrap. The latter will
1987 				 * trigger a txsync on the underlying hwna
1988 				 */
1989 				if (dst_na->retry && retry--) {
1990 					/* XXX this is going to call nm_notify again.
1991 					 * Only useful for bwrap in virtual machines
1992 					 */
1993 					goto retry;
1994 				}
1995 			}
1996 		    }
1997 		    if (still_locked)
1998 			mtx_unlock(&kring->q_lock);
1999 		}
2000 cleanup:
2001 		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
2002 		d->bq_len = 0;
2003 	}
2004 	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
2005 	brddst->bq_len = 0;
2006 	return 0;
2007 }
2008 
2009 /* nm_txsync callback for VALE ports */
2010 static int
2011 netmap_vp_txsync(struct netmap_kring *kring, int flags)
2012 {
2013 	struct netmap_vp_adapter *na =
2014 		(struct netmap_vp_adapter *)kring->na;
2015 	u_int done;
2016 	u_int const lim = kring->nkr_num_slots - 1;
2017 	u_int const head = kring->rhead;
2018 
2019 	if (bridge_batch <= 0) { /* testing only */
2020 		done = head; // used all
2021 		goto done;
2022 	}
2023 	if (!na->na_bdg) {
2024 		done = head;
2025 		goto done;
2026 	}
2027 	if (bridge_batch > NM_BDG_BATCH)
2028 		bridge_batch = NM_BDG_BATCH;
2029 
2030 	done = nm_bdg_preflush(kring, head);
2031 done:
2032 	if (done != head)
2033 		D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
2034 	/*
2035 	 * packets between 'done' and 'cur' are left unsent.
2036 	 */
2037 	kring->nr_hwcur = done;
2038 	kring->nr_hwtail = nm_prev(done, lim);
2039 	if (netmap_verbose)
2040 		D("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
2041 	return 0;
2042 }
2043 
2044 
2045 /* rxsync code used by VALE ports nm_rxsync callback and also
2046  * internally by the brwap
2047  */
2048 static int
2049 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
2050 {
2051 	struct netmap_adapter *na = kring->na;
2052 	struct netmap_ring *ring = kring->ring;
2053 	u_int nm_i, lim = kring->nkr_num_slots - 1;
2054 	u_int head = kring->rhead;
2055 	int n;
2056 
2057 	if (head > lim) {
2058 		D("ouch dangerous reset!!!");
2059 		n = netmap_ring_reinit(kring);
2060 		goto done;
2061 	}
2062 
2063 	/* First part, import newly received packets. */
2064 	/* actually nothing to do here, they are already in the kring */
2065 
2066 	/* Second part, skip past packets that userspace has released. */
2067 	nm_i = kring->nr_hwcur;
2068 	if (nm_i != head) {
2069 		/* consistency check, but nothing really important here */
2070 		for (n = 0; likely(nm_i != head); n++) {
2071 			struct netmap_slot *slot = &ring->slot[nm_i];
2072 			void *addr = NMB(na, slot);
2073 
2074 			if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
2075 				D("bad buffer index %d, ignore ?",
2076 					slot->buf_idx);
2077 			}
2078 			slot->flags &= ~NS_BUF_CHANGED;
2079 			nm_i = nm_next(nm_i, lim);
2080 		}
2081 		kring->nr_hwcur = head;
2082 	}
2083 
2084 	n = 0;
2085 done:
2086 	return n;
2087 }
2088 
2089 /*
2090  * nm_rxsync callback for VALE ports
2091  * user process reading from a VALE switch.
2092  * Already protected against concurrent calls from userspace,
2093  * but we must acquire the queue's lock to protect against
2094  * writers on the same queue.
2095  */
2096 static int
2097 netmap_vp_rxsync(struct netmap_kring *kring, int flags)
2098 {
2099 	int n;
2100 
2101 	mtx_lock(&kring->q_lock);
2102 	n = netmap_vp_rxsync_locked(kring, flags);
2103 	mtx_unlock(&kring->q_lock);
2104 	return n;
2105 }
2106 
2107 
2108 /* nm_bdg_attach callback for VALE ports
2109  * The na_vp port is this same netmap_adapter. There is no host port.
2110  */
2111 static int
2112 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na)
2113 {
2114 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
2115 
2116 	if (vpna->na_bdg)
2117 		return EBUSY;
2118 	na->na_vp = vpna;
2119 	strncpy(na->name, name, sizeof(na->name));
2120 	na->na_hostvp = NULL;
2121 	return 0;
2122 }
2123 
2124 /* create a netmap_vp_adapter that describes a VALE port.
2125  * Only persistent VALE ports have a non-null ifp.
2126  */
2127 static int
2128 netmap_vp_create(struct nmreq *nmr, struct ifnet *ifp, struct netmap_vp_adapter **ret)
2129 {
2130 	struct netmap_vp_adapter *vpna;
2131 	struct netmap_adapter *na;
2132 	int error;
2133 	u_int npipes = 0;
2134 
2135 	vpna = malloc(sizeof(*vpna), M_DEVBUF, M_NOWAIT | M_ZERO);
2136 	if (vpna == NULL)
2137 		return ENOMEM;
2138 
2139  	na = &vpna->up;
2140 
2141 	na->ifp = ifp;
2142 	strncpy(na->name, nmr->nr_name, sizeof(na->name));
2143 
2144 	/* bound checking */
2145 	na->num_tx_rings = nmr->nr_tx_rings;
2146 	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
2147 	nmr->nr_tx_rings = na->num_tx_rings; // write back
2148 	na->num_rx_rings = nmr->nr_rx_rings;
2149 	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
2150 	nmr->nr_rx_rings = na->num_rx_rings; // write back
2151 	nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
2152 			1, NM_BDG_MAXSLOTS, NULL);
2153 	na->num_tx_desc = nmr->nr_tx_slots;
2154 	nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
2155 			1, NM_BDG_MAXSLOTS, NULL);
2156 	/* validate number of pipes. We want at least 1,
2157 	 * but probably can do with some more.
2158 	 * So let's use 2 as default (when 0 is supplied)
2159 	 */
2160 	npipes = nmr->nr_arg1;
2161 	nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
2162 	nmr->nr_arg1 = npipes;	/* write back */
2163 	/* validate extra bufs */
2164 	nm_bound_var(&nmr->nr_arg3, 0, 0,
2165 			128*NM_BDG_MAXSLOTS, NULL);
2166 	na->num_rx_desc = nmr->nr_rx_slots;
2167 	vpna->mfs = 1514;
2168 	vpna->last_smac = ~0llu;
2169 	/*if (vpna->mfs > netmap_buf_size)  TODO netmap_buf_size is zero??
2170 		vpna->mfs = netmap_buf_size; */
2171         if (netmap_verbose)
2172 		D("max frame size %u", vpna->mfs);
2173 
2174 	na->na_flags |= NAF_BDG_MAYSLEEP;
2175 	/* persistent VALE ports look like hw devices
2176 	 * with a native netmap adapter
2177 	 */
2178 	if (ifp)
2179 		na->na_flags |= NAF_NATIVE;
2180 	na->nm_txsync = netmap_vp_txsync;
2181 	na->nm_rxsync = netmap_vp_rxsync;
2182 	na->nm_register = netmap_vp_reg;
2183 	na->nm_krings_create = netmap_vp_krings_create;
2184 	na->nm_krings_delete = netmap_vp_krings_delete;
2185 	na->nm_dtor = netmap_vp_dtor;
2186 	na->nm_mem = netmap_mem_private_new(na->name,
2187 			na->num_tx_rings, na->num_tx_desc,
2188 			na->num_rx_rings, na->num_rx_desc,
2189 			nmr->nr_arg3, npipes, &error);
2190 	if (na->nm_mem == NULL)
2191 		goto err;
2192 	na->nm_bdg_attach = netmap_vp_bdg_attach;
2193 	/* other nmd fields are set in the common routine */
2194 	error = netmap_attach_common(na);
2195 	if (error)
2196 		goto err;
2197 	*ret = vpna;
2198 	return 0;
2199 
2200 err:
2201 	if (na->nm_mem != NULL)
2202 		netmap_mem_delete(na->nm_mem);
2203 	free(vpna, M_DEVBUF);
2204 	return error;
2205 }
2206 
2207 /* Bridge wrapper code (bwrap).
2208  * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
2209  * VALE switch.
2210  * The main task is to swap the meaning of tx and rx rings to match the
2211  * expectations of the VALE switch code (see nm_bdg_flush).
2212  *
2213  * The bwrap works by interposing a netmap_bwrap_adapter between the
2214  * rest of the system and the hwna. The netmap_bwrap_adapter looks like
2215  * a netmap_vp_adapter to the rest the system, but, internally, it
2216  * translates all callbacks to what the hwna expects.
2217  *
2218  * Note that we have to intercept callbacks coming from two sides:
2219  *
2220  *  - callbacks coming from the netmap module are intercepted by
2221  *    passing around the netmap_bwrap_adapter instead of the hwna
2222  *
2223  *  - callbacks coming from outside of the netmap module only know
2224  *    about the hwna. This, however, only happens in interrupt
2225  *    handlers, where only the hwna->nm_notify callback is called.
2226  *    What the bwrap does is to overwrite the hwna->nm_notify callback
2227  *    with its own netmap_bwrap_intr_notify.
2228  *    XXX This assumes that the hwna->nm_notify callback was the
2229  *    standard netmap_notify(), as it is the case for nic adapters.
2230  *    Any additional action performed by hwna->nm_notify will not be
2231  *    performed by netmap_bwrap_intr_notify.
2232  *
2233  * Additionally, the bwrap can optionally attach the host rings pair
2234  * of the wrapped adapter to a different port of the switch.
2235  */
2236 
2237 
2238 static void
2239 netmap_bwrap_dtor(struct netmap_adapter *na)
2240 {
2241 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2242 	struct netmap_adapter *hwna = bna->hwna;
2243 	struct nm_bridge *b = bna->up.na_bdg,
2244 		*bh = bna->host.na_bdg;
2245 
2246 	if (b) {
2247 		netmap_bdg_detach_common(b, bna->up.bdg_port,
2248 			    (bh ? bna->host.bdg_port : -1));
2249 	}
2250 
2251 	ND("na %p", na);
2252 	na->ifp = NULL;
2253 	bna->host.up.ifp = NULL;
2254 	hwna->na_private = NULL;
2255 	hwna->na_vp = hwna->na_hostvp = NULL;
2256 	hwna->na_flags &= ~NAF_BUSY;
2257 	netmap_adapter_put(hwna);
2258 
2259 }
2260 
2261 
2262 /*
2263  * Intr callback for NICs connected to a bridge.
2264  * Simply ignore tx interrupts (maybe we could try to recover space ?)
2265  * and pass received packets from nic to the bridge.
2266  *
2267  * XXX TODO check locking: this is called from the interrupt
2268  * handler so we should make sure that the interface is not
2269  * disconnected while passing down an interrupt.
2270  *
2271  * Note, no user process can access this NIC or the host stack.
2272  * The only part of the ring that is significant are the slots,
2273  * and head/cur/tail are set from the kring as needed
2274  * (part as a receive ring, part as a transmit ring).
2275  *
2276  * callback that overwrites the hwna notify callback.
2277  * Packets come from the outside or from the host stack and are put on an
2278  * hwna rx ring.
2279  * The bridge wrapper then sends the packets through the bridge.
2280  */
2281 static int
2282 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
2283 {
2284 	struct netmap_adapter *na = kring->na;
2285 	struct netmap_bwrap_adapter *bna = na->na_private;
2286 	struct netmap_kring *bkring;
2287 	struct netmap_vp_adapter *vpna = &bna->up;
2288 	u_int ring_nr = kring->ring_id;
2289 	int ret = NM_IRQ_COMPLETED;
2290 	int error;
2291 
2292 	if (netmap_verbose)
2293 	    D("%s %s 0x%x", na->name, kring->name, flags);
2294 
2295 	bkring = &vpna->up.tx_rings[ring_nr];
2296 
2297 	/* make sure the ring is not disabled */
2298 	if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) {
2299 		return EIO;
2300 	}
2301 
2302 	if (netmap_verbose)
2303 	    D("%s head %d cur %d tail %d",  na->name,
2304 		kring->rhead, kring->rcur, kring->rtail);
2305 
2306 	/* simulate a user wakeup on the rx ring
2307 	 * fetch packets that have arrived.
2308 	 */
2309 	error = kring->nm_sync(kring, 0);
2310 	if (error)
2311 		goto put_out;
2312 	if (kring->nr_hwcur == kring->nr_hwtail) {
2313 		if (netmap_verbose)
2314 			D("how strange, interrupt with no packets on %s",
2315 			    na->name);
2316 		goto put_out;
2317 	}
2318 
2319 	/* new packets are kring->rcur to kring->nr_hwtail, and the bkring
2320 	 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail
2321 	 * to push all packets out.
2322 	 */
2323 	bkring->rhead = bkring->rcur = kring->nr_hwtail;
2324 
2325 	netmap_vp_txsync(bkring, flags);
2326 
2327 	/* mark all buffers as released on this ring */
2328 	kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail;
2329 	/* another call to actually release the buffers */
2330 	error = kring->nm_sync(kring, 0);
2331 
2332 	/* The second rxsync may have further advanced hwtail. If this happens,
2333 	 *  return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */
2334 	if (kring->rcur != kring->nr_hwtail) {
2335 		ret = NM_IRQ_RESCHED;
2336 	}
2337 put_out:
2338 	nm_kr_put(kring);
2339 
2340 	return error ? error : ret;
2341 }
2342 
2343 
2344 /* nm_register callback for bwrap */
2345 static int
2346 netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
2347 {
2348 	struct netmap_bwrap_adapter *bna =
2349 		(struct netmap_bwrap_adapter *)na;
2350 	struct netmap_adapter *hwna = bna->hwna;
2351 	struct netmap_vp_adapter *hostna = &bna->host;
2352 	int error, i;
2353 	enum txrx t;
2354 
2355 	ND("%s %s", na->name, onoff ? "on" : "off");
2356 
2357 	if (onoff) {
2358 		/* netmap_do_regif has been called on the bwrap na.
2359 		 * We need to pass the information about the
2360 		 * memory allocator down to the hwna before
2361 		 * putting it in netmap mode
2362 		 */
2363 		hwna->na_lut = na->na_lut;
2364 
2365 		if (hostna->na_bdg) {
2366 			/* if the host rings have been attached to switch,
2367 			 * we need to copy the memory allocator information
2368 			 * in the hostna also
2369 			 */
2370 			hostna->up.na_lut = na->na_lut;
2371 		}
2372 
2373 		/* cross-link the netmap rings
2374 		 * The original number of rings comes from hwna,
2375 		 * rx rings on one side equals tx rings on the other.
2376 		 */
2377 		for_rx_tx(t) {
2378 			enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2379 			for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {
2380 				NMR(hwna, r)[i].ring = NMR(na, t)[i].ring;
2381 			}
2382 		}
2383 
2384 		if (na->na_flags & NAF_HOST_RINGS) {
2385 			struct netmap_adapter *hna = &hostna->up;
2386 			/* the hostna rings are the host rings of the bwrap.
2387 			 * The corresponding krings must point back to the
2388 			 * hostna
2389 			 */
2390 			hna->tx_rings = &na->tx_rings[na->num_tx_rings];
2391 			hna->tx_rings[0].na = hna;
2392 			hna->rx_rings = &na->rx_rings[na->num_rx_rings];
2393 			hna->rx_rings[0].na = hna;
2394 		}
2395 	}
2396 
2397 	/* pass down the pending ring state information */
2398 	for_rx_tx(t) {
2399 		for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
2400 			NMR(hwna, t)[i].nr_pending_mode =
2401 				NMR(na, t)[i].nr_pending_mode;
2402 	}
2403 
2404 	/* forward the request to the hwna */
2405 	error = hwna->nm_register(hwna, onoff);
2406 	if (error)
2407 		return error;
2408 
2409 	/* copy up the current ring state information */
2410 	for_rx_tx(t) {
2411 		for (i = 0; i < nma_get_nrings(na, t) + 1; i++)
2412 			NMR(na, t)[i].nr_mode =
2413 				NMR(hwna, t)[i].nr_mode;
2414 	}
2415 
2416 	/* impersonate a netmap_vp_adapter */
2417 	netmap_vp_reg(na, onoff);
2418 	if (hostna->na_bdg)
2419 		netmap_vp_reg(&hostna->up, onoff);
2420 
2421 	if (onoff) {
2422 		u_int i;
2423 		/* intercept the hwna nm_nofify callback on the hw rings */
2424 		for (i = 0; i < hwna->num_rx_rings; i++) {
2425 			hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify;
2426 			hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
2427 		}
2428 		i = hwna->num_rx_rings; /* for safety */
2429 		/* save the host ring notify unconditionally */
2430 		hwna->rx_rings[i].save_notify = hwna->rx_rings[i].nm_notify;
2431 		if (hostna->na_bdg) {
2432 			/* also intercept the host ring notify */
2433 			hwna->rx_rings[i].nm_notify = netmap_bwrap_intr_notify;
2434 		}
2435 		if (na->active_fds == 0)
2436 			na->na_flags |= NAF_NETMAP_ON;
2437 	} else {
2438 		u_int i;
2439 
2440 		if (na->active_fds == 0)
2441 			na->na_flags &= ~NAF_NETMAP_ON;
2442 
2443 		/* reset all notify callbacks (including host ring) */
2444 		for (i = 0; i <= hwna->num_rx_rings; i++) {
2445 			hwna->rx_rings[i].nm_notify = hwna->rx_rings[i].save_notify;
2446 			hwna->rx_rings[i].save_notify = NULL;
2447 		}
2448 		hwna->na_lut.lut = NULL;
2449 		hwna->na_lut.objtotal = 0;
2450 		hwna->na_lut.objsize = 0;
2451 	}
2452 
2453 	return 0;
2454 }
2455 
2456 /* nm_config callback for bwrap */
2457 static int
2458 netmap_bwrap_config(struct netmap_adapter *na, u_int *txr, u_int *txd,
2459 				    u_int *rxr, u_int *rxd)
2460 {
2461 	struct netmap_bwrap_adapter *bna =
2462 		(struct netmap_bwrap_adapter *)na;
2463 	struct netmap_adapter *hwna = bna->hwna;
2464 
2465 	/* forward the request */
2466 	netmap_update_config(hwna);
2467 	/* swap the results */
2468 	*txr = hwna->num_rx_rings;
2469 	*txd = hwna->num_rx_desc;
2470 	*rxr = hwna->num_tx_rings;
2471 	*rxd = hwna->num_rx_desc;
2472 
2473 	return 0;
2474 }
2475 
2476 
2477 /* nm_krings_create callback for bwrap */
2478 static int
2479 netmap_bwrap_krings_create(struct netmap_adapter *na)
2480 {
2481 	struct netmap_bwrap_adapter *bna =
2482 		(struct netmap_bwrap_adapter *)na;
2483 	struct netmap_adapter *hwna = bna->hwna;
2484 	int i, error = 0;
2485 	enum txrx t;
2486 
2487 	ND("%s", na->name);
2488 
2489 	/* impersonate a netmap_vp_adapter */
2490 	error = netmap_vp_krings_create(na);
2491 	if (error)
2492 		return error;
2493 
2494 	/* also create the hwna krings */
2495 	error = hwna->nm_krings_create(hwna);
2496 	if (error) {
2497 		goto err_del_vp_rings;
2498 	}
2499 
2500 	/* get each ring slot number from the corresponding hwna ring */
2501 	for_rx_tx(t) {
2502 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2503 		for (i = 0; i < nma_get_nrings(hwna, r) + 1; i++) {
2504 			NMR(na, t)[i].nkr_num_slots = NMR(hwna, r)[i].nkr_num_slots;
2505 		}
2506 	}
2507 
2508 	return 0;
2509 
2510 err_del_vp_rings:
2511 	netmap_vp_krings_delete(na);
2512 
2513 	return error;
2514 }
2515 
2516 
2517 static void
2518 netmap_bwrap_krings_delete(struct netmap_adapter *na)
2519 {
2520 	struct netmap_bwrap_adapter *bna =
2521 		(struct netmap_bwrap_adapter *)na;
2522 	struct netmap_adapter *hwna = bna->hwna;
2523 
2524 	ND("%s", na->name);
2525 
2526 	hwna->nm_krings_delete(hwna);
2527 	netmap_vp_krings_delete(na);
2528 }
2529 
2530 
2531 /* notify method for the bridge-->hwna direction */
2532 static int
2533 netmap_bwrap_notify(struct netmap_kring *kring, int flags)
2534 {
2535 	struct netmap_adapter *na = kring->na;
2536 	struct netmap_bwrap_adapter *bna = na->na_private;
2537 	struct netmap_adapter *hwna = bna->hwna;
2538 	u_int ring_n = kring->ring_id;
2539 	u_int lim = kring->nkr_num_slots - 1;
2540 	struct netmap_kring *hw_kring;
2541 	int error;
2542 
2543 	ND("%s: na %s hwna %s",
2544 			(kring ? kring->name : "NULL!"),
2545 			(na ? na->name : "NULL!"),
2546 			(hwna ? hwna->name : "NULL!"));
2547 	hw_kring = &hwna->tx_rings[ring_n];
2548 
2549 	if (nm_kr_tryget(hw_kring, 0, NULL)) {
2550 		return ENXIO;
2551 	}
2552 
2553 	/* first step: simulate a user wakeup on the rx ring */
2554 	netmap_vp_rxsync(kring, flags);
2555 	ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2556 		na->name, ring_n,
2557 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2558 		ring->head, ring->cur, ring->tail,
2559 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
2560 	/* second step: the new packets are sent on the tx ring
2561 	 * (which is actually the same ring)
2562 	 */
2563 	hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
2564 	error = hw_kring->nm_sync(hw_kring, flags);
2565 	if (error)
2566 		goto put_out;
2567 
2568 	/* third step: now we are back the rx ring */
2569 	/* claim ownership on all hw owned bufs */
2570 	kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */
2571 
2572 	/* fourth step: the user goes to sleep again, causing another rxsync */
2573 	netmap_vp_rxsync(kring, flags);
2574 	ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
2575 		na->name, ring_n,
2576 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
2577 		ring->head, ring->cur, ring->tail,
2578 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
2579 put_out:
2580 	nm_kr_put(hw_kring);
2581 
2582 	return error ? error : NM_IRQ_COMPLETED;
2583 }
2584 
2585 
2586 /* nm_bdg_ctl callback for the bwrap.
2587  * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd].
2588  * On attach, it needs to provide a fake netmap_priv_d structure and
2589  * perform a netmap_do_regif() on the bwrap. This will put both the
2590  * bwrap and the hwna in netmap mode, with the netmap rings shared
2591  * and cross linked. Moroever, it will start intercepting interrupts
2592  * directed to hwna.
2593  */
2594 static int
2595 netmap_bwrap_bdg_ctl(struct netmap_adapter *na, struct nmreq *nmr, int attach)
2596 {
2597 	struct netmap_priv_d *npriv;
2598 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
2599 	int error = 0;
2600 
2601 	if (attach) {
2602 		if (NETMAP_OWNED_BY_ANY(na)) {
2603 			return EBUSY;
2604 		}
2605 		if (bna->na_kpriv) {
2606 			/* nothing to do */
2607 			return 0;
2608 		}
2609 		npriv = netmap_priv_new();
2610 		if (npriv == NULL)
2611 			return ENOMEM;
2612 		npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */
2613 		error = netmap_do_regif(npriv, na, 0, NR_REG_NIC_SW);
2614 		if (error) {
2615 			netmap_priv_delete(npriv);
2616 			return error;
2617 		}
2618 		bna->na_kpriv = npriv;
2619 		na->na_flags |= NAF_BUSY;
2620 	} else {
2621 		if (na->active_fds == 0) /* not registered */
2622 			return EINVAL;
2623 		netmap_priv_delete(bna->na_kpriv);
2624 		bna->na_kpriv = NULL;
2625 		na->na_flags &= ~NAF_BUSY;
2626 	}
2627 	return error;
2628 
2629 }
2630 
2631 /* attach a bridge wrapper to the 'real' device */
2632 int
2633 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
2634 {
2635 	struct netmap_bwrap_adapter *bna;
2636 	struct netmap_adapter *na = NULL;
2637 	struct netmap_adapter *hostna = NULL;
2638 	int error = 0;
2639 	enum txrx t;
2640 
2641 	/* make sure the NIC is not already in use */
2642 	if (NETMAP_OWNED_BY_ANY(hwna)) {
2643 		D("NIC %s busy, cannot attach to bridge", hwna->name);
2644 		return EBUSY;
2645 	}
2646 
2647 	bna = malloc(sizeof(*bna), M_DEVBUF, M_NOWAIT | M_ZERO);
2648 	if (bna == NULL) {
2649 		return ENOMEM;
2650 	}
2651 
2652 	na = &bna->up.up;
2653 	/* make bwrap ifp point to the real ifp */
2654 	na->ifp = hwna->ifp;
2655 	na->na_private = bna;
2656 	strncpy(na->name, nr_name, sizeof(na->name));
2657 	/* fill the ring data for the bwrap adapter with rx/tx meanings
2658 	 * swapped. The real cross-linking will be done during register,
2659 	 * when all the krings will have been created.
2660 	 */
2661 	for_rx_tx(t) {
2662 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
2663 		nma_set_nrings(na, t, nma_get_nrings(hwna, r));
2664 		nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
2665 	}
2666 	na->nm_dtor = netmap_bwrap_dtor;
2667 	na->nm_register = netmap_bwrap_reg;
2668 	// na->nm_txsync = netmap_bwrap_txsync;
2669 	// na->nm_rxsync = netmap_bwrap_rxsync;
2670 	na->nm_config = netmap_bwrap_config;
2671 	na->nm_krings_create = netmap_bwrap_krings_create;
2672 	na->nm_krings_delete = netmap_bwrap_krings_delete;
2673 	na->nm_notify = netmap_bwrap_notify;
2674 	na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
2675 	na->pdev = hwna->pdev;
2676 	na->nm_mem = hwna->nm_mem;
2677 	na->virt_hdr_len = hwna->virt_hdr_len;
2678 	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
2679 
2680 	bna->hwna = hwna;
2681 	netmap_adapter_get(hwna);
2682 	hwna->na_private = bna; /* weak reference */
2683 	hwna->na_vp = &bna->up;
2684 
2685 	if (hwna->na_flags & NAF_HOST_RINGS) {
2686 		if (hwna->na_flags & NAF_SW_ONLY)
2687 			na->na_flags |= NAF_SW_ONLY;
2688 		na->na_flags |= NAF_HOST_RINGS;
2689 		hostna = &bna->host.up;
2690 		snprintf(hostna->name, sizeof(hostna->name), "%s^", nr_name);
2691 		hostna->ifp = hwna->ifp;
2692 		for_rx_tx(t) {
2693 			enum txrx r = nm_txrx_swap(t);
2694 			nma_set_nrings(hostna, t, 1);
2695 			nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
2696 		}
2697 		// hostna->nm_txsync = netmap_bwrap_host_txsync;
2698 		// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
2699 		hostna->nm_notify = netmap_bwrap_notify;
2700 		hostna->nm_mem = na->nm_mem;
2701 		hostna->na_private = bna;
2702 		hostna->na_vp = &bna->up;
2703 		na->na_hostvp = hwna->na_hostvp =
2704 			hostna->na_hostvp = &bna->host;
2705 		hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
2706 	}
2707 
2708 	ND("%s<->%s txr %d txd %d rxr %d rxd %d",
2709 		na->name, ifp->if_xname,
2710 		na->num_tx_rings, na->num_tx_desc,
2711 		na->num_rx_rings, na->num_rx_desc);
2712 
2713 	error = netmap_attach_common(na);
2714 	if (error) {
2715 		goto err_free;
2716 	}
2717 	hwna->na_flags |= NAF_BUSY;
2718 	return 0;
2719 
2720 err_free:
2721 	hwna->na_vp = hwna->na_hostvp = NULL;
2722 	netmap_adapter_put(hwna);
2723 	free(bna, M_DEVBUF);
2724 	return error;
2725 
2726 }
2727 
2728 struct nm_bridge *
2729 netmap_init_bridges2(u_int n)
2730 {
2731 	int i;
2732 	struct nm_bridge *b;
2733 
2734 	b = malloc(sizeof(struct nm_bridge) * n, M_DEVBUF,
2735 		M_NOWAIT | M_ZERO);
2736 	if (b == NULL)
2737 		return NULL;
2738 	for (i = 0; i < n; i++)
2739 		BDG_RWINIT(&b[i]);
2740 	return b;
2741 }
2742 
2743 void
2744 netmap_uninit_bridges2(struct nm_bridge *b, u_int n)
2745 {
2746 	int i;
2747 
2748 	if (b == NULL)
2749 		return;
2750 
2751 	for (i = 0; i < n; i++)
2752 		BDG_RWDESTROY(&b[i]);
2753 	free(b, M_DEVBUF);
2754 }
2755 
2756 int
2757 netmap_init_bridges(void)
2758 {
2759 #ifdef CONFIG_NET_NS
2760 	return netmap_bns_register();
2761 #else
2762 	nm_bridges = netmap_init_bridges2(NM_BRIDGES);
2763 	if (nm_bridges == NULL)
2764 		return ENOMEM;
2765 	return 0;
2766 #endif
2767 }
2768 
2769 void
2770 netmap_uninit_bridges(void)
2771 {
2772 #ifdef CONFIG_NET_NS
2773 	netmap_bns_unregister();
2774 #else
2775 	netmap_uninit_bridges2(nm_bridges, NM_BRIDGES);
2776 #endif
2777 }
2778 #endif /* WITH_VALE */
2779