xref: /freebsd/sys/dev/netmap/netmap_bdg.c (revision dacc43df34a7da82747af82be62cb645eb36f6ca)
1 /*
2  * Copyright (C) 2013-2016 Universita` di Pisa
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *   1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  *   2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *      documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 
28 /*
29  * This module implements the VALE switch for netmap
30 
31 --- VALE SWITCH ---
32 
33 NMG_LOCK() serializes all modifications to switches and ports.
34 A switch cannot be deleted until all ports are gone.
35 
36 For each switch, an SX lock (RWlock on linux) protects
37 deletion of ports. When configuring or deleting a new port, the
38 lock is acquired in exclusive mode (after holding NMG_LOCK).
39 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
40 The lock is held throughout the entire forwarding cycle,
41 during which the thread may incur in a page fault.
42 Hence it is important that sleepable shared locks are used.
43 
44 On the rx ring, the per-port lock is grabbed initially to reserve
45 a number of slot in the ring, then the lock is released,
46 packets are copied from source to destination, and then
47 the lock is acquired again and the receive ring is updated.
48 (A similar thing is done on the tx ring for NIC and host stack
49 ports attached to the switch)
50 
51  */
52 
53 /*
54  * OS-specific code that is used only within this file.
55  * Other OS-specific code that must be accessed by drivers
56  * is present in netmap_kern.h
57  */
58 
59 #if defined(__FreeBSD__)
60 #include <sys/cdefs.h> /* prerequisite */
61 __FBSDID("$FreeBSD$");
62 
63 #include <sys/types.h>
64 #include <sys/errno.h>
65 #include <sys/param.h>	/* defines used in kernel.h */
66 #include <sys/kernel.h>	/* types used in module initialization */
67 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
68 #include <sys/sockio.h>
69 #include <sys/socketvar.h>	/* struct socket */
70 #include <sys/malloc.h>
71 #include <sys/poll.h>
72 #include <sys/rwlock.h>
73 #include <sys/socket.h> /* sockaddrs */
74 #include <sys/selinfo.h>
75 #include <sys/sysctl.h>
76 #include <net/if.h>
77 #include <net/if_var.h>
78 #include <net/bpf.h>		/* BIOCIMMEDIATE */
79 #include <machine/bus.h>	/* bus_dmamap_* */
80 #include <sys/endian.h>
81 #include <sys/refcount.h>
82 #include <sys/smp.h>
83 
84 
85 #elif defined(linux)
86 
87 #include "bsd_glue.h"
88 
89 #elif defined(__APPLE__)
90 
91 #warning OSX support is only partial
92 #include "osx_glue.h"
93 
94 #elif defined(_WIN32)
95 #include "win_glue.h"
96 
97 #else
98 
99 #error	Unsupported platform
100 
101 #endif /* unsupported */
102 
103 /*
104  * common headers
105  */
106 
107 #include <net/netmap.h>
108 #include <dev/netmap/netmap_kern.h>
109 #include <dev/netmap/netmap_mem2.h>
110 
111 #include <dev/netmap/netmap_bdg.h>
112 
113 const char*
114 netmap_bdg_name(struct netmap_vp_adapter *vp)
115 {
116 	struct nm_bridge *b = vp->na_bdg;
117 	if (b == NULL)
118 		return NULL;
119 	return b->bdg_basename;
120 }
121 
122 
123 #ifndef CONFIG_NET_NS
124 /*
125  * XXX in principle nm_bridges could be created dynamically
126  * Right now we have a static array and deletions are protected
127  * by an exclusive lock.
128  */
129 static struct nm_bridge *nm_bridges;
130 #endif /* !CONFIG_NET_NS */
131 
132 
133 static int
134 nm_is_id_char(const char c)
135 {
136 	return (c >= 'a' && c <= 'z') ||
137 	       (c >= 'A' && c <= 'Z') ||
138 	       (c >= '0' && c <= '9') ||
139 	       (c == '_');
140 }
141 
142 /* Validate the name of a VALE bridge port and return the
143  * position of the ":" character. */
144 static int
145 nm_vale_name_validate(const char *name)
146 {
147 	int colon_pos = -1;
148 	int i;
149 
150 	if (!name || strlen(name) < strlen(NM_BDG_NAME)) {
151 		return -1;
152 	}
153 
154 	for (i = 0; i < NM_BDG_IFNAMSIZ && name[i]; i++) {
155 		if (name[i] == ':') {
156 			colon_pos = i;
157 			break;
158 		} else if (!nm_is_id_char(name[i])) {
159 			return -1;
160 		}
161 	}
162 
163 	if (strlen(name) - colon_pos > IFNAMSIZ) {
164 		/* interface name too long */
165 		return -1;
166 	}
167 
168 	return colon_pos;
169 }
170 
171 /*
172  * locate a bridge among the existing ones.
173  * MUST BE CALLED WITH NMG_LOCK()
174  *
175  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
176  * We assume that this is called with a name of at least NM_NAME chars.
177  */
178 struct nm_bridge *
179 nm_find_bridge(const char *name, int create, struct netmap_bdg_ops *ops)
180 {
181 	int i, namelen;
182 	struct nm_bridge *b = NULL, *bridges;
183 	u_int num_bridges;
184 
185 	NMG_LOCK_ASSERT();
186 
187 	netmap_bns_getbridges(&bridges, &num_bridges);
188 
189 	namelen = nm_vale_name_validate(name);
190 	if (namelen < 0) {
191 		D("invalid bridge name %s", name ? name : NULL);
192 		return NULL;
193 	}
194 
195 	/* lookup the name, remember empty slot if there is one */
196 	for (i = 0; i < num_bridges; i++) {
197 		struct nm_bridge *x = bridges + i;
198 
199 		if ((x->bdg_flags & NM_BDG_ACTIVE) + x->bdg_active_ports == 0) {
200 			if (create && b == NULL)
201 				b = x;	/* record empty slot */
202 		} else if (x->bdg_namelen != namelen) {
203 			continue;
204 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
205 			ND("found '%.*s' at %d", namelen, name, i);
206 			b = x;
207 			break;
208 		}
209 	}
210 	if (i == num_bridges && b) { /* name not found, can create entry */
211 		/* initialize the bridge */
212 		ND("create new bridge %s with ports %d", b->bdg_basename,
213 			b->bdg_active_ports);
214 		b->ht = nm_os_malloc(sizeof(struct nm_hash_ent) * NM_BDG_HASH);
215 		if (b->ht == NULL) {
216 			D("failed to allocate hash table");
217 			return NULL;
218 		}
219 		strncpy(b->bdg_basename, name, namelen);
220 		b->bdg_namelen = namelen;
221 		b->bdg_active_ports = 0;
222 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
223 			b->bdg_port_index[i] = i;
224 		/* set the default function */
225 		b->bdg_ops = ops;
226 		b->private_data = b->ht;
227 		b->bdg_flags = 0;
228 		NM_BNS_GET(b);
229 	}
230 	return b;
231 }
232 
233 
234 int
235 netmap_bdg_free(struct nm_bridge *b)
236 {
237 	if ((b->bdg_flags & NM_BDG_ACTIVE) + b->bdg_active_ports != 0) {
238 		return EBUSY;
239 	}
240 
241 	ND("marking bridge %s as free", b->bdg_basename);
242 	nm_os_free(b->ht);
243 	b->bdg_ops = NULL;
244 	b->bdg_flags = 0;
245 	NM_BNS_PUT(b);
246 	return 0;
247 }
248 
249 
250 /* remove from bridge b the ports in slots hw and sw
251  * (sw can be -1 if not needed)
252  */
253 void
254 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
255 {
256 	int s_hw = hw, s_sw = sw;
257 	int i, lim =b->bdg_active_ports;
258 	uint32_t *tmp = b->tmp_bdg_port_index;
259 
260 	/*
261 	New algorithm:
262 	make a copy of bdg_port_index;
263 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
264 	in the array of bdg_port_index, replacing them with
265 	entries from the bottom of the array;
266 	decrement bdg_active_ports;
267 	acquire BDG_WLOCK() and copy back the array.
268 	 */
269 
270 	if (netmap_verbose)
271 		D("detach %d and %d (lim %d)", hw, sw, lim);
272 	/* make a copy of the list of active ports, update it,
273 	 * and then copy back within BDG_WLOCK().
274 	 */
275 	memcpy(b->tmp_bdg_port_index, b->bdg_port_index, sizeof(b->tmp_bdg_port_index));
276 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
277 		if (hw >= 0 && tmp[i] == hw) {
278 			ND("detach hw %d at %d", hw, i);
279 			lim--; /* point to last active port */
280 			tmp[i] = tmp[lim]; /* swap with i */
281 			tmp[lim] = hw;	/* now this is inactive */
282 			hw = -1;
283 		} else if (sw >= 0 && tmp[i] == sw) {
284 			ND("detach sw %d at %d", sw, i);
285 			lim--;
286 			tmp[i] = tmp[lim];
287 			tmp[lim] = sw;
288 			sw = -1;
289 		} else {
290 			i++;
291 		}
292 	}
293 	if (hw >= 0 || sw >= 0) {
294 		D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
295 	}
296 
297 	BDG_WLOCK(b);
298 	if (b->bdg_ops->dtor)
299 		b->bdg_ops->dtor(b->bdg_ports[s_hw]);
300 	b->bdg_ports[s_hw] = NULL;
301 	if (s_sw >= 0) {
302 		b->bdg_ports[s_sw] = NULL;
303 	}
304 	memcpy(b->bdg_port_index, b->tmp_bdg_port_index, sizeof(b->tmp_bdg_port_index));
305 	b->bdg_active_ports = lim;
306 	BDG_WUNLOCK(b);
307 
308 	ND("now %d active ports", lim);
309 	netmap_bdg_free(b);
310 }
311 
312 
313 /* nm_bdg_ctl callback for VALE ports */
314 int
315 netmap_vp_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
316 {
317 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
318 	struct nm_bridge *b = vpna->na_bdg;
319 
320 	if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
321 		return 0; /* nothing to do */
322 	}
323 	if (b) {
324 		netmap_set_all_rings(na, 0 /* disable */);
325 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
326 		vpna->na_bdg = NULL;
327 		netmap_set_all_rings(na, 1 /* enable */);
328 	}
329 	/* I have took reference just for attach */
330 	netmap_adapter_put(na);
331 	return 0;
332 }
333 
334 int
335 netmap_default_bdg_attach(const char *name, struct netmap_adapter *na,
336 		struct nm_bridge *b)
337 {
338 	return NM_NEED_BWRAP;
339 }
340 
341 /* Try to get a reference to a netmap adapter attached to a VALE switch.
342  * If the adapter is found (or is created), this function returns 0, a
343  * non NULL pointer is returned into *na, and the caller holds a
344  * reference to the adapter.
345  * If an adapter is not found, then no reference is grabbed and the
346  * function returns an error code, or 0 if there is just a VALE prefix
347  * mismatch. Therefore the caller holds a reference when
348  * (*na != NULL && return == 0).
349  */
350 int
351 netmap_get_bdg_na(struct nmreq_header *hdr, struct netmap_adapter **na,
352 	struct netmap_mem_d *nmd, int create, struct netmap_bdg_ops *ops)
353 {
354 	char *nr_name = hdr->nr_name;
355 	const char *ifname;
356 	struct ifnet *ifp = NULL;
357 	int error = 0;
358 	struct netmap_vp_adapter *vpna, *hostna = NULL;
359 	struct nm_bridge *b;
360 	uint32_t i, j;
361 	uint32_t cand = NM_BDG_NOPORT, cand2 = NM_BDG_NOPORT;
362 	int needed;
363 
364 	*na = NULL;     /* default return value */
365 
366 	/* first try to see if this is a bridge port. */
367 	NMG_LOCK_ASSERT();
368 	if (strncmp(nr_name, ops->name, strlen(ops->name) - 1)) {
369 		return 0;  /* no error, but no VALE prefix */
370 	}
371 
372 	b = nm_find_bridge(nr_name, create, ops);
373 	if (b == NULL) {
374 		ND("no bridges available for '%s'", nr_name);
375 		return (create ? ENOMEM : ENXIO);
376 	}
377 	if (strlen(nr_name) < b->bdg_namelen) /* impossible */
378 		panic("x");
379 
380 	/* Now we are sure that name starts with the bridge's name,
381 	 * lookup the port in the bridge. We need to scan the entire
382 	 * list. It is not important to hold a WLOCK on the bridge
383 	 * during the search because NMG_LOCK already guarantees
384 	 * that there are no other possible writers.
385 	 */
386 
387 	/* lookup in the local list of ports */
388 	for (j = 0; j < b->bdg_active_ports; j++) {
389 		i = b->bdg_port_index[j];
390 		vpna = b->bdg_ports[i];
391 		ND("checking %s", vpna->up.name);
392 		if (!strcmp(vpna->up.name, nr_name)) {
393 			netmap_adapter_get(&vpna->up);
394 			ND("found existing if %s refs %d", nr_name)
395 			*na = &vpna->up;
396 			return 0;
397 		}
398 	}
399 	/* not found, should we create it? */
400 	if (!create)
401 		return ENXIO;
402 	/* yes we should, see if we have space to attach entries */
403 	needed = 2; /* in some cases we only need 1 */
404 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
405 		D("bridge full %d, cannot create new port", b->bdg_active_ports);
406 		return ENOMEM;
407 	}
408 	/* record the next two ports available, but do not allocate yet */
409 	cand = b->bdg_port_index[b->bdg_active_ports];
410 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
411 	ND("+++ bridge %s port %s used %d avail %d %d",
412 		b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
413 
414 	/*
415 	 * try see if there is a matching NIC with this name
416 	 * (after the bridge's name)
417 	 */
418 	ifname = nr_name + b->bdg_namelen + 1;
419 	ifp = ifunit_ref(ifname);
420 	if (!ifp) {
421 		/* Create an ephemeral virtual port.
422 		 * This block contains all the ephemeral-specific logic.
423 		 */
424 
425 		if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
426 			error = EINVAL;
427 			goto out;
428 		}
429 
430 		/* bdg_netmap_attach creates a struct netmap_adapter */
431 		error = b->bdg_ops->vp_create(hdr, NULL, nmd, &vpna);
432 		if (error) {
433 			D("error %d", error);
434 			goto out;
435 		}
436 		/* shortcut - we can skip get_hw_na(),
437 		 * ownership check and nm_bdg_attach()
438 		 */
439 
440 	} else {
441 		struct netmap_adapter *hw;
442 
443 		/* the vale:nic syntax is only valid for some commands */
444 		switch (hdr->nr_reqtype) {
445 		case NETMAP_REQ_VALE_ATTACH:
446 		case NETMAP_REQ_VALE_DETACH:
447 		case NETMAP_REQ_VALE_POLLING_ENABLE:
448 		case NETMAP_REQ_VALE_POLLING_DISABLE:
449 			break; /* ok */
450 		default:
451 			error = EINVAL;
452 			goto out;
453 		}
454 
455 		error = netmap_get_hw_na(ifp, nmd, &hw);
456 		if (error || hw == NULL)
457 			goto out;
458 
459 		/* host adapter might not be created */
460 		error = hw->nm_bdg_attach(nr_name, hw, b);
461 		if (error == NM_NEED_BWRAP) {
462 			error = b->bdg_ops->bwrap_attach(nr_name, hw);
463 		}
464 		if (error)
465 			goto out;
466 		vpna = hw->na_vp;
467 		hostna = hw->na_hostvp;
468 		if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
469 			/* Check if we need to skip the host rings. */
470 			struct nmreq_vale_attach *areq =
471 				(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
472 			if (areq->reg.nr_mode != NR_REG_NIC_SW) {
473 				hostna = NULL;
474 			}
475 		}
476 	}
477 
478 	BDG_WLOCK(b);
479 	vpna->bdg_port = cand;
480 	ND("NIC  %p to bridge port %d", vpna, cand);
481 	/* bind the port to the bridge (virtual ports are not active) */
482 	b->bdg_ports[cand] = vpna;
483 	vpna->na_bdg = b;
484 	b->bdg_active_ports++;
485 	if (hostna != NULL) {
486 		/* also bind the host stack to the bridge */
487 		b->bdg_ports[cand2] = hostna;
488 		hostna->bdg_port = cand2;
489 		hostna->na_bdg = b;
490 		b->bdg_active_ports++;
491 		ND("host %p to bridge port %d", hostna, cand2);
492 	}
493 	ND("if %s refs %d", ifname, vpna->up.na_refcount);
494 	BDG_WUNLOCK(b);
495 	*na = &vpna->up;
496 	netmap_adapter_get(*na);
497 
498 out:
499 	if (ifp)
500 		if_rele(ifp);
501 
502 	return error;
503 }
504 
505 /* Process NETMAP_REQ_VALE_ATTACH.
506  */
507 int
508 nm_bdg_ctl_attach(struct nmreq_header *hdr, void *auth_token)
509 {
510 	struct nmreq_vale_attach *req =
511 		(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
512 	struct netmap_vp_adapter * vpna;
513 	struct netmap_adapter *na = NULL;
514 	struct netmap_mem_d *nmd = NULL;
515 	struct nm_bridge *b = NULL;
516 	int error;
517 
518 	NMG_LOCK();
519 	/* permission check for modified bridges */
520 	b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
521 	if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
522 		error = EACCES;
523 		goto unlock_exit;
524 	}
525 
526 	if (req->reg.nr_mem_id) {
527 		nmd = netmap_mem_find(req->reg.nr_mem_id);
528 		if (nmd == NULL) {
529 			error = EINVAL;
530 			goto unlock_exit;
531 		}
532 	}
533 
534 	/* check for existing one */
535 	error = netmap_get_vale_na(hdr, &na, nmd, 0);
536 	if (na) {
537 		error = EBUSY;
538 		goto unref_exit;
539 	}
540 	error = netmap_get_vale_na(hdr, &na,
541 				nmd, 1 /* create if not exists */);
542 	if (error) { /* no device */
543 		goto unlock_exit;
544 	}
545 
546 	if (na == NULL) { /* VALE prefix missing */
547 		error = EINVAL;
548 		goto unlock_exit;
549 	}
550 
551 	if (NETMAP_OWNED_BY_ANY(na)) {
552 		error = EBUSY;
553 		goto unref_exit;
554 	}
555 
556 	if (na->nm_bdg_ctl) {
557 		/* nop for VALE ports. The bwrap needs to put the hwna
558 		 * in netmap mode (see netmap_bwrap_bdg_ctl)
559 		 */
560 		error = na->nm_bdg_ctl(hdr, na);
561 		if (error)
562 			goto unref_exit;
563 		ND("registered %s to netmap-mode", na->name);
564 	}
565 	vpna = (struct netmap_vp_adapter *)na;
566 	req->port_index = vpna->bdg_port;
567 	NMG_UNLOCK();
568 	return 0;
569 
570 unref_exit:
571 	netmap_adapter_put(na);
572 unlock_exit:
573 	NMG_UNLOCK();
574 	return error;
575 }
576 
577 static inline int
578 nm_is_bwrap(struct netmap_adapter *na)
579 {
580 	return na->nm_register == netmap_bwrap_reg;
581 }
582 
583 /* Process NETMAP_REQ_VALE_DETACH.
584  */
585 int
586 nm_bdg_ctl_detach(struct nmreq_header *hdr, void *auth_token)
587 {
588 	struct nmreq_vale_detach *nmreq_det = (void *)(uintptr_t)hdr->nr_body;
589 	struct netmap_vp_adapter *vpna;
590 	struct netmap_adapter *na;
591 	struct nm_bridge *b = NULL;
592 	int error;
593 
594 	NMG_LOCK();
595 	/* permission check for modified bridges */
596 	b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
597 	if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
598 		error = EACCES;
599 		goto unlock_exit;
600 	}
601 
602 	error = netmap_get_vale_na(hdr, &na, NULL, 0 /* don't create */);
603 	if (error) { /* no device, or another bridge or user owns the device */
604 		goto unlock_exit;
605 	}
606 
607 	if (na == NULL) { /* VALE prefix missing */
608 		error = EINVAL;
609 		goto unlock_exit;
610 	} else if (nm_is_bwrap(na) &&
611 		   ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
612 		/* Don't detach a NIC with polling */
613 		error = EBUSY;
614 		goto unref_exit;
615 	}
616 
617 	vpna = (struct netmap_vp_adapter *)na;
618 	if (na->na_vp != vpna) {
619 		/* trying to detach first attach of VALE persistent port attached
620 		 * to 2 bridges
621 		 */
622 		error = EBUSY;
623 		goto unref_exit;
624 	}
625 	nmreq_det->port_index = vpna->bdg_port;
626 
627 	if (na->nm_bdg_ctl) {
628 		/* remove the port from bridge. The bwrap
629 		 * also needs to put the hwna in normal mode
630 		 */
631 		error = na->nm_bdg_ctl(hdr, na);
632 	}
633 
634 unref_exit:
635 	netmap_adapter_put(na);
636 unlock_exit:
637 	NMG_UNLOCK();
638 	return error;
639 
640 }
641 
642 struct nm_bdg_polling_state;
643 struct
644 nm_bdg_kthread {
645 	struct nm_kctx *nmk;
646 	u_int qfirst;
647 	u_int qlast;
648 	struct nm_bdg_polling_state *bps;
649 };
650 
651 struct nm_bdg_polling_state {
652 	bool configured;
653 	bool stopped;
654 	struct netmap_bwrap_adapter *bna;
655 	uint32_t mode;
656 	u_int qfirst;
657 	u_int qlast;
658 	u_int cpu_from;
659 	u_int ncpus;
660 	struct nm_bdg_kthread *kthreads;
661 };
662 
663 static void
664 netmap_bwrap_polling(void *data, int is_kthread)
665 {
666 	struct nm_bdg_kthread *nbk = data;
667 	struct netmap_bwrap_adapter *bna;
668 	u_int qfirst, qlast, i;
669 	struct netmap_kring **kring0, *kring;
670 
671 	if (!nbk)
672 		return;
673 	qfirst = nbk->qfirst;
674 	qlast = nbk->qlast;
675 	bna = nbk->bps->bna;
676 	kring0 = NMR(bna->hwna, NR_RX);
677 
678 	for (i = qfirst; i < qlast; i++) {
679 		kring = kring0[i];
680 		kring->nm_notify(kring, 0);
681 	}
682 }
683 
684 static int
685 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps)
686 {
687 	struct nm_kctx_cfg kcfg;
688 	int i, j;
689 
690 	bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus);
691 	if (bps->kthreads == NULL)
692 		return ENOMEM;
693 
694 	bzero(&kcfg, sizeof(kcfg));
695 	kcfg.worker_fn = netmap_bwrap_polling;
696 	kcfg.use_kthread = 1;
697 	for (i = 0; i < bps->ncpus; i++) {
698 		struct nm_bdg_kthread *t = bps->kthreads + i;
699 		int all = (bps->ncpus == 1 &&
700 			bps->mode == NETMAP_POLLING_MODE_SINGLE_CPU);
701 		int affinity = bps->cpu_from + i;
702 
703 		t->bps = bps;
704 		t->qfirst = all ? bps->qfirst /* must be 0 */: affinity;
705 		t->qlast = all ? bps->qlast : t->qfirst + 1;
706 		D("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst,
707 			t->qlast);
708 
709 		kcfg.type = i;
710 		kcfg.worker_private = t;
711 		t->nmk = nm_os_kctx_create(&kcfg, NULL);
712 		if (t->nmk == NULL) {
713 			goto cleanup;
714 		}
715 		nm_os_kctx_worker_setaff(t->nmk, affinity);
716 	}
717 	return 0;
718 
719 cleanup:
720 	for (j = 0; j < i; j++) {
721 		struct nm_bdg_kthread *t = bps->kthreads + i;
722 		nm_os_kctx_destroy(t->nmk);
723 	}
724 	nm_os_free(bps->kthreads);
725 	return EFAULT;
726 }
727 
728 /* A variant of ptnetmap_start_kthreads() */
729 static int
730 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps)
731 {
732 	int error, i, j;
733 
734 	if (!bps) {
735 		D("polling is not configured");
736 		return EFAULT;
737 	}
738 	bps->stopped = false;
739 
740 	for (i = 0; i < bps->ncpus; i++) {
741 		struct nm_bdg_kthread *t = bps->kthreads + i;
742 		error = nm_os_kctx_worker_start(t->nmk);
743 		if (error) {
744 			D("error in nm_kthread_start()");
745 			goto cleanup;
746 		}
747 	}
748 	return 0;
749 
750 cleanup:
751 	for (j = 0; j < i; j++) {
752 		struct nm_bdg_kthread *t = bps->kthreads + i;
753 		nm_os_kctx_worker_stop(t->nmk);
754 	}
755 	bps->stopped = true;
756 	return error;
757 }
758 
759 static void
760 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps)
761 {
762 	int i;
763 
764 	if (!bps)
765 		return;
766 
767 	for (i = 0; i < bps->ncpus; i++) {
768 		struct nm_bdg_kthread *t = bps->kthreads + i;
769 		nm_os_kctx_worker_stop(t->nmk);
770 		nm_os_kctx_destroy(t->nmk);
771 	}
772 	bps->stopped = true;
773 }
774 
775 static int
776 get_polling_cfg(struct nmreq_vale_polling *req, struct netmap_adapter *na,
777 		struct nm_bdg_polling_state *bps)
778 {
779 	unsigned int avail_cpus, core_from;
780 	unsigned int qfirst, qlast;
781 	uint32_t i = req->nr_first_cpu_id;
782 	uint32_t req_cpus = req->nr_num_polling_cpus;
783 
784 	avail_cpus = nm_os_ncpus();
785 
786 	if (req_cpus == 0) {
787 		D("req_cpus must be > 0");
788 		return EINVAL;
789 	} else if (req_cpus >= avail_cpus) {
790 		D("Cannot use all the CPUs in the system");
791 		return EINVAL;
792 	}
793 
794 	if (req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU) {
795 		/* Use a separate core for each ring. If nr_num_polling_cpus>1
796 		 * more consecutive rings are polled.
797 		 * For example, if nr_first_cpu_id=2 and nr_num_polling_cpus=2,
798 		 * ring 2 and 3 are polled by core 2 and 3, respectively. */
799 		if (i + req_cpus > nma_get_nrings(na, NR_RX)) {
800 			D("Rings %u-%u not in range (have %d rings)",
801 				i, i + req_cpus, nma_get_nrings(na, NR_RX));
802 			return EINVAL;
803 		}
804 		qfirst = i;
805 		qlast = qfirst + req_cpus;
806 		core_from = qfirst;
807 
808 	} else if (req->nr_mode == NETMAP_POLLING_MODE_SINGLE_CPU) {
809 		/* Poll all the rings using a core specified by nr_first_cpu_id.
810 		 * the number of cores must be 1. */
811 		if (req_cpus != 1) {
812 			D("ncpus must be 1 for NETMAP_POLLING_MODE_SINGLE_CPU "
813 				"(was %d)", req_cpus);
814 			return EINVAL;
815 		}
816 		qfirst = 0;
817 		qlast = nma_get_nrings(na, NR_RX);
818 		core_from = i;
819 	} else {
820 		D("Invalid polling mode");
821 		return EINVAL;
822 	}
823 
824 	bps->mode = req->nr_mode;
825 	bps->qfirst = qfirst;
826 	bps->qlast = qlast;
827 	bps->cpu_from = core_from;
828 	bps->ncpus = req_cpus;
829 	D("%s qfirst %u qlast %u cpu_from %u ncpus %u",
830 		req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU ?
831 		"MULTI" : "SINGLE",
832 		qfirst, qlast, core_from, req_cpus);
833 	return 0;
834 }
835 
836 static int
837 nm_bdg_ctl_polling_start(struct nmreq_vale_polling *req, struct netmap_adapter *na)
838 {
839 	struct nm_bdg_polling_state *bps;
840 	struct netmap_bwrap_adapter *bna;
841 	int error;
842 
843 	bna = (struct netmap_bwrap_adapter *)na;
844 	if (bna->na_polling_state) {
845 		D("ERROR adapter already in polling mode");
846 		return EFAULT;
847 	}
848 
849 	bps = nm_os_malloc(sizeof(*bps));
850 	if (!bps)
851 		return ENOMEM;
852 	bps->configured = false;
853 	bps->stopped = true;
854 
855 	if (get_polling_cfg(req, na, bps)) {
856 		nm_os_free(bps);
857 		return EINVAL;
858 	}
859 
860 	if (nm_bdg_create_kthreads(bps)) {
861 		nm_os_free(bps);
862 		return EFAULT;
863 	}
864 
865 	bps->configured = true;
866 	bna->na_polling_state = bps;
867 	bps->bna = bna;
868 
869 	/* disable interrupts if possible */
870 	nma_intr_enable(bna->hwna, 0);
871 	/* start kthread now */
872 	error = nm_bdg_polling_start_kthreads(bps);
873 	if (error) {
874 		D("ERROR nm_bdg_polling_start_kthread()");
875 		nm_os_free(bps->kthreads);
876 		nm_os_free(bps);
877 		bna->na_polling_state = NULL;
878 		nma_intr_enable(bna->hwna, 1);
879 	}
880 	return error;
881 }
882 
883 static int
884 nm_bdg_ctl_polling_stop(struct netmap_adapter *na)
885 {
886 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na;
887 	struct nm_bdg_polling_state *bps;
888 
889 	if (!bna->na_polling_state) {
890 		D("ERROR adapter is not in polling mode");
891 		return EFAULT;
892 	}
893 	bps = bna->na_polling_state;
894 	nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state);
895 	bps->configured = false;
896 	nm_os_free(bps);
897 	bna->na_polling_state = NULL;
898 	/* reenable interrupts */
899 	nma_intr_enable(bna->hwna, 1);
900 	return 0;
901 }
902 
903 int
904 nm_bdg_polling(struct nmreq_header *hdr)
905 {
906 	struct nmreq_vale_polling *req =
907 		(struct nmreq_vale_polling *)(uintptr_t)hdr->nr_body;
908 	struct netmap_adapter *na = NULL;
909 	int error = 0;
910 
911 	NMG_LOCK();
912 	error = netmap_get_vale_na(hdr, &na, NULL, /*create=*/0);
913 	if (na && !error) {
914 		if (!nm_is_bwrap(na)) {
915 			error = EOPNOTSUPP;
916 		} else if (hdr->nr_reqtype == NETMAP_BDG_POLLING_ON) {
917 			error = nm_bdg_ctl_polling_start(req, na);
918 			if (!error)
919 				netmap_adapter_get(na);
920 		} else {
921 			error = nm_bdg_ctl_polling_stop(na);
922 			if (!error)
923 				netmap_adapter_put(na);
924 		}
925 		netmap_adapter_put(na);
926 	} else if (!na && !error) {
927 		/* Not VALE port. */
928 		error = EINVAL;
929 	}
930 	NMG_UNLOCK();
931 
932 	return error;
933 }
934 
935 /* Process NETMAP_REQ_VALE_LIST. */
936 int
937 netmap_bdg_list(struct nmreq_header *hdr)
938 {
939 	struct nmreq_vale_list *req =
940 		(struct nmreq_vale_list *)(uintptr_t)hdr->nr_body;
941 	int namelen = strlen(hdr->nr_name);
942 	struct nm_bridge *b, *bridges;
943 	struct netmap_vp_adapter *vpna;
944 	int error = 0, i, j;
945 	u_int num_bridges;
946 
947 	netmap_bns_getbridges(&bridges, &num_bridges);
948 
949 	/* this is used to enumerate bridges and ports */
950 	if (namelen) { /* look up indexes of bridge and port */
951 		if (strncmp(hdr->nr_name, NM_BDG_NAME,
952 					strlen(NM_BDG_NAME))) {
953 			return EINVAL;
954 		}
955 		NMG_LOCK();
956 		b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
957 		if (!b) {
958 			NMG_UNLOCK();
959 			return ENOENT;
960 		}
961 
962 		req->nr_bridge_idx = b - bridges; /* bridge index */
963 		req->nr_port_idx = NM_BDG_NOPORT;
964 		for (j = 0; j < b->bdg_active_ports; j++) {
965 			i = b->bdg_port_index[j];
966 			vpna = b->bdg_ports[i];
967 			if (vpna == NULL) {
968 				D("This should not happen");
969 				continue;
970 			}
971 			/* the former and the latter identify a
972 			 * virtual port and a NIC, respectively
973 			 */
974 			if (!strcmp(vpna->up.name, hdr->nr_name)) {
975 				req->nr_port_idx = i; /* port index */
976 				break;
977 			}
978 		}
979 		NMG_UNLOCK();
980 	} else {
981 		/* return the first non-empty entry starting from
982 		 * bridge nr_arg1 and port nr_arg2.
983 		 *
984 		 * Users can detect the end of the same bridge by
985 		 * seeing the new and old value of nr_arg1, and can
986 		 * detect the end of all the bridge by error != 0
987 		 */
988 		i = req->nr_bridge_idx;
989 		j = req->nr_port_idx;
990 
991 		NMG_LOCK();
992 		for (error = ENOENT; i < NM_BRIDGES; i++) {
993 			b = bridges + i;
994 			for ( ; j < NM_BDG_MAXPORTS; j++) {
995 				if (b->bdg_ports[j] == NULL)
996 					continue;
997 				vpna = b->bdg_ports[j];
998 				/* write back the VALE switch name */
999 				strncpy(hdr->nr_name, vpna->up.name,
1000 					(size_t)IFNAMSIZ);
1001 				error = 0;
1002 				goto out;
1003 			}
1004 			j = 0; /* following bridges scan from 0 */
1005 		}
1006 	out:
1007 		req->nr_bridge_idx = i;
1008 		req->nr_port_idx = j;
1009 		NMG_UNLOCK();
1010 	}
1011 
1012 	return error;
1013 }
1014 
1015 /* Called by external kernel modules (e.g., Openvswitch).
1016  * to set configure/lookup/dtor functions of a VALE instance.
1017  * Register callbacks to the given bridge. 'name' may be just
1018  * bridge's name (including ':' if it is not just NM_BDG_NAME).
1019  *
1020  * Called without NMG_LOCK.
1021  */
1022 
1023 int
1024 netmap_bdg_regops(const char *name, struct netmap_bdg_ops *bdg_ops, void *private_data, void *auth_token)
1025 {
1026 	struct nm_bridge *b;
1027 	int error = 0;
1028 
1029 	NMG_LOCK();
1030 	b = nm_find_bridge(name, 0 /* don't create */, NULL);
1031 	if (!b) {
1032 		error = ENXIO;
1033 		goto unlock_regops;
1034 	}
1035 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
1036 		error = EACCES;
1037 		goto unlock_regops;
1038 	}
1039 
1040 	BDG_WLOCK(b);
1041 	if (!bdg_ops) {
1042 		/* resetting the bridge */
1043 		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
1044 		b->bdg_ops = NULL;
1045 		b->private_data = b->ht;
1046 	} else {
1047 		/* modifying the bridge */
1048 		b->private_data = private_data;
1049 		b->bdg_ops = bdg_ops;
1050 	}
1051 	BDG_WUNLOCK(b);
1052 
1053 unlock_regops:
1054 	NMG_UNLOCK();
1055 	return error;
1056 }
1057 
1058 
1059 int
1060 netmap_bdg_config(struct nm_ifreq *nr)
1061 {
1062 	struct nm_bridge *b;
1063 	int error = EINVAL;
1064 
1065 	NMG_LOCK();
1066 	b = nm_find_bridge(nr->nifr_name, 0, NULL);
1067 	if (!b) {
1068 		NMG_UNLOCK();
1069 		return error;
1070 	}
1071 	NMG_UNLOCK();
1072 	/* Don't call config() with NMG_LOCK() held */
1073 	BDG_RLOCK(b);
1074 	if (b->bdg_ops->config != NULL)
1075 		error = b->bdg_ops->config(nr);
1076 	BDG_RUNLOCK(b);
1077 	return error;
1078 }
1079 
1080 
1081 /* nm_register callback for VALE ports */
1082 int
1083 netmap_vp_reg(struct netmap_adapter *na, int onoff)
1084 {
1085 	struct netmap_vp_adapter *vpna =
1086 		(struct netmap_vp_adapter*)na;
1087 	enum txrx t;
1088 	int i;
1089 
1090 	/* persistent ports may be put in netmap mode
1091 	 * before being attached to a bridge
1092 	 */
1093 	if (vpna->na_bdg)
1094 		BDG_WLOCK(vpna->na_bdg);
1095 	if (onoff) {
1096 		for_rx_tx(t) {
1097 			for (i = 0; i < netmap_real_rings(na, t); i++) {
1098 				struct netmap_kring *kring = NMR(na, t)[i];
1099 
1100 				if (nm_kring_pending_on(kring))
1101 					kring->nr_mode = NKR_NETMAP_ON;
1102 			}
1103 		}
1104 		if (na->active_fds == 0)
1105 			na->na_flags |= NAF_NETMAP_ON;
1106 		 /* XXX on FreeBSD, persistent VALE ports should also
1107 		 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
1108 		 */
1109 	} else {
1110 		if (na->active_fds == 0)
1111 			na->na_flags &= ~NAF_NETMAP_ON;
1112 		for_rx_tx(t) {
1113 			for (i = 0; i < netmap_real_rings(na, t); i++) {
1114 				struct netmap_kring *kring = NMR(na, t)[i];
1115 
1116 				if (nm_kring_pending_off(kring))
1117 					kring->nr_mode = NKR_NETMAP_OFF;
1118 			}
1119 		}
1120 	}
1121 	if (vpna->na_bdg)
1122 		BDG_WUNLOCK(vpna->na_bdg);
1123 	return 0;
1124 }
1125 
1126 
1127 /* rxsync code used by VALE ports nm_rxsync callback and also
1128  * internally by the brwap
1129  */
1130 static int
1131 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
1132 {
1133 	struct netmap_adapter *na = kring->na;
1134 	struct netmap_ring *ring = kring->ring;
1135 	u_int nm_i, lim = kring->nkr_num_slots - 1;
1136 	u_int head = kring->rhead;
1137 	int n;
1138 
1139 	if (head > lim) {
1140 		D("ouch dangerous reset!!!");
1141 		n = netmap_ring_reinit(kring);
1142 		goto done;
1143 	}
1144 
1145 	/* First part, import newly received packets. */
1146 	/* actually nothing to do here, they are already in the kring */
1147 
1148 	/* Second part, skip past packets that userspace has released. */
1149 	nm_i = kring->nr_hwcur;
1150 	if (nm_i != head) {
1151 		/* consistency check, but nothing really important here */
1152 		for (n = 0; likely(nm_i != head); n++) {
1153 			struct netmap_slot *slot = &ring->slot[nm_i];
1154 			void *addr = NMB(na, slot);
1155 
1156 			if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
1157 				D("bad buffer index %d, ignore ?",
1158 					slot->buf_idx);
1159 			}
1160 			slot->flags &= ~NS_BUF_CHANGED;
1161 			nm_i = nm_next(nm_i, lim);
1162 		}
1163 		kring->nr_hwcur = head;
1164 	}
1165 
1166 	n = 0;
1167 done:
1168 	return n;
1169 }
1170 
1171 /*
1172  * nm_rxsync callback for VALE ports
1173  * user process reading from a VALE switch.
1174  * Already protected against concurrent calls from userspace,
1175  * but we must acquire the queue's lock to protect against
1176  * writers on the same queue.
1177  */
1178 int
1179 netmap_vp_rxsync(struct netmap_kring *kring, int flags)
1180 {
1181 	int n;
1182 
1183 	mtx_lock(&kring->q_lock);
1184 	n = netmap_vp_rxsync_locked(kring, flags);
1185 	mtx_unlock(&kring->q_lock);
1186 	return n;
1187 }
1188 
1189 int
1190 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna,
1191 		struct netmap_bdg_ops *ops)
1192 {
1193 	return ops->bwrap_attach(nr_name, hwna);
1194 }
1195 
1196 
1197 /* Bridge wrapper code (bwrap).
1198  * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
1199  * VALE switch.
1200  * The main task is to swap the meaning of tx and rx rings to match the
1201  * expectations of the VALE switch code (see nm_bdg_flush).
1202  *
1203  * The bwrap works by interposing a netmap_bwrap_adapter between the
1204  * rest of the system and the hwna. The netmap_bwrap_adapter looks like
1205  * a netmap_vp_adapter to the rest the system, but, internally, it
1206  * translates all callbacks to what the hwna expects.
1207  *
1208  * Note that we have to intercept callbacks coming from two sides:
1209  *
1210  *  - callbacks coming from the netmap module are intercepted by
1211  *    passing around the netmap_bwrap_adapter instead of the hwna
1212  *
1213  *  - callbacks coming from outside of the netmap module only know
1214  *    about the hwna. This, however, only happens in interrupt
1215  *    handlers, where only the hwna->nm_notify callback is called.
1216  *    What the bwrap does is to overwrite the hwna->nm_notify callback
1217  *    with its own netmap_bwrap_intr_notify.
1218  *    XXX This assumes that the hwna->nm_notify callback was the
1219  *    standard netmap_notify(), as it is the case for nic adapters.
1220  *    Any additional action performed by hwna->nm_notify will not be
1221  *    performed by netmap_bwrap_intr_notify.
1222  *
1223  * Additionally, the bwrap can optionally attach the host rings pair
1224  * of the wrapped adapter to a different port of the switch.
1225  */
1226 
1227 
1228 static void
1229 netmap_bwrap_dtor(struct netmap_adapter *na)
1230 {
1231 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1232 	struct netmap_adapter *hwna = bna->hwna;
1233 	struct nm_bridge *b = bna->up.na_bdg,
1234 		*bh = bna->host.na_bdg;
1235 
1236 	if (bna->host.up.nm_mem)
1237 		netmap_mem_put(bna->host.up.nm_mem);
1238 
1239 	if (b) {
1240 		netmap_bdg_detach_common(b, bna->up.bdg_port,
1241 			    (bh ? bna->host.bdg_port : -1));
1242 	}
1243 
1244 	ND("na %p", na);
1245 	na->ifp = NULL;
1246 	bna->host.up.ifp = NULL;
1247 	hwna->na_vp = bna->saved_na_vp;
1248 	hwna->na_hostvp = NULL;
1249 	hwna->na_private = NULL;
1250 	hwna->na_flags &= ~NAF_BUSY;
1251 	netmap_adapter_put(hwna);
1252 
1253 }
1254 
1255 
1256 /*
1257  * Intr callback for NICs connected to a bridge.
1258  * Simply ignore tx interrupts (maybe we could try to recover space ?)
1259  * and pass received packets from nic to the bridge.
1260  *
1261  * XXX TODO check locking: this is called from the interrupt
1262  * handler so we should make sure that the interface is not
1263  * disconnected while passing down an interrupt.
1264  *
1265  * Note, no user process can access this NIC or the host stack.
1266  * The only part of the ring that is significant are the slots,
1267  * and head/cur/tail are set from the kring as needed
1268  * (part as a receive ring, part as a transmit ring).
1269  *
1270  * callback that overwrites the hwna notify callback.
1271  * Packets come from the outside or from the host stack and are put on an
1272  * hwna rx ring.
1273  * The bridge wrapper then sends the packets through the bridge.
1274  */
1275 static int
1276 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
1277 {
1278 	struct netmap_adapter *na = kring->na;
1279 	struct netmap_bwrap_adapter *bna = na->na_private;
1280 	struct netmap_kring *bkring;
1281 	struct netmap_vp_adapter *vpna = &bna->up;
1282 	u_int ring_nr = kring->ring_id;
1283 	int ret = NM_IRQ_COMPLETED;
1284 	int error;
1285 
1286 	if (netmap_verbose)
1287 	    D("%s %s 0x%x", na->name, kring->name, flags);
1288 
1289 	bkring = vpna->up.tx_rings[ring_nr];
1290 
1291 	/* make sure the ring is not disabled */
1292 	if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) {
1293 		return EIO;
1294 	}
1295 
1296 	if (netmap_verbose)
1297 	    D("%s head %d cur %d tail %d",  na->name,
1298 		kring->rhead, kring->rcur, kring->rtail);
1299 
1300 	/* simulate a user wakeup on the rx ring
1301 	 * fetch packets that have arrived.
1302 	 */
1303 	error = kring->nm_sync(kring, 0);
1304 	if (error)
1305 		goto put_out;
1306 	if (kring->nr_hwcur == kring->nr_hwtail) {
1307 		if (netmap_verbose)
1308 			D("how strange, interrupt with no packets on %s",
1309 			    na->name);
1310 		goto put_out;
1311 	}
1312 
1313 	/* new packets are kring->rcur to kring->nr_hwtail, and the bkring
1314 	 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail
1315 	 * to push all packets out.
1316 	 */
1317 	bkring->rhead = bkring->rcur = kring->nr_hwtail;
1318 
1319 	bkring->nm_sync(bkring, flags);
1320 
1321 	/* mark all buffers as released on this ring */
1322 	kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail;
1323 	/* another call to actually release the buffers */
1324 	error = kring->nm_sync(kring, 0);
1325 
1326 	/* The second rxsync may have further advanced hwtail. If this happens,
1327 	 *  return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */
1328 	if (kring->rcur != kring->nr_hwtail) {
1329 		ret = NM_IRQ_RESCHED;
1330 	}
1331 put_out:
1332 	nm_kr_put(kring);
1333 
1334 	return error ? error : ret;
1335 }
1336 
1337 
1338 /* nm_register callback for bwrap */
1339 int
1340 netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
1341 {
1342 	struct netmap_bwrap_adapter *bna =
1343 		(struct netmap_bwrap_adapter *)na;
1344 	struct netmap_adapter *hwna = bna->hwna;
1345 	struct netmap_vp_adapter *hostna = &bna->host;
1346 	int error, i;
1347 	enum txrx t;
1348 
1349 	ND("%s %s", na->name, onoff ? "on" : "off");
1350 
1351 	if (onoff) {
1352 		/* netmap_do_regif has been called on the bwrap na.
1353 		 * We need to pass the information about the
1354 		 * memory allocator down to the hwna before
1355 		 * putting it in netmap mode
1356 		 */
1357 		hwna->na_lut = na->na_lut;
1358 
1359 		if (hostna->na_bdg) {
1360 			/* if the host rings have been attached to switch,
1361 			 * we need to copy the memory allocator information
1362 			 * in the hostna also
1363 			 */
1364 			hostna->up.na_lut = na->na_lut;
1365 		}
1366 
1367 	}
1368 
1369 	/* pass down the pending ring state information */
1370 	for_rx_tx(t) {
1371 		for (i = 0; i < netmap_all_rings(na, t); i++) {
1372 			NMR(hwna, nm_txrx_swap(t))[i]->nr_pending_mode =
1373 				NMR(na, t)[i]->nr_pending_mode;
1374 		}
1375 	}
1376 
1377 	/* forward the request to the hwna */
1378 	error = hwna->nm_register(hwna, onoff);
1379 	if (error)
1380 		return error;
1381 
1382 	/* copy up the current ring state information */
1383 	for_rx_tx(t) {
1384 		for (i = 0; i < netmap_all_rings(na, t); i++) {
1385 			struct netmap_kring *kring = NMR(hwna, nm_txrx_swap(t))[i];
1386 			NMR(na, t)[i]->nr_mode = kring->nr_mode;
1387 		}
1388 	}
1389 
1390 	/* impersonate a netmap_vp_adapter */
1391 	netmap_vp_reg(na, onoff);
1392 	if (hostna->na_bdg)
1393 		netmap_vp_reg(&hostna->up, onoff);
1394 
1395 	if (onoff) {
1396 		u_int i;
1397 		/* intercept the hwna nm_nofify callback on the hw rings */
1398 		for (i = 0; i < hwna->num_rx_rings; i++) {
1399 			hwna->rx_rings[i]->save_notify = hwna->rx_rings[i]->nm_notify;
1400 			hwna->rx_rings[i]->nm_notify = netmap_bwrap_intr_notify;
1401 		}
1402 		i = hwna->num_rx_rings; /* for safety */
1403 		/* save the host ring notify unconditionally */
1404 		for (; i < netmap_real_rings(hwna, NR_RX); i++) {
1405 			hwna->rx_rings[i]->save_notify =
1406 				hwna->rx_rings[i]->nm_notify;
1407 			if (hostna->na_bdg) {
1408 				/* also intercept the host ring notify */
1409 				hwna->rx_rings[i]->nm_notify =
1410 					netmap_bwrap_intr_notify;
1411 				na->tx_rings[i]->nm_sync = na->nm_txsync;
1412 			}
1413 		}
1414 		if (na->active_fds == 0)
1415 			na->na_flags |= NAF_NETMAP_ON;
1416 	} else {
1417 		u_int i;
1418 
1419 		if (na->active_fds == 0)
1420 			na->na_flags &= ~NAF_NETMAP_ON;
1421 
1422 		/* reset all notify callbacks (including host ring) */
1423 		for (i = 0; i < netmap_all_rings(hwna, NR_RX); i++) {
1424 			hwna->rx_rings[i]->nm_notify =
1425 				hwna->rx_rings[i]->save_notify;
1426 			hwna->rx_rings[i]->save_notify = NULL;
1427 		}
1428 		hwna->na_lut.lut = NULL;
1429 		hwna->na_lut.plut = NULL;
1430 		hwna->na_lut.objtotal = 0;
1431 		hwna->na_lut.objsize = 0;
1432 
1433 		/* pass ownership of the netmap rings to the hwna */
1434 		for_rx_tx(t) {
1435 			for (i = 0; i < netmap_all_rings(na, t); i++) {
1436 				NMR(na, t)[i]->ring = NULL;
1437 			}
1438 		}
1439 		/* reset the number of host rings to default */
1440 		for_rx_tx(t) {
1441 			nma_set_host_nrings(hwna, t, 1);
1442 		}
1443 
1444 	}
1445 
1446 	return 0;
1447 }
1448 
1449 /* nm_config callback for bwrap */
1450 static int
1451 netmap_bwrap_config(struct netmap_adapter *na, struct nm_config_info *info)
1452 {
1453 	struct netmap_bwrap_adapter *bna =
1454 		(struct netmap_bwrap_adapter *)na;
1455 	struct netmap_adapter *hwna = bna->hwna;
1456 	int error;
1457 
1458 	/* Forward the request to the hwna. It may happen that nobody
1459 	 * registered hwna yet, so netmap_mem_get_lut() may have not
1460 	 * been called yet. */
1461 	error = netmap_mem_get_lut(hwna->nm_mem, &hwna->na_lut);
1462 	if (error)
1463 		return error;
1464 	netmap_update_config(hwna);
1465 	/* swap the results and propagate */
1466 	info->num_tx_rings = hwna->num_rx_rings;
1467 	info->num_tx_descs = hwna->num_rx_desc;
1468 	info->num_rx_rings = hwna->num_tx_rings;
1469 	info->num_rx_descs = hwna->num_tx_desc;
1470 	info->rx_buf_maxsize = hwna->rx_buf_maxsize;
1471 
1472 	return 0;
1473 }
1474 
1475 
1476 /* nm_krings_create callback for bwrap */
1477 int
1478 netmap_bwrap_krings_create_common(struct netmap_adapter *na)
1479 {
1480 	struct netmap_bwrap_adapter *bna =
1481 		(struct netmap_bwrap_adapter *)na;
1482 	struct netmap_adapter *hwna = bna->hwna;
1483 	struct netmap_adapter *hostna = &bna->host.up;
1484 	int i, error = 0;
1485 	enum txrx t;
1486 
1487 	/* also create the hwna krings */
1488 	error = hwna->nm_krings_create(hwna);
1489 	if (error) {
1490 		return error;
1491 	}
1492 
1493 	/* increment the usage counter for all the hwna krings */
1494 	for_rx_tx(t) {
1495 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1496 			NMR(hwna, t)[i]->users++;
1497 		}
1498 	}
1499 
1500 	/* now create the actual rings */
1501 	error = netmap_mem_rings_create(hwna);
1502 	if (error) {
1503 		goto err_dec_users;
1504 	}
1505 
1506 	/* cross-link the netmap rings
1507 	 * The original number of rings comes from hwna,
1508 	 * rx rings on one side equals tx rings on the other.
1509 	 */
1510 	for_rx_tx(t) {
1511 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
1512 		for (i = 0; i < netmap_all_rings(hwna, r); i++) {
1513 			NMR(na, t)[i]->nkr_num_slots = NMR(hwna, r)[i]->nkr_num_slots;
1514 			NMR(na, t)[i]->ring = NMR(hwna, r)[i]->ring;
1515 		}
1516 	}
1517 
1518 	if (na->na_flags & NAF_HOST_RINGS) {
1519 		/* the hostna rings are the host rings of the bwrap.
1520 		 * The corresponding krings must point back to the
1521 		 * hostna
1522 		 */
1523 		hostna->tx_rings = &na->tx_rings[na->num_tx_rings];
1524 		hostna->rx_rings = &na->rx_rings[na->num_rx_rings];
1525 		for_rx_tx(t) {
1526 			for (i = 0; i < nma_get_nrings(hostna, t); i++) {
1527 				NMR(hostna, t)[i]->na = hostna;
1528 			}
1529 		}
1530 	}
1531 
1532 	return 0;
1533 
1534 err_dec_users:
1535 	for_rx_tx(t) {
1536 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1537 			NMR(hwna, t)[i]->users--;
1538 		}
1539 	}
1540 	hwna->nm_krings_delete(hwna);
1541 	return error;
1542 }
1543 
1544 
1545 void
1546 netmap_bwrap_krings_delete_common(struct netmap_adapter *na)
1547 {
1548 	struct netmap_bwrap_adapter *bna =
1549 		(struct netmap_bwrap_adapter *)na;
1550 	struct netmap_adapter *hwna = bna->hwna;
1551 	enum txrx t;
1552 	int i;
1553 
1554 	ND("%s", na->name);
1555 
1556 	/* decrement the usage counter for all the hwna krings */
1557 	for_rx_tx(t) {
1558 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1559 			NMR(hwna, t)[i]->users--;
1560 		}
1561 	}
1562 
1563 	/* delete any netmap rings that are no longer needed */
1564 	netmap_mem_rings_delete(hwna);
1565 	hwna->nm_krings_delete(hwna);
1566 }
1567 
1568 
1569 /* notify method for the bridge-->hwna direction */
1570 int
1571 netmap_bwrap_notify(struct netmap_kring *kring, int flags)
1572 {
1573 	struct netmap_adapter *na = kring->na;
1574 	struct netmap_bwrap_adapter *bna = na->na_private;
1575 	struct netmap_adapter *hwna = bna->hwna;
1576 	u_int ring_n = kring->ring_id;
1577 	u_int lim = kring->nkr_num_slots - 1;
1578 	struct netmap_kring *hw_kring;
1579 	int error;
1580 
1581 	ND("%s: na %s hwna %s",
1582 			(kring ? kring->name : "NULL!"),
1583 			(na ? na->name : "NULL!"),
1584 			(hwna ? hwna->name : "NULL!"));
1585 	hw_kring = hwna->tx_rings[ring_n];
1586 
1587 	if (nm_kr_tryget(hw_kring, 0, NULL)) {
1588 		return ENXIO;
1589 	}
1590 
1591 	/* first step: simulate a user wakeup on the rx ring */
1592 	netmap_vp_rxsync(kring, flags);
1593 	ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1594 		na->name, ring_n,
1595 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1596 		ring->head, ring->cur, ring->tail,
1597 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_ring->rtail);
1598 	/* second step: the new packets are sent on the tx ring
1599 	 * (which is actually the same ring)
1600 	 */
1601 	hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
1602 	error = hw_kring->nm_sync(hw_kring, flags);
1603 	if (error)
1604 		goto put_out;
1605 
1606 	/* third step: now we are back the rx ring */
1607 	/* claim ownership on all hw owned bufs */
1608 	kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */
1609 
1610 	/* fourth step: the user goes to sleep again, causing another rxsync */
1611 	netmap_vp_rxsync(kring, flags);
1612 	ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1613 		na->name, ring_n,
1614 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1615 		ring->head, ring->cur, ring->tail,
1616 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
1617 put_out:
1618 	nm_kr_put(hw_kring);
1619 
1620 	return error ? error : NM_IRQ_COMPLETED;
1621 }
1622 
1623 
1624 /* nm_bdg_ctl callback for the bwrap.
1625  * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd].
1626  * On attach, it needs to provide a fake netmap_priv_d structure and
1627  * perform a netmap_do_regif() on the bwrap. This will put both the
1628  * bwrap and the hwna in netmap mode, with the netmap rings shared
1629  * and cross linked. Moroever, it will start intercepting interrupts
1630  * directed to hwna.
1631  */
1632 static int
1633 netmap_bwrap_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
1634 {
1635 	struct netmap_priv_d *npriv;
1636 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1637 	int error = 0;
1638 
1639 	if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
1640 		struct nmreq_vale_attach *req =
1641 			(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
1642 		if (req->reg.nr_ringid != 0 ||
1643 			(req->reg.nr_mode != NR_REG_ALL_NIC &&
1644 				req->reg.nr_mode != NR_REG_NIC_SW)) {
1645 			/* We only support attaching all the NIC rings
1646 			 * and/or the host stack. */
1647 			return EINVAL;
1648 		}
1649 		if (NETMAP_OWNED_BY_ANY(na)) {
1650 			return EBUSY;
1651 		}
1652 		if (bna->na_kpriv) {
1653 			/* nothing to do */
1654 			return 0;
1655 		}
1656 		npriv = netmap_priv_new();
1657 		if (npriv == NULL)
1658 			return ENOMEM;
1659 		npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */
1660 		error = netmap_do_regif(npriv, na, req->reg.nr_mode,
1661 					req->reg.nr_ringid, req->reg.nr_flags);
1662 		if (error) {
1663 			netmap_priv_delete(npriv);
1664 			return error;
1665 		}
1666 		bna->na_kpriv = npriv;
1667 		na->na_flags |= NAF_BUSY;
1668 	} else {
1669 		if (na->active_fds == 0) /* not registered */
1670 			return EINVAL;
1671 		netmap_priv_delete(bna->na_kpriv);
1672 		bna->na_kpriv = NULL;
1673 		na->na_flags &= ~NAF_BUSY;
1674 	}
1675 
1676 	return error;
1677 }
1678 
1679 /* attach a bridge wrapper to the 'real' device */
1680 int
1681 netmap_bwrap_attach_common(struct netmap_adapter *na,
1682 		struct netmap_adapter *hwna)
1683 {
1684 	struct netmap_bwrap_adapter *bna;
1685 	struct netmap_adapter *hostna = NULL;
1686 	int error = 0;
1687 	enum txrx t;
1688 
1689 	/* make sure the NIC is not already in use */
1690 	if (NETMAP_OWNED_BY_ANY(hwna)) {
1691 		D("NIC %s busy, cannot attach to bridge", hwna->name);
1692 		return EBUSY;
1693 	}
1694 
1695 	bna = (struct netmap_bwrap_adapter *)na;
1696 	/* make bwrap ifp point to the real ifp */
1697 	na->ifp = hwna->ifp;
1698 	if_ref(na->ifp);
1699 	na->na_private = bna;
1700 	/* fill the ring data for the bwrap adapter with rx/tx meanings
1701 	 * swapped. The real cross-linking will be done during register,
1702 	 * when all the krings will have been created.
1703 	 */
1704 	for_rx_tx(t) {
1705 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
1706 		nma_set_nrings(na, t, nma_get_nrings(hwna, r));
1707 		nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
1708 	}
1709 	na->nm_dtor = netmap_bwrap_dtor;
1710 	na->nm_config = netmap_bwrap_config;
1711 	na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
1712 	na->pdev = hwna->pdev;
1713 	na->nm_mem = netmap_mem_get(hwna->nm_mem);
1714 	na->virt_hdr_len = hwna->virt_hdr_len;
1715 	na->rx_buf_maxsize = hwna->rx_buf_maxsize;
1716 
1717 	bna->hwna = hwna;
1718 	netmap_adapter_get(hwna);
1719 	hwna->na_private = bna; /* weak reference */
1720 	bna->saved_na_vp = hwna->na_vp;
1721 	hwna->na_vp = &bna->up;
1722 	bna->up.up.na_vp = &(bna->up);
1723 
1724 	if (hwna->na_flags & NAF_HOST_RINGS) {
1725 		if (hwna->na_flags & NAF_SW_ONLY)
1726 			na->na_flags |= NAF_SW_ONLY;
1727 		na->na_flags |= NAF_HOST_RINGS;
1728 		hostna = &bna->host.up;
1729 
1730 		/* limit the number of host rings to that of hw */
1731 		nm_bound_var(&hostna->num_tx_rings, 1, 1,
1732 				nma_get_nrings(hwna, NR_TX), NULL);
1733 		nm_bound_var(&hostna->num_rx_rings, 1, 1,
1734 				nma_get_nrings(hwna, NR_RX), NULL);
1735 
1736 		snprintf(hostna->name, sizeof(hostna->name), "%s^", na->name);
1737 		hostna->ifp = hwna->ifp;
1738 		for_rx_tx(t) {
1739 			enum txrx r = nm_txrx_swap(t);
1740 			u_int nr = nma_get_nrings(hostna, t);
1741 
1742 			nma_set_nrings(hostna, t, nr);
1743 			nma_set_host_nrings(na, t, nr);
1744 			if (nma_get_host_nrings(hwna, t) < nr) {
1745 				nma_set_host_nrings(hwna, t, nr);
1746 			}
1747 			nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
1748 		}
1749 		// hostna->nm_txsync = netmap_bwrap_host_txsync;
1750 		// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
1751 		hostna->nm_mem = netmap_mem_get(na->nm_mem);
1752 		hostna->na_private = bna;
1753 		hostna->na_vp = &bna->up;
1754 		na->na_hostvp = hwna->na_hostvp =
1755 			hostna->na_hostvp = &bna->host;
1756 		hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
1757 		hostna->rx_buf_maxsize = hwna->rx_buf_maxsize;
1758 	}
1759 
1760 	ND("%s<->%s txr %d txd %d rxr %d rxd %d",
1761 		na->name, ifp->if_xname,
1762 		na->num_tx_rings, na->num_tx_desc,
1763 		na->num_rx_rings, na->num_rx_desc);
1764 
1765 	error = netmap_attach_common(na);
1766 	if (error) {
1767 		goto err_put;
1768 	}
1769 	hwna->na_flags |= NAF_BUSY;
1770 	return 0;
1771 
1772 err_put:
1773 	hwna->na_vp = hwna->na_hostvp = NULL;
1774 	netmap_adapter_put(hwna);
1775 	return error;
1776 
1777 }
1778 
1779 struct nm_bridge *
1780 netmap_init_bridges2(u_int n)
1781 {
1782 	int i;
1783 	struct nm_bridge *b;
1784 
1785 	b = nm_os_malloc(sizeof(struct nm_bridge) * n);
1786 	if (b == NULL)
1787 		return NULL;
1788 	for (i = 0; i < n; i++)
1789 		BDG_RWINIT(&b[i]);
1790 	return b;
1791 }
1792 
1793 void
1794 netmap_uninit_bridges2(struct nm_bridge *b, u_int n)
1795 {
1796 	int i;
1797 
1798 	if (b == NULL)
1799 		return;
1800 
1801 	for (i = 0; i < n; i++)
1802 		BDG_RWDESTROY(&b[i]);
1803 	nm_os_free(b);
1804 }
1805 
1806 int
1807 netmap_init_bridges(void)
1808 {
1809 #ifdef CONFIG_NET_NS
1810 	return netmap_bns_register();
1811 #else
1812 	nm_bridges = netmap_init_bridges2(NM_BRIDGES);
1813 	if (nm_bridges == NULL)
1814 		return ENOMEM;
1815 	return 0;
1816 #endif
1817 }
1818 
1819 void
1820 netmap_uninit_bridges(void)
1821 {
1822 #ifdef CONFIG_NET_NS
1823 	netmap_bns_unregister();
1824 #else
1825 	netmap_uninit_bridges2(nm_bridges, NM_BRIDGES);
1826 #endif
1827 }
1828