xref: /freebsd/sys/dev/netmap/netmap_bdg.c (revision cc426dd31990b8b50b210efc450e404596548ca1)
1 /*
2  * Copyright (C) 2013-2016 Universita` di Pisa
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *   1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  *   2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *      documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 
28 /*
29  * This module implements the VALE switch for netmap
30 
31 --- VALE SWITCH ---
32 
33 NMG_LOCK() serializes all modifications to switches and ports.
34 A switch cannot be deleted until all ports are gone.
35 
36 For each switch, an SX lock (RWlock on linux) protects
37 deletion of ports. When configuring or deleting a new port, the
38 lock is acquired in exclusive mode (after holding NMG_LOCK).
39 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
40 The lock is held throughout the entire forwarding cycle,
41 during which the thread may incur in a page fault.
42 Hence it is important that sleepable shared locks are used.
43 
44 On the rx ring, the per-port lock is grabbed initially to reserve
45 a number of slot in the ring, then the lock is released,
46 packets are copied from source to destination, and then
47 the lock is acquired again and the receive ring is updated.
48 (A similar thing is done on the tx ring for NIC and host stack
49 ports attached to the switch)
50 
51  */
52 
53 /*
54  * OS-specific code that is used only within this file.
55  * Other OS-specific code that must be accessed by drivers
56  * is present in netmap_kern.h
57  */
58 
59 #if defined(__FreeBSD__)
60 #include <sys/cdefs.h> /* prerequisite */
61 __FBSDID("$FreeBSD$");
62 
63 #include <sys/types.h>
64 #include <sys/errno.h>
65 #include <sys/param.h>	/* defines used in kernel.h */
66 #include <sys/kernel.h>	/* types used in module initialization */
67 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
68 #include <sys/sockio.h>
69 #include <sys/socketvar.h>	/* struct socket */
70 #include <sys/malloc.h>
71 #include <sys/poll.h>
72 #include <sys/rwlock.h>
73 #include <sys/socket.h> /* sockaddrs */
74 #include <sys/selinfo.h>
75 #include <sys/sysctl.h>
76 #include <net/if.h>
77 #include <net/if_var.h>
78 #include <net/bpf.h>		/* BIOCIMMEDIATE */
79 #include <machine/bus.h>	/* bus_dmamap_* */
80 #include <sys/endian.h>
81 #include <sys/refcount.h>
82 #include <sys/smp.h>
83 
84 
85 #elif defined(linux)
86 
87 #include "bsd_glue.h"
88 
89 #elif defined(__APPLE__)
90 
91 #warning OSX support is only partial
92 #include "osx_glue.h"
93 
94 #elif defined(_WIN32)
95 #include "win_glue.h"
96 
97 #else
98 
99 #error	Unsupported platform
100 
101 #endif /* unsupported */
102 
103 /*
104  * common headers
105  */
106 
107 #include <net/netmap.h>
108 #include <dev/netmap/netmap_kern.h>
109 #include <dev/netmap/netmap_mem2.h>
110 
111 #include <dev/netmap/netmap_bdg.h>
112 
113 const char*
114 netmap_bdg_name(struct netmap_vp_adapter *vp)
115 {
116 	struct nm_bridge *b = vp->na_bdg;
117 	if (b == NULL)
118 		return NULL;
119 	return b->bdg_basename;
120 }
121 
122 
123 #ifndef CONFIG_NET_NS
124 /*
125  * XXX in principle nm_bridges could be created dynamically
126  * Right now we have a static array and deletions are protected
127  * by an exclusive lock.
128  */
129 struct nm_bridge *nm_bridges;
130 #endif /* !CONFIG_NET_NS */
131 
132 
133 static int
134 nm_is_id_char(const char c)
135 {
136 	return (c >= 'a' && c <= 'z') ||
137 	       (c >= 'A' && c <= 'Z') ||
138 	       (c >= '0' && c <= '9') ||
139 	       (c == '_');
140 }
141 
142 /* Validate the name of a bdg port and return the
143  * position of the ":" character. */
144 static int
145 nm_bdg_name_validate(const char *name, size_t prefixlen)
146 {
147 	int colon_pos = -1;
148 	int i;
149 
150 	if (!name || strlen(name) < prefixlen) {
151 		return -1;
152 	}
153 
154 	for (i = 0; i < NM_BDG_IFNAMSIZ && name[i]; i++) {
155 		if (name[i] == ':') {
156 			colon_pos = i;
157 			break;
158 		} else if (!nm_is_id_char(name[i])) {
159 			return -1;
160 		}
161 	}
162 
163 	if (strlen(name) - colon_pos > IFNAMSIZ) {
164 		/* interface name too long */
165 		return -1;
166 	}
167 
168 	return colon_pos;
169 }
170 
171 /*
172  * locate a bridge among the existing ones.
173  * MUST BE CALLED WITH NMG_LOCK()
174  *
175  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
176  * We assume that this is called with a name of at least NM_NAME chars.
177  */
178 struct nm_bridge *
179 nm_find_bridge(const char *name, int create, struct netmap_bdg_ops *ops)
180 {
181 	int i, namelen;
182 	struct nm_bridge *b = NULL, *bridges;
183 	u_int num_bridges;
184 
185 	NMG_LOCK_ASSERT();
186 
187 	netmap_bns_getbridges(&bridges, &num_bridges);
188 
189 	namelen = nm_bdg_name_validate(name,
190 			(ops != NULL ? strlen(ops->name) : 0));
191 	if (namelen < 0) {
192 		nm_prerr("invalid bridge name %s", name ? name : NULL);
193 		return NULL;
194 	}
195 
196 	/* lookup the name, remember empty slot if there is one */
197 	for (i = 0; i < num_bridges; i++) {
198 		struct nm_bridge *x = bridges + i;
199 
200 		if ((x->bdg_flags & NM_BDG_ACTIVE) + x->bdg_active_ports == 0) {
201 			if (create && b == NULL)
202 				b = x;	/* record empty slot */
203 		} else if (x->bdg_namelen != namelen) {
204 			continue;
205 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
206 			ND("found '%.*s' at %d", namelen, name, i);
207 			b = x;
208 			break;
209 		}
210 	}
211 	if (i == num_bridges && b) { /* name not found, can create entry */
212 		/* initialize the bridge */
213 		ND("create new bridge %s with ports %d", b->bdg_basename,
214 			b->bdg_active_ports);
215 		b->ht = nm_os_malloc(sizeof(struct nm_hash_ent) * NM_BDG_HASH);
216 		if (b->ht == NULL) {
217 			nm_prerr("failed to allocate hash table");
218 			return NULL;
219 		}
220 		strncpy(b->bdg_basename, name, namelen);
221 		b->bdg_namelen = namelen;
222 		b->bdg_active_ports = 0;
223 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
224 			b->bdg_port_index[i] = i;
225 		/* set the default function */
226 		b->bdg_ops = b->bdg_saved_ops = *ops;
227 		b->private_data = b->ht;
228 		b->bdg_flags = 0;
229 		NM_BNS_GET(b);
230 	}
231 	return b;
232 }
233 
234 
235 int
236 netmap_bdg_free(struct nm_bridge *b)
237 {
238 	if ((b->bdg_flags & NM_BDG_ACTIVE) + b->bdg_active_ports != 0) {
239 		return EBUSY;
240 	}
241 
242 	ND("marking bridge %s as free", b->bdg_basename);
243 	nm_os_free(b->ht);
244 	memset(&b->bdg_ops, 0, sizeof(b->bdg_ops));
245 	memset(&b->bdg_saved_ops, 0, sizeof(b->bdg_saved_ops));
246 	b->bdg_flags = 0;
247 	NM_BNS_PUT(b);
248 	return 0;
249 }
250 
251 /* Called by external kernel modules (e.g., Openvswitch).
252  * to modify the private data previously given to regops().
253  * 'name' may be just bridge's name (including ':' if it
254  * is not just NM_BDG_NAME).
255  * Called without NMG_LOCK.
256  */
257 int
258 netmap_bdg_update_private_data(const char *name, bdg_update_private_data_fn_t callback,
259 	void *callback_data, void *auth_token)
260 {
261 	void *private_data = NULL;
262 	struct nm_bridge *b;
263 	int error = 0;
264 
265 	NMG_LOCK();
266 	b = nm_find_bridge(name, 0 /* don't create */, NULL);
267 	if (!b) {
268 		error = EINVAL;
269 		goto unlock_update_priv;
270 	}
271 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
272 		error = EACCES;
273 		goto unlock_update_priv;
274 	}
275 	BDG_WLOCK(b);
276 	private_data = callback(b->private_data, callback_data, &error);
277 	b->private_data = private_data;
278 	BDG_WUNLOCK(b);
279 
280 unlock_update_priv:
281 	NMG_UNLOCK();
282 	return error;
283 }
284 
285 
286 
287 /* remove from bridge b the ports in slots hw and sw
288  * (sw can be -1 if not needed)
289  */
290 void
291 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
292 {
293 	int s_hw = hw, s_sw = sw;
294 	int i, lim =b->bdg_active_ports;
295 	uint32_t *tmp = b->tmp_bdg_port_index;
296 
297 	/*
298 	New algorithm:
299 	make a copy of bdg_port_index;
300 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
301 	in the array of bdg_port_index, replacing them with
302 	entries from the bottom of the array;
303 	decrement bdg_active_ports;
304 	acquire BDG_WLOCK() and copy back the array.
305 	 */
306 
307 	if (netmap_debug & NM_DEBUG_BDG)
308 		nm_prinf("detach %d and %d (lim %d)", hw, sw, lim);
309 	/* make a copy of the list of active ports, update it,
310 	 * and then copy back within BDG_WLOCK().
311 	 */
312 	memcpy(b->tmp_bdg_port_index, b->bdg_port_index, sizeof(b->tmp_bdg_port_index));
313 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
314 		if (hw >= 0 && tmp[i] == hw) {
315 			ND("detach hw %d at %d", hw, i);
316 			lim--; /* point to last active port */
317 			tmp[i] = tmp[lim]; /* swap with i */
318 			tmp[lim] = hw;	/* now this is inactive */
319 			hw = -1;
320 		} else if (sw >= 0 && tmp[i] == sw) {
321 			ND("detach sw %d at %d", sw, i);
322 			lim--;
323 			tmp[i] = tmp[lim];
324 			tmp[lim] = sw;
325 			sw = -1;
326 		} else {
327 			i++;
328 		}
329 	}
330 	if (hw >= 0 || sw >= 0) {
331 		nm_prerr("delete failed hw %d sw %d, should panic...", hw, sw);
332 	}
333 
334 	BDG_WLOCK(b);
335 	if (b->bdg_ops.dtor)
336 		b->bdg_ops.dtor(b->bdg_ports[s_hw]);
337 	b->bdg_ports[s_hw] = NULL;
338 	if (s_sw >= 0) {
339 		b->bdg_ports[s_sw] = NULL;
340 	}
341 	memcpy(b->bdg_port_index, b->tmp_bdg_port_index, sizeof(b->tmp_bdg_port_index));
342 	b->bdg_active_ports = lim;
343 	BDG_WUNLOCK(b);
344 
345 	ND("now %d active ports", lim);
346 	netmap_bdg_free(b);
347 }
348 
349 
350 /* nm_bdg_ctl callback for VALE ports */
351 int
352 netmap_vp_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
353 {
354 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
355 	struct nm_bridge *b = vpna->na_bdg;
356 
357 	if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
358 		return 0; /* nothing to do */
359 	}
360 	if (b) {
361 		netmap_set_all_rings(na, 0 /* disable */);
362 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
363 		vpna->na_bdg = NULL;
364 		netmap_set_all_rings(na, 1 /* enable */);
365 	}
366 	/* I have took reference just for attach */
367 	netmap_adapter_put(na);
368 	return 0;
369 }
370 
371 int
372 netmap_default_bdg_attach(const char *name, struct netmap_adapter *na,
373 		struct nm_bridge *b)
374 {
375 	return NM_NEED_BWRAP;
376 }
377 
378 /* Try to get a reference to a netmap adapter attached to a VALE switch.
379  * If the adapter is found (or is created), this function returns 0, a
380  * non NULL pointer is returned into *na, and the caller holds a
381  * reference to the adapter.
382  * If an adapter is not found, then no reference is grabbed and the
383  * function returns an error code, or 0 if there is just a VALE prefix
384  * mismatch. Therefore the caller holds a reference when
385  * (*na != NULL && return == 0).
386  */
387 int
388 netmap_get_bdg_na(struct nmreq_header *hdr, struct netmap_adapter **na,
389 	struct netmap_mem_d *nmd, int create, struct netmap_bdg_ops *ops)
390 {
391 	char *nr_name = hdr->nr_name;
392 	const char *ifname;
393 	struct ifnet *ifp = NULL;
394 	int error = 0;
395 	struct netmap_vp_adapter *vpna, *hostna = NULL;
396 	struct nm_bridge *b;
397 	uint32_t i, j;
398 	uint32_t cand = NM_BDG_NOPORT, cand2 = NM_BDG_NOPORT;
399 	int needed;
400 
401 	*na = NULL;     /* default return value */
402 
403 	/* first try to see if this is a bridge port. */
404 	NMG_LOCK_ASSERT();
405 	if (strncmp(nr_name, ops->name, strlen(ops->name) - 1)) {
406 		return 0;  /* no error, but no VALE prefix */
407 	}
408 
409 	b = nm_find_bridge(nr_name, create, ops);
410 	if (b == NULL) {
411 		ND("no bridges available for '%s'", nr_name);
412 		return (create ? ENOMEM : ENXIO);
413 	}
414 	if (strlen(nr_name) < b->bdg_namelen) /* impossible */
415 		panic("x");
416 
417 	/* Now we are sure that name starts with the bridge's name,
418 	 * lookup the port in the bridge. We need to scan the entire
419 	 * list. It is not important to hold a WLOCK on the bridge
420 	 * during the search because NMG_LOCK already guarantees
421 	 * that there are no other possible writers.
422 	 */
423 
424 	/* lookup in the local list of ports */
425 	for (j = 0; j < b->bdg_active_ports; j++) {
426 		i = b->bdg_port_index[j];
427 		vpna = b->bdg_ports[i];
428 		ND("checking %s", vpna->up.name);
429 		if (!strcmp(vpna->up.name, nr_name)) {
430 			netmap_adapter_get(&vpna->up);
431 			ND("found existing if %s refs %d", nr_name)
432 			*na = &vpna->up;
433 			return 0;
434 		}
435 	}
436 	/* not found, should we create it? */
437 	if (!create)
438 		return ENXIO;
439 	/* yes we should, see if we have space to attach entries */
440 	needed = 2; /* in some cases we only need 1 */
441 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
442 		nm_prerr("bridge full %d, cannot create new port", b->bdg_active_ports);
443 		return ENOMEM;
444 	}
445 	/* record the next two ports available, but do not allocate yet */
446 	cand = b->bdg_port_index[b->bdg_active_ports];
447 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
448 	ND("+++ bridge %s port %s used %d avail %d %d",
449 		b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
450 
451 	/*
452 	 * try see if there is a matching NIC with this name
453 	 * (after the bridge's name)
454 	 */
455 	ifname = nr_name + b->bdg_namelen + 1;
456 	ifp = ifunit_ref(ifname);
457 	if (!ifp) {
458 		/* Create an ephemeral virtual port.
459 		 * This block contains all the ephemeral-specific logic.
460 		 */
461 
462 		if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
463 			error = EINVAL;
464 			goto out;
465 		}
466 
467 		/* bdg_netmap_attach creates a struct netmap_adapter */
468 		error = b->bdg_ops.vp_create(hdr, NULL, nmd, &vpna);
469 		if (error) {
470 			if (netmap_debug & NM_DEBUG_BDG)
471 				nm_prerr("error %d", error);
472 			goto out;
473 		}
474 		/* shortcut - we can skip get_hw_na(),
475 		 * ownership check and nm_bdg_attach()
476 		 */
477 
478 	} else {
479 		struct netmap_adapter *hw;
480 
481 		/* the vale:nic syntax is only valid for some commands */
482 		switch (hdr->nr_reqtype) {
483 		case NETMAP_REQ_VALE_ATTACH:
484 		case NETMAP_REQ_VALE_DETACH:
485 		case NETMAP_REQ_VALE_POLLING_ENABLE:
486 		case NETMAP_REQ_VALE_POLLING_DISABLE:
487 			break; /* ok */
488 		default:
489 			error = EINVAL;
490 			goto out;
491 		}
492 
493 		error = netmap_get_hw_na(ifp, nmd, &hw);
494 		if (error || hw == NULL)
495 			goto out;
496 
497 		/* host adapter might not be created */
498 		error = hw->nm_bdg_attach(nr_name, hw, b);
499 		if (error == NM_NEED_BWRAP) {
500 			error = b->bdg_ops.bwrap_attach(nr_name, hw);
501 		}
502 		if (error)
503 			goto out;
504 		vpna = hw->na_vp;
505 		hostna = hw->na_hostvp;
506 		if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
507 			/* Check if we need to skip the host rings. */
508 			struct nmreq_vale_attach *areq =
509 				(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
510 			if (areq->reg.nr_mode != NR_REG_NIC_SW) {
511 				hostna = NULL;
512 			}
513 		}
514 	}
515 
516 	BDG_WLOCK(b);
517 	vpna->bdg_port = cand;
518 	ND("NIC  %p to bridge port %d", vpna, cand);
519 	/* bind the port to the bridge (virtual ports are not active) */
520 	b->bdg_ports[cand] = vpna;
521 	vpna->na_bdg = b;
522 	b->bdg_active_ports++;
523 	if (hostna != NULL) {
524 		/* also bind the host stack to the bridge */
525 		b->bdg_ports[cand2] = hostna;
526 		hostna->bdg_port = cand2;
527 		hostna->na_bdg = b;
528 		b->bdg_active_ports++;
529 		ND("host %p to bridge port %d", hostna, cand2);
530 	}
531 	ND("if %s refs %d", ifname, vpna->up.na_refcount);
532 	BDG_WUNLOCK(b);
533 	*na = &vpna->up;
534 	netmap_adapter_get(*na);
535 
536 out:
537 	if (ifp)
538 		if_rele(ifp);
539 
540 	return error;
541 }
542 
543 
544 int
545 nm_is_bwrap(struct netmap_adapter *na)
546 {
547 	return na->nm_register == netmap_bwrap_reg;
548 }
549 
550 
551 struct nm_bdg_polling_state;
552 struct
553 nm_bdg_kthread {
554 	struct nm_kctx *nmk;
555 	u_int qfirst;
556 	u_int qlast;
557 	struct nm_bdg_polling_state *bps;
558 };
559 
560 struct nm_bdg_polling_state {
561 	bool configured;
562 	bool stopped;
563 	struct netmap_bwrap_adapter *bna;
564 	uint32_t mode;
565 	u_int qfirst;
566 	u_int qlast;
567 	u_int cpu_from;
568 	u_int ncpus;
569 	struct nm_bdg_kthread *kthreads;
570 };
571 
572 static void
573 netmap_bwrap_polling(void *data)
574 {
575 	struct nm_bdg_kthread *nbk = data;
576 	struct netmap_bwrap_adapter *bna;
577 	u_int qfirst, qlast, i;
578 	struct netmap_kring **kring0, *kring;
579 
580 	if (!nbk)
581 		return;
582 	qfirst = nbk->qfirst;
583 	qlast = nbk->qlast;
584 	bna = nbk->bps->bna;
585 	kring0 = NMR(bna->hwna, NR_RX);
586 
587 	for (i = qfirst; i < qlast; i++) {
588 		kring = kring0[i];
589 		kring->nm_notify(kring, 0);
590 	}
591 }
592 
593 static int
594 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps)
595 {
596 	struct nm_kctx_cfg kcfg;
597 	int i, j;
598 
599 	bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus);
600 	if (bps->kthreads == NULL)
601 		return ENOMEM;
602 
603 	bzero(&kcfg, sizeof(kcfg));
604 	kcfg.worker_fn = netmap_bwrap_polling;
605 	for (i = 0; i < bps->ncpus; i++) {
606 		struct nm_bdg_kthread *t = bps->kthreads + i;
607 		int all = (bps->ncpus == 1 &&
608 			bps->mode == NETMAP_POLLING_MODE_SINGLE_CPU);
609 		int affinity = bps->cpu_from + i;
610 
611 		t->bps = bps;
612 		t->qfirst = all ? bps->qfirst /* must be 0 */: affinity;
613 		t->qlast = all ? bps->qlast : t->qfirst + 1;
614 		if (netmap_verbose)
615 			nm_prinf("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst,
616 				t->qlast);
617 
618 		kcfg.type = i;
619 		kcfg.worker_private = t;
620 		t->nmk = nm_os_kctx_create(&kcfg, NULL);
621 		if (t->nmk == NULL) {
622 			goto cleanup;
623 		}
624 		nm_os_kctx_worker_setaff(t->nmk, affinity);
625 	}
626 	return 0;
627 
628 cleanup:
629 	for (j = 0; j < i; j++) {
630 		struct nm_bdg_kthread *t = bps->kthreads + i;
631 		nm_os_kctx_destroy(t->nmk);
632 	}
633 	nm_os_free(bps->kthreads);
634 	return EFAULT;
635 }
636 
637 /* A variant of ptnetmap_start_kthreads() */
638 static int
639 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps)
640 {
641 	int error, i, j;
642 
643 	if (!bps) {
644 		nm_prerr("polling is not configured");
645 		return EFAULT;
646 	}
647 	bps->stopped = false;
648 
649 	for (i = 0; i < bps->ncpus; i++) {
650 		struct nm_bdg_kthread *t = bps->kthreads + i;
651 		error = nm_os_kctx_worker_start(t->nmk);
652 		if (error) {
653 			nm_prerr("error in nm_kthread_start(): %d", error);
654 			goto cleanup;
655 		}
656 	}
657 	return 0;
658 
659 cleanup:
660 	for (j = 0; j < i; j++) {
661 		struct nm_bdg_kthread *t = bps->kthreads + i;
662 		nm_os_kctx_worker_stop(t->nmk);
663 	}
664 	bps->stopped = true;
665 	return error;
666 }
667 
668 static void
669 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps)
670 {
671 	int i;
672 
673 	if (!bps)
674 		return;
675 
676 	for (i = 0; i < bps->ncpus; i++) {
677 		struct nm_bdg_kthread *t = bps->kthreads + i;
678 		nm_os_kctx_worker_stop(t->nmk);
679 		nm_os_kctx_destroy(t->nmk);
680 	}
681 	bps->stopped = true;
682 }
683 
684 static int
685 get_polling_cfg(struct nmreq_vale_polling *req, struct netmap_adapter *na,
686 		struct nm_bdg_polling_state *bps)
687 {
688 	unsigned int avail_cpus, core_from;
689 	unsigned int qfirst, qlast;
690 	uint32_t i = req->nr_first_cpu_id;
691 	uint32_t req_cpus = req->nr_num_polling_cpus;
692 
693 	avail_cpus = nm_os_ncpus();
694 
695 	if (req_cpus == 0) {
696 		nm_prerr("req_cpus must be > 0");
697 		return EINVAL;
698 	} else if (req_cpus >= avail_cpus) {
699 		nm_prerr("Cannot use all the CPUs in the system");
700 		return EINVAL;
701 	}
702 
703 	if (req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU) {
704 		/* Use a separate core for each ring. If nr_num_polling_cpus>1
705 		 * more consecutive rings are polled.
706 		 * For example, if nr_first_cpu_id=2 and nr_num_polling_cpus=2,
707 		 * ring 2 and 3 are polled by core 2 and 3, respectively. */
708 		if (i + req_cpus > nma_get_nrings(na, NR_RX)) {
709 			nm_prerr("Rings %u-%u not in range (have %d rings)",
710 				i, i + req_cpus, nma_get_nrings(na, NR_RX));
711 			return EINVAL;
712 		}
713 		qfirst = i;
714 		qlast = qfirst + req_cpus;
715 		core_from = qfirst;
716 
717 	} else if (req->nr_mode == NETMAP_POLLING_MODE_SINGLE_CPU) {
718 		/* Poll all the rings using a core specified by nr_first_cpu_id.
719 		 * the number of cores must be 1. */
720 		if (req_cpus != 1) {
721 			nm_prerr("ncpus must be 1 for NETMAP_POLLING_MODE_SINGLE_CPU "
722 				"(was %d)", req_cpus);
723 			return EINVAL;
724 		}
725 		qfirst = 0;
726 		qlast = nma_get_nrings(na, NR_RX);
727 		core_from = i;
728 	} else {
729 		nm_prerr("Invalid polling mode");
730 		return EINVAL;
731 	}
732 
733 	bps->mode = req->nr_mode;
734 	bps->qfirst = qfirst;
735 	bps->qlast = qlast;
736 	bps->cpu_from = core_from;
737 	bps->ncpus = req_cpus;
738 	nm_prinf("%s qfirst %u qlast %u cpu_from %u ncpus %u",
739 		req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU ?
740 		"MULTI" : "SINGLE",
741 		qfirst, qlast, core_from, req_cpus);
742 	return 0;
743 }
744 
745 static int
746 nm_bdg_ctl_polling_start(struct nmreq_vale_polling *req, struct netmap_adapter *na)
747 {
748 	struct nm_bdg_polling_state *bps;
749 	struct netmap_bwrap_adapter *bna;
750 	int error;
751 
752 	bna = (struct netmap_bwrap_adapter *)na;
753 	if (bna->na_polling_state) {
754 		nm_prerr("ERROR adapter already in polling mode");
755 		return EFAULT;
756 	}
757 
758 	bps = nm_os_malloc(sizeof(*bps));
759 	if (!bps)
760 		return ENOMEM;
761 	bps->configured = false;
762 	bps->stopped = true;
763 
764 	if (get_polling_cfg(req, na, bps)) {
765 		nm_os_free(bps);
766 		return EINVAL;
767 	}
768 
769 	if (nm_bdg_create_kthreads(bps)) {
770 		nm_os_free(bps);
771 		return EFAULT;
772 	}
773 
774 	bps->configured = true;
775 	bna->na_polling_state = bps;
776 	bps->bna = bna;
777 
778 	/* disable interrupts if possible */
779 	nma_intr_enable(bna->hwna, 0);
780 	/* start kthread now */
781 	error = nm_bdg_polling_start_kthreads(bps);
782 	if (error) {
783 		nm_prerr("ERROR nm_bdg_polling_start_kthread()");
784 		nm_os_free(bps->kthreads);
785 		nm_os_free(bps);
786 		bna->na_polling_state = NULL;
787 		nma_intr_enable(bna->hwna, 1);
788 	}
789 	return error;
790 }
791 
792 static int
793 nm_bdg_ctl_polling_stop(struct netmap_adapter *na)
794 {
795 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na;
796 	struct nm_bdg_polling_state *bps;
797 
798 	if (!bna->na_polling_state) {
799 		nm_prerr("ERROR adapter is not in polling mode");
800 		return EFAULT;
801 	}
802 	bps = bna->na_polling_state;
803 	nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state);
804 	bps->configured = false;
805 	nm_os_free(bps);
806 	bna->na_polling_state = NULL;
807 	/* reenable interrupts */
808 	nma_intr_enable(bna->hwna, 1);
809 	return 0;
810 }
811 
812 int
813 nm_bdg_polling(struct nmreq_header *hdr)
814 {
815 	struct nmreq_vale_polling *req =
816 		(struct nmreq_vale_polling *)(uintptr_t)hdr->nr_body;
817 	struct netmap_adapter *na = NULL;
818 	int error = 0;
819 
820 	NMG_LOCK();
821 	error = netmap_get_vale_na(hdr, &na, NULL, /*create=*/0);
822 	if (na && !error) {
823 		if (!nm_is_bwrap(na)) {
824 			error = EOPNOTSUPP;
825 		} else if (hdr->nr_reqtype == NETMAP_BDG_POLLING_ON) {
826 			error = nm_bdg_ctl_polling_start(req, na);
827 			if (!error)
828 				netmap_adapter_get(na);
829 		} else {
830 			error = nm_bdg_ctl_polling_stop(na);
831 			if (!error)
832 				netmap_adapter_put(na);
833 		}
834 		netmap_adapter_put(na);
835 	} else if (!na && !error) {
836 		/* Not VALE port. */
837 		error = EINVAL;
838 	}
839 	NMG_UNLOCK();
840 
841 	return error;
842 }
843 
844 /* Called by external kernel modules (e.g., Openvswitch).
845  * to set configure/lookup/dtor functions of a VALE instance.
846  * Register callbacks to the given bridge. 'name' may be just
847  * bridge's name (including ':' if it is not just NM_BDG_NAME).
848  *
849  * Called without NMG_LOCK.
850  */
851 
852 int
853 netmap_bdg_regops(const char *name, struct netmap_bdg_ops *bdg_ops, void *private_data, void *auth_token)
854 {
855 	struct nm_bridge *b;
856 	int error = 0;
857 
858 	NMG_LOCK();
859 	b = nm_find_bridge(name, 0 /* don't create */, NULL);
860 	if (!b) {
861 		error = ENXIO;
862 		goto unlock_regops;
863 	}
864 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
865 		error = EACCES;
866 		goto unlock_regops;
867 	}
868 
869 	BDG_WLOCK(b);
870 	if (!bdg_ops) {
871 		/* resetting the bridge */
872 		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
873 		b->bdg_ops = b->bdg_saved_ops;
874 		b->private_data = b->ht;
875 	} else {
876 		/* modifying the bridge */
877 		b->private_data = private_data;
878 #define nm_bdg_override(m) if (bdg_ops->m) b->bdg_ops.m = bdg_ops->m
879 		nm_bdg_override(lookup);
880 		nm_bdg_override(config);
881 		nm_bdg_override(dtor);
882 		nm_bdg_override(vp_create);
883 		nm_bdg_override(bwrap_attach);
884 #undef nm_bdg_override
885 
886 	}
887 	BDG_WUNLOCK(b);
888 
889 unlock_regops:
890 	NMG_UNLOCK();
891 	return error;
892 }
893 
894 
895 int
896 netmap_bdg_config(struct nm_ifreq *nr)
897 {
898 	struct nm_bridge *b;
899 	int error = EINVAL;
900 
901 	NMG_LOCK();
902 	b = nm_find_bridge(nr->nifr_name, 0, NULL);
903 	if (!b) {
904 		NMG_UNLOCK();
905 		return error;
906 	}
907 	NMG_UNLOCK();
908 	/* Don't call config() with NMG_LOCK() held */
909 	BDG_RLOCK(b);
910 	if (b->bdg_ops.config != NULL)
911 		error = b->bdg_ops.config(nr);
912 	BDG_RUNLOCK(b);
913 	return error;
914 }
915 
916 
917 /* nm_register callback for VALE ports */
918 int
919 netmap_vp_reg(struct netmap_adapter *na, int onoff)
920 {
921 	struct netmap_vp_adapter *vpna =
922 		(struct netmap_vp_adapter*)na;
923 	enum txrx t;
924 	int i;
925 
926 	/* persistent ports may be put in netmap mode
927 	 * before being attached to a bridge
928 	 */
929 	if (vpna->na_bdg)
930 		BDG_WLOCK(vpna->na_bdg);
931 	if (onoff) {
932 		for_rx_tx(t) {
933 			for (i = 0; i < netmap_real_rings(na, t); i++) {
934 				struct netmap_kring *kring = NMR(na, t)[i];
935 
936 				if (nm_kring_pending_on(kring))
937 					kring->nr_mode = NKR_NETMAP_ON;
938 			}
939 		}
940 		if (na->active_fds == 0)
941 			na->na_flags |= NAF_NETMAP_ON;
942 		 /* XXX on FreeBSD, persistent VALE ports should also
943 		 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
944 		 */
945 	} else {
946 		if (na->active_fds == 0)
947 			na->na_flags &= ~NAF_NETMAP_ON;
948 		for_rx_tx(t) {
949 			for (i = 0; i < netmap_real_rings(na, t); i++) {
950 				struct netmap_kring *kring = NMR(na, t)[i];
951 
952 				if (nm_kring_pending_off(kring))
953 					kring->nr_mode = NKR_NETMAP_OFF;
954 			}
955 		}
956 	}
957 	if (vpna->na_bdg)
958 		BDG_WUNLOCK(vpna->na_bdg);
959 	return 0;
960 }
961 
962 
963 /* rxsync code used by VALE ports nm_rxsync callback and also
964  * internally by the brwap
965  */
966 static int
967 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
968 {
969 	struct netmap_adapter *na = kring->na;
970 	struct netmap_ring *ring = kring->ring;
971 	u_int nm_i, lim = kring->nkr_num_slots - 1;
972 	u_int head = kring->rhead;
973 	int n;
974 
975 	if (head > lim) {
976 		nm_prerr("ouch dangerous reset!!!");
977 		n = netmap_ring_reinit(kring);
978 		goto done;
979 	}
980 
981 	/* First part, import newly received packets. */
982 	/* actually nothing to do here, they are already in the kring */
983 
984 	/* Second part, skip past packets that userspace has released. */
985 	nm_i = kring->nr_hwcur;
986 	if (nm_i != head) {
987 		/* consistency check, but nothing really important here */
988 		for (n = 0; likely(nm_i != head); n++) {
989 			struct netmap_slot *slot = &ring->slot[nm_i];
990 			void *addr = NMB(na, slot);
991 
992 			if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
993 				nm_prerr("bad buffer index %d, ignore ?",
994 					slot->buf_idx);
995 			}
996 			slot->flags &= ~NS_BUF_CHANGED;
997 			nm_i = nm_next(nm_i, lim);
998 		}
999 		kring->nr_hwcur = head;
1000 	}
1001 
1002 	n = 0;
1003 done:
1004 	return n;
1005 }
1006 
1007 /*
1008  * nm_rxsync callback for VALE ports
1009  * user process reading from a VALE switch.
1010  * Already protected against concurrent calls from userspace,
1011  * but we must acquire the queue's lock to protect against
1012  * writers on the same queue.
1013  */
1014 int
1015 netmap_vp_rxsync(struct netmap_kring *kring, int flags)
1016 {
1017 	int n;
1018 
1019 	mtx_lock(&kring->q_lock);
1020 	n = netmap_vp_rxsync_locked(kring, flags);
1021 	mtx_unlock(&kring->q_lock);
1022 	return n;
1023 }
1024 
1025 int
1026 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna,
1027 		struct netmap_bdg_ops *ops)
1028 {
1029 	return ops->bwrap_attach(nr_name, hwna);
1030 }
1031 
1032 
1033 /* Bridge wrapper code (bwrap).
1034  * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
1035  * VALE switch.
1036  * The main task is to swap the meaning of tx and rx rings to match the
1037  * expectations of the VALE switch code (see nm_bdg_flush).
1038  *
1039  * The bwrap works by interposing a netmap_bwrap_adapter between the
1040  * rest of the system and the hwna. The netmap_bwrap_adapter looks like
1041  * a netmap_vp_adapter to the rest the system, but, internally, it
1042  * translates all callbacks to what the hwna expects.
1043  *
1044  * Note that we have to intercept callbacks coming from two sides:
1045  *
1046  *  - callbacks coming from the netmap module are intercepted by
1047  *    passing around the netmap_bwrap_adapter instead of the hwna
1048  *
1049  *  - callbacks coming from outside of the netmap module only know
1050  *    about the hwna. This, however, only happens in interrupt
1051  *    handlers, where only the hwna->nm_notify callback is called.
1052  *    What the bwrap does is to overwrite the hwna->nm_notify callback
1053  *    with its own netmap_bwrap_intr_notify.
1054  *    XXX This assumes that the hwna->nm_notify callback was the
1055  *    standard netmap_notify(), as it is the case for nic adapters.
1056  *    Any additional action performed by hwna->nm_notify will not be
1057  *    performed by netmap_bwrap_intr_notify.
1058  *
1059  * Additionally, the bwrap can optionally attach the host rings pair
1060  * of the wrapped adapter to a different port of the switch.
1061  */
1062 
1063 
1064 static void
1065 netmap_bwrap_dtor(struct netmap_adapter *na)
1066 {
1067 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1068 	struct netmap_adapter *hwna = bna->hwna;
1069 	struct nm_bridge *b = bna->up.na_bdg,
1070 		*bh = bna->host.na_bdg;
1071 
1072 	if (bna->host.up.nm_mem)
1073 		netmap_mem_put(bna->host.up.nm_mem);
1074 
1075 	if (b) {
1076 		netmap_bdg_detach_common(b, bna->up.bdg_port,
1077 			    (bh ? bna->host.bdg_port : -1));
1078 	}
1079 
1080 	ND("na %p", na);
1081 	na->ifp = NULL;
1082 	bna->host.up.ifp = NULL;
1083 	hwna->na_vp = bna->saved_na_vp;
1084 	hwna->na_hostvp = NULL;
1085 	hwna->na_private = NULL;
1086 	hwna->na_flags &= ~NAF_BUSY;
1087 	netmap_adapter_put(hwna);
1088 
1089 }
1090 
1091 
1092 /*
1093  * Intr callback for NICs connected to a bridge.
1094  * Simply ignore tx interrupts (maybe we could try to recover space ?)
1095  * and pass received packets from nic to the bridge.
1096  *
1097  * XXX TODO check locking: this is called from the interrupt
1098  * handler so we should make sure that the interface is not
1099  * disconnected while passing down an interrupt.
1100  *
1101  * Note, no user process can access this NIC or the host stack.
1102  * The only part of the ring that is significant are the slots,
1103  * and head/cur/tail are set from the kring as needed
1104  * (part as a receive ring, part as a transmit ring).
1105  *
1106  * callback that overwrites the hwna notify callback.
1107  * Packets come from the outside or from the host stack and are put on an
1108  * hwna rx ring.
1109  * The bridge wrapper then sends the packets through the bridge.
1110  */
1111 static int
1112 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
1113 {
1114 	struct netmap_adapter *na = kring->na;
1115 	struct netmap_bwrap_adapter *bna = na->na_private;
1116 	struct netmap_kring *bkring;
1117 	struct netmap_vp_adapter *vpna = &bna->up;
1118 	u_int ring_nr = kring->ring_id;
1119 	int ret = NM_IRQ_COMPLETED;
1120 	int error;
1121 
1122 	if (netmap_debug & NM_DEBUG_RXINTR)
1123 	    nm_prinf("%s %s 0x%x", na->name, kring->name, flags);
1124 
1125 	bkring = vpna->up.tx_rings[ring_nr];
1126 
1127 	/* make sure the ring is not disabled */
1128 	if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) {
1129 		return EIO;
1130 	}
1131 
1132 	if (netmap_debug & NM_DEBUG_RXINTR)
1133 	    nm_prinf("%s head %d cur %d tail %d",  na->name,
1134 		kring->rhead, kring->rcur, kring->rtail);
1135 
1136 	/* simulate a user wakeup on the rx ring
1137 	 * fetch packets that have arrived.
1138 	 */
1139 	error = kring->nm_sync(kring, 0);
1140 	if (error)
1141 		goto put_out;
1142 	if (kring->nr_hwcur == kring->nr_hwtail) {
1143 		if (netmap_verbose)
1144 			nm_prerr("how strange, interrupt with no packets on %s",
1145 			    na->name);
1146 		goto put_out;
1147 	}
1148 
1149 	/* new packets are kring->rcur to kring->nr_hwtail, and the bkring
1150 	 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail
1151 	 * to push all packets out.
1152 	 */
1153 	bkring->rhead = bkring->rcur = kring->nr_hwtail;
1154 
1155 	bkring->nm_sync(bkring, flags);
1156 
1157 	/* mark all buffers as released on this ring */
1158 	kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail;
1159 	/* another call to actually release the buffers */
1160 	error = kring->nm_sync(kring, 0);
1161 
1162 	/* The second rxsync may have further advanced hwtail. If this happens,
1163 	 *  return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */
1164 	if (kring->rcur != kring->nr_hwtail) {
1165 		ret = NM_IRQ_RESCHED;
1166 	}
1167 put_out:
1168 	nm_kr_put(kring);
1169 
1170 	return error ? error : ret;
1171 }
1172 
1173 
1174 /* nm_register callback for bwrap */
1175 int
1176 netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
1177 {
1178 	struct netmap_bwrap_adapter *bna =
1179 		(struct netmap_bwrap_adapter *)na;
1180 	struct netmap_adapter *hwna = bna->hwna;
1181 	struct netmap_vp_adapter *hostna = &bna->host;
1182 	int error, i;
1183 	enum txrx t;
1184 
1185 	ND("%s %s", na->name, onoff ? "on" : "off");
1186 
1187 	if (onoff) {
1188 		/* netmap_do_regif has been called on the bwrap na.
1189 		 * We need to pass the information about the
1190 		 * memory allocator down to the hwna before
1191 		 * putting it in netmap mode
1192 		 */
1193 		hwna->na_lut = na->na_lut;
1194 
1195 		if (hostna->na_bdg) {
1196 			/* if the host rings have been attached to switch,
1197 			 * we need to copy the memory allocator information
1198 			 * in the hostna also
1199 			 */
1200 			hostna->up.na_lut = na->na_lut;
1201 		}
1202 
1203 	}
1204 
1205 	/* pass down the pending ring state information */
1206 	for_rx_tx(t) {
1207 		for (i = 0; i < netmap_all_rings(na, t); i++) {
1208 			NMR(hwna, nm_txrx_swap(t))[i]->nr_pending_mode =
1209 				NMR(na, t)[i]->nr_pending_mode;
1210 		}
1211 	}
1212 
1213 	/* forward the request to the hwna */
1214 	error = hwna->nm_register(hwna, onoff);
1215 	if (error)
1216 		return error;
1217 
1218 	/* copy up the current ring state information */
1219 	for_rx_tx(t) {
1220 		for (i = 0; i < netmap_all_rings(na, t); i++) {
1221 			struct netmap_kring *kring = NMR(hwna, nm_txrx_swap(t))[i];
1222 			NMR(na, t)[i]->nr_mode = kring->nr_mode;
1223 		}
1224 	}
1225 
1226 	/* impersonate a netmap_vp_adapter */
1227 	netmap_vp_reg(na, onoff);
1228 	if (hostna->na_bdg)
1229 		netmap_vp_reg(&hostna->up, onoff);
1230 
1231 	if (onoff) {
1232 		u_int i;
1233 		/* intercept the hwna nm_nofify callback on the hw rings */
1234 		for (i = 0; i < hwna->num_rx_rings; i++) {
1235 			hwna->rx_rings[i]->save_notify = hwna->rx_rings[i]->nm_notify;
1236 			hwna->rx_rings[i]->nm_notify = netmap_bwrap_intr_notify;
1237 		}
1238 		i = hwna->num_rx_rings; /* for safety */
1239 		/* save the host ring notify unconditionally */
1240 		for (; i < netmap_real_rings(hwna, NR_RX); i++) {
1241 			hwna->rx_rings[i]->save_notify =
1242 				hwna->rx_rings[i]->nm_notify;
1243 			if (hostna->na_bdg) {
1244 				/* also intercept the host ring notify */
1245 				hwna->rx_rings[i]->nm_notify =
1246 					netmap_bwrap_intr_notify;
1247 				na->tx_rings[i]->nm_sync = na->nm_txsync;
1248 			}
1249 		}
1250 		if (na->active_fds == 0)
1251 			na->na_flags |= NAF_NETMAP_ON;
1252 	} else {
1253 		u_int i;
1254 
1255 		if (na->active_fds == 0)
1256 			na->na_flags &= ~NAF_NETMAP_ON;
1257 
1258 		/* reset all notify callbacks (including host ring) */
1259 		for (i = 0; i < netmap_all_rings(hwna, NR_RX); i++) {
1260 			hwna->rx_rings[i]->nm_notify =
1261 				hwna->rx_rings[i]->save_notify;
1262 			hwna->rx_rings[i]->save_notify = NULL;
1263 		}
1264 		hwna->na_lut.lut = NULL;
1265 		hwna->na_lut.plut = NULL;
1266 		hwna->na_lut.objtotal = 0;
1267 		hwna->na_lut.objsize = 0;
1268 
1269 		/* pass ownership of the netmap rings to the hwna */
1270 		for_rx_tx(t) {
1271 			for (i = 0; i < netmap_all_rings(na, t); i++) {
1272 				NMR(na, t)[i]->ring = NULL;
1273 			}
1274 		}
1275 		/* reset the number of host rings to default */
1276 		for_rx_tx(t) {
1277 			nma_set_host_nrings(hwna, t, 1);
1278 		}
1279 
1280 	}
1281 
1282 	return 0;
1283 }
1284 
1285 /* nm_config callback for bwrap */
1286 static int
1287 netmap_bwrap_config(struct netmap_adapter *na, struct nm_config_info *info)
1288 {
1289 	struct netmap_bwrap_adapter *bna =
1290 		(struct netmap_bwrap_adapter *)na;
1291 	struct netmap_adapter *hwna = bna->hwna;
1292 	int error;
1293 
1294 	/* Forward the request to the hwna. It may happen that nobody
1295 	 * registered hwna yet, so netmap_mem_get_lut() may have not
1296 	 * been called yet. */
1297 	error = netmap_mem_get_lut(hwna->nm_mem, &hwna->na_lut);
1298 	if (error)
1299 		return error;
1300 	netmap_update_config(hwna);
1301 	/* swap the results and propagate */
1302 	info->num_tx_rings = hwna->num_rx_rings;
1303 	info->num_tx_descs = hwna->num_rx_desc;
1304 	info->num_rx_rings = hwna->num_tx_rings;
1305 	info->num_rx_descs = hwna->num_tx_desc;
1306 	info->rx_buf_maxsize = hwna->rx_buf_maxsize;
1307 
1308 	return 0;
1309 }
1310 
1311 
1312 /* nm_krings_create callback for bwrap */
1313 int
1314 netmap_bwrap_krings_create_common(struct netmap_adapter *na)
1315 {
1316 	struct netmap_bwrap_adapter *bna =
1317 		(struct netmap_bwrap_adapter *)na;
1318 	struct netmap_adapter *hwna = bna->hwna;
1319 	struct netmap_adapter *hostna = &bna->host.up;
1320 	int i, error = 0;
1321 	enum txrx t;
1322 
1323 	/* also create the hwna krings */
1324 	error = hwna->nm_krings_create(hwna);
1325 	if (error) {
1326 		return error;
1327 	}
1328 
1329 	/* increment the usage counter for all the hwna krings */
1330 	for_rx_tx(t) {
1331 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1332 			NMR(hwna, t)[i]->users++;
1333 		}
1334 	}
1335 
1336 	/* now create the actual rings */
1337 	error = netmap_mem_rings_create(hwna);
1338 	if (error) {
1339 		goto err_dec_users;
1340 	}
1341 
1342 	/* cross-link the netmap rings
1343 	 * The original number of rings comes from hwna,
1344 	 * rx rings on one side equals tx rings on the other.
1345 	 */
1346 	for_rx_tx(t) {
1347 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
1348 		for (i = 0; i < netmap_all_rings(hwna, r); i++) {
1349 			NMR(na, t)[i]->nkr_num_slots = NMR(hwna, r)[i]->nkr_num_slots;
1350 			NMR(na, t)[i]->ring = NMR(hwna, r)[i]->ring;
1351 		}
1352 	}
1353 
1354 	if (na->na_flags & NAF_HOST_RINGS) {
1355 		/* the hostna rings are the host rings of the bwrap.
1356 		 * The corresponding krings must point back to the
1357 		 * hostna
1358 		 */
1359 		hostna->tx_rings = &na->tx_rings[na->num_tx_rings];
1360 		hostna->rx_rings = &na->rx_rings[na->num_rx_rings];
1361 		for_rx_tx(t) {
1362 			for (i = 0; i < nma_get_nrings(hostna, t); i++) {
1363 				NMR(hostna, t)[i]->na = hostna;
1364 			}
1365 		}
1366 	}
1367 
1368 	return 0;
1369 
1370 err_dec_users:
1371 	for_rx_tx(t) {
1372 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1373 			NMR(hwna, t)[i]->users--;
1374 		}
1375 	}
1376 	hwna->nm_krings_delete(hwna);
1377 	return error;
1378 }
1379 
1380 
1381 void
1382 netmap_bwrap_krings_delete_common(struct netmap_adapter *na)
1383 {
1384 	struct netmap_bwrap_adapter *bna =
1385 		(struct netmap_bwrap_adapter *)na;
1386 	struct netmap_adapter *hwna = bna->hwna;
1387 	enum txrx t;
1388 	int i;
1389 
1390 	ND("%s", na->name);
1391 
1392 	/* decrement the usage counter for all the hwna krings */
1393 	for_rx_tx(t) {
1394 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1395 			NMR(hwna, t)[i]->users--;
1396 		}
1397 	}
1398 
1399 	/* delete any netmap rings that are no longer needed */
1400 	netmap_mem_rings_delete(hwna);
1401 	hwna->nm_krings_delete(hwna);
1402 }
1403 
1404 
1405 /* notify method for the bridge-->hwna direction */
1406 int
1407 netmap_bwrap_notify(struct netmap_kring *kring, int flags)
1408 {
1409 	struct netmap_adapter *na = kring->na;
1410 	struct netmap_bwrap_adapter *bna = na->na_private;
1411 	struct netmap_adapter *hwna = bna->hwna;
1412 	u_int ring_n = kring->ring_id;
1413 	u_int lim = kring->nkr_num_slots - 1;
1414 	struct netmap_kring *hw_kring;
1415 	int error;
1416 
1417 	ND("%s: na %s hwna %s",
1418 			(kring ? kring->name : "NULL!"),
1419 			(na ? na->name : "NULL!"),
1420 			(hwna ? hwna->name : "NULL!"));
1421 	hw_kring = hwna->tx_rings[ring_n];
1422 
1423 	if (nm_kr_tryget(hw_kring, 0, NULL)) {
1424 		return ENXIO;
1425 	}
1426 
1427 	/* first step: simulate a user wakeup on the rx ring */
1428 	netmap_vp_rxsync(kring, flags);
1429 	ND("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1430 		na->name, ring_n,
1431 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1432 		kring->rhead, kring->rcur, kring->rtail,
1433 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
1434 	/* second step: the new packets are sent on the tx ring
1435 	 * (which is actually the same ring)
1436 	 */
1437 	hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
1438 	error = hw_kring->nm_sync(hw_kring, flags);
1439 	if (error)
1440 		goto put_out;
1441 
1442 	/* third step: now we are back the rx ring */
1443 	/* claim ownership on all hw owned bufs */
1444 	kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */
1445 
1446 	/* fourth step: the user goes to sleep again, causing another rxsync */
1447 	netmap_vp_rxsync(kring, flags);
1448 	ND("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1449 		na->name, ring_n,
1450 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1451 		kring->rhead, kring->rcur, kring->rtail,
1452 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
1453 put_out:
1454 	nm_kr_put(hw_kring);
1455 
1456 	return error ? error : NM_IRQ_COMPLETED;
1457 }
1458 
1459 
1460 /* nm_bdg_ctl callback for the bwrap.
1461  * Called on bridge-attach and detach, as an effect of vale-ctl -[ahd].
1462  * On attach, it needs to provide a fake netmap_priv_d structure and
1463  * perform a netmap_do_regif() on the bwrap. This will put both the
1464  * bwrap and the hwna in netmap mode, with the netmap rings shared
1465  * and cross linked. Moroever, it will start intercepting interrupts
1466  * directed to hwna.
1467  */
1468 static int
1469 netmap_bwrap_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
1470 {
1471 	struct netmap_priv_d *npriv;
1472 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1473 	int error = 0;
1474 
1475 	if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
1476 		struct nmreq_vale_attach *req =
1477 			(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
1478 		if (req->reg.nr_ringid != 0 ||
1479 			(req->reg.nr_mode != NR_REG_ALL_NIC &&
1480 				req->reg.nr_mode != NR_REG_NIC_SW)) {
1481 			/* We only support attaching all the NIC rings
1482 			 * and/or the host stack. */
1483 			return EINVAL;
1484 		}
1485 		if (NETMAP_OWNED_BY_ANY(na)) {
1486 			return EBUSY;
1487 		}
1488 		if (bna->na_kpriv) {
1489 			/* nothing to do */
1490 			return 0;
1491 		}
1492 		npriv = netmap_priv_new();
1493 		if (npriv == NULL)
1494 			return ENOMEM;
1495 		npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */
1496 		error = netmap_do_regif(npriv, na, req->reg.nr_mode,
1497 					req->reg.nr_ringid, req->reg.nr_flags);
1498 		if (error) {
1499 			netmap_priv_delete(npriv);
1500 			return error;
1501 		}
1502 		bna->na_kpriv = npriv;
1503 		na->na_flags |= NAF_BUSY;
1504 	} else {
1505 		if (na->active_fds == 0) /* not registered */
1506 			return EINVAL;
1507 		netmap_priv_delete(bna->na_kpriv);
1508 		bna->na_kpriv = NULL;
1509 		na->na_flags &= ~NAF_BUSY;
1510 	}
1511 
1512 	return error;
1513 }
1514 
1515 /* attach a bridge wrapper to the 'real' device */
1516 int
1517 netmap_bwrap_attach_common(struct netmap_adapter *na,
1518 		struct netmap_adapter *hwna)
1519 {
1520 	struct netmap_bwrap_adapter *bna;
1521 	struct netmap_adapter *hostna = NULL;
1522 	int error = 0;
1523 	enum txrx t;
1524 
1525 	/* make sure the NIC is not already in use */
1526 	if (NETMAP_OWNED_BY_ANY(hwna)) {
1527 		nm_prerr("NIC %s busy, cannot attach to bridge", hwna->name);
1528 		return EBUSY;
1529 	}
1530 
1531 	bna = (struct netmap_bwrap_adapter *)na;
1532 	/* make bwrap ifp point to the real ifp */
1533 	na->ifp = hwna->ifp;
1534 	if_ref(na->ifp);
1535 	na->na_private = bna;
1536 	/* fill the ring data for the bwrap adapter with rx/tx meanings
1537 	 * swapped. The real cross-linking will be done during register,
1538 	 * when all the krings will have been created.
1539 	 */
1540 	for_rx_tx(t) {
1541 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
1542 		nma_set_nrings(na, t, nma_get_nrings(hwna, r));
1543 		nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
1544 	}
1545 	na->nm_dtor = netmap_bwrap_dtor;
1546 	na->nm_config = netmap_bwrap_config;
1547 	na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
1548 	na->pdev = hwna->pdev;
1549 	na->nm_mem = netmap_mem_get(hwna->nm_mem);
1550 	na->virt_hdr_len = hwna->virt_hdr_len;
1551 	na->rx_buf_maxsize = hwna->rx_buf_maxsize;
1552 
1553 	bna->hwna = hwna;
1554 	netmap_adapter_get(hwna);
1555 	hwna->na_private = bna; /* weak reference */
1556 	bna->saved_na_vp = hwna->na_vp;
1557 	hwna->na_vp = &bna->up;
1558 	bna->up.up.na_vp = &(bna->up);
1559 
1560 	if (hwna->na_flags & NAF_HOST_RINGS) {
1561 		if (hwna->na_flags & NAF_SW_ONLY)
1562 			na->na_flags |= NAF_SW_ONLY;
1563 		na->na_flags |= NAF_HOST_RINGS;
1564 		hostna = &bna->host.up;
1565 
1566 		/* limit the number of host rings to that of hw */
1567 		nm_bound_var(&hostna->num_tx_rings, 1, 1,
1568 				nma_get_nrings(hwna, NR_TX), NULL);
1569 		nm_bound_var(&hostna->num_rx_rings, 1, 1,
1570 				nma_get_nrings(hwna, NR_RX), NULL);
1571 
1572 		snprintf(hostna->name, sizeof(hostna->name), "%s^", na->name);
1573 		hostna->ifp = hwna->ifp;
1574 		for_rx_tx(t) {
1575 			enum txrx r = nm_txrx_swap(t);
1576 			u_int nr = nma_get_nrings(hostna, t);
1577 
1578 			nma_set_nrings(hostna, t, nr);
1579 			nma_set_host_nrings(na, t, nr);
1580 			if (nma_get_host_nrings(hwna, t) < nr) {
1581 				nma_set_host_nrings(hwna, t, nr);
1582 			}
1583 			nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
1584 		}
1585 		// hostna->nm_txsync = netmap_bwrap_host_txsync;
1586 		// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
1587 		hostna->nm_mem = netmap_mem_get(na->nm_mem);
1588 		hostna->na_private = bna;
1589 		hostna->na_vp = &bna->up;
1590 		na->na_hostvp = hwna->na_hostvp =
1591 			hostna->na_hostvp = &bna->host;
1592 		hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
1593 		hostna->rx_buf_maxsize = hwna->rx_buf_maxsize;
1594 	}
1595 	if (hwna->na_flags & NAF_MOREFRAG)
1596 		na->na_flags |= NAF_MOREFRAG;
1597 
1598 	ND("%s<->%s txr %d txd %d rxr %d rxd %d",
1599 		na->name, ifp->if_xname,
1600 		na->num_tx_rings, na->num_tx_desc,
1601 		na->num_rx_rings, na->num_rx_desc);
1602 
1603 	error = netmap_attach_common(na);
1604 	if (error) {
1605 		goto err_put;
1606 	}
1607 	hwna->na_flags |= NAF_BUSY;
1608 	return 0;
1609 
1610 err_put:
1611 	hwna->na_vp = hwna->na_hostvp = NULL;
1612 	netmap_adapter_put(hwna);
1613 	return error;
1614 
1615 }
1616 
1617 struct nm_bridge *
1618 netmap_init_bridges2(u_int n)
1619 {
1620 	int i;
1621 	struct nm_bridge *b;
1622 
1623 	b = nm_os_malloc(sizeof(struct nm_bridge) * n);
1624 	if (b == NULL)
1625 		return NULL;
1626 	for (i = 0; i < n; i++)
1627 		BDG_RWINIT(&b[i]);
1628 	return b;
1629 }
1630 
1631 void
1632 netmap_uninit_bridges2(struct nm_bridge *b, u_int n)
1633 {
1634 	int i;
1635 
1636 	if (b == NULL)
1637 		return;
1638 
1639 	for (i = 0; i < n; i++)
1640 		BDG_RWDESTROY(&b[i]);
1641 	nm_os_free(b);
1642 }
1643 
1644 int
1645 netmap_init_bridges(void)
1646 {
1647 #ifdef CONFIG_NET_NS
1648 	return netmap_bns_register();
1649 #else
1650 	nm_bridges = netmap_init_bridges2(NM_BRIDGES);
1651 	if (nm_bridges == NULL)
1652 		return ENOMEM;
1653 	return 0;
1654 #endif
1655 }
1656 
1657 void
1658 netmap_uninit_bridges(void)
1659 {
1660 #ifdef CONFIG_NET_NS
1661 	netmap_bns_unregister();
1662 #else
1663 	netmap_uninit_bridges2(nm_bridges, NM_BRIDGES);
1664 #endif
1665 }
1666