xref: /freebsd/sys/dev/netmap/netmap_bdg.c (revision bc7512cc58af2e8bbe5bbf5ca0059b1daa1da897)
1 /*
2  * Copyright (C) 2013-2016 Universita` di Pisa
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *   1. Redistributions of source code must retain the above copyright
9  *      notice, this list of conditions and the following disclaimer.
10  *   2. Redistributions in binary form must reproduce the above copyright
11  *      notice, this list of conditions and the following disclaimer in the
12  *      documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 
28 /*
29  * This module implements the VALE switch for netmap
30 
31 --- VALE SWITCH ---
32 
33 NMG_LOCK() serializes all modifications to switches and ports.
34 A switch cannot be deleted until all ports are gone.
35 
36 For each switch, an SX lock (RWlock on linux) protects
37 deletion of ports. When configuring or deleting a new port, the
38 lock is acquired in exclusive mode (after holding NMG_LOCK).
39 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
40 The lock is held throughout the entire forwarding cycle,
41 during which the thread may incur in a page fault.
42 Hence it is important that sleepable shared locks are used.
43 
44 On the rx ring, the per-port lock is grabbed initially to reserve
45 a number of slot in the ring, then the lock is released,
46 packets are copied from source to destination, and then
47 the lock is acquired again and the receive ring is updated.
48 (A similar thing is done on the tx ring for NIC and host stack
49 ports attached to the switch)
50 
51  */
52 
53 /*
54  * OS-specific code that is used only within this file.
55  * Other OS-specific code that must be accessed by drivers
56  * is present in netmap_kern.h
57  */
58 
59 #if defined(__FreeBSD__)
60 #include <sys/cdefs.h> /* prerequisite */
61 __FBSDID("$FreeBSD$");
62 
63 #include <sys/types.h>
64 #include <sys/errno.h>
65 #include <sys/param.h>	/* defines used in kernel.h */
66 #include <sys/kernel.h>	/* types used in module initialization */
67 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
68 #include <sys/sockio.h>
69 #include <sys/socketvar.h>	/* struct socket */
70 #include <sys/malloc.h>
71 #include <sys/poll.h>
72 #include <sys/rwlock.h>
73 #include <sys/socket.h> /* sockaddrs */
74 #include <sys/selinfo.h>
75 #include <sys/sysctl.h>
76 #include <net/if.h>
77 #include <net/if_var.h>
78 #include <net/bpf.h>		/* BIOCIMMEDIATE */
79 #include <machine/bus.h>	/* bus_dmamap_* */
80 #include <sys/endian.h>
81 #include <sys/refcount.h>
82 #include <sys/smp.h>
83 
84 
85 #elif defined(linux)
86 
87 #include "bsd_glue.h"
88 
89 #elif defined(__APPLE__)
90 
91 #warning OSX support is only partial
92 #include "osx_glue.h"
93 
94 #elif defined(_WIN32)
95 #include "win_glue.h"
96 
97 #else
98 
99 #error	Unsupported platform
100 
101 #endif /* unsupported */
102 
103 /*
104  * common headers
105  */
106 
107 #include <net/netmap.h>
108 #include <dev/netmap/netmap_kern.h>
109 #include <dev/netmap/netmap_mem2.h>
110 
111 #include <dev/netmap/netmap_bdg.h>
112 
113 const char*
114 netmap_bdg_name(struct netmap_vp_adapter *vp)
115 {
116 	struct nm_bridge *b = vp->na_bdg;
117 	if (b == NULL)
118 		return NULL;
119 	return b->bdg_basename;
120 }
121 
122 
123 #ifndef CONFIG_NET_NS
124 /*
125  * XXX in principle nm_bridges could be created dynamically
126  * Right now we have a static array and deletions are protected
127  * by an exclusive lock.
128  */
129 struct nm_bridge *nm_bridges;
130 #endif /* !CONFIG_NET_NS */
131 
132 
133 static int
134 nm_is_id_char(const char c)
135 {
136 	return (c >= 'a' && c <= 'z') ||
137 	       (c >= 'A' && c <= 'Z') ||
138 	       (c >= '0' && c <= '9') ||
139 	       (c == '_');
140 }
141 
142 /* Validate the name of a bdg port and return the
143  * position of the ":" character. */
144 static int
145 nm_bdg_name_validate(const char *name, size_t prefixlen)
146 {
147 	int colon_pos = -1;
148 	int i;
149 
150 	if (!name || strlen(name) < prefixlen) {
151 		return -1;
152 	}
153 
154 	for (i = 0; i < NM_BDG_IFNAMSIZ && name[i]; i++) {
155 		if (name[i] == ':') {
156 			colon_pos = i;
157 			break;
158 		} else if (!nm_is_id_char(name[i])) {
159 			return -1;
160 		}
161 	}
162 
163 	if (strlen(name) - colon_pos > IFNAMSIZ) {
164 		/* interface name too long */
165 		return -1;
166 	}
167 
168 	return colon_pos;
169 }
170 
171 /*
172  * locate a bridge among the existing ones.
173  * MUST BE CALLED WITH NMG_LOCK()
174  *
175  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
176  * We assume that this is called with a name of at least NM_NAME chars.
177  */
178 struct nm_bridge *
179 nm_find_bridge(const char *name, int create, struct netmap_bdg_ops *ops)
180 {
181 	int i, namelen;
182 	struct nm_bridge *b = NULL, *bridges;
183 	u_int num_bridges;
184 
185 	NMG_LOCK_ASSERT();
186 
187 	netmap_bns_getbridges(&bridges, &num_bridges);
188 
189 	namelen = nm_bdg_name_validate(name,
190 			(ops != NULL ? strlen(ops->name) : 0));
191 	if (namelen < 0) {
192 		nm_prerr("invalid bridge name %s", name ? name : NULL);
193 		return NULL;
194 	}
195 
196 	/* lookup the name, remember empty slot if there is one */
197 	for (i = 0; i < num_bridges; i++) {
198 		struct nm_bridge *x = bridges + i;
199 
200 		if ((x->bdg_flags & NM_BDG_ACTIVE) + x->bdg_active_ports == 0) {
201 			if (create && b == NULL)
202 				b = x;	/* record empty slot */
203 		} else if (x->bdg_namelen != namelen) {
204 			continue;
205 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
206 			nm_prdis("found '%.*s' at %d", namelen, name, i);
207 			b = x;
208 			break;
209 		}
210 	}
211 	if (i == num_bridges && b) { /* name not found, can create entry */
212 		/* initialize the bridge */
213 		nm_prdis("create new bridge %s with ports %d", b->bdg_basename,
214 			b->bdg_active_ports);
215 		b->ht = nm_os_malloc(sizeof(struct nm_hash_ent) * NM_BDG_HASH);
216 		if (b->ht == NULL) {
217 			nm_prerr("failed to allocate hash table");
218 			return NULL;
219 		}
220 		strncpy(b->bdg_basename, name, namelen);
221 		b->bdg_namelen = namelen;
222 		b->bdg_active_ports = 0;
223 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
224 			b->bdg_port_index[i] = i;
225 		/* set the default function */
226 		b->bdg_ops = b->bdg_saved_ops = *ops;
227 		b->private_data = b->ht;
228 		b->bdg_flags = 0;
229 		NM_BNS_GET(b);
230 	}
231 	return b;
232 }
233 
234 
235 int
236 netmap_bdg_free(struct nm_bridge *b)
237 {
238 	if ((b->bdg_flags & NM_BDG_ACTIVE) + b->bdg_active_ports != 0) {
239 		return EBUSY;
240 	}
241 
242 	nm_prdis("marking bridge %s as free", b->bdg_basename);
243 	nm_os_free(b->ht);
244 	memset(&b->bdg_ops, 0, sizeof(b->bdg_ops));
245 	memset(&b->bdg_saved_ops, 0, sizeof(b->bdg_saved_ops));
246 	b->bdg_flags = 0;
247 	NM_BNS_PUT(b);
248 	return 0;
249 }
250 
251 /* Called by external kernel modules (e.g., Openvswitch).
252  * to modify the private data previously given to regops().
253  * 'name' may be just bridge's name (including ':' if it
254  * is not just NM_BDG_NAME).
255  * Called without NMG_LOCK.
256  */
257 int
258 netmap_bdg_update_private_data(const char *name, bdg_update_private_data_fn_t callback,
259 	void *callback_data, void *auth_token)
260 {
261 	void *private_data = NULL;
262 	struct nm_bridge *b;
263 	int error = 0;
264 
265 	NMG_LOCK();
266 	b = nm_find_bridge(name, 0 /* don't create */, NULL);
267 	if (!b) {
268 		error = EINVAL;
269 		goto unlock_update_priv;
270 	}
271 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
272 		error = EACCES;
273 		goto unlock_update_priv;
274 	}
275 	BDG_WLOCK(b);
276 	private_data = callback(b->private_data, callback_data, &error);
277 	b->private_data = private_data;
278 	BDG_WUNLOCK(b);
279 
280 unlock_update_priv:
281 	NMG_UNLOCK();
282 	return error;
283 }
284 
285 
286 
287 /* remove from bridge b the ports in slots hw and sw
288  * (sw can be -1 if not needed)
289  */
290 void
291 netmap_bdg_detach_common(struct nm_bridge *b, int hw, int sw)
292 {
293 	int s_hw = hw, s_sw = sw;
294 	int i, lim =b->bdg_active_ports;
295 	uint32_t *tmp = b->tmp_bdg_port_index;
296 
297 	/*
298 	New algorithm:
299 	make a copy of bdg_port_index;
300 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
301 	in the array of bdg_port_index, replacing them with
302 	entries from the bottom of the array;
303 	decrement bdg_active_ports;
304 	acquire BDG_WLOCK() and copy back the array.
305 	 */
306 
307 	if (netmap_debug & NM_DEBUG_BDG)
308 		nm_prinf("detach %d and %d (lim %d)", hw, sw, lim);
309 	/* make a copy of the list of active ports, update it,
310 	 * and then copy back within BDG_WLOCK().
311 	 */
312 	memcpy(b->tmp_bdg_port_index, b->bdg_port_index, sizeof(b->tmp_bdg_port_index));
313 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
314 		if (hw >= 0 && tmp[i] == hw) {
315 			nm_prdis("detach hw %d at %d", hw, i);
316 			lim--; /* point to last active port */
317 			tmp[i] = tmp[lim]; /* swap with i */
318 			tmp[lim] = hw;	/* now this is inactive */
319 			hw = -1;
320 		} else if (sw >= 0 && tmp[i] == sw) {
321 			nm_prdis("detach sw %d at %d", sw, i);
322 			lim--;
323 			tmp[i] = tmp[lim];
324 			tmp[lim] = sw;
325 			sw = -1;
326 		} else {
327 			i++;
328 		}
329 	}
330 	if (hw >= 0 || sw >= 0) {
331 		nm_prerr("delete failed hw %d sw %d, should panic...", hw, sw);
332 	}
333 
334 	BDG_WLOCK(b);
335 	if (b->bdg_ops.dtor)
336 		b->bdg_ops.dtor(b->bdg_ports[s_hw]);
337 	b->bdg_ports[s_hw] = NULL;
338 	if (s_sw >= 0) {
339 		b->bdg_ports[s_sw] = NULL;
340 	}
341 	memcpy(b->bdg_port_index, b->tmp_bdg_port_index, sizeof(b->tmp_bdg_port_index));
342 	b->bdg_active_ports = lim;
343 	BDG_WUNLOCK(b);
344 
345 	nm_prdis("now %d active ports", lim);
346 	netmap_bdg_free(b);
347 }
348 
349 
350 /* nm_bdg_ctl callback for VALE ports */
351 int
352 netmap_vp_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
353 {
354 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
355 	struct nm_bridge *b = vpna->na_bdg;
356 
357 	if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
358 		return 0; /* nothing to do */
359 	}
360 	if (b) {
361 		netmap_set_all_rings(na, 0 /* disable */);
362 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
363 		vpna->na_bdg = NULL;
364 		netmap_set_all_rings(na, 1 /* enable */);
365 	}
366 	/* I have took reference just for attach */
367 	netmap_adapter_put(na);
368 	return 0;
369 }
370 
371 int
372 netmap_default_bdg_attach(const char *name, struct netmap_adapter *na,
373 		struct nm_bridge *b)
374 {
375 	return NM_NEED_BWRAP;
376 }
377 
378 /* Try to get a reference to a netmap adapter attached to a VALE switch.
379  * If the adapter is found (or is created), this function returns 0, a
380  * non NULL pointer is returned into *na, and the caller holds a
381  * reference to the adapter.
382  * If an adapter is not found, then no reference is grabbed and the
383  * function returns an error code, or 0 if there is just a VALE prefix
384  * mismatch. Therefore the caller holds a reference when
385  * (*na != NULL && return == 0).
386  */
387 int
388 netmap_get_bdg_na(struct nmreq_header *hdr, struct netmap_adapter **na,
389 	struct netmap_mem_d *nmd, int create, struct netmap_bdg_ops *ops)
390 {
391 	char *nr_name = hdr->nr_name;
392 	const char *ifname;
393 	struct ifnet *ifp = NULL;
394 	int error = 0;
395 	struct netmap_vp_adapter *vpna, *hostna = NULL;
396 	struct nm_bridge *b;
397 	uint32_t i, j;
398 	uint32_t cand = NM_BDG_NOPORT, cand2 = NM_BDG_NOPORT;
399 	int needed;
400 
401 	*na = NULL;     /* default return value */
402 
403 	/* first try to see if this is a bridge port. */
404 	NMG_LOCK_ASSERT();
405 	if (strncmp(nr_name, ops->name, strlen(ops->name) - 1)) {
406 		return 0;  /* no error, but no VALE prefix */
407 	}
408 
409 	b = nm_find_bridge(nr_name, create, ops);
410 	if (b == NULL) {
411 		nm_prdis("no bridges available for '%s'", nr_name);
412 		return (create ? ENOMEM : ENXIO);
413 	}
414 	if (strlen(nr_name) < b->bdg_namelen) /* impossible */
415 		panic("x");
416 
417 	/* Now we are sure that name starts with the bridge's name,
418 	 * lookup the port in the bridge. We need to scan the entire
419 	 * list. It is not important to hold a WLOCK on the bridge
420 	 * during the search because NMG_LOCK already guarantees
421 	 * that there are no other possible writers.
422 	 */
423 
424 	/* lookup in the local list of ports */
425 	for (j = 0; j < b->bdg_active_ports; j++) {
426 		i = b->bdg_port_index[j];
427 		vpna = b->bdg_ports[i];
428 		nm_prdis("checking %s", vpna->up.name);
429 		if (!strcmp(vpna->up.name, nr_name)) {
430 			netmap_adapter_get(&vpna->up);
431 			nm_prdis("found existing if %s refs %d", nr_name)
432 			*na = &vpna->up;
433 			return 0;
434 		}
435 	}
436 	/* not found, should we create it? */
437 	if (!create)
438 		return ENXIO;
439 	/* yes we should, see if we have space to attach entries */
440 	needed = 2; /* in some cases we only need 1 */
441 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
442 		nm_prerr("bridge full %d, cannot create new port", b->bdg_active_ports);
443 		return ENOMEM;
444 	}
445 	/* record the next two ports available, but do not allocate yet */
446 	cand = b->bdg_port_index[b->bdg_active_ports];
447 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
448 	nm_prdis("+++ bridge %s port %s used %d avail %d %d",
449 		b->bdg_basename, ifname, b->bdg_active_ports, cand, cand2);
450 
451 	/*
452 	 * try see if there is a matching NIC with this name
453 	 * (after the bridge's name)
454 	 */
455 	ifname = nr_name + b->bdg_namelen + 1;
456 	ifp = ifunit_ref(ifname);
457 	if (!ifp) {
458 		/* Create an ephemeral virtual port.
459 		 * This block contains all the ephemeral-specific logic.
460 		 */
461 
462 		if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
463 			error = EINVAL;
464 			goto out;
465 		}
466 
467 		/* bdg_netmap_attach creates a struct netmap_adapter */
468 		error = b->bdg_ops.vp_create(hdr, NULL, nmd, &vpna);
469 		if (error) {
470 			if (netmap_debug & NM_DEBUG_BDG)
471 				nm_prerr("error %d", error);
472 			goto out;
473 		}
474 		/* shortcut - we can skip get_hw_na(),
475 		 * ownership check and nm_bdg_attach()
476 		 */
477 
478 	} else {
479 		struct netmap_adapter *hw;
480 
481 		/* the vale:nic syntax is only valid for some commands */
482 		switch (hdr->nr_reqtype) {
483 		case NETMAP_REQ_VALE_ATTACH:
484 		case NETMAP_REQ_VALE_DETACH:
485 		case NETMAP_REQ_VALE_POLLING_ENABLE:
486 		case NETMAP_REQ_VALE_POLLING_DISABLE:
487 			break; /* ok */
488 		default:
489 			error = EINVAL;
490 			goto out;
491 		}
492 
493 		error = netmap_get_hw_na(ifp, nmd, &hw);
494 		if (error || hw == NULL)
495 			goto out;
496 
497 		/* host adapter might not be created */
498 		error = hw->nm_bdg_attach(nr_name, hw, b);
499 		if (error == NM_NEED_BWRAP) {
500 			error = b->bdg_ops.bwrap_attach(nr_name, hw);
501 		}
502 		if (error)
503 			goto out;
504 		vpna = hw->na_vp;
505 		hostna = hw->na_hostvp;
506 		if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
507 			/* Check if we need to skip the host rings. */
508 			struct nmreq_vale_attach *areq =
509 				(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
510 			if (areq->reg.nr_mode != NR_REG_NIC_SW) {
511 				hostna = NULL;
512 			}
513 		}
514 	}
515 
516 	BDG_WLOCK(b);
517 	vpna->bdg_port = cand;
518 	nm_prdis("NIC  %p to bridge port %d", vpna, cand);
519 	/* bind the port to the bridge (virtual ports are not active) */
520 	b->bdg_ports[cand] = vpna;
521 	vpna->na_bdg = b;
522 	b->bdg_active_ports++;
523 	if (hostna != NULL) {
524 		/* also bind the host stack to the bridge */
525 		b->bdg_ports[cand2] = hostna;
526 		hostna->bdg_port = cand2;
527 		hostna->na_bdg = b;
528 		b->bdg_active_ports++;
529 		nm_prdis("host %p to bridge port %d", hostna, cand2);
530 	}
531 	nm_prdis("if %s refs %d", ifname, vpna->up.na_refcount);
532 	BDG_WUNLOCK(b);
533 	*na = &vpna->up;
534 	netmap_adapter_get(*na);
535 
536 out:
537 	if (ifp)
538 		if_rele(ifp);
539 
540 	return error;
541 }
542 
543 /* Process NETMAP_REQ_VALE_ATTACH.
544  */
545 int
546 netmap_bdg_attach(struct nmreq_header *hdr, void *auth_token)
547 {
548 	struct nmreq_vale_attach *req =
549 		(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
550 	struct netmap_vp_adapter * vpna;
551 	struct netmap_adapter *na = NULL;
552 	struct netmap_mem_d *nmd = NULL;
553 	struct nm_bridge *b = NULL;
554 	int error;
555 
556 	NMG_LOCK();
557 	/* permission check for modified bridges */
558 	b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
559 	if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
560 		error = EACCES;
561 		goto unlock_exit;
562 	}
563 
564 	if (req->reg.nr_mem_id) {
565 		nmd = netmap_mem_find(req->reg.nr_mem_id);
566 		if (nmd == NULL) {
567 			error = EINVAL;
568 			goto unlock_exit;
569 		}
570 	}
571 
572 	/* check for existing one */
573 	error = netmap_get_vale_na(hdr, &na, nmd, 0);
574 	if (na) {
575 		error = EBUSY;
576 		goto unref_exit;
577 	}
578 	error = netmap_get_vale_na(hdr, &na,
579 				nmd, 1 /* create if not exists */);
580 	if (error) { /* no device */
581 		goto unlock_exit;
582 	}
583 
584 	if (na == NULL) { /* VALE prefix missing */
585 		error = EINVAL;
586 		goto unlock_exit;
587 	}
588 
589 	if (NETMAP_OWNED_BY_ANY(na)) {
590 		error = EBUSY;
591 		goto unref_exit;
592 	}
593 
594 	if (na->nm_bdg_ctl) {
595 		/* nop for VALE ports. The bwrap needs to put the hwna
596 		 * in netmap mode (see netmap_bwrap_bdg_ctl)
597 		 */
598 		error = na->nm_bdg_ctl(hdr, na);
599 		if (error)
600 			goto unref_exit;
601 		nm_prdis("registered %s to netmap-mode", na->name);
602 	}
603 	vpna = (struct netmap_vp_adapter *)na;
604 	req->port_index = vpna->bdg_port;
605 
606 	if (nmd)
607 		netmap_mem_put(nmd);
608 
609 	NMG_UNLOCK();
610 	return 0;
611 
612 unref_exit:
613 	netmap_adapter_put(na);
614 unlock_exit:
615 	if (nmd)
616 		netmap_mem_put(nmd);
617 
618 	NMG_UNLOCK();
619 	return error;
620 }
621 
622 
623 int
624 nm_is_bwrap(struct netmap_adapter *na)
625 {
626 	return na->nm_register == netmap_bwrap_reg;
627 }
628 
629 /* Process NETMAP_REQ_VALE_DETACH.
630  */
631 int
632 netmap_bdg_detach(struct nmreq_header *hdr, void *auth_token)
633 {
634 	int error;
635 
636 	NMG_LOCK();
637 	error = netmap_bdg_detach_locked(hdr, auth_token);
638 	NMG_UNLOCK();
639 	return error;
640 }
641 
642 int
643 netmap_bdg_detach_locked(struct nmreq_header *hdr, void *auth_token)
644 {
645 	struct nmreq_vale_detach *nmreq_det = (void *)(uintptr_t)hdr->nr_body;
646 	struct netmap_vp_adapter *vpna;
647 	struct netmap_adapter *na;
648 	struct nm_bridge *b = NULL;
649 	int error;
650 
651 	/* permission check for modified bridges */
652 	b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
653 	if (b && !nm_bdg_valid_auth_token(b, auth_token)) {
654 		error = EACCES;
655 		goto error_exit;
656 	}
657 
658 	error = netmap_get_vale_na(hdr, &na, NULL, 0 /* don't create */);
659 	if (error) { /* no device, or another bridge or user owns the device */
660 		goto error_exit;
661 	}
662 
663 	if (na == NULL) { /* VALE prefix missing */
664 		error = EINVAL;
665 		goto error_exit;
666 	} else if (nm_is_bwrap(na) &&
667 		   ((struct netmap_bwrap_adapter *)na)->na_polling_state) {
668 		/* Don't detach a NIC with polling */
669 		error = EBUSY;
670 		goto unref_exit;
671 	}
672 
673 	vpna = (struct netmap_vp_adapter *)na;
674 	if (na->na_vp != vpna) {
675 		/* trying to detach first attach of VALE persistent port attached
676 		 * to 2 bridges
677 		 */
678 		error = EBUSY;
679 		goto unref_exit;
680 	}
681 	nmreq_det->port_index = vpna->bdg_port;
682 
683 	if (na->nm_bdg_ctl) {
684 		/* remove the port from bridge. The bwrap
685 		 * also needs to put the hwna in normal mode
686 		 */
687 		error = na->nm_bdg_ctl(hdr, na);
688 	}
689 
690 unref_exit:
691 	netmap_adapter_put(na);
692 error_exit:
693 	return error;
694 
695 }
696 
697 
698 struct nm_bdg_polling_state;
699 struct
700 nm_bdg_kthread {
701 	struct nm_kctx *nmk;
702 	u_int qfirst;
703 	u_int qlast;
704 	struct nm_bdg_polling_state *bps;
705 };
706 
707 struct nm_bdg_polling_state {
708 	bool configured;
709 	bool stopped;
710 	struct netmap_bwrap_adapter *bna;
711 	uint32_t mode;
712 	u_int qfirst;
713 	u_int qlast;
714 	u_int cpu_from;
715 	u_int ncpus;
716 	struct nm_bdg_kthread *kthreads;
717 };
718 
719 static void
720 netmap_bwrap_polling(void *data)
721 {
722 	struct nm_bdg_kthread *nbk = data;
723 	struct netmap_bwrap_adapter *bna;
724 	u_int qfirst, qlast, i;
725 	struct netmap_kring **kring0, *kring;
726 
727 	if (!nbk)
728 		return;
729 	qfirst = nbk->qfirst;
730 	qlast = nbk->qlast;
731 	bna = nbk->bps->bna;
732 	kring0 = NMR(bna->hwna, NR_RX);
733 
734 	for (i = qfirst; i < qlast; i++) {
735 		kring = kring0[i];
736 		kring->nm_notify(kring, 0);
737 	}
738 }
739 
740 static int
741 nm_bdg_create_kthreads(struct nm_bdg_polling_state *bps)
742 {
743 	struct nm_kctx_cfg kcfg;
744 	int i, j;
745 
746 	bps->kthreads = nm_os_malloc(sizeof(struct nm_bdg_kthread) * bps->ncpus);
747 	if (bps->kthreads == NULL)
748 		return ENOMEM;
749 
750 	bzero(&kcfg, sizeof(kcfg));
751 	kcfg.worker_fn = netmap_bwrap_polling;
752 	for (i = 0; i < bps->ncpus; i++) {
753 		struct nm_bdg_kthread *t = bps->kthreads + i;
754 		int all = (bps->ncpus == 1 &&
755 			bps->mode == NETMAP_POLLING_MODE_SINGLE_CPU);
756 		int affinity = bps->cpu_from + i;
757 
758 		t->bps = bps;
759 		t->qfirst = all ? bps->qfirst /* must be 0 */: affinity;
760 		t->qlast = all ? bps->qlast : t->qfirst + 1;
761 		if (netmap_verbose)
762 			nm_prinf("kthread %d a:%u qf:%u ql:%u", i, affinity, t->qfirst,
763 				t->qlast);
764 
765 		kcfg.type = i;
766 		kcfg.worker_private = t;
767 		t->nmk = nm_os_kctx_create(&kcfg, NULL);
768 		if (t->nmk == NULL) {
769 			goto cleanup;
770 		}
771 		nm_os_kctx_worker_setaff(t->nmk, affinity);
772 	}
773 	return 0;
774 
775 cleanup:
776 	for (j = 0; j < i; j++) {
777 		struct nm_bdg_kthread *t = bps->kthreads + i;
778 		nm_os_kctx_destroy(t->nmk);
779 	}
780 	nm_os_free(bps->kthreads);
781 	return EFAULT;
782 }
783 
784 /* A variant of ptnetmap_start_kthreads() */
785 static int
786 nm_bdg_polling_start_kthreads(struct nm_bdg_polling_state *bps)
787 {
788 	int error, i, j;
789 
790 	if (!bps) {
791 		nm_prerr("polling is not configured");
792 		return EFAULT;
793 	}
794 	bps->stopped = false;
795 
796 	for (i = 0; i < bps->ncpus; i++) {
797 		struct nm_bdg_kthread *t = bps->kthreads + i;
798 		error = nm_os_kctx_worker_start(t->nmk);
799 		if (error) {
800 			nm_prerr("error in nm_kthread_start(): %d", error);
801 			goto cleanup;
802 		}
803 	}
804 	return 0;
805 
806 cleanup:
807 	for (j = 0; j < i; j++) {
808 		struct nm_bdg_kthread *t = bps->kthreads + i;
809 		nm_os_kctx_worker_stop(t->nmk);
810 	}
811 	bps->stopped = true;
812 	return error;
813 }
814 
815 static void
816 nm_bdg_polling_stop_delete_kthreads(struct nm_bdg_polling_state *bps)
817 {
818 	int i;
819 
820 	if (!bps)
821 		return;
822 
823 	for (i = 0; i < bps->ncpus; i++) {
824 		struct nm_bdg_kthread *t = bps->kthreads + i;
825 		nm_os_kctx_worker_stop(t->nmk);
826 		nm_os_kctx_destroy(t->nmk);
827 	}
828 	bps->stopped = true;
829 }
830 
831 static int
832 get_polling_cfg(struct nmreq_vale_polling *req, struct netmap_adapter *na,
833 		struct nm_bdg_polling_state *bps)
834 {
835 	unsigned int avail_cpus, core_from;
836 	unsigned int qfirst, qlast;
837 	uint32_t i = req->nr_first_cpu_id;
838 	uint32_t req_cpus = req->nr_num_polling_cpus;
839 
840 	avail_cpus = nm_os_ncpus();
841 
842 	if (req_cpus == 0) {
843 		nm_prerr("req_cpus must be > 0");
844 		return EINVAL;
845 	} else if (req_cpus >= avail_cpus) {
846 		nm_prerr("Cannot use all the CPUs in the system");
847 		return EINVAL;
848 	}
849 
850 	if (req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU) {
851 		/* Use a separate core for each ring. If nr_num_polling_cpus>1
852 		 * more consecutive rings are polled.
853 		 * For example, if nr_first_cpu_id=2 and nr_num_polling_cpus=2,
854 		 * ring 2 and 3 are polled by core 2 and 3, respectively. */
855 		if (i + req_cpus > nma_get_nrings(na, NR_RX)) {
856 			nm_prerr("Rings %u-%u not in range (have %d rings)",
857 				i, i + req_cpus, nma_get_nrings(na, NR_RX));
858 			return EINVAL;
859 		}
860 		qfirst = i;
861 		qlast = qfirst + req_cpus;
862 		core_from = qfirst;
863 
864 	} else if (req->nr_mode == NETMAP_POLLING_MODE_SINGLE_CPU) {
865 		/* Poll all the rings using a core specified by nr_first_cpu_id.
866 		 * the number of cores must be 1. */
867 		if (req_cpus != 1) {
868 			nm_prerr("ncpus must be 1 for NETMAP_POLLING_MODE_SINGLE_CPU "
869 				"(was %d)", req_cpus);
870 			return EINVAL;
871 		}
872 		qfirst = 0;
873 		qlast = nma_get_nrings(na, NR_RX);
874 		core_from = i;
875 	} else {
876 		nm_prerr("Invalid polling mode");
877 		return EINVAL;
878 	}
879 
880 	bps->mode = req->nr_mode;
881 	bps->qfirst = qfirst;
882 	bps->qlast = qlast;
883 	bps->cpu_from = core_from;
884 	bps->ncpus = req_cpus;
885 	nm_prinf("%s qfirst %u qlast %u cpu_from %u ncpus %u",
886 		req->nr_mode == NETMAP_POLLING_MODE_MULTI_CPU ?
887 		"MULTI" : "SINGLE",
888 		qfirst, qlast, core_from, req_cpus);
889 	return 0;
890 }
891 
892 static int
893 nm_bdg_ctl_polling_start(struct nmreq_vale_polling *req, struct netmap_adapter *na)
894 {
895 	struct nm_bdg_polling_state *bps;
896 	struct netmap_bwrap_adapter *bna;
897 	int error;
898 
899 	bna = (struct netmap_bwrap_adapter *)na;
900 	if (bna->na_polling_state) {
901 		nm_prerr("ERROR adapter already in polling mode");
902 		return EFAULT;
903 	}
904 
905 	bps = nm_os_malloc(sizeof(*bps));
906 	if (!bps)
907 		return ENOMEM;
908 	bps->configured = false;
909 	bps->stopped = true;
910 
911 	if (get_polling_cfg(req, na, bps)) {
912 		nm_os_free(bps);
913 		return EINVAL;
914 	}
915 
916 	if (nm_bdg_create_kthreads(bps)) {
917 		nm_os_free(bps);
918 		return EFAULT;
919 	}
920 
921 	bps->configured = true;
922 	bna->na_polling_state = bps;
923 	bps->bna = bna;
924 
925 	/* disable interrupts if possible */
926 	nma_intr_enable(bna->hwna, 0);
927 	/* start kthread now */
928 	error = nm_bdg_polling_start_kthreads(bps);
929 	if (error) {
930 		nm_prerr("ERROR nm_bdg_polling_start_kthread()");
931 		nm_os_free(bps->kthreads);
932 		nm_os_free(bps);
933 		bna->na_polling_state = NULL;
934 		nma_intr_enable(bna->hwna, 1);
935 	}
936 	return error;
937 }
938 
939 static int
940 nm_bdg_ctl_polling_stop(struct netmap_adapter *na)
941 {
942 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter *)na;
943 	struct nm_bdg_polling_state *bps;
944 
945 	if (!bna->na_polling_state) {
946 		nm_prerr("ERROR adapter is not in polling mode");
947 		return EFAULT;
948 	}
949 	bps = bna->na_polling_state;
950 	nm_bdg_polling_stop_delete_kthreads(bna->na_polling_state);
951 	bps->configured = false;
952 	nm_os_free(bps);
953 	bna->na_polling_state = NULL;
954 	/* re-enable interrupts */
955 	nma_intr_enable(bna->hwna, 1);
956 	return 0;
957 }
958 
959 int
960 nm_bdg_polling(struct nmreq_header *hdr)
961 {
962 	struct nmreq_vale_polling *req =
963 		(struct nmreq_vale_polling *)(uintptr_t)hdr->nr_body;
964 	struct netmap_adapter *na = NULL;
965 	int error = 0;
966 
967 	NMG_LOCK();
968 	error = netmap_get_vale_na(hdr, &na, NULL, /*create=*/0);
969 	if (na && !error) {
970 		if (!nm_is_bwrap(na)) {
971 			error = EOPNOTSUPP;
972 		} else if (hdr->nr_reqtype == NETMAP_BDG_POLLING_ON) {
973 			error = nm_bdg_ctl_polling_start(req, na);
974 			if (!error)
975 				netmap_adapter_get(na);
976 		} else {
977 			error = nm_bdg_ctl_polling_stop(na);
978 			if (!error)
979 				netmap_adapter_put(na);
980 		}
981 		netmap_adapter_put(na);
982 	} else if (!na && !error) {
983 		/* Not VALE port. */
984 		error = EINVAL;
985 	}
986 	NMG_UNLOCK();
987 
988 	return error;
989 }
990 
991 /* Called by external kernel modules (e.g., Openvswitch).
992  * to set configure/lookup/dtor functions of a VALE instance.
993  * Register callbacks to the given bridge. 'name' may be just
994  * bridge's name (including ':' if it is not just NM_BDG_NAME).
995  *
996  * Called without NMG_LOCK.
997  */
998 
999 int
1000 netmap_bdg_regops(const char *name, struct netmap_bdg_ops *bdg_ops, void *private_data, void *auth_token)
1001 {
1002 	struct nm_bridge *b;
1003 	int error = 0;
1004 
1005 	NMG_LOCK();
1006 	b = nm_find_bridge(name, 0 /* don't create */, NULL);
1007 	if (!b) {
1008 		error = ENXIO;
1009 		goto unlock_regops;
1010 	}
1011 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
1012 		error = EACCES;
1013 		goto unlock_regops;
1014 	}
1015 
1016 	BDG_WLOCK(b);
1017 	if (!bdg_ops) {
1018 		/* resetting the bridge */
1019 		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
1020 		b->bdg_ops = b->bdg_saved_ops;
1021 		b->private_data = b->ht;
1022 	} else {
1023 		/* modifying the bridge */
1024 		b->private_data = private_data;
1025 #define nm_bdg_override(m) if (bdg_ops->m) b->bdg_ops.m = bdg_ops->m
1026 		nm_bdg_override(lookup);
1027 		nm_bdg_override(config);
1028 		nm_bdg_override(dtor);
1029 		nm_bdg_override(vp_create);
1030 		nm_bdg_override(bwrap_attach);
1031 #undef nm_bdg_override
1032 
1033 	}
1034 	BDG_WUNLOCK(b);
1035 
1036 unlock_regops:
1037 	NMG_UNLOCK();
1038 	return error;
1039 }
1040 
1041 
1042 int
1043 netmap_bdg_config(struct nm_ifreq *nr)
1044 {
1045 	struct nm_bridge *b;
1046 	int error = EINVAL;
1047 
1048 	NMG_LOCK();
1049 	b = nm_find_bridge(nr->nifr_name, 0, NULL);
1050 	if (!b) {
1051 		NMG_UNLOCK();
1052 		return error;
1053 	}
1054 	NMG_UNLOCK();
1055 	/* Don't call config() with NMG_LOCK() held */
1056 	BDG_RLOCK(b);
1057 	if (b->bdg_ops.config != NULL)
1058 		error = b->bdg_ops.config(nr);
1059 	BDG_RUNLOCK(b);
1060 	return error;
1061 }
1062 
1063 
1064 /* nm_register callback for VALE ports */
1065 int
1066 netmap_vp_reg(struct netmap_adapter *na, int onoff)
1067 {
1068 	struct netmap_vp_adapter *vpna =
1069 		(struct netmap_vp_adapter*)na;
1070 
1071 	/* persistent ports may be put in netmap mode
1072 	 * before being attached to a bridge
1073 	 */
1074 	if (vpna->na_bdg)
1075 		BDG_WLOCK(vpna->na_bdg);
1076 	if (onoff) {
1077 		netmap_krings_mode_commit(na, onoff);
1078 		if (na->active_fds == 0)
1079 			na->na_flags |= NAF_NETMAP_ON;
1080 		 /* XXX on FreeBSD, persistent VALE ports should also
1081 		 * toggle IFCAP_NETMAP in na->ifp (2014-03-16)
1082 		 */
1083 	} else {
1084 		if (na->active_fds == 0)
1085 			na->na_flags &= ~NAF_NETMAP_ON;
1086 		netmap_krings_mode_commit(na, onoff);
1087 	}
1088 	if (vpna->na_bdg)
1089 		BDG_WUNLOCK(vpna->na_bdg);
1090 	return 0;
1091 }
1092 
1093 
1094 /* rxsync code used by VALE ports nm_rxsync callback and also
1095  * internally by the brwap
1096  */
1097 static int
1098 netmap_vp_rxsync_locked(struct netmap_kring *kring, int flags)
1099 {
1100 	struct netmap_adapter *na = kring->na;
1101 	struct netmap_ring *ring = kring->ring;
1102 	u_int nm_i, lim = kring->nkr_num_slots - 1;
1103 	u_int head = kring->rhead;
1104 	int n;
1105 
1106 	if (head > lim) {
1107 		nm_prerr("ouch dangerous reset!!!");
1108 		n = netmap_ring_reinit(kring);
1109 		goto done;
1110 	}
1111 
1112 	/* First part, import newly received packets. */
1113 	/* actually nothing to do here, they are already in the kring */
1114 
1115 	/* Second part, skip past packets that userspace has released. */
1116 	nm_i = kring->nr_hwcur;
1117 	if (nm_i != head) {
1118 		/* consistency check, but nothing really important here */
1119 		for (n = 0; likely(nm_i != head); n++) {
1120 			struct netmap_slot *slot = &ring->slot[nm_i];
1121 			void *addr = NMB(na, slot);
1122 
1123 			if (addr == NETMAP_BUF_BASE(kring->na)) { /* bad buf */
1124 				nm_prerr("bad buffer index %d, ignore ?",
1125 					slot->buf_idx);
1126 			}
1127 			slot->flags &= ~NS_BUF_CHANGED;
1128 			nm_i = nm_next(nm_i, lim);
1129 		}
1130 		kring->nr_hwcur = head;
1131 	}
1132 
1133 	n = 0;
1134 done:
1135 	return n;
1136 }
1137 
1138 /*
1139  * nm_rxsync callback for VALE ports
1140  * user process reading from a VALE switch.
1141  * Already protected against concurrent calls from userspace,
1142  * but we must acquire the queue's lock to protect against
1143  * writers on the same queue.
1144  */
1145 int
1146 netmap_vp_rxsync(struct netmap_kring *kring, int flags)
1147 {
1148 	int n;
1149 
1150 	mtx_lock(&kring->q_lock);
1151 	n = netmap_vp_rxsync_locked(kring, flags);
1152 	mtx_unlock(&kring->q_lock);
1153 	return n;
1154 }
1155 
1156 int
1157 netmap_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna,
1158 		struct netmap_bdg_ops *ops)
1159 {
1160 	return ops->bwrap_attach(nr_name, hwna);
1161 }
1162 
1163 
1164 /* Bridge wrapper code (bwrap).
1165  * This is used to connect a non-VALE-port netmap_adapter (hwna) to a
1166  * VALE switch.
1167  * The main task is to swap the meaning of tx and rx rings to match the
1168  * expectations of the VALE switch code (see nm_bdg_flush).
1169  *
1170  * The bwrap works by interposing a netmap_bwrap_adapter between the
1171  * rest of the system and the hwna. The netmap_bwrap_adapter looks like
1172  * a netmap_vp_adapter to the rest the system, but, internally, it
1173  * translates all callbacks to what the hwna expects.
1174  *
1175  * Note that we have to intercept callbacks coming from two sides:
1176  *
1177  *  - callbacks coming from the netmap module are intercepted by
1178  *    passing around the netmap_bwrap_adapter instead of the hwna
1179  *
1180  *  - callbacks coming from outside of the netmap module only know
1181  *    about the hwna. This, however, only happens in interrupt
1182  *    handlers, where only the hwna->nm_notify callback is called.
1183  *    What the bwrap does is to overwrite the hwna->nm_notify callback
1184  *    with its own netmap_bwrap_intr_notify.
1185  *    XXX This assumes that the hwna->nm_notify callback was the
1186  *    standard netmap_notify(), as it is the case for nic adapters.
1187  *    Any additional action performed by hwna->nm_notify will not be
1188  *    performed by netmap_bwrap_intr_notify.
1189  *
1190  * Additionally, the bwrap can optionally attach the host rings pair
1191  * of the wrapped adapter to a different port of the switch.
1192  */
1193 
1194 
1195 static void
1196 netmap_bwrap_dtor(struct netmap_adapter *na)
1197 {
1198 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1199 	struct netmap_adapter *hwna = bna->hwna;
1200 	struct nm_bridge *b = bna->up.na_bdg,
1201 		*bh = bna->host.na_bdg;
1202 
1203 	if (bna->host.up.nm_mem)
1204 		netmap_mem_put(bna->host.up.nm_mem);
1205 
1206 	if (b) {
1207 		netmap_bdg_detach_common(b, bna->up.bdg_port,
1208 			    (bh ? bna->host.bdg_port : -1));
1209 	}
1210 
1211 	nm_prdis("na %p", na);
1212 	na->ifp = NULL;
1213 	bna->host.up.ifp = NULL;
1214 	hwna->na_vp = bna->saved_na_vp;
1215 	hwna->na_hostvp = NULL;
1216 	hwna->na_private = NULL;
1217 	hwna->na_flags &= ~NAF_BUSY;
1218 	netmap_adapter_put(hwna);
1219 
1220 }
1221 
1222 
1223 /*
1224  * Intr callback for NICs connected to a bridge.
1225  * Simply ignore tx interrupts (maybe we could try to recover space ?)
1226  * and pass received packets from nic to the bridge.
1227  *
1228  * XXX TODO check locking: this is called from the interrupt
1229  * handler so we should make sure that the interface is not
1230  * disconnected while passing down an interrupt.
1231  *
1232  * Note, no user process can access this NIC or the host stack.
1233  * The only part of the ring that is significant are the slots,
1234  * and head/cur/tail are set from the kring as needed
1235  * (part as a receive ring, part as a transmit ring).
1236  *
1237  * callback that overwrites the hwna notify callback.
1238  * Packets come from the outside or from the host stack and are put on an
1239  * hwna rx ring.
1240  * The bridge wrapper then sends the packets through the bridge.
1241  */
1242 int
1243 netmap_bwrap_intr_notify(struct netmap_kring *kring, int flags)
1244 {
1245 	struct netmap_adapter *na = kring->na;
1246 	struct netmap_bwrap_adapter *bna = na->na_private;
1247 	struct netmap_kring *bkring;
1248 	struct netmap_vp_adapter *vpna = &bna->up;
1249 	u_int ring_nr = kring->ring_id;
1250 	int ret = NM_IRQ_COMPLETED;
1251 	int error;
1252 
1253 	if (netmap_debug & NM_DEBUG_RXINTR)
1254 	    nm_prinf("%s %s 0x%x", na->name, kring->name, flags);
1255 
1256 	bkring = vpna->up.tx_rings[ring_nr];
1257 
1258 	/* make sure the ring is not disabled */
1259 	if (nm_kr_tryget(kring, 0 /* can't sleep */, NULL)) {
1260 		return EIO;
1261 	}
1262 
1263 	if (netmap_debug & NM_DEBUG_RXINTR)
1264 	    nm_prinf("%s head %d cur %d tail %d",  na->name,
1265 		kring->rhead, kring->rcur, kring->rtail);
1266 
1267 	/* simulate a user wakeup on the rx ring
1268 	 * fetch packets that have arrived.
1269 	 */
1270 	error = kring->nm_sync(kring, 0);
1271 	if (error)
1272 		goto put_out;
1273 	if (kring->nr_hwcur == kring->nr_hwtail) {
1274 		if (netmap_verbose)
1275 			nm_prlim(1, "interrupt with no packets on %s",
1276 				kring->name);
1277 		goto put_out;
1278 	}
1279 
1280 	/* new packets are kring->rcur to kring->nr_hwtail, and the bkring
1281 	 * had hwcur == bkring->rhead. So advance bkring->rhead to kring->nr_hwtail
1282 	 * to push all packets out.
1283 	 */
1284 	bkring->rhead = bkring->rcur = kring->nr_hwtail;
1285 
1286 	bkring->nm_sync(bkring, flags);
1287 
1288 	/* mark all buffers as released on this ring */
1289 	kring->rhead = kring->rcur = kring->rtail = kring->nr_hwtail;
1290 	/* another call to actually release the buffers */
1291 	error = kring->nm_sync(kring, 0);
1292 
1293 	/* The second rxsync may have further advanced hwtail. If this happens,
1294 	 *  return NM_IRQ_RESCHED, otherwise just return NM_IRQ_COMPLETED. */
1295 	if (kring->rcur != kring->nr_hwtail) {
1296 		ret = NM_IRQ_RESCHED;
1297 	}
1298 put_out:
1299 	nm_kr_put(kring);
1300 
1301 	return error ? error : ret;
1302 }
1303 
1304 
1305 /* nm_register callback for bwrap */
1306 int
1307 netmap_bwrap_reg(struct netmap_adapter *na, int onoff)
1308 {
1309 	struct netmap_bwrap_adapter *bna =
1310 		(struct netmap_bwrap_adapter *)na;
1311 	struct netmap_adapter *hwna = bna->hwna;
1312 	struct netmap_vp_adapter *hostna = &bna->host;
1313 	int error, i;
1314 	enum txrx t;
1315 
1316 	nm_prdis("%s %s", na->name, onoff ? "on" : "off");
1317 
1318 	if (onoff) {
1319 		/* netmap_do_regif has been called on the bwrap na.
1320 		 * We need to pass the information about the
1321 		 * memory allocator down to the hwna before
1322 		 * putting it in netmap mode
1323 		 */
1324 		hwna->na_lut = na->na_lut;
1325 
1326 		if (hostna->na_bdg) {
1327 			/* if the host rings have been attached to switch,
1328 			 * we need to copy the memory allocator information
1329 			 * in the hostna also
1330 			 */
1331 			hostna->up.na_lut = na->na_lut;
1332 		}
1333 
1334 	}
1335 
1336 	/* pass down the pending ring state information */
1337 	for_rx_tx(t) {
1338 		for (i = 0; i < netmap_all_rings(na, t); i++) {
1339 			NMR(hwna, nm_txrx_swap(t))[i]->nr_pending_mode =
1340 				NMR(na, t)[i]->nr_pending_mode;
1341 		}
1342 	}
1343 
1344 	/* forward the request to the hwna */
1345 	error = hwna->nm_register(hwna, onoff);
1346 	if (error)
1347 		return error;
1348 
1349 	/* copy up the current ring state information */
1350 	for_rx_tx(t) {
1351 		for (i = 0; i < netmap_all_rings(na, t); i++) {
1352 			struct netmap_kring *kring = NMR(hwna, nm_txrx_swap(t))[i];
1353 			NMR(na, t)[i]->nr_mode = kring->nr_mode;
1354 		}
1355 	}
1356 
1357 	/* impersonate a netmap_vp_adapter */
1358 	netmap_vp_reg(na, onoff);
1359 	if (hostna->na_bdg)
1360 		netmap_vp_reg(&hostna->up, onoff);
1361 
1362 	if (onoff) {
1363 		u_int i;
1364 		/* intercept the hwna nm_nofify callback on the hw rings */
1365 		for (i = 0; i < hwna->num_rx_rings; i++) {
1366 			hwna->rx_rings[i]->save_notify = hwna->rx_rings[i]->nm_notify;
1367 			hwna->rx_rings[i]->nm_notify = bna->nm_intr_notify;
1368 		}
1369 		i = hwna->num_rx_rings; /* for safety */
1370 		/* save the host ring notify unconditionally */
1371 		for (; i < netmap_real_rings(hwna, NR_RX); i++) {
1372 			hwna->rx_rings[i]->save_notify =
1373 				hwna->rx_rings[i]->nm_notify;
1374 			if (hostna->na_bdg) {
1375 				/* also intercept the host ring notify */
1376 				hwna->rx_rings[i]->nm_notify =
1377 					netmap_bwrap_intr_notify;
1378 				na->tx_rings[i]->nm_sync = na->nm_txsync;
1379 			}
1380 		}
1381 		if (na->active_fds == 0)
1382 			na->na_flags |= NAF_NETMAP_ON;
1383 	} else {
1384 		u_int i;
1385 
1386 		if (na->active_fds == 0)
1387 			na->na_flags &= ~NAF_NETMAP_ON;
1388 
1389 		/* reset all notify callbacks (including host ring) */
1390 		for (i = 0; i < netmap_all_rings(hwna, NR_RX); i++) {
1391 			hwna->rx_rings[i]->nm_notify =
1392 				hwna->rx_rings[i]->save_notify;
1393 			hwna->rx_rings[i]->save_notify = NULL;
1394 		}
1395 		hwna->na_lut.lut = NULL;
1396 		hwna->na_lut.plut = NULL;
1397 		hwna->na_lut.objtotal = 0;
1398 		hwna->na_lut.objsize = 0;
1399 
1400 		/* reset the number of host rings to default */
1401 		for_rx_tx(t) {
1402 			nma_set_host_nrings(hwna, t, 1);
1403 		}
1404 
1405 	}
1406 
1407 	return 0;
1408 }
1409 
1410 /* nm_config callback for bwrap */
1411 static int
1412 netmap_bwrap_config(struct netmap_adapter *na, struct nm_config_info *info)
1413 {
1414 	struct netmap_bwrap_adapter *bna =
1415 		(struct netmap_bwrap_adapter *)na;
1416 	struct netmap_adapter *hwna = bna->hwna;
1417 	int error;
1418 
1419 	/* cache the lut in the embedded host adapter */
1420 	error = netmap_mem_get_lut(hwna->nm_mem, &bna->host.up.na_lut);
1421 	if (error)
1422 		return error;
1423 
1424 	/* Forward the request to the hwna. It may happen that nobody
1425 	 * registered hwna yet, so netmap_mem_get_lut() may have not
1426 	 * been called yet. */
1427 	error = netmap_mem_get_lut(hwna->nm_mem, &hwna->na_lut);
1428 	if (error)
1429 		return error;
1430 	netmap_update_config(hwna);
1431 	/* swap the results and propagate */
1432 	info->num_tx_rings = hwna->num_rx_rings;
1433 	info->num_tx_descs = hwna->num_rx_desc;
1434 	info->num_rx_rings = hwna->num_tx_rings;
1435 	info->num_rx_descs = hwna->num_tx_desc;
1436 	info->rx_buf_maxsize = hwna->rx_buf_maxsize;
1437 
1438 	if (na->na_flags & NAF_HOST_RINGS) {
1439 		struct netmap_adapter *hostna = &bna->host.up;
1440 		enum txrx t;
1441 
1442 		/* limit the number of host rings to that of hw */
1443 		if (na->na_flags & NAF_HOST_ALL) {
1444 			hostna->num_tx_rings = nma_get_nrings(hwna, NR_RX);
1445 			hostna->num_rx_rings = nma_get_nrings(hwna, NR_TX);
1446 		} else {
1447 			nm_bound_var(&hostna->num_tx_rings, 1, 1,
1448 				nma_get_nrings(hwna, NR_TX), NULL);
1449 			nm_bound_var(&hostna->num_rx_rings, 1, 1,
1450 				nma_get_nrings(hwna, NR_RX), NULL);
1451 		}
1452 		for_rx_tx(t) {
1453 			enum txrx r = nm_txrx_swap(t);
1454 			u_int nr = nma_get_nrings(hostna, t);
1455 
1456 			nma_set_host_nrings(na, t, nr);
1457 			if (nma_get_host_nrings(hwna, t) < nr) {
1458 				nma_set_host_nrings(hwna, t, nr);
1459 			}
1460 			nma_set_ndesc(hostna, t, nma_get_ndesc(hwna, r));
1461 		}
1462 	}
1463 
1464 	return 0;
1465 }
1466 
1467 /* nm_bufcfg callback for bwrap */
1468 static int
1469 netmap_bwrap_bufcfg(struct netmap_kring *kring, uint64_t target)
1470 {
1471 	struct netmap_adapter *na = kring->na;
1472 	struct netmap_bwrap_adapter *bna =
1473 		(struct netmap_bwrap_adapter *)na;
1474 	struct netmap_adapter *hwna = bna->hwna;
1475 	struct netmap_kring *hwkring;
1476 	enum txrx r;
1477 	int error;
1478 
1479 	/* we need the hw kring that corresponds to the bwrap one:
1480 	 * remember that rx and tx are swapped
1481 	 */
1482 	r = nm_txrx_swap(kring->tx);
1483 	hwkring = NMR(hwna, r)[kring->ring_id];
1484 
1485 	/* copy down the offset information, forward the request
1486 	 * and copy up the results
1487 	 */
1488 	hwkring->offset_mask = kring->offset_mask;
1489 	hwkring->offset_max  = kring->offset_max;
1490 	hwkring->offset_gap  = kring->offset_gap;
1491 
1492 	error = hwkring->nm_bufcfg(hwkring, target);
1493 	if (error)
1494 		return error;
1495 
1496 	kring->hwbuf_len = hwkring->hwbuf_len;
1497 	kring->buf_align = hwkring->buf_align;
1498 
1499 	return 0;
1500 }
1501 
1502 /* nm_krings_create callback for bwrap */
1503 int
1504 netmap_bwrap_krings_create_common(struct netmap_adapter *na)
1505 {
1506 	struct netmap_bwrap_adapter *bna =
1507 		(struct netmap_bwrap_adapter *)na;
1508 	struct netmap_adapter *hwna = bna->hwna;
1509 	struct netmap_adapter *hostna = &bna->host.up;
1510 	int i, error = 0;
1511 	enum txrx t;
1512 
1513 	/* also create the hwna krings */
1514 	error = hwna->nm_krings_create(hwna);
1515 	if (error) {
1516 		return error;
1517 	}
1518 
1519 	/* increment the usage counter for all the hwna krings */
1520 	for_rx_tx(t) {
1521 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1522 			NMR(hwna, t)[i]->users++;
1523 			/* this to prevent deletion of the rings through
1524 			 * our krings, instead of through the hwna ones */
1525 			NMR(na, t)[i]->nr_kflags |= NKR_NEEDRING;
1526 		}
1527 	}
1528 
1529 	/* now create the actual rings */
1530 	error = netmap_mem_rings_create(hwna);
1531 	if (error) {
1532 		goto err_dec_users;
1533 	}
1534 
1535 	/* cross-link the netmap rings
1536 	 * The original number of rings comes from hwna,
1537 	 * rx rings on one side equals tx rings on the other.
1538 	 */
1539 	for_rx_tx(t) {
1540 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
1541 		for (i = 0; i < netmap_all_rings(hwna, r); i++) {
1542 			NMR(na, t)[i]->nkr_num_slots = NMR(hwna, r)[i]->nkr_num_slots;
1543 			NMR(na, t)[i]->ring = NMR(hwna, r)[i]->ring;
1544 		}
1545 	}
1546 
1547 	if (na->na_flags & NAF_HOST_RINGS) {
1548 		/* the hostna rings are the host rings of the bwrap.
1549 		 * The corresponding krings must point back to the
1550 		 * hostna
1551 		 */
1552 		hostna->tx_rings = &na->tx_rings[na->num_tx_rings];
1553 		hostna->rx_rings = &na->rx_rings[na->num_rx_rings];
1554 		for_rx_tx(t) {
1555 			for (i = 0; i < nma_get_nrings(hostna, t); i++) {
1556 				NMR(hostna, t)[i]->na = hostna;
1557 			}
1558 		}
1559 	}
1560 
1561 	return 0;
1562 
1563 err_dec_users:
1564 	for_rx_tx(t) {
1565 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1566 			NMR(hwna, t)[i]->users--;
1567 			NMR(na, t)[i]->users--;
1568 		}
1569 	}
1570 	hwna->nm_krings_delete(hwna);
1571 	return error;
1572 }
1573 
1574 
1575 void
1576 netmap_bwrap_krings_delete_common(struct netmap_adapter *na)
1577 {
1578 	struct netmap_bwrap_adapter *bna =
1579 		(struct netmap_bwrap_adapter *)na;
1580 	struct netmap_adapter *hwna = bna->hwna;
1581 	enum txrx t;
1582 	int i;
1583 
1584 	nm_prdis("%s", na->name);
1585 
1586 	/* decrement the usage counter for all the hwna krings */
1587 	for_rx_tx(t) {
1588 		for (i = 0; i < netmap_all_rings(hwna, t); i++) {
1589 			NMR(hwna, t)[i]->users--;
1590 			NMR(na, t)[i]->users--;
1591 		}
1592 	}
1593 
1594 	/* delete any netmap rings that are no longer needed */
1595 	netmap_mem_rings_delete(hwna);
1596 	hwna->nm_krings_delete(hwna);
1597 }
1598 
1599 
1600 /* notify method for the bridge-->hwna direction */
1601 int
1602 netmap_bwrap_notify(struct netmap_kring *kring, int flags)
1603 {
1604 	struct netmap_adapter *na = kring->na;
1605 	struct netmap_bwrap_adapter *bna = na->na_private;
1606 	struct netmap_adapter *hwna = bna->hwna;
1607 	u_int ring_n = kring->ring_id;
1608 	u_int lim = kring->nkr_num_slots - 1;
1609 	struct netmap_kring *hw_kring;
1610 	int error;
1611 
1612 	nm_prdis("%s: na %s hwna %s",
1613 			(kring ? kring->name : "NULL!"),
1614 			(na ? na->name : "NULL!"),
1615 			(hwna ? hwna->name : "NULL!"));
1616 	hw_kring = hwna->tx_rings[ring_n];
1617 
1618 	if (nm_kr_tryget(hw_kring, 0, NULL)) {
1619 		return ENXIO;
1620 	}
1621 
1622 	/* first step: simulate a user wakeup on the rx ring */
1623 	netmap_vp_rxsync(kring, flags);
1624 	nm_prdis("%s[%d] PRE rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1625 		na->name, ring_n,
1626 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1627 		kring->rhead, kring->rcur, kring->rtail,
1628 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
1629 	/* second step: the new packets are sent on the tx ring
1630 	 * (which is actually the same ring)
1631 	 */
1632 	hw_kring->rhead = hw_kring->rcur = kring->nr_hwtail;
1633 	error = hw_kring->nm_sync(hw_kring, flags);
1634 	if (error)
1635 		goto put_out;
1636 
1637 	/* third step: now we are back the rx ring */
1638 	/* claim ownership on all hw owned bufs */
1639 	kring->rhead = kring->rcur = nm_next(hw_kring->nr_hwtail, lim); /* skip past reserved slot */
1640 
1641 	/* fourth step: the user goes to sleep again, causing another rxsync */
1642 	netmap_vp_rxsync(kring, flags);
1643 	nm_prdis("%s[%d] PST rx(c%3d t%3d l%3d) ring(h%3d c%3d t%3d) tx(c%3d ht%3d t%3d)",
1644 		na->name, ring_n,
1645 		kring->nr_hwcur, kring->nr_hwtail, kring->nkr_hwlease,
1646 		kring->rhead, kring->rcur, kring->rtail,
1647 		hw_kring->nr_hwcur, hw_kring->nr_hwtail, hw_kring->rtail);
1648 put_out:
1649 	nm_kr_put(hw_kring);
1650 
1651 	return error ? error : NM_IRQ_COMPLETED;
1652 }
1653 
1654 
1655 /* nm_bdg_ctl callback for the bwrap.
1656  * Called on bridge-attach and detach, as an effect of valectl -[ahd].
1657  * On attach, it needs to provide a fake netmap_priv_d structure and
1658  * perform a netmap_do_regif() on the bwrap. This will put both the
1659  * bwrap and the hwna in netmap mode, with the netmap rings shared
1660  * and cross linked. Moroever, it will start intercepting interrupts
1661  * directed to hwna.
1662  */
1663 static int
1664 netmap_bwrap_bdg_ctl(struct nmreq_header *hdr, struct netmap_adapter *na)
1665 {
1666 	struct netmap_priv_d *npriv;
1667 	struct netmap_bwrap_adapter *bna = (struct netmap_bwrap_adapter*)na;
1668 	int error = 0;
1669 
1670 	if (hdr->nr_reqtype == NETMAP_REQ_VALE_ATTACH) {
1671 		struct nmreq_vale_attach *req =
1672 			(struct nmreq_vale_attach *)(uintptr_t)hdr->nr_body;
1673 		if (req->reg.nr_ringid != 0 ||
1674 			(req->reg.nr_mode != NR_REG_ALL_NIC &&
1675 				req->reg.nr_mode != NR_REG_NIC_SW)) {
1676 			/* We only support attaching all the NIC rings
1677 			 * and/or the host stack. */
1678 			return EINVAL;
1679 		}
1680 		if (NETMAP_OWNED_BY_ANY(na)) {
1681 			return EBUSY;
1682 		}
1683 		if (bna->na_kpriv) {
1684 			/* nothing to do */
1685 			return 0;
1686 		}
1687 		npriv = netmap_priv_new();
1688 		if (npriv == NULL)
1689 			return ENOMEM;
1690 		npriv->np_ifp = na->ifp; /* let the priv destructor release the ref */
1691 		error = netmap_do_regif(npriv, na, hdr);
1692 		if (error) {
1693 			netmap_priv_delete(npriv);
1694 			netmap_mem_restore(bna->hwna);
1695 			return error;
1696 		}
1697 		bna->na_kpriv = npriv;
1698 		na->na_flags |= NAF_BUSY;
1699 	} else {
1700 		if (na->active_fds == 0) /* not registered */
1701 			return EINVAL;
1702 		netmap_priv_delete(bna->na_kpriv);
1703 		bna->na_kpriv = NULL;
1704 		na->na_flags &= ~NAF_BUSY;
1705 		netmap_mem_restore(bna->hwna);
1706 	}
1707 
1708 	return error;
1709 }
1710 
1711 /* attach a bridge wrapper to the 'real' device */
1712 int
1713 netmap_bwrap_attach_common(struct netmap_adapter *na,
1714 		struct netmap_adapter *hwna)
1715 {
1716 	struct netmap_bwrap_adapter *bna;
1717 	struct netmap_adapter *hostna = NULL;
1718 	int error = 0;
1719 	enum txrx t;
1720 
1721 	/* make sure the NIC is not already in use */
1722 	if (NETMAP_OWNED_BY_ANY(hwna)) {
1723 		nm_prerr("NIC %s busy, cannot attach to bridge", hwna->name);
1724 		return EBUSY;
1725 	}
1726 
1727 	bna = (struct netmap_bwrap_adapter *)na;
1728 	/* make bwrap ifp point to the real ifp */
1729 	na->ifp = hwna->ifp;
1730 	if_ref(na->ifp);
1731 	na->na_private = bna;
1732 	/* fill the ring data for the bwrap adapter with rx/tx meanings
1733 	 * swapped. The real cross-linking will be done during register,
1734 	 * when all the krings will have been created.
1735 	 */
1736 	for_rx_tx(t) {
1737 		enum txrx r = nm_txrx_swap(t); /* swap NR_TX <-> NR_RX */
1738 		nma_set_nrings(na, t, nma_get_nrings(hwna, r));
1739 		nma_set_ndesc(na, t, nma_get_ndesc(hwna, r));
1740 	}
1741 	na->nm_dtor = netmap_bwrap_dtor;
1742 	na->nm_config = netmap_bwrap_config;
1743 	na->nm_bufcfg = netmap_bwrap_bufcfg;
1744 	na->nm_bdg_ctl = netmap_bwrap_bdg_ctl;
1745 	na->pdev = hwna->pdev;
1746 	na->nm_mem = netmap_mem_get(hwna->nm_mem);
1747 	na->virt_hdr_len = hwna->virt_hdr_len;
1748 	na->rx_buf_maxsize = hwna->rx_buf_maxsize;
1749 
1750 	bna->hwna = hwna;
1751 	netmap_adapter_get(hwna);
1752 	hwna->na_private = bna; /* weak reference */
1753 	bna->saved_na_vp = hwna->na_vp;
1754 	hwna->na_vp = &bna->up;
1755 	bna->up.up.na_vp = &(bna->up);
1756 
1757 	if (hwna->na_flags & NAF_HOST_RINGS) {
1758 		if (hwna->na_flags & NAF_SW_ONLY)
1759 			na->na_flags |= NAF_SW_ONLY;
1760 		na->na_flags |= NAF_HOST_RINGS;
1761 		hostna = &bna->host.up;
1762 
1763 		snprintf(hostna->name, sizeof(hostna->name), "%s^", na->name);
1764 		hostna->ifp = hwna->ifp;
1765 		// hostna->nm_txsync = netmap_bwrap_host_txsync;
1766 		// hostna->nm_rxsync = netmap_bwrap_host_rxsync;
1767 		hostna->nm_mem = netmap_mem_get(na->nm_mem);
1768 		hostna->na_private = bna;
1769 		hostna->na_vp = &bna->up;
1770 		na->na_hostvp = hwna->na_hostvp =
1771 			hostna->na_hostvp = &bna->host;
1772 		hostna->na_flags = NAF_BUSY; /* prevent NIOCREGIF */
1773 		hostna->rx_buf_maxsize = hwna->rx_buf_maxsize;
1774 		/* bwrap_config() will determine the number of host rings */
1775 	}
1776 	if (hwna->na_flags & NAF_MOREFRAG)
1777 		na->na_flags |= NAF_MOREFRAG;
1778 
1779 	nm_prdis("%s<->%s txr %d txd %d rxr %d rxd %d",
1780 		na->name, ifp->if_xname,
1781 		na->num_tx_rings, na->num_tx_desc,
1782 		na->num_rx_rings, na->num_rx_desc);
1783 
1784 	error = netmap_attach_common(na);
1785 	if (error) {
1786 		goto err_put;
1787 	}
1788 	hwna->na_flags |= NAF_BUSY;
1789 	return 0;
1790 
1791 err_put:
1792 	hwna->na_vp = hwna->na_hostvp = NULL;
1793 	netmap_adapter_put(hwna);
1794 	return error;
1795 
1796 }
1797 
1798 struct nm_bridge *
1799 netmap_init_bridges2(u_int n)
1800 {
1801 	int i;
1802 	struct nm_bridge *b;
1803 
1804 	b = nm_os_malloc(sizeof(struct nm_bridge) * n);
1805 	if (b == NULL)
1806 		return NULL;
1807 	for (i = 0; i < n; i++)
1808 		BDG_RWINIT(&b[i]);
1809 	return b;
1810 }
1811 
1812 void
1813 netmap_uninit_bridges2(struct nm_bridge *b, u_int n)
1814 {
1815 	int i;
1816 
1817 	if (b == NULL)
1818 		return;
1819 
1820 	for (i = 0; i < n; i++)
1821 		BDG_RWDESTROY(&b[i]);
1822 	nm_os_free(b);
1823 }
1824 
1825 int
1826 netmap_init_bridges(void)
1827 {
1828 #ifdef CONFIG_NET_NS
1829 	return netmap_bns_register();
1830 #else
1831 	nm_bridges = netmap_init_bridges2(vale_max_bridges);
1832 	if (nm_bridges == NULL)
1833 		return ENOMEM;
1834 	return 0;
1835 #endif
1836 }
1837 
1838 void
1839 netmap_uninit_bridges(void)
1840 {
1841 #ifdef CONFIG_NET_NS
1842 	netmap_bns_unregister();
1843 #else
1844 	netmap_uninit_bridges2(nm_bridges, vale_max_bridges);
1845 #endif
1846 }
1847