xref: /freebsd/sys/dev/netmap/netmap_vale.c (revision 63cbe8d1d95f97e93929ec66f1138693d08dd9f6)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (C) 2013-2016 Universita` di Pisa
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *   1. Redistributions of source code must retain the above copyright
11  *      notice, this list of conditions and the following disclaimer.
12  *   2. Redistributions in binary form must reproduce the above copyright
13  *      notice, this list of conditions and the following disclaimer in the
14  *      documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 
30 #if defined(__FreeBSD__)
31 #include <sys/cdefs.h> /* prerequisite */
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/types.h>
35 #include <sys/errno.h>
36 #include <sys/param.h>	/* defines used in kernel.h */
37 #include <sys/kernel.h>	/* types used in module initialization */
38 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
39 #include <sys/sockio.h>
40 #include <sys/socketvar.h>	/* struct socket */
41 #include <sys/malloc.h>
42 #include <sys/poll.h>
43 #include <sys/rwlock.h>
44 #include <sys/socket.h> /* sockaddrs */
45 #include <sys/selinfo.h>
46 #include <sys/sysctl.h>
47 #include <net/if.h>
48 #include <net/if_var.h>
49 #include <net/bpf.h>		/* BIOCIMMEDIATE */
50 #include <machine/bus.h>	/* bus_dmamap_* */
51 #include <sys/endian.h>
52 #include <sys/refcount.h>
53 #include <sys/smp.h>
54 
55 
56 #elif defined(linux)
57 
58 #include "bsd_glue.h"
59 
60 #elif defined(__APPLE__)
61 
62 #warning OSX support is only partial
63 #include "osx_glue.h"
64 
65 #elif defined(_WIN32)
66 #include "win_glue.h"
67 
68 #else
69 
70 #error	Unsupported platform
71 
72 #endif /* unsupported */
73 
74 /*
75  * common headers
76  */
77 
78 #include <net/netmap.h>
79 #include <dev/netmap/netmap_kern.h>
80 #include <dev/netmap/netmap_mem2.h>
81 #include <dev/netmap/netmap_bdg.h>
82 
83 #ifdef WITH_VALE
84 
85 /*
86  * system parameters (most of them in netmap_kern.h)
87  * NM_BDG_NAME	prefix for switch port names, default "vale"
88  * NM_BDG_MAXPORTS	number of ports
89  * NM_BRIDGES	max number of switches in the system.
90  *	XXX should become a sysctl or tunable
91  *
92  * Switch ports are named valeX:Y where X is the switch name and Y
93  * is the port. If Y matches a physical interface name, the port is
94  * connected to a physical device.
95  *
96  * Unlike physical interfaces, switch ports use their own memory region
97  * for rings and buffers.
98  * The virtual interfaces use per-queue lock instead of core lock.
99  * In the tx loop, we aggregate traffic in batches to make all operations
100  * faster. The batch size is bridge_batch.
101  */
102 #define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
103 #define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
104 #define NM_BRIDGE_RINGSIZE	1024	/* in the device */
105 #define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
106 /* actual size of the tables */
107 #define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NETMAP_MAX_FRAGS)
108 /* NM_FT_NULL terminates a list of slots in the ft */
109 #define NM_FT_NULL		NM_BDG_BATCH_MAX
110 
111 
112 /*
113  * bridge_batch is set via sysctl to the max batch size to be
114  * used in the bridge. The actual value may be larger as the
115  * last packet in the block may overflow the size.
116  */
117 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
118 SYSBEGIN(vars_vale);
119 SYSCTL_DECL(_dev_netmap);
120 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0,
121 		"Max batch size to be used in the bridge");
122 SYSEND;
123 
124 static int netmap_vp_create(struct nmreq_header *hdr, struct ifnet *,
125 		struct netmap_mem_d *nmd, struct netmap_vp_adapter **);
126 static int netmap_vp_bdg_attach(const char *, struct netmap_adapter *,
127 		struct nm_bridge *);
128 static int netmap_vale_bwrap_attach(const char *, struct netmap_adapter *);
129 
130 /*
131  * For each output interface, nm_bdg_q is used to construct a list.
132  * bq_len is the number of output buffers (we can have coalescing
133  * during the copy).
134  */
135 struct nm_bdg_q {
136 	uint16_t bq_head;
137 	uint16_t bq_tail;
138 	uint32_t bq_len;	/* number of buffers */
139 };
140 
141 /* Holds the default callbacks */
142 struct netmap_bdg_ops vale_bdg_ops = {
143 	.lookup = netmap_bdg_learning,
144 	.config = NULL,
145 	.dtor = NULL,
146 	.vp_create = netmap_vp_create,
147 	.bwrap_attach = netmap_vale_bwrap_attach,
148 	.name = NM_BDG_NAME,
149 };
150 
151 /*
152  * this is a slightly optimized copy routine which rounds
153  * to multiple of 64 bytes and is often faster than dealing
154  * with other odd sizes. We assume there is enough room
155  * in the source and destination buffers.
156  *
157  * XXX only for multiples of 64 bytes, non overlapped.
158  */
159 static inline void
160 pkt_copy(void *_src, void *_dst, int l)
161 {
162 	uint64_t *src = _src;
163 	uint64_t *dst = _dst;
164 	if (unlikely(l >= 1024)) {
165 		memcpy(dst, src, l);
166 		return;
167 	}
168 	for (; likely(l > 0); l-=64) {
169 		*dst++ = *src++;
170 		*dst++ = *src++;
171 		*dst++ = *src++;
172 		*dst++ = *src++;
173 		*dst++ = *src++;
174 		*dst++ = *src++;
175 		*dst++ = *src++;
176 		*dst++ = *src++;
177 	}
178 }
179 
180 
181 /*
182  * Free the forwarding tables for rings attached to switch ports.
183  */
184 static void
185 nm_free_bdgfwd(struct netmap_adapter *na)
186 {
187 	int nrings, i;
188 	struct netmap_kring **kring;
189 
190 	NMG_LOCK_ASSERT();
191 	nrings = na->num_tx_rings;
192 	kring = na->tx_rings;
193 	for (i = 0; i < nrings; i++) {
194 		if (kring[i]->nkr_ft) {
195 			nm_os_free(kring[i]->nkr_ft);
196 			kring[i]->nkr_ft = NULL; /* protect from freeing twice */
197 		}
198 	}
199 }
200 
201 
202 /*
203  * Allocate the forwarding tables for the rings attached to the bridge ports.
204  */
205 static int
206 nm_alloc_bdgfwd(struct netmap_adapter *na)
207 {
208 	int nrings, l, i, num_dstq;
209 	struct netmap_kring **kring;
210 
211 	NMG_LOCK_ASSERT();
212 	/* all port:rings + broadcast */
213 	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
214 	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
215 	l += sizeof(struct nm_bdg_q) * num_dstq;
216 	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
217 
218 	nrings = netmap_real_rings(na, NR_TX);
219 	kring = na->tx_rings;
220 	for (i = 0; i < nrings; i++) {
221 		struct nm_bdg_fwd *ft;
222 		struct nm_bdg_q *dstq;
223 		int j;
224 
225 		ft = nm_os_malloc(l);
226 		if (!ft) {
227 			nm_free_bdgfwd(na);
228 			return ENOMEM;
229 		}
230 		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
231 		for (j = 0; j < num_dstq; j++) {
232 			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
233 			dstq[j].bq_len = 0;
234 		}
235 		kring[i]->nkr_ft = ft;
236 	}
237 	return 0;
238 }
239 
240 /* Allows external modules to create bridges in exclusive mode,
241  * returns an authentication token that the external module will need
242  * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(),
243  * and nm_bdg_update_private_data() operations.
244  * Successfully executed if ret != NULL and *return_status == 0.
245  */
246 void *
247 netmap_vale_create(const char *bdg_name, int *return_status)
248 {
249 	struct nm_bridge *b = NULL;
250 	void *ret = NULL;
251 
252 	NMG_LOCK();
253 	b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
254 	if (b) {
255 		*return_status = EEXIST;
256 		goto unlock_bdg_create;
257 	}
258 
259 	b = nm_find_bridge(bdg_name, 1 /* create */, &vale_bdg_ops);
260 	if (!b) {
261 		*return_status = ENOMEM;
262 		goto unlock_bdg_create;
263 	}
264 
265 	b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE;
266 	ret = nm_bdg_get_auth_token(b);
267 	*return_status = 0;
268 
269 unlock_bdg_create:
270 	NMG_UNLOCK();
271 	return ret;
272 }
273 
274 /* Allows external modules to destroy a bridge created through
275  * netmap_bdg_create(), the bridge must be empty.
276  */
277 int
278 netmap_vale_destroy(const char *bdg_name, void *auth_token)
279 {
280 	struct nm_bridge *b = NULL;
281 	int ret = 0;
282 
283 	NMG_LOCK();
284 	b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
285 	if (!b) {
286 		ret = ENXIO;
287 		goto unlock_bdg_free;
288 	}
289 
290 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
291 		ret = EACCES;
292 		goto unlock_bdg_free;
293 	}
294 	if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) {
295 		ret = EINVAL;
296 		goto unlock_bdg_free;
297 	}
298 
299 	b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE);
300 	ret = netmap_bdg_free(b);
301 	if (ret) {
302 		b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE;
303 	}
304 
305 unlock_bdg_free:
306 	NMG_UNLOCK();
307 	return ret;
308 }
309 
310 
311 
312 /* nm_dtor callback for ephemeral VALE ports */
313 static void
314 netmap_vp_dtor(struct netmap_adapter *na)
315 {
316 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
317 	struct nm_bridge *b = vpna->na_bdg;
318 
319 	ND("%s has %d references", na->name, na->na_refcount);
320 
321 	if (b) {
322 		netmap_bdg_detach_common(b, vpna->bdg_port, -1);
323 	}
324 
325 	if (na->ifp != NULL && !nm_iszombie(na)) {
326 		NM_DETACH_NA(na->ifp);
327 		if (vpna->autodelete) {
328 			ND("releasing %s", na->ifp->if_xname);
329 			NMG_UNLOCK();
330 			nm_os_vi_detach(na->ifp);
331 			NMG_LOCK();
332 		}
333 	}
334 }
335 
336 
337 /* Called by external kernel modules (e.g., Openvswitch).
338  * to modify the private data previously given to regops().
339  * 'name' may be just bridge's name (including ':' if it
340  * is not just NM_BDG_NAME).
341  * Called without NMG_LOCK.
342  */
343 int
344 nm_bdg_update_private_data(const char *name, bdg_update_private_data_fn_t callback,
345 	void *callback_data, void *auth_token)
346 {
347 	void *private_data = NULL;
348 	struct nm_bridge *b;
349 	int error = 0;
350 
351 	NMG_LOCK();
352 	b = nm_find_bridge(name, 0 /* don't create */, NULL);
353 	if (!b) {
354 		error = EINVAL;
355 		goto unlock_update_priv;
356 	}
357 	if (!nm_bdg_valid_auth_token(b, auth_token)) {
358 		error = EACCES;
359 		goto unlock_update_priv;
360 	}
361 	BDG_WLOCK(b);
362 	private_data = callback(b->private_data, callback_data, &error);
363 	b->private_data = private_data;
364 	BDG_WUNLOCK(b);
365 
366 unlock_update_priv:
367 	NMG_UNLOCK();
368 	return error;
369 }
370 
371 
372 /* nm_krings_create callback for VALE ports.
373  * Calls the standard netmap_krings_create, then adds leases on rx
374  * rings and bdgfwd on tx rings.
375  */
376 static int
377 netmap_vp_krings_create(struct netmap_adapter *na)
378 {
379 	u_int tailroom;
380 	int error, i;
381 	uint32_t *leases;
382 	u_int nrx = netmap_real_rings(na, NR_RX);
383 
384 	/*
385 	 * Leases are attached to RX rings on vale ports
386 	 */
387 	tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
388 
389 	error = netmap_krings_create(na, tailroom);
390 	if (error)
391 		return error;
392 
393 	leases = na->tailroom;
394 
395 	for (i = 0; i < nrx; i++) { /* Receive rings */
396 		na->rx_rings[i]->nkr_leases = leases;
397 		leases += na->num_rx_desc;
398 	}
399 
400 	error = nm_alloc_bdgfwd(na);
401 	if (error) {
402 		netmap_krings_delete(na);
403 		return error;
404 	}
405 
406 	return 0;
407 }
408 
409 
410 /* nm_krings_delete callback for VALE ports. */
411 static void
412 netmap_vp_krings_delete(struct netmap_adapter *na)
413 {
414 	nm_free_bdgfwd(na);
415 	netmap_krings_delete(na);
416 }
417 
418 
419 static int
420 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
421 	struct netmap_vp_adapter *na, u_int ring_nr);
422 
423 
424 /*
425  * main dispatch routine for the bridge.
426  * Grab packets from a kring, move them into the ft structure
427  * associated to the tx (input) port. Max one instance per port,
428  * filtered on input (ioctl, poll or XXX).
429  * Returns the next position in the ring.
430  */
431 static int
432 nm_bdg_preflush(struct netmap_kring *kring, u_int end)
433 {
434 	struct netmap_vp_adapter *na =
435 		(struct netmap_vp_adapter*)kring->na;
436 	struct netmap_ring *ring = kring->ring;
437 	struct nm_bdg_fwd *ft;
438 	u_int ring_nr = kring->ring_id;
439 	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
440 	u_int ft_i = 0;	/* start from 0 */
441 	u_int frags = 1; /* how many frags ? */
442 	struct nm_bridge *b = na->na_bdg;
443 
444 	/* To protect against modifications to the bridge we acquire a
445 	 * shared lock, waiting if we can sleep (if the source port is
446 	 * attached to a user process) or with a trylock otherwise (NICs).
447 	 */
448 	ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
449 	if (na->up.na_flags & NAF_BDG_MAYSLEEP)
450 		BDG_RLOCK(b);
451 	else if (!BDG_RTRYLOCK(b))
452 		return j;
453 	ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
454 	ft = kring->nkr_ft;
455 
456 	for (; likely(j != end); j = nm_next(j, lim)) {
457 		struct netmap_slot *slot = &ring->slot[j];
458 		char *buf;
459 
460 		ft[ft_i].ft_len = slot->len;
461 		ft[ft_i].ft_flags = slot->flags;
462 		ft[ft_i].ft_offset = 0;
463 
464 		ND("flags is 0x%x", slot->flags);
465 		/* we do not use the buf changed flag, but we still need to reset it */
466 		slot->flags &= ~NS_BUF_CHANGED;
467 
468 		/* this slot goes into a list so initialize the link field */
469 		ft[ft_i].ft_next = NM_FT_NULL;
470 		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
471 			(void *)(uintptr_t)slot->ptr : NMB(&na->up, slot);
472 		if (unlikely(buf == NULL)) {
473 			RD(5, "NULL %s buffer pointer from %s slot %d len %d",
474 				(slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
475 				kring->name, j, ft[ft_i].ft_len);
476 			buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
477 			ft[ft_i].ft_len = 0;
478 			ft[ft_i].ft_flags = 0;
479 		}
480 		__builtin_prefetch(buf);
481 		++ft_i;
482 		if (slot->flags & NS_MOREFRAG) {
483 			frags++;
484 			continue;
485 		}
486 		if (unlikely(netmap_verbose && frags > 1))
487 			RD(5, "%d frags at %d", frags, ft_i - frags);
488 		ft[ft_i - frags].ft_frags = frags;
489 		frags = 1;
490 		if (unlikely((int)ft_i >= bridge_batch))
491 			ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
492 	}
493 	if (frags > 1) {
494 		/* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
495 		 * have to fix frags count. */
496 		frags--;
497 		ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
498 		ft[ft_i - frags].ft_frags = frags;
499 		D("Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
500 	}
501 	if (ft_i)
502 		ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
503 	BDG_RUNLOCK(b);
504 	return j;
505 }
506 
507 
508 /* ----- FreeBSD if_bridge hash function ------- */
509 
510 /*
511  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
512  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
513  *
514  * http://www.burtleburtle.net/bob/hash/spooky.html
515  */
516 #define mix(a, b, c)                                                    \
517 do {                                                                    \
518 	a -= b; a -= c; a ^= (c >> 13);                                 \
519 	b -= c; b -= a; b ^= (a << 8);                                  \
520 	c -= a; c -= b; c ^= (b >> 13);                                 \
521 	a -= b; a -= c; a ^= (c >> 12);                                 \
522 	b -= c; b -= a; b ^= (a << 16);                                 \
523 	c -= a; c -= b; c ^= (b >> 5);                                  \
524 	a -= b; a -= c; a ^= (c >> 3);                                  \
525 	b -= c; b -= a; b ^= (a << 10);                                 \
526 	c -= a; c -= b; c ^= (b >> 15);                                 \
527 } while (/*CONSTCOND*/0)
528 
529 
530 static __inline uint32_t
531 nm_bridge_rthash(const uint8_t *addr)
532 {
533 	uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
534 
535 	b += addr[5] << 8;
536 	b += addr[4];
537 	a += addr[3] << 24;
538 	a += addr[2] << 16;
539 	a += addr[1] << 8;
540 	a += addr[0];
541 
542 	mix(a, b, c);
543 #define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
544 	return (c & BRIDGE_RTHASH_MASK);
545 }
546 
547 #undef mix
548 
549 
550 /*
551  * Lookup function for a learning bridge.
552  * Update the hash table with the source address,
553  * and then returns the destination port index, and the
554  * ring in *dst_ring (at the moment, always use ring 0)
555  */
556 uint32_t
557 netmap_bdg_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
558 		struct netmap_vp_adapter *na, void *private_data)
559 {
560 	uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset;
561 	u_int buf_len = ft->ft_len - ft->ft_offset;
562 	struct nm_hash_ent *ht = private_data;
563 	uint32_t sh, dh;
564 	u_int dst, mysrc = na->bdg_port;
565 	uint64_t smac, dmac;
566 	uint8_t indbuf[12];
567 
568 	if (buf_len < 14) {
569 		return NM_BDG_NOPORT;
570 	}
571 
572 	if (ft->ft_flags & NS_INDIRECT) {
573 		if (copyin(buf, indbuf, sizeof(indbuf))) {
574 			return NM_BDG_NOPORT;
575 		}
576 		buf = indbuf;
577 	}
578 
579 	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
580 	smac = le64toh(*(uint64_t *)(buf + 4));
581 	smac >>= 16;
582 
583 	/*
584 	 * The hash is somewhat expensive, there might be some
585 	 * worthwhile optimizations here.
586 	 */
587 	if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
588 		uint8_t *s = buf+6;
589 		sh = nm_bridge_rthash(s); /* hash of source */
590 		/* update source port forwarding entry */
591 		na->last_smac = ht[sh].mac = smac;	/* XXX expire ? */
592 		ht[sh].ports = mysrc;
593 		if (netmap_verbose)
594 		    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
595 			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
596 	}
597 	dst = NM_BDG_BROADCAST;
598 	if ((buf[0] & 1) == 0) { /* unicast */
599 		dh = nm_bridge_rthash(buf); /* hash of dst */
600 		if (ht[dh].mac == dmac) {	/* found dst */
601 			dst = ht[dh].ports;
602 		}
603 	}
604 	return dst;
605 }
606 
607 
608 /*
609  * Available space in the ring. Only used in VALE code
610  * and only with is_rx = 1
611  */
612 static inline uint32_t
613 nm_kr_space(struct netmap_kring *k, int is_rx)
614 {
615 	int space;
616 
617 	if (is_rx) {
618 		int busy = k->nkr_hwlease - k->nr_hwcur;
619 		if (busy < 0)
620 			busy += k->nkr_num_slots;
621 		space = k->nkr_num_slots - 1 - busy;
622 	} else {
623 		/* XXX never used in this branch */
624 		space = k->nr_hwtail - k->nkr_hwlease;
625 		if (space < 0)
626 			space += k->nkr_num_slots;
627 	}
628 #if 0
629 	// sanity check
630 	if (k->nkr_hwlease >= k->nkr_num_slots ||
631 		k->nr_hwcur >= k->nkr_num_slots ||
632 		k->nr_tail >= k->nkr_num_slots ||
633 		busy < 0 ||
634 		busy >= k->nkr_num_slots) {
635 		D("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
636 			k->nkr_lease_idx, k->nkr_num_slots);
637 	}
638 #endif
639 	return space;
640 }
641 
642 
643 
644 
645 /* make a lease on the kring for N positions. return the
646  * lease index
647  * XXX only used in VALE code and with is_rx = 1
648  */
649 static inline uint32_t
650 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
651 {
652 	uint32_t lim = k->nkr_num_slots - 1;
653 	uint32_t lease_idx = k->nkr_lease_idx;
654 
655 	k->nkr_leases[lease_idx] = NR_NOSLOT;
656 	k->nkr_lease_idx = nm_next(lease_idx, lim);
657 
658 	if (n > nm_kr_space(k, is_rx)) {
659 		D("invalid request for %d slots", n);
660 		panic("x");
661 	}
662 	/* XXX verify that there are n slots */
663 	k->nkr_hwlease += n;
664 	if (k->nkr_hwlease > lim)
665 		k->nkr_hwlease -= lim + 1;
666 
667 	if (k->nkr_hwlease >= k->nkr_num_slots ||
668 		k->nr_hwcur >= k->nkr_num_slots ||
669 		k->nr_hwtail >= k->nkr_num_slots ||
670 		k->nkr_lease_idx >= k->nkr_num_slots) {
671 		D("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
672 			k->na->name,
673 			k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
674 			k->nkr_lease_idx, k->nkr_num_slots);
675 	}
676 	return lease_idx;
677 }
678 
679 /*
680  *
681  * This flush routine supports only unicast and broadcast but a large
682  * number of ports, and lets us replace the learn and dispatch functions.
683  */
684 int
685 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
686 		u_int ring_nr)
687 {
688 	struct nm_bdg_q *dst_ents, *brddst;
689 	uint16_t num_dsts = 0, *dsts;
690 	struct nm_bridge *b = na->na_bdg;
691 	u_int i, me = na->bdg_port;
692 
693 	/*
694 	 * The work area (pointed by ft) is followed by an array of
695 	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
696 	 * queues per port plus one for the broadcast traffic.
697 	 * Then we have an array of destination indexes.
698 	 */
699 	dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
700 	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
701 
702 	/* first pass: find a destination for each packet in the batch */
703 	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
704 		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
705 		uint16_t dst_port, d_i;
706 		struct nm_bdg_q *d;
707 		struct nm_bdg_fwd *start_ft = NULL;
708 
709 		ND("slot %d frags %d", i, ft[i].ft_frags);
710 
711 		if (na->up.virt_hdr_len < ft[i].ft_len) {
712 			ft[i].ft_offset = na->up.virt_hdr_len;
713 			start_ft = &ft[i];
714 		} else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) {
715 			ft[i].ft_offset = ft[i].ft_len;
716 			start_ft = &ft[i+1];
717 		} else {
718 			/* Drop the packet if the virtio-net header is not into the first
719 			 * fragment nor at the very beginning of the second.
720 			 */
721 			continue;
722 		}
723 		dst_port = b->bdg_ops->lookup(start_ft, &dst_ring, na, b->private_data);
724 		if (netmap_verbose > 255)
725 			RD(5, "slot %d port %d -> %d", i, me, dst_port);
726 		if (dst_port >= NM_BDG_NOPORT)
727 			continue; /* this packet is identified to be dropped */
728 		else if (dst_port == NM_BDG_BROADCAST)
729 			dst_ring = 0; /* broadcasts always go to ring 0 */
730 		else if (unlikely(dst_port == me ||
731 		    !b->bdg_ports[dst_port]))
732 			continue;
733 
734 		/* get a position in the scratch pad */
735 		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
736 		d = dst_ents + d_i;
737 
738 		/* append the first fragment to the list */
739 		if (d->bq_head == NM_FT_NULL) { /* new destination */
740 			d->bq_head = d->bq_tail = i;
741 			/* remember this position to be scanned later */
742 			if (dst_port != NM_BDG_BROADCAST)
743 				dsts[num_dsts++] = d_i;
744 		} else {
745 			ft[d->bq_tail].ft_next = i;
746 			d->bq_tail = i;
747 		}
748 		d->bq_len += ft[i].ft_frags;
749 	}
750 
751 	/*
752 	 * Broadcast traffic goes to ring 0 on all destinations.
753 	 * So we need to add these rings to the list of ports to scan.
754 	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
755 	 * expensive. We should keep a compact list of active destinations
756 	 * so we could shorten this loop.
757 	 */
758 	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
759 	if (brddst->bq_head != NM_FT_NULL) {
760 		u_int j;
761 		for (j = 0; likely(j < b->bdg_active_ports); j++) {
762 			uint16_t d_i;
763 			i = b->bdg_port_index[j];
764 			if (unlikely(i == me))
765 				continue;
766 			d_i = i * NM_BDG_MAXRINGS;
767 			if (dst_ents[d_i].bq_head == NM_FT_NULL)
768 				dsts[num_dsts++] = d_i;
769 		}
770 	}
771 
772 	ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
773 	/* second pass: scan destinations */
774 	for (i = 0; i < num_dsts; i++) {
775 		struct netmap_vp_adapter *dst_na;
776 		struct netmap_kring *kring;
777 		struct netmap_ring *ring;
778 		u_int dst_nr, lim, j, d_i, next, brd_next;
779 		u_int needed, howmany;
780 		int retry = netmap_txsync_retry;
781 		struct nm_bdg_q *d;
782 		uint32_t my_start = 0, lease_idx = 0;
783 		int nrings;
784 		int virt_hdr_mismatch = 0;
785 
786 		d_i = dsts[i];
787 		ND("second pass %d port %d", i, d_i);
788 		d = dst_ents + d_i;
789 		// XXX fix the division
790 		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
791 		/* protect from the lookup function returning an inactive
792 		 * destination port
793 		 */
794 		if (unlikely(dst_na == NULL))
795 			goto cleanup;
796 		if (dst_na->up.na_flags & NAF_SW_ONLY)
797 			goto cleanup;
798 		/*
799 		 * The interface may be in !netmap mode in two cases:
800 		 * - when na is attached but not activated yet;
801 		 * - when na is being deactivated but is still attached.
802 		 */
803 		if (unlikely(!nm_netmap_on(&dst_na->up))) {
804 			ND("not in netmap mode!");
805 			goto cleanup;
806 		}
807 
808 		/* there is at least one either unicast or broadcast packet */
809 		brd_next = brddst->bq_head;
810 		next = d->bq_head;
811 		/* we need to reserve this many slots. If fewer are
812 		 * available, some packets will be dropped.
813 		 * Packets may have multiple fragments, so we may not use
814 		 * there is a chance that we may not use all of the slots
815 		 * we have claimed, so we will need to handle the leftover
816 		 * ones when we regain the lock.
817 		 */
818 		needed = d->bq_len + brddst->bq_len;
819 
820 		if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
821 			if (netmap_verbose) {
822 				RD(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
823 						dst_na->up.virt_hdr_len);
824 			}
825 			/* There is a virtio-net header/offloadings mismatch between
826 			 * source and destination. The slower mismatch datapath will
827 			 * be used to cope with all the mismatches.
828 			 */
829 			virt_hdr_mismatch = 1;
830 			if (dst_na->mfs < na->mfs) {
831 				/* We may need to do segmentation offloadings, and so
832 				 * we may need a number of destination slots greater
833 				 * than the number of input slots ('needed').
834 				 * We look for the smallest integer 'x' which satisfies:
835 				 *	needed * na->mfs + x * H <= x * na->mfs
836 				 * where 'H' is the length of the longest header that may
837 				 * be replicated in the segmentation process (e.g. for
838 				 * TCPv4 we must account for ethernet header, IP header
839 				 * and TCPv4 header).
840 				 */
841 				KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0"));
842 				needed = (needed * na->mfs) /
843 						(dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
844 				ND(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
845 			}
846 		}
847 
848 		ND(5, "pass 2 dst %d is %x %s",
849 			i, d_i, is_vp ? "virtual" : "nic/host");
850 		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
851 		nrings = dst_na->up.num_rx_rings;
852 		if (dst_nr >= nrings)
853 			dst_nr = dst_nr % nrings;
854 		kring = dst_na->up.rx_rings[dst_nr];
855 		ring = kring->ring;
856 		/* the destination ring may have not been opened for RX */
857 		if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON))
858 			goto cleanup;
859 		lim = kring->nkr_num_slots - 1;
860 
861 retry:
862 
863 		if (dst_na->retry && retry) {
864 			/* try to get some free slot from the previous run */
865 			kring->nm_notify(kring, 0);
866 			/* actually useful only for bwraps, since there
867 			 * the notify will trigger a txsync on the hwna. VALE ports
868 			 * have dst_na->retry == 0
869 			 */
870 		}
871 		/* reserve the buffers in the queue and an entry
872 		 * to report completion, and drop lock.
873 		 * XXX this might become a helper function.
874 		 */
875 		mtx_lock(&kring->q_lock);
876 		if (kring->nkr_stopped) {
877 			mtx_unlock(&kring->q_lock);
878 			goto cleanup;
879 		}
880 		my_start = j = kring->nkr_hwlease;
881 		howmany = nm_kr_space(kring, 1);
882 		if (needed < howmany)
883 			howmany = needed;
884 		lease_idx = nm_kr_lease(kring, howmany, 1);
885 		mtx_unlock(&kring->q_lock);
886 
887 		/* only retry if we need more than available slots */
888 		if (retry && needed <= howmany)
889 			retry = 0;
890 
891 		/* copy to the destination queue */
892 		while (howmany > 0) {
893 			struct netmap_slot *slot;
894 			struct nm_bdg_fwd *ft_p, *ft_end;
895 			u_int cnt;
896 
897 			/* find the queue from which we pick next packet.
898 			 * NM_FT_NULL is always higher than valid indexes
899 			 * so we never dereference it if the other list
900 			 * has packets (and if both are empty we never
901 			 * get here).
902 			 */
903 			if (next < brd_next) {
904 				ft_p = ft + next;
905 				next = ft_p->ft_next;
906 			} else { /* insert broadcast */
907 				ft_p = ft + brd_next;
908 				brd_next = ft_p->ft_next;
909 			}
910 			cnt = ft_p->ft_frags; // cnt > 0
911 			if (unlikely(cnt > howmany))
912 			    break; /* no more space */
913 			if (netmap_verbose && cnt > 1)
914 				RD(5, "rx %d frags to %d", cnt, j);
915 			ft_end = ft_p + cnt;
916 			if (unlikely(virt_hdr_mismatch)) {
917 				bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
918 			} else {
919 				howmany -= cnt;
920 				do {
921 					char *dst, *src = ft_p->ft_buf;
922 					size_t copy_len = ft_p->ft_len, dst_len = copy_len;
923 
924 					slot = &ring->slot[j];
925 					dst = NMB(&dst_na->up, slot);
926 
927 					ND("send [%d] %d(%d) bytes at %s:%d",
928 							i, (int)copy_len, (int)dst_len,
929 							NM_IFPNAME(dst_ifp), j);
930 					/* round to a multiple of 64 */
931 					copy_len = (copy_len + 63) & ~63;
932 
933 					if (unlikely(copy_len > NETMAP_BUF_SIZE(&dst_na->up) ||
934 						     copy_len > NETMAP_BUF_SIZE(&na->up))) {
935 						RD(5, "invalid len %d, down to 64", (int)copy_len);
936 						copy_len = dst_len = 64; // XXX
937 					}
938 					if (ft_p->ft_flags & NS_INDIRECT) {
939 						if (copyin(src, dst, copy_len)) {
940 							// invalid user pointer, pretend len is 0
941 							dst_len = 0;
942 						}
943 					} else {
944 						//memcpy(dst, src, copy_len);
945 						pkt_copy(src, dst, (int)copy_len);
946 					}
947 					slot->len = dst_len;
948 					slot->flags = (cnt << 8)| NS_MOREFRAG;
949 					j = nm_next(j, lim);
950 					needed--;
951 					ft_p++;
952 				} while (ft_p != ft_end);
953 				slot->flags = (cnt << 8); /* clear flag on last entry */
954 			}
955 			/* are we done ? */
956 			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
957 				break;
958 		}
959 		{
960 		    /* current position */
961 		    uint32_t *p = kring->nkr_leases; /* shorthand */
962 		    uint32_t update_pos;
963 		    int still_locked = 1;
964 
965 		    mtx_lock(&kring->q_lock);
966 		    if (unlikely(howmany > 0)) {
967 			/* not used all bufs. If i am the last one
968 			 * i can recover the slots, otherwise must
969 			 * fill them with 0 to mark empty packets.
970 			 */
971 			ND("leftover %d bufs", howmany);
972 			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
973 			    /* yes i am the last one */
974 			    ND("roll back nkr_hwlease to %d", j);
975 			    kring->nkr_hwlease = j;
976 			} else {
977 			    while (howmany-- > 0) {
978 				ring->slot[j].len = 0;
979 				ring->slot[j].flags = 0;
980 				j = nm_next(j, lim);
981 			    }
982 			}
983 		    }
984 		    p[lease_idx] = j; /* report I am done */
985 
986 		    update_pos = kring->nr_hwtail;
987 
988 		    if (my_start == update_pos) {
989 			/* all slots before my_start have been reported,
990 			 * so scan subsequent leases to see if other ranges
991 			 * have been completed, and to a selwakeup or txsync.
992 		         */
993 			while (lease_idx != kring->nkr_lease_idx &&
994 				p[lease_idx] != NR_NOSLOT) {
995 			    j = p[lease_idx];
996 			    p[lease_idx] = NR_NOSLOT;
997 			    lease_idx = nm_next(lease_idx, lim);
998 			}
999 			/* j is the new 'write' position. j != my_start
1000 			 * means there are new buffers to report
1001 			 */
1002 			if (likely(j != my_start)) {
1003 				kring->nr_hwtail = j;
1004 				still_locked = 0;
1005 				mtx_unlock(&kring->q_lock);
1006 				kring->nm_notify(kring, 0);
1007 				/* this is netmap_notify for VALE ports and
1008 				 * netmap_bwrap_notify for bwrap. The latter will
1009 				 * trigger a txsync on the underlying hwna
1010 				 */
1011 				if (dst_na->retry && retry--) {
1012 					/* XXX this is going to call nm_notify again.
1013 					 * Only useful for bwrap in virtual machines
1014 					 */
1015 					goto retry;
1016 				}
1017 			}
1018 		    }
1019 		    if (still_locked)
1020 			mtx_unlock(&kring->q_lock);
1021 		}
1022 cleanup:
1023 		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
1024 		d->bq_len = 0;
1025 	}
1026 	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
1027 	brddst->bq_len = 0;
1028 	return 0;
1029 }
1030 
1031 /* nm_txsync callback for VALE ports */
1032 static int
1033 netmap_vp_txsync(struct netmap_kring *kring, int flags)
1034 {
1035 	struct netmap_vp_adapter *na =
1036 		(struct netmap_vp_adapter *)kring->na;
1037 	u_int done;
1038 	u_int const lim = kring->nkr_num_slots - 1;
1039 	u_int const head = kring->rhead;
1040 
1041 	if (bridge_batch <= 0) { /* testing only */
1042 		done = head; // used all
1043 		goto done;
1044 	}
1045 	if (!na->na_bdg) {
1046 		done = head;
1047 		goto done;
1048 	}
1049 	if (bridge_batch > NM_BDG_BATCH)
1050 		bridge_batch = NM_BDG_BATCH;
1051 
1052 	done = nm_bdg_preflush(kring, head);
1053 done:
1054 	if (done != head)
1055 		D("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
1056 	/*
1057 	 * packets between 'done' and 'cur' are left unsent.
1058 	 */
1059 	kring->nr_hwcur = done;
1060 	kring->nr_hwtail = nm_prev(done, lim);
1061 	if (netmap_verbose)
1062 		D("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
1063 	return 0;
1064 }
1065 
1066 
1067 /* create a netmap_vp_adapter that describes a VALE port.
1068  * Only persistent VALE ports have a non-null ifp.
1069  */
1070 static int
1071 netmap_vp_create(struct nmreq_header *hdr, struct ifnet *ifp,
1072 		struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret)
1073 {
1074 	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1075 	struct netmap_vp_adapter *vpna;
1076 	struct netmap_adapter *na;
1077 	int error = 0;
1078 	u_int npipes = 0;
1079 	u_int extrabufs = 0;
1080 
1081 	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1082 		return EINVAL;
1083 	}
1084 
1085 	vpna = nm_os_malloc(sizeof(*vpna));
1086 	if (vpna == NULL)
1087 		return ENOMEM;
1088 
1089  	na = &vpna->up;
1090 
1091 	na->ifp = ifp;
1092 	strncpy(na->name, hdr->nr_name, sizeof(na->name));
1093 
1094 	/* bound checking */
1095 	na->num_tx_rings = req->nr_tx_rings;
1096 	nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1097 	req->nr_tx_rings = na->num_tx_rings; /* write back */
1098 	na->num_rx_rings = req->nr_rx_rings;
1099 	nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1100 	req->nr_rx_rings = na->num_rx_rings; /* write back */
1101 	nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1102 			1, NM_BDG_MAXSLOTS, NULL);
1103 	na->num_tx_desc = req->nr_tx_slots;
1104 	nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1105 			1, NM_BDG_MAXSLOTS, NULL);
1106 	/* validate number of pipes. We want at least 1,
1107 	 * but probably can do with some more.
1108 	 * So let's use 2 as default (when 0 is supplied)
1109 	 */
1110 	nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
1111 	/* validate extra bufs */
1112 	nm_bound_var(&extrabufs, 0, 0,
1113 			128*NM_BDG_MAXSLOTS, NULL);
1114 	req->nr_extra_bufs = extrabufs; /* write back */
1115 	na->num_rx_desc = req->nr_rx_slots;
1116 	/* Set the mfs to a default value, as it is needed on the VALE
1117 	 * mismatch datapath. XXX We should set it according to the MTU
1118 	 * known to the kernel. */
1119 	vpna->mfs = NM_BDG_MFS_DEFAULT;
1120 	vpna->last_smac = ~0llu;
1121 	/*if (vpna->mfs > netmap_buf_size)  TODO netmap_buf_size is zero??
1122 		vpna->mfs = netmap_buf_size; */
1123 	if (netmap_verbose)
1124 		D("max frame size %u", vpna->mfs);
1125 
1126 	na->na_flags |= NAF_BDG_MAYSLEEP;
1127 	/* persistent VALE ports look like hw devices
1128 	 * with a native netmap adapter
1129 	 */
1130 	if (ifp)
1131 		na->na_flags |= NAF_NATIVE;
1132 	na->nm_txsync = netmap_vp_txsync;
1133 	na->nm_rxsync = netmap_vp_rxsync;
1134 	na->nm_register = netmap_vp_reg;
1135 	na->nm_krings_create = netmap_vp_krings_create;
1136 	na->nm_krings_delete = netmap_vp_krings_delete;
1137 	na->nm_dtor = netmap_vp_dtor;
1138 	ND("nr_mem_id %d", req->nr_mem_id);
1139 	na->nm_mem = nmd ?
1140 		netmap_mem_get(nmd):
1141 		netmap_mem_private_new(
1142 			na->num_tx_rings, na->num_tx_desc,
1143 			na->num_rx_rings, na->num_rx_desc,
1144 			req->nr_extra_bufs, npipes, &error);
1145 	if (na->nm_mem == NULL)
1146 		goto err;
1147 	na->nm_bdg_attach = netmap_vp_bdg_attach;
1148 	/* other nmd fields are set in the common routine */
1149 	error = netmap_attach_common(na);
1150 	if (error)
1151 		goto err;
1152 	*ret = vpna;
1153 	return 0;
1154 
1155 err:
1156 	if (na->nm_mem != NULL)
1157 		netmap_mem_put(na->nm_mem);
1158 	nm_os_free(vpna);
1159 	return error;
1160 }
1161 
1162 /* nm_bdg_attach callback for VALE ports
1163  * The na_vp port is this same netmap_adapter. There is no host port.
1164  */
1165 static int
1166 netmap_vp_bdg_attach(const char *name, struct netmap_adapter *na,
1167 		struct nm_bridge *b)
1168 {
1169 	struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
1170 
1171 	if (b->bdg_ops != &vale_bdg_ops) {
1172 		return NM_NEED_BWRAP;
1173 	}
1174 	if (vpna->na_bdg) {
1175 		return NM_NEED_BWRAP;
1176 	}
1177 	na->na_vp = vpna;
1178 	strncpy(na->name, name, sizeof(na->name));
1179 	na->na_hostvp = NULL;
1180 	return 0;
1181 }
1182 
1183 static int
1184 netmap_vale_bwrap_krings_create(struct netmap_adapter *na)
1185 {
1186 	int error;
1187 
1188 	/* impersonate a netmap_vp_adapter */
1189 	error = netmap_vp_krings_create(na);
1190 	if (error)
1191 		return error;
1192 	error = netmap_bwrap_krings_create_common(na);
1193 	if (error) {
1194 		netmap_vp_krings_delete(na);
1195 	}
1196 	return error;
1197 }
1198 
1199 static void
1200 netmap_vale_bwrap_krings_delete(struct netmap_adapter *na)
1201 {
1202 	netmap_bwrap_krings_delete_common(na);
1203 	netmap_vp_krings_delete(na);
1204 }
1205 
1206 static int
1207 netmap_vale_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
1208 {
1209 	struct netmap_bwrap_adapter *bna;
1210 	struct netmap_adapter *na = NULL;
1211 	struct netmap_adapter *hostna = NULL;
1212 	int error;
1213 
1214 	bna = nm_os_malloc(sizeof(*bna));
1215 	if (bna == NULL) {
1216 		return ENOMEM;
1217 	}
1218 	na = &bna->up.up;
1219 	strncpy(na->name, nr_name, sizeof(na->name));
1220 	na->nm_register = netmap_bwrap_reg;
1221 	na->nm_txsync = netmap_vp_txsync;
1222 	// na->nm_rxsync = netmap_bwrap_rxsync;
1223 	na->nm_krings_create = netmap_vale_bwrap_krings_create;
1224 	na->nm_krings_delete = netmap_vale_bwrap_krings_delete;
1225 	na->nm_notify = netmap_bwrap_notify;
1226 	bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
1227 	/* Set the mfs, needed on the VALE mismatch datapath. */
1228 	bna->up.mfs = NM_BDG_MFS_DEFAULT;
1229 
1230 	if (hwna->na_flags & NAF_HOST_RINGS) {
1231 		hostna = &bna->host.up;
1232 		hostna->nm_notify = netmap_bwrap_notify;
1233 		bna->host.mfs = NM_BDG_MFS_DEFAULT;
1234 	}
1235 
1236 	error = netmap_bwrap_attach_common(na, hwna);
1237 	if (error) {
1238 		nm_os_free(bna);
1239 	}
1240 	return error;
1241 }
1242 
1243 int
1244 netmap_get_vale_na(struct nmreq_header *hdr, struct netmap_adapter **na,
1245 		struct netmap_mem_d *nmd, int create)
1246 {
1247 	return netmap_get_bdg_na(hdr, na, nmd, create, &vale_bdg_ops);
1248 }
1249 
1250 
1251 /* creates a persistent VALE port */
1252 int
1253 nm_vi_create(struct nmreq_header *hdr)
1254 {
1255 	struct nmreq_vale_newif *req =
1256 		(struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body;
1257 	int error = 0;
1258 	/* Build a nmreq_register out of the nmreq_vale_newif,
1259 	 * so that we can call netmap_get_bdg_na(). */
1260 	struct nmreq_register regreq;
1261 	bzero(&regreq, sizeof(regreq));
1262 	regreq.nr_tx_slots = req->nr_tx_slots;
1263 	regreq.nr_rx_slots = req->nr_rx_slots;
1264 	regreq.nr_tx_rings = req->nr_tx_rings;
1265 	regreq.nr_rx_rings = req->nr_rx_rings;
1266 	regreq.nr_mem_id = req->nr_mem_id;
1267 	hdr->nr_reqtype = NETMAP_REQ_REGISTER;
1268 	hdr->nr_body = (uintptr_t)&regreq;
1269 	error = netmap_vi_create(hdr, 0 /* no autodelete */);
1270 	hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF;
1271 	hdr->nr_body = (uintptr_t)req;
1272 	/* Write back to the original struct. */
1273 	req->nr_tx_slots = regreq.nr_tx_slots;
1274 	req->nr_rx_slots = regreq.nr_rx_slots;
1275 	req->nr_tx_rings = regreq.nr_tx_rings;
1276 	req->nr_rx_rings = regreq.nr_rx_rings;
1277 	req->nr_mem_id = regreq.nr_mem_id;
1278 	return error;
1279 }
1280 
1281 /* remove a persistent VALE port from the system */
1282 int
1283 nm_vi_destroy(const char *name)
1284 {
1285 	struct ifnet *ifp;
1286 	struct netmap_vp_adapter *vpna;
1287 	int error;
1288 
1289 	ifp = ifunit_ref(name);
1290 	if (!ifp)
1291 		return ENXIO;
1292 	NMG_LOCK();
1293 	/* make sure this is actually a VALE port */
1294 	if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
1295 		error = EINVAL;
1296 		goto err;
1297 	}
1298 
1299 	vpna = (struct netmap_vp_adapter *)NA(ifp);
1300 
1301 	/* we can only destroy ports that were created via NETMAP_BDG_NEWIF */
1302 	if (vpna->autodelete) {
1303 		error = EINVAL;
1304 		goto err;
1305 	}
1306 
1307 	/* also make sure that nobody is using the inferface */
1308 	if (NETMAP_OWNED_BY_ANY(&vpna->up) ||
1309 	    vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) {
1310 		error = EBUSY;
1311 		goto err;
1312 	}
1313 
1314 	NMG_UNLOCK();
1315 
1316 	D("destroying a persistent vale interface %s", ifp->if_xname);
1317 	/* Linux requires all the references are released
1318 	 * before unregister
1319 	 */
1320 	netmap_detach(ifp);
1321 	if_rele(ifp);
1322 	nm_os_vi_detach(ifp);
1323 	return 0;
1324 
1325 err:
1326 	NMG_UNLOCK();
1327 	if_rele(ifp);
1328 	return error;
1329 }
1330 
1331 static int
1332 nm_update_info(struct nmreq_register *req, struct netmap_adapter *na)
1333 {
1334 	req->nr_rx_rings = na->num_rx_rings;
1335 	req->nr_tx_rings = na->num_tx_rings;
1336 	req->nr_rx_slots = na->num_rx_desc;
1337 	req->nr_tx_slots = na->num_tx_desc;
1338 	return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL,
1339 					&req->nr_mem_id);
1340 }
1341 
1342 
1343 /*
1344  * Create a virtual interface registered to the system.
1345  * The interface will be attached to a bridge later.
1346  */
1347 int
1348 netmap_vi_create(struct nmreq_header *hdr, int autodelete)
1349 {
1350 	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1351 	struct ifnet *ifp;
1352 	struct netmap_vp_adapter *vpna;
1353 	struct netmap_mem_d *nmd = NULL;
1354 	int error;
1355 
1356 	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1357 		return EINVAL;
1358 	}
1359 
1360 	/* don't include VALE prefix */
1361 	if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
1362 		return EINVAL;
1363 	if (strlen(hdr->nr_name) >= IFNAMSIZ) {
1364 		return EINVAL;
1365 	}
1366 	ifp = ifunit_ref(hdr->nr_name);
1367 	if (ifp) { /* already exist, cannot create new one */
1368 		error = EEXIST;
1369 		NMG_LOCK();
1370 		if (NM_NA_VALID(ifp)) {
1371 			int update_err = nm_update_info(req, NA(ifp));
1372 			if (update_err)
1373 				error = update_err;
1374 		}
1375 		NMG_UNLOCK();
1376 		if_rele(ifp);
1377 		return error;
1378 	}
1379 	error = nm_os_vi_persist(hdr->nr_name, &ifp);
1380 	if (error)
1381 		return error;
1382 
1383 	NMG_LOCK();
1384 	if (req->nr_mem_id) {
1385 		nmd = netmap_mem_find(req->nr_mem_id);
1386 		if (nmd == NULL) {
1387 			error = EINVAL;
1388 			goto err_1;
1389 		}
1390 	}
1391 	/* netmap_vp_create creates a struct netmap_vp_adapter */
1392 	error = netmap_vp_create(hdr, ifp, nmd, &vpna);
1393 	if (error) {
1394 		D("error %d", error);
1395 		goto err_1;
1396 	}
1397 	/* persist-specific routines */
1398 	vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
1399 	if (!autodelete) {
1400 		netmap_adapter_get(&vpna->up);
1401 	} else {
1402 		vpna->autodelete = 1;
1403 	}
1404 	NM_ATTACH_NA(ifp, &vpna->up);
1405 	/* return the updated info */
1406 	error = nm_update_info(req, &vpna->up);
1407 	if (error) {
1408 		goto err_2;
1409 	}
1410 	ND("returning nr_mem_id %d", req->nr_mem_id);
1411 	if (nmd)
1412 		netmap_mem_put(nmd);
1413 	NMG_UNLOCK();
1414 	ND("created %s", ifp->if_xname);
1415 	return 0;
1416 
1417 err_2:
1418 	netmap_detach(ifp);
1419 err_1:
1420 	if (nmd)
1421 		netmap_mem_put(nmd);
1422 	NMG_UNLOCK();
1423 	nm_os_vi_detach(ifp);
1424 
1425 	return error;
1426 }
1427 
1428 #endif /* WITH_VALE */
1429