xref: /freebsd/sys/dev/netmap/netmap.c (revision 84dfba8d183d31e3412639ecb4b8ad4433cf7e80)
1 /*
2  * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * This module supports memory mapped access to network devices,
29  * see netmap(4).
30  *
31  * The module uses a large, memory pool allocated by the kernel
32  * and accessible as mmapped memory by multiple userspace threads/processes.
33  * The memory pool contains packet buffers and "netmap rings",
34  * i.e. user-accessible copies of the interface's queues.
35  *
36  * Access to the network card works like this:
37  * 1. a process/thread issues one or more open() on /dev/netmap, to create
38  *    select()able file descriptor on which events are reported.
39  * 2. on each descriptor, the process issues an ioctl() to identify
40  *    the interface that should report events to the file descriptor.
41  * 3. on each descriptor, the process issues an mmap() request to
42  *    map the shared memory region within the process' address space.
43  *    The list of interesting queues is indicated by a location in
44  *    the shared memory region.
45  * 4. using the functions in the netmap(4) userspace API, a process
46  *    can look up the occupation state of a queue, access memory buffers,
47  *    and retrieve received packets or enqueue packets to transmit.
48  * 5. using some ioctl()s the process can synchronize the userspace view
49  *    of the queue with the actual status in the kernel. This includes both
50  *    receiving the notification of new packets, and transmitting new
51  *    packets on the output interface.
52  * 6. select() or poll() can be used to wait for events on individual
53  *    transmit or receive queues (or all queues for a given interface).
54  *
55 
56 		SYNCHRONIZATION (USER)
57 
58 The netmap rings and data structures may be shared among multiple
59 user threads or even independent processes.
60 Any synchronization among those threads/processes is delegated
61 to the threads themselves. Only one thread at a time can be in
62 a system call on the same netmap ring. The OS does not enforce
63 this and only guarantees against system crashes in case of
64 invalid usage.
65 
66 		LOCKING (INTERNAL)
67 
68 Within the kernel, access to the netmap rings is protected as follows:
69 
70 - a spinlock on each ring, to handle producer/consumer races on
71   RX rings attached to the host stack (against multiple host
72   threads writing from the host stack to the same ring),
73   and on 'destination' rings attached to a VALE switch
74   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
75   protecting multiple active senders for the same destination)
76 
77 - an atomic variable to guarantee that there is at most one
78   instance of *_*xsync() on the ring at any time.
79   For rings connected to user file
80   descriptors, an atomic_test_and_set() protects this, and the
81   lock on the ring is not actually used.
82   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
83   is also used to prevent multiple executions (the driver might indeed
84   already guarantee this).
85   For NIC TX rings connected to a VALE switch, the lock arbitrates
86   access to the queue (both when allocating buffers and when pushing
87   them out).
88 
89 - *xsync() should be protected against initializations of the card.
90   On FreeBSD most devices have the reset routine protected by
91   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
92   the RING protection on rx_reset(), this should be added.
93 
94   On linux there is an external lock on the tx path, which probably
95   also arbitrates access to the reset routine. XXX to be revised
96 
97 - a per-interface core_lock protecting access from the host stack
98   while interfaces may be detached from netmap mode.
99   XXX there should be no need for this lock if we detach the interfaces
100   only while they are down.
101 
102 
103 --- VALE SWITCH ---
104 
105 NMG_LOCK() serializes all modifications to switches and ports.
106 A switch cannot be deleted until all ports are gone.
107 
108 For each switch, an SX lock (RWlock on linux) protects
109 deletion of ports. When configuring or deleting a new port, the
110 lock is acquired in exclusive mode (after holding NMG_LOCK).
111 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
112 The lock is held throughout the entire forwarding cycle,
113 during which the thread may incur in a page fault.
114 Hence it is important that sleepable shared locks are used.
115 
116 On the rx ring, the per-port lock is grabbed initially to reserve
117 a number of slot in the ring, then the lock is released,
118 packets are copied from source to destination, and then
119 the lock is acquired again and the receive ring is updated.
120 (A similar thing is done on the tx ring for NIC and host stack
121 ports attached to the switch)
122 
123  */
124 
125 /*
126  * OS-specific code that is used only within this file.
127  * Other OS-specific code that must be accessed by drivers
128  * is present in netmap_kern.h
129  */
130 
131 #if defined(__FreeBSD__)
132 #include <sys/cdefs.h> /* prerequisite */
133 __FBSDID("$FreeBSD$");
134 
135 #include <sys/types.h>
136 #include <sys/module.h>
137 #include <sys/errno.h>
138 #include <sys/param.h>	/* defines used in kernel.h */
139 #include <sys/jail.h>
140 #include <sys/kernel.h>	/* types used in module initialization */
141 #include <sys/conf.h>	/* cdevsw struct */
142 #include <sys/uio.h>	/* uio struct */
143 #include <sys/sockio.h>
144 #include <sys/socketvar.h>	/* struct socket */
145 #include <sys/malloc.h>
146 #include <sys/mman.h>	/* PROT_EXEC */
147 #include <sys/poll.h>
148 #include <sys/proc.h>
149 #include <sys/rwlock.h>
150 #include <vm/vm.h>	/* vtophys */
151 #include <vm/pmap.h>	/* vtophys */
152 #include <vm/vm_param.h>
153 #include <vm/vm_object.h>
154 #include <vm/vm_page.h>
155 #include <vm/vm_pager.h>
156 #include <vm/uma.h>
157 #include <sys/socket.h> /* sockaddrs */
158 #include <sys/selinfo.h>
159 #include <sys/sysctl.h>
160 #include <net/if.h>
161 #include <net/if_var.h>
162 #include <net/bpf.h>		/* BIOCIMMEDIATE */
163 #include <net/vnet.h>
164 #include <machine/bus.h>	/* bus_dmamap_* */
165 #include <sys/endian.h>
166 #include <sys/refcount.h>
167 
168 #define prefetch(x)	__builtin_prefetch(x)
169 
170 #define BDG_RWLOCK_T		struct rwlock // struct rwlock
171 
172 #define	BDG_RWINIT(b)		\
173 	rw_init_flags(&(b)->bdg_lock, "bdg lock", RW_NOWITNESS)
174 #define BDG_WLOCK(b)		rw_wlock(&(b)->bdg_lock)
175 #define BDG_WUNLOCK(b)		rw_wunlock(&(b)->bdg_lock)
176 #define BDG_RLOCK(b)		rw_rlock(&(b)->bdg_lock)
177 #define BDG_RTRYLOCK(b)		rw_try_rlock(&(b)->bdg_lock)
178 #define BDG_RUNLOCK(b)		rw_runlock(&(b)->bdg_lock)
179 #define BDG_RWDESTROY(b)	rw_destroy(&(b)->bdg_lock)
180 
181 
182 /* netmap global lock.
183  * normally called within the user thread (upon a system call)
184  * or when a file descriptor or process is terminated
185  * (last close or last munmap)
186  */
187 
188 #define NMG_LOCK_T		struct mtx
189 #define NMG_LOCK_INIT()		mtx_init(&netmap_global_lock, "netmap global lock", NULL, MTX_DEF)
190 #define NMG_LOCK_DESTROY()	mtx_destroy(&netmap_global_lock)
191 #define NMG_LOCK()		mtx_lock(&netmap_global_lock)
192 #define NMG_UNLOCK()		mtx_unlock(&netmap_global_lock)
193 #define NMG_LOCK_ASSERT()	mtx_assert(&netmap_global_lock, MA_OWNED)
194 
195 
196 /* atomic operations */
197 #include <machine/atomic.h>
198 #define NM_ATOMIC_TEST_AND_SET(p)	(!atomic_cmpset_acq_int((p), 0, 1))
199 #define NM_ATOMIC_CLEAR(p)		atomic_store_rel_int((p), 0)
200 
201 
202 #elif defined(linux)
203 
204 #include "bsd_glue.h"
205 
206 static netdev_tx_t linux_netmap_start_xmit(struct sk_buff *, struct net_device *);
207 
208 static struct device_driver*
209 linux_netmap_find_driver(struct device *dev)
210 {
211 	struct device_driver *dd;
212 
213 	while ( (dd = dev->driver) == NULL ) {
214 		if ( (dev = dev->parent) == NULL )
215 			return NULL;
216 	}
217 	return dd;
218 }
219 
220 static struct net_device*
221 ifunit_ref(const char *name)
222 {
223 	struct net_device *ifp = dev_get_by_name(&init_net, name);
224 	struct device_driver *dd;
225 
226 	if (ifp == NULL)
227 		return NULL;
228 
229 	if ( (dd = linux_netmap_find_driver(&ifp->dev)) == NULL )
230 		goto error;
231 
232 	if (!try_module_get(dd->owner))
233 		goto error;
234 
235 	return ifp;
236 error:
237 	dev_put(ifp);
238 	return NULL;
239 }
240 
241 static void
242 if_rele(struct net_device *ifp)
243 {
244 	struct device_driver *dd;
245 	dd = linux_netmap_find_driver(&ifp->dev);
246 	dev_put(ifp);
247 	if (dd)
248 		module_put(dd->owner);
249 }
250 
251 // XXX a mtx would suffice here too 20130404 gl
252 #define NMG_LOCK_T		struct semaphore
253 #define NMG_LOCK_INIT()		sema_init(&netmap_global_lock, 1)
254 #define NMG_LOCK_DESTROY()
255 #define NMG_LOCK()		down(&netmap_global_lock)
256 #define NMG_UNLOCK()		up(&netmap_global_lock)
257 #define NMG_LOCK_ASSERT()	//	XXX to be completed
258 
259 
260 #elif defined(__APPLE__)
261 
262 #warning OSX support is only partial
263 #include "osx_glue.h"
264 
265 #else
266 
267 #error	Unsupported platform
268 
269 #endif /* unsupported */
270 
271 /*
272  * common headers
273  */
274 #include <net/netmap.h>
275 #include <dev/netmap/netmap_kern.h>
276 #include <dev/netmap/netmap_mem2.h>
277 
278 
279 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
280 
281 /*
282  * The following variables are used by the drivers and replicate
283  * fields in the global memory pool. They only refer to buffers
284  * used by physical interfaces.
285  */
286 u_int netmap_total_buffers;
287 u_int netmap_buf_size;
288 char *netmap_buffer_base;	/* also address of an invalid buffer */
289 
290 /* user-controlled variables */
291 int netmap_verbose;
292 
293 static int netmap_no_timestamp; /* don't timestamp on rxsync */
294 
295 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
296 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
297     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
298 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
299     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
300 int netmap_mitigate = 1;
301 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
302 int netmap_no_pendintr = 1;
303 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
304     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
305 int netmap_txsync_retry = 2;
306 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
307     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
308 
309 int netmap_drop = 0;	/* debugging */
310 int netmap_flags = 0;	/* debug flags */
311 int netmap_fwd = 0;	/* force transparent mode */
312 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
313 
314 SYSCTL_INT(_dev_netmap, OID_AUTO, drop, CTLFLAG_RW, &netmap_drop, 0 , "");
315 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
316 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
317 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
318 
319 NMG_LOCK_T	netmap_global_lock;
320 
321 /*
322  * protect against multiple threads using the same ring.
323  * also check that the ring has not been stopped.
324  */
325 #define NM_KR_BUSY	1
326 #define NM_KR_STOPPED	2
327 static void nm_kr_put(struct netmap_kring *kr);
328 static __inline int nm_kr_tryget(struct netmap_kring *kr)
329 {
330 	/* check a first time without taking the lock
331 	 * to avoid starvation for nm_kr_get()
332 	 */
333 	if (unlikely(kr->nkr_stopped)) {
334 		ND("ring %p stopped (%d)", kr, kr->nkr_stopped);
335 		return NM_KR_STOPPED;
336 	}
337 	if (unlikely(NM_ATOMIC_TEST_AND_SET(&kr->nr_busy)))
338 		return NM_KR_BUSY;
339 	/* check a second time with lock held */
340 	if (unlikely(kr->nkr_stopped)) {
341 		ND("ring %p stopped (%d)", kr, kr->nkr_stopped);
342 		nm_kr_put(kr);
343 		return NM_KR_STOPPED;
344 	}
345 	return 0;
346 }
347 
348 static __inline void nm_kr_put(struct netmap_kring *kr)
349 {
350 	NM_ATOMIC_CLEAR(&kr->nr_busy);
351 }
352 
353 static void nm_kr_get(struct netmap_kring *kr)
354 {
355 	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
356 		tsleep(kr, 0, "NM_KR_GET", 4);
357 }
358 
359 static void nm_disable_ring(struct netmap_kring *kr)
360 {
361 	kr->nkr_stopped = 1;
362 	nm_kr_get(kr);
363 	mtx_lock(&kr->q_lock);
364 	mtx_unlock(&kr->q_lock);
365 	nm_kr_put(kr);
366 }
367 
368 void netmap_disable_all_rings(struct ifnet *ifp)
369 {
370 	struct netmap_adapter *na;
371 	int i;
372 
373 	if (!(ifp->if_capenable & IFCAP_NETMAP))
374 		return;
375 
376 	na = NA(ifp);
377 
378 	for (i = 0; i < na->num_tx_rings + 1; i++) {
379 		nm_disable_ring(na->tx_rings + i);
380 		selwakeuppri(&na->tx_rings[i].si, PI_NET);
381 	}
382 	for (i = 0; i < na->num_rx_rings + 1; i++) {
383 		nm_disable_ring(na->rx_rings + i);
384 		selwakeuppri(&na->rx_rings[i].si, PI_NET);
385 	}
386 	selwakeuppri(&na->tx_si, PI_NET);
387 	selwakeuppri(&na->rx_si, PI_NET);
388 }
389 
390 void netmap_enable_all_rings(struct ifnet *ifp)
391 {
392 	struct netmap_adapter *na;
393 	int i;
394 
395 	if (!(ifp->if_capenable & IFCAP_NETMAP))
396 		return;
397 
398 	na = NA(ifp);
399 	for (i = 0; i < na->num_tx_rings + 1; i++) {
400 		D("enabling %p", na->tx_rings + i);
401 		na->tx_rings[i].nkr_stopped = 0;
402 	}
403 	for (i = 0; i < na->num_rx_rings + 1; i++) {
404 		D("enabling %p", na->rx_rings + i);
405 		na->rx_rings[i].nkr_stopped = 0;
406 	}
407 }
408 
409 
410 /*
411  * generic bound_checking function
412  */
413 u_int
414 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
415 {
416 	u_int oldv = *v;
417 	const char *op = NULL;
418 
419 	if (dflt < lo)
420 		dflt = lo;
421 	if (dflt > hi)
422 		dflt = hi;
423 	if (oldv < lo) {
424 		*v = dflt;
425 		op = "Bump";
426 	} else if (oldv > hi) {
427 		*v = hi;
428 		op = "Clamp";
429 	}
430 	if (op && msg)
431 		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
432 	return *v;
433 }
434 
435 /*
436  * packet-dump function, user-supplied or static buffer.
437  * The destination buffer must be at least 30+4*len
438  */
439 const char *
440 nm_dump_buf(char *p, int len, int lim, char *dst)
441 {
442 	static char _dst[8192];
443         int i, j, i0;
444 	static char hex[] ="0123456789abcdef";
445 	char *o;	/* output position */
446 
447 #define P_HI(x)	hex[((x) & 0xf0)>>4]
448 #define P_LO(x)	hex[((x) & 0xf)]
449 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
450 	if (!dst)
451 		dst = _dst;
452 	if (lim <= 0 || lim > len)
453 		lim = len;
454 	o = dst;
455 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
456 	o += strlen(o);
457 	/* hexdump routine */
458 	for (i = 0; i < lim; ) {
459 		sprintf(o, "%5d: ", i);
460 		o += strlen(o);
461 		memset(o, ' ', 48);
462 		i0 = i;
463 		for (j=0; j < 16 && i < lim; i++, j++) {
464 			o[j*3] = P_HI(p[i]);
465 			o[j*3+1] = P_LO(p[i]);
466 		}
467 		i = i0;
468 		for (j=0; j < 16 && i < lim; i++, j++)
469 			o[j + 48] = P_C(p[i]);
470 		o[j+48] = '\n';
471 		o += j+49;
472 	}
473 	*o = '\0';
474 #undef P_HI
475 #undef P_LO
476 #undef P_C
477 	return dst;
478 }
479 
480 /*
481  * system parameters (most of them in netmap_kern.h)
482  * NM_NAME	prefix for switch port names, default "vale"
483  * NM_BDG_MAXPORTS	number of ports
484  * NM_BRIDGES	max number of switches in the system.
485  *	XXX should become a sysctl or tunable
486  *
487  * Switch ports are named valeX:Y where X is the switch name and Y
488  * is the port. If Y matches a physical interface name, the port is
489  * connected to a physical device.
490  *
491  * Unlike physical interfaces, switch ports use their own memory region
492  * for rings and buffers.
493  * The virtual interfaces use per-queue lock instead of core lock.
494  * In the tx loop, we aggregate traffic in batches to make all operations
495  * faster. The batch size is bridge_batch.
496  */
497 #define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
498 #define NM_BDG_MAXSLOTS		4096	/* XXX same as above */
499 #define NM_BRIDGE_RINGSIZE	1024	/* in the device */
500 #define NM_BDG_HASH		1024	/* forwarding table entries */
501 #define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
502 #define NM_MULTISEG		64	/* max size of a chain of bufs */
503 /* actual size of the tables */
504 #define NM_BDG_BATCH_MAX	(NM_BDG_BATCH + NM_MULTISEG)
505 /* NM_FT_NULL terminates a list of slots in the ft */
506 #define NM_FT_NULL		NM_BDG_BATCH_MAX
507 #define	NM_BRIDGES		8	/* number of bridges */
508 
509 
510 /*
511  * bridge_batch is set via sysctl to the max batch size to be
512  * used in the bridge. The actual value may be larger as the
513  * last packet in the block may overflow the size.
514  */
515 int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
516 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0 , "");
517 
518 
519 /*
520  * These are used to handle reference counters for bridge ports.
521  */
522 #define	ADD_BDG_REF(ifp)	refcount_acquire(&NA(ifp)->na_bdg_refcount)
523 #define	DROP_BDG_REF(ifp)	refcount_release(&NA(ifp)->na_bdg_refcount)
524 
525 /* The bridge references the buffers using the device specific look up table */
526 static inline void *
527 BDG_NMB(struct netmap_mem_d *nmd, struct netmap_slot *slot)
528 {
529 	struct lut_entry *lut = nmd->pools[NETMAP_BUF_POOL].lut;
530 	uint32_t i = slot->buf_idx;
531 	return (unlikely(i >= nmd->pools[NETMAP_BUF_POOL].objtotal)) ?  lut[0].vaddr : lut[i].vaddr;
532 }
533 
534 static int bdg_netmap_attach(struct netmap_adapter *);
535 static int bdg_netmap_reg(struct ifnet *ifp, int onoff);
536 int kern_netmap_regif(struct nmreq *nmr);
537 
538 /*
539  * Each transmit queue accumulates a batch of packets into
540  * a structure before forwarding. Packets to the same
541  * destination are put in a list using ft_next as a link field.
542  * ft_frags and ft_next are valid only on the first fragment.
543  */
544 struct nm_bdg_fwd {	/* forwarding entry for a bridge */
545 	void *ft_buf;		/* netmap or indirect buffer */
546 	uint8_t ft_frags;	/* how many fragments (only on 1st frag) */
547 	uint8_t _ft_port;	/* dst port (unused) */
548 	uint16_t ft_flags;	/* flags, e.g. indirect */
549 	uint16_t ft_len;	/* src fragment len */
550 	uint16_t ft_next;	/* next packet to same destination */
551 };
552 
553 /*
554  * For each output interface, nm_bdg_q is used to construct a list.
555  * bq_len is the number of output buffers (we can have coalescing
556  * during the copy).
557  */
558 struct nm_bdg_q {
559 	uint16_t bq_head;
560 	uint16_t bq_tail;
561 	uint32_t bq_len;	/* number of buffers */
562 };
563 
564 /* XXX revise this */
565 struct nm_hash_ent {
566 	uint64_t	mac;	/* the top 2 bytes are the epoch */
567 	uint64_t	ports;
568 };
569 
570 /*
571  * nm_bridge is a descriptor for a VALE switch.
572  * Interfaces for a bridge are all in bdg_ports[].
573  * The array has fixed size, an empty entry does not terminate
574  * the search, but lookups only occur on attach/detach so we
575  * don't mind if they are slow.
576  *
577  * The bridge is non blocking on the transmit ports: excess
578  * packets are dropped if there is no room on the output port.
579  *
580  * bdg_lock protects accesses to the bdg_ports array.
581  * This is a rw lock (or equivalent).
582  */
583 struct nm_bridge {
584 	/* XXX what is the proper alignment/layout ? */
585 	BDG_RWLOCK_T	bdg_lock;	/* protects bdg_ports */
586 	int		bdg_namelen;
587 	uint32_t	bdg_active_ports; /* 0 means free */
588 	char		bdg_basename[IFNAMSIZ];
589 
590 	/* Indexes of active ports (up to active_ports)
591 	 * and all other remaining ports.
592 	 */
593 	uint8_t		bdg_port_index[NM_BDG_MAXPORTS];
594 
595 	struct netmap_adapter *bdg_ports[NM_BDG_MAXPORTS];
596 
597 
598 	/*
599 	 * The function to decide the destination port.
600 	 * It returns either of an index of the destination port,
601 	 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
602 	 * forward this packet.  ring_nr is the source ring index, and the
603 	 * function may overwrite this value to forward this packet to a
604 	 * different ring index.
605 	 * This function must be set by netmap_bdgctl().
606 	 */
607 	bdg_lookup_fn_t nm_bdg_lookup;
608 
609 	/* the forwarding table, MAC+ports.
610 	 * XXX should be changed to an argument to be passed to
611 	 * the lookup function, and allocated on attach
612 	 */
613 	struct nm_hash_ent ht[NM_BDG_HASH];
614 };
615 
616 
617 /*
618  * XXX in principle nm_bridges could be created dynamically
619  * Right now we have a static array and deletions are protected
620  * by an exclusive lock.
621  */
622 struct nm_bridge nm_bridges[NM_BRIDGES];
623 
624 
625 /*
626  * A few function to tell which kind of port are we using.
627  * XXX should we hold a lock ?
628  *
629  * nma_is_vp()		virtual port
630  * nma_is_host()	port connected to the host stack
631  * nma_is_hw()		port connected to a NIC
632  */
633 int nma_is_vp(struct netmap_adapter *na);
634 int
635 nma_is_vp(struct netmap_adapter *na)
636 {
637 	return na->nm_register == bdg_netmap_reg;
638 }
639 
640 static __inline int
641 nma_is_host(struct netmap_adapter *na)
642 {
643 	return na->nm_register == NULL;
644 }
645 
646 static __inline int
647 nma_is_hw(struct netmap_adapter *na)
648 {
649 	/* In case of sw adapter, nm_register is NULL */
650 	return !nma_is_vp(na) && !nma_is_host(na);
651 }
652 
653 
654 /*
655  * If the NIC is owned by the kernel
656  * (i.e., bridge), neither another bridge nor user can use it;
657  * if the NIC is owned by a user, only users can share it.
658  * Evaluation must be done under NMG_LOCK().
659  */
660 #define NETMAP_OWNED_BY_KERN(ifp)	(!nma_is_vp(NA(ifp)) && NA(ifp)->na_bdg)
661 #define NETMAP_OWNED_BY_ANY(ifp) \
662 	(NETMAP_OWNED_BY_KERN(ifp) || (NA(ifp)->refcount > 0))
663 
664 /*
665  * NA(ifp)->bdg_port	port index
666  */
667 
668 
669 /*
670  * this is a slightly optimized copy routine which rounds
671  * to multiple of 64 bytes and is often faster than dealing
672  * with other odd sizes. We assume there is enough room
673  * in the source and destination buffers.
674  *
675  * XXX only for multiples of 64 bytes, non overlapped.
676  */
677 static inline void
678 pkt_copy(void *_src, void *_dst, int l)
679 {
680         uint64_t *src = _src;
681         uint64_t *dst = _dst;
682         if (unlikely(l >= 1024)) {
683                 memcpy(dst, src, l);
684                 return;
685         }
686         for (; likely(l > 0); l-=64) {
687                 *dst++ = *src++;
688                 *dst++ = *src++;
689                 *dst++ = *src++;
690                 *dst++ = *src++;
691                 *dst++ = *src++;
692                 *dst++ = *src++;
693                 *dst++ = *src++;
694                 *dst++ = *src++;
695         }
696 }
697 
698 
699 /*
700  * locate a bridge among the existing ones.
701  * MUST BE CALLED WITH NMG_LOCK()
702  *
703  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
704  * We assume that this is called with a name of at least NM_NAME chars.
705  */
706 static struct nm_bridge *
707 nm_find_bridge(const char *name, int create)
708 {
709 	int i, l, namelen;
710 	struct nm_bridge *b = NULL;
711 
712 	NMG_LOCK_ASSERT();
713 
714 	namelen = strlen(NM_NAME);	/* base length */
715 	l = name ? strlen(name) : 0;		/* actual length */
716 	if (l < namelen) {
717 		D("invalid bridge name %s", name ? name : NULL);
718 		return NULL;
719 	}
720 	for (i = namelen + 1; i < l; i++) {
721 		if (name[i] == ':') {
722 			namelen = i;
723 			break;
724 		}
725 	}
726 	if (namelen >= IFNAMSIZ)
727 		namelen = IFNAMSIZ;
728 	ND("--- prefix is '%.*s' ---", namelen, name);
729 
730 	/* lookup the name, remember empty slot if there is one */
731 	for (i = 0; i < NM_BRIDGES; i++) {
732 		struct nm_bridge *x = nm_bridges + i;
733 
734 		if (x->bdg_active_ports == 0) {
735 			if (create && b == NULL)
736 				b = x;	/* record empty slot */
737 		} else if (x->bdg_namelen != namelen) {
738 			continue;
739 		} else if (strncmp(name, x->bdg_basename, namelen) == 0) {
740 			ND("found '%.*s' at %d", namelen, name, i);
741 			b = x;
742 			break;
743 		}
744 	}
745 	if (i == NM_BRIDGES && b) { /* name not found, can create entry */
746 		/* initialize the bridge */
747 		strncpy(b->bdg_basename, name, namelen);
748 		ND("create new bridge %s with ports %d", b->bdg_basename,
749 			b->bdg_active_ports);
750 		b->bdg_namelen = namelen;
751 		b->bdg_active_ports = 0;
752 		for (i = 0; i < NM_BDG_MAXPORTS; i++)
753 			b->bdg_port_index[i] = i;
754 		/* set the default function */
755 		b->nm_bdg_lookup = netmap_bdg_learning;
756 		/* reset the MAC address table */
757 		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
758 	}
759 	return b;
760 }
761 
762 
763 /*
764  * Free the forwarding tables for rings attached to switch ports.
765  */
766 static void
767 nm_free_bdgfwd(struct netmap_adapter *na)
768 {
769 	int nrings, i;
770 	struct netmap_kring *kring;
771 
772 	NMG_LOCK_ASSERT();
773 	nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
774 	kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
775 	for (i = 0; i < nrings; i++) {
776 		if (kring[i].nkr_ft) {
777 			free(kring[i].nkr_ft, M_DEVBUF);
778 			kring[i].nkr_ft = NULL; /* protect from freeing twice */
779 		}
780 	}
781 	if (nma_is_hw(na))
782 		nm_free_bdgfwd(SWNA(na->ifp));
783 }
784 
785 
786 /*
787  * Allocate the forwarding tables for the rings attached to the bridge ports.
788  */
789 static int
790 nm_alloc_bdgfwd(struct netmap_adapter *na)
791 {
792 	int nrings, l, i, num_dstq;
793 	struct netmap_kring *kring;
794 
795 	NMG_LOCK_ASSERT();
796 	/* all port:rings + broadcast */
797 	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
798 	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
799 	l += sizeof(struct nm_bdg_q) * num_dstq;
800 	l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
801 
802 	nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
803 	kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
804 	for (i = 0; i < nrings; i++) {
805 		struct nm_bdg_fwd *ft;
806 		struct nm_bdg_q *dstq;
807 		int j;
808 
809 		ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
810 		if (!ft) {
811 			nm_free_bdgfwd(na);
812 			return ENOMEM;
813 		}
814 		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
815 		for (j = 0; j < num_dstq; j++) {
816 			dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
817 			dstq[j].bq_len = 0;
818 		}
819 		kring[i].nkr_ft = ft;
820 	}
821 	if (nma_is_hw(na))
822 		nm_alloc_bdgfwd(SWNA(na->ifp));
823 	return 0;
824 }
825 
826 
827 /*
828  * Fetch configuration from the device, to cope with dynamic
829  * reconfigurations after loading the module.
830  */
831 static int
832 netmap_update_config(struct netmap_adapter *na)
833 {
834 	struct ifnet *ifp = na->ifp;
835 	u_int txr, txd, rxr, rxd;
836 
837 	txr = txd = rxr = rxd = 0;
838 	if (na->nm_config) {
839 		na->nm_config(ifp, &txr, &txd, &rxr, &rxd);
840 	} else {
841 		/* take whatever we had at init time */
842 		txr = na->num_tx_rings;
843 		txd = na->num_tx_desc;
844 		rxr = na->num_rx_rings;
845 		rxd = na->num_rx_desc;
846 	}
847 
848 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
849 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
850 		return 0; /* nothing changed */
851 	if (netmap_verbose || na->refcount > 0) {
852 		D("stored config %s: txring %d x %d, rxring %d x %d",
853 			ifp->if_xname,
854 			na->num_tx_rings, na->num_tx_desc,
855 			na->num_rx_rings, na->num_rx_desc);
856 		D("new config %s: txring %d x %d, rxring %d x %d",
857 			ifp->if_xname, txr, txd, rxr, rxd);
858 	}
859 	if (na->refcount == 0) {
860 		D("configuration changed (but fine)");
861 		na->num_tx_rings = txr;
862 		na->num_tx_desc = txd;
863 		na->num_rx_rings = rxr;
864 		na->num_rx_desc = rxd;
865 		return 0;
866 	}
867 	D("configuration changed while active, this is bad...");
868 	return 1;
869 }
870 
871 static struct netmap_if *
872 netmap_if_new(const char *ifname, struct netmap_adapter *na)
873 {
874 	if (netmap_update_config(na)) {
875 		/* configuration mismatch, report and fail */
876 		return NULL;
877 	}
878 	return netmap_mem_if_new(ifname, na);
879 }
880 
881 
882 /* Structure associated to each thread which registered an interface.
883  *
884  * The first 4 fields of this structure are written by NIOCREGIF and
885  * read by poll() and NIOC?XSYNC.
886  * There is low contention among writers (actually, a correct user program
887  * should have no contention among writers) and among writers and readers,
888  * so we use a single global lock to protect the structure initialization.
889  * Since initialization involves the allocation of memory, we reuse the memory
890  * allocator lock.
891  * Read access to the structure is lock free. Readers must check that
892  * np_nifp is not NULL before using the other fields.
893  * If np_nifp is NULL initialization has not been performed, so they should
894  * return an error to userlevel.
895  *
896  * The ref_done field is used to regulate access to the refcount in the
897  * memory allocator. The refcount must be incremented at most once for
898  * each open("/dev/netmap"). The increment is performed by the first
899  * function that calls netmap_get_memory() (currently called by
900  * mmap(), NIOCGINFO and NIOCREGIF).
901  * If the refcount is incremented, it is then decremented when the
902  * private structure is destroyed.
903  */
904 struct netmap_priv_d {
905 	struct netmap_if * volatile np_nifp;	/* netmap if descriptor. */
906 
907 	struct ifnet	*np_ifp;	/* device for which we hold a ref. */
908 	int		np_ringid;	/* from the ioctl */
909 	u_int		np_qfirst, np_qlast;	/* range of rings to scan */
910 	uint16_t	np_txpoll;
911 
912 	struct netmap_mem_d *np_mref;	/* use with NMG_LOCK held */
913 #ifdef __FreeBSD__
914 	int		np_refcount;	/* use with NMG_LOCK held */
915 #endif /* __FreeBSD__ */
916 };
917 
918 /* grab a reference to the memory allocator, if we don't have one already.  The
919  * reference is taken from the netmap_adapter registered with the priv.
920  *
921  */
922 static int
923 netmap_get_memory_locked(struct netmap_priv_d* p)
924 {
925 	struct netmap_mem_d *nmd;
926 	int error = 0;
927 
928 	if (p->np_ifp == NULL) {
929 		if (!netmap_mmap_unreg)
930 			return ENODEV;
931 		/* for compatibility with older versions of the API
932  		 * we use the global allocator when no interface has been
933  		 * registered
934  		 */
935 		nmd = &nm_mem;
936 	} else {
937 		nmd = NA(p->np_ifp)->nm_mem;
938 	}
939 	if (p->np_mref == NULL) {
940 		error = netmap_mem_finalize(nmd);
941 		if (!error)
942 			p->np_mref = nmd;
943 	} else if (p->np_mref != nmd) {
944 		/* a virtual port has been registered, but previous
945  		 * syscalls already used the global allocator.
946  		 * We cannot continue
947  		 */
948 		error = ENODEV;
949 	}
950 	return error;
951 }
952 
953 static int
954 netmap_get_memory(struct netmap_priv_d* p)
955 {
956 	int error;
957 	NMG_LOCK();
958 	error = netmap_get_memory_locked(p);
959 	NMG_UNLOCK();
960 	return error;
961 }
962 
963 static int
964 netmap_have_memory_locked(struct netmap_priv_d* p)
965 {
966 	return p->np_mref != NULL;
967 }
968 
969 static void
970 netmap_drop_memory_locked(struct netmap_priv_d* p)
971 {
972 	if (p->np_mref) {
973 		netmap_mem_deref(p->np_mref);
974 		p->np_mref = NULL;
975 	}
976 }
977 
978 /*
979  * File descriptor's private data destructor.
980  *
981  * Call nm_register(ifp,0) to stop netmap mode on the interface and
982  * revert to normal operation. We expect that np_ifp has not gone.
983  * The second argument is the nifp to work on. In some cases it is
984  * not attached yet to the netmap_priv_d so we need to pass it as
985  * a separate argument.
986  */
987 /* call with NMG_LOCK held */
988 static void
989 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
990 {
991 	struct ifnet *ifp = priv->np_ifp;
992 	struct netmap_adapter *na = NA(ifp);
993 
994 	NMG_LOCK_ASSERT();
995 	na->refcount--;
996 	if (na->refcount <= 0) {	/* last instance */
997 		u_int i;
998 
999 		if (netmap_verbose)
1000 			D("deleting last instance for %s", ifp->if_xname);
1001 		/*
1002 		 * (TO CHECK) This function is only called
1003 		 * when the last reference to this file descriptor goes
1004 		 * away. This means we cannot have any pending poll()
1005 		 * or interrupt routine operating on the structure.
1006 		 * XXX The file may be closed in a thread while
1007 		 * another thread is using it.
1008 		 * Linux keeps the file opened until the last reference
1009 		 * by any outstanding ioctl/poll or mmap is gone.
1010 		 * FreeBSD does not track mmap()s (but we do) and
1011 		 * wakes up any sleeping poll(). Need to check what
1012 		 * happens if the close() occurs while a concurrent
1013 		 * syscall is running.
1014 		 */
1015 		na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */
1016 		/* Wake up any sleeping threads. netmap_poll will
1017 		 * then return POLLERR
1018 		 * XXX The wake up now must happen during *_down(), when
1019 		 * we order all activities to stop. -gl
1020 		 */
1021 		nm_free_bdgfwd(na);
1022 		for (i = 0; i < na->num_tx_rings + 1; i++) {
1023 			mtx_destroy(&na->tx_rings[i].q_lock);
1024 		}
1025 		for (i = 0; i < na->num_rx_rings + 1; i++) {
1026 			mtx_destroy(&na->rx_rings[i].q_lock);
1027 		}
1028 		/* XXX kqueue(9) needed; these will mirror knlist_init. */
1029 		/* knlist_destroy(&na->tx_si.si_note); */
1030 		/* knlist_destroy(&na->rx_si.si_note); */
1031 		if (nma_is_hw(na))
1032 			SWNA(ifp)->tx_rings = SWNA(ifp)->rx_rings = NULL;
1033 	}
1034 	/*
1035 	 * netmap_mem_if_delete() deletes the nifp, and if this is
1036 	 * the last instance also buffers, rings and krings.
1037 	 */
1038 	netmap_mem_if_delete(na, nifp);
1039 }
1040 
1041 
1042 /* we assume netmap adapter exists
1043  * Called with NMG_LOCK held
1044  */
1045 static void
1046 nm_if_rele(struct ifnet *ifp)
1047 {
1048 	int i, is_hw, hw, sw, lim;
1049 	struct nm_bridge *b;
1050 	struct netmap_adapter *na;
1051 	uint8_t tmp[NM_BDG_MAXPORTS];
1052 
1053 	NMG_LOCK_ASSERT();
1054 	/* I can be called not only for get_ifp()-ed references where netmap's
1055 	 * capability is guaranteed, but also for non-netmap-capable NICs.
1056 	 */
1057 	if (!NETMAP_CAPABLE(ifp) || !NA(ifp)->na_bdg) {
1058 		if_rele(ifp);
1059 		return;
1060 	}
1061 	na = NA(ifp);
1062 	b = na->na_bdg;
1063 	is_hw = nma_is_hw(na);
1064 
1065 	ND("%s has %d references", ifp->if_xname, NA(ifp)->na_bdg_refcount);
1066 
1067 	if (!DROP_BDG_REF(ifp))
1068 		return;
1069 
1070 	/*
1071 	New algorithm:
1072 	make a copy of bdg_port_index;
1073 	lookup NA(ifp)->bdg_port and SWNA(ifp)->bdg_port
1074 	in the array of bdg_port_index, replacing them with
1075 	entries from the bottom of the array;
1076 	decrement bdg_active_ports;
1077 	acquire BDG_WLOCK() and copy back the array.
1078 	 */
1079 
1080 	hw = NA(ifp)->bdg_port;
1081 	sw = (is_hw && SWNA(ifp)->na_bdg) ? SWNA(ifp)->bdg_port : -1;
1082 	lim = b->bdg_active_ports;
1083 
1084 	ND("detach %d and %d (lim %d)", hw, sw, lim);
1085 	/* make a copy of the list of active ports, update it,
1086 	 * and then copy back within BDG_WLOCK().
1087 	 */
1088 	memcpy(tmp, b->bdg_port_index, sizeof(tmp));
1089 	for (i = 0; (hw >= 0 || sw >= 0) && i < lim; ) {
1090 		if (hw >= 0 && tmp[i] == hw) {
1091 			ND("detach hw %d at %d", hw, i);
1092 			lim--; /* point to last active port */
1093 			tmp[i] = tmp[lim]; /* swap with i */
1094 			tmp[lim] = hw;	/* now this is inactive */
1095 			hw = -1;
1096 		} else if (sw >= 0 && tmp[i] == sw) {
1097 			ND("detach sw %d at %d", sw, i);
1098 			lim--;
1099 			tmp[i] = tmp[lim];
1100 			tmp[lim] = sw;
1101 			sw = -1;
1102 		} else {
1103 			i++;
1104 		}
1105 	}
1106 	if (hw >= 0 || sw >= 0) {
1107 		D("XXX delete failed hw %d sw %d, should panic...", hw, sw);
1108 	}
1109 	hw = NA(ifp)->bdg_port;
1110 	sw = (is_hw && SWNA(ifp)->na_bdg) ?  SWNA(ifp)->bdg_port : -1;
1111 
1112 	BDG_WLOCK(b);
1113 	b->bdg_ports[hw] = NULL;
1114 	na->na_bdg = NULL;
1115 	if (sw >= 0) {
1116 		b->bdg_ports[sw] = NULL;
1117 		SWNA(ifp)->na_bdg = NULL;
1118 	}
1119 	memcpy(b->bdg_port_index, tmp, sizeof(tmp));
1120 	b->bdg_active_ports = lim;
1121 	BDG_WUNLOCK(b);
1122 
1123 	ND("now %d active ports", lim);
1124 	if (lim == 0) {
1125 		ND("marking bridge %s as free", b->bdg_basename);
1126 		b->nm_bdg_lookup = NULL;
1127 	}
1128 
1129 	if (is_hw) {
1130 		if_rele(ifp);
1131 	} else {
1132 		if (na->na_flags & NAF_MEM_OWNER)
1133 			netmap_mem_private_delete(na->nm_mem);
1134 		bzero(na, sizeof(*na));
1135 		free(na, M_DEVBUF);
1136 		bzero(ifp, sizeof(*ifp));
1137 		free(ifp, M_DEVBUF);
1138 	}
1139 }
1140 
1141 
1142 /*
1143  * returns 1 if this is the last instance and we can free priv
1144  */
1145 static int
1146 netmap_dtor_locked(struct netmap_priv_d *priv)
1147 {
1148 	struct ifnet *ifp = priv->np_ifp;
1149 
1150 #ifdef __FreeBSD__
1151 	/*
1152 	 * np_refcount is the number of active mmaps on
1153 	 * this file descriptor
1154 	 */
1155 	if (--priv->np_refcount > 0) {
1156 		return 0;
1157 	}
1158 #endif /* __FreeBSD__ */
1159 	if (ifp) {
1160 		netmap_do_unregif(priv, priv->np_nifp);
1161 	}
1162 	netmap_drop_memory_locked(priv);
1163 	if (ifp) {
1164 		nm_if_rele(ifp); /* might also destroy *na */
1165 	}
1166 	return 1;
1167 }
1168 
1169 static void
1170 netmap_dtor(void *data)
1171 {
1172 	struct netmap_priv_d *priv = data;
1173 	int last_instance;
1174 
1175 	NMG_LOCK();
1176 	last_instance = netmap_dtor_locked(priv);
1177 	NMG_UNLOCK();
1178 	if (last_instance) {
1179 		bzero(priv, sizeof(*priv));	/* for safety */
1180 		free(priv, M_DEVBUF);
1181 	}
1182 }
1183 
1184 
1185 #ifdef __FreeBSD__
1186 
1187 /*
1188  * In order to track whether pages are still mapped, we hook into
1189  * the standard cdev_pager and intercept the constructor and
1190  * destructor.
1191  */
1192 
1193 struct netmap_vm_handle_t {
1194 	struct cdev 		*dev;
1195 	struct netmap_priv_d	*priv;
1196 };
1197 
1198 static int
1199 netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
1200     vm_ooffset_t foff, struct ucred *cred, u_short *color)
1201 {
1202 	struct netmap_vm_handle_t *vmh = handle;
1203 	D("handle %p size %jd prot %d foff %jd",
1204 		handle, (intmax_t)size, prot, (intmax_t)foff);
1205 	dev_ref(vmh->dev);
1206 	return 0;
1207 }
1208 
1209 
1210 static void
1211 netmap_dev_pager_dtor(void *handle)
1212 {
1213 	struct netmap_vm_handle_t *vmh = handle;
1214 	struct cdev *dev = vmh->dev;
1215 	struct netmap_priv_d *priv = vmh->priv;
1216 	D("handle %p", handle);
1217 	netmap_dtor(priv);
1218 	free(vmh, M_DEVBUF);
1219 	dev_rel(dev);
1220 }
1221 
1222 static int
1223 netmap_dev_pager_fault(vm_object_t object, vm_ooffset_t offset,
1224 	int prot, vm_page_t *mres)
1225 {
1226 	struct netmap_vm_handle_t *vmh = object->handle;
1227 	struct netmap_priv_d *priv = vmh->priv;
1228 	vm_paddr_t paddr;
1229 	vm_page_t page;
1230 	vm_memattr_t memattr;
1231 	vm_pindex_t pidx;
1232 
1233 	ND("object %p offset %jd prot %d mres %p",
1234 			object, (intmax_t)offset, prot, mres);
1235 	memattr = object->memattr;
1236 	pidx = OFF_TO_IDX(offset);
1237 	paddr = netmap_mem_ofstophys(priv->np_mref, offset);
1238 	if (paddr == 0)
1239 		return VM_PAGER_FAIL;
1240 
1241 	if (((*mres)->flags & PG_FICTITIOUS) != 0) {
1242 		/*
1243 		 * If the passed in result page is a fake page, update it with
1244 		 * the new physical address.
1245 		 */
1246 		page = *mres;
1247 		vm_page_updatefake(page, paddr, memattr);
1248 	} else {
1249 		/*
1250 		 * Replace the passed in reqpage page with our own fake page and
1251 		 * free up the all of the original pages.
1252 		 */
1253 #ifndef VM_OBJECT_WUNLOCK	/* FreeBSD < 10.x */
1254 #define VM_OBJECT_WUNLOCK VM_OBJECT_UNLOCK
1255 #define VM_OBJECT_WLOCK	VM_OBJECT_LOCK
1256 #endif /* VM_OBJECT_WUNLOCK */
1257 
1258 		VM_OBJECT_WUNLOCK(object);
1259 		page = vm_page_getfake(paddr, memattr);
1260 		VM_OBJECT_WLOCK(object);
1261 		vm_page_lock(*mres);
1262 		vm_page_free(*mres);
1263 		vm_page_unlock(*mres);
1264 		*mres = page;
1265 		vm_page_insert(page, object, pidx);
1266 	}
1267 	page->valid = VM_PAGE_BITS_ALL;
1268 	return (VM_PAGER_OK);
1269 }
1270 
1271 
1272 static struct cdev_pager_ops netmap_cdev_pager_ops = {
1273         .cdev_pg_ctor = netmap_dev_pager_ctor,
1274         .cdev_pg_dtor = netmap_dev_pager_dtor,
1275         .cdev_pg_fault = netmap_dev_pager_fault,
1276 };
1277 
1278 
1279 static int
1280 netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff,
1281 	vm_size_t objsize,  vm_object_t *objp, int prot)
1282 {
1283 	int error;
1284 	struct netmap_vm_handle_t *vmh;
1285 	struct netmap_priv_d *priv;
1286 	vm_object_t obj;
1287 
1288 	D("cdev %p foff %jd size %jd objp %p prot %d", cdev,
1289 	    (intmax_t )*foff, (intmax_t )objsize, objp, prot);
1290 
1291 	vmh = malloc(sizeof(struct netmap_vm_handle_t), M_DEVBUF,
1292 			      M_NOWAIT | M_ZERO);
1293 	if (vmh == NULL)
1294 		return ENOMEM;
1295 	vmh->dev = cdev;
1296 
1297 	NMG_LOCK();
1298 	error = devfs_get_cdevpriv((void**)&priv);
1299 	if (error)
1300 		goto err_unlock;
1301 	vmh->priv = priv;
1302 	priv->np_refcount++;
1303 	NMG_UNLOCK();
1304 
1305 	error = netmap_get_memory(priv);
1306 	if (error)
1307 		goto err_deref;
1308 
1309 	obj = cdev_pager_allocate(vmh, OBJT_DEVICE,
1310 		&netmap_cdev_pager_ops, objsize, prot,
1311 		*foff, NULL);
1312 	if (obj == NULL) {
1313 		D("cdev_pager_allocate failed");
1314 		error = EINVAL;
1315 		goto err_deref;
1316 	}
1317 
1318 	*objp = obj;
1319 	return 0;
1320 
1321 err_deref:
1322 	NMG_LOCK();
1323 	priv->np_refcount--;
1324 err_unlock:
1325 	NMG_UNLOCK();
1326 // err:
1327 	free(vmh, M_DEVBUF);
1328 	return error;
1329 }
1330 
1331 
1332 // XXX can we remove this ?
1333 static int
1334 netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
1335 {
1336 	if (netmap_verbose)
1337 		D("dev %p fflag 0x%x devtype %d td %p",
1338 			dev, fflag, devtype, td);
1339 	return 0;
1340 }
1341 
1342 
1343 static int
1344 netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
1345 {
1346 	struct netmap_priv_d *priv;
1347 	int error;
1348 
1349 	(void)dev;
1350 	(void)oflags;
1351 	(void)devtype;
1352 	(void)td;
1353 
1354 	// XXX wait or nowait ?
1355 	priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
1356 			      M_NOWAIT | M_ZERO);
1357 	if (priv == NULL)
1358 		return ENOMEM;
1359 
1360 	error = devfs_set_cdevpriv(priv, netmap_dtor);
1361 	if (error)
1362 	        return error;
1363 
1364 	priv->np_refcount = 1;
1365 
1366 	return 0;
1367 }
1368 #endif /* __FreeBSD__ */
1369 
1370 
1371 /*
1372  * Handlers for synchronization of the queues from/to the host.
1373  * Netmap has two operating modes:
1374  * - in the default mode, the rings connected to the host stack are
1375  *   just another ring pair managed by userspace;
1376  * - in transparent mode (XXX to be defined) incoming packets
1377  *   (from the host or the NIC) are marked as NS_FORWARD upon
1378  *   arrival, and the user application has a chance to reset the
1379  *   flag for packets that should be dropped.
1380  *   On the RXSYNC or poll(), packets in RX rings between
1381  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
1382  *   to the other side.
1383  * The transfer NIC --> host is relatively easy, just encapsulate
1384  * into mbufs and we are done. The host --> NIC side is slightly
1385  * harder because there might not be room in the tx ring so it
1386  * might take a while before releasing the buffer.
1387  */
1388 
1389 
1390 /*
1391  * pass a chain of buffers to the host stack as coming from 'dst'
1392  */
1393 static void
1394 netmap_send_up(struct ifnet *dst, struct mbuf *head)
1395 {
1396 	struct mbuf *m;
1397 
1398 	/* send packets up, outside the lock */
1399 	while ((m = head) != NULL) {
1400 		head = head->m_nextpkt;
1401 		m->m_nextpkt = NULL;
1402 		if (netmap_verbose & NM_VERB_HOST)
1403 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
1404 		NM_SEND_UP(dst, m);
1405 	}
1406 }
1407 
1408 struct mbq {
1409 	struct mbuf *head;
1410 	struct mbuf *tail;
1411 	int count;
1412 };
1413 
1414 
1415 /*
1416  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
1417  * Run from hwcur to cur - reserved
1418  */
1419 static void
1420 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1421 {
1422 	/* Take packets from hwcur to cur-reserved and pass them up.
1423 	 * In case of no buffers we give up. At the end of the loop,
1424 	 * the queue is drained in all cases.
1425 	 * XXX handle reserved
1426 	 */
1427 	u_int lim = kring->nkr_num_slots - 1;
1428 	struct mbuf *m, *tail = q->tail;
1429 	u_int k = kring->ring->cur, n = kring->ring->reserved;
1430 	struct netmap_mem_d *nmd = kring->na->nm_mem;
1431 
1432 	/* compute the final position, ring->cur - ring->reserved */
1433 	if (n > 0) {
1434 		if (k < n)
1435 			k += kring->nkr_num_slots;
1436 		k += n;
1437 	}
1438 	for (n = kring->nr_hwcur; n != k;) {
1439 		struct netmap_slot *slot = &kring->ring->slot[n];
1440 
1441 		n = nm_next(n, lim);
1442 		if ((slot->flags & NS_FORWARD) == 0 && !force)
1443 			continue;
1444 		if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(nmd)) {
1445 			D("bad pkt at %d len %d", n, slot->len);
1446 			continue;
1447 		}
1448 		slot->flags &= ~NS_FORWARD; // XXX needed ?
1449 		/* XXX adapt to the case of a multisegment packet */
1450 		m = m_devget(BDG_NMB(nmd, slot), slot->len, 0, kring->na->ifp, NULL);
1451 
1452 		if (m == NULL)
1453 			break;
1454 		if (tail)
1455 			tail->m_nextpkt = m;
1456 		else
1457 			q->head = m;
1458 		tail = m;
1459 		q->count++;
1460 		m->m_nextpkt = NULL;
1461 	}
1462 	q->tail = tail;
1463 }
1464 
1465 
1466 /*
1467  * The host ring has packets from nr_hwcur to (cur - reserved)
1468  * to be sent down to the NIC.
1469  * We need to use the queue lock on the source (host RX ring)
1470  * to protect against netmap_transmit.
1471  * If the user is well behaved we do not need to acquire locks
1472  * on the destination(s),
1473  * so we only need to make sure that there are no panics because
1474  * of user errors.
1475  * XXX verify
1476  *
1477  * We scan the tx rings, which have just been
1478  * flushed so nr_hwcur == cur. Pushing packets down means
1479  * increment cur and decrement avail.
1480  * XXX to be verified
1481  */
1482 static void
1483 netmap_sw_to_nic(struct netmap_adapter *na)
1484 {
1485 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1486 	struct netmap_kring *k1 = &na->tx_rings[0];
1487 	u_int i, howmany, src_lim, dst_lim;
1488 
1489 	/* XXX we should also check that the carrier is on */
1490 	if (kring->nkr_stopped)
1491 		return;
1492 
1493 	mtx_lock(&kring->q_lock);
1494 
1495 	if (kring->nkr_stopped)
1496 		goto out;
1497 
1498 	howmany = kring->nr_hwavail;	/* XXX otherwise cur - reserved - nr_hwcur */
1499 
1500 	src_lim = kring->nkr_num_slots - 1;
1501 	for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) {
1502 		ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail);
1503 		dst_lim = k1->nkr_num_slots - 1;
1504 		while (howmany > 0 && k1->ring->avail > 0) {
1505 			struct netmap_slot *src, *dst, tmp;
1506 			src = &kring->ring->slot[kring->nr_hwcur];
1507 			dst = &k1->ring->slot[k1->ring->cur];
1508 			tmp = *src;
1509 			src->buf_idx = dst->buf_idx;
1510 			src->flags = NS_BUF_CHANGED;
1511 
1512 			dst->buf_idx = tmp.buf_idx;
1513 			dst->len = tmp.len;
1514 			dst->flags = NS_BUF_CHANGED;
1515 			ND("out len %d buf %d from %d to %d",
1516 				dst->len, dst->buf_idx,
1517 				kring->nr_hwcur, k1->ring->cur);
1518 
1519 			kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim);
1520 			howmany--;
1521 			kring->nr_hwavail--;
1522 			k1->ring->cur = nm_next(k1->ring->cur, dst_lim);
1523 			k1->ring->avail--;
1524 		}
1525 		kring->ring->cur = kring->nr_hwcur; // XXX
1526 		k1++; // XXX why?
1527 	}
1528 out:
1529 	mtx_unlock(&kring->q_lock);
1530 }
1531 
1532 
1533 /*
1534  * netmap_txsync_to_host() passes packets up. We are called from a
1535  * system call in user process context, and the only contention
1536  * can be among multiple user threads erroneously calling
1537  * this routine concurrently.
1538  */
1539 static void
1540 netmap_txsync_to_host(struct netmap_adapter *na)
1541 {
1542 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
1543 	struct netmap_ring *ring = kring->ring;
1544 	u_int k, lim = kring->nkr_num_slots - 1;
1545 	struct mbq q = { NULL, NULL, 0 };
1546 
1547 	if (nm_kr_tryget(kring)) {
1548 		D("ring %p busy (user error)", kring);
1549 		return;
1550 	}
1551 	k = ring->cur;
1552 	if (k > lim) {
1553 		D("invalid ring index in stack TX kring %p", kring);
1554 		netmap_ring_reinit(kring);
1555 		nm_kr_put(kring);
1556 		return;
1557 	}
1558 
1559 	/* Take packets from hwcur to cur and pass them up.
1560 	 * In case of no buffers we give up. At the end of the loop,
1561 	 * the queue is drained in all cases.
1562 	 */
1563 	netmap_grab_packets(kring, &q, 1);
1564 	kring->nr_hwcur = k;
1565 	kring->nr_hwavail = ring->avail = lim;
1566 
1567 	nm_kr_put(kring);
1568 	netmap_send_up(na->ifp, q.head);
1569 }
1570 
1571 
1572 /*
1573  * This is the 'txsync' handler to send from a software ring to the
1574  * host stack.
1575  */
1576 /* SWNA(ifp)->txrings[0] is always NA(ifp)->txrings[NA(ifp)->num_txrings] */
1577 static int
1578 netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int flags)
1579 {
1580 	(void)ring_nr;
1581 	(void)flags;
1582 	if (netmap_verbose > 255)
1583 		RD(5, "sync to host %s ring %d", ifp->if_xname, ring_nr);
1584 	netmap_txsync_to_host(NA(ifp));
1585 	return 0;
1586 }
1587 
1588 
1589 /*
1590  * rxsync backend for packets coming from the host stack.
1591  * They have been put in the queue by netmap_transmit() so we
1592  * need to protect access to the kring using a lock.
1593  *
1594  * This routine also does the selrecord if called from the poll handler
1595  * (we know because td != NULL).
1596  *
1597  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
1598  *     as an additional hidden argument.
1599  */
1600 static void
1601 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
1602 {
1603 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1604 	struct netmap_ring *ring = kring->ring;
1605 	u_int j, n, lim = kring->nkr_num_slots;
1606 	u_int k = ring->cur, resvd = ring->reserved;
1607 
1608 	(void)pwait;	/* disable unused warnings */
1609 
1610 	if (kring->nkr_stopped) /* check a first time without lock */
1611 		return;
1612 
1613 	/* XXX as an optimization we could reuse na->core_lock */
1614 	mtx_lock(&kring->q_lock);
1615 
1616 	if (kring->nkr_stopped)  /* check again with lock held */
1617 		goto unlock_out;
1618 
1619 	if (k >= lim) {
1620 		netmap_ring_reinit(kring);
1621 		goto unlock_out;
1622 	}
1623 	/* new packets are already set in nr_hwavail */
1624 	/* skip past packets that userspace has released */
1625 	j = kring->nr_hwcur;
1626 	if (resvd > 0) {
1627 		if (resvd + ring->avail >= lim + 1) {
1628 			D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
1629 			ring->reserved = resvd = 0; // XXX panic...
1630 		}
1631 		k = (k >= resvd) ? k - resvd : k + lim - resvd;
1632         }
1633 	if (j != k) {
1634 		n = k >= j ? k - j : k + lim - j;
1635 		kring->nr_hwavail -= n;
1636 		kring->nr_hwcur = k;
1637 	}
1638 	k = ring->avail = kring->nr_hwavail - resvd;
1639 	if (k == 0 && td)
1640 		selrecord(td, &kring->si);
1641 	if (k && (netmap_verbose & NM_VERB_HOST))
1642 		D("%d pkts from stack", k);
1643 unlock_out:
1644 
1645 	mtx_unlock(&kring->q_lock);
1646 }
1647 
1648 
1649 /*
1650  * MUST BE CALLED UNDER NMG_LOCK()
1651  *
1652  * get a refcounted reference to an interface.
1653  * This is always called in the execution of an ioctl().
1654  *
1655  * Return ENXIO if the interface does not exist, EINVAL if netmap
1656  * is not supported by the interface.
1657  * If successful, hold a reference.
1658  *
1659  * When the NIC is attached to a bridge, reference is managed
1660  * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as
1661  * virtual ports.  Hence, on the final DROP_BDG_REF(), the NIC
1662  * is detached from the bridge, then ifp's refcount is dropped (this
1663  * is equivalent to that ifp is destroyed in case of virtual ports.
1664  *
1665  * This function uses if_rele() when we want to prevent the NIC from
1666  * being detached from the bridge in error handling.  But once refcount
1667  * is acquired by this function, it must be released using nm_if_rele().
1668  */
1669 static int
1670 get_ifp(struct nmreq *nmr, struct ifnet **ifp, int create)
1671 {
1672 	const char *name = nmr->nr_name;
1673 	int namelen = strlen(name);
1674 	struct ifnet *iter = NULL;
1675 	int no_prefix = 0;
1676 
1677 	/* first try to see if this is a bridge port. */
1678 	struct nm_bridge *b;
1679 	struct netmap_adapter *na;
1680 	int i, j, cand = -1, cand2 = -1;
1681 	int needed;
1682 
1683 	NMG_LOCK_ASSERT();
1684 	*ifp = NULL;	/* default */
1685 	if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
1686 		no_prefix = 1;	/* no VALE prefix */
1687 		goto no_bridge_port;
1688 	}
1689 
1690 	b = nm_find_bridge(name, create);
1691 	if (b == NULL) {
1692 		D("no bridges available for '%s'", name);
1693 		return (ENXIO);
1694 	}
1695 
1696 	/* Now we are sure that name starts with the bridge's name,
1697 	 * lookup the port in the bridge. We need to scan the entire
1698 	 * list. It is not important to hold a WLOCK on the bridge
1699 	 * during the search because NMG_LOCK already guarantees
1700 	 * that there are no other possible writers.
1701 	 */
1702 
1703 	/* lookup in the local list of ports */
1704 	for (j = 0; j < b->bdg_active_ports; j++) {
1705 		i = b->bdg_port_index[j];
1706 		na = b->bdg_ports[i];
1707 		// KASSERT(na != NULL);
1708 		iter = na->ifp;
1709 		/* XXX make sure the name only contains one : */
1710 		if (!strcmp(iter->if_xname, name) /* virtual port */ ||
1711 		    (namelen > b->bdg_namelen && !strcmp(iter->if_xname,
1712 		    name + b->bdg_namelen + 1)) /* NIC */) {
1713 			ADD_BDG_REF(iter);
1714 			ND("found existing if %s refs %d", name,
1715 				NA(iter)->na_bdg_refcount);
1716 			*ifp = iter;
1717 			/* we are done, this is surely netmap capable */
1718 			return 0;
1719 		}
1720 	}
1721 	/* not found, should we create it? */
1722 	if (!create)
1723 		return ENXIO;
1724 	/* yes we should, see if we have space to attach entries */
1725 	needed = 2; /* in some cases we only need 1 */
1726 	if (b->bdg_active_ports + needed >= NM_BDG_MAXPORTS) {
1727 		D("bridge full %d, cannot create new port", b->bdg_active_ports);
1728 		return EINVAL;
1729 	}
1730 	/* record the next two ports available, but do not allocate yet */
1731 	cand = b->bdg_port_index[b->bdg_active_ports];
1732 	cand2 = b->bdg_port_index[b->bdg_active_ports + 1];
1733 	ND("+++ bridge %s port %s used %d avail %d %d",
1734 		b->bdg_basename, name, b->bdg_active_ports, cand, cand2);
1735 
1736 	/*
1737 	 * try see if there is a matching NIC with this name
1738 	 * (after the bridge's name)
1739 	 */
1740 	iter = ifunit_ref(name + b->bdg_namelen + 1);
1741 	if (!iter) { /* this is a virtual port */
1742 		/* Create a temporary NA with arguments, then
1743 		 * bdg_netmap_attach() will allocate the real one
1744 		 * and attach it to the ifp
1745 		 */
1746 		struct netmap_adapter tmp_na;
1747 		int error;
1748 
1749 		if (nmr->nr_cmd) {
1750 			/* nr_cmd must be 0 for a virtual port */
1751 			return EINVAL;
1752 		}
1753 		bzero(&tmp_na, sizeof(tmp_na));
1754 		/* bound checking */
1755 		tmp_na.num_tx_rings = nmr->nr_tx_rings;
1756 		nm_bound_var(&tmp_na.num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1757 		nmr->nr_tx_rings = tmp_na.num_tx_rings; // write back
1758 		tmp_na.num_rx_rings = nmr->nr_rx_rings;
1759 		nm_bound_var(&tmp_na.num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
1760 		nmr->nr_rx_rings = tmp_na.num_rx_rings; // write back
1761 		nm_bound_var(&nmr->nr_tx_slots, NM_BRIDGE_RINGSIZE,
1762 				1, NM_BDG_MAXSLOTS, NULL);
1763 		tmp_na.num_tx_desc = nmr->nr_tx_slots;
1764 		nm_bound_var(&nmr->nr_rx_slots, NM_BRIDGE_RINGSIZE,
1765 				1, NM_BDG_MAXSLOTS, NULL);
1766 		tmp_na.num_rx_desc = nmr->nr_rx_slots;
1767 
1768 	 	/* create a struct ifnet for the new port.
1769 		 * need M_NOWAIT as we are under nma_lock
1770 		 */
1771 		iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | M_ZERO);
1772 		if (!iter)
1773 			return ENOMEM;
1774 
1775 		strcpy(iter->if_xname, name);
1776 		tmp_na.ifp = iter;
1777 		/* bdg_netmap_attach creates a struct netmap_adapter */
1778 		error = bdg_netmap_attach(&tmp_na);
1779 		if (error) {
1780 			D("error %d", error);
1781 			free(iter, M_DEVBUF);
1782 			return error;
1783 		}
1784 		cand2 = -1;	/* only need one port */
1785 	} else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */
1786 		/* make sure the NIC is not already in use */
1787 		if (NETMAP_OWNED_BY_ANY(iter)) {
1788 			D("NIC %s busy, cannot attach to bridge",
1789 				iter->if_xname);
1790 			if_rele(iter); /* don't detach from bridge */
1791 			return EINVAL;
1792 		}
1793 		if (nmr->nr_arg1 != NETMAP_BDG_HOST)
1794 			cand2 = -1; /* only need one port */
1795 	} else { /* not a netmap-capable NIC */
1796 		if_rele(iter); /* don't detach from bridge */
1797 		return EINVAL;
1798 	}
1799 	na = NA(iter);
1800 
1801 	BDG_WLOCK(b);
1802 	na->bdg_port = cand;
1803 	ND("NIC  %p to bridge port %d", NA(iter), cand);
1804 	/* bind the port to the bridge (virtual ports are not active) */
1805 	b->bdg_ports[cand] = na;
1806 	na->na_bdg = b;
1807 	b->bdg_active_ports++;
1808 	if (cand2 >= 0) {
1809 		/* also bind the host stack to the bridge */
1810 		b->bdg_ports[cand2] = SWNA(iter);
1811 		SWNA(iter)->bdg_port = cand2;
1812 		SWNA(iter)->na_bdg = b;
1813 		b->bdg_active_ports++;
1814 		ND("host %p to bridge port %d", SWNA(iter), cand2);
1815 	}
1816 	ADD_BDG_REF(iter);	// XXX one or two ?
1817 	ND("if %s refs %d", name, NA(iter)->na_bdg_refcount);
1818 	BDG_WUNLOCK(b);
1819 	*ifp = iter;
1820 	return 0;
1821 
1822 no_bridge_port:
1823 	*ifp = iter;
1824 	if (! *ifp)
1825 		*ifp = ifunit_ref(name);
1826 	if (*ifp == NULL)
1827 		return (ENXIO);
1828 
1829 	if (NETMAP_CAPABLE(*ifp)) {
1830 		/* Users cannot use the NIC attached to a bridge directly */
1831 		if (no_prefix && NETMAP_OWNED_BY_KERN(*ifp)) {
1832 			if_rele(*ifp); /* don't detach from bridge */
1833 			return EINVAL;
1834 		} else
1835 			return 0;	/* valid pointer, we hold the refcount */
1836 	}
1837 	nm_if_rele(*ifp);
1838 	return EINVAL;	// not NETMAP capable
1839 }
1840 
1841 
1842 /*
1843  * Error routine called when txsync/rxsync detects an error.
1844  * Can't do much more than resetting cur = hwcur, avail = hwavail.
1845  * Return 1 on reinit.
1846  *
1847  * This routine is only called by the upper half of the kernel.
1848  * It only reads hwcur (which is changed only by the upper half, too)
1849  * and hwavail (which may be changed by the lower half, but only on
1850  * a tx ring and only to increase it, so any error will be recovered
1851  * on the next call). For the above, we don't strictly need to call
1852  * it under lock.
1853  */
1854 int
1855 netmap_ring_reinit(struct netmap_kring *kring)
1856 {
1857 	struct netmap_ring *ring = kring->ring;
1858 	u_int i, lim = kring->nkr_num_slots - 1;
1859 	int errors = 0;
1860 
1861 	// XXX KASSERT nm_kr_tryget
1862 	RD(10, "called for %s", kring->na->ifp->if_xname);
1863 	if (ring->cur > lim)
1864 		errors++;
1865 	for (i = 0; i <= lim; i++) {
1866 		u_int idx = ring->slot[i].buf_idx;
1867 		u_int len = ring->slot[i].len;
1868 		if (idx < 2 || idx >= netmap_total_buffers) {
1869 			if (!errors++)
1870 				D("bad buffer at slot %d idx %d len %d ", i, idx, len);
1871 			ring->slot[i].buf_idx = 0;
1872 			ring->slot[i].len = 0;
1873 		} else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) {
1874 			ring->slot[i].len = 0;
1875 			if (!errors++)
1876 				D("bad len %d at slot %d idx %d",
1877 					len, i, idx);
1878 		}
1879 	}
1880 	if (errors) {
1881 		int pos = kring - kring->na->tx_rings;
1882 		int n = kring->na->num_tx_rings + 1;
1883 
1884 		RD(10, "total %d errors", errors);
1885 		errors++;
1886 		RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d",
1887 			kring->na->ifp->if_xname,
1888 			pos < n ?  "TX" : "RX", pos < n ? pos : pos - n,
1889 			ring->cur, kring->nr_hwcur,
1890 			ring->avail, kring->nr_hwavail);
1891 		ring->cur = kring->nr_hwcur;
1892 		ring->avail = kring->nr_hwavail;
1893 	}
1894 	return (errors ? 1 : 0);
1895 }
1896 
1897 
1898 /*
1899  * Set the ring ID. For devices with a single queue, a request
1900  * for all rings is the same as a single ring.
1901  */
1902 static int
1903 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid)
1904 {
1905 	struct ifnet *ifp = priv->np_ifp;
1906 	struct netmap_adapter *na = NA(ifp);
1907 	u_int i = ringid & NETMAP_RING_MASK;
1908 	/* initially (np_qfirst == np_qlast) we don't want to lock */
1909 	u_int lim = na->num_rx_rings;
1910 
1911 	if (na->num_tx_rings > lim)
1912 		lim = na->num_tx_rings;
1913 	if ( (ringid & NETMAP_HW_RING) && i >= lim) {
1914 		D("invalid ring id %d", i);
1915 		return (EINVAL);
1916 	}
1917 	priv->np_ringid = ringid;
1918 	if (ringid & NETMAP_SW_RING) {
1919 		priv->np_qfirst = NETMAP_SW_RING;
1920 		priv->np_qlast = 0;
1921 	} else if (ringid & NETMAP_HW_RING) {
1922 		priv->np_qfirst = i;
1923 		priv->np_qlast = i + 1;
1924 	} else {
1925 		priv->np_qfirst = 0;
1926 		priv->np_qlast = NETMAP_HW_RING ;
1927 	}
1928 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1929     if (netmap_verbose) {
1930 	if (ringid & NETMAP_SW_RING)
1931 		D("ringid %s set to SW RING", ifp->if_xname);
1932 	else if (ringid & NETMAP_HW_RING)
1933 		D("ringid %s set to HW RING %d", ifp->if_xname,
1934 			priv->np_qfirst);
1935 	else
1936 		D("ringid %s set to all %d HW RINGS", ifp->if_xname, lim);
1937     }
1938 	return 0;
1939 }
1940 
1941 
1942 /*
1943  * possibly move the interface to netmap-mode.
1944  * If success it returns a pointer to netmap_if, otherwise NULL.
1945  * This must be called with NMG_LOCK held.
1946  */
1947 static struct netmap_if *
1948 netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp,
1949 	uint16_t ringid, int *err)
1950 {
1951 	struct netmap_adapter *na = NA(ifp);
1952 	struct netmap_if *nifp = NULL;
1953 	int error, need_mem;
1954 
1955 	NMG_LOCK_ASSERT();
1956 	/* ring configuration may have changed, fetch from the card */
1957 	netmap_update_config(na);
1958 	priv->np_ifp = ifp;     /* store the reference */
1959 	error = netmap_set_ringid(priv, ringid);
1960 	if (error)
1961 		goto out;
1962 	/* ensure allocators are ready */
1963 	need_mem = !netmap_have_memory_locked(priv);
1964 	if (need_mem) {
1965 		error = netmap_get_memory_locked(priv);
1966 		ND("get_memory returned %d", error);
1967 		if (error)
1968 			goto out;
1969 	}
1970 	nifp = netmap_if_new(ifp->if_xname, na);
1971 	if (nifp == NULL) { /* allocation failed */
1972 		/* we should drop the allocator, but only
1973 		 * if we were the ones who grabbed it
1974 		 */
1975 		if (need_mem)
1976 			netmap_drop_memory_locked(priv);
1977 		error = ENOMEM;
1978 		goto out;
1979 	}
1980 	na->refcount++;
1981 	if (ifp->if_capenable & IFCAP_NETMAP) {
1982 		/* was already set */
1983 	} else {
1984 		u_int i;
1985 		/* Otherwise set the card in netmap mode
1986 		 * and make it use the shared buffers.
1987 		 *
1988 		 * If the interface is attached to a bridge, lock it.
1989 		 */
1990 		if (NETMAP_OWNED_BY_KERN(ifp))
1991 			BDG_WLOCK(NA(ifp)->na_bdg);
1992 		for (i = 0 ; i < na->num_tx_rings + 1; i++)
1993 			mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock",
1994 			    NULL, MTX_DEF);
1995 		for (i = 0 ; i < na->num_rx_rings + 1; i++) {
1996 			mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock",
1997 			    NULL, MTX_DEF);
1998 		}
1999 		if (nma_is_hw(na)) {
2000 			SWNA(ifp)->tx_rings = &na->tx_rings[na->num_tx_rings];
2001 			SWNA(ifp)->rx_rings = &na->rx_rings[na->num_rx_rings];
2002 		}
2003 		/*
2004 		 * do not core lock because the race is harmless here,
2005 		 * there cannot be any traffic to netmap_transmit()
2006 		 */
2007 		error = na->nm_register(ifp, 1); /* mode on */
2008 		// XXX do we need to nm_alloc_bdgfwd() in all cases ?
2009 		if (!error)
2010 			error = nm_alloc_bdgfwd(na);
2011 		if (error) {
2012 			netmap_do_unregif(priv, nifp);
2013 			nifp = NULL;
2014 		}
2015 		if (NETMAP_OWNED_BY_KERN(ifp))
2016 			BDG_WUNLOCK(NA(ifp)->na_bdg);
2017 
2018 	}
2019 out:
2020 	*err = error;
2021 	if (nifp != NULL) {
2022 		/*
2023 		 * advertise that the interface is ready bt setting ni_nifp.
2024 		 * The barrier is needed because readers (poll and *SYNC)
2025 		 * check for priv->np_nifp != NULL without locking
2026 		 */
2027 		wmb(); /* make sure previous writes are visible to all CPUs */
2028 		priv->np_nifp = nifp;
2029 	}
2030 	return nifp;
2031 }
2032 
2033 /* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
2034 static int
2035 nm_bdg_attach(struct nmreq *nmr)
2036 {
2037 	struct ifnet *ifp;
2038 	struct netmap_if *nifp;
2039 	struct netmap_priv_d *npriv;
2040 	int error;
2041 
2042 	npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
2043 	if (npriv == NULL)
2044 		return ENOMEM;
2045 	NMG_LOCK();
2046 	error = get_ifp(nmr, &ifp, 1 /* create if not exists */);
2047 	if (error) /* no device, or another bridge or user owns the device */
2048 		goto unlock_exit;
2049 	/* get_ifp() sets na_bdg if this is a physical interface
2050 	 * that we can attach to a switch.
2051 	 */
2052 	if (!NETMAP_OWNED_BY_KERN(ifp)) {
2053 		/* got reference to a virtual port or direct access to a NIC.
2054 		 * perhaps specified no bridge prefix or wrong NIC name
2055 		 */
2056 		error = EINVAL;
2057 		goto unref_exit;
2058 	}
2059 
2060 	if (NA(ifp)->refcount > 0) { /* already registered */
2061 		error = EBUSY;
2062 		DROP_BDG_REF(ifp);
2063 		goto unlock_exit;
2064 	}
2065 
2066 	nifp = netmap_do_regif(npriv, ifp, nmr->nr_ringid, &error);
2067 	if (!nifp) {
2068 		goto unref_exit;
2069 	}
2070 
2071 	NA(ifp)->na_kpriv = npriv;
2072 	NMG_UNLOCK();
2073 	ND("registered %s to netmap-mode", ifp->if_xname);
2074 	return 0;
2075 
2076 unref_exit:
2077 	nm_if_rele(ifp);
2078 unlock_exit:
2079 	NMG_UNLOCK();
2080 	bzero(npriv, sizeof(*npriv));
2081 	free(npriv, M_DEVBUF);
2082 	return error;
2083 }
2084 
2085 static int
2086 nm_bdg_detach(struct nmreq *nmr)
2087 {
2088 	struct ifnet *ifp;
2089 	int error;
2090 	int last_instance;
2091 
2092 	NMG_LOCK();
2093 	error = get_ifp(nmr, &ifp, 0 /* don't create */);
2094 	if (error) { /* no device, or another bridge or user owns the device */
2095 		goto unlock_exit;
2096 	}
2097 	/* XXX do we need to check this ? */
2098 	if (!NETMAP_OWNED_BY_KERN(ifp)) {
2099 		/* got reference to a virtual port or direct access to a NIC.
2100 		 * perhaps specified no bridge's prefix or wrong NIC's name
2101 		 */
2102 		error = EINVAL;
2103 		goto unref_exit;
2104 	}
2105 
2106 	if (NA(ifp)->refcount == 0) { /* not registered */
2107 		error = EINVAL;
2108 		goto unref_exit;
2109 	}
2110 
2111 	DROP_BDG_REF(ifp); /* the one from get_ifp */
2112 	last_instance = netmap_dtor_locked(NA(ifp)->na_kpriv); /* unregister */
2113 	NMG_UNLOCK();
2114 	if (!last_instance) {
2115 		D("--- error, trying to detach an entry with active mmaps");
2116 		error = EINVAL;
2117 	} else {
2118 		struct netmap_priv_d *npriv = NA(ifp)->na_kpriv;
2119 		NA(ifp)->na_kpriv = NULL;
2120 
2121 		bzero(npriv, sizeof(*npriv));
2122 		free(npriv, M_DEVBUF);
2123 	}
2124 	return error;
2125 
2126 unref_exit:
2127 	nm_if_rele(ifp);
2128 unlock_exit:
2129 	NMG_UNLOCK();
2130 	return error;
2131 }
2132 
2133 
2134 /* Initialize necessary fields of sw adapter located in right after hw's
2135  * one.  sw adapter attaches a pair of sw rings of the netmap-mode NIC.
2136  * It is always activated and deactivated at the same tie with the hw's one.
2137  * Thus we don't need refcounting on the sw adapter.
2138  * Regardless of NIC's feature we use separate lock so that anybody can lock
2139  * me independently from the hw adapter.
2140  * Make sure nm_register is NULL to be handled as FALSE in nma_is_hw
2141  */
2142 static void
2143 netmap_attach_sw(struct ifnet *ifp)
2144 {
2145 	struct netmap_adapter *hw_na = NA(ifp);
2146 	struct netmap_adapter *na = SWNA(ifp);
2147 
2148 	na->ifp = ifp;
2149 	na->num_rx_rings = na->num_tx_rings = 1;
2150 	na->num_tx_desc = hw_na->num_tx_desc;
2151 	na->num_rx_desc = hw_na->num_rx_desc;
2152 	na->nm_txsync = netmap_bdg_to_host;
2153 	/* we use the same memory allocator as the
2154 	 * the hw adapter */
2155 	na->nm_mem = hw_na->nm_mem;
2156 }
2157 
2158 
2159 /* exported to kernel callers, e.g. OVS ?
2160  * Entry point.
2161  * Called without NMG_LOCK.
2162  */
2163 int
2164 netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
2165 {
2166 	struct nm_bridge *b;
2167 	struct netmap_adapter *na;
2168 	struct ifnet *iter;
2169 	char *name = nmr->nr_name;
2170 	int cmd = nmr->nr_cmd, namelen = strlen(name);
2171 	int error = 0, i, j;
2172 
2173 	switch (cmd) {
2174 	case NETMAP_BDG_ATTACH:
2175 		error = nm_bdg_attach(nmr);
2176 		break;
2177 
2178 	case NETMAP_BDG_DETACH:
2179 		error = nm_bdg_detach(nmr);
2180 		break;
2181 
2182 	case NETMAP_BDG_LIST:
2183 		/* this is used to enumerate bridges and ports */
2184 		if (namelen) { /* look up indexes of bridge and port */
2185 			if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
2186 				error = EINVAL;
2187 				break;
2188 			}
2189 			NMG_LOCK();
2190 			b = nm_find_bridge(name, 0 /* don't create */);
2191 			if (!b) {
2192 				error = ENOENT;
2193 				NMG_UNLOCK();
2194 				break;
2195 			}
2196 
2197 			error = ENOENT;
2198 			for (j = 0; j < b->bdg_active_ports; j++) {
2199 				i = b->bdg_port_index[j];
2200 				na = b->bdg_ports[i];
2201 				if (na == NULL) {
2202 					D("---AAAAAAAAARGH-------");
2203 					continue;
2204 				}
2205 				iter = na->ifp;
2206 				/* the former and the latter identify a
2207 				 * virtual port and a NIC, respectively
2208 				 */
2209 				if (!strcmp(iter->if_xname, name) ||
2210 				    (namelen > b->bdg_namelen &&
2211 				    !strcmp(iter->if_xname,
2212 				    name + b->bdg_namelen + 1))) {
2213 					/* bridge index */
2214 					nmr->nr_arg1 = b - nm_bridges;
2215 					nmr->nr_arg2 = i; /* port index */
2216 					error = 0;
2217 					break;
2218 				}
2219 			}
2220 			NMG_UNLOCK();
2221 		} else {
2222 			/* return the first non-empty entry starting from
2223 			 * bridge nr_arg1 and port nr_arg2.
2224 			 *
2225 			 * Users can detect the end of the same bridge by
2226 			 * seeing the new and old value of nr_arg1, and can
2227 			 * detect the end of all the bridge by error != 0
2228 			 */
2229 			i = nmr->nr_arg1;
2230 			j = nmr->nr_arg2;
2231 
2232 			NMG_LOCK();
2233 			for (error = ENOENT; i < NM_BRIDGES; i++) {
2234 				b = nm_bridges + i;
2235 				if (j >= b->bdg_active_ports) {
2236 					j = 0; /* following bridges scan from 0 */
2237 					continue;
2238 				}
2239 				nmr->nr_arg1 = i;
2240 				nmr->nr_arg2 = j;
2241 				j = b->bdg_port_index[j];
2242 				na = b->bdg_ports[j];
2243 				iter = na->ifp;
2244 				strncpy(name, iter->if_xname, (size_t)IFNAMSIZ);
2245 				error = 0;
2246 				break;
2247 			}
2248 			NMG_UNLOCK();
2249 		}
2250 		break;
2251 
2252 	case NETMAP_BDG_LOOKUP_REG:
2253 		/* register a lookup function to the given bridge.
2254 		 * nmr->nr_name may be just bridge's name (including ':'
2255 		 * if it is not just NM_NAME).
2256 		 */
2257 		if (!func) {
2258 			error = EINVAL;
2259 			break;
2260 		}
2261 		NMG_LOCK();
2262 		b = nm_find_bridge(name, 0 /* don't create */);
2263 		if (!b) {
2264 			error = EINVAL;
2265 		} else {
2266 			b->nm_bdg_lookup = func;
2267 		}
2268 		NMG_UNLOCK();
2269 		break;
2270 
2271 	default:
2272 		D("invalid cmd (nmr->nr_cmd) (0x%x)", cmd);
2273 		error = EINVAL;
2274 		break;
2275 	}
2276 	return error;
2277 }
2278 
2279 
2280 /*
2281  * ioctl(2) support for the "netmap" device.
2282  *
2283  * Following a list of accepted commands:
2284  * - NIOCGINFO
2285  * - SIOCGIFADDR	just for convenience
2286  * - NIOCREGIF
2287  * - NIOCUNREGIF
2288  * - NIOCTXSYNC
2289  * - NIOCRXSYNC
2290  *
2291  * Return 0 on success, errno otherwise.
2292  */
2293 static int
2294 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
2295 	int fflag, struct thread *td)
2296 {
2297 	struct netmap_priv_d *priv = NULL;
2298 	struct ifnet *ifp = NULL;
2299 	struct nmreq *nmr = (struct nmreq *) data;
2300 	struct netmap_adapter *na = NULL;
2301 	int error;
2302 	u_int i, lim;
2303 	struct netmap_if *nifp;
2304 	struct netmap_kring *krings;
2305 
2306 	(void)dev;	/* UNUSED */
2307 	(void)fflag;	/* UNUSED */
2308 #ifdef linux
2309 #define devfs_get_cdevpriv(pp)				\
2310 	({ *(struct netmap_priv_d **)pp = ((struct file *)td)->private_data; 	\
2311 		(*pp ? 0 : ENOENT); })
2312 
2313 /* devfs_set_cdevpriv cannot fail on linux */
2314 #define devfs_set_cdevpriv(p, fn)				\
2315 	({ ((struct file *)td)->private_data = p; (p ? 0 : EINVAL); })
2316 
2317 
2318 #define devfs_clear_cdevpriv()	do {				\
2319 		netmap_dtor(priv); ((struct file *)td)->private_data = 0;	\
2320 	} while (0)
2321 #endif /* linux */
2322 
2323 	CURVNET_SET(TD_TO_VNET(td));
2324 
2325 	error = devfs_get_cdevpriv((void **)&priv);
2326 	if (error) {
2327 		CURVNET_RESTORE();
2328 		/* XXX ENOENT should be impossible, since the priv
2329 		 * is now created in the open */
2330 		return (error == ENOENT ? ENXIO : error);
2331 	}
2332 
2333 	nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';	/* truncate name */
2334 	switch (cmd) {
2335 	case NIOCGINFO:		/* return capabilities etc */
2336 		if (nmr->nr_version != NETMAP_API) {
2337 			D("API mismatch got %d have %d",
2338 				nmr->nr_version, NETMAP_API);
2339 			nmr->nr_version = NETMAP_API;
2340 			error = EINVAL;
2341 			break;
2342 		}
2343 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
2344 			error = netmap_bdg_ctl(nmr, NULL);
2345 			break;
2346 		}
2347 
2348 		NMG_LOCK();
2349 		do {
2350 			/* memsize is always valid */
2351 			struct netmap_mem_d *nmd = &nm_mem;
2352 			u_int memflags;
2353 
2354 			if (nmr->nr_name[0] != '\0') {
2355 				/* get a refcount */
2356 				error = get_ifp(nmr, &ifp, 1 /* create */);
2357 				if (error)
2358 					break;
2359 				na = NA(ifp);  /* retrieve the netmap adapter */
2360 				nmd = na->nm_mem; /* and its memory allocator */
2361 			}
2362 
2363 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags);
2364 			if (error)
2365 				break;
2366 			if (na == NULL) /* only memory info */
2367 				break;
2368 			nmr->nr_offset = 0;
2369 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
2370 			netmap_update_config(na);
2371 			nmr->nr_rx_rings = na->num_rx_rings;
2372 			nmr->nr_tx_rings = na->num_tx_rings;
2373 			nmr->nr_rx_slots = na->num_rx_desc;
2374 			nmr->nr_tx_slots = na->num_tx_desc;
2375 			if (memflags & NETMAP_MEM_PRIVATE)
2376 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
2377 		} while (0);
2378 		if (ifp)
2379 			nm_if_rele(ifp);	/* return the refcount */
2380 		NMG_UNLOCK();
2381 		break;
2382 
2383 	case NIOCREGIF:
2384 		if (nmr->nr_version != NETMAP_API) {
2385 			nmr->nr_version = NETMAP_API;
2386 			error = EINVAL;
2387 			break;
2388 		}
2389 		/* possibly attach/detach NIC and VALE switch */
2390 		i = nmr->nr_cmd;
2391 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) {
2392 			error = netmap_bdg_ctl(nmr, NULL);
2393 			break;
2394 		} else if (i != 0) {
2395 			D("nr_cmd must be 0 not %d", i);
2396 			error = EINVAL;
2397 			break;
2398 		}
2399 
2400 		/* protect access to priv from concurrent NIOCREGIF */
2401 		NMG_LOCK();
2402 		do {
2403 			u_int memflags;
2404 
2405 			if (priv->np_ifp != NULL) {	/* thread already registered */
2406 				error = netmap_set_ringid(priv, nmr->nr_ringid);
2407 				break;
2408 			}
2409 			/* find the interface and a reference */
2410 			error = get_ifp(nmr, &ifp, 1 /* create */); /* keep reference */
2411 			if (error)
2412 				break;
2413 			if (NETMAP_OWNED_BY_KERN(ifp)) {
2414 				nm_if_rele(ifp);
2415 				error = EBUSY;
2416 				break;
2417 			}
2418 			nifp = netmap_do_regif(priv, ifp, nmr->nr_ringid, &error);
2419 			if (!nifp) {    /* reg. failed, release priv and ref */
2420 				nm_if_rele(ifp);        /* return the refcount */
2421 				priv->np_ifp = NULL;
2422 				priv->np_nifp = NULL;
2423 				break;
2424 			}
2425 
2426 			/* return the offset of the netmap_if object */
2427 			na = NA(ifp); /* retrieve netmap adapter */
2428 			nmr->nr_rx_rings = na->num_rx_rings;
2429 			nmr->nr_tx_rings = na->num_tx_rings;
2430 			nmr->nr_rx_slots = na->num_rx_desc;
2431 			nmr->nr_tx_slots = na->num_tx_desc;
2432 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags);
2433 			if (error) {
2434 				nm_if_rele(ifp);
2435 				break;
2436 			}
2437 			if (memflags & NETMAP_MEM_PRIVATE) {
2438 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
2439 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2440 			}
2441 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2442 		} while (0);
2443 		NMG_UNLOCK();
2444 		break;
2445 
2446 	case NIOCUNREGIF:
2447 		// XXX we have no data here ?
2448 		D("deprecated, data is %p", nmr);
2449 		error = EINVAL;
2450 		break;
2451 
2452 	case NIOCTXSYNC:
2453 	case NIOCRXSYNC:
2454 		nifp = priv->np_nifp;
2455 
2456 		if (nifp == NULL) {
2457 			error = ENXIO;
2458 			break;
2459 		}
2460 		rmb(); /* make sure following reads are not from cache */
2461 
2462 		ifp = priv->np_ifp;	/* we have a reference */
2463 
2464 		if (ifp == NULL) {
2465 			D("Internal error: nifp != NULL && ifp == NULL");
2466 			error = ENXIO;
2467 			break;
2468 		}
2469 
2470 		na = NA(ifp); /* retrieve netmap adapter */
2471 		if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */
2472 			if (cmd == NIOCTXSYNC)
2473 				netmap_txsync_to_host(na);
2474 			else
2475 				netmap_rxsync_from_host(na, NULL, NULL);
2476 			break;
2477 		}
2478 		/* find the last ring to scan */
2479 		lim = priv->np_qlast;
2480 		if (lim == NETMAP_HW_RING)
2481 			lim = (cmd == NIOCTXSYNC) ?
2482 			    na->num_tx_rings : na->num_rx_rings;
2483 
2484 		krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings;
2485 		for (i = priv->np_qfirst; i < lim; i++) {
2486 			struct netmap_kring *kring = krings + i;
2487 			if (nm_kr_tryget(kring)) {
2488 				error = EBUSY;
2489 				goto out;
2490 			}
2491 			if (cmd == NIOCTXSYNC) {
2492 				if (netmap_verbose & NM_VERB_TXSYNC)
2493 					D("pre txsync ring %d cur %d hwcur %d",
2494 					    i, kring->ring->cur,
2495 					    kring->nr_hwcur);
2496 				na->nm_txsync(ifp, i, NAF_FORCE_RECLAIM);
2497 				if (netmap_verbose & NM_VERB_TXSYNC)
2498 					D("post txsync ring %d cur %d hwcur %d",
2499 					    i, kring->ring->cur,
2500 					    kring->nr_hwcur);
2501 			} else {
2502 				na->nm_rxsync(ifp, i, NAF_FORCE_READ);
2503 				microtime(&na->rx_rings[i].ring->ts);
2504 			}
2505 			nm_kr_put(kring);
2506 		}
2507 
2508 		break;
2509 
2510 #ifdef __FreeBSD__
2511 	case BIOCIMMEDIATE:
2512 	case BIOCGHDRCMPLT:
2513 	case BIOCSHDRCMPLT:
2514 	case BIOCSSEESENT:
2515 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
2516 		break;
2517 
2518 	default:	/* allow device-specific ioctls */
2519 	    {
2520 		struct socket so;
2521 
2522 		bzero(&so, sizeof(so));
2523 		NMG_LOCK();
2524 		error = get_ifp(nmr, &ifp, 0 /* don't create */); /* keep reference */
2525 		if (error) {
2526 			NMG_UNLOCK();
2527 			break;
2528 		}
2529 		so.so_vnet = ifp->if_vnet;
2530 		// so->so_proto not null.
2531 		error = ifioctl(&so, cmd, data, td);
2532 		nm_if_rele(ifp);
2533 		NMG_UNLOCK();
2534 		break;
2535 	    }
2536 
2537 #else /* linux */
2538 	default:
2539 		error = EOPNOTSUPP;
2540 #endif /* linux */
2541 	}
2542 out:
2543 
2544 	CURVNET_RESTORE();
2545 	return (error);
2546 }
2547 
2548 
2549 /*
2550  * select(2) and poll(2) handlers for the "netmap" device.
2551  *
2552  * Can be called for one or more queues.
2553  * Return true the event mask corresponding to ready events.
2554  * If there are no ready events, do a selrecord on either individual
2555  * selinfo or on the global one.
2556  * Device-dependent parts (locking and sync of tx/rx rings)
2557  * are done through callbacks.
2558  *
2559  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
2560  * The first one is remapped to pwait as selrecord() uses the name as an
2561  * hidden argument.
2562  */
2563 static int
2564 netmap_poll(struct cdev *dev, int events, struct thread *td)
2565 {
2566 	struct netmap_priv_d *priv = NULL;
2567 	struct netmap_adapter *na;
2568 	struct ifnet *ifp;
2569 	struct netmap_kring *kring;
2570 	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
2571 	u_int lim_tx, lim_rx, host_forwarded = 0;
2572 	struct mbq q = { NULL, NULL, 0 };
2573 	void *pwait = dev;	/* linux compatibility */
2574 
2575 		int retry_tx = 1;
2576 
2577 	(void)pwait;
2578 
2579 	if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL)
2580 		return POLLERR;
2581 
2582 	if (priv->np_nifp == NULL) {
2583 		D("No if registered");
2584 		return POLLERR;
2585 	}
2586 	rmb(); /* make sure following reads are not from cache */
2587 
2588 	ifp = priv->np_ifp;
2589 	// XXX check for deleting() ?
2590 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
2591 		return POLLERR;
2592 
2593 	if (netmap_verbose & 0x8000)
2594 		D("device %s events 0x%x", ifp->if_xname, events);
2595 	want_tx = events & (POLLOUT | POLLWRNORM);
2596 	want_rx = events & (POLLIN | POLLRDNORM);
2597 
2598 	na = NA(ifp); /* retrieve netmap adapter */
2599 
2600 	lim_tx = na->num_tx_rings;
2601 	lim_rx = na->num_rx_rings;
2602 
2603 	if (priv->np_qfirst == NETMAP_SW_RING) {
2604 		/* handle the host stack ring */
2605 		if (priv->np_txpoll || want_tx) {
2606 			/* push any packets up, then we are always ready */
2607 			netmap_txsync_to_host(na);
2608 			revents |= want_tx;
2609 		}
2610 		if (want_rx) {
2611 			kring = &na->rx_rings[lim_rx];
2612 			if (kring->ring->avail == 0)
2613 				netmap_rxsync_from_host(na, td, dev);
2614 			if (kring->ring->avail > 0) {
2615 				revents |= want_rx;
2616 			}
2617 		}
2618 		return (revents);
2619 	}
2620 
2621 	/* if we are in transparent mode, check also the host rx ring */
2622 	kring = &na->rx_rings[lim_rx];
2623 	if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
2624 			&& want_rx
2625 			&& (netmap_fwd || kring->ring->flags & NR_FORWARD) ) {
2626 		if (kring->ring->avail == 0)
2627 			netmap_rxsync_from_host(na, td, dev);
2628 		if (kring->ring->avail > 0)
2629 			revents |= want_rx;
2630 	}
2631 
2632 	/*
2633 	 * check_all is set if the card has more than one queue AND
2634 	 * the client is polling all of them. If true, we sleep on
2635 	 * the "global" selinfo, otherwise we sleep on individual selinfo
2636 	 * (FreeBSD only allows two selinfo's per file descriptor).
2637 	 * The interrupt routine in the driver wake one or the other
2638 	 * (or both) depending on which clients are active.
2639 	 *
2640 	 * rxsync() is only called if we run out of buffers on a POLLIN.
2641 	 * txsync() is called if we run out of buffers on POLLOUT, or
2642 	 * there are pending packets to send. The latter can be disabled
2643 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
2644 	 */
2645 	check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1);
2646 	check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1);
2647 
2648 	if (priv->np_qlast != NETMAP_HW_RING) {
2649 		lim_tx = lim_rx = priv->np_qlast;
2650 	}
2651 
2652 	/*
2653 	 * We start with a lock free round which is good if we have
2654 	 * data available. If this fails, then lock and call the sync
2655 	 * routines.
2656 	 */
2657 	for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) {
2658 		kring = &na->rx_rings[i];
2659 		if (kring->ring->avail > 0) {
2660 			revents |= want_rx;
2661 			want_rx = 0;	/* also breaks the loop */
2662 		}
2663 	}
2664 	for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) {
2665 		kring = &na->tx_rings[i];
2666 		if (kring->ring->avail > 0) {
2667 			revents |= want_tx;
2668 			want_tx = 0;	/* also breaks the loop */
2669 		}
2670 	}
2671 
2672 	/*
2673 	 * If we to push packets out (priv->np_txpoll) or want_tx is
2674 	 * still set, we do need to run the txsync calls (on all rings,
2675 	 * to avoid that the tx rings stall).
2676 	 */
2677 	if (priv->np_txpoll || want_tx) {
2678 		/* If we really want to be woken up (want_tx),
2679 		 * do a selrecord, either on the global or on
2680 		 * the private structure.  Then issue the txsync
2681 		 * so there is no race in the selrecord/selwait
2682 		 */
2683 flush_tx:
2684 		for (i = priv->np_qfirst; i < lim_tx; i++) {
2685 			kring = &na->tx_rings[i];
2686 			/*
2687 			 * Skip this ring if want_tx == 0
2688 			 * (we have already done a successful sync on
2689 			 * a previous ring) AND kring->cur == kring->hwcur
2690 			 * (there are no pending transmissions for this ring).
2691 			 */
2692 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
2693 				continue;
2694 			/* make sure only one user thread is doing this */
2695 			if (nm_kr_tryget(kring)) {
2696 				ND("ring %p busy is %d", kring, (int)kring->nr_busy);
2697 				revents |= POLLERR;
2698 				goto out;
2699 			}
2700 
2701 			if (netmap_verbose & NM_VERB_TXSYNC)
2702 				D("send %d on %s %d",
2703 					kring->ring->cur, ifp->if_xname, i);
2704 			if (na->nm_txsync(ifp, i, 0))
2705 				revents |= POLLERR;
2706 
2707 			/* Check avail/call selrecord only if called with POLLOUT */
2708 			if (want_tx) {
2709 				if (kring->ring->avail > 0) {
2710 					/* stop at the first ring. We don't risk
2711 					 * starvation.
2712 					 */
2713 					revents |= want_tx;
2714 					want_tx = 0;
2715 				}
2716 			}
2717 			nm_kr_put(kring);
2718 		}
2719 		if (want_tx && retry_tx) {
2720 			selrecord(td, check_all_tx ?
2721 			    &na->tx_si : &na->tx_rings[priv->np_qfirst].si);
2722 			retry_tx = 0;
2723 			goto flush_tx;
2724 		}
2725 	}
2726 
2727 	/*
2728 	 * now if want_rx is still set we need to lock and rxsync.
2729 	 * Do it on all rings because otherwise we starve.
2730 	 */
2731 	if (want_rx) {
2732 		int retry_rx = 1;
2733 do_retry_rx:
2734 		for (i = priv->np_qfirst; i < lim_rx; i++) {
2735 			kring = &na->rx_rings[i];
2736 
2737 			if (nm_kr_tryget(kring)) {
2738 				revents |= POLLERR;
2739 				goto out;
2740 			}
2741 
2742 			/* XXX NR_FORWARD should only be read on
2743 			 * physical or NIC ports
2744 			 */
2745 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
2746 				ND(10, "forwarding some buffers up %d to %d",
2747 				    kring->nr_hwcur, kring->ring->cur);
2748 				netmap_grab_packets(kring, &q, netmap_fwd);
2749 			}
2750 
2751 			if (na->nm_rxsync(ifp, i, 0))
2752 				revents |= POLLERR;
2753 			if (netmap_no_timestamp == 0 ||
2754 					kring->ring->flags & NR_TIMESTAMP) {
2755 				microtime(&kring->ring->ts);
2756 			}
2757 
2758 			if (kring->ring->avail > 0) {
2759 				revents |= want_rx;
2760 				retry_rx = 0;
2761 			}
2762 			nm_kr_put(kring);
2763 		}
2764 		if (retry_rx) {
2765 			retry_rx = 0;
2766 			selrecord(td, check_all_rx ?
2767 			    &na->rx_si : &na->rx_rings[priv->np_qfirst].si);
2768 			goto do_retry_rx;
2769 		}
2770 	}
2771 
2772 	/* forward host to the netmap ring.
2773 	 * I am accessing nr_hwavail without lock, but netmap_transmit
2774 	 * can only increment it, so the operation is safe.
2775 	 */
2776 	kring = &na->rx_rings[lim_rx];
2777 	if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
2778 			&& (netmap_fwd || kring->ring->flags & NR_FORWARD)
2779 			 && kring->nr_hwavail > 0 && !host_forwarded) {
2780 		netmap_sw_to_nic(na);
2781 		host_forwarded = 1; /* prevent another pass */
2782 		want_rx = 0;
2783 		goto flush_tx;
2784 	}
2785 
2786 	if (q.head)
2787 		netmap_send_up(na->ifp, q.head);
2788 
2789 out:
2790 
2791 	return (revents);
2792 }
2793 
2794 /*------- driver support routines ------*/
2795 
2796 
2797 /*
2798  * Initialize a ``netmap_adapter`` object created by driver on attach.
2799  * We allocate a block of memory with room for a struct netmap_adapter
2800  * plus two sets of N+2 struct netmap_kring (where N is the number
2801  * of hardware rings):
2802  * krings	0..N-1	are for the hardware queues.
2803  * kring	N	is for the host stack queue
2804  * kring	N+1	is only used for the selinfo for all queues.
2805  * Return 0 on success, ENOMEM otherwise.
2806  *
2807  * By default the receive and transmit adapter ring counts are both initialized
2808  * to num_queues.  na->num_tx_rings can be set for cards with different tx/rx
2809  * setups.
2810  */
2811 int
2812 netmap_attach(struct netmap_adapter *arg, u_int num_queues)
2813 {
2814 	struct netmap_adapter *na = NULL;
2815 	struct ifnet *ifp = arg ? arg->ifp : NULL;
2816 	size_t len;
2817 
2818 	if (arg == NULL || ifp == NULL)
2819 		goto fail;
2820 	/* a VALE port uses two endpoints */
2821 	len = nma_is_vp(arg) ? sizeof(*na) : sizeof(*na) * 2;
2822 	na = malloc(len, M_DEVBUF, M_NOWAIT | M_ZERO);
2823 	if (na == NULL)
2824 		goto fail;
2825 	WNA(ifp) = na;
2826 	*na = *arg; /* copy everything, trust the driver to not pass junk */
2827 	NETMAP_SET_CAPABLE(ifp);
2828 	if (na->num_tx_rings == 0)
2829 		na->num_tx_rings = num_queues;
2830 	na->num_rx_rings = num_queues;
2831 	na->refcount = na->na_single = na->na_multi = 0;
2832 	/* Core lock initialized here, others after netmap_if_new. */
2833 	mtx_init(&na->core_lock, "netmap core lock", MTX_NETWORK_LOCK, MTX_DEF);
2834 #ifdef linux
2835 	if (ifp->netdev_ops) {
2836 		ND("netdev_ops %p", ifp->netdev_ops);
2837 		/* prepare a clone of the netdev ops */
2838 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 28)
2839 		na->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2840 #else
2841 		na->nm_ndo = *ifp->netdev_ops;
2842 #endif
2843 	}
2844 	na->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2845 #endif /* linux */
2846 	na->nm_mem = arg->nm_mem ? arg->nm_mem : &nm_mem;
2847 	if (!nma_is_vp(arg))
2848 		netmap_attach_sw(ifp);
2849 	D("success for %s", ifp->if_xname);
2850 	return 0;
2851 
2852 fail:
2853 	D("fail, arg %p ifp %p na %p", arg, ifp, na);
2854 	netmap_detach(ifp);
2855 	return (na ? EINVAL : ENOMEM);
2856 }
2857 
2858 
2859 /*
2860  * Free the allocated memory linked to the given ``netmap_adapter``
2861  * object.
2862  */
2863 void
2864 netmap_detach(struct ifnet *ifp)
2865 {
2866 	struct netmap_adapter *na = NA(ifp);
2867 
2868 	if (!na)
2869 		return;
2870 
2871 	mtx_destroy(&na->core_lock);
2872 
2873 	if (na->tx_rings) { /* XXX should not happen */
2874 		D("freeing leftover tx_rings");
2875 		free(na->tx_rings, M_DEVBUF);
2876 	}
2877 	if (na->na_flags & NAF_MEM_OWNER)
2878 		netmap_mem_private_delete(na->nm_mem);
2879 	bzero(na, sizeof(*na));
2880 	WNA(ifp) = NULL;
2881 	free(na, M_DEVBUF);
2882 }
2883 
2884 
2885 int
2886 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n,
2887 	struct netmap_adapter *na, u_int ring_nr);
2888 
2889 
2890 /*
2891  * Intercept packets from the network stack and pass them
2892  * to netmap as incoming packets on the 'software' ring.
2893  * We rely on the OS to make sure that the ifp and na do not go
2894  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
2895  * In nm_register() or whenever there is a reinitialization,
2896  * we make sure to access the core lock and per-ring locks
2897  * so that IFCAP_NETMAP is visible here.
2898  */
2899 int
2900 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
2901 {
2902 	struct netmap_adapter *na = NA(ifp);
2903 	struct netmap_kring *kring;
2904 	u_int i, len = MBUF_LEN(m);
2905 	u_int error = EBUSY, lim;
2906 	struct netmap_slot *slot;
2907 
2908 	// XXX [Linux] we do not need this lock
2909 	// if we follow the down/configure/up protocol -gl
2910 	// mtx_lock(&na->core_lock);
2911 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) {
2912 		/* interface not in netmap mode anymore */
2913 		error = ENXIO;
2914 		goto done;
2915 	}
2916 
2917 	kring = &na->rx_rings[na->num_rx_rings];
2918 	lim = kring->nkr_num_slots - 1;
2919 	if (netmap_verbose & NM_VERB_HOST)
2920 		D("%s packet %d len %d from the stack", ifp->if_xname,
2921 			kring->nr_hwcur + kring->nr_hwavail, len);
2922 	// XXX reconsider long packets if we handle fragments
2923 	if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */
2924 		D("%s from_host, drop packet size %d > %d", ifp->if_xname,
2925 			len, NETMAP_BDG_BUF_SIZE(na->nm_mem));
2926 		goto done;
2927 	}
2928 	if (SWNA(ifp)->na_bdg) {
2929 		struct nm_bdg_fwd *ft;
2930 		char *dst;
2931 
2932 		na = SWNA(ifp); /* we operate on the host port */
2933 		ft = na->rx_rings[0].nkr_ft;
2934 		dst = BDG_NMB(na->nm_mem, &na->rx_rings[0].ring->slot[0]);
2935 
2936 		/* use slot 0 in the ft, there is nothing queued here */
2937 		/* XXX we can save the copy calling m_copydata in nm_bdg_flush,
2938 		 * need a special flag for this.
2939 		 */
2940 		m_copydata(m, 0, (int)len, dst);
2941 		ft->ft_flags = 0;
2942 		ft->ft_len = len;
2943 		ft->ft_buf = dst;
2944 		ft->ft_next = NM_FT_NULL;
2945 		ft->ft_frags = 1;
2946 		if (netmap_verbose & NM_VERB_HOST)
2947 			RD(5, "pkt %p size %d to bridge port %d",
2948 				dst, len, na->bdg_port);
2949 		nm_bdg_flush(ft, 1, na, 0);
2950 		na = NA(ifp);	/* back to the regular object/lock */
2951 		error = 0;
2952 		goto done;
2953 	}
2954 
2955 	/* protect against other instances of netmap_transmit,
2956 	 * and userspace invocations of rxsync().
2957 	 * XXX could reuse core_lock
2958 	 */
2959 	// XXX [Linux] there can be no other instances of netmap_transmit
2960 	// on this same ring, but we still need this lock to protect
2961 	// concurrent access from netmap_sw_to_nic() -gl
2962 	mtx_lock(&kring->q_lock);
2963 	if (kring->nr_hwavail >= lim) {
2964 		if (netmap_verbose)
2965 			D("stack ring %s full\n", ifp->if_xname);
2966 	} else {
2967 		/* compute the insert position */
2968 		i = nm_kr_rxpos(kring);
2969 		slot = &kring->ring->slot[i];
2970 		m_copydata(m, 0, (int)len, BDG_NMB(na->nm_mem, slot));
2971 		slot->len = len;
2972 		slot->flags = kring->nkr_slot_flags;
2973 		kring->nr_hwavail++;
2974 		if (netmap_verbose  & NM_VERB_HOST)
2975 			D("wake up host ring %s %d", na->ifp->if_xname, na->num_rx_rings);
2976 		selwakeuppri(&kring->si, PI_NET);
2977 		error = 0;
2978 	}
2979 	mtx_unlock(&kring->q_lock);
2980 
2981 done:
2982 	// mtx_unlock(&na->core_lock);
2983 
2984 	/* release the mbuf in either cases of success or failure. As an
2985 	 * alternative, put the mbuf in a free list and free the list
2986 	 * only when really necessary.
2987 	 */
2988 	m_freem(m);
2989 
2990 	return (error);
2991 }
2992 
2993 
2994 /*
2995  * netmap_reset() is called by the driver routines when reinitializing
2996  * a ring. The driver is in charge of locking to protect the kring.
2997  * If netmap mode is not set just return NULL.
2998  */
2999 struct netmap_slot *
3000 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
3001 	u_int new_cur)
3002 {
3003 	struct netmap_kring *kring;
3004 	int new_hwofs, lim;
3005 
3006 	if (na == NULL) {
3007 		D("NULL na, should not happen");
3008 		return NULL;	/* no netmap support here */
3009 	}
3010 	if (!(na->ifp->if_capenable & IFCAP_NETMAP)) {
3011 		ND("interface not in netmap mode");
3012 		return NULL;	/* nothing to reinitialize */
3013 	}
3014 
3015 	/* XXX note- in the new scheme, we are not guaranteed to be
3016 	 * under lock (e.g. when called on a device reset).
3017 	 * In this case, we should set a flag and do not trust too
3018 	 * much the values. In practice: TODO
3019 	 * - set a RESET flag somewhere in the kring
3020 	 * - do the processing in a conservative way
3021 	 * - let the *sync() fixup at the end.
3022 	 */
3023 	if (tx == NR_TX) {
3024 		if (n >= na->num_tx_rings)
3025 			return NULL;
3026 		kring = na->tx_rings + n;
3027 		new_hwofs = kring->nr_hwcur - new_cur;
3028 	} else {
3029 		if (n >= na->num_rx_rings)
3030 			return NULL;
3031 		kring = na->rx_rings + n;
3032 		new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur;
3033 	}
3034 	lim = kring->nkr_num_slots - 1;
3035 	if (new_hwofs > lim)
3036 		new_hwofs -= lim + 1;
3037 
3038 	/* Always set the new offset value and realign the ring. */
3039 	D("%s hwofs %d -> %d, hwavail %d -> %d",
3040 		tx == NR_TX ? "TX" : "RX",
3041 		kring->nkr_hwofs, new_hwofs,
3042 		kring->nr_hwavail,
3043 		tx == NR_TX ? lim : kring->nr_hwavail);
3044 	kring->nkr_hwofs = new_hwofs;
3045 	if (tx == NR_TX)
3046 		kring->nr_hwavail = lim;
3047 
3048 #if 0 // def linux
3049 	/* XXX check that the mappings are correct */
3050 	/* need ring_nr, adapter->pdev, direction */
3051 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
3052 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
3053 		D("error mapping rx netmap buffer %d", i);
3054 		// XXX fix error handling
3055 	}
3056 
3057 #endif /* linux */
3058 	/*
3059 	 * Wakeup on the individual and global selwait
3060 	 * We do the wakeup here, but the ring is not yet reconfigured.
3061 	 * However, we are under lock so there are no races.
3062 	 */
3063 	selwakeuppri(&kring->si, PI_NET);
3064 	selwakeuppri(tx == NR_TX ? &na->tx_si : &na->rx_si, PI_NET);
3065 	return kring->ring->slot;
3066 }
3067 
3068 
3069 /*
3070  * Grab packets from a kring, move them into the ft structure
3071  * associated to the tx (input) port. Max one instance per port,
3072  * filtered on input (ioctl, poll or XXX).
3073  * Returns the next position in the ring.
3074  */
3075 static int
3076 nm_bdg_preflush(struct netmap_adapter *na, u_int ring_nr,
3077 	struct netmap_kring *kring, u_int end)
3078 {
3079 	struct netmap_ring *ring = kring->ring;
3080 	struct nm_bdg_fwd *ft;
3081 	u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
3082 	u_int ft_i = 0;	/* start from 0 */
3083 	u_int frags = 1; /* how many frags ? */
3084 	struct nm_bridge *b = na->na_bdg;
3085 
3086 	/* To protect against modifications to the bridge we acquire a
3087 	 * shared lock, waiting if we can sleep (if the source port is
3088 	 * attached to a user process) or with a trylock otherwise (NICs).
3089 	 */
3090 	ND("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
3091 	if (na->na_flags & NAF_BDG_MAYSLEEP)
3092 		BDG_RLOCK(b);
3093 	else if (!BDG_RTRYLOCK(b))
3094 		return 0;
3095 	ND(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
3096 	ft = kring->nkr_ft;
3097 
3098 	for (; likely(j != end); j = nm_next(j, lim)) {
3099 		struct netmap_slot *slot = &ring->slot[j];
3100 		char *buf;
3101 
3102 		ft[ft_i].ft_len = slot->len;
3103 		ft[ft_i].ft_flags = slot->flags;
3104 
3105 		ND("flags is 0x%x", slot->flags);
3106 		/* this slot goes into a list so initialize the link field */
3107 		ft[ft_i].ft_next = NM_FT_NULL;
3108 		buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
3109 			(void *)(uintptr_t)slot->ptr : BDG_NMB(na->nm_mem, slot);
3110 		prefetch(buf);
3111 		++ft_i;
3112 		if (slot->flags & NS_MOREFRAG) {
3113 			frags++;
3114 			continue;
3115 		}
3116 		if (unlikely(netmap_verbose && frags > 1))
3117 			RD(5, "%d frags at %d", frags, ft_i - frags);
3118 		ft[ft_i - frags].ft_frags = frags;
3119 		frags = 1;
3120 		if (unlikely((int)ft_i >= bridge_batch))
3121 			ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
3122 	}
3123 	if (frags > 1) {
3124 		D("truncate incomplete fragment at %d (%d frags)", ft_i, frags);
3125 		// ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG
3126 		ft[ft_i - 1].ft_frags &= ~NS_MOREFRAG;
3127 		ft[ft_i - frags].ft_frags = frags - 1;
3128 	}
3129 	if (ft_i)
3130 		ft_i = nm_bdg_flush(ft, ft_i, na, ring_nr);
3131 	BDG_RUNLOCK(b);
3132 	return j;
3133 }
3134 
3135 
3136 /*
3137  * Pass packets from nic to the bridge.
3138  * XXX TODO check locking: this is called from the interrupt
3139  * handler so we should make sure that the interface is not
3140  * disconnected while passing down an interrupt.
3141  *
3142  * Note, no user process can access this NIC so we can ignore
3143  * the info in the 'ring'.
3144  */
3145 static void
3146 netmap_nic_to_bdg(struct ifnet *ifp, u_int ring_nr)
3147 {
3148 	struct netmap_adapter *na = NA(ifp);
3149 	struct netmap_kring *kring = &na->rx_rings[ring_nr];
3150 	struct netmap_ring *ring = kring->ring;
3151 	u_int j, k;
3152 
3153 	/* make sure that only one thread is ever in here,
3154 	 * after which we can unlock. Probably unnecessary XXX.
3155 	 */
3156 	if (nm_kr_tryget(kring))
3157 		return;
3158 	/* fetch packets that have arrived.
3159 	 * XXX maybe do this in a loop ?
3160 	 */
3161 	if (na->nm_rxsync(ifp, ring_nr, 0))
3162 		goto put_out;
3163 	if (kring->nr_hwavail == 0 && netmap_verbose) {
3164 		D("how strange, interrupt with no packets on %s",
3165 			ifp->if_xname);
3166 		goto put_out;
3167 	}
3168 	k = nm_kr_rxpos(kring);
3169 
3170 	j = nm_bdg_preflush(na, ring_nr, kring, k);
3171 
3172 	/* we consume everything, but we cannot update kring directly
3173 	 * because the nic may have destroyed the info in the NIC ring.
3174 	 * So we need to call rxsync again to restore it.
3175 	 */
3176 	ring->cur = j;
3177 	ring->avail = 0;
3178 	na->nm_rxsync(ifp, ring_nr, 0);
3179 
3180 put_out:
3181 	nm_kr_put(kring);
3182 	return;
3183 }
3184 
3185 
3186 /*
3187  * Default functions to handle rx/tx interrupts from a physical device.
3188  * "work_done" is non-null on the RX path, NULL for the TX path.
3189  * We rely on the OS to make sure that there is only one active
3190  * instance per queue, and that there is appropriate locking.
3191  *
3192  * If the card is not in netmap mode, simply return 0,
3193  * so that the caller proceeds with regular processing.
3194  *
3195  * If the card is connected to a netmap file descriptor,
3196  * do a selwakeup on the individual queue, plus one on the global one
3197  * if needed (multiqueue card _and_ there are multiqueue listeners),
3198  * and return 1.
3199  *
3200  * Finally, if called on rx from an interface connected to a switch,
3201  * calls the proper forwarding routine, and return 1.
3202  */
3203 int
3204 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3205 {
3206 	struct netmap_adapter *na;
3207 	struct netmap_kring *kring;
3208 
3209 	if (!(ifp->if_capenable & IFCAP_NETMAP))
3210 		return 0;
3211 
3212 	q &= NETMAP_RING_MASK;
3213 
3214 	if (netmap_verbose)
3215 		RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
3216 	na = NA(ifp);
3217 	if (na->na_flags & NAF_SKIP_INTR) {
3218 		ND("use regular interrupt");
3219 		return 0;
3220 	}
3221 
3222 	if (work_done) { /* RX path */
3223 		if (q >= na->num_rx_rings)
3224 			return 0;	// not a physical queue
3225 		kring = na->rx_rings + q;
3226 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
3227 		if (na->na_bdg != NULL) {
3228 			netmap_nic_to_bdg(ifp, q);
3229 		} else {
3230 			selwakeuppri(&kring->si, PI_NET);
3231 			if (na->num_rx_rings > 1 /* or multiple listeners */ )
3232 				selwakeuppri(&na->rx_si, PI_NET);
3233 		}
3234 		*work_done = 1; /* do not fire napi again */
3235 	} else { /* TX path */
3236 		if (q >= na->num_tx_rings)
3237 			return 0;	// not a physical queue
3238 		kring = na->tx_rings + q;
3239 		selwakeuppri(&kring->si, PI_NET);
3240 		if (na->num_tx_rings > 1 /* or multiple listeners */ )
3241 			selwakeuppri(&na->tx_si, PI_NET);
3242 	}
3243 	return 1;
3244 }
3245 
3246 
3247 #ifdef linux	/* linux-specific routines */
3248 
3249 
3250 /*
3251  * Remap linux arguments into the FreeBSD call.
3252  * - pwait is the poll table, passed as 'dev';
3253  *   If pwait == NULL someone else already woke up before. We can report
3254  *   events but they are filtered upstream.
3255  *   If pwait != NULL, then pwait->key contains the list of events.
3256  * - events is computed from pwait as above.
3257  * - file is passed as 'td';
3258  */
3259 static u_int
3260 linux_netmap_poll(struct file * file, struct poll_table_struct *pwait)
3261 {
3262 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
3263 	int events = POLLIN | POLLOUT; /* XXX maybe... */
3264 #elif LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0)
3265 	int events = pwait ? pwait->key : POLLIN | POLLOUT;
3266 #else /* in 3.4.0 field 'key' was renamed to '_key' */
3267 	int events = pwait ? pwait->_key : POLLIN | POLLOUT;
3268 #endif
3269 	return netmap_poll((void *)pwait, events, (void *)file);
3270 }
3271 
3272 
3273 static int
3274 linux_netmap_mmap(struct file *f, struct vm_area_struct *vma)
3275 {
3276 	int error = 0;
3277 	unsigned long off, va;
3278 	vm_ooffset_t pa;
3279 	struct netmap_priv_d *priv = f->private_data;
3280 	/*
3281 	 * vma->vm_start: start of mapping user address space
3282 	 * vma->vm_end: end of the mapping user address space
3283 	 * vma->vm_pfoff: offset of first page in the device
3284 	 */
3285 
3286 	// XXX security checks
3287 
3288 	error = netmap_get_memory(priv);
3289 	ND("get_memory returned %d", error);
3290 	if (error)
3291 	    return -error;
3292 
3293 	if ((vma->vm_start & ~PAGE_MASK) || (vma->vm_end & ~PAGE_MASK)) {
3294 		ND("vm_start = %lx vm_end = %lx", vma->vm_start, vma->vm_end);
3295 		return -EINVAL;
3296 	}
3297 
3298 	for (va = vma->vm_start, off = vma->vm_pgoff;
3299 	     va < vma->vm_end;
3300 	     va += PAGE_SIZE, off++)
3301 	{
3302 		pa = netmap_mem_ofstophys(priv->np_mref, off << PAGE_SHIFT);
3303 		if (pa == 0)
3304 			return -EINVAL;
3305 
3306 		ND("va %lx pa %p", va, pa);
3307 		error = remap_pfn_range(vma, va, pa >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot);
3308 		if (error)
3309 			return error;
3310 	}
3311 	return 0;
3312 }
3313 
3314 
3315 /*
3316  * This one is probably already protected by the netif lock XXX
3317  */
3318 static netdev_tx_t
3319 linux_netmap_start_xmit(struct sk_buff *skb, struct net_device *dev)
3320 {
3321 	netmap_transmit(dev, skb);
3322 	return (NETDEV_TX_OK);
3323 }
3324 
3325 
3326 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)	// XXX was 37
3327 #define LIN_IOCTL_NAME	.ioctl
3328 int
3329 linux_netmap_ioctl(struct inode *inode, struct file *file, u_int cmd, u_long data /* arg */)
3330 #else
3331 #define LIN_IOCTL_NAME	.unlocked_ioctl
3332 long
3333 linux_netmap_ioctl(struct file *file, u_int cmd, u_long data /* arg */)
3334 #endif
3335 {
3336 	int ret;
3337 	struct nmreq nmr;
3338 	bzero(&nmr, sizeof(nmr));
3339 
3340 	if (cmd == NIOCTXSYNC || cmd == NIOCRXSYNC) {
3341 		data = 0;	/* no argument required here */
3342 	}
3343 	if (data && copy_from_user(&nmr, (void *)data, sizeof(nmr) ) != 0)
3344 		return -EFAULT;
3345 	ret = netmap_ioctl(NULL, cmd, (caddr_t)&nmr, 0, (void *)file);
3346 	if (data && copy_to_user((void*)data, &nmr, sizeof(nmr) ) != 0)
3347 		return -EFAULT;
3348 	return -ret;
3349 }
3350 
3351 
3352 static int
3353 netmap_release(struct inode *inode, struct file *file)
3354 {
3355 	(void)inode;	/* UNUSED */
3356 	if (file->private_data)
3357 		netmap_dtor(file->private_data);
3358 	return (0);
3359 }
3360 
3361 
3362 static int
3363 linux_netmap_open(struct inode *inode, struct file *file)
3364 {
3365 	struct netmap_priv_d *priv;
3366 	(void)inode;	/* UNUSED */
3367 
3368 	priv = malloc(sizeof(struct netmap_priv_d), M_DEVBUF,
3369 			      M_NOWAIT | M_ZERO);
3370 	if (priv == NULL)
3371 		return -ENOMEM;
3372 
3373 	file->private_data = priv;
3374 
3375 	return (0);
3376 }
3377 
3378 
3379 static struct file_operations netmap_fops = {
3380     .owner = THIS_MODULE,
3381     .open = linux_netmap_open,
3382     .mmap = linux_netmap_mmap,
3383     LIN_IOCTL_NAME = linux_netmap_ioctl,
3384     .poll = linux_netmap_poll,
3385     .release = netmap_release,
3386 };
3387 
3388 
3389 static struct miscdevice netmap_cdevsw = {	/* same name as FreeBSD */
3390 	MISC_DYNAMIC_MINOR,
3391 	"netmap",
3392 	&netmap_fops,
3393 };
3394 
3395 static int netmap_init(void);
3396 static void netmap_fini(void);
3397 
3398 
3399 /* Errors have negative values on linux */
3400 static int linux_netmap_init(void)
3401 {
3402 	return -netmap_init();
3403 }
3404 
3405 module_init(linux_netmap_init);
3406 module_exit(netmap_fini);
3407 /* export certain symbols to other modules */
3408 EXPORT_SYMBOL(netmap_attach);		// driver attach routines
3409 EXPORT_SYMBOL(netmap_detach);		// driver detach routines
3410 EXPORT_SYMBOL(netmap_ring_reinit);	// ring init on error
3411 EXPORT_SYMBOL(netmap_buffer_lut);
3412 EXPORT_SYMBOL(netmap_total_buffers);	// index check
3413 EXPORT_SYMBOL(netmap_buffer_base);
3414 EXPORT_SYMBOL(netmap_reset);		// ring init routines
3415 EXPORT_SYMBOL(netmap_buf_size);
3416 EXPORT_SYMBOL(netmap_rx_irq);		// default irq handler
3417 EXPORT_SYMBOL(netmap_no_pendintr);	// XXX mitigation - should go away
3418 EXPORT_SYMBOL(netmap_bdg_ctl);		// bridge configuration routine
3419 EXPORT_SYMBOL(netmap_bdg_learning);	// the default lookup function
3420 EXPORT_SYMBOL(netmap_disable_all_rings);
3421 EXPORT_SYMBOL(netmap_enable_all_rings);
3422 
3423 
3424 MODULE_AUTHOR("http://info.iet.unipi.it/~luigi/netmap/");
3425 MODULE_DESCRIPTION("The netmap packet I/O framework");
3426 MODULE_LICENSE("Dual BSD/GPL"); /* the code here is all BSD. */
3427 
3428 #else /* __FreeBSD__ */
3429 
3430 
3431 static struct cdevsw netmap_cdevsw = {
3432 	.d_version = D_VERSION,
3433 	.d_name = "netmap",
3434 	.d_open = netmap_open,
3435 	.d_mmap_single = netmap_mmap_single,
3436 	.d_ioctl = netmap_ioctl,
3437 	.d_poll = netmap_poll,
3438 	.d_close = netmap_close,
3439 };
3440 #endif /* __FreeBSD__ */
3441 
3442 /*
3443  *---- support for virtual bridge -----
3444  */
3445 
3446 /* ----- FreeBSD if_bridge hash function ------- */
3447 
3448 /*
3449  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
3450  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
3451  *
3452  * http://www.burtleburtle.net/bob/hash/spooky.html
3453  */
3454 #define mix(a, b, c)                                                    \
3455 do {                                                                    \
3456         a -= b; a -= c; a ^= (c >> 13);                                 \
3457         b -= c; b -= a; b ^= (a << 8);                                  \
3458         c -= a; c -= b; c ^= (b >> 13);                                 \
3459         a -= b; a -= c; a ^= (c >> 12);                                 \
3460         b -= c; b -= a; b ^= (a << 16);                                 \
3461         c -= a; c -= b; c ^= (b >> 5);                                  \
3462         a -= b; a -= c; a ^= (c >> 3);                                  \
3463         b -= c; b -= a; b ^= (a << 10);                                 \
3464         c -= a; c -= b; c ^= (b >> 15);                                 \
3465 } while (/*CONSTCOND*/0)
3466 
3467 static __inline uint32_t
3468 nm_bridge_rthash(const uint8_t *addr)
3469 {
3470         uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hask key
3471 
3472         b += addr[5] << 8;
3473         b += addr[4];
3474         a += addr[3] << 24;
3475         a += addr[2] << 16;
3476         a += addr[1] << 8;
3477         a += addr[0];
3478 
3479         mix(a, b, c);
3480 #define BRIDGE_RTHASH_MASK	(NM_BDG_HASH-1)
3481         return (c & BRIDGE_RTHASH_MASK);
3482 }
3483 
3484 #undef mix
3485 
3486 
3487 static int
3488 bdg_netmap_reg(struct ifnet *ifp, int onoff)
3489 {
3490 	/* the interface is already attached to the bridge,
3491 	 * so we only need to toggle IFCAP_NETMAP.
3492 	 */
3493 	if (onoff) {
3494 		ifp->if_capenable |= IFCAP_NETMAP;
3495 	} else {
3496 		ifp->if_capenable &= ~IFCAP_NETMAP;
3497 	}
3498 	return 0;
3499 }
3500 
3501 
3502 /*
3503  * Lookup function for a learning bridge.
3504  * Update the hash table with the source address,
3505  * and then returns the destination port index, and the
3506  * ring in *dst_ring (at the moment, always use ring 0)
3507  */
3508 u_int
3509 netmap_bdg_learning(char *buf, u_int buf_len, uint8_t *dst_ring,
3510 		struct netmap_adapter *na)
3511 {
3512 	struct nm_hash_ent *ht = na->na_bdg->ht;
3513 	uint32_t sh, dh;
3514 	u_int dst, mysrc = na->bdg_port;
3515 	uint64_t smac, dmac;
3516 
3517 	if (buf_len < 14) {
3518 		D("invalid buf length %d", buf_len);
3519 		return NM_BDG_NOPORT;
3520 	}
3521 	dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
3522 	smac = le64toh(*(uint64_t *)(buf + 4));
3523 	smac >>= 16;
3524 
3525 	/*
3526 	 * The hash is somewhat expensive, there might be some
3527 	 * worthwhile optimizations here.
3528 	 */
3529 	if ((buf[6] & 1) == 0) { /* valid src */
3530 		uint8_t *s = buf+6;
3531 		sh = nm_bridge_rthash(s); // XXX hash of source
3532 		/* update source port forwarding entry */
3533 		ht[sh].mac = smac;	/* XXX expire ? */
3534 		ht[sh].ports = mysrc;
3535 		if (netmap_verbose)
3536 		    D("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
3537 			s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
3538 	}
3539 	dst = NM_BDG_BROADCAST;
3540 	if ((buf[0] & 1) == 0) { /* unicast */
3541 		dh = nm_bridge_rthash(buf); // XXX hash of dst
3542 		if (ht[dh].mac == dmac) {	/* found dst */
3543 			dst = ht[dh].ports;
3544 		}
3545 		/* XXX otherwise return NM_BDG_UNKNOWN ? */
3546 	}
3547 	*dst_ring = 0;
3548 	return dst;
3549 }
3550 
3551 
3552 /*
3553  * This flush routine supports only unicast and broadcast but a large
3554  * number of ports, and lets us replace the learn and dispatch functions.
3555  */
3556 int
3557 nm_bdg_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_adapter *na,
3558 		u_int ring_nr)
3559 {
3560 	struct nm_bdg_q *dst_ents, *brddst;
3561 	uint16_t num_dsts = 0, *dsts;
3562 	struct nm_bridge *b = na->na_bdg;
3563 	u_int i, j, me = na->bdg_port;
3564 
3565 	/*
3566 	 * The work area (pointed by ft) is followed by an array of
3567 	 * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
3568 	 * queues per port plus one for the broadcast traffic.
3569 	 * Then we have an array of destination indexes.
3570 	 */
3571 	dst_ents = (struct nm_bdg_q *)(ft + NM_BDG_BATCH_MAX);
3572 	dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
3573 
3574 	/* first pass: find a destination for each packet in the batch */
3575 	for (i = 0; likely(i < n); i += ft[i].ft_frags) {
3576 		uint8_t dst_ring = ring_nr; /* default, same ring as origin */
3577 		uint16_t dst_port, d_i;
3578 		struct nm_bdg_q *d;
3579 
3580 		ND("slot %d frags %d", i, ft[i].ft_frags);
3581 		dst_port = b->nm_bdg_lookup(ft[i].ft_buf, ft[i].ft_len,
3582 			&dst_ring, na);
3583 		if (netmap_verbose > 255)
3584 			RD(5, "slot %d port %d -> %d", i, me, dst_port);
3585 		if (dst_port == NM_BDG_NOPORT)
3586 			continue; /* this packet is identified to be dropped */
3587 		else if (unlikely(dst_port > NM_BDG_MAXPORTS))
3588 			continue;
3589 		else if (dst_port == NM_BDG_BROADCAST)
3590 			dst_ring = 0; /* broadcasts always go to ring 0 */
3591 		else if (unlikely(dst_port == me ||
3592 		    !b->bdg_ports[dst_port]))
3593 			continue;
3594 
3595 		/* get a position in the scratch pad */
3596 		d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
3597 		d = dst_ents + d_i;
3598 
3599 		/* append the first fragment to the list */
3600 		if (d->bq_head == NM_FT_NULL) { /* new destination */
3601 			d->bq_head = d->bq_tail = i;
3602 			/* remember this position to be scanned later */
3603 			if (dst_port != NM_BDG_BROADCAST)
3604 				dsts[num_dsts++] = d_i;
3605 		} else {
3606 			ft[d->bq_tail].ft_next = i;
3607 			d->bq_tail = i;
3608 		}
3609 		d->bq_len += ft[i].ft_frags;
3610 	}
3611 
3612 	/*
3613 	 * Broadcast traffic goes to ring 0 on all destinations.
3614 	 * So we need to add these rings to the list of ports to scan.
3615 	 * XXX at the moment we scan all NM_BDG_MAXPORTS ports, which is
3616 	 * expensive. We should keep a compact list of active destinations
3617 	 * so we could shorten this loop.
3618 	 */
3619 	brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
3620 	if (brddst->bq_head != NM_FT_NULL) {
3621 		for (j = 0; likely(j < b->bdg_active_ports); j++) {
3622 			uint16_t d_i;
3623 			i = b->bdg_port_index[j];
3624 			if (unlikely(i == me))
3625 				continue;
3626 			d_i = i * NM_BDG_MAXRINGS;
3627 			if (dst_ents[d_i].bq_head == NM_FT_NULL)
3628 				dsts[num_dsts++] = d_i;
3629 		}
3630 	}
3631 
3632 	ND(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
3633 	/* second pass: scan destinations (XXX will be modular somehow) */
3634 	for (i = 0; i < num_dsts; i++) {
3635 		struct ifnet *dst_ifp;
3636 		struct netmap_adapter *dst_na;
3637 		struct netmap_kring *kring;
3638 		struct netmap_ring *ring;
3639 		u_int dst_nr, is_vp, lim, j, sent = 0, d_i, next, brd_next;
3640 		u_int needed, howmany;
3641 		int retry = netmap_txsync_retry;
3642 		struct nm_bdg_q *d;
3643 		uint32_t my_start = 0, lease_idx = 0;
3644 		int nrings;
3645 
3646 		d_i = dsts[i];
3647 		ND("second pass %d port %d", i, d_i);
3648 		d = dst_ents + d_i;
3649 		// XXX fix the division
3650 		dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
3651 		/* protect from the lookup function returning an inactive
3652 		 * destination port
3653 		 */
3654 		if (unlikely(dst_na == NULL))
3655 			goto cleanup;
3656 		if (dst_na->na_flags & NAF_SW_ONLY)
3657 			goto cleanup;
3658 		dst_ifp = dst_na->ifp;
3659 		/*
3660 		 * The interface may be in !netmap mode in two cases:
3661 		 * - when na is attached but not activated yet;
3662 		 * - when na is being deactivated but is still attached.
3663 		 */
3664 		if (unlikely(!(dst_ifp->if_capenable & IFCAP_NETMAP))) {
3665 			ND("not in netmap mode!");
3666 			goto cleanup;
3667 		}
3668 
3669 		/* there is at least one either unicast or broadcast packet */
3670 		brd_next = brddst->bq_head;
3671 		next = d->bq_head;
3672 		/* we need to reserve this many slots. If fewer are
3673 		 * available, some packets will be dropped.
3674 		 * Packets may have multiple fragments, so we may not use
3675 		 * there is a chance that we may not use all of the slots
3676 		 * we have claimed, so we will need to handle the leftover
3677 		 * ones when we regain the lock.
3678 		 */
3679 		needed = d->bq_len + brddst->bq_len;
3680 
3681 		is_vp = nma_is_vp(dst_na);
3682 		ND(5, "pass 2 dst %d is %x %s",
3683 			i, d_i, is_vp ? "virtual" : "nic/host");
3684 		dst_nr = d_i & (NM_BDG_MAXRINGS-1);
3685 		if (is_vp) { /* virtual port */
3686 			nrings = dst_na->num_rx_rings;
3687 		} else {
3688 			nrings = dst_na->num_tx_rings;
3689 		}
3690 		if (dst_nr >= nrings)
3691 			dst_nr = dst_nr % nrings;
3692 		kring = is_vp ?  &dst_na->rx_rings[dst_nr] :
3693 				&dst_na->tx_rings[dst_nr];
3694 		ring = kring->ring;
3695 		lim = kring->nkr_num_slots - 1;
3696 
3697 retry:
3698 
3699 		/* reserve the buffers in the queue and an entry
3700 		 * to report completion, and drop lock.
3701 		 * XXX this might become a helper function.
3702 		 */
3703 		mtx_lock(&kring->q_lock);
3704 		if (kring->nkr_stopped) {
3705 			mtx_unlock(&kring->q_lock);
3706 			goto cleanup;
3707 		}
3708 		/* on physical interfaces, do a txsync to recover
3709 		 * slots for packets already transmitted.
3710 		 * XXX maybe we could be optimistic and rely on a retry
3711 		 * in case of failure.
3712 		 */
3713 		if (nma_is_hw(dst_na)) {
3714 			dst_na->nm_txsync(dst_ifp, dst_nr, 0);
3715 		}
3716 		my_start = j = kring->nkr_hwlease;
3717 		howmany = nm_kr_space(kring, is_vp);
3718 		if (needed < howmany)
3719 			howmany = needed;
3720 		lease_idx = nm_kr_lease(kring, howmany, is_vp);
3721 		mtx_unlock(&kring->q_lock);
3722 
3723 		/* only retry if we need more than available slots */
3724 		if (retry && needed <= howmany)
3725 			retry = 0;
3726 
3727 		/* copy to the destination queue */
3728 		while (howmany > 0) {
3729 			struct netmap_slot *slot;
3730 			struct nm_bdg_fwd *ft_p, *ft_end;
3731 			u_int cnt;
3732 
3733 			/* find the queue from which we pick next packet.
3734 			 * NM_FT_NULL is always higher than valid indexes
3735 			 * so we never dereference it if the other list
3736 			 * has packets (and if both are empty we never
3737 			 * get here).
3738 			 */
3739 			if (next < brd_next) {
3740 				ft_p = ft + next;
3741 				next = ft_p->ft_next;
3742 			} else { /* insert broadcast */
3743 				ft_p = ft + brd_next;
3744 				brd_next = ft_p->ft_next;
3745 			}
3746 			cnt = ft_p->ft_frags; // cnt > 0
3747 			if (unlikely(cnt > howmany))
3748 			    break; /* no more space */
3749 			howmany -= cnt;
3750 			if (netmap_verbose && cnt > 1)
3751 				RD(5, "rx %d frags to %d", cnt, j);
3752 			ft_end = ft_p + cnt;
3753 			do {
3754 			    void *dst, *src = ft_p->ft_buf;
3755 			    size_t len = (ft_p->ft_len + 63) & ~63;
3756 
3757 			    slot = &ring->slot[j];
3758 			    dst = BDG_NMB(dst_na->nm_mem, slot);
3759 			    /* round to a multiple of 64 */
3760 
3761 			    ND("send %d %d bytes at %s:%d",
3762 				i, ft_p->ft_len, dst_ifp->if_xname, j);
3763 			    if (ft_p->ft_flags & NS_INDIRECT) {
3764 				if (copyin(src, dst, len)) {
3765 					// invalid user pointer, pretend len is 0
3766 					ft_p->ft_len = 0;
3767 				}
3768 			    } else {
3769 				//memcpy(dst, src, len);
3770 				pkt_copy(src, dst, (int)len);
3771 			    }
3772 			    slot->len = ft_p->ft_len;
3773 			    slot->flags = (cnt << 8)| NS_MOREFRAG;
3774 			    j = nm_next(j, lim);
3775 			    ft_p++;
3776 			    sent++;
3777 			} while (ft_p != ft_end);
3778 			slot->flags = (cnt << 8); /* clear flag on last entry */
3779 			/* are we done ? */
3780 			if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
3781 				break;
3782 		}
3783 		{
3784 		    /* current position */
3785 		    uint32_t *p = kring->nkr_leases; /* shorthand */
3786 		    uint32_t update_pos;
3787 		    int still_locked = 1;
3788 
3789 		    mtx_lock(&kring->q_lock);
3790 		    if (unlikely(howmany > 0)) {
3791 			/* not used all bufs. If i am the last one
3792 			 * i can recover the slots, otherwise must
3793 			 * fill them with 0 to mark empty packets.
3794 			 */
3795 			ND("leftover %d bufs", howmany);
3796 			if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
3797 			    /* yes i am the last one */
3798 			    ND("roll back nkr_hwlease to %d", j);
3799 			    kring->nkr_hwlease = j;
3800 			} else {
3801 			    while (howmany-- > 0) {
3802 				ring->slot[j].len = 0;
3803 				ring->slot[j].flags = 0;
3804 				j = nm_next(j, lim);
3805 			    }
3806 			}
3807 		    }
3808 		    p[lease_idx] = j; /* report I am done */
3809 
3810 		    update_pos = is_vp ? nm_kr_rxpos(kring) : ring->cur;
3811 
3812 		    if (my_start == update_pos) {
3813 			/* all slots before my_start have been reported,
3814 			 * so scan subsequent leases to see if other ranges
3815 			 * have been completed, and to a selwakeup or txsync.
3816 		         */
3817 			while (lease_idx != kring->nkr_lease_idx &&
3818 				p[lease_idx] != NR_NOSLOT) {
3819 			    j = p[lease_idx];
3820 			    p[lease_idx] = NR_NOSLOT;
3821 			    lease_idx = nm_next(lease_idx, lim);
3822 			}
3823 			/* j is the new 'write' position. j != my_start
3824 			 * means there are new buffers to report
3825 			 */
3826 			if (likely(j != my_start)) {
3827 			    if (is_vp) {
3828 				uint32_t old_avail = kring->nr_hwavail;
3829 
3830 				kring->nr_hwavail = (j >= kring->nr_hwcur) ?
3831 					j - kring->nr_hwcur :
3832 					j + lim + 1 - kring->nr_hwcur;
3833 				if (kring->nr_hwavail < old_avail) {
3834 					D("avail shrink %d -> %d",
3835 						old_avail, kring->nr_hwavail);
3836 				}
3837 				still_locked = 0;
3838 				mtx_unlock(&kring->q_lock);
3839 				selwakeuppri(&kring->si, PI_NET);
3840 			    } else {
3841 				ring->cur = j;
3842 				/* XXX update avail ? */
3843 				still_locked = 0;
3844 				dst_na->nm_txsync(dst_ifp, dst_nr, 0);
3845 				mtx_unlock(&kring->q_lock);
3846 
3847 				/* retry to send more packets */
3848 				if (nma_is_hw(dst_na) && retry--)
3849 					goto retry;
3850 			    }
3851 			}
3852 		    }
3853 		    if (still_locked)
3854 			mtx_unlock(&kring->q_lock);
3855 		}
3856 cleanup:
3857 		d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
3858 		d->bq_len = 0;
3859 	}
3860 	brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
3861 	brddst->bq_len = 0;
3862 	return 0;
3863 }
3864 
3865 
3866 /*
3867  * main dispatch routine for the bridge.
3868  * We already know that only one thread is running this.
3869  * we must run nm_bdg_preflush without lock.
3870  */
3871 static int
3872 bdg_netmap_txsync(struct ifnet *ifp, u_int ring_nr, int flags)
3873 {
3874 	struct netmap_adapter *na = NA(ifp);
3875 	struct netmap_kring *kring = &na->tx_rings[ring_nr];
3876 	struct netmap_ring *ring = kring->ring;
3877 	u_int j, k, lim = kring->nkr_num_slots - 1;
3878 
3879 	k = ring->cur;
3880 	if (k > lim)
3881 		return netmap_ring_reinit(kring);
3882 
3883 	if (bridge_batch <= 0) { /* testing only */
3884 		j = k; // used all
3885 		goto done;
3886 	}
3887 	if (bridge_batch > NM_BDG_BATCH)
3888 		bridge_batch = NM_BDG_BATCH;
3889 
3890 	j = nm_bdg_preflush(na, ring_nr, kring, k);
3891 	if (j != k)
3892 		D("early break at %d/ %d, avail %d", j, k, kring->nr_hwavail);
3893 	/* k-j modulo ring size is the number of slots processed */
3894 	if (k < j)
3895 		k += kring->nkr_num_slots;
3896 	kring->nr_hwavail = lim - (k - j);
3897 
3898 done:
3899 	kring->nr_hwcur = j;
3900 	ring->avail = kring->nr_hwavail;
3901 	if (netmap_verbose)
3902 		D("%s ring %d flags %d", ifp->if_xname, ring_nr, flags);
3903 	return 0;
3904 }
3905 
3906 
3907 /*
3908  * user process reading from a VALE switch.
3909  * Already protected against concurrent calls from userspace,
3910  * but we must acquire the queue's lock to protect against
3911  * writers on the same queue.
3912  */
3913 static int
3914 bdg_netmap_rxsync(struct ifnet *ifp, u_int ring_nr, int flags)
3915 {
3916 	struct netmap_adapter *na = NA(ifp);
3917 	struct netmap_kring *kring = &na->rx_rings[ring_nr];
3918 	struct netmap_ring *ring = kring->ring;
3919 	u_int j, lim = kring->nkr_num_slots - 1;
3920 	u_int k = ring->cur, resvd = ring->reserved;
3921 	int n;
3922 
3923 	mtx_lock(&kring->q_lock);
3924 	if (k > lim) {
3925 		D("ouch dangerous reset!!!");
3926 		n = netmap_ring_reinit(kring);
3927 		goto done;
3928 	}
3929 
3930 	/* skip past packets that userspace has released */
3931 	j = kring->nr_hwcur;    /* netmap ring index */
3932 	if (resvd > 0) {
3933 		if (resvd + ring->avail >= lim + 1) {
3934 			D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
3935 			ring->reserved = resvd = 0; // XXX panic...
3936 		}
3937 		k = (k >= resvd) ? k - resvd : k + lim + 1 - resvd;
3938 	}
3939 
3940 	if (j != k) { /* userspace has released some packets. */
3941 		n = k - j;
3942 		if (n < 0)
3943 			n += kring->nkr_num_slots;
3944 		ND("userspace releases %d packets", n);
3945                 for (n = 0; likely(j != k); n++) {
3946                         struct netmap_slot *slot = &ring->slot[j];
3947                         void *addr = BDG_NMB(na->nm_mem, slot);
3948 
3949                         if (addr == netmap_buffer_base) { /* bad buf */
3950 				D("bad buffer index %d, ignore ?",
3951 					slot->buf_idx);
3952                         }
3953 			slot->flags &= ~NS_BUF_CHANGED;
3954                         j = nm_next(j, lim);
3955                 }
3956                 kring->nr_hwavail -= n;
3957                 kring->nr_hwcur = k;
3958         }
3959         /* tell userspace that there are new packets */
3960         ring->avail = kring->nr_hwavail - resvd;
3961 	n = 0;
3962 done:
3963 	mtx_unlock(&kring->q_lock);
3964 	return n;
3965 }
3966 
3967 
3968 static int
3969 bdg_netmap_attach(struct netmap_adapter *arg)
3970 {
3971 	struct netmap_adapter na;
3972 
3973 	ND("attaching virtual bridge");
3974 	bzero(&na, sizeof(na));
3975 
3976 	na.ifp = arg->ifp;
3977 	na.na_flags = NAF_BDG_MAYSLEEP | NAF_MEM_OWNER;
3978 	na.num_tx_rings = arg->num_tx_rings;
3979 	na.num_rx_rings = arg->num_rx_rings;
3980 	na.num_tx_desc = arg->num_tx_desc;
3981 	na.num_rx_desc = arg->num_rx_desc;
3982 	na.nm_txsync = bdg_netmap_txsync;
3983 	na.nm_rxsync = bdg_netmap_rxsync;
3984 	na.nm_register = bdg_netmap_reg;
3985 	na.nm_mem = netmap_mem_private_new(arg->ifp->if_xname,
3986 			na.num_tx_rings, na.num_tx_desc,
3987 			na.num_rx_rings, na.num_rx_desc);
3988 	return netmap_attach(&na, na.num_tx_rings);
3989 }
3990 
3991 
3992 static struct cdev *netmap_dev; /* /dev/netmap character device. */
3993 
3994 
3995 /*
3996  * Module loader.
3997  *
3998  * Create the /dev/netmap device and initialize all global
3999  * variables.
4000  *
4001  * Return 0 on success, errno on failure.
4002  */
4003 static int
4004 netmap_init(void)
4005 {
4006 	int i, error;
4007 
4008 	NMG_LOCK_INIT();
4009 
4010 	error = netmap_mem_init();
4011 	if (error != 0) {
4012 		printf("netmap: unable to initialize the memory allocator.\n");
4013 		return (error);
4014 	}
4015 	printf("netmap: loaded module\n");
4016 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
4017 			      "netmap");
4018 
4019 	bzero(nm_bridges, sizeof(struct nm_bridge) * NM_BRIDGES); /* safety */
4020 	for (i = 0; i < NM_BRIDGES; i++)
4021 		BDG_RWINIT(&nm_bridges[i]);
4022 	return (error);
4023 }
4024 
4025 
4026 /*
4027  * Module unloader.
4028  *
4029  * Free all the memory, and destroy the ``/dev/netmap`` device.
4030  */
4031 static void
4032 netmap_fini(void)
4033 {
4034 	destroy_dev(netmap_dev);
4035 	netmap_mem_fini();
4036 	NMG_LOCK_DESTROY();
4037 	printf("netmap: unloaded module.\n");
4038 }
4039 
4040 
4041 #ifdef __FreeBSD__
4042 /*
4043  * Kernel entry point.
4044  *
4045  * Initialize/finalize the module and return.
4046  *
4047  * Return 0 on success, errno on failure.
4048  */
4049 static int
4050 netmap_loader(__unused struct module *module, int event, __unused void *arg)
4051 {
4052 	int error = 0;
4053 
4054 	switch (event) {
4055 	case MOD_LOAD:
4056 		error = netmap_init();
4057 		break;
4058 
4059 	case MOD_UNLOAD:
4060 		netmap_fini();
4061 		break;
4062 
4063 	default:
4064 		error = EOPNOTSUPP;
4065 		break;
4066 	}
4067 
4068 	return (error);
4069 }
4070 
4071 
4072 DEV_MODULE(netmap, netmap_loader, NULL);
4073 #endif /* __FreeBSD__ */
4074