xref: /freebsd/sys/dev/netmap/netmap.c (revision f4b37ed0f8b307b1f3f0f630ca725d68f1dff30d)
1 /*
2  * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * $FreeBSD$
29  *
30  * This module supports memory mapped access to network devices,
31  * see netmap(4).
32  *
33  * The module uses a large, memory pool allocated by the kernel
34  * and accessible as mmapped memory by multiple userspace threads/processes.
35  * The memory pool contains packet buffers and "netmap rings",
36  * i.e. user-accessible copies of the interface's queues.
37  *
38  * Access to the network card works like this:
39  * 1. a process/thread issues one or more open() on /dev/netmap, to create
40  *    select()able file descriptor on which events are reported.
41  * 2. on each descriptor, the process issues an ioctl() to identify
42  *    the interface that should report events to the file descriptor.
43  * 3. on each descriptor, the process issues an mmap() request to
44  *    map the shared memory region within the process' address space.
45  *    The list of interesting queues is indicated by a location in
46  *    the shared memory region.
47  * 4. using the functions in the netmap(4) userspace API, a process
48  *    can look up the occupation state of a queue, access memory buffers,
49  *    and retrieve received packets or enqueue packets to transmit.
50  * 5. using some ioctl()s the process can synchronize the userspace view
51  *    of the queue with the actual status in the kernel. This includes both
52  *    receiving the notification of new packets, and transmitting new
53  *    packets on the output interface.
54  * 6. select() or poll() can be used to wait for events on individual
55  *    transmit or receive queues (or all queues for a given interface).
56  *
57 
58 		SYNCHRONIZATION (USER)
59 
60 The netmap rings and data structures may be shared among multiple
61 user threads or even independent processes.
62 Any synchronization among those threads/processes is delegated
63 to the threads themselves. Only one thread at a time can be in
64 a system call on the same netmap ring. The OS does not enforce
65 this and only guarantees against system crashes in case of
66 invalid usage.
67 
68 		LOCKING (INTERNAL)
69 
70 Within the kernel, access to the netmap rings is protected as follows:
71 
72 - a spinlock on each ring, to handle producer/consumer races on
73   RX rings attached to the host stack (against multiple host
74   threads writing from the host stack to the same ring),
75   and on 'destination' rings attached to a VALE switch
76   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
77   protecting multiple active senders for the same destination)
78 
79 - an atomic variable to guarantee that there is at most one
80   instance of *_*xsync() on the ring at any time.
81   For rings connected to user file
82   descriptors, an atomic_test_and_set() protects this, and the
83   lock on the ring is not actually used.
84   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
85   is also used to prevent multiple executions (the driver might indeed
86   already guarantee this).
87   For NIC TX rings connected to a VALE switch, the lock arbitrates
88   access to the queue (both when allocating buffers and when pushing
89   them out).
90 
91 - *xsync() should be protected against initializations of the card.
92   On FreeBSD most devices have the reset routine protected by
93   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
94   the RING protection on rx_reset(), this should be added.
95 
96   On linux there is an external lock on the tx path, which probably
97   also arbitrates access to the reset routine. XXX to be revised
98 
99 - a per-interface core_lock protecting access from the host stack
100   while interfaces may be detached from netmap mode.
101   XXX there should be no need for this lock if we detach the interfaces
102   only while they are down.
103 
104 
105 --- VALE SWITCH ---
106 
107 NMG_LOCK() serializes all modifications to switches and ports.
108 A switch cannot be deleted until all ports are gone.
109 
110 For each switch, an SX lock (RWlock on linux) protects
111 deletion of ports. When configuring or deleting a new port, the
112 lock is acquired in exclusive mode (after holding NMG_LOCK).
113 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
114 The lock is held throughout the entire forwarding cycle,
115 during which the thread may incur in a page fault.
116 Hence it is important that sleepable shared locks are used.
117 
118 On the rx ring, the per-port lock is grabbed initially to reserve
119 a number of slot in the ring, then the lock is released,
120 packets are copied from source to destination, and then
121 the lock is acquired again and the receive ring is updated.
122 (A similar thing is done on the tx ring for NIC and host stack
123 ports attached to the switch)
124 
125  */
126 
127 
128 /* --- internals ----
129  *
130  * Roadmap to the code that implements the above.
131  *
132  * > 1. a process/thread issues one or more open() on /dev/netmap, to create
133  * >    select()able file descriptor on which events are reported.
134  *
135  *  	Internally, we allocate a netmap_priv_d structure, that will be
136  *  	initialized on ioctl(NIOCREGIF).
137  *
138  *      os-specific:
139  *  	    FreeBSD: netmap_open (netmap_freebsd.c). The priv is
140  *  		     per-thread.
141  *  	    linux:   linux_netmap_open (netmap_linux.c). The priv is
142  *  		     per-open.
143  *
144  * > 2. on each descriptor, the process issues an ioctl() to identify
145  * >    the interface that should report events to the file descriptor.
146  *
147  * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
148  * 	Most important things happen in netmap_get_na() and
149  * 	netmap_do_regif(), called from there. Additional details can be
150  * 	found in the comments above those functions.
151  *
152  * 	In all cases, this action creates/takes-a-reference-to a
153  * 	netmap_*_adapter describing the port, and allocates a netmap_if
154  * 	and all necessary netmap rings, filling them with netmap buffers.
155  *
156  *      In this phase, the sync callbacks for each ring are set (these are used
157  *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
158  *      The adapter creation/initialization code puts them in the
159  * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
160  * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
161  * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
162  * 	actually call netmap_krings_create() to perform this and the other
163  * 	common stuff. netmap_krings_create() also takes care of the host rings,
164  * 	if needed, by setting their sync callbacks appropriately.
165  *
166  * 	Additional actions depend on the kind of netmap_adapter that has been
167  * 	registered:
168  *
169  * 	- netmap_hw_adapter:  	     [netmap.c]
170  * 	     This is a system netdev/ifp with native netmap support.
171  * 	     The ifp is detached from the host stack by redirecting:
172  * 	       - transmissions (from the network stack) to netmap_transmit()
173  * 	       - receive notifications to the nm_notify() callback for
174  * 	         this adapter. The callback is normally netmap_notify(), unless
175  * 	         the ifp is attached to a bridge using bwrap, in which case it
176  * 	         is netmap_bwrap_intr_notify().
177  *
178  * 	- netmap_generic_adapter:      [netmap_generic.c]
179  * 	      A system netdev/ifp without native netmap support.
180  *
181  * 	(the decision about native/non native support is taken in
182  * 	 netmap_get_hw_na(), called by netmap_get_na())
183  *
184  * 	- netmap_vp_adapter 		[netmap_vale.c]
185  * 	      Returned by netmap_get_bdg_na().
186  * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
187  * 	      are created on the fly if they don't already exist, and are
188  * 	      always attached to a bridge.
189  * 	      Persistent VALE ports must must be created seperately, and i
190  * 	      then attached like normal NICs. The NIOCREGIF we are examining
191  * 	      will find them only if they had previosly been created and
192  * 	      attached (see VALE_CTL below).
193  *
194  * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
195  * 	      Returned by netmap_get_pipe_na().
196  * 	      Both pipe ends are created, if they didn't already exist.
197  *
198  * 	- netmap_monitor_adapter      [netmap_monitor.c]
199  * 	      Returned by netmap_get_monitor_na().
200  * 	      If successful, the nm_sync callbacks of the monitored adapter
201  * 	      will be intercepted by the returned monitor.
202  *
203  * 	- netmap_bwrap_adapter	      [netmap_vale.c]
204  * 	      Cannot be obtained in this way, see VALE_CTL below
205  *
206  *
207  * 	os-specific:
208  * 	    linux: we first go through linux_netmap_ioctl() to
209  * 	           adapt the FreeBSD interface to the linux one.
210  *
211  *
212  * > 3. on each descriptor, the process issues an mmap() request to
213  * >    map the shared memory region within the process' address space.
214  * >    The list of interesting queues is indicated by a location in
215  * >    the shared memory region.
216  *
217  *      os-specific:
218  *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
219  *  	    linux:   linux_netmap_mmap (netmap_linux.c).
220  *
221  * > 4. using the functions in the netmap(4) userspace API, a process
222  * >    can look up the occupation state of a queue, access memory buffers,
223  * >    and retrieve received packets or enqueue packets to transmit.
224  *
225  * 	these actions do not involve the kernel.
226  *
227  * > 5. using some ioctl()s the process can synchronize the userspace view
228  * >    of the queue with the actual status in the kernel. This includes both
229  * >    receiving the notification of new packets, and transmitting new
230  * >    packets on the output interface.
231  *
232  * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
233  * 	cases. They invoke the nm_sync callbacks on the netmap_kring
234  * 	structures, as initialized in step 2 and maybe later modified
235  * 	by a monitor. Monitors, however, will always call the original
236  * 	callback before doing anything else.
237  *
238  *
239  * > 6. select() or poll() can be used to wait for events on individual
240  * >    transmit or receive queues (or all queues for a given interface).
241  *
242  * 	Implemented in netmap_poll(). This will call the same nm_sync()
243  * 	callbacks as in step 5 above.
244  *
245  * 	os-specific:
246  * 		linux: we first go through linux_netmap_poll() to adapt
247  * 		       the FreeBSD interface to the linux one.
248  *
249  *
250  *  ----  VALE_CTL -----
251  *
252  *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
253  *  nr_cmd in the nmreq structure. These subcommands are handled by
254  *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
255  *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
256  *  subcommands, respectively.
257  *
258  *  Any network interface known to the system (including a persistent VALE
259  *  port) can be attached to a VALE switch by issuing the
260  *  NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports
261  *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
262  *  attachment of other interfaces, instead, requires the creation of a
263  *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
264  *  netmap mode. This may require the creation of a netmap_generic_adapter if
265  *  we have no native support for the interface, or if generic adapters have
266  *  been forced by sysctl.
267  *
268  *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
269  *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
270  *  callback.  In the case of the bwrap, the callback creates the
271  *  netmap_bwrap_adapter.  The initialization of the bwrap is then
272  *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
273  *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
274  *  A generic adapter for the wrapped ifp will be created if needed, when
275  *  netmap_get_bdg_na() calls netmap_get_hw_na().
276  *
277  *
278  *  ---- DATAPATHS -----
279  *
280  *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
281  *
282  *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
283  *
284  *    - tx from netmap userspace:
285  *	 concurrently:
286  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
287  *                kring->nm_sync() == DEVICE_netmap_txsync()
288  *           2) device interrupt handler
289  *                na->nm_notify()  == netmap_notify()
290  *    - rx from netmap userspace:
291  *       concurrently:
292  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
293  *                kring->nm_sync() == DEVICE_netmap_rxsync()
294  *           2) device interrupt handler
295  *                na->nm_notify()  == netmap_notify()
296  *    - rx from host stack
297  *       concurrently:
298  *           1) host stack
299  *                netmap_transmit()
300  *                  na->nm_notify  == netmap_notify()
301  *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
302  *                kring->nm_sync() == netmap_rxsync_from_host_compat
303  *                  netmap_rxsync_from_host(na, NULL, NULL)
304  *    - tx to host stack
305  *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
306  *             kring->nm_sync() == netmap_txsync_to_host_compat
307  *               netmap_txsync_to_host(na)
308  *                 NM_SEND_UP()
309  *                   FreeBSD: na->if_input() == ?? XXX
310  *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
311  *
312  *
313  *
314  *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
315  *
316  *    na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
317  *
318  *    - tx from netmap userspace:
319  *       concurrently:
320  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
321  *               kring->nm_sync() == generic_netmap_txsync()
322  *                   linux:   dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
323  *                       generic_ndo_start_xmit()
324  *                           orig. dev. start_xmit
325  *                   FreeBSD: na->if_transmit() == orig. dev if_transmit
326  *           2) generic_mbuf_destructor()
327  *                   na->nm_notify() == netmap_notify()
328  *    - rx from netmap userspace:
329  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
330  *               kring->nm_sync() == generic_netmap_rxsync()
331  *                   mbq_safe_dequeue()
332  *           2) device driver
333  *               generic_rx_handler()
334  *                   mbq_safe_enqueue()
335  *                   na->nm_notify() == netmap_notify()
336  *    - rx from host stack:
337  *        concurrently:
338  *           1) host stack
339  *               linux: generic_ndo_start_xmit()
340  *                   netmap_transmit()
341  *               FreeBSD: ifp->if_input() == netmap_transmit
342  *               both:
343  *                       na->nm_notify() == netmap_notify()
344  *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
345  *                kring->nm_sync() == netmap_rxsync_from_host_compat
346  *                  netmap_rxsync_from_host(na, NULL, NULL)
347  *    - tx to host stack:
348  *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
349  *             kring->nm_sync() == netmap_txsync_to_host_compat
350  *               netmap_txsync_to_host(na)
351  *                 NM_SEND_UP()
352  *                   FreeBSD: na->if_input() == ??? XXX
353  *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
354  *
355  *
356  *                           -= VALE =-
357  *
358  *   INCOMING:
359  *
360  *      - VALE ports:
361  *          ioctl(NIOCTXSYNC)/netmap_poll() in process context
362  *              kring->nm_sync() == netmap_vp_txsync()
363  *
364  *      - system device with native support:
365  *         from cable:
366  *             interrupt
367  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
368  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
369  *                     netmap_vp_txsync()
370  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
371  *         from host stack:
372  *             netmap_transmit()
373  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
374  *                     kring->nm_sync() == netmap_rxsync_from_host_compat()
375  *                     netmap_vp_txsync()
376  *
377  *      - system device with generic support:
378  *         from device driver:
379  *            generic_rx_handler()
380  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
381  *                     kring->nm_sync() == generic_netmap_rxsync()
382  *                     netmap_vp_txsync()
383  *                     kring->nm_sync() == generic_netmap_rxsync()
384  *         from host stack:
385  *            netmap_transmit()
386  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
387  *                     kring->nm_sync() == netmap_rxsync_from_host_compat()
388  *                     netmap_vp_txsync()
389  *
390  *   (all cases) --> nm_bdg_flush()
391  *                      dest_na->nm_notify() == (see below)
392  *
393  *   OUTGOING:
394  *
395  *      - VALE ports:
396  *         concurrently:
397  *             1) ioctlNIOCRXSYNC)/netmap_poll() in process context
398  *                    kring->nm_sync() == netmap_vp_rxsync()
399  *             2) from nm_bdg_flush()
400  *                    na->nm_notify() == netmap_notify()
401  *
402  *      - system device with native support:
403  *          to cable:
404  *             na->nm_notify() == netmap_bwrap_notify()
405  *                 netmap_vp_rxsync()
406  *                 kring->nm_sync() == DEVICE_netmap_txsync()
407  *                 netmap_vp_rxsync()
408  *          to host stack:
409  *                 netmap_vp_rxsync()
410  *                 kring->nm_sync() == netmap_txsync_to_host_compat
411  *                 netmap_vp_rxsync_locked()
412  *
413  *      - system device with generic adapter:
414  *          to device driver:
415  *             na->nm_notify() == netmap_bwrap_notify()
416  *                 netmap_vp_rxsync()
417  *                 kring->nm_sync() == generic_netmap_txsync()
418  *                 netmap_vp_rxsync()
419  *          to host stack:
420  *                 netmap_vp_rxsync()
421  *                 kring->nm_sync() == netmap_txsync_to_host_compat
422  *                 netmap_vp_rxsync()
423  *
424  */
425 
426 /*
427  * OS-specific code that is used only within this file.
428  * Other OS-specific code that must be accessed by drivers
429  * is present in netmap_kern.h
430  */
431 
432 #if defined(__FreeBSD__)
433 #include <sys/cdefs.h> /* prerequisite */
434 #include <sys/types.h>
435 #include <sys/errno.h>
436 #include <sys/param.h>	/* defines used in kernel.h */
437 #include <sys/kernel.h>	/* types used in module initialization */
438 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
439 #include <sys/filio.h>	/* FIONBIO */
440 #include <sys/sockio.h>
441 #include <sys/socketvar.h>	/* struct socket */
442 #include <sys/malloc.h>
443 #include <sys/poll.h>
444 #include <sys/rwlock.h>
445 #include <sys/socket.h> /* sockaddrs */
446 #include <sys/selinfo.h>
447 #include <sys/sysctl.h>
448 #include <sys/jail.h>
449 #include <net/vnet.h>
450 #include <net/if.h>
451 #include <net/if_var.h>
452 #include <net/bpf.h>		/* BIOCIMMEDIATE */
453 #include <machine/bus.h>	/* bus_dmamap_* */
454 #include <sys/endian.h>
455 #include <sys/refcount.h>
456 
457 
458 /* reduce conditional code */
459 // linux API, use for the knlist in FreeBSD
460 /* use a private mutex for the knlist */
461 #define init_waitqueue_head(x) do {			\
462 	struct mtx *m = &(x)->m;			\
463 	mtx_init(m, "nm_kn_lock", NULL, MTX_DEF);	\
464 	knlist_init_mtx(&(x)->si.si_note, m);		\
465     } while (0)
466 
467 #define OS_selrecord(a, b)	selrecord(a, &((b)->si))
468 #define OS_selwakeup(a, b)	freebsd_selwakeup(a, b)
469 
470 #elif defined(linux)
471 
472 #include "bsd_glue.h"
473 
474 
475 
476 #elif defined(__APPLE__)
477 
478 #warning OSX support is only partial
479 #include "osx_glue.h"
480 
481 #else
482 
483 #error	Unsupported platform
484 
485 #endif /* unsupported */
486 
487 /*
488  * common headers
489  */
490 #include <net/netmap.h>
491 #include <dev/netmap/netmap_kern.h>
492 #include <dev/netmap/netmap_mem2.h>
493 
494 
495 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
496 
497 /* user-controlled variables */
498 int netmap_verbose;
499 
500 static int netmap_no_timestamp; /* don't timestamp on rxsync */
501 
502 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
503 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
504     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
505 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
506     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
507 int netmap_mitigate = 1;
508 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
509 int netmap_no_pendintr = 1;
510 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
511     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
512 int netmap_txsync_retry = 2;
513 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
514     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
515 
516 int netmap_adaptive_io = 0;
517 SYSCTL_INT(_dev_netmap, OID_AUTO, adaptive_io, CTLFLAG_RW,
518     &netmap_adaptive_io, 0 , "Adaptive I/O on paravirt");
519 
520 int netmap_flags = 0;	/* debug flags */
521 int netmap_fwd = 0;	/* force transparent mode */
522 
523 /*
524  * netmap_admode selects the netmap mode to use.
525  * Invalid values are reset to NETMAP_ADMODE_BEST
526  */
527 enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
528 	NETMAP_ADMODE_NATIVE,	/* either native or none */
529 	NETMAP_ADMODE_GENERIC,	/* force generic */
530 	NETMAP_ADMODE_LAST };
531 static int netmap_admode = NETMAP_ADMODE_BEST;
532 
533 int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
534 int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
535 int netmap_generic_rings = 1;   /* number of queues in generic. */
536 
537 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
538 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
539 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
540 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
541 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
542 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");
543 
544 NMG_LOCK_T	netmap_global_lock;
545 int netmap_use_count = 0; /* number of active netmap instances */
546 
547 /*
548  * mark the ring as stopped, and run through the locks
549  * to make sure other users get to see it.
550  */
551 static void
552 netmap_disable_ring(struct netmap_kring *kr)
553 {
554 	kr->nkr_stopped = 1;
555 	nm_kr_get(kr);
556 	mtx_lock(&kr->q_lock);
557 	mtx_unlock(&kr->q_lock);
558 	nm_kr_put(kr);
559 }
560 
561 /* stop or enable a single ring */
562 void
563 netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
564 {
565 	if (stopped)
566 		netmap_disable_ring(NMR(na, t) + ring_id);
567 	else
568 		NMR(na, t)[ring_id].nkr_stopped = 0;
569 }
570 
571 
572 /* stop or enable all the rings of na */
573 void
574 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
575 {
576 	int i;
577 	enum txrx t;
578 
579 	if (!nm_netmap_on(na))
580 		return;
581 
582 	for_rx_tx(t) {
583 		for (i = 0; i < netmap_real_rings(na, t); i++) {
584 			netmap_set_ring(na, i, t, stopped);
585 		}
586 	}
587 }
588 
589 /*
590  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
591  * to finish and prevents any new one from starting.  Call this before turning
592  * netmap mode off, or before removing the harware rings (e.g., on module
593  * onload).  As a rule of thumb for linux drivers, this should be placed near
594  * each napi_disable().
595  */
596 void
597 netmap_disable_all_rings(struct ifnet *ifp)
598 {
599 	netmap_set_all_rings(NA(ifp), 1 /* stopped */);
600 }
601 
602 /*
603  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
604  * adapter's rings In linux drivers, this should be placed near each
605  * napi_enable().
606  */
607 void
608 netmap_enable_all_rings(struct ifnet *ifp)
609 {
610 	netmap_set_all_rings(NA(ifp), 0 /* enabled */);
611 }
612 
613 
614 /*
615  * generic bound_checking function
616  */
617 u_int
618 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
619 {
620 	u_int oldv = *v;
621 	const char *op = NULL;
622 
623 	if (dflt < lo)
624 		dflt = lo;
625 	if (dflt > hi)
626 		dflt = hi;
627 	if (oldv < lo) {
628 		*v = dflt;
629 		op = "Bump";
630 	} else if (oldv > hi) {
631 		*v = hi;
632 		op = "Clamp";
633 	}
634 	if (op && msg)
635 		printf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
636 	return *v;
637 }
638 
639 
640 /*
641  * packet-dump function, user-supplied or static buffer.
642  * The destination buffer must be at least 30+4*len
643  */
644 const char *
645 nm_dump_buf(char *p, int len, int lim, char *dst)
646 {
647 	static char _dst[8192];
648 	int i, j, i0;
649 	static char hex[] ="0123456789abcdef";
650 	char *o;	/* output position */
651 
652 #define P_HI(x)	hex[((x) & 0xf0)>>4]
653 #define P_LO(x)	hex[((x) & 0xf)]
654 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
655 	if (!dst)
656 		dst = _dst;
657 	if (lim <= 0 || lim > len)
658 		lim = len;
659 	o = dst;
660 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
661 	o += strlen(o);
662 	/* hexdump routine */
663 	for (i = 0; i < lim; ) {
664 		sprintf(o, "%5d: ", i);
665 		o += strlen(o);
666 		memset(o, ' ', 48);
667 		i0 = i;
668 		for (j=0; j < 16 && i < lim; i++, j++) {
669 			o[j*3] = P_HI(p[i]);
670 			o[j*3+1] = P_LO(p[i]);
671 		}
672 		i = i0;
673 		for (j=0; j < 16 && i < lim; i++, j++)
674 			o[j + 48] = P_C(p[i]);
675 		o[j+48] = '\n';
676 		o += j+49;
677 	}
678 	*o = '\0';
679 #undef P_HI
680 #undef P_LO
681 #undef P_C
682 	return dst;
683 }
684 
685 
686 /*
687  * Fetch configuration from the device, to cope with dynamic
688  * reconfigurations after loading the module.
689  */
690 /* call with NMG_LOCK held */
691 int
692 netmap_update_config(struct netmap_adapter *na)
693 {
694 	u_int txr, txd, rxr, rxd;
695 
696 	txr = txd = rxr = rxd = 0;
697 	if (na->nm_config == NULL ||
698 	    na->nm_config(na, &txr, &txd, &rxr, &rxd))
699 	{
700 		/* take whatever we had at init time */
701 		txr = na->num_tx_rings;
702 		txd = na->num_tx_desc;
703 		rxr = na->num_rx_rings;
704 		rxd = na->num_rx_desc;
705 	}
706 
707 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
708 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
709 		return 0; /* nothing changed */
710 	if (netmap_verbose || na->active_fds > 0) {
711 		D("stored config %s: txring %d x %d, rxring %d x %d",
712 			na->name,
713 			na->num_tx_rings, na->num_tx_desc,
714 			na->num_rx_rings, na->num_rx_desc);
715 		D("new config %s: txring %d x %d, rxring %d x %d",
716 			na->name, txr, txd, rxr, rxd);
717 	}
718 	if (na->active_fds == 0) {
719 		D("configuration changed (but fine)");
720 		na->num_tx_rings = txr;
721 		na->num_tx_desc = txd;
722 		na->num_rx_rings = rxr;
723 		na->num_rx_desc = rxd;
724 		return 0;
725 	}
726 	D("configuration changed while active, this is bad...");
727 	return 1;
728 }
729 
730 static void netmap_txsync_to_host(struct netmap_adapter *na);
731 static int netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait);
732 
733 /* kring->nm_sync callback for the host tx ring */
734 static int
735 netmap_txsync_to_host_compat(struct netmap_kring *kring, int flags)
736 {
737 	(void)flags; /* unused */
738 	netmap_txsync_to_host(kring->na);
739 	return 0;
740 }
741 
742 /* kring->nm_sync callback for the host rx ring */
743 static int
744 netmap_rxsync_from_host_compat(struct netmap_kring *kring, int flags)
745 {
746 	(void)flags; /* unused */
747 	netmap_rxsync_from_host(kring->na, NULL, NULL);
748 	return 0;
749 }
750 
751 
752 
753 /* create the krings array and initialize the fields common to all adapters.
754  * The array layout is this:
755  *
756  *                    +----------+
757  * na->tx_rings ----->|          | \
758  *                    |          |  } na->num_tx_ring
759  *                    |          | /
760  *                    +----------+
761  *                    |          |    host tx kring
762  * na->rx_rings ----> +----------+
763  *                    |          | \
764  *                    |          |  } na->num_rx_rings
765  *                    |          | /
766  *                    +----------+
767  *                    |          |    host rx kring
768  *                    +----------+
769  * na->tailroom ----->|          | \
770  *                    |          |  } tailroom bytes
771  *                    |          | /
772  *                    +----------+
773  *
774  * Note: for compatibility, host krings are created even when not needed.
775  * The tailroom space is currently used by vale ports for allocating leases.
776  */
777 /* call with NMG_LOCK held */
778 int
779 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
780 {
781 	u_int i, len, ndesc;
782 	struct netmap_kring *kring;
783 	u_int n[NR_TXRX];
784 	enum txrx t;
785 
786 	/* account for the (possibly fake) host rings */
787 	n[NR_TX] = na->num_tx_rings + 1;
788 	n[NR_RX] = na->num_rx_rings + 1;
789 
790 	len = (n[NR_TX] + n[NR_RX]) * sizeof(struct netmap_kring) + tailroom;
791 
792 	na->tx_rings = malloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
793 	if (na->tx_rings == NULL) {
794 		D("Cannot allocate krings");
795 		return ENOMEM;
796 	}
797 	na->rx_rings = na->tx_rings + n[NR_TX];
798 
799 	/*
800 	 * All fields in krings are 0 except the one initialized below.
801 	 * but better be explicit on important kring fields.
802 	 */
803 	for_rx_tx(t) {
804 		ndesc = nma_get_ndesc(na, t);
805 		for (i = 0; i < n[t]; i++) {
806 			kring = &NMR(na, t)[i];
807 			bzero(kring, sizeof(*kring));
808 			kring->na = na;
809 			kring->ring_id = i;
810 			kring->tx = t;
811 			kring->nkr_num_slots = ndesc;
812 			if (i < nma_get_nrings(na, t)) {
813 				kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
814 			} else if (i == na->num_tx_rings) {
815 				kring->nm_sync = (t == NR_TX ?
816 						netmap_txsync_to_host_compat :
817 						netmap_rxsync_from_host_compat);
818 			}
819 			kring->nm_notify = na->nm_notify;
820 			kring->rhead = kring->rcur = kring->nr_hwcur = 0;
821 			/*
822 			 * IMPORTANT: Always keep one slot empty.
823 			 */
824 			kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
825 			snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
826 					nm_txrx2str(t), i);
827 			ND("ktx %s h %d c %d t %d",
828 				kring->name, kring->rhead, kring->rcur, kring->rtail);
829 			mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
830 			init_waitqueue_head(&kring->si);
831 		}
832 		init_waitqueue_head(&na->si[t]);
833 	}
834 
835 	na->tailroom = na->rx_rings + n[NR_RX];
836 
837 	return 0;
838 }
839 
840 
841 #ifdef __FreeBSD__
842 static void
843 netmap_knlist_destroy(NM_SELINFO_T *si)
844 {
845 	/* XXX kqueue(9) needed; these will mirror knlist_init. */
846 	knlist_delete(&si->si.si_note, curthread, 0 /* not locked */ );
847 	knlist_destroy(&si->si.si_note);
848 	/* now we don't need the mutex anymore */
849 	mtx_destroy(&si->m);
850 }
851 #endif /* __FreeBSD__ */
852 
853 
854 /* undo the actions performed by netmap_krings_create */
855 /* call with NMG_LOCK held */
856 void
857 netmap_krings_delete(struct netmap_adapter *na)
858 {
859 	struct netmap_kring *kring = na->tx_rings;
860 	enum txrx t;
861 
862 	for_rx_tx(t)
863 		netmap_knlist_destroy(&na->si[t]);
864 
865 	/* we rely on the krings layout described above */
866 	for ( ; kring != na->tailroom; kring++) {
867 		mtx_destroy(&kring->q_lock);
868 		netmap_knlist_destroy(&kring->si);
869 	}
870 	free(na->tx_rings, M_DEVBUF);
871 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
872 }
873 
874 
875 /*
876  * Destructor for NIC ports. They also have an mbuf queue
877  * on the rings connected to the host so we need to purge
878  * them first.
879  */
880 /* call with NMG_LOCK held */
881 static void
882 netmap_hw_krings_delete(struct netmap_adapter *na)
883 {
884 	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
885 
886 	ND("destroy sw mbq with len %d", mbq_len(q));
887 	mbq_purge(q);
888 	mbq_safe_destroy(q);
889 	netmap_krings_delete(na);
890 }
891 
892 
893 
894 /*
895  * Undo everything that was done in netmap_do_regif(). In particular,
896  * call nm_register(ifp,0) to stop netmap mode on the interface and
897  * revert to normal operation.
898  */
899 /* call with NMG_LOCK held */
900 static void netmap_unset_ringid(struct netmap_priv_d *);
901 static void netmap_rel_exclusive(struct netmap_priv_d *);
902 static void
903 netmap_do_unregif(struct netmap_priv_d *priv)
904 {
905 	struct netmap_adapter *na = priv->np_na;
906 
907 	NMG_LOCK_ASSERT();
908 	na->active_fds--;
909 	/* release exclusive use if it was requested on regif */
910 	netmap_rel_exclusive(priv);
911 	if (na->active_fds <= 0) {	/* last instance */
912 
913 		if (netmap_verbose)
914 			D("deleting last instance for %s", na->name);
915 
916 #ifdef	WITH_MONITOR
917 		/* walk through all the rings and tell any monitor
918 		 * that the port is going to exit netmap mode
919 		 */
920 		netmap_monitor_stop(na);
921 #endif
922 		/*
923 		 * (TO CHECK) This function is only called
924 		 * when the last reference to this file descriptor goes
925 		 * away. This means we cannot have any pending poll()
926 		 * or interrupt routine operating on the structure.
927 		 * XXX The file may be closed in a thread while
928 		 * another thread is using it.
929 		 * Linux keeps the file opened until the last reference
930 		 * by any outstanding ioctl/poll or mmap is gone.
931 		 * FreeBSD does not track mmap()s (but we do) and
932 		 * wakes up any sleeping poll(). Need to check what
933 		 * happens if the close() occurs while a concurrent
934 		 * syscall is running.
935 		 */
936 		na->nm_register(na, 0); /* off, clear flags */
937 		/* Wake up any sleeping threads. netmap_poll will
938 		 * then return POLLERR
939 		 * XXX The wake up now must happen during *_down(), when
940 		 * we order all activities to stop. -gl
941 		 */
942 		/* delete rings and buffers */
943 		netmap_mem_rings_delete(na);
944 		na->nm_krings_delete(na);
945 	}
946 	/* possibily decrement counter of tx_si/rx_si users */
947 	netmap_unset_ringid(priv);
948 	/* delete the nifp */
949 	netmap_mem_if_delete(na, priv->np_nifp);
950 	/* drop the allocator */
951 	netmap_mem_deref(na->nm_mem, na);
952 	/* mark the priv as unregistered */
953 	priv->np_na = NULL;
954 	priv->np_nifp = NULL;
955 }
956 
957 /* call with NMG_LOCK held */
958 static __inline int
959 nm_si_user(struct netmap_priv_d *priv, enum txrx t)
960 {
961 	return (priv->np_na != NULL &&
962 		(priv->np_qlast[t] - priv->np_qfirst[t] > 1));
963 }
964 
965 /*
966  * Destructor of the netmap_priv_d, called when the fd is closed
967  * Action: undo all the things done by NIOCREGIF,
968  * On FreeBSD we need to track whether there are active mmap()s,
969  * and we use np_active_mmaps for that. On linux, the field is always 0.
970  * Return: 1 if we can free priv, 0 otherwise.
971  *
972  */
973 /* call with NMG_LOCK held */
974 int
975 netmap_dtor_locked(struct netmap_priv_d *priv)
976 {
977 	struct netmap_adapter *na = priv->np_na;
978 
979 	/* number of active references to this fd */
980 	if (--priv->np_refs > 0) {
981 		return 0;
982 	}
983 	netmap_use_count--;
984 	if (!na) {
985 		return 1; //XXX is it correct?
986 	}
987 	netmap_do_unregif(priv);
988 	netmap_adapter_put(na);
989 	return 1;
990 }
991 
992 
993 /* call with NMG_LOCK *not* held */
994 void
995 netmap_dtor(void *data)
996 {
997 	struct netmap_priv_d *priv = data;
998 	int last_instance;
999 
1000 	NMG_LOCK();
1001 	last_instance = netmap_dtor_locked(priv);
1002 	NMG_UNLOCK();
1003 	if (last_instance) {
1004 		bzero(priv, sizeof(*priv));	/* for safety */
1005 		free(priv, M_DEVBUF);
1006 	}
1007 }
1008 
1009 
1010 
1011 
1012 /*
1013  * Handlers for synchronization of the queues from/to the host.
1014  * Netmap has two operating modes:
1015  * - in the default mode, the rings connected to the host stack are
1016  *   just another ring pair managed by userspace;
1017  * - in transparent mode (XXX to be defined) incoming packets
1018  *   (from the host or the NIC) are marked as NS_FORWARD upon
1019  *   arrival, and the user application has a chance to reset the
1020  *   flag for packets that should be dropped.
1021  *   On the RXSYNC or poll(), packets in RX rings between
1022  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
1023  *   to the other side.
1024  * The transfer NIC --> host is relatively easy, just encapsulate
1025  * into mbufs and we are done. The host --> NIC side is slightly
1026  * harder because there might not be room in the tx ring so it
1027  * might take a while before releasing the buffer.
1028  */
1029 
1030 
1031 /*
1032  * pass a chain of buffers to the host stack as coming from 'dst'
1033  * We do not need to lock because the queue is private.
1034  */
1035 static void
1036 netmap_send_up(struct ifnet *dst, struct mbq *q)
1037 {
1038 	struct mbuf *m;
1039 
1040 	/* send packets up, outside the lock */
1041 	while ((m = mbq_dequeue(q)) != NULL) {
1042 		if (netmap_verbose & NM_VERB_HOST)
1043 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
1044 		NM_SEND_UP(dst, m);
1045 	}
1046 	mbq_destroy(q);
1047 }
1048 
1049 
1050 /*
1051  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
1052  * Take packets from hwcur to ring->head marked NS_FORWARD (or forced)
1053  * and pass them up. Drop remaining packets in the unlikely event
1054  * of an mbuf shortage.
1055  */
1056 static void
1057 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1058 {
1059 	u_int const lim = kring->nkr_num_slots - 1;
1060 	u_int const head = kring->rhead;
1061 	u_int n;
1062 	struct netmap_adapter *na = kring->na;
1063 
1064 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1065 		struct mbuf *m;
1066 		struct netmap_slot *slot = &kring->ring->slot[n];
1067 
1068 		if ((slot->flags & NS_FORWARD) == 0 && !force)
1069 			continue;
1070 		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1071 			RD(5, "bad pkt at %d len %d", n, slot->len);
1072 			continue;
1073 		}
1074 		slot->flags &= ~NS_FORWARD; // XXX needed ?
1075 		/* XXX TODO: adapt to the case of a multisegment packet */
1076 		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1077 
1078 		if (m == NULL)
1079 			break;
1080 		mbq_enqueue(q, m);
1081 	}
1082 }
1083 
1084 
1085 /*
1086  * Send to the NIC rings packets marked NS_FORWARD between
1087  * kring->nr_hwcur and kring->rhead
1088  * Called under kring->rx_queue.lock on the sw rx ring,
1089  */
1090 static u_int
1091 netmap_sw_to_nic(struct netmap_adapter *na)
1092 {
1093 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1094 	struct netmap_slot *rxslot = kring->ring->slot;
1095 	u_int i, rxcur = kring->nr_hwcur;
1096 	u_int const head = kring->rhead;
1097 	u_int const src_lim = kring->nkr_num_slots - 1;
1098 	u_int sent = 0;
1099 
1100 	/* scan rings to find space, then fill as much as possible */
1101 	for (i = 0; i < na->num_tx_rings; i++) {
1102 		struct netmap_kring *kdst = &na->tx_rings[i];
1103 		struct netmap_ring *rdst = kdst->ring;
1104 		u_int const dst_lim = kdst->nkr_num_slots - 1;
1105 
1106 		/* XXX do we trust ring or kring->rcur,rtail ? */
1107 		for (; rxcur != head && !nm_ring_empty(rdst);
1108 		     rxcur = nm_next(rxcur, src_lim) ) {
1109 			struct netmap_slot *src, *dst, tmp;
1110 			u_int dst_cur = rdst->cur;
1111 
1112 			src = &rxslot[rxcur];
1113 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1114 				continue;
1115 
1116 			sent++;
1117 
1118 			dst = &rdst->slot[dst_cur];
1119 
1120 			tmp = *src;
1121 
1122 			src->buf_idx = dst->buf_idx;
1123 			src->flags = NS_BUF_CHANGED;
1124 
1125 			dst->buf_idx = tmp.buf_idx;
1126 			dst->len = tmp.len;
1127 			dst->flags = NS_BUF_CHANGED;
1128 
1129 			rdst->cur = nm_next(dst_cur, dst_lim);
1130 		}
1131 		/* if (sent) XXX txsync ? */
1132 	}
1133 	return sent;
1134 }
1135 
1136 
1137 /*
1138  * netmap_txsync_to_host() passes packets up. We are called from a
1139  * system call in user process context, and the only contention
1140  * can be among multiple user threads erroneously calling
1141  * this routine concurrently.
1142  */
1143 static void
1144 netmap_txsync_to_host(struct netmap_adapter *na)
1145 {
1146 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
1147 	u_int const lim = kring->nkr_num_slots - 1;
1148 	u_int const head = kring->rhead;
1149 	struct mbq q;
1150 
1151 	/* Take packets from hwcur to head and pass them up.
1152 	 * force head = cur since netmap_grab_packets() stops at head
1153 	 * In case of no buffers we give up. At the end of the loop,
1154 	 * the queue is drained in all cases.
1155 	 */
1156 	mbq_init(&q);
1157 	netmap_grab_packets(kring, &q, 1 /* force */);
1158 	ND("have %d pkts in queue", mbq_len(&q));
1159 	kring->nr_hwcur = head;
1160 	kring->nr_hwtail = head + lim;
1161 	if (kring->nr_hwtail > lim)
1162 		kring->nr_hwtail -= lim + 1;
1163 
1164 	netmap_send_up(na->ifp, &q);
1165 }
1166 
1167 
1168 /*
1169  * rxsync backend for packets coming from the host stack.
1170  * They have been put in kring->rx_queue by netmap_transmit().
1171  * We protect access to the kring using kring->rx_queue.lock
1172  *
1173  * This routine also does the selrecord if called from the poll handler
1174  * (we know because td != NULL).
1175  *
1176  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
1177  *     as an additional hidden argument.
1178  * returns the number of packets delivered to tx queues in
1179  * transparent mode, or a negative value if error
1180  */
1181 static int
1182 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
1183 {
1184 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1185 	struct netmap_ring *ring = kring->ring;
1186 	u_int nm_i, n;
1187 	u_int const lim = kring->nkr_num_slots - 1;
1188 	u_int const head = kring->rhead;
1189 	int ret = 0;
1190 	struct mbq *q = &kring->rx_queue, fq;
1191 
1192 	(void)pwait;	/* disable unused warnings */
1193 	(void)td;
1194 
1195 	mbq_init(&fq); /* fq holds packets to be freed */
1196 
1197 	mbq_lock(q);
1198 
1199 	/* First part: import newly received packets */
1200 	n = mbq_len(q);
1201 	if (n) { /* grab packets from the queue */
1202 		struct mbuf *m;
1203 		uint32_t stop_i;
1204 
1205 		nm_i = kring->nr_hwtail;
1206 		stop_i = nm_prev(nm_i, lim);
1207 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1208 			int len = MBUF_LEN(m);
1209 			struct netmap_slot *slot = &ring->slot[nm_i];
1210 
1211 			m_copydata(m, 0, len, NMB(na, slot));
1212 			ND("nm %d len %d", nm_i, len);
1213 			if (netmap_verbose)
1214                                 D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1215 
1216 			slot->len = len;
1217 			slot->flags = kring->nkr_slot_flags;
1218 			nm_i = nm_next(nm_i, lim);
1219 			mbq_enqueue(&fq, m);
1220 		}
1221 		kring->nr_hwtail = nm_i;
1222 	}
1223 
1224 	/*
1225 	 * Second part: skip past packets that userspace has released.
1226 	 */
1227 	nm_i = kring->nr_hwcur;
1228 	if (nm_i != head) { /* something was released */
1229 		if (netmap_fwd || kring->ring->flags & NR_FORWARD)
1230 			ret = netmap_sw_to_nic(na);
1231 		kring->nr_hwcur = head;
1232 	}
1233 
1234 	/* access copies of cur,tail in the kring */
1235 	if (kring->rcur == kring->rtail && td) /* no bufs available */
1236 		OS_selrecord(td, &kring->si);
1237 
1238 	mbq_unlock(q);
1239 
1240 	mbq_purge(&fq);
1241 	mbq_destroy(&fq);
1242 
1243 	return ret;
1244 }
1245 
1246 
1247 /* Get a netmap adapter for the port.
1248  *
1249  * If it is possible to satisfy the request, return 0
1250  * with *na containing the netmap adapter found.
1251  * Otherwise return an error code, with *na containing NULL.
1252  *
1253  * When the port is attached to a bridge, we always return
1254  * EBUSY.
1255  * Otherwise, if the port is already bound to a file descriptor,
1256  * then we unconditionally return the existing adapter into *na.
1257  * In all the other cases, we return (into *na) either native,
1258  * generic or NULL, according to the following table:
1259  *
1260  *					native_support
1261  * active_fds   dev.netmap.admode         YES     NO
1262  * -------------------------------------------------------
1263  *    >0              *                 NA(ifp) NA(ifp)
1264  *
1265  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1266  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1267  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1268  *
1269  */
1270 
1271 int
1272 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
1273 {
1274 	/* generic support */
1275 	int i = netmap_admode;	/* Take a snapshot. */
1276 	struct netmap_adapter *prev_na;
1277 #ifdef WITH_GENERIC
1278 	struct netmap_generic_adapter *gna;
1279 	int error = 0;
1280 #endif
1281 
1282 	*na = NULL; /* default */
1283 
1284 	/* reset in case of invalid value */
1285 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1286 		i = netmap_admode = NETMAP_ADMODE_BEST;
1287 
1288 	if (NETMAP_CAPABLE(ifp)) {
1289 		prev_na = NA(ifp);
1290 		/* If an adapter already exists, return it if
1291 		 * there are active file descriptors or if
1292 		 * netmap is not forced to use generic
1293 		 * adapters.
1294 		 */
1295 		if (NETMAP_OWNED_BY_ANY(prev_na)
1296 			|| i != NETMAP_ADMODE_GENERIC
1297 			|| prev_na->na_flags & NAF_FORCE_NATIVE
1298 #ifdef WITH_PIPES
1299 			/* ugly, but we cannot allow an adapter switch
1300 			 * if some pipe is referring to this one
1301 			 */
1302 			|| prev_na->na_next_pipe > 0
1303 #endif
1304 		) {
1305 			*na = prev_na;
1306 			return 0;
1307 		}
1308 	}
1309 
1310 	/* If there isn't native support and netmap is not allowed
1311 	 * to use generic adapters, we cannot satisfy the request.
1312 	 */
1313 	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
1314 		return EOPNOTSUPP;
1315 
1316 #ifdef WITH_GENERIC
1317 	/* Otherwise, create a generic adapter and return it,
1318 	 * saving the previously used netmap adapter, if any.
1319 	 *
1320 	 * Note that here 'prev_na', if not NULL, MUST be a
1321 	 * native adapter, and CANNOT be a generic one. This is
1322 	 * true because generic adapters are created on demand, and
1323 	 * destroyed when not used anymore. Therefore, if the adapter
1324 	 * currently attached to an interface 'ifp' is generic, it
1325 	 * must be that
1326 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1327 	 * Consequently, if NA(ifp) is generic, we will enter one of
1328 	 * the branches above. This ensures that we never override
1329 	 * a generic adapter with another generic adapter.
1330 	 */
1331 	prev_na = NA(ifp);
1332 	error = generic_netmap_attach(ifp);
1333 	if (error)
1334 		return error;
1335 
1336 	*na = NA(ifp);
1337 	gna = (struct netmap_generic_adapter*)NA(ifp);
1338 	gna->prev = prev_na; /* save old na */
1339 	if (prev_na != NULL) {
1340 		ifunit_ref(ifp->if_xname);
1341 		// XXX add a refcount ?
1342 		netmap_adapter_get(prev_na);
1343 	}
1344 	ND("Created generic NA %p (prev %p)", gna, gna->prev);
1345 
1346 	return 0;
1347 #else /* !WITH_GENERIC */
1348 	return EOPNOTSUPP;
1349 #endif
1350 }
1351 
1352 
1353 /*
1354  * MUST BE CALLED UNDER NMG_LOCK()
1355  *
1356  * Get a refcounted reference to a netmap adapter attached
1357  * to the interface specified by nmr.
1358  * This is always called in the execution of an ioctl().
1359  *
1360  * Return ENXIO if the interface specified by the request does
1361  * not exist, ENOTSUP if netmap is not supported by the interface,
1362  * EBUSY if the interface is already attached to a bridge,
1363  * EINVAL if parameters are invalid, ENOMEM if needed resources
1364  * could not be allocated.
1365  * If successful, hold a reference to the netmap adapter.
1366  *
1367  * No reference is kept on the real interface, which may then
1368  * disappear at any time.
1369  */
1370 int
1371 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1372 {
1373 	struct ifnet *ifp = NULL;
1374 	int error = 0;
1375 	struct netmap_adapter *ret = NULL;
1376 
1377 	*na = NULL;     /* default return value */
1378 
1379 	NMG_LOCK_ASSERT();
1380 
1381 	/* we cascade through all possibile types of netmap adapter.
1382 	 * All netmap_get_*_na() functions return an error and an na,
1383 	 * with the following combinations:
1384 	 *
1385 	 * error    na
1386 	 *   0	   NULL		type doesn't match
1387 	 *  !0	   NULL		type matches, but na creation/lookup failed
1388 	 *   0	  !NULL		type matches and na created/found
1389 	 *  !0    !NULL		impossible
1390 	 */
1391 
1392 	/* try to see if this is a monitor port */
1393 	error = netmap_get_monitor_na(nmr, na, create);
1394 	if (error || *na != NULL)
1395 		return error;
1396 
1397 	/* try to see if this is a pipe port */
1398 	error = netmap_get_pipe_na(nmr, na, create);
1399 	if (error || *na != NULL)
1400 		return error;
1401 
1402 	/* try to see if this is a bridge port */
1403 	error = netmap_get_bdg_na(nmr, na, create);
1404 	if (error)
1405 		return error;
1406 
1407 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1408 		goto out;
1409 
1410 	/*
1411 	 * This must be a hardware na, lookup the name in the system.
1412 	 * Note that by hardware we actually mean "it shows up in ifconfig".
1413 	 * This may still be a tap, a veth/epair, or even a
1414 	 * persistent VALE port.
1415 	 */
1416 	ifp = ifunit_ref(nmr->nr_name);
1417 	if (ifp == NULL) {
1418 	        return ENXIO;
1419 	}
1420 
1421 	error = netmap_get_hw_na(ifp, &ret);
1422 	if (error)
1423 		goto out;
1424 
1425 	*na = ret;
1426 	netmap_adapter_get(ret);
1427 
1428 out:
1429 	if (error && ret != NULL)
1430 		netmap_adapter_put(ret);
1431 
1432 	if (ifp)
1433 		if_rele(ifp); /* allow live unloading of drivers modules */
1434 
1435 	return error;
1436 }
1437 
1438 
1439 /*
1440  * validate parameters on entry for *_txsync()
1441  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1442  * in case of error.
1443  *
1444  * rhead, rcur and rtail=hwtail are stored from previous round.
1445  * hwcur is the next packet to send to the ring.
1446  *
1447  * We want
1448  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1449  *
1450  * hwcur, rhead, rtail and hwtail are reliable
1451  */
1452 static u_int
1453 nm_txsync_prologue(struct netmap_kring *kring)
1454 {
1455 #define NM_ASSERT(t) if (t) { D("fail " #t); goto error; }
1456 	struct netmap_ring *ring = kring->ring;
1457 	u_int head = ring->head; /* read only once */
1458 	u_int cur = ring->cur; /* read only once */
1459 	u_int n = kring->nkr_num_slots;
1460 
1461 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1462 		kring->name,
1463 		kring->nr_hwcur, kring->nr_hwtail,
1464 		ring->head, ring->cur, ring->tail);
1465 #if 1 /* kernel sanity checks; but we can trust the kring. */
1466 	if (kring->nr_hwcur >= n || kring->rhead >= n ||
1467 	    kring->rtail >= n ||  kring->nr_hwtail >= n)
1468 		goto error;
1469 #endif /* kernel sanity checks */
1470 	/*
1471 	 * user sanity checks. We only use 'cur',
1472 	 * A, B, ... are possible positions for cur:
1473 	 *
1474 	 *  0    A  cur   B  tail  C  n-1
1475 	 *  0    D  tail  E  cur   F  n-1
1476 	 *
1477 	 * B, F, D are valid. A, C, E are wrong
1478 	 */
1479 	if (kring->rtail >= kring->rhead) {
1480 		/* want rhead <= head <= rtail */
1481 		NM_ASSERT(head < kring->rhead || head > kring->rtail);
1482 		/* and also head <= cur <= rtail */
1483 		NM_ASSERT(cur < head || cur > kring->rtail);
1484 	} else { /* here rtail < rhead */
1485 		/* we need head outside rtail .. rhead */
1486 		NM_ASSERT(head > kring->rtail && head < kring->rhead);
1487 
1488 		/* two cases now: head <= rtail or head >= rhead  */
1489 		if (head <= kring->rtail) {
1490 			/* want head <= cur <= rtail */
1491 			NM_ASSERT(cur < head || cur > kring->rtail);
1492 		} else { /* head >= rhead */
1493 			/* cur must be outside rtail..head */
1494 			NM_ASSERT(cur > kring->rtail && cur < head);
1495 		}
1496 	}
1497 	if (ring->tail != kring->rtail) {
1498 		RD(5, "tail overwritten was %d need %d",
1499 			ring->tail, kring->rtail);
1500 		ring->tail = kring->rtail;
1501 	}
1502 	kring->rhead = head;
1503 	kring->rcur = cur;
1504 	return head;
1505 
1506 error:
1507 	RD(5, "%s kring error: head %d cur %d tail %d rhead %d rcur %d rtail %d hwcur %d hwtail %d",
1508 		kring->name,
1509 		head, cur, ring->tail,
1510 		kring->rhead, kring->rcur, kring->rtail,
1511 		kring->nr_hwcur, kring->nr_hwtail);
1512 	return n;
1513 #undef NM_ASSERT
1514 }
1515 
1516 
1517 /*
1518  * validate parameters on entry for *_rxsync()
1519  * Returns ring->head if ok, kring->nkr_num_slots on error.
1520  *
1521  * For a valid configuration,
1522  * hwcur <= head <= cur <= tail <= hwtail
1523  *
1524  * We only consider head and cur.
1525  * hwcur and hwtail are reliable.
1526  *
1527  */
1528 static u_int
1529 nm_rxsync_prologue(struct netmap_kring *kring)
1530 {
1531 	struct netmap_ring *ring = kring->ring;
1532 	uint32_t const n = kring->nkr_num_slots;
1533 	uint32_t head, cur;
1534 
1535 	ND(5,"%s kc %d kt %d h %d c %d t %d",
1536 		kring->name,
1537 		kring->nr_hwcur, kring->nr_hwtail,
1538 		ring->head, ring->cur, ring->tail);
1539 	/*
1540 	 * Before storing the new values, we should check they do not
1541 	 * move backwards. However:
1542 	 * - head is not an issue because the previous value is hwcur;
1543 	 * - cur could in principle go back, however it does not matter
1544 	 *   because we are processing a brand new rxsync()
1545 	 */
1546 	cur = kring->rcur = ring->cur;	/* read only once */
1547 	head = kring->rhead = ring->head;	/* read only once */
1548 #if 1 /* kernel sanity checks */
1549 	if (kring->nr_hwcur >= n || kring->nr_hwtail >= n)
1550 		goto error;
1551 #endif /* kernel sanity checks */
1552 	/* user sanity checks */
1553 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1554 		/* want hwcur <= rhead <= hwtail */
1555 		if (head < kring->nr_hwcur || head > kring->nr_hwtail)
1556 			goto error;
1557 		/* and also rhead <= rcur <= hwtail */
1558 		if (cur < head || cur > kring->nr_hwtail)
1559 			goto error;
1560 	} else {
1561 		/* we need rhead outside hwtail..hwcur */
1562 		if (head < kring->nr_hwcur && head > kring->nr_hwtail)
1563 			goto error;
1564 		/* two cases now: head <= hwtail or head >= hwcur  */
1565 		if (head <= kring->nr_hwtail) {
1566 			/* want head <= cur <= hwtail */
1567 			if (cur < head || cur > kring->nr_hwtail)
1568 				goto error;
1569 		} else {
1570 			/* cur must be outside hwtail..head */
1571 			if (cur < head && cur > kring->nr_hwtail)
1572 				goto error;
1573 		}
1574 	}
1575 	if (ring->tail != kring->rtail) {
1576 		RD(5, "%s tail overwritten was %d need %d",
1577 			kring->name,
1578 			ring->tail, kring->rtail);
1579 		ring->tail = kring->rtail;
1580 	}
1581 	return head;
1582 
1583 error:
1584 	RD(5, "kring error: hwcur %d rcur %d hwtail %d head %d cur %d tail %d",
1585 		kring->nr_hwcur,
1586 		kring->rcur, kring->nr_hwtail,
1587 		kring->rhead, kring->rcur, ring->tail);
1588 	return n;
1589 }
1590 
1591 
1592 /*
1593  * Error routine called when txsync/rxsync detects an error.
1594  * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1595  * Return 1 on reinit.
1596  *
1597  * This routine is only called by the upper half of the kernel.
1598  * It only reads hwcur (which is changed only by the upper half, too)
1599  * and hwtail (which may be changed by the lower half, but only on
1600  * a tx ring and only to increase it, so any error will be recovered
1601  * on the next call). For the above, we don't strictly need to call
1602  * it under lock.
1603  */
1604 int
1605 netmap_ring_reinit(struct netmap_kring *kring)
1606 {
1607 	struct netmap_ring *ring = kring->ring;
1608 	u_int i, lim = kring->nkr_num_slots - 1;
1609 	int errors = 0;
1610 
1611 	// XXX KASSERT nm_kr_tryget
1612 	RD(10, "called for %s", kring->name);
1613 	// XXX probably wrong to trust userspace
1614 	kring->rhead = ring->head;
1615 	kring->rcur  = ring->cur;
1616 	kring->rtail = ring->tail;
1617 
1618 	if (ring->cur > lim)
1619 		errors++;
1620 	if (ring->head > lim)
1621 		errors++;
1622 	if (ring->tail > lim)
1623 		errors++;
1624 	for (i = 0; i <= lim; i++) {
1625 		u_int idx = ring->slot[i].buf_idx;
1626 		u_int len = ring->slot[i].len;
1627 		if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1628 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1629 			ring->slot[i].buf_idx = 0;
1630 			ring->slot[i].len = 0;
1631 		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1632 			ring->slot[i].len = 0;
1633 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1634 		}
1635 	}
1636 	if (errors) {
1637 		RD(10, "total %d errors", errors);
1638 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1639 			kring->name,
1640 			ring->cur, kring->nr_hwcur,
1641 			ring->tail, kring->nr_hwtail);
1642 		ring->head = kring->rhead = kring->nr_hwcur;
1643 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1644 		ring->tail = kring->rtail = kring->nr_hwtail;
1645 	}
1646 	return (errors ? 1 : 0);
1647 }
1648 
1649 /* interpret the ringid and flags fields of an nmreq, by translating them
1650  * into a pair of intervals of ring indices:
1651  *
1652  * [priv->np_txqfirst, priv->np_txqlast) and
1653  * [priv->np_rxqfirst, priv->np_rxqlast)
1654  *
1655  */
1656 int
1657 netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1658 {
1659 	struct netmap_adapter *na = priv->np_na;
1660 	u_int j, i = ringid & NETMAP_RING_MASK;
1661 	u_int reg = flags & NR_REG_MASK;
1662 	enum txrx t;
1663 
1664 	if (reg == NR_REG_DEFAULT) {
1665 		/* convert from old ringid to flags */
1666 		if (ringid & NETMAP_SW_RING) {
1667 			reg = NR_REG_SW;
1668 		} else if (ringid & NETMAP_HW_RING) {
1669 			reg = NR_REG_ONE_NIC;
1670 		} else {
1671 			reg = NR_REG_ALL_NIC;
1672 		}
1673 		D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
1674 	}
1675 	switch (reg) {
1676 	case NR_REG_ALL_NIC:
1677 	case NR_REG_PIPE_MASTER:
1678 	case NR_REG_PIPE_SLAVE:
1679 		for_rx_tx(t) {
1680 			priv->np_qfirst[t] = 0;
1681 			priv->np_qlast[t] = nma_get_nrings(na, t);
1682 		}
1683 		ND("%s %d %d", "ALL/PIPE",
1684 			priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]);
1685 		break;
1686 	case NR_REG_SW:
1687 	case NR_REG_NIC_SW:
1688 		if (!(na->na_flags & NAF_HOST_RINGS)) {
1689 			D("host rings not supported");
1690 			return EINVAL;
1691 		}
1692 		for_rx_tx(t) {
1693 			priv->np_qfirst[t] = (reg == NR_REG_SW ?
1694 				nma_get_nrings(na, t) : 0);
1695 			priv->np_qlast[t] = nma_get_nrings(na, t) + 1;
1696 		}
1697 		ND("%s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
1698 			priv->np_qfirst[NR_RX], priv->np_qlast[NR_RX]);
1699 		break;
1700 	case NR_REG_ONE_NIC:
1701 		if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
1702 			D("invalid ring id %d", i);
1703 			return EINVAL;
1704 		}
1705 		for_rx_tx(t) {
1706 			/* if not enough rings, use the first one */
1707 			j = i;
1708 			if (j >= nma_get_nrings(na, t))
1709 				j = 0;
1710 			priv->np_qfirst[t] = j;
1711 			priv->np_qlast[t] = j + 1;
1712 		}
1713 		break;
1714 	default:
1715 		D("invalid regif type %d", reg);
1716 		return EINVAL;
1717 	}
1718 	priv->np_flags = (flags & ~NR_REG_MASK) | reg;
1719 
1720 	if (netmap_verbose) {
1721 		D("%s: tx [%d,%d) rx [%d,%d) id %d",
1722 			na->name,
1723 			priv->np_qfirst[NR_TX],
1724 			priv->np_qlast[NR_TX],
1725 			priv->np_qfirst[NR_RX],
1726 			priv->np_qlast[NR_RX],
1727 			i);
1728 	}
1729 	return 0;
1730 }
1731 
1732 
1733 /*
1734  * Set the ring ID. For devices with a single queue, a request
1735  * for all rings is the same as a single ring.
1736  */
1737 static int
1738 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1739 {
1740 	struct netmap_adapter *na = priv->np_na;
1741 	int error;
1742 	enum txrx t;
1743 
1744 	error = netmap_interp_ringid(priv, ringid, flags);
1745 	if (error) {
1746 		return error;
1747 	}
1748 
1749 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1750 
1751 	/* optimization: count the users registered for more than
1752 	 * one ring, which are the ones sleeping on the global queue.
1753 	 * The default netmap_notify() callback will then
1754 	 * avoid signaling the global queue if nobody is using it
1755 	 */
1756 	for_rx_tx(t) {
1757 		if (nm_si_user(priv, t))
1758 			na->si_users[t]++;
1759 	}
1760 	return 0;
1761 }
1762 
1763 static void
1764 netmap_unset_ringid(struct netmap_priv_d *priv)
1765 {
1766 	struct netmap_adapter *na = priv->np_na;
1767 	enum txrx t;
1768 
1769 	for_rx_tx(t) {
1770 		if (nm_si_user(priv, t))
1771 			na->si_users[t]--;
1772 		priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1773 	}
1774 	priv->np_flags = 0;
1775 	priv->np_txpoll = 0;
1776 }
1777 
1778 
1779 /* check that the rings we want to bind are not exclusively owned by a previous
1780  * bind.  If exclusive ownership has been requested, we also mark the rings.
1781  */
1782 static int
1783 netmap_get_exclusive(struct netmap_priv_d *priv)
1784 {
1785 	struct netmap_adapter *na = priv->np_na;
1786 	u_int i;
1787 	struct netmap_kring *kring;
1788 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1789 	enum txrx t;
1790 
1791 	ND("%s: grabbing tx [%d, %d) rx [%d, %d)",
1792 			na->name,
1793 			priv->np_qfirst[NR_TX],
1794 			priv->np_qlast[NR_TX],
1795 			priv->np_qfirst[NR_RX],
1796 			priv->np_qlast[NR_RX]);
1797 
1798 	/* first round: check that all the requested rings
1799 	 * are neither alread exclusively owned, nor we
1800 	 * want exclusive ownership when they are already in use
1801 	 */
1802 	for_rx_tx(t) {
1803 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1804 			kring = &NMR(na, t)[i];
1805 			if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
1806 			    (kring->users && excl))
1807 			{
1808 				ND("ring %s busy", kring->name);
1809 				return EBUSY;
1810 			}
1811 		}
1812 	}
1813 
1814 	/* second round: increment usage cound and possibly
1815 	 * mark as exclusive
1816 	 */
1817 
1818 	for_rx_tx(t) {
1819 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1820 			kring = &NMR(na, t)[i];
1821 			kring->users++;
1822 			if (excl)
1823 				kring->nr_kflags |= NKR_EXCLUSIVE;
1824 		}
1825 	}
1826 
1827 	return 0;
1828 
1829 }
1830 
1831 /* undo netmap_get_ownership() */
1832 static void
1833 netmap_rel_exclusive(struct netmap_priv_d *priv)
1834 {
1835 	struct netmap_adapter *na = priv->np_na;
1836 	u_int i;
1837 	struct netmap_kring *kring;
1838 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1839 	enum txrx t;
1840 
1841 	ND("%s: releasing tx [%d, %d) rx [%d, %d)",
1842 			na->name,
1843 			priv->np_qfirst[NR_TX],
1844 			priv->np_qlast[NR_TX],
1845 			priv->np_qfirst[NR_RX],
1846 			priv->np_qlast[MR_RX]);
1847 
1848 
1849 	for_rx_tx(t) {
1850 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1851 			kring = &NMR(na, t)[i];
1852 			if (excl)
1853 				kring->nr_kflags &= ~NKR_EXCLUSIVE;
1854 			kring->users--;
1855 		}
1856 	}
1857 }
1858 
1859 /*
1860  * possibly move the interface to netmap-mode.
1861  * If success it returns a pointer to netmap_if, otherwise NULL.
1862  * This must be called with NMG_LOCK held.
1863  *
1864  * The following na callbacks are called in the process:
1865  *
1866  * na->nm_config()			[by netmap_update_config]
1867  * (get current number and size of rings)
1868  *
1869  *  	We have a generic one for linux (netmap_linux_config).
1870  *  	The bwrap has to override this, since it has to forward
1871  *  	the request to the wrapped adapter (netmap_bwrap_config).
1872  *
1873  *
1874  * na->nm_krings_create()
1875  * (create and init the krings array)
1876  *
1877  * 	One of the following:
1878  *
1879  *	* netmap_hw_krings_create, 			(hw ports)
1880  *		creates the standard layout for the krings
1881  * 		and adds the mbq (used for the host rings).
1882  *
1883  * 	* netmap_vp_krings_create			(VALE ports)
1884  * 		add leases and scratchpads
1885  *
1886  * 	* netmap_pipe_krings_create			(pipes)
1887  * 		create the krings and rings of both ends and
1888  * 		cross-link them
1889  *
1890  *      * netmap_monitor_krings_create 			(monitors)
1891  *      	avoid allocating the mbq
1892  *
1893  *      * netmap_bwrap_krings_create			(bwraps)
1894  *      	create both the brap krings array,
1895  *      	the krings array of the wrapped adapter, and
1896  *      	(if needed) the fake array for the host adapter
1897  *
1898  * na->nm_register(, 1)
1899  * (put the adapter in netmap mode)
1900  *
1901  * 	This may be one of the following:
1902  * 	(XXX these should be either all *_register or all *_reg 2014-03-15)
1903  *
1904  * 	* netmap_hw_register				(hw ports)
1905  * 		checks that the ifp is still there, then calls
1906  * 		the hardware specific callback;
1907  *
1908  * 	* netmap_vp_reg					(VALE ports)
1909  *		If the port is connected to a bridge,
1910  *		set the NAF_NETMAP_ON flag under the
1911  *		bridge write lock.
1912  *
1913  *	* netmap_pipe_reg				(pipes)
1914  *		inform the other pipe end that it is no
1915  *		longer responsibile for the lifetime of this
1916  *		pipe end
1917  *
1918  *	* netmap_monitor_reg				(monitors)
1919  *		intercept the sync callbacks of the monitored
1920  *		rings
1921  *
1922  *	* netmap_bwrap_register				(bwraps)
1923  *		cross-link the bwrap and hwna rings,
1924  *		forward the request to the hwna, override
1925  *		the hwna notify callback (to get the frames
1926  *		coming from outside go through the bridge).
1927  *
1928  *
1929  */
1930 int
1931 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1932 	uint16_t ringid, uint32_t flags)
1933 {
1934 	struct netmap_if *nifp = NULL;
1935 	int error;
1936 
1937 	NMG_LOCK_ASSERT();
1938 	/* ring configuration may have changed, fetch from the card */
1939 	netmap_update_config(na);
1940 	priv->np_na = na;     /* store the reference */
1941 	error = netmap_set_ringid(priv, ringid, flags);
1942 	if (error)
1943 		goto err;
1944 	error = netmap_mem_finalize(na->nm_mem, na);
1945 	if (error)
1946 		goto err;
1947 
1948 	if (na->active_fds == 0) {
1949 		/*
1950 		 * If this is the first registration of the adapter,
1951 		 * also create the netmap rings and their in-kernel view,
1952 		 * the netmap krings.
1953 		 */
1954 
1955 		/*
1956 		 * Depending on the adapter, this may also create
1957 		 * the netmap rings themselves
1958 		 */
1959 		error = na->nm_krings_create(na);
1960 		if (error)
1961 			goto err_drop_mem;
1962 
1963 		/* create all missing netmap rings */
1964 		error = netmap_mem_rings_create(na);
1965 		if (error)
1966 			goto err_del_krings;
1967 	}
1968 
1969 	/* now the kring must exist and we can check whether some
1970 	 * previous bind has exclusive ownership on them
1971 	 */
1972 	error = netmap_get_exclusive(priv);
1973 	if (error)
1974 		goto err_del_rings;
1975 
1976 	/* in all cases, create a new netmap if */
1977 	nifp = netmap_mem_if_new(na);
1978 	if (nifp == NULL) {
1979 		error = ENOMEM;
1980 		goto err_rel_excl;
1981 	}
1982 
1983 	na->active_fds++;
1984 	if (!nm_netmap_on(na)) {
1985 		/* Netmap not active, set the card in netmap mode
1986 		 * and make it use the shared buffers.
1987 		 */
1988 		/* cache the allocator info in the na */
1989 		netmap_mem_get_lut(na->nm_mem, &na->na_lut);
1990 		ND("%p->na_lut == %p", na, na->na_lut.lut);
1991 		error = na->nm_register(na, 1); /* mode on */
1992 		if (error)
1993 			goto err_del_if;
1994 	}
1995 
1996 	/*
1997 	 * advertise that the interface is ready by setting np_nifp.
1998 	 * The barrier is needed because readers (poll, *SYNC and mmap)
1999 	 * check for priv->np_nifp != NULL without locking
2000 	 */
2001 	mb(); /* make sure previous writes are visible to all CPUs */
2002 	priv->np_nifp = nifp;
2003 
2004 	return 0;
2005 
2006 err_del_if:
2007 	memset(&na->na_lut, 0, sizeof(na->na_lut));
2008 	na->active_fds--;
2009 	netmap_mem_if_delete(na, nifp);
2010 err_rel_excl:
2011 	netmap_rel_exclusive(priv);
2012 err_del_rings:
2013 	if (na->active_fds == 0)
2014 		netmap_mem_rings_delete(na);
2015 err_del_krings:
2016 	if (na->active_fds == 0)
2017 		na->nm_krings_delete(na);
2018 err_drop_mem:
2019 	netmap_mem_deref(na->nm_mem, na);
2020 err:
2021 	priv->np_na = NULL;
2022 	return error;
2023 }
2024 
2025 
2026 /*
2027  * update kring and ring at the end of txsync.
2028  */
2029 static inline void
2030 nm_txsync_finalize(struct netmap_kring *kring)
2031 {
2032 	/* update ring tail to what the kernel knows */
2033 	kring->ring->tail = kring->rtail = kring->nr_hwtail;
2034 
2035 	/* note, head/rhead/hwcur might be behind cur/rcur
2036 	 * if no carrier
2037 	 */
2038 	ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2039 		kring->name, kring->nr_hwcur, kring->nr_hwtail,
2040 		kring->rhead, kring->rcur, kring->rtail);
2041 }
2042 
2043 
2044 /*
2045  * update kring and ring at the end of rxsync
2046  */
2047 static inline void
2048 nm_rxsync_finalize(struct netmap_kring *kring)
2049 {
2050 	/* tell userspace that there might be new packets */
2051 	//struct netmap_ring *ring = kring->ring;
2052 	ND("head %d cur %d tail %d -> %d", ring->head, ring->cur, ring->tail,
2053 		kring->nr_hwtail);
2054 	kring->ring->tail = kring->rtail = kring->nr_hwtail;
2055 	/* make a copy of the state for next round */
2056 	kring->rhead = kring->ring->head;
2057 	kring->rcur = kring->ring->cur;
2058 }
2059 
2060 
2061 
2062 /*
2063  * ioctl(2) support for the "netmap" device.
2064  *
2065  * Following a list of accepted commands:
2066  * - NIOCGINFO
2067  * - SIOCGIFADDR	just for convenience
2068  * - NIOCREGIF
2069  * - NIOCTXSYNC
2070  * - NIOCRXSYNC
2071  *
2072  * Return 0 on success, errno otherwise.
2073  */
2074 int
2075 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
2076 	int fflag, struct thread *td)
2077 {
2078 	struct netmap_priv_d *priv = NULL;
2079 	struct nmreq *nmr = (struct nmreq *) data;
2080 	struct netmap_adapter *na = NULL;
2081 	int error;
2082 	u_int i, qfirst, qlast;
2083 	struct netmap_if *nifp;
2084 	struct netmap_kring *krings;
2085 	enum txrx t;
2086 
2087 	(void)dev;	/* UNUSED */
2088 	(void)fflag;	/* UNUSED */
2089 
2090 	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
2091 		/* truncate name */
2092 		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
2093 		if (nmr->nr_version != NETMAP_API) {
2094 			D("API mismatch for %s got %d need %d",
2095 				nmr->nr_name,
2096 				nmr->nr_version, NETMAP_API);
2097 			nmr->nr_version = NETMAP_API;
2098 		}
2099 		if (nmr->nr_version < NETMAP_MIN_API ||
2100 		    nmr->nr_version > NETMAP_MAX_API) {
2101 			return EINVAL;
2102 		}
2103 	}
2104 	CURVNET_SET(TD_TO_VNET(td));
2105 
2106 	error = devfs_get_cdevpriv((void **)&priv);
2107 	if (error) {
2108 		CURVNET_RESTORE();
2109 		/* XXX ENOENT should be impossible, since the priv
2110 		 * is now created in the open */
2111 		return (error == ENOENT ? ENXIO : error);
2112 	}
2113 
2114 	switch (cmd) {
2115 	case NIOCGINFO:		/* return capabilities etc */
2116 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
2117 			error = netmap_bdg_ctl(nmr, NULL);
2118 			break;
2119 		}
2120 
2121 		NMG_LOCK();
2122 		do {
2123 			/* memsize is always valid */
2124 			struct netmap_mem_d *nmd = &nm_mem;
2125 			u_int memflags;
2126 
2127 			if (nmr->nr_name[0] != '\0') {
2128 				/* get a refcount */
2129 				error = netmap_get_na(nmr, &na, 1 /* create */);
2130 				if (error)
2131 					break;
2132 				nmd = na->nm_mem; /* get memory allocator */
2133 			}
2134 
2135 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags,
2136 				&nmr->nr_arg2);
2137 			if (error)
2138 				break;
2139 			if (na == NULL) /* only memory info */
2140 				break;
2141 			nmr->nr_offset = 0;
2142 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
2143 			netmap_update_config(na);
2144 			nmr->nr_rx_rings = na->num_rx_rings;
2145 			nmr->nr_tx_rings = na->num_tx_rings;
2146 			nmr->nr_rx_slots = na->num_rx_desc;
2147 			nmr->nr_tx_slots = na->num_tx_desc;
2148 			netmap_adapter_put(na);
2149 		} while (0);
2150 		NMG_UNLOCK();
2151 		break;
2152 
2153 	case NIOCREGIF:
2154 		/* possibly attach/detach NIC and VALE switch */
2155 		i = nmr->nr_cmd;
2156 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
2157 				|| i == NETMAP_BDG_VNET_HDR
2158 				|| i == NETMAP_BDG_NEWIF
2159 				|| i == NETMAP_BDG_DELIF) {
2160 			error = netmap_bdg_ctl(nmr, NULL);
2161 			break;
2162 		} else if (i != 0) {
2163 			D("nr_cmd must be 0 not %d", i);
2164 			error = EINVAL;
2165 			break;
2166 		}
2167 
2168 		/* protect access to priv from concurrent NIOCREGIF */
2169 		NMG_LOCK();
2170 		do {
2171 			u_int memflags;
2172 
2173 			if (priv->np_nifp != NULL) {	/* thread already registered */
2174 				error = EBUSY;
2175 				break;
2176 			}
2177 			/* find the interface and a reference */
2178 			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
2179 			if (error)
2180 				break;
2181 			if (NETMAP_OWNED_BY_KERN(na)) {
2182 				netmap_adapter_put(na);
2183 				error = EBUSY;
2184 				break;
2185 			}
2186 			error = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags);
2187 			if (error) {    /* reg. failed, release priv and ref */
2188 				netmap_adapter_put(na);
2189 				break;
2190 			}
2191 			nifp = priv->np_nifp;
2192 			priv->np_td = td; // XXX kqueue, debugging only
2193 
2194 			/* return the offset of the netmap_if object */
2195 			nmr->nr_rx_rings = na->num_rx_rings;
2196 			nmr->nr_tx_rings = na->num_tx_rings;
2197 			nmr->nr_rx_slots = na->num_rx_desc;
2198 			nmr->nr_tx_slots = na->num_tx_desc;
2199 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags,
2200 				&nmr->nr_arg2);
2201 			if (error) {
2202 				netmap_do_unregif(priv);
2203 				netmap_adapter_put(na);
2204 				break;
2205 			}
2206 			if (memflags & NETMAP_MEM_PRIVATE) {
2207 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2208 			}
2209 			for_rx_tx(t) {
2210 				priv->np_si[t] = nm_si_user(priv, t) ?
2211 					&na->si[t] : &NMR(na, t)[priv->np_qfirst[t]].si;
2212 			}
2213 
2214 			if (nmr->nr_arg3) {
2215 				D("requested %d extra buffers", nmr->nr_arg3);
2216 				nmr->nr_arg3 = netmap_extra_alloc(na,
2217 					&nifp->ni_bufs_head, nmr->nr_arg3);
2218 				D("got %d extra buffers", nmr->nr_arg3);
2219 			}
2220 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2221 		} while (0);
2222 		NMG_UNLOCK();
2223 		break;
2224 
2225 	case NIOCTXSYNC:
2226 	case NIOCRXSYNC:
2227 		nifp = priv->np_nifp;
2228 
2229 		if (nifp == NULL) {
2230 			error = ENXIO;
2231 			break;
2232 		}
2233 		mb(); /* make sure following reads are not from cache */
2234 
2235 		na = priv->np_na;      /* we have a reference */
2236 
2237 		if (na == NULL) {
2238 			D("Internal error: nifp != NULL && na == NULL");
2239 			error = ENXIO;
2240 			break;
2241 		}
2242 
2243 		if (!nm_netmap_on(na)) {
2244 			error = ENXIO;
2245 			break;
2246 		}
2247 
2248 		t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
2249 		krings = NMR(na, t);
2250 		qfirst = priv->np_qfirst[t];
2251 		qlast = priv->np_qlast[t];
2252 
2253 		for (i = qfirst; i < qlast; i++) {
2254 			struct netmap_kring *kring = krings + i;
2255 			if (nm_kr_tryget(kring)) {
2256 				error = EBUSY;
2257 				goto out;
2258 			}
2259 			if (cmd == NIOCTXSYNC) {
2260 				if (netmap_verbose & NM_VERB_TXSYNC)
2261 					D("pre txsync ring %d cur %d hwcur %d",
2262 					    i, kring->ring->cur,
2263 					    kring->nr_hwcur);
2264 				if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2265 					netmap_ring_reinit(kring);
2266 				} else if (kring->nm_sync(kring, NAF_FORCE_RECLAIM) == 0) {
2267 					nm_txsync_finalize(kring);
2268 				}
2269 				if (netmap_verbose & NM_VERB_TXSYNC)
2270 					D("post txsync ring %d cur %d hwcur %d",
2271 					    i, kring->ring->cur,
2272 					    kring->nr_hwcur);
2273 			} else {
2274 				if (nm_rxsync_prologue(kring) >= kring->nkr_num_slots) {
2275 					netmap_ring_reinit(kring);
2276 				} else if (kring->nm_sync(kring, NAF_FORCE_READ) == 0) {
2277 					nm_rxsync_finalize(kring);
2278 				}
2279 				microtime(&na->rx_rings[i].ring->ts);
2280 			}
2281 			nm_kr_put(kring);
2282 		}
2283 
2284 		break;
2285 
2286 #ifdef WITH_VALE
2287 	case NIOCCONFIG:
2288 		error = netmap_bdg_config(nmr);
2289 		break;
2290 #endif
2291 #ifdef __FreeBSD__
2292 	case FIONBIO:
2293 	case FIOASYNC:
2294 		ND("FIONBIO/FIOASYNC are no-ops");
2295 		break;
2296 
2297 	case BIOCIMMEDIATE:
2298 	case BIOCGHDRCMPLT:
2299 	case BIOCSHDRCMPLT:
2300 	case BIOCSSEESENT:
2301 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
2302 		break;
2303 
2304 	default:	/* allow device-specific ioctls */
2305 	    {
2306 		struct ifnet *ifp = ifunit_ref(nmr->nr_name);
2307 		if (ifp == NULL) {
2308 			error = ENXIO;
2309 		} else {
2310 			struct socket so;
2311 
2312 			bzero(&so, sizeof(so));
2313 			so.so_vnet = ifp->if_vnet;
2314 			// so->so_proto not null.
2315 			error = ifioctl(&so, cmd, data, td);
2316 			if_rele(ifp);
2317 		}
2318 		break;
2319 	    }
2320 
2321 #else /* linux */
2322 	default:
2323 		error = EOPNOTSUPP;
2324 #endif /* linux */
2325 	}
2326 out:
2327 
2328 	CURVNET_RESTORE();
2329 	return (error);
2330 }
2331 
2332 
2333 /*
2334  * select(2) and poll(2) handlers for the "netmap" device.
2335  *
2336  * Can be called for one or more queues.
2337  * Return true the event mask corresponding to ready events.
2338  * If there are no ready events, do a selrecord on either individual
2339  * selinfo or on the global one.
2340  * Device-dependent parts (locking and sync of tx/rx rings)
2341  * are done through callbacks.
2342  *
2343  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
2344  * The first one is remapped to pwait as selrecord() uses the name as an
2345  * hidden argument.
2346  */
2347 int
2348 netmap_poll(struct cdev *dev, int events, struct thread *td)
2349 {
2350 	struct netmap_priv_d *priv = NULL;
2351 	struct netmap_adapter *na;
2352 	struct netmap_kring *kring;
2353 	u_int i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
2354 #define want_tx want[NR_TX]
2355 #define want_rx want[NR_RX]
2356 	struct mbq q;		/* packets from hw queues to host stack */
2357 	void *pwait = dev;	/* linux compatibility */
2358 	int is_kevent = 0;
2359 	enum txrx t;
2360 
2361 	/*
2362 	 * In order to avoid nested locks, we need to "double check"
2363 	 * txsync and rxsync if we decide to do a selrecord().
2364 	 * retry_tx (and retry_rx, later) prevent looping forever.
2365 	 */
2366 	int retry_tx = 1, retry_rx = 1;
2367 
2368 	(void)pwait;
2369 	mbq_init(&q);
2370 
2371 	/*
2372 	 * XXX kevent has curthread->tp_fop == NULL,
2373 	 * so devfs_get_cdevpriv() fails. We circumvent this by passing
2374 	 * priv as the first argument, which is also useful to avoid
2375 	 * the selrecord() which are not necessary in that case.
2376 	 */
2377 	if (devfs_get_cdevpriv((void **)&priv) != 0) {
2378 		is_kevent = 1;
2379 		if (netmap_verbose)
2380 			D("called from kevent");
2381 		priv = (struct netmap_priv_d *)dev;
2382 	}
2383 	if (priv == NULL)
2384 		return POLLERR;
2385 
2386 	if (priv->np_nifp == NULL) {
2387 		D("No if registered");
2388 		return POLLERR;
2389 	}
2390 	mb(); /* make sure following reads are not from cache */
2391 
2392 	na = priv->np_na;
2393 
2394 	if (!nm_netmap_on(na))
2395 		return POLLERR;
2396 
2397 	if (netmap_verbose & 0x8000)
2398 		D("device %s events 0x%x", na->name, events);
2399 	want_tx = events & (POLLOUT | POLLWRNORM);
2400 	want_rx = events & (POLLIN | POLLRDNORM);
2401 
2402 
2403 	/*
2404 	 * check_all_{tx|rx} are set if the card has more than one queue AND
2405 	 * the file descriptor is bound to all of them. If so, we sleep on
2406 	 * the "global" selinfo, otherwise we sleep on individual selinfo
2407 	 * (FreeBSD only allows two selinfo's per file descriptor).
2408 	 * The interrupt routine in the driver wake one or the other
2409 	 * (or both) depending on which clients are active.
2410 	 *
2411 	 * rxsync() is only called if we run out of buffers on a POLLIN.
2412 	 * txsync() is called if we run out of buffers on POLLOUT, or
2413 	 * there are pending packets to send. The latter can be disabled
2414 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
2415 	 */
2416 	check_all_tx = nm_si_user(priv, NR_TX);
2417 	check_all_rx = nm_si_user(priv, NR_RX);
2418 
2419 	/*
2420 	 * We start with a lock free round which is cheap if we have
2421 	 * slots available. If this fails, then lock and call the sync
2422 	 * routines.
2423 	 */
2424 	for_rx_tx(t) {
2425 		for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) {
2426 			kring = &NMR(na, t)[i];
2427 			/* XXX compare ring->cur and kring->tail */
2428 			if (!nm_ring_empty(kring->ring)) {
2429 				revents |= want[t];
2430 				want[t] = 0;	/* also breaks the loop */
2431 			}
2432 		}
2433 	}
2434 
2435 	/*
2436 	 * If we want to push packets out (priv->np_txpoll) or
2437 	 * want_tx is still set, we must issue txsync calls
2438 	 * (on all rings, to avoid that the tx rings stall).
2439 	 * XXX should also check cur != hwcur on the tx rings.
2440 	 * Fortunately, normal tx mode has np_txpoll set.
2441 	 */
2442 	if (priv->np_txpoll || want_tx) {
2443 		/*
2444 		 * The first round checks if anyone is ready, if not
2445 		 * do a selrecord and another round to handle races.
2446 		 * want_tx goes to 0 if any space is found, and is
2447 		 * used to skip rings with no pending transmissions.
2448 		 */
2449 flush_tx:
2450 		for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_RX]; i++) {
2451 			int found = 0;
2452 
2453 			kring = &na->tx_rings[i];
2454 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
2455 				continue;
2456 			/* only one thread does txsync */
2457 			if (nm_kr_tryget(kring)) {
2458 				/* either busy or stopped
2459 				 * XXX if the ring is stopped, sleeping would
2460 				 * be better. In current code, however, we only
2461 				 * stop the rings for brief intervals (2014-03-14)
2462 				 */
2463 				if (netmap_verbose)
2464 					RD(2, "%p lost race on txring %d, ok",
2465 					    priv, i);
2466 				continue;
2467 			}
2468 			if (nm_txsync_prologue(kring) >= kring->nkr_num_slots) {
2469 				netmap_ring_reinit(kring);
2470 				revents |= POLLERR;
2471 			} else {
2472 				if (kring->nm_sync(kring, 0))
2473 					revents |= POLLERR;
2474 				else
2475 					nm_txsync_finalize(kring);
2476 			}
2477 
2478 			/*
2479 			 * If we found new slots, notify potential
2480 			 * listeners on the same ring.
2481 			 * Since we just did a txsync, look at the copies
2482 			 * of cur,tail in the kring.
2483 			 */
2484 			found = kring->rcur != kring->rtail;
2485 			nm_kr_put(kring);
2486 			if (found) { /* notify other listeners */
2487 				revents |= want_tx;
2488 				want_tx = 0;
2489 				kring->nm_notify(kring, 0);
2490 			}
2491 		}
2492 		if (want_tx && retry_tx && !is_kevent) {
2493 			OS_selrecord(td, check_all_tx ?
2494 			    &na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]].si);
2495 			retry_tx = 0;
2496 			goto flush_tx;
2497 		}
2498 	}
2499 
2500 	/*
2501 	 * If want_rx is still set scan receive rings.
2502 	 * Do it on all rings because otherwise we starve.
2503 	 */
2504 	if (want_rx) {
2505 		int send_down = 0; /* transparent mode */
2506 		/* two rounds here for race avoidance */
2507 do_retry_rx:
2508 		for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
2509 			int found = 0;
2510 
2511 			kring = &na->rx_rings[i];
2512 
2513 			if (nm_kr_tryget(kring)) {
2514 				if (netmap_verbose)
2515 					RD(2, "%p lost race on rxring %d, ok",
2516 					    priv, i);
2517 				continue;
2518 			}
2519 
2520 			if (nm_rxsync_prologue(kring) >= kring->nkr_num_slots) {
2521 				netmap_ring_reinit(kring);
2522 				revents |= POLLERR;
2523 			}
2524 			/* now we can use kring->rcur, rtail */
2525 
2526 			/*
2527 			 * transparent mode support: collect packets
2528 			 * from the rxring(s).
2529 			 * XXX NR_FORWARD should only be read on
2530 			 * physical or NIC ports
2531 			 */
2532 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
2533 				ND(10, "forwarding some buffers up %d to %d",
2534 				    kring->nr_hwcur, kring->ring->cur);
2535 				netmap_grab_packets(kring, &q, netmap_fwd);
2536 			}
2537 
2538 			if (kring->nm_sync(kring, 0))
2539 				revents |= POLLERR;
2540 			else
2541 				nm_rxsync_finalize(kring);
2542 			if (netmap_no_timestamp == 0 ||
2543 					kring->ring->flags & NR_TIMESTAMP) {
2544 				microtime(&kring->ring->ts);
2545 			}
2546 			found = kring->rcur != kring->rtail;
2547 			nm_kr_put(kring);
2548 			if (found) {
2549 				revents |= want_rx;
2550 				retry_rx = 0;
2551 				kring->nm_notify(kring, 0);
2552 			}
2553 		}
2554 
2555 		/* transparent mode XXX only during first pass ? */
2556 		if (na->na_flags & NAF_HOST_RINGS) {
2557 			kring = &na->rx_rings[na->num_rx_rings];
2558 			if (check_all_rx
2559 			    && (netmap_fwd || kring->ring->flags & NR_FORWARD)) {
2560 				/* XXX fix to use kring fields */
2561 				if (nm_ring_empty(kring->ring))
2562 					send_down = netmap_rxsync_from_host(na, td, dev);
2563 				if (!nm_ring_empty(kring->ring))
2564 					revents |= want_rx;
2565 			}
2566 		}
2567 
2568 		if (retry_rx && !is_kevent)
2569 			OS_selrecord(td, check_all_rx ?
2570 			    &na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]].si);
2571 		if (send_down > 0 || retry_rx) {
2572 			retry_rx = 0;
2573 			if (send_down)
2574 				goto flush_tx; /* and retry_rx */
2575 			else
2576 				goto do_retry_rx;
2577 		}
2578 	}
2579 
2580 	/*
2581 	 * Transparent mode: marked bufs on rx rings between
2582 	 * kring->nr_hwcur and ring->head
2583 	 * are passed to the other endpoint.
2584 	 *
2585 	 * In this mode we also scan the sw rxring, which in
2586 	 * turn passes packets up.
2587 	 *
2588 	 * XXX Transparent mode at the moment requires to bind all
2589  	 * rings to a single file descriptor.
2590 	 */
2591 
2592 	if (q.head && na->ifp != NULL)
2593 		netmap_send_up(na->ifp, &q);
2594 
2595 	return (revents);
2596 #undef want_tx
2597 #undef want_rx
2598 }
2599 
2600 
2601 /*-------------------- driver support routines -------------------*/
2602 
2603 static int netmap_hw_krings_create(struct netmap_adapter *);
2604 
2605 /* default notify callback */
2606 static int
2607 netmap_notify(struct netmap_kring *kring, int flags)
2608 {
2609 	struct netmap_adapter *na = kring->na;
2610 	enum txrx t = kring->tx;
2611 
2612 	OS_selwakeup(&kring->si, PI_NET);
2613 	/* optimization: avoid a wake up on the global
2614 	 * queue if nobody has registered for more
2615 	 * than one ring
2616 	 */
2617 	if (na->si_users[t] > 0)
2618 		OS_selwakeup(&na->si[t], PI_NET);
2619 
2620 	return 0;
2621 }
2622 
2623 
2624 /* called by all routines that create netmap_adapters.
2625  * Attach na to the ifp (if any) and provide defaults
2626  * for optional callbacks. Defaults assume that we
2627  * are creating an hardware netmap_adapter.
2628  */
2629 int
2630 netmap_attach_common(struct netmap_adapter *na)
2631 {
2632 	struct ifnet *ifp = na->ifp;
2633 
2634 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2635 		D("%s: invalid rings tx %d rx %d",
2636 			na->name, na->num_tx_rings, na->num_rx_rings);
2637 		return EINVAL;
2638 	}
2639 	/* ifp is NULL for virtual adapters (bwrap, non-persistent VALE ports,
2640 	 * pipes, monitors). For bwrap we actually have a non-null ifp for
2641 	 * use by the external modules, but that is set after this
2642 	 * function has been called.
2643 	 * XXX this is ugly, maybe split this function in two (2014-03-14)
2644 	 */
2645 	if (ifp != NULL) {
2646 		WNA(ifp) = na;
2647 
2648 	/* the following is only needed for na that use the host port.
2649 	 * XXX do we have something similar for linux ?
2650 	 */
2651 #ifdef __FreeBSD__
2652 		na->if_input = ifp->if_input; /* for netmap_send_up */
2653 #endif /* __FreeBSD__ */
2654 
2655 		NETMAP_SET_CAPABLE(ifp);
2656 	}
2657 	if (na->nm_krings_create == NULL) {
2658 		/* we assume that we have been called by a driver,
2659 		 * since other port types all provide their own
2660 		 * nm_krings_create
2661 		 */
2662 		na->nm_krings_create = netmap_hw_krings_create;
2663 		na->nm_krings_delete = netmap_hw_krings_delete;
2664 	}
2665 	if (na->nm_notify == NULL)
2666 		na->nm_notify = netmap_notify;
2667 	na->active_fds = 0;
2668 
2669 	if (na->nm_mem == NULL)
2670 		/* use the global allocator */
2671 		na->nm_mem = &nm_mem;
2672 	netmap_mem_get(na->nm_mem);
2673 #ifdef WITH_VALE
2674 	if (na->nm_bdg_attach == NULL)
2675 		/* no special nm_bdg_attach callback. On VALE
2676 		 * attach, we need to interpose a bwrap
2677 		 */
2678 		na->nm_bdg_attach = netmap_bwrap_attach;
2679 #endif
2680 	return 0;
2681 }
2682 
2683 
2684 /* standard cleanup, called by all destructors */
2685 void
2686 netmap_detach_common(struct netmap_adapter *na)
2687 {
2688 	if (na->ifp != NULL)
2689 		WNA(na->ifp) = NULL; /* XXX do we need this? */
2690 
2691 	if (na->tx_rings) { /* XXX should not happen */
2692 		D("freeing leftover tx_rings");
2693 		na->nm_krings_delete(na);
2694 	}
2695 	netmap_pipe_dealloc(na);
2696 	if (na->nm_mem)
2697 		netmap_mem_put(na->nm_mem);
2698 	bzero(na, sizeof(*na));
2699 	free(na, M_DEVBUF);
2700 }
2701 
2702 /* Wrapper for the register callback provided hardware drivers.
2703  * na->ifp == NULL means the the driver module has been
2704  * unloaded, so we cannot call into it.
2705  * Note that module unloading, in our patched linux drivers,
2706  * happens under NMG_LOCK and after having stopped all the
2707  * nic rings (see netmap_detach). This provides sufficient
2708  * protection for the other driver-provied callbacks
2709  * (i.e., nm_config and nm_*xsync), that therefore don't need
2710  * to wrapped.
2711  */
2712 static int
2713 netmap_hw_register(struct netmap_adapter *na, int onoff)
2714 {
2715 	struct netmap_hw_adapter *hwna =
2716 		(struct netmap_hw_adapter*)na;
2717 
2718 	if (na->ifp == NULL)
2719 		return onoff ? ENXIO : 0;
2720 
2721 	return hwna->nm_hw_register(na, onoff);
2722 }
2723 
2724 
2725 /*
2726  * Initialize a ``netmap_adapter`` object created by driver on attach.
2727  * We allocate a block of memory with room for a struct netmap_adapter
2728  * plus two sets of N+2 struct netmap_kring (where N is the number
2729  * of hardware rings):
2730  * krings	0..N-1	are for the hardware queues.
2731  * kring	N	is for the host stack queue
2732  * kring	N+1	is only used for the selinfo for all queues. // XXX still true ?
2733  * Return 0 on success, ENOMEM otherwise.
2734  */
2735 int
2736 netmap_attach(struct netmap_adapter *arg)
2737 {
2738 	struct netmap_hw_adapter *hwna = NULL;
2739 	// XXX when is arg == NULL ?
2740 	struct ifnet *ifp = arg ? arg->ifp : NULL;
2741 
2742 	if (arg == NULL || ifp == NULL)
2743 		goto fail;
2744 	hwna = malloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
2745 	if (hwna == NULL)
2746 		goto fail;
2747 	hwna->up = *arg;
2748 	hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
2749 	strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
2750 	hwna->nm_hw_register = hwna->up.nm_register;
2751 	hwna->up.nm_register = netmap_hw_register;
2752 	if (netmap_attach_common(&hwna->up)) {
2753 		free(hwna, M_DEVBUF);
2754 		goto fail;
2755 	}
2756 	netmap_adapter_get(&hwna->up);
2757 
2758 #ifdef linux
2759 	if (ifp->netdev_ops) {
2760 		/* prepare a clone of the netdev ops */
2761 #ifndef NETMAP_LINUX_HAVE_NETDEV_OPS
2762 		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2763 #else
2764 		hwna->nm_ndo = *ifp->netdev_ops;
2765 #endif
2766 	}
2767 	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2768 	if (ifp->ethtool_ops) {
2769 		hwna->nm_eto = *ifp->ethtool_ops;
2770 	}
2771 	hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam;
2772 #ifdef NETMAP_LINUX_HAVE_SET_CHANNELS
2773 	hwna->nm_eto.set_channels = linux_netmap_set_channels;
2774 #endif
2775 	if (arg->nm_config == NULL) {
2776 		hwna->up.nm_config = netmap_linux_config;
2777 	}
2778 #endif /* linux */
2779 
2780 	if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
2781 	    hwna->up.num_tx_rings, hwna->up.num_tx_desc,
2782 	    hwna->up.num_rx_rings, hwna->up.num_rx_desc);
2783 	return 0;
2784 
2785 fail:
2786 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2787 	if (ifp)
2788 		netmap_detach(ifp);
2789 	return (hwna ? EINVAL : ENOMEM);
2790 }
2791 
2792 
2793 void
2794 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2795 {
2796 	if (!na) {
2797 		return;
2798 	}
2799 
2800 	refcount_acquire(&na->na_refcount);
2801 }
2802 
2803 
2804 /* returns 1 iff the netmap_adapter is destroyed */
2805 int
2806 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2807 {
2808 	if (!na)
2809 		return 1;
2810 
2811 	if (!refcount_release(&na->na_refcount))
2812 		return 0;
2813 
2814 	if (na->nm_dtor)
2815 		na->nm_dtor(na);
2816 
2817 	netmap_detach_common(na);
2818 
2819 	return 1;
2820 }
2821 
2822 /* nm_krings_create callback for all hardware native adapters */
2823 int
2824 netmap_hw_krings_create(struct netmap_adapter *na)
2825 {
2826 	int ret = netmap_krings_create(na, 0);
2827 	if (ret == 0) {
2828 		/* initialize the mbq for the sw rx ring */
2829 		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
2830 		ND("initialized sw rx queue %d", na->num_rx_rings);
2831 	}
2832 	return ret;
2833 }
2834 
2835 
2836 
2837 /*
2838  * Called on module unload by the netmap-enabled drivers
2839  */
2840 void
2841 netmap_detach(struct ifnet *ifp)
2842 {
2843 	struct netmap_adapter *na = NA(ifp);
2844 
2845 	if (!na)
2846 		return;
2847 
2848 	NMG_LOCK();
2849 	netmap_disable_all_rings(ifp);
2850 	na->ifp = NULL;
2851 	na->na_flags &= ~NAF_NETMAP_ON;
2852 	/*
2853 	 * if the netmap adapter is not native, somebody
2854 	 * changed it, so we can not release it here.
2855 	 * The NULL na->ifp will notify the new owner that
2856 	 * the driver is gone.
2857 	 */
2858 	if (na->na_flags & NAF_NATIVE) {
2859 	        netmap_adapter_put(na);
2860 	}
2861 	/* give them a chance to notice */
2862 	netmap_enable_all_rings(ifp);
2863 	NMG_UNLOCK();
2864 }
2865 
2866 
2867 /*
2868  * Intercept packets from the network stack and pass them
2869  * to netmap as incoming packets on the 'software' ring.
2870  *
2871  * We only store packets in a bounded mbq and then copy them
2872  * in the relevant rxsync routine.
2873  *
2874  * We rely on the OS to make sure that the ifp and na do not go
2875  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
2876  * In nm_register() or whenever there is a reinitialization,
2877  * we make sure to make the mode change visible here.
2878  */
2879 int
2880 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
2881 {
2882 	struct netmap_adapter *na = NA(ifp);
2883 	struct netmap_kring *kring;
2884 	u_int len = MBUF_LEN(m);
2885 	u_int error = ENOBUFS;
2886 	struct mbq *q;
2887 	int space;
2888 
2889 	kring = &na->rx_rings[na->num_rx_rings];
2890 	// XXX [Linux] we do not need this lock
2891 	// if we follow the down/configure/up protocol -gl
2892 	// mtx_lock(&na->core_lock);
2893 
2894 	if (!nm_netmap_on(na)) {
2895 		D("%s not in netmap mode anymore", na->name);
2896 		error = ENXIO;
2897 		goto done;
2898 	}
2899 
2900 	q = &kring->rx_queue;
2901 
2902 	// XXX reconsider long packets if we handle fragments
2903 	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
2904 		D("%s from_host, drop packet size %d > %d", na->name,
2905 			len, NETMAP_BUF_SIZE(na));
2906 		goto done;
2907 	}
2908 
2909 	/* protect against rxsync_from_host(), netmap_sw_to_nic()
2910 	 * and maybe other instances of netmap_transmit (the latter
2911 	 * not possible on Linux).
2912 	 * Also avoid overflowing the queue.
2913 	 */
2914 	mbq_lock(q);
2915 
2916         space = kring->nr_hwtail - kring->nr_hwcur;
2917         if (space < 0)
2918                 space += kring->nkr_num_slots;
2919 	if (space + mbq_len(q) >= kring->nkr_num_slots - 1) { // XXX
2920 		RD(10, "%s full hwcur %d hwtail %d qlen %d len %d m %p",
2921 			na->name, kring->nr_hwcur, kring->nr_hwtail, mbq_len(q),
2922 			len, m);
2923 	} else {
2924 		mbq_enqueue(q, m);
2925 		ND(10, "%s %d bufs in queue len %d m %p",
2926 			na->name, mbq_len(q), len, m);
2927 		/* notify outside the lock */
2928 		m = NULL;
2929 		error = 0;
2930 	}
2931 	mbq_unlock(q);
2932 
2933 done:
2934 	if (m)
2935 		m_freem(m);
2936 	/* unconditionally wake up listeners */
2937 	kring->nm_notify(kring, 0);
2938 	/* this is normally netmap_notify(), but for nics
2939 	 * connected to a bridge it is netmap_bwrap_intr_notify(),
2940 	 * that possibly forwards the frames through the switch
2941 	 */
2942 
2943 	return (error);
2944 }
2945 
2946 
2947 /*
2948  * netmap_reset() is called by the driver routines when reinitializing
2949  * a ring. The driver is in charge of locking to protect the kring.
2950  * If native netmap mode is not set just return NULL.
2951  */
2952 struct netmap_slot *
2953 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2954 	u_int new_cur)
2955 {
2956 	struct netmap_kring *kring;
2957 	int new_hwofs, lim;
2958 
2959 	if (!nm_native_on(na)) {
2960 		ND("interface not in native netmap mode");
2961 		return NULL;	/* nothing to reinitialize */
2962 	}
2963 
2964 	/* XXX note- in the new scheme, we are not guaranteed to be
2965 	 * under lock (e.g. when called on a device reset).
2966 	 * In this case, we should set a flag and do not trust too
2967 	 * much the values. In practice: TODO
2968 	 * - set a RESET flag somewhere in the kring
2969 	 * - do the processing in a conservative way
2970 	 * - let the *sync() fixup at the end.
2971 	 */
2972 	if (tx == NR_TX) {
2973 		if (n >= na->num_tx_rings)
2974 			return NULL;
2975 		kring = na->tx_rings + n;
2976 		// XXX check whether we should use hwcur or rcur
2977 		new_hwofs = kring->nr_hwcur - new_cur;
2978 	} else {
2979 		if (n >= na->num_rx_rings)
2980 			return NULL;
2981 		kring = na->rx_rings + n;
2982 		new_hwofs = kring->nr_hwtail - new_cur;
2983 	}
2984 	lim = kring->nkr_num_slots - 1;
2985 	if (new_hwofs > lim)
2986 		new_hwofs -= lim + 1;
2987 
2988 	/* Always set the new offset value and realign the ring. */
2989 	if (netmap_verbose)
2990 	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
2991 		na->name,
2992 		tx == NR_TX ? "TX" : "RX", n,
2993 		kring->nkr_hwofs, new_hwofs,
2994 		kring->nr_hwtail,
2995 		tx == NR_TX ? lim : kring->nr_hwtail);
2996 	kring->nkr_hwofs = new_hwofs;
2997 	if (tx == NR_TX) {
2998 		kring->nr_hwtail = kring->nr_hwcur + lim;
2999 		if (kring->nr_hwtail > lim)
3000 			kring->nr_hwtail -= lim + 1;
3001 	}
3002 
3003 #if 0 // def linux
3004 	/* XXX check that the mappings are correct */
3005 	/* need ring_nr, adapter->pdev, direction */
3006 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
3007 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
3008 		D("error mapping rx netmap buffer %d", i);
3009 		// XXX fix error handling
3010 	}
3011 
3012 #endif /* linux */
3013 	/*
3014 	 * Wakeup on the individual and global selwait
3015 	 * We do the wakeup here, but the ring is not yet reconfigured.
3016 	 * However, we are under lock so there are no races.
3017 	 */
3018 	kring->nm_notify(kring, 0);
3019 	return kring->ring->slot;
3020 }
3021 
3022 
3023 /*
3024  * Dispatch rx/tx interrupts to the netmap rings.
3025  *
3026  * "work_done" is non-null on the RX path, NULL for the TX path.
3027  * We rely on the OS to make sure that there is only one active
3028  * instance per queue, and that there is appropriate locking.
3029  *
3030  * The 'notify' routine depends on what the ring is attached to.
3031  * - for a netmap file descriptor, do a selwakeup on the individual
3032  *   waitqueue, plus one on the global one if needed
3033  *   (see netmap_notify)
3034  * - for a nic connected to a switch, call the proper forwarding routine
3035  *   (see netmap_bwrap_intr_notify)
3036  */
3037 void
3038 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3039 {
3040 	struct netmap_adapter *na = NA(ifp);
3041 	struct netmap_kring *kring;
3042 	enum txrx t = (work_done ? NR_RX : NR_TX);
3043 
3044 	q &= NETMAP_RING_MASK;
3045 
3046 	if (netmap_verbose) {
3047 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
3048 	}
3049 
3050 	if (q >= nma_get_nrings(na, t))
3051 		return;	// not a physical queue
3052 
3053 	kring = NMR(na, t) + q;
3054 
3055 	if (t == NR_RX) {
3056 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
3057 		*work_done = 1; /* do not fire napi again */
3058 	}
3059 	kring->nm_notify(kring, 0);
3060 }
3061 
3062 
3063 /*
3064  * Default functions to handle rx/tx interrupts from a physical device.
3065  * "work_done" is non-null on the RX path, NULL for the TX path.
3066  *
3067  * If the card is not in netmap mode, simply return 0,
3068  * so that the caller proceeds with regular processing.
3069  * Otherwise call netmap_common_irq() and return 1.
3070  *
3071  * If the card is connected to a netmap file descriptor,
3072  * do a selwakeup on the individual queue, plus one on the global one
3073  * if needed (multiqueue card _and_ there are multiqueue listeners),
3074  * and return 1.
3075  *
3076  * Finally, if called on rx from an interface connected to a switch,
3077  * calls the proper forwarding routine, and return 1.
3078  */
3079 int
3080 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3081 {
3082 	struct netmap_adapter *na = NA(ifp);
3083 
3084 	/*
3085 	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
3086 	 * we still use the regular driver even though the previous
3087 	 * check fails. It is unclear whether we should use
3088 	 * nm_native_on() here.
3089 	 */
3090 	if (!nm_netmap_on(na))
3091 		return 0;
3092 
3093 	if (na->na_flags & NAF_SKIP_INTR) {
3094 		ND("use regular interrupt");
3095 		return 0;
3096 	}
3097 
3098 	netmap_common_irq(ifp, q, work_done);
3099 	return 1;
3100 }
3101 
3102 
3103 /*
3104  * Module loader and unloader
3105  *
3106  * netmap_init() creates the /dev/netmap device and initializes
3107  * all global variables. Returns 0 on success, errno on failure
3108  * (but there is no chance)
3109  *
3110  * netmap_fini() destroys everything.
3111  */
3112 
3113 static struct cdev *netmap_dev; /* /dev/netmap character device. */
3114 extern struct cdevsw netmap_cdevsw;
3115 
3116 
3117 void
3118 netmap_fini(void)
3119 {
3120 	netmap_uninit_bridges();
3121 	if (netmap_dev)
3122 		destroy_dev(netmap_dev);
3123 	netmap_mem_fini();
3124 	NMG_LOCK_DESTROY();
3125 	printf("netmap: unloaded module.\n");
3126 }
3127 
3128 
3129 int
3130 netmap_init(void)
3131 {
3132 	int error;
3133 
3134 	NMG_LOCK_INIT();
3135 
3136 	error = netmap_mem_init();
3137 	if (error != 0)
3138 		goto fail;
3139 	/*
3140 	 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
3141 	 * when the module is compiled in.
3142 	 * XXX could use make_dev_credv() to get error number
3143 	 */
3144 	netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
3145 		&netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
3146 			      "netmap");
3147 	if (!netmap_dev)
3148 		goto fail;
3149 
3150 	error = netmap_init_bridges();
3151 	if (error)
3152 		goto fail;
3153 
3154 #ifdef __FreeBSD__
3155 	nm_vi_init_index();
3156 #endif
3157 
3158 	printf("netmap: loaded module\n");
3159 	return (0);
3160 fail:
3161 	netmap_fini();
3162 	return (EINVAL); /* may be incorrect */
3163 }
3164