xref: /freebsd/sys/dev/netmap/netmap.c (revision 0629b152762b06325dd75a41bcb0a2789514141b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (C) 2011-2014 Matteo Landi
5  * Copyright (C) 2011-2016 Luigi Rizzo
6  * Copyright (C) 2011-2016 Giuseppe Lettieri
7  * Copyright (C) 2011-2016 Vincenzo Maffione
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *   1. Redistributions of source code must retain the above copyright
14  *      notice, this list of conditions and the following disclaimer.
15  *   2. Redistributions in binary form must reproduce the above copyright
16  *      notice, this list of conditions and the following disclaimer in the
17  *      documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 
33 /*
34  * $FreeBSD$
35  *
36  * This module supports memory mapped access to network devices,
37  * see netmap(4).
38  *
39  * The module uses a large, memory pool allocated by the kernel
40  * and accessible as mmapped memory by multiple userspace threads/processes.
41  * The memory pool contains packet buffers and "netmap rings",
42  * i.e. user-accessible copies of the interface's queues.
43  *
44  * Access to the network card works like this:
45  * 1. a process/thread issues one or more open() on /dev/netmap, to create
46  *    select()able file descriptor on which events are reported.
47  * 2. on each descriptor, the process issues an ioctl() to identify
48  *    the interface that should report events to the file descriptor.
49  * 3. on each descriptor, the process issues an mmap() request to
50  *    map the shared memory region within the process' address space.
51  *    The list of interesting queues is indicated by a location in
52  *    the shared memory region.
53  * 4. using the functions in the netmap(4) userspace API, a process
54  *    can look up the occupation state of a queue, access memory buffers,
55  *    and retrieve received packets or enqueue packets to transmit.
56  * 5. using some ioctl()s the process can synchronize the userspace view
57  *    of the queue with the actual status in the kernel. This includes both
58  *    receiving the notification of new packets, and transmitting new
59  *    packets on the output interface.
60  * 6. select() or poll() can be used to wait for events on individual
61  *    transmit or receive queues (or all queues for a given interface).
62  *
63 
64 		SYNCHRONIZATION (USER)
65 
66 The netmap rings and data structures may be shared among multiple
67 user threads or even independent processes.
68 Any synchronization among those threads/processes is delegated
69 to the threads themselves. Only one thread at a time can be in
70 a system call on the same netmap ring. The OS does not enforce
71 this and only guarantees against system crashes in case of
72 invalid usage.
73 
74 		LOCKING (INTERNAL)
75 
76 Within the kernel, access to the netmap rings is protected as follows:
77 
78 - a spinlock on each ring, to handle producer/consumer races on
79   RX rings attached to the host stack (against multiple host
80   threads writing from the host stack to the same ring),
81   and on 'destination' rings attached to a VALE switch
82   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
83   protecting multiple active senders for the same destination)
84 
85 - an atomic variable to guarantee that there is at most one
86   instance of *_*xsync() on the ring at any time.
87   For rings connected to user file
88   descriptors, an atomic_test_and_set() protects this, and the
89   lock on the ring is not actually used.
90   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
91   is also used to prevent multiple executions (the driver might indeed
92   already guarantee this).
93   For NIC TX rings connected to a VALE switch, the lock arbitrates
94   access to the queue (both when allocating buffers and when pushing
95   them out).
96 
97 - *xsync() should be protected against initializations of the card.
98   On FreeBSD most devices have the reset routine protected by
99   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
100   the RING protection on rx_reset(), this should be added.
101 
102   On linux there is an external lock on the tx path, which probably
103   also arbitrates access to the reset routine. XXX to be revised
104 
105 - a per-interface core_lock protecting access from the host stack
106   while interfaces may be detached from netmap mode.
107   XXX there should be no need for this lock if we detach the interfaces
108   only while they are down.
109 
110 
111 --- VALE SWITCH ---
112 
113 NMG_LOCK() serializes all modifications to switches and ports.
114 A switch cannot be deleted until all ports are gone.
115 
116 For each switch, an SX lock (RWlock on linux) protects
117 deletion of ports. When configuring or deleting a new port, the
118 lock is acquired in exclusive mode (after holding NMG_LOCK).
119 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
120 The lock is held throughout the entire forwarding cycle,
121 during which the thread may incur in a page fault.
122 Hence it is important that sleepable shared locks are used.
123 
124 On the rx ring, the per-port lock is grabbed initially to reserve
125 a number of slot in the ring, then the lock is released,
126 packets are copied from source to destination, and then
127 the lock is acquired again and the receive ring is updated.
128 (A similar thing is done on the tx ring for NIC and host stack
129 ports attached to the switch)
130 
131  */
132 
133 
134 /* --- internals ----
135  *
136  * Roadmap to the code that implements the above.
137  *
138  * > 1. a process/thread issues one or more open() on /dev/netmap, to create
139  * >    select()able file descriptor on which events are reported.
140  *
141  *  	Internally, we allocate a netmap_priv_d structure, that will be
142  *  	initialized on ioctl(NIOCREGIF). There is one netmap_priv_d
143  *  	structure for each open().
144  *
145  *      os-specific:
146  *  	    FreeBSD: see netmap_open() (netmap_freebsd.c)
147  *  	    linux:   see linux_netmap_open() (netmap_linux.c)
148  *
149  * > 2. on each descriptor, the process issues an ioctl() to identify
150  * >    the interface that should report events to the file descriptor.
151  *
152  * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
153  * 	Most important things happen in netmap_get_na() and
154  * 	netmap_do_regif(), called from there. Additional details can be
155  * 	found in the comments above those functions.
156  *
157  * 	In all cases, this action creates/takes-a-reference-to a
158  * 	netmap_*_adapter describing the port, and allocates a netmap_if
159  * 	and all necessary netmap rings, filling them with netmap buffers.
160  *
161  *      In this phase, the sync callbacks for each ring are set (these are used
162  *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
163  *      The adapter creation/initialization code puts them in the
164  * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
165  * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
166  * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
167  * 	actually call netmap_krings_create() to perform this and the other
168  * 	common stuff. netmap_krings_create() also takes care of the host rings,
169  * 	if needed, by setting their sync callbacks appropriately.
170  *
171  * 	Additional actions depend on the kind of netmap_adapter that has been
172  * 	registered:
173  *
174  * 	- netmap_hw_adapter:  	     [netmap.c]
175  * 	     This is a system netdev/ifp with native netmap support.
176  * 	     The ifp is detached from the host stack by redirecting:
177  * 	       - transmissions (from the network stack) to netmap_transmit()
178  * 	       - receive notifications to the nm_notify() callback for
179  * 	         this adapter. The callback is normally netmap_notify(), unless
180  * 	         the ifp is attached to a bridge using bwrap, in which case it
181  * 	         is netmap_bwrap_intr_notify().
182  *
183  * 	- netmap_generic_adapter:      [netmap_generic.c]
184  * 	      A system netdev/ifp without native netmap support.
185  *
186  * 	(the decision about native/non native support is taken in
187  * 	 netmap_get_hw_na(), called by netmap_get_na())
188  *
189  * 	- netmap_vp_adapter 		[netmap_vale.c]
190  * 	      Returned by netmap_get_bdg_na().
191  * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
192  * 	      are created on the fly if they don't already exist, and are
193  * 	      always attached to a bridge.
194  * 	      Persistent VALE ports must must be created separately, and i
195  * 	      then attached like normal NICs. The NIOCREGIF we are examining
196  * 	      will find them only if they had previosly been created and
197  * 	      attached (see VALE_CTL below).
198  *
199  * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
200  * 	      Returned by netmap_get_pipe_na().
201  * 	      Both pipe ends are created, if they didn't already exist.
202  *
203  * 	- netmap_monitor_adapter      [netmap_monitor.c]
204  * 	      Returned by netmap_get_monitor_na().
205  * 	      If successful, the nm_sync callbacks of the monitored adapter
206  * 	      will be intercepted by the returned monitor.
207  *
208  * 	- netmap_bwrap_adapter	      [netmap_vale.c]
209  * 	      Cannot be obtained in this way, see VALE_CTL below
210  *
211  *
212  * 	os-specific:
213  * 	    linux: we first go through linux_netmap_ioctl() to
214  * 	           adapt the FreeBSD interface to the linux one.
215  *
216  *
217  * > 3. on each descriptor, the process issues an mmap() request to
218  * >    map the shared memory region within the process' address space.
219  * >    The list of interesting queues is indicated by a location in
220  * >    the shared memory region.
221  *
222  *      os-specific:
223  *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
224  *  	    linux:   linux_netmap_mmap (netmap_linux.c).
225  *
226  * > 4. using the functions in the netmap(4) userspace API, a process
227  * >    can look up the occupation state of a queue, access memory buffers,
228  * >    and retrieve received packets or enqueue packets to transmit.
229  *
230  * 	these actions do not involve the kernel.
231  *
232  * > 5. using some ioctl()s the process can synchronize the userspace view
233  * >    of the queue with the actual status in the kernel. This includes both
234  * >    receiving the notification of new packets, and transmitting new
235  * >    packets on the output interface.
236  *
237  * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
238  * 	cases. They invoke the nm_sync callbacks on the netmap_kring
239  * 	structures, as initialized in step 2 and maybe later modified
240  * 	by a monitor. Monitors, however, will always call the original
241  * 	callback before doing anything else.
242  *
243  *
244  * > 6. select() or poll() can be used to wait for events on individual
245  * >    transmit or receive queues (or all queues for a given interface).
246  *
247  * 	Implemented in netmap_poll(). This will call the same nm_sync()
248  * 	callbacks as in step 5 above.
249  *
250  * 	os-specific:
251  * 		linux: we first go through linux_netmap_poll() to adapt
252  * 		       the FreeBSD interface to the linux one.
253  *
254  *
255  *  ----  VALE_CTL -----
256  *
257  *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
258  *  nr_cmd in the nmreq structure. These subcommands are handled by
259  *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
260  *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
261  *  subcommands, respectively.
262  *
263  *  Any network interface known to the system (including a persistent VALE
264  *  port) can be attached to a VALE switch by issuing the
265  *  NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports
266  *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
267  *  attachment of other interfaces, instead, requires the creation of a
268  *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
269  *  netmap mode. This may require the creation of a netmap_generic_adapter if
270  *  we have no native support for the interface, or if generic adapters have
271  *  been forced by sysctl.
272  *
273  *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
274  *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
275  *  callback.  In the case of the bwrap, the callback creates the
276  *  netmap_bwrap_adapter.  The initialization of the bwrap is then
277  *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
278  *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
279  *  A generic adapter for the wrapped ifp will be created if needed, when
280  *  netmap_get_bdg_na() calls netmap_get_hw_na().
281  *
282  *
283  *  ---- DATAPATHS -----
284  *
285  *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
286  *
287  *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
288  *
289  *    - tx from netmap userspace:
290  *	 concurrently:
291  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
292  *                kring->nm_sync() == DEVICE_netmap_txsync()
293  *           2) device interrupt handler
294  *                na->nm_notify()  == netmap_notify()
295  *    - rx from netmap userspace:
296  *       concurrently:
297  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
298  *                kring->nm_sync() == DEVICE_netmap_rxsync()
299  *           2) device interrupt handler
300  *                na->nm_notify()  == netmap_notify()
301  *    - rx from host stack
302  *       concurrently:
303  *           1) host stack
304  *                netmap_transmit()
305  *                  na->nm_notify  == netmap_notify()
306  *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
307  *                kring->nm_sync() == netmap_rxsync_from_host
308  *                  netmap_rxsync_from_host(na, NULL, NULL)
309  *    - tx to host stack
310  *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
311  *             kring->nm_sync() == netmap_txsync_to_host
312  *               netmap_txsync_to_host(na)
313  *                 nm_os_send_up()
314  *                   FreeBSD: na->if_input() == ether_input()
315  *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
316  *
317  *
318  *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
319  *
320  *    na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
321  *
322  *    - tx from netmap userspace:
323  *       concurrently:
324  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
325  *               kring->nm_sync() == generic_netmap_txsync()
326  *                   nm_os_generic_xmit_frame()
327  *                       linux:   dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
328  *                           ifp->ndo_start_xmit == generic_ndo_start_xmit()
329  *                               gna->save_start_xmit == orig. dev. start_xmit
330  *                       FreeBSD: na->if_transmit() == orig. dev if_transmit
331  *           2) generic_mbuf_destructor()
332  *                   na->nm_notify() == netmap_notify()
333  *    - rx from netmap userspace:
334  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
335  *               kring->nm_sync() == generic_netmap_rxsync()
336  *                   mbq_safe_dequeue()
337  *           2) device driver
338  *               generic_rx_handler()
339  *                   mbq_safe_enqueue()
340  *                   na->nm_notify() == netmap_notify()
341  *    - rx from host stack
342  *        FreeBSD: same as native
343  *        Linux: same as native except:
344  *           1) host stack
345  *               dev_queue_xmit() without NM_MAGIC_PRIORITY_TX
346  *                   ifp->ndo_start_xmit == generic_ndo_start_xmit()
347  *                       netmap_transmit()
348  *                           na->nm_notify() == netmap_notify()
349  *    - tx to host stack (same as native):
350  *
351  *
352  *                           -= VALE =-
353  *
354  *   INCOMING:
355  *
356  *      - VALE ports:
357  *          ioctl(NIOCTXSYNC)/netmap_poll() in process context
358  *              kring->nm_sync() == netmap_vp_txsync()
359  *
360  *      - system device with native support:
361  *         from cable:
362  *             interrupt
363  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
364  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
365  *                     netmap_vp_txsync()
366  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
367  *         from host stack:
368  *             netmap_transmit()
369  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
370  *                     kring->nm_sync() == netmap_rxsync_from_host()
371  *                     netmap_vp_txsync()
372  *
373  *      - system device with generic support:
374  *         from device driver:
375  *            generic_rx_handler()
376  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
377  *                     kring->nm_sync() == generic_netmap_rxsync()
378  *                     netmap_vp_txsync()
379  *                     kring->nm_sync() == generic_netmap_rxsync()
380  *         from host stack:
381  *            netmap_transmit()
382  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
383  *                     kring->nm_sync() == netmap_rxsync_from_host()
384  *                     netmap_vp_txsync()
385  *
386  *   (all cases) --> nm_bdg_flush()
387  *                      dest_na->nm_notify() == (see below)
388  *
389  *   OUTGOING:
390  *
391  *      - VALE ports:
392  *         concurrently:
393  *             1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
394  *                    kring->nm_sync() == netmap_vp_rxsync()
395  *             2) from nm_bdg_flush()
396  *                    na->nm_notify() == netmap_notify()
397  *
398  *      - system device with native support:
399  *          to cable:
400  *             na->nm_notify() == netmap_bwrap_notify()
401  *                 netmap_vp_rxsync()
402  *                 kring->nm_sync() == DEVICE_netmap_txsync()
403  *                 netmap_vp_rxsync()
404  *          to host stack:
405  *                 netmap_vp_rxsync()
406  *                 kring->nm_sync() == netmap_txsync_to_host
407  *                 netmap_vp_rxsync_locked()
408  *
409  *      - system device with generic adapter:
410  *          to device driver:
411  *             na->nm_notify() == netmap_bwrap_notify()
412  *                 netmap_vp_rxsync()
413  *                 kring->nm_sync() == generic_netmap_txsync()
414  *                 netmap_vp_rxsync()
415  *          to host stack:
416  *                 netmap_vp_rxsync()
417  *                 kring->nm_sync() == netmap_txsync_to_host
418  *                 netmap_vp_rxsync()
419  *
420  */
421 
422 /*
423  * OS-specific code that is used only within this file.
424  * Other OS-specific code that must be accessed by drivers
425  * is present in netmap_kern.h
426  */
427 
428 #if defined(__FreeBSD__)
429 #include <sys/cdefs.h> /* prerequisite */
430 #include <sys/types.h>
431 #include <sys/errno.h>
432 #include <sys/param.h>	/* defines used in kernel.h */
433 #include <sys/kernel.h>	/* types used in module initialization */
434 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
435 #include <sys/filio.h>	/* FIONBIO */
436 #include <sys/sockio.h>
437 #include <sys/socketvar.h>	/* struct socket */
438 #include <sys/malloc.h>
439 #include <sys/poll.h>
440 #include <sys/rwlock.h>
441 #include <sys/socket.h> /* sockaddrs */
442 #include <sys/selinfo.h>
443 #include <sys/sysctl.h>
444 #include <sys/jail.h>
445 #include <net/vnet.h>
446 #include <net/if.h>
447 #include <net/if_var.h>
448 #include <net/bpf.h>		/* BIOCIMMEDIATE */
449 #include <machine/bus.h>	/* bus_dmamap_* */
450 #include <sys/endian.h>
451 #include <sys/refcount.h>
452 
453 
454 #elif defined(linux)
455 
456 #include "bsd_glue.h"
457 
458 #elif defined(__APPLE__)
459 
460 #warning OSX support is only partial
461 #include "osx_glue.h"
462 
463 #elif defined (_WIN32)
464 
465 #include "win_glue.h"
466 
467 #else
468 
469 #error	Unsupported platform
470 
471 #endif /* unsupported */
472 
473 /*
474  * common headers
475  */
476 #include <net/netmap.h>
477 #include <dev/netmap/netmap_kern.h>
478 #include <dev/netmap/netmap_mem2.h>
479 
480 
481 /* user-controlled variables */
482 int netmap_verbose;
483 
484 static int netmap_no_timestamp; /* don't timestamp on rxsync */
485 int netmap_no_pendintr = 1;
486 int netmap_txsync_retry = 2;
487 static int netmap_fwd = 0;	/* force transparent forwarding */
488 
489 /*
490  * netmap_admode selects the netmap mode to use.
491  * Invalid values are reset to NETMAP_ADMODE_BEST
492  */
493 enum {	NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
494 	NETMAP_ADMODE_NATIVE,	/* either native or none */
495 	NETMAP_ADMODE_GENERIC,	/* force generic */
496 	NETMAP_ADMODE_LAST };
497 static int netmap_admode = NETMAP_ADMODE_BEST;
498 
499 /* netmap_generic_mit controls mitigation of RX notifications for
500  * the generic netmap adapter. The value is a time interval in
501  * nanoseconds. */
502 int netmap_generic_mit = 100*1000;
503 
504 /* We use by default netmap-aware qdiscs with generic netmap adapters,
505  * even if there can be a little performance hit with hardware NICs.
506  * However, using the qdisc is the safer approach, for two reasons:
507  * 1) it prevents non-fifo qdiscs to break the TX notification
508  *    scheme, which is based on mbuf destructors when txqdisc is
509  *    not used.
510  * 2) it makes it possible to transmit over software devices that
511  *    change skb->dev, like bridge, veth, ...
512  *
513  * Anyway users looking for the best performance should
514  * use native adapters.
515  */
516 #ifdef linux
517 int netmap_generic_txqdisc = 1;
518 #endif
519 
520 /* Default number of slots and queues for generic adapters. */
521 int netmap_generic_ringsize = 1024;
522 int netmap_generic_rings = 1;
523 
524 /* Non-zero if ptnet devices are allowed to use virtio-net headers. */
525 int ptnet_vnet_hdr = 1;
526 
527 /* 0 if ptnetmap should not use worker threads for TX processing */
528 int ptnetmap_tx_workers = 1;
529 
530 /*
531  * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated
532  * in some other operating systems
533  */
534 SYSBEGIN(main_init);
535 
536 SYSCTL_DECL(_dev_netmap);
537 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
538 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
539     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
540 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
541     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
542 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr,
543     0, "Always look for new received packets.");
544 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
545     &netmap_txsync_retry, 0, "Number of txsync loops in bridge's flush.");
546 
547 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0,
548     "Force NR_FORWARD mode");
549 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0,
550     "Adapter mode. 0 selects the best option available,"
551     "1 forces native adapter, 2 forces emulated adapter");
552 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit,
553     0, "RX notification interval in nanoseconds");
554 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW,
555     &netmap_generic_ringsize, 0,
556     "Number of per-ring slots for emulated netmap mode");
557 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW,
558     &netmap_generic_rings, 0,
559     "Number of TX/RX queues for emulated netmap adapters");
560 #ifdef linux
561 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW,
562     &netmap_generic_txqdisc, 0, "Use qdisc for generic adapters");
563 #endif
564 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr,
565     0, "Allow ptnet devices to use virtio-net headers");
566 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnetmap_tx_workers, CTLFLAG_RW,
567     &ptnetmap_tx_workers, 0, "Use worker threads for pnetmap TX processing");
568 
569 SYSEND;
570 
571 NMG_LOCK_T	netmap_global_lock;
572 
573 /*
574  * mark the ring as stopped, and run through the locks
575  * to make sure other users get to see it.
576  * stopped must be either NR_KR_STOPPED (for unbounded stop)
577  * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)
578  */
579 static void
580 netmap_disable_ring(struct netmap_kring *kr, int stopped)
581 {
582 	nm_kr_stop(kr, stopped);
583 	// XXX check if nm_kr_stop is sufficient
584 	mtx_lock(&kr->q_lock);
585 	mtx_unlock(&kr->q_lock);
586 	nm_kr_put(kr);
587 }
588 
589 /* stop or enable a single ring */
590 void
591 netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
592 {
593 	if (stopped)
594 		netmap_disable_ring(NMR(na, t) + ring_id, stopped);
595 	else
596 		NMR(na, t)[ring_id].nkr_stopped = 0;
597 }
598 
599 
600 /* stop or enable all the rings of na */
601 void
602 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
603 {
604 	int i;
605 	enum txrx t;
606 
607 	if (!nm_netmap_on(na))
608 		return;
609 
610 	for_rx_tx(t) {
611 		for (i = 0; i < netmap_real_rings(na, t); i++) {
612 			netmap_set_ring(na, i, t, stopped);
613 		}
614 	}
615 }
616 
617 /*
618  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
619  * to finish and prevents any new one from starting.  Call this before turning
620  * netmap mode off, or before removing the hardware rings (e.g., on module
621  * onload).
622  */
623 void
624 netmap_disable_all_rings(struct ifnet *ifp)
625 {
626 	if (NM_NA_VALID(ifp)) {
627 		netmap_set_all_rings(NA(ifp), NM_KR_STOPPED);
628 	}
629 }
630 
631 /*
632  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
633  * adapter's rings In linux drivers, this should be placed near each
634  * napi_enable().
635  */
636 void
637 netmap_enable_all_rings(struct ifnet *ifp)
638 {
639 	if (NM_NA_VALID(ifp)) {
640 		netmap_set_all_rings(NA(ifp), 0 /* enabled */);
641 	}
642 }
643 
644 void
645 netmap_make_zombie(struct ifnet *ifp)
646 {
647 	if (NM_NA_VALID(ifp)) {
648 		struct netmap_adapter *na = NA(ifp);
649 		netmap_set_all_rings(na, NM_KR_LOCKED);
650 		na->na_flags |= NAF_ZOMBIE;
651 		netmap_set_all_rings(na, 0);
652 	}
653 }
654 
655 void
656 netmap_undo_zombie(struct ifnet *ifp)
657 {
658 	if (NM_NA_VALID(ifp)) {
659 		struct netmap_adapter *na = NA(ifp);
660 		if (na->na_flags & NAF_ZOMBIE) {
661 			netmap_set_all_rings(na, NM_KR_LOCKED);
662 			na->na_flags &= ~NAF_ZOMBIE;
663 			netmap_set_all_rings(na, 0);
664 		}
665 	}
666 }
667 
668 /*
669  * generic bound_checking function
670  */
671 u_int
672 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
673 {
674 	u_int oldv = *v;
675 	const char *op = NULL;
676 
677 	if (dflt < lo)
678 		dflt = lo;
679 	if (dflt > hi)
680 		dflt = hi;
681 	if (oldv < lo) {
682 		*v = dflt;
683 		op = "Bump";
684 	} else if (oldv > hi) {
685 		*v = hi;
686 		op = "Clamp";
687 	}
688 	if (op && msg)
689 		nm_prinf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
690 	return *v;
691 }
692 
693 
694 /*
695  * packet-dump function, user-supplied or static buffer.
696  * The destination buffer must be at least 30+4*len
697  */
698 const char *
699 nm_dump_buf(char *p, int len, int lim, char *dst)
700 {
701 	static char _dst[8192];
702 	int i, j, i0;
703 	static char hex[] ="0123456789abcdef";
704 	char *o;	/* output position */
705 
706 #define P_HI(x)	hex[((x) & 0xf0)>>4]
707 #define P_LO(x)	hex[((x) & 0xf)]
708 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
709 	if (!dst)
710 		dst = _dst;
711 	if (lim <= 0 || lim > len)
712 		lim = len;
713 	o = dst;
714 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
715 	o += strlen(o);
716 	/* hexdump routine */
717 	for (i = 0; i < lim; ) {
718 		sprintf(o, "%5d: ", i);
719 		o += strlen(o);
720 		memset(o, ' ', 48);
721 		i0 = i;
722 		for (j=0; j < 16 && i < lim; i++, j++) {
723 			o[j*3] = P_HI(p[i]);
724 			o[j*3+1] = P_LO(p[i]);
725 		}
726 		i = i0;
727 		for (j=0; j < 16 && i < lim; i++, j++)
728 			o[j + 48] = P_C(p[i]);
729 		o[j+48] = '\n';
730 		o += j+49;
731 	}
732 	*o = '\0';
733 #undef P_HI
734 #undef P_LO
735 #undef P_C
736 	return dst;
737 }
738 
739 
740 /*
741  * Fetch configuration from the device, to cope with dynamic
742  * reconfigurations after loading the module.
743  */
744 /* call with NMG_LOCK held */
745 int
746 netmap_update_config(struct netmap_adapter *na)
747 {
748 	u_int txr, txd, rxr, rxd;
749 
750 	txr = txd = rxr = rxd = 0;
751 	if (na->nm_config == NULL ||
752 	    na->nm_config(na, &txr, &txd, &rxr, &rxd))
753 	{
754 		/* take whatever we had at init time */
755 		txr = na->num_tx_rings;
756 		txd = na->num_tx_desc;
757 		rxr = na->num_rx_rings;
758 		rxd = na->num_rx_desc;
759 	}
760 
761 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
762 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
763 		return 0; /* nothing changed */
764 	if (netmap_verbose || na->active_fds > 0) {
765 		D("stored config %s: txring %d x %d, rxring %d x %d",
766 			na->name,
767 			na->num_tx_rings, na->num_tx_desc,
768 			na->num_rx_rings, na->num_rx_desc);
769 		D("new config %s: txring %d x %d, rxring %d x %d",
770 			na->name, txr, txd, rxr, rxd);
771 	}
772 	if (na->active_fds == 0) {
773 		D("configuration changed (but fine)");
774 		na->num_tx_rings = txr;
775 		na->num_tx_desc = txd;
776 		na->num_rx_rings = rxr;
777 		na->num_rx_desc = rxd;
778 		return 0;
779 	}
780 	D("configuration changed while active, this is bad...");
781 	return 1;
782 }
783 
784 /* nm_sync callbacks for the host rings */
785 static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
786 static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
787 
788 /* create the krings array and initialize the fields common to all adapters.
789  * The array layout is this:
790  *
791  *                    +----------+
792  * na->tx_rings ----->|          | \
793  *                    |          |  } na->num_tx_ring
794  *                    |          | /
795  *                    +----------+
796  *                    |          |    host tx kring
797  * na->rx_rings ----> +----------+
798  *                    |          | \
799  *                    |          |  } na->num_rx_rings
800  *                    |          | /
801  *                    +----------+
802  *                    |          |    host rx kring
803  *                    +----------+
804  * na->tailroom ----->|          | \
805  *                    |          |  } tailroom bytes
806  *                    |          | /
807  *                    +----------+
808  *
809  * Note: for compatibility, host krings are created even when not needed.
810  * The tailroom space is currently used by vale ports for allocating leases.
811  */
812 /* call with NMG_LOCK held */
813 int
814 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
815 {
816 	u_int i, len, ndesc;
817 	struct netmap_kring *kring;
818 	u_int n[NR_TXRX];
819 	enum txrx t;
820 
821 	if (na->tx_rings != NULL) {
822 		D("warning: krings were already created");
823 		return 0;
824 	}
825 
826 	/* account for the (possibly fake) host rings */
827 	n[NR_TX] = na->num_tx_rings + 1;
828 	n[NR_RX] = na->num_rx_rings + 1;
829 
830 	len = (n[NR_TX] + n[NR_RX]) * sizeof(struct netmap_kring) + tailroom;
831 
832 	na->tx_rings = nm_os_malloc((size_t)len);
833 	if (na->tx_rings == NULL) {
834 		D("Cannot allocate krings");
835 		return ENOMEM;
836 	}
837 	na->rx_rings = na->tx_rings + n[NR_TX];
838 
839 	/*
840 	 * All fields in krings are 0 except the one initialized below.
841 	 * but better be explicit on important kring fields.
842 	 */
843 	for_rx_tx(t) {
844 		ndesc = nma_get_ndesc(na, t);
845 		for (i = 0; i < n[t]; i++) {
846 			kring = &NMR(na, t)[i];
847 			bzero(kring, sizeof(*kring));
848 			kring->na = na;
849 			kring->ring_id = i;
850 			kring->tx = t;
851 			kring->nkr_num_slots = ndesc;
852 			kring->nr_mode = NKR_NETMAP_OFF;
853 			kring->nr_pending_mode = NKR_NETMAP_OFF;
854 			if (i < nma_get_nrings(na, t)) {
855 				kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
856 			} else {
857 				kring->nm_sync = (t == NR_TX ?
858 						netmap_txsync_to_host:
859 						netmap_rxsync_from_host);
860 			}
861 			kring->nm_notify = na->nm_notify;
862 			kring->rhead = kring->rcur = kring->nr_hwcur = 0;
863 			/*
864 			 * IMPORTANT: Always keep one slot empty.
865 			 */
866 			kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
867 			snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
868 					nm_txrx2str(t), i);
869 			ND("ktx %s h %d c %d t %d",
870 				kring->name, kring->rhead, kring->rcur, kring->rtail);
871 			mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
872 			nm_os_selinfo_init(&kring->si);
873 		}
874 		nm_os_selinfo_init(&na->si[t]);
875 	}
876 
877 	na->tailroom = na->rx_rings + n[NR_RX];
878 
879 	return 0;
880 }
881 
882 
883 /* undo the actions performed by netmap_krings_create */
884 /* call with NMG_LOCK held */
885 void
886 netmap_krings_delete(struct netmap_adapter *na)
887 {
888 	struct netmap_kring *kring = na->tx_rings;
889 	enum txrx t;
890 
891 	if (na->tx_rings == NULL) {
892 		D("warning: krings were already deleted");
893 		return;
894 	}
895 
896 	for_rx_tx(t)
897 		nm_os_selinfo_uninit(&na->si[t]);
898 
899 	/* we rely on the krings layout described above */
900 	for ( ; kring != na->tailroom; kring++) {
901 		mtx_destroy(&kring->q_lock);
902 		nm_os_selinfo_uninit(&kring->si);
903 	}
904 	nm_os_free(na->tx_rings);
905 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
906 }
907 
908 
909 /*
910  * Destructor for NIC ports. They also have an mbuf queue
911  * on the rings connected to the host so we need to purge
912  * them first.
913  */
914 /* call with NMG_LOCK held */
915 void
916 netmap_hw_krings_delete(struct netmap_adapter *na)
917 {
918 	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
919 
920 	ND("destroy sw mbq with len %d", mbq_len(q));
921 	mbq_purge(q);
922 	mbq_safe_fini(q);
923 	netmap_krings_delete(na);
924 }
925 
926 static void
927 netmap_mem_drop(struct netmap_adapter *na)
928 {
929 	int last = netmap_mem_deref(na->nm_mem, na);
930 	/* if the native allocator had been overrided on regif,
931 	 * restore it now and drop the temporary one
932 	 */
933 	if (last && na->nm_mem_prev) {
934 		netmap_mem_put(na->nm_mem);
935 		na->nm_mem = na->nm_mem_prev;
936 		na->nm_mem_prev = NULL;
937 	}
938 }
939 
940 /*
941  * Undo everything that was done in netmap_do_regif(). In particular,
942  * call nm_register(ifp,0) to stop netmap mode on the interface and
943  * revert to normal operation.
944  */
945 /* call with NMG_LOCK held */
946 static void netmap_unset_ringid(struct netmap_priv_d *);
947 static void netmap_krings_put(struct netmap_priv_d *);
948 void
949 netmap_do_unregif(struct netmap_priv_d *priv)
950 {
951 	struct netmap_adapter *na = priv->np_na;
952 
953 	NMG_LOCK_ASSERT();
954 	na->active_fds--;
955 	/* unset nr_pending_mode and possibly release exclusive mode */
956 	netmap_krings_put(priv);
957 
958 #ifdef	WITH_MONITOR
959 	/* XXX check whether we have to do something with monitor
960 	 * when rings change nr_mode. */
961 	if (na->active_fds <= 0) {
962 		/* walk through all the rings and tell any monitor
963 		 * that the port is going to exit netmap mode
964 		 */
965 		netmap_monitor_stop(na);
966 	}
967 #endif
968 
969 	if (na->active_fds <= 0 || nm_kring_pending(priv)) {
970 		na->nm_register(na, 0);
971 	}
972 
973 	/* delete rings and buffers that are no longer needed */
974 	netmap_mem_rings_delete(na);
975 
976 	if (na->active_fds <= 0) {	/* last instance */
977 		/*
978 		 * (TO CHECK) We enter here
979 		 * when the last reference to this file descriptor goes
980 		 * away. This means we cannot have any pending poll()
981 		 * or interrupt routine operating on the structure.
982 		 * XXX The file may be closed in a thread while
983 		 * another thread is using it.
984 		 * Linux keeps the file opened until the last reference
985 		 * by any outstanding ioctl/poll or mmap is gone.
986 		 * FreeBSD does not track mmap()s (but we do) and
987 		 * wakes up any sleeping poll(). Need to check what
988 		 * happens if the close() occurs while a concurrent
989 		 * syscall is running.
990 		 */
991 		if (netmap_verbose)
992 			D("deleting last instance for %s", na->name);
993 
994                 if (nm_netmap_on(na)) {
995                     D("BUG: netmap on while going to delete the krings");
996                 }
997 
998 		na->nm_krings_delete(na);
999 	}
1000 
1001 	/* possibily decrement counter of tx_si/rx_si users */
1002 	netmap_unset_ringid(priv);
1003 	/* delete the nifp */
1004 	netmap_mem_if_delete(na, priv->np_nifp);
1005 	/* drop the allocator */
1006 	netmap_mem_drop(na);
1007 	/* mark the priv as unregistered */
1008 	priv->np_na = NULL;
1009 	priv->np_nifp = NULL;
1010 }
1011 
1012 /* call with NMG_LOCK held */
1013 static __inline int
1014 nm_si_user(struct netmap_priv_d *priv, enum txrx t)
1015 {
1016 	return (priv->np_na != NULL &&
1017 		(priv->np_qlast[t] - priv->np_qfirst[t] > 1));
1018 }
1019 
1020 struct netmap_priv_d*
1021 netmap_priv_new(void)
1022 {
1023 	struct netmap_priv_d *priv;
1024 
1025 	priv = nm_os_malloc(sizeof(struct netmap_priv_d));
1026 	if (priv == NULL)
1027 		return NULL;
1028 	priv->np_refs = 1;
1029 	nm_os_get_module();
1030 	return priv;
1031 }
1032 
1033 /*
1034  * Destructor of the netmap_priv_d, called when the fd is closed
1035  * Action: undo all the things done by NIOCREGIF,
1036  * On FreeBSD we need to track whether there are active mmap()s,
1037  * and we use np_active_mmaps for that. On linux, the field is always 0.
1038  * Return: 1 if we can free priv, 0 otherwise.
1039  *
1040  */
1041 /* call with NMG_LOCK held */
1042 void
1043 netmap_priv_delete(struct netmap_priv_d *priv)
1044 {
1045 	struct netmap_adapter *na = priv->np_na;
1046 
1047 	/* number of active references to this fd */
1048 	if (--priv->np_refs > 0) {
1049 		return;
1050 	}
1051 	nm_os_put_module();
1052 	if (na) {
1053 		netmap_do_unregif(priv);
1054 	}
1055 	netmap_unget_na(na, priv->np_ifp);
1056 	bzero(priv, sizeof(*priv));	/* for safety */
1057 	nm_os_free(priv);
1058 }
1059 
1060 
1061 /* call with NMG_LOCK *not* held */
1062 void
1063 netmap_dtor(void *data)
1064 {
1065 	struct netmap_priv_d *priv = data;
1066 
1067 	NMG_LOCK();
1068 	netmap_priv_delete(priv);
1069 	NMG_UNLOCK();
1070 }
1071 
1072 
1073 /*
1074  * Handlers for synchronization of the rings from/to the host stack.
1075  * These are associated to a network interface and are just another
1076  * ring pair managed by userspace.
1077  *
1078  * Netmap also supports transparent forwarding (NS_FORWARD and NR_FORWARD
1079  * flags):
1080  *
1081  * - Before releasing buffers on hw RX rings, the application can mark
1082  *   them with the NS_FORWARD flag. During the next RXSYNC or poll(), they
1083  *   will be forwarded to the host stack, similarly to what happened if
1084  *   the application moved them to the host TX ring.
1085  *
1086  * - Before releasing buffers on the host RX ring, the application can
1087  *   mark them with the NS_FORWARD flag. During the next RXSYNC or poll(),
1088  *   they will be forwarded to the hw TX rings, saving the application
1089  *   from doing the same task in user-space.
1090  *
1091  * Transparent fowarding can be enabled per-ring, by setting the NR_FORWARD
1092  * flag, or globally with the netmap_fwd sysctl.
1093  *
1094  * The transfer NIC --> host is relatively easy, just encapsulate
1095  * into mbufs and we are done. The host --> NIC side is slightly
1096  * harder because there might not be room in the tx ring so it
1097  * might take a while before releasing the buffer.
1098  */
1099 
1100 
1101 /*
1102  * Pass a whole queue of mbufs to the host stack as coming from 'dst'
1103  * We do not need to lock because the queue is private.
1104  * After this call the queue is empty.
1105  */
1106 static void
1107 netmap_send_up(struct ifnet *dst, struct mbq *q)
1108 {
1109 	struct mbuf *m;
1110 	struct mbuf *head = NULL, *prev = NULL;
1111 
1112 	/* Send packets up, outside the lock; head/prev machinery
1113 	 * is only useful for Windows. */
1114 	while ((m = mbq_dequeue(q)) != NULL) {
1115 		if (netmap_verbose & NM_VERB_HOST)
1116 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
1117 		prev = nm_os_send_up(dst, m, prev);
1118 		if (head == NULL)
1119 			head = prev;
1120 	}
1121 	if (head)
1122 		nm_os_send_up(dst, NULL, head);
1123 	mbq_fini(q);
1124 }
1125 
1126 
1127 /*
1128  * Scan the buffers from hwcur to ring->head, and put a copy of those
1129  * marked NS_FORWARD (or all of them if forced) into a queue of mbufs.
1130  * Drop remaining packets in the unlikely event
1131  * of an mbuf shortage.
1132  */
1133 static void
1134 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1135 {
1136 	u_int const lim = kring->nkr_num_slots - 1;
1137 	u_int const head = kring->rhead;
1138 	u_int n;
1139 	struct netmap_adapter *na = kring->na;
1140 
1141 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1142 		struct mbuf *m;
1143 		struct netmap_slot *slot = &kring->ring->slot[n];
1144 
1145 		if ((slot->flags & NS_FORWARD) == 0 && !force)
1146 			continue;
1147 		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1148 			RD(5, "bad pkt at %d len %d", n, slot->len);
1149 			continue;
1150 		}
1151 		slot->flags &= ~NS_FORWARD; // XXX needed ?
1152 		/* XXX TODO: adapt to the case of a multisegment packet */
1153 		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1154 
1155 		if (m == NULL)
1156 			break;
1157 		mbq_enqueue(q, m);
1158 	}
1159 }
1160 
1161 static inline int
1162 _nm_may_forward(struct netmap_kring *kring)
1163 {
1164 	return	((netmap_fwd || kring->ring->flags & NR_FORWARD) &&
1165 		 kring->na->na_flags & NAF_HOST_RINGS &&
1166 		 kring->tx == NR_RX);
1167 }
1168 
1169 static inline int
1170 nm_may_forward_up(struct netmap_kring *kring)
1171 {
1172 	return	_nm_may_forward(kring) &&
1173 		 kring->ring_id != kring->na->num_rx_rings;
1174 }
1175 
1176 static inline int
1177 nm_may_forward_down(struct netmap_kring *kring, int sync_flags)
1178 {
1179 	return	_nm_may_forward(kring) &&
1180 		 (sync_flags & NAF_CAN_FORWARD_DOWN) &&
1181 		 kring->ring_id == kring->na->num_rx_rings;
1182 }
1183 
1184 /*
1185  * Send to the NIC rings packets marked NS_FORWARD between
1186  * kring->nr_hwcur and kring->rhead.
1187  * Called under kring->rx_queue.lock on the sw rx ring.
1188  *
1189  * It can only be called if the user opened all the TX hw rings,
1190  * see NAF_CAN_FORWARD_DOWN flag.
1191  * We can touch the TX netmap rings (slots, head and cur) since
1192  * we are in poll/ioctl system call context, and the application
1193  * is not supposed to touch the ring (using a different thread)
1194  * during the execution of the system call.
1195  */
1196 static u_int
1197 netmap_sw_to_nic(struct netmap_adapter *na)
1198 {
1199 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1200 	struct netmap_slot *rxslot = kring->ring->slot;
1201 	u_int i, rxcur = kring->nr_hwcur;
1202 	u_int const head = kring->rhead;
1203 	u_int const src_lim = kring->nkr_num_slots - 1;
1204 	u_int sent = 0;
1205 
1206 	/* scan rings to find space, then fill as much as possible */
1207 	for (i = 0; i < na->num_tx_rings; i++) {
1208 		struct netmap_kring *kdst = &na->tx_rings[i];
1209 		struct netmap_ring *rdst = kdst->ring;
1210 		u_int const dst_lim = kdst->nkr_num_slots - 1;
1211 
1212 		/* XXX do we trust ring or kring->rcur,rtail ? */
1213 		for (; rxcur != head && !nm_ring_empty(rdst);
1214 		     rxcur = nm_next(rxcur, src_lim) ) {
1215 			struct netmap_slot *src, *dst, tmp;
1216 			u_int dst_head = rdst->head;
1217 
1218 			src = &rxslot[rxcur];
1219 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1220 				continue;
1221 
1222 			sent++;
1223 
1224 			dst = &rdst->slot[dst_head];
1225 
1226 			tmp = *src;
1227 
1228 			src->buf_idx = dst->buf_idx;
1229 			src->flags = NS_BUF_CHANGED;
1230 
1231 			dst->buf_idx = tmp.buf_idx;
1232 			dst->len = tmp.len;
1233 			dst->flags = NS_BUF_CHANGED;
1234 
1235 			rdst->head = rdst->cur = nm_next(dst_head, dst_lim);
1236 		}
1237 		/* if (sent) XXX txsync ? it would be just an optimization */
1238 	}
1239 	return sent;
1240 }
1241 
1242 
1243 /*
1244  * netmap_txsync_to_host() passes packets up. We are called from a
1245  * system call in user process context, and the only contention
1246  * can be among multiple user threads erroneously calling
1247  * this routine concurrently.
1248  */
1249 static int
1250 netmap_txsync_to_host(struct netmap_kring *kring, int flags)
1251 {
1252 	struct netmap_adapter *na = kring->na;
1253 	u_int const lim = kring->nkr_num_slots - 1;
1254 	u_int const head = kring->rhead;
1255 	struct mbq q;
1256 
1257 	/* Take packets from hwcur to head and pass them up.
1258 	 * Force hwcur = head since netmap_grab_packets() stops at head
1259 	 */
1260 	mbq_init(&q);
1261 	netmap_grab_packets(kring, &q, 1 /* force */);
1262 	ND("have %d pkts in queue", mbq_len(&q));
1263 	kring->nr_hwcur = head;
1264 	kring->nr_hwtail = head + lim;
1265 	if (kring->nr_hwtail > lim)
1266 		kring->nr_hwtail -= lim + 1;
1267 
1268 	netmap_send_up(na->ifp, &q);
1269 	return 0;
1270 }
1271 
1272 
1273 /*
1274  * rxsync backend for packets coming from the host stack.
1275  * They have been put in kring->rx_queue by netmap_transmit().
1276  * We protect access to the kring using kring->rx_queue.lock
1277  *
1278  * also moves to the nic hw rings any packet the user has marked
1279  * for transparent-mode forwarding, then sets the NR_FORWARD
1280  * flag in the kring to let the caller push them out
1281  */
1282 static int
1283 netmap_rxsync_from_host(struct netmap_kring *kring, int flags)
1284 {
1285 	struct netmap_adapter *na = kring->na;
1286 	struct netmap_ring *ring = kring->ring;
1287 	u_int nm_i, n;
1288 	u_int const lim = kring->nkr_num_slots - 1;
1289 	u_int const head = kring->rhead;
1290 	int ret = 0;
1291 	struct mbq *q = &kring->rx_queue, fq;
1292 
1293 	mbq_init(&fq); /* fq holds packets to be freed */
1294 
1295 	mbq_lock(q);
1296 
1297 	/* First part: import newly received packets */
1298 	n = mbq_len(q);
1299 	if (n) { /* grab packets from the queue */
1300 		struct mbuf *m;
1301 		uint32_t stop_i;
1302 
1303 		nm_i = kring->nr_hwtail;
1304 		stop_i = nm_prev(kring->nr_hwcur, lim);
1305 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1306 			int len = MBUF_LEN(m);
1307 			struct netmap_slot *slot = &ring->slot[nm_i];
1308 
1309 			m_copydata(m, 0, len, NMB(na, slot));
1310 			ND("nm %d len %d", nm_i, len);
1311 			if (netmap_verbose)
1312                                 D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1313 
1314 			slot->len = len;
1315 			slot->flags = 0;
1316 			nm_i = nm_next(nm_i, lim);
1317 			mbq_enqueue(&fq, m);
1318 		}
1319 		kring->nr_hwtail = nm_i;
1320 	}
1321 
1322 	/*
1323 	 * Second part: skip past packets that userspace has released.
1324 	 */
1325 	nm_i = kring->nr_hwcur;
1326 	if (nm_i != head) { /* something was released */
1327 		if (nm_may_forward_down(kring, flags)) {
1328 			ret = netmap_sw_to_nic(na);
1329 			if (ret > 0) {
1330 				kring->nr_kflags |= NR_FORWARD;
1331 				ret = 0;
1332 			}
1333 		}
1334 		kring->nr_hwcur = head;
1335 	}
1336 
1337 	mbq_unlock(q);
1338 
1339 	mbq_purge(&fq);
1340 	mbq_fini(&fq);
1341 
1342 	return ret;
1343 }
1344 
1345 
1346 /* Get a netmap adapter for the port.
1347  *
1348  * If it is possible to satisfy the request, return 0
1349  * with *na containing the netmap adapter found.
1350  * Otherwise return an error code, with *na containing NULL.
1351  *
1352  * When the port is attached to a bridge, we always return
1353  * EBUSY.
1354  * Otherwise, if the port is already bound to a file descriptor,
1355  * then we unconditionally return the existing adapter into *na.
1356  * In all the other cases, we return (into *na) either native,
1357  * generic or NULL, according to the following table:
1358  *
1359  *					native_support
1360  * active_fds   dev.netmap.admode         YES     NO
1361  * -------------------------------------------------------
1362  *    >0              *                 NA(ifp) NA(ifp)
1363  *
1364  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1365  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1366  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1367  *
1368  */
1369 static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */
1370 int
1371 netmap_get_hw_na(struct ifnet *ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na)
1372 {
1373 	/* generic support */
1374 	int i = netmap_admode;	/* Take a snapshot. */
1375 	struct netmap_adapter *prev_na;
1376 	int error = 0;
1377 
1378 	*na = NULL; /* default */
1379 
1380 	/* reset in case of invalid value */
1381 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1382 		i = netmap_admode = NETMAP_ADMODE_BEST;
1383 
1384 	if (NM_NA_VALID(ifp)) {
1385 		prev_na = NA(ifp);
1386 		/* If an adapter already exists, return it if
1387 		 * there are active file descriptors or if
1388 		 * netmap is not forced to use generic
1389 		 * adapters.
1390 		 */
1391 		if (NETMAP_OWNED_BY_ANY(prev_na)
1392 			|| i != NETMAP_ADMODE_GENERIC
1393 			|| prev_na->na_flags & NAF_FORCE_NATIVE
1394 #ifdef WITH_PIPES
1395 			/* ugly, but we cannot allow an adapter switch
1396 			 * if some pipe is referring to this one
1397 			 */
1398 			|| prev_na->na_next_pipe > 0
1399 #endif
1400 		) {
1401 			*na = prev_na;
1402 			goto assign_mem;
1403 		}
1404 	}
1405 
1406 	/* If there isn't native support and netmap is not allowed
1407 	 * to use generic adapters, we cannot satisfy the request.
1408 	 */
1409 	if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)
1410 		return EOPNOTSUPP;
1411 
1412 	/* Otherwise, create a generic adapter and return it,
1413 	 * saving the previously used netmap adapter, if any.
1414 	 *
1415 	 * Note that here 'prev_na', if not NULL, MUST be a
1416 	 * native adapter, and CANNOT be a generic one. This is
1417 	 * true because generic adapters are created on demand, and
1418 	 * destroyed when not used anymore. Therefore, if the adapter
1419 	 * currently attached to an interface 'ifp' is generic, it
1420 	 * must be that
1421 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1422 	 * Consequently, if NA(ifp) is generic, we will enter one of
1423 	 * the branches above. This ensures that we never override
1424 	 * a generic adapter with another generic adapter.
1425 	 */
1426 	error = generic_netmap_attach(ifp);
1427 	if (error)
1428 		return error;
1429 
1430 	*na = NA(ifp);
1431 
1432 assign_mem:
1433 	if (nmd != NULL && !((*na)->na_flags & NAF_MEM_OWNER) &&
1434 	    (*na)->active_fds == 0 && ((*na)->nm_mem != nmd)) {
1435 		(*na)->nm_mem_prev = (*na)->nm_mem;
1436 		(*na)->nm_mem = netmap_mem_get(nmd);
1437 	}
1438 
1439 	return 0;
1440 }
1441 
1442 /*
1443  * MUST BE CALLED UNDER NMG_LOCK()
1444  *
1445  * Get a refcounted reference to a netmap adapter attached
1446  * to the interface specified by nmr.
1447  * This is always called in the execution of an ioctl().
1448  *
1449  * Return ENXIO if the interface specified by the request does
1450  * not exist, ENOTSUP if netmap is not supported by the interface,
1451  * EBUSY if the interface is already attached to a bridge,
1452  * EINVAL if parameters are invalid, ENOMEM if needed resources
1453  * could not be allocated.
1454  * If successful, hold a reference to the netmap adapter.
1455  *
1456  * If the interface specified by nmr is a system one, also keep
1457  * a reference to it and return a valid *ifp.
1458  */
1459 int
1460 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na,
1461 	      struct ifnet **ifp, struct netmap_mem_d *nmd, int create)
1462 {
1463 	int error = 0;
1464 	struct netmap_adapter *ret = NULL;
1465 	int nmd_ref = 0;
1466 
1467 	*na = NULL;     /* default return value */
1468 	*ifp = NULL;
1469 
1470 	NMG_LOCK_ASSERT();
1471 
1472 	/* if the request contain a memid, try to find the
1473 	 * corresponding memory region
1474 	 */
1475 	if (nmd == NULL && nmr->nr_arg2) {
1476 		nmd = netmap_mem_find(nmr->nr_arg2);
1477 		if (nmd == NULL)
1478 			return EINVAL;
1479 		/* keep the rereference */
1480 		nmd_ref = 1;
1481 	}
1482 
1483 	/* We cascade through all possible types of netmap adapter.
1484 	 * All netmap_get_*_na() functions return an error and an na,
1485 	 * with the following combinations:
1486 	 *
1487 	 * error    na
1488 	 *   0	   NULL		type doesn't match
1489 	 *  !0	   NULL		type matches, but na creation/lookup failed
1490 	 *   0	  !NULL		type matches and na created/found
1491 	 *  !0    !NULL		impossible
1492 	 */
1493 
1494 	/* try to see if this is a ptnetmap port */
1495 	error = netmap_get_pt_host_na(nmr, na, nmd, create);
1496 	if (error || *na != NULL)
1497 		goto out;
1498 
1499 	/* try to see if this is a monitor port */
1500 	error = netmap_get_monitor_na(nmr, na, nmd, create);
1501 	if (error || *na != NULL)
1502 		goto out;
1503 
1504 	/* try to see if this is a pipe port */
1505 	error = netmap_get_pipe_na(nmr, na, nmd, create);
1506 	if (error || *na != NULL)
1507 		goto out;
1508 
1509 	/* try to see if this is a bridge port */
1510 	error = netmap_get_bdg_na(nmr, na, nmd, create);
1511 	if (error)
1512 		goto out;
1513 
1514 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1515 		goto out;
1516 
1517 	/*
1518 	 * This must be a hardware na, lookup the name in the system.
1519 	 * Note that by hardware we actually mean "it shows up in ifconfig".
1520 	 * This may still be a tap, a veth/epair, or even a
1521 	 * persistent VALE port.
1522 	 */
1523 	*ifp = ifunit_ref(nmr->nr_name);
1524 	if (*ifp == NULL) {
1525 		error = ENXIO;
1526 		goto out;
1527 	}
1528 
1529 	error = netmap_get_hw_na(*ifp, nmd, &ret);
1530 	if (error)
1531 		goto out;
1532 
1533 	*na = ret;
1534 	netmap_adapter_get(ret);
1535 
1536 out:
1537 	if (error) {
1538 		if (ret)
1539 			netmap_adapter_put(ret);
1540 		if (*ifp) {
1541 			if_rele(*ifp);
1542 			*ifp = NULL;
1543 		}
1544 	}
1545 	if (nmd_ref)
1546 		netmap_mem_put(nmd);
1547 
1548 	return error;
1549 }
1550 
1551 /* undo netmap_get_na() */
1552 void
1553 netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp)
1554 {
1555 	if (ifp)
1556 		if_rele(ifp);
1557 	if (na)
1558 		netmap_adapter_put(na);
1559 }
1560 
1561 
1562 #define NM_FAIL_ON(t) do {						\
1563 	if (unlikely(t)) {						\
1564 		RD(5, "%s: fail '" #t "' "				\
1565 			"h %d c %d t %d "				\
1566 			"rh %d rc %d rt %d "				\
1567 			"hc %d ht %d",					\
1568 			kring->name,					\
1569 			head, cur, ring->tail,				\
1570 			kring->rhead, kring->rcur, kring->rtail,	\
1571 			kring->nr_hwcur, kring->nr_hwtail);		\
1572 		return kring->nkr_num_slots;				\
1573 	}								\
1574 } while (0)
1575 
1576 /*
1577  * validate parameters on entry for *_txsync()
1578  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1579  * in case of error.
1580  *
1581  * rhead, rcur and rtail=hwtail are stored from previous round.
1582  * hwcur is the next packet to send to the ring.
1583  *
1584  * We want
1585  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1586  *
1587  * hwcur, rhead, rtail and hwtail are reliable
1588  */
1589 u_int
1590 nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1591 {
1592 	u_int head = ring->head; /* read only once */
1593 	u_int cur = ring->cur; /* read only once */
1594 	u_int n = kring->nkr_num_slots;
1595 
1596 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1597 		kring->name,
1598 		kring->nr_hwcur, kring->nr_hwtail,
1599 		ring->head, ring->cur, ring->tail);
1600 #if 1 /* kernel sanity checks; but we can trust the kring. */
1601 	NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||
1602 	    kring->rtail >= n ||  kring->nr_hwtail >= n);
1603 #endif /* kernel sanity checks */
1604 	/*
1605 	 * user sanity checks. We only use head,
1606 	 * A, B, ... are possible positions for head:
1607 	 *
1608 	 *  0    A  rhead   B  rtail   C  n-1
1609 	 *  0    D  rtail   E  rhead   F  n-1
1610 	 *
1611 	 * B, F, D are valid. A, C, E are wrong
1612 	 */
1613 	if (kring->rtail >= kring->rhead) {
1614 		/* want rhead <= head <= rtail */
1615 		NM_FAIL_ON(head < kring->rhead || head > kring->rtail);
1616 		/* and also head <= cur <= rtail */
1617 		NM_FAIL_ON(cur < head || cur > kring->rtail);
1618 	} else { /* here rtail < rhead */
1619 		/* we need head outside rtail .. rhead */
1620 		NM_FAIL_ON(head > kring->rtail && head < kring->rhead);
1621 
1622 		/* two cases now: head <= rtail or head >= rhead  */
1623 		if (head <= kring->rtail) {
1624 			/* want head <= cur <= rtail */
1625 			NM_FAIL_ON(cur < head || cur > kring->rtail);
1626 		} else { /* head >= rhead */
1627 			/* cur must be outside rtail..head */
1628 			NM_FAIL_ON(cur > kring->rtail && cur < head);
1629 		}
1630 	}
1631 	if (ring->tail != kring->rtail) {
1632 		RD(5, "%s tail overwritten was %d need %d", kring->name,
1633 			ring->tail, kring->rtail);
1634 		ring->tail = kring->rtail;
1635 	}
1636 	kring->rhead = head;
1637 	kring->rcur = cur;
1638 	return head;
1639 }
1640 
1641 
1642 /*
1643  * validate parameters on entry for *_rxsync()
1644  * Returns ring->head if ok, kring->nkr_num_slots on error.
1645  *
1646  * For a valid configuration,
1647  * hwcur <= head <= cur <= tail <= hwtail
1648  *
1649  * We only consider head and cur.
1650  * hwcur and hwtail are reliable.
1651  *
1652  */
1653 u_int
1654 nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1655 {
1656 	uint32_t const n = kring->nkr_num_slots;
1657 	uint32_t head, cur;
1658 
1659 	ND(5,"%s kc %d kt %d h %d c %d t %d",
1660 		kring->name,
1661 		kring->nr_hwcur, kring->nr_hwtail,
1662 		ring->head, ring->cur, ring->tail);
1663 	/*
1664 	 * Before storing the new values, we should check they do not
1665 	 * move backwards. However:
1666 	 * - head is not an issue because the previous value is hwcur;
1667 	 * - cur could in principle go back, however it does not matter
1668 	 *   because we are processing a brand new rxsync()
1669 	 */
1670 	cur = kring->rcur = ring->cur;	/* read only once */
1671 	head = kring->rhead = ring->head;	/* read only once */
1672 #if 1 /* kernel sanity checks */
1673 	NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);
1674 #endif /* kernel sanity checks */
1675 	/* user sanity checks */
1676 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1677 		/* want hwcur <= rhead <= hwtail */
1678 		NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);
1679 		/* and also rhead <= rcur <= hwtail */
1680 		NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1681 	} else {
1682 		/* we need rhead outside hwtail..hwcur */
1683 		NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);
1684 		/* two cases now: head <= hwtail or head >= hwcur  */
1685 		if (head <= kring->nr_hwtail) {
1686 			/* want head <= cur <= hwtail */
1687 			NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1688 		} else {
1689 			/* cur must be outside hwtail..head */
1690 			NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);
1691 		}
1692 	}
1693 	if (ring->tail != kring->rtail) {
1694 		RD(5, "%s tail overwritten was %d need %d",
1695 			kring->name,
1696 			ring->tail, kring->rtail);
1697 		ring->tail = kring->rtail;
1698 	}
1699 	return head;
1700 }
1701 
1702 
1703 /*
1704  * Error routine called when txsync/rxsync detects an error.
1705  * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1706  * Return 1 on reinit.
1707  *
1708  * This routine is only called by the upper half of the kernel.
1709  * It only reads hwcur (which is changed only by the upper half, too)
1710  * and hwtail (which may be changed by the lower half, but only on
1711  * a tx ring and only to increase it, so any error will be recovered
1712  * on the next call). For the above, we don't strictly need to call
1713  * it under lock.
1714  */
1715 int
1716 netmap_ring_reinit(struct netmap_kring *kring)
1717 {
1718 	struct netmap_ring *ring = kring->ring;
1719 	u_int i, lim = kring->nkr_num_slots - 1;
1720 	int errors = 0;
1721 
1722 	// XXX KASSERT nm_kr_tryget
1723 	RD(10, "called for %s", kring->name);
1724 	// XXX probably wrong to trust userspace
1725 	kring->rhead = ring->head;
1726 	kring->rcur  = ring->cur;
1727 	kring->rtail = ring->tail;
1728 
1729 	if (ring->cur > lim)
1730 		errors++;
1731 	if (ring->head > lim)
1732 		errors++;
1733 	if (ring->tail > lim)
1734 		errors++;
1735 	for (i = 0; i <= lim; i++) {
1736 		u_int idx = ring->slot[i].buf_idx;
1737 		u_int len = ring->slot[i].len;
1738 		if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1739 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1740 			ring->slot[i].buf_idx = 0;
1741 			ring->slot[i].len = 0;
1742 		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1743 			ring->slot[i].len = 0;
1744 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1745 		}
1746 	}
1747 	if (errors) {
1748 		RD(10, "total %d errors", errors);
1749 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1750 			kring->name,
1751 			ring->cur, kring->nr_hwcur,
1752 			ring->tail, kring->nr_hwtail);
1753 		ring->head = kring->rhead = kring->nr_hwcur;
1754 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1755 		ring->tail = kring->rtail = kring->nr_hwtail;
1756 	}
1757 	return (errors ? 1 : 0);
1758 }
1759 
1760 /* interpret the ringid and flags fields of an nmreq, by translating them
1761  * into a pair of intervals of ring indices:
1762  *
1763  * [priv->np_txqfirst, priv->np_txqlast) and
1764  * [priv->np_rxqfirst, priv->np_rxqlast)
1765  *
1766  */
1767 int
1768 netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1769 {
1770 	struct netmap_adapter *na = priv->np_na;
1771 	u_int j, i = ringid & NETMAP_RING_MASK;
1772 	u_int reg = flags & NR_REG_MASK;
1773 	int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };
1774 	enum txrx t;
1775 
1776 	if (reg == NR_REG_DEFAULT) {
1777 		/* convert from old ringid to flags */
1778 		if (ringid & NETMAP_SW_RING) {
1779 			reg = NR_REG_SW;
1780 		} else if (ringid & NETMAP_HW_RING) {
1781 			reg = NR_REG_ONE_NIC;
1782 		} else {
1783 			reg = NR_REG_ALL_NIC;
1784 		}
1785 		D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
1786 	}
1787 
1788 	if ((flags & NR_PTNETMAP_HOST) && ((reg != NR_REG_ALL_NIC &&
1789                     reg != NR_REG_PIPE_MASTER && reg != NR_REG_PIPE_SLAVE) ||
1790 			flags & (NR_RX_RINGS_ONLY|NR_TX_RINGS_ONLY))) {
1791 		D("Error: only NR_REG_ALL_NIC supported with netmap passthrough");
1792 		return EINVAL;
1793 	}
1794 
1795 	for_rx_tx(t) {
1796 		if (flags & excluded_direction[t]) {
1797 			priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1798 			continue;
1799 		}
1800 		switch (reg) {
1801 		case NR_REG_ALL_NIC:
1802 		case NR_REG_PIPE_MASTER:
1803 		case NR_REG_PIPE_SLAVE:
1804 			priv->np_qfirst[t] = 0;
1805 			priv->np_qlast[t] = nma_get_nrings(na, t);
1806 			ND("ALL/PIPE: %s %d %d", nm_txrx2str(t),
1807 				priv->np_qfirst[t], priv->np_qlast[t]);
1808 			break;
1809 		case NR_REG_SW:
1810 		case NR_REG_NIC_SW:
1811 			if (!(na->na_flags & NAF_HOST_RINGS)) {
1812 				D("host rings not supported");
1813 				return EINVAL;
1814 			}
1815 			priv->np_qfirst[t] = (reg == NR_REG_SW ?
1816 				nma_get_nrings(na, t) : 0);
1817 			priv->np_qlast[t] = nma_get_nrings(na, t) + 1;
1818 			ND("%s: %s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
1819 				nm_txrx2str(t),
1820 				priv->np_qfirst[t], priv->np_qlast[t]);
1821 			break;
1822 		case NR_REG_ONE_NIC:
1823 			if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
1824 				D("invalid ring id %d", i);
1825 				return EINVAL;
1826 			}
1827 			/* if not enough rings, use the first one */
1828 			j = i;
1829 			if (j >= nma_get_nrings(na, t))
1830 				j = 0;
1831 			priv->np_qfirst[t] = j;
1832 			priv->np_qlast[t] = j + 1;
1833 			ND("ONE_NIC: %s %d %d", nm_txrx2str(t),
1834 				priv->np_qfirst[t], priv->np_qlast[t]);
1835 			break;
1836 		default:
1837 			D("invalid regif type %d", reg);
1838 			return EINVAL;
1839 		}
1840 	}
1841 	priv->np_flags = (flags & ~NR_REG_MASK) | reg;
1842 
1843 	/* Allow transparent forwarding mode in the host --> nic
1844 	 * direction only if all the TX hw rings have been opened. */
1845 	if (priv->np_qfirst[NR_TX] == 0 &&
1846 			priv->np_qlast[NR_TX] >= na->num_tx_rings) {
1847 		priv->np_sync_flags |= NAF_CAN_FORWARD_DOWN;
1848 	}
1849 
1850 	if (netmap_verbose) {
1851 		D("%s: tx [%d,%d) rx [%d,%d) id %d",
1852 			na->name,
1853 			priv->np_qfirst[NR_TX],
1854 			priv->np_qlast[NR_TX],
1855 			priv->np_qfirst[NR_RX],
1856 			priv->np_qlast[NR_RX],
1857 			i);
1858 	}
1859 	return 0;
1860 }
1861 
1862 
1863 /*
1864  * Set the ring ID. For devices with a single queue, a request
1865  * for all rings is the same as a single ring.
1866  */
1867 static int
1868 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1869 {
1870 	struct netmap_adapter *na = priv->np_na;
1871 	int error;
1872 	enum txrx t;
1873 
1874 	error = netmap_interp_ringid(priv, ringid, flags);
1875 	if (error) {
1876 		return error;
1877 	}
1878 
1879 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1880 
1881 	/* optimization: count the users registered for more than
1882 	 * one ring, which are the ones sleeping on the global queue.
1883 	 * The default netmap_notify() callback will then
1884 	 * avoid signaling the global queue if nobody is using it
1885 	 */
1886 	for_rx_tx(t) {
1887 		if (nm_si_user(priv, t))
1888 			na->si_users[t]++;
1889 	}
1890 	return 0;
1891 }
1892 
1893 static void
1894 netmap_unset_ringid(struct netmap_priv_d *priv)
1895 {
1896 	struct netmap_adapter *na = priv->np_na;
1897 	enum txrx t;
1898 
1899 	for_rx_tx(t) {
1900 		if (nm_si_user(priv, t))
1901 			na->si_users[t]--;
1902 		priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1903 	}
1904 	priv->np_flags = 0;
1905 	priv->np_txpoll = 0;
1906 }
1907 
1908 
1909 /* Set the nr_pending_mode for the requested rings.
1910  * If requested, also try to get exclusive access to the rings, provided
1911  * the rings we want to bind are not exclusively owned by a previous bind.
1912  */
1913 static int
1914 netmap_krings_get(struct netmap_priv_d *priv)
1915 {
1916 	struct netmap_adapter *na = priv->np_na;
1917 	u_int i;
1918 	struct netmap_kring *kring;
1919 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1920 	enum txrx t;
1921 
1922 	if (netmap_verbose)
1923 		D("%s: grabbing tx [%d, %d) rx [%d, %d)",
1924 			na->name,
1925 			priv->np_qfirst[NR_TX],
1926 			priv->np_qlast[NR_TX],
1927 			priv->np_qfirst[NR_RX],
1928 			priv->np_qlast[NR_RX]);
1929 
1930 	/* first round: check that all the requested rings
1931 	 * are neither alread exclusively owned, nor we
1932 	 * want exclusive ownership when they are already in use
1933 	 */
1934 	for_rx_tx(t) {
1935 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1936 			kring = &NMR(na, t)[i];
1937 			if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
1938 			    (kring->users && excl))
1939 			{
1940 				ND("ring %s busy", kring->name);
1941 				return EBUSY;
1942 			}
1943 		}
1944 	}
1945 
1946 	/* second round: increment usage count (possibly marking them
1947 	 * as exclusive) and set the nr_pending_mode
1948 	 */
1949 	for_rx_tx(t) {
1950 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1951 			kring = &NMR(na, t)[i];
1952 			kring->users++;
1953 			if (excl)
1954 				kring->nr_kflags |= NKR_EXCLUSIVE;
1955 	                kring->nr_pending_mode = NKR_NETMAP_ON;
1956 		}
1957 	}
1958 
1959 	return 0;
1960 
1961 }
1962 
1963 /* Undo netmap_krings_get(). This is done by clearing the exclusive mode
1964  * if was asked on regif, and unset the nr_pending_mode if we are the
1965  * last users of the involved rings. */
1966 static void
1967 netmap_krings_put(struct netmap_priv_d *priv)
1968 {
1969 	struct netmap_adapter *na = priv->np_na;
1970 	u_int i;
1971 	struct netmap_kring *kring;
1972 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1973 	enum txrx t;
1974 
1975 	ND("%s: releasing tx [%d, %d) rx [%d, %d)",
1976 			na->name,
1977 			priv->np_qfirst[NR_TX],
1978 			priv->np_qlast[NR_TX],
1979 			priv->np_qfirst[NR_RX],
1980 			priv->np_qlast[MR_RX]);
1981 
1982 
1983 	for_rx_tx(t) {
1984 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1985 			kring = &NMR(na, t)[i];
1986 			if (excl)
1987 				kring->nr_kflags &= ~NKR_EXCLUSIVE;
1988 			kring->users--;
1989 			if (kring->users == 0)
1990 				kring->nr_pending_mode = NKR_NETMAP_OFF;
1991 		}
1992 	}
1993 }
1994 
1995 /*
1996  * possibly move the interface to netmap-mode.
1997  * If success it returns a pointer to netmap_if, otherwise NULL.
1998  * This must be called with NMG_LOCK held.
1999  *
2000  * The following na callbacks are called in the process:
2001  *
2002  * na->nm_config()			[by netmap_update_config]
2003  * (get current number and size of rings)
2004  *
2005  *  	We have a generic one for linux (netmap_linux_config).
2006  *  	The bwrap has to override this, since it has to forward
2007  *  	the request to the wrapped adapter (netmap_bwrap_config).
2008  *
2009  *
2010  * na->nm_krings_create()
2011  * (create and init the krings array)
2012  *
2013  * 	One of the following:
2014  *
2015  *	* netmap_hw_krings_create, 			(hw ports)
2016  *		creates the standard layout for the krings
2017  * 		and adds the mbq (used for the host rings).
2018  *
2019  * 	* netmap_vp_krings_create			(VALE ports)
2020  * 		add leases and scratchpads
2021  *
2022  * 	* netmap_pipe_krings_create			(pipes)
2023  * 		create the krings and rings of both ends and
2024  * 		cross-link them
2025  *
2026  *      * netmap_monitor_krings_create 			(monitors)
2027  *      	avoid allocating the mbq
2028  *
2029  *      * netmap_bwrap_krings_create			(bwraps)
2030  *      	create both the brap krings array,
2031  *      	the krings array of the wrapped adapter, and
2032  *      	(if needed) the fake array for the host adapter
2033  *
2034  * na->nm_register(, 1)
2035  * (put the adapter in netmap mode)
2036  *
2037  * 	This may be one of the following:
2038  *
2039  * 	* netmap_hw_reg				        (hw ports)
2040  * 		checks that the ifp is still there, then calls
2041  * 		the hardware specific callback;
2042  *
2043  * 	* netmap_vp_reg					(VALE ports)
2044  *		If the port is connected to a bridge,
2045  *		set the NAF_NETMAP_ON flag under the
2046  *		bridge write lock.
2047  *
2048  *	* netmap_pipe_reg				(pipes)
2049  *		inform the other pipe end that it is no
2050  *		longer responsible for the lifetime of this
2051  *		pipe end
2052  *
2053  *	* netmap_monitor_reg				(monitors)
2054  *		intercept the sync callbacks of the monitored
2055  *		rings
2056  *
2057  *	* netmap_bwrap_reg				(bwraps)
2058  *		cross-link the bwrap and hwna rings,
2059  *		forward the request to the hwna, override
2060  *		the hwna notify callback (to get the frames
2061  *		coming from outside go through the bridge).
2062  *
2063  *
2064  */
2065 int
2066 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
2067 	uint16_t ringid, uint32_t flags)
2068 {
2069 	struct netmap_if *nifp = NULL;
2070 	int error;
2071 
2072 	NMG_LOCK_ASSERT();
2073 	/* ring configuration may have changed, fetch from the card */
2074 	netmap_update_config(na);
2075 	priv->np_na = na;     /* store the reference */
2076 	error = netmap_set_ringid(priv, ringid, flags);
2077 	if (error)
2078 		goto err;
2079 	error = netmap_mem_finalize(na->nm_mem, na);
2080 	if (error)
2081 		goto err;
2082 
2083 	if (na->active_fds == 0) {
2084 		/*
2085 		 * If this is the first registration of the adapter,
2086 		 * perform sanity checks and create the in-kernel view
2087 		 * of the netmap rings (the netmap krings).
2088 		 */
2089 		if (na->ifp) {
2090 			/* This netmap adapter is attached to an ifnet. */
2091 			unsigned nbs = netmap_mem_bufsize(na->nm_mem);
2092 			unsigned mtu = nm_os_ifnet_mtu(na->ifp);
2093 			/* The maximum amount of bytes that a single
2094 			 * receive or transmit NIC descriptor can hold. */
2095 			unsigned hw_max_slot_len = 4096;
2096 
2097 			if (mtu <= hw_max_slot_len) {
2098 				/* The MTU fits a single NIC slot. We only
2099 				 * Need to check that netmap buffers are
2100 				 * large enough to hold an MTU. NS_MOREFRAG
2101 				 * cannot be used in this case. */
2102 				if (nbs < mtu) {
2103 					nm_prerr("error: netmap buf size (%u) "
2104 						"< device MTU (%u)", nbs, mtu);
2105 					error = EINVAL;
2106 					goto err_drop_mem;
2107 				}
2108 			} else {
2109 				/* More NIC slots may be needed to receive
2110 				 * or transmit a single packet. Check that
2111 				 * the adapter supports NS_MOREFRAG and that
2112 				 * netmap buffers are large enough to hold
2113 				 * the maximum per-slot size. */
2114 				if (!(na->na_flags & NAF_MOREFRAG)) {
2115 					nm_prerr("error: large MTU (%d) needed "
2116 						"but %s does not support "
2117 						"NS_MOREFRAG", mtu,
2118 						na->ifp->if_xname);
2119 					error = EINVAL;
2120 					goto err_drop_mem;
2121 				} else if (nbs < hw_max_slot_len) {
2122 					nm_prerr("error: using NS_MOREFRAG on "
2123 						"%s requires netmap buf size "
2124 						">= %u", na->ifp->if_xname,
2125 						hw_max_slot_len);
2126 					error = EINVAL;
2127 					goto err_drop_mem;
2128 				} else {
2129 					nm_prinf("info: netmap application on "
2130 						"%s needs to support "
2131 						"NS_MOREFRAG "
2132 						"(MTU=%u,netmap_buf_size=%u)",
2133 						na->ifp->if_xname, mtu, nbs);
2134 				}
2135 			}
2136 		}
2137 
2138 		/*
2139 		 * Depending on the adapter, this may also create
2140 		 * the netmap rings themselves
2141 		 */
2142 		error = na->nm_krings_create(na);
2143 		if (error)
2144 			goto err_drop_mem;
2145 
2146 	}
2147 
2148 	/* now the krings must exist and we can check whether some
2149 	 * previous bind has exclusive ownership on them, and set
2150 	 * nr_pending_mode
2151 	 */
2152 	error = netmap_krings_get(priv);
2153 	if (error)
2154 		goto err_del_krings;
2155 
2156 	/* create all needed missing netmap rings */
2157 	error = netmap_mem_rings_create(na);
2158 	if (error)
2159 		goto err_rel_excl;
2160 
2161 	/* in all cases, create a new netmap if */
2162 	nifp = netmap_mem_if_new(na, priv);
2163 	if (nifp == NULL) {
2164 		error = ENOMEM;
2165 		goto err_del_rings;
2166 	}
2167 
2168 	if (na->active_fds == 0) {
2169 		/* cache the allocator info in the na */
2170 		error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);
2171 		if (error)
2172 			goto err_del_if;
2173 		ND("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,
2174 					    na->na_lut.objsize);
2175 	}
2176 
2177 	if (nm_kring_pending(priv)) {
2178 		/* Some kring is switching mode, tell the adapter to
2179 		 * react on this. */
2180 		error = na->nm_register(na, 1);
2181 		if (error)
2182 			goto err_put_lut;
2183 	}
2184 
2185 	/* Commit the reference. */
2186 	na->active_fds++;
2187 
2188 	/*
2189 	 * advertise that the interface is ready by setting np_nifp.
2190 	 * The barrier is needed because readers (poll, *SYNC and mmap)
2191 	 * check for priv->np_nifp != NULL without locking
2192 	 */
2193 	mb(); /* make sure previous writes are visible to all CPUs */
2194 	priv->np_nifp = nifp;
2195 
2196 	return 0;
2197 
2198 err_put_lut:
2199 	if (na->active_fds == 0)
2200 		memset(&na->na_lut, 0, sizeof(na->na_lut));
2201 err_del_if:
2202 	netmap_mem_if_delete(na, nifp);
2203 err_del_rings:
2204 	netmap_mem_rings_delete(na);
2205 err_rel_excl:
2206 	netmap_krings_put(priv);
2207 err_del_krings:
2208 	if (na->active_fds == 0)
2209 		na->nm_krings_delete(na);
2210 err_drop_mem:
2211 	netmap_mem_drop(na);
2212 err:
2213 	priv->np_na = NULL;
2214 	return error;
2215 }
2216 
2217 
2218 /*
2219  * update kring and ring at the end of rxsync/txsync.
2220  */
2221 static inline void
2222 nm_sync_finalize(struct netmap_kring *kring)
2223 {
2224 	/*
2225 	 * Update ring tail to what the kernel knows
2226 	 * After txsync: head/rhead/hwcur might be behind cur/rcur
2227 	 * if no carrier.
2228 	 */
2229 	kring->ring->tail = kring->rtail = kring->nr_hwtail;
2230 
2231 	ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2232 		kring->name, kring->nr_hwcur, kring->nr_hwtail,
2233 		kring->rhead, kring->rcur, kring->rtail);
2234 }
2235 
2236 /* set ring timestamp */
2237 static inline void
2238 ring_timestamp_set(struct netmap_ring *ring)
2239 {
2240 	if (netmap_no_timestamp == 0 || ring->flags & NR_TIMESTAMP) {
2241 		microtime(&ring->ts);
2242 	}
2243 }
2244 
2245 
2246 /*
2247  * ioctl(2) support for the "netmap" device.
2248  *
2249  * Following a list of accepted commands:
2250  * - NIOCGINFO
2251  * - SIOCGIFADDR	just for convenience
2252  * - NIOCREGIF
2253  * - NIOCTXSYNC
2254  * - NIOCRXSYNC
2255  *
2256  * Return 0 on success, errno otherwise.
2257  */
2258 int
2259 netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *td)
2260 {
2261 	struct mbq q;	/* packets from RX hw queues to host stack */
2262 	struct nmreq *nmr = (struct nmreq *) data;
2263 	struct netmap_adapter *na = NULL;
2264 	struct netmap_mem_d *nmd = NULL;
2265 	struct ifnet *ifp = NULL;
2266 	int error = 0;
2267 	u_int i, qfirst, qlast;
2268 	struct netmap_if *nifp;
2269 	struct netmap_kring *krings;
2270 	int sync_flags;
2271 	enum txrx t;
2272 
2273 	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
2274 		/* truncate name */
2275 		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
2276 		if (nmr->nr_version != NETMAP_API) {
2277 			D("API mismatch for %s got %d need %d",
2278 				nmr->nr_name,
2279 				nmr->nr_version, NETMAP_API);
2280 			nmr->nr_version = NETMAP_API;
2281 		}
2282 		if (nmr->nr_version < NETMAP_MIN_API ||
2283 		    nmr->nr_version > NETMAP_MAX_API) {
2284 			return EINVAL;
2285 		}
2286 	}
2287 
2288 	switch (cmd) {
2289 	case NIOCGINFO:		/* return capabilities etc */
2290 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
2291 			error = netmap_bdg_ctl(nmr, NULL);
2292 			break;
2293 		}
2294 
2295 		NMG_LOCK();
2296 		do {
2297 			/* memsize is always valid */
2298 			u_int memflags;
2299 			uint64_t memsize;
2300 
2301 			if (nmr->nr_name[0] != '\0') {
2302 
2303 				/* get a refcount */
2304 				error = netmap_get_na(nmr, &na, &ifp, NULL, 1 /* create */);
2305 				if (error) {
2306 					na = NULL;
2307 					ifp = NULL;
2308 					break;
2309 				}
2310 				nmd = na->nm_mem; /* get memory allocator */
2311 			} else {
2312 				nmd = netmap_mem_find(nmr->nr_arg2 ? nmr->nr_arg2 : 1);
2313 				if (nmd == NULL) {
2314 					error = EINVAL;
2315 					break;
2316 				}
2317 			}
2318 
2319 			error = netmap_mem_get_info(nmd, &memsize, &memflags,
2320 				&nmr->nr_arg2);
2321 			if (error)
2322 				break;
2323 			nmr->nr_memsize = (uint32_t)memsize;
2324 			if (na == NULL) /* only memory info */
2325 				break;
2326 			nmr->nr_offset = 0;
2327 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
2328 			netmap_update_config(na);
2329 			nmr->nr_rx_rings = na->num_rx_rings;
2330 			nmr->nr_tx_rings = na->num_tx_rings;
2331 			nmr->nr_rx_slots = na->num_rx_desc;
2332 			nmr->nr_tx_slots = na->num_tx_desc;
2333 		} while (0);
2334 		netmap_unget_na(na, ifp);
2335 		NMG_UNLOCK();
2336 		break;
2337 
2338 	case NIOCREGIF:
2339 		/*
2340 		 * If nmr->nr_cmd is not zero, this NIOCREGIF is not really
2341 		 * a regif operation, but a different one, specified by the
2342 		 * value of nmr->nr_cmd.
2343 		 */
2344 		i = nmr->nr_cmd;
2345 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
2346 				|| i == NETMAP_BDG_VNET_HDR
2347 				|| i == NETMAP_BDG_NEWIF
2348 				|| i == NETMAP_BDG_DELIF
2349 				|| i == NETMAP_BDG_POLLING_ON
2350 				|| i == NETMAP_BDG_POLLING_OFF) {
2351 			/* possibly attach/detach NIC and VALE switch */
2352 			error = netmap_bdg_ctl(nmr, NULL);
2353 			break;
2354 		} else if (i == NETMAP_PT_HOST_CREATE || i == NETMAP_PT_HOST_DELETE) {
2355 			/* forward the command to the ptnetmap subsystem */
2356 			error = ptnetmap_ctl(nmr, priv->np_na);
2357 			break;
2358 		} else if (i == NETMAP_VNET_HDR_GET) {
2359 			/* get vnet-header length for this netmap port */
2360 			struct ifnet *ifp;
2361 
2362 			NMG_LOCK();
2363 			error = netmap_get_na(nmr, &na, &ifp, NULL, 0);
2364 			if (na && !error) {
2365 				nmr->nr_arg1 = na->virt_hdr_len;
2366 			}
2367 			netmap_unget_na(na, ifp);
2368 			NMG_UNLOCK();
2369 			break;
2370 		} else if (i == NETMAP_POOLS_INFO_GET) {
2371 			/* get information from the memory allocator */
2372 			NMG_LOCK();
2373 			if (priv->np_na && priv->np_na->nm_mem) {
2374 				struct netmap_mem_d *nmd = priv->np_na->nm_mem;
2375 				error = netmap_mem_pools_info_get(nmr, nmd);
2376 			} else {
2377 				error = EINVAL;
2378 			}
2379 			NMG_UNLOCK();
2380 			break;
2381 		} else if (i == NETMAP_POOLS_CREATE) {
2382 			nmd = netmap_mem_ext_create(nmr, &error);
2383 			if (nmd == NULL)
2384 				break;
2385 			/* reset the fields used by POOLS_CREATE to
2386 			 * avoid confusing the rest of the code
2387 			 */
2388 			nmr->nr_cmd = 0;
2389 			nmr->nr_arg1 = 0;
2390 			nmr->nr_arg2 = 0;
2391 			nmr->nr_arg3 = 0;
2392 		} else if (i != 0) {
2393 			D("nr_cmd must be 0 not %d", i);
2394 			error = EINVAL;
2395 			break;
2396 		}
2397 
2398 		/* protect access to priv from concurrent NIOCREGIF */
2399 		NMG_LOCK();
2400 		do {
2401 			u_int memflags;
2402 			uint64_t memsize;
2403 
2404 			if (priv->np_nifp != NULL) {	/* thread already registered */
2405 				error = EBUSY;
2406 				break;
2407 			}
2408 
2409 			if (nmr->nr_arg2) {
2410 				/* find the allocator and get a reference */
2411 				nmd = netmap_mem_find(nmr->nr_arg2);
2412 				if (nmd == NULL) {
2413 					error = EINVAL;
2414 					break;
2415 				}
2416 			}
2417 			/* find the interface and a reference */
2418 			error = netmap_get_na(nmr, &na, &ifp, nmd,
2419 					      1 /* create */); /* keep reference */
2420 			if (error)
2421 				break;
2422 			if (NETMAP_OWNED_BY_KERN(na)) {
2423 				error = EBUSY;
2424 				break;
2425 			}
2426 
2427 			if (na->virt_hdr_len && !(nmr->nr_flags & NR_ACCEPT_VNET_HDR)) {
2428 				error = EIO;
2429 				break;
2430 			}
2431 
2432 			error = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags);
2433 			if (error) {    /* reg. failed, release priv and ref */
2434 				break;
2435 			}
2436 			nifp = priv->np_nifp;
2437 			priv->np_td = td; // XXX kqueue, debugging only
2438 
2439 			/* return the offset of the netmap_if object */
2440 			nmr->nr_rx_rings = na->num_rx_rings;
2441 			nmr->nr_tx_rings = na->num_tx_rings;
2442 			nmr->nr_rx_slots = na->num_rx_desc;
2443 			nmr->nr_tx_slots = na->num_tx_desc;
2444 			error = netmap_mem_get_info(na->nm_mem, &memsize, &memflags,
2445 				&nmr->nr_arg2);
2446 			if (error) {
2447 				netmap_do_unregif(priv);
2448 				break;
2449 			}
2450 			nmr->nr_memsize = (uint32_t)memsize;
2451 			if (memflags & NETMAP_MEM_PRIVATE) {
2452 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2453 			}
2454 			for_rx_tx(t) {
2455 				priv->np_si[t] = nm_si_user(priv, t) ?
2456 					&na->si[t] : &NMR(na, t)[priv->np_qfirst[t]].si;
2457 			}
2458 
2459 			if (nmr->nr_arg3) {
2460 				if (netmap_verbose)
2461 					D("requested %d extra buffers", nmr->nr_arg3);
2462 				nmr->nr_arg3 = netmap_extra_alloc(na,
2463 					&nifp->ni_bufs_head, nmr->nr_arg3);
2464 				if (netmap_verbose)
2465 					D("got %d extra buffers", nmr->nr_arg3);
2466 			}
2467 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2468 
2469 			/* store ifp reference so that priv destructor may release it */
2470 			priv->np_ifp = ifp;
2471 		} while (0);
2472 		if (error) {
2473 			netmap_unget_na(na, ifp);
2474 		}
2475 		/* release the reference from netmap_mem_find() or
2476 		 * netmap_mem_ext_create()
2477 		 */
2478 		if (nmd)
2479 			netmap_mem_put(nmd);
2480 		NMG_UNLOCK();
2481 		break;
2482 
2483 	case NIOCTXSYNC:
2484 	case NIOCRXSYNC:
2485 		nifp = priv->np_nifp;
2486 
2487 		if (nifp == NULL) {
2488 			error = ENXIO;
2489 			break;
2490 		}
2491 		mb(); /* make sure following reads are not from cache */
2492 
2493 		na = priv->np_na;      /* we have a reference */
2494 
2495 		if (na == NULL) {
2496 			D("Internal error: nifp != NULL && na == NULL");
2497 			error = ENXIO;
2498 			break;
2499 		}
2500 
2501 		mbq_init(&q);
2502 		t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
2503 		krings = NMR(na, t);
2504 		qfirst = priv->np_qfirst[t];
2505 		qlast = priv->np_qlast[t];
2506 		sync_flags = priv->np_sync_flags;
2507 
2508 		for (i = qfirst; i < qlast; i++) {
2509 			struct netmap_kring *kring = krings + i;
2510 			struct netmap_ring *ring = kring->ring;
2511 
2512 			if (unlikely(nm_kr_tryget(kring, 1, &error))) {
2513 				error = (error ? EIO : 0);
2514 				continue;
2515 			}
2516 
2517 			if (cmd == NIOCTXSYNC) {
2518 				if (netmap_verbose & NM_VERB_TXSYNC)
2519 					D("pre txsync ring %d cur %d hwcur %d",
2520 					    i, ring->cur,
2521 					    kring->nr_hwcur);
2522 				if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2523 					netmap_ring_reinit(kring);
2524 				} else if (kring->nm_sync(kring, sync_flags | NAF_FORCE_RECLAIM) == 0) {
2525 					nm_sync_finalize(kring);
2526 				}
2527 				if (netmap_verbose & NM_VERB_TXSYNC)
2528 					D("post txsync ring %d cur %d hwcur %d",
2529 					    i, ring->cur,
2530 					    kring->nr_hwcur);
2531 			} else {
2532 				if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2533 					netmap_ring_reinit(kring);
2534 				}
2535 				if (nm_may_forward_up(kring)) {
2536 					/* transparent forwarding, see netmap_poll() */
2537 					netmap_grab_packets(kring, &q, netmap_fwd);
2538 				}
2539 				if (kring->nm_sync(kring, sync_flags | NAF_FORCE_READ) == 0) {
2540 					nm_sync_finalize(kring);
2541 				}
2542 				ring_timestamp_set(ring);
2543 			}
2544 			nm_kr_put(kring);
2545 		}
2546 
2547 		if (mbq_peek(&q)) {
2548 			netmap_send_up(na->ifp, &q);
2549 		}
2550 
2551 		break;
2552 
2553 #ifdef WITH_VALE
2554 	case NIOCCONFIG:
2555 		error = netmap_bdg_config(nmr);
2556 		break;
2557 #endif
2558 #ifdef __FreeBSD__
2559 	case FIONBIO:
2560 	case FIOASYNC:
2561 		ND("FIONBIO/FIOASYNC are no-ops");
2562 		break;
2563 
2564 	case BIOCIMMEDIATE:
2565 	case BIOCGHDRCMPLT:
2566 	case BIOCSHDRCMPLT:
2567 	case BIOCSSEESENT:
2568 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
2569 		break;
2570 
2571 	default:	/* allow device-specific ioctls */
2572 	    {
2573 		struct ifnet *ifp = ifunit_ref(nmr->nr_name);
2574 		if (ifp == NULL) {
2575 			error = ENXIO;
2576 		} else {
2577 			struct socket so;
2578 
2579 			bzero(&so, sizeof(so));
2580 			so.so_vnet = ifp->if_vnet;
2581 			// so->so_proto not null.
2582 			error = ifioctl(&so, cmd, data, td);
2583 			if_rele(ifp);
2584 		}
2585 		break;
2586 	    }
2587 
2588 #else /* linux */
2589 	default:
2590 		error = EOPNOTSUPP;
2591 #endif /* linux */
2592 	}
2593 
2594 	return (error);
2595 }
2596 
2597 
2598 /*
2599  * select(2) and poll(2) handlers for the "netmap" device.
2600  *
2601  * Can be called for one or more queues.
2602  * Return true the event mask corresponding to ready events.
2603  * If there are no ready events, do a selrecord on either individual
2604  * selinfo or on the global one.
2605  * Device-dependent parts (locking and sync of tx/rx rings)
2606  * are done through callbacks.
2607  *
2608  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
2609  * The first one is remapped to pwait as selrecord() uses the name as an
2610  * hidden argument.
2611  */
2612 int
2613 netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
2614 {
2615 	struct netmap_adapter *na;
2616 	struct netmap_kring *kring;
2617 	struct netmap_ring *ring;
2618 	u_int i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
2619 #define want_tx want[NR_TX]
2620 #define want_rx want[NR_RX]
2621 	struct mbq q;	/* packets from RX hw queues to host stack */
2622 
2623 	/*
2624 	 * In order to avoid nested locks, we need to "double check"
2625 	 * txsync and rxsync if we decide to do a selrecord().
2626 	 * retry_tx (and retry_rx, later) prevent looping forever.
2627 	 */
2628 	int retry_tx = 1, retry_rx = 1;
2629 
2630 	/* Transparent mode: send_down is 1 if we have found some
2631 	 * packets to forward (host RX ring --> NIC) during the rx
2632 	 * scan and we have not sent them down to the NIC yet.
2633 	 * Transparent mode requires to bind all rings to a single
2634 	 * file descriptor.
2635 	 */
2636 	int send_down = 0;
2637 	int sync_flags = priv->np_sync_flags;
2638 
2639 	mbq_init(&q);
2640 
2641 	if (priv->np_nifp == NULL) {
2642 		D("No if registered");
2643 		return POLLERR;
2644 	}
2645 	mb(); /* make sure following reads are not from cache */
2646 
2647 	na = priv->np_na;
2648 
2649 	if (!nm_netmap_on(na))
2650 		return POLLERR;
2651 
2652 	if (netmap_verbose & 0x8000)
2653 		D("device %s events 0x%x", na->name, events);
2654 	want_tx = events & (POLLOUT | POLLWRNORM);
2655 	want_rx = events & (POLLIN | POLLRDNORM);
2656 
2657 	/*
2658 	 * check_all_{tx|rx} are set if the card has more than one queue AND
2659 	 * the file descriptor is bound to all of them. If so, we sleep on
2660 	 * the "global" selinfo, otherwise we sleep on individual selinfo
2661 	 * (FreeBSD only allows two selinfo's per file descriptor).
2662 	 * The interrupt routine in the driver wake one or the other
2663 	 * (or both) depending on which clients are active.
2664 	 *
2665 	 * rxsync() is only called if we run out of buffers on a POLLIN.
2666 	 * txsync() is called if we run out of buffers on POLLOUT, or
2667 	 * there are pending packets to send. The latter can be disabled
2668 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
2669 	 */
2670 	check_all_tx = nm_si_user(priv, NR_TX);
2671 	check_all_rx = nm_si_user(priv, NR_RX);
2672 
2673 #ifdef __FreeBSD__
2674 	/*
2675 	 * We start with a lock free round which is cheap if we have
2676 	 * slots available. If this fails, then lock and call the sync
2677 	 * routines. We can't do this on Linux, as the contract says
2678 	 * that we must call nm_os_selrecord() unconditionally.
2679 	 */
2680 	if (want_tx) {
2681 		enum txrx t = NR_TX;
2682 		for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) {
2683 			kring = &NMR(na, t)[i];
2684 			/* XXX compare ring->cur and kring->tail */
2685 			if (!nm_ring_empty(kring->ring)) {
2686 				revents |= want[t];
2687 				want[t] = 0;	/* also breaks the loop */
2688 			}
2689 		}
2690 	}
2691 	if (want_rx) {
2692 		enum txrx t = NR_RX;
2693 		want_rx = 0; /* look for a reason to run the handlers */
2694 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
2695 			kring = &NMR(na, t)[i];
2696 			if (kring->ring->cur == kring->ring->tail /* try fetch new buffers */
2697 			    || kring->rhead != kring->ring->head /* release buffers */) {
2698 				want_rx = 1;
2699 			}
2700 		}
2701 		if (!want_rx)
2702 			revents |= events & (POLLIN | POLLRDNORM); /* we have data */
2703 	}
2704 #endif
2705 
2706 #ifdef linux
2707 	/* The selrecord must be unconditional on linux. */
2708 	nm_os_selrecord(sr, check_all_tx ?
2709 	    &na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]].si);
2710 	nm_os_selrecord(sr, check_all_rx ?
2711 		&na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]].si);
2712 #endif /* linux */
2713 
2714 	/*
2715 	 * If we want to push packets out (priv->np_txpoll) or
2716 	 * want_tx is still set, we must issue txsync calls
2717 	 * (on all rings, to avoid that the tx rings stall).
2718 	 * Fortunately, normal tx mode has np_txpoll set.
2719 	 */
2720 	if (priv->np_txpoll || want_tx) {
2721 		/*
2722 		 * The first round checks if anyone is ready, if not
2723 		 * do a selrecord and another round to handle races.
2724 		 * want_tx goes to 0 if any space is found, and is
2725 		 * used to skip rings with no pending transmissions.
2726 		 */
2727 flush_tx:
2728 		for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {
2729 			int found = 0;
2730 
2731 			kring = &na->tx_rings[i];
2732 			ring = kring->ring;
2733 
2734 			/*
2735 			 * Don't try to txsync this TX ring if we already found some
2736 			 * space in some of the TX rings (want_tx == 0) and there are no
2737 			 * TX slots in this ring that need to be flushed to the NIC
2738 			 * (cur == hwcur).
2739 			 */
2740 			if (!send_down && !want_tx && ring->cur == kring->nr_hwcur)
2741 				continue;
2742 
2743 			if (nm_kr_tryget(kring, 1, &revents))
2744 				continue;
2745 
2746 			if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2747 				netmap_ring_reinit(kring);
2748 				revents |= POLLERR;
2749 			} else {
2750 				if (kring->nm_sync(kring, sync_flags))
2751 					revents |= POLLERR;
2752 				else
2753 					nm_sync_finalize(kring);
2754 			}
2755 
2756 			/*
2757 			 * If we found new slots, notify potential
2758 			 * listeners on the same ring.
2759 			 * Since we just did a txsync, look at the copies
2760 			 * of cur,tail in the kring.
2761 			 */
2762 			found = kring->rcur != kring->rtail;
2763 			nm_kr_put(kring);
2764 			if (found) { /* notify other listeners */
2765 				revents |= want_tx;
2766 				want_tx = 0;
2767 #ifndef linux
2768 				kring->nm_notify(kring, 0);
2769 #endif /* linux */
2770 			}
2771 		}
2772 		/* if there were any packet to forward we must have handled them by now */
2773 		send_down = 0;
2774 		if (want_tx && retry_tx && sr) {
2775 #ifndef linux
2776 			nm_os_selrecord(sr, check_all_tx ?
2777 			    &na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]].si);
2778 #endif /* !linux */
2779 			retry_tx = 0;
2780 			goto flush_tx;
2781 		}
2782 	}
2783 
2784 	/*
2785 	 * If want_rx is still set scan receive rings.
2786 	 * Do it on all rings because otherwise we starve.
2787 	 */
2788 	if (want_rx) {
2789 		/* two rounds here for race avoidance */
2790 do_retry_rx:
2791 		for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
2792 			int found = 0;
2793 
2794 			kring = &na->rx_rings[i];
2795 			ring = kring->ring;
2796 
2797 			if (unlikely(nm_kr_tryget(kring, 1, &revents)))
2798 				continue;
2799 
2800 			if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2801 				netmap_ring_reinit(kring);
2802 				revents |= POLLERR;
2803 			}
2804 			/* now we can use kring->rcur, rtail */
2805 
2806 			/*
2807 			 * transparent mode support: collect packets from
2808 			 * hw rxring(s) that have been released by the user
2809 			 */
2810 			if (nm_may_forward_up(kring)) {
2811 				netmap_grab_packets(kring, &q, netmap_fwd);
2812 			}
2813 
2814 			/* Clear the NR_FORWARD flag anyway, it may be set by
2815 			 * the nm_sync() below only on for the host RX ring (see
2816 			 * netmap_rxsync_from_host()). */
2817 			kring->nr_kflags &= ~NR_FORWARD;
2818 			if (kring->nm_sync(kring, sync_flags))
2819 				revents |= POLLERR;
2820 			else
2821 				nm_sync_finalize(kring);
2822 			send_down |= (kring->nr_kflags & NR_FORWARD);
2823 			ring_timestamp_set(ring);
2824 			found = kring->rcur != kring->rtail;
2825 			nm_kr_put(kring);
2826 			if (found) {
2827 				revents |= want_rx;
2828 				retry_rx = 0;
2829 #ifndef linux
2830 				kring->nm_notify(kring, 0);
2831 #endif /* linux */
2832 			}
2833 		}
2834 
2835 #ifndef linux
2836 		if (retry_rx && sr) {
2837 			nm_os_selrecord(sr, check_all_rx ?
2838 			    &na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]].si);
2839 		}
2840 #endif /* !linux */
2841 		if (send_down || retry_rx) {
2842 			retry_rx = 0;
2843 			if (send_down)
2844 				goto flush_tx; /* and retry_rx */
2845 			else
2846 				goto do_retry_rx;
2847 		}
2848 	}
2849 
2850 	/*
2851 	 * Transparent mode: released bufs (i.e. between kring->nr_hwcur and
2852 	 * ring->head) marked with NS_FORWARD on hw rx rings are passed up
2853 	 * to the host stack.
2854 	 */
2855 
2856 	if (mbq_peek(&q)) {
2857 		netmap_send_up(na->ifp, &q);
2858 	}
2859 
2860 	return (revents);
2861 #undef want_tx
2862 #undef want_rx
2863 }
2864 
2865 int
2866 nma_intr_enable(struct netmap_adapter *na, int onoff)
2867 {
2868 	bool changed = false;
2869 	enum txrx t;
2870 	int i;
2871 
2872 	for_rx_tx(t) {
2873 		for (i = 0; i < nma_get_nrings(na, t); i++) {
2874 			struct netmap_kring *kring = &NMR(na, t)[i];
2875 			int on = !(kring->nr_kflags & NKR_NOINTR);
2876 
2877 			if (!!onoff != !!on) {
2878 				changed = true;
2879 			}
2880 			if (onoff) {
2881 				kring->nr_kflags &= ~NKR_NOINTR;
2882 			} else {
2883 				kring->nr_kflags |= NKR_NOINTR;
2884 			}
2885 		}
2886 	}
2887 
2888 	if (!changed) {
2889 		return 0; /* nothing to do */
2890 	}
2891 
2892 	if (!na->nm_intr) {
2893 		D("Cannot %s interrupts for %s", onoff ? "enable" : "disable",
2894 		  na->name);
2895 		return -1;
2896 	}
2897 
2898 	na->nm_intr(na, onoff);
2899 
2900 	return 0;
2901 }
2902 
2903 
2904 /*-------------------- driver support routines -------------------*/
2905 
2906 /* default notify callback */
2907 static int
2908 netmap_notify(struct netmap_kring *kring, int flags)
2909 {
2910 	struct netmap_adapter *na = kring->na;
2911 	enum txrx t = kring->tx;
2912 
2913 	nm_os_selwakeup(&kring->si);
2914 	/* optimization: avoid a wake up on the global
2915 	 * queue if nobody has registered for more
2916 	 * than one ring
2917 	 */
2918 	if (na->si_users[t] > 0)
2919 		nm_os_selwakeup(&na->si[t]);
2920 
2921 	return NM_IRQ_COMPLETED;
2922 }
2923 
2924 /* called by all routines that create netmap_adapters.
2925  * provide some defaults and get a reference to the
2926  * memory allocator
2927  */
2928 int
2929 netmap_attach_common(struct netmap_adapter *na)
2930 {
2931 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2932 		D("%s: invalid rings tx %d rx %d",
2933 			na->name, na->num_tx_rings, na->num_rx_rings);
2934 		return EINVAL;
2935 	}
2936 
2937 #ifdef __FreeBSD__
2938 	if (na->na_flags & NAF_HOST_RINGS && na->ifp) {
2939 		na->if_input = na->ifp->if_input; /* for netmap_send_up */
2940 	}
2941 	na->pdev = na; /* make sure netmap_mem_map() is called */
2942 #endif /* __FreeBSD__ */
2943 	if (na->nm_krings_create == NULL) {
2944 		/* we assume that we have been called by a driver,
2945 		 * since other port types all provide their own
2946 		 * nm_krings_create
2947 		 */
2948 		na->nm_krings_create = netmap_hw_krings_create;
2949 		na->nm_krings_delete = netmap_hw_krings_delete;
2950 	}
2951 	if (na->nm_notify == NULL)
2952 		na->nm_notify = netmap_notify;
2953 	na->active_fds = 0;
2954 
2955 	if (na->nm_mem == NULL) {
2956 		/* use the global allocator */
2957 		na->nm_mem = netmap_mem_get(&nm_mem);
2958 	}
2959 #ifdef WITH_VALE
2960 	if (na->nm_bdg_attach == NULL)
2961 		/* no special nm_bdg_attach callback. On VALE
2962 		 * attach, we need to interpose a bwrap
2963 		 */
2964 		na->nm_bdg_attach = netmap_bwrap_attach;
2965 #endif
2966 
2967 	return 0;
2968 }
2969 
2970 /* Wrapper for the register callback provided netmap-enabled
2971  * hardware drivers.
2972  * nm_iszombie(na) means that the driver module has been
2973  * unloaded, so we cannot call into it.
2974  * nm_os_ifnet_lock() must guarantee mutual exclusion with
2975  * module unloading.
2976  */
2977 static int
2978 netmap_hw_reg(struct netmap_adapter *na, int onoff)
2979 {
2980 	struct netmap_hw_adapter *hwna =
2981 		(struct netmap_hw_adapter*)na;
2982 	int error = 0;
2983 
2984 	nm_os_ifnet_lock();
2985 
2986 	if (nm_iszombie(na)) {
2987 		if (onoff) {
2988 			error = ENXIO;
2989 		} else if (na != NULL) {
2990 			na->na_flags &= ~NAF_NETMAP_ON;
2991 		}
2992 		goto out;
2993 	}
2994 
2995 	error = hwna->nm_hw_register(na, onoff);
2996 
2997 out:
2998 	nm_os_ifnet_unlock();
2999 
3000 	return error;
3001 }
3002 
3003 static void
3004 netmap_hw_dtor(struct netmap_adapter *na)
3005 {
3006 	if (nm_iszombie(na) || na->ifp == NULL)
3007 		return;
3008 
3009 	WNA(na->ifp) = NULL;
3010 }
3011 
3012 
3013 /*
3014  * Allocate a netmap_adapter object, and initialize it from the
3015  * 'arg' passed by the driver on attach.
3016  * We allocate a block of memory of 'size' bytes, which has room
3017  * for struct netmap_adapter plus additional room private to
3018  * the caller.
3019  * Return 0 on success, ENOMEM otherwise.
3020  */
3021 int
3022 netmap_attach_ext(struct netmap_adapter *arg, size_t size, int override_reg)
3023 {
3024 	struct netmap_hw_adapter *hwna = NULL;
3025 	struct ifnet *ifp = NULL;
3026 
3027 	if (size < sizeof(struct netmap_hw_adapter)) {
3028 		D("Invalid netmap adapter size %d", (int)size);
3029 		return EINVAL;
3030 	}
3031 
3032 	if (arg == NULL || arg->ifp == NULL)
3033 		goto fail;
3034 
3035 	ifp = arg->ifp;
3036 	if (NA(ifp) && !NM_NA_VALID(ifp)) {
3037 		/* If NA(ifp) is not null but there is no valid netmap
3038 		 * adapter it means that someone else is using the same
3039 		 * pointer (e.g. ax25_ptr on linux). This happens for
3040 		 * instance when also PF_RING is in use. */
3041 		D("Error: netmap adapter hook is busy");
3042 		return EBUSY;
3043 	}
3044 
3045 	hwna = nm_os_malloc(size);
3046 	if (hwna == NULL)
3047 		goto fail;
3048 	hwna->up = *arg;
3049 	hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
3050 	strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
3051 	if (override_reg) {
3052 		hwna->nm_hw_register = hwna->up.nm_register;
3053 		hwna->up.nm_register = netmap_hw_reg;
3054 	}
3055 	if (netmap_attach_common(&hwna->up)) {
3056 		nm_os_free(hwna);
3057 		goto fail;
3058 	}
3059 	netmap_adapter_get(&hwna->up);
3060 
3061 	NM_ATTACH_NA(ifp, &hwna->up);
3062 
3063 #ifdef linux
3064 	if (ifp->netdev_ops) {
3065 		/* prepare a clone of the netdev ops */
3066 #ifndef NETMAP_LINUX_HAVE_NETDEV_OPS
3067 		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
3068 #else
3069 		hwna->nm_ndo = *ifp->netdev_ops;
3070 #endif /* NETMAP_LINUX_HAVE_NETDEV_OPS */
3071 	}
3072 	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
3073 	hwna->nm_ndo.ndo_change_mtu = linux_netmap_change_mtu;
3074 	if (ifp->ethtool_ops) {
3075 		hwna->nm_eto = *ifp->ethtool_ops;
3076 	}
3077 	hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam;
3078 #ifdef NETMAP_LINUX_HAVE_SET_CHANNELS
3079 	hwna->nm_eto.set_channels = linux_netmap_set_channels;
3080 #endif /* NETMAP_LINUX_HAVE_SET_CHANNELS */
3081 	if (arg->nm_config == NULL) {
3082 		hwna->up.nm_config = netmap_linux_config;
3083 	}
3084 #endif /* linux */
3085 	if (arg->nm_dtor == NULL) {
3086 		hwna->up.nm_dtor = netmap_hw_dtor;
3087 	}
3088 
3089 	if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
3090 	    hwna->up.num_tx_rings, hwna->up.num_tx_desc,
3091 	    hwna->up.num_rx_rings, hwna->up.num_rx_desc);
3092 	return 0;
3093 
3094 fail:
3095 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
3096 	return (hwna ? EINVAL : ENOMEM);
3097 }
3098 
3099 
3100 int
3101 netmap_attach(struct netmap_adapter *arg)
3102 {
3103 	return netmap_attach_ext(arg, sizeof(struct netmap_hw_adapter),
3104 			1 /* override nm_reg */);
3105 }
3106 
3107 
3108 void
3109 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
3110 {
3111 	if (!na) {
3112 		return;
3113 	}
3114 
3115 	refcount_acquire(&na->na_refcount);
3116 }
3117 
3118 
3119 /* returns 1 iff the netmap_adapter is destroyed */
3120 int
3121 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
3122 {
3123 	if (!na)
3124 		return 1;
3125 
3126 	if (!refcount_release(&na->na_refcount))
3127 		return 0;
3128 
3129 	if (na->nm_dtor)
3130 		na->nm_dtor(na);
3131 
3132 	if (na->tx_rings) { /* XXX should not happen */
3133 		D("freeing leftover tx_rings");
3134 		na->nm_krings_delete(na);
3135 	}
3136 	netmap_pipe_dealloc(na);
3137 	if (na->nm_mem)
3138 		netmap_mem_put(na->nm_mem);
3139 	bzero(na, sizeof(*na));
3140 	nm_os_free(na);
3141 
3142 	return 1;
3143 }
3144 
3145 /* nm_krings_create callback for all hardware native adapters */
3146 int
3147 netmap_hw_krings_create(struct netmap_adapter *na)
3148 {
3149 	int ret = netmap_krings_create(na, 0);
3150 	if (ret == 0) {
3151 		/* initialize the mbq for the sw rx ring */
3152 		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
3153 		ND("initialized sw rx queue %d", na->num_rx_rings);
3154 	}
3155 	return ret;
3156 }
3157 
3158 
3159 
3160 /*
3161  * Called on module unload by the netmap-enabled drivers
3162  */
3163 void
3164 netmap_detach(struct ifnet *ifp)
3165 {
3166 	struct netmap_adapter *na = NA(ifp);
3167 
3168 	if (!na)
3169 		return;
3170 
3171 	NMG_LOCK();
3172 	netmap_set_all_rings(na, NM_KR_LOCKED);
3173 	/*
3174 	 * if the netmap adapter is not native, somebody
3175 	 * changed it, so we can not release it here.
3176 	 * The NAF_ZOMBIE flag will notify the new owner that
3177 	 * the driver is gone.
3178 	 */
3179 	if (!(na->na_flags & NAF_NATIVE) || !netmap_adapter_put(na)) {
3180 		na->na_flags |= NAF_ZOMBIE;
3181 	}
3182 	/* give active users a chance to notice that NAF_ZOMBIE has been
3183 	 * turned on, so that they can stop and return an error to userspace.
3184 	 * Note that this becomes a NOP if there are no active users and,
3185 	 * therefore, the put() above has deleted the na, since now NA(ifp) is
3186 	 * NULL.
3187 	 */
3188 	netmap_enable_all_rings(ifp);
3189 	NMG_UNLOCK();
3190 }
3191 
3192 
3193 /*
3194  * Intercept packets from the network stack and pass them
3195  * to netmap as incoming packets on the 'software' ring.
3196  *
3197  * We only store packets in a bounded mbq and then copy them
3198  * in the relevant rxsync routine.
3199  *
3200  * We rely on the OS to make sure that the ifp and na do not go
3201  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
3202  * In nm_register() or whenever there is a reinitialization,
3203  * we make sure to make the mode change visible here.
3204  */
3205 int
3206 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
3207 {
3208 	struct netmap_adapter *na = NA(ifp);
3209 	struct netmap_kring *kring, *tx_kring;
3210 	u_int len = MBUF_LEN(m);
3211 	u_int error = ENOBUFS;
3212 	unsigned int txr;
3213 	struct mbq *q;
3214 	int busy;
3215 
3216 	kring = &na->rx_rings[na->num_rx_rings];
3217 	// XXX [Linux] we do not need this lock
3218 	// if we follow the down/configure/up protocol -gl
3219 	// mtx_lock(&na->core_lock);
3220 
3221 	if (!nm_netmap_on(na)) {
3222 		D("%s not in netmap mode anymore", na->name);
3223 		error = ENXIO;
3224 		goto done;
3225 	}
3226 
3227 	txr = MBUF_TXQ(m);
3228 	if (txr >= na->num_tx_rings) {
3229 		txr %= na->num_tx_rings;
3230 	}
3231 	tx_kring = &NMR(na, NR_TX)[txr];
3232 
3233 	if (tx_kring->nr_mode == NKR_NETMAP_OFF) {
3234 		return MBUF_TRANSMIT(na, ifp, m);
3235 	}
3236 
3237 	q = &kring->rx_queue;
3238 
3239 	// XXX reconsider long packets if we handle fragments
3240 	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
3241 		D("%s from_host, drop packet size %d > %d", na->name,
3242 			len, NETMAP_BUF_SIZE(na));
3243 		goto done;
3244 	}
3245 
3246 	if (nm_os_mbuf_has_offld(m)) {
3247 		RD(1, "%s drop mbuf that needs offloadings", na->name);
3248 		goto done;
3249 	}
3250 
3251 	/* protect against netmap_rxsync_from_host(), netmap_sw_to_nic()
3252 	 * and maybe other instances of netmap_transmit (the latter
3253 	 * not possible on Linux).
3254 	 * We enqueue the mbuf only if we are sure there is going to be
3255 	 * enough room in the host RX ring, otherwise we drop it.
3256 	 */
3257 	mbq_lock(q);
3258 
3259         busy = kring->nr_hwtail - kring->nr_hwcur;
3260         if (busy < 0)
3261                 busy += kring->nkr_num_slots;
3262 	if (busy + mbq_len(q) >= kring->nkr_num_slots - 1) {
3263 		RD(2, "%s full hwcur %d hwtail %d qlen %d", na->name,
3264 			kring->nr_hwcur, kring->nr_hwtail, mbq_len(q));
3265 	} else {
3266 		mbq_enqueue(q, m);
3267 		ND(2, "%s %d bufs in queue", na->name, mbq_len(q));
3268 		/* notify outside the lock */
3269 		m = NULL;
3270 		error = 0;
3271 	}
3272 	mbq_unlock(q);
3273 
3274 done:
3275 	if (m)
3276 		m_freem(m);
3277 	/* unconditionally wake up listeners */
3278 	kring->nm_notify(kring, 0);
3279 	/* this is normally netmap_notify(), but for nics
3280 	 * connected to a bridge it is netmap_bwrap_intr_notify(),
3281 	 * that possibly forwards the frames through the switch
3282 	 */
3283 
3284 	return (error);
3285 }
3286 
3287 
3288 /*
3289  * netmap_reset() is called by the driver routines when reinitializing
3290  * a ring. The driver is in charge of locking to protect the kring.
3291  * If native netmap mode is not set just return NULL.
3292  * If native netmap mode is set, in particular, we have to set nr_mode to
3293  * NKR_NETMAP_ON.
3294  */
3295 struct netmap_slot *
3296 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
3297 	u_int new_cur)
3298 {
3299 	struct netmap_kring *kring;
3300 	int new_hwofs, lim;
3301 
3302 	if (!nm_native_on(na)) {
3303 		ND("interface not in native netmap mode");
3304 		return NULL;	/* nothing to reinitialize */
3305 	}
3306 
3307 	/* XXX note- in the new scheme, we are not guaranteed to be
3308 	 * under lock (e.g. when called on a device reset).
3309 	 * In this case, we should set a flag and do not trust too
3310 	 * much the values. In practice: TODO
3311 	 * - set a RESET flag somewhere in the kring
3312 	 * - do the processing in a conservative way
3313 	 * - let the *sync() fixup at the end.
3314 	 */
3315 	if (tx == NR_TX) {
3316 		if (n >= na->num_tx_rings)
3317 			return NULL;
3318 
3319 		kring = na->tx_rings + n;
3320 
3321 		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3322 			kring->nr_mode = NKR_NETMAP_OFF;
3323 			return NULL;
3324 		}
3325 
3326 		// XXX check whether we should use hwcur or rcur
3327 		new_hwofs = kring->nr_hwcur - new_cur;
3328 	} else {
3329 		if (n >= na->num_rx_rings)
3330 			return NULL;
3331 		kring = na->rx_rings + n;
3332 
3333 		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3334 			kring->nr_mode = NKR_NETMAP_OFF;
3335 			return NULL;
3336 		}
3337 
3338 		new_hwofs = kring->nr_hwtail - new_cur;
3339 	}
3340 	lim = kring->nkr_num_slots - 1;
3341 	if (new_hwofs > lim)
3342 		new_hwofs -= lim + 1;
3343 
3344 	/* Always set the new offset value and realign the ring. */
3345 	if (netmap_verbose)
3346 	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
3347 		na->name,
3348 		tx == NR_TX ? "TX" : "RX", n,
3349 		kring->nkr_hwofs, new_hwofs,
3350 		kring->nr_hwtail,
3351 		tx == NR_TX ? lim : kring->nr_hwtail);
3352 	kring->nkr_hwofs = new_hwofs;
3353 	if (tx == NR_TX) {
3354 		kring->nr_hwtail = kring->nr_hwcur + lim;
3355 		if (kring->nr_hwtail > lim)
3356 			kring->nr_hwtail -= lim + 1;
3357 	}
3358 
3359 	/*
3360 	 * Wakeup on the individual and global selwait
3361 	 * We do the wakeup here, but the ring is not yet reconfigured.
3362 	 * However, we are under lock so there are no races.
3363 	 */
3364 	kring->nr_mode = NKR_NETMAP_ON;
3365 	kring->nm_notify(kring, 0);
3366 	return kring->ring->slot;
3367 }
3368 
3369 
3370 /*
3371  * Dispatch rx/tx interrupts to the netmap rings.
3372  *
3373  * "work_done" is non-null on the RX path, NULL for the TX path.
3374  * We rely on the OS to make sure that there is only one active
3375  * instance per queue, and that there is appropriate locking.
3376  *
3377  * The 'notify' routine depends on what the ring is attached to.
3378  * - for a netmap file descriptor, do a selwakeup on the individual
3379  *   waitqueue, plus one on the global one if needed
3380  *   (see netmap_notify)
3381  * - for a nic connected to a switch, call the proper forwarding routine
3382  *   (see netmap_bwrap_intr_notify)
3383  */
3384 int
3385 netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
3386 {
3387 	struct netmap_kring *kring;
3388 	enum txrx t = (work_done ? NR_RX : NR_TX);
3389 
3390 	q &= NETMAP_RING_MASK;
3391 
3392 	if (netmap_verbose) {
3393 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
3394 	}
3395 
3396 	if (q >= nma_get_nrings(na, t))
3397 		return NM_IRQ_PASS; // not a physical queue
3398 
3399 	kring = NMR(na, t) + q;
3400 
3401 	if (kring->nr_mode == NKR_NETMAP_OFF) {
3402 		return NM_IRQ_PASS;
3403 	}
3404 
3405 	if (t == NR_RX) {
3406 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
3407 		*work_done = 1; /* do not fire napi again */
3408 	}
3409 
3410 	return kring->nm_notify(kring, 0);
3411 }
3412 
3413 
3414 /*
3415  * Default functions to handle rx/tx interrupts from a physical device.
3416  * "work_done" is non-null on the RX path, NULL for the TX path.
3417  *
3418  * If the card is not in netmap mode, simply return NM_IRQ_PASS,
3419  * so that the caller proceeds with regular processing.
3420  * Otherwise call netmap_common_irq().
3421  *
3422  * If the card is connected to a netmap file descriptor,
3423  * do a selwakeup on the individual queue, plus one on the global one
3424  * if needed (multiqueue card _and_ there are multiqueue listeners),
3425  * and return NR_IRQ_COMPLETED.
3426  *
3427  * Finally, if called on rx from an interface connected to a switch,
3428  * calls the proper forwarding routine.
3429  */
3430 int
3431 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3432 {
3433 	struct netmap_adapter *na = NA(ifp);
3434 
3435 	/*
3436 	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
3437 	 * we still use the regular driver even though the previous
3438 	 * check fails. It is unclear whether we should use
3439 	 * nm_native_on() here.
3440 	 */
3441 	if (!nm_netmap_on(na))
3442 		return NM_IRQ_PASS;
3443 
3444 	if (na->na_flags & NAF_SKIP_INTR) {
3445 		ND("use regular interrupt");
3446 		return NM_IRQ_PASS;
3447 	}
3448 
3449 	return netmap_common_irq(na, q, work_done);
3450 }
3451 
3452 
3453 /*
3454  * Module loader and unloader
3455  *
3456  * netmap_init() creates the /dev/netmap device and initializes
3457  * all global variables. Returns 0 on success, errno on failure
3458  * (but there is no chance)
3459  *
3460  * netmap_fini() destroys everything.
3461  */
3462 
3463 static struct cdev *netmap_dev; /* /dev/netmap character device. */
3464 extern struct cdevsw netmap_cdevsw;
3465 
3466 
3467 void
3468 netmap_fini(void)
3469 {
3470 	if (netmap_dev)
3471 		destroy_dev(netmap_dev);
3472 	/* we assume that there are no longer netmap users */
3473 	nm_os_ifnet_fini();
3474 	netmap_uninit_bridges();
3475 	netmap_mem_fini();
3476 	NMG_LOCK_DESTROY();
3477 	nm_prinf("netmap: unloaded module.\n");
3478 }
3479 
3480 
3481 int
3482 netmap_init(void)
3483 {
3484 	int error;
3485 
3486 	NMG_LOCK_INIT();
3487 
3488 	error = netmap_mem_init();
3489 	if (error != 0)
3490 		goto fail;
3491 	/*
3492 	 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
3493 	 * when the module is compiled in.
3494 	 * XXX could use make_dev_credv() to get error number
3495 	 */
3496 	netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
3497 		&netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
3498 			      "netmap");
3499 	if (!netmap_dev)
3500 		goto fail;
3501 
3502 	error = netmap_init_bridges();
3503 	if (error)
3504 		goto fail;
3505 
3506 #ifdef __FreeBSD__
3507 	nm_os_vi_init_index();
3508 #endif
3509 
3510 	error = nm_os_ifnet_init();
3511 	if (error)
3512 		goto fail;
3513 
3514 	nm_prinf("netmap: loaded module\n");
3515 	return (0);
3516 fail:
3517 	netmap_fini();
3518 	return (EINVAL); /* may be incorrect */
3519 }
3520