xref: /freebsd/sys/dev/netmap/netmap.c (revision 6966ac055c3b7a39266fb982493330df7a097997)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (C) 2011-2014 Matteo Landi
5  * Copyright (C) 2011-2016 Luigi Rizzo
6  * Copyright (C) 2011-2016 Giuseppe Lettieri
7  * Copyright (C) 2011-2016 Vincenzo Maffione
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *   1. Redistributions of source code must retain the above copyright
14  *      notice, this list of conditions and the following disclaimer.
15  *   2. Redistributions in binary form must reproduce the above copyright
16  *      notice, this list of conditions and the following disclaimer in the
17  *      documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 
33 /*
34  * $FreeBSD$
35  *
36  * This module supports memory mapped access to network devices,
37  * see netmap(4).
38  *
39  * The module uses a large, memory pool allocated by the kernel
40  * and accessible as mmapped memory by multiple userspace threads/processes.
41  * The memory pool contains packet buffers and "netmap rings",
42  * i.e. user-accessible copies of the interface's queues.
43  *
44  * Access to the network card works like this:
45  * 1. a process/thread issues one or more open() on /dev/netmap, to create
46  *    select()able file descriptor on which events are reported.
47  * 2. on each descriptor, the process issues an ioctl() to identify
48  *    the interface that should report events to the file descriptor.
49  * 3. on each descriptor, the process issues an mmap() request to
50  *    map the shared memory region within the process' address space.
51  *    The list of interesting queues is indicated by a location in
52  *    the shared memory region.
53  * 4. using the functions in the netmap(4) userspace API, a process
54  *    can look up the occupation state of a queue, access memory buffers,
55  *    and retrieve received packets or enqueue packets to transmit.
56  * 5. using some ioctl()s the process can synchronize the userspace view
57  *    of the queue with the actual status in the kernel. This includes both
58  *    receiving the notification of new packets, and transmitting new
59  *    packets on the output interface.
60  * 6. select() or poll() can be used to wait for events on individual
61  *    transmit or receive queues (or all queues for a given interface).
62  *
63 
64 		SYNCHRONIZATION (USER)
65 
66 The netmap rings and data structures may be shared among multiple
67 user threads or even independent processes.
68 Any synchronization among those threads/processes is delegated
69 to the threads themselves. Only one thread at a time can be in
70 a system call on the same netmap ring. The OS does not enforce
71 this and only guarantees against system crashes in case of
72 invalid usage.
73 
74 		LOCKING (INTERNAL)
75 
76 Within the kernel, access to the netmap rings is protected as follows:
77 
78 - a spinlock on each ring, to handle producer/consumer races on
79   RX rings attached to the host stack (against multiple host
80   threads writing from the host stack to the same ring),
81   and on 'destination' rings attached to a VALE switch
82   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
83   protecting multiple active senders for the same destination)
84 
85 - an atomic variable to guarantee that there is at most one
86   instance of *_*xsync() on the ring at any time.
87   For rings connected to user file
88   descriptors, an atomic_test_and_set() protects this, and the
89   lock on the ring is not actually used.
90   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
91   is also used to prevent multiple executions (the driver might indeed
92   already guarantee this).
93   For NIC TX rings connected to a VALE switch, the lock arbitrates
94   access to the queue (both when allocating buffers and when pushing
95   them out).
96 
97 - *xsync() should be protected against initializations of the card.
98   On FreeBSD most devices have the reset routine protected by
99   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
100   the RING protection on rx_reset(), this should be added.
101 
102   On linux there is an external lock on the tx path, which probably
103   also arbitrates access to the reset routine. XXX to be revised
104 
105 - a per-interface core_lock protecting access from the host stack
106   while interfaces may be detached from netmap mode.
107   XXX there should be no need for this lock if we detach the interfaces
108   only while they are down.
109 
110 
111 --- VALE SWITCH ---
112 
113 NMG_LOCK() serializes all modifications to switches and ports.
114 A switch cannot be deleted until all ports are gone.
115 
116 For each switch, an SX lock (RWlock on linux) protects
117 deletion of ports. When configuring or deleting a new port, the
118 lock is acquired in exclusive mode (after holding NMG_LOCK).
119 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
120 The lock is held throughout the entire forwarding cycle,
121 during which the thread may incur in a page fault.
122 Hence it is important that sleepable shared locks are used.
123 
124 On the rx ring, the per-port lock is grabbed initially to reserve
125 a number of slot in the ring, then the lock is released,
126 packets are copied from source to destination, and then
127 the lock is acquired again and the receive ring is updated.
128 (A similar thing is done on the tx ring for NIC and host stack
129 ports attached to the switch)
130 
131  */
132 
133 
134 /* --- internals ----
135  *
136  * Roadmap to the code that implements the above.
137  *
138  * > 1. a process/thread issues one or more open() on /dev/netmap, to create
139  * >    select()able file descriptor on which events are reported.
140  *
141  *  	Internally, we allocate a netmap_priv_d structure, that will be
142  *  	initialized on ioctl(NIOCREGIF). There is one netmap_priv_d
143  *  	structure for each open().
144  *
145  *      os-specific:
146  *  	    FreeBSD: see netmap_open() (netmap_freebsd.c)
147  *  	    linux:   see linux_netmap_open() (netmap_linux.c)
148  *
149  * > 2. on each descriptor, the process issues an ioctl() to identify
150  * >    the interface that should report events to the file descriptor.
151  *
152  * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
153  * 	Most important things happen in netmap_get_na() and
154  * 	netmap_do_regif(), called from there. Additional details can be
155  * 	found in the comments above those functions.
156  *
157  * 	In all cases, this action creates/takes-a-reference-to a
158  * 	netmap_*_adapter describing the port, and allocates a netmap_if
159  * 	and all necessary netmap rings, filling them with netmap buffers.
160  *
161  *      In this phase, the sync callbacks for each ring are set (these are used
162  *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
163  *      The adapter creation/initialization code puts them in the
164  * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
165  * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
166  * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
167  * 	actually call netmap_krings_create() to perform this and the other
168  * 	common stuff. netmap_krings_create() also takes care of the host rings,
169  * 	if needed, by setting their sync callbacks appropriately.
170  *
171  * 	Additional actions depend on the kind of netmap_adapter that has been
172  * 	registered:
173  *
174  * 	- netmap_hw_adapter:  	     [netmap.c]
175  * 	     This is a system netdev/ifp with native netmap support.
176  * 	     The ifp is detached from the host stack by redirecting:
177  * 	       - transmissions (from the network stack) to netmap_transmit()
178  * 	       - receive notifications to the nm_notify() callback for
179  * 	         this adapter. The callback is normally netmap_notify(), unless
180  * 	         the ifp is attached to a bridge using bwrap, in which case it
181  * 	         is netmap_bwrap_intr_notify().
182  *
183  * 	- netmap_generic_adapter:      [netmap_generic.c]
184  * 	      A system netdev/ifp without native netmap support.
185  *
186  * 	(the decision about native/non native support is taken in
187  * 	 netmap_get_hw_na(), called by netmap_get_na())
188  *
189  * 	- netmap_vp_adapter 		[netmap_vale.c]
190  * 	      Returned by netmap_get_bdg_na().
191  * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
192  * 	      are created on the fly if they don't already exist, and are
193  * 	      always attached to a bridge.
194  * 	      Persistent VALE ports must must be created separately, and i
195  * 	      then attached like normal NICs. The NIOCREGIF we are examining
196  * 	      will find them only if they had previosly been created and
197  * 	      attached (see VALE_CTL below).
198  *
199  * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
200  * 	      Returned by netmap_get_pipe_na().
201  * 	      Both pipe ends are created, if they didn't already exist.
202  *
203  * 	- netmap_monitor_adapter      [netmap_monitor.c]
204  * 	      Returned by netmap_get_monitor_na().
205  * 	      If successful, the nm_sync callbacks of the monitored adapter
206  * 	      will be intercepted by the returned monitor.
207  *
208  * 	- netmap_bwrap_adapter	      [netmap_vale.c]
209  * 	      Cannot be obtained in this way, see VALE_CTL below
210  *
211  *
212  * 	os-specific:
213  * 	    linux: we first go through linux_netmap_ioctl() to
214  * 	           adapt the FreeBSD interface to the linux one.
215  *
216  *
217  * > 3. on each descriptor, the process issues an mmap() request to
218  * >    map the shared memory region within the process' address space.
219  * >    The list of interesting queues is indicated by a location in
220  * >    the shared memory region.
221  *
222  *      os-specific:
223  *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
224  *  	    linux:   linux_netmap_mmap (netmap_linux.c).
225  *
226  * > 4. using the functions in the netmap(4) userspace API, a process
227  * >    can look up the occupation state of a queue, access memory buffers,
228  * >    and retrieve received packets or enqueue packets to transmit.
229  *
230  * 	these actions do not involve the kernel.
231  *
232  * > 5. using some ioctl()s the process can synchronize the userspace view
233  * >    of the queue with the actual status in the kernel. This includes both
234  * >    receiving the notification of new packets, and transmitting new
235  * >    packets on the output interface.
236  *
237  * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
238  * 	cases. They invoke the nm_sync callbacks on the netmap_kring
239  * 	structures, as initialized in step 2 and maybe later modified
240  * 	by a monitor. Monitors, however, will always call the original
241  * 	callback before doing anything else.
242  *
243  *
244  * > 6. select() or poll() can be used to wait for events on individual
245  * >    transmit or receive queues (or all queues for a given interface).
246  *
247  * 	Implemented in netmap_poll(). This will call the same nm_sync()
248  * 	callbacks as in step 5 above.
249  *
250  * 	os-specific:
251  * 		linux: we first go through linux_netmap_poll() to adapt
252  * 		       the FreeBSD interface to the linux one.
253  *
254  *
255  *  ----  VALE_CTL -----
256  *
257  *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
258  *  nr_cmd in the nmreq structure. These subcommands are handled by
259  *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
260  *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
261  *  subcommands, respectively.
262  *
263  *  Any network interface known to the system (including a persistent VALE
264  *  port) can be attached to a VALE switch by issuing the
265  *  NETMAP_REQ_VALE_ATTACH command. After the attachment, persistent VALE ports
266  *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
267  *  attachment of other interfaces, instead, requires the creation of a
268  *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
269  *  netmap mode. This may require the creation of a netmap_generic_adapter if
270  *  we have no native support for the interface, or if generic adapters have
271  *  been forced by sysctl.
272  *
273  *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
274  *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
275  *  callback.  In the case of the bwrap, the callback creates the
276  *  netmap_bwrap_adapter.  The initialization of the bwrap is then
277  *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
278  *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
279  *  A generic adapter for the wrapped ifp will be created if needed, when
280  *  netmap_get_bdg_na() calls netmap_get_hw_na().
281  *
282  *
283  *  ---- DATAPATHS -----
284  *
285  *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
286  *
287  *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
288  *
289  *    - tx from netmap userspace:
290  *	 concurrently:
291  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
292  *                kring->nm_sync() == DEVICE_netmap_txsync()
293  *           2) device interrupt handler
294  *                na->nm_notify()  == netmap_notify()
295  *    - rx from netmap userspace:
296  *       concurrently:
297  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
298  *                kring->nm_sync() == DEVICE_netmap_rxsync()
299  *           2) device interrupt handler
300  *                na->nm_notify()  == netmap_notify()
301  *    - rx from host stack
302  *       concurrently:
303  *           1) host stack
304  *                netmap_transmit()
305  *                  na->nm_notify  == netmap_notify()
306  *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
307  *                kring->nm_sync() == netmap_rxsync_from_host
308  *                  netmap_rxsync_from_host(na, NULL, NULL)
309  *    - tx to host stack
310  *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
311  *             kring->nm_sync() == netmap_txsync_to_host
312  *               netmap_txsync_to_host(na)
313  *                 nm_os_send_up()
314  *                   FreeBSD: na->if_input() == ether_input()
315  *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
316  *
317  *
318  *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
319  *
320  *    na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
321  *
322  *    - tx from netmap userspace:
323  *       concurrently:
324  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
325  *               kring->nm_sync() == generic_netmap_txsync()
326  *                   nm_os_generic_xmit_frame()
327  *                       linux:   dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
328  *                           ifp->ndo_start_xmit == generic_ndo_start_xmit()
329  *                               gna->save_start_xmit == orig. dev. start_xmit
330  *                       FreeBSD: na->if_transmit() == orig. dev if_transmit
331  *           2) generic_mbuf_destructor()
332  *                   na->nm_notify() == netmap_notify()
333  *    - rx from netmap userspace:
334  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
335  *               kring->nm_sync() == generic_netmap_rxsync()
336  *                   mbq_safe_dequeue()
337  *           2) device driver
338  *               generic_rx_handler()
339  *                   mbq_safe_enqueue()
340  *                   na->nm_notify() == netmap_notify()
341  *    - rx from host stack
342  *        FreeBSD: same as native
343  *        Linux: same as native except:
344  *           1) host stack
345  *               dev_queue_xmit() without NM_MAGIC_PRIORITY_TX
346  *                   ifp->ndo_start_xmit == generic_ndo_start_xmit()
347  *                       netmap_transmit()
348  *                           na->nm_notify() == netmap_notify()
349  *    - tx to host stack (same as native):
350  *
351  *
352  *                           -= VALE =-
353  *
354  *   INCOMING:
355  *
356  *      - VALE ports:
357  *          ioctl(NIOCTXSYNC)/netmap_poll() in process context
358  *              kring->nm_sync() == netmap_vp_txsync()
359  *
360  *      - system device with native support:
361  *         from cable:
362  *             interrupt
363  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
364  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
365  *                     netmap_vp_txsync()
366  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
367  *         from host stack:
368  *             netmap_transmit()
369  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
370  *                     kring->nm_sync() == netmap_rxsync_from_host()
371  *                     netmap_vp_txsync()
372  *
373  *      - system device with generic support:
374  *         from device driver:
375  *            generic_rx_handler()
376  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
377  *                     kring->nm_sync() == generic_netmap_rxsync()
378  *                     netmap_vp_txsync()
379  *                     kring->nm_sync() == generic_netmap_rxsync()
380  *         from host stack:
381  *            netmap_transmit()
382  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
383  *                     kring->nm_sync() == netmap_rxsync_from_host()
384  *                     netmap_vp_txsync()
385  *
386  *   (all cases) --> nm_bdg_flush()
387  *                      dest_na->nm_notify() == (see below)
388  *
389  *   OUTGOING:
390  *
391  *      - VALE ports:
392  *         concurrently:
393  *             1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
394  *                    kring->nm_sync() == netmap_vp_rxsync()
395  *             2) from nm_bdg_flush()
396  *                    na->nm_notify() == netmap_notify()
397  *
398  *      - system device with native support:
399  *          to cable:
400  *             na->nm_notify() == netmap_bwrap_notify()
401  *                 netmap_vp_rxsync()
402  *                 kring->nm_sync() == DEVICE_netmap_txsync()
403  *                 netmap_vp_rxsync()
404  *          to host stack:
405  *                 netmap_vp_rxsync()
406  *                 kring->nm_sync() == netmap_txsync_to_host
407  *                 netmap_vp_rxsync_locked()
408  *
409  *      - system device with generic adapter:
410  *          to device driver:
411  *             na->nm_notify() == netmap_bwrap_notify()
412  *                 netmap_vp_rxsync()
413  *                 kring->nm_sync() == generic_netmap_txsync()
414  *                 netmap_vp_rxsync()
415  *          to host stack:
416  *                 netmap_vp_rxsync()
417  *                 kring->nm_sync() == netmap_txsync_to_host
418  *                 netmap_vp_rxsync()
419  *
420  */
421 
422 /*
423  * OS-specific code that is used only within this file.
424  * Other OS-specific code that must be accessed by drivers
425  * is present in netmap_kern.h
426  */
427 
428 #if defined(__FreeBSD__)
429 #include <sys/cdefs.h> /* prerequisite */
430 #include <sys/types.h>
431 #include <sys/errno.h>
432 #include <sys/param.h>	/* defines used in kernel.h */
433 #include <sys/kernel.h>	/* types used in module initialization */
434 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
435 #include <sys/filio.h>	/* FIONBIO */
436 #include <sys/sockio.h>
437 #include <sys/socketvar.h>	/* struct socket */
438 #include <sys/malloc.h>
439 #include <sys/poll.h>
440 #include <sys/proc.h>
441 #include <sys/rwlock.h>
442 #include <sys/socket.h> /* sockaddrs */
443 #include <sys/selinfo.h>
444 #include <sys/sysctl.h>
445 #include <sys/jail.h>
446 #include <sys/epoch.h>
447 #include <net/vnet.h>
448 #include <net/if.h>
449 #include <net/if_var.h>
450 #include <net/bpf.h>		/* BIOCIMMEDIATE */
451 #include <machine/bus.h>	/* bus_dmamap_* */
452 #include <sys/endian.h>
453 #include <sys/refcount.h>
454 #include <net/ethernet.h>	/* ETHER_BPF_MTAP */
455 
456 
457 #elif defined(linux)
458 
459 #include "bsd_glue.h"
460 
461 #elif defined(__APPLE__)
462 
463 #warning OSX support is only partial
464 #include "osx_glue.h"
465 
466 #elif defined (_WIN32)
467 
468 #include "win_glue.h"
469 
470 #else
471 
472 #error	Unsupported platform
473 
474 #endif /* unsupported */
475 
476 /*
477  * common headers
478  */
479 #include <net/netmap.h>
480 #include <dev/netmap/netmap_kern.h>
481 #include <dev/netmap/netmap_mem2.h>
482 
483 
484 /* user-controlled variables */
485 int netmap_verbose;
486 #ifdef CONFIG_NETMAP_DEBUG
487 int netmap_debug;
488 #endif /* CONFIG_NETMAP_DEBUG */
489 
490 static int netmap_no_timestamp; /* don't timestamp on rxsync */
491 int netmap_no_pendintr = 1;
492 int netmap_txsync_retry = 2;
493 static int netmap_fwd = 0;	/* force transparent forwarding */
494 
495 /*
496  * netmap_admode selects the netmap mode to use.
497  * Invalid values are reset to NETMAP_ADMODE_BEST
498  */
499 enum {	NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
500 	NETMAP_ADMODE_NATIVE,	/* either native or none */
501 	NETMAP_ADMODE_GENERIC,	/* force generic */
502 	NETMAP_ADMODE_LAST };
503 static int netmap_admode = NETMAP_ADMODE_BEST;
504 
505 /* netmap_generic_mit controls mitigation of RX notifications for
506  * the generic netmap adapter. The value is a time interval in
507  * nanoseconds. */
508 int netmap_generic_mit = 100*1000;
509 
510 /* We use by default netmap-aware qdiscs with generic netmap adapters,
511  * even if there can be a little performance hit with hardware NICs.
512  * However, using the qdisc is the safer approach, for two reasons:
513  * 1) it prevents non-fifo qdiscs to break the TX notification
514  *    scheme, which is based on mbuf destructors when txqdisc is
515  *    not used.
516  * 2) it makes it possible to transmit over software devices that
517  *    change skb->dev, like bridge, veth, ...
518  *
519  * Anyway users looking for the best performance should
520  * use native adapters.
521  */
522 #ifdef linux
523 int netmap_generic_txqdisc = 1;
524 #endif
525 
526 /* Default number of slots and queues for generic adapters. */
527 int netmap_generic_ringsize = 1024;
528 int netmap_generic_rings = 1;
529 
530 /* Non-zero to enable checksum offloading in NIC drivers */
531 int netmap_generic_hwcsum = 0;
532 
533 /* Non-zero if ptnet devices are allowed to use virtio-net headers. */
534 int ptnet_vnet_hdr = 1;
535 
536 /*
537  * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated
538  * in some other operating systems
539  */
540 SYSBEGIN(main_init);
541 
542 SYSCTL_DECL(_dev_netmap);
543 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
544 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
545 		CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
546 #ifdef CONFIG_NETMAP_DEBUG
547 SYSCTL_INT(_dev_netmap, OID_AUTO, debug,
548 		CTLFLAG_RW, &netmap_debug, 0, "Debug messages");
549 #endif /* CONFIG_NETMAP_DEBUG */
550 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
551 		CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
552 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr,
553 		0, "Always look for new received packets.");
554 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
555 		&netmap_txsync_retry, 0, "Number of txsync loops in bridge's flush.");
556 
557 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0,
558 		"Force NR_FORWARD mode");
559 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0,
560 		"Adapter mode. 0 selects the best option available,"
561 		"1 forces native adapter, 2 forces emulated adapter");
562 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_hwcsum, CTLFLAG_RW, &netmap_generic_hwcsum,
563 		0, "Hardware checksums. 0 to disable checksum generation by the NIC (default),"
564 		"1 to enable checksum generation by the NIC");
565 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit,
566 		0, "RX notification interval in nanoseconds");
567 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW,
568 		&netmap_generic_ringsize, 0,
569 		"Number of per-ring slots for emulated netmap mode");
570 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW,
571 		&netmap_generic_rings, 0,
572 		"Number of TX/RX queues for emulated netmap adapters");
573 #ifdef linux
574 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW,
575 		&netmap_generic_txqdisc, 0, "Use qdisc for generic adapters");
576 #endif
577 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr,
578 		0, "Allow ptnet devices to use virtio-net headers");
579 
580 SYSEND;
581 
582 NMG_LOCK_T	netmap_global_lock;
583 
584 /*
585  * mark the ring as stopped, and run through the locks
586  * to make sure other users get to see it.
587  * stopped must be either NR_KR_STOPPED (for unbounded stop)
588  * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)
589  */
590 static void
591 netmap_disable_ring(struct netmap_kring *kr, int stopped)
592 {
593 	nm_kr_stop(kr, stopped);
594 	// XXX check if nm_kr_stop is sufficient
595 	mtx_lock(&kr->q_lock);
596 	mtx_unlock(&kr->q_lock);
597 	nm_kr_put(kr);
598 }
599 
600 /* stop or enable a single ring */
601 void
602 netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
603 {
604 	if (stopped)
605 		netmap_disable_ring(NMR(na, t)[ring_id], stopped);
606 	else
607 		NMR(na, t)[ring_id]->nkr_stopped = 0;
608 }
609 
610 
611 /* stop or enable all the rings of na */
612 void
613 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
614 {
615 	int i;
616 	enum txrx t;
617 
618 	if (!nm_netmap_on(na))
619 		return;
620 
621 	for_rx_tx(t) {
622 		for (i = 0; i < netmap_real_rings(na, t); i++) {
623 			netmap_set_ring(na, i, t, stopped);
624 		}
625 	}
626 }
627 
628 /*
629  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
630  * to finish and prevents any new one from starting.  Call this before turning
631  * netmap mode off, or before removing the hardware rings (e.g., on module
632  * onload).
633  */
634 void
635 netmap_disable_all_rings(struct ifnet *ifp)
636 {
637 	if (NM_NA_VALID(ifp)) {
638 		netmap_set_all_rings(NA(ifp), NM_KR_STOPPED);
639 	}
640 }
641 
642 /*
643  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
644  * adapter's rings In linux drivers, this should be placed near each
645  * napi_enable().
646  */
647 void
648 netmap_enable_all_rings(struct ifnet *ifp)
649 {
650 	if (NM_NA_VALID(ifp)) {
651 		netmap_set_all_rings(NA(ifp), 0 /* enabled */);
652 	}
653 }
654 
655 void
656 netmap_make_zombie(struct ifnet *ifp)
657 {
658 	if (NM_NA_VALID(ifp)) {
659 		struct netmap_adapter *na = NA(ifp);
660 		netmap_set_all_rings(na, NM_KR_LOCKED);
661 		na->na_flags |= NAF_ZOMBIE;
662 		netmap_set_all_rings(na, 0);
663 	}
664 }
665 
666 void
667 netmap_undo_zombie(struct ifnet *ifp)
668 {
669 	if (NM_NA_VALID(ifp)) {
670 		struct netmap_adapter *na = NA(ifp);
671 		if (na->na_flags & NAF_ZOMBIE) {
672 			netmap_set_all_rings(na, NM_KR_LOCKED);
673 			na->na_flags &= ~NAF_ZOMBIE;
674 			netmap_set_all_rings(na, 0);
675 		}
676 	}
677 }
678 
679 /*
680  * generic bound_checking function
681  */
682 u_int
683 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
684 {
685 	u_int oldv = *v;
686 	const char *op = NULL;
687 
688 	if (dflt < lo)
689 		dflt = lo;
690 	if (dflt > hi)
691 		dflt = hi;
692 	if (oldv < lo) {
693 		*v = dflt;
694 		op = "Bump";
695 	} else if (oldv > hi) {
696 		*v = hi;
697 		op = "Clamp";
698 	}
699 	if (op && msg)
700 		nm_prinf("%s %s to %d (was %d)", op, msg, *v, oldv);
701 	return *v;
702 }
703 
704 
705 /*
706  * packet-dump function, user-supplied or static buffer.
707  * The destination buffer must be at least 30+4*len
708  */
709 const char *
710 nm_dump_buf(char *p, int len, int lim, char *dst)
711 {
712 	static char _dst[8192];
713 	int i, j, i0;
714 	static char hex[] ="0123456789abcdef";
715 	char *o;	/* output position */
716 
717 #define P_HI(x)	hex[((x) & 0xf0)>>4]
718 #define P_LO(x)	hex[((x) & 0xf)]
719 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
720 	if (!dst)
721 		dst = _dst;
722 	if (lim <= 0 || lim > len)
723 		lim = len;
724 	o = dst;
725 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
726 	o += strlen(o);
727 	/* hexdump routine */
728 	for (i = 0; i < lim; ) {
729 		sprintf(o, "%5d: ", i);
730 		o += strlen(o);
731 		memset(o, ' ', 48);
732 		i0 = i;
733 		for (j=0; j < 16 && i < lim; i++, j++) {
734 			o[j*3] = P_HI(p[i]);
735 			o[j*3+1] = P_LO(p[i]);
736 		}
737 		i = i0;
738 		for (j=0; j < 16 && i < lim; i++, j++)
739 			o[j + 48] = P_C(p[i]);
740 		o[j+48] = '\n';
741 		o += j+49;
742 	}
743 	*o = '\0';
744 #undef P_HI
745 #undef P_LO
746 #undef P_C
747 	return dst;
748 }
749 
750 
751 /*
752  * Fetch configuration from the device, to cope with dynamic
753  * reconfigurations after loading the module.
754  */
755 /* call with NMG_LOCK held */
756 int
757 netmap_update_config(struct netmap_adapter *na)
758 {
759 	struct nm_config_info info;
760 
761 	bzero(&info, sizeof(info));
762 	if (na->nm_config == NULL ||
763 	    na->nm_config(na, &info)) {
764 		/* take whatever we had at init time */
765 		info.num_tx_rings = na->num_tx_rings;
766 		info.num_tx_descs = na->num_tx_desc;
767 		info.num_rx_rings = na->num_rx_rings;
768 		info.num_rx_descs = na->num_rx_desc;
769 		info.rx_buf_maxsize = na->rx_buf_maxsize;
770 	}
771 
772 	if (na->num_tx_rings == info.num_tx_rings &&
773 	    na->num_tx_desc == info.num_tx_descs &&
774 	    na->num_rx_rings == info.num_rx_rings &&
775 	    na->num_rx_desc == info.num_rx_descs &&
776 	    na->rx_buf_maxsize == info.rx_buf_maxsize)
777 		return 0; /* nothing changed */
778 	if (na->active_fds == 0) {
779 		na->num_tx_rings = info.num_tx_rings;
780 		na->num_tx_desc = info.num_tx_descs;
781 		na->num_rx_rings = info.num_rx_rings;
782 		na->num_rx_desc = info.num_rx_descs;
783 		na->rx_buf_maxsize = info.rx_buf_maxsize;
784 		if (netmap_verbose)
785 			nm_prinf("configuration changed for %s: txring %d x %d, "
786 				"rxring %d x %d, rxbufsz %d",
787 				na->name, na->num_tx_rings, na->num_tx_desc,
788 				na->num_rx_rings, na->num_rx_desc, na->rx_buf_maxsize);
789 		return 0;
790 	}
791 	nm_prerr("WARNING: configuration changed for %s while active: "
792 		"txring %d x %d, rxring %d x %d, rxbufsz %d",
793 		na->name, info.num_tx_rings, info.num_tx_descs,
794 		info.num_rx_rings, info.num_rx_descs,
795 		info.rx_buf_maxsize);
796 	return 1;
797 }
798 
799 /* nm_sync callbacks for the host rings */
800 static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
801 static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
802 
803 /* create the krings array and initialize the fields common to all adapters.
804  * The array layout is this:
805  *
806  *                    +----------+
807  * na->tx_rings ----->|          | \
808  *                    |          |  } na->num_tx_ring
809  *                    |          | /
810  *                    +----------+
811  *                    |          |    host tx kring
812  * na->rx_rings ----> +----------+
813  *                    |          | \
814  *                    |          |  } na->num_rx_rings
815  *                    |          | /
816  *                    +----------+
817  *                    |          |    host rx kring
818  *                    +----------+
819  * na->tailroom ----->|          | \
820  *                    |          |  } tailroom bytes
821  *                    |          | /
822  *                    +----------+
823  *
824  * Note: for compatibility, host krings are created even when not needed.
825  * The tailroom space is currently used by vale ports for allocating leases.
826  */
827 /* call with NMG_LOCK held */
828 int
829 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
830 {
831 	u_int i, len, ndesc;
832 	struct netmap_kring *kring;
833 	u_int n[NR_TXRX];
834 	enum txrx t;
835 	int err = 0;
836 
837 	if (na->tx_rings != NULL) {
838 		if (netmap_debug & NM_DEBUG_ON)
839 			nm_prerr("warning: krings were already created");
840 		return 0;
841 	}
842 
843 	/* account for the (possibly fake) host rings */
844 	n[NR_TX] = netmap_all_rings(na, NR_TX);
845 	n[NR_RX] = netmap_all_rings(na, NR_RX);
846 
847 	len = (n[NR_TX] + n[NR_RX]) *
848 		(sizeof(struct netmap_kring) + sizeof(struct netmap_kring *))
849 		+ tailroom;
850 
851 	na->tx_rings = nm_os_malloc((size_t)len);
852 	if (na->tx_rings == NULL) {
853 		nm_prerr("Cannot allocate krings");
854 		return ENOMEM;
855 	}
856 	na->rx_rings = na->tx_rings + n[NR_TX];
857 	na->tailroom = na->rx_rings + n[NR_RX];
858 
859 	/* link the krings in the krings array */
860 	kring = (struct netmap_kring *)((char *)na->tailroom + tailroom);
861 	for (i = 0; i < n[NR_TX] + n[NR_RX]; i++) {
862 		na->tx_rings[i] = kring;
863 		kring++;
864 	}
865 
866 	/*
867 	 * All fields in krings are 0 except the one initialized below.
868 	 * but better be explicit on important kring fields.
869 	 */
870 	for_rx_tx(t) {
871 		ndesc = nma_get_ndesc(na, t);
872 		for (i = 0; i < n[t]; i++) {
873 			kring = NMR(na, t)[i];
874 			bzero(kring, sizeof(*kring));
875 			kring->notify_na = na;
876 			kring->ring_id = i;
877 			kring->tx = t;
878 			kring->nkr_num_slots = ndesc;
879 			kring->nr_mode = NKR_NETMAP_OFF;
880 			kring->nr_pending_mode = NKR_NETMAP_OFF;
881 			if (i < nma_get_nrings(na, t)) {
882 				kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
883 			} else {
884 				if (!(na->na_flags & NAF_HOST_RINGS))
885 					kring->nr_kflags |= NKR_FAKERING;
886 				kring->nm_sync = (t == NR_TX ?
887 						netmap_txsync_to_host:
888 						netmap_rxsync_from_host);
889 			}
890 			kring->nm_notify = na->nm_notify;
891 			kring->rhead = kring->rcur = kring->nr_hwcur = 0;
892 			/*
893 			 * IMPORTANT: Always keep one slot empty.
894 			 */
895 			kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
896 			snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
897 					nm_txrx2str(t), i);
898 			nm_prdis("ktx %s h %d c %d t %d",
899 				kring->name, kring->rhead, kring->rcur, kring->rtail);
900 			err = nm_os_selinfo_init(&kring->si, kring->name);
901 			if (err) {
902 				netmap_krings_delete(na);
903 				return err;
904 			}
905 			mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
906 			kring->na = na;	/* setting this field marks the mutex as initialized */
907 		}
908 		err = nm_os_selinfo_init(&na->si[t], na->name);
909 		if (err) {
910 			netmap_krings_delete(na);
911 			return err;
912 		}
913 	}
914 
915 	return 0;
916 }
917 
918 
919 /* undo the actions performed by netmap_krings_create */
920 /* call with NMG_LOCK held */
921 void
922 netmap_krings_delete(struct netmap_adapter *na)
923 {
924 	struct netmap_kring **kring = na->tx_rings;
925 	enum txrx t;
926 
927 	if (na->tx_rings == NULL) {
928 		if (netmap_debug & NM_DEBUG_ON)
929 			nm_prerr("warning: krings were already deleted");
930 		return;
931 	}
932 
933 	for_rx_tx(t)
934 		nm_os_selinfo_uninit(&na->si[t]);
935 
936 	/* we rely on the krings layout described above */
937 	for ( ; kring != na->tailroom; kring++) {
938 		if ((*kring)->na != NULL)
939 			mtx_destroy(&(*kring)->q_lock);
940 		nm_os_selinfo_uninit(&(*kring)->si);
941 	}
942 	nm_os_free(na->tx_rings);
943 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
944 }
945 
946 
947 /*
948  * Destructor for NIC ports. They also have an mbuf queue
949  * on the rings connected to the host so we need to purge
950  * them first.
951  */
952 /* call with NMG_LOCK held */
953 void
954 netmap_hw_krings_delete(struct netmap_adapter *na)
955 {
956 	u_int lim = netmap_real_rings(na, NR_RX), i;
957 
958 	for (i = nma_get_nrings(na, NR_RX); i < lim; i++) {
959 		struct mbq *q = &NMR(na, NR_RX)[i]->rx_queue;
960 		nm_prdis("destroy sw mbq with len %d", mbq_len(q));
961 		mbq_purge(q);
962 		mbq_safe_fini(q);
963 	}
964 	netmap_krings_delete(na);
965 }
966 
967 static void
968 netmap_mem_drop(struct netmap_adapter *na)
969 {
970 	int last = netmap_mem_deref(na->nm_mem, na);
971 	/* if the native allocator had been overrided on regif,
972 	 * restore it now and drop the temporary one
973 	 */
974 	if (last && na->nm_mem_prev) {
975 		netmap_mem_put(na->nm_mem);
976 		na->nm_mem = na->nm_mem_prev;
977 		na->nm_mem_prev = NULL;
978 	}
979 }
980 
981 /*
982  * Undo everything that was done in netmap_do_regif(). In particular,
983  * call nm_register(ifp,0) to stop netmap mode on the interface and
984  * revert to normal operation.
985  */
986 /* call with NMG_LOCK held */
987 static void netmap_unset_ringid(struct netmap_priv_d *);
988 static void netmap_krings_put(struct netmap_priv_d *);
989 void
990 netmap_do_unregif(struct netmap_priv_d *priv)
991 {
992 	struct netmap_adapter *na = priv->np_na;
993 
994 	NMG_LOCK_ASSERT();
995 	na->active_fds--;
996 	/* unset nr_pending_mode and possibly release exclusive mode */
997 	netmap_krings_put(priv);
998 
999 #ifdef	WITH_MONITOR
1000 	/* XXX check whether we have to do something with monitor
1001 	 * when rings change nr_mode. */
1002 	if (na->active_fds <= 0) {
1003 		/* walk through all the rings and tell any monitor
1004 		 * that the port is going to exit netmap mode
1005 		 */
1006 		netmap_monitor_stop(na);
1007 	}
1008 #endif
1009 
1010 	if (na->active_fds <= 0 || nm_kring_pending(priv)) {
1011 		na->nm_register(na, 0);
1012 	}
1013 
1014 	/* delete rings and buffers that are no longer needed */
1015 	netmap_mem_rings_delete(na);
1016 
1017 	if (na->active_fds <= 0) {	/* last instance */
1018 		/*
1019 		 * (TO CHECK) We enter here
1020 		 * when the last reference to this file descriptor goes
1021 		 * away. This means we cannot have any pending poll()
1022 		 * or interrupt routine operating on the structure.
1023 		 * XXX The file may be closed in a thread while
1024 		 * another thread is using it.
1025 		 * Linux keeps the file opened until the last reference
1026 		 * by any outstanding ioctl/poll or mmap is gone.
1027 		 * FreeBSD does not track mmap()s (but we do) and
1028 		 * wakes up any sleeping poll(). Need to check what
1029 		 * happens if the close() occurs while a concurrent
1030 		 * syscall is running.
1031 		 */
1032 		if (netmap_debug & NM_DEBUG_ON)
1033 			nm_prinf("deleting last instance for %s", na->name);
1034 
1035 		if (nm_netmap_on(na)) {
1036 			nm_prerr("BUG: netmap on while going to delete the krings");
1037 		}
1038 
1039 		na->nm_krings_delete(na);
1040 
1041 		/* restore the default number of host tx and rx rings */
1042 		if (na->na_flags & NAF_HOST_RINGS) {
1043 			na->num_host_tx_rings = 1;
1044 			na->num_host_rx_rings = 1;
1045 		} else {
1046 			na->num_host_tx_rings = 0;
1047 			na->num_host_rx_rings = 0;
1048 		}
1049 	}
1050 
1051 	/* possibily decrement counter of tx_si/rx_si users */
1052 	netmap_unset_ringid(priv);
1053 	/* delete the nifp */
1054 	netmap_mem_if_delete(na, priv->np_nifp);
1055 	/* drop the allocator */
1056 	netmap_mem_drop(na);
1057 	/* mark the priv as unregistered */
1058 	priv->np_na = NULL;
1059 	priv->np_nifp = NULL;
1060 }
1061 
1062 struct netmap_priv_d*
1063 netmap_priv_new(void)
1064 {
1065 	struct netmap_priv_d *priv;
1066 
1067 	priv = nm_os_malloc(sizeof(struct netmap_priv_d));
1068 	if (priv == NULL)
1069 		return NULL;
1070 	priv->np_refs = 1;
1071 	nm_os_get_module();
1072 	return priv;
1073 }
1074 
1075 /*
1076  * Destructor of the netmap_priv_d, called when the fd is closed
1077  * Action: undo all the things done by NIOCREGIF,
1078  * On FreeBSD we need to track whether there are active mmap()s,
1079  * and we use np_active_mmaps for that. On linux, the field is always 0.
1080  * Return: 1 if we can free priv, 0 otherwise.
1081  *
1082  */
1083 /* call with NMG_LOCK held */
1084 void
1085 netmap_priv_delete(struct netmap_priv_d *priv)
1086 {
1087 	struct netmap_adapter *na = priv->np_na;
1088 
1089 	/* number of active references to this fd */
1090 	if (--priv->np_refs > 0) {
1091 		return;
1092 	}
1093 	nm_os_put_module();
1094 	if (na) {
1095 		netmap_do_unregif(priv);
1096 	}
1097 	netmap_unget_na(na, priv->np_ifp);
1098 	bzero(priv, sizeof(*priv));	/* for safety */
1099 	nm_os_free(priv);
1100 }
1101 
1102 
1103 /* call with NMG_LOCK *not* held */
1104 void
1105 netmap_dtor(void *data)
1106 {
1107 	struct netmap_priv_d *priv = data;
1108 
1109 	NMG_LOCK();
1110 	netmap_priv_delete(priv);
1111 	NMG_UNLOCK();
1112 }
1113 
1114 
1115 /*
1116  * Handlers for synchronization of the rings from/to the host stack.
1117  * These are associated to a network interface and are just another
1118  * ring pair managed by userspace.
1119  *
1120  * Netmap also supports transparent forwarding (NS_FORWARD and NR_FORWARD
1121  * flags):
1122  *
1123  * - Before releasing buffers on hw RX rings, the application can mark
1124  *   them with the NS_FORWARD flag. During the next RXSYNC or poll(), they
1125  *   will be forwarded to the host stack, similarly to what happened if
1126  *   the application moved them to the host TX ring.
1127  *
1128  * - Before releasing buffers on the host RX ring, the application can
1129  *   mark them with the NS_FORWARD flag. During the next RXSYNC or poll(),
1130  *   they will be forwarded to the hw TX rings, saving the application
1131  *   from doing the same task in user-space.
1132  *
1133  * Transparent fowarding can be enabled per-ring, by setting the NR_FORWARD
1134  * flag, or globally with the netmap_fwd sysctl.
1135  *
1136  * The transfer NIC --> host is relatively easy, just encapsulate
1137  * into mbufs and we are done. The host --> NIC side is slightly
1138  * harder because there might not be room in the tx ring so it
1139  * might take a while before releasing the buffer.
1140  */
1141 
1142 
1143 /*
1144  * Pass a whole queue of mbufs to the host stack as coming from 'dst'
1145  * We do not need to lock because the queue is private.
1146  * After this call the queue is empty.
1147  */
1148 static void
1149 netmap_send_up(struct ifnet *dst, struct mbq *q)
1150 {
1151 	struct epoch_tracker et;
1152 	struct mbuf *m;
1153 	struct mbuf *head = NULL, *prev = NULL;
1154 
1155 	NET_EPOCH_ENTER(et);
1156 	/* Send packets up, outside the lock; head/prev machinery
1157 	 * is only useful for Windows. */
1158 	while ((m = mbq_dequeue(q)) != NULL) {
1159 		if (netmap_debug & NM_DEBUG_HOST)
1160 			nm_prinf("sending up pkt %p size %d", m, MBUF_LEN(m));
1161 		prev = nm_os_send_up(dst, m, prev);
1162 		if (head == NULL)
1163 			head = prev;
1164 	}
1165 	if (head)
1166 		nm_os_send_up(dst, NULL, head);
1167 	NET_EPOCH_EXIT(et);
1168 	mbq_fini(q);
1169 }
1170 
1171 
1172 /*
1173  * Scan the buffers from hwcur to ring->head, and put a copy of those
1174  * marked NS_FORWARD (or all of them if forced) into a queue of mbufs.
1175  * Drop remaining packets in the unlikely event
1176  * of an mbuf shortage.
1177  */
1178 static void
1179 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1180 {
1181 	u_int const lim = kring->nkr_num_slots - 1;
1182 	u_int const head = kring->rhead;
1183 	u_int n;
1184 	struct netmap_adapter *na = kring->na;
1185 
1186 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1187 		struct mbuf *m;
1188 		struct netmap_slot *slot = &kring->ring->slot[n];
1189 
1190 		if ((slot->flags & NS_FORWARD) == 0 && !force)
1191 			continue;
1192 		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1193 			nm_prlim(5, "bad pkt at %d len %d", n, slot->len);
1194 			continue;
1195 		}
1196 		slot->flags &= ~NS_FORWARD; // XXX needed ?
1197 		/* XXX TODO: adapt to the case of a multisegment packet */
1198 		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1199 
1200 		if (m == NULL)
1201 			break;
1202 		mbq_enqueue(q, m);
1203 	}
1204 }
1205 
1206 static inline int
1207 _nm_may_forward(struct netmap_kring *kring)
1208 {
1209 	return	((netmap_fwd || kring->ring->flags & NR_FORWARD) &&
1210 		 kring->na->na_flags & NAF_HOST_RINGS &&
1211 		 kring->tx == NR_RX);
1212 }
1213 
1214 static inline int
1215 nm_may_forward_up(struct netmap_kring *kring)
1216 {
1217 	return	_nm_may_forward(kring) &&
1218 		 kring->ring_id != kring->na->num_rx_rings;
1219 }
1220 
1221 static inline int
1222 nm_may_forward_down(struct netmap_kring *kring, int sync_flags)
1223 {
1224 	return	_nm_may_forward(kring) &&
1225 		 (sync_flags & NAF_CAN_FORWARD_DOWN) &&
1226 		 kring->ring_id == kring->na->num_rx_rings;
1227 }
1228 
1229 /*
1230  * Send to the NIC rings packets marked NS_FORWARD between
1231  * kring->nr_hwcur and kring->rhead.
1232  * Called under kring->rx_queue.lock on the sw rx ring.
1233  *
1234  * It can only be called if the user opened all the TX hw rings,
1235  * see NAF_CAN_FORWARD_DOWN flag.
1236  * We can touch the TX netmap rings (slots, head and cur) since
1237  * we are in poll/ioctl system call context, and the application
1238  * is not supposed to touch the ring (using a different thread)
1239  * during the execution of the system call.
1240  */
1241 static u_int
1242 netmap_sw_to_nic(struct netmap_adapter *na)
1243 {
1244 	struct netmap_kring *kring = na->rx_rings[na->num_rx_rings];
1245 	struct netmap_slot *rxslot = kring->ring->slot;
1246 	u_int i, rxcur = kring->nr_hwcur;
1247 	u_int const head = kring->rhead;
1248 	u_int const src_lim = kring->nkr_num_slots - 1;
1249 	u_int sent = 0;
1250 
1251 	/* scan rings to find space, then fill as much as possible */
1252 	for (i = 0; i < na->num_tx_rings; i++) {
1253 		struct netmap_kring *kdst = na->tx_rings[i];
1254 		struct netmap_ring *rdst = kdst->ring;
1255 		u_int const dst_lim = kdst->nkr_num_slots - 1;
1256 
1257 		/* XXX do we trust ring or kring->rcur,rtail ? */
1258 		for (; rxcur != head && !nm_ring_empty(rdst);
1259 		     rxcur = nm_next(rxcur, src_lim) ) {
1260 			struct netmap_slot *src, *dst, tmp;
1261 			u_int dst_head = rdst->head;
1262 
1263 			src = &rxslot[rxcur];
1264 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1265 				continue;
1266 
1267 			sent++;
1268 
1269 			dst = &rdst->slot[dst_head];
1270 
1271 			tmp = *src;
1272 
1273 			src->buf_idx = dst->buf_idx;
1274 			src->flags = NS_BUF_CHANGED;
1275 
1276 			dst->buf_idx = tmp.buf_idx;
1277 			dst->len = tmp.len;
1278 			dst->flags = NS_BUF_CHANGED;
1279 
1280 			rdst->head = rdst->cur = nm_next(dst_head, dst_lim);
1281 		}
1282 		/* if (sent) XXX txsync ? it would be just an optimization */
1283 	}
1284 	return sent;
1285 }
1286 
1287 
1288 /*
1289  * netmap_txsync_to_host() passes packets up. We are called from a
1290  * system call in user process context, and the only contention
1291  * can be among multiple user threads erroneously calling
1292  * this routine concurrently.
1293  */
1294 static int
1295 netmap_txsync_to_host(struct netmap_kring *kring, int flags)
1296 {
1297 	struct netmap_adapter *na = kring->na;
1298 	u_int const lim = kring->nkr_num_slots - 1;
1299 	u_int const head = kring->rhead;
1300 	struct mbq q;
1301 
1302 	/* Take packets from hwcur to head and pass them up.
1303 	 * Force hwcur = head since netmap_grab_packets() stops at head
1304 	 */
1305 	mbq_init(&q);
1306 	netmap_grab_packets(kring, &q, 1 /* force */);
1307 	nm_prdis("have %d pkts in queue", mbq_len(&q));
1308 	kring->nr_hwcur = head;
1309 	kring->nr_hwtail = head + lim;
1310 	if (kring->nr_hwtail > lim)
1311 		kring->nr_hwtail -= lim + 1;
1312 
1313 	netmap_send_up(na->ifp, &q);
1314 	return 0;
1315 }
1316 
1317 
1318 /*
1319  * rxsync backend for packets coming from the host stack.
1320  * They have been put in kring->rx_queue by netmap_transmit().
1321  * We protect access to the kring using kring->rx_queue.lock
1322  *
1323  * also moves to the nic hw rings any packet the user has marked
1324  * for transparent-mode forwarding, then sets the NR_FORWARD
1325  * flag in the kring to let the caller push them out
1326  */
1327 static int
1328 netmap_rxsync_from_host(struct netmap_kring *kring, int flags)
1329 {
1330 	struct netmap_adapter *na = kring->na;
1331 	struct netmap_ring *ring = kring->ring;
1332 	u_int nm_i, n;
1333 	u_int const lim = kring->nkr_num_slots - 1;
1334 	u_int const head = kring->rhead;
1335 	int ret = 0;
1336 	struct mbq *q = &kring->rx_queue, fq;
1337 
1338 	mbq_init(&fq); /* fq holds packets to be freed */
1339 
1340 	mbq_lock(q);
1341 
1342 	/* First part: import newly received packets */
1343 	n = mbq_len(q);
1344 	if (n) { /* grab packets from the queue */
1345 		struct mbuf *m;
1346 		uint32_t stop_i;
1347 
1348 		nm_i = kring->nr_hwtail;
1349 		stop_i = nm_prev(kring->nr_hwcur, lim);
1350 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1351 			int len = MBUF_LEN(m);
1352 			struct netmap_slot *slot = &ring->slot[nm_i];
1353 
1354 			m_copydata(m, 0, len, NMB(na, slot));
1355 			nm_prdis("nm %d len %d", nm_i, len);
1356 			if (netmap_debug & NM_DEBUG_HOST)
1357 				nm_prinf("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1358 
1359 			slot->len = len;
1360 			slot->flags = 0;
1361 			nm_i = nm_next(nm_i, lim);
1362 			mbq_enqueue(&fq, m);
1363 		}
1364 		kring->nr_hwtail = nm_i;
1365 	}
1366 
1367 	/*
1368 	 * Second part: skip past packets that userspace has released.
1369 	 */
1370 	nm_i = kring->nr_hwcur;
1371 	if (nm_i != head) { /* something was released */
1372 		if (nm_may_forward_down(kring, flags)) {
1373 			ret = netmap_sw_to_nic(na);
1374 			if (ret > 0) {
1375 				kring->nr_kflags |= NR_FORWARD;
1376 				ret = 0;
1377 			}
1378 		}
1379 		kring->nr_hwcur = head;
1380 	}
1381 
1382 	mbq_unlock(q);
1383 
1384 	mbq_purge(&fq);
1385 	mbq_fini(&fq);
1386 
1387 	return ret;
1388 }
1389 
1390 
1391 /* Get a netmap adapter for the port.
1392  *
1393  * If it is possible to satisfy the request, return 0
1394  * with *na containing the netmap adapter found.
1395  * Otherwise return an error code, with *na containing NULL.
1396  *
1397  * When the port is attached to a bridge, we always return
1398  * EBUSY.
1399  * Otherwise, if the port is already bound to a file descriptor,
1400  * then we unconditionally return the existing adapter into *na.
1401  * In all the other cases, we return (into *na) either native,
1402  * generic or NULL, according to the following table:
1403  *
1404  *					native_support
1405  * active_fds   dev.netmap.admode         YES     NO
1406  * -------------------------------------------------------
1407  *    >0              *                 NA(ifp) NA(ifp)
1408  *
1409  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1410  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1411  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1412  *
1413  */
1414 static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */
1415 int
1416 netmap_get_hw_na(struct ifnet *ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na)
1417 {
1418 	/* generic support */
1419 	int i = netmap_admode;	/* Take a snapshot. */
1420 	struct netmap_adapter *prev_na;
1421 	int error = 0;
1422 
1423 	*na = NULL; /* default */
1424 
1425 	/* reset in case of invalid value */
1426 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1427 		i = netmap_admode = NETMAP_ADMODE_BEST;
1428 
1429 	if (NM_NA_VALID(ifp)) {
1430 		prev_na = NA(ifp);
1431 		/* If an adapter already exists, return it if
1432 		 * there are active file descriptors or if
1433 		 * netmap is not forced to use generic
1434 		 * adapters.
1435 		 */
1436 		if (NETMAP_OWNED_BY_ANY(prev_na)
1437 			|| i != NETMAP_ADMODE_GENERIC
1438 			|| prev_na->na_flags & NAF_FORCE_NATIVE
1439 #ifdef WITH_PIPES
1440 			/* ugly, but we cannot allow an adapter switch
1441 			 * if some pipe is referring to this one
1442 			 */
1443 			|| prev_na->na_next_pipe > 0
1444 #endif
1445 		) {
1446 			*na = prev_na;
1447 			goto assign_mem;
1448 		}
1449 	}
1450 
1451 	/* If there isn't native support and netmap is not allowed
1452 	 * to use generic adapters, we cannot satisfy the request.
1453 	 */
1454 	if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)
1455 		return EOPNOTSUPP;
1456 
1457 	/* Otherwise, create a generic adapter and return it,
1458 	 * saving the previously used netmap adapter, if any.
1459 	 *
1460 	 * Note that here 'prev_na', if not NULL, MUST be a
1461 	 * native adapter, and CANNOT be a generic one. This is
1462 	 * true because generic adapters are created on demand, and
1463 	 * destroyed when not used anymore. Therefore, if the adapter
1464 	 * currently attached to an interface 'ifp' is generic, it
1465 	 * must be that
1466 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1467 	 * Consequently, if NA(ifp) is generic, we will enter one of
1468 	 * the branches above. This ensures that we never override
1469 	 * a generic adapter with another generic adapter.
1470 	 */
1471 	error = generic_netmap_attach(ifp);
1472 	if (error)
1473 		return error;
1474 
1475 	*na = NA(ifp);
1476 
1477 assign_mem:
1478 	if (nmd != NULL && !((*na)->na_flags & NAF_MEM_OWNER) &&
1479 	    (*na)->active_fds == 0 && ((*na)->nm_mem != nmd)) {
1480 		(*na)->nm_mem_prev = (*na)->nm_mem;
1481 		(*na)->nm_mem = netmap_mem_get(nmd);
1482 	}
1483 
1484 	return 0;
1485 }
1486 
1487 /*
1488  * MUST BE CALLED UNDER NMG_LOCK()
1489  *
1490  * Get a refcounted reference to a netmap adapter attached
1491  * to the interface specified by req.
1492  * This is always called in the execution of an ioctl().
1493  *
1494  * Return ENXIO if the interface specified by the request does
1495  * not exist, ENOTSUP if netmap is not supported by the interface,
1496  * EBUSY if the interface is already attached to a bridge,
1497  * EINVAL if parameters are invalid, ENOMEM if needed resources
1498  * could not be allocated.
1499  * If successful, hold a reference to the netmap adapter.
1500  *
1501  * If the interface specified by req is a system one, also keep
1502  * a reference to it and return a valid *ifp.
1503  */
1504 int
1505 netmap_get_na(struct nmreq_header *hdr,
1506 	      struct netmap_adapter **na, struct ifnet **ifp,
1507 	      struct netmap_mem_d *nmd, int create)
1508 {
1509 	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1510 	int error = 0;
1511 	struct netmap_adapter *ret = NULL;
1512 	int nmd_ref = 0;
1513 
1514 	*na = NULL;     /* default return value */
1515 	*ifp = NULL;
1516 
1517 	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1518 		return EINVAL;
1519 	}
1520 
1521 	if (req->nr_mode == NR_REG_PIPE_MASTER ||
1522 			req->nr_mode == NR_REG_PIPE_SLAVE) {
1523 		/* Do not accept deprecated pipe modes. */
1524 		nm_prerr("Deprecated pipe nr_mode, use xx{yy or xx}yy syntax");
1525 		return EINVAL;
1526 	}
1527 
1528 	NMG_LOCK_ASSERT();
1529 
1530 	/* if the request contain a memid, try to find the
1531 	 * corresponding memory region
1532 	 */
1533 	if (nmd == NULL && req->nr_mem_id) {
1534 		nmd = netmap_mem_find(req->nr_mem_id);
1535 		if (nmd == NULL)
1536 			return EINVAL;
1537 		/* keep the rereference */
1538 		nmd_ref = 1;
1539 	}
1540 
1541 	/* We cascade through all possible types of netmap adapter.
1542 	 * All netmap_get_*_na() functions return an error and an na,
1543 	 * with the following combinations:
1544 	 *
1545 	 * error    na
1546 	 *   0	   NULL		type doesn't match
1547 	 *  !0	   NULL		type matches, but na creation/lookup failed
1548 	 *   0	  !NULL		type matches and na created/found
1549 	 *  !0    !NULL		impossible
1550 	 */
1551 	error = netmap_get_null_na(hdr, na, nmd, create);
1552 	if (error || *na != NULL)
1553 		goto out;
1554 
1555 	/* try to see if this is a monitor port */
1556 	error = netmap_get_monitor_na(hdr, na, nmd, create);
1557 	if (error || *na != NULL)
1558 		goto out;
1559 
1560 	/* try to see if this is a pipe port */
1561 	error = netmap_get_pipe_na(hdr, na, nmd, create);
1562 	if (error || *na != NULL)
1563 		goto out;
1564 
1565 	/* try to see if this is a bridge port */
1566 	error = netmap_get_vale_na(hdr, na, nmd, create);
1567 	if (error)
1568 		goto out;
1569 
1570 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1571 		goto out;
1572 
1573 	/*
1574 	 * This must be a hardware na, lookup the name in the system.
1575 	 * Note that by hardware we actually mean "it shows up in ifconfig".
1576 	 * This may still be a tap, a veth/epair, or even a
1577 	 * persistent VALE port.
1578 	 */
1579 	*ifp = ifunit_ref(hdr->nr_name);
1580 	if (*ifp == NULL) {
1581 		error = ENXIO;
1582 		goto out;
1583 	}
1584 
1585 	error = netmap_get_hw_na(*ifp, nmd, &ret);
1586 	if (error)
1587 		goto out;
1588 
1589 	*na = ret;
1590 	netmap_adapter_get(ret);
1591 
1592 	/*
1593 	 * if the adapter supports the host rings and it is not alread open,
1594 	 * try to set the number of host rings as requested by the user
1595 	 */
1596 	if (((*na)->na_flags & NAF_HOST_RINGS) && (*na)->active_fds == 0) {
1597 		if (req->nr_host_tx_rings)
1598 			(*na)->num_host_tx_rings = req->nr_host_tx_rings;
1599 		if (req->nr_host_rx_rings)
1600 			(*na)->num_host_rx_rings = req->nr_host_rx_rings;
1601 	}
1602 	nm_prdis("%s: host tx %d rx %u", (*na)->name, (*na)->num_host_tx_rings,
1603 			(*na)->num_host_rx_rings);
1604 
1605 out:
1606 	if (error) {
1607 		if (ret)
1608 			netmap_adapter_put(ret);
1609 		if (*ifp) {
1610 			if_rele(*ifp);
1611 			*ifp = NULL;
1612 		}
1613 	}
1614 	if (nmd_ref)
1615 		netmap_mem_put(nmd);
1616 
1617 	return error;
1618 }
1619 
1620 /* undo netmap_get_na() */
1621 void
1622 netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp)
1623 {
1624 	if (ifp)
1625 		if_rele(ifp);
1626 	if (na)
1627 		netmap_adapter_put(na);
1628 }
1629 
1630 
1631 #define NM_FAIL_ON(t) do {						\
1632 	if (unlikely(t)) {						\
1633 		nm_prlim(5, "%s: fail '" #t "' "				\
1634 			"h %d c %d t %d "				\
1635 			"rh %d rc %d rt %d "				\
1636 			"hc %d ht %d",					\
1637 			kring->name,					\
1638 			head, cur, ring->tail,				\
1639 			kring->rhead, kring->rcur, kring->rtail,	\
1640 			kring->nr_hwcur, kring->nr_hwtail);		\
1641 		return kring->nkr_num_slots;				\
1642 	}								\
1643 } while (0)
1644 
1645 /*
1646  * validate parameters on entry for *_txsync()
1647  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1648  * in case of error.
1649  *
1650  * rhead, rcur and rtail=hwtail are stored from previous round.
1651  * hwcur is the next packet to send to the ring.
1652  *
1653  * We want
1654  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1655  *
1656  * hwcur, rhead, rtail and hwtail are reliable
1657  */
1658 u_int
1659 nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1660 {
1661 	u_int head = ring->head; /* read only once */
1662 	u_int cur = ring->cur; /* read only once */
1663 	u_int n = kring->nkr_num_slots;
1664 
1665 	nm_prdis(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1666 		kring->name,
1667 		kring->nr_hwcur, kring->nr_hwtail,
1668 		ring->head, ring->cur, ring->tail);
1669 #if 1 /* kernel sanity checks; but we can trust the kring. */
1670 	NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||
1671 	    kring->rtail >= n ||  kring->nr_hwtail >= n);
1672 #endif /* kernel sanity checks */
1673 	/*
1674 	 * user sanity checks. We only use head,
1675 	 * A, B, ... are possible positions for head:
1676 	 *
1677 	 *  0    A  rhead   B  rtail   C  n-1
1678 	 *  0    D  rtail   E  rhead   F  n-1
1679 	 *
1680 	 * B, F, D are valid. A, C, E are wrong
1681 	 */
1682 	if (kring->rtail >= kring->rhead) {
1683 		/* want rhead <= head <= rtail */
1684 		NM_FAIL_ON(head < kring->rhead || head > kring->rtail);
1685 		/* and also head <= cur <= rtail */
1686 		NM_FAIL_ON(cur < head || cur > kring->rtail);
1687 	} else { /* here rtail < rhead */
1688 		/* we need head outside rtail .. rhead */
1689 		NM_FAIL_ON(head > kring->rtail && head < kring->rhead);
1690 
1691 		/* two cases now: head <= rtail or head >= rhead  */
1692 		if (head <= kring->rtail) {
1693 			/* want head <= cur <= rtail */
1694 			NM_FAIL_ON(cur < head || cur > kring->rtail);
1695 		} else { /* head >= rhead */
1696 			/* cur must be outside rtail..head */
1697 			NM_FAIL_ON(cur > kring->rtail && cur < head);
1698 		}
1699 	}
1700 	if (ring->tail != kring->rtail) {
1701 		nm_prlim(5, "%s tail overwritten was %d need %d", kring->name,
1702 			ring->tail, kring->rtail);
1703 		ring->tail = kring->rtail;
1704 	}
1705 	kring->rhead = head;
1706 	kring->rcur = cur;
1707 	return head;
1708 }
1709 
1710 
1711 /*
1712  * validate parameters on entry for *_rxsync()
1713  * Returns ring->head if ok, kring->nkr_num_slots on error.
1714  *
1715  * For a valid configuration,
1716  * hwcur <= head <= cur <= tail <= hwtail
1717  *
1718  * We only consider head and cur.
1719  * hwcur and hwtail are reliable.
1720  *
1721  */
1722 u_int
1723 nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1724 {
1725 	uint32_t const n = kring->nkr_num_slots;
1726 	uint32_t head, cur;
1727 
1728 	nm_prdis(5,"%s kc %d kt %d h %d c %d t %d",
1729 		kring->name,
1730 		kring->nr_hwcur, kring->nr_hwtail,
1731 		ring->head, ring->cur, ring->tail);
1732 	/*
1733 	 * Before storing the new values, we should check they do not
1734 	 * move backwards. However:
1735 	 * - head is not an issue because the previous value is hwcur;
1736 	 * - cur could in principle go back, however it does not matter
1737 	 *   because we are processing a brand new rxsync()
1738 	 */
1739 	cur = kring->rcur = ring->cur;	/* read only once */
1740 	head = kring->rhead = ring->head;	/* read only once */
1741 #if 1 /* kernel sanity checks */
1742 	NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);
1743 #endif /* kernel sanity checks */
1744 	/* user sanity checks */
1745 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1746 		/* want hwcur <= rhead <= hwtail */
1747 		NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);
1748 		/* and also rhead <= rcur <= hwtail */
1749 		NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1750 	} else {
1751 		/* we need rhead outside hwtail..hwcur */
1752 		NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);
1753 		/* two cases now: head <= hwtail or head >= hwcur  */
1754 		if (head <= kring->nr_hwtail) {
1755 			/* want head <= cur <= hwtail */
1756 			NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1757 		} else {
1758 			/* cur must be outside hwtail..head */
1759 			NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);
1760 		}
1761 	}
1762 	if (ring->tail != kring->rtail) {
1763 		nm_prlim(5, "%s tail overwritten was %d need %d",
1764 			kring->name,
1765 			ring->tail, kring->rtail);
1766 		ring->tail = kring->rtail;
1767 	}
1768 	return head;
1769 }
1770 
1771 
1772 /*
1773  * Error routine called when txsync/rxsync detects an error.
1774  * Can't do much more than resetting head = cur = hwcur, tail = hwtail
1775  * Return 1 on reinit.
1776  *
1777  * This routine is only called by the upper half of the kernel.
1778  * It only reads hwcur (which is changed only by the upper half, too)
1779  * and hwtail (which may be changed by the lower half, but only on
1780  * a tx ring and only to increase it, so any error will be recovered
1781  * on the next call). For the above, we don't strictly need to call
1782  * it under lock.
1783  */
1784 int
1785 netmap_ring_reinit(struct netmap_kring *kring)
1786 {
1787 	struct netmap_ring *ring = kring->ring;
1788 	u_int i, lim = kring->nkr_num_slots - 1;
1789 	int errors = 0;
1790 
1791 	// XXX KASSERT nm_kr_tryget
1792 	nm_prlim(10, "called for %s", kring->name);
1793 	// XXX probably wrong to trust userspace
1794 	kring->rhead = ring->head;
1795 	kring->rcur  = ring->cur;
1796 	kring->rtail = ring->tail;
1797 
1798 	if (ring->cur > lim)
1799 		errors++;
1800 	if (ring->head > lim)
1801 		errors++;
1802 	if (ring->tail > lim)
1803 		errors++;
1804 	for (i = 0; i <= lim; i++) {
1805 		u_int idx = ring->slot[i].buf_idx;
1806 		u_int len = ring->slot[i].len;
1807 		if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1808 			nm_prlim(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1809 			ring->slot[i].buf_idx = 0;
1810 			ring->slot[i].len = 0;
1811 		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1812 			ring->slot[i].len = 0;
1813 			nm_prlim(5, "bad len at slot %d idx %d len %d", i, idx, len);
1814 		}
1815 	}
1816 	if (errors) {
1817 		nm_prlim(10, "total %d errors", errors);
1818 		nm_prlim(10, "%s reinit, cur %d -> %d tail %d -> %d",
1819 			kring->name,
1820 			ring->cur, kring->nr_hwcur,
1821 			ring->tail, kring->nr_hwtail);
1822 		ring->head = kring->rhead = kring->nr_hwcur;
1823 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1824 		ring->tail = kring->rtail = kring->nr_hwtail;
1825 	}
1826 	return (errors ? 1 : 0);
1827 }
1828 
1829 /* interpret the ringid and flags fields of an nmreq, by translating them
1830  * into a pair of intervals of ring indices:
1831  *
1832  * [priv->np_txqfirst, priv->np_txqlast) and
1833  * [priv->np_rxqfirst, priv->np_rxqlast)
1834  *
1835  */
1836 int
1837 netmap_interp_ringid(struct netmap_priv_d *priv, uint32_t nr_mode,
1838 			uint16_t nr_ringid, uint64_t nr_flags)
1839 {
1840 	struct netmap_adapter *na = priv->np_na;
1841 	int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };
1842 	enum txrx t;
1843 	u_int j;
1844 
1845 	for_rx_tx(t) {
1846 		if (nr_flags & excluded_direction[t]) {
1847 			priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1848 			continue;
1849 		}
1850 		switch (nr_mode) {
1851 		case NR_REG_ALL_NIC:
1852 		case NR_REG_NULL:
1853 			priv->np_qfirst[t] = 0;
1854 			priv->np_qlast[t] = nma_get_nrings(na, t);
1855 			nm_prdis("ALL/PIPE: %s %d %d", nm_txrx2str(t),
1856 				priv->np_qfirst[t], priv->np_qlast[t]);
1857 			break;
1858 		case NR_REG_SW:
1859 		case NR_REG_NIC_SW:
1860 			if (!(na->na_flags & NAF_HOST_RINGS)) {
1861 				nm_prerr("host rings not supported");
1862 				return EINVAL;
1863 			}
1864 			priv->np_qfirst[t] = (nr_mode == NR_REG_SW ?
1865 				nma_get_nrings(na, t) : 0);
1866 			priv->np_qlast[t] = netmap_all_rings(na, t);
1867 			nm_prdis("%s: %s %d %d", nr_mode == NR_REG_SW ? "SW" : "NIC+SW",
1868 				nm_txrx2str(t),
1869 				priv->np_qfirst[t], priv->np_qlast[t]);
1870 			break;
1871 		case NR_REG_ONE_NIC:
1872 			if (nr_ringid >= na->num_tx_rings &&
1873 					nr_ringid >= na->num_rx_rings) {
1874 				nm_prerr("invalid ring id %d", nr_ringid);
1875 				return EINVAL;
1876 			}
1877 			/* if not enough rings, use the first one */
1878 			j = nr_ringid;
1879 			if (j >= nma_get_nrings(na, t))
1880 				j = 0;
1881 			priv->np_qfirst[t] = j;
1882 			priv->np_qlast[t] = j + 1;
1883 			nm_prdis("ONE_NIC: %s %d %d", nm_txrx2str(t),
1884 				priv->np_qfirst[t], priv->np_qlast[t]);
1885 			break;
1886 		case NR_REG_ONE_SW:
1887 			if (!(na->na_flags & NAF_HOST_RINGS)) {
1888 				nm_prerr("host rings not supported");
1889 				return EINVAL;
1890 			}
1891 			if (nr_ringid >= na->num_host_tx_rings &&
1892 					nr_ringid >= na->num_host_rx_rings) {
1893 				nm_prerr("invalid ring id %d", nr_ringid);
1894 				return EINVAL;
1895 			}
1896 			/* if not enough rings, use the first one */
1897 			j = nr_ringid;
1898 			if (j >= nma_get_host_nrings(na, t))
1899 				j = 0;
1900 			priv->np_qfirst[t] = nma_get_nrings(na, t) + j;
1901 			priv->np_qlast[t] = nma_get_nrings(na, t) + j + 1;
1902 			nm_prdis("ONE_SW: %s %d %d", nm_txrx2str(t),
1903 				priv->np_qfirst[t], priv->np_qlast[t]);
1904 			break;
1905 		default:
1906 			nm_prerr("invalid regif type %d", nr_mode);
1907 			return EINVAL;
1908 		}
1909 	}
1910 	priv->np_flags = nr_flags;
1911 
1912 	/* Allow transparent forwarding mode in the host --> nic
1913 	 * direction only if all the TX hw rings have been opened. */
1914 	if (priv->np_qfirst[NR_TX] == 0 &&
1915 			priv->np_qlast[NR_TX] >= na->num_tx_rings) {
1916 		priv->np_sync_flags |= NAF_CAN_FORWARD_DOWN;
1917 	}
1918 
1919 	if (netmap_verbose) {
1920 		nm_prinf("%s: tx [%d,%d) rx [%d,%d) id %d",
1921 			na->name,
1922 			priv->np_qfirst[NR_TX],
1923 			priv->np_qlast[NR_TX],
1924 			priv->np_qfirst[NR_RX],
1925 			priv->np_qlast[NR_RX],
1926 			nr_ringid);
1927 	}
1928 	return 0;
1929 }
1930 
1931 
1932 /*
1933  * Set the ring ID. For devices with a single queue, a request
1934  * for all rings is the same as a single ring.
1935  */
1936 static int
1937 netmap_set_ringid(struct netmap_priv_d *priv, uint32_t nr_mode,
1938 		uint16_t nr_ringid, uint64_t nr_flags)
1939 {
1940 	struct netmap_adapter *na = priv->np_na;
1941 	int error;
1942 	enum txrx t;
1943 
1944 	error = netmap_interp_ringid(priv, nr_mode, nr_ringid, nr_flags);
1945 	if (error) {
1946 		return error;
1947 	}
1948 
1949 	priv->np_txpoll = (nr_flags & NR_NO_TX_POLL) ? 0 : 1;
1950 
1951 	/* optimization: count the users registered for more than
1952 	 * one ring, which are the ones sleeping on the global queue.
1953 	 * The default netmap_notify() callback will then
1954 	 * avoid signaling the global queue if nobody is using it
1955 	 */
1956 	for_rx_tx(t) {
1957 		if (nm_si_user(priv, t))
1958 			na->si_users[t]++;
1959 	}
1960 	return 0;
1961 }
1962 
1963 static void
1964 netmap_unset_ringid(struct netmap_priv_d *priv)
1965 {
1966 	struct netmap_adapter *na = priv->np_na;
1967 	enum txrx t;
1968 
1969 	for_rx_tx(t) {
1970 		if (nm_si_user(priv, t))
1971 			na->si_users[t]--;
1972 		priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1973 	}
1974 	priv->np_flags = 0;
1975 	priv->np_txpoll = 0;
1976 	priv->np_kloop_state = 0;
1977 }
1978 
1979 
1980 /* Set the nr_pending_mode for the requested rings.
1981  * If requested, also try to get exclusive access to the rings, provided
1982  * the rings we want to bind are not exclusively owned by a previous bind.
1983  */
1984 static int
1985 netmap_krings_get(struct netmap_priv_d *priv)
1986 {
1987 	struct netmap_adapter *na = priv->np_na;
1988 	u_int i;
1989 	struct netmap_kring *kring;
1990 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1991 	enum txrx t;
1992 
1993 	if (netmap_debug & NM_DEBUG_ON)
1994 		nm_prinf("%s: grabbing tx [%d, %d) rx [%d, %d)",
1995 			na->name,
1996 			priv->np_qfirst[NR_TX],
1997 			priv->np_qlast[NR_TX],
1998 			priv->np_qfirst[NR_RX],
1999 			priv->np_qlast[NR_RX]);
2000 
2001 	/* first round: check that all the requested rings
2002 	 * are neither alread exclusively owned, nor we
2003 	 * want exclusive ownership when they are already in use
2004 	 */
2005 	for_rx_tx(t) {
2006 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
2007 			kring = NMR(na, t)[i];
2008 			if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
2009 			    (kring->users && excl))
2010 			{
2011 				nm_prdis("ring %s busy", kring->name);
2012 				return EBUSY;
2013 			}
2014 		}
2015 	}
2016 
2017 	/* second round: increment usage count (possibly marking them
2018 	 * as exclusive) and set the nr_pending_mode
2019 	 */
2020 	for_rx_tx(t) {
2021 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
2022 			kring = NMR(na, t)[i];
2023 			kring->users++;
2024 			if (excl)
2025 				kring->nr_kflags |= NKR_EXCLUSIVE;
2026 	                kring->nr_pending_mode = NKR_NETMAP_ON;
2027 		}
2028 	}
2029 
2030 	return 0;
2031 
2032 }
2033 
2034 /* Undo netmap_krings_get(). This is done by clearing the exclusive mode
2035  * if was asked on regif, and unset the nr_pending_mode if we are the
2036  * last users of the involved rings. */
2037 static void
2038 netmap_krings_put(struct netmap_priv_d *priv)
2039 {
2040 	struct netmap_adapter *na = priv->np_na;
2041 	u_int i;
2042 	struct netmap_kring *kring;
2043 	int excl = (priv->np_flags & NR_EXCLUSIVE);
2044 	enum txrx t;
2045 
2046 	nm_prdis("%s: releasing tx [%d, %d) rx [%d, %d)",
2047 			na->name,
2048 			priv->np_qfirst[NR_TX],
2049 			priv->np_qlast[NR_TX],
2050 			priv->np_qfirst[NR_RX],
2051 			priv->np_qlast[MR_RX]);
2052 
2053 	for_rx_tx(t) {
2054 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
2055 			kring = NMR(na, t)[i];
2056 			if (excl)
2057 				kring->nr_kflags &= ~NKR_EXCLUSIVE;
2058 			kring->users--;
2059 			if (kring->users == 0)
2060 				kring->nr_pending_mode = NKR_NETMAP_OFF;
2061 		}
2062 	}
2063 }
2064 
2065 static int
2066 nm_priv_rx_enabled(struct netmap_priv_d *priv)
2067 {
2068 	return (priv->np_qfirst[NR_RX] != priv->np_qlast[NR_RX]);
2069 }
2070 
2071 /* Validate the CSB entries for both directions (atok and ktoa).
2072  * To be called under NMG_LOCK(). */
2073 static int
2074 netmap_csb_validate(struct netmap_priv_d *priv, struct nmreq_opt_csb *csbo)
2075 {
2076 	struct nm_csb_atok *csb_atok_base =
2077 		(struct nm_csb_atok *)(uintptr_t)csbo->csb_atok;
2078 	struct nm_csb_ktoa *csb_ktoa_base =
2079 		(struct nm_csb_ktoa *)(uintptr_t)csbo->csb_ktoa;
2080 	enum txrx t;
2081 	int num_rings[NR_TXRX], tot_rings;
2082 	size_t entry_size[2];
2083 	void *csb_start[2];
2084 	int i;
2085 
2086 	if (priv->np_kloop_state & NM_SYNC_KLOOP_RUNNING) {
2087 		nm_prerr("Cannot update CSB while kloop is running");
2088 		return EBUSY;
2089 	}
2090 
2091 	tot_rings = 0;
2092 	for_rx_tx(t) {
2093 		num_rings[t] = priv->np_qlast[t] - priv->np_qfirst[t];
2094 		tot_rings += num_rings[t];
2095 	}
2096 	if (tot_rings <= 0)
2097 		return 0;
2098 
2099 	if (!(priv->np_flags & NR_EXCLUSIVE)) {
2100 		nm_prerr("CSB mode requires NR_EXCLUSIVE");
2101 		return EINVAL;
2102 	}
2103 
2104 	entry_size[0] = sizeof(*csb_atok_base);
2105 	entry_size[1] = sizeof(*csb_ktoa_base);
2106 	csb_start[0] = (void *)csb_atok_base;
2107 	csb_start[1] = (void *)csb_ktoa_base;
2108 
2109 	for (i = 0; i < 2; i++) {
2110 		/* On Linux we could use access_ok() to simplify
2111 		 * the validation. However, the advantage of
2112 		 * this approach is that it works also on
2113 		 * FreeBSD. */
2114 		size_t csb_size = tot_rings * entry_size[i];
2115 		void *tmp;
2116 		int err;
2117 
2118 		if ((uintptr_t)csb_start[i] & (entry_size[i]-1)) {
2119 			nm_prerr("Unaligned CSB address");
2120 			return EINVAL;
2121 		}
2122 
2123 		tmp = nm_os_malloc(csb_size);
2124 		if (!tmp)
2125 			return ENOMEM;
2126 		if (i == 0) {
2127 			/* Application --> kernel direction. */
2128 			err = copyin(csb_start[i], tmp, csb_size);
2129 		} else {
2130 			/* Kernel --> application direction. */
2131 			memset(tmp, 0, csb_size);
2132 			err = copyout(tmp, csb_start[i], csb_size);
2133 		}
2134 		nm_os_free(tmp);
2135 		if (err) {
2136 			nm_prerr("Invalid CSB address");
2137 			return err;
2138 		}
2139 	}
2140 
2141 	priv->np_csb_atok_base = csb_atok_base;
2142 	priv->np_csb_ktoa_base = csb_ktoa_base;
2143 
2144 	/* Initialize the CSB. */
2145 	for_rx_tx(t) {
2146 		for (i = 0; i < num_rings[t]; i++) {
2147 			struct netmap_kring *kring =
2148 				NMR(priv->np_na, t)[i + priv->np_qfirst[t]];
2149 			struct nm_csb_atok *csb_atok = csb_atok_base + i;
2150 			struct nm_csb_ktoa *csb_ktoa = csb_ktoa_base + i;
2151 
2152 			if (t == NR_RX) {
2153 				csb_atok += num_rings[NR_TX];
2154 				csb_ktoa += num_rings[NR_TX];
2155 			}
2156 
2157 			CSB_WRITE(csb_atok, head, kring->rhead);
2158 			CSB_WRITE(csb_atok, cur, kring->rcur);
2159 			CSB_WRITE(csb_atok, appl_need_kick, 1);
2160 			CSB_WRITE(csb_atok, sync_flags, 1);
2161 			CSB_WRITE(csb_ktoa, hwcur, kring->nr_hwcur);
2162 			CSB_WRITE(csb_ktoa, hwtail, kring->nr_hwtail);
2163 			CSB_WRITE(csb_ktoa, kern_need_kick, 1);
2164 
2165 			nm_prinf("csb_init for kring %s: head %u, cur %u, "
2166 				"hwcur %u, hwtail %u", kring->name,
2167 				kring->rhead, kring->rcur, kring->nr_hwcur,
2168 				kring->nr_hwtail);
2169 		}
2170 	}
2171 
2172 	return 0;
2173 }
2174 
2175 /* Ensure that the netmap adapter can support the given MTU.
2176  * @return EINVAL if the na cannot be set to mtu, 0 otherwise.
2177  */
2178 int
2179 netmap_buf_size_validate(const struct netmap_adapter *na, unsigned mtu) {
2180 	unsigned nbs = NETMAP_BUF_SIZE(na);
2181 
2182 	if (mtu <= na->rx_buf_maxsize) {
2183 		/* The MTU fits a single NIC slot. We only
2184 		 * Need to check that netmap buffers are
2185 		 * large enough to hold an MTU. NS_MOREFRAG
2186 		 * cannot be used in this case. */
2187 		if (nbs < mtu) {
2188 			nm_prerr("error: netmap buf size (%u) "
2189 				 "< device MTU (%u)", nbs, mtu);
2190 			return EINVAL;
2191 		}
2192 	} else {
2193 		/* More NIC slots may be needed to receive
2194 		 * or transmit a single packet. Check that
2195 		 * the adapter supports NS_MOREFRAG and that
2196 		 * netmap buffers are large enough to hold
2197 		 * the maximum per-slot size. */
2198 		if (!(na->na_flags & NAF_MOREFRAG)) {
2199 			nm_prerr("error: large MTU (%d) needed "
2200 				 "but %s does not support "
2201 				 "NS_MOREFRAG", mtu,
2202 				 na->ifp->if_xname);
2203 			return EINVAL;
2204 		} else if (nbs < na->rx_buf_maxsize) {
2205 			nm_prerr("error: using NS_MOREFRAG on "
2206 				 "%s requires netmap buf size "
2207 				 ">= %u", na->ifp->if_xname,
2208 				 na->rx_buf_maxsize);
2209 			return EINVAL;
2210 		} else {
2211 			nm_prinf("info: netmap application on "
2212 				 "%s needs to support "
2213 				 "NS_MOREFRAG "
2214 				 "(MTU=%u,netmap_buf_size=%u)",
2215 				 na->ifp->if_xname, mtu, nbs);
2216 		}
2217 	}
2218 	return 0;
2219 }
2220 
2221 
2222 /*
2223  * possibly move the interface to netmap-mode.
2224  * If success it returns a pointer to netmap_if, otherwise NULL.
2225  * This must be called with NMG_LOCK held.
2226  *
2227  * The following na callbacks are called in the process:
2228  *
2229  * na->nm_config()			[by netmap_update_config]
2230  * (get current number and size of rings)
2231  *
2232  *  	We have a generic one for linux (netmap_linux_config).
2233  *  	The bwrap has to override this, since it has to forward
2234  *  	the request to the wrapped adapter (netmap_bwrap_config).
2235  *
2236  *
2237  * na->nm_krings_create()
2238  * (create and init the krings array)
2239  *
2240  * 	One of the following:
2241  *
2242  *	* netmap_hw_krings_create, 			(hw ports)
2243  *		creates the standard layout for the krings
2244  * 		and adds the mbq (used for the host rings).
2245  *
2246  * 	* netmap_vp_krings_create			(VALE ports)
2247  * 		add leases and scratchpads
2248  *
2249  * 	* netmap_pipe_krings_create			(pipes)
2250  * 		create the krings and rings of both ends and
2251  * 		cross-link them
2252  *
2253  *      * netmap_monitor_krings_create 			(monitors)
2254  *      	avoid allocating the mbq
2255  *
2256  *      * netmap_bwrap_krings_create			(bwraps)
2257  *      	create both the brap krings array,
2258  *      	the krings array of the wrapped adapter, and
2259  *      	(if needed) the fake array for the host adapter
2260  *
2261  * na->nm_register(, 1)
2262  * (put the adapter in netmap mode)
2263  *
2264  * 	This may be one of the following:
2265  *
2266  * 	* netmap_hw_reg				        (hw ports)
2267  * 		checks that the ifp is still there, then calls
2268  * 		the hardware specific callback;
2269  *
2270  * 	* netmap_vp_reg					(VALE ports)
2271  *		If the port is connected to a bridge,
2272  *		set the NAF_NETMAP_ON flag under the
2273  *		bridge write lock.
2274  *
2275  *	* netmap_pipe_reg				(pipes)
2276  *		inform the other pipe end that it is no
2277  *		longer responsible for the lifetime of this
2278  *		pipe end
2279  *
2280  *	* netmap_monitor_reg				(monitors)
2281  *		intercept the sync callbacks of the monitored
2282  *		rings
2283  *
2284  *	* netmap_bwrap_reg				(bwraps)
2285  *		cross-link the bwrap and hwna rings,
2286  *		forward the request to the hwna, override
2287  *		the hwna notify callback (to get the frames
2288  *		coming from outside go through the bridge).
2289  *
2290  *
2291  */
2292 int
2293 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
2294 	uint32_t nr_mode, uint16_t nr_ringid, uint64_t nr_flags)
2295 {
2296 	struct netmap_if *nifp = NULL;
2297 	int error;
2298 
2299 	NMG_LOCK_ASSERT();
2300 	priv->np_na = na;     /* store the reference */
2301 	error = netmap_mem_finalize(na->nm_mem, na);
2302 	if (error)
2303 		goto err;
2304 
2305 	if (na->active_fds == 0) {
2306 
2307 		/* cache the allocator info in the na */
2308 		error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);
2309 		if (error)
2310 			goto err_drop_mem;
2311 		nm_prdis("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,
2312 					    na->na_lut.objsize);
2313 
2314 		/* ring configuration may have changed, fetch from the card */
2315 		netmap_update_config(na);
2316 	}
2317 
2318 	/* compute the range of tx and rx rings to monitor */
2319 	error = netmap_set_ringid(priv, nr_mode, nr_ringid, nr_flags);
2320 	if (error)
2321 		goto err_put_lut;
2322 
2323 	if (na->active_fds == 0) {
2324 		/*
2325 		 * If this is the first registration of the adapter,
2326 		 * perform sanity checks and create the in-kernel view
2327 		 * of the netmap rings (the netmap krings).
2328 		 */
2329 		if (na->ifp && nm_priv_rx_enabled(priv)) {
2330 			/* This netmap adapter is attached to an ifnet. */
2331 			unsigned mtu = nm_os_ifnet_mtu(na->ifp);
2332 
2333 			nm_prdis("%s: mtu %d rx_buf_maxsize %d netmap_buf_size %d",
2334 				na->name, mtu, na->rx_buf_maxsize, NETMAP_BUF_SIZE(na));
2335 
2336 			if (na->rx_buf_maxsize == 0) {
2337 				nm_prerr("%s: error: rx_buf_maxsize == 0", na->name);
2338 				error = EIO;
2339 				goto err_drop_mem;
2340 			}
2341 
2342 			error = netmap_buf_size_validate(na, mtu);
2343 			if (error)
2344 				goto err_drop_mem;
2345 		}
2346 
2347 		/*
2348 		 * Depending on the adapter, this may also create
2349 		 * the netmap rings themselves
2350 		 */
2351 		error = na->nm_krings_create(na);
2352 		if (error)
2353 			goto err_put_lut;
2354 
2355 	}
2356 
2357 	/* now the krings must exist and we can check whether some
2358 	 * previous bind has exclusive ownership on them, and set
2359 	 * nr_pending_mode
2360 	 */
2361 	error = netmap_krings_get(priv);
2362 	if (error)
2363 		goto err_del_krings;
2364 
2365 	/* create all needed missing netmap rings */
2366 	error = netmap_mem_rings_create(na);
2367 	if (error)
2368 		goto err_rel_excl;
2369 
2370 	/* in all cases, create a new netmap if */
2371 	nifp = netmap_mem_if_new(na, priv);
2372 	if (nifp == NULL) {
2373 		error = ENOMEM;
2374 		goto err_rel_excl;
2375 	}
2376 
2377 	if (nm_kring_pending(priv)) {
2378 		/* Some kring is switching mode, tell the adapter to
2379 		 * react on this. */
2380 		error = na->nm_register(na, 1);
2381 		if (error)
2382 			goto err_del_if;
2383 	}
2384 
2385 	/* Commit the reference. */
2386 	na->active_fds++;
2387 
2388 	/*
2389 	 * advertise that the interface is ready by setting np_nifp.
2390 	 * The barrier is needed because readers (poll, *SYNC and mmap)
2391 	 * check for priv->np_nifp != NULL without locking
2392 	 */
2393 	mb(); /* make sure previous writes are visible to all CPUs */
2394 	priv->np_nifp = nifp;
2395 
2396 	return 0;
2397 
2398 err_del_if:
2399 	netmap_mem_if_delete(na, nifp);
2400 err_rel_excl:
2401 	netmap_krings_put(priv);
2402 	netmap_mem_rings_delete(na);
2403 err_del_krings:
2404 	if (na->active_fds == 0)
2405 		na->nm_krings_delete(na);
2406 err_put_lut:
2407 	if (na->active_fds == 0)
2408 		memset(&na->na_lut, 0, sizeof(na->na_lut));
2409 err_drop_mem:
2410 	netmap_mem_drop(na);
2411 err:
2412 	priv->np_na = NULL;
2413 	return error;
2414 }
2415 
2416 
2417 /*
2418  * update kring and ring at the end of rxsync/txsync.
2419  */
2420 static inline void
2421 nm_sync_finalize(struct netmap_kring *kring)
2422 {
2423 	/*
2424 	 * Update ring tail to what the kernel knows
2425 	 * After txsync: head/rhead/hwcur might be behind cur/rcur
2426 	 * if no carrier.
2427 	 */
2428 	kring->ring->tail = kring->rtail = kring->nr_hwtail;
2429 
2430 	nm_prdis(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2431 		kring->name, kring->nr_hwcur, kring->nr_hwtail,
2432 		kring->rhead, kring->rcur, kring->rtail);
2433 }
2434 
2435 /* set ring timestamp */
2436 static inline void
2437 ring_timestamp_set(struct netmap_ring *ring)
2438 {
2439 	if (netmap_no_timestamp == 0 || ring->flags & NR_TIMESTAMP) {
2440 		microtime(&ring->ts);
2441 	}
2442 }
2443 
2444 static int nmreq_copyin(struct nmreq_header *, int);
2445 static int nmreq_copyout(struct nmreq_header *, int);
2446 static int nmreq_checkoptions(struct nmreq_header *);
2447 
2448 /*
2449  * ioctl(2) support for the "netmap" device.
2450  *
2451  * Following a list of accepted commands:
2452  * - NIOCCTRL		device control API
2453  * - NIOCTXSYNC		sync TX rings
2454  * - NIOCRXSYNC		sync RX rings
2455  * - SIOCGIFADDR	just for convenience
2456  * - NIOCGINFO		deprecated (legacy API)
2457  * - NIOCREGIF		deprecated (legacy API)
2458  *
2459  * Return 0 on success, errno otherwise.
2460  */
2461 int
2462 netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
2463 		struct thread *td, int nr_body_is_user)
2464 {
2465 	struct mbq q;	/* packets from RX hw queues to host stack */
2466 	struct netmap_adapter *na = NULL;
2467 	struct netmap_mem_d *nmd = NULL;
2468 	struct ifnet *ifp = NULL;
2469 	int error = 0;
2470 	u_int i, qfirst, qlast;
2471 	struct netmap_kring **krings;
2472 	int sync_flags;
2473 	enum txrx t;
2474 
2475 	switch (cmd) {
2476 	case NIOCCTRL: {
2477 		struct nmreq_header *hdr = (struct nmreq_header *)data;
2478 
2479 		if (hdr->nr_version < NETMAP_MIN_API ||
2480 		    hdr->nr_version > NETMAP_MAX_API) {
2481 			nm_prerr("API mismatch: got %d need %d",
2482 				hdr->nr_version, NETMAP_API);
2483 			return EINVAL;
2484 		}
2485 
2486 		/* Make a kernel-space copy of the user-space nr_body.
2487 		 * For convenince, the nr_body pointer and the pointers
2488 		 * in the options list will be replaced with their
2489 		 * kernel-space counterparts. The original pointers are
2490 		 * saved internally and later restored by nmreq_copyout
2491 		 */
2492 		error = nmreq_copyin(hdr, nr_body_is_user);
2493 		if (error) {
2494 			return error;
2495 		}
2496 
2497 		/* Sanitize hdr->nr_name. */
2498 		hdr->nr_name[sizeof(hdr->nr_name) - 1] = '\0';
2499 
2500 		switch (hdr->nr_reqtype) {
2501 		case NETMAP_REQ_REGISTER: {
2502 			struct nmreq_register *req =
2503 				(struct nmreq_register *)(uintptr_t)hdr->nr_body;
2504 			struct netmap_if *nifp;
2505 
2506 			/* Protect access to priv from concurrent requests. */
2507 			NMG_LOCK();
2508 			do {
2509 				struct nmreq_option *opt;
2510 				u_int memflags;
2511 
2512 				if (priv->np_nifp != NULL) {	/* thread already registered */
2513 					error = EBUSY;
2514 					break;
2515 				}
2516 
2517 #ifdef WITH_EXTMEM
2518 				opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_EXTMEM);
2519 				if (opt != NULL) {
2520 					struct nmreq_opt_extmem *e =
2521 						(struct nmreq_opt_extmem *)opt;
2522 
2523 					nmd = netmap_mem_ext_create(e->nro_usrptr,
2524 							&e->nro_info, &error);
2525 					opt->nro_status = error;
2526 					if (nmd == NULL)
2527 						break;
2528 				}
2529 #endif /* WITH_EXTMEM */
2530 
2531 				if (nmd == NULL && req->nr_mem_id) {
2532 					/* find the allocator and get a reference */
2533 					nmd = netmap_mem_find(req->nr_mem_id);
2534 					if (nmd == NULL) {
2535 						if (netmap_verbose) {
2536 							nm_prerr("%s: failed to find mem_id %u",
2537 									hdr->nr_name, req->nr_mem_id);
2538 						}
2539 						error = EINVAL;
2540 						break;
2541 					}
2542 				}
2543 				/* find the interface and a reference */
2544 				error = netmap_get_na(hdr, &na, &ifp, nmd,
2545 						      1 /* create */); /* keep reference */
2546 				if (error)
2547 					break;
2548 				if (NETMAP_OWNED_BY_KERN(na)) {
2549 					error = EBUSY;
2550 					break;
2551 				}
2552 
2553 				if (na->virt_hdr_len && !(req->nr_flags & NR_ACCEPT_VNET_HDR)) {
2554 					nm_prerr("virt_hdr_len=%d, but application does "
2555 						"not accept it", na->virt_hdr_len);
2556 					error = EIO;
2557 					break;
2558 				}
2559 
2560 				error = netmap_do_regif(priv, na, req->nr_mode,
2561 							req->nr_ringid, req->nr_flags);
2562 				if (error) {    /* reg. failed, release priv and ref */
2563 					break;
2564 				}
2565 
2566 				opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_CSB);
2567 				if (opt != NULL) {
2568 					struct nmreq_opt_csb *csbo =
2569 						(struct nmreq_opt_csb *)opt;
2570 					error = netmap_csb_validate(priv, csbo);
2571 					opt->nro_status = error;
2572 					if (error) {
2573 						netmap_do_unregif(priv);
2574 						break;
2575 					}
2576 				}
2577 
2578 				nifp = priv->np_nifp;
2579 
2580 				/* return the offset of the netmap_if object */
2581 				req->nr_rx_rings = na->num_rx_rings;
2582 				req->nr_tx_rings = na->num_tx_rings;
2583 				req->nr_rx_slots = na->num_rx_desc;
2584 				req->nr_tx_slots = na->num_tx_desc;
2585 				req->nr_host_tx_rings = na->num_host_tx_rings;
2586 				req->nr_host_rx_rings = na->num_host_rx_rings;
2587 				error = netmap_mem_get_info(na->nm_mem, &req->nr_memsize, &memflags,
2588 					&req->nr_mem_id);
2589 				if (error) {
2590 					netmap_do_unregif(priv);
2591 					break;
2592 				}
2593 				if (memflags & NETMAP_MEM_PRIVATE) {
2594 					*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2595 				}
2596 				for_rx_tx(t) {
2597 					priv->np_si[t] = nm_si_user(priv, t) ?
2598 						&na->si[t] : &NMR(na, t)[priv->np_qfirst[t]]->si;
2599 				}
2600 
2601 				if (req->nr_extra_bufs) {
2602 					if (netmap_verbose)
2603 						nm_prinf("requested %d extra buffers",
2604 							req->nr_extra_bufs);
2605 					req->nr_extra_bufs = netmap_extra_alloc(na,
2606 						&nifp->ni_bufs_head, req->nr_extra_bufs);
2607 					if (netmap_verbose)
2608 						nm_prinf("got %d extra buffers", req->nr_extra_bufs);
2609 				}
2610 				req->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2611 
2612 				error = nmreq_checkoptions(hdr);
2613 				if (error) {
2614 					netmap_do_unregif(priv);
2615 					break;
2616 				}
2617 
2618 				/* store ifp reference so that priv destructor may release it */
2619 				priv->np_ifp = ifp;
2620 			} while (0);
2621 			if (error) {
2622 				netmap_unget_na(na, ifp);
2623 			}
2624 			/* release the reference from netmap_mem_find() or
2625 			 * netmap_mem_ext_create()
2626 			 */
2627 			if (nmd)
2628 				netmap_mem_put(nmd);
2629 			NMG_UNLOCK();
2630 			break;
2631 		}
2632 
2633 		case NETMAP_REQ_PORT_INFO_GET: {
2634 			struct nmreq_port_info_get *req =
2635 				(struct nmreq_port_info_get *)(uintptr_t)hdr->nr_body;
2636 
2637 			NMG_LOCK();
2638 			do {
2639 				u_int memflags;
2640 
2641 				if (hdr->nr_name[0] != '\0') {
2642 					/* Build a nmreq_register out of the nmreq_port_info_get,
2643 					 * so that we can call netmap_get_na(). */
2644 					struct nmreq_register regreq;
2645 					bzero(&regreq, sizeof(regreq));
2646 					regreq.nr_mode = NR_REG_ALL_NIC;
2647 					regreq.nr_tx_slots = req->nr_tx_slots;
2648 					regreq.nr_rx_slots = req->nr_rx_slots;
2649 					regreq.nr_tx_rings = req->nr_tx_rings;
2650 					regreq.nr_rx_rings = req->nr_rx_rings;
2651 					regreq.nr_host_tx_rings = req->nr_host_tx_rings;
2652 					regreq.nr_host_rx_rings = req->nr_host_rx_rings;
2653 					regreq.nr_mem_id = req->nr_mem_id;
2654 
2655 					/* get a refcount */
2656 					hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2657 					hdr->nr_body = (uintptr_t)&regreq;
2658 					error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2659 					hdr->nr_reqtype = NETMAP_REQ_PORT_INFO_GET; /* reset type */
2660 					hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2661 					if (error) {
2662 						na = NULL;
2663 						ifp = NULL;
2664 						break;
2665 					}
2666 					nmd = na->nm_mem; /* get memory allocator */
2667 				} else {
2668 					nmd = netmap_mem_find(req->nr_mem_id ? req->nr_mem_id : 1);
2669 					if (nmd == NULL) {
2670 						if (netmap_verbose)
2671 							nm_prerr("%s: failed to find mem_id %u",
2672 									hdr->nr_name,
2673 									req->nr_mem_id ? req->nr_mem_id : 1);
2674 						error = EINVAL;
2675 						break;
2676 					}
2677 				}
2678 
2679 				error = netmap_mem_get_info(nmd, &req->nr_memsize, &memflags,
2680 					&req->nr_mem_id);
2681 				if (error)
2682 					break;
2683 				if (na == NULL) /* only memory info */
2684 					break;
2685 				netmap_update_config(na);
2686 				req->nr_rx_rings = na->num_rx_rings;
2687 				req->nr_tx_rings = na->num_tx_rings;
2688 				req->nr_rx_slots = na->num_rx_desc;
2689 				req->nr_tx_slots = na->num_tx_desc;
2690 				req->nr_host_tx_rings = na->num_host_tx_rings;
2691 				req->nr_host_rx_rings = na->num_host_rx_rings;
2692 			} while (0);
2693 			netmap_unget_na(na, ifp);
2694 			NMG_UNLOCK();
2695 			break;
2696 		}
2697 #ifdef WITH_VALE
2698 		case NETMAP_REQ_VALE_ATTACH: {
2699 			error = netmap_vale_attach(hdr, NULL /* userspace request */);
2700 			break;
2701 		}
2702 
2703 		case NETMAP_REQ_VALE_DETACH: {
2704 			error = netmap_vale_detach(hdr, NULL /* userspace request */);
2705 			break;
2706 		}
2707 
2708 		case NETMAP_REQ_VALE_LIST: {
2709 			error = netmap_vale_list(hdr);
2710 			break;
2711 		}
2712 
2713 		case NETMAP_REQ_PORT_HDR_SET: {
2714 			struct nmreq_port_hdr *req =
2715 				(struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2716 			/* Build a nmreq_register out of the nmreq_port_hdr,
2717 			 * so that we can call netmap_get_bdg_na(). */
2718 			struct nmreq_register regreq;
2719 			bzero(&regreq, sizeof(regreq));
2720 			regreq.nr_mode = NR_REG_ALL_NIC;
2721 
2722 			/* For now we only support virtio-net headers, and only for
2723 			 * VALE ports, but this may change in future. Valid lengths
2724 			 * for the virtio-net header are 0 (no header), 10 and 12. */
2725 			if (req->nr_hdr_len != 0 &&
2726 				req->nr_hdr_len != sizeof(struct nm_vnet_hdr) &&
2727 					req->nr_hdr_len != 12) {
2728 				if (netmap_verbose)
2729 					nm_prerr("invalid hdr_len %u", req->nr_hdr_len);
2730 				error = EINVAL;
2731 				break;
2732 			}
2733 			NMG_LOCK();
2734 			hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2735 			hdr->nr_body = (uintptr_t)&regreq;
2736 			error = netmap_get_vale_na(hdr, &na, NULL, 0);
2737 			hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_SET;
2738 			hdr->nr_body = (uintptr_t)req;
2739 			if (na && !error) {
2740 				struct netmap_vp_adapter *vpna =
2741 					(struct netmap_vp_adapter *)na;
2742 				na->virt_hdr_len = req->nr_hdr_len;
2743 				if (na->virt_hdr_len) {
2744 					vpna->mfs = NETMAP_BUF_SIZE(na);
2745 				}
2746 				if (netmap_verbose)
2747 					nm_prinf("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
2748 				netmap_adapter_put(na);
2749 			} else if (!na) {
2750 				error = ENXIO;
2751 			}
2752 			NMG_UNLOCK();
2753 			break;
2754 		}
2755 
2756 		case NETMAP_REQ_PORT_HDR_GET: {
2757 			/* Get vnet-header length for this netmap port */
2758 			struct nmreq_port_hdr *req =
2759 				(struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2760 			/* Build a nmreq_register out of the nmreq_port_hdr,
2761 			 * so that we can call netmap_get_bdg_na(). */
2762 			struct nmreq_register regreq;
2763 			struct ifnet *ifp;
2764 
2765 			bzero(&regreq, sizeof(regreq));
2766 			regreq.nr_mode = NR_REG_ALL_NIC;
2767 			NMG_LOCK();
2768 			hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2769 			hdr->nr_body = (uintptr_t)&regreq;
2770 			error = netmap_get_na(hdr, &na, &ifp, NULL, 0);
2771 			hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_GET;
2772 			hdr->nr_body = (uintptr_t)req;
2773 			if (na && !error) {
2774 				req->nr_hdr_len = na->virt_hdr_len;
2775 			}
2776 			netmap_unget_na(na, ifp);
2777 			NMG_UNLOCK();
2778 			break;
2779 		}
2780 
2781 		case NETMAP_REQ_VALE_NEWIF: {
2782 			error = nm_vi_create(hdr);
2783 			break;
2784 		}
2785 
2786 		case NETMAP_REQ_VALE_DELIF: {
2787 			error = nm_vi_destroy(hdr->nr_name);
2788 			break;
2789 		}
2790 
2791 		case NETMAP_REQ_VALE_POLLING_ENABLE:
2792 		case NETMAP_REQ_VALE_POLLING_DISABLE: {
2793 			error = nm_bdg_polling(hdr);
2794 			break;
2795 		}
2796 #endif  /* WITH_VALE */
2797 		case NETMAP_REQ_POOLS_INFO_GET: {
2798 			/* Get information from the memory allocator used for
2799 			 * hdr->nr_name. */
2800 			struct nmreq_pools_info *req =
2801 				(struct nmreq_pools_info *)(uintptr_t)hdr->nr_body;
2802 			NMG_LOCK();
2803 			do {
2804 				/* Build a nmreq_register out of the nmreq_pools_info,
2805 				 * so that we can call netmap_get_na(). */
2806 				struct nmreq_register regreq;
2807 				bzero(&regreq, sizeof(regreq));
2808 				regreq.nr_mem_id = req->nr_mem_id;
2809 				regreq.nr_mode = NR_REG_ALL_NIC;
2810 
2811 				hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2812 				hdr->nr_body = (uintptr_t)&regreq;
2813 				error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2814 				hdr->nr_reqtype = NETMAP_REQ_POOLS_INFO_GET; /* reset type */
2815 				hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2816 				if (error) {
2817 					na = NULL;
2818 					ifp = NULL;
2819 					break;
2820 				}
2821 				nmd = na->nm_mem; /* grab the memory allocator */
2822 				if (nmd == NULL) {
2823 					error = EINVAL;
2824 					break;
2825 				}
2826 
2827 				/* Finalize the memory allocator, get the pools
2828 				 * information and release the allocator. */
2829 				error = netmap_mem_finalize(nmd, na);
2830 				if (error) {
2831 					break;
2832 				}
2833 				error = netmap_mem_pools_info_get(req, nmd);
2834 				netmap_mem_drop(na);
2835 			} while (0);
2836 			netmap_unget_na(na, ifp);
2837 			NMG_UNLOCK();
2838 			break;
2839 		}
2840 
2841 		case NETMAP_REQ_CSB_ENABLE: {
2842 			struct nmreq_option *opt;
2843 
2844 			opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_CSB);
2845 			if (opt == NULL) {
2846 				error = EINVAL;
2847 			} else {
2848 				struct nmreq_opt_csb *csbo =
2849 					(struct nmreq_opt_csb *)opt;
2850 				NMG_LOCK();
2851 				error = netmap_csb_validate(priv, csbo);
2852 				NMG_UNLOCK();
2853 				opt->nro_status = error;
2854 			}
2855 			break;
2856 		}
2857 
2858 		case NETMAP_REQ_SYNC_KLOOP_START: {
2859 			error = netmap_sync_kloop(priv, hdr);
2860 			break;
2861 		}
2862 
2863 		case NETMAP_REQ_SYNC_KLOOP_STOP: {
2864 			error = netmap_sync_kloop_stop(priv);
2865 			break;
2866 		}
2867 
2868 		default: {
2869 			error = EINVAL;
2870 			break;
2871 		}
2872 		}
2873 		/* Write back request body to userspace and reset the
2874 		 * user-space pointer. */
2875 		error = nmreq_copyout(hdr, error);
2876 		break;
2877 	}
2878 
2879 	case NIOCTXSYNC:
2880 	case NIOCRXSYNC: {
2881 		if (unlikely(priv->np_nifp == NULL)) {
2882 			error = ENXIO;
2883 			break;
2884 		}
2885 		mb(); /* make sure following reads are not from cache */
2886 
2887 		if (unlikely(priv->np_csb_atok_base)) {
2888 			nm_prerr("Invalid sync in CSB mode");
2889 			error = EBUSY;
2890 			break;
2891 		}
2892 
2893 		na = priv->np_na;      /* we have a reference */
2894 
2895 		mbq_init(&q);
2896 		t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
2897 		krings = NMR(na, t);
2898 		qfirst = priv->np_qfirst[t];
2899 		qlast = priv->np_qlast[t];
2900 		sync_flags = priv->np_sync_flags;
2901 
2902 		for (i = qfirst; i < qlast; i++) {
2903 			struct netmap_kring *kring = krings[i];
2904 			struct netmap_ring *ring = kring->ring;
2905 
2906 			if (unlikely(nm_kr_tryget(kring, 1, &error))) {
2907 				error = (error ? EIO : 0);
2908 				continue;
2909 			}
2910 
2911 			if (cmd == NIOCTXSYNC) {
2912 				if (netmap_debug & NM_DEBUG_TXSYNC)
2913 					nm_prinf("pre txsync ring %d cur %d hwcur %d",
2914 					    i, ring->cur,
2915 					    kring->nr_hwcur);
2916 				if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2917 					netmap_ring_reinit(kring);
2918 				} else if (kring->nm_sync(kring, sync_flags | NAF_FORCE_RECLAIM) == 0) {
2919 					nm_sync_finalize(kring);
2920 				}
2921 				if (netmap_debug & NM_DEBUG_TXSYNC)
2922 					nm_prinf("post txsync ring %d cur %d hwcur %d",
2923 					    i, ring->cur,
2924 					    kring->nr_hwcur);
2925 			} else {
2926 				if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2927 					netmap_ring_reinit(kring);
2928 				}
2929 				if (nm_may_forward_up(kring)) {
2930 					/* transparent forwarding, see netmap_poll() */
2931 					netmap_grab_packets(kring, &q, netmap_fwd);
2932 				}
2933 				if (kring->nm_sync(kring, sync_flags | NAF_FORCE_READ) == 0) {
2934 					nm_sync_finalize(kring);
2935 				}
2936 				ring_timestamp_set(ring);
2937 			}
2938 			nm_kr_put(kring);
2939 		}
2940 
2941 		if (mbq_peek(&q)) {
2942 			netmap_send_up(na->ifp, &q);
2943 		}
2944 
2945 		break;
2946 	}
2947 
2948 	default: {
2949 		return netmap_ioctl_legacy(priv, cmd, data, td);
2950 		break;
2951 	}
2952 	}
2953 
2954 	return (error);
2955 }
2956 
2957 size_t
2958 nmreq_size_by_type(uint16_t nr_reqtype)
2959 {
2960 	switch (nr_reqtype) {
2961 	case NETMAP_REQ_REGISTER:
2962 		return sizeof(struct nmreq_register);
2963 	case NETMAP_REQ_PORT_INFO_GET:
2964 		return sizeof(struct nmreq_port_info_get);
2965 	case NETMAP_REQ_VALE_ATTACH:
2966 		return sizeof(struct nmreq_vale_attach);
2967 	case NETMAP_REQ_VALE_DETACH:
2968 		return sizeof(struct nmreq_vale_detach);
2969 	case NETMAP_REQ_VALE_LIST:
2970 		return sizeof(struct nmreq_vale_list);
2971 	case NETMAP_REQ_PORT_HDR_SET:
2972 	case NETMAP_REQ_PORT_HDR_GET:
2973 		return sizeof(struct nmreq_port_hdr);
2974 	case NETMAP_REQ_VALE_NEWIF:
2975 		return sizeof(struct nmreq_vale_newif);
2976 	case NETMAP_REQ_VALE_DELIF:
2977 	case NETMAP_REQ_SYNC_KLOOP_STOP:
2978 	case NETMAP_REQ_CSB_ENABLE:
2979 		return 0;
2980 	case NETMAP_REQ_VALE_POLLING_ENABLE:
2981 	case NETMAP_REQ_VALE_POLLING_DISABLE:
2982 		return sizeof(struct nmreq_vale_polling);
2983 	case NETMAP_REQ_POOLS_INFO_GET:
2984 		return sizeof(struct nmreq_pools_info);
2985 	case NETMAP_REQ_SYNC_KLOOP_START:
2986 		return sizeof(struct nmreq_sync_kloop_start);
2987 	}
2988 	return 0;
2989 }
2990 
2991 static size_t
2992 nmreq_opt_size_by_type(uint32_t nro_reqtype, uint64_t nro_size)
2993 {
2994 	size_t rv = sizeof(struct nmreq_option);
2995 #ifdef NETMAP_REQ_OPT_DEBUG
2996 	if (nro_reqtype & NETMAP_REQ_OPT_DEBUG)
2997 		return (nro_reqtype & ~NETMAP_REQ_OPT_DEBUG);
2998 #endif /* NETMAP_REQ_OPT_DEBUG */
2999 	switch (nro_reqtype) {
3000 #ifdef WITH_EXTMEM
3001 	case NETMAP_REQ_OPT_EXTMEM:
3002 		rv = sizeof(struct nmreq_opt_extmem);
3003 		break;
3004 #endif /* WITH_EXTMEM */
3005 	case NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS:
3006 		if (nro_size >= rv)
3007 			rv = nro_size;
3008 		break;
3009 	case NETMAP_REQ_OPT_CSB:
3010 		rv = sizeof(struct nmreq_opt_csb);
3011 		break;
3012 	case NETMAP_REQ_OPT_SYNC_KLOOP_MODE:
3013 		rv = sizeof(struct nmreq_opt_sync_kloop_mode);
3014 		break;
3015 	}
3016 	/* subtract the common header */
3017 	return rv - sizeof(struct nmreq_option);
3018 }
3019 
3020 /*
3021  * nmreq_copyin: create an in-kernel version of the request.
3022  *
3023  * We build the following data structure:
3024  *
3025  * hdr -> +-------+                buf
3026  *        |       |          +---------------+
3027  *        +-------+          |usr body ptr   |
3028  *        |options|-.        +---------------+
3029  *        +-------+ |        |usr options ptr|
3030  *        |body   |--------->+---------------+
3031  *        +-------+ |        |               |
3032  *                  |        |  copy of body |
3033  *                  |        |               |
3034  *                  |        +---------------+
3035  *                  |        |    NULL       |
3036  *                  |        +---------------+
3037  *                  |    .---|               |\
3038  *                  |    |   +---------------+ |
3039  *                  | .------|               | |
3040  *                  | |  |   +---------------+  \ option table
3041  *                  | |  |   |      ...      |  / indexed by option
3042  *                  | |  |   +---------------+ |  type
3043  *                  | |  |   |               | |
3044  *                  | |  |   +---------------+/
3045  *                  | |  |   |usr next ptr 1 |
3046  *                  `-|----->+---------------+
3047  *                    |  |   | copy of opt 1 |
3048  *                    |  |   |               |
3049  *                    |  | .-| nro_next      |
3050  *                    |  | | +---------------+
3051  *                    |  | | |usr next ptr 2 |
3052  *                    |  `-`>+---------------+
3053  *                    |      | copy of opt 2 |
3054  *                    |      |               |
3055  *                    |    .-| nro_next      |
3056  *                    |    | +---------------+
3057  *                    |    | |               |
3058  *                    ~    ~ ~      ...      ~
3059  *                    |    .-|               |
3060  *                    `----->+---------------+
3061  *                         | |usr next ptr n |
3062  *                         `>+---------------+
3063  *                           | copy of opt n |
3064  *                           |               |
3065  *                           | nro_next(NULL)|
3066  *                           +---------------+
3067  *
3068  * The options and body fields of the hdr structure are overwritten
3069  * with in-kernel valid pointers inside the buf. The original user
3070  * pointers are saved in the buf and restored on copyout.
3071  * The list of options is copied and the pointers adjusted. The
3072  * original pointers are saved before the option they belonged.
3073  *
3074  * The option table has an entry for every availabe option.  Entries
3075  * for options that have not been passed contain NULL.
3076  *
3077  */
3078 
3079 int
3080 nmreq_copyin(struct nmreq_header *hdr, int nr_body_is_user)
3081 {
3082 	size_t rqsz, optsz, bufsz;
3083 	int error = 0;
3084 	char *ker = NULL, *p;
3085 	struct nmreq_option **next, *src, **opt_tab;
3086 	struct nmreq_option buf;
3087 	uint64_t *ptrs;
3088 
3089 	if (hdr->nr_reserved) {
3090 		if (netmap_verbose)
3091 			nm_prerr("nr_reserved must be zero");
3092 		return EINVAL;
3093 	}
3094 
3095 	if (!nr_body_is_user)
3096 		return 0;
3097 
3098 	hdr->nr_reserved = nr_body_is_user;
3099 
3100 	/* compute the total size of the buffer */
3101 	rqsz = nmreq_size_by_type(hdr->nr_reqtype);
3102 	if (rqsz > NETMAP_REQ_MAXSIZE) {
3103 		error = EMSGSIZE;
3104 		goto out_err;
3105 	}
3106 	if ((rqsz && hdr->nr_body == (uintptr_t)NULL) ||
3107 		(!rqsz && hdr->nr_body != (uintptr_t)NULL)) {
3108 		/* Request body expected, but not found; or
3109 		 * request body found but unexpected. */
3110 		if (netmap_verbose)
3111 			nm_prerr("nr_body expected but not found, or vice versa");
3112 		error = EINVAL;
3113 		goto out_err;
3114 	}
3115 
3116 	bufsz = 2 * sizeof(void *) + rqsz +
3117 		NETMAP_REQ_OPT_MAX * sizeof(opt_tab);
3118 	/* compute the size of the buf below the option table.
3119 	 * It must contain a copy of every received option structure.
3120 	 * For every option we also need to store a copy of the user
3121 	 * list pointer.
3122 	 */
3123 	optsz = 0;
3124 	for (src = (struct nmreq_option *)(uintptr_t)hdr->nr_options; src;
3125 	     src = (struct nmreq_option *)(uintptr_t)buf.nro_next)
3126 	{
3127 		error = copyin(src, &buf, sizeof(*src));
3128 		if (error)
3129 			goto out_err;
3130 		optsz += sizeof(*src);
3131 		optsz += nmreq_opt_size_by_type(buf.nro_reqtype, buf.nro_size);
3132 		if (rqsz + optsz > NETMAP_REQ_MAXSIZE) {
3133 			error = EMSGSIZE;
3134 			goto out_err;
3135 		}
3136 		bufsz += sizeof(void *);
3137 	}
3138 	bufsz += optsz;
3139 
3140 	ker = nm_os_malloc(bufsz);
3141 	if (ker == NULL) {
3142 		error = ENOMEM;
3143 		goto out_err;
3144 	}
3145 	p = ker;	/* write pointer into the buffer */
3146 
3147 	/* make a copy of the user pointers */
3148 	ptrs = (uint64_t*)p;
3149 	*ptrs++ = hdr->nr_body;
3150 	*ptrs++ = hdr->nr_options;
3151 	p = (char *)ptrs;
3152 
3153 	/* copy the body */
3154 	error = copyin((void *)(uintptr_t)hdr->nr_body, p, rqsz);
3155 	if (error)
3156 		goto out_restore;
3157 	/* overwrite the user pointer with the in-kernel one */
3158 	hdr->nr_body = (uintptr_t)p;
3159 	p += rqsz;
3160 	/* start of the options table */
3161 	opt_tab = (struct nmreq_option **)p;
3162 	p += sizeof(opt_tab) * NETMAP_REQ_OPT_MAX;
3163 
3164 	/* copy the options */
3165 	next = (struct nmreq_option **)&hdr->nr_options;
3166 	src = *next;
3167 	while (src) {
3168 		struct nmreq_option *opt;
3169 
3170 		/* copy the option header */
3171 		ptrs = (uint64_t *)p;
3172 		opt = (struct nmreq_option *)(ptrs + 1);
3173 		error = copyin(src, opt, sizeof(*src));
3174 		if (error)
3175 			goto out_restore;
3176 		/* make a copy of the user next pointer */
3177 		*ptrs = opt->nro_next;
3178 		/* overwrite the user pointer with the in-kernel one */
3179 		*next = opt;
3180 
3181 		/* initialize the option as not supported.
3182 		 * Recognized options will update this field.
3183 		 */
3184 		opt->nro_status = EOPNOTSUPP;
3185 
3186 		/* check for invalid types */
3187 		if (opt->nro_reqtype < 1) {
3188 			if (netmap_verbose)
3189 				nm_prinf("invalid option type: %u", opt->nro_reqtype);
3190 			opt->nro_status = EINVAL;
3191 			error = EINVAL;
3192 			goto next;
3193 		}
3194 
3195 		if (opt->nro_reqtype >= NETMAP_REQ_OPT_MAX) {
3196 			/* opt->nro_status is already EOPNOTSUPP */
3197 			error = EOPNOTSUPP;
3198 			goto next;
3199 		}
3200 
3201 		/* if the type is valid, index the option in the table
3202 		 * unless it is a duplicate.
3203 		 */
3204 		if (opt_tab[opt->nro_reqtype] != NULL) {
3205 			if (netmap_verbose)
3206 				nm_prinf("duplicate option: %u", opt->nro_reqtype);
3207 			opt->nro_status = EINVAL;
3208 			opt_tab[opt->nro_reqtype]->nro_status = EINVAL;
3209 			error = EINVAL;
3210 			goto next;
3211 		}
3212 		opt_tab[opt->nro_reqtype] = opt;
3213 
3214 		p = (char *)(opt + 1);
3215 
3216 		/* copy the option body */
3217 		optsz = nmreq_opt_size_by_type(opt->nro_reqtype,
3218 						opt->nro_size);
3219 		if (optsz) {
3220 			/* the option body follows the option header */
3221 			error = copyin(src + 1, p, optsz);
3222 			if (error)
3223 				goto out_restore;
3224 			p += optsz;
3225 		}
3226 
3227 	next:
3228 		/* move to next option */
3229 		next = (struct nmreq_option **)&opt->nro_next;
3230 		src = *next;
3231 	}
3232 	if (error)
3233 		nmreq_copyout(hdr, error);
3234 	return error;
3235 
3236 out_restore:
3237 	ptrs = (uint64_t *)ker;
3238 	hdr->nr_body = *ptrs++;
3239 	hdr->nr_options = *ptrs++;
3240 	hdr->nr_reserved = 0;
3241 	nm_os_free(ker);
3242 out_err:
3243 	return error;
3244 }
3245 
3246 static int
3247 nmreq_copyout(struct nmreq_header *hdr, int rerror)
3248 {
3249 	struct nmreq_option *src, *dst;
3250 	void *ker = (void *)(uintptr_t)hdr->nr_body, *bufstart;
3251 	uint64_t *ptrs;
3252 	size_t bodysz;
3253 	int error;
3254 
3255 	if (!hdr->nr_reserved)
3256 		return rerror;
3257 
3258 	/* restore the user pointers in the header */
3259 	ptrs = (uint64_t *)ker - 2;
3260 	bufstart = ptrs;
3261 	hdr->nr_body = *ptrs++;
3262 	src = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3263 	hdr->nr_options = *ptrs;
3264 
3265 	if (!rerror) {
3266 		/* copy the body */
3267 		bodysz = nmreq_size_by_type(hdr->nr_reqtype);
3268 		error = copyout(ker, (void *)(uintptr_t)hdr->nr_body, bodysz);
3269 		if (error) {
3270 			rerror = error;
3271 			goto out;
3272 		}
3273 	}
3274 
3275 	/* copy the options */
3276 	dst = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
3277 	while (src) {
3278 		size_t optsz;
3279 		uint64_t next;
3280 
3281 		/* restore the user pointer */
3282 		next = src->nro_next;
3283 		ptrs = (uint64_t *)src - 1;
3284 		src->nro_next = *ptrs;
3285 
3286 		/* always copy the option header */
3287 		error = copyout(src, dst, sizeof(*src));
3288 		if (error) {
3289 			rerror = error;
3290 			goto out;
3291 		}
3292 
3293 		/* copy the option body only if there was no error */
3294 		if (!rerror && !src->nro_status) {
3295 			optsz = nmreq_opt_size_by_type(src->nro_reqtype,
3296 							src->nro_size);
3297 			if (optsz) {
3298 				error = copyout(src + 1, dst + 1, optsz);
3299 				if (error) {
3300 					rerror = error;
3301 					goto out;
3302 				}
3303 			}
3304 		}
3305 		src = (struct nmreq_option *)(uintptr_t)next;
3306 		dst = (struct nmreq_option *)(uintptr_t)*ptrs;
3307 	}
3308 
3309 
3310 out:
3311 	hdr->nr_reserved = 0;
3312 	nm_os_free(bufstart);
3313 	return rerror;
3314 }
3315 
3316 struct nmreq_option *
3317 nmreq_getoption(struct nmreq_header *hdr, uint16_t reqtype)
3318 {
3319 	struct nmreq_option **opt_tab;
3320 
3321 	if (!hdr->nr_options)
3322 		return NULL;
3323 
3324 	opt_tab = (struct nmreq_option **)((uintptr_t)hdr->nr_options) -
3325 	    (NETMAP_REQ_OPT_MAX + 1);
3326 	return opt_tab[reqtype];
3327 }
3328 
3329 static int
3330 nmreq_checkoptions(struct nmreq_header *hdr)
3331 {
3332 	struct nmreq_option *opt;
3333 	/* return error if there is still any option
3334 	 * marked as not supported
3335 	 */
3336 
3337 	for (opt = (struct nmreq_option *)(uintptr_t)hdr->nr_options; opt;
3338 	     opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
3339 		if (opt->nro_status == EOPNOTSUPP)
3340 			return EOPNOTSUPP;
3341 
3342 	return 0;
3343 }
3344 
3345 /*
3346  * select(2) and poll(2) handlers for the "netmap" device.
3347  *
3348  * Can be called for one or more queues.
3349  * Return true the event mask corresponding to ready events.
3350  * If there are no ready events (and 'sr' is not NULL), do a
3351  * selrecord on either individual selinfo or on the global one.
3352  * Device-dependent parts (locking and sync of tx/rx rings)
3353  * are done through callbacks.
3354  *
3355  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
3356  * The first one is remapped to pwait as selrecord() uses the name as an
3357  * hidden argument.
3358  */
3359 int
3360 netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
3361 {
3362 	struct netmap_adapter *na;
3363 	struct netmap_kring *kring;
3364 	struct netmap_ring *ring;
3365 	u_int i, want[NR_TXRX], revents = 0;
3366 	NM_SELINFO_T *si[NR_TXRX];
3367 #define want_tx want[NR_TX]
3368 #define want_rx want[NR_RX]
3369 	struct mbq q;	/* packets from RX hw queues to host stack */
3370 
3371 	/*
3372 	 * In order to avoid nested locks, we need to "double check"
3373 	 * txsync and rxsync if we decide to do a selrecord().
3374 	 * retry_tx (and retry_rx, later) prevent looping forever.
3375 	 */
3376 	int retry_tx = 1, retry_rx = 1;
3377 
3378 	/* Transparent mode: send_down is 1 if we have found some
3379 	 * packets to forward (host RX ring --> NIC) during the rx
3380 	 * scan and we have not sent them down to the NIC yet.
3381 	 * Transparent mode requires to bind all rings to a single
3382 	 * file descriptor.
3383 	 */
3384 	int send_down = 0;
3385 	int sync_flags = priv->np_sync_flags;
3386 
3387 	mbq_init(&q);
3388 
3389 	if (unlikely(priv->np_nifp == NULL)) {
3390 		return POLLERR;
3391 	}
3392 	mb(); /* make sure following reads are not from cache */
3393 
3394 	na = priv->np_na;
3395 
3396 	if (unlikely(!nm_netmap_on(na)))
3397 		return POLLERR;
3398 
3399 	if (unlikely(priv->np_csb_atok_base)) {
3400 		nm_prerr("Invalid poll in CSB mode");
3401 		return POLLERR;
3402 	}
3403 
3404 	if (netmap_debug & NM_DEBUG_ON)
3405 		nm_prinf("device %s events 0x%x", na->name, events);
3406 	want_tx = events & (POLLOUT | POLLWRNORM);
3407 	want_rx = events & (POLLIN | POLLRDNORM);
3408 
3409 	/*
3410 	 * If the card has more than one queue AND the file descriptor is
3411 	 * bound to all of them, we sleep on the "global" selinfo, otherwise
3412 	 * we sleep on individual selinfo (FreeBSD only allows two selinfo's
3413 	 * per file descriptor).
3414 	 * The interrupt routine in the driver wake one or the other
3415 	 * (or both) depending on which clients are active.
3416 	 *
3417 	 * rxsync() is only called if we run out of buffers on a POLLIN.
3418 	 * txsync() is called if we run out of buffers on POLLOUT, or
3419 	 * there are pending packets to send. The latter can be disabled
3420 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
3421 	 */
3422 	si[NR_RX] = priv->np_si[NR_RX];
3423 	si[NR_TX] = priv->np_si[NR_TX];
3424 
3425 #ifdef __FreeBSD__
3426 	/*
3427 	 * We start with a lock free round which is cheap if we have
3428 	 * slots available. If this fails, then lock and call the sync
3429 	 * routines. We can't do this on Linux, as the contract says
3430 	 * that we must call nm_os_selrecord() unconditionally.
3431 	 */
3432 	if (want_tx) {
3433 		const enum txrx t = NR_TX;
3434 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3435 			kring = NMR(na, t)[i];
3436 			if (kring->ring->cur != kring->ring->tail) {
3437 				/* Some unseen TX space is available, so what
3438 				 * we don't need to run txsync. */
3439 				revents |= want[t];
3440 				want[t] = 0;
3441 				break;
3442 			}
3443 		}
3444 	}
3445 	if (want_rx) {
3446 		const enum txrx t = NR_RX;
3447 		int rxsync_needed = 0;
3448 
3449 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3450 			kring = NMR(na, t)[i];
3451 			if (kring->ring->cur == kring->ring->tail
3452 				|| kring->rhead != kring->ring->head) {
3453 				/* There are no unseen packets on this ring,
3454 				 * or there are some buffers to be returned
3455 				 * to the netmap port. We therefore go ahead
3456 				 * and run rxsync. */
3457 				rxsync_needed = 1;
3458 				break;
3459 			}
3460 		}
3461 		if (!rxsync_needed) {
3462 			revents |= want_rx;
3463 			want_rx = 0;
3464 		}
3465 	}
3466 #endif
3467 
3468 #ifdef linux
3469 	/* The selrecord must be unconditional on linux. */
3470 	nm_os_selrecord(sr, si[NR_RX]);
3471 	nm_os_selrecord(sr, si[NR_TX]);
3472 #endif /* linux */
3473 
3474 	/*
3475 	 * If we want to push packets out (priv->np_txpoll) or
3476 	 * want_tx is still set, we must issue txsync calls
3477 	 * (on all rings, to avoid that the tx rings stall).
3478 	 * Fortunately, normal tx mode has np_txpoll set.
3479 	 */
3480 	if (priv->np_txpoll || want_tx) {
3481 		/*
3482 		 * The first round checks if anyone is ready, if not
3483 		 * do a selrecord and another round to handle races.
3484 		 * want_tx goes to 0 if any space is found, and is
3485 		 * used to skip rings with no pending transmissions.
3486 		 */
3487 flush_tx:
3488 		for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {
3489 			int found = 0;
3490 
3491 			kring = na->tx_rings[i];
3492 			ring = kring->ring;
3493 
3494 			/*
3495 			 * Don't try to txsync this TX ring if we already found some
3496 			 * space in some of the TX rings (want_tx == 0) and there are no
3497 			 * TX slots in this ring that need to be flushed to the NIC
3498 			 * (head == hwcur).
3499 			 */
3500 			if (!send_down && !want_tx && ring->head == kring->nr_hwcur)
3501 				continue;
3502 
3503 			if (nm_kr_tryget(kring, 1, &revents))
3504 				continue;
3505 
3506 			if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3507 				netmap_ring_reinit(kring);
3508 				revents |= POLLERR;
3509 			} else {
3510 				if (kring->nm_sync(kring, sync_flags))
3511 					revents |= POLLERR;
3512 				else
3513 					nm_sync_finalize(kring);
3514 			}
3515 
3516 			/*
3517 			 * If we found new slots, notify potential
3518 			 * listeners on the same ring.
3519 			 * Since we just did a txsync, look at the copies
3520 			 * of cur,tail in the kring.
3521 			 */
3522 			found = kring->rcur != kring->rtail;
3523 			nm_kr_put(kring);
3524 			if (found) { /* notify other listeners */
3525 				revents |= want_tx;
3526 				want_tx = 0;
3527 #ifndef linux
3528 				kring->nm_notify(kring, 0);
3529 #endif /* linux */
3530 			}
3531 		}
3532 		/* if there were any packet to forward we must have handled them by now */
3533 		send_down = 0;
3534 		if (want_tx && retry_tx && sr) {
3535 #ifndef linux
3536 			nm_os_selrecord(sr, si[NR_TX]);
3537 #endif /* !linux */
3538 			retry_tx = 0;
3539 			goto flush_tx;
3540 		}
3541 	}
3542 
3543 	/*
3544 	 * If want_rx is still set scan receive rings.
3545 	 * Do it on all rings because otherwise we starve.
3546 	 */
3547 	if (want_rx) {
3548 		/* two rounds here for race avoidance */
3549 do_retry_rx:
3550 		for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
3551 			int found = 0;
3552 
3553 			kring = na->rx_rings[i];
3554 			ring = kring->ring;
3555 
3556 			if (unlikely(nm_kr_tryget(kring, 1, &revents)))
3557 				continue;
3558 
3559 			if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3560 				netmap_ring_reinit(kring);
3561 				revents |= POLLERR;
3562 			}
3563 			/* now we can use kring->rcur, rtail */
3564 
3565 			/*
3566 			 * transparent mode support: collect packets from
3567 			 * hw rxring(s) that have been released by the user
3568 			 */
3569 			if (nm_may_forward_up(kring)) {
3570 				netmap_grab_packets(kring, &q, netmap_fwd);
3571 			}
3572 
3573 			/* Clear the NR_FORWARD flag anyway, it may be set by
3574 			 * the nm_sync() below only on for the host RX ring (see
3575 			 * netmap_rxsync_from_host()). */
3576 			kring->nr_kflags &= ~NR_FORWARD;
3577 			if (kring->nm_sync(kring, sync_flags))
3578 				revents |= POLLERR;
3579 			else
3580 				nm_sync_finalize(kring);
3581 			send_down |= (kring->nr_kflags & NR_FORWARD);
3582 			ring_timestamp_set(ring);
3583 			found = kring->rcur != kring->rtail;
3584 			nm_kr_put(kring);
3585 			if (found) {
3586 				revents |= want_rx;
3587 				retry_rx = 0;
3588 #ifndef linux
3589 				kring->nm_notify(kring, 0);
3590 #endif /* linux */
3591 			}
3592 		}
3593 
3594 #ifndef linux
3595 		if (retry_rx && sr) {
3596 			nm_os_selrecord(sr, si[NR_RX]);
3597 		}
3598 #endif /* !linux */
3599 		if (send_down || retry_rx) {
3600 			retry_rx = 0;
3601 			if (send_down)
3602 				goto flush_tx; /* and retry_rx */
3603 			else
3604 				goto do_retry_rx;
3605 		}
3606 	}
3607 
3608 	/*
3609 	 * Transparent mode: released bufs (i.e. between kring->nr_hwcur and
3610 	 * ring->head) marked with NS_FORWARD on hw rx rings are passed up
3611 	 * to the host stack.
3612 	 */
3613 
3614 	if (mbq_peek(&q)) {
3615 		netmap_send_up(na->ifp, &q);
3616 	}
3617 
3618 	return (revents);
3619 #undef want_tx
3620 #undef want_rx
3621 }
3622 
3623 int
3624 nma_intr_enable(struct netmap_adapter *na, int onoff)
3625 {
3626 	bool changed = false;
3627 	enum txrx t;
3628 	int i;
3629 
3630 	for_rx_tx(t) {
3631 		for (i = 0; i < nma_get_nrings(na, t); i++) {
3632 			struct netmap_kring *kring = NMR(na, t)[i];
3633 			int on = !(kring->nr_kflags & NKR_NOINTR);
3634 
3635 			if (!!onoff != !!on) {
3636 				changed = true;
3637 			}
3638 			if (onoff) {
3639 				kring->nr_kflags &= ~NKR_NOINTR;
3640 			} else {
3641 				kring->nr_kflags |= NKR_NOINTR;
3642 			}
3643 		}
3644 	}
3645 
3646 	if (!changed) {
3647 		return 0; /* nothing to do */
3648 	}
3649 
3650 	if (!na->nm_intr) {
3651 		nm_prerr("Cannot %s interrupts for %s", onoff ? "enable" : "disable",
3652 		  na->name);
3653 		return -1;
3654 	}
3655 
3656 	na->nm_intr(na, onoff);
3657 
3658 	return 0;
3659 }
3660 
3661 
3662 /*-------------------- driver support routines -------------------*/
3663 
3664 /* default notify callback */
3665 static int
3666 netmap_notify(struct netmap_kring *kring, int flags)
3667 {
3668 	struct netmap_adapter *na = kring->notify_na;
3669 	enum txrx t = kring->tx;
3670 
3671 	nm_os_selwakeup(&kring->si);
3672 	/* optimization: avoid a wake up on the global
3673 	 * queue if nobody has registered for more
3674 	 * than one ring
3675 	 */
3676 	if (na->si_users[t] > 0)
3677 		nm_os_selwakeup(&na->si[t]);
3678 
3679 	return NM_IRQ_COMPLETED;
3680 }
3681 
3682 /* called by all routines that create netmap_adapters.
3683  * provide some defaults and get a reference to the
3684  * memory allocator
3685  */
3686 int
3687 netmap_attach_common(struct netmap_adapter *na)
3688 {
3689 	if (!na->rx_buf_maxsize) {
3690 		/* Set a conservative default (larger is safer). */
3691 		na->rx_buf_maxsize = PAGE_SIZE;
3692 	}
3693 
3694 #ifdef __FreeBSD__
3695 	if (na->na_flags & NAF_HOST_RINGS && na->ifp) {
3696 		na->if_input = na->ifp->if_input; /* for netmap_send_up */
3697 	}
3698 	na->pdev = na; /* make sure netmap_mem_map() is called */
3699 #endif /* __FreeBSD__ */
3700 	if (na->na_flags & NAF_HOST_RINGS) {
3701 		if (na->num_host_rx_rings == 0)
3702 			na->num_host_rx_rings = 1;
3703 		if (na->num_host_tx_rings == 0)
3704 			na->num_host_tx_rings = 1;
3705 	}
3706 	if (na->nm_krings_create == NULL) {
3707 		/* we assume that we have been called by a driver,
3708 		 * since other port types all provide their own
3709 		 * nm_krings_create
3710 		 */
3711 		na->nm_krings_create = netmap_hw_krings_create;
3712 		na->nm_krings_delete = netmap_hw_krings_delete;
3713 	}
3714 	if (na->nm_notify == NULL)
3715 		na->nm_notify = netmap_notify;
3716 	na->active_fds = 0;
3717 
3718 	if (na->nm_mem == NULL) {
3719 		/* use the global allocator */
3720 		na->nm_mem = netmap_mem_get(&nm_mem);
3721 	}
3722 #ifdef WITH_VALE
3723 	if (na->nm_bdg_attach == NULL)
3724 		/* no special nm_bdg_attach callback. On VALE
3725 		 * attach, we need to interpose a bwrap
3726 		 */
3727 		na->nm_bdg_attach = netmap_default_bdg_attach;
3728 #endif
3729 
3730 	return 0;
3731 }
3732 
3733 /* Wrapper for the register callback provided netmap-enabled
3734  * hardware drivers.
3735  * nm_iszombie(na) means that the driver module has been
3736  * unloaded, so we cannot call into it.
3737  * nm_os_ifnet_lock() must guarantee mutual exclusion with
3738  * module unloading.
3739  */
3740 static int
3741 netmap_hw_reg(struct netmap_adapter *na, int onoff)
3742 {
3743 	struct netmap_hw_adapter *hwna =
3744 		(struct netmap_hw_adapter*)na;
3745 	int error = 0;
3746 
3747 	nm_os_ifnet_lock();
3748 
3749 	if (nm_iszombie(na)) {
3750 		if (onoff) {
3751 			error = ENXIO;
3752 		} else if (na != NULL) {
3753 			na->na_flags &= ~NAF_NETMAP_ON;
3754 		}
3755 		goto out;
3756 	}
3757 
3758 	error = hwna->nm_hw_register(na, onoff);
3759 
3760 out:
3761 	nm_os_ifnet_unlock();
3762 
3763 	return error;
3764 }
3765 
3766 static void
3767 netmap_hw_dtor(struct netmap_adapter *na)
3768 {
3769 	if (na->ifp == NULL)
3770 		return;
3771 
3772 	NM_DETACH_NA(na->ifp);
3773 }
3774 
3775 
3776 /*
3777  * Allocate a netmap_adapter object, and initialize it from the
3778  * 'arg' passed by the driver on attach.
3779  * We allocate a block of memory of 'size' bytes, which has room
3780  * for struct netmap_adapter plus additional room private to
3781  * the caller.
3782  * Return 0 on success, ENOMEM otherwise.
3783  */
3784 int
3785 netmap_attach_ext(struct netmap_adapter *arg, size_t size, int override_reg)
3786 {
3787 	struct netmap_hw_adapter *hwna = NULL;
3788 	struct ifnet *ifp = NULL;
3789 
3790 	if (size < sizeof(struct netmap_hw_adapter)) {
3791 		if (netmap_debug & NM_DEBUG_ON)
3792 			nm_prerr("Invalid netmap adapter size %d", (int)size);
3793 		return EINVAL;
3794 	}
3795 
3796 	if (arg == NULL || arg->ifp == NULL) {
3797 		if (netmap_debug & NM_DEBUG_ON)
3798 			nm_prerr("either arg or arg->ifp is NULL");
3799 		return EINVAL;
3800 	}
3801 
3802 	if (arg->num_tx_rings == 0 || arg->num_rx_rings == 0) {
3803 		if (netmap_debug & NM_DEBUG_ON)
3804 			nm_prerr("%s: invalid rings tx %d rx %d",
3805 				arg->name, arg->num_tx_rings, arg->num_rx_rings);
3806 		return EINVAL;
3807 	}
3808 
3809 	ifp = arg->ifp;
3810 	if (NM_NA_CLASH(ifp)) {
3811 		/* If NA(ifp) is not null but there is no valid netmap
3812 		 * adapter it means that someone else is using the same
3813 		 * pointer (e.g. ax25_ptr on linux). This happens for
3814 		 * instance when also PF_RING is in use. */
3815 		nm_prerr("Error: netmap adapter hook is busy");
3816 		return EBUSY;
3817 	}
3818 
3819 	hwna = nm_os_malloc(size);
3820 	if (hwna == NULL)
3821 		goto fail;
3822 	hwna->up = *arg;
3823 	hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
3824 	strlcpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
3825 	if (override_reg) {
3826 		hwna->nm_hw_register = hwna->up.nm_register;
3827 		hwna->up.nm_register = netmap_hw_reg;
3828 	}
3829 	if (netmap_attach_common(&hwna->up)) {
3830 		nm_os_free(hwna);
3831 		goto fail;
3832 	}
3833 	netmap_adapter_get(&hwna->up);
3834 
3835 	NM_ATTACH_NA(ifp, &hwna->up);
3836 
3837 	nm_os_onattach(ifp);
3838 
3839 	if (arg->nm_dtor == NULL) {
3840 		hwna->up.nm_dtor = netmap_hw_dtor;
3841 	}
3842 
3843 	if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
3844 	    hwna->up.num_tx_rings, hwna->up.num_tx_desc,
3845 	    hwna->up.num_rx_rings, hwna->up.num_rx_desc);
3846 	return 0;
3847 
3848 fail:
3849 	nm_prerr("fail, arg %p ifp %p na %p", arg, ifp, hwna);
3850 	return (hwna ? EINVAL : ENOMEM);
3851 }
3852 
3853 
3854 int
3855 netmap_attach(struct netmap_adapter *arg)
3856 {
3857 	return netmap_attach_ext(arg, sizeof(struct netmap_hw_adapter),
3858 			1 /* override nm_reg */);
3859 }
3860 
3861 
3862 void
3863 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
3864 {
3865 	if (!na) {
3866 		return;
3867 	}
3868 
3869 	refcount_acquire(&na->na_refcount);
3870 }
3871 
3872 
3873 /* returns 1 iff the netmap_adapter is destroyed */
3874 int
3875 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
3876 {
3877 	if (!na)
3878 		return 1;
3879 
3880 	if (!refcount_release(&na->na_refcount))
3881 		return 0;
3882 
3883 	if (na->nm_dtor)
3884 		na->nm_dtor(na);
3885 
3886 	if (na->tx_rings) { /* XXX should not happen */
3887 		if (netmap_debug & NM_DEBUG_ON)
3888 			nm_prerr("freeing leftover tx_rings");
3889 		na->nm_krings_delete(na);
3890 	}
3891 	netmap_pipe_dealloc(na);
3892 	if (na->nm_mem)
3893 		netmap_mem_put(na->nm_mem);
3894 	bzero(na, sizeof(*na));
3895 	nm_os_free(na);
3896 
3897 	return 1;
3898 }
3899 
3900 /* nm_krings_create callback for all hardware native adapters */
3901 int
3902 netmap_hw_krings_create(struct netmap_adapter *na)
3903 {
3904 	int ret = netmap_krings_create(na, 0);
3905 	if (ret == 0) {
3906 		/* initialize the mbq for the sw rx ring */
3907 		u_int lim = netmap_real_rings(na, NR_RX), i;
3908 		for (i = na->num_rx_rings; i < lim; i++) {
3909 			mbq_safe_init(&NMR(na, NR_RX)[i]->rx_queue);
3910 		}
3911 		nm_prdis("initialized sw rx queue %d", na->num_rx_rings);
3912 	}
3913 	return ret;
3914 }
3915 
3916 
3917 
3918 /*
3919  * Called on module unload by the netmap-enabled drivers
3920  */
3921 void
3922 netmap_detach(struct ifnet *ifp)
3923 {
3924 	struct netmap_adapter *na = NA(ifp);
3925 
3926 	if (!na)
3927 		return;
3928 
3929 	NMG_LOCK();
3930 	netmap_set_all_rings(na, NM_KR_LOCKED);
3931 	/*
3932 	 * if the netmap adapter is not native, somebody
3933 	 * changed it, so we can not release it here.
3934 	 * The NAF_ZOMBIE flag will notify the new owner that
3935 	 * the driver is gone.
3936 	 */
3937 	if (!(na->na_flags & NAF_NATIVE) || !netmap_adapter_put(na)) {
3938 		na->na_flags |= NAF_ZOMBIE;
3939 	}
3940 	/* give active users a chance to notice that NAF_ZOMBIE has been
3941 	 * turned on, so that they can stop and return an error to userspace.
3942 	 * Note that this becomes a NOP if there are no active users and,
3943 	 * therefore, the put() above has deleted the na, since now NA(ifp) is
3944 	 * NULL.
3945 	 */
3946 	netmap_enable_all_rings(ifp);
3947 	NMG_UNLOCK();
3948 }
3949 
3950 
3951 /*
3952  * Intercept packets from the network stack and pass them
3953  * to netmap as incoming packets on the 'software' ring.
3954  *
3955  * We only store packets in a bounded mbq and then copy them
3956  * in the relevant rxsync routine.
3957  *
3958  * We rely on the OS to make sure that the ifp and na do not go
3959  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
3960  * In nm_register() or whenever there is a reinitialization,
3961  * we make sure to make the mode change visible here.
3962  */
3963 int
3964 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
3965 {
3966 	struct netmap_adapter *na = NA(ifp);
3967 	struct netmap_kring *kring, *tx_kring;
3968 	u_int len = MBUF_LEN(m);
3969 	u_int error = ENOBUFS;
3970 	unsigned int txr;
3971 	struct mbq *q;
3972 	int busy;
3973 	u_int i;
3974 
3975 	i = MBUF_TXQ(m);
3976 	if (i >= na->num_host_rx_rings) {
3977 		i = i % na->num_host_rx_rings;
3978 	}
3979 	kring = NMR(na, NR_RX)[nma_get_nrings(na, NR_RX) + i];
3980 
3981 	// XXX [Linux] we do not need this lock
3982 	// if we follow the down/configure/up protocol -gl
3983 	// mtx_lock(&na->core_lock);
3984 
3985 	if (!nm_netmap_on(na)) {
3986 		nm_prerr("%s not in netmap mode anymore", na->name);
3987 		error = ENXIO;
3988 		goto done;
3989 	}
3990 
3991 	txr = MBUF_TXQ(m);
3992 	if (txr >= na->num_tx_rings) {
3993 		txr %= na->num_tx_rings;
3994 	}
3995 	tx_kring = NMR(na, NR_TX)[txr];
3996 
3997 	if (tx_kring->nr_mode == NKR_NETMAP_OFF) {
3998 		return MBUF_TRANSMIT(na, ifp, m);
3999 	}
4000 
4001 	q = &kring->rx_queue;
4002 
4003 	// XXX reconsider long packets if we handle fragments
4004 	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
4005 		nm_prerr("%s from_host, drop packet size %d > %d", na->name,
4006 			len, NETMAP_BUF_SIZE(na));
4007 		goto done;
4008 	}
4009 
4010 	if (!netmap_generic_hwcsum) {
4011 		if (nm_os_mbuf_has_csum_offld(m)) {
4012 			nm_prlim(1, "%s drop mbuf that needs checksum offload", na->name);
4013 			goto done;
4014 		}
4015 	}
4016 
4017 	if (nm_os_mbuf_has_seg_offld(m)) {
4018 		nm_prlim(1, "%s drop mbuf that needs generic segmentation offload", na->name);
4019 		goto done;
4020 	}
4021 
4022 #ifdef __FreeBSD__
4023 	ETHER_BPF_MTAP(ifp, m);
4024 #endif /* __FreeBSD__ */
4025 
4026 	/* protect against netmap_rxsync_from_host(), netmap_sw_to_nic()
4027 	 * and maybe other instances of netmap_transmit (the latter
4028 	 * not possible on Linux).
4029 	 * We enqueue the mbuf only if we are sure there is going to be
4030 	 * enough room in the host RX ring, otherwise we drop it.
4031 	 */
4032 	mbq_lock(q);
4033 
4034 	busy = kring->nr_hwtail - kring->nr_hwcur;
4035 	if (busy < 0)
4036 		busy += kring->nkr_num_slots;
4037 	if (busy + mbq_len(q) >= kring->nkr_num_slots - 1) {
4038 		nm_prlim(2, "%s full hwcur %d hwtail %d qlen %d", na->name,
4039 			kring->nr_hwcur, kring->nr_hwtail, mbq_len(q));
4040 	} else {
4041 		mbq_enqueue(q, m);
4042 		nm_prdis(2, "%s %d bufs in queue", na->name, mbq_len(q));
4043 		/* notify outside the lock */
4044 		m = NULL;
4045 		error = 0;
4046 	}
4047 	mbq_unlock(q);
4048 
4049 done:
4050 	if (m)
4051 		m_freem(m);
4052 	/* unconditionally wake up listeners */
4053 	kring->nm_notify(kring, 0);
4054 	/* this is normally netmap_notify(), but for nics
4055 	 * connected to a bridge it is netmap_bwrap_intr_notify(),
4056 	 * that possibly forwards the frames through the switch
4057 	 */
4058 
4059 	return (error);
4060 }
4061 
4062 
4063 /*
4064  * netmap_reset() is called by the driver routines when reinitializing
4065  * a ring. The driver is in charge of locking to protect the kring.
4066  * If native netmap mode is not set just return NULL.
4067  * If native netmap mode is set, in particular, we have to set nr_mode to
4068  * NKR_NETMAP_ON.
4069  */
4070 struct netmap_slot *
4071 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
4072 	u_int new_cur)
4073 {
4074 	struct netmap_kring *kring;
4075 	int new_hwofs, lim;
4076 
4077 	if (!nm_native_on(na)) {
4078 		nm_prdis("interface not in native netmap mode");
4079 		return NULL;	/* nothing to reinitialize */
4080 	}
4081 
4082 	/* XXX note- in the new scheme, we are not guaranteed to be
4083 	 * under lock (e.g. when called on a device reset).
4084 	 * In this case, we should set a flag and do not trust too
4085 	 * much the values. In practice: TODO
4086 	 * - set a RESET flag somewhere in the kring
4087 	 * - do the processing in a conservative way
4088 	 * - let the *sync() fixup at the end.
4089 	 */
4090 	if (tx == NR_TX) {
4091 		if (n >= na->num_tx_rings)
4092 			return NULL;
4093 
4094 		kring = na->tx_rings[n];
4095 
4096 		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
4097 			kring->nr_mode = NKR_NETMAP_OFF;
4098 			return NULL;
4099 		}
4100 
4101 		// XXX check whether we should use hwcur or rcur
4102 		new_hwofs = kring->nr_hwcur - new_cur;
4103 	} else {
4104 		if (n >= na->num_rx_rings)
4105 			return NULL;
4106 		kring = na->rx_rings[n];
4107 
4108 		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
4109 			kring->nr_mode = NKR_NETMAP_OFF;
4110 			return NULL;
4111 		}
4112 
4113 		new_hwofs = kring->nr_hwtail - new_cur;
4114 	}
4115 	lim = kring->nkr_num_slots - 1;
4116 	if (new_hwofs > lim)
4117 		new_hwofs -= lim + 1;
4118 
4119 	/* Always set the new offset value and realign the ring. */
4120 	if (netmap_debug & NM_DEBUG_ON)
4121 	    nm_prinf("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
4122 		na->name,
4123 		tx == NR_TX ? "TX" : "RX", n,
4124 		kring->nkr_hwofs, new_hwofs,
4125 		kring->nr_hwtail,
4126 		tx == NR_TX ? lim : kring->nr_hwtail);
4127 	kring->nkr_hwofs = new_hwofs;
4128 	if (tx == NR_TX) {
4129 		kring->nr_hwtail = kring->nr_hwcur + lim;
4130 		if (kring->nr_hwtail > lim)
4131 			kring->nr_hwtail -= lim + 1;
4132 	}
4133 
4134 	/*
4135 	 * Wakeup on the individual and global selwait
4136 	 * We do the wakeup here, but the ring is not yet reconfigured.
4137 	 * However, we are under lock so there are no races.
4138 	 */
4139 	kring->nr_mode = NKR_NETMAP_ON;
4140 	kring->nm_notify(kring, 0);
4141 	return kring->ring->slot;
4142 }
4143 
4144 
4145 /*
4146  * Dispatch rx/tx interrupts to the netmap rings.
4147  *
4148  * "work_done" is non-null on the RX path, NULL for the TX path.
4149  * We rely on the OS to make sure that there is only one active
4150  * instance per queue, and that there is appropriate locking.
4151  *
4152  * The 'notify' routine depends on what the ring is attached to.
4153  * - for a netmap file descriptor, do a selwakeup on the individual
4154  *   waitqueue, plus one on the global one if needed
4155  *   (see netmap_notify)
4156  * - for a nic connected to a switch, call the proper forwarding routine
4157  *   (see netmap_bwrap_intr_notify)
4158  */
4159 int
4160 netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
4161 {
4162 	struct netmap_kring *kring;
4163 	enum txrx t = (work_done ? NR_RX : NR_TX);
4164 
4165 	q &= NETMAP_RING_MASK;
4166 
4167 	if (netmap_debug & (NM_DEBUG_RXINTR|NM_DEBUG_TXINTR)) {
4168 	        nm_prlim(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
4169 	}
4170 
4171 	if (q >= nma_get_nrings(na, t))
4172 		return NM_IRQ_PASS; // not a physical queue
4173 
4174 	kring = NMR(na, t)[q];
4175 
4176 	if (kring->nr_mode == NKR_NETMAP_OFF) {
4177 		return NM_IRQ_PASS;
4178 	}
4179 
4180 	if (t == NR_RX) {
4181 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
4182 		*work_done = 1; /* do not fire napi again */
4183 	}
4184 
4185 	return kring->nm_notify(kring, 0);
4186 }
4187 
4188 
4189 /*
4190  * Default functions to handle rx/tx interrupts from a physical device.
4191  * "work_done" is non-null on the RX path, NULL for the TX path.
4192  *
4193  * If the card is not in netmap mode, simply return NM_IRQ_PASS,
4194  * so that the caller proceeds with regular processing.
4195  * Otherwise call netmap_common_irq().
4196  *
4197  * If the card is connected to a netmap file descriptor,
4198  * do a selwakeup on the individual queue, plus one on the global one
4199  * if needed (multiqueue card _and_ there are multiqueue listeners),
4200  * and return NR_IRQ_COMPLETED.
4201  *
4202  * Finally, if called on rx from an interface connected to a switch,
4203  * calls the proper forwarding routine.
4204  */
4205 int
4206 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
4207 {
4208 	struct netmap_adapter *na = NA(ifp);
4209 
4210 	/*
4211 	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
4212 	 * we still use the regular driver even though the previous
4213 	 * check fails. It is unclear whether we should use
4214 	 * nm_native_on() here.
4215 	 */
4216 	if (!nm_netmap_on(na))
4217 		return NM_IRQ_PASS;
4218 
4219 	if (na->na_flags & NAF_SKIP_INTR) {
4220 		nm_prdis("use regular interrupt");
4221 		return NM_IRQ_PASS;
4222 	}
4223 
4224 	return netmap_common_irq(na, q, work_done);
4225 }
4226 
4227 /* set/clear native flags and if_transmit/netdev_ops */
4228 void
4229 nm_set_native_flags(struct netmap_adapter *na)
4230 {
4231 	struct ifnet *ifp = na->ifp;
4232 
4233 	/* We do the setup for intercepting packets only if we are the
4234 	 * first user of this adapapter. */
4235 	if (na->active_fds > 0) {
4236 		return;
4237 	}
4238 
4239 	na->na_flags |= NAF_NETMAP_ON;
4240 	nm_os_onenter(ifp);
4241 	nm_update_hostrings_mode(na);
4242 }
4243 
4244 void
4245 nm_clear_native_flags(struct netmap_adapter *na)
4246 {
4247 	struct ifnet *ifp = na->ifp;
4248 
4249 	/* We undo the setup for intercepting packets only if we are the
4250 	 * last user of this adapter. */
4251 	if (na->active_fds > 0) {
4252 		return;
4253 	}
4254 
4255 	nm_update_hostrings_mode(na);
4256 	nm_os_onexit(ifp);
4257 
4258 	na->na_flags &= ~NAF_NETMAP_ON;
4259 }
4260 
4261 void
4262 netmap_krings_mode_commit(struct netmap_adapter *na, int onoff)
4263 {
4264 	enum txrx t;
4265 
4266 	for_rx_tx(t) {
4267 		int i;
4268 
4269 		for (i = 0; i < netmap_real_rings(na, t); i++) {
4270 			struct netmap_kring *kring = NMR(na, t)[i];
4271 
4272 			if (onoff && nm_kring_pending_on(kring))
4273 				kring->nr_mode = NKR_NETMAP_ON;
4274 			else if (!onoff && nm_kring_pending_off(kring))
4275 				kring->nr_mode = NKR_NETMAP_OFF;
4276 		}
4277 	}
4278 }
4279 
4280 /*
4281  * Module loader and unloader
4282  *
4283  * netmap_init() creates the /dev/netmap device and initializes
4284  * all global variables. Returns 0 on success, errno on failure
4285  * (but there is no chance)
4286  *
4287  * netmap_fini() destroys everything.
4288  */
4289 
4290 static struct cdev *netmap_dev; /* /dev/netmap character device. */
4291 extern struct cdevsw netmap_cdevsw;
4292 
4293 
4294 void
4295 netmap_fini(void)
4296 {
4297 	if (netmap_dev)
4298 		destroy_dev(netmap_dev);
4299 	/* we assume that there are no longer netmap users */
4300 	nm_os_ifnet_fini();
4301 	netmap_uninit_bridges();
4302 	netmap_mem_fini();
4303 	NMG_LOCK_DESTROY();
4304 	nm_prinf("netmap: unloaded module.");
4305 }
4306 
4307 
4308 int
4309 netmap_init(void)
4310 {
4311 	int error;
4312 
4313 	NMG_LOCK_INIT();
4314 
4315 	error = netmap_mem_init();
4316 	if (error != 0)
4317 		goto fail;
4318 	/*
4319 	 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
4320 	 * when the module is compiled in.
4321 	 * XXX could use make_dev_credv() to get error number
4322 	 */
4323 	netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
4324 		&netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
4325 			      "netmap");
4326 	if (!netmap_dev)
4327 		goto fail;
4328 
4329 	error = netmap_init_bridges();
4330 	if (error)
4331 		goto fail;
4332 
4333 #ifdef __FreeBSD__
4334 	nm_os_vi_init_index();
4335 #endif
4336 
4337 	error = nm_os_ifnet_init();
4338 	if (error)
4339 		goto fail;
4340 
4341 	nm_prinf("netmap: loaded module");
4342 	return (0);
4343 fail:
4344 	netmap_fini();
4345 	return (EINVAL); /* may be incorrect */
4346 }
4347