xref: /freebsd/sys/dev/netmap/netmap.c (revision d34048812292b714a0bf99967270d18fe3097c62)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (C) 2011-2014 Matteo Landi
5  * Copyright (C) 2011-2016 Luigi Rizzo
6  * Copyright (C) 2011-2016 Giuseppe Lettieri
7  * Copyright (C) 2011-2016 Vincenzo Maffione
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *   1. Redistributions of source code must retain the above copyright
14  *      notice, this list of conditions and the following disclaimer.
15  *   2. Redistributions in binary form must reproduce the above copyright
16  *      notice, this list of conditions and the following disclaimer in the
17  *      documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 
33 /*
34  * $FreeBSD$
35  *
36  * This module supports memory mapped access to network devices,
37  * see netmap(4).
38  *
39  * The module uses a large, memory pool allocated by the kernel
40  * and accessible as mmapped memory by multiple userspace threads/processes.
41  * The memory pool contains packet buffers and "netmap rings",
42  * i.e. user-accessible copies of the interface's queues.
43  *
44  * Access to the network card works like this:
45  * 1. a process/thread issues one or more open() on /dev/netmap, to create
46  *    select()able file descriptor on which events are reported.
47  * 2. on each descriptor, the process issues an ioctl() to identify
48  *    the interface that should report events to the file descriptor.
49  * 3. on each descriptor, the process issues an mmap() request to
50  *    map the shared memory region within the process' address space.
51  *    The list of interesting queues is indicated by a location in
52  *    the shared memory region.
53  * 4. using the functions in the netmap(4) userspace API, a process
54  *    can look up the occupation state of a queue, access memory buffers,
55  *    and retrieve received packets or enqueue packets to transmit.
56  * 5. using some ioctl()s the process can synchronize the userspace view
57  *    of the queue with the actual status in the kernel. This includes both
58  *    receiving the notification of new packets, and transmitting new
59  *    packets on the output interface.
60  * 6. select() or poll() can be used to wait for events on individual
61  *    transmit or receive queues (or all queues for a given interface).
62  *
63 
64 		SYNCHRONIZATION (USER)
65 
66 The netmap rings and data structures may be shared among multiple
67 user threads or even independent processes.
68 Any synchronization among those threads/processes is delegated
69 to the threads themselves. Only one thread at a time can be in
70 a system call on the same netmap ring. The OS does not enforce
71 this and only guarantees against system crashes in case of
72 invalid usage.
73 
74 		LOCKING (INTERNAL)
75 
76 Within the kernel, access to the netmap rings is protected as follows:
77 
78 - a spinlock on each ring, to handle producer/consumer races on
79   RX rings attached to the host stack (against multiple host
80   threads writing from the host stack to the same ring),
81   and on 'destination' rings attached to a VALE switch
82   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
83   protecting multiple active senders for the same destination)
84 
85 - an atomic variable to guarantee that there is at most one
86   instance of *_*xsync() on the ring at any time.
87   For rings connected to user file
88   descriptors, an atomic_test_and_set() protects this, and the
89   lock on the ring is not actually used.
90   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
91   is also used to prevent multiple executions (the driver might indeed
92   already guarantee this).
93   For NIC TX rings connected to a VALE switch, the lock arbitrates
94   access to the queue (both when allocating buffers and when pushing
95   them out).
96 
97 - *xsync() should be protected against initializations of the card.
98   On FreeBSD most devices have the reset routine protected by
99   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
100   the RING protection on rx_reset(), this should be added.
101 
102   On linux there is an external lock on the tx path, which probably
103   also arbitrates access to the reset routine. XXX to be revised
104 
105 - a per-interface core_lock protecting access from the host stack
106   while interfaces may be detached from netmap mode.
107   XXX there should be no need for this lock if we detach the interfaces
108   only while they are down.
109 
110 
111 --- VALE SWITCH ---
112 
113 NMG_LOCK() serializes all modifications to switches and ports.
114 A switch cannot be deleted until all ports are gone.
115 
116 For each switch, an SX lock (RWlock on linux) protects
117 deletion of ports. When configuring or deleting a new port, the
118 lock is acquired in exclusive mode (after holding NMG_LOCK).
119 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
120 The lock is held throughout the entire forwarding cycle,
121 during which the thread may incur in a page fault.
122 Hence it is important that sleepable shared locks are used.
123 
124 On the rx ring, the per-port lock is grabbed initially to reserve
125 a number of slot in the ring, then the lock is released,
126 packets are copied from source to destination, and then
127 the lock is acquired again and the receive ring is updated.
128 (A similar thing is done on the tx ring for NIC and host stack
129 ports attached to the switch)
130 
131  */
132 
133 
134 /* --- internals ----
135  *
136  * Roadmap to the code that implements the above.
137  *
138  * > 1. a process/thread issues one or more open() on /dev/netmap, to create
139  * >    select()able file descriptor on which events are reported.
140  *
141  *  	Internally, we allocate a netmap_priv_d structure, that will be
142  *  	initialized on ioctl(NIOCREGIF). There is one netmap_priv_d
143  *  	structure for each open().
144  *
145  *      os-specific:
146  *  	    FreeBSD: see netmap_open() (netmap_freebsd.c)
147  *  	    linux:   see linux_netmap_open() (netmap_linux.c)
148  *
149  * > 2. on each descriptor, the process issues an ioctl() to identify
150  * >    the interface that should report events to the file descriptor.
151  *
152  * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
153  * 	Most important things happen in netmap_get_na() and
154  * 	netmap_do_regif(), called from there. Additional details can be
155  * 	found in the comments above those functions.
156  *
157  * 	In all cases, this action creates/takes-a-reference-to a
158  * 	netmap_*_adapter describing the port, and allocates a netmap_if
159  * 	and all necessary netmap rings, filling them with netmap buffers.
160  *
161  *      In this phase, the sync callbacks for each ring are set (these are used
162  *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
163  *      The adapter creation/initialization code puts them in the
164  * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
165  * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
166  * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
167  * 	actually call netmap_krings_create() to perform this and the other
168  * 	common stuff. netmap_krings_create() also takes care of the host rings,
169  * 	if needed, by setting their sync callbacks appropriately.
170  *
171  * 	Additional actions depend on the kind of netmap_adapter that has been
172  * 	registered:
173  *
174  * 	- netmap_hw_adapter:  	     [netmap.c]
175  * 	     This is a system netdev/ifp with native netmap support.
176  * 	     The ifp is detached from the host stack by redirecting:
177  * 	       - transmissions (from the network stack) to netmap_transmit()
178  * 	       - receive notifications to the nm_notify() callback for
179  * 	         this adapter. The callback is normally netmap_notify(), unless
180  * 	         the ifp is attached to a bridge using bwrap, in which case it
181  * 	         is netmap_bwrap_intr_notify().
182  *
183  * 	- netmap_generic_adapter:      [netmap_generic.c]
184  * 	      A system netdev/ifp without native netmap support.
185  *
186  * 	(the decision about native/non native support is taken in
187  * 	 netmap_get_hw_na(), called by netmap_get_na())
188  *
189  * 	- netmap_vp_adapter 		[netmap_vale.c]
190  * 	      Returned by netmap_get_bdg_na().
191  * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
192  * 	      are created on the fly if they don't already exist, and are
193  * 	      always attached to a bridge.
194  * 	      Persistent VALE ports must must be created separately, and i
195  * 	      then attached like normal NICs. The NIOCREGIF we are examining
196  * 	      will find them only if they had previosly been created and
197  * 	      attached (see VALE_CTL below).
198  *
199  * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
200  * 	      Returned by netmap_get_pipe_na().
201  * 	      Both pipe ends are created, if they didn't already exist.
202  *
203  * 	- netmap_monitor_adapter      [netmap_monitor.c]
204  * 	      Returned by netmap_get_monitor_na().
205  * 	      If successful, the nm_sync callbacks of the monitored adapter
206  * 	      will be intercepted by the returned monitor.
207  *
208  * 	- netmap_bwrap_adapter	      [netmap_vale.c]
209  * 	      Cannot be obtained in this way, see VALE_CTL below
210  *
211  *
212  * 	os-specific:
213  * 	    linux: we first go through linux_netmap_ioctl() to
214  * 	           adapt the FreeBSD interface to the linux one.
215  *
216  *
217  * > 3. on each descriptor, the process issues an mmap() request to
218  * >    map the shared memory region within the process' address space.
219  * >    The list of interesting queues is indicated by a location in
220  * >    the shared memory region.
221  *
222  *      os-specific:
223  *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
224  *  	    linux:   linux_netmap_mmap (netmap_linux.c).
225  *
226  * > 4. using the functions in the netmap(4) userspace API, a process
227  * >    can look up the occupation state of a queue, access memory buffers,
228  * >    and retrieve received packets or enqueue packets to transmit.
229  *
230  * 	these actions do not involve the kernel.
231  *
232  * > 5. using some ioctl()s the process can synchronize the userspace view
233  * >    of the queue with the actual status in the kernel. This includes both
234  * >    receiving the notification of new packets, and transmitting new
235  * >    packets on the output interface.
236  *
237  * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
238  * 	cases. They invoke the nm_sync callbacks on the netmap_kring
239  * 	structures, as initialized in step 2 and maybe later modified
240  * 	by a monitor. Monitors, however, will always call the original
241  * 	callback before doing anything else.
242  *
243  *
244  * > 6. select() or poll() can be used to wait for events on individual
245  * >    transmit or receive queues (or all queues for a given interface).
246  *
247  * 	Implemented in netmap_poll(). This will call the same nm_sync()
248  * 	callbacks as in step 5 above.
249  *
250  * 	os-specific:
251  * 		linux: we first go through linux_netmap_poll() to adapt
252  * 		       the FreeBSD interface to the linux one.
253  *
254  *
255  *  ----  VALE_CTL -----
256  *
257  *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
258  *  nr_cmd in the nmreq structure. These subcommands are handled by
259  *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
260  *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
261  *  subcommands, respectively.
262  *
263  *  Any network interface known to the system (including a persistent VALE
264  *  port) can be attached to a VALE switch by issuing the
265  *  NETMAP_REQ_VALE_ATTACH command. After the attachment, persistent VALE ports
266  *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
267  *  attachment of other interfaces, instead, requires the creation of a
268  *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
269  *  netmap mode. This may require the creation of a netmap_generic_adapter if
270  *  we have no native support for the interface, or if generic adapters have
271  *  been forced by sysctl.
272  *
273  *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
274  *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
275  *  callback.  In the case of the bwrap, the callback creates the
276  *  netmap_bwrap_adapter.  The initialization of the bwrap is then
277  *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
278  *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
279  *  A generic adapter for the wrapped ifp will be created if needed, when
280  *  netmap_get_bdg_na() calls netmap_get_hw_na().
281  *
282  *
283  *  ---- DATAPATHS -----
284  *
285  *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
286  *
287  *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
288  *
289  *    - tx from netmap userspace:
290  *	 concurrently:
291  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
292  *                kring->nm_sync() == DEVICE_netmap_txsync()
293  *           2) device interrupt handler
294  *                na->nm_notify()  == netmap_notify()
295  *    - rx from netmap userspace:
296  *       concurrently:
297  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
298  *                kring->nm_sync() == DEVICE_netmap_rxsync()
299  *           2) device interrupt handler
300  *                na->nm_notify()  == netmap_notify()
301  *    - rx from host stack
302  *       concurrently:
303  *           1) host stack
304  *                netmap_transmit()
305  *                  na->nm_notify  == netmap_notify()
306  *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
307  *                kring->nm_sync() == netmap_rxsync_from_host
308  *                  netmap_rxsync_from_host(na, NULL, NULL)
309  *    - tx to host stack
310  *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
311  *             kring->nm_sync() == netmap_txsync_to_host
312  *               netmap_txsync_to_host(na)
313  *                 nm_os_send_up()
314  *                   FreeBSD: na->if_input() == ether_input()
315  *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
316  *
317  *
318  *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
319  *
320  *    na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
321  *
322  *    - tx from netmap userspace:
323  *       concurrently:
324  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
325  *               kring->nm_sync() == generic_netmap_txsync()
326  *                   nm_os_generic_xmit_frame()
327  *                       linux:   dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
328  *                           ifp->ndo_start_xmit == generic_ndo_start_xmit()
329  *                               gna->save_start_xmit == orig. dev. start_xmit
330  *                       FreeBSD: na->if_transmit() == orig. dev if_transmit
331  *           2) generic_mbuf_destructor()
332  *                   na->nm_notify() == netmap_notify()
333  *    - rx from netmap userspace:
334  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
335  *               kring->nm_sync() == generic_netmap_rxsync()
336  *                   mbq_safe_dequeue()
337  *           2) device driver
338  *               generic_rx_handler()
339  *                   mbq_safe_enqueue()
340  *                   na->nm_notify() == netmap_notify()
341  *    - rx from host stack
342  *        FreeBSD: same as native
343  *        Linux: same as native except:
344  *           1) host stack
345  *               dev_queue_xmit() without NM_MAGIC_PRIORITY_TX
346  *                   ifp->ndo_start_xmit == generic_ndo_start_xmit()
347  *                       netmap_transmit()
348  *                           na->nm_notify() == netmap_notify()
349  *    - tx to host stack (same as native):
350  *
351  *
352  *                           -= VALE =-
353  *
354  *   INCOMING:
355  *
356  *      - VALE ports:
357  *          ioctl(NIOCTXSYNC)/netmap_poll() in process context
358  *              kring->nm_sync() == netmap_vp_txsync()
359  *
360  *      - system device with native support:
361  *         from cable:
362  *             interrupt
363  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
364  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
365  *                     netmap_vp_txsync()
366  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
367  *         from host stack:
368  *             netmap_transmit()
369  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
370  *                     kring->nm_sync() == netmap_rxsync_from_host()
371  *                     netmap_vp_txsync()
372  *
373  *      - system device with generic support:
374  *         from device driver:
375  *            generic_rx_handler()
376  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
377  *                     kring->nm_sync() == generic_netmap_rxsync()
378  *                     netmap_vp_txsync()
379  *                     kring->nm_sync() == generic_netmap_rxsync()
380  *         from host stack:
381  *            netmap_transmit()
382  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
383  *                     kring->nm_sync() == netmap_rxsync_from_host()
384  *                     netmap_vp_txsync()
385  *
386  *   (all cases) --> nm_bdg_flush()
387  *                      dest_na->nm_notify() == (see below)
388  *
389  *   OUTGOING:
390  *
391  *      - VALE ports:
392  *         concurrently:
393  *             1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
394  *                    kring->nm_sync() == netmap_vp_rxsync()
395  *             2) from nm_bdg_flush()
396  *                    na->nm_notify() == netmap_notify()
397  *
398  *      - system device with native support:
399  *          to cable:
400  *             na->nm_notify() == netmap_bwrap_notify()
401  *                 netmap_vp_rxsync()
402  *                 kring->nm_sync() == DEVICE_netmap_txsync()
403  *                 netmap_vp_rxsync()
404  *          to host stack:
405  *                 netmap_vp_rxsync()
406  *                 kring->nm_sync() == netmap_txsync_to_host
407  *                 netmap_vp_rxsync_locked()
408  *
409  *      - system device with generic adapter:
410  *          to device driver:
411  *             na->nm_notify() == netmap_bwrap_notify()
412  *                 netmap_vp_rxsync()
413  *                 kring->nm_sync() == generic_netmap_txsync()
414  *                 netmap_vp_rxsync()
415  *          to host stack:
416  *                 netmap_vp_rxsync()
417  *                 kring->nm_sync() == netmap_txsync_to_host
418  *                 netmap_vp_rxsync()
419  *
420  */
421 
422 /*
423  * OS-specific code that is used only within this file.
424  * Other OS-specific code that must be accessed by drivers
425  * is present in netmap_kern.h
426  */
427 
428 #if defined(__FreeBSD__)
429 #include <sys/cdefs.h> /* prerequisite */
430 #include <sys/types.h>
431 #include <sys/errno.h>
432 #include <sys/param.h>	/* defines used in kernel.h */
433 #include <sys/kernel.h>	/* types used in module initialization */
434 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
435 #include <sys/filio.h>	/* FIONBIO */
436 #include <sys/sockio.h>
437 #include <sys/socketvar.h>	/* struct socket */
438 #include <sys/malloc.h>
439 #include <sys/poll.h>
440 #include <sys/rwlock.h>
441 #include <sys/socket.h> /* sockaddrs */
442 #include <sys/selinfo.h>
443 #include <sys/sysctl.h>
444 #include <sys/jail.h>
445 #include <net/vnet.h>
446 #include <net/if.h>
447 #include <net/if_var.h>
448 #include <net/bpf.h>		/* BIOCIMMEDIATE */
449 #include <machine/bus.h>	/* bus_dmamap_* */
450 #include <sys/endian.h>
451 #include <sys/refcount.h>
452 
453 
454 #elif defined(linux)
455 
456 #include "bsd_glue.h"
457 
458 #elif defined(__APPLE__)
459 
460 #warning OSX support is only partial
461 #include "osx_glue.h"
462 
463 #elif defined (_WIN32)
464 
465 #include "win_glue.h"
466 
467 #else
468 
469 #error	Unsupported platform
470 
471 #endif /* unsupported */
472 
473 /*
474  * common headers
475  */
476 #include <net/netmap.h>
477 #include <dev/netmap/netmap_kern.h>
478 #include <dev/netmap/netmap_mem2.h>
479 
480 
481 /* user-controlled variables */
482 int netmap_verbose;
483 
484 static int netmap_no_timestamp; /* don't timestamp on rxsync */
485 int netmap_no_pendintr = 1;
486 int netmap_txsync_retry = 2;
487 static int netmap_fwd = 0;	/* force transparent forwarding */
488 
489 /*
490  * netmap_admode selects the netmap mode to use.
491  * Invalid values are reset to NETMAP_ADMODE_BEST
492  */
493 enum {	NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
494 	NETMAP_ADMODE_NATIVE,	/* either native or none */
495 	NETMAP_ADMODE_GENERIC,	/* force generic */
496 	NETMAP_ADMODE_LAST };
497 static int netmap_admode = NETMAP_ADMODE_BEST;
498 
499 /* netmap_generic_mit controls mitigation of RX notifications for
500  * the generic netmap adapter. The value is a time interval in
501  * nanoseconds. */
502 int netmap_generic_mit = 100*1000;
503 
504 /* We use by default netmap-aware qdiscs with generic netmap adapters,
505  * even if there can be a little performance hit with hardware NICs.
506  * However, using the qdisc is the safer approach, for two reasons:
507  * 1) it prevents non-fifo qdiscs to break the TX notification
508  *    scheme, which is based on mbuf destructors when txqdisc is
509  *    not used.
510  * 2) it makes it possible to transmit over software devices that
511  *    change skb->dev, like bridge, veth, ...
512  *
513  * Anyway users looking for the best performance should
514  * use native adapters.
515  */
516 #ifdef linux
517 int netmap_generic_txqdisc = 1;
518 #endif
519 
520 /* Default number of slots and queues for generic adapters. */
521 int netmap_generic_ringsize = 1024;
522 int netmap_generic_rings = 1;
523 
524 /* Non-zero to enable checksum offloading in NIC drivers */
525 int netmap_generic_hwcsum = 0;
526 
527 /* Non-zero if ptnet devices are allowed to use virtio-net headers. */
528 int ptnet_vnet_hdr = 1;
529 
530 /* 0 if ptnetmap should not use worker threads for TX processing */
531 int ptnetmap_tx_workers = 1;
532 
533 /*
534  * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated
535  * in some other operating systems
536  */
537 SYSBEGIN(main_init);
538 
539 SYSCTL_DECL(_dev_netmap);
540 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
541 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
542 		CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
543 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
544 		CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
545 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr,
546 		0, "Always look for new received packets.");
547 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
548 		&netmap_txsync_retry, 0, "Number of txsync loops in bridge's flush.");
549 
550 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0,
551 		"Force NR_FORWARD mode");
552 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0,
553 		"Adapter mode. 0 selects the best option available,"
554 		"1 forces native adapter, 2 forces emulated adapter");
555 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_hwcsum, CTLFLAG_RW, &netmap_generic_hwcsum,
556 		0, "Hardware checksums. 0 to disable checksum generation by the NIC (default),"
557 		"1 to enable checksum generation by the NIC");
558 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit,
559 		0, "RX notification interval in nanoseconds");
560 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW,
561 		&netmap_generic_ringsize, 0,
562 		"Number of per-ring slots for emulated netmap mode");
563 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW,
564 		&netmap_generic_rings, 0,
565 		"Number of TX/RX queues for emulated netmap adapters");
566 #ifdef linux
567 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW,
568 		&netmap_generic_txqdisc, 0, "Use qdisc for generic adapters");
569 #endif
570 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr,
571 		0, "Allow ptnet devices to use virtio-net headers");
572 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnetmap_tx_workers, CTLFLAG_RW,
573 		&ptnetmap_tx_workers, 0, "Use worker threads for pnetmap TX processing");
574 
575 SYSEND;
576 
577 NMG_LOCK_T	netmap_global_lock;
578 
579 /*
580  * mark the ring as stopped, and run through the locks
581  * to make sure other users get to see it.
582  * stopped must be either NR_KR_STOPPED (for unbounded stop)
583  * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)
584  */
585 static void
586 netmap_disable_ring(struct netmap_kring *kr, int stopped)
587 {
588 	nm_kr_stop(kr, stopped);
589 	// XXX check if nm_kr_stop is sufficient
590 	mtx_lock(&kr->q_lock);
591 	mtx_unlock(&kr->q_lock);
592 	nm_kr_put(kr);
593 }
594 
595 /* stop or enable a single ring */
596 void
597 netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
598 {
599 	if (stopped)
600 		netmap_disable_ring(NMR(na, t)[ring_id], stopped);
601 	else
602 		NMR(na, t)[ring_id]->nkr_stopped = 0;
603 }
604 
605 
606 /* stop or enable all the rings of na */
607 void
608 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
609 {
610 	int i;
611 	enum txrx t;
612 
613 	if (!nm_netmap_on(na))
614 		return;
615 
616 	for_rx_tx(t) {
617 		for (i = 0; i < netmap_real_rings(na, t); i++) {
618 			netmap_set_ring(na, i, t, stopped);
619 		}
620 	}
621 }
622 
623 /*
624  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
625  * to finish and prevents any new one from starting.  Call this before turning
626  * netmap mode off, or before removing the hardware rings (e.g., on module
627  * onload).
628  */
629 void
630 netmap_disable_all_rings(struct ifnet *ifp)
631 {
632 	if (NM_NA_VALID(ifp)) {
633 		netmap_set_all_rings(NA(ifp), NM_KR_STOPPED);
634 	}
635 }
636 
637 /*
638  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
639  * adapter's rings In linux drivers, this should be placed near each
640  * napi_enable().
641  */
642 void
643 netmap_enable_all_rings(struct ifnet *ifp)
644 {
645 	if (NM_NA_VALID(ifp)) {
646 		netmap_set_all_rings(NA(ifp), 0 /* enabled */);
647 	}
648 }
649 
650 void
651 netmap_make_zombie(struct ifnet *ifp)
652 {
653 	if (NM_NA_VALID(ifp)) {
654 		struct netmap_adapter *na = NA(ifp);
655 		netmap_set_all_rings(na, NM_KR_LOCKED);
656 		na->na_flags |= NAF_ZOMBIE;
657 		netmap_set_all_rings(na, 0);
658 	}
659 }
660 
661 void
662 netmap_undo_zombie(struct ifnet *ifp)
663 {
664 	if (NM_NA_VALID(ifp)) {
665 		struct netmap_adapter *na = NA(ifp);
666 		if (na->na_flags & NAF_ZOMBIE) {
667 			netmap_set_all_rings(na, NM_KR_LOCKED);
668 			na->na_flags &= ~NAF_ZOMBIE;
669 			netmap_set_all_rings(na, 0);
670 		}
671 	}
672 }
673 
674 /*
675  * generic bound_checking function
676  */
677 u_int
678 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
679 {
680 	u_int oldv = *v;
681 	const char *op = NULL;
682 
683 	if (dflt < lo)
684 		dflt = lo;
685 	if (dflt > hi)
686 		dflt = hi;
687 	if (oldv < lo) {
688 		*v = dflt;
689 		op = "Bump";
690 	} else if (oldv > hi) {
691 		*v = hi;
692 		op = "Clamp";
693 	}
694 	if (op && msg)
695 		nm_prinf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
696 	return *v;
697 }
698 
699 
700 /*
701  * packet-dump function, user-supplied or static buffer.
702  * The destination buffer must be at least 30+4*len
703  */
704 const char *
705 nm_dump_buf(char *p, int len, int lim, char *dst)
706 {
707 	static char _dst[8192];
708 	int i, j, i0;
709 	static char hex[] ="0123456789abcdef";
710 	char *o;	/* output position */
711 
712 #define P_HI(x)	hex[((x) & 0xf0)>>4]
713 #define P_LO(x)	hex[((x) & 0xf)]
714 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
715 	if (!dst)
716 		dst = _dst;
717 	if (lim <= 0 || lim > len)
718 		lim = len;
719 	o = dst;
720 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
721 	o += strlen(o);
722 	/* hexdump routine */
723 	for (i = 0; i < lim; ) {
724 		sprintf(o, "%5d: ", i);
725 		o += strlen(o);
726 		memset(o, ' ', 48);
727 		i0 = i;
728 		for (j=0; j < 16 && i < lim; i++, j++) {
729 			o[j*3] = P_HI(p[i]);
730 			o[j*3+1] = P_LO(p[i]);
731 		}
732 		i = i0;
733 		for (j=0; j < 16 && i < lim; i++, j++)
734 			o[j + 48] = P_C(p[i]);
735 		o[j+48] = '\n';
736 		o += j+49;
737 	}
738 	*o = '\0';
739 #undef P_HI
740 #undef P_LO
741 #undef P_C
742 	return dst;
743 }
744 
745 
746 /*
747  * Fetch configuration from the device, to cope with dynamic
748  * reconfigurations after loading the module.
749  */
750 /* call with NMG_LOCK held */
751 int
752 netmap_update_config(struct netmap_adapter *na)
753 {
754 	struct nm_config_info info;
755 
756 	bzero(&info, sizeof(info));
757 	if (na->nm_config == NULL ||
758 	    na->nm_config(na, &info)) {
759 		/* take whatever we had at init time */
760 		info.num_tx_rings = na->num_tx_rings;
761 		info.num_tx_descs = na->num_tx_desc;
762 		info.num_rx_rings = na->num_rx_rings;
763 		info.num_rx_descs = na->num_rx_desc;
764 		info.rx_buf_maxsize = na->rx_buf_maxsize;
765 	}
766 
767 	if (na->num_tx_rings == info.num_tx_rings &&
768 	    na->num_tx_desc == info.num_tx_descs &&
769 	    na->num_rx_rings == info.num_rx_rings &&
770 	    na->num_rx_desc == info.num_rx_descs &&
771 	    na->rx_buf_maxsize == info.rx_buf_maxsize)
772 		return 0; /* nothing changed */
773 	if (na->active_fds == 0) {
774 		na->num_tx_rings = info.num_tx_rings;
775 		na->num_tx_desc = info.num_tx_descs;
776 		na->num_rx_rings = info.num_rx_rings;
777 		na->num_rx_desc = info.num_rx_descs;
778 		na->rx_buf_maxsize = info.rx_buf_maxsize;
779 		D("configuration changed for %s: txring %d x %d, "
780 			"rxring %d x %d, rxbufsz %d",
781 			na->name, na->num_tx_rings, na->num_tx_desc,
782 			na->num_rx_rings, na->num_rx_desc, na->rx_buf_maxsize);
783 		return 0;
784 	}
785 	D("WARNING: configuration changed for %s while active: "
786 		"txring %d x %d, rxring %d x %d, rxbufsz %d",
787 		na->name, info.num_tx_rings, info.num_tx_descs,
788 		info.num_rx_rings, info.num_rx_descs,
789 		info.rx_buf_maxsize);
790 	return 1;
791 }
792 
793 /* nm_sync callbacks for the host rings */
794 static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
795 static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
796 
797 /* create the krings array and initialize the fields common to all adapters.
798  * The array layout is this:
799  *
800  *                    +----------+
801  * na->tx_rings ----->|          | \
802  *                    |          |  } na->num_tx_ring
803  *                    |          | /
804  *                    +----------+
805  *                    |          |    host tx kring
806  * na->rx_rings ----> +----------+
807  *                    |          | \
808  *                    |          |  } na->num_rx_rings
809  *                    |          | /
810  *                    +----------+
811  *                    |          |    host rx kring
812  *                    +----------+
813  * na->tailroom ----->|          | \
814  *                    |          |  } tailroom bytes
815  *                    |          | /
816  *                    +----------+
817  *
818  * Note: for compatibility, host krings are created even when not needed.
819  * The tailroom space is currently used by vale ports for allocating leases.
820  */
821 /* call with NMG_LOCK held */
822 int
823 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
824 {
825 	u_int i, len, ndesc;
826 	struct netmap_kring *kring;
827 	u_int n[NR_TXRX];
828 	enum txrx t;
829 
830 	if (na->tx_rings != NULL) {
831 		D("warning: krings were already created");
832 		return 0;
833 	}
834 
835 	/* account for the (possibly fake) host rings */
836 	n[NR_TX] = netmap_all_rings(na, NR_TX);
837 	n[NR_RX] = netmap_all_rings(na, NR_RX);
838 
839 	len = (n[NR_TX] + n[NR_RX]) *
840 		(sizeof(struct netmap_kring) + sizeof(struct netmap_kring *))
841 		+ tailroom;
842 
843 	na->tx_rings = nm_os_malloc((size_t)len);
844 	if (na->tx_rings == NULL) {
845 		D("Cannot allocate krings");
846 		return ENOMEM;
847 	}
848 	na->rx_rings = na->tx_rings + n[NR_TX];
849 	na->tailroom = na->rx_rings + n[NR_RX];
850 
851 	/* link the krings in the krings array */
852 	kring = (struct netmap_kring *)((char *)na->tailroom + tailroom);
853 	for (i = 0; i < n[NR_TX] + n[NR_RX]; i++) {
854 		na->tx_rings[i] = kring;
855 		kring++;
856 	}
857 
858 	/*
859 	 * All fields in krings are 0 except the one initialized below.
860 	 * but better be explicit on important kring fields.
861 	 */
862 	for_rx_tx(t) {
863 		ndesc = nma_get_ndesc(na, t);
864 		for (i = 0; i < n[t]; i++) {
865 			kring = NMR(na, t)[i];
866 			bzero(kring, sizeof(*kring));
867 			kring->na = na;
868 			kring->notify_na = na;
869 			kring->ring_id = i;
870 			kring->tx = t;
871 			kring->nkr_num_slots = ndesc;
872 			kring->nr_mode = NKR_NETMAP_OFF;
873 			kring->nr_pending_mode = NKR_NETMAP_OFF;
874 			if (i < nma_get_nrings(na, t)) {
875 				kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
876 			} else {
877 				if (!(na->na_flags & NAF_HOST_RINGS))
878 					kring->nr_kflags |= NKR_FAKERING;
879 				kring->nm_sync = (t == NR_TX ?
880 						netmap_txsync_to_host:
881 						netmap_rxsync_from_host);
882 			}
883 			kring->nm_notify = na->nm_notify;
884 			kring->rhead = kring->rcur = kring->nr_hwcur = 0;
885 			/*
886 			 * IMPORTANT: Always keep one slot empty.
887 			 */
888 			kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
889 			snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
890 					nm_txrx2str(t), i);
891 			ND("ktx %s h %d c %d t %d",
892 				kring->name, kring->rhead, kring->rcur, kring->rtail);
893 			mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
894 			nm_os_selinfo_init(&kring->si);
895 		}
896 		nm_os_selinfo_init(&na->si[t]);
897 	}
898 
899 
900 	return 0;
901 }
902 
903 
904 /* undo the actions performed by netmap_krings_create */
905 /* call with NMG_LOCK held */
906 void
907 netmap_krings_delete(struct netmap_adapter *na)
908 {
909 	struct netmap_kring **kring = na->tx_rings;
910 	enum txrx t;
911 
912 	if (na->tx_rings == NULL) {
913 		D("warning: krings were already deleted");
914 		return;
915 	}
916 
917 	for_rx_tx(t)
918 		nm_os_selinfo_uninit(&na->si[t]);
919 
920 	/* we rely on the krings layout described above */
921 	for ( ; kring != na->tailroom; kring++) {
922 		mtx_destroy(&(*kring)->q_lock);
923 		nm_os_selinfo_uninit(&(*kring)->si);
924 	}
925 	nm_os_free(na->tx_rings);
926 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
927 }
928 
929 
930 /*
931  * Destructor for NIC ports. They also have an mbuf queue
932  * on the rings connected to the host so we need to purge
933  * them first.
934  */
935 /* call with NMG_LOCK held */
936 void
937 netmap_hw_krings_delete(struct netmap_adapter *na)
938 {
939 	u_int lim = netmap_real_rings(na, NR_RX), i;
940 
941 	for (i = nma_get_nrings(na, NR_RX); i < lim; i++) {
942 		struct mbq *q = &NMR(na, NR_RX)[i]->rx_queue;
943 		ND("destroy sw mbq with len %d", mbq_len(q));
944 		mbq_purge(q);
945 		mbq_safe_fini(q);
946 	}
947 	netmap_krings_delete(na);
948 }
949 
950 static void
951 netmap_mem_drop(struct netmap_adapter *na)
952 {
953 	int last = netmap_mem_deref(na->nm_mem, na);
954 	/* if the native allocator had been overrided on regif,
955 	 * restore it now and drop the temporary one
956 	 */
957 	if (last && na->nm_mem_prev) {
958 		netmap_mem_put(na->nm_mem);
959 		na->nm_mem = na->nm_mem_prev;
960 		na->nm_mem_prev = NULL;
961 	}
962 }
963 
964 /*
965  * Undo everything that was done in netmap_do_regif(). In particular,
966  * call nm_register(ifp,0) to stop netmap mode on the interface and
967  * revert to normal operation.
968  */
969 /* call with NMG_LOCK held */
970 static void netmap_unset_ringid(struct netmap_priv_d *);
971 static void netmap_krings_put(struct netmap_priv_d *);
972 void
973 netmap_do_unregif(struct netmap_priv_d *priv)
974 {
975 	struct netmap_adapter *na = priv->np_na;
976 
977 	NMG_LOCK_ASSERT();
978 	na->active_fds--;
979 	/* unset nr_pending_mode and possibly release exclusive mode */
980 	netmap_krings_put(priv);
981 
982 #ifdef	WITH_MONITOR
983 	/* XXX check whether we have to do something with monitor
984 	 * when rings change nr_mode. */
985 	if (na->active_fds <= 0) {
986 		/* walk through all the rings and tell any monitor
987 		 * that the port is going to exit netmap mode
988 		 */
989 		netmap_monitor_stop(na);
990 	}
991 #endif
992 
993 	if (na->active_fds <= 0 || nm_kring_pending(priv)) {
994 		na->nm_register(na, 0);
995 	}
996 
997 	/* delete rings and buffers that are no longer needed */
998 	netmap_mem_rings_delete(na);
999 
1000 	if (na->active_fds <= 0) {	/* last instance */
1001 		/*
1002 		 * (TO CHECK) We enter here
1003 		 * when the last reference to this file descriptor goes
1004 		 * away. This means we cannot have any pending poll()
1005 		 * or interrupt routine operating on the structure.
1006 		 * XXX The file may be closed in a thread while
1007 		 * another thread is using it.
1008 		 * Linux keeps the file opened until the last reference
1009 		 * by any outstanding ioctl/poll or mmap is gone.
1010 		 * FreeBSD does not track mmap()s (but we do) and
1011 		 * wakes up any sleeping poll(). Need to check what
1012 		 * happens if the close() occurs while a concurrent
1013 		 * syscall is running.
1014 		 */
1015 		if (netmap_verbose)
1016 			D("deleting last instance for %s", na->name);
1017 
1018 		if (nm_netmap_on(na)) {
1019 			D("BUG: netmap on while going to delete the krings");
1020 		}
1021 
1022 		na->nm_krings_delete(na);
1023 	}
1024 
1025 	/* possibily decrement counter of tx_si/rx_si users */
1026 	netmap_unset_ringid(priv);
1027 	/* delete the nifp */
1028 	netmap_mem_if_delete(na, priv->np_nifp);
1029 	/* drop the allocator */
1030 	netmap_mem_drop(na);
1031 	/* mark the priv as unregistered */
1032 	priv->np_na = NULL;
1033 	priv->np_nifp = NULL;
1034 }
1035 
1036 /* call with NMG_LOCK held */
1037 static __inline int
1038 nm_si_user(struct netmap_priv_d *priv, enum txrx t)
1039 {
1040 	return (priv->np_na != NULL &&
1041 		(priv->np_qlast[t] - priv->np_qfirst[t] > 1));
1042 }
1043 
1044 struct netmap_priv_d*
1045 netmap_priv_new(void)
1046 {
1047 	struct netmap_priv_d *priv;
1048 
1049 	priv = nm_os_malloc(sizeof(struct netmap_priv_d));
1050 	if (priv == NULL)
1051 		return NULL;
1052 	priv->np_refs = 1;
1053 	nm_os_get_module();
1054 	return priv;
1055 }
1056 
1057 /*
1058  * Destructor of the netmap_priv_d, called when the fd is closed
1059  * Action: undo all the things done by NIOCREGIF,
1060  * On FreeBSD we need to track whether there are active mmap()s,
1061  * and we use np_active_mmaps for that. On linux, the field is always 0.
1062  * Return: 1 if we can free priv, 0 otherwise.
1063  *
1064  */
1065 /* call with NMG_LOCK held */
1066 void
1067 netmap_priv_delete(struct netmap_priv_d *priv)
1068 {
1069 	struct netmap_adapter *na = priv->np_na;
1070 
1071 	/* number of active references to this fd */
1072 	if (--priv->np_refs > 0) {
1073 		return;
1074 	}
1075 	nm_os_put_module();
1076 	if (na) {
1077 		netmap_do_unregif(priv);
1078 	}
1079 	netmap_unget_na(na, priv->np_ifp);
1080 	bzero(priv, sizeof(*priv));	/* for safety */
1081 	nm_os_free(priv);
1082 }
1083 
1084 
1085 /* call with NMG_LOCK *not* held */
1086 void
1087 netmap_dtor(void *data)
1088 {
1089 	struct netmap_priv_d *priv = data;
1090 
1091 	NMG_LOCK();
1092 	netmap_priv_delete(priv);
1093 	NMG_UNLOCK();
1094 }
1095 
1096 
1097 /*
1098  * Handlers for synchronization of the rings from/to the host stack.
1099  * These are associated to a network interface and are just another
1100  * ring pair managed by userspace.
1101  *
1102  * Netmap also supports transparent forwarding (NS_FORWARD and NR_FORWARD
1103  * flags):
1104  *
1105  * - Before releasing buffers on hw RX rings, the application can mark
1106  *   them with the NS_FORWARD flag. During the next RXSYNC or poll(), they
1107  *   will be forwarded to the host stack, similarly to what happened if
1108  *   the application moved them to the host TX ring.
1109  *
1110  * - Before releasing buffers on the host RX ring, the application can
1111  *   mark them with the NS_FORWARD flag. During the next RXSYNC or poll(),
1112  *   they will be forwarded to the hw TX rings, saving the application
1113  *   from doing the same task in user-space.
1114  *
1115  * Transparent fowarding can be enabled per-ring, by setting the NR_FORWARD
1116  * flag, or globally with the netmap_fwd sysctl.
1117  *
1118  * The transfer NIC --> host is relatively easy, just encapsulate
1119  * into mbufs and we are done. The host --> NIC side is slightly
1120  * harder because there might not be room in the tx ring so it
1121  * might take a while before releasing the buffer.
1122  */
1123 
1124 
1125 /*
1126  * Pass a whole queue of mbufs to the host stack as coming from 'dst'
1127  * We do not need to lock because the queue is private.
1128  * After this call the queue is empty.
1129  */
1130 static void
1131 netmap_send_up(struct ifnet *dst, struct mbq *q)
1132 {
1133 	struct mbuf *m;
1134 	struct mbuf *head = NULL, *prev = NULL;
1135 
1136 	/* Send packets up, outside the lock; head/prev machinery
1137 	 * is only useful for Windows. */
1138 	while ((m = mbq_dequeue(q)) != NULL) {
1139 		if (netmap_verbose & NM_VERB_HOST)
1140 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
1141 		prev = nm_os_send_up(dst, m, prev);
1142 		if (head == NULL)
1143 			head = prev;
1144 	}
1145 	if (head)
1146 		nm_os_send_up(dst, NULL, head);
1147 	mbq_fini(q);
1148 }
1149 
1150 
1151 /*
1152  * Scan the buffers from hwcur to ring->head, and put a copy of those
1153  * marked NS_FORWARD (or all of them if forced) into a queue of mbufs.
1154  * Drop remaining packets in the unlikely event
1155  * of an mbuf shortage.
1156  */
1157 static void
1158 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1159 {
1160 	u_int const lim = kring->nkr_num_slots - 1;
1161 	u_int const head = kring->rhead;
1162 	u_int n;
1163 	struct netmap_adapter *na = kring->na;
1164 
1165 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1166 		struct mbuf *m;
1167 		struct netmap_slot *slot = &kring->ring->slot[n];
1168 
1169 		if ((slot->flags & NS_FORWARD) == 0 && !force)
1170 			continue;
1171 		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1172 			RD(5, "bad pkt at %d len %d", n, slot->len);
1173 			continue;
1174 		}
1175 		slot->flags &= ~NS_FORWARD; // XXX needed ?
1176 		/* XXX TODO: adapt to the case of a multisegment packet */
1177 		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1178 
1179 		if (m == NULL)
1180 			break;
1181 		mbq_enqueue(q, m);
1182 	}
1183 }
1184 
1185 static inline int
1186 _nm_may_forward(struct netmap_kring *kring)
1187 {
1188 	return	((netmap_fwd || kring->ring->flags & NR_FORWARD) &&
1189 		 kring->na->na_flags & NAF_HOST_RINGS &&
1190 		 kring->tx == NR_RX);
1191 }
1192 
1193 static inline int
1194 nm_may_forward_up(struct netmap_kring *kring)
1195 {
1196 	return	_nm_may_forward(kring) &&
1197 		 kring->ring_id != kring->na->num_rx_rings;
1198 }
1199 
1200 static inline int
1201 nm_may_forward_down(struct netmap_kring *kring, int sync_flags)
1202 {
1203 	return	_nm_may_forward(kring) &&
1204 		 (sync_flags & NAF_CAN_FORWARD_DOWN) &&
1205 		 kring->ring_id == kring->na->num_rx_rings;
1206 }
1207 
1208 /*
1209  * Send to the NIC rings packets marked NS_FORWARD between
1210  * kring->nr_hwcur and kring->rhead.
1211  * Called under kring->rx_queue.lock on the sw rx ring.
1212  *
1213  * It can only be called if the user opened all the TX hw rings,
1214  * see NAF_CAN_FORWARD_DOWN flag.
1215  * We can touch the TX netmap rings (slots, head and cur) since
1216  * we are in poll/ioctl system call context, and the application
1217  * is not supposed to touch the ring (using a different thread)
1218  * during the execution of the system call.
1219  */
1220 static u_int
1221 netmap_sw_to_nic(struct netmap_adapter *na)
1222 {
1223 	struct netmap_kring *kring = na->rx_rings[na->num_rx_rings];
1224 	struct netmap_slot *rxslot = kring->ring->slot;
1225 	u_int i, rxcur = kring->nr_hwcur;
1226 	u_int const head = kring->rhead;
1227 	u_int const src_lim = kring->nkr_num_slots - 1;
1228 	u_int sent = 0;
1229 
1230 	/* scan rings to find space, then fill as much as possible */
1231 	for (i = 0; i < na->num_tx_rings; i++) {
1232 		struct netmap_kring *kdst = na->tx_rings[i];
1233 		struct netmap_ring *rdst = kdst->ring;
1234 		u_int const dst_lim = kdst->nkr_num_slots - 1;
1235 
1236 		/* XXX do we trust ring or kring->rcur,rtail ? */
1237 		for (; rxcur != head && !nm_ring_empty(rdst);
1238 		     rxcur = nm_next(rxcur, src_lim) ) {
1239 			struct netmap_slot *src, *dst, tmp;
1240 			u_int dst_head = rdst->head;
1241 
1242 			src = &rxslot[rxcur];
1243 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1244 				continue;
1245 
1246 			sent++;
1247 
1248 			dst = &rdst->slot[dst_head];
1249 
1250 			tmp = *src;
1251 
1252 			src->buf_idx = dst->buf_idx;
1253 			src->flags = NS_BUF_CHANGED;
1254 
1255 			dst->buf_idx = tmp.buf_idx;
1256 			dst->len = tmp.len;
1257 			dst->flags = NS_BUF_CHANGED;
1258 
1259 			rdst->head = rdst->cur = nm_next(dst_head, dst_lim);
1260 		}
1261 		/* if (sent) XXX txsync ? it would be just an optimization */
1262 	}
1263 	return sent;
1264 }
1265 
1266 
1267 /*
1268  * netmap_txsync_to_host() passes packets up. We are called from a
1269  * system call in user process context, and the only contention
1270  * can be among multiple user threads erroneously calling
1271  * this routine concurrently.
1272  */
1273 static int
1274 netmap_txsync_to_host(struct netmap_kring *kring, int flags)
1275 {
1276 	struct netmap_adapter *na = kring->na;
1277 	u_int const lim = kring->nkr_num_slots - 1;
1278 	u_int const head = kring->rhead;
1279 	struct mbq q;
1280 
1281 	/* Take packets from hwcur to head and pass them up.
1282 	 * Force hwcur = head since netmap_grab_packets() stops at head
1283 	 */
1284 	mbq_init(&q);
1285 	netmap_grab_packets(kring, &q, 1 /* force */);
1286 	ND("have %d pkts in queue", mbq_len(&q));
1287 	kring->nr_hwcur = head;
1288 	kring->nr_hwtail = head + lim;
1289 	if (kring->nr_hwtail > lim)
1290 		kring->nr_hwtail -= lim + 1;
1291 
1292 	netmap_send_up(na->ifp, &q);
1293 	return 0;
1294 }
1295 
1296 
1297 /*
1298  * rxsync backend for packets coming from the host stack.
1299  * They have been put in kring->rx_queue by netmap_transmit().
1300  * We protect access to the kring using kring->rx_queue.lock
1301  *
1302  * also moves to the nic hw rings any packet the user has marked
1303  * for transparent-mode forwarding, then sets the NR_FORWARD
1304  * flag in the kring to let the caller push them out
1305  */
1306 static int
1307 netmap_rxsync_from_host(struct netmap_kring *kring, int flags)
1308 {
1309 	struct netmap_adapter *na = kring->na;
1310 	struct netmap_ring *ring = kring->ring;
1311 	u_int nm_i, n;
1312 	u_int const lim = kring->nkr_num_slots - 1;
1313 	u_int const head = kring->rhead;
1314 	int ret = 0;
1315 	struct mbq *q = &kring->rx_queue, fq;
1316 
1317 	mbq_init(&fq); /* fq holds packets to be freed */
1318 
1319 	mbq_lock(q);
1320 
1321 	/* First part: import newly received packets */
1322 	n = mbq_len(q);
1323 	if (n) { /* grab packets from the queue */
1324 		struct mbuf *m;
1325 		uint32_t stop_i;
1326 
1327 		nm_i = kring->nr_hwtail;
1328 		stop_i = nm_prev(kring->nr_hwcur, lim);
1329 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1330 			int len = MBUF_LEN(m);
1331 			struct netmap_slot *slot = &ring->slot[nm_i];
1332 
1333 			m_copydata(m, 0, len, NMB(na, slot));
1334 			ND("nm %d len %d", nm_i, len);
1335 			if (netmap_verbose)
1336 				D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1337 
1338 			slot->len = len;
1339 			slot->flags = 0;
1340 			nm_i = nm_next(nm_i, lim);
1341 			mbq_enqueue(&fq, m);
1342 		}
1343 		kring->nr_hwtail = nm_i;
1344 	}
1345 
1346 	/*
1347 	 * Second part: skip past packets that userspace has released.
1348 	 */
1349 	nm_i = kring->nr_hwcur;
1350 	if (nm_i != head) { /* something was released */
1351 		if (nm_may_forward_down(kring, flags)) {
1352 			ret = netmap_sw_to_nic(na);
1353 			if (ret > 0) {
1354 				kring->nr_kflags |= NR_FORWARD;
1355 				ret = 0;
1356 			}
1357 		}
1358 		kring->nr_hwcur = head;
1359 	}
1360 
1361 	mbq_unlock(q);
1362 
1363 	mbq_purge(&fq);
1364 	mbq_fini(&fq);
1365 
1366 	return ret;
1367 }
1368 
1369 
1370 /* Get a netmap adapter for the port.
1371  *
1372  * If it is possible to satisfy the request, return 0
1373  * with *na containing the netmap adapter found.
1374  * Otherwise return an error code, with *na containing NULL.
1375  *
1376  * When the port is attached to a bridge, we always return
1377  * EBUSY.
1378  * Otherwise, if the port is already bound to a file descriptor,
1379  * then we unconditionally return the existing adapter into *na.
1380  * In all the other cases, we return (into *na) either native,
1381  * generic or NULL, according to the following table:
1382  *
1383  *					native_support
1384  * active_fds   dev.netmap.admode         YES     NO
1385  * -------------------------------------------------------
1386  *    >0              *                 NA(ifp) NA(ifp)
1387  *
1388  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1389  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1390  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1391  *
1392  */
1393 static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */
1394 int
1395 netmap_get_hw_na(struct ifnet *ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na)
1396 {
1397 	/* generic support */
1398 	int i = netmap_admode;	/* Take a snapshot. */
1399 	struct netmap_adapter *prev_na;
1400 	int error = 0;
1401 
1402 	*na = NULL; /* default */
1403 
1404 	/* reset in case of invalid value */
1405 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1406 		i = netmap_admode = NETMAP_ADMODE_BEST;
1407 
1408 	if (NM_NA_VALID(ifp)) {
1409 		prev_na = NA(ifp);
1410 		/* If an adapter already exists, return it if
1411 		 * there are active file descriptors or if
1412 		 * netmap is not forced to use generic
1413 		 * adapters.
1414 		 */
1415 		if (NETMAP_OWNED_BY_ANY(prev_na)
1416 			|| i != NETMAP_ADMODE_GENERIC
1417 			|| prev_na->na_flags & NAF_FORCE_NATIVE
1418 #ifdef WITH_PIPES
1419 			/* ugly, but we cannot allow an adapter switch
1420 			 * if some pipe is referring to this one
1421 			 */
1422 			|| prev_na->na_next_pipe > 0
1423 #endif
1424 		) {
1425 			*na = prev_na;
1426 			goto assign_mem;
1427 		}
1428 	}
1429 
1430 	/* If there isn't native support and netmap is not allowed
1431 	 * to use generic adapters, we cannot satisfy the request.
1432 	 */
1433 	if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)
1434 		return EOPNOTSUPP;
1435 
1436 	/* Otherwise, create a generic adapter and return it,
1437 	 * saving the previously used netmap adapter, if any.
1438 	 *
1439 	 * Note that here 'prev_na', if not NULL, MUST be a
1440 	 * native adapter, and CANNOT be a generic one. This is
1441 	 * true because generic adapters are created on demand, and
1442 	 * destroyed when not used anymore. Therefore, if the adapter
1443 	 * currently attached to an interface 'ifp' is generic, it
1444 	 * must be that
1445 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1446 	 * Consequently, if NA(ifp) is generic, we will enter one of
1447 	 * the branches above. This ensures that we never override
1448 	 * a generic adapter with another generic adapter.
1449 	 */
1450 	error = generic_netmap_attach(ifp);
1451 	if (error)
1452 		return error;
1453 
1454 	*na = NA(ifp);
1455 
1456 assign_mem:
1457 	if (nmd != NULL && !((*na)->na_flags & NAF_MEM_OWNER) &&
1458 	    (*na)->active_fds == 0 && ((*na)->nm_mem != nmd)) {
1459 		(*na)->nm_mem_prev = (*na)->nm_mem;
1460 		(*na)->nm_mem = netmap_mem_get(nmd);
1461 	}
1462 
1463 	return 0;
1464 }
1465 
1466 /*
1467  * MUST BE CALLED UNDER NMG_LOCK()
1468  *
1469  * Get a refcounted reference to a netmap adapter attached
1470  * to the interface specified by req.
1471  * This is always called in the execution of an ioctl().
1472  *
1473  * Return ENXIO if the interface specified by the request does
1474  * not exist, ENOTSUP if netmap is not supported by the interface,
1475  * EBUSY if the interface is already attached to a bridge,
1476  * EINVAL if parameters are invalid, ENOMEM if needed resources
1477  * could not be allocated.
1478  * If successful, hold a reference to the netmap adapter.
1479  *
1480  * If the interface specified by req is a system one, also keep
1481  * a reference to it and return a valid *ifp.
1482  */
1483 int
1484 netmap_get_na(struct nmreq_header *hdr,
1485 	      struct netmap_adapter **na, struct ifnet **ifp,
1486 	      struct netmap_mem_d *nmd, int create)
1487 {
1488 	struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
1489 	int error = 0;
1490 	struct netmap_adapter *ret = NULL;
1491 	int nmd_ref = 0;
1492 
1493 	*na = NULL;     /* default return value */
1494 	*ifp = NULL;
1495 
1496 	if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
1497 		return EINVAL;
1498 	}
1499 
1500 	if (req->nr_mode == NR_REG_PIPE_MASTER ||
1501 			req->nr_mode == NR_REG_PIPE_SLAVE) {
1502 		/* Do not accept deprecated pipe modes. */
1503 		D("Deprecated pipe nr_mode, use xx{yy or xx}yy syntax");
1504 		return EINVAL;
1505 	}
1506 
1507 	NMG_LOCK_ASSERT();
1508 
1509 	/* if the request contain a memid, try to find the
1510 	 * corresponding memory region
1511 	 */
1512 	if (nmd == NULL && req->nr_mem_id) {
1513 		nmd = netmap_mem_find(req->nr_mem_id);
1514 		if (nmd == NULL)
1515 			return EINVAL;
1516 		/* keep the rereference */
1517 		nmd_ref = 1;
1518 	}
1519 
1520 	/* We cascade through all possible types of netmap adapter.
1521 	 * All netmap_get_*_na() functions return an error and an na,
1522 	 * with the following combinations:
1523 	 *
1524 	 * error    na
1525 	 *   0	   NULL		type doesn't match
1526 	 *  !0	   NULL		type matches, but na creation/lookup failed
1527 	 *   0	  !NULL		type matches and na created/found
1528 	 *  !0    !NULL		impossible
1529 	 */
1530 
1531 	/* try to see if this is a ptnetmap port */
1532 	error = netmap_get_pt_host_na(hdr, na, nmd, create);
1533 	if (error || *na != NULL)
1534 		goto out;
1535 
1536 	/* try to see if this is a monitor port */
1537 	error = netmap_get_monitor_na(hdr, na, nmd, create);
1538 	if (error || *na != NULL)
1539 		goto out;
1540 
1541 	/* try to see if this is a pipe port */
1542 	error = netmap_get_pipe_na(hdr, na, nmd, create);
1543 	if (error || *na != NULL)
1544 		goto out;
1545 
1546 	/* try to see if this is a bridge port */
1547 	error = netmap_get_vale_na(hdr, na, nmd, create);
1548 	if (error)
1549 		goto out;
1550 
1551 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1552 		goto out;
1553 
1554 	/*
1555 	 * This must be a hardware na, lookup the name in the system.
1556 	 * Note that by hardware we actually mean "it shows up in ifconfig".
1557 	 * This may still be a tap, a veth/epair, or even a
1558 	 * persistent VALE port.
1559 	 */
1560 	*ifp = ifunit_ref(hdr->nr_name);
1561 	if (*ifp == NULL) {
1562 		error = ENXIO;
1563 		goto out;
1564 	}
1565 
1566 	error = netmap_get_hw_na(*ifp, nmd, &ret);
1567 	if (error)
1568 		goto out;
1569 
1570 	*na = ret;
1571 	netmap_adapter_get(ret);
1572 
1573 out:
1574 	if (error) {
1575 		if (ret)
1576 			netmap_adapter_put(ret);
1577 		if (*ifp) {
1578 			if_rele(*ifp);
1579 			*ifp = NULL;
1580 		}
1581 	}
1582 	if (nmd_ref)
1583 		netmap_mem_put(nmd);
1584 
1585 	return error;
1586 }
1587 
1588 /* undo netmap_get_na() */
1589 void
1590 netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp)
1591 {
1592 	if (ifp)
1593 		if_rele(ifp);
1594 	if (na)
1595 		netmap_adapter_put(na);
1596 }
1597 
1598 
1599 #define NM_FAIL_ON(t) do {						\
1600 	if (unlikely(t)) {						\
1601 		RD(5, "%s: fail '" #t "' "				\
1602 			"h %d c %d t %d "				\
1603 			"rh %d rc %d rt %d "				\
1604 			"hc %d ht %d",					\
1605 			kring->name,					\
1606 			head, cur, ring->tail,				\
1607 			kring->rhead, kring->rcur, kring->rtail,	\
1608 			kring->nr_hwcur, kring->nr_hwtail);		\
1609 		return kring->nkr_num_slots;				\
1610 	}								\
1611 } while (0)
1612 
1613 /*
1614  * validate parameters on entry for *_txsync()
1615  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1616  * in case of error.
1617  *
1618  * rhead, rcur and rtail=hwtail are stored from previous round.
1619  * hwcur is the next packet to send to the ring.
1620  *
1621  * We want
1622  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1623  *
1624  * hwcur, rhead, rtail and hwtail are reliable
1625  */
1626 u_int
1627 nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1628 {
1629 	u_int head = ring->head; /* read only once */
1630 	u_int cur = ring->cur; /* read only once */
1631 	u_int n = kring->nkr_num_slots;
1632 
1633 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1634 		kring->name,
1635 		kring->nr_hwcur, kring->nr_hwtail,
1636 		ring->head, ring->cur, ring->tail);
1637 #if 1 /* kernel sanity checks; but we can trust the kring. */
1638 	NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||
1639 	    kring->rtail >= n ||  kring->nr_hwtail >= n);
1640 #endif /* kernel sanity checks */
1641 	/*
1642 	 * user sanity checks. We only use head,
1643 	 * A, B, ... are possible positions for head:
1644 	 *
1645 	 *  0    A  rhead   B  rtail   C  n-1
1646 	 *  0    D  rtail   E  rhead   F  n-1
1647 	 *
1648 	 * B, F, D are valid. A, C, E are wrong
1649 	 */
1650 	if (kring->rtail >= kring->rhead) {
1651 		/* want rhead <= head <= rtail */
1652 		NM_FAIL_ON(head < kring->rhead || head > kring->rtail);
1653 		/* and also head <= cur <= rtail */
1654 		NM_FAIL_ON(cur < head || cur > kring->rtail);
1655 	} else { /* here rtail < rhead */
1656 		/* we need head outside rtail .. rhead */
1657 		NM_FAIL_ON(head > kring->rtail && head < kring->rhead);
1658 
1659 		/* two cases now: head <= rtail or head >= rhead  */
1660 		if (head <= kring->rtail) {
1661 			/* want head <= cur <= rtail */
1662 			NM_FAIL_ON(cur < head || cur > kring->rtail);
1663 		} else { /* head >= rhead */
1664 			/* cur must be outside rtail..head */
1665 			NM_FAIL_ON(cur > kring->rtail && cur < head);
1666 		}
1667 	}
1668 	if (ring->tail != kring->rtail) {
1669 		RD(5, "%s tail overwritten was %d need %d", kring->name,
1670 			ring->tail, kring->rtail);
1671 		ring->tail = kring->rtail;
1672 	}
1673 	kring->rhead = head;
1674 	kring->rcur = cur;
1675 	return head;
1676 }
1677 
1678 
1679 /*
1680  * validate parameters on entry for *_rxsync()
1681  * Returns ring->head if ok, kring->nkr_num_slots on error.
1682  *
1683  * For a valid configuration,
1684  * hwcur <= head <= cur <= tail <= hwtail
1685  *
1686  * We only consider head and cur.
1687  * hwcur and hwtail are reliable.
1688  *
1689  */
1690 u_int
1691 nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1692 {
1693 	uint32_t const n = kring->nkr_num_slots;
1694 	uint32_t head, cur;
1695 
1696 	ND(5,"%s kc %d kt %d h %d c %d t %d",
1697 		kring->name,
1698 		kring->nr_hwcur, kring->nr_hwtail,
1699 		ring->head, ring->cur, ring->tail);
1700 	/*
1701 	 * Before storing the new values, we should check they do not
1702 	 * move backwards. However:
1703 	 * - head is not an issue because the previous value is hwcur;
1704 	 * - cur could in principle go back, however it does not matter
1705 	 *   because we are processing a brand new rxsync()
1706 	 */
1707 	cur = kring->rcur = ring->cur;	/* read only once */
1708 	head = kring->rhead = ring->head;	/* read only once */
1709 #if 1 /* kernel sanity checks */
1710 	NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);
1711 #endif /* kernel sanity checks */
1712 	/* user sanity checks */
1713 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1714 		/* want hwcur <= rhead <= hwtail */
1715 		NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);
1716 		/* and also rhead <= rcur <= hwtail */
1717 		NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1718 	} else {
1719 		/* we need rhead outside hwtail..hwcur */
1720 		NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);
1721 		/* two cases now: head <= hwtail or head >= hwcur  */
1722 		if (head <= kring->nr_hwtail) {
1723 			/* want head <= cur <= hwtail */
1724 			NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1725 		} else {
1726 			/* cur must be outside hwtail..head */
1727 			NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);
1728 		}
1729 	}
1730 	if (ring->tail != kring->rtail) {
1731 		RD(5, "%s tail overwritten was %d need %d",
1732 			kring->name,
1733 			ring->tail, kring->rtail);
1734 		ring->tail = kring->rtail;
1735 	}
1736 	return head;
1737 }
1738 
1739 
1740 /*
1741  * Error routine called when txsync/rxsync detects an error.
1742  * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1743  * Return 1 on reinit.
1744  *
1745  * This routine is only called by the upper half of the kernel.
1746  * It only reads hwcur (which is changed only by the upper half, too)
1747  * and hwtail (which may be changed by the lower half, but only on
1748  * a tx ring and only to increase it, so any error will be recovered
1749  * on the next call). For the above, we don't strictly need to call
1750  * it under lock.
1751  */
1752 int
1753 netmap_ring_reinit(struct netmap_kring *kring)
1754 {
1755 	struct netmap_ring *ring = kring->ring;
1756 	u_int i, lim = kring->nkr_num_slots - 1;
1757 	int errors = 0;
1758 
1759 	// XXX KASSERT nm_kr_tryget
1760 	RD(10, "called for %s", kring->name);
1761 	// XXX probably wrong to trust userspace
1762 	kring->rhead = ring->head;
1763 	kring->rcur  = ring->cur;
1764 	kring->rtail = ring->tail;
1765 
1766 	if (ring->cur > lim)
1767 		errors++;
1768 	if (ring->head > lim)
1769 		errors++;
1770 	if (ring->tail > lim)
1771 		errors++;
1772 	for (i = 0; i <= lim; i++) {
1773 		u_int idx = ring->slot[i].buf_idx;
1774 		u_int len = ring->slot[i].len;
1775 		if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1776 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1777 			ring->slot[i].buf_idx = 0;
1778 			ring->slot[i].len = 0;
1779 		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1780 			ring->slot[i].len = 0;
1781 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1782 		}
1783 	}
1784 	if (errors) {
1785 		RD(10, "total %d errors", errors);
1786 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1787 			kring->name,
1788 			ring->cur, kring->nr_hwcur,
1789 			ring->tail, kring->nr_hwtail);
1790 		ring->head = kring->rhead = kring->nr_hwcur;
1791 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1792 		ring->tail = kring->rtail = kring->nr_hwtail;
1793 	}
1794 	return (errors ? 1 : 0);
1795 }
1796 
1797 /* interpret the ringid and flags fields of an nmreq, by translating them
1798  * into a pair of intervals of ring indices:
1799  *
1800  * [priv->np_txqfirst, priv->np_txqlast) and
1801  * [priv->np_rxqfirst, priv->np_rxqlast)
1802  *
1803  */
1804 int
1805 netmap_interp_ringid(struct netmap_priv_d *priv, uint32_t nr_mode,
1806 			uint16_t nr_ringid, uint64_t nr_flags)
1807 {
1808 	struct netmap_adapter *na = priv->np_na;
1809 	int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };
1810 	enum txrx t;
1811 	u_int j;
1812 
1813 	if ((nr_flags & NR_PTNETMAP_HOST) && ((nr_mode != NR_REG_ALL_NIC) ||
1814 			nr_flags & (NR_RX_RINGS_ONLY|NR_TX_RINGS_ONLY))) {
1815 		D("Error: only NR_REG_ALL_NIC supported with netmap passthrough");
1816 		return EINVAL;
1817 	}
1818 
1819 	for_rx_tx(t) {
1820 		if (nr_flags & excluded_direction[t]) {
1821 			priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1822 			continue;
1823 		}
1824 		switch (nr_mode) {
1825 		case NR_REG_ALL_NIC:
1826 			priv->np_qfirst[t] = 0;
1827 			priv->np_qlast[t] = nma_get_nrings(na, t);
1828 			ND("ALL/PIPE: %s %d %d", nm_txrx2str(t),
1829 				priv->np_qfirst[t], priv->np_qlast[t]);
1830 			break;
1831 		case NR_REG_SW:
1832 		case NR_REG_NIC_SW:
1833 			if (!(na->na_flags & NAF_HOST_RINGS)) {
1834 				D("host rings not supported");
1835 				return EINVAL;
1836 			}
1837 			priv->np_qfirst[t] = (nr_mode == NR_REG_SW ?
1838 				nma_get_nrings(na, t) : 0);
1839 			priv->np_qlast[t] = netmap_all_rings(na, t);
1840 			ND("%s: %s %d %d", nr_mode == NR_REG_SW ? "SW" : "NIC+SW",
1841 				nm_txrx2str(t),
1842 				priv->np_qfirst[t], priv->np_qlast[t]);
1843 			break;
1844 		case NR_REG_ONE_NIC:
1845 			if (nr_ringid >= na->num_tx_rings &&
1846 					nr_ringid >= na->num_rx_rings) {
1847 				D("invalid ring id %d", nr_ringid);
1848 				return EINVAL;
1849 			}
1850 			/* if not enough rings, use the first one */
1851 			j = nr_ringid;
1852 			if (j >= nma_get_nrings(na, t))
1853 				j = 0;
1854 			priv->np_qfirst[t] = j;
1855 			priv->np_qlast[t] = j + 1;
1856 			ND("ONE_NIC: %s %d %d", nm_txrx2str(t),
1857 				priv->np_qfirst[t], priv->np_qlast[t]);
1858 			break;
1859 		default:
1860 			D("invalid regif type %d", nr_mode);
1861 			return EINVAL;
1862 		}
1863 	}
1864 	priv->np_flags = nr_flags | nr_mode; // TODO
1865 
1866 	/* Allow transparent forwarding mode in the host --> nic
1867 	 * direction only if all the TX hw rings have been opened. */
1868 	if (priv->np_qfirst[NR_TX] == 0 &&
1869 			priv->np_qlast[NR_TX] >= na->num_tx_rings) {
1870 		priv->np_sync_flags |= NAF_CAN_FORWARD_DOWN;
1871 	}
1872 
1873 	if (netmap_verbose) {
1874 		D("%s: tx [%d,%d) rx [%d,%d) id %d",
1875 			na->name,
1876 			priv->np_qfirst[NR_TX],
1877 			priv->np_qlast[NR_TX],
1878 			priv->np_qfirst[NR_RX],
1879 			priv->np_qlast[NR_RX],
1880 			nr_ringid);
1881 	}
1882 	return 0;
1883 }
1884 
1885 
1886 /*
1887  * Set the ring ID. For devices with a single queue, a request
1888  * for all rings is the same as a single ring.
1889  */
1890 static int
1891 netmap_set_ringid(struct netmap_priv_d *priv, uint32_t nr_mode,
1892 		uint16_t nr_ringid, uint64_t nr_flags)
1893 {
1894 	struct netmap_adapter *na = priv->np_na;
1895 	int error;
1896 	enum txrx t;
1897 
1898 	error = netmap_interp_ringid(priv, nr_mode, nr_ringid, nr_flags);
1899 	if (error) {
1900 		return error;
1901 	}
1902 
1903 	priv->np_txpoll = (nr_flags & NR_NO_TX_POLL) ? 0 : 1;
1904 
1905 	/* optimization: count the users registered for more than
1906 	 * one ring, which are the ones sleeping on the global queue.
1907 	 * The default netmap_notify() callback will then
1908 	 * avoid signaling the global queue if nobody is using it
1909 	 */
1910 	for_rx_tx(t) {
1911 		if (nm_si_user(priv, t))
1912 			na->si_users[t]++;
1913 	}
1914 	return 0;
1915 }
1916 
1917 static void
1918 netmap_unset_ringid(struct netmap_priv_d *priv)
1919 {
1920 	struct netmap_adapter *na = priv->np_na;
1921 	enum txrx t;
1922 
1923 	for_rx_tx(t) {
1924 		if (nm_si_user(priv, t))
1925 			na->si_users[t]--;
1926 		priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1927 	}
1928 	priv->np_flags = 0;
1929 	priv->np_txpoll = 0;
1930 }
1931 
1932 
1933 /* Set the nr_pending_mode for the requested rings.
1934  * If requested, also try to get exclusive access to the rings, provided
1935  * the rings we want to bind are not exclusively owned by a previous bind.
1936  */
1937 static int
1938 netmap_krings_get(struct netmap_priv_d *priv)
1939 {
1940 	struct netmap_adapter *na = priv->np_na;
1941 	u_int i;
1942 	struct netmap_kring *kring;
1943 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1944 	enum txrx t;
1945 
1946 	if (netmap_verbose)
1947 		D("%s: grabbing tx [%d, %d) rx [%d, %d)",
1948 			na->name,
1949 			priv->np_qfirst[NR_TX],
1950 			priv->np_qlast[NR_TX],
1951 			priv->np_qfirst[NR_RX],
1952 			priv->np_qlast[NR_RX]);
1953 
1954 	/* first round: check that all the requested rings
1955 	 * are neither alread exclusively owned, nor we
1956 	 * want exclusive ownership when they are already in use
1957 	 */
1958 	for_rx_tx(t) {
1959 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1960 			kring = NMR(na, t)[i];
1961 			if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
1962 			    (kring->users && excl))
1963 			{
1964 				ND("ring %s busy", kring->name);
1965 				return EBUSY;
1966 			}
1967 		}
1968 	}
1969 
1970 	/* second round: increment usage count (possibly marking them
1971 	 * as exclusive) and set the nr_pending_mode
1972 	 */
1973 	for_rx_tx(t) {
1974 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1975 			kring = NMR(na, t)[i];
1976 			kring->users++;
1977 			if (excl)
1978 				kring->nr_kflags |= NKR_EXCLUSIVE;
1979 	                kring->nr_pending_mode = NKR_NETMAP_ON;
1980 		}
1981 	}
1982 
1983 	return 0;
1984 
1985 }
1986 
1987 /* Undo netmap_krings_get(). This is done by clearing the exclusive mode
1988  * if was asked on regif, and unset the nr_pending_mode if we are the
1989  * last users of the involved rings. */
1990 static void
1991 netmap_krings_put(struct netmap_priv_d *priv)
1992 {
1993 	struct netmap_adapter *na = priv->np_na;
1994 	u_int i;
1995 	struct netmap_kring *kring;
1996 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1997 	enum txrx t;
1998 
1999 	ND("%s: releasing tx [%d, %d) rx [%d, %d)",
2000 			na->name,
2001 			priv->np_qfirst[NR_TX],
2002 			priv->np_qlast[NR_TX],
2003 			priv->np_qfirst[NR_RX],
2004 			priv->np_qlast[MR_RX]);
2005 
2006 	for_rx_tx(t) {
2007 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
2008 			kring = NMR(na, t)[i];
2009 			if (excl)
2010 				kring->nr_kflags &= ~NKR_EXCLUSIVE;
2011 			kring->users--;
2012 			if (kring->users == 0)
2013 				kring->nr_pending_mode = NKR_NETMAP_OFF;
2014 		}
2015 	}
2016 }
2017 
2018 static int
2019 nm_priv_rx_enabled(struct netmap_priv_d *priv)
2020 {
2021 	return (priv->np_qfirst[NR_RX] != priv->np_qlast[NR_RX]);
2022 }
2023 
2024 /*
2025  * possibly move the interface to netmap-mode.
2026  * If success it returns a pointer to netmap_if, otherwise NULL.
2027  * This must be called with NMG_LOCK held.
2028  *
2029  * The following na callbacks are called in the process:
2030  *
2031  * na->nm_config()			[by netmap_update_config]
2032  * (get current number and size of rings)
2033  *
2034  *  	We have a generic one for linux (netmap_linux_config).
2035  *  	The bwrap has to override this, since it has to forward
2036  *  	the request to the wrapped adapter (netmap_bwrap_config).
2037  *
2038  *
2039  * na->nm_krings_create()
2040  * (create and init the krings array)
2041  *
2042  * 	One of the following:
2043  *
2044  *	* netmap_hw_krings_create, 			(hw ports)
2045  *		creates the standard layout for the krings
2046  * 		and adds the mbq (used for the host rings).
2047  *
2048  * 	* netmap_vp_krings_create			(VALE ports)
2049  * 		add leases and scratchpads
2050  *
2051  * 	* netmap_pipe_krings_create			(pipes)
2052  * 		create the krings and rings of both ends and
2053  * 		cross-link them
2054  *
2055  *      * netmap_monitor_krings_create 			(monitors)
2056  *      	avoid allocating the mbq
2057  *
2058  *      * netmap_bwrap_krings_create			(bwraps)
2059  *      	create both the brap krings array,
2060  *      	the krings array of the wrapped adapter, and
2061  *      	(if needed) the fake array for the host adapter
2062  *
2063  * na->nm_register(, 1)
2064  * (put the adapter in netmap mode)
2065  *
2066  * 	This may be one of the following:
2067  *
2068  * 	* netmap_hw_reg				        (hw ports)
2069  * 		checks that the ifp is still there, then calls
2070  * 		the hardware specific callback;
2071  *
2072  * 	* netmap_vp_reg					(VALE ports)
2073  *		If the port is connected to a bridge,
2074  *		set the NAF_NETMAP_ON flag under the
2075  *		bridge write lock.
2076  *
2077  *	* netmap_pipe_reg				(pipes)
2078  *		inform the other pipe end that it is no
2079  *		longer responsible for the lifetime of this
2080  *		pipe end
2081  *
2082  *	* netmap_monitor_reg				(monitors)
2083  *		intercept the sync callbacks of the monitored
2084  *		rings
2085  *
2086  *	* netmap_bwrap_reg				(bwraps)
2087  *		cross-link the bwrap and hwna rings,
2088  *		forward the request to the hwna, override
2089  *		the hwna notify callback (to get the frames
2090  *		coming from outside go through the bridge).
2091  *
2092  *
2093  */
2094 int
2095 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
2096 	uint32_t nr_mode, uint16_t nr_ringid, uint64_t nr_flags)
2097 {
2098 	struct netmap_if *nifp = NULL;
2099 	int error;
2100 
2101 	NMG_LOCK_ASSERT();
2102 	priv->np_na = na;     /* store the reference */
2103 	error = netmap_mem_finalize(na->nm_mem, na);
2104 	if (error)
2105 		goto err;
2106 
2107 	if (na->active_fds == 0) {
2108 
2109 		/* cache the allocator info in the na */
2110 		error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);
2111 		if (error)
2112 			goto err_drop_mem;
2113 		ND("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,
2114 					    na->na_lut.objsize);
2115 
2116 		/* ring configuration may have changed, fetch from the card */
2117 		netmap_update_config(na);
2118 	}
2119 
2120 	/* compute the range of tx and rx rings to monitor */
2121 	error = netmap_set_ringid(priv, nr_mode, nr_ringid, nr_flags);
2122 	if (error)
2123 		goto err_put_lut;
2124 
2125 	if (na->active_fds == 0) {
2126 		/*
2127 		 * If this is the first registration of the adapter,
2128 		 * perform sanity checks and create the in-kernel view
2129 		 * of the netmap rings (the netmap krings).
2130 		 */
2131 		if (na->ifp && nm_priv_rx_enabled(priv)) {
2132 			/* This netmap adapter is attached to an ifnet. */
2133 			unsigned nbs = NETMAP_BUF_SIZE(na);
2134 			unsigned mtu = nm_os_ifnet_mtu(na->ifp);
2135 
2136 			ND("%s: mtu %d rx_buf_maxsize %d netmap_buf_size %d",
2137 					na->name, mtu, na->rx_buf_maxsize, nbs);
2138 
2139 			if (na->rx_buf_maxsize == 0) {
2140 				D("%s: error: rx_buf_maxsize == 0", na->name);
2141 				error = EIO;
2142 				goto err_drop_mem;
2143 			}
2144 
2145 			if (mtu <= na->rx_buf_maxsize) {
2146 				/* The MTU fits a single NIC slot. We only
2147 				 * Need to check that netmap buffers are
2148 				 * large enough to hold an MTU. NS_MOREFRAG
2149 				 * cannot be used in this case. */
2150 				if (nbs < mtu) {
2151 					nm_prerr("error: netmap buf size (%u) "
2152 						"< device MTU (%u)\n", nbs, mtu);
2153 					error = EINVAL;
2154 					goto err_drop_mem;
2155 				}
2156 			} else {
2157 				/* More NIC slots may be needed to receive
2158 				 * or transmit a single packet. Check that
2159 				 * the adapter supports NS_MOREFRAG and that
2160 				 * netmap buffers are large enough to hold
2161 				 * the maximum per-slot size. */
2162 				if (!(na->na_flags & NAF_MOREFRAG)) {
2163 					nm_prerr("error: large MTU (%d) needed "
2164 						"but %s does not support "
2165 						"NS_MOREFRAG\n", mtu,
2166 						na->ifp->if_xname);
2167 					error = EINVAL;
2168 					goto err_drop_mem;
2169 				} else if (nbs < na->rx_buf_maxsize) {
2170 					nm_prerr("error: using NS_MOREFRAG on "
2171 						"%s requires netmap buf size "
2172 						">= %u\n", na->ifp->if_xname,
2173 						na->rx_buf_maxsize);
2174 					error = EINVAL;
2175 					goto err_drop_mem;
2176 				} else {
2177 					nm_prinf("info: netmap application on "
2178 						"%s needs to support "
2179 						"NS_MOREFRAG "
2180 						"(MTU=%u,netmap_buf_size=%u)\n",
2181 						na->ifp->if_xname, mtu, nbs);
2182 				}
2183 			}
2184 		}
2185 
2186 		/*
2187 		 * Depending on the adapter, this may also create
2188 		 * the netmap rings themselves
2189 		 */
2190 		error = na->nm_krings_create(na);
2191 		if (error)
2192 			goto err_put_lut;
2193 
2194 	}
2195 
2196 	/* now the krings must exist and we can check whether some
2197 	 * previous bind has exclusive ownership on them, and set
2198 	 * nr_pending_mode
2199 	 */
2200 	error = netmap_krings_get(priv);
2201 	if (error)
2202 		goto err_del_krings;
2203 
2204 	/* create all needed missing netmap rings */
2205 	error = netmap_mem_rings_create(na);
2206 	if (error)
2207 		goto err_rel_excl;
2208 
2209 	/* in all cases, create a new netmap if */
2210 	nifp = netmap_mem_if_new(na, priv);
2211 	if (nifp == NULL) {
2212 		error = ENOMEM;
2213 		goto err_rel_excl;
2214 	}
2215 
2216 	if (nm_kring_pending(priv)) {
2217 		/* Some kring is switching mode, tell the adapter to
2218 		 * react on this. */
2219 		error = na->nm_register(na, 1);
2220 		if (error)
2221 			goto err_del_if;
2222 	}
2223 
2224 	/* Commit the reference. */
2225 	na->active_fds++;
2226 
2227 	/*
2228 	 * advertise that the interface is ready by setting np_nifp.
2229 	 * The barrier is needed because readers (poll, *SYNC and mmap)
2230 	 * check for priv->np_nifp != NULL without locking
2231 	 */
2232 	mb(); /* make sure previous writes are visible to all CPUs */
2233 	priv->np_nifp = nifp;
2234 
2235 	return 0;
2236 
2237 err_del_if:
2238 	netmap_mem_if_delete(na, nifp);
2239 err_rel_excl:
2240 	netmap_krings_put(priv);
2241 	netmap_mem_rings_delete(na);
2242 err_del_krings:
2243 	if (na->active_fds == 0)
2244 		na->nm_krings_delete(na);
2245 err_put_lut:
2246 	if (na->active_fds == 0)
2247 		memset(&na->na_lut, 0, sizeof(na->na_lut));
2248 err_drop_mem:
2249 	netmap_mem_drop(na);
2250 err:
2251 	priv->np_na = NULL;
2252 	return error;
2253 }
2254 
2255 
2256 /*
2257  * update kring and ring at the end of rxsync/txsync.
2258  */
2259 static inline void
2260 nm_sync_finalize(struct netmap_kring *kring)
2261 {
2262 	/*
2263 	 * Update ring tail to what the kernel knows
2264 	 * After txsync: head/rhead/hwcur might be behind cur/rcur
2265 	 * if no carrier.
2266 	 */
2267 	kring->ring->tail = kring->rtail = kring->nr_hwtail;
2268 
2269 	ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2270 		kring->name, kring->nr_hwcur, kring->nr_hwtail,
2271 		kring->rhead, kring->rcur, kring->rtail);
2272 }
2273 
2274 /* set ring timestamp */
2275 static inline void
2276 ring_timestamp_set(struct netmap_ring *ring)
2277 {
2278 	if (netmap_no_timestamp == 0 || ring->flags & NR_TIMESTAMP) {
2279 		microtime(&ring->ts);
2280 	}
2281 }
2282 
2283 static int nmreq_copyin(struct nmreq_header *, int);
2284 static int nmreq_copyout(struct nmreq_header *, int);
2285 static int nmreq_checkoptions(struct nmreq_header *);
2286 
2287 /*
2288  * ioctl(2) support for the "netmap" device.
2289  *
2290  * Following a list of accepted commands:
2291  * - NIOCCTRL		device control API
2292  * - NIOCTXSYNC		sync TX rings
2293  * - NIOCRXSYNC		sync RX rings
2294  * - SIOCGIFADDR	just for convenience
2295  * - NIOCGINFO		deprecated (legacy API)
2296  * - NIOCREGIF		deprecated (legacy API)
2297  *
2298  * Return 0 on success, errno otherwise.
2299  */
2300 int
2301 netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
2302 		struct thread *td, int nr_body_is_user)
2303 {
2304 	struct mbq q;	/* packets from RX hw queues to host stack */
2305 	struct netmap_adapter *na = NULL;
2306 	struct netmap_mem_d *nmd = NULL;
2307 	struct ifnet *ifp = NULL;
2308 	int error = 0;
2309 	u_int i, qfirst, qlast;
2310 	struct netmap_if *nifp;
2311 	struct netmap_kring **krings;
2312 	int sync_flags;
2313 	enum txrx t;
2314 
2315 	switch (cmd) {
2316 	case NIOCCTRL: {
2317 		struct nmreq_header *hdr = (struct nmreq_header *)data;
2318 
2319 		if (hdr->nr_version != NETMAP_API) {
2320 			D("API mismatch for reqtype %d: got %d need %d",
2321 				hdr->nr_version,
2322 				hdr->nr_version, NETMAP_API);
2323 			hdr->nr_version = NETMAP_API;
2324 		}
2325 		if (hdr->nr_version < NETMAP_MIN_API ||
2326 		    hdr->nr_version > NETMAP_MAX_API) {
2327 			return EINVAL;
2328 		}
2329 
2330 		/* Make a kernel-space copy of the user-space nr_body.
2331 		 * For convenince, the nr_body pointer and the pointers
2332 		 * in the options list will be replaced with their
2333 		 * kernel-space counterparts. The original pointers are
2334 		 * saved internally and later restored by nmreq_copyout
2335 		 */
2336 		error = nmreq_copyin(hdr, nr_body_is_user);
2337 		if (error) {
2338 			return error;
2339 		}
2340 
2341 		/* Sanitize hdr->nr_name. */
2342 		hdr->nr_name[sizeof(hdr->nr_name) - 1] = '\0';
2343 
2344 		switch (hdr->nr_reqtype) {
2345 		case NETMAP_REQ_REGISTER: {
2346 			struct nmreq_register *req =
2347 				(struct nmreq_register *)(uintptr_t)hdr->nr_body;
2348 			/* Protect access to priv from concurrent requests. */
2349 			NMG_LOCK();
2350 			do {
2351 				u_int memflags;
2352 #ifdef WITH_EXTMEM
2353 				struct nmreq_option *opt;
2354 #endif /* WITH_EXTMEM */
2355 
2356 				if (priv->np_nifp != NULL) {	/* thread already registered */
2357 					error = EBUSY;
2358 					break;
2359 				}
2360 
2361 #ifdef WITH_EXTMEM
2362 				opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)hdr->nr_options,
2363 						NETMAP_REQ_OPT_EXTMEM);
2364 				if (opt != NULL) {
2365 					struct nmreq_opt_extmem *e =
2366 						(struct nmreq_opt_extmem *)opt;
2367 
2368 					error = nmreq_checkduplicate(opt);
2369 					if (error) {
2370 						opt->nro_status = error;
2371 						break;
2372 					}
2373 					nmd = netmap_mem_ext_create(e->nro_usrptr,
2374 							&e->nro_info, &error);
2375 					opt->nro_status = error;
2376 					if (nmd == NULL)
2377 						break;
2378 				}
2379 #endif /* WITH_EXTMEM */
2380 
2381 				if (nmd == NULL && req->nr_mem_id) {
2382 					/* find the allocator and get a reference */
2383 					nmd = netmap_mem_find(req->nr_mem_id);
2384 					if (nmd == NULL) {
2385 						error = EINVAL;
2386 						break;
2387 					}
2388 				}
2389 				/* find the interface and a reference */
2390 				error = netmap_get_na(hdr, &na, &ifp, nmd,
2391 						      1 /* create */); /* keep reference */
2392 				if (error)
2393 					break;
2394 				if (NETMAP_OWNED_BY_KERN(na)) {
2395 					error = EBUSY;
2396 					break;
2397 				}
2398 
2399 				if (na->virt_hdr_len && !(req->nr_flags & NR_ACCEPT_VNET_HDR)) {
2400 					error = EIO;
2401 					break;
2402 				}
2403 
2404 				error = netmap_do_regif(priv, na, req->nr_mode,
2405 							req->nr_ringid, req->nr_flags);
2406 				if (error) {    /* reg. failed, release priv and ref */
2407 					break;
2408 				}
2409 				nifp = priv->np_nifp;
2410 				priv->np_td = td; /* for debugging purposes */
2411 
2412 				/* return the offset of the netmap_if object */
2413 				req->nr_rx_rings = na->num_rx_rings;
2414 				req->nr_tx_rings = na->num_tx_rings;
2415 				req->nr_rx_slots = na->num_rx_desc;
2416 				req->nr_tx_slots = na->num_tx_desc;
2417 				error = netmap_mem_get_info(na->nm_mem, &req->nr_memsize, &memflags,
2418 					&req->nr_mem_id);
2419 				if (error) {
2420 					netmap_do_unregif(priv);
2421 					break;
2422 				}
2423 				if (memflags & NETMAP_MEM_PRIVATE) {
2424 					*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2425 				}
2426 				for_rx_tx(t) {
2427 					priv->np_si[t] = nm_si_user(priv, t) ?
2428 						&na->si[t] : &NMR(na, t)[priv->np_qfirst[t]]->si;
2429 				}
2430 
2431 				if (req->nr_extra_bufs) {
2432 					if (netmap_verbose)
2433 						D("requested %d extra buffers",
2434 							req->nr_extra_bufs);
2435 					req->nr_extra_bufs = netmap_extra_alloc(na,
2436 						&nifp->ni_bufs_head, req->nr_extra_bufs);
2437 					if (netmap_verbose)
2438 						D("got %d extra buffers", req->nr_extra_bufs);
2439 				}
2440 				req->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2441 
2442 				error = nmreq_checkoptions(hdr);
2443 				if (error) {
2444 					netmap_do_unregif(priv);
2445 					break;
2446 				}
2447 
2448 				/* store ifp reference so that priv destructor may release it */
2449 				priv->np_ifp = ifp;
2450 			} while (0);
2451 			if (error) {
2452 				netmap_unget_na(na, ifp);
2453 			}
2454 			/* release the reference from netmap_mem_find() or
2455 			 * netmap_mem_ext_create()
2456 			 */
2457 			if (nmd)
2458 				netmap_mem_put(nmd);
2459 			NMG_UNLOCK();
2460 			break;
2461 		}
2462 
2463 		case NETMAP_REQ_PORT_INFO_GET: {
2464 			struct nmreq_port_info_get *req =
2465 				(struct nmreq_port_info_get *)(uintptr_t)hdr->nr_body;
2466 
2467 			NMG_LOCK();
2468 			do {
2469 				u_int memflags;
2470 
2471 				if (hdr->nr_name[0] != '\0') {
2472 					/* Build a nmreq_register out of the nmreq_port_info_get,
2473 					 * so that we can call netmap_get_na(). */
2474 					struct nmreq_register regreq;
2475 					bzero(&regreq, sizeof(regreq));
2476 					regreq.nr_tx_slots = req->nr_tx_slots;
2477 					regreq.nr_rx_slots = req->nr_rx_slots;
2478 					regreq.nr_tx_rings = req->nr_tx_rings;
2479 					regreq.nr_rx_rings = req->nr_rx_rings;
2480 					regreq.nr_mem_id = req->nr_mem_id;
2481 
2482 					/* get a refcount */
2483 					hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2484 					hdr->nr_body = (uintptr_t)&regreq;
2485 					error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
2486 					hdr->nr_reqtype = NETMAP_REQ_PORT_INFO_GET; /* reset type */
2487 					hdr->nr_body = (uintptr_t)req; /* reset nr_body */
2488 					if (error) {
2489 						na = NULL;
2490 						ifp = NULL;
2491 						break;
2492 					}
2493 					nmd = na->nm_mem; /* get memory allocator */
2494 				} else {
2495 					nmd = netmap_mem_find(req->nr_mem_id ? req->nr_mem_id : 1);
2496 					if (nmd == NULL) {
2497 						error = EINVAL;
2498 						break;
2499 					}
2500 				}
2501 
2502 				error = netmap_mem_get_info(nmd, &req->nr_memsize, &memflags,
2503 					&req->nr_mem_id);
2504 				if (error)
2505 					break;
2506 				if (na == NULL) /* only memory info */
2507 					break;
2508 				req->nr_offset = 0;
2509 				req->nr_rx_slots = req->nr_tx_slots = 0;
2510 				netmap_update_config(na);
2511 				req->nr_rx_rings = na->num_rx_rings;
2512 				req->nr_tx_rings = na->num_tx_rings;
2513 				req->nr_rx_slots = na->num_rx_desc;
2514 				req->nr_tx_slots = na->num_tx_desc;
2515 			} while (0);
2516 			netmap_unget_na(na, ifp);
2517 			NMG_UNLOCK();
2518 			break;
2519 		}
2520 #ifdef WITH_VALE
2521 		case NETMAP_REQ_VALE_ATTACH: {
2522 			error = nm_bdg_ctl_attach(hdr, NULL /* userspace request */);
2523 			break;
2524 		}
2525 
2526 		case NETMAP_REQ_VALE_DETACH: {
2527 			error = nm_bdg_ctl_detach(hdr, NULL /* userspace request */);
2528 			break;
2529 		}
2530 
2531 		case NETMAP_REQ_VALE_LIST: {
2532 			error = netmap_bdg_list(hdr);
2533 			break;
2534 		}
2535 
2536 		case NETMAP_REQ_PORT_HDR_SET: {
2537 			struct nmreq_port_hdr *req =
2538 				(struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2539 			/* Build a nmreq_register out of the nmreq_port_hdr,
2540 			 * so that we can call netmap_get_bdg_na(). */
2541 			struct nmreq_register regreq;
2542 			bzero(&regreq, sizeof(regreq));
2543 			/* For now we only support virtio-net headers, and only for
2544 			 * VALE ports, but this may change in future. Valid lengths
2545 			 * for the virtio-net header are 0 (no header), 10 and 12. */
2546 			if (req->nr_hdr_len != 0 &&
2547 				req->nr_hdr_len != sizeof(struct nm_vnet_hdr) &&
2548 					req->nr_hdr_len != 12) {
2549 				error = EINVAL;
2550 				break;
2551 			}
2552 			NMG_LOCK();
2553 			hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2554 			hdr->nr_body = (uintptr_t)&regreq;
2555 			error = netmap_get_vale_na(hdr, &na, NULL, 0);
2556 			hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_SET;
2557 			hdr->nr_body = (uintptr_t)req;
2558 			if (na && !error) {
2559 				struct netmap_vp_adapter *vpna =
2560 					(struct netmap_vp_adapter *)na;
2561 				na->virt_hdr_len = req->nr_hdr_len;
2562 				if (na->virt_hdr_len) {
2563 					vpna->mfs = NETMAP_BUF_SIZE(na);
2564 				}
2565 				D("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
2566 				netmap_adapter_put(na);
2567 			} else if (!na) {
2568 				error = ENXIO;
2569 			}
2570 			NMG_UNLOCK();
2571 			break;
2572 		}
2573 
2574 		case NETMAP_REQ_PORT_HDR_GET: {
2575 			/* Get vnet-header length for this netmap port */
2576 			struct nmreq_port_hdr *req =
2577 				(struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
2578 			/* Build a nmreq_register out of the nmreq_port_hdr,
2579 			 * so that we can call netmap_get_bdg_na(). */
2580 			struct nmreq_register regreq;
2581 			struct ifnet *ifp;
2582 
2583 			bzero(&regreq, sizeof(regreq));
2584 			NMG_LOCK();
2585 			hdr->nr_reqtype = NETMAP_REQ_REGISTER;
2586 			hdr->nr_body = (uintptr_t)&regreq;
2587 			error = netmap_get_na(hdr, &na, &ifp, NULL, 0);
2588 			hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_GET;
2589 			hdr->nr_body = (uintptr_t)req;
2590 			if (na && !error) {
2591 				req->nr_hdr_len = na->virt_hdr_len;
2592 			}
2593 			netmap_unget_na(na, ifp);
2594 			NMG_UNLOCK();
2595 			break;
2596 		}
2597 
2598 		case NETMAP_REQ_VALE_NEWIF: {
2599 			error = nm_vi_create(hdr);
2600 			break;
2601 		}
2602 
2603 		case NETMAP_REQ_VALE_DELIF: {
2604 			error = nm_vi_destroy(hdr->nr_name);
2605 			break;
2606 		}
2607 
2608 		case NETMAP_REQ_VALE_POLLING_ENABLE:
2609 		case NETMAP_REQ_VALE_POLLING_DISABLE: {
2610 			error = nm_bdg_polling(hdr);
2611 			break;
2612 		}
2613 #endif  /* WITH_VALE */
2614 		case NETMAP_REQ_POOLS_INFO_GET: {
2615 			struct nmreq_pools_info *req =
2616 				(struct nmreq_pools_info *)(uintptr_t)hdr->nr_body;
2617 			/* Get information from the memory allocator. This
2618 			 * netmap device must already be bound to a port.
2619 			 * Note that hdr->nr_name is ignored. */
2620 			NMG_LOCK();
2621 			if (priv->np_na && priv->np_na->nm_mem) {
2622 				struct netmap_mem_d *nmd = priv->np_na->nm_mem;
2623 				error = netmap_mem_pools_info_get(req, nmd);
2624 			} else {
2625 				error = EINVAL;
2626 			}
2627 			NMG_UNLOCK();
2628 			break;
2629 		}
2630 
2631 		default: {
2632 			error = EINVAL;
2633 			break;
2634 		}
2635 		}
2636 		/* Write back request body to userspace and reset the
2637 		 * user-space pointer. */
2638 		error = nmreq_copyout(hdr, error);
2639 		break;
2640 	}
2641 
2642 	case NIOCTXSYNC:
2643 	case NIOCRXSYNC: {
2644 		nifp = priv->np_nifp;
2645 
2646 		if (nifp == NULL) {
2647 			error = ENXIO;
2648 			break;
2649 		}
2650 		mb(); /* make sure following reads are not from cache */
2651 
2652 		na = priv->np_na;      /* we have a reference */
2653 
2654 		if (na == NULL) {
2655 			D("Internal error: nifp != NULL && na == NULL");
2656 			error = ENXIO;
2657 			break;
2658 		}
2659 
2660 		mbq_init(&q);
2661 		t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
2662 		krings = NMR(na, t);
2663 		qfirst = priv->np_qfirst[t];
2664 		qlast = priv->np_qlast[t];
2665 		sync_flags = priv->np_sync_flags;
2666 
2667 		for (i = qfirst; i < qlast; i++) {
2668 			struct netmap_kring *kring = krings[i];
2669 			struct netmap_ring *ring = kring->ring;
2670 
2671 			if (unlikely(nm_kr_tryget(kring, 1, &error))) {
2672 				error = (error ? EIO : 0);
2673 				continue;
2674 			}
2675 
2676 			if (cmd == NIOCTXSYNC) {
2677 				if (netmap_verbose & NM_VERB_TXSYNC)
2678 					D("pre txsync ring %d cur %d hwcur %d",
2679 					    i, ring->cur,
2680 					    kring->nr_hwcur);
2681 				if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2682 					netmap_ring_reinit(kring);
2683 				} else if (kring->nm_sync(kring, sync_flags | NAF_FORCE_RECLAIM) == 0) {
2684 					nm_sync_finalize(kring);
2685 				}
2686 				if (netmap_verbose & NM_VERB_TXSYNC)
2687 					D("post txsync ring %d cur %d hwcur %d",
2688 					    i, ring->cur,
2689 					    kring->nr_hwcur);
2690 			} else {
2691 				if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2692 					netmap_ring_reinit(kring);
2693 				}
2694 				if (nm_may_forward_up(kring)) {
2695 					/* transparent forwarding, see netmap_poll() */
2696 					netmap_grab_packets(kring, &q, netmap_fwd);
2697 				}
2698 				if (kring->nm_sync(kring, sync_flags | NAF_FORCE_READ) == 0) {
2699 					nm_sync_finalize(kring);
2700 				}
2701 				ring_timestamp_set(ring);
2702 			}
2703 			nm_kr_put(kring);
2704 		}
2705 
2706 		if (mbq_peek(&q)) {
2707 			netmap_send_up(na->ifp, &q);
2708 		}
2709 
2710 		break;
2711 	}
2712 
2713 	default: {
2714 		return netmap_ioctl_legacy(priv, cmd, data, td);
2715 		break;
2716 	}
2717 	}
2718 
2719 	return (error);
2720 }
2721 
2722 size_t
2723 nmreq_size_by_type(uint16_t nr_reqtype)
2724 {
2725 	switch (nr_reqtype) {
2726 	case NETMAP_REQ_REGISTER:
2727 		return sizeof(struct nmreq_register);
2728 	case NETMAP_REQ_PORT_INFO_GET:
2729 		return sizeof(struct nmreq_port_info_get);
2730 	case NETMAP_REQ_VALE_ATTACH:
2731 		return sizeof(struct nmreq_vale_attach);
2732 	case NETMAP_REQ_VALE_DETACH:
2733 		return sizeof(struct nmreq_vale_detach);
2734 	case NETMAP_REQ_VALE_LIST:
2735 		return sizeof(struct nmreq_vale_list);
2736 	case NETMAP_REQ_PORT_HDR_SET:
2737 	case NETMAP_REQ_PORT_HDR_GET:
2738 		return sizeof(struct nmreq_port_hdr);
2739 	case NETMAP_REQ_VALE_NEWIF:
2740 		return sizeof(struct nmreq_vale_newif);
2741 	case NETMAP_REQ_VALE_DELIF:
2742 		return 0;
2743 	case NETMAP_REQ_VALE_POLLING_ENABLE:
2744 	case NETMAP_REQ_VALE_POLLING_DISABLE:
2745 		return sizeof(struct nmreq_vale_polling);
2746 	case NETMAP_REQ_POOLS_INFO_GET:
2747 		return sizeof(struct nmreq_pools_info);
2748 	}
2749 	return 0;
2750 }
2751 
2752 static size_t
2753 nmreq_opt_size_by_type(uint16_t nro_reqtype)
2754 {
2755 	size_t rv = sizeof(struct nmreq_option);
2756 #ifdef NETMAP_REQ_OPT_DEBUG
2757 	if (nro_reqtype & NETMAP_REQ_OPT_DEBUG)
2758 		return (nro_reqtype & ~NETMAP_REQ_OPT_DEBUG);
2759 #endif /* NETMAP_REQ_OPT_DEBUG */
2760 	switch (nro_reqtype) {
2761 #ifdef WITH_EXTMEM
2762 	case NETMAP_REQ_OPT_EXTMEM:
2763 		rv = sizeof(struct nmreq_opt_extmem);
2764 		break;
2765 #endif /* WITH_EXTMEM */
2766 	}
2767 	/* subtract the common header */
2768 	return rv - sizeof(struct nmreq_option);
2769 }
2770 
2771 int
2772 nmreq_copyin(struct nmreq_header *hdr, int nr_body_is_user)
2773 {
2774 	size_t rqsz, optsz, bufsz;
2775 	int error;
2776 	char *ker = NULL, *p;
2777 	struct nmreq_option **next, *src;
2778 	struct nmreq_option buf;
2779 	uint64_t *ptrs;
2780 
2781 	if (hdr->nr_reserved)
2782 		return EINVAL;
2783 
2784 	if (!nr_body_is_user)
2785 		return 0;
2786 
2787 	hdr->nr_reserved = nr_body_is_user;
2788 
2789 	/* compute the total size of the buffer */
2790 	rqsz = nmreq_size_by_type(hdr->nr_reqtype);
2791 	if (rqsz > NETMAP_REQ_MAXSIZE) {
2792 		error = EMSGSIZE;
2793 		goto out_err;
2794 	}
2795 	if ((rqsz && hdr->nr_body == (uintptr_t)NULL) ||
2796 		(!rqsz && hdr->nr_body != (uintptr_t)NULL)) {
2797 		/* Request body expected, but not found; or
2798 		 * request body found but unexpected. */
2799 		error = EINVAL;
2800 		goto out_err;
2801 	}
2802 
2803 	bufsz = 2 * sizeof(void *) + rqsz;
2804 	optsz = 0;
2805 	for (src = (struct nmreq_option *)(uintptr_t)hdr->nr_options; src;
2806 	     src = (struct nmreq_option *)(uintptr_t)buf.nro_next)
2807 	{
2808 		error = copyin(src, &buf, sizeof(*src));
2809 		if (error)
2810 			goto out_err;
2811 		optsz += sizeof(*src);
2812 		optsz += nmreq_opt_size_by_type(buf.nro_reqtype);
2813 		if (rqsz + optsz > NETMAP_REQ_MAXSIZE) {
2814 			error = EMSGSIZE;
2815 			goto out_err;
2816 		}
2817 		bufsz += optsz + sizeof(void *);
2818 	}
2819 
2820 	ker = nm_os_malloc(bufsz);
2821 	if (ker == NULL) {
2822 		error = ENOMEM;
2823 		goto out_err;
2824 	}
2825 	p = ker;
2826 
2827 	/* make a copy of the user pointers */
2828 	ptrs = (uint64_t*)p;
2829 	*ptrs++ = hdr->nr_body;
2830 	*ptrs++ = hdr->nr_options;
2831 	p = (char *)ptrs;
2832 
2833 	/* copy the body */
2834 	error = copyin((void *)(uintptr_t)hdr->nr_body, p, rqsz);
2835 	if (error)
2836 		goto out_restore;
2837 	/* overwrite the user pointer with the in-kernel one */
2838 	hdr->nr_body = (uintptr_t)p;
2839 	p += rqsz;
2840 
2841 	/* copy the options */
2842 	next = (struct nmreq_option **)&hdr->nr_options;
2843 	src = *next;
2844 	while (src) {
2845 		struct nmreq_option *opt;
2846 
2847 		/* copy the option header */
2848 		ptrs = (uint64_t *)p;
2849 		opt = (struct nmreq_option *)(ptrs + 1);
2850 		error = copyin(src, opt, sizeof(*src));
2851 		if (error)
2852 			goto out_restore;
2853 		/* make a copy of the user next pointer */
2854 		*ptrs = opt->nro_next;
2855 		/* overwrite the user pointer with the in-kernel one */
2856 		*next = opt;
2857 
2858 		/* initialize the option as not supported.
2859 		 * Recognized options will update this field.
2860 		 */
2861 		opt->nro_status = EOPNOTSUPP;
2862 
2863 		p = (char *)(opt + 1);
2864 
2865 		/* copy the option body */
2866 		optsz = nmreq_opt_size_by_type(opt->nro_reqtype);
2867 		if (optsz) {
2868 			/* the option body follows the option header */
2869 			error = copyin(src + 1, p, optsz);
2870 			if (error)
2871 				goto out_restore;
2872 			p += optsz;
2873 		}
2874 
2875 		/* move to next option */
2876 		next = (struct nmreq_option **)&opt->nro_next;
2877 		src = *next;
2878 	}
2879 	return 0;
2880 
2881 out_restore:
2882 	ptrs = (uint64_t *)ker;
2883 	hdr->nr_body = *ptrs++;
2884 	hdr->nr_options = *ptrs++;
2885 	hdr->nr_reserved = 0;
2886 	nm_os_free(ker);
2887 out_err:
2888 	return error;
2889 }
2890 
2891 static int
2892 nmreq_copyout(struct nmreq_header *hdr, int rerror)
2893 {
2894 	struct nmreq_option *src, *dst;
2895 	void *ker = (void *)(uintptr_t)hdr->nr_body, *bufstart;
2896 	uint64_t *ptrs;
2897 	size_t bodysz;
2898 	int error;
2899 
2900 	if (!hdr->nr_reserved)
2901 		return rerror;
2902 
2903 	/* restore the user pointers in the header */
2904 	ptrs = (uint64_t *)ker - 2;
2905 	bufstart = ptrs;
2906 	hdr->nr_body = *ptrs++;
2907 	src = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
2908 	hdr->nr_options = *ptrs;
2909 
2910 	if (!rerror) {
2911 		/* copy the body */
2912 		bodysz = nmreq_size_by_type(hdr->nr_reqtype);
2913 		error = copyout(ker, (void *)(uintptr_t)hdr->nr_body, bodysz);
2914 		if (error) {
2915 			rerror = error;
2916 			goto out;
2917 		}
2918 	}
2919 
2920 	/* copy the options */
2921 	dst = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
2922 	while (src) {
2923 		size_t optsz;
2924 		uint64_t next;
2925 
2926 		/* restore the user pointer */
2927 		next = src->nro_next;
2928 		ptrs = (uint64_t *)src - 1;
2929 		src->nro_next = *ptrs;
2930 
2931 		/* always copy the option header */
2932 		error = copyout(src, dst, sizeof(*src));
2933 		if (error) {
2934 			rerror = error;
2935 			goto out;
2936 		}
2937 
2938 		/* copy the option body only if there was no error */
2939 		if (!rerror && !src->nro_status) {
2940 			optsz = nmreq_opt_size_by_type(src->nro_reqtype);
2941 			if (optsz) {
2942 				error = copyout(src + 1, dst + 1, optsz);
2943 				if (error) {
2944 					rerror = error;
2945 					goto out;
2946 				}
2947 			}
2948 		}
2949 		src = (struct nmreq_option *)(uintptr_t)next;
2950 		dst = (struct nmreq_option *)(uintptr_t)*ptrs;
2951 	}
2952 
2953 
2954 out:
2955 	hdr->nr_reserved = 0;
2956 	nm_os_free(bufstart);
2957 	return rerror;
2958 }
2959 
2960 struct nmreq_option *
2961 nmreq_findoption(struct nmreq_option *opt, uint16_t reqtype)
2962 {
2963 	for ( ; opt; opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
2964 		if (opt->nro_reqtype == reqtype)
2965 			return opt;
2966 	return NULL;
2967 }
2968 
2969 int
2970 nmreq_checkduplicate(struct nmreq_option *opt) {
2971 	uint16_t type = opt->nro_reqtype;
2972 	int dup = 0;
2973 
2974 	while ((opt = nmreq_findoption((struct nmreq_option *)(uintptr_t)opt->nro_next,
2975 			type))) {
2976 		dup++;
2977 		opt->nro_status = EINVAL;
2978 	}
2979 	return (dup ? EINVAL : 0);
2980 }
2981 
2982 static int
2983 nmreq_checkoptions(struct nmreq_header *hdr)
2984 {
2985 	struct nmreq_option *opt;
2986 	/* return error if there is still any option
2987 	 * marked as not supported
2988 	 */
2989 
2990 	for (opt = (struct nmreq_option *)(uintptr_t)hdr->nr_options; opt;
2991 	     opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
2992 		if (opt->nro_status == EOPNOTSUPP)
2993 			return EOPNOTSUPP;
2994 
2995 	return 0;
2996 }
2997 
2998 /*
2999  * select(2) and poll(2) handlers for the "netmap" device.
3000  *
3001  * Can be called for one or more queues.
3002  * Return true the event mask corresponding to ready events.
3003  * If there are no ready events, do a selrecord on either individual
3004  * selinfo or on the global one.
3005  * Device-dependent parts (locking and sync of tx/rx rings)
3006  * are done through callbacks.
3007  *
3008  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
3009  * The first one is remapped to pwait as selrecord() uses the name as an
3010  * hidden argument.
3011  */
3012 int
3013 netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
3014 {
3015 	struct netmap_adapter *na;
3016 	struct netmap_kring *kring;
3017 	struct netmap_ring *ring;
3018 	u_int i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
3019 #define want_tx want[NR_TX]
3020 #define want_rx want[NR_RX]
3021 	struct mbq q;	/* packets from RX hw queues to host stack */
3022 
3023 	/*
3024 	 * In order to avoid nested locks, we need to "double check"
3025 	 * txsync and rxsync if we decide to do a selrecord().
3026 	 * retry_tx (and retry_rx, later) prevent looping forever.
3027 	 */
3028 	int retry_tx = 1, retry_rx = 1;
3029 
3030 	/* Transparent mode: send_down is 1 if we have found some
3031 	 * packets to forward (host RX ring --> NIC) during the rx
3032 	 * scan and we have not sent them down to the NIC yet.
3033 	 * Transparent mode requires to bind all rings to a single
3034 	 * file descriptor.
3035 	 */
3036 	int send_down = 0;
3037 	int sync_flags = priv->np_sync_flags;
3038 
3039 	mbq_init(&q);
3040 
3041 	if (priv->np_nifp == NULL) {
3042 		D("No if registered");
3043 		return POLLERR;
3044 	}
3045 	mb(); /* make sure following reads are not from cache */
3046 
3047 	na = priv->np_na;
3048 
3049 	if (!nm_netmap_on(na))
3050 		return POLLERR;
3051 
3052 	if (netmap_verbose & 0x8000)
3053 		D("device %s events 0x%x", na->name, events);
3054 	want_tx = events & (POLLOUT | POLLWRNORM);
3055 	want_rx = events & (POLLIN | POLLRDNORM);
3056 
3057 	/*
3058 	 * check_all_{tx|rx} are set if the card has more than one queue AND
3059 	 * the file descriptor is bound to all of them. If so, we sleep on
3060 	 * the "global" selinfo, otherwise we sleep on individual selinfo
3061 	 * (FreeBSD only allows two selinfo's per file descriptor).
3062 	 * The interrupt routine in the driver wake one or the other
3063 	 * (or both) depending on which clients are active.
3064 	 *
3065 	 * rxsync() is only called if we run out of buffers on a POLLIN.
3066 	 * txsync() is called if we run out of buffers on POLLOUT, or
3067 	 * there are pending packets to send. The latter can be disabled
3068 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
3069 	 */
3070 	check_all_tx = nm_si_user(priv, NR_TX);
3071 	check_all_rx = nm_si_user(priv, NR_RX);
3072 
3073 #ifdef __FreeBSD__
3074 	/*
3075 	 * We start with a lock free round which is cheap if we have
3076 	 * slots available. If this fails, then lock and call the sync
3077 	 * routines. We can't do this on Linux, as the contract says
3078 	 * that we must call nm_os_selrecord() unconditionally.
3079 	 */
3080 	if (want_tx) {
3081 		enum txrx t = NR_TX;
3082 		for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) {
3083 			kring = NMR(na, t)[i];
3084 			/* XXX compare ring->cur and kring->tail */
3085 			if (!nm_ring_empty(kring->ring)) {
3086 				revents |= want[t];
3087 				want[t] = 0;	/* also breaks the loop */
3088 			}
3089 		}
3090 	}
3091 	if (want_rx) {
3092 		enum txrx t = NR_RX;
3093 		want_rx = 0; /* look for a reason to run the handlers */
3094 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
3095 			kring = NMR(na, t)[i];
3096 			if (kring->ring->cur == kring->ring->tail /* try fetch new buffers */
3097 			    || kring->rhead != kring->ring->head /* release buffers */) {
3098 				want_rx = 1;
3099 			}
3100 		}
3101 		if (!want_rx)
3102 			revents |= events & (POLLIN | POLLRDNORM); /* we have data */
3103 	}
3104 #endif
3105 
3106 #ifdef linux
3107 	/* The selrecord must be unconditional on linux. */
3108 	nm_os_selrecord(sr, check_all_tx ?
3109 	    &na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]]->si);
3110 	nm_os_selrecord(sr, check_all_rx ?
3111 		&na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]]->si);
3112 #endif /* linux */
3113 
3114 	/*
3115 	 * If we want to push packets out (priv->np_txpoll) or
3116 	 * want_tx is still set, we must issue txsync calls
3117 	 * (on all rings, to avoid that the tx rings stall).
3118 	 * Fortunately, normal tx mode has np_txpoll set.
3119 	 */
3120 	if (priv->np_txpoll || want_tx) {
3121 		/*
3122 		 * The first round checks if anyone is ready, if not
3123 		 * do a selrecord and another round to handle races.
3124 		 * want_tx goes to 0 if any space is found, and is
3125 		 * used to skip rings with no pending transmissions.
3126 		 */
3127 flush_tx:
3128 		for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {
3129 			int found = 0;
3130 
3131 			kring = na->tx_rings[i];
3132 			ring = kring->ring;
3133 
3134 			/*
3135 			 * Don't try to txsync this TX ring if we already found some
3136 			 * space in some of the TX rings (want_tx == 0) and there are no
3137 			 * TX slots in this ring that need to be flushed to the NIC
3138 			 * (head == hwcur).
3139 			 */
3140 			if (!send_down && !want_tx && ring->head == kring->nr_hwcur)
3141 				continue;
3142 
3143 			if (nm_kr_tryget(kring, 1, &revents))
3144 				continue;
3145 
3146 			if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3147 				netmap_ring_reinit(kring);
3148 				revents |= POLLERR;
3149 			} else {
3150 				if (kring->nm_sync(kring, sync_flags))
3151 					revents |= POLLERR;
3152 				else
3153 					nm_sync_finalize(kring);
3154 			}
3155 
3156 			/*
3157 			 * If we found new slots, notify potential
3158 			 * listeners on the same ring.
3159 			 * Since we just did a txsync, look at the copies
3160 			 * of cur,tail in the kring.
3161 			 */
3162 			found = kring->rcur != kring->rtail;
3163 			nm_kr_put(kring);
3164 			if (found) { /* notify other listeners */
3165 				revents |= want_tx;
3166 				want_tx = 0;
3167 #ifndef linux
3168 				kring->nm_notify(kring, 0);
3169 #endif /* linux */
3170 			}
3171 		}
3172 		/* if there were any packet to forward we must have handled them by now */
3173 		send_down = 0;
3174 		if (want_tx && retry_tx && sr) {
3175 #ifndef linux
3176 			nm_os_selrecord(sr, check_all_tx ?
3177 			    &na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]]->si);
3178 #endif /* !linux */
3179 			retry_tx = 0;
3180 			goto flush_tx;
3181 		}
3182 	}
3183 
3184 	/*
3185 	 * If want_rx is still set scan receive rings.
3186 	 * Do it on all rings because otherwise we starve.
3187 	 */
3188 	if (want_rx) {
3189 		/* two rounds here for race avoidance */
3190 do_retry_rx:
3191 		for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
3192 			int found = 0;
3193 
3194 			kring = na->rx_rings[i];
3195 			ring = kring->ring;
3196 
3197 			if (unlikely(nm_kr_tryget(kring, 1, &revents)))
3198 				continue;
3199 
3200 			if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
3201 				netmap_ring_reinit(kring);
3202 				revents |= POLLERR;
3203 			}
3204 			/* now we can use kring->rcur, rtail */
3205 
3206 			/*
3207 			 * transparent mode support: collect packets from
3208 			 * hw rxring(s) that have been released by the user
3209 			 */
3210 			if (nm_may_forward_up(kring)) {
3211 				netmap_grab_packets(kring, &q, netmap_fwd);
3212 			}
3213 
3214 			/* Clear the NR_FORWARD flag anyway, it may be set by
3215 			 * the nm_sync() below only on for the host RX ring (see
3216 			 * netmap_rxsync_from_host()). */
3217 			kring->nr_kflags &= ~NR_FORWARD;
3218 			if (kring->nm_sync(kring, sync_flags))
3219 				revents |= POLLERR;
3220 			else
3221 				nm_sync_finalize(kring);
3222 			send_down |= (kring->nr_kflags & NR_FORWARD);
3223 			ring_timestamp_set(ring);
3224 			found = kring->rcur != kring->rtail;
3225 			nm_kr_put(kring);
3226 			if (found) {
3227 				revents |= want_rx;
3228 				retry_rx = 0;
3229 #ifndef linux
3230 				kring->nm_notify(kring, 0);
3231 #endif /* linux */
3232 			}
3233 		}
3234 
3235 #ifndef linux
3236 		if (retry_rx && sr) {
3237 			nm_os_selrecord(sr, check_all_rx ?
3238 			    &na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]]->si);
3239 		}
3240 #endif /* !linux */
3241 		if (send_down || retry_rx) {
3242 			retry_rx = 0;
3243 			if (send_down)
3244 				goto flush_tx; /* and retry_rx */
3245 			else
3246 				goto do_retry_rx;
3247 		}
3248 	}
3249 
3250 	/*
3251 	 * Transparent mode: released bufs (i.e. between kring->nr_hwcur and
3252 	 * ring->head) marked with NS_FORWARD on hw rx rings are passed up
3253 	 * to the host stack.
3254 	 */
3255 
3256 	if (mbq_peek(&q)) {
3257 		netmap_send_up(na->ifp, &q);
3258 	}
3259 
3260 	return (revents);
3261 #undef want_tx
3262 #undef want_rx
3263 }
3264 
3265 int
3266 nma_intr_enable(struct netmap_adapter *na, int onoff)
3267 {
3268 	bool changed = false;
3269 	enum txrx t;
3270 	int i;
3271 
3272 	for_rx_tx(t) {
3273 		for (i = 0; i < nma_get_nrings(na, t); i++) {
3274 			struct netmap_kring *kring = NMR(na, t)[i];
3275 			int on = !(kring->nr_kflags & NKR_NOINTR);
3276 
3277 			if (!!onoff != !!on) {
3278 				changed = true;
3279 			}
3280 			if (onoff) {
3281 				kring->nr_kflags &= ~NKR_NOINTR;
3282 			} else {
3283 				kring->nr_kflags |= NKR_NOINTR;
3284 			}
3285 		}
3286 	}
3287 
3288 	if (!changed) {
3289 		return 0; /* nothing to do */
3290 	}
3291 
3292 	if (!na->nm_intr) {
3293 		D("Cannot %s interrupts for %s", onoff ? "enable" : "disable",
3294 		  na->name);
3295 		return -1;
3296 	}
3297 
3298 	na->nm_intr(na, onoff);
3299 
3300 	return 0;
3301 }
3302 
3303 
3304 /*-------------------- driver support routines -------------------*/
3305 
3306 /* default notify callback */
3307 static int
3308 netmap_notify(struct netmap_kring *kring, int flags)
3309 {
3310 	struct netmap_adapter *na = kring->notify_na;
3311 	enum txrx t = kring->tx;
3312 
3313 	nm_os_selwakeup(&kring->si);
3314 	/* optimization: avoid a wake up on the global
3315 	 * queue if nobody has registered for more
3316 	 * than one ring
3317 	 */
3318 	if (na->si_users[t] > 0)
3319 		nm_os_selwakeup(&na->si[t]);
3320 
3321 	return NM_IRQ_COMPLETED;
3322 }
3323 
3324 /* called by all routines that create netmap_adapters.
3325  * provide some defaults and get a reference to the
3326  * memory allocator
3327  */
3328 int
3329 netmap_attach_common(struct netmap_adapter *na)
3330 {
3331 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
3332 		D("%s: invalid rings tx %d rx %d",
3333 			na->name, na->num_tx_rings, na->num_rx_rings);
3334 		return EINVAL;
3335 	}
3336 
3337 	if (!na->rx_buf_maxsize) {
3338 		/* Set a conservative default (larger is safer). */
3339 		na->rx_buf_maxsize = PAGE_SIZE;
3340 	}
3341 
3342 #ifdef __FreeBSD__
3343 	if (na->na_flags & NAF_HOST_RINGS && na->ifp) {
3344 		na->if_input = na->ifp->if_input; /* for netmap_send_up */
3345 	}
3346 	na->pdev = na; /* make sure netmap_mem_map() is called */
3347 #endif /* __FreeBSD__ */
3348 	if (na->na_flags & NAF_HOST_RINGS) {
3349 		if (na->num_host_rx_rings == 0)
3350 			na->num_host_rx_rings = 1;
3351 		if (na->num_host_tx_rings == 0)
3352 			na->num_host_tx_rings = 1;
3353 	}
3354 	if (na->nm_krings_create == NULL) {
3355 		/* we assume that we have been called by a driver,
3356 		 * since other port types all provide their own
3357 		 * nm_krings_create
3358 		 */
3359 		na->nm_krings_create = netmap_hw_krings_create;
3360 		na->nm_krings_delete = netmap_hw_krings_delete;
3361 	}
3362 	if (na->nm_notify == NULL)
3363 		na->nm_notify = netmap_notify;
3364 	na->active_fds = 0;
3365 
3366 	if (na->nm_mem == NULL) {
3367 		/* use the global allocator */
3368 		na->nm_mem = netmap_mem_get(&nm_mem);
3369 	}
3370 #ifdef WITH_VALE
3371 	if (na->nm_bdg_attach == NULL)
3372 		/* no special nm_bdg_attach callback. On VALE
3373 		 * attach, we need to interpose a bwrap
3374 		 */
3375 		na->nm_bdg_attach = netmap_default_bdg_attach;
3376 #endif
3377 
3378 	return 0;
3379 }
3380 
3381 /* Wrapper for the register callback provided netmap-enabled
3382  * hardware drivers.
3383  * nm_iszombie(na) means that the driver module has been
3384  * unloaded, so we cannot call into it.
3385  * nm_os_ifnet_lock() must guarantee mutual exclusion with
3386  * module unloading.
3387  */
3388 static int
3389 netmap_hw_reg(struct netmap_adapter *na, int onoff)
3390 {
3391 	struct netmap_hw_adapter *hwna =
3392 		(struct netmap_hw_adapter*)na;
3393 	int error = 0;
3394 
3395 	nm_os_ifnet_lock();
3396 
3397 	if (nm_iszombie(na)) {
3398 		if (onoff) {
3399 			error = ENXIO;
3400 		} else if (na != NULL) {
3401 			na->na_flags &= ~NAF_NETMAP_ON;
3402 		}
3403 		goto out;
3404 	}
3405 
3406 	error = hwna->nm_hw_register(na, onoff);
3407 
3408 out:
3409 	nm_os_ifnet_unlock();
3410 
3411 	return error;
3412 }
3413 
3414 static void
3415 netmap_hw_dtor(struct netmap_adapter *na)
3416 {
3417 	if (na->ifp == NULL)
3418 		return;
3419 
3420 	NM_DETACH_NA(na->ifp);
3421 }
3422 
3423 
3424 /*
3425  * Allocate a netmap_adapter object, and initialize it from the
3426  * 'arg' passed by the driver on attach.
3427  * We allocate a block of memory of 'size' bytes, which has room
3428  * for struct netmap_adapter plus additional room private to
3429  * the caller.
3430  * Return 0 on success, ENOMEM otherwise.
3431  */
3432 int
3433 netmap_attach_ext(struct netmap_adapter *arg, size_t size, int override_reg)
3434 {
3435 	struct netmap_hw_adapter *hwna = NULL;
3436 	struct ifnet *ifp = NULL;
3437 
3438 	if (size < sizeof(struct netmap_hw_adapter)) {
3439 		D("Invalid netmap adapter size %d", (int)size);
3440 		return EINVAL;
3441 	}
3442 
3443 	if (arg == NULL || arg->ifp == NULL)
3444 		return EINVAL;
3445 
3446 	ifp = arg->ifp;
3447 	if (NM_NA_CLASH(ifp)) {
3448 		/* If NA(ifp) is not null but there is no valid netmap
3449 		 * adapter it means that someone else is using the same
3450 		 * pointer (e.g. ax25_ptr on linux). This happens for
3451 		 * instance when also PF_RING is in use. */
3452 		D("Error: netmap adapter hook is busy");
3453 		return EBUSY;
3454 	}
3455 
3456 	hwna = nm_os_malloc(size);
3457 	if (hwna == NULL)
3458 		goto fail;
3459 	hwna->up = *arg;
3460 	hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
3461 	strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
3462 	if (override_reg) {
3463 		hwna->nm_hw_register = hwna->up.nm_register;
3464 		hwna->up.nm_register = netmap_hw_reg;
3465 	}
3466 	if (netmap_attach_common(&hwna->up)) {
3467 		nm_os_free(hwna);
3468 		goto fail;
3469 	}
3470 	netmap_adapter_get(&hwna->up);
3471 
3472 	NM_ATTACH_NA(ifp, &hwna->up);
3473 
3474 	nm_os_onattach(ifp);
3475 
3476 	if (arg->nm_dtor == NULL) {
3477 		hwna->up.nm_dtor = netmap_hw_dtor;
3478 	}
3479 
3480 	if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
3481 	    hwna->up.num_tx_rings, hwna->up.num_tx_desc,
3482 	    hwna->up.num_rx_rings, hwna->up.num_rx_desc);
3483 	return 0;
3484 
3485 fail:
3486 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
3487 	return (hwna ? EINVAL : ENOMEM);
3488 }
3489 
3490 
3491 int
3492 netmap_attach(struct netmap_adapter *arg)
3493 {
3494 	return netmap_attach_ext(arg, sizeof(struct netmap_hw_adapter),
3495 			1 /* override nm_reg */);
3496 }
3497 
3498 
3499 void
3500 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
3501 {
3502 	if (!na) {
3503 		return;
3504 	}
3505 
3506 	refcount_acquire(&na->na_refcount);
3507 }
3508 
3509 
3510 /* returns 1 iff the netmap_adapter is destroyed */
3511 int
3512 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
3513 {
3514 	if (!na)
3515 		return 1;
3516 
3517 	if (!refcount_release(&na->na_refcount))
3518 		return 0;
3519 
3520 	if (na->nm_dtor)
3521 		na->nm_dtor(na);
3522 
3523 	if (na->tx_rings) { /* XXX should not happen */
3524 		D("freeing leftover tx_rings");
3525 		na->nm_krings_delete(na);
3526 	}
3527 	netmap_pipe_dealloc(na);
3528 	if (na->nm_mem)
3529 		netmap_mem_put(na->nm_mem);
3530 	bzero(na, sizeof(*na));
3531 	nm_os_free(na);
3532 
3533 	return 1;
3534 }
3535 
3536 /* nm_krings_create callback for all hardware native adapters */
3537 int
3538 netmap_hw_krings_create(struct netmap_adapter *na)
3539 {
3540 	int ret = netmap_krings_create(na, 0);
3541 	if (ret == 0) {
3542 		/* initialize the mbq for the sw rx ring */
3543 		u_int lim = netmap_real_rings(na, NR_RX), i;
3544 		for (i = na->num_rx_rings; i < lim; i++) {
3545 			mbq_safe_init(&NMR(na, NR_RX)[i]->rx_queue);
3546 		}
3547 		ND("initialized sw rx queue %d", na->num_rx_rings);
3548 	}
3549 	return ret;
3550 }
3551 
3552 
3553 
3554 /*
3555  * Called on module unload by the netmap-enabled drivers
3556  */
3557 void
3558 netmap_detach(struct ifnet *ifp)
3559 {
3560 	struct netmap_adapter *na = NA(ifp);
3561 
3562 	if (!na)
3563 		return;
3564 
3565 	NMG_LOCK();
3566 	netmap_set_all_rings(na, NM_KR_LOCKED);
3567 	/*
3568 	 * if the netmap adapter is not native, somebody
3569 	 * changed it, so we can not release it here.
3570 	 * The NAF_ZOMBIE flag will notify the new owner that
3571 	 * the driver is gone.
3572 	 */
3573 	if (!(na->na_flags & NAF_NATIVE) || !netmap_adapter_put(na)) {
3574 		na->na_flags |= NAF_ZOMBIE;
3575 	}
3576 	/* give active users a chance to notice that NAF_ZOMBIE has been
3577 	 * turned on, so that they can stop and return an error to userspace.
3578 	 * Note that this becomes a NOP if there are no active users and,
3579 	 * therefore, the put() above has deleted the na, since now NA(ifp) is
3580 	 * NULL.
3581 	 */
3582 	netmap_enable_all_rings(ifp);
3583 	NMG_UNLOCK();
3584 }
3585 
3586 
3587 /*
3588  * Intercept packets from the network stack and pass them
3589  * to netmap as incoming packets on the 'software' ring.
3590  *
3591  * We only store packets in a bounded mbq and then copy them
3592  * in the relevant rxsync routine.
3593  *
3594  * We rely on the OS to make sure that the ifp and na do not go
3595  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
3596  * In nm_register() or whenever there is a reinitialization,
3597  * we make sure to make the mode change visible here.
3598  */
3599 int
3600 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
3601 {
3602 	struct netmap_adapter *na = NA(ifp);
3603 	struct netmap_kring *kring, *tx_kring;
3604 	u_int len = MBUF_LEN(m);
3605 	u_int error = ENOBUFS;
3606 	unsigned int txr;
3607 	struct mbq *q;
3608 	int busy;
3609 	u_int i;
3610 
3611 	i = MBUF_TXQ(m);
3612 	if (i >= na->num_host_rx_rings) {
3613 		i = i % na->num_host_rx_rings;
3614 	}
3615 	kring = NMR(na, NR_RX)[nma_get_nrings(na, NR_RX) + i];
3616 
3617 	// XXX [Linux] we do not need this lock
3618 	// if we follow the down/configure/up protocol -gl
3619 	// mtx_lock(&na->core_lock);
3620 
3621 	if (!nm_netmap_on(na)) {
3622 		D("%s not in netmap mode anymore", na->name);
3623 		error = ENXIO;
3624 		goto done;
3625 	}
3626 
3627 	txr = MBUF_TXQ(m);
3628 	if (txr >= na->num_tx_rings) {
3629 		txr %= na->num_tx_rings;
3630 	}
3631 	tx_kring = NMR(na, NR_TX)[txr];
3632 
3633 	if (tx_kring->nr_mode == NKR_NETMAP_OFF) {
3634 		return MBUF_TRANSMIT(na, ifp, m);
3635 	}
3636 
3637 	q = &kring->rx_queue;
3638 
3639 	// XXX reconsider long packets if we handle fragments
3640 	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
3641 		D("%s from_host, drop packet size %d > %d", na->name,
3642 			len, NETMAP_BUF_SIZE(na));
3643 		goto done;
3644 	}
3645 
3646 	if (!netmap_generic_hwcsum) {
3647 		if (nm_os_mbuf_has_csum_offld(m)) {
3648 			RD(1, "%s drop mbuf that needs checksum offload", na->name);
3649 			goto done;
3650 		}
3651 	}
3652 
3653 	if (nm_os_mbuf_has_seg_offld(m)) {
3654 		RD(1, "%s drop mbuf that needs generic segmentation offload", na->name);
3655 		goto done;
3656 	}
3657 
3658 	/* protect against netmap_rxsync_from_host(), netmap_sw_to_nic()
3659 	 * and maybe other instances of netmap_transmit (the latter
3660 	 * not possible on Linux).
3661 	 * We enqueue the mbuf only if we are sure there is going to be
3662 	 * enough room in the host RX ring, otherwise we drop it.
3663 	 */
3664 	mbq_lock(q);
3665 
3666 	busy = kring->nr_hwtail - kring->nr_hwcur;
3667 	if (busy < 0)
3668 		busy += kring->nkr_num_slots;
3669 	if (busy + mbq_len(q) >= kring->nkr_num_slots - 1) {
3670 		RD(2, "%s full hwcur %d hwtail %d qlen %d", na->name,
3671 			kring->nr_hwcur, kring->nr_hwtail, mbq_len(q));
3672 	} else {
3673 		mbq_enqueue(q, m);
3674 		ND(2, "%s %d bufs in queue", na->name, mbq_len(q));
3675 		/* notify outside the lock */
3676 		m = NULL;
3677 		error = 0;
3678 	}
3679 	mbq_unlock(q);
3680 
3681 done:
3682 	if (m)
3683 		m_freem(m);
3684 	/* unconditionally wake up listeners */
3685 	kring->nm_notify(kring, 0);
3686 	/* this is normally netmap_notify(), but for nics
3687 	 * connected to a bridge it is netmap_bwrap_intr_notify(),
3688 	 * that possibly forwards the frames through the switch
3689 	 */
3690 
3691 	return (error);
3692 }
3693 
3694 
3695 /*
3696  * netmap_reset() is called by the driver routines when reinitializing
3697  * a ring. The driver is in charge of locking to protect the kring.
3698  * If native netmap mode is not set just return NULL.
3699  * If native netmap mode is set, in particular, we have to set nr_mode to
3700  * NKR_NETMAP_ON.
3701  */
3702 struct netmap_slot *
3703 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
3704 	u_int new_cur)
3705 {
3706 	struct netmap_kring *kring;
3707 	int new_hwofs, lim;
3708 
3709 	if (!nm_native_on(na)) {
3710 		ND("interface not in native netmap mode");
3711 		return NULL;	/* nothing to reinitialize */
3712 	}
3713 
3714 	/* XXX note- in the new scheme, we are not guaranteed to be
3715 	 * under lock (e.g. when called on a device reset).
3716 	 * In this case, we should set a flag and do not trust too
3717 	 * much the values. In practice: TODO
3718 	 * - set a RESET flag somewhere in the kring
3719 	 * - do the processing in a conservative way
3720 	 * - let the *sync() fixup at the end.
3721 	 */
3722 	if (tx == NR_TX) {
3723 		if (n >= na->num_tx_rings)
3724 			return NULL;
3725 
3726 		kring = na->tx_rings[n];
3727 
3728 		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3729 			kring->nr_mode = NKR_NETMAP_OFF;
3730 			return NULL;
3731 		}
3732 
3733 		// XXX check whether we should use hwcur or rcur
3734 		new_hwofs = kring->nr_hwcur - new_cur;
3735 	} else {
3736 		if (n >= na->num_rx_rings)
3737 			return NULL;
3738 		kring = na->rx_rings[n];
3739 
3740 		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3741 			kring->nr_mode = NKR_NETMAP_OFF;
3742 			return NULL;
3743 		}
3744 
3745 		new_hwofs = kring->nr_hwtail - new_cur;
3746 	}
3747 	lim = kring->nkr_num_slots - 1;
3748 	if (new_hwofs > lim)
3749 		new_hwofs -= lim + 1;
3750 
3751 	/* Always set the new offset value and realign the ring. */
3752 	if (netmap_verbose)
3753 	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
3754 		na->name,
3755 		tx == NR_TX ? "TX" : "RX", n,
3756 		kring->nkr_hwofs, new_hwofs,
3757 		kring->nr_hwtail,
3758 		tx == NR_TX ? lim : kring->nr_hwtail);
3759 	kring->nkr_hwofs = new_hwofs;
3760 	if (tx == NR_TX) {
3761 		kring->nr_hwtail = kring->nr_hwcur + lim;
3762 		if (kring->nr_hwtail > lim)
3763 			kring->nr_hwtail -= lim + 1;
3764 	}
3765 
3766 	/*
3767 	 * Wakeup on the individual and global selwait
3768 	 * We do the wakeup here, but the ring is not yet reconfigured.
3769 	 * However, we are under lock so there are no races.
3770 	 */
3771 	kring->nr_mode = NKR_NETMAP_ON;
3772 	kring->nm_notify(kring, 0);
3773 	return kring->ring->slot;
3774 }
3775 
3776 
3777 /*
3778  * Dispatch rx/tx interrupts to the netmap rings.
3779  *
3780  * "work_done" is non-null on the RX path, NULL for the TX path.
3781  * We rely on the OS to make sure that there is only one active
3782  * instance per queue, and that there is appropriate locking.
3783  *
3784  * The 'notify' routine depends on what the ring is attached to.
3785  * - for a netmap file descriptor, do a selwakeup on the individual
3786  *   waitqueue, plus one on the global one if needed
3787  *   (see netmap_notify)
3788  * - for a nic connected to a switch, call the proper forwarding routine
3789  *   (see netmap_bwrap_intr_notify)
3790  */
3791 int
3792 netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
3793 {
3794 	struct netmap_kring *kring;
3795 	enum txrx t = (work_done ? NR_RX : NR_TX);
3796 
3797 	q &= NETMAP_RING_MASK;
3798 
3799 	if (netmap_verbose) {
3800 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
3801 	}
3802 
3803 	if (q >= nma_get_nrings(na, t))
3804 		return NM_IRQ_PASS; // not a physical queue
3805 
3806 	kring = NMR(na, t)[q];
3807 
3808 	if (kring->nr_mode == NKR_NETMAP_OFF) {
3809 		return NM_IRQ_PASS;
3810 	}
3811 
3812 	if (t == NR_RX) {
3813 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
3814 		*work_done = 1; /* do not fire napi again */
3815 	}
3816 
3817 	return kring->nm_notify(kring, 0);
3818 }
3819 
3820 
3821 /*
3822  * Default functions to handle rx/tx interrupts from a physical device.
3823  * "work_done" is non-null on the RX path, NULL for the TX path.
3824  *
3825  * If the card is not in netmap mode, simply return NM_IRQ_PASS,
3826  * so that the caller proceeds with regular processing.
3827  * Otherwise call netmap_common_irq().
3828  *
3829  * If the card is connected to a netmap file descriptor,
3830  * do a selwakeup on the individual queue, plus one on the global one
3831  * if needed (multiqueue card _and_ there are multiqueue listeners),
3832  * and return NR_IRQ_COMPLETED.
3833  *
3834  * Finally, if called on rx from an interface connected to a switch,
3835  * calls the proper forwarding routine.
3836  */
3837 int
3838 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3839 {
3840 	struct netmap_adapter *na = NA(ifp);
3841 
3842 	/*
3843 	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
3844 	 * we still use the regular driver even though the previous
3845 	 * check fails. It is unclear whether we should use
3846 	 * nm_native_on() here.
3847 	 */
3848 	if (!nm_netmap_on(na))
3849 		return NM_IRQ_PASS;
3850 
3851 	if (na->na_flags & NAF_SKIP_INTR) {
3852 		ND("use regular interrupt");
3853 		return NM_IRQ_PASS;
3854 	}
3855 
3856 	return netmap_common_irq(na, q, work_done);
3857 }
3858 
3859 /* set/clear native flags and if_transmit/netdev_ops */
3860 void
3861 nm_set_native_flags(struct netmap_adapter *na)
3862 {
3863 	struct ifnet *ifp = na->ifp;
3864 
3865 	/* We do the setup for intercepting packets only if we are the
3866 	 * first user of this adapapter. */
3867 	if (na->active_fds > 0) {
3868 		return;
3869 	}
3870 
3871 	na->na_flags |= NAF_NETMAP_ON;
3872 	nm_os_onenter(ifp);
3873 	nm_update_hostrings_mode(na);
3874 }
3875 
3876 void
3877 nm_clear_native_flags(struct netmap_adapter *na)
3878 {
3879 	struct ifnet *ifp = na->ifp;
3880 
3881 	/* We undo the setup for intercepting packets only if we are the
3882 	 * last user of this adapapter. */
3883 	if (na->active_fds > 0) {
3884 		return;
3885 	}
3886 
3887 	nm_update_hostrings_mode(na);
3888 	nm_os_onexit(ifp);
3889 
3890 	na->na_flags &= ~NAF_NETMAP_ON;
3891 }
3892 
3893 
3894 /*
3895  * Module loader and unloader
3896  *
3897  * netmap_init() creates the /dev/netmap device and initializes
3898  * all global variables. Returns 0 on success, errno on failure
3899  * (but there is no chance)
3900  *
3901  * netmap_fini() destroys everything.
3902  */
3903 
3904 static struct cdev *netmap_dev; /* /dev/netmap character device. */
3905 extern struct cdevsw netmap_cdevsw;
3906 
3907 
3908 void
3909 netmap_fini(void)
3910 {
3911 	if (netmap_dev)
3912 		destroy_dev(netmap_dev);
3913 	/* we assume that there are no longer netmap users */
3914 	nm_os_ifnet_fini();
3915 	netmap_uninit_bridges();
3916 	netmap_mem_fini();
3917 	NMG_LOCK_DESTROY();
3918 	nm_prinf("netmap: unloaded module.\n");
3919 }
3920 
3921 
3922 int
3923 netmap_init(void)
3924 {
3925 	int error;
3926 
3927 	NMG_LOCK_INIT();
3928 
3929 	error = netmap_mem_init();
3930 	if (error != 0)
3931 		goto fail;
3932 	/*
3933 	 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
3934 	 * when the module is compiled in.
3935 	 * XXX could use make_dev_credv() to get error number
3936 	 */
3937 	netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
3938 		&netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
3939 			      "netmap");
3940 	if (!netmap_dev)
3941 		goto fail;
3942 
3943 	error = netmap_init_bridges();
3944 	if (error)
3945 		goto fail;
3946 
3947 #ifdef __FreeBSD__
3948 	nm_os_vi_init_index();
3949 #endif
3950 
3951 	error = nm_os_ifnet_init();
3952 	if (error)
3953 		goto fail;
3954 
3955 	nm_prinf("netmap: loaded module\n");
3956 	return (0);
3957 fail:
3958 	netmap_fini();
3959 	return (EINVAL); /* may be incorrect */
3960 }
3961