xref: /freebsd/sys/dev/netmap/netmap.c (revision 2a243b9539a45b392a515569cab2091844cf2bdf)
1 /*
2  * Copyright (C) 2011-2014 Matteo Landi
3  * Copyright (C) 2011-2016 Luigi Rizzo
4  * Copyright (C) 2011-2016 Giuseppe Lettieri
5  * Copyright (C) 2011-2016 Vincenzo Maffione
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *   1. Redistributions of source code must retain the above copyright
12  *      notice, this list of conditions and the following disclaimer.
13  *   2. Redistributions in binary form must reproduce the above copyright
14  *      notice, this list of conditions and the following disclaimer in the
15  *      documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 
31 /*
32  * $FreeBSD$
33  *
34  * This module supports memory mapped access to network devices,
35  * see netmap(4).
36  *
37  * The module uses a large, memory pool allocated by the kernel
38  * and accessible as mmapped memory by multiple userspace threads/processes.
39  * The memory pool contains packet buffers and "netmap rings",
40  * i.e. user-accessible copies of the interface's queues.
41  *
42  * Access to the network card works like this:
43  * 1. a process/thread issues one or more open() on /dev/netmap, to create
44  *    select()able file descriptor on which events are reported.
45  * 2. on each descriptor, the process issues an ioctl() to identify
46  *    the interface that should report events to the file descriptor.
47  * 3. on each descriptor, the process issues an mmap() request to
48  *    map the shared memory region within the process' address space.
49  *    The list of interesting queues is indicated by a location in
50  *    the shared memory region.
51  * 4. using the functions in the netmap(4) userspace API, a process
52  *    can look up the occupation state of a queue, access memory buffers,
53  *    and retrieve received packets or enqueue packets to transmit.
54  * 5. using some ioctl()s the process can synchronize the userspace view
55  *    of the queue with the actual status in the kernel. This includes both
56  *    receiving the notification of new packets, and transmitting new
57  *    packets on the output interface.
58  * 6. select() or poll() can be used to wait for events on individual
59  *    transmit or receive queues (or all queues for a given interface).
60  *
61 
62 		SYNCHRONIZATION (USER)
63 
64 The netmap rings and data structures may be shared among multiple
65 user threads or even independent processes.
66 Any synchronization among those threads/processes is delegated
67 to the threads themselves. Only one thread at a time can be in
68 a system call on the same netmap ring. The OS does not enforce
69 this and only guarantees against system crashes in case of
70 invalid usage.
71 
72 		LOCKING (INTERNAL)
73 
74 Within the kernel, access to the netmap rings is protected as follows:
75 
76 - a spinlock on each ring, to handle producer/consumer races on
77   RX rings attached to the host stack (against multiple host
78   threads writing from the host stack to the same ring),
79   and on 'destination' rings attached to a VALE switch
80   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
81   protecting multiple active senders for the same destination)
82 
83 - an atomic variable to guarantee that there is at most one
84   instance of *_*xsync() on the ring at any time.
85   For rings connected to user file
86   descriptors, an atomic_test_and_set() protects this, and the
87   lock on the ring is not actually used.
88   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
89   is also used to prevent multiple executions (the driver might indeed
90   already guarantee this).
91   For NIC TX rings connected to a VALE switch, the lock arbitrates
92   access to the queue (both when allocating buffers and when pushing
93   them out).
94 
95 - *xsync() should be protected against initializations of the card.
96   On FreeBSD most devices have the reset routine protected by
97   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
98   the RING protection on rx_reset(), this should be added.
99 
100   On linux there is an external lock on the tx path, which probably
101   also arbitrates access to the reset routine. XXX to be revised
102 
103 - a per-interface core_lock protecting access from the host stack
104   while interfaces may be detached from netmap mode.
105   XXX there should be no need for this lock if we detach the interfaces
106   only while they are down.
107 
108 
109 --- VALE SWITCH ---
110 
111 NMG_LOCK() serializes all modifications to switches and ports.
112 A switch cannot be deleted until all ports are gone.
113 
114 For each switch, an SX lock (RWlock on linux) protects
115 deletion of ports. When configuring or deleting a new port, the
116 lock is acquired in exclusive mode (after holding NMG_LOCK).
117 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
118 The lock is held throughout the entire forwarding cycle,
119 during which the thread may incur in a page fault.
120 Hence it is important that sleepable shared locks are used.
121 
122 On the rx ring, the per-port lock is grabbed initially to reserve
123 a number of slot in the ring, then the lock is released,
124 packets are copied from source to destination, and then
125 the lock is acquired again and the receive ring is updated.
126 (A similar thing is done on the tx ring for NIC and host stack
127 ports attached to the switch)
128 
129  */
130 
131 
132 /* --- internals ----
133  *
134  * Roadmap to the code that implements the above.
135  *
136  * > 1. a process/thread issues one or more open() on /dev/netmap, to create
137  * >    select()able file descriptor on which events are reported.
138  *
139  *  	Internally, we allocate a netmap_priv_d structure, that will be
140  *  	initialized on ioctl(NIOCREGIF). There is one netmap_priv_d
141  *  	structure for each open().
142  *
143  *      os-specific:
144  *  	    FreeBSD: see netmap_open() (netmap_freebsd.c)
145  *  	    linux:   see linux_netmap_open() (netmap_linux.c)
146  *
147  * > 2. on each descriptor, the process issues an ioctl() to identify
148  * >    the interface that should report events to the file descriptor.
149  *
150  * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
151  * 	Most important things happen in netmap_get_na() and
152  * 	netmap_do_regif(), called from there. Additional details can be
153  * 	found in the comments above those functions.
154  *
155  * 	In all cases, this action creates/takes-a-reference-to a
156  * 	netmap_*_adapter describing the port, and allocates a netmap_if
157  * 	and all necessary netmap rings, filling them with netmap buffers.
158  *
159  *      In this phase, the sync callbacks for each ring are set (these are used
160  *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
161  *      The adapter creation/initialization code puts them in the
162  * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
163  * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
164  * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
165  * 	actually call netmap_krings_create() to perform this and the other
166  * 	common stuff. netmap_krings_create() also takes care of the host rings,
167  * 	if needed, by setting their sync callbacks appropriately.
168  *
169  * 	Additional actions depend on the kind of netmap_adapter that has been
170  * 	registered:
171  *
172  * 	- netmap_hw_adapter:  	     [netmap.c]
173  * 	     This is a system netdev/ifp with native netmap support.
174  * 	     The ifp is detached from the host stack by redirecting:
175  * 	       - transmissions (from the network stack) to netmap_transmit()
176  * 	       - receive notifications to the nm_notify() callback for
177  * 	         this adapter. The callback is normally netmap_notify(), unless
178  * 	         the ifp is attached to a bridge using bwrap, in which case it
179  * 	         is netmap_bwrap_intr_notify().
180  *
181  * 	- netmap_generic_adapter:      [netmap_generic.c]
182  * 	      A system netdev/ifp without native netmap support.
183  *
184  * 	(the decision about native/non native support is taken in
185  * 	 netmap_get_hw_na(), called by netmap_get_na())
186  *
187  * 	- netmap_vp_adapter 		[netmap_vale.c]
188  * 	      Returned by netmap_get_bdg_na().
189  * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
190  * 	      are created on the fly if they don't already exist, and are
191  * 	      always attached to a bridge.
192  * 	      Persistent VALE ports must must be created separately, and i
193  * 	      then attached like normal NICs. The NIOCREGIF we are examining
194  * 	      will find them only if they had previosly been created and
195  * 	      attached (see VALE_CTL below).
196  *
197  * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
198  * 	      Returned by netmap_get_pipe_na().
199  * 	      Both pipe ends are created, if they didn't already exist.
200  *
201  * 	- netmap_monitor_adapter      [netmap_monitor.c]
202  * 	      Returned by netmap_get_monitor_na().
203  * 	      If successful, the nm_sync callbacks of the monitored adapter
204  * 	      will be intercepted by the returned monitor.
205  *
206  * 	- netmap_bwrap_adapter	      [netmap_vale.c]
207  * 	      Cannot be obtained in this way, see VALE_CTL below
208  *
209  *
210  * 	os-specific:
211  * 	    linux: we first go through linux_netmap_ioctl() to
212  * 	           adapt the FreeBSD interface to the linux one.
213  *
214  *
215  * > 3. on each descriptor, the process issues an mmap() request to
216  * >    map the shared memory region within the process' address space.
217  * >    The list of interesting queues is indicated by a location in
218  * >    the shared memory region.
219  *
220  *      os-specific:
221  *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
222  *  	    linux:   linux_netmap_mmap (netmap_linux.c).
223  *
224  * > 4. using the functions in the netmap(4) userspace API, a process
225  * >    can look up the occupation state of a queue, access memory buffers,
226  * >    and retrieve received packets or enqueue packets to transmit.
227  *
228  * 	these actions do not involve the kernel.
229  *
230  * > 5. using some ioctl()s the process can synchronize the userspace view
231  * >    of the queue with the actual status in the kernel. This includes both
232  * >    receiving the notification of new packets, and transmitting new
233  * >    packets on the output interface.
234  *
235  * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
236  * 	cases. They invoke the nm_sync callbacks on the netmap_kring
237  * 	structures, as initialized in step 2 and maybe later modified
238  * 	by a monitor. Monitors, however, will always call the original
239  * 	callback before doing anything else.
240  *
241  *
242  * > 6. select() or poll() can be used to wait for events on individual
243  * >    transmit or receive queues (or all queues for a given interface).
244  *
245  * 	Implemented in netmap_poll(). This will call the same nm_sync()
246  * 	callbacks as in step 5 above.
247  *
248  * 	os-specific:
249  * 		linux: we first go through linux_netmap_poll() to adapt
250  * 		       the FreeBSD interface to the linux one.
251  *
252  *
253  *  ----  VALE_CTL -----
254  *
255  *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
256  *  nr_cmd in the nmreq structure. These subcommands are handled by
257  *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
258  *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
259  *  subcommands, respectively.
260  *
261  *  Any network interface known to the system (including a persistent VALE
262  *  port) can be attached to a VALE switch by issuing the
263  *  NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports
264  *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
265  *  attachment of other interfaces, instead, requires the creation of a
266  *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
267  *  netmap mode. This may require the creation of a netmap_generic_adapter if
268  *  we have no native support for the interface, or if generic adapters have
269  *  been forced by sysctl.
270  *
271  *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
272  *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
273  *  callback.  In the case of the bwrap, the callback creates the
274  *  netmap_bwrap_adapter.  The initialization of the bwrap is then
275  *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
276  *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
277  *  A generic adapter for the wrapped ifp will be created if needed, when
278  *  netmap_get_bdg_na() calls netmap_get_hw_na().
279  *
280  *
281  *  ---- DATAPATHS -----
282  *
283  *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
284  *
285  *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
286  *
287  *    - tx from netmap userspace:
288  *	 concurrently:
289  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
290  *                kring->nm_sync() == DEVICE_netmap_txsync()
291  *           2) device interrupt handler
292  *                na->nm_notify()  == netmap_notify()
293  *    - rx from netmap userspace:
294  *       concurrently:
295  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
296  *                kring->nm_sync() == DEVICE_netmap_rxsync()
297  *           2) device interrupt handler
298  *                na->nm_notify()  == netmap_notify()
299  *    - rx from host stack
300  *       concurrently:
301  *           1) host stack
302  *                netmap_transmit()
303  *                  na->nm_notify  == netmap_notify()
304  *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
305  *                kring->nm_sync() == netmap_rxsync_from_host
306  *                  netmap_rxsync_from_host(na, NULL, NULL)
307  *    - tx to host stack
308  *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
309  *             kring->nm_sync() == netmap_txsync_to_host
310  *               netmap_txsync_to_host(na)
311  *                 nm_os_send_up()
312  *                   FreeBSD: na->if_input() == ether_input()
313  *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
314  *
315  *
316  *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
317  *
318  *    na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
319  *
320  *    - tx from netmap userspace:
321  *       concurrently:
322  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
323  *               kring->nm_sync() == generic_netmap_txsync()
324  *                   nm_os_generic_xmit_frame()
325  *                       linux:   dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
326  *                           ifp->ndo_start_xmit == generic_ndo_start_xmit()
327  *                               gna->save_start_xmit == orig. dev. start_xmit
328  *                       FreeBSD: na->if_transmit() == orig. dev if_transmit
329  *           2) generic_mbuf_destructor()
330  *                   na->nm_notify() == netmap_notify()
331  *    - rx from netmap userspace:
332  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
333  *               kring->nm_sync() == generic_netmap_rxsync()
334  *                   mbq_safe_dequeue()
335  *           2) device driver
336  *               generic_rx_handler()
337  *                   mbq_safe_enqueue()
338  *                   na->nm_notify() == netmap_notify()
339  *    - rx from host stack
340  *        FreeBSD: same as native
341  *        Linux: same as native except:
342  *           1) host stack
343  *               dev_queue_xmit() without NM_MAGIC_PRIORITY_TX
344  *                   ifp->ndo_start_xmit == generic_ndo_start_xmit()
345  *                       netmap_transmit()
346  *                           na->nm_notify() == netmap_notify()
347  *    - tx to host stack (same as native):
348  *
349  *
350  *                           -= VALE =-
351  *
352  *   INCOMING:
353  *
354  *      - VALE ports:
355  *          ioctl(NIOCTXSYNC)/netmap_poll() in process context
356  *              kring->nm_sync() == netmap_vp_txsync()
357  *
358  *      - system device with native support:
359  *         from cable:
360  *             interrupt
361  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
362  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
363  *                     netmap_vp_txsync()
364  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
365  *         from host stack:
366  *             netmap_transmit()
367  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
368  *                     kring->nm_sync() == netmap_rxsync_from_host()
369  *                     netmap_vp_txsync()
370  *
371  *      - system device with generic support:
372  *         from device driver:
373  *            generic_rx_handler()
374  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
375  *                     kring->nm_sync() == generic_netmap_rxsync()
376  *                     netmap_vp_txsync()
377  *                     kring->nm_sync() == generic_netmap_rxsync()
378  *         from host stack:
379  *            netmap_transmit()
380  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
381  *                     kring->nm_sync() == netmap_rxsync_from_host()
382  *                     netmap_vp_txsync()
383  *
384  *   (all cases) --> nm_bdg_flush()
385  *                      dest_na->nm_notify() == (see below)
386  *
387  *   OUTGOING:
388  *
389  *      - VALE ports:
390  *         concurrently:
391  *             1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
392  *                    kring->nm_sync() == netmap_vp_rxsync()
393  *             2) from nm_bdg_flush()
394  *                    na->nm_notify() == netmap_notify()
395  *
396  *      - system device with native support:
397  *          to cable:
398  *             na->nm_notify() == netmap_bwrap_notify()
399  *                 netmap_vp_rxsync()
400  *                 kring->nm_sync() == DEVICE_netmap_txsync()
401  *                 netmap_vp_rxsync()
402  *          to host stack:
403  *                 netmap_vp_rxsync()
404  *                 kring->nm_sync() == netmap_txsync_to_host
405  *                 netmap_vp_rxsync_locked()
406  *
407  *      - system device with generic adapter:
408  *          to device driver:
409  *             na->nm_notify() == netmap_bwrap_notify()
410  *                 netmap_vp_rxsync()
411  *                 kring->nm_sync() == generic_netmap_txsync()
412  *                 netmap_vp_rxsync()
413  *          to host stack:
414  *                 netmap_vp_rxsync()
415  *                 kring->nm_sync() == netmap_txsync_to_host
416  *                 netmap_vp_rxsync()
417  *
418  */
419 
420 /*
421  * OS-specific code that is used only within this file.
422  * Other OS-specific code that must be accessed by drivers
423  * is present in netmap_kern.h
424  */
425 
426 #if defined(__FreeBSD__)
427 #include <sys/cdefs.h> /* prerequisite */
428 #include <sys/types.h>
429 #include <sys/errno.h>
430 #include <sys/param.h>	/* defines used in kernel.h */
431 #include <sys/kernel.h>	/* types used in module initialization */
432 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
433 #include <sys/filio.h>	/* FIONBIO */
434 #include <sys/sockio.h>
435 #include <sys/socketvar.h>	/* struct socket */
436 #include <sys/malloc.h>
437 #include <sys/poll.h>
438 #include <sys/rwlock.h>
439 #include <sys/socket.h> /* sockaddrs */
440 #include <sys/selinfo.h>
441 #include <sys/sysctl.h>
442 #include <sys/jail.h>
443 #include <net/vnet.h>
444 #include <net/if.h>
445 #include <net/if_var.h>
446 #include <net/bpf.h>		/* BIOCIMMEDIATE */
447 #include <machine/bus.h>	/* bus_dmamap_* */
448 #include <sys/endian.h>
449 #include <sys/refcount.h>
450 
451 
452 #elif defined(linux)
453 
454 #include "bsd_glue.h"
455 
456 #elif defined(__APPLE__)
457 
458 #warning OSX support is only partial
459 #include "osx_glue.h"
460 
461 #elif defined (_WIN32)
462 
463 #include "win_glue.h"
464 
465 #else
466 
467 #error	Unsupported platform
468 
469 #endif /* unsupported */
470 
471 /*
472  * common headers
473  */
474 #include <net/netmap.h>
475 #include <dev/netmap/netmap_kern.h>
476 #include <dev/netmap/netmap_mem2.h>
477 
478 
479 /* user-controlled variables */
480 int netmap_verbose;
481 
482 static int netmap_no_timestamp; /* don't timestamp on rxsync */
483 int netmap_mitigate = 1;
484 int netmap_no_pendintr = 1;
485 int netmap_txsync_retry = 2;
486 int netmap_flags = 0;	/* debug flags */
487 static int netmap_fwd = 0;	/* force transparent forwarding */
488 
489 /*
490  * netmap_admode selects the netmap mode to use.
491  * Invalid values are reset to NETMAP_ADMODE_BEST
492  */
493 enum {	NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
494 	NETMAP_ADMODE_NATIVE,	/* either native or none */
495 	NETMAP_ADMODE_GENERIC,	/* force generic */
496 	NETMAP_ADMODE_LAST };
497 static int netmap_admode = NETMAP_ADMODE_BEST;
498 
499 /* netmap_generic_mit controls mitigation of RX notifications for
500  * the generic netmap adapter. The value is a time interval in
501  * nanoseconds. */
502 int netmap_generic_mit = 100*1000;
503 
504 /* We use by default netmap-aware qdiscs with generic netmap adapters,
505  * even if there can be a little performance hit with hardware NICs.
506  * However, using the qdisc is the safer approach, for two reasons:
507  * 1) it prevents non-fifo qdiscs to break the TX notification
508  *    scheme, which is based on mbuf destructors when txqdisc is
509  *    not used.
510  * 2) it makes it possible to transmit over software devices that
511  *    change skb->dev, like bridge, veth, ...
512  *
513  * Anyway users looking for the best performance should
514  * use native adapters.
515  */
516 int netmap_generic_txqdisc = 1;
517 
518 /* Default number of slots and queues for generic adapters. */
519 int netmap_generic_ringsize = 1024;
520 int netmap_generic_rings = 1;
521 
522 /* Non-zero if ptnet devices are allowed to use virtio-net headers. */
523 int ptnet_vnet_hdr = 1;
524 
525 /* 0 if ptnetmap should not use worker threads for TX processing */
526 int ptnetmap_tx_workers = 1;
527 
528 /*
529  * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated
530  * in some other operating systems
531  */
532 SYSBEGIN(main_init);
533 
534 SYSCTL_DECL(_dev_netmap);
535 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
536 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
537     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
538 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
539     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
540 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
541 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
542     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
543 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
544     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
545 
546 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
547 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
548 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
549 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
550 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
551 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");
552 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW, &netmap_generic_txqdisc, 0 , "");
553 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr, 0 , "");
554 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnetmap_tx_workers, CTLFLAG_RW, &ptnetmap_tx_workers, 0 , "");
555 
556 SYSEND;
557 
558 NMG_LOCK_T	netmap_global_lock;
559 
560 /*
561  * mark the ring as stopped, and run through the locks
562  * to make sure other users get to see it.
563  * stopped must be either NR_KR_STOPPED (for unbounded stop)
564  * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)
565  */
566 static void
567 netmap_disable_ring(struct netmap_kring *kr, int stopped)
568 {
569 	nm_kr_stop(kr, stopped);
570 	// XXX check if nm_kr_stop is sufficient
571 	mtx_lock(&kr->q_lock);
572 	mtx_unlock(&kr->q_lock);
573 	nm_kr_put(kr);
574 }
575 
576 /* stop or enable a single ring */
577 void
578 netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
579 {
580 	if (stopped)
581 		netmap_disable_ring(NMR(na, t) + ring_id, stopped);
582 	else
583 		NMR(na, t)[ring_id].nkr_stopped = 0;
584 }
585 
586 
587 /* stop or enable all the rings of na */
588 void
589 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
590 {
591 	int i;
592 	enum txrx t;
593 
594 	if (!nm_netmap_on(na))
595 		return;
596 
597 	for_rx_tx(t) {
598 		for (i = 0; i < netmap_real_rings(na, t); i++) {
599 			netmap_set_ring(na, i, t, stopped);
600 		}
601 	}
602 }
603 
604 /*
605  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
606  * to finish and prevents any new one from starting.  Call this before turning
607  * netmap mode off, or before removing the hardware rings (e.g., on module
608  * onload).
609  */
610 void
611 netmap_disable_all_rings(struct ifnet *ifp)
612 {
613 	if (NM_NA_VALID(ifp)) {
614 		netmap_set_all_rings(NA(ifp), NM_KR_STOPPED);
615 	}
616 }
617 
618 /*
619  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
620  * adapter's rings In linux drivers, this should be placed near each
621  * napi_enable().
622  */
623 void
624 netmap_enable_all_rings(struct ifnet *ifp)
625 {
626 	if (NM_NA_VALID(ifp)) {
627 		netmap_set_all_rings(NA(ifp), 0 /* enabled */);
628 	}
629 }
630 
631 void
632 netmap_make_zombie(struct ifnet *ifp)
633 {
634 	if (NM_NA_VALID(ifp)) {
635 		struct netmap_adapter *na = NA(ifp);
636 		netmap_set_all_rings(na, NM_KR_LOCKED);
637 		na->na_flags |= NAF_ZOMBIE;
638 		netmap_set_all_rings(na, 0);
639 	}
640 }
641 
642 void
643 netmap_undo_zombie(struct ifnet *ifp)
644 {
645 	if (NM_NA_VALID(ifp)) {
646 		struct netmap_adapter *na = NA(ifp);
647 		if (na->na_flags & NAF_ZOMBIE) {
648 			netmap_set_all_rings(na, NM_KR_LOCKED);
649 			na->na_flags &= ~NAF_ZOMBIE;
650 			netmap_set_all_rings(na, 0);
651 		}
652 	}
653 }
654 
655 /*
656  * generic bound_checking function
657  */
658 u_int
659 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
660 {
661 	u_int oldv = *v;
662 	const char *op = NULL;
663 
664 	if (dflt < lo)
665 		dflt = lo;
666 	if (dflt > hi)
667 		dflt = hi;
668 	if (oldv < lo) {
669 		*v = dflt;
670 		op = "Bump";
671 	} else if (oldv > hi) {
672 		*v = hi;
673 		op = "Clamp";
674 	}
675 	if (op && msg)
676 		nm_prinf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
677 	return *v;
678 }
679 
680 
681 /*
682  * packet-dump function, user-supplied or static buffer.
683  * The destination buffer must be at least 30+4*len
684  */
685 const char *
686 nm_dump_buf(char *p, int len, int lim, char *dst)
687 {
688 	static char _dst[8192];
689 	int i, j, i0;
690 	static char hex[] ="0123456789abcdef";
691 	char *o;	/* output position */
692 
693 #define P_HI(x)	hex[((x) & 0xf0)>>4]
694 #define P_LO(x)	hex[((x) & 0xf)]
695 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
696 	if (!dst)
697 		dst = _dst;
698 	if (lim <= 0 || lim > len)
699 		lim = len;
700 	o = dst;
701 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
702 	o += strlen(o);
703 	/* hexdump routine */
704 	for (i = 0; i < lim; ) {
705 		sprintf(o, "%5d: ", i);
706 		o += strlen(o);
707 		memset(o, ' ', 48);
708 		i0 = i;
709 		for (j=0; j < 16 && i < lim; i++, j++) {
710 			o[j*3] = P_HI(p[i]);
711 			o[j*3+1] = P_LO(p[i]);
712 		}
713 		i = i0;
714 		for (j=0; j < 16 && i < lim; i++, j++)
715 			o[j + 48] = P_C(p[i]);
716 		o[j+48] = '\n';
717 		o += j+49;
718 	}
719 	*o = '\0';
720 #undef P_HI
721 #undef P_LO
722 #undef P_C
723 	return dst;
724 }
725 
726 
727 /*
728  * Fetch configuration from the device, to cope with dynamic
729  * reconfigurations after loading the module.
730  */
731 /* call with NMG_LOCK held */
732 int
733 netmap_update_config(struct netmap_adapter *na)
734 {
735 	u_int txr, txd, rxr, rxd;
736 
737 	txr = txd = rxr = rxd = 0;
738 	if (na->nm_config == NULL ||
739 	    na->nm_config(na, &txr, &txd, &rxr, &rxd))
740 	{
741 		/* take whatever we had at init time */
742 		txr = na->num_tx_rings;
743 		txd = na->num_tx_desc;
744 		rxr = na->num_rx_rings;
745 		rxd = na->num_rx_desc;
746 	}
747 
748 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
749 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
750 		return 0; /* nothing changed */
751 	if (netmap_verbose || na->active_fds > 0) {
752 		D("stored config %s: txring %d x %d, rxring %d x %d",
753 			na->name,
754 			na->num_tx_rings, na->num_tx_desc,
755 			na->num_rx_rings, na->num_rx_desc);
756 		D("new config %s: txring %d x %d, rxring %d x %d",
757 			na->name, txr, txd, rxr, rxd);
758 	}
759 	if (na->active_fds == 0) {
760 		D("configuration changed (but fine)");
761 		na->num_tx_rings = txr;
762 		na->num_tx_desc = txd;
763 		na->num_rx_rings = rxr;
764 		na->num_rx_desc = rxd;
765 		return 0;
766 	}
767 	D("configuration changed while active, this is bad...");
768 	return 1;
769 }
770 
771 /* nm_sync callbacks for the host rings */
772 static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
773 static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
774 
775 /* create the krings array and initialize the fields common to all adapters.
776  * The array layout is this:
777  *
778  *                    +----------+
779  * na->tx_rings ----->|          | \
780  *                    |          |  } na->num_tx_ring
781  *                    |          | /
782  *                    +----------+
783  *                    |          |    host tx kring
784  * na->rx_rings ----> +----------+
785  *                    |          | \
786  *                    |          |  } na->num_rx_rings
787  *                    |          | /
788  *                    +----------+
789  *                    |          |    host rx kring
790  *                    +----------+
791  * na->tailroom ----->|          | \
792  *                    |          |  } tailroom bytes
793  *                    |          | /
794  *                    +----------+
795  *
796  * Note: for compatibility, host krings are created even when not needed.
797  * The tailroom space is currently used by vale ports for allocating leases.
798  */
799 /* call with NMG_LOCK held */
800 int
801 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
802 {
803 	u_int i, len, ndesc;
804 	struct netmap_kring *kring;
805 	u_int n[NR_TXRX];
806 	enum txrx t;
807 
808 	if (na->tx_rings != NULL) {
809 		D("warning: krings were already created");
810 		return 0;
811 	}
812 
813 	/* account for the (possibly fake) host rings */
814 	n[NR_TX] = na->num_tx_rings + 1;
815 	n[NR_RX] = na->num_rx_rings + 1;
816 
817 	len = (n[NR_TX] + n[NR_RX]) * sizeof(struct netmap_kring) + tailroom;
818 
819 	na->tx_rings = nm_os_malloc((size_t)len);
820 	if (na->tx_rings == NULL) {
821 		D("Cannot allocate krings");
822 		return ENOMEM;
823 	}
824 	na->rx_rings = na->tx_rings + n[NR_TX];
825 
826 	/*
827 	 * All fields in krings are 0 except the one initialized below.
828 	 * but better be explicit on important kring fields.
829 	 */
830 	for_rx_tx(t) {
831 		ndesc = nma_get_ndesc(na, t);
832 		for (i = 0; i < n[t]; i++) {
833 			kring = &NMR(na, t)[i];
834 			bzero(kring, sizeof(*kring));
835 			kring->na = na;
836 			kring->ring_id = i;
837 			kring->tx = t;
838 			kring->nkr_num_slots = ndesc;
839 			kring->nr_mode = NKR_NETMAP_OFF;
840 			kring->nr_pending_mode = NKR_NETMAP_OFF;
841 			if (i < nma_get_nrings(na, t)) {
842 				kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
843 			} else {
844 				kring->nm_sync = (t == NR_TX ?
845 						netmap_txsync_to_host:
846 						netmap_rxsync_from_host);
847 			}
848 			kring->nm_notify = na->nm_notify;
849 			kring->rhead = kring->rcur = kring->nr_hwcur = 0;
850 			/*
851 			 * IMPORTANT: Always keep one slot empty.
852 			 */
853 			kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
854 			snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
855 					nm_txrx2str(t), i);
856 			ND("ktx %s h %d c %d t %d",
857 				kring->name, kring->rhead, kring->rcur, kring->rtail);
858 			mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
859 			nm_os_selinfo_init(&kring->si);
860 		}
861 		nm_os_selinfo_init(&na->si[t]);
862 	}
863 
864 	na->tailroom = na->rx_rings + n[NR_RX];
865 
866 	return 0;
867 }
868 
869 
870 /* undo the actions performed by netmap_krings_create */
871 /* call with NMG_LOCK held */
872 void
873 netmap_krings_delete(struct netmap_adapter *na)
874 {
875 	struct netmap_kring *kring = na->tx_rings;
876 	enum txrx t;
877 
878 	if (na->tx_rings == NULL) {
879 		D("warning: krings were already deleted");
880 		return;
881 	}
882 
883 	for_rx_tx(t)
884 		nm_os_selinfo_uninit(&na->si[t]);
885 
886 	/* we rely on the krings layout described above */
887 	for ( ; kring != na->tailroom; kring++) {
888 		mtx_destroy(&kring->q_lock);
889 		nm_os_selinfo_uninit(&kring->si);
890 	}
891 	nm_os_free(na->tx_rings);
892 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
893 }
894 
895 
896 /*
897  * Destructor for NIC ports. They also have an mbuf queue
898  * on the rings connected to the host so we need to purge
899  * them first.
900  */
901 /* call with NMG_LOCK held */
902 void
903 netmap_hw_krings_delete(struct netmap_adapter *na)
904 {
905 	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
906 
907 	ND("destroy sw mbq with len %d", mbq_len(q));
908 	mbq_purge(q);
909 	mbq_safe_fini(q);
910 	netmap_krings_delete(na);
911 }
912 
913 
914 
915 /*
916  * Undo everything that was done in netmap_do_regif(). In particular,
917  * call nm_register(ifp,0) to stop netmap mode on the interface and
918  * revert to normal operation.
919  */
920 /* call with NMG_LOCK held */
921 static void netmap_unset_ringid(struct netmap_priv_d *);
922 static void netmap_krings_put(struct netmap_priv_d *);
923 void
924 netmap_do_unregif(struct netmap_priv_d *priv)
925 {
926 	struct netmap_adapter *na = priv->np_na;
927 
928 	NMG_LOCK_ASSERT();
929 	na->active_fds--;
930 	/* unset nr_pending_mode and possibly release exclusive mode */
931 	netmap_krings_put(priv);
932 
933 #ifdef	WITH_MONITOR
934 	/* XXX check whether we have to do something with monitor
935 	 * when rings change nr_mode. */
936 	if (na->active_fds <= 0) {
937 		/* walk through all the rings and tell any monitor
938 		 * that the port is going to exit netmap mode
939 		 */
940 		netmap_monitor_stop(na);
941 	}
942 #endif
943 
944 	if (na->active_fds <= 0 || nm_kring_pending(priv)) {
945 		na->nm_register(na, 0);
946 	}
947 
948 	/* delete rings and buffers that are no longer needed */
949 	netmap_mem_rings_delete(na);
950 
951 	if (na->active_fds <= 0) {	/* last instance */
952 		/*
953 		 * (TO CHECK) We enter here
954 		 * when the last reference to this file descriptor goes
955 		 * away. This means we cannot have any pending poll()
956 		 * or interrupt routine operating on the structure.
957 		 * XXX The file may be closed in a thread while
958 		 * another thread is using it.
959 		 * Linux keeps the file opened until the last reference
960 		 * by any outstanding ioctl/poll or mmap is gone.
961 		 * FreeBSD does not track mmap()s (but we do) and
962 		 * wakes up any sleeping poll(). Need to check what
963 		 * happens if the close() occurs while a concurrent
964 		 * syscall is running.
965 		 */
966 		if (netmap_verbose)
967 			D("deleting last instance for %s", na->name);
968 
969                 if (nm_netmap_on(na)) {
970                     D("BUG: netmap on while going to delete the krings");
971                 }
972 
973 		na->nm_krings_delete(na);
974 	}
975 
976 	/* possibily decrement counter of tx_si/rx_si users */
977 	netmap_unset_ringid(priv);
978 	/* delete the nifp */
979 	netmap_mem_if_delete(na, priv->np_nifp);
980 	/* drop the allocator */
981 	netmap_mem_deref(na->nm_mem, na);
982 	/* mark the priv as unregistered */
983 	priv->np_na = NULL;
984 	priv->np_nifp = NULL;
985 }
986 
987 /* call with NMG_LOCK held */
988 static __inline int
989 nm_si_user(struct netmap_priv_d *priv, enum txrx t)
990 {
991 	return (priv->np_na != NULL &&
992 		(priv->np_qlast[t] - priv->np_qfirst[t] > 1));
993 }
994 
995 struct netmap_priv_d*
996 netmap_priv_new(void)
997 {
998 	struct netmap_priv_d *priv;
999 
1000 	priv = nm_os_malloc(sizeof(struct netmap_priv_d));
1001 	if (priv == NULL)
1002 		return NULL;
1003 	priv->np_refs = 1;
1004 	nm_os_get_module();
1005 	return priv;
1006 }
1007 
1008 /*
1009  * Destructor of the netmap_priv_d, called when the fd is closed
1010  * Action: undo all the things done by NIOCREGIF,
1011  * On FreeBSD we need to track whether there are active mmap()s,
1012  * and we use np_active_mmaps for that. On linux, the field is always 0.
1013  * Return: 1 if we can free priv, 0 otherwise.
1014  *
1015  */
1016 /* call with NMG_LOCK held */
1017 void
1018 netmap_priv_delete(struct netmap_priv_d *priv)
1019 {
1020 	struct netmap_adapter *na = priv->np_na;
1021 
1022 	/* number of active references to this fd */
1023 	if (--priv->np_refs > 0) {
1024 		return;
1025 	}
1026 	nm_os_put_module();
1027 	if (na) {
1028 		netmap_do_unregif(priv);
1029 	}
1030 	netmap_unget_na(na, priv->np_ifp);
1031 	bzero(priv, sizeof(*priv));	/* for safety */
1032 	nm_os_free(priv);
1033 }
1034 
1035 
1036 /* call with NMG_LOCK *not* held */
1037 void
1038 netmap_dtor(void *data)
1039 {
1040 	struct netmap_priv_d *priv = data;
1041 
1042 	NMG_LOCK();
1043 	netmap_priv_delete(priv);
1044 	NMG_UNLOCK();
1045 }
1046 
1047 
1048 /*
1049  * Handlers for synchronization of the rings from/to the host stack.
1050  * These are associated to a network interface and are just another
1051  * ring pair managed by userspace.
1052  *
1053  * Netmap also supports transparent forwarding (NS_FORWARD and NR_FORWARD
1054  * flags):
1055  *
1056  * - Before releasing buffers on hw RX rings, the application can mark
1057  *   them with the NS_FORWARD flag. During the next RXSYNC or poll(), they
1058  *   will be forwarded to the host stack, similarly to what happened if
1059  *   the application moved them to the host TX ring.
1060  *
1061  * - Before releasing buffers on the host RX ring, the application can
1062  *   mark them with the NS_FORWARD flag. During the next RXSYNC or poll(),
1063  *   they will be forwarded to the hw TX rings, saving the application
1064  *   from doing the same task in user-space.
1065  *
1066  * Transparent fowarding can be enabled per-ring, by setting the NR_FORWARD
1067  * flag, or globally with the netmap_fwd sysctl.
1068  *
1069  * The transfer NIC --> host is relatively easy, just encapsulate
1070  * into mbufs and we are done. The host --> NIC side is slightly
1071  * harder because there might not be room in the tx ring so it
1072  * might take a while before releasing the buffer.
1073  */
1074 
1075 
1076 /*
1077  * Pass a whole queue of mbufs to the host stack as coming from 'dst'
1078  * We do not need to lock because the queue is private.
1079  * After this call the queue is empty.
1080  */
1081 static void
1082 netmap_send_up(struct ifnet *dst, struct mbq *q)
1083 {
1084 	struct mbuf *m;
1085 	struct mbuf *head = NULL, *prev = NULL;
1086 
1087 	/* Send packets up, outside the lock; head/prev machinery
1088 	 * is only useful for Windows. */
1089 	while ((m = mbq_dequeue(q)) != NULL) {
1090 		if (netmap_verbose & NM_VERB_HOST)
1091 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
1092 		prev = nm_os_send_up(dst, m, prev);
1093 		if (head == NULL)
1094 			head = prev;
1095 	}
1096 	if (head)
1097 		nm_os_send_up(dst, NULL, head);
1098 	mbq_fini(q);
1099 }
1100 
1101 
1102 /*
1103  * Scan the buffers from hwcur to ring->head, and put a copy of those
1104  * marked NS_FORWARD (or all of them if forced) into a queue of mbufs.
1105  * Drop remaining packets in the unlikely event
1106  * of an mbuf shortage.
1107  */
1108 static void
1109 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1110 {
1111 	u_int const lim = kring->nkr_num_slots - 1;
1112 	u_int const head = kring->rhead;
1113 	u_int n;
1114 	struct netmap_adapter *na = kring->na;
1115 
1116 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1117 		struct mbuf *m;
1118 		struct netmap_slot *slot = &kring->ring->slot[n];
1119 
1120 		if ((slot->flags & NS_FORWARD) == 0 && !force)
1121 			continue;
1122 		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1123 			RD(5, "bad pkt at %d len %d", n, slot->len);
1124 			continue;
1125 		}
1126 		slot->flags &= ~NS_FORWARD; // XXX needed ?
1127 		/* XXX TODO: adapt to the case of a multisegment packet */
1128 		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1129 
1130 		if (m == NULL)
1131 			break;
1132 		mbq_enqueue(q, m);
1133 	}
1134 }
1135 
1136 static inline int
1137 _nm_may_forward(struct netmap_kring *kring)
1138 {
1139 	return	((netmap_fwd || kring->ring->flags & NR_FORWARD) &&
1140 		 kring->na->na_flags & NAF_HOST_RINGS &&
1141 		 kring->tx == NR_RX);
1142 }
1143 
1144 static inline int
1145 nm_may_forward_up(struct netmap_kring *kring)
1146 {
1147 	return	_nm_may_forward(kring) &&
1148 		 kring->ring_id != kring->na->num_rx_rings;
1149 }
1150 
1151 static inline int
1152 nm_may_forward_down(struct netmap_kring *kring, int sync_flags)
1153 {
1154 	return	_nm_may_forward(kring) &&
1155 		 (sync_flags & NAF_CAN_FORWARD_DOWN) &&
1156 		 kring->ring_id == kring->na->num_rx_rings;
1157 }
1158 
1159 /*
1160  * Send to the NIC rings packets marked NS_FORWARD between
1161  * kring->nr_hwcur and kring->rhead.
1162  * Called under kring->rx_queue.lock on the sw rx ring.
1163  *
1164  * It can only be called if the user opened all the TX hw rings,
1165  * see NAF_CAN_FORWARD_DOWN flag.
1166  * We can touch the TX netmap rings (slots, head and cur) since
1167  * we are in poll/ioctl system call context, and the application
1168  * is not supposed to touch the ring (using a different thread)
1169  * during the execution of the system call.
1170  */
1171 static u_int
1172 netmap_sw_to_nic(struct netmap_adapter *na)
1173 {
1174 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1175 	struct netmap_slot *rxslot = kring->ring->slot;
1176 	u_int i, rxcur = kring->nr_hwcur;
1177 	u_int const head = kring->rhead;
1178 	u_int const src_lim = kring->nkr_num_slots - 1;
1179 	u_int sent = 0;
1180 
1181 	/* scan rings to find space, then fill as much as possible */
1182 	for (i = 0; i < na->num_tx_rings; i++) {
1183 		struct netmap_kring *kdst = &na->tx_rings[i];
1184 		struct netmap_ring *rdst = kdst->ring;
1185 		u_int const dst_lim = kdst->nkr_num_slots - 1;
1186 
1187 		/* XXX do we trust ring or kring->rcur,rtail ? */
1188 		for (; rxcur != head && !nm_ring_empty(rdst);
1189 		     rxcur = nm_next(rxcur, src_lim) ) {
1190 			struct netmap_slot *src, *dst, tmp;
1191 			u_int dst_head = rdst->head;
1192 
1193 			src = &rxslot[rxcur];
1194 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1195 				continue;
1196 
1197 			sent++;
1198 
1199 			dst = &rdst->slot[dst_head];
1200 
1201 			tmp = *src;
1202 
1203 			src->buf_idx = dst->buf_idx;
1204 			src->flags = NS_BUF_CHANGED;
1205 
1206 			dst->buf_idx = tmp.buf_idx;
1207 			dst->len = tmp.len;
1208 			dst->flags = NS_BUF_CHANGED;
1209 
1210 			rdst->head = rdst->cur = nm_next(dst_head, dst_lim);
1211 		}
1212 		/* if (sent) XXX txsync ? it would be just an optimization */
1213 	}
1214 	return sent;
1215 }
1216 
1217 
1218 /*
1219  * netmap_txsync_to_host() passes packets up. We are called from a
1220  * system call in user process context, and the only contention
1221  * can be among multiple user threads erroneously calling
1222  * this routine concurrently.
1223  */
1224 static int
1225 netmap_txsync_to_host(struct netmap_kring *kring, int flags)
1226 {
1227 	struct netmap_adapter *na = kring->na;
1228 	u_int const lim = kring->nkr_num_slots - 1;
1229 	u_int const head = kring->rhead;
1230 	struct mbq q;
1231 
1232 	/* Take packets from hwcur to head and pass them up.
1233 	 * Force hwcur = head since netmap_grab_packets() stops at head
1234 	 */
1235 	mbq_init(&q);
1236 	netmap_grab_packets(kring, &q, 1 /* force */);
1237 	ND("have %d pkts in queue", mbq_len(&q));
1238 	kring->nr_hwcur = head;
1239 	kring->nr_hwtail = head + lim;
1240 	if (kring->nr_hwtail > lim)
1241 		kring->nr_hwtail -= lim + 1;
1242 
1243 	netmap_send_up(na->ifp, &q);
1244 	return 0;
1245 }
1246 
1247 
1248 /*
1249  * rxsync backend for packets coming from the host stack.
1250  * They have been put in kring->rx_queue by netmap_transmit().
1251  * We protect access to the kring using kring->rx_queue.lock
1252  *
1253  * also moves to the nic hw rings any packet the user has marked
1254  * for transparent-mode forwarding, then sets the NR_FORWARD
1255  * flag in the kring to let the caller push them out
1256  */
1257 static int
1258 netmap_rxsync_from_host(struct netmap_kring *kring, int flags)
1259 {
1260 	struct netmap_adapter *na = kring->na;
1261 	struct netmap_ring *ring = kring->ring;
1262 	u_int nm_i, n;
1263 	u_int const lim = kring->nkr_num_slots - 1;
1264 	u_int const head = kring->rhead;
1265 	int ret = 0;
1266 	struct mbq *q = &kring->rx_queue, fq;
1267 
1268 	mbq_init(&fq); /* fq holds packets to be freed */
1269 
1270 	mbq_lock(q);
1271 
1272 	/* First part: import newly received packets */
1273 	n = mbq_len(q);
1274 	if (n) { /* grab packets from the queue */
1275 		struct mbuf *m;
1276 		uint32_t stop_i;
1277 
1278 		nm_i = kring->nr_hwtail;
1279 		stop_i = nm_prev(kring->nr_hwcur, lim);
1280 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1281 			int len = MBUF_LEN(m);
1282 			struct netmap_slot *slot = &ring->slot[nm_i];
1283 
1284 			m_copydata(m, 0, len, NMB(na, slot));
1285 			ND("nm %d len %d", nm_i, len);
1286 			if (netmap_verbose)
1287                                 D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1288 
1289 			slot->len = len;
1290 			slot->flags = kring->nkr_slot_flags;
1291 			nm_i = nm_next(nm_i, lim);
1292 			mbq_enqueue(&fq, m);
1293 		}
1294 		kring->nr_hwtail = nm_i;
1295 	}
1296 
1297 	/*
1298 	 * Second part: skip past packets that userspace has released.
1299 	 */
1300 	nm_i = kring->nr_hwcur;
1301 	if (nm_i != head) { /* something was released */
1302 		if (nm_may_forward_down(kring, flags)) {
1303 			ret = netmap_sw_to_nic(na);
1304 			if (ret > 0) {
1305 				kring->nr_kflags |= NR_FORWARD;
1306 				ret = 0;
1307 			}
1308 		}
1309 		kring->nr_hwcur = head;
1310 	}
1311 
1312 	mbq_unlock(q);
1313 
1314 	mbq_purge(&fq);
1315 	mbq_fini(&fq);
1316 
1317 	return ret;
1318 }
1319 
1320 
1321 /* Get a netmap adapter for the port.
1322  *
1323  * If it is possible to satisfy the request, return 0
1324  * with *na containing the netmap adapter found.
1325  * Otherwise return an error code, with *na containing NULL.
1326  *
1327  * When the port is attached to a bridge, we always return
1328  * EBUSY.
1329  * Otherwise, if the port is already bound to a file descriptor,
1330  * then we unconditionally return the existing adapter into *na.
1331  * In all the other cases, we return (into *na) either native,
1332  * generic or NULL, according to the following table:
1333  *
1334  *					native_support
1335  * active_fds   dev.netmap.admode         YES     NO
1336  * -------------------------------------------------------
1337  *    >0              *                 NA(ifp) NA(ifp)
1338  *
1339  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1340  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1341  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1342  *
1343  */
1344 static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */
1345 int
1346 netmap_get_hw_na(struct ifnet *ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na)
1347 {
1348 	/* generic support */
1349 	int i = netmap_admode;	/* Take a snapshot. */
1350 	struct netmap_adapter *prev_na;
1351 	int error = 0;
1352 
1353 	*na = NULL; /* default */
1354 
1355 	/* reset in case of invalid value */
1356 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1357 		i = netmap_admode = NETMAP_ADMODE_BEST;
1358 
1359 	if (NM_NA_VALID(ifp)) {
1360 		prev_na = NA(ifp);
1361 		/* If an adapter already exists, return it if
1362 		 * there are active file descriptors or if
1363 		 * netmap is not forced to use generic
1364 		 * adapters.
1365 		 */
1366 		if (NETMAP_OWNED_BY_ANY(prev_na)
1367 			|| i != NETMAP_ADMODE_GENERIC
1368 			|| prev_na->na_flags & NAF_FORCE_NATIVE
1369 #ifdef WITH_PIPES
1370 			/* ugly, but we cannot allow an adapter switch
1371 			 * if some pipe is referring to this one
1372 			 */
1373 			|| prev_na->na_next_pipe > 0
1374 #endif
1375 		) {
1376 			*na = prev_na;
1377 			goto assign_mem;
1378 		}
1379 	}
1380 
1381 	/* If there isn't native support and netmap is not allowed
1382 	 * to use generic adapters, we cannot satisfy the request.
1383 	 */
1384 	if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)
1385 		return EOPNOTSUPP;
1386 
1387 	/* Otherwise, create a generic adapter and return it,
1388 	 * saving the previously used netmap adapter, if any.
1389 	 *
1390 	 * Note that here 'prev_na', if not NULL, MUST be a
1391 	 * native adapter, and CANNOT be a generic one. This is
1392 	 * true because generic adapters are created on demand, and
1393 	 * destroyed when not used anymore. Therefore, if the adapter
1394 	 * currently attached to an interface 'ifp' is generic, it
1395 	 * must be that
1396 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1397 	 * Consequently, if NA(ifp) is generic, we will enter one of
1398 	 * the branches above. This ensures that we never override
1399 	 * a generic adapter with another generic adapter.
1400 	 */
1401 	error = generic_netmap_attach(ifp);
1402 	if (error)
1403 		return error;
1404 
1405 	*na = NA(ifp);
1406 
1407 assign_mem:
1408 	if (nmd != NULL && !((*na)->na_flags & NAF_MEM_OWNER) &&
1409 	    (*na)->active_fds == 0 && ((*na)->nm_mem != nmd)) {
1410 		netmap_mem_put((*na)->nm_mem);
1411 		(*na)->nm_mem = netmap_mem_get(nmd);
1412 	}
1413 
1414 	return 0;
1415 }
1416 
1417 /*
1418  * MUST BE CALLED UNDER NMG_LOCK()
1419  *
1420  * Get a refcounted reference to a netmap adapter attached
1421  * to the interface specified by nmr.
1422  * This is always called in the execution of an ioctl().
1423  *
1424  * Return ENXIO if the interface specified by the request does
1425  * not exist, ENOTSUP if netmap is not supported by the interface,
1426  * EBUSY if the interface is already attached to a bridge,
1427  * EINVAL if parameters are invalid, ENOMEM if needed resources
1428  * could not be allocated.
1429  * If successful, hold a reference to the netmap adapter.
1430  *
1431  * If the interface specified by nmr is a system one, also keep
1432  * a reference to it and return a valid *ifp.
1433  */
1434 int
1435 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na,
1436 	      struct ifnet **ifp, struct netmap_mem_d *nmd, int create)
1437 {
1438 	int error = 0;
1439 	struct netmap_adapter *ret = NULL;
1440 	int nmd_ref = 0;
1441 
1442 	*na = NULL;     /* default return value */
1443 	*ifp = NULL;
1444 
1445 	NMG_LOCK_ASSERT();
1446 
1447 	/* if the request contain a memid, try to find the
1448 	 * corresponding memory region
1449 	 */
1450 	if (nmd == NULL && nmr->nr_arg2) {
1451 		nmd = netmap_mem_find(nmr->nr_arg2);
1452 		if (nmd == NULL)
1453 			return EINVAL;
1454 		/* keep the rereference */
1455 		nmd_ref = 1;
1456 	}
1457 
1458 	/* We cascade through all possible types of netmap adapter.
1459 	 * All netmap_get_*_na() functions return an error and an na,
1460 	 * with the following combinations:
1461 	 *
1462 	 * error    na
1463 	 *   0	   NULL		type doesn't match
1464 	 *  !0	   NULL		type matches, but na creation/lookup failed
1465 	 *   0	  !NULL		type matches and na created/found
1466 	 *  !0    !NULL		impossible
1467 	 */
1468 
1469 	/* try to see if this is a ptnetmap port */
1470 	error = netmap_get_pt_host_na(nmr, na, nmd, create);
1471 	if (error || *na != NULL)
1472 		goto out;
1473 
1474 	/* try to see if this is a monitor port */
1475 	error = netmap_get_monitor_na(nmr, na, nmd, create);
1476 	if (error || *na != NULL)
1477 		goto out;
1478 
1479 	/* try to see if this is a pipe port */
1480 	error = netmap_get_pipe_na(nmr, na, nmd, create);
1481 	if (error || *na != NULL)
1482 		goto out;
1483 
1484 	/* try to see if this is a bridge port */
1485 	error = netmap_get_bdg_na(nmr, na, nmd, create);
1486 	if (error)
1487 		goto out;
1488 
1489 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1490 		goto out;
1491 
1492 	/*
1493 	 * This must be a hardware na, lookup the name in the system.
1494 	 * Note that by hardware we actually mean "it shows up in ifconfig".
1495 	 * This may still be a tap, a veth/epair, or even a
1496 	 * persistent VALE port.
1497 	 */
1498 	*ifp = ifunit_ref(nmr->nr_name);
1499 	if (*ifp == NULL) {
1500 		error = ENXIO;
1501 		goto out;
1502 	}
1503 
1504 	error = netmap_get_hw_na(*ifp, nmd, &ret);
1505 	if (error)
1506 		goto out;
1507 
1508 	*na = ret;
1509 	netmap_adapter_get(ret);
1510 
1511 out:
1512 	if (error) {
1513 		if (ret)
1514 			netmap_adapter_put(ret);
1515 		if (*ifp) {
1516 			if_rele(*ifp);
1517 			*ifp = NULL;
1518 		}
1519 	}
1520 	if (nmd_ref)
1521 		netmap_mem_put(nmd);
1522 
1523 	return error;
1524 }
1525 
1526 /* undo netmap_get_na() */
1527 void
1528 netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp)
1529 {
1530 	if (ifp)
1531 		if_rele(ifp);
1532 	if (na)
1533 		netmap_adapter_put(na);
1534 }
1535 
1536 
1537 #define NM_FAIL_ON(t) do {						\
1538 	if (unlikely(t)) {						\
1539 		RD(5, "%s: fail '" #t "' "				\
1540 			"h %d c %d t %d "				\
1541 			"rh %d rc %d rt %d "				\
1542 			"hc %d ht %d",					\
1543 			kring->name,					\
1544 			head, cur, ring->tail,				\
1545 			kring->rhead, kring->rcur, kring->rtail,	\
1546 			kring->nr_hwcur, kring->nr_hwtail);		\
1547 		return kring->nkr_num_slots;				\
1548 	}								\
1549 } while (0)
1550 
1551 /*
1552  * validate parameters on entry for *_txsync()
1553  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1554  * in case of error.
1555  *
1556  * rhead, rcur and rtail=hwtail are stored from previous round.
1557  * hwcur is the next packet to send to the ring.
1558  *
1559  * We want
1560  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1561  *
1562  * hwcur, rhead, rtail and hwtail are reliable
1563  */
1564 u_int
1565 nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1566 {
1567 	u_int head = ring->head; /* read only once */
1568 	u_int cur = ring->cur; /* read only once */
1569 	u_int n = kring->nkr_num_slots;
1570 
1571 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1572 		kring->name,
1573 		kring->nr_hwcur, kring->nr_hwtail,
1574 		ring->head, ring->cur, ring->tail);
1575 #if 1 /* kernel sanity checks; but we can trust the kring. */
1576 	NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||
1577 	    kring->rtail >= n ||  kring->nr_hwtail >= n);
1578 #endif /* kernel sanity checks */
1579 	/*
1580 	 * user sanity checks. We only use head,
1581 	 * A, B, ... are possible positions for head:
1582 	 *
1583 	 *  0    A  rhead   B  rtail   C  n-1
1584 	 *  0    D  rtail   E  rhead   F  n-1
1585 	 *
1586 	 * B, F, D are valid. A, C, E are wrong
1587 	 */
1588 	if (kring->rtail >= kring->rhead) {
1589 		/* want rhead <= head <= rtail */
1590 		NM_FAIL_ON(head < kring->rhead || head > kring->rtail);
1591 		/* and also head <= cur <= rtail */
1592 		NM_FAIL_ON(cur < head || cur > kring->rtail);
1593 	} else { /* here rtail < rhead */
1594 		/* we need head outside rtail .. rhead */
1595 		NM_FAIL_ON(head > kring->rtail && head < kring->rhead);
1596 
1597 		/* two cases now: head <= rtail or head >= rhead  */
1598 		if (head <= kring->rtail) {
1599 			/* want head <= cur <= rtail */
1600 			NM_FAIL_ON(cur < head || cur > kring->rtail);
1601 		} else { /* head >= rhead */
1602 			/* cur must be outside rtail..head */
1603 			NM_FAIL_ON(cur > kring->rtail && cur < head);
1604 		}
1605 	}
1606 	if (ring->tail != kring->rtail) {
1607 		RD(5, "%s tail overwritten was %d need %d", kring->name,
1608 			ring->tail, kring->rtail);
1609 		ring->tail = kring->rtail;
1610 	}
1611 	kring->rhead = head;
1612 	kring->rcur = cur;
1613 	return head;
1614 }
1615 
1616 
1617 /*
1618  * validate parameters on entry for *_rxsync()
1619  * Returns ring->head if ok, kring->nkr_num_slots on error.
1620  *
1621  * For a valid configuration,
1622  * hwcur <= head <= cur <= tail <= hwtail
1623  *
1624  * We only consider head and cur.
1625  * hwcur and hwtail are reliable.
1626  *
1627  */
1628 u_int
1629 nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1630 {
1631 	uint32_t const n = kring->nkr_num_slots;
1632 	uint32_t head, cur;
1633 
1634 	ND(5,"%s kc %d kt %d h %d c %d t %d",
1635 		kring->name,
1636 		kring->nr_hwcur, kring->nr_hwtail,
1637 		ring->head, ring->cur, ring->tail);
1638 	/*
1639 	 * Before storing the new values, we should check they do not
1640 	 * move backwards. However:
1641 	 * - head is not an issue because the previous value is hwcur;
1642 	 * - cur could in principle go back, however it does not matter
1643 	 *   because we are processing a brand new rxsync()
1644 	 */
1645 	cur = kring->rcur = ring->cur;	/* read only once */
1646 	head = kring->rhead = ring->head;	/* read only once */
1647 #if 1 /* kernel sanity checks */
1648 	NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);
1649 #endif /* kernel sanity checks */
1650 	/* user sanity checks */
1651 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1652 		/* want hwcur <= rhead <= hwtail */
1653 		NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);
1654 		/* and also rhead <= rcur <= hwtail */
1655 		NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1656 	} else {
1657 		/* we need rhead outside hwtail..hwcur */
1658 		NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);
1659 		/* two cases now: head <= hwtail or head >= hwcur  */
1660 		if (head <= kring->nr_hwtail) {
1661 			/* want head <= cur <= hwtail */
1662 			NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1663 		} else {
1664 			/* cur must be outside hwtail..head */
1665 			NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);
1666 		}
1667 	}
1668 	if (ring->tail != kring->rtail) {
1669 		RD(5, "%s tail overwritten was %d need %d",
1670 			kring->name,
1671 			ring->tail, kring->rtail);
1672 		ring->tail = kring->rtail;
1673 	}
1674 	return head;
1675 }
1676 
1677 
1678 /*
1679  * Error routine called when txsync/rxsync detects an error.
1680  * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1681  * Return 1 on reinit.
1682  *
1683  * This routine is only called by the upper half of the kernel.
1684  * It only reads hwcur (which is changed only by the upper half, too)
1685  * and hwtail (which may be changed by the lower half, but only on
1686  * a tx ring and only to increase it, so any error will be recovered
1687  * on the next call). For the above, we don't strictly need to call
1688  * it under lock.
1689  */
1690 int
1691 netmap_ring_reinit(struct netmap_kring *kring)
1692 {
1693 	struct netmap_ring *ring = kring->ring;
1694 	u_int i, lim = kring->nkr_num_slots - 1;
1695 	int errors = 0;
1696 
1697 	// XXX KASSERT nm_kr_tryget
1698 	RD(10, "called for %s", kring->name);
1699 	// XXX probably wrong to trust userspace
1700 	kring->rhead = ring->head;
1701 	kring->rcur  = ring->cur;
1702 	kring->rtail = ring->tail;
1703 
1704 	if (ring->cur > lim)
1705 		errors++;
1706 	if (ring->head > lim)
1707 		errors++;
1708 	if (ring->tail > lim)
1709 		errors++;
1710 	for (i = 0; i <= lim; i++) {
1711 		u_int idx = ring->slot[i].buf_idx;
1712 		u_int len = ring->slot[i].len;
1713 		if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1714 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1715 			ring->slot[i].buf_idx = 0;
1716 			ring->slot[i].len = 0;
1717 		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1718 			ring->slot[i].len = 0;
1719 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1720 		}
1721 	}
1722 	if (errors) {
1723 		RD(10, "total %d errors", errors);
1724 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1725 			kring->name,
1726 			ring->cur, kring->nr_hwcur,
1727 			ring->tail, kring->nr_hwtail);
1728 		ring->head = kring->rhead = kring->nr_hwcur;
1729 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1730 		ring->tail = kring->rtail = kring->nr_hwtail;
1731 	}
1732 	return (errors ? 1 : 0);
1733 }
1734 
1735 /* interpret the ringid and flags fields of an nmreq, by translating them
1736  * into a pair of intervals of ring indices:
1737  *
1738  * [priv->np_txqfirst, priv->np_txqlast) and
1739  * [priv->np_rxqfirst, priv->np_rxqlast)
1740  *
1741  */
1742 int
1743 netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1744 {
1745 	struct netmap_adapter *na = priv->np_na;
1746 	u_int j, i = ringid & NETMAP_RING_MASK;
1747 	u_int reg = flags & NR_REG_MASK;
1748 	int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };
1749 	enum txrx t;
1750 
1751 	if (reg == NR_REG_DEFAULT) {
1752 		/* convert from old ringid to flags */
1753 		if (ringid & NETMAP_SW_RING) {
1754 			reg = NR_REG_SW;
1755 		} else if (ringid & NETMAP_HW_RING) {
1756 			reg = NR_REG_ONE_NIC;
1757 		} else {
1758 			reg = NR_REG_ALL_NIC;
1759 		}
1760 		D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
1761 	}
1762 
1763 	if ((flags & NR_PTNETMAP_HOST) && ((reg != NR_REG_ALL_NIC &&
1764                     reg != NR_REG_PIPE_MASTER && reg != NR_REG_PIPE_SLAVE) ||
1765 			flags & (NR_RX_RINGS_ONLY|NR_TX_RINGS_ONLY))) {
1766 		D("Error: only NR_REG_ALL_NIC supported with netmap passthrough");
1767 		return EINVAL;
1768 	}
1769 
1770 	for_rx_tx(t) {
1771 		if (flags & excluded_direction[t]) {
1772 			priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1773 			continue;
1774 		}
1775 		switch (reg) {
1776 		case NR_REG_ALL_NIC:
1777 		case NR_REG_PIPE_MASTER:
1778 		case NR_REG_PIPE_SLAVE:
1779 			priv->np_qfirst[t] = 0;
1780 			priv->np_qlast[t] = nma_get_nrings(na, t);
1781 			ND("ALL/PIPE: %s %d %d", nm_txrx2str(t),
1782 				priv->np_qfirst[t], priv->np_qlast[t]);
1783 			break;
1784 		case NR_REG_SW:
1785 		case NR_REG_NIC_SW:
1786 			if (!(na->na_flags & NAF_HOST_RINGS)) {
1787 				D("host rings not supported");
1788 				return EINVAL;
1789 			}
1790 			priv->np_qfirst[t] = (reg == NR_REG_SW ?
1791 				nma_get_nrings(na, t) : 0);
1792 			priv->np_qlast[t] = nma_get_nrings(na, t) + 1;
1793 			ND("%s: %s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
1794 				nm_txrx2str(t),
1795 				priv->np_qfirst[t], priv->np_qlast[t]);
1796 			break;
1797 		case NR_REG_ONE_NIC:
1798 			if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
1799 				D("invalid ring id %d", i);
1800 				return EINVAL;
1801 			}
1802 			/* if not enough rings, use the first one */
1803 			j = i;
1804 			if (j >= nma_get_nrings(na, t))
1805 				j = 0;
1806 			priv->np_qfirst[t] = j;
1807 			priv->np_qlast[t] = j + 1;
1808 			ND("ONE_NIC: %s %d %d", nm_txrx2str(t),
1809 				priv->np_qfirst[t], priv->np_qlast[t]);
1810 			break;
1811 		default:
1812 			D("invalid regif type %d", reg);
1813 			return EINVAL;
1814 		}
1815 	}
1816 	priv->np_flags = (flags & ~NR_REG_MASK) | reg;
1817 
1818 	/* Allow transparent forwarding mode in the host --> nic
1819 	 * direction only if all the TX hw rings have been opened. */
1820 	if (priv->np_qfirst[NR_TX] == 0 &&
1821 			priv->np_qlast[NR_TX] >= na->num_tx_rings) {
1822 		priv->np_sync_flags |= NAF_CAN_FORWARD_DOWN;
1823 	}
1824 
1825 	if (netmap_verbose) {
1826 		D("%s: tx [%d,%d) rx [%d,%d) id %d",
1827 			na->name,
1828 			priv->np_qfirst[NR_TX],
1829 			priv->np_qlast[NR_TX],
1830 			priv->np_qfirst[NR_RX],
1831 			priv->np_qlast[NR_RX],
1832 			i);
1833 	}
1834 	return 0;
1835 }
1836 
1837 
1838 /*
1839  * Set the ring ID. For devices with a single queue, a request
1840  * for all rings is the same as a single ring.
1841  */
1842 static int
1843 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1844 {
1845 	struct netmap_adapter *na = priv->np_na;
1846 	int error;
1847 	enum txrx t;
1848 
1849 	error = netmap_interp_ringid(priv, ringid, flags);
1850 	if (error) {
1851 		return error;
1852 	}
1853 
1854 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1855 
1856 	/* optimization: count the users registered for more than
1857 	 * one ring, which are the ones sleeping on the global queue.
1858 	 * The default netmap_notify() callback will then
1859 	 * avoid signaling the global queue if nobody is using it
1860 	 */
1861 	for_rx_tx(t) {
1862 		if (nm_si_user(priv, t))
1863 			na->si_users[t]++;
1864 	}
1865 	return 0;
1866 }
1867 
1868 static void
1869 netmap_unset_ringid(struct netmap_priv_d *priv)
1870 {
1871 	struct netmap_adapter *na = priv->np_na;
1872 	enum txrx t;
1873 
1874 	for_rx_tx(t) {
1875 		if (nm_si_user(priv, t))
1876 			na->si_users[t]--;
1877 		priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1878 	}
1879 	priv->np_flags = 0;
1880 	priv->np_txpoll = 0;
1881 }
1882 
1883 
1884 /* Set the nr_pending_mode for the requested rings.
1885  * If requested, also try to get exclusive access to the rings, provided
1886  * the rings we want to bind are not exclusively owned by a previous bind.
1887  */
1888 static int
1889 netmap_krings_get(struct netmap_priv_d *priv)
1890 {
1891 	struct netmap_adapter *na = priv->np_na;
1892 	u_int i;
1893 	struct netmap_kring *kring;
1894 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1895 	enum txrx t;
1896 
1897 	ND("%s: grabbing tx [%d, %d) rx [%d, %d)",
1898 			na->name,
1899 			priv->np_qfirst[NR_TX],
1900 			priv->np_qlast[NR_TX],
1901 			priv->np_qfirst[NR_RX],
1902 			priv->np_qlast[NR_RX]);
1903 
1904 	/* first round: check that all the requested rings
1905 	 * are neither alread exclusively owned, nor we
1906 	 * want exclusive ownership when they are already in use
1907 	 */
1908 	for_rx_tx(t) {
1909 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1910 			kring = &NMR(na, t)[i];
1911 			if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
1912 			    (kring->users && excl))
1913 			{
1914 				ND("ring %s busy", kring->name);
1915 				return EBUSY;
1916 			}
1917 		}
1918 	}
1919 
1920 	/* second round: increment usage count (possibly marking them
1921 	 * as exclusive) and set the nr_pending_mode
1922 	 */
1923 	for_rx_tx(t) {
1924 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1925 			kring = &NMR(na, t)[i];
1926 			kring->users++;
1927 			if (excl)
1928 				kring->nr_kflags |= NKR_EXCLUSIVE;
1929 	                kring->nr_pending_mode = NKR_NETMAP_ON;
1930 		}
1931 	}
1932 
1933 	return 0;
1934 
1935 }
1936 
1937 /* Undo netmap_krings_get(). This is done by clearing the exclusive mode
1938  * if was asked on regif, and unset the nr_pending_mode if we are the
1939  * last users of the involved rings. */
1940 static void
1941 netmap_krings_put(struct netmap_priv_d *priv)
1942 {
1943 	struct netmap_adapter *na = priv->np_na;
1944 	u_int i;
1945 	struct netmap_kring *kring;
1946 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1947 	enum txrx t;
1948 
1949 	ND("%s: releasing tx [%d, %d) rx [%d, %d)",
1950 			na->name,
1951 			priv->np_qfirst[NR_TX],
1952 			priv->np_qlast[NR_TX],
1953 			priv->np_qfirst[NR_RX],
1954 			priv->np_qlast[MR_RX]);
1955 
1956 
1957 	for_rx_tx(t) {
1958 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1959 			kring = &NMR(na, t)[i];
1960 			if (excl)
1961 				kring->nr_kflags &= ~NKR_EXCLUSIVE;
1962 			kring->users--;
1963 			if (kring->users == 0)
1964 				kring->nr_pending_mode = NKR_NETMAP_OFF;
1965 		}
1966 	}
1967 }
1968 
1969 /*
1970  * possibly move the interface to netmap-mode.
1971  * If success it returns a pointer to netmap_if, otherwise NULL.
1972  * This must be called with NMG_LOCK held.
1973  *
1974  * The following na callbacks are called in the process:
1975  *
1976  * na->nm_config()			[by netmap_update_config]
1977  * (get current number and size of rings)
1978  *
1979  *  	We have a generic one for linux (netmap_linux_config).
1980  *  	The bwrap has to override this, since it has to forward
1981  *  	the request to the wrapped adapter (netmap_bwrap_config).
1982  *
1983  *
1984  * na->nm_krings_create()
1985  * (create and init the krings array)
1986  *
1987  * 	One of the following:
1988  *
1989  *	* netmap_hw_krings_create, 			(hw ports)
1990  *		creates the standard layout for the krings
1991  * 		and adds the mbq (used for the host rings).
1992  *
1993  * 	* netmap_vp_krings_create			(VALE ports)
1994  * 		add leases and scratchpads
1995  *
1996  * 	* netmap_pipe_krings_create			(pipes)
1997  * 		create the krings and rings of both ends and
1998  * 		cross-link them
1999  *
2000  *      * netmap_monitor_krings_create 			(monitors)
2001  *      	avoid allocating the mbq
2002  *
2003  *      * netmap_bwrap_krings_create			(bwraps)
2004  *      	create both the brap krings array,
2005  *      	the krings array of the wrapped adapter, and
2006  *      	(if needed) the fake array for the host adapter
2007  *
2008  * na->nm_register(, 1)
2009  * (put the adapter in netmap mode)
2010  *
2011  * 	This may be one of the following:
2012  *
2013  * 	* netmap_hw_reg				        (hw ports)
2014  * 		checks that the ifp is still there, then calls
2015  * 		the hardware specific callback;
2016  *
2017  * 	* netmap_vp_reg					(VALE ports)
2018  *		If the port is connected to a bridge,
2019  *		set the NAF_NETMAP_ON flag under the
2020  *		bridge write lock.
2021  *
2022  *	* netmap_pipe_reg				(pipes)
2023  *		inform the other pipe end that it is no
2024  *		longer responsible for the lifetime of this
2025  *		pipe end
2026  *
2027  *	* netmap_monitor_reg				(monitors)
2028  *		intercept the sync callbacks of the monitored
2029  *		rings
2030  *
2031  *	* netmap_bwrap_reg				(bwraps)
2032  *		cross-link the bwrap and hwna rings,
2033  *		forward the request to the hwna, override
2034  *		the hwna notify callback (to get the frames
2035  *		coming from outside go through the bridge).
2036  *
2037  *
2038  */
2039 int
2040 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
2041 	uint16_t ringid, uint32_t flags)
2042 {
2043 	struct netmap_if *nifp = NULL;
2044 	int error;
2045 
2046 	NMG_LOCK_ASSERT();
2047 	/* ring configuration may have changed, fetch from the card */
2048 	netmap_update_config(na);
2049 	priv->np_na = na;     /* store the reference */
2050 	error = netmap_set_ringid(priv, ringid, flags);
2051 	if (error)
2052 		goto err;
2053 	error = netmap_mem_finalize(na->nm_mem, na);
2054 	if (error)
2055 		goto err;
2056 
2057 	if (na->active_fds == 0) {
2058 		/*
2059 		 * If this is the first registration of the adapter,
2060 		 * create the  in-kernel view of the netmap rings,
2061 		 * the netmap krings.
2062 		 */
2063 
2064 		/*
2065 		 * Depending on the adapter, this may also create
2066 		 * the netmap rings themselves
2067 		 */
2068 		error = na->nm_krings_create(na);
2069 		if (error)
2070 			goto err_drop_mem;
2071 
2072 	}
2073 
2074 	/* now the krings must exist and we can check whether some
2075 	 * previous bind has exclusive ownership on them, and set
2076 	 * nr_pending_mode
2077 	 */
2078 	error = netmap_krings_get(priv);
2079 	if (error)
2080 		goto err_del_krings;
2081 
2082 	/* create all needed missing netmap rings */
2083 	error = netmap_mem_rings_create(na);
2084 	if (error)
2085 		goto err_rel_excl;
2086 
2087 	/* in all cases, create a new netmap if */
2088 	nifp = netmap_mem_if_new(na, priv);
2089 	if (nifp == NULL) {
2090 		error = ENOMEM;
2091 		goto err_del_rings;
2092 	}
2093 
2094 	if (na->active_fds == 0) {
2095 		/* cache the allocator info in the na */
2096 		error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);
2097 		if (error)
2098 			goto err_del_if;
2099 		ND("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,
2100 					    na->na_lut.objsize);
2101 	}
2102 
2103 	if (nm_kring_pending(priv)) {
2104 		/* Some kring is switching mode, tell the adapter to
2105 		 * react on this. */
2106 		error = na->nm_register(na, 1);
2107 		if (error)
2108 			goto err_put_lut;
2109 	}
2110 
2111 	/* Commit the reference. */
2112 	na->active_fds++;
2113 
2114 	/*
2115 	 * advertise that the interface is ready by setting np_nifp.
2116 	 * The barrier is needed because readers (poll, *SYNC and mmap)
2117 	 * check for priv->np_nifp != NULL without locking
2118 	 */
2119 	mb(); /* make sure previous writes are visible to all CPUs */
2120 	priv->np_nifp = nifp;
2121 
2122 	return 0;
2123 
2124 err_put_lut:
2125 	if (na->active_fds == 0)
2126 		memset(&na->na_lut, 0, sizeof(na->na_lut));
2127 err_del_if:
2128 	netmap_mem_if_delete(na, nifp);
2129 err_rel_excl:
2130 	netmap_krings_put(priv);
2131 err_del_rings:
2132 	netmap_mem_rings_delete(na);
2133 err_del_krings:
2134 	if (na->active_fds == 0)
2135 		na->nm_krings_delete(na);
2136 err_drop_mem:
2137 	netmap_mem_deref(na->nm_mem, na);
2138 err:
2139 	priv->np_na = NULL;
2140 	return error;
2141 }
2142 
2143 
2144 /*
2145  * update kring and ring at the end of rxsync/txsync.
2146  */
2147 static inline void
2148 nm_sync_finalize(struct netmap_kring *kring)
2149 {
2150 	/*
2151 	 * Update ring tail to what the kernel knows
2152 	 * After txsync: head/rhead/hwcur might be behind cur/rcur
2153 	 * if no carrier.
2154 	 */
2155 	kring->ring->tail = kring->rtail = kring->nr_hwtail;
2156 
2157 	ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2158 		kring->name, kring->nr_hwcur, kring->nr_hwtail,
2159 		kring->rhead, kring->rcur, kring->rtail);
2160 }
2161 
2162 /* set ring timestamp */
2163 static inline void
2164 ring_timestamp_set(struct netmap_ring *ring)
2165 {
2166 	if (netmap_no_timestamp == 0 || ring->flags & NR_TIMESTAMP) {
2167 		microtime(&ring->ts);
2168 	}
2169 }
2170 
2171 
2172 /*
2173  * ioctl(2) support for the "netmap" device.
2174  *
2175  * Following a list of accepted commands:
2176  * - NIOCGINFO
2177  * - SIOCGIFADDR	just for convenience
2178  * - NIOCREGIF
2179  * - NIOCTXSYNC
2180  * - NIOCRXSYNC
2181  *
2182  * Return 0 on success, errno otherwise.
2183  */
2184 int
2185 netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *td)
2186 {
2187 	struct mbq q;	/* packets from RX hw queues to host stack */
2188 	struct nmreq *nmr = (struct nmreq *) data;
2189 	struct netmap_adapter *na = NULL;
2190 	struct netmap_mem_d *nmd = NULL;
2191 	struct ifnet *ifp = NULL;
2192 	int error = 0;
2193 	u_int i, qfirst, qlast;
2194 	struct netmap_if *nifp;
2195 	struct netmap_kring *krings;
2196 	int sync_flags;
2197 	enum txrx t;
2198 
2199 	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
2200 		/* truncate name */
2201 		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
2202 		if (nmr->nr_version != NETMAP_API) {
2203 			D("API mismatch for %s got %d need %d",
2204 				nmr->nr_name,
2205 				nmr->nr_version, NETMAP_API);
2206 			nmr->nr_version = NETMAP_API;
2207 		}
2208 		if (nmr->nr_version < NETMAP_MIN_API ||
2209 		    nmr->nr_version > NETMAP_MAX_API) {
2210 			return EINVAL;
2211 		}
2212 	}
2213 
2214 	switch (cmd) {
2215 	case NIOCGINFO:		/* return capabilities etc */
2216 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
2217 			error = netmap_bdg_ctl(nmr, NULL);
2218 			break;
2219 		}
2220 
2221 		NMG_LOCK();
2222 		do {
2223 			/* memsize is always valid */
2224 			u_int memflags;
2225 
2226 			if (nmr->nr_name[0] != '\0') {
2227 
2228 				/* get a refcount */
2229 				error = netmap_get_na(nmr, &na, &ifp, NULL, 1 /* create */);
2230 				if (error) {
2231 					na = NULL;
2232 					ifp = NULL;
2233 					break;
2234 				}
2235 				nmd = na->nm_mem; /* get memory allocator */
2236 			} else {
2237 				nmd = netmap_mem_find(nmr->nr_arg2 ? nmr->nr_arg2 : 1);
2238 				if (nmd == NULL) {
2239 					error = EINVAL;
2240 					break;
2241 				}
2242 			}
2243 
2244 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags,
2245 				&nmr->nr_arg2);
2246 			if (error)
2247 				break;
2248 			if (na == NULL) /* only memory info */
2249 				break;
2250 			nmr->nr_offset = 0;
2251 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
2252 			netmap_update_config(na);
2253 			nmr->nr_rx_rings = na->num_rx_rings;
2254 			nmr->nr_tx_rings = na->num_tx_rings;
2255 			nmr->nr_rx_slots = na->num_rx_desc;
2256 			nmr->nr_tx_slots = na->num_tx_desc;
2257 		} while (0);
2258 		netmap_unget_na(na, ifp);
2259 		NMG_UNLOCK();
2260 		break;
2261 
2262 	case NIOCREGIF:
2263 		/*
2264 		 * If nmr->nr_cmd is not zero, this NIOCREGIF is not really
2265 		 * a regif operation, but a different one, specified by the
2266 		 * value of nmr->nr_cmd.
2267 		 */
2268 		i = nmr->nr_cmd;
2269 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
2270 				|| i == NETMAP_BDG_VNET_HDR
2271 				|| i == NETMAP_BDG_NEWIF
2272 				|| i == NETMAP_BDG_DELIF
2273 				|| i == NETMAP_BDG_POLLING_ON
2274 				|| i == NETMAP_BDG_POLLING_OFF) {
2275 			/* possibly attach/detach NIC and VALE switch */
2276 			error = netmap_bdg_ctl(nmr, NULL);
2277 			break;
2278 		} else if (i == NETMAP_PT_HOST_CREATE || i == NETMAP_PT_HOST_DELETE) {
2279 			/* forward the command to the ptnetmap subsystem */
2280 			error = ptnetmap_ctl(nmr, priv->np_na);
2281 			break;
2282 		} else if (i == NETMAP_VNET_HDR_GET) {
2283 			/* get vnet-header length for this netmap port */
2284 			struct ifnet *ifp;
2285 
2286 			NMG_LOCK();
2287 			error = netmap_get_na(nmr, &na, &ifp, NULL, 0);
2288 			if (na && !error) {
2289 				nmr->nr_arg1 = na->virt_hdr_len;
2290 			}
2291 			netmap_unget_na(na, ifp);
2292 			NMG_UNLOCK();
2293 			break;
2294 		} else if (i == NETMAP_POOLS_INFO_GET) {
2295 			/* get information from the memory allocator */
2296 			NMG_LOCK();
2297 			if (priv->np_na && priv->np_na->nm_mem) {
2298 				struct netmap_mem_d *nmd = priv->np_na->nm_mem;
2299 				error = netmap_mem_pools_info_get(nmr, nmd);
2300 			} else {
2301 				error = EINVAL;
2302 			}
2303 			NMG_UNLOCK();
2304 			break;
2305 		} else if (i != 0) {
2306 			D("nr_cmd must be 0 not %d", i);
2307 			error = EINVAL;
2308 			break;
2309 		}
2310 
2311 		/* protect access to priv from concurrent NIOCREGIF */
2312 		NMG_LOCK();
2313 		do {
2314 			u_int memflags;
2315 			struct ifnet *ifp;
2316 
2317 			if (priv->np_nifp != NULL) {	/* thread already registered */
2318 				error = EBUSY;
2319 				break;
2320 			}
2321 
2322 			if (nmr->nr_arg2) {
2323 				/* find the allocator and get a reference */
2324 				nmd = netmap_mem_find(nmr->nr_arg2);
2325 				if (nmd == NULL) {
2326 					error = EINVAL;
2327 					break;
2328 				}
2329 			}
2330 			/* find the interface and a reference */
2331 			error = netmap_get_na(nmr, &na, &ifp, nmd,
2332 					      1 /* create */); /* keep reference */
2333 			if (error)
2334 				break;
2335 			if (NETMAP_OWNED_BY_KERN(na)) {
2336 				error = EBUSY;
2337 				break;
2338 			}
2339 
2340 			if (na->virt_hdr_len && !(nmr->nr_flags & NR_ACCEPT_VNET_HDR)) {
2341 				error = EIO;
2342 				break;
2343 			}
2344 
2345 			error = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags);
2346 			if (error) {    /* reg. failed, release priv and ref */
2347 				break;
2348 			}
2349 			nifp = priv->np_nifp;
2350 			priv->np_td = td; // XXX kqueue, debugging only
2351 
2352 			/* return the offset of the netmap_if object */
2353 			nmr->nr_rx_rings = na->num_rx_rings;
2354 			nmr->nr_tx_rings = na->num_tx_rings;
2355 			nmr->nr_rx_slots = na->num_rx_desc;
2356 			nmr->nr_tx_slots = na->num_tx_desc;
2357 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags,
2358 				&nmr->nr_arg2);
2359 			if (error) {
2360 				netmap_do_unregif(priv);
2361 				break;
2362 			}
2363 			if (memflags & NETMAP_MEM_PRIVATE) {
2364 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2365 			}
2366 			for_rx_tx(t) {
2367 				priv->np_si[t] = nm_si_user(priv, t) ?
2368 					&na->si[t] : &NMR(na, t)[priv->np_qfirst[t]].si;
2369 			}
2370 
2371 			if (nmr->nr_arg3) {
2372 				if (netmap_verbose)
2373 					D("requested %d extra buffers", nmr->nr_arg3);
2374 				nmr->nr_arg3 = netmap_extra_alloc(na,
2375 					&nifp->ni_bufs_head, nmr->nr_arg3);
2376 				if (netmap_verbose)
2377 					D("got %d extra buffers", nmr->nr_arg3);
2378 			}
2379 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2380 
2381 			/* store ifp reference so that priv destructor may release it */
2382 			priv->np_ifp = ifp;
2383 		} while (0);
2384 		if (error) {
2385 			netmap_unget_na(na, ifp);
2386 		}
2387 		/* release the reference from netmap_mem_find() or
2388 		 * netmap_mem_ext_create()
2389 		 */
2390 		if (nmd)
2391 			netmap_mem_put(nmd);
2392 		NMG_UNLOCK();
2393 		break;
2394 
2395 	case NIOCTXSYNC:
2396 	case NIOCRXSYNC:
2397 		nifp = priv->np_nifp;
2398 
2399 		if (nifp == NULL) {
2400 			error = ENXIO;
2401 			break;
2402 		}
2403 		mb(); /* make sure following reads are not from cache */
2404 
2405 		na = priv->np_na;      /* we have a reference */
2406 
2407 		if (na == NULL) {
2408 			D("Internal error: nifp != NULL && na == NULL");
2409 			error = ENXIO;
2410 			break;
2411 		}
2412 
2413 		mbq_init(&q);
2414 		t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
2415 		krings = NMR(na, t);
2416 		qfirst = priv->np_qfirst[t];
2417 		qlast = priv->np_qlast[t];
2418 		sync_flags = priv->np_sync_flags;
2419 
2420 		for (i = qfirst; i < qlast; i++) {
2421 			struct netmap_kring *kring = krings + i;
2422 			struct netmap_ring *ring = kring->ring;
2423 
2424 			if (unlikely(nm_kr_tryget(kring, 1, &error))) {
2425 				error = (error ? EIO : 0);
2426 				continue;
2427 			}
2428 
2429 			if (cmd == NIOCTXSYNC) {
2430 				if (netmap_verbose & NM_VERB_TXSYNC)
2431 					D("pre txsync ring %d cur %d hwcur %d",
2432 					    i, ring->cur,
2433 					    kring->nr_hwcur);
2434 				if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2435 					netmap_ring_reinit(kring);
2436 				} else if (kring->nm_sync(kring, sync_flags | NAF_FORCE_RECLAIM) == 0) {
2437 					nm_sync_finalize(kring);
2438 				}
2439 				if (netmap_verbose & NM_VERB_TXSYNC)
2440 					D("post txsync ring %d cur %d hwcur %d",
2441 					    i, ring->cur,
2442 					    kring->nr_hwcur);
2443 			} else {
2444 				if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2445 					netmap_ring_reinit(kring);
2446 				}
2447 				if (nm_may_forward_up(kring)) {
2448 					/* transparent forwarding, see netmap_poll() */
2449 					netmap_grab_packets(kring, &q, netmap_fwd);
2450 				}
2451 				if (kring->nm_sync(kring, sync_flags | NAF_FORCE_READ) == 0) {
2452 					nm_sync_finalize(kring);
2453 				}
2454 				ring_timestamp_set(ring);
2455 			}
2456 			nm_kr_put(kring);
2457 		}
2458 
2459 		if (mbq_peek(&q)) {
2460 			netmap_send_up(na->ifp, &q);
2461 		}
2462 
2463 		break;
2464 
2465 #ifdef WITH_VALE
2466 	case NIOCCONFIG:
2467 		error = netmap_bdg_config(nmr);
2468 		break;
2469 #endif
2470 #ifdef __FreeBSD__
2471 	case FIONBIO:
2472 	case FIOASYNC:
2473 		ND("FIONBIO/FIOASYNC are no-ops");
2474 		break;
2475 
2476 	case BIOCIMMEDIATE:
2477 	case BIOCGHDRCMPLT:
2478 	case BIOCSHDRCMPLT:
2479 	case BIOCSSEESENT:
2480 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
2481 		break;
2482 
2483 	default:	/* allow device-specific ioctls */
2484 	    {
2485 		struct ifnet *ifp = ifunit_ref(nmr->nr_name);
2486 		if (ifp == NULL) {
2487 			error = ENXIO;
2488 		} else {
2489 			struct socket so;
2490 
2491 			bzero(&so, sizeof(so));
2492 			so.so_vnet = ifp->if_vnet;
2493 			// so->so_proto not null.
2494 			error = ifioctl(&so, cmd, data, td);
2495 			if_rele(ifp);
2496 		}
2497 		break;
2498 	    }
2499 
2500 #else /* linux */
2501 	default:
2502 		error = EOPNOTSUPP;
2503 #endif /* linux */
2504 	}
2505 
2506 	return (error);
2507 }
2508 
2509 
2510 /*
2511  * select(2) and poll(2) handlers for the "netmap" device.
2512  *
2513  * Can be called for one or more queues.
2514  * Return true the event mask corresponding to ready events.
2515  * If there are no ready events, do a selrecord on either individual
2516  * selinfo or on the global one.
2517  * Device-dependent parts (locking and sync of tx/rx rings)
2518  * are done through callbacks.
2519  *
2520  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
2521  * The first one is remapped to pwait as selrecord() uses the name as an
2522  * hidden argument.
2523  */
2524 int
2525 netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
2526 {
2527 	struct netmap_adapter *na;
2528 	struct netmap_kring *kring;
2529 	struct netmap_ring *ring;
2530 	u_int i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
2531 #define want_tx want[NR_TX]
2532 #define want_rx want[NR_RX]
2533 	struct mbq q;	/* packets from RX hw queues to host stack */
2534 	enum txrx t;
2535 
2536 	/*
2537 	 * In order to avoid nested locks, we need to "double check"
2538 	 * txsync and rxsync if we decide to do a selrecord().
2539 	 * retry_tx (and retry_rx, later) prevent looping forever.
2540 	 */
2541 	int retry_tx = 1, retry_rx = 1;
2542 
2543 	/* Transparent mode: send_down is 1 if we have found some
2544 	 * packets to forward (host RX ring --> NIC) during the rx
2545 	 * scan and we have not sent them down to the NIC yet.
2546 	 * Transparent mode requires to bind all rings to a single
2547 	 * file descriptor.
2548 	 */
2549 	int send_down = 0;
2550 	int sync_flags = priv->np_sync_flags;
2551 
2552 	mbq_init(&q);
2553 
2554 	if (priv->np_nifp == NULL) {
2555 		D("No if registered");
2556 		return POLLERR;
2557 	}
2558 	mb(); /* make sure following reads are not from cache */
2559 
2560 	na = priv->np_na;
2561 
2562 	if (!nm_netmap_on(na))
2563 		return POLLERR;
2564 
2565 	if (netmap_verbose & 0x8000)
2566 		D("device %s events 0x%x", na->name, events);
2567 	want_tx = events & (POLLOUT | POLLWRNORM);
2568 	want_rx = events & (POLLIN | POLLRDNORM);
2569 
2570 	/*
2571 	 * check_all_{tx|rx} are set if the card has more than one queue AND
2572 	 * the file descriptor is bound to all of them. If so, we sleep on
2573 	 * the "global" selinfo, otherwise we sleep on individual selinfo
2574 	 * (FreeBSD only allows two selinfo's per file descriptor).
2575 	 * The interrupt routine in the driver wake one or the other
2576 	 * (or both) depending on which clients are active.
2577 	 *
2578 	 * rxsync() is only called if we run out of buffers on a POLLIN.
2579 	 * txsync() is called if we run out of buffers on POLLOUT, or
2580 	 * there are pending packets to send. The latter can be disabled
2581 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
2582 	 */
2583 	check_all_tx = nm_si_user(priv, NR_TX);
2584 	check_all_rx = nm_si_user(priv, NR_RX);
2585 
2586 	/*
2587 	 * We start with a lock free round which is cheap if we have
2588 	 * slots available. If this fails, then lock and call the sync
2589 	 * routines.
2590 	 */
2591 #if 1 /* new code- call rx if any of the ring needs to release or read buffers */
2592 	if (want_tx) {
2593 		t = NR_TX;
2594 		for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) {
2595 			kring = &NMR(na, t)[i];
2596 			/* XXX compare ring->cur and kring->tail */
2597 			if (!nm_ring_empty(kring->ring)) {
2598 				revents |= want[t];
2599 				want[t] = 0;	/* also breaks the loop */
2600 			}
2601 		}
2602 	}
2603 	if (want_rx) {
2604 		want_rx = 0; /* look for a reason to run the handlers */
2605 		t = NR_RX;
2606 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
2607 			kring = &NMR(na, t)[i];
2608 			if (kring->ring->cur == kring->ring->tail /* try fetch new buffers */
2609 			    || kring->rhead != kring->ring->head /* release buffers */) {
2610 				want_rx = 1;
2611 			}
2612 		}
2613 		if (!want_rx)
2614 			revents |= events & (POLLIN | POLLRDNORM); /* we have data */
2615 	}
2616 #else /* old code */
2617 	for_rx_tx(t) {
2618 		for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) {
2619 			kring = &NMR(na, t)[i];
2620 			/* XXX compare ring->cur and kring->tail */
2621 			if (!nm_ring_empty(kring->ring)) {
2622 				revents |= want[t];
2623 				want[t] = 0;	/* also breaks the loop */
2624 			}
2625 		}
2626 	}
2627 #endif /* old code */
2628 
2629 	/*
2630 	 * If we want to push packets out (priv->np_txpoll) or
2631 	 * want_tx is still set, we must issue txsync calls
2632 	 * (on all rings, to avoid that the tx rings stall).
2633 	 * XXX should also check cur != hwcur on the tx rings.
2634 	 * Fortunately, normal tx mode has np_txpoll set.
2635 	 */
2636 	if (priv->np_txpoll || want_tx) {
2637 		/*
2638 		 * The first round checks if anyone is ready, if not
2639 		 * do a selrecord and another round to handle races.
2640 		 * want_tx goes to 0 if any space is found, and is
2641 		 * used to skip rings with no pending transmissions.
2642 		 */
2643 flush_tx:
2644 		for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {
2645 			int found = 0;
2646 
2647 			kring = &na->tx_rings[i];
2648 			ring = kring->ring;
2649 
2650 			if (!send_down && !want_tx && ring->cur == kring->nr_hwcur)
2651 				continue;
2652 
2653 			if (nm_kr_tryget(kring, 1, &revents))
2654 				continue;
2655 
2656 			if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2657 				netmap_ring_reinit(kring);
2658 				revents |= POLLERR;
2659 			} else {
2660 				if (kring->nm_sync(kring, sync_flags))
2661 					revents |= POLLERR;
2662 				else
2663 					nm_sync_finalize(kring);
2664 			}
2665 
2666 			/*
2667 			 * If we found new slots, notify potential
2668 			 * listeners on the same ring.
2669 			 * Since we just did a txsync, look at the copies
2670 			 * of cur,tail in the kring.
2671 			 */
2672 			found = kring->rcur != kring->rtail;
2673 			nm_kr_put(kring);
2674 			if (found) { /* notify other listeners */
2675 				revents |= want_tx;
2676 				want_tx = 0;
2677 				kring->nm_notify(kring, 0);
2678 			}
2679 		}
2680 		/* if there were any packet to forward we must have handled them by now */
2681 		send_down = 0;
2682 		if (want_tx && retry_tx && sr) {
2683 			nm_os_selrecord(sr, check_all_tx ?
2684 			    &na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]].si);
2685 			retry_tx = 0;
2686 			goto flush_tx;
2687 		}
2688 	}
2689 
2690 	/*
2691 	 * If want_rx is still set scan receive rings.
2692 	 * Do it on all rings because otherwise we starve.
2693 	 */
2694 	if (want_rx) {
2695 		/* two rounds here for race avoidance */
2696 do_retry_rx:
2697 		for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
2698 			int found = 0;
2699 
2700 			kring = &na->rx_rings[i];
2701 			ring = kring->ring;
2702 
2703 			if (unlikely(nm_kr_tryget(kring, 1, &revents)))
2704 				continue;
2705 
2706 			if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2707 				netmap_ring_reinit(kring);
2708 				revents |= POLLERR;
2709 			}
2710 			/* now we can use kring->rcur, rtail */
2711 
2712 			/*
2713 			 * transparent mode support: collect packets from
2714 			 * hw rxring(s) that have been released by the user
2715 			 */
2716 			if (nm_may_forward_up(kring)) {
2717 				netmap_grab_packets(kring, &q, netmap_fwd);
2718 			}
2719 
2720 			/* Clear the NR_FORWARD flag anyway, it may be set by
2721 			 * the nm_sync() below only on for the host RX ring (see
2722 			 * netmap_rxsync_from_host()). */
2723 			kring->nr_kflags &= ~NR_FORWARD;
2724 			if (kring->nm_sync(kring, sync_flags))
2725 				revents |= POLLERR;
2726 			else
2727 				nm_sync_finalize(kring);
2728 			send_down |= (kring->nr_kflags & NR_FORWARD);
2729 			ring_timestamp_set(ring);
2730 			found = kring->rcur != kring->rtail;
2731 			nm_kr_put(kring);
2732 			if (found) {
2733 				revents |= want_rx;
2734 				retry_rx = 0;
2735 				kring->nm_notify(kring, 0);
2736 			}
2737 		}
2738 
2739 		if (retry_rx && sr) {
2740 			nm_os_selrecord(sr, check_all_rx ?
2741 			    &na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]].si);
2742 		}
2743 		if (send_down || retry_rx) {
2744 			retry_rx = 0;
2745 			if (send_down)
2746 				goto flush_tx; /* and retry_rx */
2747 			else
2748 				goto do_retry_rx;
2749 		}
2750 	}
2751 
2752 	/*
2753 	 * Transparent mode: released bufs (i.e. between kring->nr_hwcur and
2754 	 * ring->head) marked with NS_FORWARD on hw rx rings are passed up
2755 	 * to the host stack.
2756 	 */
2757 
2758 	if (mbq_peek(&q)) {
2759 		netmap_send_up(na->ifp, &q);
2760 	}
2761 
2762 	return (revents);
2763 #undef want_tx
2764 #undef want_rx
2765 }
2766 
2767 
2768 /*-------------------- driver support routines -------------------*/
2769 
2770 /* default notify callback */
2771 static int
2772 netmap_notify(struct netmap_kring *kring, int flags)
2773 {
2774 	struct netmap_adapter *na = kring->na;
2775 	enum txrx t = kring->tx;
2776 
2777 	nm_os_selwakeup(&kring->si);
2778 	/* optimization: avoid a wake up on the global
2779 	 * queue if nobody has registered for more
2780 	 * than one ring
2781 	 */
2782 	if (na->si_users[t] > 0)
2783 		nm_os_selwakeup(&na->si[t]);
2784 
2785 	return NM_IRQ_COMPLETED;
2786 }
2787 
2788 /* called by all routines that create netmap_adapters.
2789  * provide some defaults and get a reference to the
2790  * memory allocator
2791  */
2792 int
2793 netmap_attach_common(struct netmap_adapter *na)
2794 {
2795 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2796 		D("%s: invalid rings tx %d rx %d",
2797 			na->name, na->num_tx_rings, na->num_rx_rings);
2798 		return EINVAL;
2799 	}
2800 
2801 #ifdef __FreeBSD__
2802 	if (na->na_flags & NAF_HOST_RINGS && na->ifp) {
2803 		na->if_input = na->ifp->if_input; /* for netmap_send_up */
2804 	}
2805 #endif /* __FreeBSD__ */
2806 	if (na->nm_krings_create == NULL) {
2807 		/* we assume that we have been called by a driver,
2808 		 * since other port types all provide their own
2809 		 * nm_krings_create
2810 		 */
2811 		na->nm_krings_create = netmap_hw_krings_create;
2812 		na->nm_krings_delete = netmap_hw_krings_delete;
2813 	}
2814 	if (na->nm_notify == NULL)
2815 		na->nm_notify = netmap_notify;
2816 	na->active_fds = 0;
2817 
2818 	if (na->nm_mem == NULL) {
2819 		/* use the global allocator */
2820 		na->nm_mem = netmap_mem_get(&nm_mem);
2821 	}
2822 #ifdef WITH_VALE
2823 	if (na->nm_bdg_attach == NULL)
2824 		/* no special nm_bdg_attach callback. On VALE
2825 		 * attach, we need to interpose a bwrap
2826 		 */
2827 		na->nm_bdg_attach = netmap_bwrap_attach;
2828 #endif
2829 
2830 	return 0;
2831 }
2832 
2833 
2834 /* standard cleanup, called by all destructors */
2835 void
2836 netmap_detach_common(struct netmap_adapter *na)
2837 {
2838 	if (na->tx_rings) { /* XXX should not happen */
2839 		D("freeing leftover tx_rings");
2840 		na->nm_krings_delete(na);
2841 	}
2842 	netmap_pipe_dealloc(na);
2843 	if (na->nm_mem)
2844 		netmap_mem_put(na->nm_mem);
2845 	bzero(na, sizeof(*na));
2846 	nm_os_free(na);
2847 }
2848 
2849 /* Wrapper for the register callback provided netmap-enabled
2850  * hardware drivers.
2851  * nm_iszombie(na) means that the driver module has been
2852  * unloaded, so we cannot call into it.
2853  * nm_os_ifnet_lock() must guarantee mutual exclusion with
2854  * module unloading.
2855  */
2856 static int
2857 netmap_hw_reg(struct netmap_adapter *na, int onoff)
2858 {
2859 	struct netmap_hw_adapter *hwna =
2860 		(struct netmap_hw_adapter*)na;
2861 	int error = 0;
2862 
2863 	nm_os_ifnet_lock();
2864 
2865 	if (nm_iszombie(na)) {
2866 		if (onoff) {
2867 			error = ENXIO;
2868 		} else if (na != NULL) {
2869 			na->na_flags &= ~NAF_NETMAP_ON;
2870 		}
2871 		goto out;
2872 	}
2873 
2874 	error = hwna->nm_hw_register(na, onoff);
2875 
2876 out:
2877 	nm_os_ifnet_unlock();
2878 
2879 	return error;
2880 }
2881 
2882 static void
2883 netmap_hw_dtor(struct netmap_adapter *na)
2884 {
2885 	if (nm_iszombie(na) || na->ifp == NULL)
2886 		return;
2887 
2888 	WNA(na->ifp) = NULL;
2889 }
2890 
2891 
2892 /*
2893  * Allocate a netmap_adapter object, and initialize it from the
2894  * 'arg' passed by the driver on attach.
2895  * We allocate a block of memory of 'size' bytes, which has room
2896  * for struct netmap_adapter plus additional room private to
2897  * the caller.
2898  * Return 0 on success, ENOMEM otherwise.
2899  */
2900 int
2901 netmap_attach_ext(struct netmap_adapter *arg, size_t size)
2902 {
2903 	struct netmap_hw_adapter *hwna = NULL;
2904 	struct ifnet *ifp = NULL;
2905 
2906 	if (size < sizeof(struct netmap_hw_adapter)) {
2907 		D("Invalid netmap adapter size %d", (int)size);
2908 		return EINVAL;
2909 	}
2910 
2911 	if (arg == NULL || arg->ifp == NULL)
2912 		goto fail;
2913 	ifp = arg->ifp;
2914 	hwna = nm_os_malloc(size);
2915 	if (hwna == NULL)
2916 		goto fail;
2917 	hwna->up = *arg;
2918 	hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
2919 	strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
2920 	hwna->nm_hw_register = hwna->up.nm_register;
2921 	hwna->up.nm_register = netmap_hw_reg;
2922 	if (netmap_attach_common(&hwna->up)) {
2923 		nm_os_free(hwna);
2924 		goto fail;
2925 	}
2926 	netmap_adapter_get(&hwna->up);
2927 
2928 	NM_ATTACH_NA(ifp, &hwna->up);
2929 
2930 #ifdef linux
2931 	if (ifp->netdev_ops) {
2932 		/* prepare a clone of the netdev ops */
2933 #ifndef NETMAP_LINUX_HAVE_NETDEV_OPS
2934 		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2935 #else
2936 		hwna->nm_ndo = *ifp->netdev_ops;
2937 #endif /* NETMAP_LINUX_HAVE_NETDEV_OPS */
2938 	}
2939 	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2940 	if (ifp->ethtool_ops) {
2941 		hwna->nm_eto = *ifp->ethtool_ops;
2942 	}
2943 	hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam;
2944 #ifdef NETMAP_LINUX_HAVE_SET_CHANNELS
2945 	hwna->nm_eto.set_channels = linux_netmap_set_channels;
2946 #endif /* NETMAP_LINUX_HAVE_SET_CHANNELS */
2947 	if (arg->nm_config == NULL) {
2948 		hwna->up.nm_config = netmap_linux_config;
2949 	}
2950 #endif /* linux */
2951 	if (arg->nm_dtor == NULL) {
2952 		hwna->up.nm_dtor = netmap_hw_dtor;
2953 	}
2954 
2955 	if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
2956 	    hwna->up.num_tx_rings, hwna->up.num_tx_desc,
2957 	    hwna->up.num_rx_rings, hwna->up.num_rx_desc);
2958 	return 0;
2959 
2960 fail:
2961 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2962 	return (hwna ? EINVAL : ENOMEM);
2963 }
2964 
2965 
2966 int
2967 netmap_attach(struct netmap_adapter *arg)
2968 {
2969 	return netmap_attach_ext(arg, sizeof(struct netmap_hw_adapter));
2970 }
2971 
2972 
2973 void
2974 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2975 {
2976 	if (!na) {
2977 		return;
2978 	}
2979 
2980 	refcount_acquire(&na->na_refcount);
2981 }
2982 
2983 
2984 /* returns 1 iff the netmap_adapter is destroyed */
2985 int
2986 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2987 {
2988 	if (!na)
2989 		return 1;
2990 
2991 	if (!refcount_release(&na->na_refcount))
2992 		return 0;
2993 
2994 	if (na->nm_dtor)
2995 		na->nm_dtor(na);
2996 
2997 	netmap_detach_common(na);
2998 
2999 	return 1;
3000 }
3001 
3002 /* nm_krings_create callback for all hardware native adapters */
3003 int
3004 netmap_hw_krings_create(struct netmap_adapter *na)
3005 {
3006 	int ret = netmap_krings_create(na, 0);
3007 	if (ret == 0) {
3008 		/* initialize the mbq for the sw rx ring */
3009 		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
3010 		ND("initialized sw rx queue %d", na->num_rx_rings);
3011 	}
3012 	return ret;
3013 }
3014 
3015 
3016 
3017 /*
3018  * Called on module unload by the netmap-enabled drivers
3019  */
3020 void
3021 netmap_detach(struct ifnet *ifp)
3022 {
3023 	struct netmap_adapter *na = NA(ifp);
3024 
3025 	if (!na)
3026 		return;
3027 
3028 	NMG_LOCK();
3029 	netmap_set_all_rings(na, NM_KR_LOCKED);
3030 	na->na_flags |= NAF_ZOMBIE;
3031 	/*
3032 	 * if the netmap adapter is not native, somebody
3033 	 * changed it, so we can not release it here.
3034 	 * The NAF_ZOMBIE flag will notify the new owner that
3035 	 * the driver is gone.
3036 	 */
3037 	if (na->na_flags & NAF_NATIVE) {
3038 	        netmap_adapter_put(na);
3039 	}
3040 	/* give active users a chance to notice that NAF_ZOMBIE has been
3041 	 * turned on, so that they can stop and return an error to userspace.
3042 	 * Note that this becomes a NOP if there are no active users and,
3043 	 * therefore, the put() above has deleted the na, since now NA(ifp) is
3044 	 * NULL.
3045 	 */
3046 	netmap_enable_all_rings(ifp);
3047 	NMG_UNLOCK();
3048 }
3049 
3050 
3051 /*
3052  * Intercept packets from the network stack and pass them
3053  * to netmap as incoming packets on the 'software' ring.
3054  *
3055  * We only store packets in a bounded mbq and then copy them
3056  * in the relevant rxsync routine.
3057  *
3058  * We rely on the OS to make sure that the ifp and na do not go
3059  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
3060  * In nm_register() or whenever there is a reinitialization,
3061  * we make sure to make the mode change visible here.
3062  */
3063 int
3064 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
3065 {
3066 	struct netmap_adapter *na = NA(ifp);
3067 	struct netmap_kring *kring, *tx_kring;
3068 	u_int len = MBUF_LEN(m);
3069 	u_int error = ENOBUFS;
3070 	unsigned int txr;
3071 	struct mbq *q;
3072 	int busy;
3073 
3074 	kring = &na->rx_rings[na->num_rx_rings];
3075 	// XXX [Linux] we do not need this lock
3076 	// if we follow the down/configure/up protocol -gl
3077 	// mtx_lock(&na->core_lock);
3078 
3079 	if (!nm_netmap_on(na)) {
3080 		D("%s not in netmap mode anymore", na->name);
3081 		error = ENXIO;
3082 		goto done;
3083 	}
3084 
3085 	txr = MBUF_TXQ(m);
3086 	if (txr >= na->num_tx_rings) {
3087 		txr %= na->num_tx_rings;
3088 	}
3089 	tx_kring = &NMR(na, NR_TX)[txr];
3090 
3091 	if (tx_kring->nr_mode == NKR_NETMAP_OFF) {
3092 		return MBUF_TRANSMIT(na, ifp, m);
3093 	}
3094 
3095 	q = &kring->rx_queue;
3096 
3097 	// XXX reconsider long packets if we handle fragments
3098 	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
3099 		D("%s from_host, drop packet size %d > %d", na->name,
3100 			len, NETMAP_BUF_SIZE(na));
3101 		goto done;
3102 	}
3103 
3104 	if (nm_os_mbuf_has_offld(m)) {
3105 		RD(1, "%s drop mbuf that needs offloadings", na->name);
3106 		goto done;
3107 	}
3108 
3109 	/* protect against netmap_rxsync_from_host(), netmap_sw_to_nic()
3110 	 * and maybe other instances of netmap_transmit (the latter
3111 	 * not possible on Linux).
3112 	 * We enqueue the mbuf only if we are sure there is going to be
3113 	 * enough room in the host RX ring, otherwise we drop it.
3114 	 */
3115 	mbq_lock(q);
3116 
3117 	busy = kring->nr_hwtail - kring->nr_hwcur;
3118 	if (busy < 0)
3119 		busy += kring->nkr_num_slots;
3120 	if (busy + mbq_len(q) >= kring->nkr_num_slots - 1) {
3121 		RD(2, "%s full hwcur %d hwtail %d qlen %d", na->name,
3122 			kring->nr_hwcur, kring->nr_hwtail, mbq_len(q));
3123 	} else {
3124 		mbq_enqueue(q, m);
3125 		ND(2, "%s %d bufs in queue", na->name, mbq_len(q));
3126 		/* notify outside the lock */
3127 		m = NULL;
3128 		error = 0;
3129 	}
3130 	mbq_unlock(q);
3131 
3132 done:
3133 	if (m)
3134 		m_freem(m);
3135 	/* unconditionally wake up listeners */
3136 	kring->nm_notify(kring, 0);
3137 	/* this is normally netmap_notify(), but for nics
3138 	 * connected to a bridge it is netmap_bwrap_intr_notify(),
3139 	 * that possibly forwards the frames through the switch
3140 	 */
3141 
3142 	return (error);
3143 }
3144 
3145 
3146 /*
3147  * netmap_reset() is called by the driver routines when reinitializing
3148  * a ring. The driver is in charge of locking to protect the kring.
3149  * If native netmap mode is not set just return NULL.
3150  * If native netmap mode is set, in particular, we have to set nr_mode to
3151  * NKR_NETMAP_ON.
3152  */
3153 struct netmap_slot *
3154 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
3155 	u_int new_cur)
3156 {
3157 	struct netmap_kring *kring;
3158 	int new_hwofs, lim;
3159 
3160 	if (!nm_native_on(na)) {
3161 		ND("interface not in native netmap mode");
3162 		return NULL;	/* nothing to reinitialize */
3163 	}
3164 
3165 	/* XXX note- in the new scheme, we are not guaranteed to be
3166 	 * under lock (e.g. when called on a device reset).
3167 	 * In this case, we should set a flag and do not trust too
3168 	 * much the values. In practice: TODO
3169 	 * - set a RESET flag somewhere in the kring
3170 	 * - do the processing in a conservative way
3171 	 * - let the *sync() fixup at the end.
3172 	 */
3173 	if (tx == NR_TX) {
3174 		if (n >= na->num_tx_rings)
3175 			return NULL;
3176 
3177 		kring = na->tx_rings + n;
3178 
3179 		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3180 			kring->nr_mode = NKR_NETMAP_OFF;
3181 			return NULL;
3182 		}
3183 
3184 		// XXX check whether we should use hwcur or rcur
3185 		new_hwofs = kring->nr_hwcur - new_cur;
3186 	} else {
3187 		if (n >= na->num_rx_rings)
3188 			return NULL;
3189 		kring = na->rx_rings + n;
3190 
3191 		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3192 			kring->nr_mode = NKR_NETMAP_OFF;
3193 			return NULL;
3194 		}
3195 
3196 		new_hwofs = kring->nr_hwtail - new_cur;
3197 	}
3198 	lim = kring->nkr_num_slots - 1;
3199 	if (new_hwofs > lim)
3200 		new_hwofs -= lim + 1;
3201 
3202 	/* Always set the new offset value and realign the ring. */
3203 	if (netmap_verbose)
3204 	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
3205 		na->name,
3206 		tx == NR_TX ? "TX" : "RX", n,
3207 		kring->nkr_hwofs, new_hwofs,
3208 		kring->nr_hwtail,
3209 		tx == NR_TX ? lim : kring->nr_hwtail);
3210 	kring->nkr_hwofs = new_hwofs;
3211 	if (tx == NR_TX) {
3212 		kring->nr_hwtail = kring->nr_hwcur + lim;
3213 		if (kring->nr_hwtail > lim)
3214 			kring->nr_hwtail -= lim + 1;
3215 	}
3216 
3217 #if 0 // def linux
3218 	/* XXX check that the mappings are correct */
3219 	/* need ring_nr, adapter->pdev, direction */
3220 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
3221 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
3222 		D("error mapping rx netmap buffer %d", i);
3223 		// XXX fix error handling
3224 	}
3225 
3226 #endif /* linux */
3227 	/*
3228 	 * Wakeup on the individual and global selwait
3229 	 * We do the wakeup here, but the ring is not yet reconfigured.
3230 	 * However, we are under lock so there are no races.
3231 	 */
3232 	kring->nr_mode = NKR_NETMAP_ON;
3233 	kring->nm_notify(kring, 0);
3234 	return kring->ring->slot;
3235 }
3236 
3237 
3238 /*
3239  * Dispatch rx/tx interrupts to the netmap rings.
3240  *
3241  * "work_done" is non-null on the RX path, NULL for the TX path.
3242  * We rely on the OS to make sure that there is only one active
3243  * instance per queue, and that there is appropriate locking.
3244  *
3245  * The 'notify' routine depends on what the ring is attached to.
3246  * - for a netmap file descriptor, do a selwakeup on the individual
3247  *   waitqueue, plus one on the global one if needed
3248  *   (see netmap_notify)
3249  * - for a nic connected to a switch, call the proper forwarding routine
3250  *   (see netmap_bwrap_intr_notify)
3251  */
3252 int
3253 netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
3254 {
3255 	struct netmap_kring *kring;
3256 	enum txrx t = (work_done ? NR_RX : NR_TX);
3257 
3258 	q &= NETMAP_RING_MASK;
3259 
3260 	if (netmap_verbose) {
3261 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
3262 	}
3263 
3264 	if (q >= nma_get_nrings(na, t))
3265 		return NM_IRQ_PASS; // not a physical queue
3266 
3267 	kring = NMR(na, t) + q;
3268 
3269 	if (kring->nr_mode == NKR_NETMAP_OFF) {
3270 		return NM_IRQ_PASS;
3271 	}
3272 
3273 	if (t == NR_RX) {
3274 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
3275 		*work_done = 1; /* do not fire napi again */
3276 	}
3277 
3278 	return kring->nm_notify(kring, 0);
3279 }
3280 
3281 
3282 /*
3283  * Default functions to handle rx/tx interrupts from a physical device.
3284  * "work_done" is non-null on the RX path, NULL for the TX path.
3285  *
3286  * If the card is not in netmap mode, simply return NM_IRQ_PASS,
3287  * so that the caller proceeds with regular processing.
3288  * Otherwise call netmap_common_irq().
3289  *
3290  * If the card is connected to a netmap file descriptor,
3291  * do a selwakeup on the individual queue, plus one on the global one
3292  * if needed (multiqueue card _and_ there are multiqueue listeners),
3293  * and return NR_IRQ_COMPLETED.
3294  *
3295  * Finally, if called on rx from an interface connected to a switch,
3296  * calls the proper forwarding routine.
3297  */
3298 int
3299 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3300 {
3301 	struct netmap_adapter *na = NA(ifp);
3302 
3303 	/*
3304 	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
3305 	 * we still use the regular driver even though the previous
3306 	 * check fails. It is unclear whether we should use
3307 	 * nm_native_on() here.
3308 	 */
3309 	if (!nm_netmap_on(na))
3310 		return NM_IRQ_PASS;
3311 
3312 	if (na->na_flags & NAF_SKIP_INTR) {
3313 		ND("use regular interrupt");
3314 		return NM_IRQ_PASS;
3315 	}
3316 
3317 	return netmap_common_irq(na, q, work_done);
3318 }
3319 
3320 
3321 /*
3322  * Module loader and unloader
3323  *
3324  * netmap_init() creates the /dev/netmap device and initializes
3325  * all global variables. Returns 0 on success, errno on failure
3326  * (but there is no chance)
3327  *
3328  * netmap_fini() destroys everything.
3329  */
3330 
3331 static struct cdev *netmap_dev; /* /dev/netmap character device. */
3332 extern struct cdevsw netmap_cdevsw;
3333 
3334 
3335 void
3336 netmap_fini(void)
3337 {
3338 	if (netmap_dev)
3339 		destroy_dev(netmap_dev);
3340 	/* we assume that there are no longer netmap users */
3341 	nm_os_ifnet_fini();
3342 	netmap_uninit_bridges();
3343 	netmap_mem_fini();
3344 	NMG_LOCK_DESTROY();
3345 	nm_prinf("netmap: unloaded module.\n");
3346 }
3347 
3348 
3349 int
3350 netmap_init(void)
3351 {
3352 	int error;
3353 
3354 	NMG_LOCK_INIT();
3355 
3356 	error = netmap_mem_init();
3357 	if (error != 0)
3358 		goto fail;
3359 	/*
3360 	 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
3361 	 * when the module is compiled in.
3362 	 * XXX could use make_dev_credv() to get error number
3363 	 */
3364 	netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
3365 		&netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
3366 			      "netmap");
3367 	if (!netmap_dev)
3368 		goto fail;
3369 
3370 	error = netmap_init_bridges();
3371 	if (error)
3372 		goto fail;
3373 
3374 #ifdef __FreeBSD__
3375 	nm_os_vi_init_index();
3376 #endif
3377 
3378 	error = nm_os_ifnet_init();
3379 	if (error)
3380 		goto fail;
3381 
3382 	nm_prinf("netmap: loaded module\n");
3383 	return (0);
3384 fail:
3385 	netmap_fini();
3386 	return (EINVAL); /* may be incorrect */
3387 }
3388