xref: /freebsd/sys/dev/netmap/netmap.c (revision d8a0fe102c0cfdfcd5b818f850eff09d8536c9bc)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (C) 2011-2014 Matteo Landi
5  * Copyright (C) 2011-2016 Luigi Rizzo
6  * Copyright (C) 2011-2016 Giuseppe Lettieri
7  * Copyright (C) 2011-2016 Vincenzo Maffione
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *   1. Redistributions of source code must retain the above copyright
14  *      notice, this list of conditions and the following disclaimer.
15  *   2. Redistributions in binary form must reproduce the above copyright
16  *      notice, this list of conditions and the following disclaimer in the
17  *      documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 
33 /*
34  * $FreeBSD$
35  *
36  * This module supports memory mapped access to network devices,
37  * see netmap(4).
38  *
39  * The module uses a large, memory pool allocated by the kernel
40  * and accessible as mmapped memory by multiple userspace threads/processes.
41  * The memory pool contains packet buffers and "netmap rings",
42  * i.e. user-accessible copies of the interface's queues.
43  *
44  * Access to the network card works like this:
45  * 1. a process/thread issues one or more open() on /dev/netmap, to create
46  *    select()able file descriptor on which events are reported.
47  * 2. on each descriptor, the process issues an ioctl() to identify
48  *    the interface that should report events to the file descriptor.
49  * 3. on each descriptor, the process issues an mmap() request to
50  *    map the shared memory region within the process' address space.
51  *    The list of interesting queues is indicated by a location in
52  *    the shared memory region.
53  * 4. using the functions in the netmap(4) userspace API, a process
54  *    can look up the occupation state of a queue, access memory buffers,
55  *    and retrieve received packets or enqueue packets to transmit.
56  * 5. using some ioctl()s the process can synchronize the userspace view
57  *    of the queue with the actual status in the kernel. This includes both
58  *    receiving the notification of new packets, and transmitting new
59  *    packets on the output interface.
60  * 6. select() or poll() can be used to wait for events on individual
61  *    transmit or receive queues (or all queues for a given interface).
62  *
63 
64 		SYNCHRONIZATION (USER)
65 
66 The netmap rings and data structures may be shared among multiple
67 user threads or even independent processes.
68 Any synchronization among those threads/processes is delegated
69 to the threads themselves. Only one thread at a time can be in
70 a system call on the same netmap ring. The OS does not enforce
71 this and only guarantees against system crashes in case of
72 invalid usage.
73 
74 		LOCKING (INTERNAL)
75 
76 Within the kernel, access to the netmap rings is protected as follows:
77 
78 - a spinlock on each ring, to handle producer/consumer races on
79   RX rings attached to the host stack (against multiple host
80   threads writing from the host stack to the same ring),
81   and on 'destination' rings attached to a VALE switch
82   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
83   protecting multiple active senders for the same destination)
84 
85 - an atomic variable to guarantee that there is at most one
86   instance of *_*xsync() on the ring at any time.
87   For rings connected to user file
88   descriptors, an atomic_test_and_set() protects this, and the
89   lock on the ring is not actually used.
90   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
91   is also used to prevent multiple executions (the driver might indeed
92   already guarantee this).
93   For NIC TX rings connected to a VALE switch, the lock arbitrates
94   access to the queue (both when allocating buffers and when pushing
95   them out).
96 
97 - *xsync() should be protected against initializations of the card.
98   On FreeBSD most devices have the reset routine protected by
99   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
100   the RING protection on rx_reset(), this should be added.
101 
102   On linux there is an external lock on the tx path, which probably
103   also arbitrates access to the reset routine. XXX to be revised
104 
105 - a per-interface core_lock protecting access from the host stack
106   while interfaces may be detached from netmap mode.
107   XXX there should be no need for this lock if we detach the interfaces
108   only while they are down.
109 
110 
111 --- VALE SWITCH ---
112 
113 NMG_LOCK() serializes all modifications to switches and ports.
114 A switch cannot be deleted until all ports are gone.
115 
116 For each switch, an SX lock (RWlock on linux) protects
117 deletion of ports. When configuring or deleting a new port, the
118 lock is acquired in exclusive mode (after holding NMG_LOCK).
119 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
120 The lock is held throughout the entire forwarding cycle,
121 during which the thread may incur in a page fault.
122 Hence it is important that sleepable shared locks are used.
123 
124 On the rx ring, the per-port lock is grabbed initially to reserve
125 a number of slot in the ring, then the lock is released,
126 packets are copied from source to destination, and then
127 the lock is acquired again and the receive ring is updated.
128 (A similar thing is done on the tx ring for NIC and host stack
129 ports attached to the switch)
130 
131  */
132 
133 
134 /* --- internals ----
135  *
136  * Roadmap to the code that implements the above.
137  *
138  * > 1. a process/thread issues one or more open() on /dev/netmap, to create
139  * >    select()able file descriptor on which events are reported.
140  *
141  *  	Internally, we allocate a netmap_priv_d structure, that will be
142  *  	initialized on ioctl(NIOCREGIF). There is one netmap_priv_d
143  *  	structure for each open().
144  *
145  *      os-specific:
146  *  	    FreeBSD: see netmap_open() (netmap_freebsd.c)
147  *  	    linux:   see linux_netmap_open() (netmap_linux.c)
148  *
149  * > 2. on each descriptor, the process issues an ioctl() to identify
150  * >    the interface that should report events to the file descriptor.
151  *
152  * 	Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
153  * 	Most important things happen in netmap_get_na() and
154  * 	netmap_do_regif(), called from there. Additional details can be
155  * 	found in the comments above those functions.
156  *
157  * 	In all cases, this action creates/takes-a-reference-to a
158  * 	netmap_*_adapter describing the port, and allocates a netmap_if
159  * 	and all necessary netmap rings, filling them with netmap buffers.
160  *
161  *      In this phase, the sync callbacks for each ring are set (these are used
162  *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
163  *      The adapter creation/initialization code puts them in the
164  * 	netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
165  * 	are copied from there to the netmap_kring's during netmap_do_regif(), by
166  * 	the nm_krings_create() callback.  All the nm_krings_create callbacks
167  * 	actually call netmap_krings_create() to perform this and the other
168  * 	common stuff. netmap_krings_create() also takes care of the host rings,
169  * 	if needed, by setting their sync callbacks appropriately.
170  *
171  * 	Additional actions depend on the kind of netmap_adapter that has been
172  * 	registered:
173  *
174  * 	- netmap_hw_adapter:  	     [netmap.c]
175  * 	     This is a system netdev/ifp with native netmap support.
176  * 	     The ifp is detached from the host stack by redirecting:
177  * 	       - transmissions (from the network stack) to netmap_transmit()
178  * 	       - receive notifications to the nm_notify() callback for
179  * 	         this adapter. The callback is normally netmap_notify(), unless
180  * 	         the ifp is attached to a bridge using bwrap, in which case it
181  * 	         is netmap_bwrap_intr_notify().
182  *
183  * 	- netmap_generic_adapter:      [netmap_generic.c]
184  * 	      A system netdev/ifp without native netmap support.
185  *
186  * 	(the decision about native/non native support is taken in
187  * 	 netmap_get_hw_na(), called by netmap_get_na())
188  *
189  * 	- netmap_vp_adapter 		[netmap_vale.c]
190  * 	      Returned by netmap_get_bdg_na().
191  * 	      This is a persistent or ephemeral VALE port. Ephemeral ports
192  * 	      are created on the fly if they don't already exist, and are
193  * 	      always attached to a bridge.
194  * 	      Persistent VALE ports must must be created separately, and i
195  * 	      then attached like normal NICs. The NIOCREGIF we are examining
196  * 	      will find them only if they had previosly been created and
197  * 	      attached (see VALE_CTL below).
198  *
199  * 	- netmap_pipe_adapter 	      [netmap_pipe.c]
200  * 	      Returned by netmap_get_pipe_na().
201  * 	      Both pipe ends are created, if they didn't already exist.
202  *
203  * 	- netmap_monitor_adapter      [netmap_monitor.c]
204  * 	      Returned by netmap_get_monitor_na().
205  * 	      If successful, the nm_sync callbacks of the monitored adapter
206  * 	      will be intercepted by the returned monitor.
207  *
208  * 	- netmap_bwrap_adapter	      [netmap_vale.c]
209  * 	      Cannot be obtained in this way, see VALE_CTL below
210  *
211  *
212  * 	os-specific:
213  * 	    linux: we first go through linux_netmap_ioctl() to
214  * 	           adapt the FreeBSD interface to the linux one.
215  *
216  *
217  * > 3. on each descriptor, the process issues an mmap() request to
218  * >    map the shared memory region within the process' address space.
219  * >    The list of interesting queues is indicated by a location in
220  * >    the shared memory region.
221  *
222  *      os-specific:
223  *  	    FreeBSD: netmap_mmap_single (netmap_freebsd.c).
224  *  	    linux:   linux_netmap_mmap (netmap_linux.c).
225  *
226  * > 4. using the functions in the netmap(4) userspace API, a process
227  * >    can look up the occupation state of a queue, access memory buffers,
228  * >    and retrieve received packets or enqueue packets to transmit.
229  *
230  * 	these actions do not involve the kernel.
231  *
232  * > 5. using some ioctl()s the process can synchronize the userspace view
233  * >    of the queue with the actual status in the kernel. This includes both
234  * >    receiving the notification of new packets, and transmitting new
235  * >    packets on the output interface.
236  *
237  * 	These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
238  * 	cases. They invoke the nm_sync callbacks on the netmap_kring
239  * 	structures, as initialized in step 2 and maybe later modified
240  * 	by a monitor. Monitors, however, will always call the original
241  * 	callback before doing anything else.
242  *
243  *
244  * > 6. select() or poll() can be used to wait for events on individual
245  * >    transmit or receive queues (or all queues for a given interface).
246  *
247  * 	Implemented in netmap_poll(). This will call the same nm_sync()
248  * 	callbacks as in step 5 above.
249  *
250  * 	os-specific:
251  * 		linux: we first go through linux_netmap_poll() to adapt
252  * 		       the FreeBSD interface to the linux one.
253  *
254  *
255  *  ----  VALE_CTL -----
256  *
257  *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
258  *  nr_cmd in the nmreq structure. These subcommands are handled by
259  *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
260  *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
261  *  subcommands, respectively.
262  *
263  *  Any network interface known to the system (including a persistent VALE
264  *  port) can be attached to a VALE switch by issuing the
265  *  NETMAP_BDG_ATTACH subcommand. After the attachment, persistent VALE ports
266  *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
267  *  attachment of other interfaces, instead, requires the creation of a
268  *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
269  *  netmap mode. This may require the creation of a netmap_generic_adapter if
270  *  we have no native support for the interface, or if generic adapters have
271  *  been forced by sysctl.
272  *
273  *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
274  *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
275  *  callback.  In the case of the bwrap, the callback creates the
276  *  netmap_bwrap_adapter.  The initialization of the bwrap is then
277  *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
278  *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
279  *  A generic adapter for the wrapped ifp will be created if needed, when
280  *  netmap_get_bdg_na() calls netmap_get_hw_na().
281  *
282  *
283  *  ---- DATAPATHS -----
284  *
285  *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
286  *
287  *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
288  *
289  *    - tx from netmap userspace:
290  *	 concurrently:
291  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
292  *                kring->nm_sync() == DEVICE_netmap_txsync()
293  *           2) device interrupt handler
294  *                na->nm_notify()  == netmap_notify()
295  *    - rx from netmap userspace:
296  *       concurrently:
297  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
298  *                kring->nm_sync() == DEVICE_netmap_rxsync()
299  *           2) device interrupt handler
300  *                na->nm_notify()  == netmap_notify()
301  *    - rx from host stack
302  *       concurrently:
303  *           1) host stack
304  *                netmap_transmit()
305  *                  na->nm_notify  == netmap_notify()
306  *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
307  *                kring->nm_sync() == netmap_rxsync_from_host
308  *                  netmap_rxsync_from_host(na, NULL, NULL)
309  *    - tx to host stack
310  *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
311  *             kring->nm_sync() == netmap_txsync_to_host
312  *               netmap_txsync_to_host(na)
313  *                 nm_os_send_up()
314  *                   FreeBSD: na->if_input() == ether_input()
315  *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
316  *
317  *
318  *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
319  *
320  *    na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
321  *
322  *    - tx from netmap userspace:
323  *       concurrently:
324  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
325  *               kring->nm_sync() == generic_netmap_txsync()
326  *                   nm_os_generic_xmit_frame()
327  *                       linux:   dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
328  *                           ifp->ndo_start_xmit == generic_ndo_start_xmit()
329  *                               gna->save_start_xmit == orig. dev. start_xmit
330  *                       FreeBSD: na->if_transmit() == orig. dev if_transmit
331  *           2) generic_mbuf_destructor()
332  *                   na->nm_notify() == netmap_notify()
333  *    - rx from netmap userspace:
334  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
335  *               kring->nm_sync() == generic_netmap_rxsync()
336  *                   mbq_safe_dequeue()
337  *           2) device driver
338  *               generic_rx_handler()
339  *                   mbq_safe_enqueue()
340  *                   na->nm_notify() == netmap_notify()
341  *    - rx from host stack
342  *        FreeBSD: same as native
343  *        Linux: same as native except:
344  *           1) host stack
345  *               dev_queue_xmit() without NM_MAGIC_PRIORITY_TX
346  *                   ifp->ndo_start_xmit == generic_ndo_start_xmit()
347  *                       netmap_transmit()
348  *                           na->nm_notify() == netmap_notify()
349  *    - tx to host stack (same as native):
350  *
351  *
352  *                           -= VALE =-
353  *
354  *   INCOMING:
355  *
356  *      - VALE ports:
357  *          ioctl(NIOCTXSYNC)/netmap_poll() in process context
358  *              kring->nm_sync() == netmap_vp_txsync()
359  *
360  *      - system device with native support:
361  *         from cable:
362  *             interrupt
363  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
364  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
365  *                     netmap_vp_txsync()
366  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
367  *         from host stack:
368  *             netmap_transmit()
369  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
370  *                     kring->nm_sync() == netmap_rxsync_from_host()
371  *                     netmap_vp_txsync()
372  *
373  *      - system device with generic support:
374  *         from device driver:
375  *            generic_rx_handler()
376  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
377  *                     kring->nm_sync() == generic_netmap_rxsync()
378  *                     netmap_vp_txsync()
379  *                     kring->nm_sync() == generic_netmap_rxsync()
380  *         from host stack:
381  *            netmap_transmit()
382  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
383  *                     kring->nm_sync() == netmap_rxsync_from_host()
384  *                     netmap_vp_txsync()
385  *
386  *   (all cases) --> nm_bdg_flush()
387  *                      dest_na->nm_notify() == (see below)
388  *
389  *   OUTGOING:
390  *
391  *      - VALE ports:
392  *         concurrently:
393  *             1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
394  *                    kring->nm_sync() == netmap_vp_rxsync()
395  *             2) from nm_bdg_flush()
396  *                    na->nm_notify() == netmap_notify()
397  *
398  *      - system device with native support:
399  *          to cable:
400  *             na->nm_notify() == netmap_bwrap_notify()
401  *                 netmap_vp_rxsync()
402  *                 kring->nm_sync() == DEVICE_netmap_txsync()
403  *                 netmap_vp_rxsync()
404  *          to host stack:
405  *                 netmap_vp_rxsync()
406  *                 kring->nm_sync() == netmap_txsync_to_host
407  *                 netmap_vp_rxsync_locked()
408  *
409  *      - system device with generic adapter:
410  *          to device driver:
411  *             na->nm_notify() == netmap_bwrap_notify()
412  *                 netmap_vp_rxsync()
413  *                 kring->nm_sync() == generic_netmap_txsync()
414  *                 netmap_vp_rxsync()
415  *          to host stack:
416  *                 netmap_vp_rxsync()
417  *                 kring->nm_sync() == netmap_txsync_to_host
418  *                 netmap_vp_rxsync()
419  *
420  */
421 
422 /*
423  * OS-specific code that is used only within this file.
424  * Other OS-specific code that must be accessed by drivers
425  * is present in netmap_kern.h
426  */
427 
428 #if defined(__FreeBSD__)
429 #include <sys/cdefs.h> /* prerequisite */
430 #include <sys/types.h>
431 #include <sys/errno.h>
432 #include <sys/param.h>	/* defines used in kernel.h */
433 #include <sys/kernel.h>	/* types used in module initialization */
434 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
435 #include <sys/filio.h>	/* FIONBIO */
436 #include <sys/sockio.h>
437 #include <sys/socketvar.h>	/* struct socket */
438 #include <sys/malloc.h>
439 #include <sys/poll.h>
440 #include <sys/rwlock.h>
441 #include <sys/socket.h> /* sockaddrs */
442 #include <sys/selinfo.h>
443 #include <sys/sysctl.h>
444 #include <sys/jail.h>
445 #include <net/vnet.h>
446 #include <net/if.h>
447 #include <net/if_var.h>
448 #include <net/bpf.h>		/* BIOCIMMEDIATE */
449 #include <machine/bus.h>	/* bus_dmamap_* */
450 #include <sys/endian.h>
451 #include <sys/refcount.h>
452 
453 
454 #elif defined(linux)
455 
456 #include "bsd_glue.h"
457 
458 #elif defined(__APPLE__)
459 
460 #warning OSX support is only partial
461 #include "osx_glue.h"
462 
463 #elif defined (_WIN32)
464 
465 #include "win_glue.h"
466 
467 #else
468 
469 #error	Unsupported platform
470 
471 #endif /* unsupported */
472 
473 /*
474  * common headers
475  */
476 #include <net/netmap.h>
477 #include <dev/netmap/netmap_kern.h>
478 #include <dev/netmap/netmap_mem2.h>
479 
480 
481 /* user-controlled variables */
482 int netmap_verbose;
483 
484 static int netmap_no_timestamp; /* don't timestamp on rxsync */
485 int netmap_mitigate = 1;
486 int netmap_no_pendintr = 1;
487 int netmap_txsync_retry = 2;
488 int netmap_flags = 0;	/* debug flags */
489 static int netmap_fwd = 0;	/* force transparent forwarding */
490 
491 /*
492  * netmap_admode selects the netmap mode to use.
493  * Invalid values are reset to NETMAP_ADMODE_BEST
494  */
495 enum {	NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
496 	NETMAP_ADMODE_NATIVE,	/* either native or none */
497 	NETMAP_ADMODE_GENERIC,	/* force generic */
498 	NETMAP_ADMODE_LAST };
499 static int netmap_admode = NETMAP_ADMODE_BEST;
500 
501 /* netmap_generic_mit controls mitigation of RX notifications for
502  * the generic netmap adapter. The value is a time interval in
503  * nanoseconds. */
504 int netmap_generic_mit = 100*1000;
505 
506 /* We use by default netmap-aware qdiscs with generic netmap adapters,
507  * even if there can be a little performance hit with hardware NICs.
508  * However, using the qdisc is the safer approach, for two reasons:
509  * 1) it prevents non-fifo qdiscs to break the TX notification
510  *    scheme, which is based on mbuf destructors when txqdisc is
511  *    not used.
512  * 2) it makes it possible to transmit over software devices that
513  *    change skb->dev, like bridge, veth, ...
514  *
515  * Anyway users looking for the best performance should
516  * use native adapters.
517  */
518 int netmap_generic_txqdisc = 1;
519 
520 /* Default number of slots and queues for generic adapters. */
521 int netmap_generic_ringsize = 1024;
522 int netmap_generic_rings = 1;
523 
524 /* Non-zero if ptnet devices are allowed to use virtio-net headers. */
525 int ptnet_vnet_hdr = 1;
526 
527 /* 0 if ptnetmap should not use worker threads for TX processing */
528 int ptnetmap_tx_workers = 1;
529 
530 /*
531  * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated
532  * in some other operating systems
533  */
534 SYSBEGIN(main_init);
535 
536 SYSCTL_DECL(_dev_netmap);
537 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
538 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
539     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
540 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
541     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
542 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
543 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
544     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
545 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
546     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
547 
548 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
549 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
550 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
551 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
552 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
553 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW, &netmap_generic_rings, 0 , "");
554 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW, &netmap_generic_txqdisc, 0 , "");
555 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr, 0 , "");
556 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnetmap_tx_workers, CTLFLAG_RW, &ptnetmap_tx_workers, 0 , "");
557 
558 SYSEND;
559 
560 NMG_LOCK_T	netmap_global_lock;
561 
562 /*
563  * mark the ring as stopped, and run through the locks
564  * to make sure other users get to see it.
565  * stopped must be either NR_KR_STOPPED (for unbounded stop)
566  * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)
567  */
568 static void
569 netmap_disable_ring(struct netmap_kring *kr, int stopped)
570 {
571 	nm_kr_stop(kr, stopped);
572 	// XXX check if nm_kr_stop is sufficient
573 	mtx_lock(&kr->q_lock);
574 	mtx_unlock(&kr->q_lock);
575 	nm_kr_put(kr);
576 }
577 
578 /* stop or enable a single ring */
579 void
580 netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
581 {
582 	if (stopped)
583 		netmap_disable_ring(NMR(na, t) + ring_id, stopped);
584 	else
585 		NMR(na, t)[ring_id].nkr_stopped = 0;
586 }
587 
588 
589 /* stop or enable all the rings of na */
590 void
591 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
592 {
593 	int i;
594 	enum txrx t;
595 
596 	if (!nm_netmap_on(na))
597 		return;
598 
599 	for_rx_tx(t) {
600 		for (i = 0; i < netmap_real_rings(na, t); i++) {
601 			netmap_set_ring(na, i, t, stopped);
602 		}
603 	}
604 }
605 
606 /*
607  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
608  * to finish and prevents any new one from starting.  Call this before turning
609  * netmap mode off, or before removing the hardware rings (e.g., on module
610  * onload).
611  */
612 void
613 netmap_disable_all_rings(struct ifnet *ifp)
614 {
615 	if (NM_NA_VALID(ifp)) {
616 		netmap_set_all_rings(NA(ifp), NM_KR_STOPPED);
617 	}
618 }
619 
620 /*
621  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
622  * adapter's rings In linux drivers, this should be placed near each
623  * napi_enable().
624  */
625 void
626 netmap_enable_all_rings(struct ifnet *ifp)
627 {
628 	if (NM_NA_VALID(ifp)) {
629 		netmap_set_all_rings(NA(ifp), 0 /* enabled */);
630 	}
631 }
632 
633 void
634 netmap_make_zombie(struct ifnet *ifp)
635 {
636 	if (NM_NA_VALID(ifp)) {
637 		struct netmap_adapter *na = NA(ifp);
638 		netmap_set_all_rings(na, NM_KR_LOCKED);
639 		na->na_flags |= NAF_ZOMBIE;
640 		netmap_set_all_rings(na, 0);
641 	}
642 }
643 
644 void
645 netmap_undo_zombie(struct ifnet *ifp)
646 {
647 	if (NM_NA_VALID(ifp)) {
648 		struct netmap_adapter *na = NA(ifp);
649 		if (na->na_flags & NAF_ZOMBIE) {
650 			netmap_set_all_rings(na, NM_KR_LOCKED);
651 			na->na_flags &= ~NAF_ZOMBIE;
652 			netmap_set_all_rings(na, 0);
653 		}
654 	}
655 }
656 
657 /*
658  * generic bound_checking function
659  */
660 u_int
661 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
662 {
663 	u_int oldv = *v;
664 	const char *op = NULL;
665 
666 	if (dflt < lo)
667 		dflt = lo;
668 	if (dflt > hi)
669 		dflt = hi;
670 	if (oldv < lo) {
671 		*v = dflt;
672 		op = "Bump";
673 	} else if (oldv > hi) {
674 		*v = hi;
675 		op = "Clamp";
676 	}
677 	if (op && msg)
678 		nm_prinf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
679 	return *v;
680 }
681 
682 
683 /*
684  * packet-dump function, user-supplied or static buffer.
685  * The destination buffer must be at least 30+4*len
686  */
687 const char *
688 nm_dump_buf(char *p, int len, int lim, char *dst)
689 {
690 	static char _dst[8192];
691 	int i, j, i0;
692 	static char hex[] ="0123456789abcdef";
693 	char *o;	/* output position */
694 
695 #define P_HI(x)	hex[((x) & 0xf0)>>4]
696 #define P_LO(x)	hex[((x) & 0xf)]
697 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
698 	if (!dst)
699 		dst = _dst;
700 	if (lim <= 0 || lim > len)
701 		lim = len;
702 	o = dst;
703 	sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
704 	o += strlen(o);
705 	/* hexdump routine */
706 	for (i = 0; i < lim; ) {
707 		sprintf(o, "%5d: ", i);
708 		o += strlen(o);
709 		memset(o, ' ', 48);
710 		i0 = i;
711 		for (j=0; j < 16 && i < lim; i++, j++) {
712 			o[j*3] = P_HI(p[i]);
713 			o[j*3+1] = P_LO(p[i]);
714 		}
715 		i = i0;
716 		for (j=0; j < 16 && i < lim; i++, j++)
717 			o[j + 48] = P_C(p[i]);
718 		o[j+48] = '\n';
719 		o += j+49;
720 	}
721 	*o = '\0';
722 #undef P_HI
723 #undef P_LO
724 #undef P_C
725 	return dst;
726 }
727 
728 
729 /*
730  * Fetch configuration from the device, to cope with dynamic
731  * reconfigurations after loading the module.
732  */
733 /* call with NMG_LOCK held */
734 int
735 netmap_update_config(struct netmap_adapter *na)
736 {
737 	u_int txr, txd, rxr, rxd;
738 
739 	txr = txd = rxr = rxd = 0;
740 	if (na->nm_config == NULL ||
741 	    na->nm_config(na, &txr, &txd, &rxr, &rxd))
742 	{
743 		/* take whatever we had at init time */
744 		txr = na->num_tx_rings;
745 		txd = na->num_tx_desc;
746 		rxr = na->num_rx_rings;
747 		rxd = na->num_rx_desc;
748 	}
749 
750 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
751 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
752 		return 0; /* nothing changed */
753 	if (netmap_verbose || na->active_fds > 0) {
754 		D("stored config %s: txring %d x %d, rxring %d x %d",
755 			na->name,
756 			na->num_tx_rings, na->num_tx_desc,
757 			na->num_rx_rings, na->num_rx_desc);
758 		D("new config %s: txring %d x %d, rxring %d x %d",
759 			na->name, txr, txd, rxr, rxd);
760 	}
761 	if (na->active_fds == 0) {
762 		D("configuration changed (but fine)");
763 		na->num_tx_rings = txr;
764 		na->num_tx_desc = txd;
765 		na->num_rx_rings = rxr;
766 		na->num_rx_desc = rxd;
767 		return 0;
768 	}
769 	D("configuration changed while active, this is bad...");
770 	return 1;
771 }
772 
773 /* nm_sync callbacks for the host rings */
774 static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
775 static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
776 
777 /* create the krings array and initialize the fields common to all adapters.
778  * The array layout is this:
779  *
780  *                    +----------+
781  * na->tx_rings ----->|          | \
782  *                    |          |  } na->num_tx_ring
783  *                    |          | /
784  *                    +----------+
785  *                    |          |    host tx kring
786  * na->rx_rings ----> +----------+
787  *                    |          | \
788  *                    |          |  } na->num_rx_rings
789  *                    |          | /
790  *                    +----------+
791  *                    |          |    host rx kring
792  *                    +----------+
793  * na->tailroom ----->|          | \
794  *                    |          |  } tailroom bytes
795  *                    |          | /
796  *                    +----------+
797  *
798  * Note: for compatibility, host krings are created even when not needed.
799  * The tailroom space is currently used by vale ports for allocating leases.
800  */
801 /* call with NMG_LOCK held */
802 int
803 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
804 {
805 	u_int i, len, ndesc;
806 	struct netmap_kring *kring;
807 	u_int n[NR_TXRX];
808 	enum txrx t;
809 
810 	if (na->tx_rings != NULL) {
811 		D("warning: krings were already created");
812 		return 0;
813 	}
814 
815 	/* account for the (possibly fake) host rings */
816 	n[NR_TX] = na->num_tx_rings + 1;
817 	n[NR_RX] = na->num_rx_rings + 1;
818 
819 	len = (n[NR_TX] + n[NR_RX]) * sizeof(struct netmap_kring) + tailroom;
820 
821 	na->tx_rings = nm_os_malloc((size_t)len);
822 	if (na->tx_rings == NULL) {
823 		D("Cannot allocate krings");
824 		return ENOMEM;
825 	}
826 	na->rx_rings = na->tx_rings + n[NR_TX];
827 
828 	/*
829 	 * All fields in krings are 0 except the one initialized below.
830 	 * but better be explicit on important kring fields.
831 	 */
832 	for_rx_tx(t) {
833 		ndesc = nma_get_ndesc(na, t);
834 		for (i = 0; i < n[t]; i++) {
835 			kring = &NMR(na, t)[i];
836 			bzero(kring, sizeof(*kring));
837 			kring->na = na;
838 			kring->ring_id = i;
839 			kring->tx = t;
840 			kring->nkr_num_slots = ndesc;
841 			kring->nr_mode = NKR_NETMAP_OFF;
842 			kring->nr_pending_mode = NKR_NETMAP_OFF;
843 			if (i < nma_get_nrings(na, t)) {
844 				kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
845 			} else {
846 				kring->nm_sync = (t == NR_TX ?
847 						netmap_txsync_to_host:
848 						netmap_rxsync_from_host);
849 			}
850 			kring->nm_notify = na->nm_notify;
851 			kring->rhead = kring->rcur = kring->nr_hwcur = 0;
852 			/*
853 			 * IMPORTANT: Always keep one slot empty.
854 			 */
855 			kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
856 			snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
857 					nm_txrx2str(t), i);
858 			ND("ktx %s h %d c %d t %d",
859 				kring->name, kring->rhead, kring->rcur, kring->rtail);
860 			mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
861 			nm_os_selinfo_init(&kring->si);
862 		}
863 		nm_os_selinfo_init(&na->si[t]);
864 	}
865 
866 	na->tailroom = na->rx_rings + n[NR_RX];
867 
868 	return 0;
869 }
870 
871 
872 /* undo the actions performed by netmap_krings_create */
873 /* call with NMG_LOCK held */
874 void
875 netmap_krings_delete(struct netmap_adapter *na)
876 {
877 	struct netmap_kring *kring = na->tx_rings;
878 	enum txrx t;
879 
880 	if (na->tx_rings == NULL) {
881 		D("warning: krings were already deleted");
882 		return;
883 	}
884 
885 	for_rx_tx(t)
886 		nm_os_selinfo_uninit(&na->si[t]);
887 
888 	/* we rely on the krings layout described above */
889 	for ( ; kring != na->tailroom; kring++) {
890 		mtx_destroy(&kring->q_lock);
891 		nm_os_selinfo_uninit(&kring->si);
892 	}
893 	nm_os_free(na->tx_rings);
894 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
895 }
896 
897 
898 /*
899  * Destructor for NIC ports. They also have an mbuf queue
900  * on the rings connected to the host so we need to purge
901  * them first.
902  */
903 /* call with NMG_LOCK held */
904 void
905 netmap_hw_krings_delete(struct netmap_adapter *na)
906 {
907 	struct mbq *q = &na->rx_rings[na->num_rx_rings].rx_queue;
908 
909 	ND("destroy sw mbq with len %d", mbq_len(q));
910 	mbq_purge(q);
911 	mbq_safe_fini(q);
912 	netmap_krings_delete(na);
913 }
914 
915 
916 
917 /*
918  * Undo everything that was done in netmap_do_regif(). In particular,
919  * call nm_register(ifp,0) to stop netmap mode on the interface and
920  * revert to normal operation.
921  */
922 /* call with NMG_LOCK held */
923 static void netmap_unset_ringid(struct netmap_priv_d *);
924 static void netmap_krings_put(struct netmap_priv_d *);
925 void
926 netmap_do_unregif(struct netmap_priv_d *priv)
927 {
928 	struct netmap_adapter *na = priv->np_na;
929 
930 	NMG_LOCK_ASSERT();
931 	na->active_fds--;
932 	/* unset nr_pending_mode and possibly release exclusive mode */
933 	netmap_krings_put(priv);
934 
935 #ifdef	WITH_MONITOR
936 	/* XXX check whether we have to do something with monitor
937 	 * when rings change nr_mode. */
938 	if (na->active_fds <= 0) {
939 		/* walk through all the rings and tell any monitor
940 		 * that the port is going to exit netmap mode
941 		 */
942 		netmap_monitor_stop(na);
943 	}
944 #endif
945 
946 	if (na->active_fds <= 0 || nm_kring_pending(priv)) {
947 		na->nm_register(na, 0);
948 	}
949 
950 	/* delete rings and buffers that are no longer needed */
951 	netmap_mem_rings_delete(na);
952 
953 	if (na->active_fds <= 0) {	/* last instance */
954 		/*
955 		 * (TO CHECK) We enter here
956 		 * when the last reference to this file descriptor goes
957 		 * away. This means we cannot have any pending poll()
958 		 * or interrupt routine operating on the structure.
959 		 * XXX The file may be closed in a thread while
960 		 * another thread is using it.
961 		 * Linux keeps the file opened until the last reference
962 		 * by any outstanding ioctl/poll or mmap is gone.
963 		 * FreeBSD does not track mmap()s (but we do) and
964 		 * wakes up any sleeping poll(). Need to check what
965 		 * happens if the close() occurs while a concurrent
966 		 * syscall is running.
967 		 */
968 		if (netmap_verbose)
969 			D("deleting last instance for %s", na->name);
970 
971                 if (nm_netmap_on(na)) {
972                     D("BUG: netmap on while going to delete the krings");
973                 }
974 
975 		na->nm_krings_delete(na);
976 	}
977 
978 	/* possibily decrement counter of tx_si/rx_si users */
979 	netmap_unset_ringid(priv);
980 	/* delete the nifp */
981 	netmap_mem_if_delete(na, priv->np_nifp);
982 	/* drop the allocator */
983 	netmap_mem_deref(na->nm_mem, na);
984 	/* mark the priv as unregistered */
985 	priv->np_na = NULL;
986 	priv->np_nifp = NULL;
987 }
988 
989 /* call with NMG_LOCK held */
990 static __inline int
991 nm_si_user(struct netmap_priv_d *priv, enum txrx t)
992 {
993 	return (priv->np_na != NULL &&
994 		(priv->np_qlast[t] - priv->np_qfirst[t] > 1));
995 }
996 
997 struct netmap_priv_d*
998 netmap_priv_new(void)
999 {
1000 	struct netmap_priv_d *priv;
1001 
1002 	priv = nm_os_malloc(sizeof(struct netmap_priv_d));
1003 	if (priv == NULL)
1004 		return NULL;
1005 	priv->np_refs = 1;
1006 	nm_os_get_module();
1007 	return priv;
1008 }
1009 
1010 /*
1011  * Destructor of the netmap_priv_d, called when the fd is closed
1012  * Action: undo all the things done by NIOCREGIF,
1013  * On FreeBSD we need to track whether there are active mmap()s,
1014  * and we use np_active_mmaps for that. On linux, the field is always 0.
1015  * Return: 1 if we can free priv, 0 otherwise.
1016  *
1017  */
1018 /* call with NMG_LOCK held */
1019 void
1020 netmap_priv_delete(struct netmap_priv_d *priv)
1021 {
1022 	struct netmap_adapter *na = priv->np_na;
1023 
1024 	/* number of active references to this fd */
1025 	if (--priv->np_refs > 0) {
1026 		return;
1027 	}
1028 	nm_os_put_module();
1029 	if (na) {
1030 		netmap_do_unregif(priv);
1031 	}
1032 	netmap_unget_na(na, priv->np_ifp);
1033 	bzero(priv, sizeof(*priv));	/* for safety */
1034 	nm_os_free(priv);
1035 }
1036 
1037 
1038 /* call with NMG_LOCK *not* held */
1039 void
1040 netmap_dtor(void *data)
1041 {
1042 	struct netmap_priv_d *priv = data;
1043 
1044 	NMG_LOCK();
1045 	netmap_priv_delete(priv);
1046 	NMG_UNLOCK();
1047 }
1048 
1049 
1050 /*
1051  * Handlers for synchronization of the rings from/to the host stack.
1052  * These are associated to a network interface and are just another
1053  * ring pair managed by userspace.
1054  *
1055  * Netmap also supports transparent forwarding (NS_FORWARD and NR_FORWARD
1056  * flags):
1057  *
1058  * - Before releasing buffers on hw RX rings, the application can mark
1059  *   them with the NS_FORWARD flag. During the next RXSYNC or poll(), they
1060  *   will be forwarded to the host stack, similarly to what happened if
1061  *   the application moved them to the host TX ring.
1062  *
1063  * - Before releasing buffers on the host RX ring, the application can
1064  *   mark them with the NS_FORWARD flag. During the next RXSYNC or poll(),
1065  *   they will be forwarded to the hw TX rings, saving the application
1066  *   from doing the same task in user-space.
1067  *
1068  * Transparent fowarding can be enabled per-ring, by setting the NR_FORWARD
1069  * flag, or globally with the netmap_fwd sysctl.
1070  *
1071  * The transfer NIC --> host is relatively easy, just encapsulate
1072  * into mbufs and we are done. The host --> NIC side is slightly
1073  * harder because there might not be room in the tx ring so it
1074  * might take a while before releasing the buffer.
1075  */
1076 
1077 
1078 /*
1079  * Pass a whole queue of mbufs to the host stack as coming from 'dst'
1080  * We do not need to lock because the queue is private.
1081  * After this call the queue is empty.
1082  */
1083 static void
1084 netmap_send_up(struct ifnet *dst, struct mbq *q)
1085 {
1086 	struct mbuf *m;
1087 	struct mbuf *head = NULL, *prev = NULL;
1088 
1089 	/* Send packets up, outside the lock; head/prev machinery
1090 	 * is only useful for Windows. */
1091 	while ((m = mbq_dequeue(q)) != NULL) {
1092 		if (netmap_verbose & NM_VERB_HOST)
1093 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
1094 		prev = nm_os_send_up(dst, m, prev);
1095 		if (head == NULL)
1096 			head = prev;
1097 	}
1098 	if (head)
1099 		nm_os_send_up(dst, NULL, head);
1100 	mbq_fini(q);
1101 }
1102 
1103 
1104 /*
1105  * Scan the buffers from hwcur to ring->head, and put a copy of those
1106  * marked NS_FORWARD (or all of them if forced) into a queue of mbufs.
1107  * Drop remaining packets in the unlikely event
1108  * of an mbuf shortage.
1109  */
1110 static void
1111 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
1112 {
1113 	u_int const lim = kring->nkr_num_slots - 1;
1114 	u_int const head = kring->rhead;
1115 	u_int n;
1116 	struct netmap_adapter *na = kring->na;
1117 
1118 	for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
1119 		struct mbuf *m;
1120 		struct netmap_slot *slot = &kring->ring->slot[n];
1121 
1122 		if ((slot->flags & NS_FORWARD) == 0 && !force)
1123 			continue;
1124 		if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
1125 			RD(5, "bad pkt at %d len %d", n, slot->len);
1126 			continue;
1127 		}
1128 		slot->flags &= ~NS_FORWARD; // XXX needed ?
1129 		/* XXX TODO: adapt to the case of a multisegment packet */
1130 		m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
1131 
1132 		if (m == NULL)
1133 			break;
1134 		mbq_enqueue(q, m);
1135 	}
1136 }
1137 
1138 static inline int
1139 _nm_may_forward(struct netmap_kring *kring)
1140 {
1141 	return	((netmap_fwd || kring->ring->flags & NR_FORWARD) &&
1142 		 kring->na->na_flags & NAF_HOST_RINGS &&
1143 		 kring->tx == NR_RX);
1144 }
1145 
1146 static inline int
1147 nm_may_forward_up(struct netmap_kring *kring)
1148 {
1149 	return	_nm_may_forward(kring) &&
1150 		 kring->ring_id != kring->na->num_rx_rings;
1151 }
1152 
1153 static inline int
1154 nm_may_forward_down(struct netmap_kring *kring, int sync_flags)
1155 {
1156 	return	_nm_may_forward(kring) &&
1157 		 (sync_flags & NAF_CAN_FORWARD_DOWN) &&
1158 		 kring->ring_id == kring->na->num_rx_rings;
1159 }
1160 
1161 /*
1162  * Send to the NIC rings packets marked NS_FORWARD between
1163  * kring->nr_hwcur and kring->rhead.
1164  * Called under kring->rx_queue.lock on the sw rx ring.
1165  *
1166  * It can only be called if the user opened all the TX hw rings,
1167  * see NAF_CAN_FORWARD_DOWN flag.
1168  * We can touch the TX netmap rings (slots, head and cur) since
1169  * we are in poll/ioctl system call context, and the application
1170  * is not supposed to touch the ring (using a different thread)
1171  * during the execution of the system call.
1172  */
1173 static u_int
1174 netmap_sw_to_nic(struct netmap_adapter *na)
1175 {
1176 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
1177 	struct netmap_slot *rxslot = kring->ring->slot;
1178 	u_int i, rxcur = kring->nr_hwcur;
1179 	u_int const head = kring->rhead;
1180 	u_int const src_lim = kring->nkr_num_slots - 1;
1181 	u_int sent = 0;
1182 
1183 	/* scan rings to find space, then fill as much as possible */
1184 	for (i = 0; i < na->num_tx_rings; i++) {
1185 		struct netmap_kring *kdst = &na->tx_rings[i];
1186 		struct netmap_ring *rdst = kdst->ring;
1187 		u_int const dst_lim = kdst->nkr_num_slots - 1;
1188 
1189 		/* XXX do we trust ring or kring->rcur,rtail ? */
1190 		for (; rxcur != head && !nm_ring_empty(rdst);
1191 		     rxcur = nm_next(rxcur, src_lim) ) {
1192 			struct netmap_slot *src, *dst, tmp;
1193 			u_int dst_head = rdst->head;
1194 
1195 			src = &rxslot[rxcur];
1196 			if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
1197 				continue;
1198 
1199 			sent++;
1200 
1201 			dst = &rdst->slot[dst_head];
1202 
1203 			tmp = *src;
1204 
1205 			src->buf_idx = dst->buf_idx;
1206 			src->flags = NS_BUF_CHANGED;
1207 
1208 			dst->buf_idx = tmp.buf_idx;
1209 			dst->len = tmp.len;
1210 			dst->flags = NS_BUF_CHANGED;
1211 
1212 			rdst->head = rdst->cur = nm_next(dst_head, dst_lim);
1213 		}
1214 		/* if (sent) XXX txsync ? it would be just an optimization */
1215 	}
1216 	return sent;
1217 }
1218 
1219 
1220 /*
1221  * netmap_txsync_to_host() passes packets up. We are called from a
1222  * system call in user process context, and the only contention
1223  * can be among multiple user threads erroneously calling
1224  * this routine concurrently.
1225  */
1226 static int
1227 netmap_txsync_to_host(struct netmap_kring *kring, int flags)
1228 {
1229 	struct netmap_adapter *na = kring->na;
1230 	u_int const lim = kring->nkr_num_slots - 1;
1231 	u_int const head = kring->rhead;
1232 	struct mbq q;
1233 
1234 	/* Take packets from hwcur to head and pass them up.
1235 	 * Force hwcur = head since netmap_grab_packets() stops at head
1236 	 */
1237 	mbq_init(&q);
1238 	netmap_grab_packets(kring, &q, 1 /* force */);
1239 	ND("have %d pkts in queue", mbq_len(&q));
1240 	kring->nr_hwcur = head;
1241 	kring->nr_hwtail = head + lim;
1242 	if (kring->nr_hwtail > lim)
1243 		kring->nr_hwtail -= lim + 1;
1244 
1245 	netmap_send_up(na->ifp, &q);
1246 	return 0;
1247 }
1248 
1249 
1250 /*
1251  * rxsync backend for packets coming from the host stack.
1252  * They have been put in kring->rx_queue by netmap_transmit().
1253  * We protect access to the kring using kring->rx_queue.lock
1254  *
1255  * also moves to the nic hw rings any packet the user has marked
1256  * for transparent-mode forwarding, then sets the NR_FORWARD
1257  * flag in the kring to let the caller push them out
1258  */
1259 static int
1260 netmap_rxsync_from_host(struct netmap_kring *kring, int flags)
1261 {
1262 	struct netmap_adapter *na = kring->na;
1263 	struct netmap_ring *ring = kring->ring;
1264 	u_int nm_i, n;
1265 	u_int const lim = kring->nkr_num_slots - 1;
1266 	u_int const head = kring->rhead;
1267 	int ret = 0;
1268 	struct mbq *q = &kring->rx_queue, fq;
1269 
1270 	mbq_init(&fq); /* fq holds packets to be freed */
1271 
1272 	mbq_lock(q);
1273 
1274 	/* First part: import newly received packets */
1275 	n = mbq_len(q);
1276 	if (n) { /* grab packets from the queue */
1277 		struct mbuf *m;
1278 		uint32_t stop_i;
1279 
1280 		nm_i = kring->nr_hwtail;
1281 		stop_i = nm_prev(kring->nr_hwcur, lim);
1282 		while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
1283 			int len = MBUF_LEN(m);
1284 			struct netmap_slot *slot = &ring->slot[nm_i];
1285 
1286 			m_copydata(m, 0, len, NMB(na, slot));
1287 			ND("nm %d len %d", nm_i, len);
1288 			if (netmap_verbose)
1289                                 D("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
1290 
1291 			slot->len = len;
1292 			slot->flags = kring->nkr_slot_flags;
1293 			nm_i = nm_next(nm_i, lim);
1294 			mbq_enqueue(&fq, m);
1295 		}
1296 		kring->nr_hwtail = nm_i;
1297 	}
1298 
1299 	/*
1300 	 * Second part: skip past packets that userspace has released.
1301 	 */
1302 	nm_i = kring->nr_hwcur;
1303 	if (nm_i != head) { /* something was released */
1304 		if (nm_may_forward_down(kring, flags)) {
1305 			ret = netmap_sw_to_nic(na);
1306 			if (ret > 0) {
1307 				kring->nr_kflags |= NR_FORWARD;
1308 				ret = 0;
1309 			}
1310 		}
1311 		kring->nr_hwcur = head;
1312 	}
1313 
1314 	mbq_unlock(q);
1315 
1316 	mbq_purge(&fq);
1317 	mbq_fini(&fq);
1318 
1319 	return ret;
1320 }
1321 
1322 
1323 /* Get a netmap adapter for the port.
1324  *
1325  * If it is possible to satisfy the request, return 0
1326  * with *na containing the netmap adapter found.
1327  * Otherwise return an error code, with *na containing NULL.
1328  *
1329  * When the port is attached to a bridge, we always return
1330  * EBUSY.
1331  * Otherwise, if the port is already bound to a file descriptor,
1332  * then we unconditionally return the existing adapter into *na.
1333  * In all the other cases, we return (into *na) either native,
1334  * generic or NULL, according to the following table:
1335  *
1336  *					native_support
1337  * active_fds   dev.netmap.admode         YES     NO
1338  * -------------------------------------------------------
1339  *    >0              *                 NA(ifp) NA(ifp)
1340  *
1341  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
1342  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
1343  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
1344  *
1345  */
1346 static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */
1347 int
1348 netmap_get_hw_na(struct ifnet *ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na)
1349 {
1350 	/* generic support */
1351 	int i = netmap_admode;	/* Take a snapshot. */
1352 	struct netmap_adapter *prev_na;
1353 	int error = 0;
1354 
1355 	*na = NULL; /* default */
1356 
1357 	/* reset in case of invalid value */
1358 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
1359 		i = netmap_admode = NETMAP_ADMODE_BEST;
1360 
1361 	if (NM_NA_VALID(ifp)) {
1362 		prev_na = NA(ifp);
1363 		/* If an adapter already exists, return it if
1364 		 * there are active file descriptors or if
1365 		 * netmap is not forced to use generic
1366 		 * adapters.
1367 		 */
1368 		if (NETMAP_OWNED_BY_ANY(prev_na)
1369 			|| i != NETMAP_ADMODE_GENERIC
1370 			|| prev_na->na_flags & NAF_FORCE_NATIVE
1371 #ifdef WITH_PIPES
1372 			/* ugly, but we cannot allow an adapter switch
1373 			 * if some pipe is referring to this one
1374 			 */
1375 			|| prev_na->na_next_pipe > 0
1376 #endif
1377 		) {
1378 			*na = prev_na;
1379 			goto assign_mem;
1380 		}
1381 	}
1382 
1383 	/* If there isn't native support and netmap is not allowed
1384 	 * to use generic adapters, we cannot satisfy the request.
1385 	 */
1386 	if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)
1387 		return EOPNOTSUPP;
1388 
1389 	/* Otherwise, create a generic adapter and return it,
1390 	 * saving the previously used netmap adapter, if any.
1391 	 *
1392 	 * Note that here 'prev_na', if not NULL, MUST be a
1393 	 * native adapter, and CANNOT be a generic one. This is
1394 	 * true because generic adapters are created on demand, and
1395 	 * destroyed when not used anymore. Therefore, if the adapter
1396 	 * currently attached to an interface 'ifp' is generic, it
1397 	 * must be that
1398 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1399 	 * Consequently, if NA(ifp) is generic, we will enter one of
1400 	 * the branches above. This ensures that we never override
1401 	 * a generic adapter with another generic adapter.
1402 	 */
1403 	error = generic_netmap_attach(ifp);
1404 	if (error)
1405 		return error;
1406 
1407 	*na = NA(ifp);
1408 
1409 assign_mem:
1410 	if (nmd != NULL && !((*na)->na_flags & NAF_MEM_OWNER) &&
1411 	    (*na)->active_fds == 0 && ((*na)->nm_mem != nmd)) {
1412 		netmap_mem_put((*na)->nm_mem);
1413 		(*na)->nm_mem = netmap_mem_get(nmd);
1414 	}
1415 
1416 	return 0;
1417 }
1418 
1419 /*
1420  * MUST BE CALLED UNDER NMG_LOCK()
1421  *
1422  * Get a refcounted reference to a netmap adapter attached
1423  * to the interface specified by nmr.
1424  * This is always called in the execution of an ioctl().
1425  *
1426  * Return ENXIO if the interface specified by the request does
1427  * not exist, ENOTSUP if netmap is not supported by the interface,
1428  * EBUSY if the interface is already attached to a bridge,
1429  * EINVAL if parameters are invalid, ENOMEM if needed resources
1430  * could not be allocated.
1431  * If successful, hold a reference to the netmap adapter.
1432  *
1433  * If the interface specified by nmr is a system one, also keep
1434  * a reference to it and return a valid *ifp.
1435  */
1436 int
1437 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na,
1438 	      struct ifnet **ifp, struct netmap_mem_d *nmd, int create)
1439 {
1440 	int error = 0;
1441 	struct netmap_adapter *ret = NULL;
1442 	int nmd_ref = 0;
1443 
1444 	*na = NULL;     /* default return value */
1445 	*ifp = NULL;
1446 
1447 	NMG_LOCK_ASSERT();
1448 
1449 	/* if the request contain a memid, try to find the
1450 	 * corresponding memory region
1451 	 */
1452 	if (nmd == NULL && nmr->nr_arg2) {
1453 		nmd = netmap_mem_find(nmr->nr_arg2);
1454 		if (nmd == NULL)
1455 			return EINVAL;
1456 		/* keep the rereference */
1457 		nmd_ref = 1;
1458 	}
1459 
1460 	/* We cascade through all possible types of netmap adapter.
1461 	 * All netmap_get_*_na() functions return an error and an na,
1462 	 * with the following combinations:
1463 	 *
1464 	 * error    na
1465 	 *   0	   NULL		type doesn't match
1466 	 *  !0	   NULL		type matches, but na creation/lookup failed
1467 	 *   0	  !NULL		type matches and na created/found
1468 	 *  !0    !NULL		impossible
1469 	 */
1470 
1471 	/* try to see if this is a ptnetmap port */
1472 	error = netmap_get_pt_host_na(nmr, na, nmd, create);
1473 	if (error || *na != NULL)
1474 		goto out;
1475 
1476 	/* try to see if this is a monitor port */
1477 	error = netmap_get_monitor_na(nmr, na, nmd, create);
1478 	if (error || *na != NULL)
1479 		goto out;
1480 
1481 	/* try to see if this is a pipe port */
1482 	error = netmap_get_pipe_na(nmr, na, nmd, create);
1483 	if (error || *na != NULL)
1484 		goto out;
1485 
1486 	/* try to see if this is a bridge port */
1487 	error = netmap_get_bdg_na(nmr, na, nmd, create);
1488 	if (error)
1489 		goto out;
1490 
1491 	if (*na != NULL) /* valid match in netmap_get_bdg_na() */
1492 		goto out;
1493 
1494 	/*
1495 	 * This must be a hardware na, lookup the name in the system.
1496 	 * Note that by hardware we actually mean "it shows up in ifconfig".
1497 	 * This may still be a tap, a veth/epair, or even a
1498 	 * persistent VALE port.
1499 	 */
1500 	*ifp = ifunit_ref(nmr->nr_name);
1501 	if (*ifp == NULL) {
1502 		error = ENXIO;
1503 		goto out;
1504 	}
1505 
1506 	error = netmap_get_hw_na(*ifp, nmd, &ret);
1507 	if (error)
1508 		goto out;
1509 
1510 	*na = ret;
1511 	netmap_adapter_get(ret);
1512 
1513 out:
1514 	if (error) {
1515 		if (ret)
1516 			netmap_adapter_put(ret);
1517 		if (*ifp) {
1518 			if_rele(*ifp);
1519 			*ifp = NULL;
1520 		}
1521 	}
1522 	if (nmd_ref)
1523 		netmap_mem_put(nmd);
1524 
1525 	return error;
1526 }
1527 
1528 /* undo netmap_get_na() */
1529 void
1530 netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp)
1531 {
1532 	if (ifp)
1533 		if_rele(ifp);
1534 	if (na)
1535 		netmap_adapter_put(na);
1536 }
1537 
1538 
1539 #define NM_FAIL_ON(t) do {						\
1540 	if (unlikely(t)) {						\
1541 		RD(5, "%s: fail '" #t "' "				\
1542 			"h %d c %d t %d "				\
1543 			"rh %d rc %d rt %d "				\
1544 			"hc %d ht %d",					\
1545 			kring->name,					\
1546 			head, cur, ring->tail,				\
1547 			kring->rhead, kring->rcur, kring->rtail,	\
1548 			kring->nr_hwcur, kring->nr_hwtail);		\
1549 		return kring->nkr_num_slots;				\
1550 	}								\
1551 } while (0)
1552 
1553 /*
1554  * validate parameters on entry for *_txsync()
1555  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
1556  * in case of error.
1557  *
1558  * rhead, rcur and rtail=hwtail are stored from previous round.
1559  * hwcur is the next packet to send to the ring.
1560  *
1561  * We want
1562  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
1563  *
1564  * hwcur, rhead, rtail and hwtail are reliable
1565  */
1566 u_int
1567 nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1568 {
1569 	u_int head = ring->head; /* read only once */
1570 	u_int cur = ring->cur; /* read only once */
1571 	u_int n = kring->nkr_num_slots;
1572 
1573 	ND(5, "%s kcur %d ktail %d head %d cur %d tail %d",
1574 		kring->name,
1575 		kring->nr_hwcur, kring->nr_hwtail,
1576 		ring->head, ring->cur, ring->tail);
1577 #if 1 /* kernel sanity checks; but we can trust the kring. */
1578 	NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||
1579 	    kring->rtail >= n ||  kring->nr_hwtail >= n);
1580 #endif /* kernel sanity checks */
1581 	/*
1582 	 * user sanity checks. We only use head,
1583 	 * A, B, ... are possible positions for head:
1584 	 *
1585 	 *  0    A  rhead   B  rtail   C  n-1
1586 	 *  0    D  rtail   E  rhead   F  n-1
1587 	 *
1588 	 * B, F, D are valid. A, C, E are wrong
1589 	 */
1590 	if (kring->rtail >= kring->rhead) {
1591 		/* want rhead <= head <= rtail */
1592 		NM_FAIL_ON(head < kring->rhead || head > kring->rtail);
1593 		/* and also head <= cur <= rtail */
1594 		NM_FAIL_ON(cur < head || cur > kring->rtail);
1595 	} else { /* here rtail < rhead */
1596 		/* we need head outside rtail .. rhead */
1597 		NM_FAIL_ON(head > kring->rtail && head < kring->rhead);
1598 
1599 		/* two cases now: head <= rtail or head >= rhead  */
1600 		if (head <= kring->rtail) {
1601 			/* want head <= cur <= rtail */
1602 			NM_FAIL_ON(cur < head || cur > kring->rtail);
1603 		} else { /* head >= rhead */
1604 			/* cur must be outside rtail..head */
1605 			NM_FAIL_ON(cur > kring->rtail && cur < head);
1606 		}
1607 	}
1608 	if (ring->tail != kring->rtail) {
1609 		RD(5, "%s tail overwritten was %d need %d", kring->name,
1610 			ring->tail, kring->rtail);
1611 		ring->tail = kring->rtail;
1612 	}
1613 	kring->rhead = head;
1614 	kring->rcur = cur;
1615 	return head;
1616 }
1617 
1618 
1619 /*
1620  * validate parameters on entry for *_rxsync()
1621  * Returns ring->head if ok, kring->nkr_num_slots on error.
1622  *
1623  * For a valid configuration,
1624  * hwcur <= head <= cur <= tail <= hwtail
1625  *
1626  * We only consider head and cur.
1627  * hwcur and hwtail are reliable.
1628  *
1629  */
1630 u_int
1631 nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
1632 {
1633 	uint32_t const n = kring->nkr_num_slots;
1634 	uint32_t head, cur;
1635 
1636 	ND(5,"%s kc %d kt %d h %d c %d t %d",
1637 		kring->name,
1638 		kring->nr_hwcur, kring->nr_hwtail,
1639 		ring->head, ring->cur, ring->tail);
1640 	/*
1641 	 * Before storing the new values, we should check they do not
1642 	 * move backwards. However:
1643 	 * - head is not an issue because the previous value is hwcur;
1644 	 * - cur could in principle go back, however it does not matter
1645 	 *   because we are processing a brand new rxsync()
1646 	 */
1647 	cur = kring->rcur = ring->cur;	/* read only once */
1648 	head = kring->rhead = ring->head;	/* read only once */
1649 #if 1 /* kernel sanity checks */
1650 	NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);
1651 #endif /* kernel sanity checks */
1652 	/* user sanity checks */
1653 	if (kring->nr_hwtail >= kring->nr_hwcur) {
1654 		/* want hwcur <= rhead <= hwtail */
1655 		NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);
1656 		/* and also rhead <= rcur <= hwtail */
1657 		NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1658 	} else {
1659 		/* we need rhead outside hwtail..hwcur */
1660 		NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);
1661 		/* two cases now: head <= hwtail or head >= hwcur  */
1662 		if (head <= kring->nr_hwtail) {
1663 			/* want head <= cur <= hwtail */
1664 			NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
1665 		} else {
1666 			/* cur must be outside hwtail..head */
1667 			NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);
1668 		}
1669 	}
1670 	if (ring->tail != kring->rtail) {
1671 		RD(5, "%s tail overwritten was %d need %d",
1672 			kring->name,
1673 			ring->tail, kring->rtail);
1674 		ring->tail = kring->rtail;
1675 	}
1676 	return head;
1677 }
1678 
1679 
1680 /*
1681  * Error routine called when txsync/rxsync detects an error.
1682  * Can't do much more than resetting head =cur = hwcur, tail = hwtail
1683  * Return 1 on reinit.
1684  *
1685  * This routine is only called by the upper half of the kernel.
1686  * It only reads hwcur (which is changed only by the upper half, too)
1687  * and hwtail (which may be changed by the lower half, but only on
1688  * a tx ring and only to increase it, so any error will be recovered
1689  * on the next call). For the above, we don't strictly need to call
1690  * it under lock.
1691  */
1692 int
1693 netmap_ring_reinit(struct netmap_kring *kring)
1694 {
1695 	struct netmap_ring *ring = kring->ring;
1696 	u_int i, lim = kring->nkr_num_slots - 1;
1697 	int errors = 0;
1698 
1699 	// XXX KASSERT nm_kr_tryget
1700 	RD(10, "called for %s", kring->name);
1701 	// XXX probably wrong to trust userspace
1702 	kring->rhead = ring->head;
1703 	kring->rcur  = ring->cur;
1704 	kring->rtail = ring->tail;
1705 
1706 	if (ring->cur > lim)
1707 		errors++;
1708 	if (ring->head > lim)
1709 		errors++;
1710 	if (ring->tail > lim)
1711 		errors++;
1712 	for (i = 0; i <= lim; i++) {
1713 		u_int idx = ring->slot[i].buf_idx;
1714 		u_int len = ring->slot[i].len;
1715 		if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
1716 			RD(5, "bad index at slot %d idx %d len %d ", i, idx, len);
1717 			ring->slot[i].buf_idx = 0;
1718 			ring->slot[i].len = 0;
1719 		} else if (len > NETMAP_BUF_SIZE(kring->na)) {
1720 			ring->slot[i].len = 0;
1721 			RD(5, "bad len at slot %d idx %d len %d", i, idx, len);
1722 		}
1723 	}
1724 	if (errors) {
1725 		RD(10, "total %d errors", errors);
1726 		RD(10, "%s reinit, cur %d -> %d tail %d -> %d",
1727 			kring->name,
1728 			ring->cur, kring->nr_hwcur,
1729 			ring->tail, kring->nr_hwtail);
1730 		ring->head = kring->rhead = kring->nr_hwcur;
1731 		ring->cur  = kring->rcur  = kring->nr_hwcur;
1732 		ring->tail = kring->rtail = kring->nr_hwtail;
1733 	}
1734 	return (errors ? 1 : 0);
1735 }
1736 
1737 /* interpret the ringid and flags fields of an nmreq, by translating them
1738  * into a pair of intervals of ring indices:
1739  *
1740  * [priv->np_txqfirst, priv->np_txqlast) and
1741  * [priv->np_rxqfirst, priv->np_rxqlast)
1742  *
1743  */
1744 int
1745 netmap_interp_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1746 {
1747 	struct netmap_adapter *na = priv->np_na;
1748 	u_int j, i = ringid & NETMAP_RING_MASK;
1749 	u_int reg = flags & NR_REG_MASK;
1750 	int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };
1751 	enum txrx t;
1752 
1753 	if (reg == NR_REG_DEFAULT) {
1754 		/* convert from old ringid to flags */
1755 		if (ringid & NETMAP_SW_RING) {
1756 			reg = NR_REG_SW;
1757 		} else if (ringid & NETMAP_HW_RING) {
1758 			reg = NR_REG_ONE_NIC;
1759 		} else {
1760 			reg = NR_REG_ALL_NIC;
1761 		}
1762 		D("deprecated API, old ringid 0x%x -> ringid %x reg %d", ringid, i, reg);
1763 	}
1764 
1765 	if ((flags & NR_PTNETMAP_HOST) && ((reg != NR_REG_ALL_NIC &&
1766                     reg != NR_REG_PIPE_MASTER && reg != NR_REG_PIPE_SLAVE) ||
1767 			flags & (NR_RX_RINGS_ONLY|NR_TX_RINGS_ONLY))) {
1768 		D("Error: only NR_REG_ALL_NIC supported with netmap passthrough");
1769 		return EINVAL;
1770 	}
1771 
1772 	for_rx_tx(t) {
1773 		if (flags & excluded_direction[t]) {
1774 			priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1775 			continue;
1776 		}
1777 		switch (reg) {
1778 		case NR_REG_ALL_NIC:
1779 		case NR_REG_PIPE_MASTER:
1780 		case NR_REG_PIPE_SLAVE:
1781 			priv->np_qfirst[t] = 0;
1782 			priv->np_qlast[t] = nma_get_nrings(na, t);
1783 			ND("ALL/PIPE: %s %d %d", nm_txrx2str(t),
1784 				priv->np_qfirst[t], priv->np_qlast[t]);
1785 			break;
1786 		case NR_REG_SW:
1787 		case NR_REG_NIC_SW:
1788 			if (!(na->na_flags & NAF_HOST_RINGS)) {
1789 				D("host rings not supported");
1790 				return EINVAL;
1791 			}
1792 			priv->np_qfirst[t] = (reg == NR_REG_SW ?
1793 				nma_get_nrings(na, t) : 0);
1794 			priv->np_qlast[t] = nma_get_nrings(na, t) + 1;
1795 			ND("%s: %s %d %d", reg == NR_REG_SW ? "SW" : "NIC+SW",
1796 				nm_txrx2str(t),
1797 				priv->np_qfirst[t], priv->np_qlast[t]);
1798 			break;
1799 		case NR_REG_ONE_NIC:
1800 			if (i >= na->num_tx_rings && i >= na->num_rx_rings) {
1801 				D("invalid ring id %d", i);
1802 				return EINVAL;
1803 			}
1804 			/* if not enough rings, use the first one */
1805 			j = i;
1806 			if (j >= nma_get_nrings(na, t))
1807 				j = 0;
1808 			priv->np_qfirst[t] = j;
1809 			priv->np_qlast[t] = j + 1;
1810 			ND("ONE_NIC: %s %d %d", nm_txrx2str(t),
1811 				priv->np_qfirst[t], priv->np_qlast[t]);
1812 			break;
1813 		default:
1814 			D("invalid regif type %d", reg);
1815 			return EINVAL;
1816 		}
1817 	}
1818 	priv->np_flags = (flags & ~NR_REG_MASK) | reg;
1819 
1820 	/* Allow transparent forwarding mode in the host --> nic
1821 	 * direction only if all the TX hw rings have been opened. */
1822 	if (priv->np_qfirst[NR_TX] == 0 &&
1823 			priv->np_qlast[NR_TX] >= na->num_tx_rings) {
1824 		priv->np_sync_flags |= NAF_CAN_FORWARD_DOWN;
1825 	}
1826 
1827 	if (netmap_verbose) {
1828 		D("%s: tx [%d,%d) rx [%d,%d) id %d",
1829 			na->name,
1830 			priv->np_qfirst[NR_TX],
1831 			priv->np_qlast[NR_TX],
1832 			priv->np_qfirst[NR_RX],
1833 			priv->np_qlast[NR_RX],
1834 			i);
1835 	}
1836 	return 0;
1837 }
1838 
1839 
1840 /*
1841  * Set the ring ID. For devices with a single queue, a request
1842  * for all rings is the same as a single ring.
1843  */
1844 static int
1845 netmap_set_ringid(struct netmap_priv_d *priv, uint16_t ringid, uint32_t flags)
1846 {
1847 	struct netmap_adapter *na = priv->np_na;
1848 	int error;
1849 	enum txrx t;
1850 
1851 	error = netmap_interp_ringid(priv, ringid, flags);
1852 	if (error) {
1853 		return error;
1854 	}
1855 
1856 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1857 
1858 	/* optimization: count the users registered for more than
1859 	 * one ring, which are the ones sleeping on the global queue.
1860 	 * The default netmap_notify() callback will then
1861 	 * avoid signaling the global queue if nobody is using it
1862 	 */
1863 	for_rx_tx(t) {
1864 		if (nm_si_user(priv, t))
1865 			na->si_users[t]++;
1866 	}
1867 	return 0;
1868 }
1869 
1870 static void
1871 netmap_unset_ringid(struct netmap_priv_d *priv)
1872 {
1873 	struct netmap_adapter *na = priv->np_na;
1874 	enum txrx t;
1875 
1876 	for_rx_tx(t) {
1877 		if (nm_si_user(priv, t))
1878 			na->si_users[t]--;
1879 		priv->np_qfirst[t] = priv->np_qlast[t] = 0;
1880 	}
1881 	priv->np_flags = 0;
1882 	priv->np_txpoll = 0;
1883 }
1884 
1885 
1886 /* Set the nr_pending_mode for the requested rings.
1887  * If requested, also try to get exclusive access to the rings, provided
1888  * the rings we want to bind are not exclusively owned by a previous bind.
1889  */
1890 static int
1891 netmap_krings_get(struct netmap_priv_d *priv)
1892 {
1893 	struct netmap_adapter *na = priv->np_na;
1894 	u_int i;
1895 	struct netmap_kring *kring;
1896 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1897 	enum txrx t;
1898 
1899 	ND("%s: grabbing tx [%d, %d) rx [%d, %d)",
1900 			na->name,
1901 			priv->np_qfirst[NR_TX],
1902 			priv->np_qlast[NR_TX],
1903 			priv->np_qfirst[NR_RX],
1904 			priv->np_qlast[NR_RX]);
1905 
1906 	/* first round: check that all the requested rings
1907 	 * are neither alread exclusively owned, nor we
1908 	 * want exclusive ownership when they are already in use
1909 	 */
1910 	for_rx_tx(t) {
1911 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1912 			kring = &NMR(na, t)[i];
1913 			if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
1914 			    (kring->users && excl))
1915 			{
1916 				ND("ring %s busy", kring->name);
1917 				return EBUSY;
1918 			}
1919 		}
1920 	}
1921 
1922 	/* second round: increment usage count (possibly marking them
1923 	 * as exclusive) and set the nr_pending_mode
1924 	 */
1925 	for_rx_tx(t) {
1926 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1927 			kring = &NMR(na, t)[i];
1928 			kring->users++;
1929 			if (excl)
1930 				kring->nr_kflags |= NKR_EXCLUSIVE;
1931 	                kring->nr_pending_mode = NKR_NETMAP_ON;
1932 		}
1933 	}
1934 
1935 	return 0;
1936 
1937 }
1938 
1939 /* Undo netmap_krings_get(). This is done by clearing the exclusive mode
1940  * if was asked on regif, and unset the nr_pending_mode if we are the
1941  * last users of the involved rings. */
1942 static void
1943 netmap_krings_put(struct netmap_priv_d *priv)
1944 {
1945 	struct netmap_adapter *na = priv->np_na;
1946 	u_int i;
1947 	struct netmap_kring *kring;
1948 	int excl = (priv->np_flags & NR_EXCLUSIVE);
1949 	enum txrx t;
1950 
1951 	ND("%s: releasing tx [%d, %d) rx [%d, %d)",
1952 			na->name,
1953 			priv->np_qfirst[NR_TX],
1954 			priv->np_qlast[NR_TX],
1955 			priv->np_qfirst[NR_RX],
1956 			priv->np_qlast[MR_RX]);
1957 
1958 
1959 	for_rx_tx(t) {
1960 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
1961 			kring = &NMR(na, t)[i];
1962 			if (excl)
1963 				kring->nr_kflags &= ~NKR_EXCLUSIVE;
1964 			kring->users--;
1965 			if (kring->users == 0)
1966 				kring->nr_pending_mode = NKR_NETMAP_OFF;
1967 		}
1968 	}
1969 }
1970 
1971 /*
1972  * possibly move the interface to netmap-mode.
1973  * If success it returns a pointer to netmap_if, otherwise NULL.
1974  * This must be called with NMG_LOCK held.
1975  *
1976  * The following na callbacks are called in the process:
1977  *
1978  * na->nm_config()			[by netmap_update_config]
1979  * (get current number and size of rings)
1980  *
1981  *  	We have a generic one for linux (netmap_linux_config).
1982  *  	The bwrap has to override this, since it has to forward
1983  *  	the request to the wrapped adapter (netmap_bwrap_config).
1984  *
1985  *
1986  * na->nm_krings_create()
1987  * (create and init the krings array)
1988  *
1989  * 	One of the following:
1990  *
1991  *	* netmap_hw_krings_create, 			(hw ports)
1992  *		creates the standard layout for the krings
1993  * 		and adds the mbq (used for the host rings).
1994  *
1995  * 	* netmap_vp_krings_create			(VALE ports)
1996  * 		add leases and scratchpads
1997  *
1998  * 	* netmap_pipe_krings_create			(pipes)
1999  * 		create the krings and rings of both ends and
2000  * 		cross-link them
2001  *
2002  *      * netmap_monitor_krings_create 			(monitors)
2003  *      	avoid allocating the mbq
2004  *
2005  *      * netmap_bwrap_krings_create			(bwraps)
2006  *      	create both the brap krings array,
2007  *      	the krings array of the wrapped adapter, and
2008  *      	(if needed) the fake array for the host adapter
2009  *
2010  * na->nm_register(, 1)
2011  * (put the adapter in netmap mode)
2012  *
2013  * 	This may be one of the following:
2014  *
2015  * 	* netmap_hw_reg				        (hw ports)
2016  * 		checks that the ifp is still there, then calls
2017  * 		the hardware specific callback;
2018  *
2019  * 	* netmap_vp_reg					(VALE ports)
2020  *		If the port is connected to a bridge,
2021  *		set the NAF_NETMAP_ON flag under the
2022  *		bridge write lock.
2023  *
2024  *	* netmap_pipe_reg				(pipes)
2025  *		inform the other pipe end that it is no
2026  *		longer responsible for the lifetime of this
2027  *		pipe end
2028  *
2029  *	* netmap_monitor_reg				(monitors)
2030  *		intercept the sync callbacks of the monitored
2031  *		rings
2032  *
2033  *	* netmap_bwrap_reg				(bwraps)
2034  *		cross-link the bwrap and hwna rings,
2035  *		forward the request to the hwna, override
2036  *		the hwna notify callback (to get the frames
2037  *		coming from outside go through the bridge).
2038  *
2039  *
2040  */
2041 int
2042 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
2043 	uint16_t ringid, uint32_t flags)
2044 {
2045 	struct netmap_if *nifp = NULL;
2046 	int error;
2047 
2048 	NMG_LOCK_ASSERT();
2049 	/* ring configuration may have changed, fetch from the card */
2050 	netmap_update_config(na);
2051 	priv->np_na = na;     /* store the reference */
2052 	error = netmap_set_ringid(priv, ringid, flags);
2053 	if (error)
2054 		goto err;
2055 	error = netmap_mem_finalize(na->nm_mem, na);
2056 	if (error)
2057 		goto err;
2058 
2059 	if (na->active_fds == 0) {
2060 		/*
2061 		 * If this is the first registration of the adapter,
2062 		 * create the  in-kernel view of the netmap rings,
2063 		 * the netmap krings.
2064 		 */
2065 
2066 		/*
2067 		 * Depending on the adapter, this may also create
2068 		 * the netmap rings themselves
2069 		 */
2070 		error = na->nm_krings_create(na);
2071 		if (error)
2072 			goto err_drop_mem;
2073 
2074 	}
2075 
2076 	/* now the krings must exist and we can check whether some
2077 	 * previous bind has exclusive ownership on them, and set
2078 	 * nr_pending_mode
2079 	 */
2080 	error = netmap_krings_get(priv);
2081 	if (error)
2082 		goto err_del_krings;
2083 
2084 	/* create all needed missing netmap rings */
2085 	error = netmap_mem_rings_create(na);
2086 	if (error)
2087 		goto err_rel_excl;
2088 
2089 	/* in all cases, create a new netmap if */
2090 	nifp = netmap_mem_if_new(na, priv);
2091 	if (nifp == NULL) {
2092 		error = ENOMEM;
2093 		goto err_del_rings;
2094 	}
2095 
2096 	if (na->active_fds == 0) {
2097 		/* cache the allocator info in the na */
2098 		error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);
2099 		if (error)
2100 			goto err_del_if;
2101 		ND("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,
2102 					    na->na_lut.objsize);
2103 	}
2104 
2105 	if (nm_kring_pending(priv)) {
2106 		/* Some kring is switching mode, tell the adapter to
2107 		 * react on this. */
2108 		error = na->nm_register(na, 1);
2109 		if (error)
2110 			goto err_put_lut;
2111 	}
2112 
2113 	/* Commit the reference. */
2114 	na->active_fds++;
2115 
2116 	/*
2117 	 * advertise that the interface is ready by setting np_nifp.
2118 	 * The barrier is needed because readers (poll, *SYNC and mmap)
2119 	 * check for priv->np_nifp != NULL without locking
2120 	 */
2121 	mb(); /* make sure previous writes are visible to all CPUs */
2122 	priv->np_nifp = nifp;
2123 
2124 	return 0;
2125 
2126 err_put_lut:
2127 	if (na->active_fds == 0)
2128 		memset(&na->na_lut, 0, sizeof(na->na_lut));
2129 err_del_if:
2130 	netmap_mem_if_delete(na, nifp);
2131 err_rel_excl:
2132 	netmap_krings_put(priv);
2133 err_del_rings:
2134 	netmap_mem_rings_delete(na);
2135 err_del_krings:
2136 	if (na->active_fds == 0)
2137 		na->nm_krings_delete(na);
2138 err_drop_mem:
2139 	netmap_mem_deref(na->nm_mem, na);
2140 err:
2141 	priv->np_na = NULL;
2142 	return error;
2143 }
2144 
2145 
2146 /*
2147  * update kring and ring at the end of rxsync/txsync.
2148  */
2149 static inline void
2150 nm_sync_finalize(struct netmap_kring *kring)
2151 {
2152 	/*
2153 	 * Update ring tail to what the kernel knows
2154 	 * After txsync: head/rhead/hwcur might be behind cur/rcur
2155 	 * if no carrier.
2156 	 */
2157 	kring->ring->tail = kring->rtail = kring->nr_hwtail;
2158 
2159 	ND(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
2160 		kring->name, kring->nr_hwcur, kring->nr_hwtail,
2161 		kring->rhead, kring->rcur, kring->rtail);
2162 }
2163 
2164 /* set ring timestamp */
2165 static inline void
2166 ring_timestamp_set(struct netmap_ring *ring)
2167 {
2168 	if (netmap_no_timestamp == 0 || ring->flags & NR_TIMESTAMP) {
2169 		microtime(&ring->ts);
2170 	}
2171 }
2172 
2173 
2174 /*
2175  * ioctl(2) support for the "netmap" device.
2176  *
2177  * Following a list of accepted commands:
2178  * - NIOCGINFO
2179  * - SIOCGIFADDR	just for convenience
2180  * - NIOCREGIF
2181  * - NIOCTXSYNC
2182  * - NIOCRXSYNC
2183  *
2184  * Return 0 on success, errno otherwise.
2185  */
2186 int
2187 netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data, struct thread *td)
2188 {
2189 	struct mbq q;	/* packets from RX hw queues to host stack */
2190 	struct nmreq *nmr = (struct nmreq *) data;
2191 	struct netmap_adapter *na = NULL;
2192 	struct netmap_mem_d *nmd = NULL;
2193 	struct ifnet *ifp = NULL;
2194 	int error = 0;
2195 	u_int i, qfirst, qlast;
2196 	struct netmap_if *nifp;
2197 	struct netmap_kring *krings;
2198 	int sync_flags;
2199 	enum txrx t;
2200 
2201 	if (cmd == NIOCGINFO || cmd == NIOCREGIF) {
2202 		/* truncate name */
2203 		nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';
2204 		if (nmr->nr_version != NETMAP_API) {
2205 			D("API mismatch for %s got %d need %d",
2206 				nmr->nr_name,
2207 				nmr->nr_version, NETMAP_API);
2208 			nmr->nr_version = NETMAP_API;
2209 		}
2210 		if (nmr->nr_version < NETMAP_MIN_API ||
2211 		    nmr->nr_version > NETMAP_MAX_API) {
2212 			return EINVAL;
2213 		}
2214 	}
2215 
2216 	switch (cmd) {
2217 	case NIOCGINFO:		/* return capabilities etc */
2218 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
2219 			error = netmap_bdg_ctl(nmr, NULL);
2220 			break;
2221 		}
2222 
2223 		NMG_LOCK();
2224 		do {
2225 			/* memsize is always valid */
2226 			u_int memflags;
2227 
2228 			if (nmr->nr_name[0] != '\0') {
2229 
2230 				/* get a refcount */
2231 				error = netmap_get_na(nmr, &na, &ifp, NULL, 1 /* create */);
2232 				if (error) {
2233 					na = NULL;
2234 					ifp = NULL;
2235 					break;
2236 				}
2237 				nmd = na->nm_mem; /* get memory allocator */
2238 			} else {
2239 				nmd = netmap_mem_find(nmr->nr_arg2 ? nmr->nr_arg2 : 1);
2240 				if (nmd == NULL) {
2241 					error = EINVAL;
2242 					break;
2243 				}
2244 			}
2245 
2246 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags,
2247 				&nmr->nr_arg2);
2248 			if (error)
2249 				break;
2250 			if (na == NULL) /* only memory info */
2251 				break;
2252 			nmr->nr_offset = 0;
2253 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
2254 			netmap_update_config(na);
2255 			nmr->nr_rx_rings = na->num_rx_rings;
2256 			nmr->nr_tx_rings = na->num_tx_rings;
2257 			nmr->nr_rx_slots = na->num_rx_desc;
2258 			nmr->nr_tx_slots = na->num_tx_desc;
2259 		} while (0);
2260 		netmap_unget_na(na, ifp);
2261 		NMG_UNLOCK();
2262 		break;
2263 
2264 	case NIOCREGIF:
2265 		/*
2266 		 * If nmr->nr_cmd is not zero, this NIOCREGIF is not really
2267 		 * a regif operation, but a different one, specified by the
2268 		 * value of nmr->nr_cmd.
2269 		 */
2270 		i = nmr->nr_cmd;
2271 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH
2272 				|| i == NETMAP_BDG_VNET_HDR
2273 				|| i == NETMAP_BDG_NEWIF
2274 				|| i == NETMAP_BDG_DELIF
2275 				|| i == NETMAP_BDG_POLLING_ON
2276 				|| i == NETMAP_BDG_POLLING_OFF) {
2277 			/* possibly attach/detach NIC and VALE switch */
2278 			error = netmap_bdg_ctl(nmr, NULL);
2279 			break;
2280 		} else if (i == NETMAP_PT_HOST_CREATE || i == NETMAP_PT_HOST_DELETE) {
2281 			/* forward the command to the ptnetmap subsystem */
2282 			error = ptnetmap_ctl(nmr, priv->np_na);
2283 			break;
2284 		} else if (i == NETMAP_VNET_HDR_GET) {
2285 			/* get vnet-header length for this netmap port */
2286 			struct ifnet *ifp;
2287 
2288 			NMG_LOCK();
2289 			error = netmap_get_na(nmr, &na, &ifp, NULL, 0);
2290 			if (na && !error) {
2291 				nmr->nr_arg1 = na->virt_hdr_len;
2292 			}
2293 			netmap_unget_na(na, ifp);
2294 			NMG_UNLOCK();
2295 			break;
2296 		} else if (i == NETMAP_POOLS_INFO_GET) {
2297 			/* get information from the memory allocator */
2298 			NMG_LOCK();
2299 			if (priv->np_na && priv->np_na->nm_mem) {
2300 				struct netmap_mem_d *nmd = priv->np_na->nm_mem;
2301 				error = netmap_mem_pools_info_get(nmr, nmd);
2302 			} else {
2303 				error = EINVAL;
2304 			}
2305 			NMG_UNLOCK();
2306 			break;
2307 		} else if (i != 0) {
2308 			D("nr_cmd must be 0 not %d", i);
2309 			error = EINVAL;
2310 			break;
2311 		}
2312 
2313 		/* protect access to priv from concurrent NIOCREGIF */
2314 		NMG_LOCK();
2315 		do {
2316 			u_int memflags;
2317 			struct ifnet *ifp;
2318 
2319 			if (priv->np_nifp != NULL) {	/* thread already registered */
2320 				error = EBUSY;
2321 				break;
2322 			}
2323 
2324 			if (nmr->nr_arg2) {
2325 				/* find the allocator and get a reference */
2326 				nmd = netmap_mem_find(nmr->nr_arg2);
2327 				if (nmd == NULL) {
2328 					error = EINVAL;
2329 					break;
2330 				}
2331 			}
2332 			/* find the interface and a reference */
2333 			error = netmap_get_na(nmr, &na, &ifp, nmd,
2334 					      1 /* create */); /* keep reference */
2335 			if (error)
2336 				break;
2337 			if (NETMAP_OWNED_BY_KERN(na)) {
2338 				error = EBUSY;
2339 				break;
2340 			}
2341 
2342 			if (na->virt_hdr_len && !(nmr->nr_flags & NR_ACCEPT_VNET_HDR)) {
2343 				error = EIO;
2344 				break;
2345 			}
2346 
2347 			error = netmap_do_regif(priv, na, nmr->nr_ringid, nmr->nr_flags);
2348 			if (error) {    /* reg. failed, release priv and ref */
2349 				break;
2350 			}
2351 			nifp = priv->np_nifp;
2352 			priv->np_td = td; // XXX kqueue, debugging only
2353 
2354 			/* return the offset of the netmap_if object */
2355 			nmr->nr_rx_rings = na->num_rx_rings;
2356 			nmr->nr_tx_rings = na->num_tx_rings;
2357 			nmr->nr_rx_slots = na->num_rx_desc;
2358 			nmr->nr_tx_slots = na->num_tx_desc;
2359 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags,
2360 				&nmr->nr_arg2);
2361 			if (error) {
2362 				netmap_do_unregif(priv);
2363 				break;
2364 			}
2365 			if (memflags & NETMAP_MEM_PRIVATE) {
2366 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
2367 			}
2368 			for_rx_tx(t) {
2369 				priv->np_si[t] = nm_si_user(priv, t) ?
2370 					&na->si[t] : &NMR(na, t)[priv->np_qfirst[t]].si;
2371 			}
2372 
2373 			if (nmr->nr_arg3) {
2374 				if (netmap_verbose)
2375 					D("requested %d extra buffers", nmr->nr_arg3);
2376 				nmr->nr_arg3 = netmap_extra_alloc(na,
2377 					&nifp->ni_bufs_head, nmr->nr_arg3);
2378 				if (netmap_verbose)
2379 					D("got %d extra buffers", nmr->nr_arg3);
2380 			}
2381 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
2382 
2383 			/* store ifp reference so that priv destructor may release it */
2384 			priv->np_ifp = ifp;
2385 		} while (0);
2386 		if (error) {
2387 			netmap_unget_na(na, ifp);
2388 		}
2389 		/* release the reference from netmap_mem_find() or
2390 		 * netmap_mem_ext_create()
2391 		 */
2392 		if (nmd)
2393 			netmap_mem_put(nmd);
2394 		NMG_UNLOCK();
2395 		break;
2396 
2397 	case NIOCTXSYNC:
2398 	case NIOCRXSYNC:
2399 		nifp = priv->np_nifp;
2400 
2401 		if (nifp == NULL) {
2402 			error = ENXIO;
2403 			break;
2404 		}
2405 		mb(); /* make sure following reads are not from cache */
2406 
2407 		na = priv->np_na;      /* we have a reference */
2408 
2409 		if (na == NULL) {
2410 			D("Internal error: nifp != NULL && na == NULL");
2411 			error = ENXIO;
2412 			break;
2413 		}
2414 
2415 		mbq_init(&q);
2416 		t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
2417 		krings = NMR(na, t);
2418 		qfirst = priv->np_qfirst[t];
2419 		qlast = priv->np_qlast[t];
2420 		sync_flags = priv->np_sync_flags;
2421 
2422 		for (i = qfirst; i < qlast; i++) {
2423 			struct netmap_kring *kring = krings + i;
2424 			struct netmap_ring *ring = kring->ring;
2425 
2426 			if (unlikely(nm_kr_tryget(kring, 1, &error))) {
2427 				error = (error ? EIO : 0);
2428 				continue;
2429 			}
2430 
2431 			if (cmd == NIOCTXSYNC) {
2432 				if (netmap_verbose & NM_VERB_TXSYNC)
2433 					D("pre txsync ring %d cur %d hwcur %d",
2434 					    i, ring->cur,
2435 					    kring->nr_hwcur);
2436 				if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2437 					netmap_ring_reinit(kring);
2438 				} else if (kring->nm_sync(kring, sync_flags | NAF_FORCE_RECLAIM) == 0) {
2439 					nm_sync_finalize(kring);
2440 				}
2441 				if (netmap_verbose & NM_VERB_TXSYNC)
2442 					D("post txsync ring %d cur %d hwcur %d",
2443 					    i, ring->cur,
2444 					    kring->nr_hwcur);
2445 			} else {
2446 				if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2447 					netmap_ring_reinit(kring);
2448 				}
2449 				if (nm_may_forward_up(kring)) {
2450 					/* transparent forwarding, see netmap_poll() */
2451 					netmap_grab_packets(kring, &q, netmap_fwd);
2452 				}
2453 				if (kring->nm_sync(kring, sync_flags | NAF_FORCE_READ) == 0) {
2454 					nm_sync_finalize(kring);
2455 				}
2456 				ring_timestamp_set(ring);
2457 			}
2458 			nm_kr_put(kring);
2459 		}
2460 
2461 		if (mbq_peek(&q)) {
2462 			netmap_send_up(na->ifp, &q);
2463 		}
2464 
2465 		break;
2466 
2467 #ifdef WITH_VALE
2468 	case NIOCCONFIG:
2469 		error = netmap_bdg_config(nmr);
2470 		break;
2471 #endif
2472 #ifdef __FreeBSD__
2473 	case FIONBIO:
2474 	case FIOASYNC:
2475 		ND("FIONBIO/FIOASYNC are no-ops");
2476 		break;
2477 
2478 	case BIOCIMMEDIATE:
2479 	case BIOCGHDRCMPLT:
2480 	case BIOCSHDRCMPLT:
2481 	case BIOCSSEESENT:
2482 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
2483 		break;
2484 
2485 	default:	/* allow device-specific ioctls */
2486 	    {
2487 		struct ifnet *ifp = ifunit_ref(nmr->nr_name);
2488 		if (ifp == NULL) {
2489 			error = ENXIO;
2490 		} else {
2491 			struct socket so;
2492 
2493 			bzero(&so, sizeof(so));
2494 			so.so_vnet = ifp->if_vnet;
2495 			// so->so_proto not null.
2496 			error = ifioctl(&so, cmd, data, td);
2497 			if_rele(ifp);
2498 		}
2499 		break;
2500 	    }
2501 
2502 #else /* linux */
2503 	default:
2504 		error = EOPNOTSUPP;
2505 #endif /* linux */
2506 	}
2507 
2508 	return (error);
2509 }
2510 
2511 
2512 /*
2513  * select(2) and poll(2) handlers for the "netmap" device.
2514  *
2515  * Can be called for one or more queues.
2516  * Return true the event mask corresponding to ready events.
2517  * If there are no ready events, do a selrecord on either individual
2518  * selinfo or on the global one.
2519  * Device-dependent parts (locking and sync of tx/rx rings)
2520  * are done through callbacks.
2521  *
2522  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
2523  * The first one is remapped to pwait as selrecord() uses the name as an
2524  * hidden argument.
2525  */
2526 int
2527 netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
2528 {
2529 	struct netmap_adapter *na;
2530 	struct netmap_kring *kring;
2531 	struct netmap_ring *ring;
2532 	u_int i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
2533 #define want_tx want[NR_TX]
2534 #define want_rx want[NR_RX]
2535 	struct mbq q;	/* packets from RX hw queues to host stack */
2536 	enum txrx t;
2537 
2538 	/*
2539 	 * In order to avoid nested locks, we need to "double check"
2540 	 * txsync and rxsync if we decide to do a selrecord().
2541 	 * retry_tx (and retry_rx, later) prevent looping forever.
2542 	 */
2543 	int retry_tx = 1, retry_rx = 1;
2544 
2545 	/* Transparent mode: send_down is 1 if we have found some
2546 	 * packets to forward (host RX ring --> NIC) during the rx
2547 	 * scan and we have not sent them down to the NIC yet.
2548 	 * Transparent mode requires to bind all rings to a single
2549 	 * file descriptor.
2550 	 */
2551 	int send_down = 0;
2552 	int sync_flags = priv->np_sync_flags;
2553 
2554 	mbq_init(&q);
2555 
2556 	if (priv->np_nifp == NULL) {
2557 		D("No if registered");
2558 		return POLLERR;
2559 	}
2560 	mb(); /* make sure following reads are not from cache */
2561 
2562 	na = priv->np_na;
2563 
2564 	if (!nm_netmap_on(na))
2565 		return POLLERR;
2566 
2567 	if (netmap_verbose & 0x8000)
2568 		D("device %s events 0x%x", na->name, events);
2569 	want_tx = events & (POLLOUT | POLLWRNORM);
2570 	want_rx = events & (POLLIN | POLLRDNORM);
2571 
2572 	/*
2573 	 * check_all_{tx|rx} are set if the card has more than one queue AND
2574 	 * the file descriptor is bound to all of them. If so, we sleep on
2575 	 * the "global" selinfo, otherwise we sleep on individual selinfo
2576 	 * (FreeBSD only allows two selinfo's per file descriptor).
2577 	 * The interrupt routine in the driver wake one or the other
2578 	 * (or both) depending on which clients are active.
2579 	 *
2580 	 * rxsync() is only called if we run out of buffers on a POLLIN.
2581 	 * txsync() is called if we run out of buffers on POLLOUT, or
2582 	 * there are pending packets to send. The latter can be disabled
2583 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
2584 	 */
2585 	check_all_tx = nm_si_user(priv, NR_TX);
2586 	check_all_rx = nm_si_user(priv, NR_RX);
2587 
2588 	/*
2589 	 * We start with a lock free round which is cheap if we have
2590 	 * slots available. If this fails, then lock and call the sync
2591 	 * routines.
2592 	 */
2593 #if 1 /* new code- call rx if any of the ring needs to release or read buffers */
2594 	if (want_tx) {
2595 		t = NR_TX;
2596 		for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) {
2597 			kring = &NMR(na, t)[i];
2598 			/* XXX compare ring->cur and kring->tail */
2599 			if (!nm_ring_empty(kring->ring)) {
2600 				revents |= want[t];
2601 				want[t] = 0;	/* also breaks the loop */
2602 			}
2603 		}
2604 	}
2605 	if (want_rx) {
2606 		want_rx = 0; /* look for a reason to run the handlers */
2607 		t = NR_RX;
2608 		for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
2609 			kring = &NMR(na, t)[i];
2610 			if (kring->ring->cur == kring->ring->tail /* try fetch new buffers */
2611 			    || kring->rhead != kring->ring->head /* release buffers */) {
2612 				want_rx = 1;
2613 			}
2614 		}
2615 		if (!want_rx)
2616 			revents |= events & (POLLIN | POLLRDNORM); /* we have data */
2617 	}
2618 #else /* old code */
2619 	for_rx_tx(t) {
2620 		for (i = priv->np_qfirst[t]; want[t] && i < priv->np_qlast[t]; i++) {
2621 			kring = &NMR(na, t)[i];
2622 			/* XXX compare ring->cur and kring->tail */
2623 			if (!nm_ring_empty(kring->ring)) {
2624 				revents |= want[t];
2625 				want[t] = 0;	/* also breaks the loop */
2626 			}
2627 		}
2628 	}
2629 #endif /* old code */
2630 
2631 	/*
2632 	 * If we want to push packets out (priv->np_txpoll) or
2633 	 * want_tx is still set, we must issue txsync calls
2634 	 * (on all rings, to avoid that the tx rings stall).
2635 	 * XXX should also check cur != hwcur on the tx rings.
2636 	 * Fortunately, normal tx mode has np_txpoll set.
2637 	 */
2638 	if (priv->np_txpoll || want_tx) {
2639 		/*
2640 		 * The first round checks if anyone is ready, if not
2641 		 * do a selrecord and another round to handle races.
2642 		 * want_tx goes to 0 if any space is found, and is
2643 		 * used to skip rings with no pending transmissions.
2644 		 */
2645 flush_tx:
2646 		for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {
2647 			int found = 0;
2648 
2649 			kring = &na->tx_rings[i];
2650 			ring = kring->ring;
2651 
2652 			if (!send_down && !want_tx && ring->cur == kring->nr_hwcur)
2653 				continue;
2654 
2655 			if (nm_kr_tryget(kring, 1, &revents))
2656 				continue;
2657 
2658 			if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2659 				netmap_ring_reinit(kring);
2660 				revents |= POLLERR;
2661 			} else {
2662 				if (kring->nm_sync(kring, sync_flags))
2663 					revents |= POLLERR;
2664 				else
2665 					nm_sync_finalize(kring);
2666 			}
2667 
2668 			/*
2669 			 * If we found new slots, notify potential
2670 			 * listeners on the same ring.
2671 			 * Since we just did a txsync, look at the copies
2672 			 * of cur,tail in the kring.
2673 			 */
2674 			found = kring->rcur != kring->rtail;
2675 			nm_kr_put(kring);
2676 			if (found) { /* notify other listeners */
2677 				revents |= want_tx;
2678 				want_tx = 0;
2679 				kring->nm_notify(kring, 0);
2680 			}
2681 		}
2682 		/* if there were any packet to forward we must have handled them by now */
2683 		send_down = 0;
2684 		if (want_tx && retry_tx && sr) {
2685 			nm_os_selrecord(sr, check_all_tx ?
2686 			    &na->si[NR_TX] : &na->tx_rings[priv->np_qfirst[NR_TX]].si);
2687 			retry_tx = 0;
2688 			goto flush_tx;
2689 		}
2690 	}
2691 
2692 	/*
2693 	 * If want_rx is still set scan receive rings.
2694 	 * Do it on all rings because otherwise we starve.
2695 	 */
2696 	if (want_rx) {
2697 		/* two rounds here for race avoidance */
2698 do_retry_rx:
2699 		for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
2700 			int found = 0;
2701 
2702 			kring = &na->rx_rings[i];
2703 			ring = kring->ring;
2704 
2705 			if (unlikely(nm_kr_tryget(kring, 1, &revents)))
2706 				continue;
2707 
2708 			if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
2709 				netmap_ring_reinit(kring);
2710 				revents |= POLLERR;
2711 			}
2712 			/* now we can use kring->rcur, rtail */
2713 
2714 			/*
2715 			 * transparent mode support: collect packets from
2716 			 * hw rxring(s) that have been released by the user
2717 			 */
2718 			if (nm_may_forward_up(kring)) {
2719 				netmap_grab_packets(kring, &q, netmap_fwd);
2720 			}
2721 
2722 			/* Clear the NR_FORWARD flag anyway, it may be set by
2723 			 * the nm_sync() below only on for the host RX ring (see
2724 			 * netmap_rxsync_from_host()). */
2725 			kring->nr_kflags &= ~NR_FORWARD;
2726 			if (kring->nm_sync(kring, sync_flags))
2727 				revents |= POLLERR;
2728 			else
2729 				nm_sync_finalize(kring);
2730 			send_down |= (kring->nr_kflags & NR_FORWARD);
2731 			ring_timestamp_set(ring);
2732 			found = kring->rcur != kring->rtail;
2733 			nm_kr_put(kring);
2734 			if (found) {
2735 				revents |= want_rx;
2736 				retry_rx = 0;
2737 				kring->nm_notify(kring, 0);
2738 			}
2739 		}
2740 
2741 		if (retry_rx && sr) {
2742 			nm_os_selrecord(sr, check_all_rx ?
2743 			    &na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]].si);
2744 		}
2745 		if (send_down || retry_rx) {
2746 			retry_rx = 0;
2747 			if (send_down)
2748 				goto flush_tx; /* and retry_rx */
2749 			else
2750 				goto do_retry_rx;
2751 		}
2752 	}
2753 
2754 	/*
2755 	 * Transparent mode: released bufs (i.e. between kring->nr_hwcur and
2756 	 * ring->head) marked with NS_FORWARD on hw rx rings are passed up
2757 	 * to the host stack.
2758 	 */
2759 
2760 	if (mbq_peek(&q)) {
2761 		netmap_send_up(na->ifp, &q);
2762 	}
2763 
2764 	return (revents);
2765 #undef want_tx
2766 #undef want_rx
2767 }
2768 
2769 
2770 /*-------------------- driver support routines -------------------*/
2771 
2772 /* default notify callback */
2773 static int
2774 netmap_notify(struct netmap_kring *kring, int flags)
2775 {
2776 	struct netmap_adapter *na = kring->na;
2777 	enum txrx t = kring->tx;
2778 
2779 	nm_os_selwakeup(&kring->si);
2780 	/* optimization: avoid a wake up on the global
2781 	 * queue if nobody has registered for more
2782 	 * than one ring
2783 	 */
2784 	if (na->si_users[t] > 0)
2785 		nm_os_selwakeup(&na->si[t]);
2786 
2787 	return NM_IRQ_COMPLETED;
2788 }
2789 
2790 /* called by all routines that create netmap_adapters.
2791  * provide some defaults and get a reference to the
2792  * memory allocator
2793  */
2794 int
2795 netmap_attach_common(struct netmap_adapter *na)
2796 {
2797 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
2798 		D("%s: invalid rings tx %d rx %d",
2799 			na->name, na->num_tx_rings, na->num_rx_rings);
2800 		return EINVAL;
2801 	}
2802 
2803 #ifdef __FreeBSD__
2804 	if (na->na_flags & NAF_HOST_RINGS && na->ifp) {
2805 		na->if_input = na->ifp->if_input; /* for netmap_send_up */
2806 	}
2807 #endif /* __FreeBSD__ */
2808 	if (na->nm_krings_create == NULL) {
2809 		/* we assume that we have been called by a driver,
2810 		 * since other port types all provide their own
2811 		 * nm_krings_create
2812 		 */
2813 		na->nm_krings_create = netmap_hw_krings_create;
2814 		na->nm_krings_delete = netmap_hw_krings_delete;
2815 	}
2816 	if (na->nm_notify == NULL)
2817 		na->nm_notify = netmap_notify;
2818 	na->active_fds = 0;
2819 
2820 	if (na->nm_mem == NULL) {
2821 		/* use the global allocator */
2822 		na->nm_mem = netmap_mem_get(&nm_mem);
2823 	}
2824 #ifdef WITH_VALE
2825 	if (na->nm_bdg_attach == NULL)
2826 		/* no special nm_bdg_attach callback. On VALE
2827 		 * attach, we need to interpose a bwrap
2828 		 */
2829 		na->nm_bdg_attach = netmap_bwrap_attach;
2830 #endif
2831 
2832 	return 0;
2833 }
2834 
2835 
2836 /* standard cleanup, called by all destructors */
2837 void
2838 netmap_detach_common(struct netmap_adapter *na)
2839 {
2840 	if (na->tx_rings) { /* XXX should not happen */
2841 		D("freeing leftover tx_rings");
2842 		na->nm_krings_delete(na);
2843 	}
2844 	netmap_pipe_dealloc(na);
2845 	if (na->nm_mem)
2846 		netmap_mem_put(na->nm_mem);
2847 	bzero(na, sizeof(*na));
2848 	nm_os_free(na);
2849 }
2850 
2851 /* Wrapper for the register callback provided netmap-enabled
2852  * hardware drivers.
2853  * nm_iszombie(na) means that the driver module has been
2854  * unloaded, so we cannot call into it.
2855  * nm_os_ifnet_lock() must guarantee mutual exclusion with
2856  * module unloading.
2857  */
2858 static int
2859 netmap_hw_reg(struct netmap_adapter *na, int onoff)
2860 {
2861 	struct netmap_hw_adapter *hwna =
2862 		(struct netmap_hw_adapter*)na;
2863 	int error = 0;
2864 
2865 	nm_os_ifnet_lock();
2866 
2867 	if (nm_iszombie(na)) {
2868 		if (onoff) {
2869 			error = ENXIO;
2870 		} else if (na != NULL) {
2871 			na->na_flags &= ~NAF_NETMAP_ON;
2872 		}
2873 		goto out;
2874 	}
2875 
2876 	error = hwna->nm_hw_register(na, onoff);
2877 
2878 out:
2879 	nm_os_ifnet_unlock();
2880 
2881 	return error;
2882 }
2883 
2884 static void
2885 netmap_hw_dtor(struct netmap_adapter *na)
2886 {
2887 	if (nm_iszombie(na) || na->ifp == NULL)
2888 		return;
2889 
2890 	WNA(na->ifp) = NULL;
2891 }
2892 
2893 
2894 /*
2895  * Allocate a netmap_adapter object, and initialize it from the
2896  * 'arg' passed by the driver on attach.
2897  * We allocate a block of memory of 'size' bytes, which has room
2898  * for struct netmap_adapter plus additional room private to
2899  * the caller.
2900  * Return 0 on success, ENOMEM otherwise.
2901  */
2902 int
2903 netmap_attach_ext(struct netmap_adapter *arg, size_t size)
2904 {
2905 	struct netmap_hw_adapter *hwna = NULL;
2906 	struct ifnet *ifp = NULL;
2907 
2908 	if (size < sizeof(struct netmap_hw_adapter)) {
2909 		D("Invalid netmap adapter size %d", (int)size);
2910 		return EINVAL;
2911 	}
2912 
2913 	if (arg == NULL || arg->ifp == NULL)
2914 		goto fail;
2915 	ifp = arg->ifp;
2916 	hwna = nm_os_malloc(size);
2917 	if (hwna == NULL)
2918 		goto fail;
2919 	hwna->up = *arg;
2920 	hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
2921 	strncpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
2922 	hwna->nm_hw_register = hwna->up.nm_register;
2923 	hwna->up.nm_register = netmap_hw_reg;
2924 	if (netmap_attach_common(&hwna->up)) {
2925 		nm_os_free(hwna);
2926 		goto fail;
2927 	}
2928 	netmap_adapter_get(&hwna->up);
2929 
2930 	NM_ATTACH_NA(ifp, &hwna->up);
2931 
2932 #ifdef linux
2933 	if (ifp->netdev_ops) {
2934 		/* prepare a clone of the netdev ops */
2935 #ifndef NETMAP_LINUX_HAVE_NETDEV_OPS
2936 		hwna->nm_ndo.ndo_start_xmit = ifp->netdev_ops;
2937 #else
2938 		hwna->nm_ndo = *ifp->netdev_ops;
2939 #endif /* NETMAP_LINUX_HAVE_NETDEV_OPS */
2940 	}
2941 	hwna->nm_ndo.ndo_start_xmit = linux_netmap_start_xmit;
2942 	if (ifp->ethtool_ops) {
2943 		hwna->nm_eto = *ifp->ethtool_ops;
2944 	}
2945 	hwna->nm_eto.set_ringparam = linux_netmap_set_ringparam;
2946 #ifdef NETMAP_LINUX_HAVE_SET_CHANNELS
2947 	hwna->nm_eto.set_channels = linux_netmap_set_channels;
2948 #endif /* NETMAP_LINUX_HAVE_SET_CHANNELS */
2949 	if (arg->nm_config == NULL) {
2950 		hwna->up.nm_config = netmap_linux_config;
2951 	}
2952 #endif /* linux */
2953 	if (arg->nm_dtor == NULL) {
2954 		hwna->up.nm_dtor = netmap_hw_dtor;
2955 	}
2956 
2957 	if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
2958 	    hwna->up.num_tx_rings, hwna->up.num_tx_desc,
2959 	    hwna->up.num_rx_rings, hwna->up.num_rx_desc);
2960 	return 0;
2961 
2962 fail:
2963 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
2964 	return (hwna ? EINVAL : ENOMEM);
2965 }
2966 
2967 
2968 int
2969 netmap_attach(struct netmap_adapter *arg)
2970 {
2971 	return netmap_attach_ext(arg, sizeof(struct netmap_hw_adapter));
2972 }
2973 
2974 
2975 void
2976 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
2977 {
2978 	if (!na) {
2979 		return;
2980 	}
2981 
2982 	refcount_acquire(&na->na_refcount);
2983 }
2984 
2985 
2986 /* returns 1 iff the netmap_adapter is destroyed */
2987 int
2988 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
2989 {
2990 	if (!na)
2991 		return 1;
2992 
2993 	if (!refcount_release(&na->na_refcount))
2994 		return 0;
2995 
2996 	if (na->nm_dtor)
2997 		na->nm_dtor(na);
2998 
2999 	netmap_detach_common(na);
3000 
3001 	return 1;
3002 }
3003 
3004 /* nm_krings_create callback for all hardware native adapters */
3005 int
3006 netmap_hw_krings_create(struct netmap_adapter *na)
3007 {
3008 	int ret = netmap_krings_create(na, 0);
3009 	if (ret == 0) {
3010 		/* initialize the mbq for the sw rx ring */
3011 		mbq_safe_init(&na->rx_rings[na->num_rx_rings].rx_queue);
3012 		ND("initialized sw rx queue %d", na->num_rx_rings);
3013 	}
3014 	return ret;
3015 }
3016 
3017 
3018 
3019 /*
3020  * Called on module unload by the netmap-enabled drivers
3021  */
3022 void
3023 netmap_detach(struct ifnet *ifp)
3024 {
3025 	struct netmap_adapter *na = NA(ifp);
3026 
3027 	if (!na)
3028 		return;
3029 
3030 	NMG_LOCK();
3031 	netmap_set_all_rings(na, NM_KR_LOCKED);
3032 	na->na_flags |= NAF_ZOMBIE;
3033 	/*
3034 	 * if the netmap adapter is not native, somebody
3035 	 * changed it, so we can not release it here.
3036 	 * The NAF_ZOMBIE flag will notify the new owner that
3037 	 * the driver is gone.
3038 	 */
3039 	if (na->na_flags & NAF_NATIVE) {
3040 	        netmap_adapter_put(na);
3041 	}
3042 	/* give active users a chance to notice that NAF_ZOMBIE has been
3043 	 * turned on, so that they can stop and return an error to userspace.
3044 	 * Note that this becomes a NOP if there are no active users and,
3045 	 * therefore, the put() above has deleted the na, since now NA(ifp) is
3046 	 * NULL.
3047 	 */
3048 	netmap_enable_all_rings(ifp);
3049 	NMG_UNLOCK();
3050 }
3051 
3052 
3053 /*
3054  * Intercept packets from the network stack and pass them
3055  * to netmap as incoming packets on the 'software' ring.
3056  *
3057  * We only store packets in a bounded mbq and then copy them
3058  * in the relevant rxsync routine.
3059  *
3060  * We rely on the OS to make sure that the ifp and na do not go
3061  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
3062  * In nm_register() or whenever there is a reinitialization,
3063  * we make sure to make the mode change visible here.
3064  */
3065 int
3066 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
3067 {
3068 	struct netmap_adapter *na = NA(ifp);
3069 	struct netmap_kring *kring, *tx_kring;
3070 	u_int len = MBUF_LEN(m);
3071 	u_int error = ENOBUFS;
3072 	unsigned int txr;
3073 	struct mbq *q;
3074 	int busy;
3075 
3076 	kring = &na->rx_rings[na->num_rx_rings];
3077 	// XXX [Linux] we do not need this lock
3078 	// if we follow the down/configure/up protocol -gl
3079 	// mtx_lock(&na->core_lock);
3080 
3081 	if (!nm_netmap_on(na)) {
3082 		D("%s not in netmap mode anymore", na->name);
3083 		error = ENXIO;
3084 		goto done;
3085 	}
3086 
3087 	txr = MBUF_TXQ(m);
3088 	if (txr >= na->num_tx_rings) {
3089 		txr %= na->num_tx_rings;
3090 	}
3091 	tx_kring = &NMR(na, NR_TX)[txr];
3092 
3093 	if (tx_kring->nr_mode == NKR_NETMAP_OFF) {
3094 		return MBUF_TRANSMIT(na, ifp, m);
3095 	}
3096 
3097 	q = &kring->rx_queue;
3098 
3099 	// XXX reconsider long packets if we handle fragments
3100 	if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
3101 		D("%s from_host, drop packet size %d > %d", na->name,
3102 			len, NETMAP_BUF_SIZE(na));
3103 		goto done;
3104 	}
3105 
3106 	if (nm_os_mbuf_has_offld(m)) {
3107 		RD(1, "%s drop mbuf that needs offloadings", na->name);
3108 		goto done;
3109 	}
3110 
3111 	/* protect against netmap_rxsync_from_host(), netmap_sw_to_nic()
3112 	 * and maybe other instances of netmap_transmit (the latter
3113 	 * not possible on Linux).
3114 	 * We enqueue the mbuf only if we are sure there is going to be
3115 	 * enough room in the host RX ring, otherwise we drop it.
3116 	 */
3117 	mbq_lock(q);
3118 
3119 	busy = kring->nr_hwtail - kring->nr_hwcur;
3120 	if (busy < 0)
3121 		busy += kring->nkr_num_slots;
3122 	if (busy + mbq_len(q) >= kring->nkr_num_slots - 1) {
3123 		RD(2, "%s full hwcur %d hwtail %d qlen %d", na->name,
3124 			kring->nr_hwcur, kring->nr_hwtail, mbq_len(q));
3125 	} else {
3126 		mbq_enqueue(q, m);
3127 		ND(2, "%s %d bufs in queue", na->name, mbq_len(q));
3128 		/* notify outside the lock */
3129 		m = NULL;
3130 		error = 0;
3131 	}
3132 	mbq_unlock(q);
3133 
3134 done:
3135 	if (m)
3136 		m_freem(m);
3137 	/* unconditionally wake up listeners */
3138 	kring->nm_notify(kring, 0);
3139 	/* this is normally netmap_notify(), but for nics
3140 	 * connected to a bridge it is netmap_bwrap_intr_notify(),
3141 	 * that possibly forwards the frames through the switch
3142 	 */
3143 
3144 	return (error);
3145 }
3146 
3147 
3148 /*
3149  * netmap_reset() is called by the driver routines when reinitializing
3150  * a ring. The driver is in charge of locking to protect the kring.
3151  * If native netmap mode is not set just return NULL.
3152  * If native netmap mode is set, in particular, we have to set nr_mode to
3153  * NKR_NETMAP_ON.
3154  */
3155 struct netmap_slot *
3156 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
3157 	u_int new_cur)
3158 {
3159 	struct netmap_kring *kring;
3160 	int new_hwofs, lim;
3161 
3162 	if (!nm_native_on(na)) {
3163 		ND("interface not in native netmap mode");
3164 		return NULL;	/* nothing to reinitialize */
3165 	}
3166 
3167 	/* XXX note- in the new scheme, we are not guaranteed to be
3168 	 * under lock (e.g. when called on a device reset).
3169 	 * In this case, we should set a flag and do not trust too
3170 	 * much the values. In practice: TODO
3171 	 * - set a RESET flag somewhere in the kring
3172 	 * - do the processing in a conservative way
3173 	 * - let the *sync() fixup at the end.
3174 	 */
3175 	if (tx == NR_TX) {
3176 		if (n >= na->num_tx_rings)
3177 			return NULL;
3178 
3179 		kring = na->tx_rings + n;
3180 
3181 		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3182 			kring->nr_mode = NKR_NETMAP_OFF;
3183 			return NULL;
3184 		}
3185 
3186 		// XXX check whether we should use hwcur or rcur
3187 		new_hwofs = kring->nr_hwcur - new_cur;
3188 	} else {
3189 		if (n >= na->num_rx_rings)
3190 			return NULL;
3191 		kring = na->rx_rings + n;
3192 
3193 		if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
3194 			kring->nr_mode = NKR_NETMAP_OFF;
3195 			return NULL;
3196 		}
3197 
3198 		new_hwofs = kring->nr_hwtail - new_cur;
3199 	}
3200 	lim = kring->nkr_num_slots - 1;
3201 	if (new_hwofs > lim)
3202 		new_hwofs -= lim + 1;
3203 
3204 	/* Always set the new offset value and realign the ring. */
3205 	if (netmap_verbose)
3206 	    D("%s %s%d hwofs %d -> %d, hwtail %d -> %d",
3207 		na->name,
3208 		tx == NR_TX ? "TX" : "RX", n,
3209 		kring->nkr_hwofs, new_hwofs,
3210 		kring->nr_hwtail,
3211 		tx == NR_TX ? lim : kring->nr_hwtail);
3212 	kring->nkr_hwofs = new_hwofs;
3213 	if (tx == NR_TX) {
3214 		kring->nr_hwtail = kring->nr_hwcur + lim;
3215 		if (kring->nr_hwtail > lim)
3216 			kring->nr_hwtail -= lim + 1;
3217 	}
3218 
3219 #if 0 // def linux
3220 	/* XXX check that the mappings are correct */
3221 	/* need ring_nr, adapter->pdev, direction */
3222 	buffer_info->dma = dma_map_single(&pdev->dev, addr, adapter->rx_buffer_len, DMA_FROM_DEVICE);
3223 	if (dma_mapping_error(&adapter->pdev->dev, buffer_info->dma)) {
3224 		D("error mapping rx netmap buffer %d", i);
3225 		// XXX fix error handling
3226 	}
3227 
3228 #endif /* linux */
3229 	/*
3230 	 * Wakeup on the individual and global selwait
3231 	 * We do the wakeup here, but the ring is not yet reconfigured.
3232 	 * However, we are under lock so there are no races.
3233 	 */
3234 	kring->nr_mode = NKR_NETMAP_ON;
3235 	kring->nm_notify(kring, 0);
3236 	return kring->ring->slot;
3237 }
3238 
3239 
3240 /*
3241  * Dispatch rx/tx interrupts to the netmap rings.
3242  *
3243  * "work_done" is non-null on the RX path, NULL for the TX path.
3244  * We rely on the OS to make sure that there is only one active
3245  * instance per queue, and that there is appropriate locking.
3246  *
3247  * The 'notify' routine depends on what the ring is attached to.
3248  * - for a netmap file descriptor, do a selwakeup on the individual
3249  *   waitqueue, plus one on the global one if needed
3250  *   (see netmap_notify)
3251  * - for a nic connected to a switch, call the proper forwarding routine
3252  *   (see netmap_bwrap_intr_notify)
3253  */
3254 int
3255 netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
3256 {
3257 	struct netmap_kring *kring;
3258 	enum txrx t = (work_done ? NR_RX : NR_TX);
3259 
3260 	q &= NETMAP_RING_MASK;
3261 
3262 	if (netmap_verbose) {
3263 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
3264 	}
3265 
3266 	if (q >= nma_get_nrings(na, t))
3267 		return NM_IRQ_PASS; // not a physical queue
3268 
3269 	kring = NMR(na, t) + q;
3270 
3271 	if (kring->nr_mode == NKR_NETMAP_OFF) {
3272 		return NM_IRQ_PASS;
3273 	}
3274 
3275 	if (t == NR_RX) {
3276 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
3277 		*work_done = 1; /* do not fire napi again */
3278 	}
3279 
3280 	return kring->nm_notify(kring, 0);
3281 }
3282 
3283 
3284 /*
3285  * Default functions to handle rx/tx interrupts from a physical device.
3286  * "work_done" is non-null on the RX path, NULL for the TX path.
3287  *
3288  * If the card is not in netmap mode, simply return NM_IRQ_PASS,
3289  * so that the caller proceeds with regular processing.
3290  * Otherwise call netmap_common_irq().
3291  *
3292  * If the card is connected to a netmap file descriptor,
3293  * do a selwakeup on the individual queue, plus one on the global one
3294  * if needed (multiqueue card _and_ there are multiqueue listeners),
3295  * and return NR_IRQ_COMPLETED.
3296  *
3297  * Finally, if called on rx from an interface connected to a switch,
3298  * calls the proper forwarding routine.
3299  */
3300 int
3301 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
3302 {
3303 	struct netmap_adapter *na = NA(ifp);
3304 
3305 	/*
3306 	 * XXX emulated netmap mode sets NAF_SKIP_INTR so
3307 	 * we still use the regular driver even though the previous
3308 	 * check fails. It is unclear whether we should use
3309 	 * nm_native_on() here.
3310 	 */
3311 	if (!nm_netmap_on(na))
3312 		return NM_IRQ_PASS;
3313 
3314 	if (na->na_flags & NAF_SKIP_INTR) {
3315 		ND("use regular interrupt");
3316 		return NM_IRQ_PASS;
3317 	}
3318 
3319 	return netmap_common_irq(na, q, work_done);
3320 }
3321 
3322 
3323 /*
3324  * Module loader and unloader
3325  *
3326  * netmap_init() creates the /dev/netmap device and initializes
3327  * all global variables. Returns 0 on success, errno on failure
3328  * (but there is no chance)
3329  *
3330  * netmap_fini() destroys everything.
3331  */
3332 
3333 static struct cdev *netmap_dev; /* /dev/netmap character device. */
3334 extern struct cdevsw netmap_cdevsw;
3335 
3336 
3337 void
3338 netmap_fini(void)
3339 {
3340 	if (netmap_dev)
3341 		destroy_dev(netmap_dev);
3342 	/* we assume that there are no longer netmap users */
3343 	nm_os_ifnet_fini();
3344 	netmap_uninit_bridges();
3345 	netmap_mem_fini();
3346 	NMG_LOCK_DESTROY();
3347 	nm_prinf("netmap: unloaded module.\n");
3348 }
3349 
3350 
3351 int
3352 netmap_init(void)
3353 {
3354 	int error;
3355 
3356 	NMG_LOCK_INIT();
3357 
3358 	error = netmap_mem_init();
3359 	if (error != 0)
3360 		goto fail;
3361 	/*
3362 	 * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
3363 	 * when the module is compiled in.
3364 	 * XXX could use make_dev_credv() to get error number
3365 	 */
3366 	netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
3367 		&netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
3368 			      "netmap");
3369 	if (!netmap_dev)
3370 		goto fail;
3371 
3372 	error = netmap_init_bridges();
3373 	if (error)
3374 		goto fail;
3375 
3376 #ifdef __FreeBSD__
3377 	nm_os_vi_init_index();
3378 #endif
3379 
3380 	error = nm_os_ifnet_init();
3381 	if (error)
3382 		goto fail;
3383 
3384 	nm_prinf("netmap: loaded module\n");
3385 	return (0);
3386 fail:
3387 	netmap_fini();
3388 	return (EINVAL); /* may be incorrect */
3389 }
3390